├── README.md
├── corpus
    ├── cot.py
    ├── kg-to-text.py
    ├── pa_ablation.py
    ├── pa_construct_chatgpt.py
    ├── pa_construct_mistral.py
    ├── pa_filter.py
    └── summary.py
├── figs
    └── 1.png
├── inference
    ├── closed
    │   ├── answer
    │   │   └── answer.py
    │   └── rewrite
    │   │   ├── infer_chain.py
    │   │   ├── infer_pa.py
    │   │   ├── infer_summary.py
    │   │   └── infer_text.py
    └── open
    │   ├── answer
    │       ├── answer.py
    │       └── answer_no.py
    │   ├── linearize.py
    │   ├── process_freebase.py
    │   ├── query_interface.py
    │   ├── retrieve
    │       ├── 2hop
    │       │   ├── 2hop.py
    │       │   ├── format.py
    │       │   ├── format
    │       │   │   ├── GraphQuestions.json
    │       │   │   └── grailqa.json
    │       │   ├── query_interface.py
    │       │   └── sim_compute.py
    │       └── bm25
    │       │   ├── build_index_sparse.sh
    │       │   ├── format.py
    │       │   ├── format
    │       │       ├── GraphQuestions.json
    │       │       └── grailqa.json
    │       │   ├── run_search_sparse.sh
    │       │   └── search.py
    │   └── rewrite
    │       ├── infer_chain.py
    │       ├── infer_pa.py
    │       ├── infer_summary.py
    │       └── infer_text.py
├── instruction-tuning
    ├── build_dataset.py
    ├── ds_zero2_no_offload.json
    ├── merge.py
    ├── run_clm_sft_with_peft-7b.py
    ├── run_clm_sft_with_peft-8b.py
    ├── run_dpo-step.sh
    ├── run_dpo.py
    ├── run_dpo.sh
    ├── run_llama-7b.sh
    └── run_llama-8b.sh
├── requirement1.txt
├── requirement2.txt
└── subgraph
    ├── GraphQuestions
        ├── gold
        │   └── test.json
        ├── graph_query.py
        ├── query_interface.py
        └── sparql_utils
        │   ├── load_kb.py
        │   ├── misc.py
        │   ├── sparql_engine.py
        │   ├── sparql_executor.py
        │   └── value_class.py
    ├── gold_graph.py
    └── grailqa
        ├── gold
            └── test.json
        ├── graph_query.py
        ├── query_interface.py
        └── sparql_utils
            ├── load_kb.py
            ├── misc.py
            ├── sparql_engine.py
            ├── sparql_executor.py
            └── value_class.py


/corpus/cot.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | import copy
  4 | import time
  5 | import random
  6 | from openai import OpenAI
  7 | from tqdm import tqdm
  8 | import os
  9 | 
 10 | client=OpenAI(api_key='YOUR KEY')
 11 | 
 12 | interval=200
 13 | DATA='GraphQuestions'
 14 | # set EX_RATE
 15 | if DATA in ['GraphQuestions','WebQSP']:
 16 |     EX_RATE=1
 17 | if DATA in ['grailqa']:
 18 |     EX_RATE=0.5
 19 |     
 20 | train=json.load(open('../subgraph/'+DATA+'/graph/train.json','r',encoding='utf-8'))
 21 | 
 22 | os.makedirs(DATA+'/finetune/'+DATA+'/CoT/train/',exist_ok=True)
 23 | os.makedirs(DATA+'/finetune/'+DATA+'/CoT/middle/',exist_ok=True)
 24 | 
 25 | kr_prompt='''Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 26 | Triples: (Oxybutynin Oral, medicine.routed_drug.route_of_administration, Oral administration) (Oxybutynin Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin Chloride Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin chloride 5 extended release film coated tablet, medicine.drug_formulation.formulation_of, Oxybutynin)
 27 | Question: oxybutynin chloride 5 extended release film coated tablet is the ingredients of what routed drug?
 28 | Reason 1: I need to know which routed drug contains oxybutynin chloride 5 extended release film coated tablet.
 29 | Knowledge 1: "Oxybutynin Chloride Oral" is a type of routed drug and "Oxybutynin chloride 5 extended release film coated tablet" is one of the marketed formulations of "Oxybutynin Chloride Oral".
 30 | 
 31 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 32 | Triples: (Google, organization.organization.founders, Sergey Brin) (Sergey Brin, people.person.education, CVT1) (CVT1, education.education.institution, University of Maryland, College Park) (Google, organization.organization.founders, Larry Page) (Larry Page, people.person.education, CVT1) (CVT1, education.education.institution, University of Michigan) (CVT1, education.education.institution, Stanford University)
 33 | Question: where did the founder of google go to college?
 34 | Reason 1: I need to know who is the founder of google.
 35 | Knowledge 1: Sergey Brin and Larry Page is the founder of google.
 36 | Reason 2: I need to know Sergey Brin and Larry Page go to which the university.
 37 | Knowledge 2: Sergey Brin studied at the University of Maryland, College Park and Stanford University. Larry Page studied at the University of Michigan and Stanford University.
 38 | 
 39 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 40 | Triples: (Rock music, music.genre.artists, Outkast) (Rock music, music.genre.parent_genre, Folk music) (Rock music, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, Bright Eyes) (Electronica, music.genre.parent_genre, House music) (Electronica, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, t.A.T.u.)
 41 | Question: the albums confessions tour is part of what parent genre of a musical genre?
 42 | Reason 1: I need to know the musical genre of the albums confessions tour.
 43 | Knowledge 1: The album confessions tour is associated with the genre Rock music and Electronica.
 44 | Reason 2: I need to know the parent genre of Rock music and Electronica.
 45 | Knowledge 2: The parent genre of Rock music is Folk music. The parent genre of Electronica is House music.
 46 | 
 47 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 48 | Triples: {triple}
 49 | Question: {ques}
 50 | '''
 51 | 
 52 | kr_prompt1='''Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 53 | Triples: {triple}
 54 | Question: {ques}
 55 | '''
 56 | 
 57 | ans_prompt='''Below are the facts that might be relevant to answer the question:
 58 | {knowledge}
 59 | Question: {ques}
 60 | Answer:'''
 61 | 
 62 | num_dict = {
 63 |         '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 64 |         '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine'
 65 |     }
 66 | 
 67 | def getResponse(prompt,max_retries=10):
 68 |     # set retries
 69 |     retries=0
 70 |     while retries < max_retries:
 71 |         try:
 72 |             res = client.chat.completions.create(
 73 |                 model='gpt-3.5-turbo',
 74 |                 #model='gpt-4',
 75 |                 messages=[
 76 |                     {'role': 'user', 'content': prompt}
 77 |                 ],
 78 |                 temperature=0,
 79 |             )
 80 |             return res.choices[0].message.content
 81 |         except Exception as e:
 82 |             print(f"An error occurred: {e}")
 83 |             print("Retrying in 1 minutes...")
 84 |             retries += 1
 85 |             time.sleep(60)
 86 |     return ''
 87 | 
 88 | data=[]
 89 | resume=0
 90 | #data=json.load(open('train-'+str(resume)+'.json','r',encoding='utf-8'))
 91 | index=resume
 92 | for sample in tqdm(train[resume:]):
 93 |     index+=1
 94 |     # gold graph
 95 |     gold_g=set()
 96 |     for i in sample['restrict_graph']:
 97 |         for j in i:
 98 |             temp='('+j[0]+', '+j[1]+', '+j[2]+')'
 99 |             gold_g.add(temp)
100 |     # shuffle gold graph
101 |     gold_g=list(gold_g)
102 |     random.shuffle(gold_g)
103 |     
104 |     # extend graph
105 |     extend=set()
106 |     for i in sample["ex_graph"]:
107 |         for j in i:
108 |             temp='('+j[0]+', '+j[1]+', '+j[2]+')'
109 |             if temp not in gold_g:
110 |                 extend.add(temp)
111 |     extend=list(extend)
112 |     random.shuffle(extend)
113 |     
114 |     # extend number filter
115 |     ex_filter=set()
116 |     NUM=math.ceil(len(gold_g)*EX_RATE) 
117 |     # first use no CVT triple
118 |     for i in extend:
119 |         if 'CVT' not in i:
120 |             ex_filter.add(i)
121 |         if len(ex_filter)==NUM:
122 |             break
123 |     # add CVT triple
124 |     if len(ex_filter)<NUM: 
125 |         for i in extend:
126 |             if 'CVT' in i:
127 |                 ex_filter.add(i)
128 |             if len(ex_filter)==NUM:
129 |                 break      
130 |     
131 |     # noisy graph
132 |     noisy=set(gold_g).union(ex_filter)
133 |     # random shuffle
134 |     noisy=list(noisy)
135 |     random.shuffle(noisy)
136 |     # noisy graph string
137 |     noisy_string=''
138 |     for i in noisy:
139 |         noisy_string=noisy_string+i+' '
140 |     
141 |     # data generation
142 |     # knowledge rewriter
143 |     knowledge=getResponse(kr_prompt.format(triple=noisy_string.strip(),ques=sample["question"]))
144 |     print(kr_prompt.format(triple=noisy_string.strip(),ques=sample["question"]))
145 |     print(knowledge)
146 |     # extract knowledge
147 |     know=''
148 |     for line in knowledge.split('\n'):
149 |         if not line.startswith('Reason') and len(line.strip())!=0:
150 |             if 'Knowledge ' in line:
151 |                 know=know+line.replace('Knowledge ','')[3:]+'\n'
152 |             else:
153 |                 know=know+line+'\n'
154 |     # format knowledge
155 |     format_know=''
156 |     for line in knowledge.split('\n'):
157 |         if line.startswith('Knowledge') or line.startswith('Reason'):
158 |             format_know=format_know+line+'\n'
159 |     # skip null format knowledge
160 |     if len(format_know.strip())==0:
161 |         continue
162 |     # knowledge augmented response
163 |     answer=getResponse(ans_prompt.format(knowledge=know.strip(),ques=sample["question"]))
164 |     print(ans_prompt.format(knowledge=know.strip(),ques=sample["question"]))
165 |     print(answer)
166 |     
167 |     # gold answer extraction
168 |     if DATA=='WebQSP':
169 |         gold=sample["answer"]
170 |     else:
171 |         gold=[]
172 |         for i in sample["answer"]:
173 |             if i.get("entity_name"):
174 |                 gold.append(i["entity_name"])
175 |             else:
176 |                 if i["answer_argument"].isdigit() and num_dict.get(i["answer_argument"]):
177 |                     gold.append(num_dict[i["answer_argument"]])
178 |                 else:
179 |                     gold.append(i["answer_argument"])
180 | 
181 |     # result
182 |     result=''
183 |     FLAG=True
184 |     for i in gold:
185 |         if i.lower() not in answer.lower():
186 |             FLAG=False
187 |             break
188 |     if FLAG:
189 |         result='correct'
190 |     else:
191 |         result='incorrect'
192 |     
193 |     # record
194 |     if FLAG:
195 |         samdict=dict()
196 |         samdict['question']=sample['question']
197 |         samdict['graph']=list(gold_g)
198 |         samdict['ex_graph']=noisy
199 |         samdict['know_prompt']=kr_prompt1.format(triple=noisy_string.strip(),ques=sample["question"])
200 |         samdict['knowledge']=format_know.strip()
201 |         samdict['answer']=gold
202 |         samdict['response']=answer
203 |         data.append(samdict)
204 |     if index%interval==0:
205 |         json.dump(data,open(DATA+'/finetune/'+DATA+'/CoT/middle/all-'+str(index)+'.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
206 | 
207 | # save
208 | random.shuffle(data)
209 | json.dump(data,open(DATA+'/finetune/'+DATA+'/CoT/all.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
210 | 
211 | # convert data to specific template
212 | train_num=int(len(data)*0.9)
213 | train=[]
214 | dev=[]
215 | for i in data[:train_num]:
216 |     temp=dict()
217 |     temp["instruction"]=i['know_prompt']
218 |     temp["input"]=''
219 |     temp["output"]=i['knowledge']
220 |     train.append(temp)
221 |     
222 | for i in data[train_num:]:
223 |     temp=dict()
224 |     temp["instruction"]=i['know_prompt']
225 |     temp["input"]=''
226 |     temp["output"]=i['knowledge']
227 |     dev.append(temp)
228 |     
229 | json.dump(train,open(DATA+'/finetune/'+DATA+'/CoT/train/train.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
230 | json.dump(dev,open(DATA+'/finetune/'+DATA+'/CoT/dev.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
231 | 


--------------------------------------------------------------------------------
/corpus/kg-to-text.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | import copy
  4 | import time
  5 | import random
  6 | from openai import OpenAI
  7 | from tqdm import tqdm
  8 | import os
  9 | 
 10 | interval=500
 11 | DATA='GraphQuestions'
 12 | # set EX_RATE
 13 | if DATA in ['GraphQuestions','WebQSP']:
 14 |     EX_RATE=1
 15 | if DATA in ['grailqa']:
 16 |     EX_RATE=0.5
 17 | 
 18 | train=json.load(open('../subgraph/'+DATA+'/graph/train.json','r',encoding='utf-8'))
 19 | 
 20 | os.makedirs(DATA+'/finetune/'+DATA+'/kg-to-text/train/',exist_ok=True)
 21 | os.makedirs(DATA+'/finetune/'+DATA+'/kg-to-text/middle/',exist_ok=True)
 22 | 
 23 | # set client
 24 | client=OpenAI(api_key='YOUR KEY')
 25 | 
 26 | kr_prompt='''Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: (Oxybutynin Oral, medicine.routed_drug.route_of_administration, Oral administration) (Oxybutynin Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin Chloride Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin chloride 5 extended release film coated tablet, medicine.drug_formulation.formulation_of, Oxybutynin). The sentence is: Oxybutynin Oral is a medication that is administered orally. It is marketed in the form of Oxybutynin chloride 5 extended release film coated tablets. Another marketed formulation is Oxybutynin Chloride Oral. Furthermore, Oxybutynin chloride 5 extended release film coated tablet is a formulation of Oxybutynin.
 27 | 
 28 | Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: (Google, organization.organization.founders, Sergey Brin) (Sergey Brin, people.person.education, CVT1) (CVT1, education.education.institution, University of Maryland, College Park) (Google, organization.organization.founders, Larry Page) (Larry Page, people.person.education, CVT2) (CVT2, education.education.institution, University of Michigan) (CVT2, education.education.institution, Stanford University). The sentence is: Google was founded by Sergey Brin and Larry Page. Sergey Brin was educated at the University of Maryland, College Park, while Larry Page was educated at the University of Michigan and Stanford University.
 29 | 
 30 | Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: (Rock music, music.genre.artists, Outkast) (Rock music, music.genre.parent_genre, Folk music) (Rock music, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, Bright Eyes) (Electronica, music.genre.parent_genre, House music) (Electronica, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, t.A.T.u.). The sentence is: Rock music, which is a subgenre of Folk music, includes artists like Outkast and albums such as "The Confessions Tour". Conversely, Electronica is a daughter genre of House music with artists like Bright Eyes and t.A.T.u., and also features albums like "The Confessions Tour".
 31 | 
 32 | Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: {graph}. The sentence is: '''
 33 | 
 34 | 
 35 | kr_prompt1='Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: {graph}. The sentence is: '
 36 | 
 37 | ans_prompt='''Below are the facts that might be relevant to answer the question:
 38 | {knowledge}
 39 | Question: {ques}
 40 | Answer:'''
 41 | 
 42 | num_dict = {
 43 |         '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 44 |         '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine'
 45 |     }
 46 | 
 47 | def getResponse(prompt,max_retries=10):
 48 |     # set retries
 49 |     retries=0
 50 |     while retries < max_retries:
 51 |         try:
 52 |             res = client.chat.completions.create(
 53 |                 model='gpt-3.5-turbo',
 54 |                 #model='gpt-4',
 55 |                 messages=[
 56 |                     {'role': 'user', 'content': prompt}
 57 |                 ],
 58 |                 temperature=0,
 59 |             )
 60 |             return res.choices[0].message.content
 61 |         except Exception as e:
 62 |             print(f"An error occurred: {e}")
 63 |             print("Retrying in 1 minutes...")
 64 |             retries += 1
 65 |             time.sleep(60)
 66 |     return ''
 67 | 
 68 | data=[]
 69 | resume=0
 70 | #data=json.load(open('train-'+str(resume)+'.json','r',encoding='utf-8'))
 71 | index=resume
 72 | for sample in tqdm(train[resume:]):
 73 |     index+=1
 74 |     # gold graph
 75 |     gold_g=set()
 76 |     for i in sample['restrict_graph']:
 77 |         for j in i:
 78 |             temp='('+j[0]+', '+j[1]+', '+j[2]+')'
 79 |             gold_g.add(temp)
 80 |     # shuffle gold graph
 81 |     gold_g=list(gold_g)
 82 |     random.shuffle(gold_g)
 83 |     
 84 |     # extend graph
 85 |     extend=set()
 86 |     for i in sample["ex_graph"]:
 87 |         for j in i:
 88 |             temp='('+j[0]+', '+j[1]+', '+j[2]+')'
 89 |             if temp not in gold_g:
 90 |                 extend.add(temp)
 91 |     extend=list(extend)
 92 |     random.shuffle(extend)
 93 |     
 94 |     # extend number filter
 95 |     ex_filter=set()
 96 |     NUM=math.ceil(len(gold_g)*EX_RATE) 
 97 |     # first use no CVT triple
 98 |     for i in extend:
 99 |         if 'CVT' not in i:
100 |             ex_filter.add(i)
101 |         if len(ex_filter)==NUM:
102 |             break
103 |     # add CVT triple
104 |     if len(ex_filter)<NUM: 
105 |         for i in extend:
106 |             if 'CVT' in i:
107 |                 ex_filter.add(i)
108 |             if len(ex_filter)==NUM:
109 |                 break      
110 |     
111 |     # noisy graph
112 |     noisy=set(gold_g).union(ex_filter)
113 |     # random shuffle
114 |     noisy=list(noisy)
115 |     random.shuffle(noisy)
116 |     # noisy graph string
117 |     noisy_string=''
118 |     for i in noisy:
119 |         noisy_string=noisy_string+i+' '
120 |     
121 |     # data generation
122 |     # knowledge rewriter
123 |     knowledge=getResponse(kr_prompt.format(graph=noisy_string.strip()))
124 |     print(kr_prompt.format(graph=noisy_string.strip()))
125 |     print(knowledge)
126 |     # skip null format knowledge
127 |     if len(knowledge.strip())==0:
128 |         continue
129 |     # knowledge augmented response
130 |     answer=getResponse(ans_prompt.format(knowledge=knowledge.strip(),ques=sample["question"]))
131 |     print(ans_prompt.format(knowledge=knowledge.strip(),ques=sample["question"]))
132 |     print(answer)
133 |     
134 |     # gold answer extraction
135 |     if DATA=='WebQSP':
136 |         gold=sample["answer"]
137 |     else:
138 |         gold=[]
139 |         for i in sample["answer"]:
140 |             if i.get("entity_name"):
141 |                 gold.append(i["entity_name"])
142 |             else:
143 |                 if i["answer_argument"].isdigit() and num_dict.get(i["answer_argument"]):
144 |                     gold.append(num_dict[i["answer_argument"]])
145 |                 else:
146 |                     gold.append(i["answer_argument"])
147 | 
148 |     # result
149 |     result=''
150 |     FLAG=True
151 |     for i in gold:
152 |         if i.lower() not in answer.lower():
153 |             FLAG=False
154 |             break
155 |     if FLAG:
156 |         result='correct'
157 |     else:
158 |         result='incorrect'
159 |     
160 |     # record
161 |     if FLAG:
162 |         samdict=dict()
163 |         samdict['question']=sample['question']
164 |         samdict['graph']=list(gold_g)
165 |         samdict['ex_graph']=noisy
166 |         samdict['know_prompt']=kr_prompt1.format(graph=noisy_string.strip())
167 |         samdict['knowledge']=knowledge.strip()
168 |         samdict['answer']=gold
169 |         samdict['response']=answer
170 |         data.append(samdict)
171 |     if index%interval==0:
172 |         json.dump(data,open(DATA+'/finetune/'+DATA+'/kg-to-text/middle/all-'+str(index)+'.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
173 | 
174 | # save
175 | random.shuffle(data)
176 | json.dump(data,open(DATA+'/finetune/'+DATA+'/kg-to-text/all.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
177 | 
178 | # convert data to specific template
179 | train_num=int(len(data)*0.9)
180 | train=[]
181 | dev=[]
182 | for i in data[:train_num]:
183 |     temp=dict()
184 |     temp["instruction"]=i['know_prompt']
185 |     temp["input"]=''
186 |     temp["output"]=i['knowledge']
187 |     train.append(temp)
188 |     
189 | for i in data[train_num:]:
190 |     temp=dict()
191 |     temp["instruction"]=i['know_prompt']
192 |     temp["input"]=''
193 |     temp["output"]=i['knowledge']
194 |     dev.append(temp)
195 |     
196 | json.dump(train,open(DATA+'/finetune/'+DATA+'/kg-to-text/train/train.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
197 | json.dump(dev,open(DATA+'/finetune/'+DATA+'/kg-to-text/dev.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
198 | 


--------------------------------------------------------------------------------
/corpus/pa_ablation.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import random
 4 | 
 5 | 
 6 | DATA='GraphQuestions'
 7 | # llm: Meta-Llama-3-8B-Instruct, llama-2-7b-chat-hf
 8 | LLM='llama-2-7b-chat-hf'
 9 | 
10 | data=json.load(open(DATA+'/PA-Mistral/CoT/'+LLM+'/sample.json','r',encoding='utf-8'))
11 | 
12 | ablation=[]
13 | 
14 | kr_prompt='''Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
15 | Triples: {triple}
16 | Question: {ques}
17 | '''
18 | 
19 | for sample in data:
20 |     if sample['source']=='single knowledge':
21 |         continue
22 |     if len(sample['prefer'])!=0:
23 |         verdict=sample['prefer'].split('\n')[-1]
24 |         if verdict=='A':
25 |             temp=dict()
26 |             temp['prompt']=kr_prompt.format(triple=sample["noisy"],ques=sample["question"])
27 |             temp['chosen']=sample["output_list"][0]
28 |             temp["rejected"]=sample["output_list"][1]
29 |             ablation.append(temp)
30 |         if verdict=='B':
31 |             temp=dict()
32 |             temp['prompt']=kr_prompt.format(triple=sample["noisy"],ques=sample["question"])
33 |             temp['chosen']=sample["output_list"][1]
34 |             temp["rejected"]=sample["output_list"][0]
35 |             ablation.append(temp)        
36 |     else:
37 |         num1=0
38 |         num2=0
39 |         for a in sample["answer"]:
40 |             if a in sample["output_list"][0]:
41 |                 num1+=1
42 |             if a in sample["output_list"][1]:
43 |                 num2+=1    
44 |         if num1>num2:
45 |             temp=dict()
46 |             temp['prompt']=kr_prompt.format(triple=sample["noisy"],ques=sample["question"])
47 |             temp['chosen']=sample["output_list"][0]
48 |             temp["rejected"]=sample["output_list"][1]
49 |             ablation.append(temp)
50 |         if num1<num2:     
51 |             temp=dict()
52 |             temp['prompt']=kr_prompt.format(triple=sample["noisy"],ques=sample["question"])
53 |             temp['chosen']=sample["output_list"][1]
54 |             temp["rejected"]=sample["output_list"][0]
55 |             ablation.append(temp)   
56 | 
57 | print(len(ablation))
58 | os.makedirs(DATA+'/PA-Mistral-ablation/CoT/'+LLM,exist_ok = True)
59 | random.shuffle(ablation)
60 | train_num=int(len(ablation)*0.9)
61 | json.dump(ablation[:train_num],open(DATA+'/PA-Mistral-ablation/CoT/'+LLM+'/train.json','w',encoding='utf-8'),ensure_ascii=False,indent=2)
62 | json.dump(ablation[train_num:],open(DATA+'/PA-Mistral-ablation/CoT/'+LLM+'/dev.json','w',encoding='utf-8'),ensure_ascii=False,indent=2)


--------------------------------------------------------------------------------
/corpus/pa_filter.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | from transformers import AutoTokenizer
 4 | from tqdm import tqdm
 5 | 
 6 | # grailqa, GraphQuestions
 7 | DATA='grailqa'
 8 | # llama-2-7b-chat-hf, bloom-3b, Meta-Llama-3-8B-Instruct
 9 | LLM='Meta-Llama-3-8B-Instruct'
10 | 
11 | data=json.load(open(DATA+'/PA-Mistral/CoT/'+LLM+'/middle/DPO-8000.json','r',encoding='utf-8'))
12 | tokenizer = AutoTokenizer.from_pretrained('../../pretrain/'+LLM)
13 | 
14 | def detect_repeated_text(text, threshold=5):
15 |     words = text.split()
16 |     word_count = {}
17 | 
18 |     for word in words:
19 |         word_count[word] = word_count.get(word, 0) + 1
20 | 
21 |     repeated_words = [word for word, count in word_count.items() if count > threshold]
22 |     
23 |     if len(repeated_words)>threshold:
24 |         return True
25 |     else:
26 |         return False
27 |     
28 |     return repeated_words
29 | 
30 | # filter too long sequence
31 | data1=[]
32 | num=0
33 | for sample in tqdm(data):
34 |     
35 |     p_l=len(tokenizer(sample['prompt'],return_tensors="pt")["input_ids"][0])
36 |     c_l=len(tokenizer(sample['chosen'],return_tensors="pt")["input_ids"][0])
37 |     r_l=len(tokenizer(sample['rejected'],return_tensors="pt")["input_ids"][0])
38 |     t_l=max(c_l,r_l)
39 |     s_l=p_l+t_l
40 |     
41 |     if p_l>1024 or c_l>512 or r_l>1024 or s_l>2048:
42 |         continue
43 |         
44 |     chosen=sample['chosen'].strip().split('\n')
45 |     if len(chosen)>6:
46 |         continue
47 |     FLAG=True
48 |     for index,line in enumerate(chosen):
49 |         if index%2==0:
50 |             if not line.startswith('Reason '):
51 |                 FLAG=False
52 |                 break
53 |         else:
54 |             if not line.startswith('Knowledge '):
55 |                 FLAG=False
56 |                 break            
57 |     if FLAG:
58 |         data1.append(sample)
59 |     
60 |     #if detect_repeated_text(sample['chosen']):
61 |     #    print(sample)
62 |     
63 | print(len(data1))
64 | 
65 | # divide into train and dev
66 | random.shuffle(data1)
67 | train_num=int(len(data1)*0.9)
68 | json.dump(data1[:train_num],open(DATA+'/PA-Mistral/CoT/'+LLM+'/train.json','w',encoding='utf-8'),ensure_ascii=False,indent=2)
69 | json.dump(data1[train_num:],open(DATA+'/PA-Mistral/CoT/'+LLM+'/dev.json','w',encoding='utf-8'),ensure_ascii=False,indent=2)
70 | 


--------------------------------------------------------------------------------
/corpus/summary.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | import copy
  4 | import time
  5 | import random
  6 | from openai import OpenAI
  7 | from tqdm import tqdm
  8 | import os
  9 | 
 10 | interval=500
 11 | DATA='GraphQuestions'
 12 | # set EX_RATE
 13 | if DATA in ['GraphQuestions','WebQSP']:
 14 |     EX_RATE=1
 15 | if DATA in ['grailqa']:
 16 |     EX_RATE=0.5
 17 | 
 18 | # set client
 19 | client=OpenAI(api_key='YOUR KEY')
 20 |     
 21 | train=json.load(open('../subgraph/'+DATA+'/graph/train.json','r',encoding='utf-8'))
 22 | 
 23 | os.makedirs(DATA+'/finetune/'+DATA+'/summary/train/',exist_ok=True)
 24 | os.makedirs(DATA+'/finetune/'+DATA+'/summary/middle/',exist_ok=True)
 25 | 
 26 | kr_prompt='''Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 27 | Triples: (Oxybutynin Oral, medicine.routed_drug.route_of_administration, Oral administration) (Oxybutynin Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin Chloride Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin chloride 5 extended release film coated tablet, medicine.drug_formulation.formulation_of, Oxybutynin)
 28 | Question: oxybutynin chloride 5 extended release film coated tablet is the ingredients of what routed drug?
 29 | Knowledge: "Oxybutynin Chloride Oral" is a type of routed drug and "Oxybutynin chloride 5 extended release film coated tablet" is one of the marketed formulations of "Oxybutynin Chloride Oral".
 30 | 
 31 | Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 32 | Triples: (Google, organization.organization.founders, Sergey Brin) (Sergey Brin, people.person.education, CVT1) (CVT1, education.education.institution, University of Maryland, College Park) (Google, organization.organization.founders, Larry Page) (Larry Page, people.person.education, CVT2) (CVT2, education.education.institution, University of Michigan) (CVT2, education.education.institution, Stanford University)
 33 | Question: where did the founder of google go to college?
 34 | Knowledge: The founders of Google are Sergey Brin and Larry Page. Sergey Brin attended the University of Maryland, College Park for his education, while Larry Page attended both the University of Michigan and Stanford University.
 35 | 
 36 | Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 37 | Triples: (Rock music, music.genre.artists, Outkast) (Rock music, music.genre.parent_genre, Folk music) (Rock music, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, Bright Eyes) (Electronica, music.genre.parent_genre, House music) (Electronica, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, t.A.T.u.)
 38 | Question: the albums confessions tour is part of what parent genre of a musical genre?
 39 | Knowledge: The album "The Confessions Tour" is associated with both the Rock music and Electronica genres. Rock music is the parent genre of Folk music, while Electronica is the parent genre of House music.
 40 | 
 41 | Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 42 | Triples: {triple}
 43 | Question: {ques}
 44 | Knowledge: '''
 45 | 
 46 | kr_prompt1='''Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 47 | Triples: {triple}
 48 | Question: {ques}
 49 | Knowledge: '''
 50 | 
 51 | ans_prompt='''Below are the facts that might be relevant to answer the question:
 52 | {knowledge}
 53 | Question: {ques}
 54 | Answer:'''
 55 | 
 56 | num_dict = {
 57 |         '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 58 |         '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine'
 59 |     }
 60 | 
 61 | def getResponse(prompt,max_retries=10):
 62 |     # set retries
 63 |     retries=0
 64 |     while retries < max_retries:
 65 |         try:
 66 |             res = client.chat.completions.create(
 67 |                 model='gpt-3.5-turbo',
 68 |                 #model='gpt-4',
 69 |                 messages=[
 70 |                     {'role': 'user', 'content': prompt}
 71 |                 ],
 72 |                 temperature=0,
 73 |             )
 74 |             return res.choices[0].message.content
 75 |         except Exception as e:
 76 |             print(f"An error occurred: {e}")
 77 |             print("Retrying in 1 minutes...")
 78 |             retries += 1
 79 |             time.sleep(60)
 80 |     return ''
 81 | 
 82 | data=[]
 83 | resume=0
 84 | #data=json.load(open('train-'+str(resume)+'.json','r',encoding='utf-8'))
 85 | index=resume
 86 | for sample in tqdm(train[resume:]):
 87 |     index+=1
 88 |     if index%interval==0:
 89 |         json.dump(data,open(DATA+'/finetune/'+DATA+'/summary/middle/all-'+str(index)+'.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
 90 |     # gold graph
 91 |     gold_g=set()
 92 |     for i in sample['restrict_graph']:
 93 |         for j in i:
 94 |             temp='('+j[0]+', '+j[1]+', '+j[2]+')'
 95 |             gold_g.add(temp)
 96 |     # shuffle gold graph
 97 |     gold_g=list(gold_g)
 98 |     random.shuffle(gold_g)
 99 |     
100 |     # extend graph
101 |     extend=set()
102 |     for i in sample["ex_graph"]:
103 |         for j in i:
104 |             temp='('+j[0]+', '+j[1]+', '+j[2]+')'
105 |             if temp not in gold_g:
106 |                 extend.add(temp)
107 |     extend=list(extend)
108 |     random.shuffle(extend)
109 |     
110 |     # extend number filter
111 |     ex_filter=set()
112 |     NUM=math.ceil(len(gold_g)*EX_RATE)
113 |     # first use no CVT triple
114 |     for i in extend:
115 |         if 'CVT' not in i:
116 |             ex_filter.add(i)
117 |         if len(ex_filter)==NUM:
118 |             break
119 |     # add CVT triple
120 |     if len(ex_filter)<NUM: 
121 |         for i in extend:
122 |             if 'CVT' in i:
123 |                 ex_filter.add(i)
124 |             if len(ex_filter)==NUM:
125 |                 break      
126 |     
127 |     # noisy graph
128 |     noisy=set(gold_g).union(ex_filter)
129 |     # random shuffle
130 |     noisy=list(noisy)
131 |     random.shuffle(noisy)
132 |     # noisy graph string
133 |     noisy_string=''
134 |     for i in noisy:
135 |         noisy_string=noisy_string+i+' '
136 |     
137 |     # data generation
138 |     # knowledge rewriter
139 |     knowledge=getResponse(kr_prompt.format(triple=noisy_string.strip(),ques=sample["question"]))
140 |     print(kr_prompt.format(triple=noisy_string.strip(),ques=sample["question"]))
141 |     print(knowledge)
142 |     # knowledge augmented response
143 |     answer=getResponse(ans_prompt.format(knowledge=knowledge.strip(),ques=sample["question"]))
144 |     print(ans_prompt.format(knowledge=knowledge.strip(),ques=sample["question"]))
145 |     print(answer)
146 |     
147 |     # gold answer extraction
148 |     if DATA=='WebQSP':
149 |         gold=sample["answer"]
150 |     else:
151 |         gold=[]
152 |         for i in sample["answer"]:
153 |             if i.get("entity_name"):
154 |                 gold.append(i["entity_name"])
155 |             else:
156 |                 gold.append(i["answer_argument"])
157 |     
158 |     # gold number
159 |     gold_num=[]
160 |     for i in gold:
161 |         if i.isdigit() and num_dict.get(i):
162 |             gold_num.append(num_dict[i])
163 | 
164 |     # result
165 |     cor=0
166 |     FLAG=False
167 |     FLAG1=True
168 |     FLAG2=False
169 |     # judge use gold_num or gold
170 |     for i in gold_num:
171 |         if i.lower() in answer.lower():
172 |             FLAG2=True
173 |             break
174 |     if FLAG2:
175 |         gold_ans=gold_num
176 |     else:
177 |         gold_ans=gold
178 | 
179 |     # result
180 |     result=''
181 |     FLAG=True
182 |     for i in gold_ans:
183 |         if i.lower() not in answer.lower():
184 |             FLAG=False
185 |             break
186 |     if FLAG:
187 |         result='correct'
188 |     else:
189 |         result='incorrect'
190 |     
191 |     # record
192 |     if FLAG:
193 |         samdict=dict()
194 |         samdict['question']=sample['question']
195 |         samdict['graph']=list(gold_g)
196 |         samdict['ex_graph']=noisy
197 |         samdict['know_prompt']=kr_prompt1.format(triple=noisy_string.strip(),ques=sample["question"])
198 |         samdict['knowledge']=knowledge.strip()
199 |         samdict['answer']=gold_ans
200 |         samdict['response']=answer
201 |         data.append(samdict)
202 | 
203 | # save
204 | random.shuffle(data)
205 | json.dump(data,open(DATA+'/finetune/'+DATA+'/summary/all.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
206 | 
207 | # convert data to specific template
208 | train_num=int(len(data)*0.9)
209 | train=[]
210 | dev=[]
211 | for i in data[:train_num]:
212 |     temp=dict()
213 |     temp["instruction"]=i['know_prompt']
214 |     temp["input"]=''
215 |     temp["output"]=i['knowledge']
216 |     train.append(temp)
217 |     
218 | for i in data[train_num:]:
219 |     temp=dict()
220 |     temp["instruction"]=i['know_prompt']
221 |     temp["input"]=''
222 |     temp["output"]=i['knowledge']
223 |     dev.append(temp)
224 |     
225 | json.dump(train,open(DATA+'/finetune/'+DATA+'/summary/train/train.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
226 | json.dump(dev,open(DATA+'/finetune/'+DATA+'/summary/dev.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
227 | 


--------------------------------------------------------------------------------
/figs/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wuyike2000/CoTKR/52c0bf01eba3858ec6f2ead68649273abdaeca31/figs/1.png


--------------------------------------------------------------------------------
/inference/closed/answer/answer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import json
  4 | import random
  5 | from tqdm import tqdm
  6 | from transformers import GenerationConfig,AutoModelForCausalLM,AutoTokenizer,AutoModel
  7 | import torch
  8 | from peft import PeftModel
  9 | import sys
 10 | import openai
 11 | import time
 12 | from openai import OpenAI
 13 | 
 14 | # generation config
 15 | generation_config = GenerationConfig(
 16 |         temperature=0.01,
 17 |         top_k=40,
 18 |         top_p=0.9,
 19 |         do_sample=True,
 20 |         num_beams=1,
 21 |         repetition_penalty=1.1,
 22 |         max_new_tokens=256
 23 | )
 24 | 
 25 | # dataset: grailqa, GraphQuestions
 26 | DATA='grailqa'
 27 | # rewrite llm: llama-2-7b-chat-hf, Meta-Llama-3-8B-Instruct, chatgpt
 28 | LLM='Meta-Llama-3-8B-Instruct'
 29 | # answer llm: Mistral-7B-Instruct-v0.3, chatgpt
 30 | ANS='Mistral-7B-Instruct-v0.3'
 31 | # knowledge representation: pa-chatgpt, pa-mistral, chain, summary, text, triple
 32 | KR='chain'
 33 | 
 34 | # load different knowledge representation
 35 | if KR=='triple':
 36 |     test=json.load(open('../../../subgraph/'+DATA+'/gold/test.json','r',encoding='utf-8'))
 37 | else:
 38 |     test=json.load(open('../rewrite/result/'+DATA+'/'+LLM+'/'+KR+'.json','r',encoding='utf-8'))
 39 |     
 40 | # set client
 41 | client=OpenAI(api_key='YOUR KEY')
 42 | 
 43 | ans_chain_prompt='''Your task is to answer the question based on the reasoning chain that might be relevant. Try to use the original words from the given knowledge to answer the question. But if it is not useful, just ignore it and generate your own guess.
 44 | {knowledge}
 45 | Question: {ques}
 46 | Answer: '''
 47 | 
 48 | ans_prompt='''Your task is to answer the question based on the knowledge that might be relevant. Try to use the original words from the given knowledge to answer the question. But if it is not useful, just ignore it and generate your own guess.
 49 | {knowledge}
 50 | Question: {ques}
 51 | Answer: '''
 52 | 
 53 | num_dict = {
 54 |     '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 55 |     '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
 56 |     '10': 'ten', '11': 'eleven', '12': 'twelve', '13': 'thirteen', 
 57 |     '14': 'fourteen', '15': 'fifteen', '16': 'sixteen', '17': 'seventeen',
 58 |     '18': 'eighteen', '19': 'nineteen', '20': 'twenty'
 59 | }
 60 |    
 61 | # path for ANS
 62 | LLM_PATH='../../../../pretrain/'+ANS
 63 | # path for tokenizer
 64 | TOKENIZER_PATH='../../../../pretrain/'+ANS
 65 | # graph
 66 | origin=json.load(open('../../../subgraph/'+DATA+'/gold/test.json','r',encoding='utf-8'))
 67 | # result
 68 | if KR=='triple':
 69 |     result='result/'+ANS+'/'+DATA+'/triple.json'
 70 |     os.makedirs('result/'+ANS+'/'+DATA,exist_ok = True)
 71 |     log_file='log/'+ANS+'/'+DATA+'/triple.log'
 72 |     os.makedirs('log/'+ANS+'/'+DATA,exist_ok = True)
 73 | else:
 74 |     result='result/'+ANS+'/'+DATA+'/'+LLM+'/'+KR+'.json'
 75 |     os.makedirs('result/'+ANS+'/'+DATA+'/'+LLM,exist_ok = True)
 76 |     log_file='log/'+ANS+'/'+DATA+'/'+LLM+'/'+KR+'.log'
 77 |     os.makedirs('log/'+ANS+'/'+DATA+'/'+LLM,exist_ok = True)
 78 | 
 79 | # redirect output to log
 80 | sys.stdout = open(log_file, 'w')
 81 | 
 82 | if ANS!='chatgpt':
 83 |     tokenizer=AutoTokenizer.from_pretrained(TOKENIZER_PATH)
 84 |     llm=AutoModelForCausalLM.from_pretrained(LLM_PATH,torch_dtype=torch.float16,low_cpu_mem_usage=True,device_map='cuda:0')
 85 | 
 86 | def getResponse(prompt,max_retries=10):
 87 |     # set retries
 88 |     retries=0
 89 |     while retries < max_retries:
 90 |         try:
 91 |             res = client.chat.completions.create(
 92 |                 model='gpt-3.5-turbo',
 93 |                 #model='gpt-4',
 94 |                 messages=[
 95 |                     {'role': 'user', 'content': prompt}
 96 |                 ],
 97 |                 temperature=0,
 98 |             )
 99 |             return res.choices[0].message.content
100 |         except Exception as e:
101 |             print(f"An error occurred: {e}")
102 |             print("Retrying in 1 minutes...")
103 |             retries += 1
104 |             time.sleep(60)
105 |     return ''
106 | 
107 | def LLMResponse(prompt,llm,tokenizer,cuda):
108 |     inputs = tokenizer(prompt,return_tensors="pt")
109 |     generation_output = llm.generate(
110 |             input_ids=inputs["input_ids"].to(cuda),
111 |             attention_mask=inputs['attention_mask'].to(cuda),
112 |             eos_token_id=tokenizer.eos_token_id,
113 |             pad_token_id=tokenizer.eos_token_id,
114 |             generation_config=generation_config
115 |         )
116 |     output = tokenizer.decode(generation_output[0],skip_special_tokens=True)
117 |     response = output.split(prompt)[-1].strip()
118 |     return response
119 | 
120 | def format(knowledge):
121 |     # format knowledge
122 |     if knowledge.startswith('Your task is to '):
123 |         # format knowledge: chain, pa
124 |         if 'Question: ' in knowledge:
125 |             knowledge=knowledge.split('\n')
126 |             knowledge='\n'.join(knowledge[3:])
127 |     
128 |         # format knowledge: summary
129 |         if 'Knowledge: ' in knowledge:
130 |             knowledge=knowledge.split('Knowledge: ')[1]
131 |         
132 |         # format knowledge: text
133 |         if 'The sentence is: ' in knowledge:
134 |             knowledge=knowledge.split('The sentence is: ')[1]
135 |     return knowledge
136 | 
137 | index=0
138 | f1=0
139 | acc=0
140 | EM=0
141 | data=[]
142 | for sample,sample1 in tqdm(zip(origin,test),total=len(origin)):
143 |     index+=1
144 |     if KR=='triple':
145 |         knowledge=sample1["triples"]
146 |     else:
147 |         knowledge=sample1["knowledge"]
148 |     
149 |     # format knowledge
150 |     knowledge=format(knowledge)
151 |     
152 |     # knowledge augmented response
153 |     if KR in ['pa-chatgpt','pa-mistral','chain']:
154 |         prompt=ans_chain_prompt.format(knowledge=knowledge.strip(),ques=sample["question"])
155 |     else:
156 |         prompt=ans_prompt.format(knowledge=knowledge.strip(),ques=sample["question"])
157 | 
158 |     if ANS!='chatgpt':
159 |         answer=LLMResponse(prompt,llm,tokenizer,'cuda:0')
160 |     else:
161 |         answer=getResponse(prompt)
162 |     print(prompt)
163 |     print(answer)
164 |     
165 |     # gold answer extraction
166 |     gold=sample["answer"]
167 |     
168 |     # gold number
169 |     gold_num=[]
170 |     for i in gold:
171 |         if i.isdigit() and num_dict.get(i):
172 |             gold_num.append(num_dict[i])
173 | 
174 |     # result
175 |     cor=0
176 |     FLAG=False
177 |     FLAG1=True
178 |     FLAG2=False
179 |     # judge use gold_num or gold
180 |     for i in gold_num:
181 |         if i.lower() in answer.lower():
182 |             FLAG2=True
183 |             break
184 |     if FLAG2:
185 |         gold_ans=gold_num
186 |     else:
187 |         gold_ans=gold
188 |     if len(answer)!=0:
189 |         for i in gold_ans:
190 |             if i.lower() in answer.lower():
191 |                 FLAG=True
192 |                 cor+=1
193 |             if i.lower() not in answer.lower():
194 |                 FLAG1=False      
195 |         if FLAG:
196 |             acc+=1
197 |         f1+=cor/len(gold_ans)
198 |         if FLAG1:
199 |             EM+=1    
200 |     else:
201 |         FLAG=False
202 |         FLAG1=False
203 |         cor=0
204 | 
205 |     # record
206 |     temp=dict()
207 |     temp['question']=sample['question']
208 |     temp['answer']=gold_ans
209 |     temp['graph']=sample["triples"]
210 |     temp['knowledge']=knowledge
211 |     temp['response']=answer
212 |     if FLAG:
213 |         temp['accuracy']=1
214 |     else:
215 |         temp['accuracy']=0
216 |     temp['f1']=cor/len(gold)
217 |     if FLAG1:
218 |         temp['EM']=1
219 |     else:
220 |         temp['EM']=0 
221 |     data.append(temp)
222 |     print('Current Accuracy: {}'.format(acc/index))
223 |     print('Current F1: {}'.format(f1/index))
224 |     print('Current EM: {}'.format(EM/index))
225 |     sys.stdout.flush()
226 | 
227 | print('Accuracy: {}'.format(acc/len(test)))
228 | print('F1: {}'.format(f1/len(test)))
229 | print('EM: {}'.format(EM/len(test)))
230 | json.dump(data,open(result,'w',encoding='utf-8'),indent=2,ensure_ascii=False)
231 | 


--------------------------------------------------------------------------------
/inference/closed/rewrite/infer_chain.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import json
  4 | import random
  5 | from tqdm import tqdm
  6 | from transformers import GenerationConfig,AutoModelForCausalLM,AutoTokenizer,AutoModel
  7 | import torch
  8 | from peft import PeftModel
  9 | import sys
 10 | import openai
 11 | import time
 12 | from openai import OpenAI
 13 | 
 14 | # generation config
 15 | generation_config = GenerationConfig(
 16 |         temperature=0.01,
 17 |         top_k=40,
 18 |         top_p=0.9,
 19 |         do_sample=True,
 20 |         num_beams=1,
 21 |         repetition_penalty=1.1,
 22 |         max_new_tokens=1024
 23 | )
 24 | 
 25 | # dataset: grailqa, GraphQuestions
 26 | DATA='grailqa'
 27 | # llm: llama-2-7b-chat-hf, Meta-Llama-3-8B-Instruct, chatgpt
 28 | LLM='Meta-Llama-3-8B-Instruct'
 29 | 
 30 | # set client
 31 | client=OpenAI(api_key='YOUR KEY')
 32 | 
 33 | test=json.load(open('../../../subgraph/'+DATA+'/gold/test.json','r',encoding='utf-8'))
 34 | 
 35 | kr_prompt_llm='''Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 36 | Triples: {triple}
 37 | Question: {ques}
 38 | '''
 39 | 
 40 | kr_prompt_gpt='''Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 41 | Triples: (Oxybutynin Oral, medicine.routed_drug.route_of_administration, Oral administration) (Oxybutynin Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin Chloride Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin chloride 5 extended release film coated tablet, medicine.drug_formulation.formulation_of, Oxybutynin)
 42 | Question: oxybutynin chloride 5 extended release film coated tablet is the ingredients of what routed drug?
 43 | Reason 1: I need to know which routed drug has the marketed formulation of oxybutynin chloride 5 extended release film coated tablet.
 44 | Knowledge 1: The routed drugs Oxybutynin Oral and Oxybutynin Chloride Oral have the marketed formulation of oxybutynin chloride 5 extended release film coated tablet.
 45 | 
 46 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 47 | Triples: (Google, organization.organization.founders, Sergey Brin) (Sergey Brin, people.person.education, CVT1) (CVT1, education.education.institution, University of Maryland, College Park) (Google, organization.organization.founders, Larry Page) (Larry Page, people.person.education, CVT2) (CVT2, education.education.institution, University of Michigan) (CVT2, education.education.institution, Stanford University)
 48 | Question: where did the founder of google go to college?
 49 | Reason 1: I need to know who the founders of Google are.
 50 | Knowledge 1: The founders of Google are Sergey Brin and Larry Page.
 51 | Reason 2: I need to know where Sergey Brin and Larry Page went to college.
 52 | Knowledge 2: Sergey Brin went to the University of Maryland, College Park for college. Larry Page went to the University of Michigan and Stanford University for college.
 53 | 
 54 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 55 | Triples: (Rock music, music.genre.artists, Outkast) (Rock music, music.genre.parent_genre, Folk music) (Rock music, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, Bright Eyes) (Electronica, music.genre.parent_genre, House music) (Electronica, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, t.A.T.u.)
 56 | Question: the albums confessions tour is part of what parent genre of a musical genre?
 57 | Reason 1: I need to know the musical genre of the albums confessions tour.
 58 | Knowledge 1: The album confessions tour is associated with the genre Rock music and Electronica.
 59 | Reason 2: I need to know the parent genre of Rock music and Electronica.
 60 | Knowledge 2: The parent genre of Rock music is Folk music. The parent genre of Electronica is House music.
 61 | 
 62 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 63 | Triples: {triple}
 64 | Question: {ques}
 65 | '''
 66 | 
 67 | num_dict = {
 68 |     '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 69 |     '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
 70 |     '10': 'ten', '11': 'eleven', '12': 'twelve', '13': 'thirteen', 
 71 |     '14': 'fourteen', '15': 'fifteen', '16': 'sixteen', '17': 'seventeen',
 72 |     '18': 'eighteen', '19': 'nineteen', '20': 'twenty'
 73 | }
 74 |    
 75 | if LLM!='chatgpt':
 76 |     # path for LLM
 77 |     LLM_PATH='../../../../pretrain/'+LLM
 78 |     # path for tokenizer
 79 |     TOKENIZER_PATH='../../../../pretrain/'+LLM
 80 |     # path for lora
 81 |     PEFT_PATH='../../../instruction-tuning/output-'+DATA+'/CoT/'+LLM+'/best_model'
 82 |     # load tokenizer and llm
 83 |     tokenizer=AutoTokenizer.from_pretrained(TOKENIZER_PATH)
 84 |     llm=AutoModelForCausalLM.from_pretrained(LLM_PATH,torch_dtype=torch.float16,low_cpu_mem_usage=True,device_map='cuda:0')
 85 |     # merge peft into base LLM
 86 |     if PEFT_PATH:
 87 |         llm=PeftModel.from_pretrained(llm, PEFT_PATH,torch_dtype=torch.float16,device_map='cuda:0')
 88 | 
 89 | # result
 90 | result='result/'+DATA+'/'+LLM+'/chain.json'
 91 | os.makedirs('result/'+DATA+'/'+LLM,exist_ok = True)
 92 | log_file='log/'+DATA+'/'+LLM+'/chain.log'
 93 | os.makedirs('log/'+DATA+'/'+LLM,exist_ok = True)
 94 | 
 95 | # redirect output to log
 96 | sys.stdout = open(log_file, 'w')
 97 | 
 98 | def getResponse(prompt,max_retries=10):
 99 |     # set retries
100 |     retries=0
101 |     while retries < max_retries:
102 |         try:
103 |             res = client.chat.completions.create(
104 |                 model='gpt-3.5-turbo',
105 |                 #model='gpt-4',
106 |                 messages=[
107 |                     {'role': 'user', 'content': prompt}
108 |                 ],
109 |                 temperature=0,
110 |             )
111 |             return res.choices[0].message.content
112 |         except Exception as e:
113 |             print(f"An error occurred: {e}")
114 |             print("Retrying in 1 minutes...")
115 |             retries += 1
116 |             time.sleep(60)
117 |     return ''
118 |             
119 | def LLMResponse(prompt,llm,tokenizer,cuda):
120 |     inputs = tokenizer(prompt,return_tensors="pt")
121 |     generation_output = llm.generate(
122 |             input_ids=inputs["input_ids"].to(cuda),
123 |             attention_mask=inputs['attention_mask'].to(cuda),
124 |             eos_token_id=tokenizer.eos_token_id,
125 |             pad_token_id=tokenizer.eos_token_id,
126 |             generation_config=generation_config
127 |         )
128 |     output = tokenizer.decode(generation_output[0],skip_special_tokens=True)
129 |     response = output.split(prompt)[-1].strip()
130 |     return response
131 | 
132 | data=[]
133 | for sample in tqdm(test):
134 | 
135 |     # knowledge rewriter
136 |     if len(sample["triples"])!=0:
137 |         if LLM!='chatgpt':
138 |             knowledge=LLMResponse(kr_prompt_llm.format(triple=sample["triples"],ques=sample["question"]),llm,tokenizer,'cuda:0')
139 |             print(kr_prompt_llm.format(triple=sample["triples"],ques=sample["question"]))
140 |             print(knowledge)
141 |         else:
142 |             knowledge=getResponse(kr_prompt_gpt.format(triple=sample["triples"],ques=sample["question"]))
143 |             print(kr_prompt_gpt.format(triple=sample["triples"],ques=sample["question"]))
144 |             print(knowledge)            
145 |     else:
146 |         knowledge=''
147 | 
148 |     # record
149 |     temp=dict()
150 |     temp['question']=sample['question']
151 |     temp['answer']=sample["answer"]
152 |     temp['graph']=sample["triples"]
153 |     temp['knowledge']=knowledge
154 |     data.append(temp)
155 | 
156 | json.dump(data,open(result,'w',encoding='utf-8'),indent=2,ensure_ascii=False)


--------------------------------------------------------------------------------
/inference/closed/rewrite/infer_pa.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import json
  4 | import random
  5 | from tqdm import tqdm
  6 | from transformers import GenerationConfig,AutoModelForCausalLM,AutoTokenizer,AutoModel
  7 | import torch
  8 | from peft import PeftModel
  9 | import sys
 10 | import openai
 11 | import time
 12 | from openai import OpenAI
 13 | 
 14 | # generation config
 15 | generation_config = GenerationConfig(
 16 |         temperature=0.01,
 17 |         top_k=40,
 18 |         top_p=0.9,
 19 |         do_sample=True,
 20 |         num_beams=1,
 21 |         repetition_penalty=1.1,
 22 |         max_new_tokens=1024
 23 | )
 24 | 
 25 | # dataset: grailqa, GraphQuestions
 26 | DATA='grailqa'
 27 | # llm: llama-2-7b-chat-hf, Meta-Llama-3-8B-Instruct
 28 | LLM='Meta-Llama-3-8B-Instruct'
 29 | 
 30 | # set client
 31 | client=OpenAI(api_key='YOUR KEY')
 32 | 
 33 | test=json.load(open('../../../subgraph/'+DATA+'/gold/test.json','r',encoding='utf-8'))
 34 | 
 35 | kr_prompt_llm='''Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 36 | Triples: {triple}
 37 | Question: {ques}
 38 | '''
 39 | 
 40 | kr_prompt_gpt='''Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 41 | Triples: (Oxybutynin Oral, medicine.routed_drug.route_of_administration, Oral administration) (Oxybutynin Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin Chloride Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin chloride 5 extended release film coated tablet, medicine.drug_formulation.formulation_of, Oxybutynin)
 42 | Question: oxybutynin chloride 5 extended release film coated tablet is the ingredients of what routed drug?
 43 | Reason 1: I need to know which routed drug has the marketed formulation of oxybutynin chloride 5 extended release film coated tablet.
 44 | Knowledge 1: The routed drugs Oxybutynin Oral and Oxybutynin Chloride Oral have the marketed formulation of oxybutynin chloride 5 extended release film coated tablet.
 45 | 
 46 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 47 | Triples: (Google, organization.organization.founders, Sergey Brin) (Sergey Brin, people.person.education, CVT1) (CVT1, education.education.institution, University of Maryland, College Park) (Google, organization.organization.founders, Larry Page) (Larry Page, people.person.education, CVT2) (CVT2, education.education.institution, University of Michigan) (CVT2, education.education.institution, Stanford University)
 48 | Question: where did the founder of google go to college?
 49 | Reason 1: I need to know who the founders of Google are.
 50 | Knowledge 1: The founders of Google are Sergey Brin and Larry Page.
 51 | Reason 2: I need to know where Sergey Brin and Larry Page went to college.
 52 | Knowledge 2: Sergey Brin went to the University of Maryland, College Park for college. Larry Page went to the University of Michigan and Stanford University for college.
 53 | 
 54 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 55 | Triples: (Rock music, music.genre.artists, Outkast) (Rock music, music.genre.parent_genre, Folk music) (Rock music, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, Bright Eyes) (Electronica, music.genre.parent_genre, House music) (Electronica, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, t.A.T.u.)
 56 | Question: the albums confessions tour is part of what parent genre of a musical genre?
 57 | Reason 1: I need to know the musical genre of the albums confessions tour.
 58 | Knowledge 1: The album confessions tour is associated with the genre Rock music and Electronica.
 59 | Reason 2: I need to know the parent genre of Rock music and Electronica.
 60 | Knowledge 2: The parent genre of Rock music is Folk music. The parent genre of Electronica is House music.
 61 | 
 62 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 63 | Triples: {triple}
 64 | Question: {ques}
 65 | '''
 66 | 
 67 | num_dict = {
 68 |     '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 69 |     '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
 70 |     '10': 'ten', '11': 'eleven', '12': 'twelve', '13': 'thirteen', 
 71 |     '14': 'fourteen', '15': 'fifteen', '16': 'sixteen', '17': 'seventeen',
 72 |     '18': 'eighteen', '19': 'nineteen', '20': 'twenty'
 73 | }
 74 |    
 75 | if LLM!='chatgpt':
 76 |     # path for LLM
 77 |     LLM_PATH='../../../instruction-tuning/output-'+DATA+'/sft/CoT/'+LLM
 78 |     # path for tokenizer
 79 |     TOKENIZER_PATH='../../../../pretrain/'+LLM
 80 |     # path for lora
 81 |     PEFT_PATH='../../../instruction-tuning/output-'+DATA+'/PA-chatgpt/CoT/'+LLM+'/best_model'
 82 |     print(PEFT_PATH)
 83 |     # load tokenizer and llm
 84 |     tokenizer=AutoTokenizer.from_pretrained(TOKENIZER_PATH)
 85 |     llm=AutoModelForCausalLM.from_pretrained(LLM_PATH,torch_dtype=torch.float16,low_cpu_mem_usage=True,device_map='cuda:0')
 86 |     # merge peft into base LLM
 87 |     if PEFT_PATH:
 88 |         llm=PeftModel.from_pretrained(llm, PEFT_PATH,torch_dtype=torch.float16,device_map='cuda:0')
 89 | 
 90 | # result
 91 | result='result/'+DATA+'/'+LLM+'/pa-chatgpt.json'
 92 | os.makedirs('result/'+DATA+'/'+LLM,exist_ok = True)
 93 | log_file='log/'+DATA+'/'+LLM+'/pa-chatgpt.log'
 94 | os.makedirs('log/'+DATA+'/'+LLM,exist_ok = True)
 95 | 
 96 | # redirect output to log
 97 | sys.stdout = open(log_file, 'w')
 98 | 
 99 | def getResponse(prompt,max_retries=10):
100 |     # set retries
101 |     retries=0
102 |     while retries < max_retries:
103 |         try:
104 |             res = client.chat.completions.create(
105 |                 model='gpt-3.5-turbo',
106 |                 #model='gpt-4',
107 |                 messages=[
108 |                     {'role': 'user', 'content': prompt}
109 |                 ],
110 |                 temperature=0,
111 |             )
112 |             return res.choices[0].message.content
113 |         except Exception as e:
114 |             print(f"An error occurred: {e}")
115 |             print("Retrying in 1 minutes...")
116 |             retries += 1
117 |             time.sleep(60)
118 |     return ''
119 |             
120 | def LLMResponse(prompt,llm,tokenizer,cuda):
121 |     inputs = tokenizer(prompt,return_tensors="pt")
122 |     generation_output = llm.generate(
123 |             input_ids=inputs["input_ids"].to(cuda),
124 |             attention_mask=inputs['attention_mask'].to(cuda),
125 |             eos_token_id=tokenizer.eos_token_id,
126 |             pad_token_id=tokenizer.eos_token_id,
127 |             generation_config=generation_config
128 |         )
129 |     output = tokenizer.decode(generation_output[0],skip_special_tokens=True)
130 |     response = output.split(prompt)[-1].strip()
131 |     return response
132 | 
133 | data=[]
134 | for sample in tqdm(test):
135 | 
136 |     # knowledge rewriter
137 |     if len(sample["triples"])!=0:
138 |         if LLM!='chatgpt':
139 |             knowledge=LLMResponse(kr_prompt_llm.format(triple=sample["triples"],ques=sample["question"]),llm,tokenizer,'cuda:0')
140 |             print(kr_prompt_llm.format(triple=sample["triples"],ques=sample["question"]))
141 |             print(knowledge)
142 |         else:
143 |             knowledge=getResponse(kr_prompt_gpt.format(triple=sample["triples"],ques=sample["question"]))
144 |             print(kr_prompt_gpt.format(triple=sample["triples"],ques=sample["question"]))
145 |             print(knowledge)            
146 |     else:
147 |         knowledge=''
148 | 
149 |     # record
150 |     temp=dict()
151 |     temp['question']=sample['question']
152 |     temp['answer']=sample["answer"]
153 |     temp['graph']=sample["triples"]
154 |     temp['knowledge']=knowledge
155 |     data.append(temp)
156 | 
157 | json.dump(data,open(result,'w',encoding='utf-8'),indent=2,ensure_ascii=False)


--------------------------------------------------------------------------------
/inference/closed/rewrite/infer_summary.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import json
  4 | import random
  5 | from tqdm import tqdm
  6 | from transformers import GenerationConfig,AutoModelForCausalLM,AutoTokenizer,AutoModel
  7 | import torch
  8 | from peft import PeftModel
  9 | import sys
 10 | import openai
 11 | import time
 12 | from openai import OpenAI
 13 | 
 14 | # generation config
 15 | generation_config = GenerationConfig(
 16 |         temperature=0.01,
 17 |         top_k=40,
 18 |         top_p=0.9,
 19 |         do_sample=True,
 20 |         num_beams=1,
 21 |         repetition_penalty=1.1,
 22 |         max_new_tokens=1024
 23 | )
 24 | 
 25 | # dataset: grailqa, GraphQuestions
 26 | DATA='grailqa'
 27 | # llm: llama-2-7b-chat-hf, Meta-Llama-3-8B-Instruct, chatgpt
 28 | LLM='Meta-Llama-3-8B-Instruct'
 29 | 
 30 | # set client
 31 | client=OpenAI(api_key='YOUR KEY')
 32 | 
 33 | test=json.load(open('../../../subgraph/'+DATA+'/gold/test.json','r',encoding='utf-8'))
 34 | 
 35 | kr_prompt_llm='''Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 36 | Triples: {triple}
 37 | Question: {ques}
 38 | Knowledge: '''
 39 | 
 40 | kr_prompt_gpt='''Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 41 | Triples: (Oxybutynin Oral, medicine.routed_drug.route_of_administration, Oral administration) (Oxybutynin Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin Chloride Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin chloride 5 extended release film coated tablet, medicine.drug_formulation.formulation_of, Oxybutynin)
 42 | Question: oxybutynin chloride 5 extended release film coated tablet is the ingredients of what routed drug?
 43 | Knowledge: The Oxybutynin chloride 5 extended release film coated tablet is a marketed formulation of the routed drugs Oxybutynin Oral and Oxybutynin Chloride Oral.
 44 | 
 45 | Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 46 | Triples: (Google, organization.organization.founders, Sergey Brin) (Sergey Brin, people.person.education, CVT1) (CVT1, education.education.institution, University of Maryland, College Park) (Google, organization.organization.founders, Larry Page) (Larry Page, people.person.education, CVT2) (CVT2, education.education.institution, University of Michigan) (CVT2, education.education.institution, Stanford University)
 47 | Question: where did the founder of google go to college?
 48 | Knowledge: The founders of Google are Sergey Brin and Larry Page. Sergey Brin attended the University of Maryland, College Park for his education. Larry Page attended the University of Michigan and Stanford University for his education.
 49 | 
 50 | Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 51 | Triples: (Rock music, music.genre.artists, Outkast) (Rock music, music.genre.parent_genre, Folk music) (Rock music, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, Bright Eyes) (Electronica, music.genre.parent_genre, House music) (Electronica, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, t.A.T.u.)
 52 | Question: the albums confessions tour is part of what parent genre of a musical genre?
 53 | Knowledge: The album confessions tour is associated with the genre Rock music and Electronica. The parent genre of Rock music is Folk music. The parent genre of Electronica is House music.
 54 | 
 55 | Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 56 | Triples: {triple}
 57 | Question: {ques}
 58 | Knowledge: '''
 59 | 
 60 | num_dict = {
 61 |     '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 62 |     '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
 63 |     '10': 'ten', '11': 'eleven', '12': 'twelve', '13': 'thirteen', 
 64 |     '14': 'fourteen', '15': 'fifteen', '16': 'sixteen', '17': 'seventeen',
 65 |     '18': 'eighteen', '19': 'nineteen', '20': 'twenty'
 66 | }
 67 |    
 68 | if LLM!='chatgpt':
 69 |     # path for LLM
 70 |     LLM_PATH='../../../../pretrain/'+LLM
 71 |     # path for tokenizer
 72 |     TOKENIZER_PATH='../../../../pretrain/'+LLM
 73 |     # path for lora
 74 |     PEFT_PATH='../../../instruction-tuning/output-'+DATA+'/summary/'+LLM+'/best_model'
 75 |     # load tokenizer and llm
 76 |     tokenizer=AutoTokenizer.from_pretrained(TOKENIZER_PATH)
 77 |     llm=AutoModelForCausalLM.from_pretrained(LLM_PATH,torch_dtype=torch.float16,low_cpu_mem_usage=True,device_map='cuda:0')
 78 |     # merge peft into base LLM
 79 |     if PEFT_PATH:
 80 |         llm=PeftModel.from_pretrained(llm, PEFT_PATH,torch_dtype=torch.float16,device_map='cuda:0')
 81 |     
 82 | # result
 83 | result='result/'+DATA+'/'+LLM+'/summary.json'
 84 | os.makedirs('result/'+DATA+'/'+LLM,exist_ok = True)
 85 | log_file='log/'+DATA+'/'+LLM+'/summary.log'
 86 | os.makedirs('log/'+DATA+'/'+LLM,exist_ok = True)
 87 | 
 88 | # redirect output to log
 89 | sys.stdout = open(log_file, 'w')
 90 | 
 91 | def getResponse(prompt,max_retries=10):
 92 |     # set retries
 93 |     retries=0
 94 |     while retries < max_retries:
 95 |         try:
 96 |             res = client.chat.completions.create(
 97 |                 model='gpt-3.5-turbo',
 98 |                 #model='gpt-4',
 99 |                 messages=[
100 |                     {'role': 'user', 'content': prompt}
101 |                 ],
102 |                 temperature=0,
103 |             )
104 |             return res.choices[0].message.content
105 |         except Exception as e:
106 |             print(f"An error occurred: {e}")
107 |             print("Retrying in 1 minutes...")
108 |             retries += 1
109 |             time.sleep(60)
110 |     return ''
111 |             
112 | def LLMResponse(prompt,llm,tokenizer,cuda):
113 |     inputs = tokenizer(prompt,return_tensors="pt")
114 |     generation_output = llm.generate(
115 |             input_ids=inputs["input_ids"].to(cuda),
116 |             attention_mask=inputs['attention_mask'].to(cuda),
117 |             eos_token_id=tokenizer.eos_token_id,
118 |             pad_token_id=tokenizer.eos_token_id,
119 |             generation_config=generation_config
120 |         )
121 |     output = tokenizer.decode(generation_output[0],skip_special_tokens=True)
122 |     response = output.split(prompt)[-1].strip()
123 |     return response
124 | 
125 | data=[]
126 | for sample in tqdm(test):
127 | 
128 |     # knowledge rewriter
129 |     if len(sample["triples"])!=0:
130 |         if LLM!='chatgpt':
131 |             knowledge=LLMResponse(kr_prompt_llm.format(triple=sample["triples"],ques=sample["question"]),llm,tokenizer,'cuda:0')
132 |             print(kr_prompt_llm.format(triple=sample["triples"],ques=sample["question"]))
133 |             print(knowledge)
134 |         else:
135 |             knowledge=getResponse(kr_prompt_gpt.format(triple=sample["triples"],ques=sample["question"]))
136 |             print(kr_prompt_gpt.format(triple=sample["triples"],ques=sample["question"]))
137 |             print(knowledge)            
138 |     else:
139 |         knowledge=''
140 | 
141 |     # record
142 |     temp=dict()
143 |     temp['question']=sample['question']
144 |     temp['answer']=sample["answer"]
145 |     temp['graph']=sample["triples"]
146 |     temp['knowledge']=knowledge
147 |     data.append(temp)
148 | 
149 | json.dump(data,open(result,'w',encoding='utf-8'),indent=2,ensure_ascii=False)


--------------------------------------------------------------------------------
/inference/closed/rewrite/infer_text.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import json
  4 | import random
  5 | from tqdm import tqdm
  6 | from transformers import GenerationConfig,AutoModelForCausalLM,AutoTokenizer,AutoModel
  7 | import torch
  8 | from peft import PeftModel
  9 | import sys
 10 | import openai
 11 | import time
 12 | from openai import OpenAI
 13 | 
 14 | # generation config
 15 | generation_config = GenerationConfig(
 16 |         temperature=0.01,
 17 |         top_k=40,
 18 |         top_p=0.9,
 19 |         do_sample=True,
 20 |         num_beams=1,
 21 |         repetition_penalty=1.1,
 22 |         max_new_tokens=1024
 23 | )
 24 | 
 25 | # dataset: grailqa, GraphQuestions
 26 | DATA='grailqa'
 27 | # llm: llama-2-7b-chat-hf, Meta-Llama-3-8B-Instruct, chatgpt
 28 | LLM='Meta-Llama-3-8B-Instruct'
 29 | 
 30 | # set client
 31 | client=OpenAI(api_key='YOUR KEY')
 32 | 
 33 | test=json.load(open('../../../subgraph/'+DATA+'/gold/test.json','r',encoding='utf-8'))
 34 | 
 35 | kr_prompt_llm='''Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: {triple}. The sentence is: '''
 36 | 
 37 | kr_prompt_gpt='''Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: (Oxybutynin Oral, medicine.routed_drug.route_of_administration, Oral administration) (Oxybutynin Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin Chloride Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin chloride 5 extended release film coated tablet, medicine.drug_formulation.formulation_of, Oxybutynin). The sentence is: Oxybutynin Oral is a medication that is administered orally. It is marketed in the form of Oxybutynin chloride 5 extended release film coated tablets. Another marketed formulation is Oxybutynin Chloride Oral. Furthermore, Oxybutynin chloride 5 extended release film coated tablet is a formulation of Oxybutynin.
 38 | 
 39 | Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: (Google, organization.organization.founders, Sergey Brin) (Sergey Brin, people.person.education, CVT1) (CVT1, education.education.institution, University of Maryland, College Park) (Google, organization.organization.founders, Larry Page) (Larry Page, people.person.education, CVT2) (CVT2, education.education.institution, University of Michigan) (CVT2, education.education.institution, Stanford University). The sentence is: Google was founded by Sergey Brin and Larry Page. Sergey Brin was educated at the University of Maryland, College Park, while Larry Page was educated at the University of Michigan and Stanford University.
 40 | 
 41 | Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: (Rock music, music.genre.artists, Outkast) (Rock music, music.genre.parent_genre, Folk music) (Rock music, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, Bright Eyes) (Electronica, music.genre.parent_genre, House music) (Electronica, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, t.A.T.u.). The sentence is: Rock music, which is a subgenre of Folk music, includes artists like Outkast and albums such as "The Confessions Tour". Conversely, Electronica is a daughter genre of House music with artists like Bright Eyes and t.A.T.u., and also features albums like "The Confessions Tour".
 42 | 
 43 | Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: {triple}. The sentence is: '''
 44 | 
 45 | num_dict = {
 46 |     '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 47 |     '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
 48 |     '10': 'ten', '11': 'eleven', '12': 'twelve', '13': 'thirteen', 
 49 |     '14': 'fourteen', '15': 'fifteen', '16': 'sixteen', '17': 'seventeen',
 50 |     '18': 'eighteen', '19': 'nineteen', '20': 'twenty'
 51 | }
 52 |    
 53 | if LLM!='chatgpt':
 54 |     # path for LLM
 55 |     LLM_PATH='../../../../pretrain/'+LLM
 56 |     # path for tokenizer
 57 |     TOKENIZER_PATH='../../../../pretrain/'+LLM
 58 |     # path for lora
 59 |     PEFT_PATH='../../../instruction-tuning/output-'+DATA+'/kg-to-text/'+LLM+'/best_model'
 60 |     # load tokenizer and llm
 61 |     tokenizer=AutoTokenizer.from_pretrained(TOKENIZER_PATH)
 62 |     llm=AutoModelForCausalLM.from_pretrained(LLM_PATH,torch_dtype=torch.float16,low_cpu_mem_usage=True,device_map='cuda:0')
 63 |     # merge peft into base LLM
 64 |     if PEFT_PATH:
 65 |         llm=PeftModel.from_pretrained(llm, PEFT_PATH,torch_dtype=torch.float16,device_map='cuda:0')
 66 |         
 67 | # result
 68 | result='result/'+DATA+'/'+LLM+'/text.json'
 69 | os.makedirs('result/'+DATA+'/'+LLM,exist_ok = True)
 70 | log_file='log/'+DATA+'/'+LLM+'/text.log'
 71 | os.makedirs('log/'+DATA+'/'+LLM,exist_ok = True)
 72 | 
 73 | # redirect output to log
 74 | sys.stdout = open(log_file, 'w')
 75 | 
 76 | def getResponse(prompt,max_retries=10):
 77 |     # set retries
 78 |     retries=0
 79 |     while retries < max_retries:
 80 |         try:
 81 |             res = client.chat.completions.create(
 82 |                 model='gpt-3.5-turbo',
 83 |                 #model='gpt-4',
 84 |                 messages=[
 85 |                     {'role': 'user', 'content': prompt}
 86 |                 ],
 87 |                 temperature=0,
 88 |             )
 89 |             return res.choices[0].message.content
 90 |         except Exception as e:
 91 |             print(f"An error occurred: {e}")
 92 |             print("Retrying in 1 minutes...")
 93 |             retries += 1
 94 |             time.sleep(60)
 95 |     return ''
 96 |             
 97 | def LLMResponse(prompt,llm,tokenizer,cuda):
 98 |     inputs = tokenizer(prompt,return_tensors="pt",max_length=4096,truncation=True)
 99 |     generation_output = llm.generate(
100 |             input_ids=inputs["input_ids"].to(cuda),
101 |             attention_mask=inputs['attention_mask'].to(cuda),
102 |             eos_token_id=tokenizer.eos_token_id,
103 |             pad_token_id=tokenizer.eos_token_id,
104 |             generation_config=generation_config
105 |         )
106 |     output = tokenizer.decode(generation_output[0],skip_special_tokens=True)
107 |     response = output.split(prompt)[-1].strip()
108 |     return response
109 |     
110 | data=[]
111 | for sample in tqdm(test):
112 | 
113 |     # knowledge rewriter
114 |     if len(sample["triples"])!=0:
115 |         if LLM!='chatgpt':
116 |             knowledge=LLMResponse(kr_prompt_llm.format(triple=sample["triples"]),llm,tokenizer,'cuda:0')
117 |             print(kr_prompt_llm.format(triple=sample["triples"]))
118 |             print(knowledge)
119 |         else:
120 |             knowledge=getResponse(kr_prompt_gpt.format(triple=sample["triples"]))
121 |             print(kr_prompt_gpt.format(triple=sample["triples"]))
122 |             print(knowledge)            
123 |     else:
124 |         knowledge=''
125 | 
126 |     # record
127 |     temp=dict()
128 |     temp['question']=sample['question']
129 |     temp['answer']=sample["answer"]
130 |     temp['graph']=sample["triples"]
131 |     temp['knowledge']=knowledge
132 |     data.append(temp)
133 | 
134 | json.dump(data,open(result,'w',encoding='utf-8'),indent=2,ensure_ascii=False)


--------------------------------------------------------------------------------
/inference/open/answer/answer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import json
  4 | import random
  5 | from tqdm import tqdm
  6 | from transformers import GenerationConfig,AutoModelForCausalLM,AutoTokenizer,AutoModel
  7 | import torch
  8 | from peft import PeftModel
  9 | import sys
 10 | import openai
 11 | import time
 12 | from openai import OpenAI
 13 | 
 14 | # generation config
 15 | generation_config = GenerationConfig(
 16 |         temperature=0.01,
 17 |         top_k=40,
 18 |         top_p=0.9,
 19 |         do_sample=True,
 20 |         num_beams=1,
 21 |         repetition_penalty=1.1,
 22 |         max_new_tokens=256
 23 | )
 24 | 
 25 | # dataset: grailqa, GraphQuestions
 26 | DATA='grailqa'
 27 | # rewrite llm: llama-2-7b-chat-hf, Meta-Llama-3-8B-Instruct, chatgpt
 28 | LLM='Meta-Llama-3-8B-Instruct'
 29 | # answer llm: Mistral-7B-Instruct-v0.3, chatgpt
 30 | ANS='Mistral-7B-Instruct-v0.3'
 31 | # retrieve method: bm25, 2hop
 32 | MODE='2hop'
 33 | # knowledge representation: pa-chatgpt, pa-mistral, chain, summary, text, triple
 34 | KR='chain'
 35 | 
 36 | # load different knowledge representation
 37 | if KR=='triple':
 38 |     test=json.load(open('../retrieve/'+MODE+'/format/'+DATA+'.json','r',encoding='utf-8'))
 39 | else:
 40 |     test=json.load(open('../rewrite/result/'+DATA+'/'+MODE+'/'+LLM+'/'+KR+'.json','r',encoding='utf-8'))
 41 |     
 42 | # set client
 43 | client=OpenAI(api_key='YOUR KEY')
 44 | 
 45 | ans_chain_prompt='''Your task is to answer the question based on the reasoning chain that might be relevant. Try to use the original words from the given knowledge to answer the question. But if it is not useful, just ignore it and generate your own guess.
 46 | {knowledge}
 47 | Question: {ques}
 48 | Answer: '''
 49 | 
 50 | ans_prompt='''Your task is to answer the question based on the knowledge that might be relevant. Try to use the original words from the given knowledge to answer the question. But if it is not useful, just ignore it and generate your own guess.
 51 | {knowledge}
 52 | Question: {ques}
 53 | Answer: '''
 54 | 
 55 | num_dict = {
 56 |     '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 57 |     '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
 58 |     '10': 'ten', '11': 'eleven', '12': 'twelve', '13': 'thirteen', 
 59 |     '14': 'fourteen', '15': 'fifteen', '16': 'sixteen', '17': 'seventeen',
 60 |     '18': 'eighteen', '19': 'nineteen', '20': 'twenty'
 61 | }
 62 |    
 63 | # path for ANS
 64 | LLM_PATH='../../../../pretrain/'+ANS
 65 | # path for tokenizer
 66 | TOKENIZER_PATH='../../../../pretrain/'+ANS
 67 | # graph
 68 | origin=json.load(open('../../../subgraph/'+DATA+'/gold/test.json','r',encoding='utf-8'))
 69 | # result
 70 | if KR=='triple':
 71 |     result='result/'+ANS+'/'+DATA+'/'+MODE+'/triple.json'
 72 |     os.makedirs('result/'+ANS+'/'+DATA+'/'+MODE,exist_ok = True)
 73 |     log_file='log/'+ANS+'/'+DATA+'/'+MODE+'/triple.log'
 74 |     os.makedirs('log/'+ANS+'/'+DATA+'/'+MODE,exist_ok = True)
 75 | else:
 76 |     result='result/'+ANS+'/'+DATA+'/'+MODE+'/'+LLM+'/'+KR+'.json'
 77 |     os.makedirs('result/'+ANS+'/'+DATA+'/'+MODE+'/'+LLM,exist_ok = True)
 78 |     log_file='log/'+ANS+'/'+DATA+'/'+MODE+'/'+LLM+'/'+KR+'.log'
 79 |     os.makedirs('log/'+ANS+'/'+DATA+'/'+MODE+'/'+LLM,exist_ok = True)
 80 | 
 81 | # redirect output to log
 82 | sys.stdout = open(log_file, 'w')
 83 | 
 84 | if ANS!='chatgpt':
 85 |     tokenizer=AutoTokenizer.from_pretrained(TOKENIZER_PATH)
 86 |     llm=AutoModelForCausalLM.from_pretrained(LLM_PATH,torch_dtype=torch.float16,low_cpu_mem_usage=True,device_map='cuda:0')
 87 | 
 88 | def getResponse(prompt,max_retries=10):
 89 |     # set retries
 90 |     retries=0
 91 |     while retries < max_retries:
 92 |         try:
 93 |             res = client.chat.completions.create(
 94 |                 model='gpt-3.5-turbo',
 95 |                 #model='gpt-4',
 96 |                 messages=[
 97 |                     {'role': 'user', 'content': prompt}
 98 |                 ],
 99 |                 temperature=0,
100 |             )
101 |             return res.choices[0].message.content
102 |         except Exception as e:
103 |             print(f"An error occurred: {e}")
104 |             print("Retrying in 1 minutes...")
105 |             retries += 1
106 |             time.sleep(60)
107 |     return ''
108 | 
109 | def LLMResponse(prompt,llm,tokenizer,cuda):
110 |     inputs = tokenizer(prompt,return_tensors="pt")
111 |     generation_output = llm.generate(
112 |             input_ids=inputs["input_ids"].to(cuda),
113 |             attention_mask=inputs['attention_mask'].to(cuda),
114 |             eos_token_id=tokenizer.eos_token_id,
115 |             pad_token_id=tokenizer.eos_token_id,
116 |             generation_config=generation_config
117 |         )
118 |     output = tokenizer.decode(generation_output[0],skip_special_tokens=True)
119 |     response = output.split(prompt)[-1].strip()
120 |     return response
121 | 
122 | def format(knowledge):
123 |     # format knowledge
124 |     if knowledge.startswith('Your task is to '):
125 |         # format knowledge: chain, pa
126 |         if 'Question: ' in knowledge:
127 |             knowledge=knowledge.split('\n')
128 |             knowledge='\n'.join(knowledge[3:])
129 |     
130 |         # format knowledge: summary
131 |         if 'Knowledge: ' in knowledge:
132 |             knowledge=knowledge.split('Knowledge: ')[1]
133 |         
134 |         # format knowledge: text
135 |         if 'The sentence is: ' in knowledge:
136 |             knowledge=knowledge.split('The sentence is: ')[1]
137 |     return knowledge
138 | 
139 | index=0
140 | f1=0
141 | acc=0
142 | EM=0
143 | data=[]
144 | for sample,sample1 in tqdm(zip(origin,test),total=len(origin)):
145 |     index+=1
146 |     if KR=='triple':
147 |         knowledge=sample1["triples"]
148 |     else:
149 |         knowledge=sample1["knowledge"]
150 |     
151 |     # format knowledge
152 |     knowledge=format(knowledge)
153 |     
154 |     # knowledge augmented response
155 |     if KR in ['pa-chatgpt','pa-mistral','chain']:
156 |         prompt=ans_chain_prompt.format(knowledge=knowledge.strip(),ques=sample["question"])
157 |     else:
158 |         prompt=ans_prompt.format(knowledge=knowledge.strip(),ques=sample["question"])
159 | 
160 |     if ANS!='chatgpt':
161 |         answer=LLMResponse(prompt,llm,tokenizer,'cuda:0')
162 |     else:
163 |         answer=getResponse(prompt)
164 |     print(prompt)
165 |     print(answer)
166 |     
167 |     # gold answer extraction
168 |     gold=sample["answer"]
169 |     
170 |     # gold number
171 |     gold_num=[]
172 |     for i in gold:
173 |         if i.isdigit() and num_dict.get(i):
174 |             gold_num.append(num_dict[i])
175 | 
176 |     # result
177 |     cor=0
178 |     FLAG=False
179 |     FLAG1=True
180 |     FLAG2=False
181 |     # judge use gold_num or gold
182 |     for i in gold_num:
183 |         if i.lower() in answer.lower():
184 |             FLAG2=True
185 |             break
186 |     if FLAG2:
187 |         gold_ans=gold_num
188 |     else:
189 |         gold_ans=gold
190 |     if len(answer)!=0:
191 |         for i in gold_ans:
192 |             if i.lower() in answer.lower():
193 |                 FLAG=True
194 |                 cor+=1
195 |             if i.lower() not in answer.lower():
196 |                 FLAG1=False      
197 |         if FLAG:
198 |             acc+=1
199 |         f1+=cor/len(gold_ans)
200 |         if FLAG1:
201 |             EM+=1    
202 |     else:
203 |         FLAG=False
204 |         FLAG1=False
205 |         cor=0
206 | 
207 |     # record
208 |     temp=dict()
209 |     temp['question']=sample['question']
210 |     temp['answer']=gold_ans
211 |     temp['graph']=sample["triples"]
212 |     temp['knowledge']=knowledge
213 |     temp['response']=answer
214 |     if FLAG:
215 |         temp['accuracy']=1
216 |     else:
217 |         temp['accuracy']=0
218 |     temp['f1']=cor/len(gold)
219 |     if FLAG1:
220 |         temp['EM']=1
221 |     else:
222 |         temp['EM']=0 
223 |     data.append(temp)
224 |     print('Current Accuracy: {}'.format(acc/index))
225 |     print('Current F1: {}'.format(f1/index))
226 |     print('Current EM: {}'.format(EM/index))
227 |     sys.stdout.flush()
228 | 
229 | print('Accuracy: {}'.format(acc/len(test)))
230 | print('F1: {}'.format(f1/len(test)))
231 | print('EM: {}'.format(EM/len(test)))
232 | json.dump(data,open(result,'w',encoding='utf-8'),indent=2,ensure_ascii=False)
233 | 


--------------------------------------------------------------------------------
/inference/open/answer/answer_no.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import json
  4 | import random
  5 | from tqdm import tqdm
  6 | from transformers import GenerationConfig,AutoModelForCausalLM,AutoTokenizer,AutoModel
  7 | import torch
  8 | from peft import PeftModel
  9 | import sys
 10 | import openai
 11 | import time
 12 | from openai import OpenAI
 13 | 
 14 | # generation config
 15 | generation_config = GenerationConfig(
 16 |         temperature=0.01,
 17 |         top_k=40,
 18 |         top_p=0.9,
 19 |         do_sample=True,
 20 |         num_beams=1,
 21 |         repetition_penalty=1.1,
 22 |         max_new_tokens=256
 23 | )
 24 | 
 25 | # dataset: grailqa, GraphQuestions
 26 | DATA='grailqa'
 27 | # chatgpt, Mistral-7B-Instruct-v0.3
 28 | ANS='chatgpt'
 29 | 
 30 | # path for LLM
 31 | LLM_PATH='../../../../pretrain/'+ANS
 32 | # path for tokenizer
 33 | TOKENIZER_PATH='../../../../pretrain/'+ANS
 34 | 
 35 | if ANS!='chatgpt':
 36 |     tokenizer=AutoTokenizer.from_pretrained(TOKENIZER_PATH)
 37 |     llm=AutoModelForCausalLM.from_pretrained(LLM_PATH,torch_dtype=torch.float16,low_cpu_mem_usage=True,device_map='cuda:0')
 38 | 
 39 | # set client
 40 | client=OpenAI(api_key='YOUR KEY')
 41 | 
 42 | test=json.load(open('../retrieve/2hop/format/'+DATA+'.json','r',encoding='utf-8'))
 43 | 
 44 | ans_prompt='''Question: {ques}
 45 | Answer: '''
 46 | 
 47 | num_dict = {
 48 |     '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 49 |     '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
 50 |     '10': 'ten', '11': 'eleven', '12': 'twelve', '13': 'thirteen', 
 51 |     '14': 'fourteen', '15': 'fifteen', '16': 'sixteen', '17': 'seventeen',
 52 |     '18': 'eighteen', '19': 'nineteen', '20': 'twenty'
 53 | }
 54 |    
 55 | # result
 56 | result='result/'+ANS+'/'+DATA+'/no_knowledge.json'
 57 | os.makedirs('result/'+ANS+'/'+DATA,exist_ok = True)
 58 | log_file='log/'+ANS+'/'+DATA+'/no_knowledge.log'
 59 | os.makedirs('result/'+ANS+'/'+DATA,exist_ok = True)
 60 | 
 61 | # redirect output to log
 62 | sys.stdout = open(log_file, 'w')
 63 | 
 64 | def getResponse(prompt,max_retries=10):
 65 |     # set retries
 66 |     retries=0
 67 |     while retries < max_retries:
 68 |         try:
 69 |             res = client.chat.completions.create(
 70 |                 model='gpt-3.5-turbo',
 71 |                 #model='gpt-4',
 72 |                 messages=[
 73 |                     {'role': 'user', 'content': prompt}
 74 |                 ],
 75 |                 temperature=0,
 76 |             )
 77 |             return res.choices[0].message.content
 78 |         except Exception as e:
 79 |             print(f"An error occurred: {e}")
 80 |             print("Retrying in 1 minutes...")
 81 |             retries += 1
 82 |             time.sleep(60)
 83 |     return ''
 84 | 
 85 | def LLMResponse(prompt,llm,tokenizer,cuda):
 86 |     inputs = tokenizer(prompt,return_tensors="pt")
 87 |     generation_output = llm.generate(
 88 |             input_ids=inputs["input_ids"].to(cuda),
 89 |             attention_mask=inputs['attention_mask'].to(cuda),
 90 |             eos_token_id=tokenizer.eos_token_id,
 91 |             pad_token_id=tokenizer.eos_token_id,
 92 |             generation_config=generation_config
 93 |         )
 94 |     output = tokenizer.decode(generation_output[0],skip_special_tokens=True)
 95 |     response = output.split(prompt)[-1].strip()
 96 |     return response
 97 | 
 98 | index=0
 99 | f1=0
100 | acc=0
101 | EM=0
102 | data=[]
103 | for sample in tqdm(test):
104 |     index+=1
105 |     
106 |     # response
107 |     if ANS!='chatgpt':
108 |         answer=LLMResponse(ans_prompt.format(ques=sample["question"]),llm,tokenizer,'cuda:0')
109 |     else:
110 |         answer=getResponse(ans_prompt.format(ques=sample["question"]))
111 |     print(ans_prompt.format(ques=sample["question"]))
112 |     print(answer)
113 |     
114 |     # gold answer extraction
115 |     gold=sample["answer"]
116 |     
117 |     # gold number
118 |     gold_num=[]
119 |     for i in gold:
120 |         if i.isdigit() and num_dict.get(i):
121 |             gold_num.append(num_dict[i])
122 | 
123 |     # result
124 |     cor=0
125 |     FLAG=False
126 |     FLAG1=True
127 |     FLAG2=False
128 |     # judge use gold_num or gold
129 |     for i in gold_num:
130 |         if i.lower() in answer.lower():
131 |             FLAG2=True
132 |             break
133 |     if FLAG2:
134 |         gold_ans=gold_num
135 |     else:
136 |         gold_ans=gold
137 |     if len(answer)!=0:
138 |         for i in gold_ans:
139 |             if i.lower() in answer.lower():
140 |                 FLAG=True
141 |                 cor+=1
142 |             if i.lower() not in answer.lower():
143 |                 FLAG1=False      
144 |         if FLAG:
145 |             acc+=1
146 |         f1+=cor/len(gold_ans)
147 |         if FLAG1:
148 |             EM+=1    
149 |     else:
150 |         FLAG=False
151 |         FLAG1=False
152 |         cor=0
153 | 
154 |     # record
155 |     temp=dict()
156 |     temp['question']=sample['question']
157 |     temp['answer']=gold_ans
158 |     temp['response']=answer
159 |     if FLAG:
160 |         temp['accuracy']=1
161 |     else:
162 |         temp['accuracy']=0
163 |     temp['f1']=cor/len(gold)
164 |     if FLAG1:
165 |         temp['EM']=1
166 |     else:
167 |         temp['EM']=0
168 |     data.append(temp)
169 |     print('Current Accuracy: {}'.format(acc/index))
170 |     print('Current F1: {}'.format(f1/index))
171 |     print('Current EM: {}'.format(EM/index))
172 |     sys.stdout.flush()
173 | 
174 | print('Accuracy: {}'.format(acc/len(test)))
175 | print('F1: {}'.format(f1/len(test)))
176 | print('EM: {}'.format(EM/len(test)))
177 | json.dump(data,open(result,'w',encoding='utf-8'),indent=2,ensure_ascii=False)
178 | 


--------------------------------------------------------------------------------
/inference/open/linearize.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Modifications Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3 | # All rights reserved.
  4 | #
  5 | # This source code is licensed under the license found in the
  6 | # LICENSE file in the root directory of this source tree.
  7 | 
  8 | import os
  9 | import csv
 10 | from tqdm import tqdm
 11 | from collections import defaultdict
 12 | from query_interface import query_ent_name
 13 | 
 14 | class Relation:
 15 |     def __init__(self, line):
 16 |         if line is None:
 17 |             self.subj = self.rel = self.obj = None
 18 |             return
 19 |         e1, rel, e2 = line.strip().split("\t")
 20 |         self.subj = e1
 21 |         self.rel = rel
 22 |         self.obj = e2
 23 |     
 24 |     def __hash__(self):
 25 |         return hash((self.subj, self.rel, self.obj))
 26 |         
 27 |     def _filter_relation(self):
 28 |         relation = self.rel
 29 |         if relation == "type.object.name":
 30 |             return True
 31 |         return False
 32 | 
 33 |     def should_ignore(self, id2name_dict):
 34 |         if self._filter_relation():
 35 |             return True
 36 |         return False
 37 |     
 38 |     def __repr__(self):
 39 |         return f"Subj: {self.subj}; Rel: {self.rel}; Obj: {self.obj}"
 40 | 
 41 | '''
 42 | # query freebase, too slow
 43 | def convert_relation_to_text(relation, entity_names):
 44 |     if isinstance(relation, Relation):
 45 |         subj, rel, obj = relation.subj, relation.rel, relation.obj
 46 |     else:
 47 |         subj, rel, obj = relation
 48 | 
 49 |     # subject
 50 |     # check whether it is literal
 51 |     # literal
 52 |     if subj[:2] not in ['m.','n.','g.']:
 53 |         subj_surface = subj.replace('-08:00','')
 54 |         subj_str = subj.replace('-08:00','')
 55 |     # entity
 56 |     else:
 57 |         if subj in entity_names:
 58 |             subj_surface = entity_names[subj]
 59 |             subj_str = entity_names[subj]
 60 |         else:
 61 |             subj_surface = query_ent_name(subj)
 62 |             if not subj_surface:
 63 |                 subj_surface = subj
 64 |                 subj_str = ""
 65 |             else:
 66 |                 subj_str = subj_surface
 67 |         
 68 |     # object
 69 |     # check whether it is literal
 70 |     # literal
 71 |     if obj[:2] not in ['m.','n.','g.']:
 72 |         obj_surface = obj.replace('-08:00','')
 73 |         obj_str = obj.replace('-08:00','')
 74 |     # entity
 75 |     else:
 76 |         if obj in entity_names:
 77 |             obj_surface = entity_names[obj]
 78 |             obj_str = entity_names[obj]
 79 |         else:
 80 |             obj_surface = query_ent_name(obj)
 81 |             if not obj_surface:
 82 |                 obj_surface = obj
 83 |                 obj_str = ""
 84 |             else:
 85 |                 obj_str = obj_surface
 86 |             
 87 |     # relation
 88 |     # e.g. film.film.other_crew
 89 |     # replace '.' and '_' with ' '
 90 |     rel_surface = rel.replace('.', ' ')
 91 |     rel_surface = rel_surface.replace('_', ' ')
 92 |     
 93 |     triple_form = '('+', '.join([subj_surface,rel,obj_surface])+')'
 94 |     text_form = ""
 95 |     if len(subj_str)!=0:
 96 |         text_form = text_form+subj_str+" "
 97 |     if len(obj_str)!=0:
 98 |         text_form = text_form+rel_surface+" "+obj_str+' .'
 99 |     else:
100 |         text_form=text_form+rel_surface+' .'
101 |     return triple_form, text_form
102 | '''
103 |  
104 | def convert_relation_to_text(relation, entity_names):
105 |     if isinstance(relation, Relation):
106 |         subj, rel, obj = relation.subj, relation.rel, relation.obj
107 |     else:
108 |         subj, rel, obj = relation
109 | 
110 |     # subject
111 |     if subj in entity_names:
112 |         subj_surface = entity_names[subj]
113 |         subj_str = entity_names[subj]
114 |     else:
115 |         subj_surface = subj
116 |         subj_str = ""
117 |         
118 |     # object
119 |     if obj in entity_names:
120 |         obj_surface = entity_names[obj]
121 |         obj_str = entity_names[obj]
122 |     else:
123 |         obj_surface = obj
124 |         obj_str = ""
125 |             
126 |     # relation
127 |     # e.g. film.film.other_crew
128 |     # replace '.' and '_' with ' '
129 |     rel_surface = rel.replace('.', ' ')
130 |     rel_surface = rel_surface.replace('_', ' ')
131 |     
132 |     triple_form = '('+', '.join([subj_surface,rel,obj_surface])+')'
133 |     text_form = ""
134 |     if len(subj_str)!=0:
135 |         text_form = text_form+subj_str+" "
136 |     if len(obj_str)!=0:
137 |         text_form = text_form+rel_surface+" "+obj_str+' .'
138 |     else:
139 |         text_form=text_form+rel_surface+' .'
140 |     return triple_form, text_form
141 | 
142 | # replace "{name} v2" to "{name}"
143 | def get_raw_name(name_wversion):
144 |     dict_name = name_wversion.split(" ")
145 |     if dict_name[-1].startswith("v") and dict_name[-1][1:].isnumeric():
146 |         dict_name = " ".join(dict_name[:-1])
147 |     else:
148 |         dict_name = " ".join(dict_name)
149 |     return dict_name
150 | 
151 | 
152 | def load_nameid_dict(file_dir, lower):
153 |     print("Loading name2id and id2name dict...")
154 |     name2id_dict = defaultdict(list)
155 |     id2name_dict = {}
156 |     for file in tqdm(os.listdir(file_dir)):
157 |         with open(os.path.join(file_dir, file), 'r') as rf:
158 |             data_input = csv.reader(rf, delimiter="\t")
159 |             for row in data_input:
160 |                 if lower:
161 |                     procesed_name = row[2].lower()
162 |                 else:
163 |                     procesed_name = row[2]
164 |                 name2id_dict[procesed_name].append(row[0])
165 |                 id2name_dict[row[0]] = procesed_name
166 |     return name2id_dict, id2name_dict
167 | 


--------------------------------------------------------------------------------
/inference/open/retrieve/2hop/2hop.py:
--------------------------------------------------------------------------------
  1 | from query_interface import query_ent_name,query_1hop_relation,query_2hop_relation,get_1hop_chain,get_2hop_chain,get_2hop_triples
  2 | import json
  3 | import csv
  4 | import re
  5 | import os
  6 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
  7 | from tqdm import tqdm
  8 | from sim_compute import Similarity
  9 | 
 10 | DATA='GraphQuestions'
 11 | K=100
 12 | 
 13 | def has_digit(input_string):
 14 |     for char in input_string:
 15 |         if char.isdigit():
 16 |             return True
 17 |     return False
 18 |     
 19 | def replace_entity(string, entity, replacement):
 20 |     pattern = re.compile(re.escape(entity), re.IGNORECASE)
 21 |     new_string = re.sub(pattern, replacement, string)
 22 |     return new_string
 23 | 
 24 | # construct name dict
 25 | id2name = {}
 26 | for file_name in tqdm(os.listdir('../../Freebase/id2name_parts')):
 27 |     with open(os.path.join('../../Freebase/id2name_parts', file_name), 'r') as rf:
 28 |         data_input = csv.reader(rf, delimiter="\t")
 29 |         for row in data_input:
 30 |             id2name[row[0]] = row[2]
 31 |             
 32 | def convert_names(triples):
 33 |     triplelist=[]
 34 |     for t in triples:
 35 |         # literal
 36 |         if t[0][0:2] not in ['m.','n.','g.']:
 37 |             head=t[0].replace('-08:00','') 
 38 |         # entity
 39 |         else:
 40 |             if id2name.get(t[0]):
 41 |                 head=id2name[t[0]].replace('-08:00','')
 42 |             else:
 43 |                 temp=query_ent_name(t[0])
 44 |                 if temp:
 45 |                     head=temp.replace('-08:00','')
 46 |                 else:
 47 |                     head=t[0]
 48 |         # literal
 49 |         if t[2][0:2] not in ['m.','n.','g.']:
 50 |             tail=t[2].replace('-08:00','') 
 51 |         # entity
 52 |         else:
 53 |             if id2name.get(t[2]):
 54 |                 tail=id2name[t[2]].replace('-08:00','')
 55 |             else:
 56 |                 temp=query_ent_name(t[2])
 57 |                 if temp:
 58 |                     tail=temp.replace('-08:00','')
 59 |                 else:
 60 |                     tail=t[2]     
 61 |         triplelist.append('('+head+', '+t[1]+', '+tail+')')
 62 |     return triplelist
 63 | 
 64 | # similarity model
 65 | sim_model=Similarity()
 66 | 
 67 | retrieve_subgraph=[]
 68 | # retrieve metrics
 69 | accuracy=0
 70 | recall=0
 71 | # query neighbour 
 72 | dataset=json.load(open('../../../../subgraph/'+DATA+'/data/test.json','r',encoding='utf-8'))
 73 | for index,sample in tqdm(enumerate(dataset)):
 74 |     if DATA in ['grailqa','GraphQuestions']:
 75 |         # question, entity, head, answer
 76 |         question=sample["question"]
 77 |         entity=[]
 78 |         head=[]
 79 |         for n in sample["graph_query"]["nodes"]:
 80 |             if n["node_type"]=="entity":
 81 |                 entity.append(n["friendly_name"])
 82 |                 head.append(n["id"])
 83 |                 id2name[n["id"]]=n["friendly_name"]
 84 |         # replace entity surface to [MASK]
 85 |         question_mask=question
 86 |         for i in entity:
 87 |             question_mask=replace_entity(question_mask,i,'[MASK]')
 88 |         answer=[]
 89 |         for i in sample["answer"]:   
 90 |             if i.get("entity_name"):
 91 |                 answer.append(i["entity_name"])
 92 |                 id2name[i["answer_argument"]]=i["entity_name"]
 93 |             else:
 94 |                 answer.append(i["answer_argument"]) 
 95 |     if DATA in ['WebQSP']:
 96 |         # question, entity, head, answer
 97 |         question=sample["question"]
 98 |         entity=[sample["headname"]]
 99 |         question_mask=replace_entity(question,entity[0],'[MASK]')
100 |         head=[sample["headmid"]]
101 |         answer=sample["answername"].split('|')
102 |         id2name[head[0]]=entity[0]
103 |         for ansmid,ans in zip(sample["answermid"].split('|'),sample["answername"].split('|')):
104 |             id2name[ansmid]=ans
105 |     # if no head entity, no retrieve result
106 |     if len(head)==0:
107 |         graphdict=dict()
108 |         graphdict['question']=question
109 |         graphdict['triples']=[]
110 |         graphdict['answers']=answer
111 |         retrieve_subgraph.append(graphdict)
112 |         continue
113 |     # query 1 hop and 2 hop relation
114 |     reset=set()
115 |     en2rel=dict()
116 |     for ent in head:
117 |         rel_1hop=query_1hop_relation(ent)
118 |         chain_2hop=query_2hop_relation(ent)
119 |         rel_2hop=set()
120 |         for i in chain_2hop:
121 |             rel_2hop.add(i[0]+' '+i[1])
122 |         en2rel[ent]=rel_1hop|rel_2hop
123 |         reset=reset|rel_1hop
124 |         reset=reset|rel_2hop
125 |     # if no relation, no retrieve result
126 |     if len(reset)==0:
127 |         graphdict=dict()
128 |         graphdict['question']=question
129 |         graphdict['triples']=[]
130 |         graphdict['answers']=answer
131 |         retrieve_subgraph.append(graphdict)
132 |         continue
133 |     # select most similar relation chain
134 |     sort_re=sim_model.compute(question,list(reset))
135 |     # sample from KG
136 |     triples=[]
137 |     for rechain in sort_re:
138 |         temp_h=[]
139 |         for ent,rel in en2rel.items():
140 |             if rechain in rel:
141 |                 temp_h.append(ent)
142 |         if ' ' in rechain:
143 |             for h in temp_h:
144 |                 triples.extend(convert_names(get_2hop_chain(h,rechain.split(' '))))
145 |         else:
146 |             for h in temp_h:
147 |                 triples.extend(convert_names(get_1hop_chain(h,rechain)))
148 |         if len(set(triples))>=K:
149 |             break
150 |     # avoid redundant triples
151 |     triples1=[]
152 |     for i in triples[:K]:
153 |         if i not in triples1:
154 |             triples1.append(i)
155 |     contents=' '.join(triples1[:50])
156 |     # calculate retrieve metrics
157 |     FLAG=False
158 |     temp_r=0
159 |     for a in answer:
160 |         if a.lower() in contents.lower():
161 |             FLAG=True
162 |             temp_r+=1
163 |     if FLAG:
164 |         accuracy+=1
165 |         recall+=temp_r/len(answer)
166 |     graphdict=dict()
167 |     graphdict['question']=question
168 |     graphdict['triples']=triples1
169 |     graphdict['answers']=answer
170 |     retrieve_subgraph.append(graphdict)
171 |     print('*'*30,'Current Retrieve Results','*'*30)
172 |     print('Accuracy:',accuracy/(index+1))
173 |     print('Recall:',recall/(index+1))
174 | 
175 | # save retrieve results
176 | os.makedirs('results', exist_ok=True)
177 | json.dump(retrieve_subgraph,open('results/'+DATA+'.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
178 | print('*'*30,'Retrieve Results','*'*30)
179 | print('Accuracy:',accuracy/len(dataset))
180 | print('Recall:',recall/len(dataset))


--------------------------------------------------------------------------------
/inference/open/retrieve/2hop/format.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import re
 4 | 
 5 | # grailqa, GraphQuestions, WebQSP
 6 | DATA='grailqa'
 7 | NUM=30
 8 | 
 9 | def has_digit(input_string):
10 |     for char in input_string:
11 |         if char.isdigit():
12 |             return True
13 |     return False
14 |     
15 | with open('results/'+DATA+'.json', 'r') as rf:
16 |     documents = json.load(rf)
17 | 
18 | accuracy=0
19 | recall=0
20 | result=[]
21 | for doc in documents:
22 |     question=doc["question"]
23 |     answer=doc["answers"]
24 |     # avoid redundant triples
25 |     triplelist=doc["triples"]
26 |     triplelist1=[]
27 |     for i in triplelist:
28 |         if len(i)>100:
29 |             continue
30 |         if len(i.split(', '))<3:
31 |             continue
32 |         rel=i.split(', ')[1]
33 |         # skip relations
34 |         if rel.startswith('common') or rel.startswith('type.object') or rel.startswith('freebase') or rel.endswith('type') or rel.endswith('label'):
35 |             continue
36 |         #print(i.split(', '))
37 |         # skip triples with too long object
38 |         if i not in triplelist1:
39 |             triplelist1.append(i)
40 |     # construct triple string
41 |     triples=' '.join(triplelist1[:NUM])
42 |     # convert mid to cvt
43 |     candidate=re.findall(r'm\.[\da-zA-Z_]+', triples)
44 |     candidate.extend(re.findall(r'g\.[\da-zA-Z_]+', triples))
45 |     candidate.extend(re.findall(r'n\.[\da-zA-Z_]+', triples))
46 |     cvtmid=[]
47 |     for i in candidate:
48 |         if has_digit(i):
49 |             if i not in cvtmid:
50 |                 cvtmid.append(i)
51 |     cvt_num=1
52 |     for i in cvtmid:
53 |         triples=triples.replace(i,'CVT'+str(cvt_num))
54 |         cvt_num+=1
55 |     samdict=dict()
56 |     samdict["question"]=question
57 |     samdict["answer"]=answer
58 |     samdict["triples"]=triples
59 |     result.append(samdict)
60 |     FLAG=False
61 |     r=0
62 |     for i in answer:
63 |         if i in triples:
64 |             FLAG=True
65 |             r+=1
66 |     if FLAG:
67 |         accuracy+=1
68 |         recall+=r/len(answer)
69 | 
70 | json.dump(result,open('format/'+DATA+'.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
71 | print('Accuracy:',accuracy/len(result))
72 | print('Recall:',recall/len(result))
73 | 


--------------------------------------------------------------------------------
/inference/open/retrieve/2hop/sim_compute.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 3 | from sentence_transformers import SentenceTransformer, util
 4 | import torch
 5 | import torch.nn.functional as F
 6 | 
 7 | class Similarity:
 8 |     def __init__(self):
 9 |         # Load model
10 |         self.model = SentenceTransformer('../../../../../pretrain/all-MiniLM-L6-v2',device='cuda:0')
11 | 
12 |     def compute(self, query, relations):
13 |         embedding1 = self.model.encode(query, show_progress_bar=False,device='cuda:0',convert_to_tensor=True)
14 |         embedding2 = self.model.encode(relations,batch_size=1024,show_progress_bar=False, device='cuda:0',convert_to_tensor=True)
15 |         cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)[0]
16 |         sim_relations = list(zip(cosine_scores.tolist(), relations))
17 |         sim_relations = sorted(sim_relations, key=lambda x: x[0], reverse=True)
18 |         sorted_relations = [relation for _, relation in sim_relations]
19 |         return sorted_relations


--------------------------------------------------------------------------------
/inference/open/retrieve/bm25/build_index_sparse.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Build the index for the general knowledge base using pyserini.
 4 | 
 5 | Freebase="../../Freebase/processed"
 6 | 
 7 | python -m pyserini.index.lucene \
 8 |   --collection JsonCollection \
 9 |   --input ../../Freebase/processed/document \
10 |   --index index \
11 |   --generator DefaultLuceneDocumentGenerator \
12 |   --threads 10 \
13 |   --storePositions --storeDocvectors --storeRaw
14 | 


--------------------------------------------------------------------------------
/inference/open/retrieve/bm25/format.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import re
 4 | 
 5 | DATA='grailqa'
 6 | NUM=20
 7 | 
 8 | def has_digit(input_string):
 9 |     for char in input_string:
10 |         if char.isdigit():
11 |             return True
12 |     return False
13 | 
14 | with open('../../../../subgraph/'+DATA+'/data/test.json', 'r') as rf:
15 |     data = json.load(rf)
16 |     
17 | with open('results/'+DATA+'.json', 'r') as rf:
18 |     documents = json.load(rf)
19 | 
20 | result=[]
21 | for sample,doc in zip(data,documents):
22 |     if DATA in ['WebQSP']:
23 |         question=sample["question"]
24 |         answer=sample["answername"].split('|')
25 |     if DATA in ['GraphQuestions','grailqa']:
26 |         question=sample["question"]
27 |         answer=[]
28 |         for i in sample["answer"]:   
29 |             if i.get("entity_name"):
30 |                 answer.append(i["entity_name"])
31 |             else:
32 |                 answer.append(i["answer_argument"])
33 |     doclist=doc["documents"]
34 |     triple_str=''
35 |     for d in doclist:
36 |         triple_str=triple_str+d["triples"]+' '
37 |     triple_str=triple_str[:-1]
38 |     # avoid redundant triples
39 |     triplelist=triple_str.split(') (')
40 |     triplelist[0]=triplelist[0][1:]
41 |     triplelist[-1]=triplelist[-1][:-1]
42 |     triplelist1=[]
43 |     for i in triplelist:
44 |         if len(i)>100:
45 |             continue
46 |         if len(i.split(', '))<3:
47 |             continue
48 |         rel=i.split(', ')[1]
49 |         # skip relations
50 |         if rel.startswith('common') or rel.startswith('type.object') or rel.startswith('freebase') or rel.endswith('type') or rel.endswith('label'):
51 |             continue
52 |         #print(i.split(', '))
53 |         # skip triples with too long object
54 |         if i not in triplelist1:
55 |             triplelist1.append(i)
56 |     # construct triple string
57 |     triples=''
58 |     for i in triplelist1[:NUM]:
59 |         triples=triples+'('+i+') '
60 |     triples=triples[:-1]
61 |     # convert mid to cvt
62 |     candidate=re.findall(r'm\.[\da-zA-Z_]+', triples)
63 |     candidate.extend(re.findall(r'g\.[\da-zA-Z_]+', triples))
64 |     candidate.extend(re.findall(r'n\.[\da-zA-Z_]+', triples))
65 |     cvtmid=[]
66 |     cvt_num=1
67 |     for i in candidate:
68 |         if has_digit(i):
69 |             if i not in cvtmid:
70 |                 cvtmid.append(i)
71 |     for i in cvtmid:
72 |         triples=triples.replace(i,'CVT'+str(cvt_num))
73 |         cvt_num+=1
74 |     samdict=dict()
75 |     samdict["question"]=question
76 |     samdict["answer"]=answer
77 |     samdict["triples"]=triples
78 |     result.append(samdict)
79 | 
80 | json.dump(result,open('format/'+DATA+'.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)


--------------------------------------------------------------------------------
/inference/open/retrieve/bm25/run_search_sparse.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | index_name="index_bm25"
 4 | dataset='grailqa'
 5 | output="results/${dataset}.json"
 6 | 
 7 | python search.py \
 8 |     --dataset ${dataset} \
 9 |     --query_data_path ../../../../subgraph/${dataset}/data/test.json \
10 |     --index_name ${index_name} \
11 |     --output ${output} \
12 |     --top_k 100 \
13 |     --k1 0.4 \
14 |     --b 0.4 \
15 |     --num_process 10 \
16 |     --eval \
17 |     --save
18 | 


--------------------------------------------------------------------------------
/inference/open/retrieve/bm25/search.py:
--------------------------------------------------------------------------------
  1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.  
  2 | # SPDX-License-Identifier: CC-BY-NC-4.0
  3 | 
  4 | from pyserini.search.lucene import LuceneSearcher
  5 | import json
  6 | from tqdm import tqdm
  7 | import os
  8 | import re
  9 | import argparse
 10 | import pickle
 11 | import multiprocessing.pool
 12 | from functools import partial
 13 | from collections import defaultdict
 14 | from pyserini.index import IndexReader
 15 |         
 16 | def has_digit(input_string):
 17 |     for char in input_string:
 18 |         if char.isdigit():
 19 |             return True
 20 |     return False
 21 | 
 22 | class Bm25Searcher:
 23 |     def __init__(self, index_dir, args):
 24 |         self.index_dir = index_dir
 25 |         self.args = args
 26 |         self.searcher = LuceneSearcher(index_dir)
 27 |         self.searcher.set_bm25(args.k1, args.b)
 28 |         self.index_reader=IndexReader(index_dir)
 29 |         if len(args.ignore_string) > 0:
 30 |             self.ignore_list = args.ignore_string.split(',')
 31 |             print(f'ignore list: {self.ignore_list}')
 32 |         else:
 33 |             self.ignore_list = []
 34 |         
 35 |         # load documents for post process
 36 |         '''
 37 |         self.mid2doc=dict()
 38 |         for path in tqdm(os.listdir(self.args.documents)):
 39 |             with open(self.args.documents+'/'+path,'r',encoding='utf-8') as f:
 40 |                 for line in f:
 41 |                     try:
 42 |                         data=json.loads(line)
 43 |                         if self.mid2doc.get(data["mid"]) is None:
 44 |                             self.mid2doc[data["mid"]]=[]
 45 |                         self.mid2doc[data["mid"]].append(data["id"])
 46 |                     except:
 47 |                         continue
 48 |         with open('mid2doc.pickle', 'wb') as f:
 49 |             pickle.dump(self.mid2doc, f)
 50 |         '''
 51 |         with open('mid2doc.pickle', 'rb') as f:
 52 |             self.mid2doc = pickle.load(f)   
 53 |  
 54 |     def perform_search(self, sample, top_k, ques_id):
 55 |         if self.args.dataset in ['WebQSP']:
 56 |             query=sample["question"]
 57 |             head=set()
 58 |             head.add(sample["headmid"])
 59 |         if self.args.dataset in ['GraphQuestions','grailqa']:
 60 |             query=sample["question"]
 61 |             head=set()
 62 |             for n in sample["graph_query"]["nodes"]:
 63 |                 if n["node_type"]=="entity":
 64 |                     head.add(n["id"])
 65 |         for string in self.ignore_list:
 66 |             query = query.replace(string, ' ')
 67 |         query = query.strip()
 68 |         
 69 |         # get relevant document for head entity
 70 |         docid=[]
 71 |         for i in head:
 72 |             if self.mid2doc.get(i):
 73 |                 docid.extend(self.mid2doc[i])
 74 |         # first search using relevant document
 75 |         id_score=[]
 76 |         for i in docid:
 77 |             score = self.index_reader.compute_query_document_score(i, query)
 78 |             id_score.append([score,i])
 79 |         id_score=sorted(id_score,key=lambda x: x[0], reverse=True)
 80 |         documents=[]
 81 |         for i in id_score[:top_k]:
 82 |             raw_data=self.searcher.doc(i[1])
 83 |             documents.append(json.loads(raw_data.raw()))
 84 | 
 85 |         # search
 86 |         if len(documents)<top_k:
 87 |             results = self.searcher.search(query, k=200)
 88 |             for result in results:
 89 |                 try:
 90 |                     doc_dict = json.loads(result.lucene_document.get('raw'))
 91 |                     if doc_dict not in documents:
 92 |                         documents.append(doc_dict)
 93 |                         if len(documents)>=top_k:
 94 |                             break
 95 |                 except:
 96 |                     continue
 97 |         context = dict()
 98 |         context['documents']=documents
 99 |         context['id']=ques_id
100 |         return context
101 | 
102 | def search_all(process_idx, num_process, searcher, args):
103 |     # load dataset
104 |     with open(args.query_data_path, 'r') as rf:
105 |         data = json.load(rf)
106 | 
107 |     output_data = []
108 |     for i, data_i in tqdm(enumerate(data)):
109 |         if i % num_process != process_idx:
110 |             continue
111 |         # search
112 |         output_i = searcher.perform_search(data_i, args.top_k,i)
113 |         output_data.append(output_i)
114 |     return output_data
115 |     
116 | def eval_top_k_one(documents, answer,top_k):
117 |     recall = 0
118 |     # merge into context
119 |     context=''
120 |     for doc in documents['documents'][:top_k]:
121 |         context+=doc['triples']
122 |     for ans in answer:
123 |         if ans.lower() in context.lower():
124 |             recall += 1
125 |     return recall / (len(answer) + 1e-8)    
126 | 
127 | def eval_top_k(output_data, answers,top_k_list=[1,2,3,4,5,6,7,8,9,10]):
128 |     print("*"*30,"Evaluate the Retrieval Result","*"*30)
129 |     hits_dict = defaultdict(int)
130 |     recall_dict = defaultdict(float)
131 |     top_k_list = [k for k in top_k_list if k <= len(output_data[0]['documents'])]
132 |     for documents,answer in tqdm(zip(output_data,answers)):
133 |         for k in top_k_list:
134 |             recall = eval_top_k_one(documents, answer,k)
135 |             if recall > 0:
136 |                 hits_dict[k] += 1
137 |             recall_dict[k] += recall
138 |     for k in top_k_list:
139 |         print("Top {}".format(k), 
140 |               "Hits: ", round(hits_dict[k] * 100 / len(output_data), 1), 
141 |               "Recall: ", round(recall_dict[k] * 100 / len(output_data), 1))
142 | 
143 | # argparse for root_dir, index_dir, query_data_path, output_dir
144 | parser = argparse.ArgumentParser(description='Search using pySerini')
145 | parser.add_argument("--dataset", type=str, default='WebQSP',
146 |                     help="KBQA dataset")
147 | parser.add_argument("--documents", type=str, default='../../Freebase/processed/document',
148 |                     help="documents directory")                    
149 | parser.add_argument("--index_name", type=str, default='Wikidata',
150 |                     help="directory to store the search index")
151 | parser.add_argument("--query_data_path", type=str, default='',
152 |                     help="directory to store the queries")
153 | parser.add_argument("--output", type=str, default='',
154 |                     help="directory to store the retrieved output")
155 | parser.add_argument("--num_process", type=int, default=10,
156 |                     help="number of processes to use for multi-threading")
157 | parser.add_argument("--top_k", type=int, default=150,
158 |                     help="number of passages to be retrieved for each query")
159 | parser.add_argument("--ignore_string", type=str, default="",
160 |                     help="string to ignore in the query, split by comma")
161 | parser.add_argument("--b", type=float, default=0.4,
162 |                     help="parameter of BM25")
163 | parser.add_argument("--k1", type=float, default=0.9,
164 |                     help="parameter of BM25")
165 | parser.add_argument("--save", action="store_true",
166 |                     help="whether to save the output")
167 | parser.add_argument("--eval", action="store_true",
168 |                     help="whether to evaluate the output")
169 | args = parser.parse_args()
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     index_dir = args.index_name
174 |     searcher = Bm25Searcher(index_dir, args)
175 | 
176 |     num_process = args.num_process
177 |     pool = multiprocessing.pool.ThreadPool(processes=num_process)
178 |     sampleData = [x for x in range(num_process)]
179 |     search_all_part = partial(search_all, 
180 |                                 searcher = searcher,
181 |                                 num_process = num_process,
182 |                                 args = args)
183 |     results = pool.map(search_all_part, sampleData)
184 |     pool.close()
185 | 
186 |     output_data = []
187 |     for result in results:
188 |         output_data += result
189 | 
190 |     # sort the output data by question id
191 |     output_data = sorted(output_data, key=lambda item: item['id'])
192 |     if args.eval:
193 |         # load answer from original data
194 |         answers=[]
195 |         with open(args.query_data_path, 'r') as rf:
196 |             dataset = json.load(rf)
197 |         if args.dataset in ['WebQSP']:
198 |             for sample in dataset:  
199 |                 answers.append(sample["answername"].split('|'))
200 |         if args.dataset in ['GraphQuestions','grailqa']:
201 |             for sample in dataset:
202 |                 answer=[]  
203 |                 for i in sample["answer"]:   
204 |                     if i.get("entity_name"):
205 |                         answer.append(i["entity_name"])
206 |                     else:
207 |                         answer.append(i["answer_argument"]) 
208 |                 answers.append(answer)
209 |         # evaluate output                   
210 |         eval_top_k(output_data, answers, top_k_list=[1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100])
211 |     
212 |     # truncate documents into 10 documents
213 |     for i in output_data:
214 |         i['documents'] = i['documents'][:10]
215 | 
216 |     # save output data
217 |     # create output dir recursively if not exist
218 |     if args.save:
219 |         os.makedirs('results', exist_ok=True)
220 |         print("saving output data to {}".format(args.output))
221 |         with open(args.output, "w") as wf:
222 |             json.dump(output_data, wf, indent=2, ensure_ascii=False)
223 | 


--------------------------------------------------------------------------------
/inference/open/rewrite/infer_chain.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import json
  4 | import random
  5 | from tqdm import tqdm
  6 | from transformers import GenerationConfig,AutoModelForCausalLM,AutoTokenizer,AutoModel
  7 | import torch
  8 | from peft import PeftModel
  9 | import sys
 10 | import openai
 11 | import time
 12 | from openai import OpenAI
 13 | 
 14 | # generation config
 15 | generation_config = GenerationConfig(
 16 |         temperature=0.01,
 17 |         top_k=40,
 18 |         top_p=0.9,
 19 |         do_sample=True,
 20 |         num_beams=1,
 21 |         repetition_penalty=1.1,
 22 |         max_new_tokens=1024
 23 | )
 24 | 
 25 | # dataset: grailqa, GraphQuestions
 26 | DATA='grailqa'
 27 | # llm: llama-2-7b-chat-hf, Meta-Llama-3-8B-Instruct, chatgpt
 28 | LLM='chatgpt'
 29 | # retrieve method: bm25, 2hop
 30 | MODE='2hop'
 31 | 
 32 | # set client
 33 | client=OpenAI(api_key='YOUR KEY')
 34 | 
 35 | test=json.load(open('../retrieve/'+MODE+'/format/'+DATA+'.json','r',encoding='utf-8'))
 36 | 
 37 | kr_prompt_llm='''Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 38 | Triples: {triple}
 39 | Question: {ques}
 40 | '''
 41 | 
 42 | kr_prompt_gpt='''Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 43 | Triples: (Oxybutynin Oral, medicine.routed_drug.route_of_administration, Oral administration) (Oxybutynin Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin Chloride Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin chloride 5 extended release film coated tablet, medicine.drug_formulation.formulation_of, Oxybutynin)
 44 | Question: oxybutynin chloride 5 extended release film coated tablet is the ingredients of what routed drug?
 45 | Reason 1: I need to know which routed drug has the marketed formulation of oxybutynin chloride 5 extended release film coated tablet.
 46 | Knowledge 1: The routed drugs Oxybutynin Oral and Oxybutynin Chloride Oral have the marketed formulation of oxybutynin chloride 5 extended release film coated tablet.
 47 | 
 48 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 49 | Triples: (Google, organization.organization.founders, Sergey Brin) (Sergey Brin, people.person.education, CVT1) (CVT1, education.education.institution, University of Maryland, College Park) (Google, organization.organization.founders, Larry Page) (Larry Page, people.person.education, CVT2) (CVT2, education.education.institution, University of Michigan) (CVT2, education.education.institution, Stanford University)
 50 | Question: where did the founder of google go to college?
 51 | Reason 1: I need to know who the founders of Google are.
 52 | Knowledge 1: The founders of Google are Sergey Brin and Larry Page.
 53 | Reason 2: I need to know where Sergey Brin and Larry Page went to college.
 54 | Knowledge 2: Sergey Brin went to the University of Maryland, College Park for college. Larry Page went to the University of Michigan and Stanford University for college.
 55 | 
 56 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 57 | Triples: (Rock music, music.genre.artists, Outkast) (Rock music, music.genre.parent_genre, Folk music) (Rock music, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, Bright Eyes) (Electronica, music.genre.parent_genre, House music) (Electronica, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, t.A.T.u.)
 58 | Question: the albums confessions tour is part of what parent genre of a musical genre?
 59 | Reason 1: I need to know the musical genre of the albums confessions tour.
 60 | Knowledge 1: The album confessions tour is associated with the genre Rock music and Electronica.
 61 | Reason 2: I need to know the parent genre of Rock music and Electronica.
 62 | Knowledge 2: The parent genre of Rock music is Folk music. The parent genre of Electronica is House music.
 63 | 
 64 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 65 | Triples: {triple}
 66 | Question: {ques}
 67 | '''
 68 | 
 69 | num_dict = {
 70 |     '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 71 |     '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
 72 |     '10': 'ten', '11': 'eleven', '12': 'twelve', '13': 'thirteen', 
 73 |     '14': 'fourteen', '15': 'fifteen', '16': 'sixteen', '17': 'seventeen',
 74 |     '18': 'eighteen', '19': 'nineteen', '20': 'twenty'
 75 | }
 76 |    
 77 | if LLM!='chatgpt':
 78 |     # path for LLM
 79 |     LLM_PATH='../../../../pretrain/'+LLM
 80 |     # path for tokenizer
 81 |     TOKENIZER_PATH='../../../../pretrain/'+LLM
 82 |     # path for lora
 83 |     PEFT_PATH='../../../instruction-tuning/output-'+DATA+'/CoT/'+LLM+'/best_model'
 84 |     # load tokenizer and llm
 85 |     tokenizer=AutoTokenizer.from_pretrained(TOKENIZER_PATH)
 86 |     llm=AutoModelForCausalLM.from_pretrained(LLM_PATH,torch_dtype=torch.float16,low_cpu_mem_usage=True,device_map='cuda:0')
 87 |     # merge peft into base LLM
 88 |     if PEFT_PATH:
 89 |         llm=PeftModel.from_pretrained(llm, PEFT_PATH,torch_dtype=torch.float16,device_map='cuda:0')
 90 |         
 91 | # result
 92 | result='result/'+DATA+'/'+MODE+'/'+LLM+'/chain.json'
 93 | os.makedirs('result/'+DATA+'/'+MODE+'/'+LLM,exist_ok = True)
 94 | log_file='log/'+DATA+'/'+MODE+'/'+LLM+'/chain.log'
 95 | os.makedirs('log/'+DATA+'/'+MODE+'/'+LLM,exist_ok = True)
 96 | 
 97 | # redirect output to log
 98 | sys.stdout = open(log_file, 'w')
 99 | 
100 | def getResponse(prompt,max_retries=10):
101 |     # set retries
102 |     retries=0
103 |     while retries < max_retries:
104 |         try:
105 |             res = client.chat.completions.create(
106 |                 model='gpt-3.5-turbo',
107 |                 #model='gpt-4',
108 |                 messages=[
109 |                     {'role': 'user', 'content': prompt}
110 |                 ],
111 |                 temperature=0,
112 |             )
113 |             return res.choices[0].message.content
114 |         except Exception as e:
115 |             print(f"An error occurred: {e}")
116 |             print("Retrying in 1 minutes...")
117 |             retries += 1
118 |             time.sleep(60)
119 |     return ''
120 |             
121 | def LLMResponse(prompt,llm,tokenizer,cuda):
122 |     inputs = tokenizer(prompt,return_tensors="pt")
123 |     generation_output = llm.generate(
124 |             input_ids=inputs["input_ids"].to(cuda),
125 |             attention_mask=inputs['attention_mask'].to(cuda),
126 |             eos_token_id=tokenizer.eos_token_id,
127 |             pad_token_id=tokenizer.eos_token_id,
128 |             generation_config=generation_config
129 |         )
130 |     output = tokenizer.decode(generation_output[0],skip_special_tokens=True)
131 |     response = output.split(prompt)[-1].strip()
132 |     return response
133 | 
134 | data=[]
135 | for sample in tqdm(test):
136 | 
137 |     # knowledge rewriter
138 |     if len(sample["triples"])!=0:
139 |         if LLM!='chatgpt':
140 |             knowledge=LLMResponse(kr_prompt_llm.format(triple=sample["triples"],ques=sample["question"]),llm,tokenizer,'cuda:0')
141 |             print(kr_prompt_llm.format(triple=sample["triples"],ques=sample["question"]))
142 |             print(knowledge)
143 |         else:
144 |             knowledge=getResponse(kr_prompt_gpt.format(triple=sample["triples"],ques=sample["question"]))
145 |             print(kr_prompt_gpt.format(triple=sample["triples"],ques=sample["question"]))
146 |             print(knowledge)            
147 |     else:
148 |         knowledge=''
149 | 
150 |     # record
151 |     temp=dict()
152 |     temp['question']=sample['question']
153 |     temp['answer']=sample["answer"]
154 |     temp['graph']=sample["triples"]
155 |     temp['knowledge']=knowledge
156 |     data.append(temp)
157 | 
158 | json.dump(data,open(result,'w',encoding='utf-8'),indent=2,ensure_ascii=False)


--------------------------------------------------------------------------------
/inference/open/rewrite/infer_pa.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import json
  4 | import random
  5 | from tqdm import tqdm
  6 | from transformers import GenerationConfig,AutoModelForCausalLM,AutoTokenizer,AutoModel
  7 | import torch
  8 | from peft import PeftModel
  9 | import sys
 10 | import openai
 11 | import time
 12 | from openai import OpenAI
 13 | 
 14 | # generation config
 15 | generation_config = GenerationConfig(
 16 |         temperature=0.01,
 17 |         top_k=40,
 18 |         top_p=0.9,
 19 |         do_sample=True,
 20 |         num_beams=1,
 21 |         repetition_penalty=1.1,
 22 |         max_new_tokens=1024
 23 | )
 24 | 
 25 | # dataset: grailqa, GraphQuestions
 26 | DATA='grailqa'
 27 | # llm: llama-2-7b-chat-hf, Meta-Llama-3-8B-Instruct
 28 | LLM='Meta-Llama-3-8B-Instruct'
 29 | # retrieve method: bm25, 2hop
 30 | MODE='2hop'
 31 | 
 32 | # set client
 33 | client=OpenAI(api_key='YOUR KEY')
 34 | 
 35 | test=json.load(open('../retrieve/'+MODE+'/format/'+DATA+'.json','r',encoding='utf-8'))
 36 | 
 37 | kr_prompt_llm='''Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 38 | Triples: {triple}
 39 | Question: {ques}
 40 | '''
 41 | 
 42 | kr_prompt_gpt='''Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 43 | Triples: (Oxybutynin Oral, medicine.routed_drug.route_of_administration, Oral administration) (Oxybutynin Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin Chloride Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin chloride 5 extended release film coated tablet, medicine.drug_formulation.formulation_of, Oxybutynin)
 44 | Question: oxybutynin chloride 5 extended release film coated tablet is the ingredients of what routed drug?
 45 | Reason 1: I need to know which routed drug has the marketed formulation of oxybutynin chloride 5 extended release film coated tablet.
 46 | Knowledge 1: The routed drugs Oxybutynin Oral and Oxybutynin Chloride Oral have the marketed formulation of oxybutynin chloride 5 extended release film coated tablet.
 47 | 
 48 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 49 | Triples: (Google, organization.organization.founders, Sergey Brin) (Sergey Brin, people.person.education, CVT1) (CVT1, education.education.institution, University of Maryland, College Park) (Google, organization.organization.founders, Larry Page) (Larry Page, people.person.education, CVT2) (CVT2, education.education.institution, University of Michigan) (CVT2, education.education.institution, Stanford University)
 50 | Question: where did the founder of google go to college?
 51 | Reason 1: I need to know who the founders of Google are.
 52 | Knowledge 1: The founders of Google are Sergey Brin and Larry Page.
 53 | Reason 2: I need to know where Sergey Brin and Larry Page went to college.
 54 | Knowledge 2: Sergey Brin went to the University of Maryland, College Park for college. Larry Page went to the University of Michigan and Stanford University for college.
 55 | 
 56 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 57 | Triples: (Rock music, music.genre.artists, Outkast) (Rock music, music.genre.parent_genre, Folk music) (Rock music, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, Bright Eyes) (Electronica, music.genre.parent_genre, House music) (Electronica, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, t.A.T.u.)
 58 | Question: the albums confessions tour is part of what parent genre of a musical genre?
 59 | Reason 1: I need to know the musical genre of the albums confessions tour.
 60 | Knowledge 1: The album confessions tour is associated with the genre Rock music and Electronica.
 61 | Reason 2: I need to know the parent genre of Rock music and Electronica.
 62 | Knowledge 2: The parent genre of Rock music is Folk music. The parent genre of Electronica is House music.
 63 | 
 64 | Your task is to summarize the relevant information that is helpful to answer the question from the following triples. Please think step by step and iteratively generate the reasoning chain and the corresponding knowledge.
 65 | Triples: {triple}
 66 | Question: {ques}
 67 | '''
 68 | 
 69 | num_dict = {
 70 |     '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 71 |     '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
 72 |     '10': 'ten', '11': 'eleven', '12': 'twelve', '13': 'thirteen', 
 73 |     '14': 'fourteen', '15': 'fifteen', '16': 'sixteen', '17': 'seventeen',
 74 |     '18': 'eighteen', '19': 'nineteen', '20': 'twenty'
 75 | }
 76 | 
 77 | if LLM!='chatgpt':
 78 |     # path for LLM
 79 |     LLM_PATH='../../../instruction-tuning/output-'+DATA+'/sft/CoT/'+LLM
 80 |     # path for tokenizer
 81 |     TOKENIZER_PATH='../../../../pretrain/'+LLM
 82 |     # path for lora
 83 |     PEFT_PATH='../../../instruction-tuning/output-'+DATA+'/PA-chatgpt/CoT/'+LLM+'/best_model'
 84 |     print(PEFT_PATH)
 85 |     # load tokenizer and llm
 86 |     tokenizer=AutoTokenizer.from_pretrained(TOKENIZER_PATH)
 87 |     llm=AutoModelForCausalLM.from_pretrained(LLM_PATH,torch_dtype=torch.float16,low_cpu_mem_usage=True,device_map='cuda:0')
 88 |     # merge peft into base LLM
 89 |     if PEFT_PATH:
 90 |         llm=PeftModel.from_pretrained(llm, PEFT_PATH,torch_dtype=torch.float16,device_map='cuda:0')
 91 |     
 92 | # result
 93 | result='result/'+DATA+'/'+MODE+'/'+LLM+'/pa-chatgpt.json'
 94 | os.makedirs('result/'+DATA+'/'+MODE+'/'+LLM,exist_ok = True)
 95 | log_file='log/'+DATA+'/'+MODE+'/'+LLM+'/pa-chatgpt.log'
 96 | os.makedirs('log/'+DATA+'/'+MODE+'/'+LLM,exist_ok = True)
 97 | 
 98 | # redirect output to log
 99 | sys.stdout = open(log_file, 'w')
100 | 
101 | def getResponse(prompt,max_retries=10):
102 |     # set retries
103 |     retries=0
104 |     while retries < max_retries:
105 |         try:
106 |             res = client.chat.completions.create(
107 |                 model='gpt-3.5-turbo',
108 |                 #model='gpt-4',
109 |                 messages=[
110 |                     {'role': 'user', 'content': prompt}
111 |                 ],
112 |                 temperature=0,
113 |             )
114 |             return res.choices[0].message.content
115 |         except Exception as e:
116 |             print(f"An error occurred: {e}")
117 |             print("Retrying in 1 minutes...")
118 |             retries += 1
119 |             time.sleep(60)
120 |     return ''
121 |             
122 | def LLMResponse(prompt,llm,tokenizer,cuda):
123 |     inputs = tokenizer(prompt,return_tensors="pt")
124 |     generation_output = llm.generate(
125 |             input_ids=inputs["input_ids"].to(cuda),
126 |             attention_mask=inputs['attention_mask'].to(cuda),
127 |             eos_token_id=tokenizer.eos_token_id,
128 |             pad_token_id=tokenizer.eos_token_id,
129 |             generation_config=generation_config
130 |         )
131 |     output = tokenizer.decode(generation_output[0],skip_special_tokens=True)
132 |     response = output.split(prompt)[-1].strip()
133 |     return response
134 | 
135 | data=[]
136 | for sample in tqdm(test):
137 | 
138 |     # knowledge rewriter
139 |     if len(sample["triples"])!=0:
140 |         if LLM!='chatgpt':
141 |             knowledge=LLMResponse(kr_prompt_llm.format(triple=sample["triples"],ques=sample["question"]),llm,tokenizer,'cuda:0')
142 |             print(kr_prompt_llm.format(triple=sample["triples"],ques=sample["question"]))
143 |             print(knowledge)
144 |         else:
145 |             knowledge=getResponse(kr_prompt_gpt.format(triple=sample["triples"],ques=sample["question"]))
146 |             print(kr_prompt_gpt.format(triple=sample["triples"],ques=sample["question"]))
147 |             print(knowledge)            
148 |     else:
149 |         knowledge=''
150 | 
151 |     # record
152 |     temp=dict()
153 |     temp['question']=sample['question']
154 |     temp['answer']=sample["answer"]
155 |     temp['graph']=sample["triples"]
156 |     temp['knowledge']=knowledge
157 |     data.append(temp)
158 | 
159 | json.dump(data,open(result,'w',encoding='utf-8'),indent=2,ensure_ascii=False)
160 | 


--------------------------------------------------------------------------------
/inference/open/rewrite/infer_summary.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import json
  4 | import random
  5 | from tqdm import tqdm
  6 | from transformers import GenerationConfig,AutoModelForCausalLM,AutoTokenizer,AutoModel
  7 | import torch
  8 | from peft import PeftModel
  9 | import sys
 10 | import openai
 11 | import time
 12 | from openai import OpenAI
 13 | 
 14 | # generation config
 15 | generation_config = GenerationConfig(
 16 |         temperature=0.01,
 17 |         top_k=40,
 18 |         top_p=0.9,
 19 |         do_sample=True,
 20 |         num_beams=1,
 21 |         repetition_penalty=1.1,
 22 |         max_new_tokens=1024
 23 | )
 24 | 
 25 | # dataset: GraphQuestions, grailqa, WebQSP
 26 | DATA='grailqa'
 27 | # llm: llama-2-7b-chat-hf, Meta-Llama-3-8B-Instruct, chatgpt
 28 | LLM='Meta-Llama-3-8B-Instruct'
 29 | # retrieve method: bm25, 2hop
 30 | MODE='2hop'
 31 | 
 32 | # set client
 33 | client=OpenAI(api_key='YOUR KEY')
 34 | 
 35 | test=json.load(open('../retrieve/'+MODE+'/format/'+DATA+'.json','r',encoding='utf-8'))
 36 | 
 37 | kr_prompt_llm='''Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 38 | Triples: {triple}
 39 | Question: {ques}
 40 | Knowledge: '''
 41 | 
 42 | kr_prompt_gpt='''Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 43 | Triples: (Oxybutynin Oral, medicine.routed_drug.route_of_administration, Oral administration) (Oxybutynin Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin Chloride Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin chloride 5 extended release film coated tablet, medicine.drug_formulation.formulation_of, Oxybutynin)
 44 | Question: oxybutynin chloride 5 extended release film coated tablet is the ingredients of what routed drug?
 45 | Knowledge: The Oxybutynin chloride 5 extended release film coated tablet is a marketed formulation of the routed drugs Oxybutynin Oral and Oxybutynin Chloride Oral.
 46 | 
 47 | Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 48 | Triples: (Google, organization.organization.founders, Sergey Brin) (Sergey Brin, people.person.education, CVT1) (CVT1, education.education.institution, University of Maryland, College Park) (Google, organization.organization.founders, Larry Page) (Larry Page, people.person.education, CVT2) (CVT2, education.education.institution, University of Michigan) (CVT2, education.education.institution, Stanford University)
 49 | Question: where did the founder of google go to college?
 50 | Knowledge: The founders of Google are Sergey Brin and Larry Page. Sergey Brin attended the University of Maryland, College Park for his education. Larry Page attended the University of Michigan and Stanford University for his education.
 51 | 
 52 | Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 53 | Triples: (Rock music, music.genre.artists, Outkast) (Rock music, music.genre.parent_genre, Folk music) (Rock music, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, Bright Eyes) (Electronica, music.genre.parent_genre, House music) (Electronica, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, t.A.T.u.)
 54 | Question: the albums confessions tour is part of what parent genre of a musical genre?
 55 | Knowledge: The album confessions tour is associated with the genre Rock music and Electronica. The parent genre of Rock music is Folk music. The parent genre of Electronica is House music.
 56 | 
 57 | Your task is to summarize the relevant knowledge that is helpful to answer the question from the following triples.
 58 | Triples: {triple}
 59 | Question: {ques}
 60 | Knowledge: '''
 61 | 
 62 | 
 63 | num_dict = {
 64 |     '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 65 |     '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
 66 |     '10': 'ten', '11': 'eleven', '12': 'twelve', '13': 'thirteen', 
 67 |     '14': 'fourteen', '15': 'fifteen', '16': 'sixteen', '17': 'seventeen',
 68 |     '18': 'eighteen', '19': 'nineteen', '20': 'twenty'
 69 | }
 70 |    
 71 | if LLM!='chatgpt':
 72 |     # path for LLM
 73 |     LLM_PATH='../../../../pretrain/'+LLM
 74 |     # path for tokenizer
 75 |     TOKENIZER_PATH='../../../../pretrain/'+LLM
 76 |     # path for lora
 77 |     PEFT_PATH='../../../instruction-tuning/output-'+DATA+'/summary/'+LLM+'/best_model'
 78 |     # load tokenizer and llm
 79 |     tokenizer=AutoTokenizer.from_pretrained(TOKENIZER_PATH)
 80 |     llm=AutoModelForCausalLM.from_pretrained(LLM_PATH,torch_dtype=torch.float16,low_cpu_mem_usage=True,device_map='cuda:0')
 81 |     # merge peft into base LLM
 82 |     if PEFT_PATH:
 83 |         llm=PeftModel.from_pretrained(llm, PEFT_PATH,torch_dtype=torch.float16,device_map='cuda:0')    
 84 |     
 85 | # result
 86 | result='result/'+DATA+'/'+MODE+'/'+LLM+'/summary.json'
 87 | os.makedirs('result/'+DATA+'/'+MODE+'/'+LLM,exist_ok = True)
 88 | log_file='log/'+DATA+'/'+MODE+'/'+LLM+'/summary.log'
 89 | os.makedirs('log/'+DATA+'/'+MODE+'/'+LLM,exist_ok = True)
 90 | 
 91 | # redirect output to log
 92 | sys.stdout = open(log_file, 'w')
 93 | 
 94 | def getResponse(prompt,max_retries=10):
 95 |     # set retries
 96 |     retries=0
 97 |     while retries < max_retries:
 98 |         try:
 99 |             res = client.chat.completions.create(
100 |                 model='gpt-3.5-turbo',
101 |                 #model='gpt-4',
102 |                 messages=[
103 |                     {'role': 'user', 'content': prompt}
104 |                 ],
105 |                 temperature=0,
106 |             )
107 |             return res.choices[0].message.content
108 |         except Exception as e:
109 |             print(f"An error occurred: {e}")
110 |             print("Retrying in 1 minutes...")
111 |             retries += 1
112 |             time.sleep(60)
113 |     return ''
114 |             
115 | def LLMResponse(prompt,llm,tokenizer,cuda):
116 |     inputs = tokenizer(prompt,return_tensors="pt")
117 |     generation_output = llm.generate(
118 |             input_ids=inputs["input_ids"].to(cuda),
119 |             attention_mask=inputs['attention_mask'].to(cuda),
120 |             eos_token_id=tokenizer.eos_token_id,
121 |             pad_token_id=tokenizer.eos_token_id,
122 |             generation_config=generation_config
123 |         )
124 |     output = tokenizer.decode(generation_output[0],skip_special_tokens=True)
125 |     response = output.split(prompt)[-1].strip()
126 |     return response
127 | 
128 | data=[]
129 | for sample in tqdm(test):
130 |     
131 |     # knowledge rewriter
132 |     if len(sample["triples"])!=0:
133 |         if LLM!='chatgpt':
134 |             knowledge=LLMResponse(kr_prompt_llm.format(triple=sample["triples"],ques=sample["question"]),llm,tokenizer,'cuda:0')
135 |             print(kr_prompt_llm.format(triple=sample["triples"],ques=sample["question"]))
136 |             print(knowledge)
137 |         else:
138 |             knowledge=getResponse(kr_prompt_gpt.format(triple=sample["triples"],ques=sample["question"]))
139 |             print(kr_prompt_gpt.format(triple=sample["triples"],ques=sample["question"]))
140 |             print(knowledge)            
141 |     else:
142 |         knowledge=''
143 |     
144 |     # record
145 |     temp=dict()
146 |     temp['question']=sample['question']
147 |     temp['answer']=sample["answer"]
148 |     temp['graph']=sample["triples"]
149 |     temp['knowledge']=knowledge
150 |     data.append(temp)
151 | 
152 | json.dump(data,open(result,'w',encoding='utf-8'),indent=2,ensure_ascii=False)
153 | 


--------------------------------------------------------------------------------
/inference/open/rewrite/infer_text.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import json
  4 | import random
  5 | from tqdm import tqdm
  6 | from transformers import GenerationConfig,AutoModelForCausalLM,AutoTokenizer,AutoModel
  7 | import torch
  8 | from peft import PeftModel
  9 | import sys
 10 | import openai
 11 | import time
 12 | from openai import OpenAI
 13 | 
 14 | # generation config
 15 | generation_config = GenerationConfig(
 16 |         temperature=0.01,
 17 |         top_k=40,
 18 |         top_p=0.9,
 19 |         do_sample=True,
 20 |         num_beams=1,
 21 |         repetition_penalty=1.1,
 22 |         max_new_tokens=1024
 23 | )
 24 | 
 25 | # dataset: grailqa, GraphQuestions
 26 | DATA='grailqa'
 27 | # llm: llama-2-7b-chat-hf, Meta-Llama-3-8B-Instruct, chatgpt
 28 | LLM='Meta-Llama-3-8B-Instruct'
 29 | # retrieve method: bm25, 2hop
 30 | MODE='2hop'
 31 | 
 32 | # set client
 33 | client=OpenAI(api_key='YOUR KEY')
 34 | 
 35 | test=json.load(open('../retrieve/'+MODE+'/format/'+DATA+'.json','r',encoding='utf-8'))
 36 | 
 37 | kr_prompt_llm='''Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: {triple}. The sentence is: '''
 38 | 
 39 | kr_prompt_gpt='''Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: (Oxybutynin Oral, medicine.routed_drug.route_of_administration, Oral administration) (Oxybutynin Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin Chloride Oral, medicine.routed_drug.marketed_formulations, Oxybutynin chloride 5 extended release film coated tablet) (Oxybutynin chloride 5 extended release film coated tablet, medicine.drug_formulation.formulation_of, Oxybutynin). The sentence is: Oxybutynin Oral is a medication that is administered orally. It is marketed in the form of Oxybutynin chloride 5 extended release film coated tablets. Another marketed formulation is Oxybutynin Chloride Oral. Furthermore, Oxybutynin chloride 5 extended release film coated tablet is a formulation of Oxybutynin.
 40 | 
 41 | Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: (Google, organization.organization.founders, Sergey Brin) (Sergey Brin, people.person.education, CVT1) (CVT1, education.education.institution, University of Maryland, College Park) (Google, organization.organization.founders, Larry Page) (Larry Page, people.person.education, CVT2) (CVT2, education.education.institution, University of Michigan) (CVT2, education.education.institution, Stanford University). The sentence is: Google was founded by Sergey Brin and Larry Page. Sergey Brin was educated at the University of Maryland, College Park, while Larry Page was educated at the University of Michigan and Stanford University.
 42 | 
 43 | Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: (Rock music, music.genre.artists, Outkast) (Rock music, music.genre.parent_genre, Folk music) (Rock music, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, Bright Eyes) (Electronica, music.genre.parent_genre, House music) (Electronica, music.genre.albums, The Confessions Tour) (Electronica, music.genre.artists, t.A.T.u.). The sentence is: Rock music, which is a subgenre of Folk music, includes artists like Outkast and albums such as "The Confessions Tour". Conversely, Electronica is a daughter genre of House music with artists like Bright Eyes and t.A.T.u., and also features albums like "The Confessions Tour".
 44 | 
 45 | Your task is to transform a knowledge graph to a sentence or multiple sentences. The knowledge graph is: {triple}. The sentence is: '''
 46 | 
 47 | num_dict = {
 48 |     '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
 49 |     '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
 50 |     '10': 'ten', '11': 'eleven', '12': 'twelve', '13': 'thirteen', 
 51 |     '14': 'fourteen', '15': 'fifteen', '16': 'sixteen', '17': 'seventeen',
 52 |     '18': 'eighteen', '19': 'nineteen', '20': 'twenty'
 53 | }
 54 |    
 55 | if LLM!='chatgpt':
 56 |     # path for LLM
 57 |     LLM_PATH='../../../../pretrain/'+LLM
 58 |     # path for tokenizer
 59 |     TOKENIZER_PATH='../../../../pretrain/'+LLM
 60 |     # path for lora
 61 |     PEFT_PATH='../../../instruction-tuning/output-'+DATA+'/kg-to-text/'+LLM+'/best_model'
 62 |     # load tokenizer and llm
 63 |     tokenizer=AutoTokenizer.from_pretrained(TOKENIZER_PATH)
 64 |     llm=AutoModelForCausalLM.from_pretrained(LLM_PATH,torch_dtype=torch.float16,low_cpu_mem_usage=True,device_map='cuda:0')
 65 |     # merge peft into base LLM
 66 |     if PEFT_PATH:
 67 |         llm=PeftModel.from_pretrained(llm, PEFT_PATH,torch_dtype=torch.float16,device_map='cuda:0')
 68 |     
 69 | # result
 70 | result='result/'+DATA+'/'+MODE+'/'+LLM+'/text.json'
 71 | os.makedirs('result/'+DATA+'/'+MODE+'/'+LLM,exist_ok = True)
 72 | log_file='log/'+DATA+'/'+MODE+'/'+LLM+'/text.log'
 73 | os.makedirs('log/'+DATA+'/'+MODE+'/'+LLM,exist_ok = True)
 74 | 
 75 | # redirect output to log
 76 | sys.stdout = open(log_file, 'w')
 77 | 
 78 | def getResponse(prompt,max_retries=10):
 79 |     # set retries
 80 |     retries=0
 81 |     while retries < max_retries:
 82 |         try:
 83 |             res = client.chat.completions.create(
 84 |                 model='gpt-3.5-turbo',
 85 |                 #model='gpt-4',
 86 |                 messages=[
 87 |                     {'role': 'user', 'content': prompt}
 88 |                 ],
 89 |                 temperature=0,
 90 |             )
 91 |             return res.choices[0].message.content
 92 |         except Exception as e:
 93 |             print(f"An error occurred: {e}")
 94 |             print("Retrying in 1 minutes...")
 95 |             retries += 1
 96 |             time.sleep(60)
 97 |     return ''
 98 |             
 99 | def LLMResponse(prompt,llm,tokenizer,cuda):
100 |     inputs = tokenizer(prompt,return_tensors="pt")
101 |     generation_output = llm.generate(
102 |             input_ids=inputs["input_ids"].to(cuda),
103 |             attention_mask=inputs['attention_mask'].to(cuda),
104 |             eos_token_id=tokenizer.eos_token_id,
105 |             pad_token_id=tokenizer.eos_token_id,
106 |             generation_config=generation_config
107 |         )
108 |     output = tokenizer.decode(generation_output[0],skip_special_tokens=True)
109 |     response = output.split(prompt)[-1].strip()
110 |     return response
111 | 
112 | data=[]
113 | for sample in tqdm(test):
114 | 
115 |     # knowledge rewriter
116 |     if len(sample["triples"])!=0:
117 |         if LLM!='chatgpt':
118 |             knowledge=LLMResponse(kr_prompt_llm.format(triple=sample["triples"]),llm,tokenizer,'cuda:0')
119 |             print(kr_prompt_llm.format(triple=sample["triples"]))
120 |             print(knowledge)
121 |         else:
122 |             knowledge=getResponse(kr_prompt_gpt.format(triple=sample["triples"]))
123 |             print(kr_prompt_gpt.format(triple=sample["triples"]))
124 |             print(knowledge)            
125 |     else:
126 |         knowledge=''
127 | 
128 |     # record
129 |     temp=dict()
130 |     temp['question']=sample['question']
131 |     temp['answer']=sample["answer"]
132 |     temp['graph']=sample["triples"]
133 |     temp['knowledge']=knowledge
134 |     data.append(temp)
135 | 
136 | json.dump(data,open(result,'w',encoding='utf-8'),indent=2,ensure_ascii=False)
137 | 


--------------------------------------------------------------------------------
/instruction-tuning/ds_zero2_no_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 100,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1e-10
 9 |     },
10 | 
11 |     "zero_optimization": {
12 |         "stage": 2,
13 |         "allgather_partitions": true,
14 |         "allgather_bucket_size": 1e8,
15 |         "overlap_comm": true,
16 |         "reduce_scatter": true,
17 |         "reduce_bucket_size": 1e8,
18 |         "contiguous_gradients": true
19 |     },
20 | 
21 |     "gradient_accumulation_steps": "auto",
22 |     "gradient_clipping": "auto",
23 |     "steps_per_print": 2000,
24 |     "train_batch_size": "auto",
25 |     "train_micro_batch_size_per_gpu": "auto",
26 |     "wall_clock_breakdown": false
27 | }
28 | 


--------------------------------------------------------------------------------
/instruction-tuning/merge.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 4 | from transformers import GenerationConfig,AutoModelForCausalLM,AutoTokenizer,AutoModel
 5 | from peft import PeftModel
 6 | 
 7 | # dataset: GraphQuestions, grailqa
 8 | DATA='grailqa'
 9 | # llm: llama-2-7b-chat-hf, Meta-Llama-3-8B-Instruct
10 | LLM='llama-2-7b-chat-hf'
11 | # mode
12 | MODE='CoT'
13 | # path for LLM
14 | LLM_PATH='../../pretrain/'+LLM
15 | # path for tokenizer
16 | TOKENIZER_PATH='../../pretrain/'+LLM
17 | # path for lora
18 | PEFT_PATH='output-'+DATA+'/'+MODE+'/'+LLM+'/best_model'
19 | # result
20 | result='output-'+DATA+'/sft/'+MODE+'/'+LLM
21 | 
22 | tokenizer=AutoTokenizer.from_pretrained(LLM_PATH)
23 | llm=AutoModelForCausalLM.from_pretrained(LLM_PATH,torch_dtype=torch.float16,low_cpu_mem_usage=True,device_map='cuda:0')
24 | llm=PeftModel.from_pretrained(llm, PEFT_PATH,torch_dtype=torch.float16,device_map='cuda:0')
25 | llm=llm.merge_and_unload()
26 | llm.save_pretrained(result)
27 | 


--------------------------------------------------------------------------------
/instruction-tuning/run_dpo-step.sh:
--------------------------------------------------------------------------------
 1 | llm=Meta-Llama-3-8B-Instruct
 2 | data=GraphQuestions
 3 | MODE=CoT
 4 | dataset=${data}/PA-chatgpt/${MODE}/${llm}
 5 | load_in_kbits=16
 6 | train_file=$dataset/train.json
 7 | validation_file=$dataset/dev.json
 8 | gpu_id='1'
 9 | train_batch_size=1
10 | eval_batch_size=1
11 | accumulation_steps=128
12 | epoch=1
13 | node=1
14 | max_prompt_length=2048
15 | max_target_length=2048
16 | max_seq_length=4096
17 | 
18 | lr=1e-4
19 | lora_rank=64
20 | lora_alpha=128
21 | lora_trainable="q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj"
22 | lora_dropout=0.05
23 | pretrained_model=output-${data}/sft/${MODE}/${llm}
24 | chinese_tokenizer_path=../../pretrain/${llm}
25 | per_device_train_batch_size=${train_batch_size}
26 | per_device_eval_batch_size=${eval_batch_size}
27 | gradient_accumulation_steps=${accumulation_steps}
28 | output_dir=output-$dataset
29 | modules_to_save="embed_tokens,lm_head"
30 | deepspeed_config_file=ds_zero2_no_offload.json
31 | 
32 | CUDA_VISIBLE_DEVICES=${gpu_id} torchrun --master_port 28610 --nnodes 1 --nproc_per_node ${node} run_dpo.py \
33 |     --model_name_or_path ${pretrained_model} \
34 |     --tokenizer_name_or_path ${chinese_tokenizer_path} \
35 |     --train_file ${train_file} \
36 |     --validation_file ${validation_file} \
37 |     --per_device_train_batch_size ${per_device_train_batch_size} \
38 |     --per_device_eval_batch_size ${per_device_eval_batch_size} \
39 |     --do_train \
40 |     --do_eval \
41 |     --seed $RANDOM \
42 |     --fp16 \
43 |     --num_train_epochs ${epoch} \
44 |     --lr_scheduler_type cosine \
45 |     --learning_rate ${lr} \
46 |     --warmup_ratio 0.03 \
47 |     --weight_decay 0 \
48 |     --logging_strategy steps \
49 |     --logging_steps 5 \
50 |     --save_strategy steps \
51 |     --save_steps 5 \
52 |     --evaluation_strategy no \
53 |     --gradient_accumulation_steps ${gradient_accumulation_steps} \
54 |     --preprocessing_num_workers 8 \
55 |     --max_prompt_length ${max_prompt_length} \
56 |     --max_target_length ${max_target_length} \
57 |     --max_seq_length ${max_seq_length} \
58 |     --output_dir ${output_dir} \
59 |     --save_safetensors False \
60 |     --overwrite_output_dir \
61 |     --ddp_timeout 30000 \
62 |     --logging_first_step True \
63 |     --lora_rank ${lora_rank} \
64 |     --lora_alpha ${lora_alpha} \
65 |     --trainable ${lora_trainable} \
66 |     --lora_dropout ${lora_dropout} \
67 |     --torch_dtype float16 \
68 |     --load_in_kbits ${load_in_kbits} \
69 |     --gradient_checkpointing \
70 |     --ddp_find_unused_parameters False \
71 |     --report_to none 
72 | 
73 | 


--------------------------------------------------------------------------------
/instruction-tuning/run_dpo.sh:
--------------------------------------------------------------------------------
 1 | llm=Meta-Llama-3-8B-Instruct
 2 | data=GraphQuestions
 3 | MODE=CoT
 4 | dataset=${data}/PA-chatgpt/${MODE}/${llm}
 5 | load_in_kbits=16
 6 | train_file=$dataset/train.json
 7 | validation_file=$dataset/dev.json
 8 | gpu_id='0'
 9 | train_batch_size=1
10 | eval_batch_size=1
11 | accumulation_steps=128
12 | epoch=10
13 | node=1
14 | max_prompt_length=2048
15 | max_target_length=2048
16 | max_seq_length=4096
17 | 
18 | lr=1e-4
19 | lora_rank=64
20 | lora_alpha=128
21 | lora_trainable="q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj"
22 | lora_dropout=0.05
23 | pretrained_model=output-${data}/sft/${MODE}/${llm}
24 | chinese_tokenizer_path=../../pretrain/${llm}
25 | per_device_train_batch_size=${train_batch_size}
26 | per_device_eval_batch_size=${eval_batch_size}
27 | gradient_accumulation_steps=${accumulation_steps}
28 | output_dir=output-$dataset
29 | modules_to_save="embed_tokens,lm_head"
30 | deepspeed_config_file=ds_zero2_no_offload.json
31 | 
32 | CUDA_VISIBLE_DEVICES=${gpu_id} torchrun --master_port 29920 --nnodes 1 --nproc_per_node ${node} run_dpo.py \
33 |     --model_name_or_path ${pretrained_model} \
34 |     --tokenizer_name_or_path ${chinese_tokenizer_path} \
35 |     --train_file ${train_file} \
36 |     --validation_file ${validation_file} \
37 |     --per_device_train_batch_size ${per_device_train_batch_size} \
38 |     --per_device_eval_batch_size ${per_device_eval_batch_size} \
39 |     --do_train \
40 |     --do_eval \
41 |     --seed $RANDOM \
42 |     --fp16 \
43 |     --num_train_epochs ${epoch} \
44 |     --lr_scheduler_type cosine \
45 |     --learning_rate ${lr} \
46 |     --warmup_ratio 0.03 \
47 |     --weight_decay 0 \
48 |     --logging_strategy steps \
49 |     --logging_steps 10 \
50 |     --save_strategy epoch \
51 |     --save_total_limit 10 \
52 |     --evaluation_strategy epoch \
53 |     --gradient_accumulation_steps ${gradient_accumulation_steps} \
54 |     --preprocessing_num_workers 8 \
55 |     --max_prompt_length ${max_prompt_length} \
56 |     --max_target_length ${max_target_length} \
57 |     --max_seq_length ${max_seq_length} \
58 |     --output_dir ${output_dir} \
59 |     --save_safetensors False \
60 |     --overwrite_output_dir \
61 |     --ddp_timeout 30000 \
62 |     --logging_first_step True \
63 |     --lora_rank ${lora_rank} \
64 |     --lora_alpha ${lora_alpha} \
65 |     --trainable ${lora_trainable} \
66 |     --lora_dropout ${lora_dropout} \
67 |     --torch_dtype float16 \
68 |     --load_in_kbits ${load_in_kbits} \
69 |     --gradient_checkpointing \
70 |     --ddp_find_unused_parameters False \
71 |     --load_best_model_at_end True \
72 |     --report_to none 
73 | 


--------------------------------------------------------------------------------
/instruction-tuning/run_llama-7b.sh:
--------------------------------------------------------------------------------
 1 | llm='llama-2-7b-chat-hf'
 2 | dataset='GraphQuestions/CoT'
 3 | train_batch_size=1
 4 | eval_batch_size=1
 5 | accumulation_steps=64
 6 | node=2
 7 | max_length=4096
 8 | 
 9 | lr=1e-4
10 | lora_rank=64
11 | lora_alpha=128
12 | lora_trainable="q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj"
13 | lora_dropout=0.05
14 | pretrained_model=../../pretrain/${llm}
15 | chinese_tokenizer_path=../../pretrain/${llm}
16 | per_device_train_batch_size=${train_batch_size}
17 | per_device_eval_batch_size=${eval_batch_size}
18 | gradient_accumulation_steps=${accumulation_steps}
19 | dataset_dir=${dataset}/train/
20 | output_dir=output-${dataset}/${llm}
21 | validation_file=${dataset}/dev.json
22 | modules_to_save="embed_tokens,lm_head"
23 | 
24 | deepspeed_config_file=ds_zero2_no_offload.json
25 | torchrun --master_port 27140 --nnodes 1 --nproc_per_node ${node} run_clm_sft_with_peft-7b.py \
26 |     --deepspeed ${deepspeed_config_file} \
27 |     --model_name_or_path ${pretrained_model} \
28 |     --tokenizer_name_or_path ${chinese_tokenizer_path} \
29 |     --dataset_dir ${dataset_dir} \
30 |     --validation_split_percentage 0.001 \
31 |     --per_device_train_batch_size ${per_device_train_batch_size} \
32 |     --per_device_eval_batch_size ${per_device_eval_batch_size} \
33 |     --do_train \
34 |     --do_eval \
35 |     --seed $RANDOM \
36 |     --fp16 \
37 |     --num_train_epochs 10 \
38 |     --lr_scheduler_type cosine \
39 |     --learning_rate ${lr} \
40 |     --warmup_ratio 0.03 \
41 |     --weight_decay 0 \
42 |     --logging_strategy steps \
43 |     --logging_steps 10 \
44 |     --save_strategy epoch \
45 |     --save_total_limit 2 \
46 |     --evaluation_strategy epoch \
47 |     --gradient_accumulation_steps ${gradient_accumulation_steps} \
48 |     --preprocessing_num_workers 8 \
49 |     --max_seq_length ${max_length} \
50 |     --output_dir ${output_dir} \
51 |     --overwrite_output_dir \
52 |     --ddp_timeout 30000 \
53 |     --logging_first_step True \
54 |     --lora_rank ${lora_rank} \
55 |     --lora_alpha ${lora_alpha} \
56 |     --trainable ${lora_trainable} \
57 |     --modules_to_save ${modules_to_save} \
58 |     --lora_dropout ${lora_dropout} \
59 |     --torch_dtype float16 \
60 |     --validation_file ${validation_file} \
61 |     --gradient_checkpointing \
62 |     --ddp_find_unused_parameters False \
63 |     --load_best_model_at_end True \
64 |     --report_to none
65 | 


--------------------------------------------------------------------------------
/instruction-tuning/run_llama-8b.sh:
--------------------------------------------------------------------------------
 1 | llm='Meta-Llama-3-8B-Instruct'
 2 | dataset='grailqa/kg-to-text'
 3 | train_batch_size=1
 4 | eval_batch_size=1
 5 | accumulation_steps=64
 6 | node=2
 7 | max_length=4096
 8 | 
 9 | lr=1e-4
10 | lora_rank=64
11 | lora_alpha=128
12 | lora_trainable="q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj"
13 | lora_dropout=0.05
14 | pretrained_model=../../pretrain/${llm}
15 | chinese_tokenizer_path=../../pretrain/${llm}
16 | per_device_train_batch_size=${train_batch_size}
17 | per_device_eval_batch_size=${eval_batch_size}
18 | gradient_accumulation_steps=${accumulation_steps}
19 | dataset_dir=${dataset}/train/
20 | output_dir=output-${dataset}/${llm}
21 | validation_file=${dataset}/dev.json
22 | modules_to_save="embed_tokens,lm_head"
23 | 
24 | deepspeed_config_file=ds_zero2_no_offload.json
25 | 
26 | torchrun --master_port 29548 --nnodes 1 --nproc_per_node ${node} run_clm_sft_with_peft-8b.py \
27 |     --deepspeed ${deepspeed_config_file} \
28 |     --model_name_or_path ${pretrained_model} \
29 |     --tokenizer_name_or_path ${chinese_tokenizer_path} \
30 |     --dataset_dir ${dataset_dir} \
31 |     --validation_split_percentage 0.001 \
32 |     --per_device_train_batch_size ${per_device_train_batch_size} \
33 |     --per_device_eval_batch_size ${per_device_eval_batch_size} \
34 |     --do_train \
35 |     --do_eval \
36 |     --seed $RANDOM \
37 |     --fp16 \
38 |     --num_train_epochs 10 \
39 |     --lr_scheduler_type cosine \
40 |     --learning_rate ${lr} \
41 |     --warmup_ratio 0.03 \
42 |     --weight_decay 0 \
43 |     --logging_strategy steps \
44 |     --logging_steps 10 \
45 |     --save_strategy epoch \
46 |     --save_total_limit 2 \
47 |     --evaluation_strategy epoch \
48 |     --gradient_accumulation_steps ${gradient_accumulation_steps} \
49 |     --preprocessing_num_workers 8 \
50 |     --max_seq_length ${max_length} \
51 |     --output_dir ${output_dir} \
52 |     --overwrite_output_dir \
53 |     --ddp_timeout 30000 \
54 |     --logging_first_step True \
55 |     --lora_rank ${lora_rank} \
56 |     --lora_alpha ${lora_alpha} \
57 |     --trainable ${lora_trainable} \
58 |     --lora_dropout ${lora_dropout} \
59 |     --torch_dtype float16 \
60 |     --validation_file ${validation_file} \
61 |     --gradient_checkpointing \
62 |     --ddp_find_unused_parameters False \
63 |     --load_best_model_at_end True \
64 |     --report_to none 
65 | 


--------------------------------------------------------------------------------
/requirement1.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.30.0
 2 | datasets==2.19.1
 3 | deepspeed==0.10.0
 4 | numpy==1.26.4
 5 | openai==1.27.0
 6 | pandas==2.2.2
 7 | peft==0.10.0
 8 | safetensors==0.4.3
 9 | sentence-transformers==2.2.2
10 | sentencepiece==0.2.0
11 | tensorboard==2.15.1
12 | torch==2.3.0
13 | transformers==4.40.2


--------------------------------------------------------------------------------
/requirement2.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.30.0
 2 | datasets==2.19.1
 3 | deepspeed==0.14.2
 4 | numpy==1.26.4
 5 | openai==1.27.0
 6 | pandas==2.2.2
 7 | peft==0.6.2
 8 | safetensors==0.4.3
 9 | sentence-transformers==2.2.2
10 | sentencepiece==0.2.0
11 | tensorboard==2.15.1
12 | torch==2.3.0
13 | transformers==4.40.2
14 | trl==0.8.6


--------------------------------------------------------------------------------
/subgraph/GraphQuestions/graph_query.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | from sparql_utils.sparql_executor import execute_query, execute_query_allvar, get_friendly_name
  4 | from query_interface import get_1hop
  5 | from tqdm import tqdm
  6 | import random
  7 | import copy
  8 | 
  9 | EXNUM=10
 10 | GRAPHNUM=10
 11 | 
 12 | # change sparql to query all variables
 13 | def update_sparql_query(query_string):
 14 |     # find all ver
 15 |     var_pattern = r'\?[xy]\d+'
 16 |     variables = set(re.findall(var_pattern, query_string))
 17 |     # sort based on num
 18 |     variables = sorted(variables, key=lambda x: int(x[2:]))
 19 |     # extract triples with y for it may not be used in main query
 20 |     query_lines = query_string.split('\n')
 21 |     query_y=[]
 22 |     for line in query_lines:
 23 |         if line.startswith("?") and len(line.split(' ')) == 5 and '?y' in line:
 24 |             query_y.append(line)
 25 |     query_lines=query_lines[:3]+query_y+query_lines[3:]
 26 |     # modify select distinct
 27 |     if query_lines[1].startswith('SELECT '):
 28 |         # remove SELECT (?x0 AS ?value) WHERE { and last }
 29 |         query_lines = query_lines[:1] + query_lines[2:-1]
 30 |         if query_lines[1].startswith('SELECT DISTINCT'):
 31 |             select_parts = query_lines[1].split(' ')
 32 |             select_parts[2] = ' '.join(variables)
 33 |             query_lines[1] = ' '.join(select_parts)
 34 |     return '\n'.join(query_lines), list(variables)
 35 |     
 36 | # parse sparql to subgraph
 37 | def sparql_to_graph(query):
 38 |     # input: sparql query
 39 |     # return: str(triples), 
 40 |     lines = query.split('\n')
 41 |     graph_lines = []
 42 |     values = {}
 43 |     # extract all intermediate entity mid
 44 |     for line in lines:
 45 |         if line.startswith("VALUE"):
 46 |             k = None
 47 |             v = None
 48 |             for item in line.split(' '):
 49 |                 if item.startswith("?"):
 50 |                     k = item
 51 |                 if item.startswith(":") or "<http://www.w3.org/2001/XMLSchema#" in item:
 52 |                     v = item
 53 |             assert k != None and v != None
 54 |             values[k] = v
 55 |     
 56 |     # extract triples from sparql query
 57 |     for line in lines:
 58 |         if line.startswith("SELECT DISTINCT"): # make sure Answer always is ?x0
 59 |             assert line == 'SELECT DISTINCT ?x0  WHERE { '
 60 |         # only process triple lines
 61 |         if line.startswith("?") and len(line.split(' ')) == 5:
 62 |             graph_lines.append(line)
 63 |     graph = '\n'.join(graph_lines)
 64 |     return graph
 65 |     
 66 | def query(file_type):
 67 |     file_name = 'data/' + file_type +'.json'
 68 |     
 69 |     data = json.load(open(file_name, 'r'))
 70 |     mid_dict = {}
 71 |     graphdata=[]
 72 |     
 73 |     for one_example in tqdm(data):
 74 |         # one sample
 75 |         sample=dict()
 76 |         # cvt dict, for one sample
 77 |         cvt_dict=dict()
 78 |         # cvt index
 79 |         cvt=1
 80 |         # sparql query
 81 |         sparql_query = one_example['sparql_query']
 82 |         sparql_query, variables= update_sparql_query(sparql_query)
 83 |         res = execute_query_allvar(sparql_query)
 84 |         # name query
 85 |         name=[]
 86 |         if res != None:
 87 |             for one_combo in res:
 88 |                 # one name
 89 |                 one_name=dict()
 90 |                 # iterate query results
 91 |                 for k,v in one_combo.items():
 92 |                     value = 'null'
 93 |                     one_combo[k] = {'mid': v['value'], 'value': v['value']}
 94 |                     if v['value'].startswith('http://rdf.freebase.com/ns/'):
 95 |                         mid = v['value'].replace('http://rdf.freebase.com/ns/', '').replace('-08:00','')
 96 |                         if mid in mid_dict:
 97 |                             value = mid_dict[mid]
 98 |                         else:
 99 |                             try:
100 |                                 value = get_friendly_name(mid)
101 |                                 if value!='null':
102 |                                     mid_dict[mid] = value
103 |                             except:
104 |                                 print(mid)
105 |                     else:
106 |                         mid=v['value'].replace('-08:00','')
107 |                         value=v['value'].replace('-08:00','')
108 |                     if value!='null': 
109 |                         one_name[k] = {'mid': mid, 'value': value}
110 |                     else:
111 |                         if cvt_dict.get(mid):
112 |                             one_name[k] = {'mid': mid, 'value': cvt_dict[mid]}
113 |                         else:
114 |                             one_name[k] = {'mid': mid, 'value': 'CVT'+str(cvt)}
115 |                             cvt_dict[mid]='CVT'+str(cvt)
116 |                             cvt+=1    
117 |                 name.append(one_name)
118 |         
119 |         # extract entity mid from graph
120 |         midlist=[]
121 |         # graph name exchange
122 |         graph=[]
123 |         graphstr=sparql_to_graph(one_example['sparql_query'])
124 |         graphstr=graphstr.replace(' . ','').split('\n')
125 |         for n in name:
126 |             midset=set()
127 |             # one subgraph for ques
128 |             one_graph=[]
129 |             # mid to name
130 |             for i in graphstr:
131 |                 triple=[]
132 |                 j=i.split(' ')
133 |                 for k in j:
134 |                     if k.startswith('?'):
135 |                         triple.append(n[k[1:]]['value'])
136 |                         continue
137 |                     if k.startswith(':'):
138 |                         triple.append(k[1:])
139 |                         continue
140 |                     triple.append(k)
141 |                 # add mid
142 |                 # j[0]
143 |                 # make sure j[0] is an entity
144 |                 if j[0].startswith('?') and len(n[j[0][1:]]['mid'])>1 and n[j[0][1:]]['mid'][0:2] in ['m.','n.','g.']:
145 |                     midset.add(n[j[0][1:]]['mid'])
146 |                 # j[2]
147 |                 # make sure j[2] is an entity
148 |                 if j[2].startswith('?') and len(n[j[2][1:]]['mid'])>1 and n[j[2][1:]]['mid'][0:2] in ['m.','n.','g.']:
149 |                     midset.add(n[j[2][1:]]['mid'])
150 |                 # skip type relation               
151 |                 if triple[1]!='type.object.type':
152 |                     one_graph.append(triple)
153 |             midlist.append(midset)
154 |             random.shuffle(one_graph)
155 |             graph.append(one_graph)
156 |         
157 |         # graph extend
158 |         ex_graph=[]
159 |         for index,g in enumerate(graph[:GRAPHNUM]):
160 |             # copy g to g1
161 |             g1=copy.deepcopy(g)
162 |             # iteratively extend triple
163 |             ex_triple=[]
164 |             # collect mid triple
165 |             mid_triple=[]
166 |             for j in midlist[index]:
167 |                 for k in get_1hop(j)[:EXNUM]:
168 |                     if k not in mid_triple:
169 |                         mid_triple.append(k)
170 |             # avoid redundant triple
171 |             unique_triples = set(tuple(triple) for triple in mid_triple)
172 |             mid_triple = [list(triple) for triple in unique_triples]
173 |             random.shuffle(mid_triple)
174 |             # mid to name
175 |             for k in mid_triple:
176 |                 extend=[]
177 |                 # k[0]
178 |                 temp=''
179 |                 # k[0] is in mid_dict
180 |                 if mid_dict.get(k[0]): 
181 |                     temp=mid_dict[k[0]]
182 |                 # k[0] is not entity
183 |                 if len(temp)==0 and (len(k[0])==1 or k[0][0:2] not in ['m.','n.','g.']):
184 |                     temp=k[0].replace('-08:00','')
185 |                 # k[0] is entity                    
186 |                 if len(temp)==0:
187 |                     temp=get_friendly_name(k[0])
188 |                     if temp=='null':
189 |                         if cvt_dict.get(k[0]):
190 |                             temp=cvt_dict[k[0]]
191 |                         else:
192 |                             temp='CVT'+str(cvt)
193 |                             cvt_dict[k[0]]=temp
194 |                             cvt+=1
195 |                     else:
196 |                         temp=temp.replace('-08:00','')
197 |                 extend.append(temp)
198 |                 # k[1]
199 |                 extend.append(k[1])
200 |                 # k[2]                    
201 |                 temp='' 
202 |                 # k[2] is in mid_dict
203 |                 if mid_dict.get(k[2]):
204 |                     temp=mid_dict[k[2]]
205 |                 # k[2] is not entity
206 |                 if len(temp)==0 and (len(k[2])==1 or k[2][0:2] not in ['m.','n.','g.']):
207 |                     temp=k[2].replace('-08:00','') 
208 |                 # k[2] is entity                   
209 |                 if len(temp)==0:
210 |                     temp=get_friendly_name(k[2])
211 |                     if temp=='null':
212 |                         if cvt_dict.get(k[2]):
213 |                             temp=cvt_dict[k[2]]
214 |                         else:
215 |                             temp='CVT'+str(cvt)
216 |                             cvt_dict[k[2]]=temp
217 |                             cvt+=1
218 |                     else:
219 |                         temp=temp.replace('-08:00','')
220 |                 extend.append(temp)              
221 |                 #if extend not in g1:
222 |                 #    g1.append(extend)
223 |                 if extend not in ex_triple:
224 |                     ex_triple.append(extend)     
225 |             # add ex_triple to g1
226 |             random.shuffle(ex_triple)
227 |             g1.extend(ex_triple)   
228 |             random.shuffle(g1)
229 |             ex_graph.append(g1)
230 |      
231 |         sample['qid']=one_example['qid']
232 |         sample['question']=one_example['question']
233 |         sample['answer']=one_example['answer']
234 |         sample['sparql_query']=one_example['sparql_query']
235 |         sample['s_expression']=one_example['s_expression']
236 |         sample['graph']=graph
237 |         sample['restrict_graph']=graph[:GRAPHNUM]
238 |         sample['ex_graph']=ex_graph
239 |         graphdata.append(sample)
240 |             
241 |     json.dump(graphdata,open('graph/'+file_type+'.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
242 | 
243 | query('train')
244 | query('test')


--------------------------------------------------------------------------------
/subgraph/GraphQuestions/sparql_utils/misc.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict, Counter, deque
  2 | import torch
  3 | import json
  4 | import pickle
  5 | import numpy as np
  6 | import torch.nn as nn
  7 | import random
  8 | import os
  9 | import time
 10 | ######################################################
 11 | ##################### used in SRN ####################
 12 | START_RELATION = 'START_RELATION'
 13 | NO_OP_RELATION = 'NO_OP_RELATION'
 14 | NO_OP_ENTITY = 'NO_OP_ENTITY'
 15 | DUMMY_RELATION = 'DUMMY_RELATION'
 16 | DUMMY_ENTITY = 'DUMMY_ENTITY'
 17 | 
 18 | DUMMY_RELATION_ID = 0
 19 | START_RELATION_ID = 1
 20 | NO_OP_RELATION_ID = 2
 21 | DUMMY_ENTITY_ID = 0
 22 | NO_OP_ENTITY_ID = 1
 23 | 
 24 | EPSILON = float(np.finfo(float).eps)
 25 | HUGE_INT = 1e31
 26 | 
 27 | def format_path(path_trace, id2entity, id2relation):
 28 |     def get_most_recent_relation(j):
 29 |         relation_id = int(path_trace[j][0])
 30 |         if relation_id == NO_OP_RELATION_ID:
 31 |             return '<null>'
 32 |         else:
 33 |             return id2relation[relation_id]
 34 | 
 35 |     def get_most_recent_entity(j):
 36 |         return id2entity[int(path_trace[j][1])]
 37 | 
 38 |     path_str = get_most_recent_entity(0)
 39 |     for j in range(1, len(path_trace)):
 40 |         rel = get_most_recent_relation(j)
 41 |         if not rel.endswith('_inv'):
 42 |             path_str += ' -{}-> '.format(rel)
 43 |         else:
 44 |             path_str += ' <-{}- '.format(rel[:-4])
 45 |         path_str += get_most_recent_entity(j)
 46 |     return path_str
 47 | 
 48 | def pad_and_cat(a, padding_value, padding_dim=1):
 49 |     max_dim_size = max([x.size()[padding_dim] for x in a])
 50 |     padded_a = []
 51 |     for x in a:
 52 |         if x.size()[padding_dim] < max_dim_size:
 53 |             res_len = max_dim_size - x.size()[1]
 54 |             pad = nn.ConstantPad1d((0, res_len), padding_value)
 55 |             padded_a.append(pad(x))
 56 |         else:
 57 |             padded_a.append(x)
 58 |     return torch.cat(padded_a, dim=0)
 59 | 
 60 | def safe_log(x):
 61 |     return torch.log(x + EPSILON)
 62 | 
 63 | def entropy(p):
 64 |     return torch.sum(- p * safe_log(p), 1)
 65 | 
 66 | def init_word2id():
 67 |     return {
 68 |         '<PAD>': 0,
 69 |         '<UNK>': 1,
 70 |         'E_S': 2,
 71 |     }
 72 | def init_entity2id():
 73 |     return {
 74 |         DUMMY_ENTITY: DUMMY_ENTITY_ID,
 75 |         NO_OP_ENTITY: NO_OP_ENTITY_ID
 76 |     }
 77 | def init_relation2id():
 78 |     return {
 79 |         DUMMY_RELATION: DUMMY_RELATION_ID,
 80 |         START_RELATION: START_RELATION_ID,
 81 |         NO_OP_RELATION: NO_OP_RELATION_ID
 82 |     }
 83 | 
 84 | def add_item_to_x2id(item, x2id):
 85 |     if not item in x2id:
 86 |         x2id[item] = len(x2id)
 87 | 
 88 | def tile_along_beam(v, beam_size, dim=0):
 89 |     """
 90 |     Tile a tensor along a specified dimension for the specified beam size.
 91 |     :param v: Input tensor.
 92 |     :param beam_size: Beam size.
 93 |     """
 94 |     if dim == -1:
 95 |         dim = len(v.size()) - 1
 96 |     v = v.unsqueeze(dim + 1)
 97 |     v = torch.cat([v] * beam_size, dim=dim+1)
 98 |     new_size = []
 99 |     for i, d in enumerate(v.size()):
100 |         if i == dim + 1:
101 |             new_size[-1] *= d
102 |         else:
103 |             new_size.append(d)
104 |     return v.view(new_size)
105 | ##################### used in SRN ####################
106 | ######################################################
107 | 
108 | 
109 |         
110 | def init_vocab():
111 |     return {
112 |         '<PAD>': 0,
113 |         '<UNK>': 1,
114 |         '<START>': 2,
115 |         '<END>': 3
116 |     }
117 | 
118 | def invert_dict(d):
119 |     return {v: k for k, v in d.items()}
120 | 
121 | def load_glove(glove_pt, idx_to_token):
122 |     glove = pickle.load(open(glove_pt, 'rb'))
123 |     dim = len(glove['the'])
124 |     matrix = []
125 |     for i in range(len(idx_to_token)):
126 |         token = idx_to_token[i]
127 |         tokens = token.split()
128 |         if len(tokens) > 1:
129 |             v = np.zeros((dim,))
130 |             for token in tokens:
131 |                 v = v + glove.get(token, glove['the'])
132 |             v = v / len(tokens)
133 |         else:
134 |             v = glove.get(token, glove['the'])
135 |         matrix.append(v)
136 |     matrix = np.asarray(matrix)
137 |     return matrix
138 | 
139 | 
140 | class SmoothedValue(object):
141 |     """Track a series of values and provide access to smoothed values over a
142 |     window or the global series average.
143 |     """
144 | 
145 |     def __init__(self, window_size=20):
146 |         self.deque = deque(maxlen=window_size)
147 |         self.series = []
148 |         self.total = 0.0
149 |         self.count = 0
150 | 
151 |     def update(self, value):
152 |         self.deque.append(value)
153 |         self.series.append(value)
154 |         self.count += 1
155 |         self.total += value
156 | 
157 |     @property
158 |     def median(self):
159 |         d = torch.tensor(list(self.deque))
160 |         return d.median().item()
161 | 
162 |     @property
163 |     def avg(self):
164 |         d = torch.tensor(list(self.deque))
165 |         return d.mean().item()
166 | 
167 |     @property
168 |     def global_avg(self):
169 |         return self.total / self.count
170 | 
171 | 
172 | class MetricLogger(object):
173 |     def __init__(self, delimiter="\t"):
174 |         self.meters = defaultdict(SmoothedValue)
175 |         self.delimiter = delimiter
176 | 
177 |     def update(self, **kwargs):
178 |         for k, v in kwargs.items():
179 |             if isinstance(v, torch.Tensor):
180 |                 v = v.item()
181 |             assert isinstance(v, (float, int))
182 |             self.meters[k].update(v)
183 | 
184 |     def __getattr__(self, attr):
185 |         if attr in self.meters:
186 |             return self.meters[attr]
187 |         if attr in self.__dict__:
188 |             return self.__dict__[attr]
189 |         raise AttributeError("'{}' object has no attribute '{}'".format(
190 |                     type(self).__name__, attr))
191 | 
192 |     def __str__(self):
193 |         loss_str = []
194 |         for name, meter in self.meters.items():
195 |             loss_str.append(
196 |                 "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg)
197 |             )
198 |         return self.delimiter.join(loss_str)
199 | 
200 | 
201 | def seed_everything(seed=1029):
202 |     '''
203 |     设置整个开发环境的seed
204 |     :param seed:
205 |     :param device:
206 |     :return:
207 |     '''
208 |     random.seed(seed)
209 |     os.environ['PYTHONHASHSEED'] = str(seed)
210 |     np.random.seed(seed)
211 |     torch.manual_seed(seed)
212 |     torch.cuda.manual_seed(seed)
213 |     torch.cuda.manual_seed_all(seed)
214 |     # some cudnn methods can be random even after fixing the seed
215 |     # unless you tell it to be deterministic
216 |     torch.backends.cudnn.deterministic = True
217 | 
218 | 
219 | class ProgressBar(object):
220 |     '''
221 |     custom progress bar
222 |     Example:
223 |         >>> pbar = ProgressBar(n_total=30,desc='training')
224 |         >>> step = 2
225 |         >>> pbar(step=step)
226 |     '''
227 |     def __init__(self, n_total,width=30,desc = 'Training'):
228 |         self.width = width
229 |         self.n_total = n_total
230 |         self.start_time = time.time()
231 |         self.desc = desc
232 | 
233 |     def __call__(self, step, info={}):
234 |         now = time.time()
235 |         current = step + 1
236 |         recv_per = current / self.n_total
237 |         bar = f'[{self.desc}] {current}/{self.n_total} ['
238 |         if recv_per >= 1:
239 |             recv_per = 1
240 |         prog_width = int(self.width * recv_per)
241 |         if prog_width > 0:
242 |             bar += '=' * (prog_width - 1)
243 |             if current< self.n_total:
244 |                 bar += ">"
245 |             else:
246 |                 bar += '='
247 |         bar += '.' * (self.width - prog_width)
248 |         bar += ']'
249 |         show_bar = f"\r{bar}"
250 |         time_per_unit = (now - self.start_time) / current
251 |         if current < self.n_total:
252 |             eta = time_per_unit * (self.n_total - current)
253 |             if eta > 3600:
254 |                 eta_format = ('%d:%02d:%02d' %
255 |                               (eta // 3600, (eta % 3600) // 60, eta % 60))
256 |             elif eta > 60:
257 |                 eta_format = '%d:%02d' % (eta // 60, eta % 60)
258 |             else:
259 |                 eta_format = '%ds' % eta
260 |             time_info = f' - ETA: {eta_format}'
261 |         else:
262 |             if time_per_unit >= 1:
263 |                 time_info = f' {time_per_unit:.1f}s/step'
264 |             elif time_per_unit >= 1e-3:
265 |                 time_info = f' {time_per_unit * 1e3:.1f}ms/step'
266 |             else:
267 |                 time_info = f' {time_per_unit * 1e6:.1f}us/step'
268 | 
269 |         show_bar += time_info
270 |         if len(info) != 0:
271 |             show_info = f'{show_bar} ' + \
272 |                         "-".join([f' {key}: {value:.4f} ' for key, value in info.items()])
273 |             print(show_info, end='')
274 |         else:
275 |             print(show_bar, end='')


--------------------------------------------------------------------------------
/subgraph/GraphQuestions/sparql_utils/value_class.py:
--------------------------------------------------------------------------------
  1 | def comp(a, b, op):
  2 |     """
  3 |     Args:
  4 |         - a (ValueClass): attribute value of a certain entity
  5 |         - b (ValueClass): comparison target
  6 |         - op: =/>/</!=
  7 |     Example:
  8 |         a is someone's birthday, 1960-02-01, b is 1960, op is '=', then return True
  9 |     """
 10 |     if b.isTime():
 11 |         # Note: for time, 'a=b' actually means a in b, 'a!=b' means a not in b
 12 |         if op == '=':
 13 |             return b.contains(a)
 14 |         elif op == '!=':
 15 |             return not b.contains(a)
 16 |     if op == '=':
 17 |         return a == b
 18 |     elif op == '<':
 19 |         return a < b
 20 |     elif op == '>':
 21 |         return a > b
 22 |     elif op == '!=':
 23 |         return a != b
 24 | 
 25 | class ValueClass():
 26 |     def __init__(self, type, value, unit=None):
 27 |         """
 28 |         When type is
 29 |             - string, value is a str
 30 |             - quantity, value is a number and unit is required
 31 |             - year, value is a int
 32 |             - date, value is a date object
 33 |         """
 34 |         self.type = type
 35 |         self.value = value
 36 |         self.unit = unit
 37 | 
 38 |     def isTime(self):
 39 |         return self.type in {'year', 'date'}
 40 | 
 41 |     def can_compare(self, other):
 42 |         if self.type == 'string':
 43 |             return other.type == 'string'
 44 |         elif self.type == 'quantity':
 45 |             # NOTE: for two quantity, they can compare only when they have the same unit
 46 |             return other.type == 'quantity' and other.unit == self.unit
 47 |         else:
 48 |             # year can compare with date
 49 |             return other.type == 'year' or other.type == 'date'
 50 | 
 51 |     def contains(self, other):
 52 |         """
 53 |         check whether self contains other, which is different from __eq__ and the result is asymmetric
 54 |         used for conditions like whether 2001-01-01 in 2001, or whether 2001 in 2001-01-01
 55 |         """
 56 |         if self.type == 'year': # year can contain year and date
 57 |             other_value = other.value if other.type == 'year' else other.value.year
 58 |             return self.value == other_value
 59 |         elif self.type == 'date': # date can only contain date
 60 |             return other.type == 'date' and self.value == other.value
 61 |         else:
 62 |             raise Exception('not supported type: %s' % self.type)
 63 | 
 64 | 
 65 |     def __eq__(self, other):
 66 |         """
 67 |         2001 and 2001-01-01 is not equal
 68 |         """
 69 |         assert self.can_compare(other)
 70 |         return self.type == other.type and self.value == other.value
 71 | 
 72 |     def __lt__(self, other):
 73 |         """
 74 |         Comparison between a year and a date will convert them both to year
 75 |         """
 76 |         assert self.can_compare(other)
 77 |         if self.type == 'string':
 78 |             raise Exception('try to compare two string')
 79 |         elif self.type == 'quantity':
 80 |             return self.value < other.value
 81 |         elif self.type == 'year':
 82 |             other_value = other.value if other.type == 'year' else other.value.year
 83 |             return self.value < other_value
 84 |         elif self.type == 'date':
 85 |             if other.type == 'year':
 86 |                 return self.value.year < other.value
 87 |             else:
 88 |                 return self.value < other.value
 89 | 
 90 |     def __gt__(self, other):
 91 |         assert self.can_compare(other)
 92 |         if self.type == 'string':
 93 |             raise Exception('try to compare two string')
 94 |         elif self.type == 'quantity':
 95 |             return self.value > other.value
 96 |         elif self.type == 'year':
 97 |             other_value = other.value if other.type == 'year' else other.value.year
 98 |             return self.value > other_value
 99 |         elif self.type == 'date':
100 |             if other.type == 'year':
101 |                 return self.value.year > other.value
102 |             else:
103 |                 return self.value > other.value
104 | 
105 |     def __str__(self):
106 |         if self.type == 'string':
107 |             return self.value
108 |         elif self.type == 'quantity':
109 |             if self.value - int(self.value) < 1e-5:
110 |                 v = int(self.value)
111 |             else:
112 |                 v = self.value
113 |             return '{} {}'.format(v, self.unit) if self.unit != '1' else str(v)
114 |         elif self.type == 'year':
115 |             return str(self.value)
116 |         elif self.type == 'date':
117 |             return self.value.isoformat()
118 | 


--------------------------------------------------------------------------------
/subgraph/gold_graph.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import math
 4 | import random
 5 | from tqdm import tqdm
 6 | 
 7 | # dataset: grailqa, GraphQuestions
 8 | DATA='grailqa'
 9 | 
10 | # result for subgraph
11 | result=DATA+'/gold/test.json'
12 | 
13 | # load data
14 | data=json.load(open(DATA+'/graph/test.json','r',encoding='utf-8'))
15 | 
16 | num_dict = {
17 |     '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
18 |     '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine',
19 |     '10': 'ten', '11': 'eleven', '12': 'twelve', '13': 'thirteen', 
20 |     '14': 'fourteen', '15': 'fifteen', '16': 'sixteen', '17': 'seventeen',
21 |     '18': 'eighteen', '19': 'nineteen', '20': 'twenty'
22 | }
23 | 
24 | MAX_NUM=10
25 | 
26 | samplelist=[]
27 | for sample in tqdm(data):
28 |     # graph sample
29 |     graphset=set()
30 |     for i in sample['graph'][:MAX_NUM]:
31 |         for j in i:
32 |             graphset.add('('+j[0]+', '+j[1]+', '+j[2]+')')
33 |     # avoid too many triples
34 |     graphlist=list(graphset)
35 |     
36 | 
37 |     # gold answer extraction
38 |     if DATA=='WebQSP':
39 |         gold=sample["answer"]
40 |     else:
41 |         gold=[]
42 |         for i in sample["answer"]:
43 |             if i.get("entity_name"):
44 |                 gold.append(i["entity_name"])
45 |             else:
46 |                 gold.append(i["answer_argument"])
47 | 
48 |    
49 |     # save
50 |     temp=dict()
51 |     temp['question']=sample['question']
52 |     temp["triples"]=' '.join(graphlist)
53 |     temp['answer']=gold
54 |     samplelist.append(temp)
55 | 
56 | json.dump(samplelist,open(result,'w',encoding='utf-8'),indent=2,ensure_ascii=False)
57 |     


--------------------------------------------------------------------------------
/subgraph/grailqa/graph_query.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | from sparql_utils.sparql_executor import execute_query, execute_query_allvar, get_friendly_name
  4 | from query_interface import get_1hop
  5 | from tqdm import tqdm
  6 | import random
  7 | import copy
  8 | 
  9 | # max extend triple number for each entity in gold graph
 10 | EXNUM=10
 11 | # max graph number for extend
 12 | GRAPHNUM=10
 13 | 
 14 | # change sparql to query all variables
 15 | def update_sparql_query(query_string):
 16 |     # find all ver
 17 |     var_pattern = r'\?[xy]\d+'
 18 |     variables = set(re.findall(var_pattern, query_string))
 19 |     # sort based on num
 20 |     variables = sorted(variables, key=lambda x: int(x[2:]))
 21 |     # extract triples with y for it may not be used in main query
 22 |     query_lines = query_string.split('\n')
 23 |     query_y=[]
 24 |     for line in query_lines:
 25 |         if line.startswith("?") and len(line.split(' ')) == 5 and '?y' in line:
 26 |             query_y.append(line)
 27 |     query_lines=query_lines[:3]+query_y+query_lines[3:]
 28 |     # modify select distinct
 29 |     if query_lines[1].startswith('SELECT '):
 30 |         # remove SELECT (?x0 AS ?value) WHERE { and last }
 31 |         query_lines = query_lines[:1] + query_lines[2:-1]
 32 |         if query_lines[1].startswith('SELECT DISTINCT'):
 33 |             select_parts = query_lines[1].split(' ')
 34 |             select_parts[2] = ' '.join(variables)
 35 |             query_lines[1] = ' '.join(select_parts)
 36 |     return '\n'.join(query_lines), list(variables)
 37 |     
 38 | # parse sparql to subgraph
 39 | def sparql_to_graph(query):
 40 |     # input: sparql query
 41 |     # return: str(triples), 
 42 |     lines = query.split('\n')
 43 |     graph_lines = []
 44 |     values = {}
 45 |     # extract all intermediate entity mid
 46 |     for line in lines:
 47 |         if line.startswith("VALUE"):
 48 |             k = None
 49 |             v = None
 50 |             for item in line.split(' '):
 51 |                 if item.startswith("?"):
 52 |                     k = item
 53 |                 if item.startswith(":") or "<http://www.w3.org/2001/XMLSchema#" in item:
 54 |                     v = item
 55 |             assert k != None and v != None
 56 |             values[k] = v
 57 |     
 58 |     # extract triples from sparql query
 59 |     for line in lines:
 60 |         if line.startswith("SELECT DISTINCT"): # make sure Answer always is ?x0
 61 |             assert line == 'SELECT DISTINCT ?x0  WHERE { '
 62 |         # only process triple lines
 63 |         if line.startswith("?") and len(line.split(' ')) == 5:
 64 |             graph_lines.append(line)
 65 |     graph = '\n'.join(graph_lines)
 66 |     return graph
 67 |     
 68 | def query(file_type):
 69 |     file_name = 'data/' + file_type +'.json'
 70 |     
 71 |     data = json.load(open(file_name, 'r'))
 72 |     mid_dict = {}
 73 |     graphdata=[]
 74 |     
 75 |     for one_example in tqdm(data):
 76 |         # one sample
 77 |         sample=dict()
 78 |         # cvt dict, for one sample
 79 |         cvt_dict=dict()
 80 |         # cvt index
 81 |         cvt=1
 82 |         # sparql query
 83 |         sparql_query = one_example['sparql_query']
 84 |         sparql_query, variables= update_sparql_query(sparql_query)
 85 |         res = execute_query_allvar(sparql_query)
 86 |         # name query
 87 |         name=[]
 88 |         if res != None:
 89 |             for one_combo in res:
 90 |                 # one name
 91 |                 one_name=dict()
 92 |                 # iterate query results
 93 |                 for k,v in one_combo.items():
 94 |                     value = 'null'
 95 |                     one_combo[k] = {'mid': v['value'], 'value': v['value']}
 96 |                     if v['value'].startswith('http://rdf.freebase.com/ns/'):
 97 |                         mid = v['value'].replace('http://rdf.freebase.com/ns/', '').replace('-08:00','')
 98 |                         if mid in mid_dict:
 99 |                             value = mid_dict[mid]
100 |                         else:
101 |                             try:
102 |                                 value = get_friendly_name(mid)
103 |                                 if value!='null':
104 |                                     mid_dict[mid] = value
105 |                             except:
106 |                                 print(mid)
107 |                     else:
108 |                         mid=v['value'].replace('-08:00','')
109 |                         value=v['value'].replace('-08:00','')
110 |                     if value!='null': 
111 |                         one_name[k] = {'mid': mid, 'value': value}
112 |                     else:
113 |                         if cvt_dict.get(mid):
114 |                             one_name[k] = {'mid': mid, 'value': cvt_dict[mid]}
115 |                         else:
116 |                             one_name[k] = {'mid': mid, 'value': 'CVT'+str(cvt)}
117 |                             cvt_dict[mid]='CVT'+str(cvt)
118 |                             cvt+=1    
119 |                 name.append(one_name)
120 |         
121 |         # extract entity mid from graph
122 |         midlist=[]
123 |         # graph name exchange
124 |         graph=[]
125 |         graphstr=sparql_to_graph(one_example['sparql_query'])
126 |         graphstr=graphstr.replace(' . ','').split('\n')
127 |         for n in name:
128 |             midset=set()
129 |             # one subgraph for ques
130 |             one_graph=[]
131 |             # mid to name
132 |             for i in graphstr:
133 |                 triple=[]
134 |                 j=i.split(' ')
135 |                 for k in j:
136 |                     if k.startswith('?'):
137 |                         triple.append(n[k[1:]]['value'])
138 |                         continue
139 |                     if k.startswith(':'):
140 |                         triple.append(k[1:])
141 |                         continue
142 |                     triple.append(k)
143 |                 # add mid
144 |                 # j[0]
145 |                 # make sure j[0] is an entity
146 |                 if j[0].startswith('?') and len(n[j[0][1:]]['mid'])>1 and n[j[0][1:]]['mid'][0:2] in ['m.','n.','g.']:
147 |                     midset.add(n[j[0][1:]]['mid'])
148 |                 # j[2]
149 |                 # make sure j[2] is an entity
150 |                 if j[2].startswith('?') and len(n[j[2][1:]]['mid'])>1 and n[j[2][1:]]['mid'][0:2] in ['m.','n.','g.']:
151 |                     midset.add(n[j[2][1:]]['mid'])
152 |                 # skip type relation               
153 |                 if triple[1]!='type.object.type':
154 |                     one_graph.append(triple)
155 |             midlist.append(midset)
156 |             random.shuffle(one_graph)
157 |             graph.append(one_graph)
158 |         
159 |         # graph extend
160 |         ex_graph=[]
161 |         for index,g in enumerate(graph[:GRAPHNUM]):
162 |             # copy g to g1
163 |             g1=copy.deepcopy(g)
164 |             # iteratively extend triple
165 |             ex_triple=[]
166 |             # collect mid triple
167 |             mid_triple=[]
168 |             for j in midlist[index]:
169 |                 for k in get_1hop(j)[:EXNUM]:
170 |                     if k not in mid_triple:
171 |                         mid_triple.append(k)
172 |             # avoid redundant triple
173 |             unique_triples = set(tuple(triple) for triple in mid_triple)
174 |             mid_triple = [list(triple) for triple in unique_triples]
175 |             random.shuffle(mid_triple)
176 |             # mid to name
177 |             for k in mid_triple:
178 |                 extend=[]
179 |                 # k[0]
180 |                 temp=''
181 |                 # k[0] is in mid_dict
182 |                 if mid_dict.get(k[0]): 
183 |                     temp=mid_dict[k[0]]
184 |                 # k[0] is not entity
185 |                 if len(temp)==0 and (len(k[0])==1 or k[0][0:2] not in ['m.','n.','g.']):
186 |                     temp=k[0].replace('-08:00','')
187 |                 # k[0] is entity                    
188 |                 if len(temp)==0:
189 |                     temp=get_friendly_name(k[0])
190 |                     if temp=='null':
191 |                         if cvt_dict.get(k[0]):
192 |                             temp=cvt_dict[k[0]]
193 |                         else:
194 |                             temp='CVT'+str(cvt)
195 |                             cvt_dict[k[0]]=temp
196 |                             cvt+=1
197 |                     else:
198 |                         temp=temp.replace('-08:00','')
199 |                 extend.append(temp)
200 |                 # k[1]
201 |                 extend.append(k[1])
202 |                 # k[2]                    
203 |                 temp='' 
204 |                 # k[2] is in mid_dict
205 |                 if mid_dict.get(k[2]):
206 |                     temp=mid_dict[k[2]]
207 |                 # k[2] is not entity
208 |                 if len(temp)==0 and (len(k[2])==1 or k[2][0:2] not in ['m.','n.','g.']):
209 |                     temp=k[2].replace('-08:00','') 
210 |                 # k[2] is entity                   
211 |                 if len(temp)==0:
212 |                     temp=get_friendly_name(k[2])
213 |                     if temp=='null':
214 |                         if cvt_dict.get(k[2]):
215 |                             temp=cvt_dict[k[2]]
216 |                         else:
217 |                             temp='CVT'+str(cvt)
218 |                             cvt_dict[k[2]]=temp
219 |                             cvt+=1
220 |                     else:
221 |                         temp=temp.replace('-08:00','')
222 |                 extend.append(temp)                           
223 |                 #if extend not in g1:
224 |                 #    g1.append(extend)
225 |                 if extend not in ex_triple:
226 |                     ex_triple.append(extend)     
227 |             # add ex_triple to g1
228 |             random.shuffle(ex_triple)
229 |             g1.extend(ex_triple)   
230 |             random.shuffle(g1)
231 |             ex_graph.append(g1)
232 |             
233 |         sample['qid']=one_example['qid']
234 |         sample['question']=one_example['question']
235 |         sample['answer']=one_example['answer']
236 |         sample['sparql_query']=one_example['sparql_query']
237 |         sample['s_expression']=one_example['s_expression']
238 |         sample['graph']=graph
239 |         sample['restrict_graph']=graph[:GRAPHNUM]
240 |         sample['ex_graph']=ex_graph
241 |         graphdata.append(sample)
242 |             
243 |     json.dump(graphdata,open('graph/'+file_type+'.json','w',encoding='utf-8'),indent=2,ensure_ascii=False)
244 | 
245 | query('train')
246 | query('dev')


--------------------------------------------------------------------------------
/subgraph/grailqa/sparql_utils/misc.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict, Counter, deque
  2 | import torch
  3 | import json
  4 | import pickle
  5 | import numpy as np
  6 | import torch.nn as nn
  7 | import random
  8 | import os
  9 | import time
 10 | ######################################################
 11 | ##################### used in SRN ####################
 12 | START_RELATION = 'START_RELATION'
 13 | NO_OP_RELATION = 'NO_OP_RELATION'
 14 | NO_OP_ENTITY = 'NO_OP_ENTITY'
 15 | DUMMY_RELATION = 'DUMMY_RELATION'
 16 | DUMMY_ENTITY = 'DUMMY_ENTITY'
 17 | 
 18 | DUMMY_RELATION_ID = 0
 19 | START_RELATION_ID = 1
 20 | NO_OP_RELATION_ID = 2
 21 | DUMMY_ENTITY_ID = 0
 22 | NO_OP_ENTITY_ID = 1
 23 | 
 24 | EPSILON = float(np.finfo(float).eps)
 25 | HUGE_INT = 1e31
 26 | 
 27 | def format_path(path_trace, id2entity, id2relation):
 28 |     def get_most_recent_relation(j):
 29 |         relation_id = int(path_trace[j][0])
 30 |         if relation_id == NO_OP_RELATION_ID:
 31 |             return '<null>'
 32 |         else:
 33 |             return id2relation[relation_id]
 34 | 
 35 |     def get_most_recent_entity(j):
 36 |         return id2entity[int(path_trace[j][1])]
 37 | 
 38 |     path_str = get_most_recent_entity(0)
 39 |     for j in range(1, len(path_trace)):
 40 |         rel = get_most_recent_relation(j)
 41 |         if not rel.endswith('_inv'):
 42 |             path_str += ' -{}-> '.format(rel)
 43 |         else:
 44 |             path_str += ' <-{}- '.format(rel[:-4])
 45 |         path_str += get_most_recent_entity(j)
 46 |     return path_str
 47 | 
 48 | def pad_and_cat(a, padding_value, padding_dim=1):
 49 |     max_dim_size = max([x.size()[padding_dim] for x in a])
 50 |     padded_a = []
 51 |     for x in a:
 52 |         if x.size()[padding_dim] < max_dim_size:
 53 |             res_len = max_dim_size - x.size()[1]
 54 |             pad = nn.ConstantPad1d((0, res_len), padding_value)
 55 |             padded_a.append(pad(x))
 56 |         else:
 57 |             padded_a.append(x)
 58 |     return torch.cat(padded_a, dim=0)
 59 | 
 60 | def safe_log(x):
 61 |     return torch.log(x + EPSILON)
 62 | 
 63 | def entropy(p):
 64 |     return torch.sum(- p * safe_log(p), 1)
 65 | 
 66 | def init_word2id():
 67 |     return {
 68 |         '<PAD>': 0,
 69 |         '<UNK>': 1,
 70 |         'E_S': 2,
 71 |     }
 72 | def init_entity2id():
 73 |     return {
 74 |         DUMMY_ENTITY: DUMMY_ENTITY_ID,
 75 |         NO_OP_ENTITY: NO_OP_ENTITY_ID
 76 |     }
 77 | def init_relation2id():
 78 |     return {
 79 |         DUMMY_RELATION: DUMMY_RELATION_ID,
 80 |         START_RELATION: START_RELATION_ID,
 81 |         NO_OP_RELATION: NO_OP_RELATION_ID
 82 |     }
 83 | 
 84 | def add_item_to_x2id(item, x2id):
 85 |     if not item in x2id:
 86 |         x2id[item] = len(x2id)
 87 | 
 88 | def tile_along_beam(v, beam_size, dim=0):
 89 |     """
 90 |     Tile a tensor along a specified dimension for the specified beam size.
 91 |     :param v: Input tensor.
 92 |     :param beam_size: Beam size.
 93 |     """
 94 |     if dim == -1:
 95 |         dim = len(v.size()) - 1
 96 |     v = v.unsqueeze(dim + 1)
 97 |     v = torch.cat([v] * beam_size, dim=dim+1)
 98 |     new_size = []
 99 |     for i, d in enumerate(v.size()):
100 |         if i == dim + 1:
101 |             new_size[-1] *= d
102 |         else:
103 |             new_size.append(d)
104 |     return v.view(new_size)
105 | ##################### used in SRN ####################
106 | ######################################################
107 | 
108 | 
109 |         
110 | def init_vocab():
111 |     return {
112 |         '<PAD>': 0,
113 |         '<UNK>': 1,
114 |         '<START>': 2,
115 |         '<END>': 3
116 |     }
117 | 
118 | def invert_dict(d):
119 |     return {v: k for k, v in d.items()}
120 | 
121 | def load_glove(glove_pt, idx_to_token):
122 |     glove = pickle.load(open(glove_pt, 'rb'))
123 |     dim = len(glove['the'])
124 |     matrix = []
125 |     for i in range(len(idx_to_token)):
126 |         token = idx_to_token[i]
127 |         tokens = token.split()
128 |         if len(tokens) > 1:
129 |             v = np.zeros((dim,))
130 |             for token in tokens:
131 |                 v = v + glove.get(token, glove['the'])
132 |             v = v / len(tokens)
133 |         else:
134 |             v = glove.get(token, glove['the'])
135 |         matrix.append(v)
136 |     matrix = np.asarray(matrix)
137 |     return matrix
138 | 
139 | 
140 | class SmoothedValue(object):
141 |     """Track a series of values and provide access to smoothed values over a
142 |     window or the global series average.
143 |     """
144 | 
145 |     def __init__(self, window_size=20):
146 |         self.deque = deque(maxlen=window_size)
147 |         self.series = []
148 |         self.total = 0.0
149 |         self.count = 0
150 | 
151 |     def update(self, value):
152 |         self.deque.append(value)
153 |         self.series.append(value)
154 |         self.count += 1
155 |         self.total += value
156 | 
157 |     @property
158 |     def median(self):
159 |         d = torch.tensor(list(self.deque))
160 |         return d.median().item()
161 | 
162 |     @property
163 |     def avg(self):
164 |         d = torch.tensor(list(self.deque))
165 |         return d.mean().item()
166 | 
167 |     @property
168 |     def global_avg(self):
169 |         return self.total / self.count
170 | 
171 | 
172 | class MetricLogger(object):
173 |     def __init__(self, delimiter="\t"):
174 |         self.meters = defaultdict(SmoothedValue)
175 |         self.delimiter = delimiter
176 | 
177 |     def update(self, **kwargs):
178 |         for k, v in kwargs.items():
179 |             if isinstance(v, torch.Tensor):
180 |                 v = v.item()
181 |             assert isinstance(v, (float, int))
182 |             self.meters[k].update(v)
183 | 
184 |     def __getattr__(self, attr):
185 |         if attr in self.meters:
186 |             return self.meters[attr]
187 |         if attr in self.__dict__:
188 |             return self.__dict__[attr]
189 |         raise AttributeError("'{}' object has no attribute '{}'".format(
190 |                     type(self).__name__, attr))
191 | 
192 |     def __str__(self):
193 |         loss_str = []
194 |         for name, meter in self.meters.items():
195 |             loss_str.append(
196 |                 "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg)
197 |             )
198 |         return self.delimiter.join(loss_str)
199 | 
200 | 
201 | def seed_everything(seed=1029):
202 |     '''
203 |     设置整个开发环境的seed
204 |     :param seed:
205 |     :param device:
206 |     :return:
207 |     '''
208 |     random.seed(seed)
209 |     os.environ['PYTHONHASHSEED'] = str(seed)
210 |     np.random.seed(seed)
211 |     torch.manual_seed(seed)
212 |     torch.cuda.manual_seed(seed)
213 |     torch.cuda.manual_seed_all(seed)
214 |     # some cudnn methods can be random even after fixing the seed
215 |     # unless you tell it to be deterministic
216 |     torch.backends.cudnn.deterministic = True
217 | 
218 | 
219 | class ProgressBar(object):
220 |     '''
221 |     custom progress bar
222 |     Example:
223 |         >>> pbar = ProgressBar(n_total=30,desc='training')
224 |         >>> step = 2
225 |         >>> pbar(step=step)
226 |     '''
227 |     def __init__(self, n_total,width=30,desc = 'Training'):
228 |         self.width = width
229 |         self.n_total = n_total
230 |         self.start_time = time.time()
231 |         self.desc = desc
232 | 
233 |     def __call__(self, step, info={}):
234 |         now = time.time()
235 |         current = step + 1
236 |         recv_per = current / self.n_total
237 |         bar = f'[{self.desc}] {current}/{self.n_total} ['
238 |         if recv_per >= 1:
239 |             recv_per = 1
240 |         prog_width = int(self.width * recv_per)
241 |         if prog_width > 0:
242 |             bar += '=' * (prog_width - 1)
243 |             if current< self.n_total:
244 |                 bar += ">"
245 |             else:
246 |                 bar += '='
247 |         bar += '.' * (self.width - prog_width)
248 |         bar += ']'
249 |         show_bar = f"\r{bar}"
250 |         time_per_unit = (now - self.start_time) / current
251 |         if current < self.n_total:
252 |             eta = time_per_unit * (self.n_total - current)
253 |             if eta > 3600:
254 |                 eta_format = ('%d:%02d:%02d' %
255 |                               (eta // 3600, (eta % 3600) // 60, eta % 60))
256 |             elif eta > 60:
257 |                 eta_format = '%d:%02d' % (eta // 60, eta % 60)
258 |             else:
259 |                 eta_format = '%ds' % eta
260 |             time_info = f' - ETA: {eta_format}'
261 |         else:
262 |             if time_per_unit >= 1:
263 |                 time_info = f' {time_per_unit:.1f}s/step'
264 |             elif time_per_unit >= 1e-3:
265 |                 time_info = f' {time_per_unit * 1e3:.1f}ms/step'
266 |             else:
267 |                 time_info = f' {time_per_unit * 1e6:.1f}us/step'
268 | 
269 |         show_bar += time_info
270 |         if len(info) != 0:
271 |             show_info = f'{show_bar} ' + \
272 |                         "-".join([f' {key}: {value:.4f} ' for key, value in info.items()])
273 |             print(show_info, end='')
274 |         else:
275 |             print(show_bar, end='')


--------------------------------------------------------------------------------
/subgraph/grailqa/sparql_utils/sparql_engine.py:
--------------------------------------------------------------------------------
  1 | import rdflib
  2 | from rdflib import URIRef, BNode, Literal, XSD
  3 | from rdflib.plugins.stores import sparqlstore
  4 | from itertools import chain
  5 | from tqdm import tqdm
  6 | import argparse
  7 | 
  8 | import sys
  9 | from sparql_utils.load_kb import DataForSPARQL
 10 | from sparql_utils.value_class import ValueClass
 11 | 
 12 | 
 13 | virtuoso_address = "http://10.201.190.172:8890/sparql"
 14 | # virtuoso_graph_uri = 'KQApro'
 15 | virtuoso_graph_uri = 'freebase'
 16 | 
 17 | 
 18 | 
 19 | def legal(s):
 20 |     # convert predicate and attribute keys to legal format
 21 |     return s.replace(' ', '_')
 22 | 
 23 | def esc_escape(s):
 24 |     '''
 25 |     Why we need this:
 26 |     If there is an escape in Literal, such as '\EUR', the query string will be something like '?pv <pred:value> "\\EUR"'.
 27 |     However, in virtuoso engine, \\ is connected with E, and \\E forms a bad escape sequence.
 28 |     So we must repeat \\, and virtuoso will consider "\\\\EUR" as "\EUR".
 29 | 
 30 |     Note this must be applied before esc_quot, as esc_quot will introduce extra escapes.
 31 |     '''
 32 |     return s.replace('\\', '\\\\')
 33 | 
 34 | def esc_quot(s):
 35 |     '''
 36 |     Why we need this:
 37 |     We use "<value>" to represent a literal value in the sparql query.
 38 |     If the <value> has a double quotation mark itself, we must escape it to make sure the query is valid for the virtuoso engine.
 39 |     '''
 40 |     return s.replace('"', '\\"')
 41 | 
 42 | class SparqlEngine():
 43 |     gs1 = None
 44 |     PRED_INSTANCE = 'pred:instance_of'
 45 |     PRED_NAME = 'pred:name'
 46 | 
 47 |     PRED_VALUE = 'pred:value'       # link packed value node to its literal value
 48 |     PRED_UNIT = 'pred:unit'         # link packed value node to its unit
 49 | 
 50 |     PRED_YEAR = 'pred:year'         # link packed value node to its year value, which is an integer
 51 |     PRED_DATE = 'pred:date'         # link packed value node to its date value, which is a date
 52 | 
 53 |     PRED_FACT_H = 'pred:fact_h'     # link qualifier node to its head
 54 |     PRED_FACT_R = 'pred:fact_r'
 55 |     PRED_FACT_T = 'pred:fact_t'
 56 | 
 57 |     SPECIAL_PREDICATES = (PRED_INSTANCE, PRED_NAME, PRED_VALUE, PRED_UNIT, PRED_YEAR, PRED_DATE, PRED_FACT_H, PRED_FACT_R, PRED_FACT_T)
 58 |     def __init__(self, data, ttl_file=''):
 59 |         self.nodes = nodes = {}
 60 |         for i in chain(data.concepts, data.entities):
 61 |             nodes[i] = URIRef(i)
 62 |         for p in chain(data.predicates, data.attribute_keys, SparqlEngine.SPECIAL_PREDICATES):
 63 |             nodes[p] = URIRef(legal(p))
 64 |         
 65 |         self.graph = graph = rdflib.Graph()
 66 | 
 67 |         for i in chain(data.concepts, data.entities):
 68 |             name = data.get_name(i)
 69 |             graph.add((nodes[i], nodes[SparqlEngine.PRED_NAME], Literal(name)))
 70 | 
 71 |         for ent_id in tqdm(data.entities, desc='Establishing rdf graph'):
 72 |             for con_id in data.get_all_concepts(ent_id):
 73 |                 graph.add((nodes[ent_id], nodes[SparqlEngine.PRED_INSTANCE], nodes[con_id]))
 74 |             for (k, v, qualifiers) in data.get_attribute_facts(ent_id):
 75 |                 h, r = nodes[ent_id], nodes[k]
 76 |                 t = self._get_value_node(v)
 77 |                 graph.add((h, r, t))
 78 |                 fact_node = self._new_fact_node(h, r, t)
 79 | 
 80 |                 for qk, qvs in qualifiers.items():
 81 |                     for qv in qvs:
 82 |                         h, r = fact_node, nodes[qk]
 83 |                         t = self._get_value_node(qv)
 84 |                         if len(list(graph[t])) == 0:
 85 |                             print(t)
 86 |                         graph.add((h, r, t))
 87 | 
 88 |             for (pred, obj_id, direction, qualifiers) in data.get_relation_facts(ent_id):
 89 |                 if direction == 'backward':
 90 |                     if data.is_concept(obj_id):
 91 |                         h, r, t = nodes[obj_id], nodes[pred], nodes[ent_id]
 92 |                     else:
 93 |                         continue
 94 |                 else:
 95 |                     h, r, t = nodes[ent_id], nodes[pred], nodes[obj_id]
 96 |                 graph.add((h, r, t))
 97 |                 fact_node = self._new_fact_node(h, r, t)
 98 |                 for qk, qvs in qualifiers.items():
 99 |                     for qv in qvs:
100 |                         h, r = fact_node, nodes[qk]
101 |                         t = self._get_value_node(qv)
102 |                         graph.add((h, r, t))
103 | 
104 |         if ttl_file:
105 |             print('Save graph to {}'.format(ttl_file))
106 |             graph.serialize(ttl_file, format='turtle')
107 | 
108 | 
109 |     def _get_value_node(self, v):
110 |         # we use a URIRef node, because we need its reference in query results, which is not supported by BNode
111 |         if v.type == 'string':
112 |             node = BNode()
113 |             self.graph.add((node, self.nodes[SparqlEngine.PRED_VALUE], Literal(v.value)))
114 |             return node
115 |         elif v.type == 'quantity': 
116 |             # we use a node to pack value and unit
117 |             node = BNode()
118 |             self.graph.add((node, self.nodes[SparqlEngine.PRED_VALUE], Literal(v.value, datatype=XSD.double)))
119 |             self.graph.add((node, self.nodes[SparqlEngine.PRED_UNIT], Literal(v.unit)))
120 |             return node
121 |         elif v.type == 'year':
122 |             node = BNode()
123 |             self.graph.add((node, self.nodes[SparqlEngine.PRED_YEAR], Literal(v.value)))
124 |             return node
125 |         elif v.type == 'date':
126 |             # use a node to pack year and date
127 |             node = BNode()
128 |             self.graph.add((node, self.nodes[SparqlEngine.PRED_YEAR], Literal(v.value.year)))
129 |             self.graph.add((node, self.nodes[SparqlEngine.PRED_DATE], Literal(v.value, datatype=XSD.date)))
130 |             return node
131 | 
132 |     def _new_fact_node(self, h, r, t):
133 |         node = BNode()
134 |         self.graph.add((node, self.nodes[SparqlEngine.PRED_FACT_H], h))
135 |         self.graph.add((node, self.nodes[SparqlEngine.PRED_FACT_R], r))
136 |         self.graph.add((node, self.nodes[SparqlEngine.PRED_FACT_T], t))
137 |         return node
138 | 
139 | 
140 | def query_virtuoso(q):
141 |     endpoint = virtuoso_address
142 |     store=sparqlstore.SPARQLUpdateStore(endpoint)
143 |     gs = rdflib.ConjunctiveGraph(store)
144 |     gs.open((endpoint, endpoint))
145 |     gs1 = gs.get_context(rdflib.URIRef(virtuoso_graph_uri))
146 |     res = gs1.query(q)
147 |     return res
148 | 
149 | 
150 | 
151 | def get_sparql_answer(sparql, data):
152 |     """
153 |     data: DataForSPARQL object, we need the key_type
154 |     """
155 |     try:
156 |         # infer the parse_type based on sparql
157 |         if sparql.startswith('SELECT DISTINCT ?e') or sparql.startswith('SELECT ?e'):
158 |             parse_type = 'name'
159 |         elif sparql.startswith('SELECT (COUNT(DISTINCT ?e)'):
160 |             parse_type = 'count'
161 |         elif sparql.startswith('SELECT DISTINCT ?p '):
162 |             parse_type = 'pred'
163 |         elif sparql.startswith('ASK'):
164 |             parse_type = 'bool'
165 |         else:
166 |             tokens = sparql.split()
167 |             tgt = tokens[2]
168 |             for i in range(len(tokens)-1, 1, -1):
169 |                 if tokens[i]=='.' and tokens[i-1]==tgt:
170 |                     key = tokens[i-2]
171 |                     break
172 |             key = key[1:-1].replace('_', ' ')
173 |             t = data.key_type[key]
174 |             parse_type = 'attr_{}'.format(t)
175 | 
176 |         parsed_answer = None
177 |         res = query_virtuoso(sparql)
178 |         if res.vars:
179 |             res = [[binding[v] for v in res.vars] for binding in res.bindings]
180 |             if len(res) != 1:
181 |                 return None
182 |         else:
183 |             res = res.askAnswer
184 |             assert parse_type == 'bool'
185 |         
186 |         if parse_type == 'name':
187 |             node = res[0][0]
188 |             sp = 'SELECT DISTINCT ?v WHERE {{ <{}> <{}> ?v .  }}'.format(node, SparqlEngine.PRED_NAME)
189 |             res = query_virtuoso(sp)
190 |             res = [[binding[v] for v in res.vars] for binding in res.bindings]
191 |             name = res[0][0].value
192 |             parsed_answer = name
193 |         elif parse_type == 'count':
194 |             count = res[0][0].value
195 |             parsed_answer = str(count)
196 |         elif parse_type.startswith('attr_'):
197 |             node = res[0][0]
198 |             v_type = parse_type.split('_')[1]
199 |             unit = None
200 |             if v_type == 'string':
201 |                 sp = 'SELECT DISTINCT ?v WHERE {{ <{}> <{}> ?v .  }}'.format(node, SparqlEngine.PRED_VALUE)
202 |             elif v_type == 'quantity':
203 |                 # Note: For those large number, ?v is truncated by virtuoso (e.g., 14756087 to 1.47561e+07)
204 |                 # To obtain the accurate ?v, we need to cast it to str
205 |                 sp = 'SELECT DISTINCT ?v,?u,(str(?v) as ?sv) WHERE {{ <{}> <{}> ?v ; <{}> ?u .  }}'.format(node, SparqlEngine.PRED_VALUE, SparqlEngine.PRED_UNIT)
206 |             elif v_type == 'year':
207 |                 sp = 'SELECT DISTINCT ?v WHERE {{ <{}> <{}> ?v .  }}'.format(node, SparqlEngine.PRED_YEAR)
208 |             elif v_type == 'date':
209 |                 sp = 'SELECT DISTINCT ?v WHERE {{ <{}> <{}> ?v .  }}'.format(node, SparqlEngine.PRED_DATE)
210 |             else:
211 |                 raise Exception('unsupported parse type')
212 |             res = query_virtuoso(sp)
213 |             res = [[binding[v] for v in res.vars] for binding in res.bindings]
214 |             # if there is no specific date, then convert the type to year
215 |             if len(res)==0 and v_type == 'date':
216 |                 v_type = 'year'
217 |                 sp = 'SELECT DISTINCT ?v WHERE {{ <{}> <{}> ?v .  }}'.format(node, SparqlEngine.PRED_YEAR)
218 |                 res = query_virtuoso(sp)
219 |                 res = [[binding[v] for v in res.vars] for binding in res.bindings]
220 |             if v_type == 'quantity':
221 |                 value = float(res[0][2].value)
222 |                 unit = res[0][1].value
223 |             else:
224 |                 value = res[0][0].value
225 |             value = ValueClass(v_type, value, unit)
226 |             parsed_answer = str(value)
227 |         elif parse_type == 'bool':
228 |             parsed_answer = 'yes' if res else 'no'
229 |         elif parse_type == 'pred':
230 |             parsed_answer = str(res[0][0])
231 |             parsed_answer = parsed_answer.replace('_', ' ')
232 |         return parsed_answer
233 |     except Exception:
234 |         return None
235 | 
236 | 
237 | if __name__ == '__main__':
238 |     parser = argparse.ArgumentParser()
239 |     # input and output
240 |     parser.add_argument('--kb_path', required=True)
241 |     parser.add_argument('--ttl_path', required=True)
242 |     args = parser.parse_args()
243 | 
244 |     data = DataForSPARQL(args.kb_path)
245 |     engine = SparqlEngine(data, args.ttl_path)
246 | 


--------------------------------------------------------------------------------
/subgraph/grailqa/sparql_utils/value_class.py:
--------------------------------------------------------------------------------
  1 | def comp(a, b, op):
  2 |     """
  3 |     Args:
  4 |         - a (ValueClass): attribute value of a certain entity
  5 |         - b (ValueClass): comparison target
  6 |         - op: =/>/</!=
  7 |     Example:
  8 |         a is someone's birthday, 1960-02-01, b is 1960, op is '=', then return True
  9 |     """
 10 |     if b.isTime():
 11 |         # Note: for time, 'a=b' actually means a in b, 'a!=b' means a not in b
 12 |         if op == '=':
 13 |             return b.contains(a)
 14 |         elif op == '!=':
 15 |             return not b.contains(a)
 16 |     if op == '=':
 17 |         return a == b
 18 |     elif op == '<':
 19 |         return a < b
 20 |     elif op == '>':
 21 |         return a > b
 22 |     elif op == '!=':
 23 |         return a != b
 24 | 
 25 | class ValueClass():
 26 |     def __init__(self, type, value, unit=None):
 27 |         """
 28 |         When type is
 29 |             - string, value is a str
 30 |             - quantity, value is a number and unit is required
 31 |             - year, value is a int
 32 |             - date, value is a date object
 33 |         """
 34 |         self.type = type
 35 |         self.value = value
 36 |         self.unit = unit
 37 | 
 38 |     def isTime(self):
 39 |         return self.type in {'year', 'date'}
 40 | 
 41 |     def can_compare(self, other):
 42 |         if self.type == 'string':
 43 |             return other.type == 'string'
 44 |         elif self.type == 'quantity':
 45 |             # NOTE: for two quantity, they can compare only when they have the same unit
 46 |             return other.type == 'quantity' and other.unit == self.unit
 47 |         else:
 48 |             # year can compare with date
 49 |             return other.type == 'year' or other.type == 'date'
 50 | 
 51 |     def contains(self, other):
 52 |         """
 53 |         check whether self contains other, which is different from __eq__ and the result is asymmetric
 54 |         used for conditions like whether 2001-01-01 in 2001, or whether 2001 in 2001-01-01
 55 |         """
 56 |         if self.type == 'year': # year can contain year and date
 57 |             other_value = other.value if other.type == 'year' else other.value.year
 58 |             return self.value == other_value
 59 |         elif self.type == 'date': # date can only contain date
 60 |             return other.type == 'date' and self.value == other.value
 61 |         else:
 62 |             raise Exception('not supported type: %s' % self.type)
 63 | 
 64 | 
 65 |     def __eq__(self, other):
 66 |         """
 67 |         2001 and 2001-01-01 is not equal
 68 |         """
 69 |         assert self.can_compare(other)
 70 |         return self.type == other.type and self.value == other.value
 71 | 
 72 |     def __lt__(self, other):
 73 |         """
 74 |         Comparison between a year and a date will convert them both to year
 75 |         """
 76 |         assert self.can_compare(other)
 77 |         if self.type == 'string':
 78 |             raise Exception('try to compare two string')
 79 |         elif self.type == 'quantity':
 80 |             return self.value < other.value
 81 |         elif self.type == 'year':
 82 |             other_value = other.value if other.type == 'year' else other.value.year
 83 |             return self.value < other_value
 84 |         elif self.type == 'date':
 85 |             if other.type == 'year':
 86 |                 return self.value.year < other.value
 87 |             else:
 88 |                 return self.value < other.value
 89 | 
 90 |     def __gt__(self, other):
 91 |         assert self.can_compare(other)
 92 |         if self.type == 'string':
 93 |             raise Exception('try to compare two string')
 94 |         elif self.type == 'quantity':
 95 |             return self.value > other.value
 96 |         elif self.type == 'year':
 97 |             other_value = other.value if other.type == 'year' else other.value.year
 98 |             return self.value > other_value
 99 |         elif self.type == 'date':
100 |             if other.type == 'year':
101 |                 return self.value.year > other.value
102 |             else:
103 |                 return self.value > other.value
104 | 
105 |     def __str__(self):
106 |         if self.type == 'string':
107 |             return self.value
108 |         elif self.type == 'quantity':
109 |             if self.value - int(self.value) < 1e-5:
110 |                 v = int(self.value)
111 |             else:
112 |                 v = self.value
113 |             return '{} {}'.format(v, self.unit) if self.unit != '1' else str(v)
114 |         elif self.type == 'year':
115 |             return str(self.value)
116 |         elif self.type == 'date':
117 |             return self.value.isoformat()
118 | 


--------------------------------------------------------------------------------