├── GraphReasoning ├── __init__.py ├── agents.py ├── graph_analysis.py ├── graph_generation.py ├── graph_tools.py ├── openai_tools.py └── utils.py ├── LICENSE ├── Notebooks ├── GraphReasoning - Graph Analysis.ipynb └── GraphReasoning - Graph Reasoning with LLM - BioMixtral.ipynb ├── README.md └── setup.py /GraphReasoning/__init__.py: -------------------------------------------------------------------------------- 1 | from GraphReasoning.openai_tools import * 2 | from GraphReasoning.graph_tools import * 3 | from GraphReasoning.graph_generation import * 4 | from GraphReasoning.utils import * 5 | from GraphReasoning.graph_analysis import * 6 | from GraphReasoning.agents import * -------------------------------------------------------------------------------- /GraphReasoning/agents.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | from transformers import logging 3 | from GraphReasoning.utils import * 4 | 5 | 6 | #transformers.logging.set_verbosity_info() 7 | logging.set_verbosity_error() 8 | 9 | from guidance import models, gen, select 10 | 11 | from guidance.models import LlamaCpp 12 | from guidance.models import Chat 13 | import re 14 | from guidance import gen, select, system, user, assistant 15 | newline = "\n" 16 | 17 | from IPython.display import display, Markdown 18 | 19 | import markdown2 20 | import pdfkit 21 | 22 | # <|system|> 23 | # You are a friendly chatbot who always responds in the style of a pirate. 24 | # <|user|> 25 | # How many helicopters can a human eat in one sitting? 26 | # <|assistant|> 27 | # Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food! 28 | 29 | class ZephyrLlamaCppChat(LlamaCpp, Chat): 30 | def get_role_start(self, role_name, **kwargs): 31 | if role_name == "user": 32 | return "<|user|>\n" 33 | 34 | elif role_name == "assistant": 35 | return "<|assistant|>\n" 36 | 37 | elif role_name == "system": 38 | return "<|system|>\n" 39 | 40 | def get_role_end(self, role_name=None): 41 | if role_name == "user": 42 | return "" 43 | elif role_name == "assistant": 44 | return "" 45 | elif role_name == "system": 46 | return "" 47 | 48 | class ConversationAgent: 49 | def __init__(self, chat_model, name: str, instructions: str, context_turns: int = 2, 50 | temperature=0.1, ): 51 | self._chat_model = chat_model 52 | self._name = name 53 | self._instructions = instructions 54 | self._my_turns = [] 55 | self._interlocutor_turns = [] 56 | self._went_first = False 57 | self._context_turns = context_turns 58 | # self._q=q 59 | self.temperature=temperature 60 | 61 | @property 62 | def name(self) -> str: 63 | return self._name 64 | 65 | def get_conv(self, ) -> str: 66 | 67 | return self._my_turns 68 | 69 | def reply(self, interlocutor_reply: str | None = None) -> str: 70 | if interlocutor_reply is None: 71 | self._my_turns = [] 72 | self._interlocutor_turns = [] 73 | self._went_first = True 74 | else: 75 | self._interlocutor_turns.append(interlocutor_reply) 76 | 77 | # Get trimmed history 78 | 79 | print 80 | my_hist = self._my_turns[(1-self._context_turns):] 81 | interlocutor_hist = self._interlocutor_turns[-self._context_turns:] 82 | 83 | # Set up the system prompt 84 | curr_model = self._chat_model 85 | with system(): 86 | #curr_model += f"Your name is {self.name}. {self._instructions}" 87 | curr_model += f"{self._instructions}" 88 | ''' 89 | if len(interlocutor_hist) == 0: 90 | curr_model += "Introduce yourself" 91 | elif len(interlocutor_hist) == 1: 92 | curr_model += "Introduce yourself before continuing the conversation" 93 | 94 | ''' 95 | # Replay the last few turns 96 | for i in range(len(my_hist)): 97 | with user(): 98 | curr_model += interlocutor_hist[i] 99 | with assistant(): 100 | curr_model += my_hist[i] 101 | 102 | if len(interlocutor_hist) > 0: 103 | with user(): 104 | curr_model += interlocutor_hist[-1] 105 | #else: 106 | # with user(): 107 | # if q != '': 108 | # curr_model += f"Answer this question: {self._q}" 109 | 110 | #print ("CURRENT: ",curr_model) 111 | with assistant(): 112 | curr_model += gen(name='response', max_tokens=1024,temperature= self.temperature) 113 | 114 | self._my_turns.append(curr_model['response']) 115 | return curr_model['response'] 116 | 117 | def conversation_simulator( 118 | bot0: ConversationAgent, 119 | question_gpt, 120 | question_gpt_name='Engineer',answer_gpt_name='Biologist', 121 | question_temperature = 0.7, 122 | question_asker_instructions='You ALWAYS ask tough questions. ', 123 | q='What is bioinspiration?', 124 | total_turns: int = 5,data_dir='./', 125 | marker_ch='>>> ',start_with_q=False,only_last=True, 126 | )-> list[dict[str,str]]: 127 | 128 | conversation_turns = [] 129 | last_reply = q 130 | first=True 131 | for _ in range(total_turns): 132 | 133 | last_reply = bot0.reply(last_reply) 134 | 135 | conversation_turns.append(dict(name=bot0.name, text=last_reply)) 136 | 137 | if first: 138 | first=False 139 | if only_last: 140 | 141 | txt= f'Consider this question and response.\n\n### Question: {q}\n\n### Response: {last_reply}' 142 | 143 | 144 | else: 145 | conv=get_entire_conversation(q, conversation_turns,marker_ch=marker_ch,start_with_q=start_with_q, question_gpt_name=question_gpt_name) 146 | txt=f'### Consider this conversation between {question_gpt_name} and {answer_gpt_name}:\n\n{conv}\n\n"' 147 | 148 | 149 | else: 150 | 151 | if only_last: 152 | txt= f'Consider this question and response.\n\n### Question: {q}\n\n### Response: {last_reply}' 153 | 154 | else: 155 | conv=get_entire_conversation(q, conversation_turns,marker_ch=marker_ch,start_with_q=start_with_q,question_gpt_name=question_gpt_name) 156 | txt=f'### Consider this conversation between {question_gpt_name} and {answer_gpt_name}:\n\n{conv}\n\n"' 157 | 158 | 159 | with system(): 160 | lm = question_gpt+question_asker_instructions+"\n\nYou MUST respond with ONE new probing question. " 161 | 162 | with user(): 163 | lm += f"""{txt}\n\n### Instruction: Respond with a SINGLE follow-up question that critically challenges the response. 164 | DO NOT answer the question or comment on it yet. 165 | \n\nThe single question is: """ 166 | 167 | with assistant(): 168 | q_new = lm+gen(name='question', temperature=question_temperature, 169 | max_tokens=500, stop=newline) 170 | 171 | q_new=q_new['question'].replace('"', '') 172 | 173 | last_reply=q_new 174 | 175 | conversation_turns.append(dict(name=question_gpt_name, text=last_reply)) 176 | 177 | 178 | return conversation_turns 179 | 180 | 181 | def read_and_summarize(gpt, txt='This is a conversation.', q='', 182 | ): 183 | 184 | with system(): 185 | lm = gpt + "You analyze text and provide an accurate account of the content from all sides discussed." 186 | 187 | with user(): 188 | lm += f"""Carefully read this conversation: 189 | 190 | <<<{txt}>>> 191 | Accurately summarize the conversation and identify the key points made. 192 | 193 | Think step by step: 194 | """ 195 | 196 | with assistant(): 197 | lm+=gen('summary', max_tokens=1024) 198 | 199 | with user(): 200 | lm += f'Now list the salient insights as bullet points.' 201 | 202 | with assistant(): 203 | lm+=gen('bullet', max_tokens=1024) 204 | 205 | with user(): 206 | lm += f'Identify the single most important takeaway in the conversation and how it answers the original question, <<<{q}>>>.' 207 | 208 | with assistant(): 209 | lm+=gen('takeaway', max_tokens=1024) 210 | 211 | return lm['summary'], lm['bullet'], lm['takeaway'] 212 | 213 | 214 | def answer_question (gpt_question_asker, gpt, q='I have identified this amino acid sequence: AAAAAIIAAAA. How can I use it? ', 215 | 216 | bot_name_1="Biologist", 217 | bot_instructions_1 = f"""You are a biologist. You are taking part in a discussion, from a life science perspective. 218 | Keep your answers brief, but accurate, and creative. 219 | """, 220 | bot_name_2="Engineer", 221 | bot_instructions_2 = """You are a critical engineer. You are taking part in a discussion, from the perspective of engineering. 222 | Keep your answers brief, and always challenge statements in a provokative way. As a creative individual, you inject ideas from other fields. """, 223 | question_temperature = 0.1, 224 | conv_temperature=0.3, 225 | 226 | total_turns=4, 227 | delete_last_question=True, #whether or not the last question is deleted (since it is not actually answered anyway) 228 | save_PDF=True, 229 | PDF_name=None, save_dir='./', 230 | txt_file_path=None, marker_ch='>>> ',start_with_q=False,only_last=True, 231 | 232 | ): 233 | 234 | bot_1 = ConversationAgent(chat_model=gpt, name=bot_name_1, instructions=bot_instructions_1, 235 | context_turns=total_turns, temperature=conv_temperature) 236 | 237 | conversation_turns = conversation_simulator(bot_1, question_gpt=gpt_question_asker, 238 | question_gpt_name=bot_name_2,answer_gpt_name=bot_name_1, 239 | question_temperature=question_temperature, 240 | question_asker_instructions=bot_instructions_2, 241 | q=q, 242 | total_turns=total_turns, data_dir=save_dir,marker_ch=marker_ch,start_with_q=start_with_q,only_last=only_last) 243 | 244 | if delete_last_question: 245 | conversation_turns.pop() 246 | 247 | txt='' 248 | txt+=f"The question discussed is: **{q.strip()}**\n\n" 249 | 250 | print ("-----------------------------------------") 251 | for turn in conversation_turns: 252 | 253 | txt +=f"**{turn['name'].strip ()}**: {turn['text']}\n\n" 254 | 255 | summary, bullet, keytakaway = read_and_summarize(gpt ,txt, q=q) 256 | 257 | integrated = f"""#### Question and conversation: 258 | 259 | {txt} 260 | 261 | #### Summary: 262 | 263 | {summary} 264 | 265 | #### List of key points: 266 | 267 | {bullet} 268 | 269 | #### Key takeaway: 270 | 271 | **{keytakaway.strip()}** 272 | """ 273 | 274 | if save_PDF: 275 | # Convert Markdown to HTML 276 | html_text = markdown2.markdown(integrated) 277 | 278 | # Convert HTML to PDF and save it 279 | max_len_fname=64 280 | if PDF_name==None: 281 | PDF_name=f'{save_dir}{q[:max_len_fname].strip()}.pdf' 282 | 283 | pdfkit.from_string(html_text, PDF_name) 284 | 285 | max_len_fname=64 286 | if txt_file_path==None: 287 | txt_file_path = f'{save_dir}{q[:max_len_fname].strip()}.txt' 288 | save_raw_txt=remove_markdown_symbols(integrated) 289 | 290 | with open(txt_file_path, 'w') as file: 291 | file.write(save_raw_txt) 292 | 293 | return conversation_turns, txt, summary, bullet, keytakaway, integrated, save_raw_txt 294 | 295 | 296 | def get_entire_conversation (q, conversation_turns, marker_ch='### ', start_with_q=False, question_gpt_name='Question: '): 297 | txt='' 298 | 299 | if start_with_q: 300 | txt+=f"{marker_ch}The question discussed is: {q.strip()}\n\n" 301 | else: 302 | txt+=f"{marker_ch}{question_gpt_name}: {q.strip()}\n\n" 303 | 304 | print ("-----------------------------------------") 305 | for turn in conversation_turns: 306 | 307 | txt +=f"{marker_ch}{turn['name'].strip ()}: {turn['text']}\n\n" 308 | return txt 309 | 310 | 311 | ############################################################################################################################# 312 | ######################################################### LLamaIndex based ################################################## 313 | ############################################################################################################################# 314 | 315 | from llama_index.core.memory import ChatMemoryBuffer 316 | 317 | from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings 318 | from llama_index.core.embeddings import resolve_embed_model 319 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding 320 | from llama_index.core.node_parser import SentenceSplitter 321 | 322 | from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings 323 | from llama_index.core.chat_engine import SimpleChatEngine 324 | 325 | def get_chat_engine_from_index_LlamaIndex(llm,index, chat_token_limit=2500,verbose=False, chat_mode="context", 326 | system_prompt='You are a chatbot, able to have normal interactions, as well as talk about context provided.'): 327 | memory = ChatMemoryBuffer.from_defaults(token_limit=chat_token_limit) 328 | 329 | chat_engine = index.as_chat_engine(llm=llm, 330 | chat_mode=chat_mode, 331 | memory=memory, 332 | system_prompt=system_prompt,verbose=verbose, 333 | ) 334 | 335 | return chat_engine 336 | 337 | def get_answer_LlamaIndex (llm, #model, tokenizer, 338 | q, system_prompt="You are an expert in materials science.", chat_engine=None, 339 | max_new_tokens=1024, #temperature=0.7, 340 | messages_to_prompt=None,chat_token_limit=2500,chat_mode="context", 341 | completion_to_prompt=None,index=None, verbose=False): 342 | 343 | if chat_engine==None: 344 | 345 | if index != None: 346 | 347 | chat_engine=get_chat_engine_from_index_LlamaIndex(llm,index, chat_token_limit=chat_token_limit,verbose=verbose,chat_mode=chat_mode, 348 | system_prompt=f'You are a chatbot, able to have normal interactions, as well as talk about data provided. {system_prompt}') 349 | else: 350 | chat_engine = SimpleChatEngine.from_defaults(llm=llm, system_prompt=system_prompt) 351 | 352 | response = chat_engine.stream_chat(q) 353 | for token in response.response_gen: 354 | print(token, end="") 355 | return response.response, chat_engine 356 | 357 | 358 | 359 | class ConversationAgent_LlamaIndex: 360 | def __init__(self, llm, 361 | 362 | name: str, instructions: str,# context_turns: int = 2, 363 | 364 | index=None,chat_token_limit=2500,verbose=False,chat_mode="context", 365 | ): 366 | 367 | self._name = name 368 | self._instructions = instructions 369 | self._source_nodes =[] 370 | 371 | if index != None: 372 | print (f"Set up chat engine, with index, verbose={verbose}, chat_mode={chat_mode}.") 373 | 374 | self.chat_engine=get_chat_engine_from_index_LlamaIndex(llm,index, chat_token_limit=chat_token_limit,verbose=verbose,chat_mode=chat_mode, 375 | system_prompt=f'You are a chatbot, able to have normal interactions, as well as talk about data provided.\n\n{self._instructions}') 376 | else: 377 | self.chat_engine = SimpleChatEngine.from_defaults(llm=llm, system_prompt=self._instructions) 378 | 379 | 380 | @property 381 | def name(self) -> str: 382 | return self._name 383 | 384 | def get_conv(self, ) -> str: 385 | 386 | return self.chat_engine.chat_history 387 | def get_source_nodes(self, ) -> str: 388 | 389 | return self._source_nodes 390 | 391 | def reset_chat(self, ): 392 | self.chat_engine.reset() 393 | 394 | def reply(self, question) -> str: 395 | response = self.chat_engine.stream_chat(question ) 396 | for token in response.response_gen: 397 | print(token, end="") 398 | 399 | self._source_nodes.append (response.source_nodes) 400 | 401 | return response.response, response 402 | 403 | def conversation_simulator_LlamaIndex( 404 | 405 | llm_answer, llm_question, 406 | question_gpt_name='Engineer',answer_gpt_name='Biologist', answer_instructions='You answer correctly.', 407 | 408 | question_asker_instructions='You always respond with a single, tough, question. ', 409 | q='What is bioinspiration?', 410 | total_turns: int = 5,data_dir='./', 411 | marker_ch='>>> ',start_with_q=False,only_last=True, 412 | marker_ch_outer='### ',sample_question='', 413 | 414 | answer_index=None,question_index=None, verbose=False,chat_mode="context",chat_token_limit=2500, 415 | iterate_on_question=False,#whether to revise question after initial draft, 416 | include_N_turns_in_question_development=9999,single_shot_question=True, 417 | iterate_on_question_with_earlier_context=False, #whether or not to iterate on question with all earlier context of just the question draft 418 | )-> list[dict[str,str]]: 419 | 420 | answer_agent = ConversationAgent_LlamaIndex(llm_answer, 421 | 422 | name=answer_gpt_name, instructions=answer_instructions, 423 | 424 | index=answer_index,verbose=verbose,chat_mode=chat_mode,chat_token_limit=chat_token_limit, 425 | ) 426 | 427 | conversation_turns = [] 428 | q_new = q #None 429 | 430 | conversation_turns.append(dict(name=question_gpt_name, text=q_new)) 431 | 432 | 433 | print (f"### {question_gpt_name}: {q}\n") 434 | for _ in range(total_turns): 435 | 436 | print (f"### {answer_gpt_name}: ", end="") 437 | 438 | last_reply, response = answer_agent.reply(q_new) 439 | 440 | 441 | conversation_turns.append(dict(name=answer_gpt_name, text=last_reply)) 442 | 443 | if only_last: 444 | 445 | txt= f'Consider this question and response.\n\n{marker_ch_outer}Question: {q}\n\n{marker_ch_outer} Response: {last_reply}' 446 | 447 | else: 448 | NN=include_N_turns_in_question_development 449 | 450 | NN = NN + 1 if NN % 2 else NN # Adjust NN to be even if it's not 451 | conv=get_entire_conversation_LlamaIndex(q, conversation_turns[-NN:],marker_ch=marker_ch,start_with_q=start_with_q, question_gpt_name=question_gpt_name) 452 | 453 | txt=f'{marker_ch_outer}Read this conversation between {question_gpt_name} and {answer_gpt_name}:\n\n```{conv}```\n\n"' 454 | 455 | if single_shot_question: # SINGLE SHOT QUESTION 456 | 457 | q=f"""{txt}\n\n{marker_ch_outer}Instruction: Respond with a SINGLE follow-up question that critically challenges the earlier responses. 458 | 459 | DO NOT answer the question or comment on it yet. Do NOT repeat a question that was asked in the earlier conversation.{sample_question} 460 | 461 | The single question is:""" 462 | 463 | q=f"""{txt}\n\n{marker_ch_outer}Please generate a thoughtful and challenging follow-up question. {sample_question}{question_gpt_name}:""" 464 | 465 | print (f"\n\n### {question_gpt_name}: ", end="") 466 | 467 | q_new, q_chat=get_answer_LlamaIndex (llm_question,#model, tokenizer, 468 | q=q, #temperature=question_temperature, 469 | 470 | index=question_index,verbose=verbose,chat_mode=chat_mode,chat_token_limit=chat_token_limit, 471 | system_prompt=question_asker_instructions+"You MUST respond with ONE new probing question. ONLY provide the question.") 472 | 473 | else: # MULTI SHOT QUESTION 474 | q=f"""{txt}\n\n{marker_ch_outer}Instruction: Summarize the conversation, with details. Include logic and reasoning, and think step by step.""" 475 | print (f"\n\n### {question_gpt_name}, summary: ", end="") 476 | summary_for_q, chat_engine=get_answer_LlamaIndex (llm_question, q=q, #messages_to_prompt=messages_to_prompt, 477 | 478 | system_prompt="You analyze text and develop questions.", chat_engine=None) 479 | q=f"""{marker_ch_outer}Please generate a thoughtful and challenging follow-up question. {sample_question}\n\nThe question is:""" 480 | print (f"\n\n### {question_gpt_name}: ", end="") 481 | q_new, chat_engine=get_answer_LlamaIndex (llm_question, q=q, #messages_to_prompt=messages_to_prompt, 482 | 483 | system_prompt="You analyze text and develop questions.",chat_engine=chat_engine) 484 | 485 | 486 | if iterate_on_question: 487 | if iterate_on_question_with_earlier_context==False: 488 | q_chat=None #start with new chat 489 | print (f"\n\n### {question_gpt_name} (iterate): ", end="") 490 | q_new, _=get_answer_LlamaIndex (llm_question,#model, tokenizer, 491 | q=f"Make sure >>>{q_new}<<< is a SINGLE question.\n\nDO NOT answer the question. If it is a single question, just reply with the question.{sample_question}\n\nThe SINGLE question is: ", #temperature=question_temperature, 492 | 493 | index=question_index,verbose=verbose,chat_mode=chat_mode,chat_token_limit=chat_token_limit, 494 | system_prompt="You pose questions.",chat_engine=q_chat 495 | ) 496 | 497 | q_new=q_new.replace('"', '') 498 | 499 | print (f"\n") 500 | 501 | conversation_turns.append(dict(name=question_gpt_name, text=q_new)) 502 | 503 | 504 | return conversation_turns, answer_agent.get_conv(), response, answer_agent 505 | 506 | 507 | def read_and_summarize_LlamaIndex( llm, txt='This is a conversation.', q='', 508 | ): 509 | q=f"""Carefully read this conversation: 510 | 511 | >>>{txt}<<< 512 | 513 | Accurately summarize the conversation and identify the key points made. 514 | 515 | Think step by step: 516 | """ 517 | 518 | summary, chat_engine=get_answer_LlamaIndex (llm, q=q, 519 | system_prompt="You analyze text and provide an accurate account of the content from all sides discussed.") 520 | 521 | q=f'Now list the salient insights as bullet points.' 522 | 523 | bullet, chat_engine=get_answer_LlamaIndex (llm, q=q, 524 | system_prompt="You analyze text and provide an accurate account of the content from all sides discussed.", 525 | chat_engine=chat_engine) 526 | 527 | 528 | q=f'Identify the single most important takeaway in the conversation and how it answers the original question, <<<{q}>>>.' 529 | takeaway, chat_engine=get_answer_LlamaIndex (llm, q=q, 530 | system_prompt="You analyze text and provide an accurate account of the content from all sides discussed.", 531 | chat_engine=chat_engine) 532 | 533 | 534 | return summary, bullet, takeaway 535 | 536 | def answer_question_LlamaIndex ( #model, tokenizer, 537 | 538 | llm_answer, 539 | llm_question, llm_summarize, 540 | q='I have identified this amino acid sequence: AAAAAIIAAAA. How can I use it? ', 541 | bot_name_1="Biologist", 542 | bot_instructions_1 = f"""You are a biologist. You are taking part in a discussion, from a life science perspective. 543 | Keep your answers brief, but accurate, and creative. 544 | """, 545 | bot_name_2="Engineer", 546 | bot_instructions_2 = """You are a critical engineer. You are taking part in a discussion, from the perspective of engineering. 547 | Keep your answers brief, and always challenge statements in a provokative way. As a creative individual, you inject ideas from other fields. """, 548 | 549 | include_N_turns_in_question_development=99999, 550 | total_turns=4, 551 | delete_last_question=True, #whether or not the last question is deleted (since it is not actually answered anyway) 552 | save_PDF=True,sample_question='', 553 | PDF_name=None, save_dir='./', 554 | txt_file_path=None, marker_ch='>>> ',marker_ch_outer='### ', 555 | start_with_q=False,only_last=True,single_shot_question=True, 556 | messages_to_prompt=None,question_index=None, answer_index=None,chat_mode="context",chat_token_limit=2500, 557 | completion_to_prompt=None,iterate_on_question=False,iterate_on_question_with_earlier_context=True,verbose=False, 558 | ): 559 | 560 | conversation_turns, answer_agent_conv, response, answer_agent = conversation_simulator_LlamaIndex( llm_answer, llm_question,# model, tokenizer, 561 | question_gpt_name=bot_name_2,answer_gpt_name=bot_name_1, 562 | question_asker_instructions=bot_instructions_2, 563 | q=q, question_index=question_index, answer_index=answer_index, 564 | include_N_turns_in_question_development=include_N_turns_in_question_development, 565 | single_shot_question=single_shot_question, 566 | total_turns=total_turns, data_dir=save_dir,marker_ch=marker_ch,marker_ch_outer=marker_ch_outer, 567 | start_with_q=start_with_q,only_last=only_last, sample_question=sample_question, 568 | verbose=verbose,chat_mode=chat_mode,chat_token_limit=chat_token_limit, 569 | iterate_on_question=iterate_on_question,iterate_on_question_with_earlier_context=iterate_on_question_with_earlier_context, 570 | ) 571 | 572 | if delete_last_question: 573 | conversation_turns.pop() 574 | 575 | txt='' 576 | txt+=f"The question discussed is: **{q.strip()}**\n\n" 577 | 578 | print ("-----------------------------------------") 579 | for turn in conversation_turns: 580 | 581 | txt +=f"**{turn['name'].strip ()}**: {turn['text']}\n\n" 582 | 583 | summary, bullet, keytakaway = read_and_summarize_LlamaIndex(llm_summarize,#model, tokenizer , 584 | txt, q=q, ) 585 | 586 | integrated = f"""#### Question and conversation: 587 | 588 | {txt} 589 | 590 | #### Summary: 591 | 592 | {summary} 593 | 594 | #### List of key points: 595 | 596 | {bullet} 597 | 598 | #### Key takeaway: 599 | 600 | **{keytakaway.strip()}** 601 | """ 602 | 603 | if save_PDF: 604 | # Convert Markdown to HTML 605 | html_text = markdown2.markdown(integrated) 606 | 607 | # Convert HTML to PDF and save it 608 | max_len_fname=64 609 | if PDF_name==None: 610 | PDF_name=f'{save_dir}{q[:max_len_fname].strip()}.pdf' 611 | 612 | pdfkit.from_string(html_text, PDF_name) 613 | 614 | max_len_fname=64 615 | if txt_file_path==None: 616 | txt_file_path = f'{save_dir}{q[:max_len_fname].strip()}.txt' 617 | save_raw_txt=remove_markdown_symbols(integrated) 618 | 619 | with open(txt_file_path, 'w') as file: 620 | file.write(save_raw_txt) 621 | 622 | return conversation_turns, txt, summary, bullet, keytakaway, integrated, save_raw_txt, answer_agent_conv, response, answer_agent 623 | 624 | 625 | def get_entire_conversation_LlamaIndex (q, conversation_turns, marker_ch='### ', start_with_q=False, question_gpt_name='Question: '): 626 | txt='' 627 | 628 | if start_with_q: 629 | txt+=f"{marker_ch}The question discussed is: {q.strip()}\n\n" 630 | else: 631 | txt='' 632 | 633 | #print ("-----------------------------------------") 634 | for turn in conversation_turns: 635 | 636 | txt +=f"{marker_ch}{turn['name'].strip ()}: {turn['text']}\n\n" 637 | return txt.strip() 638 | -------------------------------------------------------------------------------- /GraphReasoning/graph_generation.py: -------------------------------------------------------------------------------- 1 | from GraphReasoning.graph_tools import * 2 | from GraphReasoning.utils import * 3 | from GraphReasoning.graph_analysis import * 4 | 5 | import copy 6 | 7 | import re 8 | from IPython.display import display, Markdown 9 | 10 | import markdown2 11 | import pdfkit 12 | 13 | 14 | import uuid 15 | import pandas as pd 16 | import numpy as np 17 | 18 | import pandas as pd 19 | import numpy as np 20 | import networkx as nx 21 | import os 22 | from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader 23 | from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader 24 | from langchain.text_splitter import RecursiveCharacterTextSplitter 25 | from pathlib import Path 26 | import random 27 | from pyvis.network import Network 28 | 29 | from tqdm.notebook import tqdm 30 | 31 | import itertools 32 | import seaborn as sns 33 | palette = "hls" 34 | 35 | import uuid 36 | import pandas as pd 37 | import numpy as np 38 | 39 | from transformers import AutoTokenizer, AutoModel 40 | import torch 41 | from scipy.spatial.distance import cosine 42 | from sklearn.decomposition import PCA 43 | import numpy as np 44 | from sklearn.decomposition import PCA 45 | from sklearn.cluster import KMeans 46 | import matplotlib.pyplot as plt 47 | import seaborn as sns # For more attractive plotting 48 | 49 | from sklearn.cluster import KMeans 50 | import matplotlib.pyplot as plt 51 | 52 | import pandas as pd 53 | 54 | import transformers 55 | from transformers import logging 56 | 57 | logging.set_verbosity_error() 58 | 59 | import re 60 | from IPython.display import display, Markdown 61 | 62 | import markdown2 63 | import pdfkit 64 | 65 | 66 | import uuid 67 | import pandas as pd 68 | import numpy as np 69 | 70 | import pandas as pd 71 | import numpy as np 72 | import networkx as nx 73 | import os 74 | from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader 75 | from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader 76 | from langchain.text_splitter import RecursiveCharacterTextSplitter 77 | from pathlib import Path 78 | import random 79 | from pyvis.network import Network 80 | 81 | from tqdm.notebook import tqdm 82 | 83 | import seaborn as sns 84 | palette = "hls" 85 | 86 | import uuid 87 | import pandas as pd 88 | import numpy as np 89 | 90 | from transformers import AutoTokenizer, AutoModel 91 | import torch 92 | from scipy.spatial.distance import cosine 93 | from sklearn.decomposition import PCA 94 | import numpy as np 95 | from sklearn.decomposition import PCA 96 | from sklearn.cluster import KMeans 97 | import matplotlib.pyplot as plt 98 | import seaborn as sns # For more attractive plotting 99 | 100 | from sklearn.cluster import KMeans 101 | import matplotlib.pyplot as plt 102 | 103 | # Code based on: https://github.com/rahulnyk/knowledge_graph 104 | 105 | def extract (string, start='[', end=']'): 106 | start_index = string.find(start) 107 | end_index = string.rfind(end) 108 | 109 | return string[start_index :end_index+1] 110 | def documents2Dataframe(documents) -> pd.DataFrame: 111 | rows = [] 112 | for chunk in documents: 113 | row = { 114 | "text": chunk, 115 | # **chunk.metadata, 116 | "chunk_id": uuid.uuid4().hex, 117 | } 118 | rows = rows + [row] 119 | 120 | df = pd.DataFrame(rows) 121 | return df 122 | 123 | def concepts2Df(concepts_list) -> pd.DataFrame: 124 | ## Remove all NaN entities 125 | concepts_dataframe = pd.DataFrame(concepts_list).replace(" ", np.nan) 126 | concepts_dataframe = concepts_dataframe.dropna(subset=["entity"]) 127 | concepts_dataframe["entity"] = concepts_dataframe["entity"].apply( 128 | lambda x: x.lower() 129 | ) 130 | 131 | return concepts_dataframe 132 | 133 | 134 | def df2Graph(dataframe: pd.DataFrame, generate, repeat_refine=0, verbatim=False, 135 | 136 | ) -> list: 137 | 138 | results = dataframe.apply( 139 | lambda row: graphPrompt(row.text, generate, {"chunk_id": row.chunk_id}, repeat_refine=repeat_refine, 140 | verbatim=verbatim,#model 141 | ), axis=1 142 | ) 143 | # invalid json results in NaN 144 | results = results.dropna() 145 | results = results.reset_index(drop=True) 146 | 147 | ## Flatten the list of lists to one single list of entities. 148 | concept_list = np.concatenate(results).ravel().tolist() 149 | return concept_list 150 | 151 | 152 | def graph2Df(nodes_list) -> pd.DataFrame: 153 | ## Remove all NaN entities 154 | graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan) 155 | graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"]) 156 | graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: str(x).lower()) 157 | graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: str(x).lower()) 158 | 159 | return graph_dataframe 160 | 161 | import sys 162 | from yachalk import chalk 163 | sys.path.append("..") 164 | 165 | import json 166 | 167 | def graphPrompt(input: str, generate, metadata={}, #model="mistral-openorca:latest", 168 | repeat_refine=0,verbatim=False, 169 | ): 170 | 171 | SYS_PROMPT_GRAPHMAKER = ( 172 | "You are a network ontology graph maker who extracts terms and their relations from a given context, using category theory. " 173 | "You are provided with a context chunk (delimited by ```) Your task is to extract the ontology " 174 | "of terms mentioned in the given context. These terms should represent the key concepts as per the context, including well-defined and widely used names of materials, systems, methods. \n\n" 175 | "Format your output as a list of JSON. Each element of the list contains a pair of terms" 176 | "and the relation between them, like the follwing: \n" 177 | "[\n" 178 | " {\n" 179 | ' "node_1": "A concept from extracted ontology",\n' 180 | ' "node_2": "A related concept from extracted ontology",\n' 181 | ' "edge": "Relationship between the two concepts, node_1 and node_2, succinctly described"\n' 182 | " }, {...}\n" 183 | "]" 184 | "" 185 | "Examples:" 186 | "Context: ```Alice is Marc's mother.```\n" 187 | "[\n" 188 | " {\n" 189 | ' "node_1": "Alice",\n' 190 | ' "node_2": "Marc",\n' 191 | ' "edge": "is mother of"\n' 192 | " }, " 193 | "{...}\n" 194 | "]" 195 | "Context: ```Silk is a strong natural fiber used to catch prey in a web. Beta-sheets control its strength.```\n" 196 | "[\n" 197 | " {\n" 198 | ' "node_1": "silk",\n' 199 | ' "node_2": "fiber",\n' 200 | ' "edge": "is"\n' 201 | " }," 202 | " {\n" 203 | ' "node_1": "beta-sheets",\n' 204 | ' "node_2": "strength",\n' 205 | ' "edge": "control"\n' 206 | " }," 207 | " {\n" 208 | ' "node_1": "silk",\n' 209 | ' "node_2": "prey",\n' 210 | ' "edge": "catches"\n' 211 | " }," 212 | "{...}\n" 213 | "]\n\n" 214 | "Analyze the text carefully and produce around 10 triplets, making sure they reflect consistent ontologies.\n" 215 | ) 216 | 217 | USER_PROMPT = f"Context: ```{input}``` \n\nOutput: " 218 | 219 | print (".", end ="") 220 | response = generate( system_prompt=SYS_PROMPT_GRAPHMAKER, prompt=USER_PROMPT) 221 | if verbatim: 222 | print ("---------------------\nFirst result: ", response) 223 | 224 | SYS_PROMPT_FORMAT = ('You respond in this format:' 225 | '[\n' 226 | " {\n" 227 | ' "node_1": "A concept from extracted ontology",\n' 228 | ' "node_2": "A related concept from extracted ontology",\n' 229 | ' "edge": "Relationship between the two concepts, node_1 and node_2, succinctly described"\n' 230 | ' }, {...} ]\n' ) 231 | USER_PROMPT = (f'Read this context: ```{input}```.' 232 | f'Read this ontology: ```{response}```' 233 | f'\n\nImprove the ontology by renaming nodes so that they have consistent labels that are widely used in the field of materials science.''' 234 | '') 235 | response = generate( system_prompt=SYS_PROMPT_FORMAT, 236 | prompt=USER_PROMPT) 237 | if verbatim: 238 | print ("---------------------\nAfter improve: ", response) 239 | 240 | USER_PROMPT = f"Context: ```{response}``` \n\n Fix to make sure it is proper format. " 241 | response = generate( system_prompt=SYS_PROMPT_FORMAT, prompt=USER_PROMPT) 242 | response = response.replace ('\\', '' ) 243 | if verbatim: 244 | print ("---------------------\nAfter clean: ", response) 245 | 246 | if repeat_refine>0: 247 | for rep in tqdm(range (repeat_refine)): 248 | 249 | 250 | 251 | USER_PROMPT = (f'Insert new triplets into the original ontology. Read this context: ```{input}```.' 252 | f'Read this ontology: ```{response}```' 253 | f'\n\nInsert additional triplets to the original list, in the same JSON format. Repeat original AND new triplets.\n' 254 | '') 255 | response = generate( system_prompt=SYS_PROMPT_GRAPHMAKER, 256 | prompt=USER_PROMPT) 257 | if verbatim: 258 | print ("---------------------\nAfter adding triplets: ", response) 259 | USER_PROMPT = f"Context: ```{response}``` \n\n Fix to make sure it is proper format. " 260 | response = generate( system_prompt=SYS_PROMPT_FORMAT, prompt=USER_PROMPT) 261 | response = response.replace ('\\', '' ) 262 | USER_PROMPT = (f'Read this context: ```{input}```.' 263 | f'Read this ontology: ```{response}```' 264 | f'\n\nRevise the ontology by renaming nodes and edges so that they have consistent and concise labels.''' 265 | 266 | '') 267 | response = generate( system_prompt=SYS_PROMPT_FORMAT, 268 | prompt=USER_PROMPT) 269 | if verbatim: 270 | print (f"---------------------\nAfter refine {rep}/{repeat_refine}: ", response) 271 | 272 | 273 | USER_PROMPT = f"Context: ```{response}``` \n\n Fix to make sure it is proper format. " 274 | response = generate( system_prompt=SYS_PROMPT_FORMAT, prompt=USER_PROMPT) 275 | response = response.replace ('\\', '' ) 276 | 277 | try: 278 | response=extract (response) 279 | 280 | except: 281 | print (end='') 282 | 283 | try: 284 | result = json.loads(response) 285 | print (result) 286 | result = [dict(item, **metadata) for item in result] 287 | except: 288 | print("\n\nERROR ### Here is the buggy response: ", response, "\n\n") 289 | result = None 290 | return result 291 | 292 | def colors2Community(communities) -> pd.DataFrame: 293 | 294 | p = sns.color_palette(palette, len(communities)).as_hex() 295 | random.shuffle(p) 296 | rows = [] 297 | group = 0 298 | for community in communities: 299 | color = p.pop() 300 | group += 1 301 | for node in community: 302 | rows += [{"node": node, "color": color, "group": group}] 303 | df_colors = pd.DataFrame(rows) 304 | return df_colors 305 | 306 | def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame: 307 | ## Melt the dataframe into a list of nodes 308 | df['node_1'] = df['node_1'].astype(str) 309 | df['node_2'] = df['node_2'].astype(str) 310 | df['edge'] = df['edge'].astype(str) 311 | dfg_long = pd.melt( 312 | df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node" 313 | ) 314 | dfg_long.drop(columns=["variable"], inplace=True) 315 | # Self join with chunk id as the key will create a link between terms occuring in the same text chunk. 316 | dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2")) 317 | # drop self loops 318 | self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index 319 | dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True) 320 | ## Group and count edges. 321 | dfg2 = ( 322 | dfg2.groupby(["node_1", "node_2"]) 323 | .agg({"chunk_id": [",".join, "count"]}) 324 | .reset_index() 325 | ) 326 | dfg2.columns = ["node_1", "node_2", "chunk_id", "count"] 327 | dfg2.replace("", np.nan, inplace=True) 328 | dfg2.dropna(subset=["node_1", "node_2"], inplace=True) 329 | # Drop edges with 1 count 330 | dfg2 = dfg2[dfg2["count"] != 1] 331 | dfg2["edge"] = "contextual proximity" 332 | return dfg2 333 | 334 | def make_graph_from_text (txt,generate, 335 | include_contextual_proximity=False, 336 | graph_root='graph_root', 337 | chunk_size=2500,chunk_overlap=0, 338 | repeat_refine=0,verbatim=False, 339 | data_dir='./data_output_KG/', 340 | save_PDF=False,#TO DO 341 | save_HTML=True, 342 | ): 343 | 344 | ## data directory 345 | if not os.path.exists(data_dir): 346 | os.makedirs(data_dir) 347 | 348 | outputdirectory = Path(f"./{data_dir}/") #where graphs are stored from graph2df function 349 | 350 | 351 | splitter = RecursiveCharacterTextSplitter( 352 | #chunk_size=5000, #1500, 353 | chunk_size=chunk_size, #1500, 354 | chunk_overlap=chunk_overlap, 355 | length_function=len, 356 | is_separator_regex=False, 357 | ) 358 | 359 | pages = splitter.split_text(txt) 360 | print("Number of chunks = ", len(pages)) 361 | if verbatim: 362 | display(Markdown (pages[0]) ) 363 | 364 | df = documents2Dataframe(pages) 365 | 366 | ## To regenerate the graph with LLM, set this to True 367 | regenerate = True 368 | 369 | if regenerate: 370 | concepts_list = df2Graph(df,generate,repeat_refine=repeat_refine,verbatim=verbatim) #model='zephyr:latest' ) 371 | dfg1 = graph2Df(concepts_list) 372 | if not os.path.exists(outputdirectory): 373 | os.makedirs(outputdirectory) 374 | 375 | dfg1.to_csv(outputdirectory/f"{graph_root}_graph.csv", sep="|", index=False) 376 | df.to_csv(outputdirectory/f"{graph_root}_chunks.csv", sep="|", index=False) 377 | dfg1.to_csv(outputdirectory/f"{graph_root}_graph_clean.csv", #sep="|", index=False 378 | ) 379 | df.to_csv(outputdirectory/f"{graph_root}_chunks_clean.csv", #sep="|", index=False 380 | ) 381 | else: 382 | dfg1 = pd.read_csv(outputdirectory/f"{graph_root}_graph.csv", sep="|") 383 | 384 | dfg1.replace("", np.nan, inplace=True) 385 | dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True) 386 | dfg1['count'] = 4 387 | 388 | if verbatim: 389 | print("Shape of graph DataFrame: ", dfg1.shape) 390 | dfg1.head()### 391 | 392 | if include_contextual_proximity: 393 | dfg2 = contextual_proximity(dfg1) 394 | dfg = pd.concat([dfg1, dfg2], axis=0) 395 | #dfg2.tail() 396 | else: 397 | dfg=dfg1 398 | 399 | 400 | dfg = ( 401 | dfg.groupby(["node_1", "node_2"]) 402 | .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'}) 403 | .reset_index() 404 | ) 405 | #dfg 406 | 407 | nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique() 408 | print ("Nodes shape: ", nodes.shape) 409 | 410 | G = nx.Graph() 411 | node_list=[] 412 | node_1_list=[] 413 | node_2_list=[] 414 | title_list=[] 415 | weight_list=[] 416 | chunk_id_list=[] 417 | 418 | ## Add nodes to the graph 419 | for node in nodes: 420 | G.add_node( 421 | str(node) 422 | ) 423 | node_list.append (node) 424 | 425 | ## Add edges to the graph 426 | for _, row in dfg.iterrows(): 427 | 428 | G.add_edge( 429 | str(row["node_1"]), 430 | str(row["node_2"]), 431 | title=row["edge"], 432 | weight=row['count']/4 433 | ) 434 | 435 | node_1_list.append (row["node_1"]) 436 | node_2_list.append (row["node_2"]) 437 | title_list.append (row["edge"]) 438 | weight_list.append (row['count']/4) 439 | 440 | chunk_id_list.append (row['chunk_id'] ) 441 | 442 | try: 443 | 444 | df_nodes = pd.DataFrame({"nodes": node_list} ) 445 | df_nodes.to_csv(f'{data_dir}/{graph_root}_nodes.csv') 446 | df_nodes.to_json(f'{data_dir}/{graph_root}_nodes.json') 447 | 448 | df_edges = pd.DataFrame({"node_1": node_1_list, "node_2": node_2_list,"edge_list": title_list, "weight_list": weight_list } ) 449 | df_edges.to_csv(f'{data_dir}/{graph_root}_edges.csv') 450 | df_edges.to_json(f'{data_dir}/{graph_root}_edges.json') 451 | 452 | except: 453 | 454 | print ("Error saving CSV/JSON files.") 455 | 456 | communities_generator = nx.community.girvan_newman(G) 457 | #top_level_communities = next(communities_generator) 458 | next_level_communities = next(communities_generator) 459 | communities = sorted(map(sorted, next_level_communities)) 460 | 461 | if verbatim: 462 | print("Number of Communities = ", len(communities)) 463 | 464 | if verbatim: 465 | print("Communities: ", communities) 466 | 467 | colors = colors2Community(communities) 468 | if verbatim: 469 | print ("Colors: ", colors) 470 | 471 | for index, row in colors.iterrows(): 472 | G.nodes[row['node']]['group'] = row['group'] 473 | G.nodes[row['node']]['color'] = row['color'] 474 | G.nodes[row['node']]['size'] = G.degree[row['node']] 475 | 476 | net = Network( 477 | 478 | notebook=True, 479 | 480 | cdn_resources="remote", 481 | height="900px", 482 | width="100%", 483 | select_menu=True, 484 | 485 | filter_menu=False, 486 | ) 487 | 488 | net.from_nx(G) 489 | net.force_atlas_2based(central_gravity=0.015, gravity=-31) 490 | 491 | net.show_buttons() 492 | 493 | graph_HTML= f'{data_dir}/{graph_root}_grapHTML.html' 494 | graph_GraphML= f'{data_dir}/{graph_root}_graphML.graphml' # f'{data_dir}/resulting_graph.graphml', 495 | nx.write_graphml(G, graph_GraphML) 496 | 497 | if save_HTML: 498 | net.show(graph_HTML, 499 | ) 500 | 501 | if save_PDF: 502 | output_pdf=f'{data_dir}/{graph_root}_PDF.pdf' 503 | pdfkit.from_file(graph_HTML, output_pdf) 504 | else: 505 | output_pdf=None 506 | res_stat=graph_statistics_and_plots_for_large_graphs(G, data_dir=data_dir,include_centrality=False, 507 | make_graph_plot=False,) 508 | 509 | print ("Graph statistics: ", res_stat) 510 | return graph_HTML, graph_GraphML, G, net, output_pdf 511 | 512 | import time 513 | from copy import deepcopy 514 | 515 | def add_new_subgraph_from_text(txt,generate,node_embeddings,tokenizer, model, 516 | original_graph_path_and_fname, 517 | data_dir_output='./data_temp/', verbatim=True, 518 | size_threshold=10,chunk_size=10000, 519 | do_Louvain_on_new_graph=True,include_contextual_proximity=False,repeat_refine=0,similarity_threshold=0.95, do_simplify_graph=True,#whether or not to simplify, uses similiraty_threshold defined above 520 | return_only_giant_component=False, 521 | save_common_graph=False,G_to_add=None,graph_GraphML_to_add=None, 522 | ): 523 | 524 | display (Markdown(txt[:256]+"....")) 525 | graph_GraphML=None 526 | 527 | G_new=None 528 | res=None 529 | assert not (G_to_add is not None and graph_GraphML_to_add is not None), "G_to_add and graph_GraphML_to_add cannot be used together. Pick one or the other to provide a graph to be added." 530 | 531 | try: 532 | start_time = time.time() 533 | idx=0 534 | 535 | if verbatim: 536 | print ("Now create or load new graph...") 537 | 538 | if graph_GraphML_to_add==None and G_newlymade==None: #make new if no existing one provided 539 | print ("Make new graph from text...") 540 | _, graph_GraphML_to_add, G_to_add, _, _ =make_graph_from_text (txt,generate, 541 | include_contextual_proximity=include_contextual_proximity, 542 | 543 | data_dir=data_dir_output, 544 | graph_root=f'graph_new_{idx}', 545 | 546 | chunk_size=chunk_size, repeat_refine=repeat_refine, 547 | verbatim=verbatim, 548 | 549 | ) 550 | if verbatim: 551 | print ("Generated new graph from text provided: ", graph_GraphML_to_add) 552 | 553 | else: 554 | if verbatim: 555 | print ("Instead of generating graph, loading it or using provided graph...(any txt data provided will be ignored...)") 556 | 557 | if graph_GraphML_to_add!=None: 558 | print ("Loading graph: ", graph_GraphML_to_add) 559 | 560 | print("--- %s seconds ---" % (time.time() - start_time)) 561 | except: 562 | print ("ALERT: Graph generation failed...for idx=",idx) 563 | 564 | print ("Now add node to existing graph...") 565 | 566 | try: 567 | #Load original graph 568 | G = nx.read_graphml(original_graph_path_and_fname) 569 | 570 | if G_to_add!=None: 571 | G_loaded=H = deepcopy(G_to_add) 572 | if verbatim: 573 | print ("Using provided graph to add (any txt data provided will be ignored...)") 574 | else: 575 | if verbatim: 576 | print ("Loading graph to be added either newly generated or provided.") 577 | G_loaded = nx.read_graphml(graph_GraphML_to_add) 578 | 579 | res_newgraph=graph_statistics_and_plots_for_large_graphs(G_loaded, data_dir=data_dir_output,include_centrality=False, 580 | make_graph_plot=False,root='new_graph') 581 | print (res_newgraph) 582 | 583 | G_new = nx.compose(G,G_loaded) 584 | 585 | if save_common_graph: 586 | print ("Identify common nodes and save...") 587 | try: 588 | 589 | common_nodes = set(G.nodes()).intersection(set(G_loaded.nodes())) 590 | 591 | subgraph = G_new.subgraph(common_nodes) 592 | graph_GraphML= f'{data_dir_output}/{graph_root}_common_nodes_before_simple.graphml' 593 | nx.write_graphml(subgraph, graph_GraphML) 594 | except: 595 | print ("Common nodes identification failed.") 596 | print ("Done!") 597 | 598 | if verbatim: 599 | print ("Now update node embeddings") 600 | node_embeddings=update_node_embeddings(node_embeddings, G_new, tokenizer, model) 601 | print ("Done update node embeddings.") 602 | if do_simplify_graph: 603 | if verbatim: 604 | print ("Now simplify graph.") 605 | G_new, node_embeddings =simplify_graph (G_new, node_embeddings, tokenizer, model , 606 | similarity_threshold=similarity_threshold, use_llm=False, data_dir_output=data_dir_output, 607 | verbatim=verbatim,) 608 | if verbatim: 609 | print ("Done simplify graph.") 610 | 611 | if verbatim: 612 | print ("Done update graph") 613 | 614 | if size_threshold >0: 615 | if verbatim: 616 | print ("Remove small fragments") 617 | G_new=remove_small_fragents (G_new, size_threshold=size_threshold) 618 | node_embeddings=update_node_embeddings(node_embeddings, G_new, tokenizer, model, verbatim=verbatim) 619 | 620 | if return_only_giant_component: 621 | if verbatim: 622 | print ("Select only giant component...") 623 | connected_components = sorted(nx.connected_components(G_new), key=len, reverse=True) 624 | G_new = G_new.subgraph(connected_components[0]).copy() 625 | node_embeddings=update_node_embeddings(node_embeddings, G_new, tokenizer, model, verbatim=verbatim) 626 | 627 | print (".") 628 | if do_Louvain_on_new_graph: 629 | G_new=graph_Louvain (G_new, 630 | graph_GraphML=None) 631 | if verbatim: 632 | print ("Don Louvain...") 633 | 634 | print (".") 635 | 636 | graph_root=f'graph' 637 | graph_GraphML= f'{data_dir_output}/{graph_root}_augmented_graphML_integrated.graphml' # f'{data_dir}/resulting_graph.graphml', 638 | print (".") 639 | nx.write_graphml(G_new, graph_GraphML) 640 | print ("Done...written: ", graph_GraphML) 641 | res=graph_statistics_and_plots_for_large_graphs(G_new, data_dir=data_dir_output,include_centrality=False, 642 | make_graph_plot=False,root='assembled') 643 | 644 | print ("Graph statistics: ", res) 645 | 646 | except: 647 | print ("Error adding new graph.") 648 | print (end="") 649 | 650 | return graph_GraphML, G_new, G_loaded, G, node_embeddings, res 651 | -------------------------------------------------------------------------------- /GraphReasoning/graph_tools.py: -------------------------------------------------------------------------------- 1 | import heapq 2 | 3 | import copy 4 | from transformers import AutoTokenizer, AutoModel 5 | import torch 6 | from scipy.spatial.distance import cosine 7 | from sklearn.decomposition import PCA 8 | import numpy as np 9 | from sklearn.decomposition import PCA 10 | from sklearn.cluster import KMeans 11 | import matplotlib.pyplot as plt 12 | import seaborn as sns # For more attractive plotting 13 | 14 | from sklearn.cluster import KMeans 15 | import matplotlib.pyplot as plt 16 | import community as community_louvain 17 | import networkx as nx 18 | import pandas as pd # Assuming colors2Community returns a pandas DataFrame 19 | 20 | import seaborn as sns 21 | import re 22 | from IPython.display import display, Markdown 23 | 24 | import markdown2 25 | import pdfkit 26 | 27 | import time 28 | 29 | import uuid 30 | import pandas as pd 31 | import numpy as np 32 | 33 | import pandas as pd 34 | import numpy as np 35 | import networkx as nx 36 | import os 37 | from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader 38 | from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader 39 | from langchain.text_splitter import RecursiveCharacterTextSplitter 40 | from pathlib import Path 41 | import random 42 | from pyvis.network import Network 43 | 44 | from tqdm.notebook import tqdm 45 | 46 | import seaborn as sns 47 | palette = "hls" 48 | 49 | import uuid 50 | import pandas as pd 51 | import numpy as np 52 | 53 | from transformers import AutoTokenizer, AutoModel 54 | import torch 55 | from scipy.spatial.distance import cosine 56 | from sklearn.decomposition import PCA 57 | import numpy as np 58 | from sklearn.decomposition import PCA 59 | from sklearn.cluster import KMeans 60 | import matplotlib.pyplot as plt 61 | import seaborn as sns # For more attractive plotting 62 | 63 | from sklearn.cluster import KMeans 64 | import matplotlib.pyplot as plt 65 | import transformers 66 | from transformers import logging 67 | 68 | logging.set_verbosity_error() 69 | 70 | import re 71 | 72 | from IPython.display import display, Markdown 73 | 74 | import markdown2 75 | import pdfkit 76 | 77 | 78 | import uuid 79 | import pandas as pd 80 | import numpy as np 81 | 82 | import pandas as pd 83 | import numpy as np 84 | import networkx as nx 85 | import os 86 | from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader 87 | from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader 88 | from langchain.text_splitter import RecursiveCharacterTextSplitter 89 | from pathlib import Path 90 | import random 91 | from pyvis.network import Network 92 | 93 | from tqdm.notebook import tqdm 94 | 95 | import seaborn as sns 96 | palette = "hls" 97 | 98 | import uuid 99 | import pandas as pd 100 | import numpy as np 101 | 102 | from transformers import AutoTokenizer, AutoModel 103 | import torch 104 | from scipy.spatial.distance import cosine 105 | from sklearn.decomposition import PCA 106 | import numpy as np 107 | from sklearn.decomposition import PCA 108 | from sklearn.cluster import KMeans 109 | import matplotlib.pyplot as plt 110 | import seaborn as sns # For more attractive plotting 111 | 112 | from sklearn.cluster import KMeans 113 | import matplotlib.pyplot as plt 114 | 115 | # Function to generate embeddings 116 | def generate_node_embeddings(graph, tokenizer, model): 117 | embeddings = {} 118 | for node in tqdm(graph.nodes()): 119 | inputs = tokenizer(str(node), return_tensors="pt") 120 | outputs = model(**inputs) 121 | embeddings[node] = outputs.last_hidden_state.mean(dim=1).detach().numpy() 122 | return embeddings 123 | 124 | import pickle 125 | 126 | def save_embeddings(embeddings, file_path): 127 | with open(file_path, 'wb') as f: 128 | pickle.dump(embeddings, f) 129 | def load_embeddings(file_path): 130 | with open(file_path, 'rb') as f: 131 | embeddings = pickle.load(f) 132 | return embeddings 133 | 134 | def find_best_fitting_node(keyword, embeddings, tokenizer, model): 135 | inputs = tokenizer(keyword, return_tensors="pt") 136 | outputs = model(**inputs) 137 | keyword_embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy().flatten() # Flatten to ensure 1-D 138 | 139 | # Calculate cosine similarity and find the best match 140 | best_node = None 141 | best_similarity = float('-inf') # Initialize with negative infinity 142 | for node, embedding in embeddings.items(): 143 | # Ensure embedding is 1-D 144 | embedding = embedding.flatten() # Flatten to ensure 1-D 145 | similarity = 1 - cosine(keyword_embedding, embedding) # Cosine similarity 146 | if similarity > best_similarity: 147 | best_similarity = similarity 148 | best_node = node 149 | 150 | return best_node, best_similarity 151 | 152 | def find_best_fitting_node_list(keyword, embeddings, tokenizer, model, N_samples=5): 153 | inputs = tokenizer(keyword, return_tensors="pt") 154 | outputs = model(**inputs) 155 | keyword_embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy().flatten() # Flatten to ensure 1-D 156 | 157 | # Initialize a min-heap 158 | min_heap = [] 159 | heapq.heapify(min_heap) 160 | 161 | for node, embedding in embeddings.items(): 162 | # Ensure embedding is 1-D 163 | embedding = embedding.flatten() # Flatten to ensure 1-D 164 | similarity = 1 - cosine(keyword_embedding, embedding) # Cosine similarity 165 | 166 | # If the heap is smaller than N_samples, just add the current node and similarity 167 | if len(min_heap) < N_samples: 168 | heapq.heappush(min_heap, (similarity, node)) 169 | else: 170 | # If the current similarity is greater than the smallest similarity in the heap 171 | if similarity > min_heap[0][0]: 172 | heapq.heappop(min_heap) # Remove the smallest 173 | heapq.heappush(min_heap, (similarity, node)) # Add the current node and similarity 174 | 175 | # Convert the min-heap to a sorted list in descending order of similarity 176 | best_nodes = sorted(min_heap, key=lambda x: -x[0]) 177 | 178 | # Return a list of tuples (node, similarity) 179 | return [(node, similarity) for similarity, node in best_nodes] 180 | 181 | 182 | # Example usage 183 | def visualize_embeddings_2d(embeddings , data_dir='./'): 184 | # Generate embeddings 185 | #embeddings = generate_node_embeddings(graph, tokenizer, model) 186 | 187 | # Extract the embedding vectors 188 | node_ids = list(embeddings.keys()) 189 | vectors = np.array([embeddings[node].flatten() for node in node_ids]) 190 | 191 | # Reduce dimensions to 2D using PCA 192 | pca = PCA(n_components=2) 193 | vectors_2d = pca.fit_transform(vectors) 194 | 195 | # Plot 196 | plt.figure(figsize=(10, 8)) 197 | plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], alpha=0.5) 198 | for i, node_id in enumerate(node_ids): 199 | plt.text(vectors_2d[i, 0], vectors_2d[i, 1], str(node_id), fontsize=9) 200 | plt.title('Node Embeddings Visualization') 201 | plt.xlabel('PCA 1') 202 | plt.ylabel('PCA 2') 203 | plt.savefig(f'{data_dir}/node_embeddings_2d.svg') # Save the figure as SVG 204 | plt.show() 205 | 206 | 207 | def visualize_embeddings_2d_notext(embeddings, n_clusters=3, data_dir='./'): 208 | # Extract the embedding vectors 209 | node_ids = list(embeddings.keys()) 210 | vectors = np.array([embeddings[node].flatten() for node in node_ids]) 211 | 212 | # Reduce dimensions to 2D using PCA 213 | pca = PCA(n_components=2) 214 | vectors_2d = pca.fit_transform(vectors) 215 | 216 | # Cluster the embeddings 217 | kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(vectors) 218 | labels = kmeans.labels_ 219 | 220 | # Plot 221 | plt.figure(figsize=(10, 8)) 222 | plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], c=labels, alpha=0.5, cmap='viridis') 223 | plt.title('Node Embeddings Visualization') 224 | plt.xlabel('PCA 1') 225 | plt.ylabel('PCA 2') 226 | plt.savefig(f'{data_dir}/node_embeddings_2d_clusters.svg') # Save the figure as SVG 227 | plt.show() 228 | 229 | 230 | def visualize_embeddings_2d_pretty(embeddings, n_clusters=3, data_dir='./'): 231 | # Extract the embedding vectors 232 | node_ids = list(embeddings.keys()) 233 | vectors = np.array([embeddings[node].flatten() for node in node_ids]) 234 | 235 | # Reduce dimensions to 2D using PCA 236 | pca = PCA(n_components=2) 237 | vectors_2d = pca.fit_transform(vectors) 238 | 239 | # Cluster the embeddings 240 | kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(vectors) 241 | labels = kmeans.labels_ 242 | 243 | # Count the number of points in each cluster 244 | unique, counts = np.unique(labels, return_counts=True) 245 | cluster_counts = dict(zip(unique, counts)) 246 | 247 | # Plot 248 | plt.figure(figsize=(10, 8)) 249 | sns.set(style='whitegrid') # Set seaborn style for prettier plots 250 | 251 | # Use seaborn's color palette and matplotlib's scatter plot 252 | palette = sns.color_palette("hsv", n_clusters) # Use a different color palette 253 | for cluster in range(n_clusters): 254 | cluster_points = vectors_2d[labels == cluster] 255 | plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster} (n={cluster_counts[cluster]})', alpha=0.7, edgecolors='w', s=100, cmap=palette) 256 | 257 | plt.title('Node Embeddings Visualization with Clusters') 258 | plt.xlabel('PCA 1') 259 | plt.ylabel('PCA 2') 260 | plt.legend(scatterpoints=1) # Add a legend to show cluster labels and counts 261 | plt.savefig(f'{data_dir}/node_embeddings_2d_clusters_pretty.svg') # Save the figure as SVG 262 | plt.show() 263 | 264 | # Optionally print the counts for each cluster 265 | for cluster, count in cluster_counts.items(): 266 | print(f'Cluster {cluster}: {count} items') 267 | 268 | from scipy.spatial.distance import cdist 269 | 270 | def visualize_embeddings_2d_pretty_and_sample(embeddings, n_clusters=3, n_samples=5, data_dir='./', 271 | alpha=0.7, edgecolors='none', s=50,): 272 | # Extract the embedding vectors 273 | node_ids = list(embeddings.keys()) 274 | vectors = np.array([embeddings[node].flatten() for node in node_ids]) 275 | 276 | # Reduce dimensions to 2D using PCA 277 | pca = PCA(n_components=2) 278 | vectors_2d = pca.fit_transform(vectors) 279 | 280 | # Cluster the embeddings 281 | kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(vectors) 282 | labels = kmeans.labels_ 283 | 284 | # Count the number of points in each cluster 285 | unique, counts = np.unique(labels, return_counts=True) 286 | cluster_counts = dict(zip(unique, counts)) 287 | 288 | # Plot 289 | plt.figure(figsize=(10, 8)) 290 | sns.set(style='whitegrid') # Set seaborn style for prettier plots 291 | palette = sns.color_palette("hsv", n_clusters) 292 | for cluster in range(n_clusters): 293 | cluster_points = vectors_2d[labels == cluster] 294 | plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster} (n={cluster_counts[cluster]})' 295 | , alpha=alpha, edgecolors=edgecolors, s=s, cmap=palette,#alpha=0.7, edgecolors='w', s=100, cmap=palette) 296 | ) 297 | 298 | plt.title('Node Embeddings Visualization with Clusters') 299 | plt.xlabel('PCA 1') 300 | plt.ylabel('PCA 2') 301 | plt.legend(scatterpoints=1) 302 | plt.savefig(f'{data_dir}/node_embeddings_2d_clusters_pretty.svg') 303 | plt.show() 304 | 305 | # Output N_sample terms from the center of each cluster 306 | centroids = kmeans.cluster_centers_ 307 | for cluster in range(n_clusters): 308 | cluster_indices = np.where(labels == cluster)[0] 309 | cluster_vectors = vectors[cluster_indices] 310 | cluster_node_ids = np.array(node_ids)[cluster_indices] 311 | 312 | # Calculate distances of points in this cluster to the centroid 313 | distances = cdist(cluster_vectors, [centroids[cluster]], 'euclidean').flatten() 314 | 315 | # Get indices of N_samples closest points 316 | closest_indices = np.argsort(distances)[:n_samples] 317 | closest_node_ids = cluster_node_ids[closest_indices] 318 | 319 | print(f'Cluster {cluster}: {len(cluster_vectors)} items') 320 | print(f'Closest {n_samples} node IDs to centroid:', closest_node_ids) 321 | 322 | import numpy as np 323 | from sklearn.decomposition import PCA 324 | from sklearn.metrics.pairwise import euclidean_distances 325 | from sklearn.mixture import GaussianMixture 326 | from scipy.spatial import Voronoi, voronoi_plot_2d 327 | import matplotlib.pyplot as plt 328 | 329 | def visualize_embeddings_with_gmm_density_voronoi_and_print_top_samples(embeddings, n_clusters=5, top_n=3, data_dir='./',s=50): 330 | # Extract the embedding vectors 331 | descriptions = list(embeddings.keys()) 332 | node_ids = list(embeddings.keys()) 333 | vectors = np.array([embeddings[node].flatten() for node in node_ids]) 334 | 335 | # Reduce dimensions to 2D using PCA 336 | pca = PCA(n_components=2) 337 | vectors_2d = pca.fit_transform(vectors) 338 | 339 | # Fit a Gaussian Mixture Model 340 | gmm = GaussianMixture(n_components=n_clusters, random_state=42) 341 | gmm.fit(vectors_2d) 342 | labels = gmm.predict(vectors_2d) 343 | 344 | # Generate Voronoi regions 345 | vor = Voronoi(gmm.means_) 346 | 347 | # Plotting 348 | plt.figure(figsize=(10, 10)) 349 | # Plot Voronoi diagram 350 | voronoi_plot_2d(vor, show_vertices=False, show_points=False, line_colors='black', line_width=1, line_alpha=0.7, point_size=2) 351 | 352 | # Color points based on their cluster 353 | for i in range(n_clusters): 354 | plt.scatter(vectors_2d[labels == i, 0], vectors_2d[labels == i, 1], s=s, label=f'Cluster {i}') 355 | 356 | plt.title('Embedding Vectors with GMM Density and Voronoi Tessellation') 357 | plt.xlabel('PCA 1') 358 | plt.ylabel('PCA 2') 359 | plt.legend() 360 | plt.savefig(f'{data_dir}/node_embeddings_2d_clusters_voronoi.svg') 361 | 362 | plt.show() 363 | # Print top-ranked sample texts 364 | for i in range(n_clusters): 365 | cluster_center = gmm.means_[i] 366 | cluster_points = vectors_2d[labels == i] 367 | 368 | distances = euclidean_distances(cluster_points, [cluster_center]) 369 | distances = distances.flatten() 370 | 371 | closest_indices = np.argsort(distances)[:top_n] 372 | 373 | print(f"\nTop {top_n} closest samples to the center of Cluster {i}:") 374 | for idx in closest_indices: 375 | original_idx = np.where(labels == i)[0][idx] 376 | desc = descriptions[original_idx] 377 | print(f"- Description: {desc}, Distance: {distances[idx]:.2f}") 378 | 379 | def analyze_network(G, data_dir='./', root = 'graph_analysis'): 380 | # Compute the degrees of the nodes 381 | # Compute the degrees of the nodes 382 | degrees = [d for n, d in G.degree()] 383 | 384 | # Compute maximum, minimum, and median node degrees 385 | max_degree = max(degrees) 386 | min_degree = min(degrees) 387 | median_degree = np.median(degrees) 388 | 389 | # Number of nodes and edges 390 | num_nodes = G.number_of_nodes() 391 | num_edges = G.number_of_edges() 392 | 393 | # Average node degree 394 | avg_degree = np.mean(degrees) 395 | 396 | # Density of the network 397 | density = nx.density(G) 398 | 399 | # Number of communities (using connected components as a simple community proxy) 400 | num_communities = nx.number_connected_components(G) 401 | 402 | # Print the results 403 | print(f"Maximum Degree: {max_degree}") 404 | print(f"Minimum Degree: {min_degree}") 405 | print(f"Median Degree: {median_degree}") 406 | print(f"Number of Nodes: {num_nodes}") 407 | print(f"Number of Edges: {num_edges}") 408 | print(f"Average Node Degree: {avg_degree:.2f}") 409 | print(f"Density: {density:.4f}") 410 | print(f"Number of Communities: {num_communities}") 411 | 412 | # Plot the results 413 | fig, axs = plt.subplots(5, 1, figsize=(10, 15)) 414 | 415 | metrics = [ 416 | ('Number of Nodes', num_nodes), 417 | ('Number of Edges', num_edges), 418 | ('Avg Node Degree', avg_degree), 419 | ('Density', density), 420 | ('Number of Communities', num_communities) 421 | ] 422 | 423 | for ax, (label, value) in zip(axs, metrics): 424 | ax.barh(label, value, color='blue') 425 | ax.set_xlim(0, max(value * 1.1, 1.1)) # Adding some padding for better visualization 426 | ax.set_xlabel('Value') 427 | ax.set_title(label) 428 | 429 | plt.tight_layout() 430 | plt.savefig(f'{data_dir}/community_structure_{root}.svg') 431 | # Show the plot 432 | plt.show() 433 | 434 | return max_degree, min_degree, median_degree 435 | 436 | def graph_statistics_and_plots(G, data_dir='./'): 437 | # Calculate statistics 438 | degrees = [degree for node, degree in G.degree()] 439 | degree_distribution = np.bincount(degrees) 440 | average_degree = np.mean(degrees) 441 | clustering_coefficients = nx.clustering(G) 442 | average_clustering_coefficient = nx.average_clustering(G) 443 | triangles = sum(nx.triangles(G).values()) / 3 444 | connected_components = nx.number_connected_components(G) 445 | density = nx.density(G) 446 | 447 | # Diameter and Average Path Length (for connected graphs or components) 448 | if nx.is_connected(G): 449 | diameter = nx.diameter(G) 450 | average_path_length = nx.average_shortest_path_length(G) 451 | else: 452 | diameter = "Graph not connected" 453 | component_lengths = [nx.average_shortest_path_length(G.subgraph(c)) for c in nx.connected_components(G)] 454 | average_path_length = np.mean(component_lengths) 455 | 456 | # Plot Degree Distribution 457 | plt.figure(figsize=(10, 6)) 458 | plt.hist(degrees, bins=range(min(degrees), max(degrees) + 1), alpha=0.75, color='blue') 459 | plt.title('Degree Distribution') 460 | plt.xlabel('Degree') 461 | plt.ylabel('Frequency') 462 | plt.savefig(f'{data_dir}/degree_distribution.svg') 463 | #plt.close() 464 | plt.show() 465 | 466 | # Plot Clustering Coefficient Distribution 467 | plt.figure(figsize=(10, 6)) 468 | plt.hist(list(clustering_coefficients.values()), bins=10, alpha=0.75, color='green') 469 | plt.title('Clustering Coefficient Distribution') 470 | plt.xlabel('Clustering Coefficient') 471 | plt.ylabel('Frequency') 472 | plt.savefig(f'{data_dir}/clustering_coefficient_distribution.svg') 473 | plt.show() 474 | #plt.close() 475 | 476 | statistics = { 477 | 'Degree Distribution': degree_distribution, 478 | 'Average Degree': average_degree, 479 | 'Clustering Coefficients': clustering_coefficients, 480 | 'Average Clustering Coefficient': average_clustering_coefficient, 481 | 'Number of Triangles': triangles, 482 | 'Connected Components': connected_components, 483 | 'Diameter': diameter, 484 | 'Density': density, 485 | 'Average Path Length': average_path_length, 486 | } 487 | 488 | return statistics 489 | 490 | def graph_statistics_and_plots_for_large_graphs (G, data_dir='./', include_centrality=False, 491 | make_graph_plot=False,root='graph', log_scale=True, 492 | log_hist_scale=True,density_opt=False, bins=50, 493 | ): 494 | # Basic statistics 495 | num_nodes = G.number_of_nodes() 496 | num_edges = G.number_of_edges() 497 | degrees = [degree for node, degree in G.degree()] 498 | log_degrees = np.log1p(degrees) # Using log1p for a better handle on zero degrees 499 | #degree_distribution = np.bincount(degrees) 500 | average_degree = np.mean(degrees) 501 | density = nx.density(G) 502 | connected_components = nx.number_connected_components(G) 503 | 504 | # Centrality measures 505 | if include_centrality: 506 | degree_centrality = nx.degree_centrality(G) 507 | betweenness_centrality = nx.betweenness_centrality(G) 508 | closeness_centrality = nx.closeness_centrality(G) 509 | eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000) 510 | 511 | # Community detection with Louvain method 512 | partition = community_louvain.best_partition(G) 513 | num_communities = len(set(partition.values())) 514 | 515 | # Plotting 516 | # Degree Distribution on a log-log scale 517 | plt.figure(figsize=(10, 6)) 518 | 519 | if log_scale: 520 | counts, bins, patches = plt.hist(log_degrees, bins=bins, alpha=0.75, color='blue', log=log_hist_scale, density=density_opt) 521 | 522 | plt.xscale('log') 523 | plt.yscale('log') 524 | xlab_0='Log(1 + Degree)' 525 | if density_opt: 526 | ylab_0='Probability Distribution' 527 | else: 528 | ylab_0='Probability Distribution' 529 | ylab_0=ylab_0 + log_hist_scale*' (log)' 530 | 531 | 532 | plt_title='Histogram of Log-Transformed Node Degrees with Log-Log Scale' 533 | 534 | else: 535 | counts, bins, patches = plt.hist(degrees, bins=bins, alpha=0.75, color='blue', log=log_hist_scale, density=density_opt) 536 | xlab_0='Degree' 537 | if density_opt: 538 | ylab_0='Probability Distribution' 539 | else: 540 | ylab_0='Probability Distribution' 541 | ylab_0=ylab_0 + log_hist_scale*' (log)' 542 | plt_title='Histogram of Node Degrees' 543 | 544 | plt.title(plt_title) 545 | plt.xlabel(xlab_0) 546 | plt.ylabel(ylab_0) 547 | plt.savefig(f'{data_dir}/{plt_title}_{root}.svg') 548 | plt.show() 549 | 550 | if make_graph_plot: 551 | 552 | # Additional Plots 553 | # Plot community structure 554 | plt.figure(figsize=(10, 6)) 555 | pos = nx.spring_layout(G) # for better visualization 556 | cmap = plt.get_cmap('viridis') 557 | nx.draw_networkx(G, pos, node_color=list(partition.values()), node_size=20, cmap=cmap, with_labels=False) 558 | plt.title('Community Structure') 559 | plt.savefig(f'{data_dir}/community_structure_{root}.svg') 560 | plt.show() 561 | plt.close() 562 | 563 | # Save statistics 564 | statistics = { 565 | 'Number of Nodes': num_nodes, 566 | 'Number of Edges': num_edges, 567 | 'Average Degree': average_degree, 568 | 'Density': density, 569 | 'Connected Components': connected_components, 570 | 'Number of Communities': num_communities, 571 | # Centrality measures could be added here as well, but they are often better analyzed separately due to their detailed nature 572 | } 573 | if include_centrality: 574 | centrality = { 575 | 'degree_centrality': degree_centrality, 576 | 'betweenness_centrality': betweenness_centrality, 577 | 'closeness_centrality': closeness_centrality, 578 | 'eigenvector_centrality': eigenvector_centrality, 579 | } 580 | else: 581 | centrality=None 582 | 583 | 584 | return statistics, include_centrality 585 | 586 | ## Now add these colors to communities and make another dataframe 587 | def colors2Community(communities) -> pd.DataFrame: 588 | ## Define a color palette 589 | p = sns.color_palette(palette, len(communities)).as_hex() 590 | random.shuffle(p) 591 | rows = [] 592 | group = 0 593 | for community in communities: 594 | color = p.pop() 595 | group += 1 596 | for node in community: 597 | rows += [{"node": node, "color": color, "group": group}] 598 | df_colors = pd.DataFrame(rows) 599 | return df_colors 600 | 601 | 602 | 603 | def graph_Louvain (G, 604 | graph_GraphML=None, palette = "hls"): 605 | # Assuming G is your graph and data_dir is defined 606 | 607 | # Compute the best partition using the Louvain algorithm 608 | partition = community_louvain.best_partition(G) 609 | 610 | # Organize nodes into communities based on the Louvain partition 611 | communities = {} 612 | for node, comm_id in partition.items(): 613 | communities.setdefault(comm_id, []).append(node) 614 | 615 | communities_list = list(communities.values()) 616 | print("Number of Communities =", len(communities_list)) 617 | print("Communities: ", communities_list) 618 | 619 | # Assuming colors2Community can work with the communities_list format 620 | colors = colors2Community(communities_list) 621 | print("Colors: ", colors) 622 | 623 | # Assign attributes to nodes based on their community membership 624 | for index, row in colors.iterrows(): 625 | node = row['node'] 626 | G.nodes[node]['group'] = row['group'] 627 | G.nodes[node]['color'] = row['color'] 628 | G.nodes[node]['size'] = G.degree[node] 629 | 630 | print("Done, assigned colors and groups...") 631 | 632 | # Write the graph with community information to a GraphML file 633 | if graph_GraphML != None: 634 | try: 635 | nx.write_graphml(G, graph_GraphML) 636 | 637 | print("Written GraphML.") 638 | 639 | except: 640 | print ("Error saving GraphML file.") 641 | return G 642 | 643 | def save_graph (G, 644 | graph_GraphML=None, ): 645 | if graph_GraphML != None: 646 | nx.write_graphml(G, graph_GraphML) 647 | 648 | print("Written GraphML") 649 | else: 650 | print("Error, no file name provided.") 651 | return 652 | 653 | def update_node_embeddings(embeddings, graph_new, tokenizer, model, remove_embeddings_for_nodes_no_longer_in_graph=True, 654 | verbatim=False): 655 | """ 656 | Update embeddings for new nodes in an updated graph, ensuring that the original embeddings are not altered. 657 | 658 | Args: 659 | - embeddings (dict): Existing node embeddings. 660 | - graph_new: The updated graph object. 661 | - tokenizer: Tokenizer object to tokenize node names. 662 | - model: Model object to generate embeddings. 663 | 664 | Returns: 665 | - Updated embeddings dictionary with embeddings for new nodes, without altering the original embeddings. 666 | """ 667 | # Create a deep copy of the original embeddings 668 | embeddings_updated = copy.deepcopy(embeddings) 669 | 670 | # Iterate through new graph nodes 671 | for node in tqdm(graph_new.nodes()): 672 | # Check if the node already has an embedding in the copied dictionary 673 | if node not in embeddings_updated: 674 | if verbatim: 675 | print(f"Generating embedding for new node: {node}") 676 | inputs = tokenizer(node, return_tensors="pt") 677 | outputs = model(**inputs) 678 | # Update the copied embeddings dictionary with the new node's embedding 679 | embeddings_updated[node] = outputs.last_hidden_state.mean(dim=1).detach().numpy() 680 | 681 | if remove_embeddings_for_nodes_no_longer_in_graph: 682 | # Remove embeddings for nodes that no longer exist in the graph from the copied dictionary 683 | nodes_in_graph = set(graph_new.nodes()) 684 | for node in list(embeddings_updated): 685 | if node not in nodes_in_graph: 686 | if verbatim: 687 | print(f"Removing embedding for node no longer in graph: {node}") 688 | del embeddings_updated[node] 689 | 690 | return embeddings_updated 691 | 692 | def remove_small_fragents (G_new, size_threshold): 693 | if size_threshold >0: 694 | 695 | # Find all connected components, returned as sets of nodes 696 | components = list(nx.connected_components(G_new)) 697 | 698 | # Iterate through components and remove those smaller than the threshold 699 | for component in components: 700 | if len(component) < size_threshold: 701 | # Remove the nodes in small components 702 | G_new.remove_nodes_from(component) 703 | return G_new 704 | 705 | 706 | def simplify_node_name_with_llm(node_name, generate, max_tokens=2048, temperature=0.3): 707 | # Generate a prompt for the LLM to simplify or describe the node name 708 | system_prompt='You are an ontological graph maker. You carefully rename nodes in complex networks.' 709 | prompt = f"Provide a simplified, more descriptive name for a network node named '{node_name}' that reflects its importance or role within a network." 710 | 711 | # Assuming 'generate' is a function that calls the LLM with the given prompt 712 | #simplified_name = generate(system_prompt=system_prompt, prompt) 713 | simplified_name = generate(system_prompt=system_prompt, prompt=prompt, max_tokens=max_tokens, temperature=temperature) 714 | 715 | return simplified_name 716 | 717 | import networkx as nx 718 | from sklearn.metrics.pairwise import cosine_similarity 719 | import numpy as np 720 | 721 | def simplify_graph_simple(graph_, node_embeddings, tokenizer, model, similarity_threshold=0.9, use_llm=False, 722 | data_dir_output='./', 723 | graph_root='simple_graph', verbatim=False,max_tokens=2048, temperature=0.3,generate=None, 724 | ): 725 | graph = graph_.copy() 726 | nodes = list(node_embeddings.keys()) 727 | embeddings_matrix = np.array([node_embeddings[node].flatten() for node in nodes]) 728 | 729 | similarity_matrix = cosine_similarity(embeddings_matrix) 730 | to_merge = np.where(similarity_matrix > similarity_threshold) 731 | 732 | node_mapping = {} 733 | nodes_to_recalculate = set() 734 | for i, j in tqdm(zip(*to_merge)): 735 | if i != j: # ignore self-similarity 736 | node_i, node_j = nodes[i], nodes[j] 737 | if graph.degree(node_i) >= graph.degree(node_j): 738 | node_to_keep, node_to_merge = node_i, node_j 739 | else: 740 | node_to_keep, node_to_merge = node_j, node_i 741 | if verbatim: 742 | print ("node to keep and merge: ", node_to_keep,"<--", node_to_merge) 743 | # Optionally use LLM to generate a simplified or more descriptive name 744 | if use_llm: 745 | original_node_to_keep = node_to_keep 746 | node_to_keep = simplify_node_name_with_llm(node_to_keep, generate, max_tokens=max_tokens, temperature=temperature) 747 | # Add the original and new node names to the list for recalculation 748 | nodes_to_recalculate.add(original_node_to_keep) 749 | nodes_to_recalculate.add(node_to_keep) 750 | 751 | node_mapping[node_to_merge] = node_to_keep 752 | 753 | new_graph = nx.relabel_nodes(graph, node_mapping, copy=True) 754 | 755 | # Recalculate embeddings for nodes that have been merged or renamed 756 | recalculated_embeddings = regenerate_node_embeddings(new_graph, nodes_to_recalculate, tokenizer, model) 757 | 758 | # Update the embeddings dictionary with the recalculated embeddings 759 | updated_embeddings = {**node_embeddings, **recalculated_embeddings} 760 | 761 | # Remove embeddings for nodes that no longer exist 762 | for node in node_mapping.keys(): 763 | if node in updated_embeddings: 764 | del updated_embeddings[node] 765 | 766 | graph_GraphML= f'{data_dir_output}/{graph_root}_graphML_simplified.graphml' # f'{data_dir}/resulting_graph.graphml', 767 | #print (".") 768 | nx.write_graphml(new_graph, graph_GraphML) 769 | 770 | return new_graph, updated_embeddings 771 | 772 | import networkx as nx 773 | import numpy as np 774 | from sklearn.metrics.pairwise import cosine_similarity 775 | from tqdm import tqdm 776 | import matplotlib.pyplot as plt 777 | from powerlaw import Fit 778 | 779 | # Assuming regenerate_node_embeddings is defined as provided earlier 780 | 781 | import numpy as np 782 | import networkx as nx 783 | from sklearn.metrics.pairwise import cosine_similarity 784 | from tqdm import tqdm 785 | 786 | def simplify_node_name_with_llm(node_name, max_tokens, temperature): 787 | # This is a placeholder for the actual function that uses a language model 788 | # to generate a simplified or more descriptive node name. 789 | return node_name 790 | 791 | def regenerate_node_embeddings(graph, nodes_to_recalculate, tokenizer, model): 792 | """ 793 | Regenerate embeddings for specific nodes. 794 | """ 795 | new_embeddings = {} 796 | for node in tqdm(nodes_to_recalculate): 797 | inputs = tokenizer(node, return_tensors="pt") 798 | outputs = model(**inputs) 799 | new_embeddings[node] = outputs.last_hidden_state.mean(dim=1).detach().numpy() 800 | return new_embeddings 801 | 802 | def simplify_graph(graph_, node_embeddings, tokenizer, model, similarity_threshold=0.9, use_llm=False, 803 | data_dir_output='./', graph_root='simple_graph', verbatim=False, max_tokens=2048, 804 | temperature=0.3, generate=None): 805 | """ 806 | Simplifies a graph by merging similar nodes and optionally renaming them using a language model. 807 | """ 808 | 809 | graph = graph_.copy() 810 | 811 | nodes = list(node_embeddings.keys()) 812 | embeddings_matrix = np.array([node_embeddings[node].flatten() for node in nodes]) 813 | 814 | similarity_matrix = cosine_similarity(embeddings_matrix) 815 | to_merge = np.where(similarity_matrix > similarity_threshold) 816 | 817 | node_mapping = {} 818 | nodes_to_recalculate = set() 819 | merged_nodes = set() # Keep track of nodes that have been merged 820 | if verbatim: 821 | print("Start...") 822 | for i, j in tqdm(zip(*to_merge), total=len(to_merge[0])): 823 | if i != j and nodes[i] not in merged_nodes and nodes[j] not in merged_nodes: # Check for duplicates 824 | node_i, node_j = nodes[i], nodes[j] 825 | 826 | try: 827 | if graph.degree(node_i) >= graph.degree(node_j): 828 | #if graph.degree[node_i] >= graph.degree[node_j]: 829 | node_to_keep, node_to_merge = node_i, node_j 830 | else: 831 | node_to_keep, node_to_merge = node_j, node_i 832 | 833 | if verbatim: 834 | print("Node to keep and merge:", node_to_keep, "<--", node_to_merge) 835 | 836 | #if use_llm and node_to_keep in nodes_to_recalculate: 837 | # node_to_keep = simplify_node_name_with_llm(node_to_keep, max_tokens=max_tokens, temperature=temperature) 838 | 839 | node_mapping[node_to_merge] = node_to_keep 840 | nodes_to_recalculate.add(node_to_keep) 841 | merged_nodes.add(node_to_merge) # Mark the merged node to avoid duplicate handling 842 | except: 843 | print (end="") 844 | if verbatim: 845 | print ("Now relabel. ") 846 | # Create the simplified graph by relabeling nodes. 847 | new_graph = nx.relabel_nodes(graph, node_mapping, copy=True) 848 | if verbatim: 849 | print ("New graph generated, nodes relabled. ") 850 | # Recalculate embeddings for nodes that have been merged or renamed. 851 | recalculated_embeddings = regenerate_node_embeddings(new_graph, nodes_to_recalculate, tokenizer, model) 852 | if verbatim: 853 | print ("Relcaulated embeddings... ") 854 | # Update the embeddings dictionary with the recalculated embeddings. 855 | updated_embeddings = {**node_embeddings, **recalculated_embeddings} 856 | 857 | # Remove embeddings for nodes that no longer exist in the graph. 858 | for node in merged_nodes: 859 | updated_embeddings.pop(node, None) 860 | if verbatim: 861 | print ("Now save graph... ") 862 | 863 | # Save the simplified graph to a file. 864 | graph_path = f'{data_dir_output}/{graph_root}_graphML_simplified.graphml' 865 | nx.write_graphml(new_graph, graph_path) 866 | 867 | if verbatim: 868 | print(f"Graph simplified and saved to {graph_path}") 869 | 870 | return new_graph, updated_embeddings 871 | 872 | def make_HTML (G,data_dir='./', graph_root='graph_root'): 873 | 874 | net = Network( 875 | #notebook=False, 876 | notebook=True, 877 | # bgcolor="#1a1a1a", 878 | cdn_resources="remote", 879 | height="900px", 880 | width="100%", 881 | select_menu=True, 882 | # font_color="#cccccc", 883 | filter_menu=False, 884 | ) 885 | 886 | net.from_nx(G) 887 | # net.repulsion(node_distance=150, spring_length=400) 888 | net.force_atlas_2based(central_gravity=0.015, gravity=-31) 889 | # net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380) 890 | 891 | #net.show_buttons(filter_=["physics"]) 892 | net.show_buttons() 893 | 894 | #net.show(graph_output_directory, notebook=False) 895 | graph_HTML= f'{data_dir}/{graph_root}_graphHTML.html' 896 | 897 | net.show(graph_HTML, #notebook=True 898 | ) 899 | 900 | return graph_HTML 901 | 902 | def return_giant_component_of_graph (G_new ): 903 | connected_components = sorted(nx.connected_components(G_new), key=len, reverse=True) 904 | G_new = G_new.subgraph(connected_components[0]).copy() 905 | return G_new 906 | 907 | def return_giant_component_G_and_embeddings (G_new, node_embeddings): 908 | connected_components = sorted(nx.connected_components(G_new), key=len, reverse=True) 909 | G_new = G_new.subgraph(connected_components[0]).copy() 910 | node_embeddings=update_node_embeddings(node_embeddings, G_new, tokenizer, model, verbatim=verbatim) 911 | return G_new, node_embeddings 912 | 913 | def extract_number(filename): 914 | # This function extracts numbers from a filename and converts them to an integer. 915 | # It finds all sequences of digits in the filename and returns the first one as an integer. 916 | # If no number is found, it returns -1. 917 | match = re.search(r'(\d+)', filename) 918 | return int(match.group(0)) if match else -1 919 | 920 | def get_list_of_graphs_and_chunks (graph_q='graph_*_graph_clean.csv', chunk_q='graph_*_chunks_clean.csv', data_dir='./',verbatim=False): 921 | graph_pattern = os.path.join(data_dir, graph_q) 922 | chunk_pattern = os.path.join(data_dir, chunk_q) 923 | 924 | # Use glob to find all files matching the patterns 925 | graph_files = glob.glob(graph_pattern) 926 | chunk_files = glob.glob(chunk_pattern) 927 | 928 | # Sort the files using the custom key function 929 | graph_file_list = sorted(graph_files, key=extract_number) 930 | chunk_file_list = sorted(chunk_files, key=extract_number) 931 | 932 | if verbatim: 933 | # Print the lists to verify 934 | print ('\n'.join(graph_file_list[:10]), '\n\n', '\n'.join(chunk_file_list[:10]),'\n') 935 | 936 | print('# graph files:', len (graph_file_list)) 937 | print('# chunk files:', len (chunk_file_list)) 938 | 939 | return graph_file_list, chunk_file_list 940 | 941 | def print_graph_nodes_with_texts(G, separator="; ", N=64): 942 | """ 943 | Prints out each node in the graph along with the associated texts, concatenated into a single string. 944 | 945 | Parameters: 946 | - G: A NetworkX graph object where each node has a 'texts' attribute containing a list of texts. 947 | - separator: A string separator used to join texts. Default is "; ". 948 | """ 949 | print("Graph Nodes and Their Associated Texts (Concatenated):") 950 | for node, data in G.nodes(data=True): 951 | texts = data.get('texts', []) 952 | concatenated_texts = separator.join(texts) 953 | print(f"Node: {node}, Texts: {concatenated_texts[:N]}") 954 | 955 | def print_graph_nodes (G, separator="; ", N=64): 956 | """ 957 | Prints out each node in the graph along with the associated texts, concatenated into a single string. 958 | 959 | Parameters: 960 | - G: A NetworkX graph object where each node has a 'texts' attribute containing a list of texts. 961 | - separator: A string separator used to join texts. Default is "; ". 962 | """ 963 | i=0 964 | print("Graph Nodes and Their Associated Texts (Concatenated):") 965 | for node in G.nodes : 966 | print(f"Node {i}: {node}") 967 | i=i+1 968 | def get_text_associated_with_node(G, node_identifier ='bone', ): 969 | 970 | # Accessing and printing the 'texts' attribute for the node 971 | if 'texts' in G.nodes[node_identifier]: 972 | texts = G.nodes[node_identifier]['texts'] 973 | concatenated_texts = "; ".join(texts) # Assuming you want to concatenate the texts 974 | print(f"Texts associated with node '{node_identifier}': {concatenated_texts}") 975 | else: 976 | print(f"No 'texts' attribute found for node {node_identifier}") 977 | concatenated_texts='' 978 | return concatenated_texts 979 | 980 | import networkx as nx 981 | import json 982 | from copy import deepcopy 983 | from tqdm import tqdm 984 | 985 | def save_graph_with_text_as_JSON(G_or, data_dir='./', graph_name='my_graph.graphml'): 986 | G = deepcopy(G_or) 987 | 988 | # Ensure correct path joining 989 | import os 990 | fname = os.path.join(data_dir, graph_name) 991 | 992 | for _, data in tqdm(G.nodes(data=True)): 993 | for key in data: 994 | if isinstance(data[key], (list, dict, set, tuple)): # Extend this as needed 995 | data[key] = json.dumps(data[key]) 996 | 997 | for _, _, data in tqdm(G.edges(data=True)): 998 | for key in data: 999 | if isinstance(data[key], (list, dict, set, tuple)): # Extend this as needed 1000 | data[key] = json.dumps(data[key]) 1001 | 1002 | nx.write_graphml(G, fname) 1003 | return fname 1004 | 1005 | def load_graph_with_text_as_JSON(data_dir='./', graph_name='my_graph.graphml'): 1006 | # Ensure correct path joining 1007 | import os 1008 | fname = os.path.join(data_dir, graph_name) 1009 | 1010 | G = nx.read_graphml(fname) 1011 | 1012 | for node, data in tqdm(G.nodes(data=True)): 1013 | for key, value in data.items(): 1014 | if isinstance(value, str): # Only attempt to deserialize strings 1015 | try: 1016 | data[key] = json.loads(value) 1017 | except json.JSONDecodeError: 1018 | pass # If the value is not a valid JSON string, do nothing 1019 | 1020 | for _, _, data in tqdm(G.edges(data=True)): 1021 | for key, value in data.items(): 1022 | if isinstance(value, str): 1023 | try: 1024 | data[key] = json.loads(value) 1025 | except json.JSONDecodeError: 1026 | pass 1027 | 1028 | return G 1029 | 1030 | from copy import deepcopy 1031 | import networkx as nx 1032 | from tqdm import tqdm 1033 | import os 1034 | 1035 | def save_graph_without_text(G_or, data_dir='./', graph_name='my_graph.graphml'): 1036 | G = deepcopy(G_or) 1037 | 1038 | # Process nodes: remove 'texts' attribute and convert others to string 1039 | for _, data in tqdm(G.nodes(data=True), desc="Processing nodes"): 1040 | if 'texts' in data: 1041 | del data['texts'] # Remove the 'texts' attribute 1042 | # Convert all other attributes to strings 1043 | for key in data: 1044 | data[key] = str(data[key]) 1045 | 1046 | # Process edges: similar approach, remove 'texts' and convert attributes 1047 | for i, (_, _, data) in enumerate(tqdm(G.edges(data=True), desc="Processing edges")): 1048 | #for _, _, data in tqdm(G.edges(data=True), desc="Processing edges"): 1049 | data['id'] = str(i) # Assign a unique ID 1050 | if 'texts' in data: 1051 | del data['texts'] # Remove the 'texts' attribute 1052 | # Convert all other attributes to strings 1053 | for key in data: 1054 | data[key] = str(data[key]) 1055 | 1056 | # Ensure correct directory path and file name handling 1057 | fname = os.path.join(data_dir, graph_name) 1058 | 1059 | # Save the graph to a GraphML file 1060 | nx.write_graphml(G, fname, edge_id_from_attribute='id') 1061 | return fname 1062 | 1063 | def print_nodes_and_labels (G, N=10): 1064 | # Printing out the first 10 nodes 1065 | ch_list=[] 1066 | 1067 | print("First 10 nodes:") 1068 | for node in list(G.nodes())[:10]: 1069 | print(node) 1070 | 1071 | print("\nFirst 10 edges with titles:") 1072 | for (node1, node2, data) in list(G.edges(data=True))[:10]: 1073 | edge_title = data.get('title') # Replace 'title' with the attribute key you're interested in 1074 | ch=f"Node labels: ({node1}, {node2}) - Title: {edge_title}" 1075 | ch_list.append (ch) 1076 | 1077 | print (ch) 1078 | 1079 | 1080 | return ch_list 1081 | 1082 | import pandas as pd 1083 | import numpy as np 1084 | import networkx as nx 1085 | from tqdm import tqdm 1086 | from pathlib import Path 1087 | from copy import deepcopy 1088 | import json 1089 | from tqdm import tqdm 1090 | import pandas as pd 1091 | import networkx as nx 1092 | import os 1093 | import pandas as pd 1094 | import numpy as np 1095 | import networkx as nx 1096 | from tqdm import tqdm 1097 | import json 1098 | 1099 | def make_graph_from_text_withtext(graph_file_list, chunk_file_list, 1100 | include_contextual_proximity=False, 1101 | graph_root='graph_root', 1102 | repeat_refine=0, verbatim=False, 1103 | data_dir='./data_output_KG/', 1104 | save_PDF=False, save_HTML=True, N_max=10, 1105 | idx_start=0): 1106 | """ 1107 | Constructs a graph from text data, ensuring edge labels do not incorrectly include node names. 1108 | """ 1109 | 1110 | # Initialize an empty DataFrame to store all texts 1111 | all_texts_df = pd.DataFrame() 1112 | 1113 | # Initialize an empty graph 1114 | G_total = nx.Graph() 1115 | 1116 | for idx in tqdm(range(idx_start, min(len(graph_file_list), N_max)), desc="Processing graphs"): 1117 | try: 1118 | # Load graph and chunk data 1119 | graph_df = pd.read_csv(graph_file_list[idx]) 1120 | text_df = pd.read_csv(chunk_file_list[idx]) 1121 | 1122 | # Append the current text_df to the all_texts_df 1123 | all_texts_df = pd.concat([all_texts_df, text_df], ignore_index=True) 1124 | 1125 | # Clean and aggregate the graph data 1126 | graph_df.replace("", np.nan, inplace=True) 1127 | graph_df.dropna(subset=["node_1", "node_2", 'edge'], inplace=True) 1128 | graph_df['count'] = 4 # Example fixed count, adjust as necessary 1129 | 1130 | # Aggregate edges and combine attributes 1131 | graph_df = (graph_df.groupby(["node_1", "node_2"]) 1132 | .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'}) 1133 | .reset_index()) 1134 | 1135 | if verbatim: 1136 | print("Shape of graph DataFrame: ", graph_df.shape) 1137 | 1138 | # Add edges to the graph 1139 | for _, row in graph_df.iterrows(): 1140 | G_total.add_edge(row['node_1'], row['node_2'], chunk_id=row['chunk_id'], 1141 | title=row['edge'], weight=row['count'] / 4) 1142 | 1143 | except Exception as e: 1144 | print(f"Error in graph generation for idx={idx}: {e}") 1145 | 1146 | # Ensure no duplicate chunk_id entries 1147 | all_texts_df = all_texts_df.drop_duplicates(subset=['chunk_id']) 1148 | 1149 | # Map chunk_id to text 1150 | chunk_id_to_text = pd.Series(all_texts_df.text.values, index=all_texts_df.chunk_id).to_dict() 1151 | 1152 | # Initialize node texts collection 1153 | node_texts = {node: set() for node in G_total.nodes()} 1154 | 1155 | # Associate texts with nodes based on edges 1156 | for (node1, node2, data) in tqdm(G_total.edges(data=True), desc="Mapping texts to nodes"): 1157 | chunk_ids = data.get('chunk_id', '').split(',') 1158 | for chunk_id in chunk_ids: 1159 | text = chunk_id_to_text.get(chunk_id, "") 1160 | if text: # If text is found for the chunk_id 1161 | node_texts[node1].add(text) 1162 | node_texts[node2].add(text) 1163 | 1164 | # Update nodes with their texts 1165 | for node, texts in node_texts.items(): 1166 | G_total.nodes[node]['texts'] = list(texts) # Convert from set to list 1167 | 1168 | return G_total 1169 | import numpy as np 1170 | from tqdm import tqdm 1171 | import networkx as nx 1172 | from sklearn.metrics.pairwise import cosine_similarity 1173 | 1174 | def regenerate_node_embeddings(graph, nodes_to_recalculate, tokenizer, model): 1175 | """ 1176 | Regenerate embeddings for specific nodes. 1177 | """ 1178 | new_embeddings = {} 1179 | for node in tqdm(nodes_to_recalculate): 1180 | inputs = tokenizer(node, return_tensors="pt") 1181 | outputs = model(**inputs) 1182 | new_embeddings[node] = outputs.last_hidden_state.mean(dim=1).detach().numpy() 1183 | return new_embeddings 1184 | 1185 | def simplify_graph_with_text(graph_, node_embeddings, tokenizer, model, similarity_threshold=0.9, use_llm=False, 1186 | data_dir_output='./', graph_root='simple_graph', verbatim=False, max_tokens=2048, 1187 | temperature=0.3, generate=None): 1188 | """ 1189 | Simplifies a graph by merging similar nodes and optionally renaming them using a language model. 1190 | Also, merges 'texts' node attribute ensuring no duplicates. 1191 | """ 1192 | 1193 | graph = deepcopy(graph_) 1194 | 1195 | nodes = list(node_embeddings.keys()) 1196 | embeddings_matrix = np.array([node_embeddings[node].flatten() for node in nodes]) 1197 | 1198 | similarity_matrix = cosine_similarity(embeddings_matrix) 1199 | to_merge = np.where(similarity_matrix > similarity_threshold) 1200 | 1201 | node_mapping = {} 1202 | nodes_to_recalculate = set() 1203 | merged_nodes = set() # Keep track of nodes that have been merged 1204 | if verbatim: 1205 | print("Start...") 1206 | for i, j in tqdm(zip(*to_merge), total=len(to_merge[0])): 1207 | if i != j and nodes[i] not in merged_nodes and nodes[j] not in merged_nodes: # Check for duplicates 1208 | node_i, node_j = nodes[i], nodes[j] 1209 | 1210 | try: 1211 | if graph.degree(node_i) >= graph.degree(node_j): 1212 | node_to_keep, node_to_merge = node_i, node_j 1213 | else: 1214 | node_to_keep, node_to_merge = node_j, node_i 1215 | 1216 | # Handle 'texts' attribute by merging and removing duplicates 1217 | texts_to_keep = set(graph.nodes[node_to_keep].get('texts', [])) 1218 | texts_to_merge = set(graph.nodes[node_to_merge].get('texts', [])) 1219 | merged_texts = list(texts_to_keep.union(texts_to_merge)) 1220 | graph.nodes[node_to_keep]['texts'] = merged_texts 1221 | 1222 | if verbatim: 1223 | print("Node to keep and merge:", node_to_keep, "<--", node_to_merge) 1224 | 1225 | node_mapping[node_to_merge] = node_to_keep 1226 | nodes_to_recalculate.add(node_to_keep) 1227 | merged_nodes.add(node_to_merge) # Mark the merged node to avoid duplicate handling 1228 | except Exception as e: 1229 | print("Error during merging:", e) 1230 | if verbatim: 1231 | print ("Now relabel. ") 1232 | # Create the simplified graph by relabeling nodes. 1233 | new_graph = nx.relabel_nodes(graph, node_mapping, copy=True) 1234 | if verbatim: 1235 | print ("New graph generated, nodes relabled. ") 1236 | # Recalculate embeddings for nodes that have been merged or renamed. 1237 | recalculated_embeddings = regenerate_node_embeddings(new_graph, nodes_to_recalculate, tokenizer, model) 1238 | if verbatim: 1239 | print ("Relcaulated embeddings... ") 1240 | # Update the embeddings dictionary with the recalculated embeddings. 1241 | updated_embeddings = {**node_embeddings, **recalculated_embeddings} 1242 | if verbatim: 1243 | print ("Done recalculate embeddings... ") 1244 | 1245 | # Remove embeddings for nodes that no longer exist in the graph. 1246 | for node in merged_nodes: 1247 | updated_embeddings.pop(node, None) 1248 | if verbatim: 1249 | print ("Now save graph... ") 1250 | 1251 | # Save the simplified graph to a file. 1252 | graph_path = f'{graph_root}_graphML_simplified_JSON.graphml' 1253 | save_graph_with_text_as_JSON (new_graph, data_dir=data_dir_output, graph_name=graph_path) 1254 | 1255 | if verbatim: 1256 | print(f"Graph simplified and saved to {graph_path}") 1257 | 1258 | return new_graph, updated_embeddings 1259 | -------------------------------------------------------------------------------- /GraphReasoning/openai_tools.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import base64 3 | import requests 4 | from datetime import datetime 5 | from GraphReasoning.graph_tools import * 6 | 7 | from GraphReasoning.utils import * 8 | from GraphReasoning.graph_analysis import * 9 | 10 | import openai 11 | 12 | def generate_OpenAIGPT ( system_prompt='You are a materials scientist.', prompt="Decsribe the best options to design abrasive materials.", 13 | temperature=0.2,max_tokens=2048,timeout=120, 14 | 15 | frequency_penalty=0, 16 | presence_penalty=0, 17 | top_p=1., 18 | openai_api_key='',gpt_model='gpt-4-vision-preview', organization='', 19 | ): 20 | client = openai.OpenAI(api_key=openai_api_key, 21 | organization =organization) 22 | 23 | chat_completion = client.chat.completions.create( 24 | messages=[ 25 | { 26 | "role": "system", 27 | "content": system_prompt, 28 | }, 29 | { 30 | "role": "user", 31 | "content": prompt, 32 | } 33 | ], 34 | temperature=temperature, 35 | max_tokens=max_tokens, 36 | model=gpt_model, 37 | timeout=timeout, 38 | frequency_penalty=frequency_penalty, 39 | presence_penalty=presence_penalty, 40 | top_p=top_p, 41 | ) 42 | return chat_completion.choices[0].message.content 43 | 44 | 45 | def reason_over_image_OpenAI (system_prompt='You are a scientist.', prompt='Carefully analyze this graph. Be creative and synthesize new research ideas to build sustainable mycelium materials.', 46 | image_path='IMAGES/H1000_E_bridggingcentrality_alt_2.png', 47 | temperature=0.2,max_tokens=2048,timeout=120, 48 | #gpt_model='gpt-3.5-turbo', 49 | frequency_penalty=0, 50 | presence_penalty=0, openai_api_key='',gpt_model='gpt-4-vision-preview', organization='', 51 | top_p=1., #local_llm=None, 52 | verbatim=False, 53 | ): 54 | 55 | if verbatim: 56 | print ("Prompt: ", prompt) 57 | def encode_image(image_path): 58 | with open(image_path, "rb") as image_file: 59 | return base64.b64encode(image_file.read()).decode('utf-8') 60 | 61 | # Getting the base64 string 62 | base64_image = encode_image(image_path) 63 | 64 | headers = { 65 | "Content-Type": "application/json", 66 | "Authorization": f"Bearer {openai_api_key}" 67 | } 68 | 69 | payload = { 70 | "model": gpt_model, 71 | "messages": [ 72 | { 73 | "role": "system", 74 | "content": system_prompt, 75 | }, 76 | { 77 | "role": "user", 78 | "content": [ 79 | { 80 | "type": "text", 81 | "text": prompt 82 | }, 83 | { 84 | "type": "image_url", 85 | "image_url": { 86 | "url": f"data:image/jpeg;base64,{base64_image}" 87 | } 88 | } 89 | ] 90 | } 91 | ], 92 | 93 | "max_tokens":max_tokens, 94 | 95 | } 96 | 97 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload,) 98 | 99 | if verbatim: 100 | display (Markdown(response.json()['choices'][0]['message']['content'])) 101 | 102 | return response.json()['choices'][0]['message']['content'] 103 | 104 | 105 | 106 | def reason_over_image_and_graph_via_triples (path_graph, generate, image_path='', 107 | keyword_1 = "music and sound", 108 | # local_llm=None, 109 | keyword_2 = "apples",include_keywords_as_nodes=True, 110 | inst_prepend='', 111 | 112 | instruction = 'Now, reason over them and propose a research hypothesis.', 113 | verbatim=False, 114 | N_limit=None,temperature=0.3, 115 | keywords_separator=' --> ',system_prompt='You are a scientist who uses logic and reasoning.', 116 | max_tokens=4096,prepend='You are given a set of information from a graph that describes the relationship between materials, structure, properties, and properties. You analyze these logically through reasoning.\n\n', 117 | 118 | save_files=True,data_dir='./',visualize_paths_as_graph=True, display_graph=True,words_per_line=2, 119 | ): 120 | 121 | print ("Reason over graph and image: ", image_path) 122 | 123 | make_dir_if_needed(data_dir) 124 | task=inst_prepend+'' 125 | 126 | 127 | join_strings = lambda strings: '\n'.join(strings) 128 | join_strings_newline = lambda strings: '\n'.join(strings) 129 | 130 | node_list=print_node_pairs_edge_title(path_graph) 131 | if N_limit != None: 132 | node_list=node_list[:N_limit] 133 | 134 | if verbatim: 135 | print ("Node list: ", node_list) 136 | 137 | 138 | if include_keywords_as_nodes: 139 | task=task+f"The following is a graph provided from an analysis of relationships between the concepts of {keyword_1} and {keyword_2}.\n\n" 140 | task=task+f"Consider this list of nodes and relations in a knowledge graph:\n\nFormat: node_1, relationship, node_2\n\nThe data is:\n\n{join_strings_newline( node_list)}\n\n" 141 | 142 | task=task+f"{instruction}" 143 | 144 | if verbatim: 145 | print ( "###############################################################\nTASK:\n", task) 146 | 147 | 148 | response=generate(system_prompt=system_prompt, 149 | prompt=task, max_tokens=max_tokens, temperature=temperature,image_path=image_path,) 150 | 151 | if verbatim: 152 | display(Markdown("**Response:** "+response )) 153 | 154 | return response , path_graph, fname, graph_GraphML 155 | 156 | 157 | from openai import OpenAI # OpenAI Python library to make API calls 158 | import requests # used to download images 159 | import os # used to access filepaths 160 | from PIL import Image # used to print and edit images 161 | import base64 162 | from IPython.display import display, Image 163 | import json 164 | 165 | def develop_prompt_from_text_and_generate_image (response, generate_OpenAIGPT, image_dir_name='./image_temp/', number_imgs=1, 166 | size="1024x1024",show_img=True,max_tokens=2048,temperature=0.3, 167 | quality='hd', style='vivid', direct_prompt=None, openai_api_key='', 168 | gpt_model='gpt-4-0125-preview', organization='', dalle_model="dall-e-3", 169 | system_prompt="You make prompts for DALLE-3." 170 | ): 171 | 172 | 173 | image_dir = os.path.join(os.curdir, image_dir_name) 174 | make_dir_if_needed(image_dir) 175 | img_list=[] 176 | if direct_prompt == None: 177 | task=f'''Consider this description of a novel material: {response} 178 | 179 | Develop a well-constructed, detailed and clear prompt for DALLE-3 that allows me to visualize the new material design. 180 | 181 | The prompt should be written such that the resulting image presents a clear reflection of the material's real microstructure and key features. Make sure that the resulting image does NOT include any text. 182 | ''' 183 | 184 | response=generate_OpenAIGPT(system_prompt=system_prompt, #local_llm=local_llm, 185 | prompt=task, max_tokens=max_tokens, temperature=temperature, ) 186 | display (Markdown("Image gen prompt:\n\n"+response)) 187 | else: 188 | response=direct_prompt 189 | display (Markdown("Image gen prompt already provided:\n\n"+response)) 190 | 191 | # set a directory to save DALL·E images to 192 | 193 | client = openai.OpenAI(api_key=openai_api_key, 194 | organization =organization) 195 | generation_response = client.images.generate( 196 | model = dalle_model, 197 | prompt=response, 198 | n=number_imgs, 199 | style=style, 200 | quality=quality, 201 | size=size, 202 | 203 | response_format="b64_json", 204 | ) 205 | 206 | 207 | for index, image_dict in enumerate(generation_response.data): 208 | image_data = base64.b64decode(image_dict.b64_json) 209 | 210 | # Get the current time 211 | time_part = datetime.now().strftime("%Y%m%d_%H%M%S") 212 | 213 | image_file = os.path.join(image_dir_name, f"generated_image_{time_part}_{response[:32]}_{index}.png") 214 | with open(image_file, mode="wb") as png: 215 | png.write(image_data) 216 | display(Image(data=image_data)) 217 | 218 | return img_list 219 | 220 | 221 | ########## Chat-like interaction with OpenAI models, text or images ########## 222 | 223 | import base64 224 | import requests 225 | import json 226 | from transformers.image_utils import load_image 227 | 228 | # Function to encode the image 229 | def encode_image(image_path): 230 | with open(image_path, "rb") as image_file: 231 | return base64.b64encode(image_file.read()).decode('utf-8') 232 | from io import BytesIO 233 | 234 | def is_url(val) -> bool: 235 | return isinstance(val, str) and val.startswith("http") 236 | 237 | def get_answer( query='What is shown in this image?',model="gpt-4o", 238 | image=None, payload=None, max_tokens=1024, temperature=0.1, 239 | top_p=0.95, top_k=40, init_instr = "Look at this image: ", 240 | display_image=False, system='You are a helpful assistant.' 241 | ): 242 | 243 | base64_image=None 244 | if image != None: 245 | if is_url(image): 246 | image= load_image(image) 247 | else: 248 | image= load_image(image) 249 | 250 | if display_image: 251 | display (image) 252 | 253 | # Convert the image to a byte array 254 | buffered = BytesIO() 255 | image.save(buffered, format="PNG") # Use the appropriate format for your image 256 | img_byte_array = buffered.getvalue() 257 | 258 | # Encode the byte array into a base64 string 259 | base64_image = base64.b64encode(img_byte_array).decode("utf-8") 260 | 261 | headers = { 262 | "Content-Type": "application/json", 263 | "Authorization": f"Bearer {api_key}" 264 | } 265 | 266 | if payload==None: 267 | if base64_image!=None: 268 | payload = { 269 | "model": model, 270 | "messages": [{ 271 | "role": "system", 272 | "content": [ { 273 | "type": "text", 274 | "text": system 275 | }, ] 276 | }, 277 | { 278 | "role": "user", 279 | "content": [ 280 | { 281 | "type": "text", 282 | "text": query 283 | }, 284 | { 285 | "type": "image_url", 286 | "image_url": { 287 | "url": f"data:image/jpeg;base64,{base64_image}" 288 | } 289 | } 290 | ] 291 | } 292 | ], 293 | "max_tokens": max_tokens 294 | } 295 | else: 296 | 297 | payload = { 298 | "model": model, 299 | "messages": [ 300 | { 301 | "role": "system", 302 | "content": [ { 303 | "type": "text", 304 | "text": system 305 | }, ] 306 | }, 307 | { 308 | "role": "user", 309 | "content": [ { 310 | "type": "text", 311 | "text": query 312 | }, ] 313 | } 314 | ], 315 | "max_tokens": max_tokens 316 | } 317 | 318 | else: 319 | payload['messages'].append ({"role":"user", "content": [ 320 | {"type": "text", "text": query} ] }, ) 321 | 322 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 323 | response_dict = response.json() 324 | message_content = response_dict['choices'][0]['message']['content'] 325 | 326 | payload['messages'].append ({"role":"assistant", "content": [ 327 | {"type": "text", "text": message_content} ] }, ) 328 | 329 | return message_content, payload 330 | ''' 331 | answer, payload= get_answer( query='What is graphene?', payload=None, image= None, 332 | display_image=True, model="gpt-4o-mini") 333 | answer, payload 334 | ''' 335 | ''' 336 | answer, payload= get_answer( query='What do you see?', payload=None, image= "1920px-Spiderweb_with_frost.jpg", 337 | display_image=True, model="gpt-4o-mini") 338 | answer 339 | ''' 340 | -------------------------------------------------------------------------------- /GraphReasoning/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | def contains_phrase(main_string, phrase): 5 | return phrase in main_string 6 | 7 | def make_dir_if_needed (dir_path): 8 | if not os.path.exists(dir_path): 9 | # Create directory 10 | os.makedirs(dir_path) 11 | return "Directory created." 12 | else: 13 | return "Directory already exists." 14 | 15 | def remove_markdown_symbols(text): 16 | # Remove links 17 | text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) 18 | # Remove images 19 | text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text) 20 | # Remove headers 21 | text = re.sub(r'#+\s', '', text) 22 | # Remove bold and italic 23 | text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) 24 | text = re.sub(r'\*([^*]+)\*', r'\1', text) 25 | text = re.sub(r'__([^_]+)__', r'\1', text) 26 | text = re.sub(r'_([^_]+)_', r'\1', text) 27 | # Remove inline code 28 | text = re.sub(r'`([^`]+)`', r'\1', text) 29 | # Remove blockquotes 30 | text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE) 31 | # Remove strikethrough 32 | text = re.sub(r'~~(.*?)~~', r'\1', text) 33 | # Remove code blocks 34 | text = re.sub(r'```.*?```', '', text, flags=re.DOTALL) 35 | # Remove extra newlines 36 | text = re.sub(r'\n\s*\n', '\n\n', text) 37 | # Remove list markers 38 | text = re.sub(r'^[\*\-\+]\s+', '', text, flags=re.MULTILINE) 39 | text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE) 40 | 41 | return text.strip() -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Notebooks/GraphReasoning - Graph Reasoning with LLM - BioMixtral.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5d624278-209f-4b2c-871c-6a9b6f4e2649", 6 | "metadata": {}, 7 | "source": [ 8 | "# GraphReasoning: Scientific Discovery through Knowledge Extraction and Multimodal Graph-based Representation and Reasoning\n", 9 | "\n", 10 | "Markus J. Buehler, MIT, 2024 mbuehler@MIT.EDU\n", 11 | "\n", 12 | "### Example: GraphReasoning: Loading graph and graph analysis" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 9, 18 | "id": "2f9fc97f-9cc2-4728-9398-c5be7a2331ff", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import os\n", 23 | "#os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", 24 | "device='cuda'" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 10, 30 | "id": "f35cad71-ccc5-47d8-8701-aa9b766f0d9b", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "from tqdm.notebook import tqdm\n", 35 | "from IPython.display import display, Markdown\n", 36 | "from huggingface_hub import hf_hub_download\n", 37 | "from GraphReasoning import *" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "c6035b88-6be6-477a-b795-e9435e25f017", 43 | "metadata": {}, 44 | "source": [ 45 | "### Load graph and embeddings " 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 12, 51 | "id": "029dff92-f063-4ae2-b9af-b49fc6879295", 52 | "metadata": { 53 | "scrolled": true 54 | }, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "File downloaded at: ././GRAPHDATA/BioGraph.graphml\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "#Hugging Face repo\n", 66 | "repository_id = \"lamm-mit/GraphReasoning\"\n", 67 | "data_dir='./GRAPHDATA' \n", 68 | "\n", 69 | "data_dir_output='./GRAPHDATA_OUTPUT/'\n", 70 | "\n", 71 | "graph_name='BioGraph.graphml'\n", 72 | "\n", 73 | "make_dir_if_needed(data_dir)\n", 74 | "make_dir_if_needed(data_dir_output)\n", 75 | "\n", 76 | "tokenizer_model=\"BAAI/bge-large-en-v1.5\"\n", 77 | "\n", 78 | "embedding_tokenizer = AutoTokenizer.from_pretrained(tokenizer_model, ) \n", 79 | "embedding_model = AutoModel.from_pretrained(tokenizer_model, ) \n", 80 | "\n", 81 | "filename = f\"{data_dir}/{graph_name}\"\n", 82 | "file_path = hf_hub_download(repo_id=repository_id, filename=filename, local_dir='./')\n", 83 | "print(f\"File downloaded at: {file_path}\")\n", 84 | "\n", 85 | "graph_name=f'{data_dir}/{graph_name}'\n", 86 | "G = nx.read_graphml(graph_name)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 13, 92 | "id": "c6bdfe1e-5641-4762-a3b0-e42e28359f7d", 93 | "metadata": { 94 | "scrolled": true 95 | }, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "File downloaded at: ././GRAPHDATA/BioGraph_embeddings_ge-large-en-v1.5.pkl\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "embedding_file='BioGraph_embeddings_ge-large-en-v1.5.pkl'\n", 107 | "\n", 108 | "generate_new_embeddings=False\n", 109 | "if generate_new_embeddings:\n", 110 | " node_embeddings = generate_node_embeddings(G, embedding_tokenizer, embedding_model, )\n", 111 | " save_embeddings(node_embeddings, f'{data_dir}/{embedding_file}')\n", 112 | " \n", 113 | "else:\n", 114 | " filename = f\"{data_dir}/{embedding_file}\"\n", 115 | " file_path = hf_hub_download(repo_id=repository_id, filename=filename, local_dir='./')\n", 116 | " print(f\"File downloaded at: {file_path}\")\n", 117 | "\n", 118 | " node_embeddings = load_embeddings(f'{data_dir}/{embedding_file}')" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "a1f256c6-b48d-4cc1-b5b4-efd78ed7e107", 124 | "metadata": {}, 125 | "source": [ 126 | "### Load LLM: BioMixtral" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 17, 132 | "id": "934fcdd6-318e-4739-b583-4a74da032e8e", 133 | "metadata": { 134 | "scrolled": true 135 | }, 136 | "outputs": [ 137 | { 138 | "name": "stderr", 139 | "output_type": "stream", 140 | "text": [ 141 | "llama_model_loader: loaded meta data with 24 key-value pairs and 995 tensors from ./models/ggml-model-q5_K_M.gguf (version GGUF V3 (latest))\n", 142 | "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", 143 | "llama_model_loader: - kv 0: general.architecture str = llama\n", 144 | "llama_model_loader: - kv 1: general.name str = .\n", 145 | "llama_model_loader: - kv 2: llama.context_length u32 = 32768\n", 146 | "llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n", 147 | "llama_model_loader: - kv 4: llama.block_count u32 = 32\n", 148 | "llama_model_loader: - kv 5: llama.feed_forward_length u32 = 14336\n", 149 | "llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n", 150 | "llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n", 151 | "llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 8\n", 152 | "llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n", 153 | "llama_model_loader: - kv 10: llama.expert_count u32 = 8\n", 154 | "llama_model_loader: - kv 11: llama.expert_used_count u32 = 2\n", 155 | "llama_model_loader: - kv 12: llama.rope.freq_base f32 = 1000000.000000\n", 156 | "llama_model_loader: - kv 13: general.file_type u32 = 17\n", 157 | "llama_model_loader: - kv 14: tokenizer.ggml.model str = llama\n", 158 | "llama_model_loader: - kv 15: tokenizer.ggml.tokens arr[str,32000] = [\"\", \"\", \"\", \"<0x00>\", \"<...\n", 159 | "llama_model_loader: - kv 16: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000...\n", 160 | "llama_model_loader: - kv 17: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n", 161 | "llama_model_loader: - kv 18: tokenizer.ggml.bos_token_id u32 = 1\n", 162 | "llama_model_loader: - kv 19: tokenizer.ggml.eos_token_id u32 = 2\n", 163 | "llama_model_loader: - kv 20: tokenizer.ggml.add_bos_token bool = true\n", 164 | "llama_model_loader: - kv 21: tokenizer.ggml.add_eos_token bool = false\n", 165 | "llama_model_loader: - kv 22: tokenizer.chat_template str = {{ bos_token }}{% for message in mess...\n", 166 | "llama_model_loader: - kv 23: general.quantization_version u32 = 2\n", 167 | "llama_model_loader: - type f32: 65 tensors\n", 168 | "llama_model_loader: - type f16: 32 tensors\n", 169 | "llama_model_loader: - type q8_0: 64 tensors\n", 170 | "llama_model_loader: - type q5_K: 833 tensors\n", 171 | "llama_model_loader: - type q6_K: 1 tensors\n", 172 | "llm_load_vocab: special tokens definition check successful ( 259/32000 ).\n", 173 | "llm_load_print_meta: format = GGUF V3 (latest)\n", 174 | "llm_load_print_meta: arch = llama\n", 175 | "llm_load_print_meta: vocab type = SPM\n", 176 | "llm_load_print_meta: n_vocab = 32000\n", 177 | "llm_load_print_meta: n_merges = 0\n", 178 | "llm_load_print_meta: n_ctx_train = 32768\n", 179 | "llm_load_print_meta: n_embd = 4096\n", 180 | "llm_load_print_meta: n_head = 32\n", 181 | "llm_load_print_meta: n_head_kv = 8\n", 182 | "llm_load_print_meta: n_layer = 32\n", 183 | "llm_load_print_meta: n_rot = 128\n", 184 | "llm_load_print_meta: n_embd_head_k = 128\n", 185 | "llm_load_print_meta: n_embd_head_v = 128\n", 186 | "llm_load_print_meta: n_gqa = 4\n", 187 | "llm_load_print_meta: n_embd_k_gqa = 1024\n", 188 | "llm_load_print_meta: n_embd_v_gqa = 1024\n", 189 | "llm_load_print_meta: f_norm_eps = 0.0e+00\n", 190 | "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n", 191 | "llm_load_print_meta: f_clamp_kqv = 0.0e+00\n", 192 | "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n", 193 | "llm_load_print_meta: f_logit_scale = 0.0e+00\n", 194 | "llm_load_print_meta: n_ff = 14336\n", 195 | "llm_load_print_meta: n_expert = 8\n", 196 | "llm_load_print_meta: n_expert_used = 2\n", 197 | "llm_load_print_meta: causal attn = 1\n", 198 | "llm_load_print_meta: pooling type = 0\n", 199 | "llm_load_print_meta: rope type = 0\n", 200 | "llm_load_print_meta: rope scaling = linear\n", 201 | "llm_load_print_meta: freq_base_train = 1000000.0\n", 202 | "llm_load_print_meta: freq_scale_train = 1\n", 203 | "llm_load_print_meta: n_yarn_orig_ctx = 32768\n", 204 | "llm_load_print_meta: rope_finetuned = unknown\n", 205 | "llm_load_print_meta: ssm_d_conv = 0\n", 206 | "llm_load_print_meta: ssm_d_inner = 0\n", 207 | "llm_load_print_meta: ssm_d_state = 0\n", 208 | "llm_load_print_meta: ssm_dt_rank = 0\n", 209 | "llm_load_print_meta: model type = 7B\n", 210 | "llm_load_print_meta: model ftype = Q5_K - Medium\n", 211 | "llm_load_print_meta: model params = 46.70 B\n", 212 | "llm_load_print_meta: model size = 30.02 GiB (5.52 BPW) \n", 213 | "llm_load_print_meta: general.name = .\n", 214 | "llm_load_print_meta: BOS token = 1 ''\n", 215 | "llm_load_print_meta: EOS token = 2 ''\n", 216 | "llm_load_print_meta: UNK token = 0 ''\n", 217 | "llm_load_print_meta: LF token = 13 '<0x0A>'\n", 218 | "ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no\n", 219 | "ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes\n", 220 | "ggml_cuda_init: found 2 CUDA devices:\n", 221 | " Device 0: NVIDIA RTX A6000, compute capability 8.6, VMM: yes\n", 222 | " Device 1: NVIDIA RTX A6000, compute capability 8.6, VMM: yes\n", 223 | "llm_load_tensors: ggml ctx size = 1.43 MiB\n", 224 | "llm_load_tensors: offloading 32 repeating layers to GPU\n", 225 | "llm_load_tensors: offloading non-repeating layers to GPU\n", 226 | "llm_load_tensors: offloaded 33/33 layers to GPU\n", 227 | "llm_load_tensors: CUDA_Host buffer size = 85.94 MiB\n", 228 | "llm_load_tensors: CUDA0 buffer size = 16228.09 MiB\n", 229 | "llm_load_tensors: CUDA1 buffer size = 14421.46 MiB\n", 230 | "....................................................................................................\n", 231 | "llama_new_context_with_model: n_ctx = 10016\n", 232 | "llama_new_context_with_model: n_batch = 512\n", 233 | "llama_new_context_with_model: n_ubatch = 512\n", 234 | "llama_new_context_with_model: freq_base = 1000000.0\n", 235 | "llama_new_context_with_model: freq_scale = 1\n", 236 | "llama_kv_cache_init: CUDA0 KV buffer size = 665.12 MiB\n", 237 | "llama_kv_cache_init: CUDA1 KV buffer size = 586.88 MiB\n", 238 | "llama_new_context_with_model: KV self size = 1252.00 MiB, K (f16): 626.00 MiB, V (f16): 626.00 MiB\n", 239 | "llama_new_context_with_model: CUDA_Host output buffer size = 0.12 MiB\n", 240 | "llama_new_context_with_model: pipeline parallelism enabled (n_copies=4)\n", 241 | "llama_new_context_with_model: CUDA0 compute buffer size = 788.29 MiB\n", 242 | "llama_new_context_with_model: CUDA1 compute buffer size = 788.30 MiB\n", 243 | "llama_new_context_with_model: CUDA_Host compute buffer size = 86.27 MiB\n", 244 | "llama_new_context_with_model: graph nodes = 1638\n", 245 | "llama_new_context_with_model: graph splits = 3\n", 246 | "AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 1 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | \n", 247 | "Model metadata: {'tokenizer.chat_template': \"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}\", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.eos_token_id': '2', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '17', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'tokenizer.ggml.add_bos_token': 'true', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '14336', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'tokenizer.ggml.bos_token_id': '1', 'llama.attention.head_count': '32', 'llama.block_count': '32', 'llama.attention.head_count_kv': '8', 'llama.expert_count': '8', 'llama.context_length': '32768', 'general.name': '.', 'llama.expert_used_count': '2'}\n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "from llama_cpp import Llama\n", 253 | "import llama_cpp\n", 254 | "\n", 255 | "#m\n", 256 | "repository_id='lamm-mit/BioinspiredMixtral'\n", 257 | "filename='ggml-model-q5_K_M.gguf'\n", 258 | "file_path = hf_hub_download(repo_id=repository_id, filename=filename, local_dir='./models/')\n", 259 | "\n", 260 | "chat_format=\"mistral-instruct\"\n", 261 | "\n", 262 | "llm = Llama(model_path=file_path,\n", 263 | " n_gpu_layers=-1,verbose= True, #False,#False,\n", 264 | " n_ctx=10000,\n", 265 | " main_gpu=0,\n", 266 | " chat_format=chat_format,\n", 267 | " )" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 16, 273 | "id": "244a2bc7-7ce0-4fc1-8982-af957f2a011c", 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/plain": [ 279 | "'./models/ggml-model-q5_K_M.gguf'" 280 | ] 281 | }, 282 | "execution_count": 16, 283 | "metadata": {}, 284 | "output_type": "execute_result" 285 | } 286 | ], 287 | "source": [ 288 | "file_path" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 18, 294 | "id": "344c2448-bb74-47ca-a74b-6bbb31c80008", 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "def generate_BioMixtral (system_prompt='You are a biomaterials cientist.', \n", 299 | " prompt=\"What is spider silk?\",temperature=0.333,\n", 300 | " max_tokens=10000, \n", 301 | " ):\n", 302 | "\n", 303 | " if system_prompt==None:\n", 304 | " messages=[\n", 305 | " {\"role\": \"user\", \"content\": prompt},\n", 306 | " ]\n", 307 | " else:\n", 308 | " messages=[\n", 309 | " {\"role\": \"system\", \"content\": system_prompt, },\n", 310 | " {\"role\": \"user\", \"content\": prompt},\n", 311 | " ]\n", 312 | "\n", 313 | " result=llm.create_chat_completion(\n", 314 | " messages=messages,\n", 315 | " temperature=temperature,\n", 316 | " max_tokens=max_tokens,\n", 317 | " )\n", 318 | " return result['choices'][0]['message']['content']\n", 319 | " " 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 20, 325 | "id": "f825d8a6-ff2c-48cb-ada5-acfacee3bc0b", 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "name": "stderr", 330 | "output_type": "stream", 331 | "text": [ 332 | "Llama.generate: prefix-match hit\n", 333 | "\n", 334 | "llama_print_timings: load time = 5546.25 ms\n", 335 | "llama_print_timings: sample time = 70.73 ms / 152 runs ( 0.47 ms per token, 2149.02 tokens per second)\n", 336 | "llama_print_timings: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second)\n", 337 | "llama_print_timings: eval time = 8868.12 ms / 152 runs ( 58.34 ms per token, 17.14 tokens per second)\n", 338 | "llama_print_timings: total time = 9294.25 ms / 153 tokens\n" 339 | ] 340 | }, 341 | { 342 | "name": "stdout", 343 | "output_type": "stream", 344 | "text": [ 345 | " Graphene is a single layer of carbon atoms arranged in a hexagonal lattice, with each atom bonded to three neighboring atoms through strong covalent bonds. It has unique mechanical, electrical, and thermal properties that make it an attractive material for various applications. Graphene can be synthesized using different methods, such as chemical vapor deposition (CVD), epitaxial growth on silicon carbide (SiC), and reduction of graphene oxide (GO). The mechanical properties of graphene are influenced by the presence of defects, such as vacancies, grain boundaries, and functional groups. These defects can affect the strength, toughness, and conductivity of graphene.\n", 346 | "--- 9.303527116775513 seconds ---\n" 347 | ] 348 | }, 349 | { 350 | "data": { 351 | "text/markdown": [ 352 | " Graphene is a single layer of carbon atoms arranged in a hexagonal lattice, with each atom bonded to three neighboring atoms through strong covalent bonds. It has unique mechanical, electrical, and thermal properties that make it an attractive material for various applications. Graphene can be synthesized using different methods, such as chemical vapor deposition (CVD), epitaxial growth on silicon carbide (SiC), and reduction of graphene oxide (GO). The mechanical properties of graphene are influenced by the presence of defects, such as vacancies, grain boundaries, and functional groups. These defects can affect the strength, toughness, and conductivity of graphene." 353 | ], 354 | "text/plain": [ 355 | "" 356 | ] 357 | }, 358 | "metadata": {}, 359 | "output_type": "display_data" 360 | } 361 | ], 362 | "source": [ 363 | "q='''What is graphene?'''\n", 364 | "start_time = time.time()\n", 365 | "res=generate_BioMixtral( system_prompt='You design materials.', \n", 366 | " prompt=q, max_tokens=1024, temperature=0.3, )\n", 367 | "\n", 368 | "print (res)\n", 369 | "deltat=time.time() - start_time\n", 370 | "print(\"--- %s seconds ---\" % deltat)\n", 371 | "display (Markdown(res))" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 24, 377 | "id": "36872d4c-ea8c-428d-b32b-0987ccefc844", 378 | "metadata": { 379 | "scrolled": true 380 | }, 381 | "outputs": [ 382 | { 383 | "name": "stdout", 384 | "output_type": "stream", 385 | "text": [ 386 | "0nth best fitting node for 'collagen': 'collagen' with similarity: 1\n", 387 | "0nth best fitting node for 'copper': 'copper' with similarity: 1\n", 388 | "Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n", 389 | "./GRAPHDATA_OUTPUT//shortest_path_2hops_collagen_copper.html\n", 390 | "HTML visualization: ./GRAPHDATA_OUTPUT//shortest_path_2hops_collagen_copper.html\n", 391 | "GraphML file: ./GRAPHDATA_OUTPUT//shortestpath_2hops_collagen_copper.graphml\n", 392 | "You are given a set of information from a graph that describes the relationship \n", 393 | " between materials, structure, properties, and properties. You analyze these logically \n", 394 | " through reasoning.\n", 395 | "\n", 396 | "### Consider this list of nodes and relations in a knowledge graph:\n", 397 | "\n", 398 | "Format: node_1, relationship, node_2\n", 399 | "\n", 400 | "The data is:\n", 401 | "\n", 402 | "biomimetic injectable hydrogels, can be processed into, collagen\n", 403 | "highly porous collagen strands, promote, collagen\n", 404 | "poisson's ratio, has, collagen\n", 405 | "complex hierarchical microstructure, Has, collagen\n", 406 | "collagen's mechanical properties, are physiologically relevant due to its role in providing structural support to organisms, making it a crucial component of bone., collagen\n", 407 | "biological integration, promotes, collagen\n", 408 | "biological integration, have, nanohydroxyapatite\n", 409 | "connective tissue, Found In, collagen\n", 410 | "x-ray diffraction, has been used to determine the precise atomic conformation of, collagen\n", 411 | "soft tissues, provides tensile strength, collagen\n", 412 | "tissue regeneration and growth promotion, used for, tissue engineering scaffolds\n", 413 | "fibronectin, collagen provides strength and stiffness while fibronectin facilitates cell adhesion and migration, collagen\n", 414 | "fibronectin, Protein that is a major component of ECM and facilitates cell adhesion and migration., ecm\n", 415 | "fibronectin, fibronectin facilitates cell adhesion and migration while proteoglycans provide tissue hydration and lubrication, proteoglycans\n", 416 | "fibronectin, fibronectin facilitates cell adhesion and migration while laminins form a structural barrier and interact with other extracellular matrix proteins, laminins\n", 417 | "chitosan, can be processed into, collagen\n", 418 | "chitosan, can improve, mechanical properties\n", 419 | "chitosan, is being blended with, pva\n", 420 | "8:1:1 ratio of collagen, phosphate-buffered saline (pbs), and sodium hydroxide (naoh), is neutralized using, collagen\n", 421 | "cohesive traction-separation relationship, applied at microscopic level, collagen\n", 422 | "nanocomposites, Nanocomposites have improved mechanical properties., mechanical properties\n", 423 | "nanocomposites, Nanocomposites can contain collagen., collagen\n", 424 | "nanocomposites, can be used as matrix materials for, nanoparticles\n", 425 | "nanocomposites, provide environmental benefits,are renewable, biodegradable, and green, biopolymers\n", 426 | "nanocomposites, subclass of, polymer nanocomposites\n", 427 | "nanocomposites, Nanocomposite design is used for, porcupine fish spines\n", 428 | "natural biomaterials, Are derived from natural sources., tissue engineering scaffolds\n", 429 | "combination of natural biomaterials and synthetic polymers, Combine natural and synthetic biomaterials., tissue engineering scaffolds\n", 430 | "nanohydroxyapatite, have, biocompatibility\n", 431 | "nanohydroxyapatite, can be processed into, collagen\n", 432 | "poor physical and chemical properties, Suffers From, collagen\n", 433 | "surface treatment, type and extent, nanoparticles\n", 434 | "ecm, Overarching term for biological materials such as extracellular matrix (ECM) and its constituent proteins., biological materials\n", 435 | "ecm, organize into, collagen fibrils\n", 436 | "ecm, Protein that is a major component of ECM and plays a crucial role in cell adhesion and tissue organization., collagen\n", 437 | "ecm, composed of, structural proteins\n", 438 | "ecm, Protein that is a major component of ECM and provides a scaffold for cell adhesion and migration., laminins\n", 439 | "free radicals, produce, nanoparticles\n", 440 | "nanocomposite hydrogels, which can enhance, nanoparticles\n", 441 | "potential side effects, further research is necessary to fully comprehend, nanoparticles\n", 442 | "bone healing, Necessary for, nanoparticles\n", 443 | "genipin, Used As Crosslinking Method, collagen\n", 444 | "material modification, Requires During Preparation, collagen\n", 445 | "silver, Such as, nanoparticles\n", 446 | "synthetic polymers, difference in design leads to superior structural properties, biological materials\n", 447 | "synthetic polymers, Have limitations, biocompatibility\n", 448 | "synthetic polymers, Are synthesized chemically., tissue engineering scaffolds\n", 449 | "synthetic polymers, transform into nanocomposites, biopolymers\n", 450 | "bird bones, consists of, collagen\n", 451 | "bird bones, consists of, hydroxyapatite\n", 452 | "glutaraldehyde, Used As Crosslinking Method, collagen\n", 453 | "reproducing native architecture and function of cartilage, Strive to have the same biological properties and behavior as native tissues., tissue engineering scaffolds\n", 454 | "submicron structures, is present on, collagen\n", 455 | "submicron structures, is present on, protein\n", 456 | "interpenetrating networks (ipn), Are formed by interlocking polymer networks., tissue engineering scaffolds\n", 457 | "proteins inside the bacterial cells, interact with, nanoparticles\n", 458 | "cell seeding material, can be seeded with, collagen\n", 459 | "cell seeding material, can be used as a, collagenous biomaterial\n", 460 | "stiff triple-helix structure, has, collagen\n", 461 | "ivory, is composed of, collagen\n", 462 | "ivory, is composed of, hydroxyapatite\n", 463 | "collagen-based materials, Enhanced By Modification, biocompatibility\n", 464 | "collagen-based materials, Used In, collagen\n", 465 | "collagen-based materials, Collagen-based biomaterials is a more general term that encompasses both constructs and composites., collagen-based materials\n", 466 | "cell death, lead to, nanoparticles\n", 467 | "cell differentiation, promote, tissue engineering scaffolds\n", 468 | "polymeric agents, have been used for in-vitro intrafibrillar mineralization, collagen\n", 469 | "diverse functions, explored by, collagen\n", 470 | "skin, is found in, collagen\n", 471 | "inflammation, potentially cause, nanoparticles\n", 472 | "low doses, Function as synthetic replacements for, nanoparticles\n", 473 | "cancer cells, selectively target, nanoparticles\n", 474 | "nanomaterials, made from, tissue engineering scaffolds\n", 475 | "nanomaterials, have, biomedical applications\n", 476 | "localized deformations, cause during deformation, nanoparticles\n", 477 | "image, Appears in the Plane of, collagen\n", 478 | "hydroxyapatite (ha), is, scaffolds\n", 479 | "hydroxyapatite (ha), Interacts, collagen\n", 480 | "hydroxyapatite (ha), major inorganic component, bone\n", 481 | "biodegradability, possess, biological materials\n", 482 | "biodegradability, Has, collagen\n", 483 | "biodegradability, can improve, biomolecules\n", 484 | "biodegradability, have, biopolymers\n", 485 | "zirconium, Such as, nanoparticles\n", 486 | "environmentally induced degradation, can result from, collagen\n", 487 | "multichannel structures, Produced For, tissue engineering scaffolds\n", 488 | "increasing shift in (s) values, correlate with, nanoparticles\n", 489 | "natural polyamides, is, collagen\n", 490 | "natural polyamides, serve as, structural proteins\n", 491 | "connective tissues, is found in, collagen\n", 492 | "piezoelectricity, Exhibit local electrostatic attraction and repulsion due to shear piezoelectricity, collagen fibrils\n", 493 | "piezoelectricity, Exhibits, collagen\n", 494 | "piezoelectricity, studied, microstructure\n", 495 | "collagenous biomaterial, can be produced through, collagen\n", 496 | "collagenous biomaterial, is used in, biomedical applications\n", 497 | "collagenous biomaterial, is a type of, protein\n", 498 | "collagenous biomaterial, can be used as a, scaffold material\n", 499 | "collagenous biomaterial, can be used as a, adhesive\n", 500 | "collagenous biomaterial, is a type of, biomedical materials\n", 501 | "collagenous biomaterial, can be used as a, thermoplastic material\n", 502 | "hydroxyapatite, fracture energy calculated between, collagen\n", 503 | "hydroxyapatite, Such as, nanoparticles\n", 504 | "hydroxyapatite, Most mineralized at the spinous end, porcupine fish spines\n", 505 | "hydroxyapatite, Is often referred to as, bone\n", 506 | "hydroxyapatite, are made of, high-performance materials with nacre-like structure\n", 507 | "hydroxyapatite, constituent, mineral content\n", 508 | "chemical treatments, showed lower improvements in, nanoparticles\n", 509 | "pt nanops, are directly doped into, nanoparticles\n", 510 | "molecular packing, near Tg, which is roughly commensurate with their size, nanoparticles\n", 511 | "polishing substrate, Should be firmly attached to prevent displacement, nanoparticles\n", 512 | "bone morphogenetic proteins, Synthetic replacements for, nanoparticles\n", 513 | "drugs, deliver, nanoparticles\n", 514 | "drugs, can be incorporated, injectablehydrogels\n", 515 | "tropocollagen molecule, is composed of, collagen\n", 516 | "bactericidal effects, Have, nanoparticles\n", 517 | "natural tissues, mimic, tissue engineering scaffolds\n", 518 | "protein, plays a role in, toughness\n", 519 | "protein, is a, collagen\n", 520 | "efficacy, improve, nanoparticles\n", 521 | "neutralization, undergoes, collagen\n", 522 | "cells, enable self-healing capabilities, biological materials\n", 523 | "cells, exposed to (treated with), scaffold material\n", 524 | "cells, can be incorporated, injectablehydrogels\n", 525 | "cells, interact, nanoparticles\n", 526 | "calcination, is dissolved in, nanoparticles\n", 527 | "aggregates, ranging from, nanoparticles\n", 528 | "upconverting or luminescent dyes, include, nanoparticles\n", 529 | "cross-linking, may be treated through, collagen\n", 530 | "bovine dermal collagen, is, collagen\n", 531 | "stabilized collagen fibers, is, collagen\n", 532 | "distribution, not uniform, nanoparticles\n", 533 | "side effects, reduce, nanoparticles\n", 534 | "micro- and nanospheres, can be processed into, collagen\n", 535 | "polymer scaffolds, Improve, nanoparticles\n", 536 | "cytotoxicity, commonly used agents such as glutaraldehyde and polyepoxy compounds can cause, collagen\n", 537 | "cytotoxicity, demonstrate, scaffolds\n", 538 | "adhesive, was used as, collagen\n", 539 | "smaller sized np, demonstrates, nanoparticles\n", 540 | "smaller sized np, through, bacteria\n", 541 | "smaller sized np, through, fungi\n", 542 | "decellularized scaffold, retains, collagen\n", 543 | "well-ordered self-assembled morphologies, forms, collagen\n", 544 | "porous organic polymers (pops), Can be synthesized as, nanoparticles\n", 545 | "bone, reveal the existence of, biological materials\n", 546 | "bone, has, mechanical properties\n", 547 | "bone, Bone exhibits toughness due to its ability to absorb energy and resist fracture or deformation under load., toughness\n", 548 | "bone, made of, collagen fibrils\n", 549 | "bone, is found in, collagen\n", 550 | "0.01 m hcl, is suspended in, collagen\n", 551 | "peo, lacks cell adhesive properties, tissue engineering scaffolds\n", 552 | "proteoglycans, collagen provides strength and stiffness while proteoglycans provide tissue hydration and lubrication, collagen\n", 553 | "proteoglycans, laminins form a structural barrier and interact with other extracellular matrix proteins while proteoglycans provide tissue hydration and lubrication, laminins\n", 554 | "mineral content, constituent, collagen\n", 555 | "essential excitation, Excites Local Electromechanical Response Of, collagen\n", 556 | "antimicrobial properties, have, nanoparticles\n", 557 | "quantum dots, include, nanoparticles\n", 558 | "iron, Contribute to, toughness\n", 559 | "iron, Such as, nanoparticles\n", 560 | "thermoplastic material, has, collagen\n", 561 | "plga, Demonstrates, biocompatibility\n", 562 | "plga, Deposited during long-term cell culture, collagen\n", 563 | "gold or silver, are incorporated into, nanoparticles\n", 564 | "polymer chains, bind to, nanoparticles\n", 565 | "go nanosheets, Is significant; impact on electrical conductivity is relatively little due to sporadic crosslinking of a small amount of metal ions, mechanical properties\n", 566 | "go nanosheets, Composites exhibit higher toughness, toughness\n", 567 | "go nanosheets, provides substrate for, collagen\n", 568 | "go nanosheets, Composites exhibit higher tensile strength, tensile strength\n", 569 | "go nanosheets, Have significant impact; composites exhibit higher tensile strength and toughness, go nanosheets\n", 570 | "go nanosheets, involves, synergistic interfacial interactions\n", 571 | "young's modulus, has, collagen\n", 572 | "hexagonally ordered, vertically oriented pvp domains, are segregated within and concentrated at, nanoparticles\n", 573 | "tissue engineering scaffolds, Suitable material for, collagen\n", 574 | "tissue engineering scaffolds, suitable for, biodegradable hydrogels\n", 575 | "tissue engineering scaffolds, Create 3D structures by combining soft and rigid biomaterials., multiphasic strategies\n", 576 | "tissue engineering scaffolds, Create multilayered and multiscale structures that mimic native architecture and function., multistructured and hierarchical strategies\n", 577 | "tissue engineering scaffolds, promote, cell growth\n", 578 | "tissue engineering scaffolds, are uniformly dispersed in, nanoparticles\n", 579 | "tissue engineering scaffolds, interactions between, polymer chains components\n", 580 | "tissue engineering scaffolds, lacks cell adhesive properties, pva\n", 581 | "tissue engineering scaffolds, suitable fillers for, carbon-based nanomaterials\n", 582 | "tissue engineering scaffolds, Are being used for designing, biomimetic devices\n", 583 | "biomedical applications, has been used in,has been used in, collagen\n", 584 | "biomedical applications, Used for, polymeric materials\n", 585 | "mechanisms of action, their, nanoparticles\n", 586 | "ductility, Contributes, collagen\n", 587 | "biopolymers, offer, biocompatibility\n", 588 | "biopolymers, deposits oriented platelet-like structures during bone growth, collagen\n", 589 | "biopolymers, bonds, nacre\n", 590 | "direct cryogenic ting sff method, cannot be directly applied to, collagen\n", 591 | "tissues, interact, nanoparticles\n", 592 | "collagenfibril, is formed by the self-assembly of tropocollagen molecules, collagen\n", 593 | "oxidative stress, potentially cause, nanoparticles\n", 594 | "nanogranular structure, consists of, nanoparticles\n", 595 | "porcupine fish spines, Used in, collagen\n", 596 | "biomimetic devices, better match complexity and properties, biological materials\n", 597 | "intracrystalline organics, form within, nanoparticles\n", 598 | "intracrystalline organics, are present within, aragonite plates\n", 599 | "cnc films, linking their nanoscopic properties to macroscale performance, mechanical properties\n", 600 | "cnc films, composed solely of, nanoparticles\n", 601 | "energy dissipation, allow for during deformation, collagen fibrils\n", 602 | "energy dissipation, provides, collagen\n", 603 | "energy dissipation, Energy Dissipation refers to the process by which a material absorbs and dissipates energy during deformation or loading, while energy dissipation is a specific property or behavior of a material that allows it to absorb and dissipate energy during deformation or loading., energy dissipation\n", 604 | "carbodiimide, Used As Crosslinking Method, collagen\n", 605 | "skin regeneration, has been primarily used in, collagen\n", 606 | "biological activity, Depends On, collagen\n", 607 | "biological activity, possesses, biological activity\n", 608 | "surface modification, developed to, nanoparticles\n", 609 | "surface modification, Can be surface modified using surface modification, surface modification\n", 610 | "injectablehydrogels, can be processed into, collagen\n", 611 | "bilinear force-extension behavior, exhibits, collagen\n", 612 | "biomedical materials, exhibits, biocompatibility\n", 613 | "biomedical materials, is, collagen\n", 614 | "weak piezoresponse, Has, collagen\n", 615 | "extracellular matrix proteins, Extracellular matrix proteins include collagen., collagen\n", 616 | "synergistic interfacial interactions, involves, collagen\n", 617 | "aluminum, Such as, nanoparticles\n", 618 | "aluminum, while, was also characterized using afm\n", 619 | "higher naoh concentration, is required to prevent spontaneous polymerization when acetic-acid-extracted collagen is used, collagen\n", 620 | "biological materials, provide functionalities, biocompatibility\n", 621 | "biological materials, provide functionalities, mechanical properties\n", 622 | "biological materials, are attributed to exceptional properties for, toughness\n", 623 | "biological materials, Both contribute to overall mechanical properties, collagen\n", 624 | "biological materials, is, nacre\n", 625 | "flexural strength, enhanced, nanoparticles\n", 626 | "interfacial shear strength, improved, nanoparticles\n", 627 | "fungi, against, nanoparticles\n", 628 | "scaffold material, is an ideal, collagen\n", 629 | "collagen cross-links, has, collagen\n", 630 | "wangs' et al's small-diameter vascular scaffolds, made of, collagen\n", 631 | "polymeric materials, incorporated into, nanoparticles\n", 632 | "improved biocompatibility, developed to, nanoparticles\n", 633 | "enzyme inactivation, lead to, nanoparticles\n", 634 | "polymer nanocomposites, contain, nanoparticles\n", 635 | "biomolecules, binds and assembles, nanoparticles\n", 636 | "crosslinking methods, Widely Used for Modification, collagen\n", 637 | "chitosan-collagen composite hydrogels, have potential applications as, collagen\n", 638 | "collagen, Has, biocompatibility\n", 639 | "collagen, studied for, mechanical properties\n", 640 | "collagen, can be processed into, scaffolds\n", 641 | "collagen, Contributes, toughness\n", 642 | "collagen, forms, collagen fibrils\n", 643 | "collagen, is combined with, electrospinning processes\n", 644 | "collagen, is combined with, freeze-drying\n", 645 | "collagen, present in, biological structures\n", 646 | "collagen, guides mineralization,results in similar size and distribution, nanocrystals\n", 647 | "collagen, is composed of, alpha-helices\n", 648 | "collagen, forms staggered architectures during bone growth and shell growth, respectively, nacre\n", 649 | "collagen, simulated using image data obtained from confocal laser scanning microscopy, material point method (mpm)\n", 650 | "collagen, influences, elastic moduli predictions for mineralized collagen fibrils\n", 651 | "collagen, may be treated through, blending\n", 652 | "collagen, can result from, enzymatic degradation\n", 653 | "collagen, can be processed into, membranes and films\n", 654 | "collagen, understanding of structure and function, tissue being repaired\n", 655 | "collagen, is part of, thc\n", 656 | "collagen, is, triple-helical domain\n", 657 | "collagen, Related To, hierarchical microstructures\n", 658 | "collagen, Has, rough topography\n", 659 | "collagen, has high tensile strength due to, sub-fibril alignment\n", 660 | "collagen, are made of, high-performance materials with nacre-like structure\n", 661 | "collagen, is, structural proteins\n", 662 | "collagen, adjusts, hydrophobic surface regions\n", 663 | "collagen, is found in, cartilage\n", 664 | "collagen, collagen provides strength and stiffness while laminins form a structural barrier and interact with other extracellular matrix proteins, laminins\n", 665 | "nanoparticles, enhanced ultimate mechanical properties, composite materials\n", 666 | "nanoparticles, behavior is significantly influenced by, polymer matrices\n", 667 | "nanoparticles, significantly increased, tensile strength\n", 668 | "nanoparticles, Improve, titanium dioxide nanoparticles\n", 669 | "nanoparticles, Such as, copper\n", 670 | "nanoparticles, Such as, gold\n", 671 | "nanoparticles, Impart, osteogenic properties\n", 672 | "nanoparticles, against, bacteria\n", 673 | "nanoparticles, against, viruses\n", 674 | "nanoparticles, contribute to during deformation, microstructure\n", 675 | "nanoparticles, are present within, aragonite plates\n", 676 | "nanoparticles, synthesis with tunable properties, sol-gel process\n", 677 | "nanoparticles, are formed by post-calcinations, nb nanops\n", 678 | "nanoparticles, convert into, sn ions\n", 679 | "nanoparticles, are contained in, zro2 nanops\n", 680 | "nanoparticles, and provide additional functionalities, thermoresponsive properties\n", 681 | "nanoparticles, are templated by, cellulose nanocrystals\n", 682 | "nanoparticles, include, plasmonic nanoparticles\n", 683 | "nanoparticles, developed to, reduce toxicity\n", 684 | "was also characterized using afm, while, copper\n", 685 | "hierarchical microstructures, Renaming for consistency and clarity, hierarchical microstructures\n", 686 | "biological structures, studied for, mechanical properties\n", 687 | "tensile strength, is related to, thc\n", 688 | "tensile strength, Renaming, tensile strength\n", 689 | "tensile strength, is less than, nacre\n", 690 | "nacre, exhibits, toughness\n", 691 | "mechanical properties, Investigation of, scaffolds\n", 692 | "mechanical properties, Renaming, mechanical properties\n", 693 | "toughness, Calculated from load-displacement measurements, scaffolds\n", 694 | "biocompatibility, sameAs, biocompatibility\n", 695 | "\n", 696 | "These are from a knowledge graph between collagen and copper.\n", 697 | "\n", 698 | "### Develop a new research idea around collagen and copper.\n" 699 | ] 700 | }, 701 | { 702 | "name": "stderr", 703 | "output_type": "stream", 704 | "text": [ 705 | "Llama.generate: prefix-match hit\n", 706 | "\n", 707 | "llama_print_timings: load time = 5546.25 ms\n", 708 | "llama_print_timings: sample time = 157.28 ms / 323 runs ( 0.49 ms per token, 2053.73 tokens per second)\n", 709 | "llama_print_timings: prompt eval time = 33531.03 ms / 4772 tokens ( 7.03 ms per token, 142.32 tokens per second)\n", 710 | "llama_print_timings: eval time = 23368.99 ms / 322 runs ( 72.57 ms per token, 13.78 tokens per second)\n", 711 | "llama_print_timings: total time = 57923.38 ms / 5094 tokens\n" 712 | ] 713 | }, 714 | { 715 | "data": { 716 | "text/markdown": [ 717 | "**Response:** Based on the given information, a potential research idea could be to explore the use of copper-collagen nanoparticles for bone tissue engineering applications. The knowledge graph provides evidence that collagen has been used as a scaffold material in biomedical applications, while copper has shown promise as an antimicrobial agent and osteogenic material. By combining these two materials, it may be possible to create a novel nanocomposite with improved mechanical properties and enhanced biological activity for bone tissue engineering.\n", 718 | "\n", 719 | "The research could involve synthesizing and characterizing the copper-collagen nanoparticles using various techniques such as sol-gel process, post-calcinations, and surface modification methods. The study could also investigate the effects of different ratios of collagen to copper on the properties of the nanocomposite, as well as its biocompatibility and potential toxicity.\n", 720 | "\n", 721 | "The research idea is based on the knowledge graph's description of how collagen can be processed into various forms such as membranes, films, and hydrogels, while copper has been used in a variety of applications including antimicrobial agents and osteogenic materials. The study could also draw inspiration from the knowledge graph's mention of the use of nanoparticles to enhance mechanical properties and improve biocompatibility in composite materials.\n", 722 | "\n", 723 | "Overall, this research idea aims to explore the potential of copper-collagen nanoparticles as a novel material for bone tissue engineering applications, with a focus on improving both mechanical properties and biological activity." 724 | ], 725 | "text/plain": [ 726 | "" 727 | ] 728 | }, 729 | "metadata": {}, 730 | "output_type": "display_data" 731 | }, 732 | { 733 | "data": { 734 | "text/markdown": [ 735 | " Based on the given information, a potential research idea could be to explore the use of copper-collagen nanoparticles for bone tissue engineering applications. The knowledge graph provides evidence that collagen has been used as a scaffold material in biomedical applications, while copper has shown promise as an antimicrobial agent and osteogenic material. By combining these two materials, it may be possible to create a novel nanocomposite with improved mechanical properties and enhanced biological activity for bone tissue engineering.\n", 736 | "\n", 737 | "The research could involve synthesizing and characterizing the copper-collagen nanoparticles using various techniques such as sol-gel process, post-calcinations, and surface modification methods. The study could also investigate the effects of different ratios of collagen to copper on the properties of the nanocomposite, as well as its biocompatibility and potential toxicity.\n", 738 | "\n", 739 | "The research idea is based on the knowledge graph's description of how collagen can be processed into various forms such as membranes, films, and hydrogels, while copper has been used in a variety of applications including antimicrobial agents and osteogenic materials. The study could also draw inspiration from the knowledge graph's mention of the use of nanoparticles to enhance mechanical properties and improve biocompatibility in composite materials.\n", 740 | "\n", 741 | "Overall, this research idea aims to explore the potential of copper-collagen nanoparticles as a novel material for bone tissue engineering applications, with a focus on improving both mechanical properties and biological activity." 742 | ], 743 | "text/plain": [ 744 | "" 745 | ] 746 | }, 747 | "metadata": {}, 748 | "output_type": "display_data" 749 | } 750 | ], 751 | "source": [ 752 | "response, (best_node_1, best_similarity_1, best_node_2, best_similarity_2), path, path_graph, shortest_path_length, fname, graph_GraphML = find_path_and_reason(\n", 753 | " G, \n", 754 | " node_embeddings,\n", 755 | " embedding_tokenizer, \n", 756 | " embedding_model, \n", 757 | " generate_BioMixtral, \n", 758 | " data_dir=data_dir_output,\n", 759 | " verbatim=True,\n", 760 | " include_keywords_as_nodes=True, # Include keywords in the graph analysis\n", 761 | " keyword_1=\"collagen\",\n", 762 | " keyword_2=\"copper\",\n", 763 | " N_limit=9999, # The limit for keywords, triplets, etc.\n", 764 | " instruction='Develop a new research idea around collagen and copper.',\n", 765 | " keywords_separator=', ',\n", 766 | " graph_analysis_type='nodes and relations',\n", 767 | " temperature=0.3, \n", 768 | " inst_prepend='### ', # Instruction prepend text\n", 769 | " prepend='''You are given a set of information from a graph that describes the relationship \n", 770 | " between materials, structure, properties, and properties. You analyze these logically \n", 771 | " through reasoning.\\n\\n''', # Prepend text for analysis\n", 772 | " visualize_paths_as_graph=True, # Whether to visualize paths as a graph\n", 773 | " display_graph=True, # Whether to display the graph\n", 774 | ")\n", 775 | "display(Markdown(response))" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": 25, 781 | "id": "09f87f6d-9b8f-454f-8b7a-84806b8b22f6", 782 | "metadata": {}, 783 | "outputs": [ 784 | { 785 | "data": { 786 | "text/plain": [ 787 | "(\" Based on the given information, a potential research idea could be to explore the use of copper-collagen nanoparticles for bone tissue engineering applications. The knowledge graph provides evidence that collagen has been used as a scaffold material in biomedical applications, while copper has shown promise as an antimicrobial agent and osteogenic material. By combining these two materials, it may be possible to create a novel nanocomposite with improved mechanical properties and enhanced biological activity for bone tissue engineering.\\n\\nThe research could involve synthesizing and characterizing the copper-collagen nanoparticles using various techniques such as sol-gel process, post-calcinations, and surface modification methods. The study could also investigate the effects of different ratios of collagen to copper on the properties of the nanocomposite, as well as its biocompatibility and potential toxicity.\\n\\nThe research idea is based on the knowledge graph's description of how collagen can be processed into various forms such as membranes, films, and hydrogels, while copper has been used in a variety of applications including antimicrobial agents and osteogenic materials. The study could also draw inspiration from the knowledge graph's mention of the use of nanoparticles to enhance mechanical properties and improve biocompatibility in composite materials.\\n\\nOverall, this research idea aims to explore the potential of copper-collagen nanoparticles as a novel material for bone tissue engineering applications, with a focus on improving both mechanical properties and biological activity.\",\n", 788 | " ('collagen', 1, 'copper', 1),\n", 789 | " ['collagen', 'tissue engineering scaffolds', 'nanoparticles', 'copper'])" 790 | ] 791 | }, 792 | "execution_count": 25, 793 | "metadata": {}, 794 | "output_type": "execute_result" 795 | } 796 | ], 797 | "source": [ 798 | "response, (best_node_1, best_similarity_1, best_node_2, best_similarity_2), path" 799 | ] 800 | } 801 | ], 802 | "metadata": { 803 | "kernelspec": { 804 | "display_name": "Python 3 (ipykernel)", 805 | "language": "python", 806 | "name": "python3" 807 | }, 808 | "language_info": { 809 | "codemirror_mode": { 810 | "name": "ipython", 811 | "version": 3 812 | }, 813 | "file_extension": ".py", 814 | "mimetype": "text/x-python", 815 | "name": "python", 816 | "nbconvert_exporter": "python", 817 | "pygments_lexer": "ipython3", 818 | "version": "3.11.5" 819 | } 820 | }, 821 | "nbformat": 4, 822 | "nbformat_minor": 5 823 | } 824 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GraphReasoning: Scientific Discovery through Knowledge Extraction and Multimodal Graph-based Representation and Reasoning 2 | 3 | Markus J. Buehler, MIT, 2024 4 | mbuehler@MIT.EDU 5 | 6 | Leveraging generative Artificial Intelligence (AI), we have transformed a dataset comprising 1,000 scientific papers into an ontological knowledge graph. Through an in-depth structural analysis, we have calculated node degrees, identified communities and connectivities, and evaluated clustering coefficients and betweenness centrality of pivotal nodes, uncovering fascinating knowledge architectures. The graph has an inherently scale-free nature, is highly connected, and can be used for graph reasoning by taking advantage of transitive and isomorphic properties that reveal unprecedented interdisciplinary relationships that can be used to answer queries, identify gaps in knowledge, propose never-before-seen material designs, and predict material behaviors. We compute deep node embeddings for combinatorial node similarity ranking for use in a path sampling strategy links dissimilar concepts that have previously not been related. One comparison revealed structural parallels between biological materials and Beethoven's 9th Symphony, highlighting shared patterns of complexity through isomorphic mapping. In another example, the algorithm proposed a hierarchical mycelium-based composite based on integrating path sampling with principles extracted from Kandinsky's 'Composition VII' painting. The resulting material integrates an innovative set of concepts that include a balance of chaos/order, adjustable porosity, mechanical strength, and complex patterned chemical functionalization. We uncover other isomorphisms across science, technology and art, revealing a nuanced ontology of immanence that reveal a context-dependent heterarchical interplay of constituents. Graph-based generative AI achieves a far higher degree of novelty, explorative capacity, and technical detail, than conventional approaches and establishes a widely useful framework for innovation by revealing hidden connections. 7 | This library provides all codes and libraries used in the paper: https://arxiv.org/abs/2403.11996 8 | 9 | ![image](https://github.com/lamm-mit/GraphReasoning/assets/101393859/3baa3752-8222-4857-a64c-c046693d6315) 10 | 11 | # Installation and Examples 12 | 13 | Install directly from GitHub: 14 | ``` 15 | pip install git+https://github.com/lamm-mit/GraphReasoning 16 | ``` 17 | Or, editable: 18 | ``` 19 | pip install -e git+https://github.com/lamm-mit/GraphReasoning.git#egg=GraphReasoning 20 | ``` 21 | Install X-LoRA, if needed: 22 | ``` 23 | pip install git+https://github.com/EricLBuehler/xlora.git 24 | ``` 25 | You may need wkhtmltopdf for the multi-agent model: 26 | ``` 27 | sudo apt-get install wkhtmltopdf 28 | ``` 29 | If you plan to use llama.cpp, install using: 30 | ``` 31 | CMAKE_ARGS="-DLLAMA_CUBLAS=on " pip install 'git+https://github.com/abetlen/llama-cpp-python.git#egg=llama-cpp-python[server]' --force-reinstall --upgrade --no-cache-dir 32 | ``` 33 | Model weights and other data: 34 | 35 | [lamm-mit/GraphReasoning 36 | ](https://huggingface.co/lamm-mit/GraphReasoning/tree/main) 37 | 38 | Graph file: 39 | ``` 40 | from huggingface_hub import hf_hub_download 41 | data_dir='./GRAPHDATA/' 42 | graph_name='BioGraph.graphml' 43 | filename = f"{data_dir}/{graph_name}" 44 | file_path = hf_hub_download(repo_id=repository_id, filename=filename, local_dir='./') 45 | ``` 46 | Embeddings: 47 | ``` 48 | from huggingface_hub import hf_hub_download 49 | data_dir='./GRAPHDATA/' 50 | embedding_file='BioGraph_embeddings_ge-large-en-v1.5.pkl' 51 | filename = f"{data_dir}/{embedding_file}" 52 | file_path = hf_hub_download(repo_id=repository_id, filename=filename, local_dir='./') 53 | ``` 54 | Example: 55 | ``` 56 | from transformers import AutoTokenizer, AutoModel 57 | 58 | from GraphReasoning import * 59 | 60 | embedding_tokenizer = AutoTokenizer.from_pretrained(tokenizer_model, ) 61 | embedding_model = AutoModel.from_pretrained(tokenizer_model, ) 62 | 63 | data_dir_output='./GRAPHDATA_OUTPUT/' 64 | make_dir_if_needed(data_dir_output) 65 | 66 | data_dir='./GRAPHDATA/' 67 | 68 | graph_name=f'{data_dir}/{graph_name}' 69 | G = nx.read_graphml(graph_name) 70 | node_embeddings = load_embeddings(f'{data_dir}/{embedding_file}') 71 | 72 | visualize_embeddings_2d_pretty_and_sample(node_embeddings, 73 | n_clusters=10, n_samples=10, 74 | data_dir=data_dir_output, alpha=.7) 75 | 76 | describe_communities_with_plots_complex(G, N=6, data_dir=data_dir_output) 77 | ``` 78 | Analyze graph and extract information: 79 | ``` 80 | find_best_fitting_node_list("copper", node_embeddings, embedding_tokenizer, embedding_model, 5) 81 | ``` 82 | Find path: 83 | ``` 84 | (best_node_1, best_similarity_1, best_node_2, best_similarity_2), path, path_graph, shortest_path_length, fname, graph_GraphML=find_path( G, node_embeddings, 85 | embedding_tokenizer, embedding_model , second_hop=False, data_dir=data_dir_output, 86 | keyword_1 = "copper", keyword_2 = "silk", 87 | similarity_fit_ID_node_1=0, similarity_fit_ID_node_2=0, 88 | ) 89 | path_list, path_string=print_path_with_edges_as_list(G , path) 90 | path_list, path_string, path 91 | ``` 92 | 93 | # Reference 94 | 95 | ```LaTeX 96 | @misc{Buehler2024AcceleratingDiscoveryGraphReasoning, 97 | author = {Buehler, Markus J.}, 98 | title = {Accelerating Scientific Discovery with Generative Knowledge Extraction, Graph-Based Representation, and Multimodal Intelligent Graph Reasoning}, 99 | year = {2024}, 100 | eprint = {2403.11996}, 101 | archivePrefix = {arXiv}, 102 | doi = {10.48550/arXiv.2403.11996}, 103 | url = {https://doi.org/10.48550/arXiv.2403.11996} 104 | } 105 | ``` 106 | 107 | # API Documentation for Graph Analysis and Reasoning Code 108 | 109 | ## Table of Contents 110 | 111 | 1. [Introduction](#introduction) 112 | 2. [Graph Analysis](#graph-analysis) 113 | - [Shortest Path Functions](#shortest-path-functions) 114 | - [Path Finding and Reasoning](#path-finding-and-reasoning) 115 | - [Community Detection and Analysis](#community-detection-and-analysis) 116 | - [Scale-Free Network Analysis](#scale-free-network-analysis) 117 | 3. [Graph Generation](#graph-generation) 118 | - [Creating Graphs from Text](#creating-graphs-from-text) 119 | - [Adding Subgraphs to Existing Graphs](#adding-subgraphs-to-existing-graphs) 120 | 4. [Graph Tools](#graph-tools) 121 | - [Node Embeddings](#node-embeddings) 122 | - [Graph Visualization](#graph-visualization) 123 | - [Graph Statistics, Exporting/Rendering, and Plots](#graph-statistics-and-plots) 124 | - [Graph Simplification](#graph-simplification) 125 | 5. [Conversational Agents](#conversational-agents) 126 | - [ConversationAgent Class](#conversationagent-class) 127 | - [Conversation Simulation](#conversation-simulation) 128 | - [Conversation Summarization](#conversation-summarization) 129 | - [Question Answering with Agents](#question-answering-with-agents) 130 | 6. [Full API](#full_api) 131 | 132 | ## Introduction 133 | 134 | This API documentation provides an overview of the functions and classes available in the graph analysis and reasoning code. The code is organized into several files, each focusing on a specific aspect of graph analysis, generation, and conversational agents. 135 | 136 | ## Graph Analysis 137 | 138 | The `graph_analysis.py` file contains functions for analyzing graphs, including shortest path finding, path finding with reasoning, community detection, and scale-free network analysis. 139 | 140 | ### Shortest Path Functions 141 | 142 | - `find_shortest_path`: Finds the shortest path between two nodes in a graph. 143 | - `find_shortest_path_with2hops`: Finds the shortest path considering nodes within 2 hops. 144 | - `find_N_paths`: Finds N shortest paths between two nodes. 145 | 146 | ### Path Finding and Reasoning 147 | 148 | - `find_path`: Finds a path between two keywords using best fitting nodes. 149 | - `find_path_and_reason`: Finds a path and reasons over it using a language model. 150 | - `find_path_with_relations_and_reason_combined`: Finds paths, reasons over them, considering multiple paths. 151 | 152 | ### Community Detection and Analysis 153 | 154 | - `describe_communities`: Detects and describes the top N communities in the graph. 155 | - `describe_communities_with_plots`: Detects, describes, and plots the top N communities. 156 | - `describe_communities_with_plots_complex`: More detailed community analysis with plots. 157 | 158 | ### Scale-Free Network Analysis 159 | 160 | - `is_scale_free`: Determines if the network is scale-free using the powerlaw package. 161 | 162 | ## Graph Generation 163 | 164 | The `graph_generation.py` file provides functions for creating graphs from text and adding subgraphs to existing graphs. 165 | 166 | ### Creating Graphs from Text 167 | 168 | - `make_graph_from_text`: Creates a graph from input text. 169 | 170 | ### Adding Subgraphs to Existing Graphs 171 | 172 | - `add_new_subgraph_from_text`: Adds a new subgraph to an existing graph based on input text. 173 | 174 | ## Graph Tools 175 | 176 | The `graph_tools.py` file offers various tools for working with graphs, including saving graph in various file formats (including HTML), node embeddings, graph visualization, graph statistics, and graph simplification. 177 | 178 | ### Node Embeddings 179 | 180 | - `generate_node_embeddings`: Generates node embeddings using a language model. 181 | - `save_embeddings`: Saves node embeddings to a file. 182 | - `load_embeddings`: Loads node embeddings from a file. 183 | - `find_best_fitting_node`: Finds the best fitting node for a given keyword. 184 | - `find_best_fitting_node_list`: Finds the N best fitting nodes for a given keyword. 185 | 186 | ### Graph Visualization 187 | 188 | - `visualize_embeddings_2d`: Visualizes node embeddings in 2D. 189 | - `visualize_embeddings_2d_notext`: Visualizes node embeddings in 2D without text labels. 190 | - `visualize_embeddings_2d_pretty`: Visualizes node embeddings in 2D with a pretty style. 191 | - `visualize_embeddings_2d_pretty_and_sample`: Visualizes node embeddings in 2D with a pretty style and outputs samples for each cluster. 192 | 193 | ### Graph Statistics, Exporting/Rendering, and Plots 194 | 195 | - `graph_statistics_and_plots_for_large_graphs`: Calculates graph statistics and creates visualizations for large graphs. 196 | - `make_HTML`: Saves graph as HTML file. 197 | 198 | ### Graph Simplification 199 | 200 | - `simplify_graph`: Simplifies a graph by merging similar nodes. 201 | - `remove_small_fragents`: Removes small fragments from a graph. 202 | - `update_node_embeddings`: Updates node embeddings for a new graph. 203 | 204 | ## X-LoRA Tools 205 | 206 | The `xlora_tools.py` file contains functions for plotting the scalings of an X-LoRA model. 207 | 208 | ### Plotting Model Scalings 209 | 210 | - `plot_scalings`: Plots the scalings of an X-LoRA model in various ways. 211 | - `plot_scalings_from_tensor`: Plots the scalings from a tensor in various ways. 212 | 213 | ## Conversational Agents 214 | 215 | The `agents.py` file provides classes and functions for creating and working with conversational agents. 216 | 217 | ### ConversationAgent Class 218 | 219 | - `ConversationAgent`: Represents a conversational agent. 220 | 221 | ### Conversation Simulation 222 | 223 | - `conversation_simulator`: Simulates a conversation between agents. 224 | 225 | ### Conversation Summarization 226 | 227 | - `read_and_summarize`: Reads and summarizes a conversation. 228 | 229 | ### Question Answering with Agents 230 | 231 | - `answer_question`: Answers a question using a conversation between two agents. 232 | 233 | ## Full API Documentation 234 | 235 | ## graph_analysis.py 236 | 237 | ### `find_shortest_path(G, source='graphene', target='complexity', verbatim=True, data_dir='./')` 238 | - **Description:** Finds the shortest path between two nodes in a graph. 239 | - **Input:** 240 | - `G` (networkx.Graph): The input graph. 241 | - `source` (str): The source node. Default is 'graphene'. 242 | - `target` (str): The target node. Default is 'complexity'. 243 | - `verbatim` (bool): Whether to print verbose output. Default is True. 244 | - `data_dir` (str): The directory to save output files. Default is './'. 245 | - **Returns:** 246 | - `path` (list): The shortest path from source to target. 247 | - `path_graph` (networkx.Graph): The subgraph containing the shortest path. 248 | - `shortest_path_length` (int): The length of the shortest path. 249 | - `fname` (str): The filename of the saved HTML file. 250 | - `graph_GraphML` (str): The filename of the saved GraphML file. 251 | 252 | ### `find_shortest_path_with2hops(G, source='graphene', target='complexity', second_hop=True, verbatim=True, data_dir='./', save_files=True)` 253 | - **Description:** Finds the shortest path between two nodes considering nodes within 2 hops. 254 | - **Input:** 255 | - `G` (networkx.Graph): The input graph. 256 | - `source` (str): The source node. Default is 'graphene'. 257 | - `target` (str): The target node. Default is 'complexity'. 258 | - `second_hop` (bool): Whether to consider nodes within 2 hops. Default is True. 259 | - `verbatim` (bool): Whether to print verbose output. Default is True. 260 | - `data_dir` (str): The directory to save output files. Default is './'. 261 | - `save_files` (bool): Whether to save output files. Default is True. 262 | - **Returns:** 263 | - `path` (list): The shortest path from source to target. 264 | - `path_graph` (networkx.Graph): The subgraph containing the shortest path and nodes within 2 hops. 265 | - `shortest_path_length` (int): The length of the shortest path. 266 | - `fname` (str): The filename of the saved HTML file. None if `save_files` is False. 267 | - `graph_GraphML` (str): The filename of the saved GraphML file. None if `save_files` is False. 268 | 269 | ### `find_N_paths(G, source='graphene', target='complexity', N=5)` 270 | - **Description:** Finds N shortest paths between two nodes. 271 | - **Input:** 272 | - `G` (networkx.Graph): The input graph. 273 | - `source` (str): The source node. Default is 'graphene'. 274 | - `target` (str): The target node. Default is 'complexity'. 275 | - `N` (int): The number of shortest paths to find. Default is 5. 276 | - **Returns:** 277 | - `sampled_paths` (list): A list of the N shortest paths from source to target. 278 | - `fname_list` (list): A list of filenames of the saved HTML files for each path. 279 | 280 | ### `find_all_triplets(G)` 281 | - **Description:** Finds all connected triplets of nodes in the graph. 282 | - **Input:** 283 | - `G` (networkx.Graph): The input graph. 284 | - **Returns:** 285 | - `triplets` (list): A list of all connected triplets of nodes in the graph. 286 | 287 | ### `print_node_pairs_edge_title(G)` 288 | - **Description:** Prints node pairs with their edge titles. 289 | - **Input:** 290 | - `G` (networkx.Graph): The input graph. 291 | - **Returns:** 292 | - `pairs_and_titles` (list): A list of node pairs with their edge titles. 293 | 294 | ### `find_path(G, node_embeddings, tokenizer, model, keyword_1='music and sound', keyword_2='graphene', verbatim=True, second_hop=False, data_dir='./', similarity_fit_ID_node_1=0, similarity_fit_ID_node_2=0, save_files=True)` 295 | - **Description:** Finds a path between two keywords using best fitting nodes. 296 | - **Input:** 297 | - `G` (networkx.Graph): The input graph. 298 | - `node_embeddings` (dict): A dictionary of node embeddings. 299 | - `tokenizer`: The tokenizer for the language model. 300 | - `model`: The language model. 301 | - `keyword_1` (str): The first keyword. Default is 'music and sound'. 302 | - `keyword_2` (str): The second keyword. Default is 'graphene'. 303 | - `verbatim` (bool): Whether to print verbose output. Default is True. 304 | - `second_hop` (bool): Whether to consider nodes within 2 hops. Default is False. 305 | - `data_dir` (str): The directory to save output files. Default is './'. 306 | - `similarity_fit_ID_node_1` (int): The index of the best fitting node for keyword_1. Default is 0. 307 | - `similarity_fit_ID_node_2` (int): The index of the best fitting node for keyword_2. Default is 0. 308 | - `save_files` (bool): Whether to save output files. Default is True. 309 | - **Returns:** 310 | - `(best_node_1, best_similarity_1, best_node_2, best_similarity_2)` (tuple): The best fitting nodes and their similarities for each keyword. 311 | - `path` (list): The path from best_node_1 to best_node_2. 312 | - `path_graph` (networkx.Graph): The subgraph containing the path. 313 | - `shortest_path_length` (int): The length of the path. 314 | - `fname` (str): The filename of the saved HTML file. None if `save_files` is False. 315 | - `graph_GraphML` (str): The filename of the saved GraphML file. None if `save_files` is False. 316 | 317 | ### `describe_communities(G, N=10)` 318 | - **Description:** Detects and describes the top N communities in the graph. 319 | - **Input:** 320 | - `G` (networkx.Graph): The input graph. 321 | - `N` (int): The number of top communities to describe. Default is 10. 322 | - **Returns:** None. Prints the description of the top N communities. 323 | 324 | ### `describe_communities_with_plots(G, N=10, N_nodes=5, data_dir='./')` 325 | - **Description:** Detects, describes and plots the top N communities in the graph. 326 | - **Input:** 327 | - `G` (networkx.Graph): The input graph. 328 | - `N` (int): The number of top communities to describe and plot. Default is 10. 329 | - `N_nodes` (int): The number of top nodes to highlight per community. Default is 5. 330 | - `data_dir` (str): The directory to save output files. Default is './'. 331 | - **Returns:** None. Saves the community size plot and the combined plot of top nodes by degree for each community. 332 | 333 | ### `describe_communities_with_plots_complex(G, N=10, N_nodes=5, data_dir='./')` 334 | - **Description:** Performs a more detailed community analysis with plots. 335 | - **Input:** 336 | - `G` (networkx.Graph): The input graph. 337 | - `N` (int): The number of top communities to describe and plot. Default is 10. 338 | - `N_nodes` (int): The number of top nodes to highlight per community. Default is 5. 339 | - `data_dir` (str): The directory to save output files. Default is './'. 340 | - **Returns:** None. Saves various plots for community analysis. 341 | 342 | ### `is_scale_free(G, plot_distribution=True, data_dir='./', manual_xmin=None)` 343 | - **Description:** Determines if the network G is scale-free using the powerlaw package. 344 | - **Input:** 345 | - `G` (networkx.Graph): The input graph. 346 | - `plot_distribution` (bool): Whether to plot the degree distribution with the power-law fit. Default is True. 347 | - `data_dir` (str): The directory to save output files. Default is './'. 348 | - `manual_xmin` (int): Manually set the xmin value for power-law fitting. 349 | - **Returns:** 350 | - `is_scale_free` (bool): Whether the network is scale-free. 351 | - `fit` (powerlaw.Fit): The powerlaw fit object. 352 | 353 | ### `print_path_with_edges_as_list(G, path, keywords_separator=' --> ')` 354 | - **Description:** Prints a path with nodes and edge titles as a list. 355 | - **Input:** 356 | - `G` (networkx.Graph): The input graph. 357 | - `path` (list): The path to print. 358 | - `keywords_separator` (str): The separator for the keywords in the output string. Default is ' --> '. 359 | - **Returns:** 360 | - `path_elements` (list): The path elements as a list. 361 | - `as_string` (str): The path elements as a string. 362 | 363 | ### `find_path_and_reason(G, node_embeddings, tokenizer, model, generate, keyword_1='music and sound', keyword_2='apples', include_keywords_as_nodes=True, inst_prepend='', graph_analysis_type='path and relations', instruction='Now, reason over them and propose a research hypothesis.', verbatim=False, N_limit=None, temperature=0.3, keywords_separator=' --> ', system_prompt='You are a scientist who uses logic and reasoning.', max_tokens=4096, prepend='You are given a set of information from a graph that describes the relationship between materials, structure, properties, and properties. You analyze these logically through reasoning.\n\n', similarity_fit_ID_node_1=0, similarity_fit_ID_node_2=0, save_files=True, data_dir='./', visualize_paths_as_graph=True, display_graph=True, words_per_line=2)` 364 | - **Description:** Finds a path between keywords and reasons over it using LLM. 365 | - **Input:** 366 | - `G` (networkx.Graph): The input graph. 367 | - `node_embeddings` (dict): A dictionary of node embeddings. 368 | - `tokenizer`: The tokenizer for the language model. 369 | - `model`: The language model. 370 | - `generate`: The generate function for the language model. 371 | - `keyword_1` (str): The first keyword. Default is 'music and sound'. 372 | - `keyword_2` (str): The second keyword. Default is 'apples'. 373 | - `include_keywords_as_nodes` (bool): Whether to include the keywords as nodes in the path. Default is True. 374 | - `inst_prepend` (str): The instruction to prepend to the generate function. Default is ''. 375 | - `graph_analysis_type` (str): The type of graph analysis to perform. Default is 'path and relations'. 376 | - `instruction` (str): The instruction for reasoning. Default is 'Now, reason over them and propose a research hypothesis.'. 377 | - `verbatim` (bool): Whether to print verbose output. Default is False. 378 | - `N_limit` (int): The maximum number of nodes to include in the path. Default is None (no limit). 379 | - `temperature` (float): The temperature for the generate function. Default is 0.3. 380 | - `keywords_separator` (str): The separator for the keywords in the output string. Default is ' --> '. 381 | - `system_prompt` (str): The system prompt for the generate function. Default is 'You are a scientist who uses logic and reasoning.'. 382 | - `max_tokens` (int): The maximum number of tokens for the generate function. Default is 4096. 383 | - `prepend` (str): The string to prepend to the generate function. Default is 'You are given a set of information from a graph that describes the relationship between materials, structure, properties, and properties. You analyze these logically through reasoning.\n\n'. 384 | - `similarity_fit_ID_node_1` (int): The index of the best fitting node for keyword_1. Default is 0. 385 | - `similarity_fit_ID_node_2` (int): The index of the best fitting node for keyword_2. Default is 0. 386 | - `save_files` (bool): Whether to save output files. Default is True. 387 | - `data_dir` (str): The directory to save output files. Default is './'. 388 | - `visualize_paths_as_graph` (bool): Whether to visualize the paths as a graph. Default is True. 389 | - `display_graph` (bool): Whether to display the graph. Default is True. 390 | - `words_per_line` (int): The number of words per line for the graph visualization. Default is 2. 391 | - **Returns:** 392 | - `response` (str): The response from the generate function. 393 | - `(best_node_1, best_similarity_1, best_node_2, best_similarity_2)` (tuple): The best fitting nodes and their similarities for each keyword. 394 | - `path` (list): The path from best_node_1 to best_node_2. 395 | - `path_graph` (networkx.Graph): The subgraph containing the path. 396 | - `shortest_path_length` (int): The length of the path. 397 | - `fname` (str): The filename of the saved HTML file. None if `save_files` is False. 398 | - `graph_GraphML` (str): The filename of the saved GraphML file. None if `save_files` is False. 399 | 400 | ### `find_path_with_relations_and_reason_combined(G, node_embeddings, tokenizer, model, generate, keyword_1='music and sound', keyword_2='apples', include_keywords_as_nodes=True, inst_prepend='', instruction='Now, reason over them and propose a research hypothesis.', verbatim=False, N_limit=None, temperature=0.3, keywords_separator=' --> ', system_prompt='You are a scientist who uses logic and reasoning.', max_tokens=4096, prepend='You are given a set of information from a graph that describes the relationship between materials, structure, properties, and properties. You analyze these logically through reasoning.\n\n', num_paths=2, include_all_possible=False, data_dir='./', save_files=False, visualize_paths_as_graph=False, display_graph=True, words_per_line=2)` 401 | - **Description:** Finds paths between keywords, reasons over them, considering multiple paths. 402 | - **Input:** 403 | - `G` (networkx.Graph): The input graph. 404 | - `node_embeddings` (dict): A dictionary of node embeddings. 405 | - `tokenizer`: The tokenizer for the language model. 406 | - `model`: The language model. 407 | - `generate`: The generate function for the language model. 408 | - `keyword_1` (str): The first keyword. Default is 'music and sound'. 409 | - `keyword_2` (str): The second keyword. Default is 'apples'. 410 | - `include_keywords_as_nodes` (bool): Whether to include the keywords as nodes in the paths. Default is True. 411 | - `inst_prepend` (str): The instruction to prepend to the generate function. Default is ''. 412 | - `instruction` (str): The instruction for reasoning. Default is 'Now, reason over them and propose a research hypothesis.'. 413 | - `verbatim` (bool): Whether to print verbose output. Default is False. 414 | - `N_limit` (int): The maximum number of nodes to include in each path. Default is None (no limit). 415 | - `temperature` (float): The temperature for the generate function. Default is 0.3. 416 | - `keywords_separator` (str): The separator for the keywords in the output string. Default is ' --> '. 417 | - `system_prompt` (str): The system prompt for the generate function. Default is 'You are a scientist who uses logic and reasoning.'. 418 | - `max_tokens` (int): The maximum number of tokens for the generate function. Default is 4096. 419 | - `prepend` (str): The string to prepend to the generate function. Default is 'You are given a set of information from a graph that describes the relationship between materials, structure, properties, and properties. You analyze these logically through reasoning.\n\n'. 420 | - `num_paths` (int): The number of paths to find. Default is 2. 421 | - `include_all_possible` (bool): Whether to include all possible combinations of paths. Default is False. 422 | - `data_dir` (str): The directory to save output files. Default is './'. 423 | - `save_files` (bool): Whether to save output files. Default is False. 424 | - `visualize_paths_as_graph` (bool): Whether to visualize the paths as a graph. Default is False. 425 | - `display_graph` (bool): Whether to display the graph. Default is True. 426 | - `words_per_line` (int): The number of words per line for the graph visualization. Default is 2. 427 | - **Returns:** 428 | - `response` (str): The response from the generate function. 429 | 430 | ## graph_generation.py 431 | 432 | ### `make_graph_from_text(txt, generate, include_contextual_proximity=False, graph_root='graph_root', chunk_size=2500, chunk_overlap=0, repeat_refine=0, verbatim=False, data_dir='./data_output_KG/', save_PDF=False, save_HTML=True)` 433 | - **Description:** Creates a graph from input text. 434 | - **Input:** 435 | - `txt` (str): The input text to generate the graph from. 436 | - `generate`: The generate function for the language model. 437 | - `include_contextual_proximity` (bool): Whether to include contextual proximity edges. Default is False. 438 | - `graph_root` (str): The root name for the generated graph files. Default is 'graph_root'. 439 | - `chunk_size` (int): The size of each chunk of text to process. Default is 2500. 440 | - `chunk_overlap` (int): The overlap between chunks of text. Default is 0. 441 | - `repeat_refine` (int): The number of times to repeat the graph refinement process. Default is 0. 442 | - `verbatim` (bool): Whether to print verbose output. Default is False. 443 | - `data_dir` (str): The directory to save output files. Default is './data_output_KG/'. 444 | - `save_PDF` (bool): Whether to save the graph as a PDF file. Default is False. 445 | - `save_HTML` (bool): Whether to save the graph as an HTML file. Default is True. 446 | - **Returns:** 447 | - `graph_HTML` (str): The filename of the saved HTML file. 448 | - `graph_GraphML` (str): The filename of the saved GraphML file. 449 | - `G` (networkx.Graph): The generated graph. 450 | - `net` (pyvis.network.Network): The Pyvis network object for the graph. 451 | - `output_pdf` (str): The filename of the saved PDF file. None if `save_PDF` is False. 452 | 453 | ### `add_new_subgraph_from_text(txt, generate, node_embeddings, tokenizer, model, original_graph_path_and_fname, data_dir_output='./data_temp/', verbatim=True, size_threshold=10, chunk_size=10000, do_Louvain_on_new_graph=True, include_contextual_proximity=False, repeat_refine=0, similarity_threshold=0.95, simplify_graph=True, return_only_giant_component=False, save_common_graph=True, G_to_add=None, graph_GraphML_to_add=None)` 454 | - **Description:** Adds a new subgraph to an existing graph based on input text. 455 | - **Input:** 456 | - `txt` (str): The input text to generate the subgraph from. 457 | - `generate`: The generate function for the language model. 458 | - `node_embeddings` (dict): A dictionary of node embeddings. 459 | - `tokenizer`: The tokenizer for the language model. 460 | - `model`: The language model. 461 | - `original_graph_path_and_fname` (str): The path and filename of the original graph to add the subgraph to. 462 | - `data_dir_output` (str): The directory to save output files. Default is './data_temp/'. 463 | - `verbatim` (bool): Whether to print verbose output. Default is True. 464 | - `size_threshold` (int): The minimum size of connected components to keep in the graph. Default is 10. 465 | - `chunk_size` (int): The size of each chunk of text to process. Default is 10000. 466 | - `do_Louvain_on_new_graph` (bool): Whether to perform Louvain community detection on the new graph. Default is True. 467 | - `include_contextual_proximity` (bool): Whether to include contextual proximity edges. Default is False. 468 | - `repeat_refine` (int): The number of times to repeat the graph refinement process. Default is 0. 469 | - `similarity_threshold` (float): The similarity threshold for simplifying the graph. Default is 0.95. 470 | - `simplify_graph` (bool): Whether to simplify the graph by merging similar nodes. Default is True. 471 | - `return_only_giant_component` (bool): Whether to return only the giant component of the graph. Default is False. 472 | - `save_common_graph` (bool): Whether to save a graph of the common nodes between the original and new graphs. Default is True. 473 | - `G_to_add` (networkx.Graph): An optional graph to add to the original graph instead of generating a new one. Default is None. 474 | - `graph_GraphML_to_add` (str): An optional GraphML file to load a graph from instead of generating a new one. Default is None. 475 | - **Returns:** 476 | - `graph_GraphML` (str): The filename of the saved GraphML file for the combined graph. 477 | - `G_new` (networkx.Graph): The combined graph. 478 | - `G_loaded` (networkx.Graph): The loaded graph to add to the original graph. 479 | - `G` (networkx.Graph): The original graph. 480 | - `node_embeddings` (dict): The updated node embeddings. 481 | - `res` (dict): The graph statistics for the combined graph. 482 | 483 | ## graph_tools.py 484 | 485 | ### `make_HTML(graph, data_dir, graph_root)` 486 | - **Description:** Saves graph as HTML file for easy visualization in a browser. 487 | - **Input:** 488 | - `graph` (networkx.Graph): The input graph. 489 | - `data_dir`: Directory to save HTML graph file in. 490 | - `graph_rool`: Root for file name. 491 | - **Returns:** 492 | - `graph_HTML`: File name of graph in HTML format, as `f'{data_dir}/{graph_root}_graphHTML.html`. 493 | 494 | ### `generate_node_embeddings(graph, tokenizer, model)` 495 | - **Description:** Generates node embeddings using a LLM. 496 | - **Input:** 497 | - `graph` (networkx.Graph): The input graph. 498 | - `tokenizer`: The tokenizer for the language model. 499 | - `model`: The language model. 500 | - **Returns:** 501 | - `embeddings` (dict): A dictionary of node embeddings. 502 | 503 | ### `save_embeddings(embeddings, file_path)` 504 | - **Description:** Saves node embeddings to a file. 505 | - **Input:** 506 | - `embeddings` (dict): A dictionary of node embeddings. 507 | - `file_path` (str): The path to save the embeddings to. 508 | - **Returns:** None. 509 | 510 | ### `load_embeddings(file_path)` 511 | - **Description:** Loads node embeddings from a file. 512 | - **Input:** 513 | - `file_path` (str): The path to load the embeddings from. 514 | - **Returns:** 515 | - `embeddings` (dict): A dictionary of node embeddings. 516 | 517 | ### `find_best_fitting_node(keyword, embeddings, tokenizer, model)` 518 | - **Description:** Finds the best fitting node for a given keyword. 519 | - **Input:** 520 | - `keyword` (str): The keyword to find the best fitting node for. 521 | - `embeddings` (dict): A dictionary of node embeddings. 522 | - `tokenizer`: The tokenizer for the language model. 523 | - `model`: The language model. 524 | - **Returns:** 525 | - `best_node` (str): The best fitting node for the keyword. 526 | - `best_similarity` (float): The similarity score for the best fitting node. 527 | 528 | ### `find_best_fitting_node_list(keyword, embeddings, tokenizer, model, N_samples=5)` 529 | - **Description:** Finds the N best fitting nodes for a given keyword. 530 | - **Input:** 531 | - `keyword` (str): The keyword to find the best fitting nodes for. 532 | - `embeddings` (dict): A dictionary of node embeddings. 533 | - `tokenizer`: The tokenizer for the language model. 534 | - `model`: The language model. 535 | - `N_samples` (int): The number of best fitting nodes to return. Default is 5. 536 | - **Returns:** 537 | - `best_nodes` (list): A list of tuples containing the best fitting nodes and their similarity scores. 538 | 539 | ### `visualize_embeddings_2d(embeddings, data_dir='./')` 540 | - **Description:** Visualizes node embeddings in 2D. 541 | - **Input:** 542 | - `embeddings` (dict): A dictionary of node embeddings. 543 | - `data_dir` (str): The directory to save output files. Default is './'. 544 | - **Returns:** None. Saves a 2D visualization of the node embeddings. 545 | 546 | ### `visualize_embeddings_2d_notext(embeddings, n_clusters=3, data_dir='./')` 547 | - **Description:** Visualizes node embeddings in 2D without text labels. 548 | - **Input:** 549 | - `embeddings` (dict): A dictionary of node embeddings. 550 | - `n_clusters` (int): The number of clusters to use for clustering the embeddings. Default is 3. 551 | - `data_dir` (str): The directory to save output files. Default is './'. 552 | - **Returns:** None. Saves a 2D visualization of the node embeddings without text labels. 553 | 554 | ### `visualize_embeddings_2d_pretty(embeddings, n_clusters=3, data_dir='./')` 555 | - **Description:** Visualizes node embeddings in 2D with a pretty style. 556 | - **Input:** 557 | - `embeddings` (dict): A dictionary of node embeddings. 558 | - `n_clusters` (int): The number of clusters to use for clustering the embeddings. Default is 3. 559 | - `data_dir` (str): The directory to save output files. Default is './'. 560 | - **Returns:** None. Saves a pretty 2D visualization of the node embeddings. 561 | 562 | ### `visualize_embeddings_2d_pretty_and_sample(embeddings, n_clusters=3, n_samples=5, data_dir='./', alpha=0.7, edgecolors='none', s=50)` 563 | - **Description:** Visualizes node embeddings in 2D with a pretty style and outputs samples for each cluster. 564 | - **Input:** 565 | - `embeddings` (dict): A dictionary of node embeddings. 566 | - `n_clusters` (int): The number of clusters to use for clustering the embeddings. Default is 3. 567 | - `n_samples` (int): The number of samples to output for each cluster. Default is 5. 568 | - `data_dir` (str): The directory to save output files. Default is './'. 569 | - `alpha` (float): The alpha value for the scatter plot. Default is 0.7. 570 | - `edgecolors` (str): The edge color for the scatter plot. Default is 'none'. 571 | - `s` (int): The size of the markers in the scatter plot. Default is 50. 572 | - **Returns:** None. Saves a pretty 2D visualization of the node embeddings and outputs samples for each cluster. 573 | 574 | ### `graph_statistics_and_plots_for_large_graphs(G, data_dir='./', include_centrality=False, make_graph_plot=False, root='graph')` 575 | - **Description:** Calculates graph statistics and creates visualizations for large graphs. 576 | - **Input:** 577 | - `G` (networkx.Graph): The input graph. 578 | - `data_dir` (str): The directory to save output files. Default is './'. 579 | - `include_centrality` (bool): Whether to include centrality measures in the statistics. Default is False. 580 | - `make_graph_plot` (bool): Whether to create a plot of the graph. Default is False. 581 | - `root` (str): The root name for the output files. Default is 'graph'. 582 | - **Returns:** 583 | - `statistics` (dict): A dictionary of graph statistics. 584 | - `centrality` (dict): A dictionary of centrality measures. None if `include_centrality` is False. 585 | 586 | ### `simplify_graph(graph_, node_embeddings, tokenizer, model, similarity_threshold=0.9, use_llm=False, data_dir_output='./', graph_root='simple_graph', verbatim=False, max_tokens=2048, temperature=0.3, generate=None)` 587 | - **Description:** Simplifies a graph by merging similar nodes. 588 | - **Input:** 589 | - `graph_` (networkx.Graph): The input graph. 590 | - `node_embeddings` (dict): A dictionary of node embeddings. 591 | - `tokenizer`: The tokenizer for the language model. 592 | - `model`: The language model. 593 | - `similarity_threshold` (float): The similarity threshold for merging nodes. Default is 0.9. 594 | - `use_llm` (bool): Whether to use a language model to rename merged nodes. Default is False. 595 | - `data_dir_output` (str): The directory to save output files. Default is './'. 596 | - `graph_root` (str): The root name for the output files. Default is 'simple_graph'. 597 | - `verbatim` (bool): Whether to print verbose output. Default is False. 598 | - `max_tokens` (int): The maximum number of tokens to generate for renaming nodes. Default is 2048. 599 | - `temperature` (float): The temperature for the language model. Default is 0.3. 600 | - `generate`: The generate function for the language model. Default is None. 601 | - **Returns:** 602 | - `new_graph` (networkx.Graph): The simplified graph. 603 | - `updated_embeddings` (dict): The updated node embeddings after simplification. 604 | 605 | ### `remove_small_fragents(G_new, size_threshold)` 606 | - **Description:** Removes small fragments from a graph. 607 | - **Input:** 608 | - `G_new` (networkx.Graph): The input graph. 609 | - `size_threshold` (int): The minimum size of connected components to keep in the graph. 610 | - **Returns:** 611 | - `G_new` (networkx.Graph): The graph with small fragments removed. 612 | 613 | ### `update_node_embeddings(embeddings, graph_new, tokenizer, model, remove_embeddings_for_nodes_no_longer_in_graph=True, verbatim=False)` 614 | - **Description:** Updates node embeddings for a new graph. 615 | - **Input:** 616 | - `embeddings` (dict): A dictionary of node embeddings. 617 | - `graph_new` (networkx.Graph): The updated graph. 618 | - `tokenizer`: The tokenizer for the language model. 619 | - `model`: The language model. 620 | - `remove_embeddings_for_nodes_no_longer_in_graph` (bool): Whether to remove embeddings for nodes that are no longer in the graph. Default is True. 621 | - `verbatim` (bool): Whether to print verbose output. Default is False. 622 | - **Returns:** 623 | - `embeddings_updated` (dict): The updated node embeddings. 624 | 625 | ## agents.py 626 | 627 | ### `ConversationAgent` class 628 | - **Description:** Represents a conversational agent. 629 | - **Initialization:** 630 | - `chat_model`: The chat model to use for generating responses. 631 | - `name` (str): The name of the agent. 632 | - `instructions` (str): The instructions for the agent. 633 | - `context_turns` (int): The number of turns of context to use for generating responses. Default is 2. 634 | - `temperature` (float): The temperature for the language model. Default is 0.1. 635 | - **Methods:** 636 | - `reply(interlocutor_reply=None)`: Generates a response to the given interlocutor reply. 637 | - `interlocutor_reply` (str): The reply from the interlocutor. Default is None. 638 | - Returns: The generated response (str). 639 | 640 | ### `conversation_simulator(bot0, question_gpt, question_gpt_name='Engineer', question_temperature=0.7, question_asker_instructions='You ALWAYS ask tough questions. ', q='What is bioinspiration?', total_turns=5, data_dir='./')` 641 | - **Description:** Simulates a conversation between agents. 642 | - **Input:** 643 | - `bot0` (ConversationAgent): The first agent in the conversation. 644 | - `question_gpt`: The language model to use for generating questions. 645 | - `question_gpt_name` (str): The name of the question-asking agent. Default is 'Engineer'. 646 | - `question_temperature` (float): The temperature for the question-asking language model. Default is 0.7. 647 | - `question_asker_instructions` (str): The instructions for the question-asking agent. Default is 'You ALWAYS ask tough questions. '. 648 | - `q` (str): The initial question for the conversation. Default is 'What is bioinspiration?'. 649 | - `total_turns` (int): The total number of turns in the conversation. Default is 5. 650 | - `data_dir` (str): The directory to save output files. Default is './'. 651 | - **Returns:** 652 | - `conversation_turns` (list): A list of dictionaries representing each turn in the conversation. 653 | 654 | ### `read_and_summarize(gpt, txt='This is a conversation.', q='')` 655 | - **Description:** Reads and summarizes a conversation. 656 | - **Input:** 657 | - `gpt`: The language model to use for summarization. 658 | - `txt` (str): The conversation text. Default is 'This is a conversation.'. 659 | - `q` (str): The original question. Default is ''. 660 | - **Returns:** 661 | - `summary` (str): The summary of the conversation. 662 | - `bullet` (str): The key points of the conversation as bullet points. 663 | - `takeaway` (str): The most important takeaway from the conversation. 664 | 665 | ### `answer_question(gpt_question_asker, gpt, q='I have identified this amino acid sequence: AAAAAIIAAAA. How can I use it?', bot_name_1='Biologist', bot_instructions_1='You are a biologist. You are taking part in a discussion, from a life science perspective.\nKeep your answers brief, but accurate, and creative.\n', bot_name_2='Engineer', bot_instructions_2='You are a critical engineer. You are taking part in a discussion, from the perspective of engineering.\nKeep your answers brief, and always challenge statements in a provokative way. As a creative individual, you inject ideas from other fields. ', question_temperature=0.1, conv_temperature=0.3, total_turns=4, delete_last_question=True, save_PDF=True, PDF_name=None, save_dir='./', txt_file_path=None)` 666 | - **Description:** Answers a question using a conversation between two agents. 667 | - **Input:** 668 | - `gpt_question_asker`: The language model to use for generating questions. 669 | - `gpt`: The language model to use for generating responses. 670 | - `q` (str): The initial question. Default is 'I have identified this amino acid sequence: AAAAAIIAAAA. How can I use it?'. 671 | - `bot_name_1` (str): The name of the first agent. Default is 'Biologist'. 672 | - `bot_instructions_1` (str): The instructions for the first agent. Default is 'You are a biologist. You are taking part in a discussion, from a life science perspective.\nKeep your answers brief, but accurate, and creative.\n'. 673 | - `bot_name_2` (str): The name of the second agent. Default is 'Engineer'. 674 | - `bot_instructions_2` (str): The instructions for the second agent. Default is 'You are a critical engineer. You are taking part in a discussion, from the perspective of engineering.\nKeep your answers brief, and always challenge statements in a provokative way. As a creative individual, you inject ideas from other fields. '. 675 | - `question_temperature` (float): The temperature for the question-asking language model. Default is 0.1. 676 | - `conv_temperature` (float): The temperature for the conversation language model. Default is 0.3. 677 | - `total_turns` (int): The total number of turns in the conversation. Default is 4. 678 | - `delete_last_question` (bool): Whether to delete the last question from the conversation. Default is True. 679 | - `save_PDF` (bool): Whether to save the conversation as a PDF file. Default is True. 680 | - `PDF_name` (str): The name of the PDF file to save. Default is None. 681 | - `save_dir` (str): The directory to save output files. Default is './'. 682 | - `txt_file_path` (str): The path to save the conversation as a text file. Default is None. 683 | - **Returns:** 684 | - `conversation_turns` (list): A list of dictionaries representing each turn in the conversation. 685 | - `txt` (str): The conversation text. 686 | - `summary` (str): The summary of the conversation. 687 | - `bullet` (str): The key points of the conversation as bullet points. 688 | - `keytakaway` (str): The most important takeaway from the conversation. 689 | - `integrated` (str): The integrated conversation text with summary, bullet points, and key takeaway. 690 | - `save_raw_txt` (str): The raw conversation text without markdown formatting. 691 | 692 | 693 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | # It's a good practice to read long descriptions outside the setup function 4 | with open('README.md', 'r', encoding='utf-8') as f: 5 | long_description = f.read() 6 | 7 | setup( 8 | name='GraphReasoning', 9 | version='0.2.0', 10 | author='Markus J. Buehler', 11 | author_email='mbuehler@mit.edu', 12 | packages=find_packages(), 13 | install_requires=[ 14 | 'numpy', 15 | 'networkx', 16 | 'matplotlib', 17 | 'pandas', 18 | 'transformers>=4.39', 19 | 'powerlaw', 20 | 'markdown2', 21 | 'pdfkit', 22 | 'bitsandbytes', 23 | 'peft', 24 | 'accelerate', 25 | 'torch', 26 | 'torchvision', 27 | 'torchaudio', 28 | 'huggingface_hub', 29 | 'langchain', 30 | 'pyvis', 31 | 'yachalk', 32 | 'pytesseract', 33 | 'llama-index', 34 | 'tqdm', 35 | 'ipython', 36 | 'scikit-learn', 37 | 'scipy', 38 | 'seaborn', 39 | 'uuid', 40 | 'pdfminer.six', 41 | 'community', 42 | 'guidance', 43 | 'python-louvain', 44 | 'wkhtmltopdf' 45 | ], 46 | description='GraphReasoning: Use LLM to reason over graphs, combined with multi-agent modeling.', 47 | long_description=long_description, 48 | long_description_content_type='text/markdown', 49 | url='https://github.com/lamm-mit/GraphReasoning', 50 | classifiers=[ 51 | 'License :: OSI Approved :: MIT License', 52 | 'Programming Language :: Python :: 3.11' 53 | ], 54 | python_requires='>=3.10', 55 | ) 56 | --------------------------------------------------------------------------------