├── ScienceDiscovery
    ├── __init__.py
    ├── __pycache__
    │   ├── utils.cpython-310.pyc
    │   └── __init__.cpython-310.pyc
    ├── graph.py
    ├── llm_config.py
    ├── agents.py
    └── utils.py
├── setup.py
├── Notebooks
    ├── SciAgents_ScienceDiscovery_GraphReasoning_non-automated.ipynb
    └── SciAgents_ScienceDiscovery_GraphReasoning_automated.ipynb
├── README.md
└── LICENSE.txt


/ScienceDiscovery/__init__.py:
--------------------------------------------------------------------------------
1 | from ScienceDiscovery.utils import *
2 | from ScienceDiscovery.agents import *


--------------------------------------------------------------------------------
/ScienceDiscovery/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lamm-mit/SciAgentsDiscovery/HEAD/ScienceDiscovery/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/ScienceDiscovery/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lamm-mit/SciAgentsDiscovery/HEAD/ScienceDiscovery/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/ScienceDiscovery/graph.py:
--------------------------------------------------------------------------------
 1 | from ScienceDiscovery.utils import *
 2 | import os
 3 | 
 4 | data_dir_source='./graph_giant_component/'
 5 | 
 6 | embeddings_name='embeddings_simple_giant_ge-large-en-v1.5.pkl'
 7 | graph_name='large_graph_simple_giant.graphml'
 8 | tokenizer_model="BAAI/bge-large-en-v1.5"
 9 | 
10 | embedding_tokenizer = AutoTokenizer.from_pretrained(tokenizer_model) 
11 | embedding_model = AutoModel.from_pretrained(tokenizer_model,  ) 
12 | 
13 | G = load_graph_with_text_as_JSON (data_dir=data_dir_source, graph_name=graph_name)
14 | G = return_giant_component_of_graph  (G)
15 | G = nx.Graph(G)
16 | try:
17 |     node_embeddings = load_embeddings(f'{data_dir_source}/{embeddings_name}')
18 | except:
19 |     print ("Node embeddings not loaded, need to regenerate.")
20 |     node_embeddings = generate_node_embeddings(G, embedding_tokenizer, embedding_model, )


--------------------------------------------------------------------------------
/ScienceDiscovery/llm_config.py:
--------------------------------------------------------------------------------
 1 | import autogen
 2 | 
 3 | config_list_4o  = autogen.config_list_from_models(model_list=["gpt-4o"])
 4 | 
 5 | config_list_4turbo = autogen.config_list_from_models(model_list=["gpt-4o"])
 6 | 
 7 | gpt4o_config = {
 8 |     "cache_seed": 42,  # change the cache_seed for different trials
 9 |     "temperature": 0.0,
10 |     "config_list": config_list_4o,
11 |     "timeout": 540000,
12 | }
13 | 
14 | 
15 | gpt4o_config_graph = {
16 |     "cache_seed": 42,  # change the cache_seed for different trials
17 |     "temperature": 0.1,
18 |     "config_list": config_list_4o,
19 |     "timeout": 540000,
20 |     "max_tokens": 2048
21 | }
22 | 
23 | gpt4turbo_config_graph = {
24 |     "cache_seed": 42,  # change the cache_seed for different trials
25 |     "temperature": 0.2,
26 |     "config_list": config_list_4turbo,
27 |     "timeout": 540000,
28 | }
29 | 
30 | gpt4turbo_config = {
31 |     "cache_seed": 42,  # change the cache_seed for different trials
32 |     "temperature": 0,
33 |     "config_list": config_list_4turbo,
34 |     "timeout": 540000,
35 | }


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | # It's a good practice to read long descriptions outside the setup function
 4 | with open('README.md', 'r', encoding='utf-8') as f:
 5 |     long_description = f.read()
 6 | 
 7 | setup(
 8 |     name='ScienceDiscovery',
 9 |     version='0.1.0',
10 |     author='Markus J. Buehler',
11 |     author_email='mbuehler@mit.edu',
12 |     packages=find_packages(),
13 |     install_requires=[
14 |         'numpy',
15 |         'networkx',
16 |         'matplotlib',
17 |         'pandas',
18 |         'transformers>=4.39',
19 |         'pyautogen>=0.2.28',
20 |         'powerlaw',
21 |         'markdown2',
22 |         'pdfkit',
23 |         'bitsandbytes',
24 |         'peft',
25 |         'accelerate',
26 |         'torch',
27 |         'torchvision',
28 |         'torchaudio',
29 |         'huggingface_hub',
30 |         'langchain',
31 |         'pyvis',
32 |         'yachalk',
33 |         'pytesseract',
34 |         'llama-index',
35 |         'tqdm',
36 |         'ipython',
37 |         'scikit-learn',
38 |         'scipy',
39 |         'seaborn',
40 |         'uuid',
41 |         'pdfminer.six',
42 |         'community',
43 |         'guidance',
44 |         'python-louvain',
45 |         'wkhtmltopdf',
46 |         'weasyprint',
47 |         'llama-index-embeddings-huggingface',
48 |         'langchain-community',
49 |     ],
50 |     description='ScienceDiscovery: Use LLM-based multi-agent system to reason over graphs and generate novel research ideas.',
51 |     long_description=long_description,
52 |     long_description_content_type='text/markdown',
53 |     url='https://github.com/lamm-mit/SciAgentsDiscovery',
54 |     classifiers=[
55 |         'License :: OSI Approved :: MIT License',
56 |         'Programming Language :: Python :: 3.11'
57 |     ],
58 |     python_requires='>=3.10',
59 | )
60 | 


--------------------------------------------------------------------------------
/Notebooks/SciAgents_ScienceDiscovery_GraphReasoning_non-automated.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "1be16592-06df-42d3-834e-92001c455abb",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# SciAgents\n",
  9 |     "## Automating scientific discovery through multi-agent intelligent graph reasoning\n",
 10 |     "\n",
 11 |     "#### Alireza Ghafarollahi, Markus J. Buehler, MIT, 2024 mbuehler@MIT.EDU"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "id": "e88a310b-5971-4111-9f0e-ac6eef990594",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "!git clone https://github.com/lamm-mit/SciAgentsDiscovery.git\n",
 22 |     "%cd SciAgentsDiscovery\n",
 23 |     "!pip install -e ."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 1,
 29 |    "id": "24cbd5ab-8985-443a-abd4-bde904dcd389",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import os\n",
 34 |     "\n",
 35 |     "OpenAI_key='sk-'\n",
 36 |     "os.environ['OPENAI_API_KEY']=OpenAI_key\n",
 37 |     "\n",
 38 |     "SemanticScholar_api_key = ''\n",
 39 |     "os.environ['SEMANTIC_SCHOLAR_API_KEY']=SemanticScholar_api_key\n",
 40 |     "\n",
 41 |     "data_dir_output='./graph_giant_component_LLMdiscovery_example/'"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "id": "0b37b47a-bbf2-4bcc-802f-4203146a2946",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "from ScienceDiscovery import *\n",
 52 |     "make_dir_if_needed(data_dir_output)"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "id": "dc4c7e01-48aa-426a-b1ff-cea65d5d6427",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "### Setting up OpenAI GPT model for the LLM"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "id": "c9e805ab-7609-4b2c-babc-bf51279574b3",
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "default_generate_OpenAIGPT = partial(\n",
 71 |     "    generate_OpenAIGPT,\n",
 72 |     "    openai_api_key=OpenAI_key,\n",
 73 |     "    #gpt_model='gpt-4-turbo',\n",
 74 |     "    gpt_model='gpt-4o',\n",
 75 |     "    temperature=0.2,\n",
 76 |     "    max_tokens=2048,\n",
 77 |     ")"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "id": "3b261e57-2f27-4588-8e5e-6774d654b85e",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Research idea generation using the non-automated multi-agent model"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "id": "d8276cc2-4ff3-4112-ac80-4ce744e80a23",
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "research_generation(G=G, \n",
 96 |     "                    embedding_tokenizer=embedding_tokenizer,\n",
 97 |     "                    embedding_model=embedding_model,\n",
 98 |     "                    node_embeddings=node_embeddings,\n",
 99 |     "                    generate=default_generate_OpenAIGPT,\n",
100 |     "                    generate_graph_expansion=default_generate_OpenAIGPT,\n",
101 |     "                    randomness_factor=0.2, num_random_waypoints=4,shortest_path=False,\n",
102 |     "                    second_hop=False, data_dir=data_dir_output, save_files=False, verbatim=True,\n",
103 |     "                    keyword_1='energy-intensive', keyword_2='protein')"
104 |    ]
105 |   }
106 |  ],
107 |  "metadata": {
108 |   "kernelspec": {
109 |    "display_name": "Python 3 (ipykernel)",
110 |    "language": "python",
111 |    "name": "python3"
112 |   },
113 |   "language_info": {
114 |    "codemirror_mode": {
115 |     "name": "ipython",
116 |     "version": 3
117 |    },
118 |    "file_extension": ".py",
119 |    "mimetype": "text/x-python",
120 |    "name": "python",
121 |    "nbconvert_exporter": "python",
122 |    "pygments_lexer": "ipython3",
123 |    "version": "3.10.12"
124 |   }
125 |  },
126 |  "nbformat": 4,
127 |  "nbformat_minor": 5
128 | }
129 | 


--------------------------------------------------------------------------------
/Notebooks/SciAgents_ScienceDiscovery_GraphReasoning_automated.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "1be16592-06df-42d3-834e-92001c455abb",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# SciAgents\n",
  9 |     "## Automating scientific discovery through multi-agent intelligent graph reasoning\n",
 10 |     "\n",
 11 |     "#### Alireza Ghafarollahi, Markus J. Buehler, MIT, 2024 mbuehler@MIT.EDU"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "id": "e88a310b-5971-4111-9f0e-ac6eef990594",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "!git clone https://github.com/lamm-mit/SciAgentsDiscovery.git\n",
 22 |     "%cd SciAgentsDiscovery\n",
 23 |     "!pip install -e ."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 1,
 29 |    "id": "24cbd5ab-8985-443a-abd4-bde904dcd389",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import os\n",
 34 |     "\n",
 35 |     "OpenAI_key='sk-'\n",
 36 |     "os.environ['OPENAI_API_KEY']=OpenAI_key\n",
 37 |     "\n",
 38 |     "SemanticScholar_api_key = ''\n",
 39 |     "os.environ['SEMANTIC_SCHOLAR_API_KEY']=SemanticScholar_api_key\n",
 40 |     "\n",
 41 |     "data_dir_output='./graph_giant_component_LLMdiscovery_example/'"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "id": "0b37b47a-bbf2-4bcc-802f-4203146a2946",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "from ScienceDiscovery import *\n",
 52 |     "make_dir_if_needed(data_dir_output)"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "id": "4e29d660-93b2-4ec4-a9e5-0366cf502515",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "## Research idea generation using the automated multi-agent model"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "id": "9a5d67fa-b6c2-4259-a8ee-8db4b1801b05",
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "res = user.initiate_chat(recipient=manager,\n",
 71 |     "message='''Develop a research proposal using random concepts. In the end, rate the novelty and feasibility of the research idea.''',\n",
 72 |     "                        clear_history=True)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "id": "02261047-d40f-4bfc-a400-cd2057bc6c20",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "### Saving the output"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 5,
 86 |    "id": "bb50b7c2-6063-410c-b1a2-04c1b74246d5",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "formatted_text = \"\"\n",
 91 |     "formatted_text_summary = \"\"\n",
 92 |     "for i in range(len(res.chat_history)):\n",
 93 |     "    try:\n",
 94 |     "        formatted_text += f'''{res.chat_history[i]['tool_calls'][0]['function']['name']}-{res.chat_history[1]['tool_calls'][0]['function']['arguments']}\\n\\n'''\n",
 95 |     "    except:\n",
 96 |     "        if i==0:\n",
 97 |     "            formatted_text += '### ' + f'''{res.chat_history[i]['content']}\\n\\n'''\n",
 98 |     "        else:\n",
 99 |     "            formatted_text += f'''{res.chat_history[i]['content']}\\n\\n'''\n",
100 |     "            if re.search(\"Summary of the Initial Research Hypothesis\", f'''{res.chat_history[i]['content']}'''):\n",
101 |     "                formatted_text_summary += f'''{res.chat_history[i]['content']}'''\n",
102 |     "\n",
103 |     "text_markdown = Markdown(formatted_text)\n",
104 |     "\n",
105 |     "markdown_to_pdf(formatted_text, 'output_research')"
106 |    ]
107 |   }
108 |  ],
109 |  "metadata": {
110 |   "kernelspec": {
111 |    "display_name": "Python 3 (ipykernel)",
112 |    "language": "python",
113 |    "name": "python3"
114 |   },
115 |   "language_info": {
116 |    "codemirror_mode": {
117 |     "name": "ipython",
118 |     "version": 3
119 |    },
120 |    "file_extension": ".py",
121 |    "mimetype": "text/x-python",
122 |    "name": "python",
123 |    "nbconvert_exporter": "python",
124 |    "pygments_lexer": "ipython3",
125 |    "version": "3.10.12"
126 |   }
127 |  },
128 |  "nbformat": 4,
129 |  "nbformat_minor": 5
130 | }
131 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SciAgents
  2 | ## Automating scientific discovery through multi-agent intelligent graph reasoning
  3 | A. Ghafarollahi, M.J. Buehler*
  4 | 
  5 | Massachusetts Institute of Technology
  6 | 
  7 | *mbuehler@MIT.EDU
  8 | 
  9 | ## Summary
 10 | A key challenge in artificial intelligence is the creation of systems capable of autonomously advancing scientific understanding by exploring novel domains, identifying complex patterns, and uncovering previously unseen connections in vast scientific data. In this work, we present SciAgents, an approach that leverages three core concepts: (1) the use of large-scale ontological knowledge graphs to organize and interconnect diverse scientific concepts, (2) a suite of large language models (LLMs) and data retrieval tools, and (3) multi-agent systems with in-situ learning capabilities. Applied to biologically inspired materials, SciAgents reveals hidden interdisciplinary relationships that were previously considered unrelated, achieving a scale, precision, and exploratory power that surpasses traditional human-driven research methods. The framework autonomously generates and refines research hypotheses, elucidating underlying mechanisms, design principles, and unexpected material properties. By integrating these capabilities in a modular fashion, the intelligent system yields material discoveries, critique and improve existing hypotheses, retrieve up-to-date data about existing research, and highlights their strengths and limitations. Our case studies demonstrate scalable capabilities to combine generative AI, ontological representations, and multi-agent modeling, harnessing a `swarm of intelligence' similar to biological systems. This provides new avenues for materials discovery and accelerates the development of advanced materials by unlocking Nature’s design principles. 
 11 | 
 12 | ![Fig_1](https://github.com/user-attachments/assets/3cae1052-427a-407c-8c9d-629111a3c070)
 13 | 
 14 | Figure 1. **Overview of the multi-agent graph-reasoning system developed here**  
 15 | **Panel a**: Overview of graph construction, as reported in [M.J. Buehler et al., 2024](https://iopscience.iop.org/article/10.1088/2632-2153/ad7228/meta). The visual shows the progression from scientific papers as a data source to graph construction, with the image on the right showing a zoomed-in view of the graph.  
 16 | **Panels b and c**: Two distinct approaches are presented. In **b**, a multi-agent system based on a pre-programmed sequence of interactions between agents ensures consistency and reliability. In **c**, a fully automated, flexible multi-agent framework adapts dynamically to the evolving research context. Both systems leverage a sampled path within a global knowledge graph as context to guide the research idea generation process. Each agent plays a specialized role: **Ontologist** defines key concepts and relationships, **Scientist 1** crafts a detailed research proposal, **Scientist 2** expands and refines the proposal, **Critic agent** conducts a thorough review and suggests improvements.  In the second approach, **Planner** develops a detailed plan, and the **Assistant** checks the novelty of the generated research hypotheses.
 17 | This collaborative framework enables the generation of innovative and well-rounded scientific hypotheses that extend beyond conventional human-driven methods.
 18 | 
 19 | ![silk_energy_results](https://github.com/user-attachments/assets/19c5e9d9-d6d1-4d9b-9a66-8bda742c7579)
 20 | 
 21 | Figure 2: Results from our multi-agent model, illustrating a novel research hypothesis based on a knowledge
 22 | graph connecting the keywords “silk” and “energy-intensive”, as an example. This visual overview shows that the
 23 | system produces detailed, well-organized documentation of research development with multiple pages and detailed text
 24 | (the example shown here includes 8,100 words).
 25 | 
 26 | ### Codes
 27 | This repository contains code for generating novel research ideas in the field of bio-inspired materials.
 28 | 
 29 | The notebook files ```SciAgents_ScienceDiscovery_GraphReasoning_non-automated.ipynb``` and ```SciAgents_ScienceDiscovery_GraphReasoning_automated.ipynb``` in the Notebooks directory correspond to the non-automated and automated multi-agent frameworks, respectively, as explained in the accompanying paper.
 30 | 
 31 | The automated multi-agent model is implemented with [AG2](https://github.com/ag2ai/ag2?tab=readme-ov-file) (Formerly AutoGen), an open-source ecosystem for agent-based AI modeling. 
 32 | This project is also collected in [Build with AG2](https://github.com/ag2ai/build-with-ag2), you can checkout more projects built with AG2.
 33 | 
 34 | ### Audio file generation (podcast style, lecture, summary and others)
 35 | 
 36 | Please see: [lamm-mit/PDF2Audio](https://github.com/lamm-mit/PDF2Audio) or use the version at 🤗 Hugging Face Spaces [lamm-mit/PDF2Audio](https://huggingface.co/spaces/lamm-mit/PDF2Audio).
 37 | 
 38 | ### Example
 39 | https://github.com/user-attachments/assets/d5a972f8-5308-4e42-b7dc-d68ba84e2140
 40 | 
 41 | 
 42 | ### Requirements
 43 | 
 44 | You need to install the GraphReasoning package, as describe below. Further, (a) OpenAI and (b) Semantic Scholar APIs are required to run the codes. 
 45 | 
 46 | #### Graph Reasoning installation 
 47 | 
 48 | Install directly from GitHub:
 49 | ```
 50 | pip install git+https://github.com/lamm-mit/GraphReasoning
 51 | ```
 52 | Or, editable:
 53 | ```
 54 | pip install -e git+https://github.com/lamm-mit/GraphReasoning.git#egg=GraphReasoning
 55 | ```
 56 | You may need wkhtmltopdf:
 57 | ```
 58 | sudo apt-get install wkhtmltopdf
 59 | ```
 60 | #### Graph file:
 61 | ```
 62 | from huggingface_hub import hf_hub_download   
 63 | graph_name='large_graph_simple_giant.graphml'
 64 | filename = f"{graph_name}"
 65 | file_path = hf_hub_download(repo_id='lamm-mit/bio-graph-1K', filename=filename,  local_dir='./graph_giant_component')
 66 | ```
 67 | 
 68 | #### Embeddings:
 69 | ```
 70 | from huggingface_hub import hf_hub_download
 71 | embedding_name='embeddings_simple_giant_ge-large-en-v1.5.pkl'
 72 | filename = f"{embedding_name}"
 73 | file_path = hf_hub_download(repo_id='lamm-mit/bio-graph-1K', filename=filename,  local_dir='./graph_giant_component')
 74 | ```
 75 | 
 76 | ### Additional background
 77 | 
 78 | ![Fig_2](https://github.com/user-attachments/assets/88f6a9f3-77b5-4b9c-ad7a-73e4b0841f0b)
 79 | 
 80 | Figure 3. Overview of the entire process from initial keyword selection to the final document, following a hierarchical expansion strategy where answers are successively refined and improved, enriched with retrieved data, critiqued and amended by identification or critical modeling, simulation and experimental tasks. The process begins with initial keyword identification or random exploration within a graph, followed by path sampling to create a subgraph of relevant concepts and relationships. This subgraph forms the basis for generating structured output in JSON, including the hypothesis, outcome, mechanisms, design principles, unexpected properties, comparison, and novelty. Each component is subsequently expanded on with individual prompting, to yield significant amount of additional detail, forming a comprehensive draft. This draft then undergoes a critical review process, including amendments for modeling and simulation priorities (e.g., molecular dynamics) and experimental priorities (e.g., synthetic biology). The final integrated draft, along with critical analyses, results in a document that guides further scientific inquiry.
 81 | 
 82 | ![Fig_3](https://github.com/user-attachments/assets/c356a6da-7218-42d0-b0f2-966193436f4c)
 83 | 
 84 | 
 85 | Figure 4. SciAgents presents a framework for generative materials informatics, showcasing the iterative process of ideation and reasoning driven by input data, questions, and context.} The cycle of ideation and reasoning leads to predictive outcomes, offering insights into new material designs and properties. The visual elements on the edges represent various data modalities such as images, documents, scientific data, DNA sequences, video content, and microscopy, illustrating the diverse sources of information feeding into this process.
 86 | 
 87 | ![image](https://github.com/user-attachments/assets/c11b7448-2c7b-43ae-89f2-f0e8ecac6849)
 88 | 
 89 | Figure 5. Visualization of the ontological knowledge graph (left: whole graph, right: sub-graph) that organizes information. 
 90 | 
 91 | ### Original papers
 92 | 
 93 | Please cite this work as:
 94 | ```
 95 | @article{ghafarollahi2024sciagents,
 96 |   title={SciAgents: Automating Scientific Discovery Through Bioinspired Multi-Agent Intelligent Graph Reasoning},
 97 |   author={Ghafarollahi, Alireza and Buehler, Markus J},
 98 |   journal={Advanced Materials},
 99 |   pages={2413523},
100 |   year={2024},
101 |   publisher={Wiley Online Library}
102 | }
103 | 
104 | @article{buehler2024graphreasoning,
105 | 	author={Markus J. Buehler},
106 | 	title={Accelerating Scientific Discovery with Generative Knowledge Extraction, Graph-Based Representation, and Multimodal Intelligent Graph Reasoning},
107 | 	journal={Machine Learning: Science and Technology},
108 | 	year={2024},
109 | 	url={http://iopscience.iop.org/article/10.1088/2632-2153/ad7228},
110 | }
111 | ```
112 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/ScienceDiscovery/agents.py:
--------------------------------------------------------------------------------
  1 | from ScienceDiscovery.utils import *
  2 | from ScienceDiscovery.llm_config import *
  3 | from ScienceDiscovery.graph import *
  4 | 
  5 | 
  6 | from typing import Union
  7 | import autogen
  8 | from autogen import AssistantAgent
  9 | from autogen.agentchat.contrib.img_utils import get_pil_image, pil_to_data_uri
 10 | from autogen import register_function
 11 | from autogen import ConversableAgent
 12 | from typing import Dict, List
 13 | from typing import Annotated, TypedDict
 14 | from autogen import Agent
 15 | 
 16 | user = autogen.UserProxyAgent(
 17 |     name="user",
 18 |     is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"),
 19 |     human_input_mode="ALWAYS",
 20 |     system_message="user. You are a human admin. You pose the task.",
 21 |     llm_config=False,
 22 |     code_execution_config=False,
 23 | )
 24 | 
 25 | planner = AssistantAgent(
 26 |     name="planner",
 27 |     system_message = '''Planner. You are a helpful AI assistant. Your task is to suggest a comprehensive plan to solve a given task.
 28 | 
 29 | Explain the Plan: Begin by providing a clear overview of the plan.
 30 | Break Down the Plan: For each part of the plan, explain the reasoning behind it, and describe the specific actions that need to be taken.
 31 | No Execution: Your role is strictly to suggest the plan. Do not take any actions to execute it.
 32 | No Tool Call: If tool call is required, you must include the name of the tool and the agent who calls it in the plan. However, you are not allowed to call any Tool or function yourself. 
 33 | 
 34 | ''',
 35 |     llm_config=gpt4turbo_config,
 36 |     description='Who can suggest a step-by-step plan to solve the task by breaking down the task into simpler sub-tasks.',
 37 | )
 38 | 
 39 | assistant = AssistantAgent(
 40 |     name="assistant",
 41 |     system_message = '''You are a helpful AI assistant.
 42 |     
 43 | Your role is to call the appropriate tools and functions as suggested in the plan. You act as an intermediary between the planner's suggested plan and the execution of specific tasks using the available tools. You ensure that the correct parameters are passed to each tool and that the results are accurately reported back to the team.
 44 | 
 45 | Return "TERMINATE" in the end when the task is over.
 46 | ''',
 47 |     llm_config=gpt4turbo_config,
 48 |     description='''An assistant who calls the tools and functions as needed and returns the results. Tools include "rate_novelty_feasibility" and "generate_path".''',
 49 | )
 50 | 
 51 | 
 52 | ontologist = AssistantAgent(
 53 |     name="ontologist",
 54 |     system_message = '''ontologist. You must follow the plan from planner. You are a sophisticated ontologist.
 55 |     
 56 | Given some key concepts extracted from a comprehensive knowledge graph, your task is to define each one of the terms and discuss the relationships identified in the graph.
 57 | 
 58 | The format of the knowledge graph is "node_1 -- relationship between node_1 and node_2 -- node_2 -- relationship between node_2 and node_3 -- node_3...."
 59 | 
 60 | Make sure to incorporate EACH of the concepts in the knowledge graph in your response.
 61 | 
 62 | Do not add any introductory phrases. First, define each term in the knowledge graph and then, secondly, discuss each of the relationships, with context.
 63 | 
 64 | Here is an example structure for our response, in the following format
 65 | 
 66 | {{
 67 | ### Definitions:
 68 | A clear definition of each term in the knowledge graph.
 69 | ### Relationships
 70 | A thorough discussion of all the relationships in the graph. 
 71 | }}
 72 | 
 73 | Further Instructions: 
 74 | Perform only the tasks assigned to you in the plan; do not undertake tasks assigned to other agents. Additionally, do not execute any functions or tools.
 75 | ''',
 76 |     llm_config=gpt4turbo_config,
 77 |     description='I can define each of the terms and discusses the relationships in the path.',
 78 | )
 79 | 
 80 | 
 81 | scientist = AssistantAgent(
 82 |     name="scientist",
 83 |     system_message = '''scientist. You must follow the plan from the planner. 
 84 |     
 85 | You are a sophisticated scientist trained in scientific research and innovation. 
 86 |     
 87 | Given the definitions and relationships acquired from a comprehensive knowledge graph, your task is to synthesize a novel research proposal with initial key aspects-hypothesis, outcome, mechanisms, design_principles, unexpected_properties, comparision, and novelty  . Your response should not only demonstrate deep understanding and rational thinking but also explore imaginative and unconventional applications of these concepts. 
 88 |     
 89 | Analyze the graph deeply and carefully, then craft a detailed research proposal that investigates a likely groundbreaking aspect that incorporates EACH of the concepts and relationships identified in the knowledge graph by the ontologist.
 90 | 
 91 | Consider the implications of your proposal and predict the outcome or behavior that might result from this line of investigation. Your creativity in linking these concepts to address unsolved problems or propose new, unexplored areas of study, emergent or unexpected behaviors, will be highly valued.
 92 | 
 93 | Be as quantitative as possible and include details such as numbers, sequences, or chemical formulas. 
 94 | 
 95 | Your response should include the following SEVEN keys in great detail: 
 96 | 
 97 | "hypothesis" clearly delineates the hypothesis at the basis for the proposed research question. The hypothesis should be well-defined, has novelty, is feasible, has a well-defined purpose and clear components. Your hypothesis should be as detailed as possible.
 98 | 
 99 | "outcome" describes the expected findings or impact of the research. Be quantitative and include numbers, material properties, sequences, or chemical formula.
100 | 
101 | "mechanisms" provides details about anticipated chemical, biological or physical behaviors. Be as specific as possible, across all scales from molecular to macroscale.
102 | 
103 | "design_principles" should list out detailed design principles, focused on novel concepts, and include a high level of detail. Be creative and give this a lot of thought, and be exhaustive in your response. 
104 | 
105 | "unexpected_properties" should predict unexpected properties of the new material or system. Include specific predictions, and explain the rationale behind these clearly using logic and reasoning. Think carefully.
106 | 
107 | "comparison" should provide a detailed comparison with other materials, technologies or scientific concepts. Be detailed and quantitative. 
108 | 
109 | "novelty" should discuss novel aspects of the proposed idea, specifically highlighting how this advances over existing knowledge and technology. 
110 | 
111 | Ensure your scientific proposal is both innovative and grounded in logical reasoning, capable of advancing our understanding or application of the concepts provided.
112 | 
113 | Here is an example structure for your response, in the following order:
114 | 
115 | {{
116 |   "1- hypothesis": "...",
117 |   "2- outcome": "...",
118 |   "3- mechanisms": "...",
119 |   "4- design_principles": "...",
120 |   "5- unexpected_properties": "...",
121 |   "6- comparison": "...",
122 |   "7- novelty": "...",
123 | }}
124 | 
125 | Remember, the value of your response lies in scientific discovery, new avenues of scientific inquiry, and potential technological breakthroughs, with detailed and solid reasoning.
126 | 
127 | Further Instructions: 
128 | Make sure to incorporate EACH of the concepts in the knowledge graph in your response. 
129 | Perform only the tasks assigned to you in the plan; do not undertake tasks assigned to other agents.
130 | Additionally, do not execute any functions or tools.
131 | ''',
132 |     llm_config=gpt4turbo_config_graph,
133 |     description='I can craft the research proposal with key aspects based on the definitions and relationships acquired by the ontologist. I am **ONLY** allowed to speak after `Ontologist`',
134 | )
135 | 
136 | 
137 | hypothesis_agent = AssistantAgent(
138 |     name="hypothesis_agent",
139 |     system_message = '''hypothesis_agent. Carefully expand on the ```{hypothesis}```  of the research proposal.
140 | 
141 | Critically assess the original content and improve on it. \
142 | Add more specifics, quantitive scientific information (such as chemical formulas, numbers, sequences, processing conditions, microstructures, etc.), \
143 | rationale, and step-by-step reasoning. When possible, comment on specific modeling and simulation techniques, experimental methods, or particular analyses. 
144 | 
145 | Start by carefully assessing this initial draft from the perspective of a peer-reviewer whose task it is to critically assess and improve the science of the following:
146 | 
147 | <hypothesis>
148 | where <hypothesis> is the hypothesis aspect of the research proposal.  
149 | 
150 | Do not add any introductory phrases. Your response begins with your response, with a heading: ### Expanded ... 
151 | ''',
152 |     llm_config=gpt4o_config_graph,
153 |     description='''I can expand the "hypothesis" aspect of the research proposal crafted by the "scientist".''',
154 | )
155 | 
156 | 
157 | outcome_agent = AssistantAgent(
158 |     name="outcome_agent",
159 |     system_message = '''outcome_agent. Carefully expand on the ```{outcome}``` of the research proposal developed by the scientist.
160 | 
161 | Critically assess the original content and improve on it. \
162 | Add more specifics, quantitive scientific information (such as chemical formulas, numbers, sequences, processing conditions, microstructures, etc.), \
163 | rationale, and step-by-step reasoning. When possible, comment on specific modeling and simulation techniques, experimental methods, or particular analyses. 
164 | 
165 | Start by carefully assessing this initial draft from the perspective of a peer-reviewer whose task it is to critically assess and improve the science of the following:
166 | 
167 | <outcome>
168 | where <outcome> is the outcome aspect of the research proposal.  
169 | 
170 | Do not add any introductory phrases. Your response begins with your response, with a heading: ### Expanded ... 
171 | ''',
172 |     llm_config=gpt4o_config_graph,
173 |     description='''I can expand the "outcome" aspect of the research proposal crafted by the "scientist".''',
174 | )
175 | 
176 | mechanism_agent = AssistantAgent(
177 |     name="mechanism_agent",
178 |     system_message = '''mechanism_agent. Carefully expand on this particular aspect: ```{mechanism}``` of the research proposal.
179 | 
180 | Critically assess the original content and improve on it. \
181 | Add more specifics, quantitive scientific information (such as chemical formulas, numbers, sequences, processing conditions, microstructures, etc.), \
182 | rationale, and step-by-step reasoning. When possible, comment on specific modeling and simulation techniques, experimental methods, or particular analyses. 
183 | 
184 | Start by carefully assessing this initial draft from the perspective of a peer-reviewer whose task it is to critically assess and improve the science of the following:
185 | 
186 | <mechanism>
187 | where <mechanism> is the mechanism aspect of the research proposal.  
188 | 
189 | Do not add any introductory phrases. Your response begins with your response, with a heading: ### Expanded ... 
190 | ''',
191 |     llm_config=gpt4o_config_graph,
192 |     description='''I can expand the "mechanism" aspect of the research proposal crafted by the "scientist"''',
193 | )
194 | 
195 | design_principles_agent = AssistantAgent(
196 |     name="design_principles_agent",
197 |     system_message = '''design_principles_agent. Carefully expand on this particular aspect: ```{design_principles}``` of the research proposal.
198 | 
199 | Critically assess the original content and improve on it. \
200 | Add more specifics, quantitive scientific information (such as chemical formulas, numbers, sequences, processing conditions, microstructures, etc.), \
201 | rationale, and step-by-step reasoning. When possible, comment on specific modeling and simulation techniques, experimental methods, or particular analyses. 
202 | 
203 | Start by carefully assessing this initial draft from the perspective of a peer-reviewer whose task it is to critically assess and improve the science of the following:
204 | 
205 | <design_principles>
206 | where <design_principles> is the design_principles aspect of the research proposal.  
207 | 
208 | Do not add any introductory phrases. Your response begins with your response, with a heading: ### Expanded ...
209 | ''',
210 |     llm_config=gpt4o_config_graph,
211 |     description='''I can expand the "design_principle" aspect of the research proposal crafted by the "scientist".''',
212 | )
213 | 
214 | unexpected_properties_agent = AssistantAgent(
215 |     name="unexpected_properties_agent",
216 |     system_message = '''unexpected_properties_agent. Carefully expand on this particular aspect: ```{unexpected_properties}``` of the research proposal.
217 | 
218 | Critically assess the original content and improve on it. \
219 | Add more specifics, quantitive scientific information (such as chemical formulas, numbers, sequences, processing conditions, microstructures, etc.), \
220 | rationale, and step-by-step reasoning. When possible, comment on specific modeling and simulation techniques, experimental methods, or particular analyses. 
221 | 
222 | Start by carefully assessing this initial draft from the perspective of a peer-reviewer whose task it is to critically assess and improve the science of the following:
223 | 
224 | <unexpected_properties>
225 | where <unexpected_properties> is the unexpected_properties aspect of the research proposal.  
226 | 
227 | Do not add any introductory phrases. Your response begins with your response, with a heading: ### Expanded ...
228 | ''',
229 |     llm_config=gpt4o_config_graph,
230 |     description='''I can expand the "unexpected_properties" aspect of the research proposal crafted by the "scientist.''',
231 | )
232 | 
233 | comparison_agent = AssistantAgent(
234 |     name="comparison_agent",
235 |     system_message = '''comparison_agent. Carefully expand on this particular aspect: ```{comparison}``` of the research proposal.
236 | 
237 | Critically assess the original content and improve on it. \
238 | Add more specifics, quantitive scientific information (such as chemical formulas, numbers, sequences, processing conditions, microstructures, etc.), \
239 | rationale, and step-by-step reasoning. When possible, comment on specific modeling and simulation techniques, experimental methods, or particular analyses. 
240 | 
241 | Start by carefully assessing this initial draft from the perspective of a peer-reviewer whose task it is to critically assess and improve the science of the following:
242 | 
243 | <comparison>
244 | where <comparison> is the comparison aspect of the research proposal.  
245 | 
246 | Do not add any introductory phrases. Your response begins with your response, with a heading: ### Expanded ...
247 | ''',
248 |     llm_config=gpt4o_config_graph,
249 |     description='''I can expand the "comparison" aspect of the research proposal crafted by the "scientist".''',
250 | )
251 | 
252 | novelty_agent = AssistantAgent(
253 |     name="novelty_agent",
254 |     system_message = '''novelty_agent. Carefully expand on this particular aspect: ```{novelty}``` of the research proposal.
255 | 
256 | Critically assess the original content and improve on it. \
257 | Add more specifics, quantitive scientific information (such as chemical formulas, numbers, sequences, processing conditions, microstructures, etc.), \
258 | rationale, and step-by-step reasoning. When possible, comment on specific modeling and simulation techniques, experimental methods, or particular analyses. 
259 | 
260 | Start by carefully assessing this initial draft from the perspective of a peer-reviewer whose task it is to critically assess and improve the science of the following:
261 | 
262 | <novelty>
263 | where <novelty> is the novelty aspect of the research proposal.  
264 | 
265 | Do not add any introductory phrases. Your response begins with your response, with a heading: ### Expanded ...
266 | ''',
267 |     llm_config=gpt4o_config_graph,
268 |     description='''I can expand the "novelty" aspect of the research proposal crafted by the "scientist".''',
269 | )
270 | 
271 | critic_agent = AssistantAgent(
272 |     name="critic_agent",
273 |     system_message = '''critic_agent. You are a helpful AI agent who provides accurate, detailed and valuable responses. 
274 | 
275 | You read the whole proposal with all its details and expanded aspects and provide:
276 | 
277 | (1) a summary of the document (in one paragraph, but including sufficient detail such as mechanisms, \
278 | related technologies, models and experiments, methods to be used, and so on), \
279 | 
280 | (2) a thorough critical scientific review with strengths and weaknesses, and suggested improvements. Include logical reasoning and scientific approaches.
281 | 
282 | Next, from within this document, 
283 | 
284 | (1) identify the single most impactful scientific question that can be tackled with molecular modeling. \
285 | \n\nOutline key steps to set up and conduct such modeling and simulation, with details and include unique aspects of the planned work.
286 | 
287 | (2) identify the single most impactful scientific question that can be tackled with synthetic biology. \
288 | \n\nOutline key steps to set up and conduct such experimental work, with details and include unique aspects of the planned work.'
289 | 
290 | Important Note:
291 | ***You do not rate Novelty and Feasibility. You are not to rate the novelty and feasibility.***
292 | ''',
293 |     llm_config=gpt4o_config_graph,
294 |     description='''I can summarizes, critique, and suggest improvements after all seven aspects of the proposal have been expanded by the agents.''',
295 | )
296 | 
297 | 
298 | novelty_assistant = autogen.AssistantAgent(
299 |     name="novelty_assistant",
300 |     system_message = '''You are a critical AI assistant collaborating with a group of scientists to assess the potential impact of a research proposal. Your primary task is to evaluate a proposed research hypothesis for its novelty and feasibility, ensuring it does not overlap significantly with existing literature or delve into areas that are already well-explored.
301 | 
302 | You will have access to the Semantic Scholar API, which you can use to survey relevant literature and retrieve the top 10 results for any search query, along with their abstracts. Based on this information, you will critically assess the idea, rating its novelty and feasibility on a scale from 1 to 10 (with 1 being the lowest and 10 the highest).
303 | 
304 | Your goal is to be a stringent evaluator, especially regarding novelty. Only ideas with a sufficient contribution that could justify a new conference or peer-reviewed research paper should pass your scrutiny. 
305 | 
306 | After careful analysis, return your estimations for the novelty and feasibility rates. 
307 | 
308 | If the tool call was not successful, please re-call the tool until you get a valid response. 
309 | 
310 | After the evaluation, conclude with a recommendation and end the conversation by stating "TERMINATE".''',
311 |     llm_config=gpt4turbo_config,
312 | )
313 | 
314 | # create a UserProxyAgent instance named "user_proxy"
315 | novelty_admin = autogen.UserProxyAgent(
316 |     name="novelty_admin",
317 |     human_input_mode="NEVER",
318 |     max_consecutive_auto_reply=10,
319 |     is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"),
320 |     code_execution_config=False,  # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.
321 |     llm_config=False,
322 | )
323 | 
324 | @novelty_admin.register_for_execution()
325 | @novelty_assistant.register_for_llm(description='''This function is designed to search for academic papers using the Semantic Scholar API based on a specified query. 
326 | The query should be constructed with relevant keywords separated by "+". ''')
327 | def response_to_query(query: Annotated[str, '''the query for the paper search. The query must consist of relevant keywords separated by +'''])->str:
328 |     # Define the API endpoint URL
329 |     url = 'https://api.semanticscholar.org/graph/v1/paper/search'
330 |     
331 |     # More specific query parameter
332 |     query_params = {
333 |         'query': {query},           
334 |         'fields': 'title,abstract,openAccessPdf,url'
335 |                    }
336 |     
337 |     # Directly define the API key (Reminder: Securely handle API keys in production environments)
338 |      # Replace with the actual API key
339 |     
340 |     # Define headers with API key
341 |     api_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
342 |     headers = {'x-api-key': api_key}
343 |     
344 |     # Send the API request
345 |     response = requests.get(url, params=query_params, headers=headers)
346 |     
347 |     # Check response status
348 |     if response.status_code == 200:
349 |        response_data = response.json()
350 |        # Process and print the response data as needed
351 |     else:
352 |        response_data = f"Request failed with status code {response.status_code}: {response.text}"
353 | 
354 |     return response_data
355 | 
356 | @user.register_for_execution()
357 | @planner.register_for_llm()
358 | @assistant.register_for_llm(description='''This function can be used to create a knowledge path. The function may either take two keywords as the input or randomly assign them and then returns a path between these nodes. 
359 | The path contains several concepts (nodes) and the relationships between them (edges). THe function returns the path.
360 | Do not use this function if the path is already provided. If neither path nor the keywords are provided, select None for the keywords so that a path will be generated between randomly selected nodes.''')
361 | def generate_path(keyword_1: Annotated[Union[str, None], 'the first node in the knowledge graph. None for random selection.'],
362 |                     keyword_2: Annotated[Union[str, None], 'the second node in the knowledge graph. None for random selection.'],
363 |                  ) -> str:
364 |     
365 |     path_list_for_vis, path_list_for_vis_string = create_path(G, embedding_tokenizer,
366 |                                     embedding_model, node_embeddings , generate_graph_expansion=None,
367 |                                     randomness_factor=0.2, num_random_waypoints=4, shortest_path=False,
368 |                                     second_hop=False, data_dir='./', save_files=False, verbatim=True,
369 |                                     keyword_1 = keyword_1, keyword_2=keyword_2,)
370 | 
371 |     return path_list_for_vis_string
372 | 
373 | @user.register_for_execution()
374 | @planner.register_for_llm()
375 | @assistant.register_for_llm(description='''Use this function to rate the novelty and feasibility of a research idea against the literature. The function uses semantic shcolar to access the literature articles.  
376 | The function will return the novelty and feasibility rate from 1 to 10 (lowest to highest). The input to the function is the hypothesis with its details.''')
377 | def rate_novelty_feasibility(hypothesis: Annotated[str, 'the research hypothesis.']) -> str:
378 |     res = novelty_admin.initiate_chat(
379 |     novelty_assistant,
380 |         clear_history=True,
381 |         silent=False,
382 |         max_turns=10,
383 |     message=f'''Rate the following research hypothesis\n\n{hypothesis}. \n\nCall the function three times at most, but not in parallel. Wait for the results before calling the next function. ''',
384 |         summary_method="reflection_with_llm",
385 |         summary_args={"summary_prompt" : "Return all the results of the analysis as is."}
386 |     )
387 | 
388 |     return res.summary
389 | 
390 | 
391 | planner.reset()
392 | assistant.reset()
393 | ontologist.reset()
394 | scientist.reset()
395 | critic_agent.reset()
396 | 
397 | 
398 | groupchat = autogen.GroupChat(
399 |     agents=[user, planner, assistant, ontologist, scientist,
400 |             hypothesis_agent, outcome_agent, mechanism_agent, design_principles_agent, unexpected_properties_agent, comparison_agent, novelty_agent, critic_agent#sequence_retriever,
401 |                ], messages=[], max_round=50, admin_name='user', send_introductions=True, allow_repeat_speaker=True,
402 |     speaker_selection_method='auto',
403 | )
404 | 
405 | manager = autogen.GroupChatManager(groupchat=groupchat, 
406 |                                    llm_config=gpt4turbo_config, 
407 |                                    system_message='you dynamically select a speaker.')


--------------------------------------------------------------------------------
/ScienceDiscovery/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | 
  4 | # Markdown and PDF Handling
  5 | import markdown
  6 | import markdown2
  7 | from weasyprint import HTML
  8 | import pdfkit
  9 | 
 10 | # Utility and System Libraries
 11 | import random
 12 | import re
 13 | import uuid
 14 | import time
 15 | import glob
 16 | from datetime import datetime
 17 | from copy import deepcopy
 18 | from pathlib import Path
 19 | 
 20 | # Progress Bars
 21 | from tqdm.notebook import tqdm
 22 | try:
 23 |     get_ipython
 24 |     from tqdm.notebook import tqdm
 25 | except NameError:
 26 |     from tqdm import tqdm
 27 | 
 28 | # Visualization Libraries
 29 | import matplotlib.pyplot as plt
 30 | import seaborn as sns  # For more attractive plotting
 31 | sns.set_palette("hls")
 32 | 
 33 | # PyVis for Graph Visualization
 34 | from pyvis.network import Network
 35 | 
 36 | # IPython Display
 37 | from IPython.display import display, Markdown
 38 | 
 39 | # Data Processing Libraries
 40 | import pandas as pd
 41 | import numpy as np
 42 | 
 43 | # Machine Learning and AI
 44 | import torch
 45 | from transformers import AutoTokenizer, AutoModel
 46 | from scipy.spatial.distance import cosine
 47 | from sklearn.decomposition import PCA
 48 | from sklearn.cluster import KMeans
 49 | 
 50 | # Graph Libraries
 51 | import networkx as nx
 52 | 
 53 | # LangChain Document Loaders and Splitters
 54 | from langchain.document_loaders import (
 55 |     PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader, 
 56 |     PyPDFDirectoryLoader, DirectoryLoader
 57 | )
 58 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 59 | 
 60 | # Custom Graph Reasoning Module
 61 | from GraphReasoning import *
 62 | 
 63 | # JSON Handling
 64 | import json
 65 | 
 66 | from functools import partial
 67 | 
 68 | 
 69 | 
 70 | def markdown_to_pdf(markdown_text, output_pdf_path):
 71 |     """
 72 |     Convert a Markdown string to a PDF file using markdown2 and pdfkit.
 73 | 
 74 |     Args:
 75 |     markdown_text (str): The Markdown text to convert.
 76 |     output_pdf_path (str): The path where the output PDF should be saved.
 77 |     """
 78 |     # Convert Markdown to HTML
 79 |     html_content = markdown2.markdown(markdown_text)
 80 |     
 81 |     # Define CSS for smaller font size
 82 |     css = """
 83 |     <style>
 84 |     body {
 85 |         font-size: 10px;  /* Adjust the font size as needed */
 86 |     }
 87 |     </style>
 88 |     """
 89 |     
 90 |     # Combine CSS and HTML content
 91 |     full_html = f"{css}{html_content}"
 92 | 
 93 |     # Convert HTML to PDF
 94 |     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 95 |     output_md_path = f"{output_pdf_path}_{timestamp}.md"
 96 |     output_pdf_path = f"{output_pdf_path}_{timestamp}.pdf"
 97 | 
 98 |     # Save the Markdown text to a .md file
 99 |     with open(output_md_path, 'w') as md_file:
100 |         md_file.write(markdown_text)   
101 | 
102 |     pdfkit.from_string(full_html, output_pdf_path)
103 | 
104 |     return output_pdf_path
105 | 
106 | 
107 | def convert_response_to_JSON (text_with_json):
108 |     match = re.search(r"\{.*\}", text_with_json, re.DOTALL)
109 |     if match:
110 |         json_str = match.group(0)  # This is the extracted JSON string
111 |     
112 |         # Step 2: Parse the JSON string into a dictionary (also performs a cleanup)
113 |         json_obj = json.loads(json_str)
114 |     
115 |         # Step 3: Convert the dictionary back into a JSON-formatted string
116 |         cleaned_json_str = json.dumps(json_obj, ensure_ascii=False)
117 |           
118 |         #print("JSONL file created with the extracted JSON.")
119 |     else:
120 |         print("No JSON content found.")
121 |         cleaned_json_str=''
122 |     return cleaned_json_str
123 | 
124 | 
125 | def json_to_formatted_text(json_data):
126 |     formatted_text = ""
127 | 
128 |     formatted_text += f"### Hypothesis\n{json_data['hypothesis']}\n\n"
129 |     formatted_text += f"### Outcome\n{json_data['outcome']}\n\n"
130 |     formatted_text += f"### Mechanisms\n{json_data['mechanisms']}\n\n"
131 | 
132 |     formatted_text += "### Design Principles\n"
133 | 
134 |     design_principles_list=json_data['design_principles']
135 | 
136 |     if isinstance(design_principles_list, list):
137 |         for principle in design_principles_list:
138 |             formatted_text += f"- {principle}\n"
139 |     else:
140 |         formatted_text += f"- {design_principles_list}\n"
141 | 
142 |     formatted_text += "\n"
143 | 
144 |     formatted_text += f"### Unexpected Properties\n{json_data['unexpected_properties']}\n\n"
145 |     formatted_text += f"### Comparison\n{json_data['comparison']}\n\n"
146 |     formatted_text += f"### Novelty\n{json_data['novelty']}\n"
147 | 
148 |     return formatted_text
149 | 
150 | 
151 | 
152 | 
153 | def create_path(G, embedding_tokenizer, embedding_model, node_embeddings,
154 |                           generate_graph_expansion=None,
155 |                           second_hop=False, data_dir='./', save_files=False, verbatim=False,
156 |                           keyword_1 = None, keyword_2 = None, 
157 |                           shortest_path=True, #if set to False, do NOT use shortest path but sample a random path 
158 |                           top_k=5, #for random walk, if shortest_path=False
159 |                           randomness_factor=0,
160 |                           num_random_waypoints=0,
161 |                          ):
162 | 
163 |     if keyword_1==None or keyword_2==None:
164 |         # Randomly pick two distinct nodes
165 |         random_nodes = random.sample(list(G.nodes()), 2)
166 |         
167 |         if keyword_1==None:
168 |              keyword_1 = random_nodes[0]
169 |         if keyword_2==None:
170 |              keyword_2 = random_nodes[1]
171 |         
172 |         if verbatim:
173 |              print("Randomly selected nodes:", keyword_1, "and", keyword_2)
174 |     
175 |     print(">>> Selected nodes:", keyword_1, "and", keyword_2)
176 |     '''
177 |     try:
178 |         keyword_1=keyword_1[0]
179 |     except:
180 |         keyword_1=keyword_1
181 |     try:
182 |         keyword_2  =keyword_2[0]
183 |     except:
184 |         keyword_2  =keyword_2
185 |     '''
186 |     if shortest_path:
187 |         (best_node_1, best_similarity_1, best_node_2, best_similarity_2), path, path_graph, shortest_path_length, fname, graph_GraphML=find_path( G, node_embeddings,
188 |                                          embedding_tokenizer, embedding_model , second_hop=False, 
189 |                                          data_dir=data_dir, save_files=False,
190 |                                          keyword_1 = keyword_1, keyword_2 = keyword_2, )
191 | 
192 |     else: #random path
193 |         print ("Random walk to get path:", keyword_1, "and", keyword_2)
194 |         
195 |         if randomness_factor>0 or num_random_waypoints>0:
196 |             path, path_graph, shortest_path_length, _, _= heuristic_path_with_embeddings_with_randomization_waypoints(
197 |                 G, 
198 |                 embedding_tokenizer, 
199 |                 embedding_model, 
200 |                 keyword_1, 
201 |                 keyword_2, 
202 |                 node_embeddings, 
203 |                 top_k=5, 
204 |                 #perturbation_factor=0.1, 
205 |                 second_hop=False, 
206 |                 data_dir=data_dir, 
207 |                 verbatim=True, 
208 |                 save_files=False,
209 |                 randomness_factor=randomness_factor,
210 |                 num_random_waypoints=num_random_waypoints,
211 |             )
212 | 
213 |         else:
214 |             path, path_graph, shortest_path_length, _, _ = heuristic_path_with_embeddings(G, embedding_tokenizer, embedding_model, 
215 |                                                                                       keyword_1, keyword_2, 
216 |                                                                                       node_embeddings, top_k=top_k, 
217 |                                                                                       second_hop=False,data_dir=data_dir, 
218 |                                                                                       verbatim=verbatim,
219 |                                                                                       save_files=save_files)
220 | 
221 |         
222 |         print ("Done random walk to get path")
223 | 
224 |     print("Path:", path)
225 | 
226 |     path_list_for_vis, path_list_for_vis_string=path_list=print_path_with_edges_as_list(G, path, keywords_separator=' -- ') 
227 |     print  ( path_list_for_vis_string )
228 | 
229 |     return path_list_for_vis, path_list_for_vis_string
230 | 
231 | 
232 | 
233 | 
234 | 
235 | def develop_qa_over_path (G, embedding_tokenizer, embedding_model,node_embeddings,
236 |                           generate, generate_graph_expansion=None,
237 |                           second_hop=False, data_dir='./', save_files=False, verbatim=False,
238 |                           keyword_1 = None, keyword_2 = None, 
239 |                           shortest_path=True, #if set to False, do NOT use shortest path but sample a random path 
240 |                           top_k=5, #for random walk, if shortest_path=False
241 |                           randomness_factor=0,
242 |                           num_random_waypoints=0,
243 |                          ):
244 | 
245 |     if generate_graph_expansion==None:
246 |         generate_graph_expansion=generate
247 |     if keyword_1==None or keyword_2==None:
248 |         # Randomly pick two distinct nodes
249 |         random_nodes = random.sample(list(G.nodes()), 2)
250 |         
251 |         if keyword_1==None:
252 |              keyword_1 = random_nodes[0]
253 |         if keyword_2==None:
254 |              keyword_2 = random_nodes[1]
255 |         
256 |         if verbatim:
257 |              print("Randomly selected nodes:", keyword_1, "and", keyword_2)
258 |     
259 |     print(">>> Selected nodes:", keyword_1, "and", keyword_2)
260 |     '''
261 |     try:
262 |         keyword_1=keyword_1[0]
263 |     except:
264 |         keyword_1=keyword_1
265 |     try:
266 |         keyword_2  =keyword_2[0]
267 |     except:
268 |         keyword_2  =keyword_2
269 |     '''
270 |     if shortest_path:
271 |         (best_node_1, best_similarity_1, best_node_2, best_similarity_2), path, path_graph, shortest_path_length, fname, graph_GraphML=find_path(G, node_embeddings,
272 |                                          embedding_tokenizer, embedding_model , second_hop=False, 
273 |                                          data_dir=data_dir, save_files=False,
274 |                                          keyword_1 = keyword_1, keyword_2 = keyword_2, )
275 | 
276 |     else: #random path
277 |         print ("Random walk to get path:", keyword_1, "and", keyword_2)
278 |         
279 |         if randomness_factor>0 or num_random_waypoints>0:
280 |             path, path_graph, shortest_path_length, _, _= heuristic_path_with_embeddings_with_randomization_waypoints(
281 |                 G, 
282 |                 embedding_tokenizer, 
283 |                 embedding_model, 
284 |                 keyword_1, 
285 |                 keyword_2, 
286 |                 node_embeddings, 
287 |                 top_k=5, 
288 |                 #perturbation_factor=0.1, 
289 |                 second_hop=False, 
290 |                 data_dir=data_dir, 
291 |                 verbatim=True, 
292 |                 save_files=False,
293 |                 randomness_factor=randomness_factor,
294 |                 num_random_waypoints=num_random_waypoints,
295 |             )
296 | 
297 |         else:
298 |             path, path_graph, shortest_path_length, _, _ = heuristic_path_with_embeddings(G, embedding_tokenizer, embedding_model, 
299 |                                                                                       keyword_1, keyword_2, 
300 |                                                                                       node_embeddings, top_k=top_k, 
301 |                                                                                       second_hop=False,data_dir=data_dir, 
302 |                                                                                       verbatim=verbatim,
303 |                                                                                       save_files=save_files)
304 | 
305 |         
306 |         print ("Done random walk to get path")
307 | 
308 |     print("Path:", path)
309 | 
310 |     path_list_for_vis, path_list_for_vis_string=path_list=print_path_with_edges_as_list(G, path, keywords_separator=' -- ') 
311 |     print  ( path_list_for_vis_string )
312 |     
313 |     print ("---------------------------------------------")
314 | 
315 |     prompt=f"""You are a sophisticated ontologist trained in scientific research, engineering, and innovation. 
316 |     
317 | Given the following key concepts extracted from a comprehensive knowledge graph, your task is to define each one of the terms and discuss the relationships identified in the graph.
318 | 
319 | Consider this list of nodes and relationships from a knowledge graph between "{keyword_1}" and "{keyword_2}". 
320 | 
321 | The format of the knowledge graph is "node_1 -- relationship between node_1 and node_2 -- node_2 -- relationship between node_2 and node_3 -- node_3...."
322 | 
323 | Here is the graph:
324 | 
325 | {path_list_for_vis_string}
326 | 
327 | Make sure to incorporate EACH of the concepts in the knowledge graph in your response. 
328 | 
329 | Do not add any introductory phrases. First, define each term in the knowledge graph and then, secondly, discuss each of the relationships, with context. """
330 | 
331 |     expanded=''
332 |     expanded=generate_graph_expansion( system_prompt='You are a creative scientist who provides accurate, detailed and valuable responses.',  
333 |                             prompt=prompt, max_tokens=1024, temperature=.1,  )
334 | 
335 |     print ("EXPANDED: ", expanded, "\n\n")
336 | 
337 |     if expanded != "":
338 |         expanded = f"Here is an analysis of the concepts and relationships in the graph:\n\n{expanded}\n\n"
339 |     
340 |     prompt=f"""You are a sophisticated scientist trained in scientific research and innovation. 
341 |     
342 | Given the following key concepts extracted from a comprehensive knowledge graph, your task is to synthesize a novel research hypothesis. Your response should not only demonstrate deep understanding and rational thinking but also explore imaginative and unconventional applications of these concepts. 
343 |     
344 | Consider this list of nodes and relationships from a knowledge graph between "{keyword_1}" and "{keyword_2}". \
345 | The format of the graph is "node_1 -- relationship between node_1 and node_2 -- node_2 -- relationship between node_2 and node_3 -- node_3...."
346 | 
347 | Here is the graph:
348 | 
349 | {path_list_for_vis_string}
350 | 
351 | {expanded}Analyze the graph deeply and carefully, then craft a detailed research hypothesis that investigates a likely groundbreaking aspect that incorporates EACH of these concepts. Consider the implications of your hypothesis and predict the outcome or behavior that might result from this line of investigation. Your creativity in linking these concepts to address unsolved problems or propose new, unexplored areas of study, emergent or unexpected behaviors, will be highly valued.
352 | 
353 | Be as quantitative as possible and include details such as numbers, sequences, or chemical formulas. Please structure your response in JSON format, with SEVEN keys: 
354 | 
355 | "hypothesis" clearly delineates the hypothesis at the basis for the proposed research question.
356 | 
357 | "outcome" describes the expected findings or impact of the research. Be quantitative and include numbers, material properties, sequences, or chemical formula.
358 | 
359 | "mechanisms" provides details about anticipated chemical, biological or physical behaviors. Be as specific as possible, across all scales from molecular to macroscale.
360 | 
361 | "design_principles" should list out detailed design principles, focused on novel concepts and include a high level of detail. Be creative and give this a lot of thought, and be exhaustive in your response. 
362 | 
363 | "unexpected_properties" should predict unexpected properties of the new material or system. Include specific predictions, and explain the rationale behind these clearly using logic and reasoning. Think carefully.
364 | 
365 | "comparison" should provide a detailed comparison with other materials, technologies or scientific concepts. Be detailed and quantitative. 
366 | 
367 | "novelty" should discuss novel aspects of the proposed idea, specifically highlighting how this advances over existing knowledge and technology. 
368 | 
369 | Ensure your scientific hypothesis is both innovative and grounded in logical reasoning, capable of advancing our understanding or application of the concepts provided.
370 | 
371 | Here is an example structure for your response, in JSON format:
372 | 
373 | {{
374 |   "hypothesis": "...",
375 |   "outcome": "...",
376 |   "mechanisms": "...",
377 |   "design_principles": "...",
378 |   "unexpected_properties": "...",
379 |   "comparison": "...",
380 |   "novelty": "...",
381 | }}
382 | 
383 | Remember, the value of your response is as scientific discovery, new avenues of scientific inquiry and potential technological breakthroughs, with details and solid reasoning.
384 | 
385 | Make sure to incorporate EACH of the concepts in the knowledge graph in your response. 
386 | """
387 |     if verbatim:
388 |         print ("##############################################")
389 |         print (prompt)
390 |         print ("##############################################")
391 | 
392 |     res=generate( system_prompt='You are a creative scientist who provides accurate, detailed and valuable responses, in JSON format.',  
393 |                             prompt=prompt, max_tokens=2048, temperature=.2,  )
394 | 
395 |     res=convert_response_to_JSON(res)
396 | 
397 |     if verbatim:
398 |         display (Markdown(res) )
399 | 
400 |     res_dict=None
401 |     try:    
402 |         res_dict = json.loads(res)
403 |         res_dict['path_string'] = path_list_for_vis_string
404 |         res_dict['expanded'] = expanded
405 | 
406 |     except:
407 |         print ("Dict generation failed...")
408 |     
409 |     return res, res_dict, path_list_for_vis_string, json_to_formatted_text(res_dict), (keyword_1, keyword_2)
410 | 
411 | def research_generation(G, embedding_tokenizer,
412 |                         embedding_model, node_embeddings, 
413 |                         generate, 
414 |                         generate_graph_expansion,
415 |                         randomness_factor, num_random_waypoints,shortest_path,
416 |                         second_hop, data_dir, save_files, verbatim,
417 |                         keyword_1 = None, keyword_2=None,
418 |                        ):
419 | 
420 |     df_total = pd.DataFrame()
421 | 
422 |     res, res_data, path_string, formatted_text, (start_node, end_node) = develop_qa_over_path (G=G, 
423 |                           embedding_tokenizer=embedding_tokenizer,
424 |                           embedding_model=embedding_model,
425 |                           node_embeddings=node_embeddings,
426 |                           generate=generate,
427 |                           generate_graph_expansion=generate_graph_expansion,
428 |                           randomness_factor=randomness_factor, 
429 |                           num_random_waypoints=num_random_waypoints,
430 |                           shortest_path=shortest_path,
431 |                           second_hop=second_hop, data_dir=data_dir, save_files=save_files, verbatim=verbatim,
432 |                           keyword_1 = keyword_1, keyword_2=keyword_2,
433 |                          )
434 |  
435 | 
436 |     print (start_node, "---->", end_node)
437 | 
438 | #generate=generate_Anthropic
439 | 
440 |     expanded_text=''
441 |     res_data_expanded={}
442 |     #for i, field in tqdm(enumerate (res_data.keys())):
443 |     for i, field in tqdm(enumerate (list (res_data.keys())[:7])):    
444 |         prompt=f'''You are given a new resaerch idea:
445 |         
446 | {formatted_text}
447 | 
448 | This research idea was developed based on a knowledge graph that describes relationships between two concepts, {start_node} and {end_node}:
449 |     
450 | {path_string}
451 |     
452 | Now, carefully expand on this particular aspect: ```{field}```.
453 |     
454 | Critically assess the original content and improve on it. \
455 | Add more specifics, quantitive scientific information, if possible, such as chemical formulas, numbers, protein sequences, processing conditions, microstructures, etc. \
456 | Include a clear rationale and step-by-step reasoning. When possible, comment on specific modeling and simulation techniques and codes, experimental methods, or particular analyses. 
457 |     
458 | Start by carefully assessing this initial draft from the perspective of a peer-reviewer whose task it is to critically assess and improve the science:
459 |     
460 | {res_data[field]}
461 |     
462 | Do not add any introductory phrases. Your response begins with your response, with a heading: ### Expanded ... 
463 | '''
464 |         res=generate( system_prompt='You are a creative scientist who provides accurate, detailed and valuable responses.',  
465 |                                     prompt=prompt, max_tokens=2048, temperature=.2,  )
466 |         
467 |         display (Markdown(res [:256]))
468 |     
469 |         res_data_expanded[field]=res
470 |     #    expanded_text = expanded_text+f'\n\n## Expanded field {i+1}: {field}\n\n'+res
471 |         expanded_text = expanded_text+f'\n\n'+res
472 |     print ('---------------------------------------------')
473 |     
474 |     complete=f"# Research concept between '{start_node}' and '{end_node}'\n\n### KNOWLEDGE GRAPH:\n\n{res_data['path_string']}\n\n"+f"### EXPANDED GRAPH:\n\n{res_data['expanded']}"+f"### PROPOSED RESEARCH/MATERIAL:\n\n{formatted_text}"+f'\n\n### EXPANDED DESCRIPTIONS:\n\n'+expanded_text
475 | 
476 | #display (complete)
477 | #generate=generate_Anthropic
478 | 
479 |     prompt=f'Read this document:\n\n{complete}\n\nProvide (1) a summary of the document (in one paragraph, but including sufficient detail such as mechanisms, \
480 | related technologies, models and experiments, methods to be used, and so on), \
481 | and (2) a thorough critical scientific review with strengths and weaknesses, and suggested improvements. Include logical reasoning and scientific approaches.'
482 |     critiques=generate( system_prompt='You are a critical scientist who provides accurate, detailed and valuable responses.',  
483 |                                     prompt=prompt, max_tokens=2048, temperature=.1,  )
484 |         
485 |     res_data['critiques'] = critiques
486 |     res_data['res_data_expanded'] = res_data_expanded
487 |     
488 |     #display(Markdown(critiques))
489 |     complete_doc=complete+ f'\n\n## SUMMARY, CRITICAL REVIEW AND IMPROVEMENTS:\n\n'+critiques
490 |     
491 |     
492 |     #generate=generate_Anthropic
493 |     prompt=f'Read this document:\n\n{complete_doc}\n\nFrom within this document, identify the single most impactful scientific question that can be tackled with molecular modeling. \
494 | \n\nOutline key steps to set up and conduct such modeling and simulation, with details and include unique aspects of the planned work.'
495 |     modeling_priority=generate( system_prompt='You are a scientist who provides accurate, detailed and valuable responses.',  
496 |                                     prompt=prompt, max_tokens=2048, temperature=.1,  )
497 |     prompt=f'Read this document:\n\n{complete_doc}\n\nFrom within this document, identify the single most impactful scientific question that can be tackled with synthetic biology. \
498 | \n\nOutline key steps to set up and conduct such experimental work, with details and include unique aspects of the planned work.'
499 |     synbio_priority=generate( system_prompt='You are a  scientist who provides accurate, detailed and valuable responses.',  
500 |                                     prompt=prompt, max_tokens=2048, temperature=.1,  )
501 |     display (Markdown(modeling_priority))
502 |     display (Markdown(synbio_priority))
503 |     
504 |     complete_doc=complete_doc+ f'\n\n## MODELING AND SIMULATION PRIORITIES:\n\n'+modeling_priority
505 |     complete_doc=complete_doc+ f'\n\n## SYNTHETIC BIOLOGY EXPERIMENTAL PRIORITIES:\n\n'+synbio_priority
506 |     
507 |     res_data['modeling_priority'] = modeling_priority
508 |     res_data['synbio_priority'] = synbio_priority
509 |     
510 |     output_pdf_path = f"{data_dir}/output_"
511 |     fname=markdown_to_pdf(complete_doc, output_pdf_path)
512 |     
513 |     df = pd.DataFrame([res_data])
514 |     df_total = pd.concat([df_total, df], ignore_index=True)
515 |     #df_total.to_csv(fname)
516 |     df_total.to_csv(fname[:-4]+'.csv') 
517 | 
518 |     return None


--------------------------------------------------------------------------------