├── design.jpeg
├── dirTree.jpg
├── working.jpg
├── requirements.txt
├── .gitignore
├── src
    ├── data
    │   ├── ContractOpportunities-20231031-045845.csv
    │   └── AssistanceListings_DataGov_PUBLIC_WEEKLY_20200321.csv
    ├── app
    │   ├── config.yml
    │   ├── utils.py
    │   ├── llm.py
    │   ├── main.py
    │   └── llm_prompter.py
    ├── datalayer
    │   ├── datasources.yml
    │   ├── datapreprocessor.py
    │   ├── Neo4jDumper.py
    │   └── KnowledgeGraph.py
    ├── services
    │   ├── schema.yml
    │   ├── cypher_qa.py
    │   ├── Identity_retrival_for_csv.py
    │   └── Identity_retrival_for_html.py
    ├── UI
    │   └── ui.py
    └── components
    │   └── base_component.py
├── Dockerfile
├── tree.txt
└── README.md


/design.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prvnsingh/LLM-WebToGraph/HEAD/design.jpeg


--------------------------------------------------------------------------------
/dirTree.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prvnsingh/LLM-WebToGraph/HEAD/dirTree.jpg


--------------------------------------------------------------------------------
/working.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prvnsingh/LLM-WebToGraph/HEAD/working.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prvnsingh/LLM-WebToGraph/HEAD/requirements.txt


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | Model
2 | .idea
3 | .env
4 | **__pychache__**
5 | *.pyc
6 | */data/*
7 | */logs/*
8 | 


--------------------------------------------------------------------------------
/src/data/ContractOpportunities-20231031-045845.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prvnsingh/LLM-WebToGraph/HEAD/src/data/ContractOpportunities-20231031-045845.csv


--------------------------------------------------------------------------------
/src/data/AssistanceListings_DataGov_PUBLIC_WEEKLY_20200321.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prvnsingh/LLM-WebToGraph/HEAD/src/data/AssistanceListings_DataGov_PUBLIC_WEEKLY_20200321.csv


--------------------------------------------------------------------------------
/src/app/config.yml:
--------------------------------------------------------------------------------
1 | neo4j:
2 |   uri: neo4j+s://813d1613.databases.neo4j.io
3 |   username: neo4j
4 |   password: 5yRHnFRIdfbz5FRql8GmmmlLj_TLcvFGeE5KzL4QPuc
5 | port: 8000
6 | host: 0.0.0.0


--------------------------------------------------------------------------------
/src/datalayer/datasources.yml:
--------------------------------------------------------------------------------
1 | #add more urls
2 | link:
3 |     - https://www.worldbank.org/en/projects-operations/procurement?srce=both
4 |     - https://projects.worldbank.org/en/projects-operations/projects-list?os=0
5 | csv:
6 |     - data/AssistanceListings_DataGov_PUBLIC_WEEKLY_20200321.csv
7 |     - data/ContractOpportunities-20231031-045845.csv


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use base shared Python image
 2 | FROM python:3.9
 3 | 
 4 | RUN pip install --upgrade pip
 5 | RUN pip install --upgrade setuptools wheel
 6 | 
 7 | COPY ./requirements.txt /requirements.txt
 8 | RUN pip install --no-cache-dir -r /requirements.txt
 9 | 
10 | # Copy the project files into the container
11 | COPY ./src /src
12 | 
13 | RUN MKDIR ./logs/
14 | 
15 | # Expose any necessary ports
16 | EXPOSE 8000
17 | EXPOSE 8501
18 | 
19 | # Set the working directory
20 | WORKDIR /src
21 | 
22 | # Start the application
23 | CMD ["sh", "-c", "streamlit run UI/ui.py & uvicorn app.main:app --host 0.0.0.0 --port 8000"]


--------------------------------------------------------------------------------
/src/datalayer/datapreprocessor.py:
--------------------------------------------------------------------------------
 1 | from app import utils
 2 | from components.base_component import BaseComponent
 3 | 
 4 | pre_defined_sources = ('link', 'csv')
 5 | 
 6 | 
 7 | class DataPreprocessor(BaseComponent):
 8 |     def __init__(self, datasource):
 9 |         super().__init__('DataPreprocessor')
10 |         self.sources = utils.read_yaml_file(datasource)
11 |         self.csv_sources = self.sources.get('csv', [])
12 |         self.html_sources = self.sources.get('link', [])
13 | 
14 |     def get_csv_sources(self):
15 |         return self.csv_sources
16 | 
17 |     def get_html_sources(self):
18 |         return self.html_sources
19 | 
20 |     def run(self, **kwargs):
21 |         pass
22 | 


--------------------------------------------------------------------------------
/src/app/utils.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | 
 3 | 
 4 | def read_yaml_file(file_path):
 5 |     """
 6 |     Load a YAML file and return its contents as a Python dictionary.
 7 | 
 8 |     Args:
 9 |         file_path (str): The path to the YAML file.
10 | 
11 |     Returns:
12 |         dict: A dictionary containing the YAML configuration.
13 |     """
14 |     try:
15 |         with open(file_path, "r") as yaml_file:
16 |             config = yaml.safe_load(yaml_file)
17 |         return config
18 |     except FileNotFoundError:
19 |         print(f"Error: Config file not found at {file_path}")
20 |         return {}
21 |     except yaml.YAMLError as e:
22 |         print(f"Error: Failed to load YAML from {file_path}. {e}")
23 |         return {}
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/src/services/schema.yml:
--------------------------------------------------------------------------------
 1 | properties:
 2 |   Organizations:
 3 |     type: string
 4 |   Project_Name:
 5 |     type: string
 6 |   location:
 7 |     type: string
 8 |   date:
 9 |     type: string
10 |   financial_Value:
11 |     type: string
12 |   key_personals/POC:
13 |     type: string
14 |   Industry_type:
15 |     type: string
16 |   project_status:
17 |     type: string
18 |   Sustainability and Environmental Factors:
19 |     type: string
20 |   Collaboration and Partnerships:
21 |     type: string
22 |   Risk Factors:
23 |     type: string
24 |   Vendor and Supplier Information:
25 |     type: string
26 |   brief:
27 |     type: string
28 | required:
29 | - Organizations
30 | - Project_Name
31 | - location
32 | - date
33 | - financial_Value
34 | - project_status


--------------------------------------------------------------------------------
/tree.txt:
--------------------------------------------------------------------------------
 1 | |-- Dockerfile
 2 | |-- README.md
 3 | |-- requirements.txt
 4 | |-- src
 5 | |-- |-- Model
 6 | |-- |-- |-- llama-2-7b
 7 | |-- |--     |-- checklist.chk
 8 | |-- |--     |-- consolidated.00.pth
 9 | |-- |--     |-- params.json
10 | |-- |-- UI
11 | |-- |-- |-- ui.py
12 | |-- |-- app
13 | |-- |-- |-- config.yml
14 | |-- |-- |-- llm.py
15 | |-- |-- |-- llm_prompter.py
16 | |-- |-- |-- main.py
17 | |-- |-- |-- utils.py
18 | |-- |-- components
19 | |-- |-- |-- base_component.py
20 | |-- |-- data
21 | |-- |-- |-- AssistanceListings_DataGov_PUBLIC_WEEKLY_20200321.csv
22 | |-- |-- |-- ContractOpportunities-20231031-045845.csv
23 | |-- |-- datalayer
24 | |-- |-- |-- KnowledgeGraph.py
25 | |-- |-- |-- Neo4jDumper.py
26 | |-- |-- |-- datapreprocessor.py
27 | |-- |-- |-- datasources.yml
28 | |-- |-- logs
29 | |-- |-- |-- logs.txt
30 | |-- |-- services
31 | |--     |-- Identity_retrival_for_csv.py
32 | |--     |-- Identity_retrival_for_html.py
33 | |--     |-- cypher_qa.py
34 | |--     |-- schema.yml
35 | 


--------------------------------------------------------------------------------
/src/services/cypher_qa.py:
--------------------------------------------------------------------------------
 1 | from langchain.chains import GraphCypherQAChain
 2 | from app.llm import Llm
 3 | from components.base_component import BaseComponent
 4 | from datalayer.Neo4jDumper import Neo4jDumper
 5 | 
 6 | 
 7 | class CypherQa(BaseComponent):
 8 |     def __init__(self, model_name):
 9 |         super().__init__('cypher_qa')
10 |         # instantiating the openai llm model and neo4j connection
11 |         self.neo4j_instance = Neo4jDumper(config_path='app/config.yml')
12 |         self.open_ai_llm = Llm(model=model_name)
13 |         # schema = utils.read_yaml_file('services/schemaN.yml')
14 |         # graph_schema = construct_schema(schema,[],[])
15 |         print(self.neo4j_instance.graph.schema)
16 |         self.cypher_chain = GraphCypherQAChain.from_llm(
17 |             cypher_llm=self.open_ai_llm.llm,
18 |             qa_llm=self.open_ai_llm.llm,
19 |             graph=self.neo4j_instance.graph,
20 |             # validate_cypher=True,  # Validate relationship directions
21 |             verbose=True,
22 |         )
23 | 
24 |     def run(self, text):
25 |         return self.cypher_chain.run(text)
26 | 


--------------------------------------------------------------------------------
/src/UI/ui.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import streamlit as st
 3 | 
 4 | st.title("LLM-WebToGraph")
 5 | st.text(
 6 |     'This project using langchain and OpenAI LLM to transform data from different sources (weblinks/csv) to knowledge graph and store then in neo4j DB.')
 7 | st.write('Press submit to upload process the data and generate knowledge graph.')
 8 | 
 9 | if st.button("process csv files and generate knowledge graph"):
10 |     # Send user_input to FastAPI
11 |     fastapi_url = "http://localhost:8000/generate_tags_from_csv"
12 |     response = requests.get(fastapi_url)
13 |     if response.status_code == 200:
14 |         st.write(f"{response.text}")
15 |     else:
16 |         st.error(f"Error: {response.status_code}")
17 | 
18 | if st.button("process html links and generate knowledge graph"):
19 |     # Send user_input to FastAPI
20 |     fastapi_url = "http://localhost:8000/generate_tags_from_html"
21 |     response = requests.get(fastapi_url)
22 |     if response.status_code == 200:
23 |         st.write(f"{response.text}")
24 |     else:
25 |         st.error(f"Error: {response.status_code}")
26 | 
27 | user_input = st.text_input("ask any question about data")
28 | if st.button('submit'):
29 |     # Send user_input to FastAPI
30 |     fastapi_url = f"http://localhost:8000/query_graph/{user_input}"
31 |     response = requests.get(fastapi_url)
32 |     if response.status_code == 200:
33 |         st.write(f"{response.text}")
34 |     else:
35 |         st.error(f"Error: {response.status_code}")
36 | 


--------------------------------------------------------------------------------
/src/components/base_component.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from abc import ABC, abstractmethod
 3 | from functools import wraps
 4 | from typing import List, Union
 5 | 
 6 | 
 7 | def log_errors(logger):
 8 |     def decorator(run):
 9 |         @wraps(run)
10 |         def wrapper(*args, **kwargs):
11 |             try:
12 |                 return run(*args, **kwargs)
13 |             except Exception as e:
14 |                 logger.log_error(f"An error occurred: {str(e)}", exception=e)
15 | 
16 |         return wrapper
17 | 
18 |     return decorator
19 | 
20 | 
21 | class BaseComponent(ABC):
22 |     """"""
23 | 
24 |     def __init__(self, logger_name):
25 |         self.logger = self._configure_logger(logger_name)
26 | 
27 |     @log_errors
28 |     @abstractmethod
29 |     def run(
30 |             self,
31 |             input: Union[str, List[float]],
32 |     ) -> str:
33 |         """Comment"""
34 | 
35 |     @log_errors
36 |     async def run_async(
37 |             self,
38 |             input: Union[str, List[float]],
39 |     ) -> str:
40 |         """Comment"""
41 | 
42 |     @staticmethod
43 |     def _configure_logger(component_name):
44 |         logger = logging.getLogger(component_name)
45 |         logger.setLevel(logging.INFO)
46 | 
47 |         # Create a file handler for logging to a file
48 |         file_handler = logging.FileHandler('logs/logs.txt')
49 | 
50 |         # Create a formatter
51 |         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
52 |         file_handler.setFormatter(formatter)
53 | 
54 |         # Add the file handler to the logger
55 |         logger.addHandler(file_handler)
56 |         return logger
57 | 
58 |     def log_error(self, error_message, exception=None):
59 |         # Centralized error handling
60 |         self.logger.error(error_message)
61 |         if exception:
62 |             self.logger.error(f"Exception: {str(exception)}")
63 |         # You can add more error handling logic here, such as sending alerts or notifications
64 | 


--------------------------------------------------------------------------------
/src/app/llm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import backoff
 4 | import openai  # for OpenAI API calls
 5 | from dotenv import load_dotenv
 6 | from langchain.chains import create_extraction_chain
 7 | from langchain.chat_models import ChatOpenAI
 8 | from tenacity import (
 9 |     retry,
10 |     stop_after_attempt,
11 |     wait_random_exponential,
12 | )  # for exponential backoff
13 | 
14 | from app import utils
15 | from components.base_component import BaseComponent
16 | 
17 | load_dotenv()
18 | 
19 | 
20 | def get_schema():
21 |     """
22 |     The get_schema function reads the schema.yml file and returns a dictionary of the schema.
23 | 
24 |     :return: The schema
25 |     :doc-author: Trelent
26 |     """
27 |     schema = utils.read_yaml_file('services/schema.yml')
28 |     return schema
29 | 
30 | 
31 | class Llm(BaseComponent):
32 | 
33 |     def __init__(self, model: str):
34 |         super().__init__('Lllm')
35 |         self.model = model
36 |         # for huggingface hub models
37 |         # self.llm = HuggingFaceHub(repo_id='ValiantLabs/ShiningValiant', task='text-generation',
38 |         #                           huggingfacehub_api_token=os.getenv('HF_AUTH_TOKEN'),
39 |         #                           model_kwargs={"temperature": 0, "max_length": 64})
40 |         self.llm = ChatOpenAI(temperature=0, model_name=model, openai_api_key=os.getenv('OPENAI_API_KEY'))
41 | 
42 |     @backoff.on_exception(backoff.expo, openai.error.RateLimitError)
43 |     @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
44 |     def run(self, input_text):
45 |         """
46 |         The run function is the main entry point for your component.
47 |         It will be called with a string of text to process, and should return a dictionary of results.
48 |         The keys in this dictionary are the names of slots that you defined in your schema.
49 | 
50 |         :param self: Represent the instance of the class
51 |         :param input_text: Pass the text that we want to extract entities from
52 |         :return: A dictionary with the following structure:
53 |         """
54 |         schema = get_schema()
55 |         self.logger.info(f'schema: {schema}')
56 |         chain = create_extraction_chain(schema, self.llm)
57 |         llm_response = chain.run(input_text)
58 |         self.logger.info(f'llm_response: {llm_response}')
59 |         return llm_response
60 | 


--------------------------------------------------------------------------------
/src/datalayer/Neo4jDumper.py:
--------------------------------------------------------------------------------
 1 | from langchain.graphs import Neo4jGraph
 2 | from langchain.graphs.graph_document import GraphDocument
 3 | from neo4j import GraphDatabase
 4 | 
 5 | from app import utils
 6 | from components.base_component import BaseComponent
 7 | from datalayer.KnowledgeGraph import map_to_base_node, map_to_base_relationship
 8 | 
 9 | 
10 | class Neo4jDumper(BaseComponent):
11 |     def __init__(self, config_path):
12 |         super().__init__('Neo4jDumper')
13 |         config = utils.read_yaml_file(config_path)
14 |         self.uri = config.get('neo4j').get('uri')
15 |         self.username = config.get('neo4j').get('username')
16 |         self.password = config.get('neo4j').get('password')
17 |         self.graph = Neo4jGraph(
18 |             url=self.uri, username=self.username, password=self.password
19 |         )
20 | 
21 |     def dump_data(self, tx, data):
22 |         for key, value in data.items():
23 |             # Create a node for each key-value pair
24 |             tx.run(query="CREATE (n:Node {key: $key, value: $value})", key=key, value=value)
25 |             self.logger.info(f"Dumped data for {key}: {value} to neo4j")
26 | 
27 |     def run(self, data):
28 |         try:
29 |             with GraphDatabase.driver(self.uri, auth=(self.username, self.password)) as driver:
30 |                 with driver.session() as session:
31 |                     self.dump_data(session, data)
32 |             self.logger.info("Neo4j database connected successfully. and data dumped successfully.")
33 |         except Exception as e:
34 |             self.logger.error(f"Error while connecting to neo4j: {str(e)}")
35 |         finally:
36 |             session.close()
37 | 
38 |     # New implementation using graph document
39 |     def run2(self, data, document):
40 |         try:
41 |             graph = Neo4jGraph(
42 |                 url=self.uri, username=self.username, password=self.password
43 |             )
44 |             # Construct a graph document
45 |             graph_document = GraphDocument(
46 |                 nodes=[map_to_base_node(node) for node in data.nodes],
47 |                 relationships=[map_to_base_relationship(rel) for rel in data.rels],
48 |                 source=document
49 |             )
50 |             # Store information into a graph
51 |             graph.add_graph_documents([graph_document])
52 | 
53 |         except Exception as e:
54 |             self.logger.error(f"Error while connecting to neo4j: {str(e)}")
55 | 


--------------------------------------------------------------------------------
/src/datalayer/KnowledgeGraph.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | 
 3 | from langchain.graphs.graph_document import (
 4 |     Node as BaseNode,
 5 |     Relationship as BaseRelationship
 6 | )
 7 | from langchain.pydantic_v1 import Field, BaseModel
 8 | 
 9 | 
10 | class Property(BaseModel):
11 |     """A single property consisting of key and value"""
12 |     key: str = Field(..., description="key")
13 |     value: str = Field(..., description="value")
14 | 
15 | 
16 | class Node(BaseNode):
17 |     properties: Optional[List[Property]] = Field(
18 |         None, description="List of node properties")
19 | 
20 | 
21 | class Relationship(BaseRelationship):
22 |     properties: Optional[List[Property]] = Field(
23 |         None, description="List of relationship properties"
24 |     )
25 | 
26 | 
27 | class KnowledgeGraph(BaseModel):
28 |     """Generate a knowledge graph with entities and relationships."""
29 |     nodes: List[Node] = Field(
30 |         ..., description="List of nodes in the knowledge graph")
31 |     rels: List[Relationship] = Field(
32 |         ..., description="List of relationships in the knowledge graph"
33 |     )
34 | 
35 | 
36 | def format_property_key(s: str) -> str:
37 |     words = s.split()
38 |     if not words:
39 |         return s
40 |     first_word = words[0].lower()
41 |     capitalized_words = [word.capitalize() for word in words[1:]]
42 |     return "".join([first_word] + capitalized_words)
43 | 
44 | 
45 | def props_to_dict(props) -> dict:
46 |     """Convert properties to a dictionary."""
47 |     properties = {}
48 |     if not props:
49 |         return properties
50 |     for p in props:
51 |         properties[format_property_key(p.key)] = p.value
52 |     return properties
53 | 
54 | 
55 | def map_to_base_node(node: Node) -> BaseNode:
56 |     """Map the KnowledgeGraph Node to the base Node."""
57 |     properties = props_to_dict(node.properties) if node.properties else {}
58 |     # Add name property for better Cypher statement generation
59 |     properties["name"] = node.id.title()
60 |     return BaseNode(
61 |         id=node.id.title(), type=node.type.capitalize(), properties=properties
62 |     )
63 | 
64 | 
65 | def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
66 |     """Map the KnowledgeGraph Relationship to the base Relationship."""
67 |     source = map_to_base_node(rel.source)
68 |     target = map_to_base_node(rel.target)
69 |     properties = props_to_dict(rel.properties) if rel.properties else {}
70 |     return BaseRelationship(
71 |         source=source, target=target, type=rel.type, properties=properties
72 |     )
73 | 


--------------------------------------------------------------------------------
/src/services/Identity_retrival_for_csv.py:
--------------------------------------------------------------------------------
 1 | from langchain.document_loaders.csv_loader import CSVLoader
 2 | from app import utils
 3 | from app.llm import Llm
 4 | from components.base_component import BaseComponent
 5 | from datalayer.Neo4jDumper import Neo4jDumper
 6 | 
 7 | 
 8 | class NameIdentityRetrievalForCsv(BaseComponent):
 9 |     def __init__(self, model_name, data_path):
10 |         """
11 |         The __init__ function is called when the class is instantiated.
12 |         It sets up the instance of the class, and defines all its attributes.
13 |         The self parameter refers to an instance of a class, and it's required in order for Python to know which object you're referring to.
14 | 
15 |         :param self: Represent the instance of the class
16 |         :param model_name: Instantiate the openai llm model
17 |         :param data_path: Read the yaml file which contains the path to all csv files
18 |         :return: The instance of the class
19 |         """
20 |         super().__init__('NameIdentityRetrievalForCsv')
21 |         self.sources = utils.read_yaml_file(data_path)
22 |         self.csv_sources = self.sources.get('csv', [])
23 |         # instantiating the openai llm model and neo4j connection
24 |         self.neo4j_instance = Neo4jDumper(config_path='app/config.yml')
25 |         self.open_ai_llm = Llm(model=model_name)
26 | 
27 |     def run(self, **kwargs):
28 |         """
29 |         The run function is the main function of this module. It takes in a list of csv files and extracts knowledge graph from them using openai api.
30 |         The knowledge graph is then dumped into neo4j database.
31 | 
32 |         :param self: Represent the instance of the class
33 |         :return: A tuple of the following:
34 |         """
35 |         for csvfile in self.csv_sources:
36 |             # loading the csv using langchain document loader for csv
37 |             loader = CSVLoader(file_path=csvfile)
38 |             data = loader.load()
39 | 
40 |             # setting up openai model and extracting knowledge graph
41 |             self.logger.info(f'loading model {self.open_ai_llm}')
42 |             # just sending last few lines of csv as the token limit is limited of openai api free version.
43 |             # model should  be changed to claude2 (Anthropic) or premium openai api key should be used.
44 |             # response = self.open_ai_llm.extract_and_store_graph(document=data[-1])
45 |             response = self.open_ai_llm.run(input_text=data[-1])
46 |             # instantiating neo4jBD and dumping the knowledge graph
47 |             self.neo4j_instance.run(data=response)
48 |             self.logger.info(f'knowledge graph populated successfully for data source: {csvfile}')
49 | 


--------------------------------------------------------------------------------
/src/services/Identity_retrival_for_html.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, List
 2 | 
 3 | from langchain.document_loaders import AsyncHtmlLoader
 4 | from langchain.document_transformers import BeautifulSoupTransformer
 5 | from app import utils
 6 | from app.llm import Llm
 7 | from components.base_component import BaseComponent
 8 | from datalayer.Neo4jDumper import Neo4jDumper
 9 | 
10 | 
11 | class NameIdentityRetrievalForHtml(BaseComponent):
12 |     def __init__(self, model_name, data_path):
13 | 
14 |         """
15 |         The __init__ function is called when the class is instantiated.
16 |         It sets up the initial values of all attributes for an instance of a class.
17 |         The self parameter refers to the current instance of a class, and it's required by Python.
18 | 
19 |         :param self: Represent the instance of the class
20 |         :param model_name: Specify the model name that we want to use for our predictions
21 |         :param data_path: Read the yaml file which contains the links to be scraped
22 |         :return: Nothing
23 |         """
24 |         super().__init__('NameIdentityRetrievalForHtml')
25 |         self.sources = utils.read_yaml_file(data_path)
26 |         self.html_sources = self.sources.get('link', [])
27 |         # instantiating the openai llm model and neo4j connection
28 |         self.neo4j_instance = Neo4jDumper(config_path='app/config.yml')
29 |         self.open_ai_llm = Llm(model=model_name)
30 | 
31 |     def run_async(self, **kwargs):
32 | 
33 |         """
34 |         The run_async function is used to run the pipeline asynchronously.
35 |             It takes in a list of html sources and extracts knowledge graph from them using openai api.
36 |             The extracted knowledge graph is then dumped into neo4j database.
37 | 
38 |         :param self: Represent the instance of the object itself
39 |         :param **kwargs: Pass a variable number of keyword arguments to a function
40 |         :return: A list of all the knowledge graphs extracted from the html sources
41 |         """
42 |         for link in self.html_sources:
43 |             loader =AsyncHtmlLoader(link)
44 |             html = loader.load()
45 |             # html = loader.load()
46 |             bs_transformer = BeautifulSoupTransformer()
47 |             docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["table"])
48 |             self.logger.info(docs_transformed[0].page_content[0:500])
49 | 
50 |             # setting up openai model and extracting knowledge graph
51 |             self.logger.info(f'loading model {self.open_ai_llm}')
52 | 
53 |             # just sending last few lines of csv as the token limit is limited of openai api free version.
54 |             # model should  be changed to claude2 (Anthropic) or premium openai api key should be used.
55 |             # response = self.open_ai_llm.extract_and_store_graph(document=docs_transformed[0])
56 |             tokens_cap = len(docs_transformed[0].page_content) - 4
57 |             response = self.open_ai_llm.run(input_text=docs_transformed[0].page_content[tokens_cap:])
58 |             # instantiating neo4jBD and dumping the knowledge graph
59 |             self.neo4j_instance.run(data=response)
60 |             self.logger.info(f'knowledge graph populated successfully for data source: {link}')
61 | 
62 |     def run(self, input: Union[str, List[float]]) -> str:
63 |         pass
64 | 


--------------------------------------------------------------------------------
/src/app/main.py:
--------------------------------------------------------------------------------
 1 | import uvicorn
 2 | from dotenv import load_dotenv
 3 | from fastapi import FastAPI
 4 | from fastapi.responses import HTMLResponse
 5 | 
 6 | from app import utils
 7 | from services.Identity_retrival_for_csv import NameIdentityRetrievalForCsv
 8 | from services.Identity_retrival_for_html import NameIdentityRetrievalForHtml
 9 | from services.cypher_qa import CypherQa
10 | 
11 | app = FastAPI(
12 |     title="LLM-WebToGraph",
13 |     description="""This project using langchain and OpenAI LLM to transform data from different sources (web 
14 |     links/csv) to knowledge graph and store then in neo4j DB.""",
15 |     version="0.1.0",
16 | )
17 | 
18 | 
19 | @app.get("/query_graph/{question}")
20 | def query_graph(question: str):
21 | 
22 |     """
23 |     The query_graph function takes a question as input and returns the answer to that question.
24 |     The function uses the CypherQa class from the cypher_qachain package, which is an implementation of
25 |     the QAChain algorithm for answering questions about graphs using GPT-3. The model used by this function
26 |     is gpt-3.5-turbo, which was trained on a dataset of ~100k questions and answers about graphs.
27 | 
28 |     :param question: str: Pass the question to the function
29 |     :return: A htmlresponse object
30 | 
31 |     """
32 |     graph_cypher_qachain = CypherQa(model_name='gpt-3.5-turbo')
33 |     response = graph_cypher_qachain.run(question)
34 |     return HTMLResponse(content=response, status_code=200)
35 | 
36 | 
37 | @app.get("/generate_tags_from_html")
38 | async def generate_tags():
39 |     """
40 |     The generate_tags function is a ReST endpoint that will generate the tags for all the data sources.
41 |     This function is called by an external service, such as Jenkins or Travis CI, to ensure that the tags are up-to-date.
42 |     The function returns a 200 status code if successful and 500 otherwise.
43 | 
44 |     :return: A htmlresponse object with the content as 'successfully generated the knowledge from the data sources!!!' and status_code as 200
45 | 
46 |     """
47 |     ner = NameIdentityRetrievalForHtml(model_name='gpt-3.5-turbo', data_path='datalayer/datasources.yml')
48 |     ner.run_async()  # asyncronous call since html pages can take time to load and scrape
49 |     return HTMLResponse(content='Successfully generated the knowledge from the data sources!!!', status_code=200)
50 | 
51 | 
52 | @app.get("/generate_tags_from_csv")
53 | def generate_tags():
54 |     """
55 |     The generate_tags function is a ReST endpoint that will generate the tags for each of the data sources.
56 |         It uses the NameIdentityRetrievalForCsv class to accomplish this task.
57 |         The model_name and data_path are passed as parameters to this function.
58 | 
59 |     :return: A htmlresponse object with the status code 200
60 |     """
61 |     ner = NameIdentityRetrievalForCsv(model_name='gpt-3.5-turbo', data_path='datalayer/datasources.yml')
62 |     ner.run()
63 |     return HTMLResponse(content='Successfully generated the knowledge from the data sources!!!', status_code=200)
64 | 
65 | 
66 | # health check route
67 | @app.get("/health")
68 | def health_check():
69 |     return {"status": "healthy"}
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     app_config = utils.read_yaml_file('app/config.yml')
74 |     load_dotenv()
75 |     uvicorn.run(app, port=app_config.get('port'), host=app_config.get('host'))
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LLM-WebToGraph
 2 | 
 3 | LLM-WebToGraph is a powerful project that harnesses the capabilities of Langchain and OpenAI's Language Models (LLMs) to scrape data from various sources on the web, transforming it into a structured knowledge graph. This knowledge graph is then populated into a Neo4j Aura Database, providing an efficient way to store, query, and retrieve information using cypher query and LLMs. With the synergy of Langchain, OpenAI LLMs, and Neo4j, this project offers a robust solution for knowledge management and retrieval.
 4 | 
 5 | ## Architecture
 6 | ![design](https://github.com/prvnsingh/LLM-WebToGraph/blob/main/design.jpeg?raw=true)
 7 | 
 8 | 
 9 | ## Overview
10 | 
11 | The LLM-WebToGraph project combines several key components to achieve its goal:
12 | 
13 | 1. **Langchain:** A language model designed for natural language understanding and generation, powering the core of the project.
14 | 
15 | 2. **OpenAI's Language Models (LLMs):** These models are used to extract and process data from various sources, converting unstructured data into structured knowledge.
16 | 
17 | 3. **Neo4j Aura Database:** The project stores the structured knowledge graph in a Neo4j Aura Database, allowing for efficient storage and retrieval.
18 | 
19 | 4. **FastAPI:** To expose an API for interacting with the project and to check its health status.
20 | 
21 | 5. **Streamlit:** For building a user-friendly interface to query and visualize the knowledge graph.
22 | 
23 | ## Features
24 | 
25 | - Web scraping from various sources, such as web links and CSV files.
26 | - Data transformation and extraction using OpenAI LLM (gpt-3.5-turbo).
27 | - Population of a structured knowledge graph in Neo4j Aura Database.
28 | - FastAPI-based health check API to monitor the application's status.
29 | - Streamlit web application for querying and visualizing the knowledge graph.
30 | 
31 | ## Getting Started
32 | 1. Configuring the data sources
33 |    - Update the data files .csv in the data directory.
34 |    - Update the links of html in datasource.yml
35 | 2. Setup environment variables
36 |    - Add credentials in .env file like openAI api key and neo4jDB password or add environment variables.
37 | 
38 | 3. Configure the schema.yml for identities and relationships
39 |    - Modify the schema.yml to specify the identities to be recognized.
40 | 4. Run the streamlit UI and FASTAPI app.
41 |    - build docker and run the image with env file
42 | ~~~sh
43 |    sudo docker run --env-file .env -p 8501:8501 -p 8000:8000 image_name 
44 | ~~~
45 | To access the application
46 | ~~~html
47 | http://localhost:8501/
48 | ~~~
49 | 
50 | To check backend APIs, access the swagger at
51 | ```html
52 | http://localhost:8000/docs
53 | ```
54 | ## Working directory
55 | ![Directory Tree](https://github.com/prvnsingh/LLM-WebToGraph/blob/main/dirTree.jpg?raw=true)
56 | 
57 | ## Demo snapshot
58 | ![Demo snapshot](https://github.com/prvnsingh/LLM-WebToGraph/blob/main/working.jpg?raw=true)
59 | 
60 | ## Contributing
61 | 
62 | Contributions to the LLM-WebToGraph project are welcome! If you'd like to contribute, please follow these guidelines:
63 | 
64 | - Fork the repository.
65 | - Create a new branch for your feature or bug fix.
66 | - Make your changes and ensure tests pass.
67 | - Submit a pull request.
68 | 
69 | ## Future Scope
70 | In the future, the project can be extended with a microservices architecture, including:
71 | 
72 | A separate data service responsible for ingesting data from S3.
73 | Utilization of a Selenium bot to scrape the web and download CSV files.
74 | Integration with more data sources for enhanced knowledge graph creation.
75 | 
76 | ## References
77 | - [Langchain Graph Transformer Documentation](https://python.langchain.com/docs/use_cases/graph/diffbot_graphtransformer)
78 | - [Langchain Cypher Query Documentation](https://python.langchain.com/docs/use_cases/graph/graph_cypher_qa)
79 | - [Blog Post: Constructing Knowledge Graphs from Text](https://blog.langchain.dev/constructing-knowledge-graphs-from-text-using-openai-functions/)
80 | 
81 | ## License
82 | 
83 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
84 | 
85 | ## Contact
86 | 
87 | For questions or support, feel free to contact us at [prvns1997@gmail.com](mailto:prvns1997@email.com).
88 | 


--------------------------------------------------------------------------------
/src/app/llm_prompter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List, Optional
 3 | 
 4 | import backoff
 5 | import openai  # for OpenAI API calls
 6 | from dotenv import load_dotenv
 7 | from langchain.chains.openai_functions import create_structured_output_chain
 8 | from langchain.chat_models import ChatOpenAI
 9 | from langchain.prompts import ChatPromptTemplate
10 | from langchain.schema import Document
11 | from tenacity import (
12 |     retry,
13 |     stop_after_attempt,
14 |     wait_random_exponential,
15 | )  # for exponential backoff
16 | 
17 | from app import utils
18 | from components.base_component import BaseComponent
19 | from datalayer.KnowledgeGraph import KnowledgeGraph
20 | 
21 | load_dotenv()
22 | 
23 | 
24 | def get_schema():
25 |     """
26 |     The get_schema function reads the schema.yml file and returns a dictionary of the schema.
27 | 
28 |     :return: The schema
29 |     :doc-author: Trelent
30 |     """
31 |     schema = utils.read_yaml_file('services/schema.yml')
32 |     return schema
33 | 
34 | 
35 | class LlmPrompter(BaseComponent):
36 | 
37 |     def __init__(self, model: str):
38 |         super().__init__('LlmPrompter')
39 | 
40 |         self.model = model
41 |         # for huggingface hub models
42 |         # self.llm = HuggingFaceHub(repo_id='ValiantLabs/ShiningValiant', task='text-generation',
43 |         #                           huggingfacehub_api_token=os.getenv('HF_AUTH_TOKEN'),
44 |         #                           model_kwargs={"temperature": 0, "max_length": 64})
45 |         self.llm = ChatOpenAI(temperature=0, model_name=model, openai_api_key=os.getenv('OPENAI_API_KEY'))
46 | 
47 |     @backoff.on_exception(backoff.expo, openai.error.RateLimitError)
48 |     @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
49 |     def run(self, document: Document,
50 |             nodes: Optional[List[str]] = None,
51 |             rels: Optional[List[str]] = None):
52 |         return self.extract_and_store_graph(document, nodes, rels)
53 | 
54 |     def get_extraction_chain(self,
55 |                              allowed_nodes: Optional[List[str]] = None,
56 |                              allowed_rels: Optional[List[str]] = None
57 |                              ):
58 |         prompt = ChatPromptTemplate.from_messages(
59 |             [(
60 |                 "system",
61 |                 f"""# Knowledge Graph Instructions for GPT-4
62 |     ## 1. Overview
63 |     You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
64 |     - **Nodes** represent entities and concepts. They're akin to largest infrastructure projects nodes.
65 |     - The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
66 |     ## 2. Labeling Nodes
67 |     - **Consistency**: Ensure you use basic or elementary types for node labels.
68 |       - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
69 |     - **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
70 |     {'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
71 |     {'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
72 |     ## 3. Handling Numerical Data and Dates
73 |     - Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
74 |     - **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
75 |     - **Property Format**: Properties must be in a key-value format.
76 |     - **Quotation Marks**: Never use escaped single or double quotes within property values.
77 |     ## 4. Coreference Resolution
78 |     - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
79 |     If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
80 |     always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
81 |     Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
82 |     ## 5. Strict Compliance
83 |     Adhere to the rules strictly. Non-compliance will result in termination.
84 |               """),
85 |                 ("human", "Use the given format to extract information from the following input: {input}"),
86 |                 ("human", "Tip: Make sure to answer in the correct format"),
87 |             ])
88 |         return create_structured_output_chain(KnowledgeGraph, self.llm, prompt, verbose=False)
89 | 
90 |     def extract_and_store_graph(self,
91 |                                 document: Document,
92 |                                 nodes: Optional[List[str]] = None,
93 |                                 rels: Optional[List[str]] = None):
94 |         # Extract graph data using OpenAI functions
95 |         extract_chain = self.get_extraction_chain(nodes, rels)
96 |         data = extract_chain.run(document.page_content)
97 |         return data
98 | 


--------------------------------------------------------------------------------