├── design.jpeg ├── dirTree.jpg ├── working.jpg ├── requirements.txt ├── .gitignore ├── src ├── data │ ├── ContractOpportunities-20231031-045845.csv │ └── AssistanceListings_DataGov_PUBLIC_WEEKLY_20200321.csv ├── app │ ├── config.yml │ ├── utils.py │ ├── llm.py │ ├── main.py │ └── llm_prompter.py ├── datalayer │ ├── datasources.yml │ ├── datapreprocessor.py │ ├── Neo4jDumper.py │ └── KnowledgeGraph.py ├── services │ ├── schema.yml │ ├── cypher_qa.py │ ├── Identity_retrival_for_csv.py │ └── Identity_retrival_for_html.py ├── UI │ └── ui.py └── components │ └── base_component.py ├── Dockerfile ├── tree.txt └── README.md /design.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prvnsingh/LLM-WebToGraph/HEAD/design.jpeg -------------------------------------------------------------------------------- /dirTree.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prvnsingh/LLM-WebToGraph/HEAD/dirTree.jpg -------------------------------------------------------------------------------- /working.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prvnsingh/LLM-WebToGraph/HEAD/working.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prvnsingh/LLM-WebToGraph/HEAD/requirements.txt -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Model 2 | .idea 3 | .env 4 | **__pychache__** 5 | *.pyc 6 | */data/* 7 | */logs/* 8 | -------------------------------------------------------------------------------- /src/data/ContractOpportunities-20231031-045845.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prvnsingh/LLM-WebToGraph/HEAD/src/data/ContractOpportunities-20231031-045845.csv -------------------------------------------------------------------------------- /src/data/AssistanceListings_DataGov_PUBLIC_WEEKLY_20200321.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prvnsingh/LLM-WebToGraph/HEAD/src/data/AssistanceListings_DataGov_PUBLIC_WEEKLY_20200321.csv -------------------------------------------------------------------------------- /src/app/config.yml: -------------------------------------------------------------------------------- 1 | neo4j: 2 | uri: neo4j+s://813d1613.databases.neo4j.io 3 | username: neo4j 4 | password: 5yRHnFRIdfbz5FRql8GmmmlLj_TLcvFGeE5KzL4QPuc 5 | port: 8000 6 | host: 0.0.0.0 -------------------------------------------------------------------------------- /src/datalayer/datasources.yml: -------------------------------------------------------------------------------- 1 | #add more urls 2 | link: 3 | - https://www.worldbank.org/en/projects-operations/procurement?srce=both 4 | - https://projects.worldbank.org/en/projects-operations/projects-list?os=0 5 | csv: 6 | - data/AssistanceListings_DataGov_PUBLIC_WEEKLY_20200321.csv 7 | - data/ContractOpportunities-20231031-045845.csv -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use base shared Python image 2 | FROM python:3.9 3 | 4 | RUN pip install --upgrade pip 5 | RUN pip install --upgrade setuptools wheel 6 | 7 | COPY ./requirements.txt /requirements.txt 8 | RUN pip install --no-cache-dir -r /requirements.txt 9 | 10 | # Copy the project files into the container 11 | COPY ./src /src 12 | 13 | RUN MKDIR ./logs/ 14 | 15 | # Expose any necessary ports 16 | EXPOSE 8000 17 | EXPOSE 8501 18 | 19 | # Set the working directory 20 | WORKDIR /src 21 | 22 | # Start the application 23 | CMD ["sh", "-c", "streamlit run UI/ui.py & uvicorn app.main:app --host 0.0.0.0 --port 8000"] -------------------------------------------------------------------------------- /src/datalayer/datapreprocessor.py: -------------------------------------------------------------------------------- 1 | from app import utils 2 | from components.base_component import BaseComponent 3 | 4 | pre_defined_sources = ('link', 'csv') 5 | 6 | 7 | class DataPreprocessor(BaseComponent): 8 | def __init__(self, datasource): 9 | super().__init__('DataPreprocessor') 10 | self.sources = utils.read_yaml_file(datasource) 11 | self.csv_sources = self.sources.get('csv', []) 12 | self.html_sources = self.sources.get('link', []) 13 | 14 | def get_csv_sources(self): 15 | return self.csv_sources 16 | 17 | def get_html_sources(self): 18 | return self.html_sources 19 | 20 | def run(self, **kwargs): 21 | pass 22 | -------------------------------------------------------------------------------- /src/app/utils.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | 4 | def read_yaml_file(file_path): 5 | """ 6 | Load a YAML file and return its contents as a Python dictionary. 7 | 8 | Args: 9 | file_path (str): The path to the YAML file. 10 | 11 | Returns: 12 | dict: A dictionary containing the YAML configuration. 13 | """ 14 | try: 15 | with open(file_path, "r") as yaml_file: 16 | config = yaml.safe_load(yaml_file) 17 | return config 18 | except FileNotFoundError: 19 | print(f"Error: Config file not found at {file_path}") 20 | return {} 21 | except yaml.YAMLError as e: 22 | print(f"Error: Failed to load YAML from {file_path}. {e}") 23 | return {} 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /src/services/schema.yml: -------------------------------------------------------------------------------- 1 | properties: 2 | Organizations: 3 | type: string 4 | Project_Name: 5 | type: string 6 | location: 7 | type: string 8 | date: 9 | type: string 10 | financial_Value: 11 | type: string 12 | key_personals/POC: 13 | type: string 14 | Industry_type: 15 | type: string 16 | project_status: 17 | type: string 18 | Sustainability and Environmental Factors: 19 | type: string 20 | Collaboration and Partnerships: 21 | type: string 22 | Risk Factors: 23 | type: string 24 | Vendor and Supplier Information: 25 | type: string 26 | brief: 27 | type: string 28 | required: 29 | - Organizations 30 | - Project_Name 31 | - location 32 | - date 33 | - financial_Value 34 | - project_status -------------------------------------------------------------------------------- /tree.txt: -------------------------------------------------------------------------------- 1 | |-- Dockerfile 2 | |-- README.md 3 | |-- requirements.txt 4 | |-- src 5 | |-- |-- Model 6 | |-- |-- |-- llama-2-7b 7 | |-- |-- |-- checklist.chk 8 | |-- |-- |-- consolidated.00.pth 9 | |-- |-- |-- params.json 10 | |-- |-- UI 11 | |-- |-- |-- ui.py 12 | |-- |-- app 13 | |-- |-- |-- config.yml 14 | |-- |-- |-- llm.py 15 | |-- |-- |-- llm_prompter.py 16 | |-- |-- |-- main.py 17 | |-- |-- |-- utils.py 18 | |-- |-- components 19 | |-- |-- |-- base_component.py 20 | |-- |-- data 21 | |-- |-- |-- AssistanceListings_DataGov_PUBLIC_WEEKLY_20200321.csv 22 | |-- |-- |-- ContractOpportunities-20231031-045845.csv 23 | |-- |-- datalayer 24 | |-- |-- |-- KnowledgeGraph.py 25 | |-- |-- |-- Neo4jDumper.py 26 | |-- |-- |-- datapreprocessor.py 27 | |-- |-- |-- datasources.yml 28 | |-- |-- logs 29 | |-- |-- |-- logs.txt 30 | |-- |-- services 31 | |-- |-- Identity_retrival_for_csv.py 32 | |-- |-- Identity_retrival_for_html.py 33 | |-- |-- cypher_qa.py 34 | |-- |-- schema.yml 35 | -------------------------------------------------------------------------------- /src/services/cypher_qa.py: -------------------------------------------------------------------------------- 1 | from langchain.chains import GraphCypherQAChain 2 | from app.llm import Llm 3 | from components.base_component import BaseComponent 4 | from datalayer.Neo4jDumper import Neo4jDumper 5 | 6 | 7 | class CypherQa(BaseComponent): 8 | def __init__(self, model_name): 9 | super().__init__('cypher_qa') 10 | # instantiating the openai llm model and neo4j connection 11 | self.neo4j_instance = Neo4jDumper(config_path='app/config.yml') 12 | self.open_ai_llm = Llm(model=model_name) 13 | # schema = utils.read_yaml_file('services/schemaN.yml') 14 | # graph_schema = construct_schema(schema,[],[]) 15 | print(self.neo4j_instance.graph.schema) 16 | self.cypher_chain = GraphCypherQAChain.from_llm( 17 | cypher_llm=self.open_ai_llm.llm, 18 | qa_llm=self.open_ai_llm.llm, 19 | graph=self.neo4j_instance.graph, 20 | # validate_cypher=True, # Validate relationship directions 21 | verbose=True, 22 | ) 23 | 24 | def run(self, text): 25 | return self.cypher_chain.run(text) 26 | -------------------------------------------------------------------------------- /src/UI/ui.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import streamlit as st 3 | 4 | st.title("LLM-WebToGraph") 5 | st.text( 6 | 'This project using langchain and OpenAI LLM to transform data from different sources (weblinks/csv) to knowledge graph and store then in neo4j DB.') 7 | st.write('Press submit to upload process the data and generate knowledge graph.') 8 | 9 | if st.button("process csv files and generate knowledge graph"): 10 | # Send user_input to FastAPI 11 | fastapi_url = "http://localhost:8000/generate_tags_from_csv" 12 | response = requests.get(fastapi_url) 13 | if response.status_code == 200: 14 | st.write(f"{response.text}") 15 | else: 16 | st.error(f"Error: {response.status_code}") 17 | 18 | if st.button("process html links and generate knowledge graph"): 19 | # Send user_input to FastAPI 20 | fastapi_url = "http://localhost:8000/generate_tags_from_html" 21 | response = requests.get(fastapi_url) 22 | if response.status_code == 200: 23 | st.write(f"{response.text}") 24 | else: 25 | st.error(f"Error: {response.status_code}") 26 | 27 | user_input = st.text_input("ask any question about data") 28 | if st.button('submit'): 29 | # Send user_input to FastAPI 30 | fastapi_url = f"http://localhost:8000/query_graph/{user_input}" 31 | response = requests.get(fastapi_url) 32 | if response.status_code == 200: 33 | st.write(f"{response.text}") 34 | else: 35 | st.error(f"Error: {response.status_code}") 36 | -------------------------------------------------------------------------------- /src/components/base_component.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from abc import ABC, abstractmethod 3 | from functools import wraps 4 | from typing import List, Union 5 | 6 | 7 | def log_errors(logger): 8 | def decorator(run): 9 | @wraps(run) 10 | def wrapper(*args, **kwargs): 11 | try: 12 | return run(*args, **kwargs) 13 | except Exception as e: 14 | logger.log_error(f"An error occurred: {str(e)}", exception=e) 15 | 16 | return wrapper 17 | 18 | return decorator 19 | 20 | 21 | class BaseComponent(ABC): 22 | """""" 23 | 24 | def __init__(self, logger_name): 25 | self.logger = self._configure_logger(logger_name) 26 | 27 | @log_errors 28 | @abstractmethod 29 | def run( 30 | self, 31 | input: Union[str, List[float]], 32 | ) -> str: 33 | """Comment""" 34 | 35 | @log_errors 36 | async def run_async( 37 | self, 38 | input: Union[str, List[float]], 39 | ) -> str: 40 | """Comment""" 41 | 42 | @staticmethod 43 | def _configure_logger(component_name): 44 | logger = logging.getLogger(component_name) 45 | logger.setLevel(logging.INFO) 46 | 47 | # Create a file handler for logging to a file 48 | file_handler = logging.FileHandler('logs/logs.txt') 49 | 50 | # Create a formatter 51 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 52 | file_handler.setFormatter(formatter) 53 | 54 | # Add the file handler to the logger 55 | logger.addHandler(file_handler) 56 | return logger 57 | 58 | def log_error(self, error_message, exception=None): 59 | # Centralized error handling 60 | self.logger.error(error_message) 61 | if exception: 62 | self.logger.error(f"Exception: {str(exception)}") 63 | # You can add more error handling logic here, such as sending alerts or notifications 64 | -------------------------------------------------------------------------------- /src/app/llm.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import backoff 4 | import openai # for OpenAI API calls 5 | from dotenv import load_dotenv 6 | from langchain.chains import create_extraction_chain 7 | from langchain.chat_models import ChatOpenAI 8 | from tenacity import ( 9 | retry, 10 | stop_after_attempt, 11 | wait_random_exponential, 12 | ) # for exponential backoff 13 | 14 | from app import utils 15 | from components.base_component import BaseComponent 16 | 17 | load_dotenv() 18 | 19 | 20 | def get_schema(): 21 | """ 22 | The get_schema function reads the schema.yml file and returns a dictionary of the schema. 23 | 24 | :return: The schema 25 | :doc-author: Trelent 26 | """ 27 | schema = utils.read_yaml_file('services/schema.yml') 28 | return schema 29 | 30 | 31 | class Llm(BaseComponent): 32 | 33 | def __init__(self, model: str): 34 | super().__init__('Lllm') 35 | self.model = model 36 | # for huggingface hub models 37 | # self.llm = HuggingFaceHub(repo_id='ValiantLabs/ShiningValiant', task='text-generation', 38 | # huggingfacehub_api_token=os.getenv('HF_AUTH_TOKEN'), 39 | # model_kwargs={"temperature": 0, "max_length": 64}) 40 | self.llm = ChatOpenAI(temperature=0, model_name=model, openai_api_key=os.getenv('OPENAI_API_KEY')) 41 | 42 | @backoff.on_exception(backoff.expo, openai.error.RateLimitError) 43 | @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) 44 | def run(self, input_text): 45 | """ 46 | The run function is the main entry point for your component. 47 | It will be called with a string of text to process, and should return a dictionary of results. 48 | The keys in this dictionary are the names of slots that you defined in your schema. 49 | 50 | :param self: Represent the instance of the class 51 | :param input_text: Pass the text that we want to extract entities from 52 | :return: A dictionary with the following structure: 53 | """ 54 | schema = get_schema() 55 | self.logger.info(f'schema: {schema}') 56 | chain = create_extraction_chain(schema, self.llm) 57 | llm_response = chain.run(input_text) 58 | self.logger.info(f'llm_response: {llm_response}') 59 | return llm_response 60 | -------------------------------------------------------------------------------- /src/datalayer/Neo4jDumper.py: -------------------------------------------------------------------------------- 1 | from langchain.graphs import Neo4jGraph 2 | from langchain.graphs.graph_document import GraphDocument 3 | from neo4j import GraphDatabase 4 | 5 | from app import utils 6 | from components.base_component import BaseComponent 7 | from datalayer.KnowledgeGraph import map_to_base_node, map_to_base_relationship 8 | 9 | 10 | class Neo4jDumper(BaseComponent): 11 | def __init__(self, config_path): 12 | super().__init__('Neo4jDumper') 13 | config = utils.read_yaml_file(config_path) 14 | self.uri = config.get('neo4j').get('uri') 15 | self.username = config.get('neo4j').get('username') 16 | self.password = config.get('neo4j').get('password') 17 | self.graph = Neo4jGraph( 18 | url=self.uri, username=self.username, password=self.password 19 | ) 20 | 21 | def dump_data(self, tx, data): 22 | for key, value in data.items(): 23 | # Create a node for each key-value pair 24 | tx.run(query="CREATE (n:Node {key: $key, value: $value})", key=key, value=value) 25 | self.logger.info(f"Dumped data for {key}: {value} to neo4j") 26 | 27 | def run(self, data): 28 | try: 29 | with GraphDatabase.driver(self.uri, auth=(self.username, self.password)) as driver: 30 | with driver.session() as session: 31 | self.dump_data(session, data) 32 | self.logger.info("Neo4j database connected successfully. and data dumped successfully.") 33 | except Exception as e: 34 | self.logger.error(f"Error while connecting to neo4j: {str(e)}") 35 | finally: 36 | session.close() 37 | 38 | # New implementation using graph document 39 | def run2(self, data, document): 40 | try: 41 | graph = Neo4jGraph( 42 | url=self.uri, username=self.username, password=self.password 43 | ) 44 | # Construct a graph document 45 | graph_document = GraphDocument( 46 | nodes=[map_to_base_node(node) for node in data.nodes], 47 | relationships=[map_to_base_relationship(rel) for rel in data.rels], 48 | source=document 49 | ) 50 | # Store information into a graph 51 | graph.add_graph_documents([graph_document]) 52 | 53 | except Exception as e: 54 | self.logger.error(f"Error while connecting to neo4j: {str(e)}") 55 | -------------------------------------------------------------------------------- /src/datalayer/KnowledgeGraph.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | from langchain.graphs.graph_document import ( 4 | Node as BaseNode, 5 | Relationship as BaseRelationship 6 | ) 7 | from langchain.pydantic_v1 import Field, BaseModel 8 | 9 | 10 | class Property(BaseModel): 11 | """A single property consisting of key and value""" 12 | key: str = Field(..., description="key") 13 | value: str = Field(..., description="value") 14 | 15 | 16 | class Node(BaseNode): 17 | properties: Optional[List[Property]] = Field( 18 | None, description="List of node properties") 19 | 20 | 21 | class Relationship(BaseRelationship): 22 | properties: Optional[List[Property]] = Field( 23 | None, description="List of relationship properties" 24 | ) 25 | 26 | 27 | class KnowledgeGraph(BaseModel): 28 | """Generate a knowledge graph with entities and relationships.""" 29 | nodes: List[Node] = Field( 30 | ..., description="List of nodes in the knowledge graph") 31 | rels: List[Relationship] = Field( 32 | ..., description="List of relationships in the knowledge graph" 33 | ) 34 | 35 | 36 | def format_property_key(s: str) -> str: 37 | words = s.split() 38 | if not words: 39 | return s 40 | first_word = words[0].lower() 41 | capitalized_words = [word.capitalize() for word in words[1:]] 42 | return "".join([first_word] + capitalized_words) 43 | 44 | 45 | def props_to_dict(props) -> dict: 46 | """Convert properties to a dictionary.""" 47 | properties = {} 48 | if not props: 49 | return properties 50 | for p in props: 51 | properties[format_property_key(p.key)] = p.value 52 | return properties 53 | 54 | 55 | def map_to_base_node(node: Node) -> BaseNode: 56 | """Map the KnowledgeGraph Node to the base Node.""" 57 | properties = props_to_dict(node.properties) if node.properties else {} 58 | # Add name property for better Cypher statement generation 59 | properties["name"] = node.id.title() 60 | return BaseNode( 61 | id=node.id.title(), type=node.type.capitalize(), properties=properties 62 | ) 63 | 64 | 65 | def map_to_base_relationship(rel: Relationship) -> BaseRelationship: 66 | """Map the KnowledgeGraph Relationship to the base Relationship.""" 67 | source = map_to_base_node(rel.source) 68 | target = map_to_base_node(rel.target) 69 | properties = props_to_dict(rel.properties) if rel.properties else {} 70 | return BaseRelationship( 71 | source=source, target=target, type=rel.type, properties=properties 72 | ) 73 | -------------------------------------------------------------------------------- /src/services/Identity_retrival_for_csv.py: -------------------------------------------------------------------------------- 1 | from langchain.document_loaders.csv_loader import CSVLoader 2 | from app import utils 3 | from app.llm import Llm 4 | from components.base_component import BaseComponent 5 | from datalayer.Neo4jDumper import Neo4jDumper 6 | 7 | 8 | class NameIdentityRetrievalForCsv(BaseComponent): 9 | def __init__(self, model_name, data_path): 10 | """ 11 | The __init__ function is called when the class is instantiated. 12 | It sets up the instance of the class, and defines all its attributes. 13 | The self parameter refers to an instance of a class, and it's required in order for Python to know which object you're referring to. 14 | 15 | :param self: Represent the instance of the class 16 | :param model_name: Instantiate the openai llm model 17 | :param data_path: Read the yaml file which contains the path to all csv files 18 | :return: The instance of the class 19 | """ 20 | super().__init__('NameIdentityRetrievalForCsv') 21 | self.sources = utils.read_yaml_file(data_path) 22 | self.csv_sources = self.sources.get('csv', []) 23 | # instantiating the openai llm model and neo4j connection 24 | self.neo4j_instance = Neo4jDumper(config_path='app/config.yml') 25 | self.open_ai_llm = Llm(model=model_name) 26 | 27 | def run(self, **kwargs): 28 | """ 29 | The run function is the main function of this module. It takes in a list of csv files and extracts knowledge graph from them using openai api. 30 | The knowledge graph is then dumped into neo4j database. 31 | 32 | :param self: Represent the instance of the class 33 | :return: A tuple of the following: 34 | """ 35 | for csvfile in self.csv_sources: 36 | # loading the csv using langchain document loader for csv 37 | loader = CSVLoader(file_path=csvfile) 38 | data = loader.load() 39 | 40 | # setting up openai model and extracting knowledge graph 41 | self.logger.info(f'loading model {self.open_ai_llm}') 42 | # just sending last few lines of csv as the token limit is limited of openai api free version. 43 | # model should be changed to claude2 (Anthropic) or premium openai api key should be used. 44 | # response = self.open_ai_llm.extract_and_store_graph(document=data[-1]) 45 | response = self.open_ai_llm.run(input_text=data[-1]) 46 | # instantiating neo4jBD and dumping the knowledge graph 47 | self.neo4j_instance.run(data=response) 48 | self.logger.info(f'knowledge graph populated successfully for data source: {csvfile}') 49 | -------------------------------------------------------------------------------- /src/services/Identity_retrival_for_html.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | from langchain.document_loaders import AsyncHtmlLoader 4 | from langchain.document_transformers import BeautifulSoupTransformer 5 | from app import utils 6 | from app.llm import Llm 7 | from components.base_component import BaseComponent 8 | from datalayer.Neo4jDumper import Neo4jDumper 9 | 10 | 11 | class NameIdentityRetrievalForHtml(BaseComponent): 12 | def __init__(self, model_name, data_path): 13 | 14 | """ 15 | The __init__ function is called when the class is instantiated. 16 | It sets up the initial values of all attributes for an instance of a class. 17 | The self parameter refers to the current instance of a class, and it's required by Python. 18 | 19 | :param self: Represent the instance of the class 20 | :param model_name: Specify the model name that we want to use for our predictions 21 | :param data_path: Read the yaml file which contains the links to be scraped 22 | :return: Nothing 23 | """ 24 | super().__init__('NameIdentityRetrievalForHtml') 25 | self.sources = utils.read_yaml_file(data_path) 26 | self.html_sources = self.sources.get('link', []) 27 | # instantiating the openai llm model and neo4j connection 28 | self.neo4j_instance = Neo4jDumper(config_path='app/config.yml') 29 | self.open_ai_llm = Llm(model=model_name) 30 | 31 | def run_async(self, **kwargs): 32 | 33 | """ 34 | The run_async function is used to run the pipeline asynchronously. 35 | It takes in a list of html sources and extracts knowledge graph from them using openai api. 36 | The extracted knowledge graph is then dumped into neo4j database. 37 | 38 | :param self: Represent the instance of the object itself 39 | :param **kwargs: Pass a variable number of keyword arguments to a function 40 | :return: A list of all the knowledge graphs extracted from the html sources 41 | """ 42 | for link in self.html_sources: 43 | loader =AsyncHtmlLoader(link) 44 | html = loader.load() 45 | # html = loader.load() 46 | bs_transformer = BeautifulSoupTransformer() 47 | docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["table"]) 48 | self.logger.info(docs_transformed[0].page_content[0:500]) 49 | 50 | # setting up openai model and extracting knowledge graph 51 | self.logger.info(f'loading model {self.open_ai_llm}') 52 | 53 | # just sending last few lines of csv as the token limit is limited of openai api free version. 54 | # model should be changed to claude2 (Anthropic) or premium openai api key should be used. 55 | # response = self.open_ai_llm.extract_and_store_graph(document=docs_transformed[0]) 56 | tokens_cap = len(docs_transformed[0].page_content) - 4 57 | response = self.open_ai_llm.run(input_text=docs_transformed[0].page_content[tokens_cap:]) 58 | # instantiating neo4jBD and dumping the knowledge graph 59 | self.neo4j_instance.run(data=response) 60 | self.logger.info(f'knowledge graph populated successfully for data source: {link}') 61 | 62 | def run(self, input: Union[str, List[float]]) -> str: 63 | pass 64 | -------------------------------------------------------------------------------- /src/app/main.py: -------------------------------------------------------------------------------- 1 | import uvicorn 2 | from dotenv import load_dotenv 3 | from fastapi import FastAPI 4 | from fastapi.responses import HTMLResponse 5 | 6 | from app import utils 7 | from services.Identity_retrival_for_csv import NameIdentityRetrievalForCsv 8 | from services.Identity_retrival_for_html import NameIdentityRetrievalForHtml 9 | from services.cypher_qa import CypherQa 10 | 11 | app = FastAPI( 12 | title="LLM-WebToGraph", 13 | description="""This project using langchain and OpenAI LLM to transform data from different sources (web 14 | links/csv) to knowledge graph and store then in neo4j DB.""", 15 | version="0.1.0", 16 | ) 17 | 18 | 19 | @app.get("/query_graph/{question}") 20 | def query_graph(question: str): 21 | 22 | """ 23 | The query_graph function takes a question as input and returns the answer to that question. 24 | The function uses the CypherQa class from the cypher_qachain package, which is an implementation of 25 | the QAChain algorithm for answering questions about graphs using GPT-3. The model used by this function 26 | is gpt-3.5-turbo, which was trained on a dataset of ~100k questions and answers about graphs. 27 | 28 | :param question: str: Pass the question to the function 29 | :return: A htmlresponse object 30 | 31 | """ 32 | graph_cypher_qachain = CypherQa(model_name='gpt-3.5-turbo') 33 | response = graph_cypher_qachain.run(question) 34 | return HTMLResponse(content=response, status_code=200) 35 | 36 | 37 | @app.get("/generate_tags_from_html") 38 | async def generate_tags(): 39 | """ 40 | The generate_tags function is a ReST endpoint that will generate the tags for all the data sources. 41 | This function is called by an external service, such as Jenkins or Travis CI, to ensure that the tags are up-to-date. 42 | The function returns a 200 status code if successful and 500 otherwise. 43 | 44 | :return: A htmlresponse object with the content as 'successfully generated the knowledge from the data sources!!!' and status_code as 200 45 | 46 | """ 47 | ner = NameIdentityRetrievalForHtml(model_name='gpt-3.5-turbo', data_path='datalayer/datasources.yml') 48 | ner.run_async() # asyncronous call since html pages can take time to load and scrape 49 | return HTMLResponse(content='Successfully generated the knowledge from the data sources!!!', status_code=200) 50 | 51 | 52 | @app.get("/generate_tags_from_csv") 53 | def generate_tags(): 54 | """ 55 | The generate_tags function is a ReST endpoint that will generate the tags for each of the data sources. 56 | It uses the NameIdentityRetrievalForCsv class to accomplish this task. 57 | The model_name and data_path are passed as parameters to this function. 58 | 59 | :return: A htmlresponse object with the status code 200 60 | """ 61 | ner = NameIdentityRetrievalForCsv(model_name='gpt-3.5-turbo', data_path='datalayer/datasources.yml') 62 | ner.run() 63 | return HTMLResponse(content='Successfully generated the knowledge from the data sources!!!', status_code=200) 64 | 65 | 66 | # health check route 67 | @app.get("/health") 68 | def health_check(): 69 | return {"status": "healthy"} 70 | 71 | 72 | if __name__ == '__main__': 73 | app_config = utils.read_yaml_file('app/config.yml') 74 | load_dotenv() 75 | uvicorn.run(app, port=app_config.get('port'), host=app_config.get('host')) 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLM-WebToGraph 2 | 3 | LLM-WebToGraph is a powerful project that harnesses the capabilities of Langchain and OpenAI's Language Models (LLMs) to scrape data from various sources on the web, transforming it into a structured knowledge graph. This knowledge graph is then populated into a Neo4j Aura Database, providing an efficient way to store, query, and retrieve information using cypher query and LLMs. With the synergy of Langchain, OpenAI LLMs, and Neo4j, this project offers a robust solution for knowledge management and retrieval. 4 | 5 | ## Architecture 6 | ![design](https://github.com/prvnsingh/LLM-WebToGraph/blob/main/design.jpeg?raw=true) 7 | 8 | 9 | ## Overview 10 | 11 | The LLM-WebToGraph project combines several key components to achieve its goal: 12 | 13 | 1. **Langchain:** A language model designed for natural language understanding and generation, powering the core of the project. 14 | 15 | 2. **OpenAI's Language Models (LLMs):** These models are used to extract and process data from various sources, converting unstructured data into structured knowledge. 16 | 17 | 3. **Neo4j Aura Database:** The project stores the structured knowledge graph in a Neo4j Aura Database, allowing for efficient storage and retrieval. 18 | 19 | 4. **FastAPI:** To expose an API for interacting with the project and to check its health status. 20 | 21 | 5. **Streamlit:** For building a user-friendly interface to query and visualize the knowledge graph. 22 | 23 | ## Features 24 | 25 | - Web scraping from various sources, such as web links and CSV files. 26 | - Data transformation and extraction using OpenAI LLM (gpt-3.5-turbo). 27 | - Population of a structured knowledge graph in Neo4j Aura Database. 28 | - FastAPI-based health check API to monitor the application's status. 29 | - Streamlit web application for querying and visualizing the knowledge graph. 30 | 31 | ## Getting Started 32 | 1. Configuring the data sources 33 | - Update the data files .csv in the data directory. 34 | - Update the links of html in datasource.yml 35 | 2. Setup environment variables 36 | - Add credentials in .env file like openAI api key and neo4jDB password or add environment variables. 37 | 38 | 3. Configure the schema.yml for identities and relationships 39 | - Modify the schema.yml to specify the identities to be recognized. 40 | 4. Run the streamlit UI and FASTAPI app. 41 | - build docker and run the image with env file 42 | ~~~sh 43 | sudo docker run --env-file .env -p 8501:8501 -p 8000:8000 image_name 44 | ~~~ 45 | To access the application 46 | ~~~html 47 | http://localhost:8501/ 48 | ~~~ 49 | 50 | To check backend APIs, access the swagger at 51 | ```html 52 | http://localhost:8000/docs 53 | ``` 54 | ## Working directory 55 | ![Directory Tree](https://github.com/prvnsingh/LLM-WebToGraph/blob/main/dirTree.jpg?raw=true) 56 | 57 | ## Demo snapshot 58 | ![Demo snapshot](https://github.com/prvnsingh/LLM-WebToGraph/blob/main/working.jpg?raw=true) 59 | 60 | ## Contributing 61 | 62 | Contributions to the LLM-WebToGraph project are welcome! If you'd like to contribute, please follow these guidelines: 63 | 64 | - Fork the repository. 65 | - Create a new branch for your feature or bug fix. 66 | - Make your changes and ensure tests pass. 67 | - Submit a pull request. 68 | 69 | ## Future Scope 70 | In the future, the project can be extended with a microservices architecture, including: 71 | 72 | A separate data service responsible for ingesting data from S3. 73 | Utilization of a Selenium bot to scrape the web and download CSV files. 74 | Integration with more data sources for enhanced knowledge graph creation. 75 | 76 | ## References 77 | - [Langchain Graph Transformer Documentation](https://python.langchain.com/docs/use_cases/graph/diffbot_graphtransformer) 78 | - [Langchain Cypher Query Documentation](https://python.langchain.com/docs/use_cases/graph/graph_cypher_qa) 79 | - [Blog Post: Constructing Knowledge Graphs from Text](https://blog.langchain.dev/constructing-knowledge-graphs-from-text-using-openai-functions/) 80 | 81 | ## License 82 | 83 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. 84 | 85 | ## Contact 86 | 87 | For questions or support, feel free to contact us at [prvns1997@gmail.com](mailto:prvns1997@email.com). 88 | -------------------------------------------------------------------------------- /src/app/llm_prompter.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Optional 3 | 4 | import backoff 5 | import openai # for OpenAI API calls 6 | from dotenv import load_dotenv 7 | from langchain.chains.openai_functions import create_structured_output_chain 8 | from langchain.chat_models import ChatOpenAI 9 | from langchain.prompts import ChatPromptTemplate 10 | from langchain.schema import Document 11 | from tenacity import ( 12 | retry, 13 | stop_after_attempt, 14 | wait_random_exponential, 15 | ) # for exponential backoff 16 | 17 | from app import utils 18 | from components.base_component import BaseComponent 19 | from datalayer.KnowledgeGraph import KnowledgeGraph 20 | 21 | load_dotenv() 22 | 23 | 24 | def get_schema(): 25 | """ 26 | The get_schema function reads the schema.yml file and returns a dictionary of the schema. 27 | 28 | :return: The schema 29 | :doc-author: Trelent 30 | """ 31 | schema = utils.read_yaml_file('services/schema.yml') 32 | return schema 33 | 34 | 35 | class LlmPrompter(BaseComponent): 36 | 37 | def __init__(self, model: str): 38 | super().__init__('LlmPrompter') 39 | 40 | self.model = model 41 | # for huggingface hub models 42 | # self.llm = HuggingFaceHub(repo_id='ValiantLabs/ShiningValiant', task='text-generation', 43 | # huggingfacehub_api_token=os.getenv('HF_AUTH_TOKEN'), 44 | # model_kwargs={"temperature": 0, "max_length": 64}) 45 | self.llm = ChatOpenAI(temperature=0, model_name=model, openai_api_key=os.getenv('OPENAI_API_KEY')) 46 | 47 | @backoff.on_exception(backoff.expo, openai.error.RateLimitError) 48 | @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) 49 | def run(self, document: Document, 50 | nodes: Optional[List[str]] = None, 51 | rels: Optional[List[str]] = None): 52 | return self.extract_and_store_graph(document, nodes, rels) 53 | 54 | def get_extraction_chain(self, 55 | allowed_nodes: Optional[List[str]] = None, 56 | allowed_rels: Optional[List[str]] = None 57 | ): 58 | prompt = ChatPromptTemplate.from_messages( 59 | [( 60 | "system", 61 | f"""# Knowledge Graph Instructions for GPT-4 62 | ## 1. Overview 63 | You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph. 64 | - **Nodes** represent entities and concepts. They're akin to largest infrastructure projects nodes. 65 | - The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience. 66 | ## 2. Labeling Nodes 67 | - **Consistency**: Ensure you use basic or elementary types for node labels. 68 | - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist". 69 | - **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text. 70 | {'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""} 71 | {'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""} 72 | ## 3. Handling Numerical Data and Dates 73 | - Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes. 74 | - **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes. 75 | - **Property Format**: Properties must be in a key-value format. 76 | - **Quotation Marks**: Never use escaped single or double quotes within property values. 77 | ## 4. Coreference Resolution 78 | - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency. 79 | If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"), 80 | always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID. 81 | Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial. 82 | ## 5. Strict Compliance 83 | Adhere to the rules strictly. Non-compliance will result in termination. 84 | """), 85 | ("human", "Use the given format to extract information from the following input: {input}"), 86 | ("human", "Tip: Make sure to answer in the correct format"), 87 | ]) 88 | return create_structured_output_chain(KnowledgeGraph, self.llm, prompt, verbose=False) 89 | 90 | def extract_and_store_graph(self, 91 | document: Document, 92 | nodes: Optional[List[str]] = None, 93 | rels: Optional[List[str]] = None): 94 | # Extract graph data using OpenAI functions 95 | extract_chain = self.get_extraction_chain(nodes, rels) 96 | data = extract_chain.run(document.page_content) 97 | return data 98 | --------------------------------------------------------------------------------