├── __pycache__ ├── chain.cpython-311.pyc └── retrievers.cpython-311.pyc ├── zeroshot.cfg ├── __init__.py ├── README.md ├── utils.py ├── .devcontainer └── devcontainer.json ├── requirements.txt ├── chain.py ├── retrievers.py ├── functions.py ├── ingest.py ├── App.py └── openaiKG.ipynb /__pycache__/chain.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leannchen86/openai-knowledge-graph-streamlit-app/HEAD/__pycache__/chain.cpython-311.pyc -------------------------------------------------------------------------------- /__pycache__/retrievers.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leannchen86/openai-knowledge-graph-streamlit-app/HEAD/__pycache__/retrievers.cpython-311.pyc -------------------------------------------------------------------------------- /zeroshot.cfg: -------------------------------------------------------------------------------- 1 | [paths] 2 | examples = null 3 | 4 | [nlp] 5 | lang = "en" 6 | pipeline = ["ner", "llm_rel"] 7 | 8 | [components] 9 | 10 | [components.ner] 11 | source = "en_core_web_md" 12 | 13 | [components.llm_rel] 14 | factory = "llm" 15 | 16 | [components.llm_rel.task] 17 | @llm_tasks = "spacy.REL.v1" 18 | 19 | [components.llm_rel.model] 20 | @llm_models = "spacy.GPT-4.v3" 21 | name = "gpt-4-0125-preview" 22 | config = {"temperature": 0.0} 23 | 24 | [initialize] 25 | vectors = "en_core_web_md" -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Advanced RAG Streamlit Chatbot - README 2 | 3 | This AI chatbot integrates a Spacy-LLM constructed knowledge graph with an advanced RAG (Retrieval Augmented Generation), offering context-aware interactions. Built using Streamlit, it features an intuitive UI and a neo4j-like graph interface for exploring entity relationships. This chatbot is fed with data related to the OpenAI Saga. 4 | 5 | Highlights 6 | 7 | 1. Advanced RAG System: 8 | Utilizes OpenAI's language models for relevant, informed responses. 9 | 10 | 2. Knowledge Graph Integration: 11 | Leverages Neo4j's comprehensive network of entity relationships for deeper insights. 12 | 13 | 3. Streamlit Interface: 14 | Offers a user-friendly, easy-to-navigate experience. 15 | 16 | Note: retrievers.py and chain.py are from langchain's neo4j_advanced_rag template: 17 | 18 | https://github.com/langchain-ai/langchain/tree/master/templates/neo4j-advanced-rag?ref=blog.langchain.dev 19 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import inspect 16 | import textwrap 17 | 18 | import streamlit as st 19 | 20 | 21 | def show_code(demo): 22 | """Showing the code of the demo.""" 23 | show_code = st.sidebar.checkbox("Show code", True) 24 | if show_code: 25 | # Showing the code of the demo. 26 | st.markdown("## Code") 27 | sourcelines, _ = inspect.getsourcelines(demo) 28 | st.code(textwrap.dedent("".join(sourcelines[1:]))) 29 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Python 3", 3 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 4 | "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye", 5 | "customizations": { 6 | "codespaces": { 7 | "openFiles": [ 8 | "README.md", 9 | "Hello.py" 10 | ] 11 | }, 12 | "vscode": { 13 | "settings": {}, 14 | "extensions": [ 15 | "ms-python.python", 16 | "ms-python.vscode-pylance" 17 | ] 18 | } 19 | }, 20 | "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y (n) ") 21 | 22 | # Create a list of WHERE clauses for relationship types 23 | rel_clauses = [] 24 | for rel_type in rel_types: 25 | rel_clauses.append(f"type(r)='{rel_type}' ") 26 | 27 | # Combine the clauses into one Cypher query 28 | if rel_clauses: 29 | rel_match = " OR ".join(rel_clauses) 30 | query = f"MATCH {' OR '.join(node_clauses)} WHERE {rel_match} RETURN p, r, n" 31 | else: 32 | query = f"MATCH {' OR '.join(node_clauses)} RETURN p, r, n" 33 | 34 | return query 35 | 36 | def process_graph_result(context): 37 | nodes = [] 38 | edges = [] 39 | node_names = set() # This defines node_names to track unique nodes 40 | 41 | for record in context: # Adjusted to access 'Full Context' from the result 42 | # Process nodes 43 | p_name = record['p.name'] 44 | o_name = record['o.name'] 45 | 46 | # Add nodes if they don't already exist 47 | if p_name not in node_names: 48 | nodes.append(Node(id=p_name, label=p_name, size=5, shape="circle")) 49 | node_names.add(p_name) 50 | if o_name not in node_names: 51 | nodes.append(Node(id=o_name, label=o_name, size=5, shape="circle")) 52 | node_names.add(o_name) 53 | 54 | # Process edges 55 | relationship_label = record['type(r)'] 56 | edges.append(Edge(source=p_name, target=o_name, label=relationship_label)) 57 | 58 | return nodes, edges 59 | 60 | # Function to fetch data from Neo4j 61 | def fetch_graph_data(nodesType=None, relType=None, direct_cypher_query=None, intermediate_steps=None): 62 | # Use the direct Cypher query if provided 63 | if direct_cypher_query: 64 | cypher_query = direct_cypher_query 65 | else: 66 | # Construct the Cypher query based on selected filters 67 | cypher_query = construct_cypher_query(nodesType, relType) 68 | context = intermediate_steps[0]['context'] 69 | nodes, edges = process_graph_result(context) 70 | return nodes, edges 71 | -------------------------------------------------------------------------------- /ingest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import List 3 | from langchain.chains.openai_functions import create_structured_output_chain 4 | from langchain.chat_models import ChatOpenAI 5 | from langchain.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader 6 | from langchain.docstore.document import Document 7 | from langchain.embeddings.openai import OpenAIEmbeddings 8 | from langchain.graphs import Neo4jGraph 9 | from langchain.prompts import ChatPromptTemplate 10 | from langchain.pydantic_v1 import BaseModel, Field 11 | from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter 12 | from neo4j.exceptions import ClientError 13 | import os 14 | 15 | graph = Neo4jGraph() 16 | 17 | # Load Wikipedia Data 18 | all_data = WikipediaLoader(query="Removal_of_Sam_Altman_from_OpenAI").load() 19 | 20 | # Embeddings & LLM models 21 | embeddings = OpenAIEmbeddings() 22 | embedding_dimension = 1536 23 | llm = ChatOpenAI(temperature=0) 24 | 25 | # Process All Data 26 | parent_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24) 27 | child_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=24) 28 | 29 | # Ingest Parent-Child node pairs 30 | for document in all_data: 31 | parent_documents = parent_splitter.split_documents([document]) 32 | for i, parent in enumerate(parent_documents): 33 | child_documents = child_splitter.split_documents([parent]) 34 | params = { 35 | "parent_text": parent.page_content, 36 | "parent_id": i, 37 | "parent_embedding": embeddings.embed_query(parent.page_content), 38 | "children": [ 39 | { 40 | "text": c.page_content, 41 | "id": f"{i}-{ic}", 42 | "embedding": embeddings.embed_query(c.page_content), 43 | } 44 | for ic, c in enumerate(child_documents) 45 | ], 46 | } 47 | # Ingest data 48 | graph.query( 49 | """ 50 | MERGE (p:Parent {id: $parent_id}) 51 | SET p.text = $parent_text 52 | WITH p 53 | CALL db.create.setVectorProperty(p, 'embedding', $parent_embedding) 54 | YIELD node 55 | WITH p 56 | UNWIND $children AS child 57 | MERGE (c:Child {id: child.id}) 58 | SET c.text = child.text 59 | MERGE (c)<-[:HAS_CHILD]-(p) 60 | WITH c, child 61 | CALL db.create.setVectorProperty(c, 'embedding', child.embedding) 62 | YIELD node 63 | RETURN count(*) 64 | """, 65 | params, 66 | ) 67 | # Create vector index for child 68 | try: 69 | graph.query( 70 | "CALL db.index.vector.createNodeIndex('parent_document', " 71 | "'Child', 'embedding', $dimension, 'cosine')", 72 | {"dimension": embedding_dimension}, 73 | ) 74 | except ClientError: # already exists 75 | pass 76 | # Create vector index for parents 77 | try: 78 | graph.query( 79 | "CALL db.index.vector.createNodeIndex('typical_rag', " 80 | "'Parent', 'embedding', $dimension, 'cosine')", 81 | {"dimension": embedding_dimension}, 82 | ) 83 | except ClientError: # already exists 84 | pass 85 | # Ingest hypothethical questions 86 | 87 | 88 | class Questions(BaseModel): 89 | """Generating hypothetical questions about text.""" 90 | 91 | questions: List[str] = Field( 92 | ..., 93 | description=( 94 | "Generated hypothetical questions based on " "the information from the text" 95 | ), 96 | ) 97 | 98 | 99 | questions_prompt = ChatPromptTemplate.from_messages( 100 | [ 101 | ( 102 | "system", 103 | ( 104 | "You are generating hypothetical questions based on the information " 105 | "found in the text. Make sure to provide full context in the generated " 106 | "questions." 107 | ), 108 | ), 109 | ( 110 | "human", 111 | ( 112 | "Use the given format to generate hypothetical questions from the " 113 | "following input: {input}" 114 | ), 115 | ), 116 | ] 117 | ) 118 | 119 | question_chain = create_structured_output_chain(Questions, llm, questions_prompt) 120 | 121 | for i, parent in enumerate(parent_documents): 122 | questions = question_chain.run(parent.page_content).questions 123 | params = { 124 | "parent_id": i, 125 | "questions": [ 126 | {"text": q, "id": f"{i}-{iq}", "embedding": embeddings.embed_query(q)} 127 | for iq, q in enumerate(questions) 128 | if q 129 | ], 130 | } 131 | graph.query( 132 | """ 133 | MERGE (p:Parent {id: $parent_id}) 134 | WITH p 135 | UNWIND $questions AS question 136 | CREATE (q:Question {id: question.id}) 137 | SET q.text = question.text 138 | MERGE (q)<-[:HAS_QUESTION]-(p) 139 | WITH q, question 140 | CALL db.create.setVectorProperty(q, 'embedding', question.embedding) 141 | YIELD node 142 | RETURN count(*) 143 | """, 144 | params, 145 | ) 146 | # Create vector index 147 | try: 148 | graph.query( 149 | "CALL db.index.vector.createNodeIndex('hypothetical_questions', " 150 | "'Question', 'embedding', $dimension, 'cosine')", 151 | {"dimension": embedding_dimension}, 152 | ) 153 | except ClientError: # already exists 154 | pass 155 | 156 | # Ingest summaries 157 | 158 | summary_prompt = ChatPromptTemplate.from_messages( 159 | [ 160 | ( 161 | "system", 162 | ( 163 | "You are generating concise and accurate summaries based on the " 164 | "information found in the text." 165 | ), 166 | ), 167 | ( 168 | "human", 169 | ("Generate a summary of the following input: {question}\n" "Summary:"), 170 | ), 171 | ] 172 | ) 173 | 174 | summary_chain = summary_prompt | llm 175 | 176 | for i, parent in enumerate(parent_documents): 177 | summary = summary_chain.invoke({"question": parent.page_content}).content 178 | params = { 179 | "parent_id": i, 180 | "summary": summary, 181 | "embedding": embeddings.embed_query(summary), 182 | } 183 | graph.query( 184 | """ 185 | MERGE (p:Parent {id: $parent_id}) 186 | MERGE (p)-[:HAS_SUMMARY]->(s:Summary) 187 | SET s.text = $summary 188 | WITH s 189 | CALL db.create.setVectorProperty(s, 'embedding', $embedding) 190 | YIELD node 191 | RETURN count(*) 192 | """, 193 | params, 194 | ) 195 | # Create vector index 196 | try: 197 | graph.query( 198 | "CALL db.index.vector.createNodeIndex('summary', " 199 | "'Summary', 'embedding', $dimension, 'cosine')", 200 | {"dimension": embedding_dimension}, 201 | ) 202 | except ClientError: # already exists 203 | pass 204 | -------------------------------------------------------------------------------- /App.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from langchain.chat_models import ChatOpenAI 3 | from langchain.graphs import Neo4jGraph 4 | from streamlit_agraph import agraph, Node, Edge, Config 5 | from neo4j import GraphDatabase 6 | import os 7 | from openai import OpenAI 8 | 9 | # Function to process the query and return a response 10 | def process_query(query): 11 | # Use GraphCypherQAChain to get a Cypher query and a natural language response 12 | result = cypher_chain(query) 13 | intermediate_steps = result['intermediate_steps'] 14 | final_answer = result['result'] 15 | generated_cypher = intermediate_steps[0]['query'] 16 | response_structured = final_answer 17 | 18 | # Fetch graph data using the Cypher query 19 | nodes, edges = fetch_graph_data(nodesType=None, relType=None, direct_cypher_query=generated_cypher, intermediate_steps=intermediate_steps) 20 | 21 | return response_structured, nodes, edges 22 | 23 | # Function to fetch data from Neo4j 24 | def fetch_graph_data(nodesType=None, relType=None, direct_cypher_query=None, intermediate_steps=None): 25 | # Use the direct Cypher query if provided 26 | if direct_cypher_query: 27 | context = intermediate_steps[1]['context'] 28 | nodes, edges = process_graph_result(context) 29 | else: 30 | if nodesType or relType: 31 | # Construct the Cypher query based on selected filters 32 | cypher_query = construct_cypher_query(nodesType, relType) 33 | with GraphDatabase.driver(os.environ["NEO4J_URI"], 34 | auth=(os.environ["NEO4J_USERNAME"], 35 | os.environ["NEO4J_PASSWORD"])).session() as session: 36 | result = session.run(cypher_query) 37 | nodes, edges = process_graph_result_select(result) 38 | 39 | return nodes, edges 40 | 41 | 42 | # Function to construct the Cypher query based on selected filters 43 | def construct_cypher_query(node_types, rel_types): 44 | # Create a list of MATCH clauses for node types 45 | node_clauses = [] 46 | for node_type in node_types: 47 | node_clauses.append(f"(p:{node_type})-[r]->(n) ") 48 | 49 | # Create a list of WHERE clauses for relationship types 50 | rel_clauses = [] 51 | for rel_type in rel_types: 52 | rel_clauses.append(f"type(r)='{rel_type}' ") 53 | 54 | # Combine the clauses into one Cypher query 55 | if rel_clauses: 56 | rel_match = " OR ".join(rel_clauses) 57 | query = f"MATCH {' OR '.join(node_clauses)} WHERE {rel_match} RETURN p, r, n" 58 | else: 59 | query = f"MATCH {' OR '.join(node_clauses)} RETURN p, r, n" 60 | 61 | return query 62 | 63 | def process_graph_result(result): 64 | nodes = [] 65 | edges = [] 66 | node_names = set() # This defines node_names to track unique nodes 67 | 68 | for record in result: 69 | # Process nodes 70 | p_name = record['p.name'] 71 | o_name = record['o.name'] 72 | 73 | # Add nodes if they don't already exist 74 | if p_name not in node_names: 75 | nodes.append(Node(id=p_name, label=p_name, size=5, shape="circle")) 76 | node_names.add(p_name) 77 | if o_name not in node_names: 78 | nodes.append(Node(id=o_name, label=o_name, size=5, shape="circle")) 79 | node_names.add(o_name) 80 | 81 | # Process edges 82 | relationship_label = record['type(r)'] 83 | edges.append(Edge(source=p_name, target=o_name, label=relationship_label)) 84 | 85 | return nodes, edges 86 | 87 | def process_graph_result_select(result): 88 | nodes = [] 89 | edges = [] 90 | node_names = set() # This defines node_names to track unique nodes 91 | 92 | for record in result: 93 | # Process nodes 94 | p = record['p'] 95 | n = record['n'] 96 | p_name = p['name'] 97 | n_name = n['name'] 98 | 99 | # Add nodes if they don't already exist 100 | if p_name not in node_names: 101 | nodes.append(Node(id=p_name, label=p_name, size=5, shape="circle")) 102 | node_names.add(p_name) 103 | if n_name not in node_names: 104 | nodes.append(Node(id=n_name, label=n_name, size=5, shape="circle")) 105 | node_names.add(n_name) 106 | 107 | # Process edges, include the date in the label if it exists 108 | r = record['r'] 109 | relationship_label = r.type 110 | if 'date' in r: 111 | relationship_label = f"{r.type} ({r['date']})" 112 | edges.append(Edge(source=p_name, target=n_name, label=relationship_label)) 113 | 114 | return nodes, edges 115 | 116 | # from langchain.agents import initialize_agent 117 | st.title("The OpenAI Saga") 118 | 119 | NEO4J_URI= st.secrets["NEO4J_URI"] 120 | NEO4J_USERNAME= st.secrets["NEO4J_USERNAME"] 121 | NEO4J_PASSWORD= st.secrets["NEO4J_PASSWORD"] 122 | 123 | graph = Neo4jGraph( 124 | url=os.environ["NEO4J_URI"], 125 | username=os.environ["NEO4J_USERNAME"], 126 | password=os.environ["NEO4J_PASSWORD"]) 127 | 128 | # Fetch the unique node types and relationship types for sidebar filters 129 | node_types = ['Person', 'Organization', 'Group', 'Topic'] 130 | relationship_types = [ 131 | 'BELONGS_TO', 'FORMER_CEO_OF', 'CEO_OF', 'FORMER_MEMBER_OF', 'CURRENT_MEMBER_OF','REMAIN_MEMBER_OF', 'SCHEDULES_CALL_WITH', 132 | 'QUESTIONED_FIRING_SAM', 'FOUNDED_BY', 'INVESTED_IN', 'CONSIDERS_BOARD_SEAT', 'FORMER_CTO_OF', 'INFORMED_OF_FIRING', 'FIRED_AS_CEO', 133 | 'ALL_HANDS_MEETING', 'RESIGNS_FROM', 'APPOINTED_INTERIM_CEO', 'JOINS_MICROSOFT', 'THREATEN_TO_RESIGN', 'CONSIDERS_MERGER_WITH', 134 | 'IN_TALKS_WITH_BOARD', 'RETURNS_AS_CEO', 'RETURNS_TO', 'CONSIDERS_BOARD_SEAT', 'AIMS_TO_DEVELOP_AGI_WITH', 'QUESTIONED_FIRING_SAM', 135 | 'FOUNDED_BY', 'INVESTED_IN', 'DEMOTED_FROM', 'RELEASES_HIRING_STATEMENT', 'HIRED_BY', 'REGRETS_FIRING','MENTIONS', 'EXPLAINS_DECISIONS', 'DESCRIBES', 'FORMER_PRESIDENT'] 136 | 137 | st.sidebar.header('Filters') 138 | selected_node_types = st.sidebar.multiselect('Node Types', node_types, default=node_types) 139 | selected_relationship_types = st.sidebar.multiselect('Relationship Types', relationship_types, default=relationship_types) 140 | 141 | # Initialize state variables and check for changes in selections 142 | if 'prev_node_types' not in st.session_state: 143 | st.session_state.prev_node_types = selected_node_types 144 | if 'prev_relationship_types' not in st.session_state: 145 | st.session_state.prev_relationship_types = selected_relationship_types 146 | 147 | # Update graph if selections change 148 | if (selected_node_types != st.session_state.prev_node_types or 149 | selected_relationship_types != st.session_state.prev_relationship_types): 150 | st.session_state.prev_node_types = selected_node_types 151 | st.session_state.prev_relationship_types = selected_relationship_types 152 | # Construct and fetch new graph data 153 | cypher_query = construct_cypher_query(selected_node_types, selected_relationship_types) 154 | nodes, edges = fetch_graph_data(nodesType=selected_node_types, relType=selected_relationship_types) 155 | # Define the configuration for the graph visualization 156 | config = Config(height=600, width=800, directed=True, nodeHighlightBehavior=True, highlightColor="#F7A7A6") 157 | # Render the graph using agraph with the specified configuration 158 | agraph(nodes=nodes, edges=edges, config=config) 159 | 160 | 161 | with st.sidebar: 162 | openai_api_key = st.text_input("OpenAI API Key", key="langchain_search_api_key_openai", type="password") 163 | "[Get an OpenAI API key](https://platform.openai.com/account/api-keys)" 164 | 165 | def combine_contexts(structured, unstructured, client): 166 | 167 | messages = [{'role': 'system', 'content': 'You are an assistant of an advanced retrieval augmented system,\ 168 | who prioritizes accuracy and is very context-aware.\ 169 | Pleass summarize text from the following and generate\ 170 | a comprehensive, logical and context_aware answer.'}, 171 | {'role': 'user', 'content': structured + unstructured}] 172 | completion = client.chat.completions.create(model="gpt-4", 173 | messages=messages, 174 | temperature=0) 175 | response = completion.choices[0].message.content 176 | 177 | return response 178 | 179 | # Initialize OpenAI API key and Chat model 180 | if openai_api_key: 181 | client = OpenAI(api_key=openai_api_key) 182 | os.environ["OPENAI_API_KEY"] = openai_api_key 183 | from retrievers import initialize_retrievers 184 | from chain import initialize_chain, Question 185 | typical_rag, parent_vectorstore, hypothetic_question_vectorstore, summary_vectorstore = initialize_retrievers(openai_api_key) 186 | chain_txt = initialize_chain(openai_api_key, typical_rag, parent_vectorstore, hypothetic_question_vectorstore, summary_vectorstore) 187 | 188 | # Chat interface 189 | if "messages" not in st.session_state: 190 | st.session_state["messages"] = [{"role": "assistant", "content": "Hi there, ask me a question."}] 191 | 192 | for msg in st.session_state.messages: 193 | st.chat_message(msg["role"]).write(msg["content"]) 194 | 195 | if prompt := st.chat_input(placeholder="Ask a question"): 196 | if not openai_api_key: 197 | st.error("Please add your OpenAI API key to continue.") 198 | else: 199 | # Display response 200 | # Initialize the GraphCypherQAChain from chain.py 201 | from langchain.chains import GraphCypherQAChain 202 | cypher_chain = GraphCypherQAChain.from_llm( 203 | cypher_llm=ChatOpenAI(temperature=0, model_name='gpt-4', api_key=openai_api_key), 204 | qa_llm=ChatOpenAI(temperature=0, api_key=openai_api_key), 205 | graph=graph, 206 | verbose=True, 207 | return_intermediate_steps=True 208 | ) 209 | # Update session state with new message 210 | st.session_state.messages.append({"role": "user", "content": prompt}) 211 | st.chat_message("user").write(prompt) 212 | response_structured, nodes, edges= process_query(prompt) 213 | response_nonstructured = chain_txt.invoke( 214 | {"question": prompt}, 215 | {"configurable": {"strategy": "parent_strategy"}}, 216 | ) 217 | config = Config(height=600, width=800, directed=True, nodeHighlightBehavior=True, highlightColor="#F7A7A6") 218 | final_ans = combine_contexts(response_structured, response_nonstructured, client) 219 | st.session_state.messages.append({"role": "assistant", "content": final_ans}) 220 | st.chat_message("assistant").write(final_ans) 221 | agraph(nodes=nodes, edges=edges, config=config) 222 | 223 | -------------------------------------------------------------------------------- /openaiKG.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "57ea0bce-75b1-4e35-a17c-53ae7b1ccff6", 6 | "metadata": {}, 7 | "source": [ 8 | "# Get Wiki info of OpenAI key stakeholders" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "2cc4fb2b-08e4-43a0-a37e-6cdd62f5dc83", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# !pip install langchain\n", 19 | "# !pip install wikipedia" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 8, 25 | "id": "33a04ef2-16aa-41dc-9ee1-a53703275d41", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# Load the OpenAI Wikipedia page\n", 30 | "from langchain.document_loaders import WikipediaLoader\n", 31 | "from langchain.text_splitter import CharacterTextSplitter\n", 32 | "raw_documents = WikipediaLoader(query=\"OpenAI\").load()\n", 33 | "\n", 34 | "# Define chunking strategy\n", 35 | "text_splitter = CharacterTextSplitter.from_tiktoken_encoder(\n", 36 | " chunk_size=1000, chunk_overlap=20\n", 37 | ")\n", 38 | "# Chunk the document\n", 39 | "documents = text_splitter.split_documents(raw_documents)\n", 40 | "for d in documents:\n", 41 | " del d.metadata[\"summary\"]" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 11, 47 | "id": "9da3f01d-ae49-4d18-8b27-86932b67a5f3", 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "https://en.wikipedia.org/wiki/OpenAI\n", 55 | "https://en.wikipedia.org/wiki/Removal_of_Sam_Altman_from_OpenAI\n", 56 | "https://en.wikipedia.org/wiki/OpenAI_Five\n", 57 | "https://en.wikipedia.org/wiki/Greg_Brockman\n", 58 | "https://en.wikipedia.org/wiki/Ilya_Sutskever\n", 59 | "https://en.wikipedia.org/wiki/Emmett_Shear\n", 60 | "https://en.wikipedia.org/wiki/Artificial_general_intelligence\n", 61 | "https://en.wikipedia.org/wiki/Mira_Murati\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "for doc in documents:\n", 67 | " print(doc.metadata['source'])" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 10, 73 | "id": "5c5d5a34-d079-4444-88f9-511da29fe523", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "documents.remove(documents[2])\n", 78 | "documents.remove(documents[3])" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "97cd6970-d1c4-444b-81bf-f2594c2b2045", 84 | "metadata": {}, 85 | "source": [ 86 | "# Enable Neo4j database" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "64657236-720e-4953-b598-8931eec0244d", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# !pip install pypdf" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "1c9550f5-cc18-46c2-9469-1f43991f3f1c", 102 | "metadata": {}, 103 | "source": [ 104 | "# News Articles" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 8, 110 | "id": "aba1b836-437b-40ca-be38-3f1d11a42cc7", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "from langchain.document_loaders import PyPDFLoader\n", 115 | "from langchain.docstore.document import Document\n", 116 | "import os\n", 117 | "\n", 118 | "# Directory containing your PDF files\n", 119 | "directory_path = '/Users/leannchen/Documents/tcnews'\n", 120 | "\n", 121 | "# Initialize PyPDFLoader for each PDF in the directory\n", 122 | "loaders = [PyPDFLoader(os.path.join(directory_path, f)) for f in os.listdir(directory_path) if f.endswith('.pdf')]\n", 123 | "\n", 124 | "# Load documents from PDFs\n", 125 | "news_docs = []\n", 126 | "for loader in loaders:\n", 127 | " news_docs.extend(loader.load())\n", 128 | "\n", 129 | "# Prepare the content and metadata for each news article as Document objects\n", 130 | "news_articles_data = [\n", 131 | " Document(\n", 132 | " page_content=doc.page_content, # Assuming this is how you access the page content of the document\n", 133 | " metadata={\n", 134 | " \"source\": doc.metadata['source'].removeprefix('/Users/leannchen/Documents/tcnews/'), # Assuming this is the metadata format\n", 135 | " # Include any other metadata items here\n", 136 | " }\n", 137 | " )\n", 138 | " for doc in news_docs # Assuming news_docs is a list of objects with page_content and metadata\n", 139 | "]\n", 140 | "\n", 141 | "# Later, when you are ready to add them to the database:\n", 142 | "# Call add_documents and construct Document objects inline\n", 143 | "# Assuming news_articles_data is already a list of Document objects\n", 144 | "# neo4j_db.add_documents(\n", 145 | "# news_articles_data,\n", 146 | "# ids=[f\"news_article_{i}\" for i in range(len(news_articles_data))]\n", 147 | "\n", 148 | "# )" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "2ce9e011-7bdf-438e-a591-9ad480ef78e1", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# !pip install spacy-llm\n", 159 | "# !pip install --upgrade jupyter ipywidgets" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 9, 165 | "id": "710b9c7a-0cfa-4435-8cc7-aade9cfa2bc3", 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "all_data = documents + news_articles_data" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "id": "ea91be3d-cc34-4093-921c-f55dd31ba652", 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "all_data" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "id": "12a944c3-8050-4c25-9ab1-37aa14ac8311", 185 | "metadata": {}, 186 | "source": [ 187 | "# Perform Article Summaries as Relationship Extraction Database" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 289, 193 | "id": "2528c928-2f38-42ed-a509-1ed4109885bf", 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "from langchain.chains.combine_documents.stuff import StuffDocumentsChain\n", 198 | "from langchain.chains.llm import LLMChain\n", 199 | "from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain\n", 200 | "from langchain.prompts import PromptTemplate\n", 201 | "from langchain.llms import OpenAI\n", 202 | "from langchain.chat_models import ChatOpenAI\n", 203 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", 204 | "# Initialize the text splitter\n", 205 | "rtext_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)\n", 206 | "\n", 207 | "# Initialize LLM\n", 208 | "llm = ChatOpenAI(temperature=0, model_name=\"gpt-4\")\n", 209 | "\n", 210 | "# Define the map prompt template\n", 211 | "map_template = \"\"\"The following is a set of documents\n", 212 | "{all_data}\n", 213 | "Based on this list of docs, please perform concise summaries while extracting essential relationships for relationships analysis later, please do include dates of actions or events, which are very important for timeline analysis later. Example: \"Sam gets fired by the OpenAI board on 11/17/2023 or (Nov. 17th, Friday)\", which showcases not only the relationship between Sam and OpenAI, but also when it happens.\n", 214 | "Helpful Answer:\"\"\"\n", 215 | "map_prompt = PromptTemplate.from_template(map_template)\n", 216 | "\n", 217 | "# Define the map_chain\n", 218 | "map_chain = LLMChain(llm=llm, prompt=map_prompt)\n", 219 | "\n", 220 | "all_data = news_articles_data + documents\n", 221 | "# Extract text from each document\n", 222 | "# all_text_data = [doc.page_content for doc in all_data]\n", 223 | "\n", 224 | "# Reduce\n", 225 | "reduce_template = \"\"\"The following is set of summaries:\n", 226 | "{all_data}\n", 227 | "Take these and distill it into concise summaries of the articles while containing important relationships and events (including the timeline). Example: \"Sam gets fired by the OpenAI board on 11/17/2023 or (Nov. 17th, Friday)\", which showcases not only the relationship between Sam and OpenAI, but also when it happens.\n", 228 | "Helpful Answer:\"\"\"\n", 229 | "reduce_prompt = PromptTemplate.from_template(reduce_template)\n", 230 | "\n", 231 | "# ChatPromptTemplate(input_variables=['all_data'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['all_data'], template='The following is a set of documents:\\n{all_data}\\nBased on this list of docs, please identify the main themes \\nHelpful Answer:'))])\n", 232 | "\n", 233 | "# Run chain\n", 234 | "reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)\n", 235 | "\n", 236 | "combine_documents_chain = StuffDocumentsChain(\n", 237 | " llm_chain=reduce_chain,\n", 238 | " document_variable_name=\"all_data\" # This should match the variable name in reduce_prompt\n", 239 | ")\n", 240 | "\n", 241 | "# Combines and iteravely reduces the mapped documents\n", 242 | "reduce_documents_chain = ReduceDocumentsChain(\n", 243 | " # This is final chain that is called.\n", 244 | " combine_documents_chain=combine_documents_chain,\n", 245 | " # If documents exceed context for `StuffDocumentsChain`\n", 246 | " collapse_documents_chain=combine_documents_chain,\n", 247 | " # The maximum number of tokens to group documents into.\n", 248 | " token_max=4000,\n", 249 | ")\n", 250 | "\n", 251 | "# Combining documents by mapping a chain over them, then combining results\n", 252 | "map_reduce_chain = MapReduceDocumentsChain(\n", 253 | " # Map chain\n", 254 | " llm_chain=map_chain,\n", 255 | " # Reduce chain\n", 256 | " reduce_documents_chain=reduce_documents_chain,\n", 257 | " # The variable name in the llm_chain to put the documents in\n", 258 | " document_variable_name=\"all_data\",\n", 259 | " # Return the results of the map steps in the output\n", 260 | " return_intermediate_steps=False,\n", 261 | ")\n", 262 | "\n", 263 | "text_splitter = CharacterTextSplitter.from_tiktoken_encoder(\n", 264 | " chunk_size=1000, chunk_overlap=0\n", 265 | ")\n", 266 | "split_docs = text_splitter.split_documents(all_data)\n", 267 | "\n", 268 | "# Run the MapReduce Chain\n", 269 | "summarization_results = map_reduce_chain.run(split_docs)\n" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 290, 275 | "id": "5816cb1b-0a49-47fa-be60-102252980584", 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "\"1. Sam Altman was fired as CEO of OpenAI on November 17, 2023, leading to a power struggle within the company. Over 730 employees threatened to quit and join Altman at Microsoft unless the board resigned and reappointed Altman and co-founder Greg Brockman. Despite initial talks of reinstatement, the board later confirmed that Altman would not be returning. Altman and Brockman subsequently joined Microsoft to head a new advanced AI research unit.\\n\\n2. Following Altman's departure, the OpenAI board underwent a reshuffle, with Mira Murati appointed as interim CEO, only to be later replaced by Emmett Shear, the former CEO of Twitch. The board's actions were criticized for lack of transparency and communication, with the exact reasons for Altman's removal remaining unclear.\\n\\n3. The OpenAI board, composed of Ilya Sutskever, Adam D’Angelo, Helen Toner, and Tasha McCauley, faced calls for resignation. Potential replacements included Bret Taylor and Will Hurd. The board's composition was criticized for lack of diversity and deep knowledge about responsible use of AI.\\n\\n4. Altman announced on November 20, 2023, that he would not be returning as CEO of OpenAI and would instead join Microsoft to lead a new AI research team. Nearly 500 of OpenAI’s roughly 770 employees threatened to resign unless the startup’s board resigned and reappointed Altman.\\n\\n5. Microsoft has been acquiring top executives and AI engineering talent from OpenAI, a generative AI company in which Microsoft holds a minority stake worth several billion dollars. Microsoft's leadership, particularly CEO Satya Nadella, has been projecting a 'business as usual' message during these upheavals at OpenAI.\\n\\n6. Microsoft successfully navigated through U.K. and EU competition authorities to merge with Activision by restructuring the deal and agreeing to certain conditions. However, its market power in cloud computing and potential influence over OpenAI is raising concerns among competition regulators.\\n\\n7. Emmett Shear, the ex-CEO of Twitch, was appointed as the interim CEO of OpenAI after Sam Altman was fired and replaced by CTO Mira Murati. Shear plans to hire an independent investigator to look into the events leading up to his appointment and reform the management and leadership team at OpenAI.\\n\\n8. OpenAI, an American AI research organization, was founded in December 2015. Microsoft invested $1 billion in OpenAI Global LLC in 2019 and $10 billion in 2023. On November 17, 2023, Sam Altman was removed as CEO and Greg Brockman was removed as chairman of OpenAI. Both returned four days later after negotiations with the board.\\n\\n9. Mira Murati, born in Albania in 1988, is a technology executive who has worked at Tesla, Leap Motion, and OpenAI. She served as the CTO of OpenAI from 2018, leading projects like ChatGPT, Dall-E, and Codex. She briefly served as interim CEO of OpenAI in November 2023 after Sam Altman was removed, but was replaced by Emmett Shear after three days.\\n\\n10. Sam Altman, born in 1985, is a technology entrepreneur who co-founded Loopt and Hydrazine Capital, and served as a partner and president at Y Combinator. He was the CEO of OpenAI from 2019 until his removal in November 2023, but was reinstated five days later. He also co-founded Tools For Humanity and raised $1 billion for OpenAI from Microsoft.\"" 282 | ] 283 | }, 284 | "execution_count": 290, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "summarization_results" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 291, 296 | "id": "a686f1a8-a3a7-4e4d-a297-3493927ff3f5", 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "# Store summarization_results to a text file for future use\n", 301 | "# Timeline will further be added into the summaries\n", 302 | "with open('summary.txt', 'w') as file:\n", 303 | " file.write(str(summarization_results))" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "id": "dd4a39aa-5c78-49ed-a789-6514f33af1b4", 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "!pip install spacy-llm" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "id": "9f8b2d01-5d61-41f5-a126-99194c2be22a", 319 | "metadata": {}, 320 | "source": [ 321 | "# Entity and Relationship" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 293, 327 | "id": "56098473-d023-4452-a439-4cf62556b9e8", 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "Text: 1.\n", 335 | "Entities: [('1', 'CARDINAL')]\n", 336 | "Relations:\n", 337 | "Text: Sam Altman was fired as CEO of OpenAI on November 17, 2023, leading to a\n", 338 | "power struggle within the company.\n", 339 | "Entities: [('Sam Altman', 'PERSON'), ('November 17, 2023', 'DATE')]\n", 340 | "Relations:\n", 341 | " - Sam Altman [fired_on] November 17, 2023\n", 342 | "Text: Over 730 employees threatened to quit and join Altman at Microsoft unless\n", 343 | "the board resigned and reappointed Altman and co-founder Greg Brockman.\n", 344 | "Entities: [('Over 730', 'CARDINAL'), ('Altman', 'ORG'), ('Microsoft', 'ORG'),\n", 345 | "('Altman', 'PERSON'), ('Greg Brockman', 'PERSON')]\n", 346 | "Relations:\n", 347 | " - Greg Brockman [co-founder_of] Altman\n", 348 | "Text: Despite initial talks of reinstatement, the board later confirmed that\n", 349 | "Altman would not be returning.\n", 350 | "Entities: [('Altman', 'ORG')]\n", 351 | "Relations:\n", 352 | "Text: Altman and Brockman subsequently joined Microsoft to head a new advanced\n", 353 | "AI research unit.\n", 354 | "Entities: [('Altman', 'ORG'), ('Brockman', 'ORG'), ('Microsoft', 'ORG'), ('AI',\n", 355 | "'ORG')]\n", 356 | "Relations:\n", 357 | " - Altman [joined] Microsoft\n", 358 | " - Brockman [joined] Microsoft\n", 359 | " - Altman [head] AI\n", 360 | " - Brockman [head] AI\n", 361 | "Text: 2.\n", 362 | "Entities: [('2', 'CARDINAL')]\n", 363 | "Relations:\n", 364 | "Text: Following Altman's departure, the OpenAI board underwent a reshuffle, with\n", 365 | "Mira Murati appointed as interim CEO, only to be later replaced by Emmett Shear,\n", 366 | "the former CEO of Twitch.\n", 367 | "Entities: [('Altman', 'PERSON'), ('OpenAI', 'ORG'), ('Mira Murati', 'PERSON'),\n", 368 | "('Emmett Shear', 'PERSON'), ('Twitch', 'ORG')]\n", 369 | "Relations:\n", 370 | " - Altman [departure_from] OpenAI\n", 371 | " - Mira Murati [interim_CEO_of] OpenAI\n", 372 | " - Emmett Shear [CEO_of] OpenAI\n", 373 | " - Emmett Shear [former_CEO_of] Twitch\n", 374 | "Text: The board's actions were criticized for lack of transparency and\n", 375 | "communication, with the exact reasons for Altman's removal remaining unclear.\n", 376 | "Entities: [('Altman', 'ORG')]\n", 377 | "Relations:\n", 378 | "Text: 3.\n", 379 | "Entities: [('3', 'CARDINAL')]\n", 380 | "Relations:\n", 381 | "Text: The OpenAI board, composed of Ilya Sutskever, Adam D’Angelo, Helen Toner,\n", 382 | "and Tasha McCauley, faced calls for resignation.\n", 383 | "Entities: [('Ilya Sutskever', 'PERSON'), ('Adam D’Angelo', 'PERSON'), ('Helen\n", 384 | "Toner', 'PERSON'), ('Tasha McCauley', 'PERSON')]\n", 385 | "Relations:\n", 386 | "Text: Potential replacements included Bret Taylor and Will Hurd.\n", 387 | "Entities: [('Bret Taylor', 'PERSON'), ('Will Hurd', 'PERSON')]\n", 388 | "Relations:\n", 389 | " - Bret Taylor [potential_replacements] Will Hurd\n", 390 | "Text: The board's composition was criticized for lack of diversity and deep\n", 391 | "knowledge about responsible use of AI.\n", 392 | "Entities: [('AI', 'ORG')]\n", 393 | "Relations:\n", 394 | "Text: 4.\n", 395 | "Entities: [('4', 'CARDINAL')]\n", 396 | "Relations:\n", 397 | "Text: Altman announced on November 20, 2023, that he would not be returning as\n", 398 | "CEO of OpenAI and would instead join Microsoft to lead a new AI research team.\n", 399 | "Entities: [('Altman', 'ORG'), ('November 20, 2023', 'DATE'), ('OpenAI', 'ORG'),\n", 400 | "('Microsoft', 'ORG'), ('AI', 'ORG')]\n", 401 | "Relations:\n", 402 | " - Altman [lead] AI\n", 403 | "Text: Nearly 500 of OpenAI’s roughly 770 employees threatened to resign unless\n", 404 | "the startup’s board resigned and reappointed Altman.\n", 405 | "Entities: [('Nearly 500', 'CARDINAL'), ('roughly 770', 'CARDINAL'), ('Altman',\n", 406 | "'PERSON')]\n", 407 | "Relations:\n", 408 | " - Nearly 500 [part_of] roughly 770\n", 409 | " - Nearly 500 [threatened_by] Altman\n", 410 | " - roughly 770 [threatened_by] Altman\n", 411 | "Text: 5.\n", 412 | "Entities: [('5', 'CARDINAL')]\n", 413 | "Relations:\n", 414 | "Text: Microsoft has been acquiring top executives and AI engineering talent from\n", 415 | "OpenAI, a generative AI company in which Microsoft holds a minority stake worth\n", 416 | "several billion dollars.\n", 417 | "Entities: [('Microsoft', 'ORG'), ('AI', 'ORG'), ('OpenAI', 'ORG'), ('Microsoft',\n", 418 | "'ORG'), ('several billion dollars', 'MONEY')]\n", 419 | "Relations:\n", 420 | " - Microsoft [has_stake] several billion dollars\n", 421 | "Text: Microsoft's leadership, particularly CEO Satya Nadella, has been\n", 422 | "projecting a 'business as usual' message during these upheavals at OpenAI. 6.\n", 423 | "Entities: [('Microsoft', 'ORG'), ('Satya Nadella', 'PERSON'), ('6', 'CARDINAL')]\n", 424 | "Relations:\n", 425 | " - Satya Nadella [works for] Microsoft\n", 426 | "Text: Microsoft successfully navigated through U.K. and EU competition\n", 427 | "authorities to merge with Activision by restructuring the deal and agreeing to\n", 428 | "certain conditions.\n", 429 | "Entities: [('Microsoft', 'ORG'), ('U.K.', 'GPE'), ('EU', 'ORG'), ('Activision',\n", 430 | "'ORG')]\n", 431 | "Relations:\n", 432 | " - Microsoft [merge_with] Activision\n", 433 | "Text: However, its market power in cloud computing and potential influence over\n", 434 | "OpenAI is raising concerns among competition regulators.\n", 435 | "Entities: [('OpenAI', 'ORG')]\n", 436 | "Relations:\n", 437 | "Text: 7. Emmett Shear, the ex-CEO of Twitch, was appointed as the interim CEO of\n", 438 | "OpenAI after Sam Altman was fired and replaced by CTO Mira Murati.\n", 439 | "Entities: [('7', 'CARDINAL'), ('Emmett Shear', 'PERSON'), ('Twitch', 'ORG'),\n", 440 | "('Sam Altman', 'PERSON'), ('Mira Murati', 'PERSON')]\n", 441 | "Relations:\n", 442 | " - Emmett Shear [ex-CEO_of] Twitch\n", 443 | " - Emmett Shear [ordinal] 7\n", 444 | "Text: Shear plans to hire an independent investigator to look into the events\n", 445 | "leading up to his appointment and reform the management and leadership team at\n", 446 | "OpenAI.\n", 447 | "Entities: [('Shear', 'ORG')]\n", 448 | "Relations:\n", 449 | "Text: 8.\n", 450 | "Entities: [('8', 'CARDINAL')]\n", 451 | "Relations:\n", 452 | "Text: OpenAI, an American AI research organization, was founded in December\n", 453 | "2015.\n", 454 | "Entities: [('American', 'NORP'), ('December 2015', 'DATE')]\n", 455 | "Relations:\n", 456 | " - American [founded_on] December 2015\n", 457 | "Text: Microsoft invested $1 billion in OpenAI Global LLC in 2019 and $10 billion\n", 458 | "in 2023.\n", 459 | "Entities: [('Microsoft', 'ORG'), ('$1 billion', 'MONEY'), ('OpenAI Global LLC',\n", 460 | "'ORG'), ('2019', 'DATE'), ('$10 billion', 'MONEY'), ('2023', 'DATE')]\n", 461 | "Relations:\n", 462 | " - Microsoft [invested_in] $1 billion\n", 463 | " - Microsoft [invested_in] OpenAI Global LLC\n", 464 | " - Microsoft [invested_on] 2019\n", 465 | " - Microsoft [invested_in] $10 billion\n", 466 | " - Microsoft [invested_on] 2023\n", 467 | "Text: On November 17, 2023, Sam Altman was removed as CEO and Greg Brockman was\n", 468 | "removed as chairman of OpenAI.\n", 469 | "Entities: [('November 17, 2023', 'DATE'), ('Sam Altman', 'PERSON'), ('Greg\n", 470 | "Brockman', 'PERSON'), ('OpenAI', 'ORG')]\n", 471 | "Relations:\n", 472 | " - Greg Brockman [role_in] OpenAI\n", 473 | "Text: Both returned four days later after negotiations with the board.\n", 474 | "Entities: [('four days later', 'DATE')]\n", 475 | "Relations:\n", 476 | "Text: 9.\n", 477 | "Entities: [('9', 'CARDINAL')]\n", 478 | "Relations:\n", 479 | "Text: Mira Murati, born in Albania in 1988, is a technology executive who has\n", 480 | "worked at Tesla, Leap Motion, and OpenAI.\n", 481 | "Entities: [('Mira Murati', 'PERSON'), ('Albania', 'GPE'), ('1988', 'DATE'),\n", 482 | "('Tesla', 'ORG'), ('Leap Motion', 'ORG')]\n", 483 | "Relations:\n", 484 | " - Mira Murati [born_in] Albania\n", 485 | " - Mira Murati [born_on] 1988\n", 486 | " - Mira Murati [worked_at] Tesla\n", 487 | " - Mira Murati [worked_at] Leap Motion\n", 488 | "Text: She served as the CTO of OpenAI from 2018, leading projects like ChatGPT,\n", 489 | "Dall-E, and Codex.\n", 490 | "Entities: [('OpenAI', 'ORG'), ('2018', 'DATE'), ('Dall', 'PERSON'), ('Codex',\n", 491 | "'ORG')]\n", 492 | "Relations:\n", 493 | " - OpenAI [time] 2018\n", 494 | " - Dall [affiliation] OpenAI\n", 495 | " - Dall [time] 2018\n", 496 | "Text: She briefly served as interim CEO of OpenAI in November 2023 after Sam\n", 497 | "Altman was removed, but was replaced by Emmett Shear after three days.\n", 498 | "Entities: [('OpenAI', 'ORG'), ('November 2023', 'DATE'), ('Sam Altman',\n", 499 | "'PERSON'), ('Emmett Shear', 'PERSON'), ('three days', 'DATE')]\n", 500 | "Relations:\n", 501 | " - OpenAI [time] November 2023\n", 502 | " - Sam Altman [role] OpenAI\n", 503 | " - Emmett Shear [role] OpenAI\n", 504 | " - Emmett Shear [time] three days\n", 505 | "Text: 10.\n", 506 | "Entities: [('10', 'CARDINAL')]\n", 507 | "Relations:\n", 508 | "Text: Sam Altman, born in 1985, is a technology entrepreneur who co-founded\n", 509 | "Loopt and Hydrazine Capital, and served as a partner and president at Y\n", 510 | "Combinator.\n", 511 | "Entities: [('Sam Altman', 'PERSON'), ('1985', 'DATE'), ('Loopt and Hydrazine\n", 512 | "Capital', 'ORG'), ('Y Combinator', 'ORG')]\n", 513 | "Relations:\n", 514 | " - Sam Altman [born_on] 1985\n", 515 | " - Sam Altman [co-founded] Loopt and Hydrazine Capital\n", 516 | " - Sam Altman [worked_at] Y Combinator\n", 517 | "Text: He was the CEO of OpenAI from 2019 until his removal in November 2023, but\n", 518 | "was reinstated five days later.\n", 519 | "Entities: [('OpenAI', 'ORG'), ('2019', 'DATE'), ('November 2023', 'DATE'),\n", 520 | "('five days later', 'DATE')]\n", 521 | "Relations:\n", 522 | " - OpenAI [start_date] 2019\n", 523 | " - OpenAI [end_date] November 2023\n", 524 | " - OpenAI [reinstatement_date] five days later\n", 525 | "Text: He also co-founded Tools For Humanity and raised $1 billion for OpenAI\n", 526 | "from Microsoft.\n", 527 | "Entities: [('Tools For Humanity', 'ORG'), ('$1 billion', 'MONEY'), ('Microsoft',\n", 528 | "'ORG')]\n", 529 | "Relations:\n", 530 | " - Tools For Humanity [raised_money_for] $1 billion\n", 531 | " - $1 billion [invested_by] Microsoft\n", 532 | "Entity counts: Counter({'ORG': 39, 'PERSON': 24, 'DATE': 15, 'CARDINAL': 13,\n", 533 | "'MONEY': 4, 'GPE': 2, 'NORP': 1})\n", 534 | "Relation counts: Counter({'time': 4, 'invested_in': 3, 'worked_at': 3, 'joined':\n", 535 | "2, 'head': 2, 'threatened_by': 2, 'invested_on': 2, 'born_on': 2, 'role': 2,\n", 536 | "'fired_on': 1, 'co-founder_of': 1, 'departure_from': 1, 'interim_CEO_of': 1,\n", 537 | "'CEO_of': 1, 'former_CEO_of': 1, 'potential_replacements': 1, 'lead': 1,\n", 538 | "'part_of': 1, 'has_stake': 1, 'works for': 1, 'merge_with': 1, 'ex-CEO_of': 1,\n", 539 | "'ordinal': 1, 'founded_on': 1, 'role_in': 1, 'born_in': 1, 'affiliation': 1,\n", 540 | "'co-founded': 1, 'start_date': 1, 'end_date': 1, 'reinstatement_date': 1,\n", 541 | "'raised_money_for': 1, 'invested_by': 1})\n" 542 | ] 543 | } 544 | ], 545 | "source": [ 546 | "import os\n", 547 | "import json\n", 548 | "import spacy\n", 549 | "from collections import Counter\n", 550 | "from pathlib import Path\n", 551 | "from wasabi import msg\n", 552 | "from spacy_llm.util import assemble\n", 553 | "\n", 554 | "# traditional spacy NER (Named Recognition Library)\n", 555 | "def split_document_sent(text):\n", 556 | " nlp = spacy.load(\"en_core_web_sm\")\n", 557 | " doc = nlp(text)\n", 558 | " return [sent.text.strip() for sent in doc.sents] # referencial\n", 559 | "\n", 560 | "# spacy-llm relationship extraction\n", 561 | "def process_text(nlp, text, verbose=False):\n", 562 | " doc = nlp(text)\n", 563 | " if verbose:\n", 564 | " msg.text(f\"Text: {doc.text}\")\n", 565 | " msg.text(f\"Entities: {[(ent.text, ent.label_) for ent in doc.ents]}\")\n", 566 | " msg.text(\"Relations:\")\n", 567 | " for r in doc._.rel:\n", 568 | " msg.text(f\" - {doc.ents[r.dep]} [{r.relation}] {doc.ents[r.dest]}\")\n", 569 | " return doc\n", 570 | "\n", 571 | "def run_pipeline(config_path, examples_path=None, verbose=False):\n", 572 | " if not os.getenv(\"OPENAI_API_KEY\"):\n", 573 | " msg.fail(\"OPENAI_API_KEY env variable was not found. Set it and try again.\", exits=1)\n", 574 | "\n", 575 | " nlp = assemble(config_path, overrides={} if examples_path is None else {\"paths.examples\": str(examples_path)})\n", 576 | "\n", 577 | " # Initialize counters and storage\n", 578 | " processed_data = []\n", 579 | " entity_counts = Counter()\n", 580 | " relation_counts = Counter()\n", 581 | "\n", 582 | " # Load your articles and news data here\n", 583 | " # all_data = news_articles_data + documents\n", 584 | "\n", 585 | " sents = split_document_sent(summarization_results)\n", 586 | " for sent in sents:\n", 587 | " doc = process_text(nlp, sent, verbose)\n", 588 | " entities = [(ent.text, ent.label_) for ent in doc.ents]\n", 589 | " relations = [(doc.ents[r.dep].text, r.relation, doc.ents[r.dest].text) for r in doc._.rel]\n", 590 | " \n", 591 | " # Store processed data\n", 592 | " processed_data.append({'text': doc.text, 'entities': entities, 'relations': relations})\n", 593 | "\n", 594 | " # Update counters\n", 595 | " entity_counts.update([ent[1] for ent in entities])\n", 596 | " relation_counts.update([rel[1] for rel in relations])\n", 597 | "\n", 598 | " # Export to JSON\n", 599 | " with open('processed_data.json', 'w') as f:\n", 600 | " json.dump(processed_data, f)\n", 601 | "\n", 602 | " # Display summary\n", 603 | " msg.text(f\"Entity counts: {entity_counts}\")\n", 604 | " msg.text(f\"Relation counts: {relation_counts}\")\n", 605 | "\n", 606 | "# Set your configuration paths and flags\n", 607 | "config_path = Path(\"zeroshot.cfg\")\n", 608 | "examples_path = None # or None if not using few-shot\n", 609 | "verbose = True\n", 610 | "\n", 611 | "# Run the pipeline\n", 612 | "file = run_pipeline(config_path, None, verbose)\n" 613 | ] 614 | } 615 | ], 616 | "metadata": { 617 | "kernelspec": { 618 | "display_name": "Python 3 (ipykernel)", 619 | "language": "python", 620 | "name": "python3" 621 | }, 622 | "language_info": { 623 | "codemirror_mode": { 624 | "name": "ipython", 625 | "version": 3 626 | }, 627 | "file_extension": ".py", 628 | "mimetype": "text/x-python", 629 | "name": "python", 630 | "nbconvert_exporter": "python", 631 | "pygments_lexer": "ipython3", 632 | "version": "3.10.12" 633 | } 634 | }, 635 | "nbformat": 4, 636 | "nbformat_minor": 5 637 | } 638 | --------------------------------------------------------------------------------