├── gemini-multimodal ├── README.md ├── resources │ ├── rca001.png │ ├── movie-er.jpg │ └── laptop-wireless-network-issue.gif └── gemini-vision-er-graph-demo.ipynb ├── openai+llmsherpa ├── genai-stack │ ├── images │ │ ├── readme.md │ │ └── qna-logo.png │ ├── readme.md │ ├── cs_bot_papers.py │ └── chains.py ├── requirements.txt ├── readme.md ├── KGEmbedding_Populate.ipynb └── LayoutPDFReader_KGLoader.ipynb ├── unstructured-io ├── exampple-docs │ ├── readme.md │ └── layout-parser-paper.pdf ├── readme.md └── Unstructured-IO_PDF_KGLoader.ipynb ├── README.md ├── openai+llamaparse ├── readme.md ├── table_node_property.png ├── text_node_property.png ├── document_graph_schema.png ├── query_document_visualization.png ├── InjuredWorkerGuidebookCalifornia.pdf └── demo_neo4j_vectordb.ipynb └── local-genai └── nvidia-tensorrt └── readme.md /gemini-multimodal/README.md: -------------------------------------------------------------------------------- 1 | # Google Gemini Multi-Modal Model 2 | -------------------------------------------------------------------------------- /openai+llmsherpa/genai-stack/images/readme.md: -------------------------------------------------------------------------------- 1 | ## Application icon 2 | -------------------------------------------------------------------------------- /unstructured-io/exampple-docs/readme.md: -------------------------------------------------------------------------------- 1 | ## Sample documents used in testing 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # graph-rag 2 | Graph based retrieval + GenAI = Better RAG in production 3 | -------------------------------------------------------------------------------- /openai+llmsherpa/requirements.txt: -------------------------------------------------------------------------------- 1 | llmsherpa==0.1.3 2 | openai==1.8.0 3 | neo4j==5.16.0 -------------------------------------------------------------------------------- /unstructured-io/readme.md: -------------------------------------------------------------------------------- 1 | ## Code for building ingestion pipeline using unstructured-io 2 | 3 | -------------------------------------------------------------------------------- /openai+llamaparse/readme.md: -------------------------------------------------------------------------------- 1 | ## Experiments on Llama Parse for PDF document processing with OpenAI APIs 2 | 3 | -------------------------------------------------------------------------------- /openai+llmsherpa/genai-stack/readme.md: -------------------------------------------------------------------------------- 1 | ## GenAI-Stack by Neo4j 2 | 3 | Files to update / add to the default project. 4 | -------------------------------------------------------------------------------- /gemini-multimodal/resources/rca001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joshua-Yu/graph-rag/main/gemini-multimodal/resources/rca001.png -------------------------------------------------------------------------------- /gemini-multimodal/resources/movie-er.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joshua-Yu/graph-rag/main/gemini-multimodal/resources/movie-er.jpg -------------------------------------------------------------------------------- /openai+llamaparse/table_node_property.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joshua-Yu/graph-rag/main/openai+llamaparse/table_node_property.png -------------------------------------------------------------------------------- /openai+llamaparse/text_node_property.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joshua-Yu/graph-rag/main/openai+llamaparse/text_node_property.png -------------------------------------------------------------------------------- /local-genai/nvidia-tensorrt/readme.md: -------------------------------------------------------------------------------- 1 | ## Running GenAI applications on Windows desktop with NVIDIA graphical card powered by TensorRT-LLM 2 | -------------------------------------------------------------------------------- /openai+llamaparse/document_graph_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joshua-Yu/graph-rag/main/openai+llamaparse/document_graph_schema.png -------------------------------------------------------------------------------- /openai+llmsherpa/genai-stack/images/qna-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joshua-Yu/graph-rag/main/openai+llmsherpa/genai-stack/images/qna-logo.png -------------------------------------------------------------------------------- /openai+llamaparse/query_document_visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joshua-Yu/graph-rag/main/openai+llamaparse/query_document_visualization.png -------------------------------------------------------------------------------- /openai+llamaparse/InjuredWorkerGuidebookCalifornia.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joshua-Yu/graph-rag/main/openai+llamaparse/InjuredWorkerGuidebookCalifornia.pdf -------------------------------------------------------------------------------- /unstructured-io/exampple-docs/layout-parser-paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joshua-Yu/graph-rag/main/unstructured-io/exampple-docs/layout-parser-paper.pdf -------------------------------------------------------------------------------- /gemini-multimodal/resources/laptop-wireless-network-issue.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joshua-Yu/graph-rag/main/gemini-multimodal/resources/laptop-wireless-network-issue.gif -------------------------------------------------------------------------------- /openai+llmsherpa/readme.md: -------------------------------------------------------------------------------- 1 | ## RAG from PDF documents 2 | 3 | LLM: OpenAI 4 | PDF Parser: LLMSherpa 5 | GenAI framework: genai-stack, including LangChain 6 | Knowledge storage: Neo4j AuraDB 7 | 8 | -------------------------------------------------------------------------------- /openai+llmsherpa/KGEmbedding_Populate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from neo4j import GraphDatabase\n", 10 | "#from openai.embeddings_utils import get_embedding\n", 11 | "from openai import OpenAI\n", 12 | "\n", 13 | "\"\"\"\n", 14 | "LoadEmbedding: call OpenAI embedding API to generate embeddings for each property of node in Neo4j\n", 15 | "Version: 1.1\n", 16 | "\"\"\"\n", 17 | "OPENAI_KEY = \"OPENAI_API_KEY\"\n", 18 | "EMBEDDING_MODEL = \"text-embedding-ada-002\"\n", 19 | "NEO4J_URL = \"bolt://localhost:7687\"\n", 20 | "NEO4J_USER = \"neo4j\"\n", 21 | "NEO4J_PASSWORD = \"NEO4J_PASSWORD\"\n", 22 | "NEO4J_DATABASE = \"NEO4J_DATABASE\"\n", 23 | "\n", 24 | "def get_embedding(client, text, model):\n", 25 | " response = client.embeddings.create(\n", 26 | " input=text,\n", 27 | " model=model,\n", 28 | " )\n", 29 | " return response.data[0].embedding\n", 30 | "\n", 31 | "def LoadEmbedding(label, property):\n", 32 | " driver = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USER, NEO4J_PASSWORD), database=NEO4J_DATABASE)\n", 33 | " openai_client = OpenAI (api_key = OPENAI_KEY)\n", 34 | "\n", 35 | " with driver.session() as session:\n", 36 | " # get chunks in document, together with their section titles\n", 37 | " result = session.run(f\"MATCH (ch:{label}) -[:HAS_PARENT]-> (s:Section) RETURN id(ch) AS id, s.title + ' >> ' + ch.{property} AS text\")\n", 38 | " # call OpenAI embedding API to generate embeddings for each proporty of node\n", 39 | " # for each node, update the embedding property\n", 40 | " count = 0\n", 41 | " for record in result:\n", 42 | " id = record[\"id\"]\n", 43 | " text = record[\"text\"]\n", 44 | " \n", 45 | " # For better performance, text can be batched\n", 46 | " embedding = get_embedding(openai_client, text, EMBEDDING_MODEL)\n", 47 | " \n", 48 | " # key property of Embedding node differentiates different embeddings\n", 49 | " cypher = \"CREATE (e:Embedding) SET e.key=$key, e.value=$embedding\"\n", 50 | " cypher = cypher + \" WITH e MATCH (n) WHERE id(n) = $id CREATE (n) -[:HAS_EMBEDDING]-> (e)\"\n", 51 | " session.run(cypher,key=property, embedding=embedding, id=id )\n", 52 | " count = count + 1\n", 53 | "\n", 54 | " session.close()\n", 55 | " \n", 56 | " print(\"Processed \" + str(count) + \" \" + label + \" nodes for property @\" + property + \".\")\n", 57 | " return count\n", 58 | "\n", 59 | "\n" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "\n", 69 | "LoadEmbedding(\"Chunk\", \"sentences\")\n", 70 | "\n", 71 | "LoadEmbedding(\"Table\", \"name\")\n", 72 | "\n" 73 | ] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.11.6" 93 | }, 94 | "orig_nbformat": 4 95 | }, 96 | "nbformat": 4, 97 | "nbformat_minor": 2 98 | } 99 | -------------------------------------------------------------------------------- /openai+llmsherpa/genai-stack/cs_bot_papers.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import streamlit as st 4 | from streamlit.logger import get_logger 5 | from langchain.callbacks.base import BaseCallbackHandler 6 | from langchain.prompts.chat import ( 7 | ChatPromptTemplate, 8 | SystemMessagePromptTemplate, 9 | HumanMessagePromptTemplate, 10 | ) 11 | from langchain.graphs import Neo4jGraph 12 | from dotenv import load_dotenv 13 | from utils import ( 14 | extract_title_and_question, 15 | create_vector_index, 16 | ) 17 | from chains import ( 18 | load_embedding_model, 19 | load_llm, 20 | configure_llm_only_chain, 21 | configure_qa_rag_chain, 22 | configure_qa_structure_rag_chain, 23 | ) 24 | 25 | 26 | # >>>> initialise - environemnt <<<< 27 | 28 | load_dotenv(".env") 29 | 30 | url = os.getenv("NEO4J_URI") 31 | username = os.getenv("NEO4J_USERNAME") 32 | password = os.getenv("NEO4J_PASSWORD") 33 | database = os.getenv("NEO4J_DATABASE") 34 | ollama_base_url = os.getenv("OLLAMA_BASE_URL") 35 | embedding_model_name = os.getenv("EMBEDDING_MODEL") 36 | llm_name = os.getenv("LLM") 37 | # Remapping for Langchain Neo4j integration 38 | # os.environ["NEO4J_URL"] = url 39 | 40 | 41 | # >>>> initialise - services <<<< 42 | 43 | logger = get_logger(__name__) 44 | 45 | neo4j_graph = Neo4jGraph(url=url, username=username, password=password, database=database) 46 | 47 | embeddings, dimension = load_embedding_model( 48 | embedding_model_name, config={"ollama_base_url": ollama_base_url}, logger=logger 49 | ) 50 | 51 | llm = load_llm(llm_name, logger=logger, config={"ollama_base_url": ollama_base_url}) 52 | 53 | 54 | # llm_chain: LLM only response 55 | llm_chain = configure_llm_only_chain(llm) 56 | 57 | # rag_chain: KG augmented response 58 | rag_chain = configure_qa_structure_rag_chain( 59 | llm, embeddings, embeddings_store_url=url, username=username, password=password 60 | ) 61 | 62 | # SKIPPED: create_vector_index(neo4j_graph, dimension) 63 | 64 | # >>>> Class definition - StreamHander <<<< 65 | 66 | class StreamHandler(BaseCallbackHandler): 67 | def __init__(self, container, initial_text=""): 68 | self.container = container 69 | self.text = initial_text 70 | 71 | def on_llm_new_token(self, token: str, **kwargs) -> None: 72 | self.text += token 73 | self.container.markdown(self.text) 74 | 75 | # >>>> Streamlit UI <<<< 76 | 77 | styl = f""" 78 | 95 | """ 96 | st.markdown(styl, unsafe_allow_html=True) 97 | st.image("images/qna-logo.png", width=160) 98 | 99 | # >>>> UI interations <<<< 100 | 101 | def chat_input(): 102 | user_input = st.chat_input("What service questions can I help you resolve today?") 103 | 104 | if user_input: 105 | with st.chat_message("user"): 106 | st.write(user_input) 107 | with st.chat_message("assistant"): 108 | st.caption(f"RAG: {name}") 109 | stream_handler = StreamHandler(st.empty()) 110 | 111 | # Call chain to generate answers 112 | result = output_function( 113 | {"question": user_input, "chat_history": []}, callbacks=[stream_handler] 114 | )["answer"] 115 | 116 | output = result 117 | 118 | st.session_state[f"user_input"].append(user_input) 119 | st.session_state[f"generated"].append(output) 120 | st.session_state[f"rag_mode"].append(name) 121 | 122 | 123 | def display_chat(): 124 | # Session state 125 | if "generated" not in st.session_state: 126 | st.session_state[f"generated"] = [] 127 | 128 | if "user_input" not in st.session_state: 129 | st.session_state[f"user_input"] = [] 130 | 131 | if "rag_mode" not in st.session_state: 132 | st.session_state[f"rag_mode"] = [] 133 | 134 | if st.session_state[f"generated"]: 135 | size = len(st.session_state[f"generated"]) 136 | # Display only the last three exchanges 137 | for i in range(max(size - 3, 0), size): 138 | with st.chat_message("user"): 139 | st.write(st.session_state[f"user_input"][i]) 140 | 141 | with st.chat_message("assistant"): 142 | st.caption(f"RAG: {st.session_state[f'rag_mode'][i]}") 143 | st.write(st.session_state[f"generated"][i]) 144 | 145 | with st.expander("Not finding what you're looking for?"): 146 | st.write( 147 | "Automatically generate a draft for an internal ticket to our support team." 148 | ) 149 | st.button( 150 | "Generate ticket", 151 | type="primary", 152 | key="show_ticket", 153 | on_click=open_sidebar, 154 | ) 155 | with st.container(): 156 | st.write(" ") 157 | 158 | 159 | def mode_select() -> str: 160 | options = ["Disabled", "Enabled"] 161 | return st.radio("Select RAG mode", options, horizontal=True) 162 | 163 | # >>>>> switch on/off RAG mode 164 | 165 | name = mode_select() 166 | if name == "LLM only" or name == "Disabled": 167 | output_function = llm_chain 168 | elif name == "Vector + Graph" or name == "Enabled": 169 | output_function = rag_chain 170 | 171 | 172 | def generate_ticket(): 173 | # Get high ranked questions 174 | records = neo4j_graph.query( 175 | "MATCH (q:Question) RETURN q.title AS title, q.body AS body ORDER BY q.score DESC LIMIT 3" 176 | ) 177 | questions = [] 178 | for i, question in enumerate(records, start=1): 179 | questions.append((question["title"], question["body"])) 180 | # Ask LLM to generate new question in the same style 181 | questions_prompt = "" 182 | for i, question in enumerate(questions, start=1): 183 | questions_prompt += f"{i}. {question[0]}\n" 184 | questions_prompt += f"{question[1]}\n\n" 185 | questions_prompt += "----\n\n" 186 | 187 | gen_system_template = f""" 188 | You're an expert in formulating high quality questions. 189 | Can you formulate a question in the same style, detail and tone as the following example questions? 190 | {questions_prompt} 191 | --- 192 | 193 | Don't make anything up, only use information in the following question. 194 | Return a title for the question, and the question post itself. 195 | 196 | Return example: 197 | --- 198 | Title: How do I use the Neo4j Python driver? 199 | Question: I'm trying to connect to Neo4j using the Python driver, but I'm getting an error. 200 | --- 201 | """ 202 | # we need jinja2 since the questions themselves contain curly braces 203 | system_prompt = SystemMessagePromptTemplate.from_template( 204 | gen_system_template, template_format="jinja2" 205 | ) 206 | q_prompt = st.session_state[f"user_input"][-1] 207 | chat_prompt = ChatPromptTemplate.from_messages( 208 | [ 209 | system_prompt, 210 | SystemMessagePromptTemplate.from_template( 211 | """ 212 | Respond in the following format or you will be unplugged. 213 | --- 214 | Title: New title 215 | Question: New question 216 | --- 217 | """ 218 | ), 219 | HumanMessagePromptTemplate.from_template("{text}"), 220 | ] 221 | ) 222 | llm_response = llm_chain( 223 | f"Here's the question to rewrite in the expected format: ```{q_prompt}```", 224 | [], 225 | chat_prompt, 226 | ) 227 | new_title, new_question = extract_title_and_question(llm_response["answer"]) 228 | return (new_title, new_question) 229 | 230 | 231 | def open_sidebar(): 232 | st.session_state.open_sidebar = True 233 | 234 | 235 | def close_sidebar(): 236 | st.session_state.open_sidebar = False 237 | 238 | 239 | if not "open_sidebar" in st.session_state: 240 | st.session_state.open_sidebar = False 241 | if st.session_state.open_sidebar: 242 | new_title, new_question = generate_ticket() 243 | with st.sidebar: 244 | st.title("Ticket draft") 245 | st.write("Auto generated draft ticket") 246 | st.text_input("Title", new_title) 247 | st.text_area("Description", new_question) 248 | st.button( 249 | "Submit to support team", 250 | type="primary", 251 | key="submit_ticket", 252 | on_click=close_sidebar, 253 | ) 254 | 255 | # >>>> UI: show chat <<<< 256 | display_chat() 257 | chat_input() 258 | 259 | 260 | -------------------------------------------------------------------------------- /openai+llmsherpa/genai-stack/chains.py: -------------------------------------------------------------------------------- 1 | from langchain.embeddings.openai import OpenAIEmbeddings 2 | from langchain.embeddings import OllamaEmbeddings, SentenceTransformerEmbeddings 3 | from langchain.chat_models import ChatOpenAI, ChatOllama 4 | from langchain.vectorstores.neo4j_vector import Neo4jVector 5 | from langchain.chains import RetrievalQAWithSourcesChain 6 | from langchain.chains.qa_with_sources import load_qa_with_sources_chain 7 | from langchain.prompts.chat import ( 8 | ChatPromptTemplate, 9 | SystemMessagePromptTemplate, 10 | HumanMessagePromptTemplate, 11 | ) 12 | from typing import List, Any 13 | from utils import BaseLogger 14 | from langchain.chains import GraphCypherQAChain 15 | 16 | def load_embedding_model(embedding_model_name: str, logger=BaseLogger(), config={}): 17 | if embedding_model_name == "ollama": 18 | embeddings = OllamaEmbeddings( 19 | base_url=config["ollama_base_url"], model="llama2" 20 | ) 21 | dimension = 4096 22 | logger.info("Embedding: Using Ollama") 23 | elif embedding_model_name == "openai": 24 | embeddings = OpenAIEmbeddings() 25 | dimension = 1536 26 | logger.info("Embedding: Using OpenAI") 27 | else: 28 | embeddings = SentenceTransformerEmbeddings( 29 | model_name="all-MiniLM-L6-v2", cache_folder="/embedding_model" 30 | ) 31 | dimension = 384 32 | logger.info("Embedding: Using SentenceTransformer") 33 | return embeddings, dimension 34 | 35 | 36 | def load_llm(llm_name: str, logger=BaseLogger(), config={}): 37 | if llm_name == "gpt-4": 38 | logger.info("LLM: Using GPT-4") 39 | return ChatOpenAI(temperature=0, model_name="gpt-4", streaming=True) 40 | elif llm_name == "gpt-3.5": 41 | logger.info("LLM: Using GPT-3.5") 42 | return ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", streaming=True) 43 | elif len(llm_name): 44 | logger.info(f"LLM: Using Ollama: {llm_name}") 45 | return ChatOllama( 46 | temperature=0, 47 | base_url=config["ollama_base_url"], 48 | model=llm_name, 49 | streaming=True, 50 | top_k=10, # A higher value (100) will give more diverse answers, while a lower value (10) will be more conservative. 51 | top_p=0.3, # Higher value (0.95) will lead to more diverse text, while a lower value (0.5) will generate more focused text. 52 | num_ctx=3072, # Sets the size of the context window used to generate the next token. 53 | ) 54 | logger.info("LLM: Using GPT-3.5") 55 | return ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", streaming=True) 56 | 57 | 58 | def configure_llm_only_chain(llm): 59 | # LLM only response 60 | template = """ 61 | You are a helpful assistant that helps with answering general questions. 62 | If you don't know the answer, just say that you don't know, don't try to make up an answer. 63 | """ 64 | system_message_prompt = SystemMessagePromptTemplate.from_template(template) 65 | human_template = "{text}" 66 | human_message_prompt = HumanMessagePromptTemplate.from_template(human_template) 67 | chat_prompt = ChatPromptTemplate.from_messages( 68 | [system_message_prompt, human_message_prompt] 69 | ) 70 | 71 | def generate_llm_output( 72 | user_input: str, callbacks: List[Any], prompt=chat_prompt 73 | ) -> str: 74 | answer = llm( 75 | prompt.format_prompt( 76 | text=user_input, 77 | ).to_messages(), 78 | callbacks=callbacks, 79 | ).content 80 | return {"answer": answer} 81 | 82 | return generate_llm_output 83 | 84 | 85 | def configure_qa_rag_chain(llm, embeddings, embeddings_store_url, username, password): 86 | # RAG response 87 | general_system_template = """ 88 | Use the following pieces of context to answer the question at the end. 89 | The context contains question-answer pairs and their links from Stackoverflow. 90 | You should prefer information from accepted or more upvoted answers. 91 | Make sure to rely on information from the answers and not on questions to provide accuate responses. 92 | When you find particular answer in the context useful, make sure to cite it in the answer using the link. 93 | If you don't know the answer, just say that you don't know, don't try to make up an answer. 94 | ---- 95 | {summaries} 96 | ---- 97 | Each answer you generate should contain a section at the end of links to 98 | Stackoverflow questions and answers you found useful, which are described under Source value. 99 | You can only use links to StackOverflow questions that are present in the context and always 100 | add links to the end of the answer in the style of citations. 101 | Generate concise answers with references sources section of links to 102 | relevant StackOverflow questions only at the end of the answer. 103 | """ 104 | general_user_template = "Question:```{question}```" 105 | messages = [ 106 | SystemMessagePromptTemplate.from_template(general_system_template), 107 | HumanMessagePromptTemplate.from_template(general_user_template), 108 | ] 109 | qa_prompt = ChatPromptTemplate.from_messages(messages) 110 | 111 | qa_chain = load_qa_with_sources_chain( 112 | llm, 113 | chain_type="stuff", 114 | prompt=qa_prompt, 115 | ) 116 | 117 | # Vector + Knowledge Graph response 118 | kg = Neo4jVector.from_existing_index( 119 | embedding=embeddings, 120 | url=embeddings_store_url, 121 | username=username, 122 | password=password, 123 | database='neo4j', # neo4j by default 124 | index_name="stackoverflow", # vector by default 125 | text_node_property="body", # text by default 126 | retrieval_query=""" 127 | WITH node AS question, score AS similarity 128 | CALL { with question 129 | MATCH (question)<-[:ANSWERS]-(answer) 130 | WITH answer 131 | ORDER BY answer.is_accepted DESC, answer.score DESC 132 | WITH collect(answer)[..2] as answers 133 | RETURN reduce(str='', answer IN answers | str + 134 | '\n### Answer (Accepted: '+ answer.is_accepted + 135 | ' Score: ' + answer.score+ '): '+ answer.body + '\n') as answerTexts 136 | } 137 | RETURN '##Question: ' + question.title + '\n' + question.body + '\n' 138 | + answerTexts AS text, similarity as score, {source: question.link} AS metadata 139 | ORDER BY similarity ASC // so that best answers are the last 140 | """, 141 | ) 142 | 143 | kg_qa = RetrievalQAWithSourcesChain( 144 | combine_documents_chain=qa_chain, 145 | retriever=kg.as_retriever(search_kwargs={"k": 2}), 146 | reduce_k_below_max_tokens=False, 147 | max_tokens_limit=3375, 148 | ) 149 | return kg_qa 150 | 151 | # ADDED 152 | # >>>> Extended to support vector search over strucutured chunking 153 | 154 | def configure_qa_structure_rag_chain(llm, embeddings, embeddings_store_url, username, password): 155 | # RAG response based on vector search and retrieval of structured chunks 156 | 157 | sample_query = """ 158 | // 0 - prepare question and its embedding 159 | MATCH (ch:Chunk) -[:HAS_EMBEDDING]-> (chemb) 160 | WHERE ch.block_idx = 19 161 | WITH ch.sentences AS question, chemb.value AS qemb 162 | // 1 - search chunk vectors 163 | CALL db.index.vector.queryNodes($index_name, $k, qemb) YIELD node, score 164 | // 2 - retrieve connectd chunks, sections and documents 165 | WITH node AS answerEmb, score 166 | MATCH (answerEmb) <-[:HAS_EMBEDDING]- (answer) -[:HAS_PARENT*]-> (s:Section) 167 | WITH s, score LIMIT 1 168 | MATCH (d:Document) <-[*]- (s) <-[:HAS_PARENT*]- (chunk:Chunk) 169 | WITH d, s, chunk, score ORDER BY chunk.block_idx ASC 170 | // 3 - prepare results 171 | WITH d, collect(chunk) AS chunks, score 172 | RETURN {source: d.url, page: chunks[0].page_idx} AS metadata, 173 | reduce(text = "", x IN chunks | text + x.sentences + '.') AS text, score; 174 | """ 175 | 176 | general_system_template = """ 177 | You are a customer service agent that helps a customer with answering questions about a service. 178 | Use the following context to answer the question at the end. 179 | Make sure not to make any changes to the context if possible when prepare answers so as to provide accuate responses. 180 | If you don't know the answer, just say that you don't know, don't try to make up an answer. 181 | ---- 182 | {summaries} 183 | ---- 184 | At the end of each answer you should contain metadata for relevant document in the form of (source, page). 185 | For example, if context has `metadata`:(source:'docu_url', page:1), you should display ('doc_url', 1). 186 | """ 187 | general_user_template = "Question:```{question}```" 188 | messages = [ 189 | SystemMessagePromptTemplate.from_template(general_system_template), 190 | HumanMessagePromptTemplate.from_template(general_user_template), 191 | ] 192 | qa_prompt = ChatPromptTemplate.from_messages(messages) 193 | 194 | qa_chain = load_qa_with_sources_chain( 195 | llm, 196 | chain_type="stuff", 197 | prompt=qa_prompt, 198 | ) 199 | 200 | # Vector + Knowledge Graph response 201 | kg = Neo4jVector.from_existing_index( 202 | embedding=embeddings, 203 | url=embeddings_store_url, 204 | username=username, 205 | password=password, 206 | database='bravo', # neo4j by default 207 | index_name="chunkVectorIndex", # vector by default 208 | node_label="Embedding", # embedding node label 209 | embedding_node_property="value", # embedding value property 210 | text_node_property="sentences", # text by default 211 | retrieval_query=""" 212 | WITH node AS answerEmb, score 213 | ORDER BY score DESC LIMIT 10 214 | MATCH (answerEmb) <-[:HAS_EMBEDDING]- (answer) -[:HAS_PARENT*]-> (s:Section) 215 | WITH s, answer, score 216 | MATCH (d:Document) <-[*]- (s) <-[:HAS_PARENT*]- (chunk:Chunk) 217 | WITH d, s, answer, chunk, score ORDER BY d.url_hash, s.title, chunk.block_idx ASC 218 | // 3 - prepare results 219 | WITH d, s, collect(answer) AS answers, collect(chunk) AS chunks, max(score) AS maxScore 220 | RETURN {source: d.url, page: chunks[0].page_idx+1, matched_chunk_id: id(answers[0])} AS metadata, 221 | reduce(text = "", x IN chunks | text + x.sentences + '.') AS text, maxScore AS score LIMIT 3; 222 | """, 223 | ) 224 | 225 | kg_qa = RetrievalQAWithSourcesChain( 226 | combine_documents_chain=qa_chain, 227 | retriever=kg.as_retriever(search_kwargs={"k": 25}), 228 | reduce_k_below_max_tokens=False, 229 | max_tokens_limit=7000, # gpt-4 230 | ) 231 | return kg_qa 232 | -------------------------------------------------------------------------------- /openai+llmsherpa/LayoutPDFReader_KGLoader.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#\n", 10 | "# This is the demo of:\n", 11 | "# - using LayoutPDFReader to read PDF files\n", 12 | "# - mapping PDF elements into a property graph\n", 13 | "# - saving PDF elements into Neo4j\n", 14 | "#\n", 15 | "\n", 16 | "from llmsherpa.readers import LayoutPDFReader\n", 17 | "\n", 18 | "llmsherpa_api_url = \"https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all\"\n", 19 | "\n", 20 | "file_location = '/Users/Shared/resources/papers'\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from neo4j import GraphDatabase\n", 30 | "import uuid\n", 31 | "import hashlib\n", 32 | "\n", 33 | "# Please change the following variables to your own Neo4j instance\n", 34 | "NEO4J_URL = \"neo4j+s://.databases.neo4j.io\"\n", 35 | "NEO4J_USER = \"neo4j\"\n", 36 | "NEO4J_PASSWORD = \"\"\n", 37 | "NEO4J_DATABASE = \"neo4j\"\n", 38 | "\n", 39 | "\n", 40 | "def initialiseNeo4j():\n", 41 | " cypher_schema = [\n", 42 | " \"CREATE CONSTRAINT sectionKey IF NOT EXISTS FOR (c:Section) REQUIRE (c.key) IS UNIQUE;\",\n", 43 | " \"CREATE CONSTRAINT chunkKey IF NOT EXISTS FOR (c:Chunk) REQUIRE (c.key) IS UNIQUE;\",\n", 44 | " \"CREATE CONSTRAINT documentKey IF NOT EXISTS FOR (c:Document) REQUIRE (c.url_hash) IS UNIQUE;\",\n", 45 | " \"CREATE CONSTRAINT tableKey IF NOT EXISTS FOR (c:Table) REQUIRE (c.key) IS UNIQUE;\",\n", 46 | " \"CALL db.index.vector.createNodeIndex('chunkVectorIndex', 'Embedding', 'value', 1536, 'COSINE');\"\n", 47 | " ]\n", 48 | "\n", 49 | " driver = GraphDatabase.driver(NEO4J_URL, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))\n", 50 | "\n", 51 | " with driver.session() as session:\n", 52 | " for cypher in cypher_schema:\n", 53 | " session.run(cypher)\n", 54 | " driver.close()\n", 55 | " \n", 56 | "\n", 57 | "def ingestDocumentNeo4j(doc, doc_location):\n", 58 | "\n", 59 | "\n", 60 | " cypher_pool = [\n", 61 | " # 0 - Document\n", 62 | " \"MERGE (d:Document {url_hash: $doc_url_hash_val}) ON CREATE SET d.url = $doc_url_val RETURN d;\", \n", 63 | " # 1 - Section\n", 64 | " \"MERGE (p:Section {key: $doc_url_hash_val+'|'+$block_idx_val+'|'+$title_hash_val}) ON CREATE SET p.page_idx = $page_idx_val, p.title_hash = $title_hash_val, p.block_idx = $block_idx_val, p.title = $title_val, p.tag = $tag_val, p.level = $level_val RETURN p;\",\n", 65 | " # 2 - Link Section with the Document\n", 66 | " \"MATCH (d:Document {url_hash: $doc_url_hash_val}) MATCH (s:Section {key: $doc_url_hash_val+'|'+$block_idx_val+'|'+$title_hash_val}) MERGE (d)<-[:HAS_DOCUMENT]-(s);\",\n", 67 | " # 3 - Link Section with a parent section\n", 68 | " \"MATCH (s1:Section {key: $doc_url_hash_val+'|'+$parent_block_idx_val+'|'+$parent_title_hash_val}) MATCH (s2:Section {key: $doc_url_hash_val+'|'+$block_idx_val+'|'+$title_hash_val}) MERGE (s1)<-[:UNDER_SECTION]-(s2);\",\n", 69 | " # 4 - Chunk\n", 70 | " \"MERGE (c:Chunk {key: $doc_url_hash_val+'|'+$block_idx_val+'|'+$sentences_hash_val}) ON CREATE SET c.sentences = $sentences_val, c.sentences_hash = $sentences_hash_val, c.block_idx = $block_idx_val, c.page_idx = $page_idx_val, c.tag = $tag_val, c.level = $level_val RETURN c;\",\n", 71 | " # 5 - Link Chunk to Section\n", 72 | " \"MATCH (c:Chunk {key: $doc_url_hash_val+'|'+$block_idx_val+'|'+$sentences_hash_val}) MATCH (s:Section {key:$doc_url_hash_val+'|'+$parent_block_idx_val+'|'+$parent_hash_val}) MERGE (s)<-[:HAS_PARENT]-(c);\",\n", 73 | " # 6 - Table\n", 74 | " \"MERGE (t:Table {key: $doc_url_hash_val+'|'+$block_idx_val+'|'+$name_val}) ON CREATE SET t.name = $name_val, t.doc_url_hash = $doc_url_hash_val, t.block_idx = $block_idx_val, t.page_idx = $page_idx_val, t.html = $html_val, t.rows = $rows_val RETURN t;\",\n", 75 | " # 7 - Link Table to Section\n", 76 | " \"MATCH (t:Table {key: $doc_url_hash_val+'|'+$block_idx_val+'|'+$name_val}) MATCH (s:Section {key: $doc_url_hash_val+'|'+$parent_block_idx_val+'|'+$parent_hash_val}) MERGE (s)<-[:HAS_PARENT]-(t);\",\n", 77 | " # 8 - Link Table to Document if no parent section\n", 78 | " \"MATCH (t:Table {key: $doc_url_hash_val+'|'+$block_idx_val+'|'+$name_val}) MATCH (s:Document {url_hash: $doc_url_hash_val}) MERGE (s)<-[:HAS_PARENT]-(t);\"\n", 79 | " ]\n", 80 | "\n", 81 | " driver = GraphDatabase.driver(NEO4J_URL, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))\n", 82 | "\n", 83 | " with driver.session() as session:\n", 84 | " cypher = \"\"\n", 85 | "\n", 86 | " # 1 - Create Document node\n", 87 | " doc_url_val = doc_location\n", 88 | " doc_url_hash_val = hashlib.md5(doc_url_val.encode(\"utf-8\")).hexdigest()\n", 89 | "\n", 90 | " cypher = cypher_pool[0]\n", 91 | " session.run(cypher, doc_url_hash_val=doc_url_hash_val, doc_url_val=doc_url_val)\n", 92 | "\n", 93 | " # 2 - Create Section nodes\n", 94 | " \n", 95 | " countSection = 0\n", 96 | " for sec in doc.sections():\n", 97 | " sec_title_val = sec.title\n", 98 | " sec_title_hash_val = hashlib.md5(sec_title_val.encode(\"utf-8\")).hexdigest()\n", 99 | " sec_tag_val = sec.tag\n", 100 | " sec_level_val = sec.level\n", 101 | " sec_page_idx_val = sec.page_idx\n", 102 | " sec_block_idx_val = sec.block_idx\n", 103 | "\n", 104 | " # MERGE section node\n", 105 | " if not sec_tag_val == 'table':\n", 106 | " cypher = cypher_pool[1]\n", 107 | " session.run(cypher, page_idx_val=sec_page_idx_val\n", 108 | " , title_hash_val=sec_title_hash_val\n", 109 | " , title_val=sec_title_val\n", 110 | " , tag_val=sec_tag_val\n", 111 | " , level_val=sec_level_val\n", 112 | " , block_idx_val=sec_block_idx_val\n", 113 | " , doc_url_hash_val=doc_url_hash_val\n", 114 | " )\n", 115 | "\n", 116 | " # Link Section with a parent section or Document\n", 117 | "\n", 118 | " sec_parent_val = str(sec.parent.to_text())\n", 119 | "\n", 120 | " if sec_parent_val == \"None\": # use Document as parent\n", 121 | "\n", 122 | " cypher = cypher_pool[2]\n", 123 | " session.run(cypher, page_idx_val=sec_page_idx_val\n", 124 | " , title_hash_val=sec_title_hash_val\n", 125 | " , doc_url_hash_val=doc_url_hash_val\n", 126 | " , block_idx_val=sec_block_idx_val\n", 127 | " )\n", 128 | "\n", 129 | " else: # use parent section\n", 130 | " sec_parent_title_hash_val = hashlib.md5(sec_parent_val.encode(\"utf-8\")).hexdigest()\n", 131 | " sec_parent_page_idx_val = sec.parent.page_idx\n", 132 | " sec_parent_block_idx_val = sec.parent.block_idx\n", 133 | "\n", 134 | " cypher = cypher_pool[3]\n", 135 | " session.run(cypher, page_idx_val=sec_page_idx_val\n", 136 | " , title_hash_val=sec_title_hash_val\n", 137 | " , block_idx_val=sec_block_idx_val\n", 138 | " , parent_page_idx_val=sec_parent_page_idx_val\n", 139 | " , parent_title_hash_val=sec_parent_title_hash_val\n", 140 | " , parent_block_idx_val=sec_parent_block_idx_val\n", 141 | " , doc_url_hash_val=doc_url_hash_val\n", 142 | " )\n", 143 | " # **** if sec_parent_val == \"None\": \n", 144 | "\n", 145 | " countSection += 1\n", 146 | " # **** for sec in doc.sections():\n", 147 | "\n", 148 | " \n", 149 | " # ------- Continue within the blocks -------\n", 150 | " # 3 - Create Chunk nodes from chunks\n", 151 | " \n", 152 | " countChunk = 0\n", 153 | " for chk in doc.chunks():\n", 154 | "\n", 155 | " chunk_block_idx_val = chk.block_idx\n", 156 | " chunk_page_idx_val = chk.page_idx\n", 157 | " chunk_tag_val = chk.tag\n", 158 | " chunk_level_val = chk.level\n", 159 | " chunk_sentences = \"\\n\".join(chk.sentences)\n", 160 | "\n", 161 | " # MERGE Chunk node\n", 162 | " if not chunk_tag_val == 'table':\n", 163 | " chunk_sentences_hash_val = hashlib.md5(chunk_sentences.encode(\"utf-8\")).hexdigest()\n", 164 | "\n", 165 | " # MERGE chunk node\n", 166 | " cypher = cypher_pool[4]\n", 167 | " session.run(cypher, sentences_hash_val=chunk_sentences_hash_val\n", 168 | " , sentences_val=chunk_sentences\n", 169 | " , block_idx_val=chunk_block_idx_val\n", 170 | " , page_idx_val=chunk_page_idx_val\n", 171 | " , tag_val=chunk_tag_val\n", 172 | " , level_val=chunk_level_val\n", 173 | " , doc_url_hash_val=doc_url_hash_val\n", 174 | " )\n", 175 | " \n", 176 | " # Link chunk with a section\n", 177 | " # Chunk always has a parent section \n", 178 | "\n", 179 | " chk_parent_val = str(chk.parent.to_text())\n", 180 | " \n", 181 | " if not chk_parent_val == \"None\":\n", 182 | " chk_parent_hash_val = hashlib.md5(chk_parent_val.encode(\"utf-8\")).hexdigest()\n", 183 | " chk_parent_page_idx_val = chk.parent.page_idx\n", 184 | " chk_parent_block_idx_val = chk.parent.block_idx\n", 185 | "\n", 186 | " cypher = cypher_pool[5]\n", 187 | " session.run(cypher, sentences_hash_val=chunk_sentences_hash_val\n", 188 | " , block_idx_val=chunk_block_idx_val\n", 189 | " , parent_hash_val=chk_parent_hash_val\n", 190 | " , parent_block_idx_val=chk_parent_block_idx_val\n", 191 | " , doc_url_hash_val=doc_url_hash_val\n", 192 | " )\n", 193 | " \n", 194 | " # Link sentence \n", 195 | " # >> TO DO for smaller token length\n", 196 | "\n", 197 | " countChunk += 1\n", 198 | " # **** for chk in doc.chunks(): \n", 199 | "\n", 200 | " # 4 - Create Table nodes\n", 201 | "\n", 202 | " countTable = 0\n", 203 | " for tb in doc.tables():\n", 204 | " page_idx_val = tb.page_idx\n", 205 | " block_idx_val = tb.block_idx\n", 206 | " name_val = 'block#' + str(block_idx_val) + '_' + tb.name\n", 207 | " html_val = tb.to_html()\n", 208 | " rows_val = len(tb.rows)\n", 209 | "\n", 210 | " # MERGE table node\n", 211 | "\n", 212 | " cypher = cypher_pool[6]\n", 213 | " session.run(cypher, block_idx_val=block_idx_val\n", 214 | " , page_idx_val=page_idx_val\n", 215 | " , name_val=name_val\n", 216 | " , html_val=html_val\n", 217 | " , rows_val=rows_val\n", 218 | " , doc_url_hash_val=doc_url_hash_val\n", 219 | " )\n", 220 | " \n", 221 | " # Link table with a section\n", 222 | " # Table always has a parent section \n", 223 | "\n", 224 | " table_parent_val = str(tb.parent.to_text())\n", 225 | " \n", 226 | " if not table_parent_val == \"None\":\n", 227 | " table_parent_hash_val = hashlib.md5(table_parent_val.encode(\"utf-8\")).hexdigest()\n", 228 | " table_parent_page_idx_val = tb.parent.page_idx\n", 229 | " table_parent_block_idx_val = tb.parent.block_idx\n", 230 | "\n", 231 | " cypher = cypher_pool[7]\n", 232 | " session.run(cypher, name_val=name_val\n", 233 | " , block_idx_val=block_idx_val\n", 234 | " , parent_page_idx_val=table_parent_page_idx_val\n", 235 | " , parent_hash_val=table_parent_hash_val\n", 236 | " , parent_block_idx_val=table_parent_block_idx_val\n", 237 | " , doc_url_hash_val=doc_url_hash_val\n", 238 | " )\n", 239 | "\n", 240 | " else: # link table to Document\n", 241 | " cypher = cypher_pool[8]\n", 242 | " session.run(cypher, name_val=name_val\n", 243 | " , block_idx_val=block_idx_val\n", 244 | " , doc_url_hash_val=doc_url_hash_val\n", 245 | " )\n", 246 | " countTable += 1\n", 247 | "\n", 248 | " # **** for tb in doc.tables():\n", 249 | " \n", 250 | " print(f'\\'{doc_url_val}\\' Done! Summary: ')\n", 251 | " print('#Sections: ' + str(countSection))\n", 252 | " print('#Chunks: ' + str(countChunk))\n", 253 | " print('#Tables: ' + str(countTable))\n", 254 | "\n", 255 | " driver.close()\n" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 3, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "# create constraints and indexes\n", 265 | "\n", 266 | "initialiseNeo4j()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 3, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "name": "stdout", 276 | "output_type": "stream", 277 | "text": [ 278 | "#PDF files found: 1!\n", 279 | "'/Users/Shared/resources/papers/Natural Language is All a Graph Needs.pdf' Done! Summary: \n", 280 | "#Sections: 63\n", 281 | "#Chunks: 170\n", 282 | "#Tables: 6\n", 283 | "Total time: 0:10:04.419306\n" 284 | ] 285 | } 286 | ], 287 | "source": [ 288 | "\n", 289 | "# get all documents under the folder\n", 290 | "import os\n", 291 | "import glob\n", 292 | "from datetime import datetime\n", 293 | "import time\n", 294 | "\n", 295 | "pdf_files = glob.glob(file_location + '/*.pdf')\n", 296 | "\n", 297 | "print(f'#PDF files found: {len(pdf_files)}!')\n", 298 | "pdf_reader = LayoutPDFReader(llmsherpa_api_url)\n", 299 | "\n", 300 | "# parse documents and create graph\n", 301 | "startTime = datetime.now()\n", 302 | "\n", 303 | "for pdf_file in pdf_files:\n", 304 | " doc = pdf_reader.read_pdf(pdf_file)\n", 305 | "\n", 306 | " # find the first / in pdf_file from right\n", 307 | " idx = pdf_file.rfind('/')\n", 308 | " pdf_file_name = pdf_file[idx+1:]\n", 309 | "\n", 310 | " # open a local file to write the JSON\n", 311 | " with open(pdf_file_name + '.json', 'w') as f:\n", 312 | " # convert doc.json from a list to string\n", 313 | " f.write(str(doc.json))\n", 314 | "\n", 315 | " ingestDocumentNeo4j(doc, pdf_file)\n", 316 | "\n", 317 | "print(f'Total time: {datetime.now() - startTime}')\n", 318 | "\n", 319 | "# DONE\n", 320 | "\n" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [] 329 | } 330 | ], 331 | "metadata": { 332 | "kernelspec": { 333 | "display_name": "Python 3", 334 | "language": "python", 335 | "name": "python3" 336 | }, 337 | "language_info": { 338 | "codemirror_mode": { 339 | "name": "ipython", 340 | "version": 3 341 | }, 342 | "file_extension": ".py", 343 | "mimetype": "text/x-python", 344 | "name": "python", 345 | "nbconvert_exporter": "python", 346 | "pygments_lexer": "ipython3", 347 | "version": "3.8.9" 348 | }, 349 | "orig_nbformat": 4 350 | }, 351 | "nbformat": 4, 352 | "nbformat_minor": 2 353 | } 354 | -------------------------------------------------------------------------------- /openai+llamaparse/demo_neo4j_vectordb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# PDF parsing using LlamaParse for knowledge graph creation in Neo4j \n", 8 | "\n", 9 | "\"Open\n", 10 | "\n", 11 | "This Python notebook offers a comprehensive guide on leveraging LlamaParse to extract information from PDF documents and subsequently store this extracted content into a Neo4j graph database. Designed with practicality in mind, this tutorial caters to developers, data scientists, and tech enthusiasts interested in document processing, information extraction, and graph database technologies.\n", 12 | "\n", 13 | "Key Features of the Notebook:\n", 14 | "\n", 15 | "1. Setting Up the Environment: Step-by-step instructions on setting up your Python environment, including the installation of necessary libraries and tools such as LlamaParse and the Neo4j database driver.\n", 16 | "\n", 17 | "2. PDF Document Processing: Demonstrates how to use LlamaParse to read PDF documents, extract relevant information (such as text, tables, and images), and transform this information into a structured format suitable for database insertion.\n", 18 | "\n", 19 | "3. The Graph Model for docoment: Guidance on designing an effective graph model that represents the relationships and entities extracted from your PDF documents, ensuring optimal structure for querying and analysis. \n", 20 | "\n", 21 | "4. Storing Extracted Data in Neo4j: Detailed code examples showing how to connect to a Neo4j database from Python, create nodes and relationships based on the extracted data, and execute Cypher queries to populate the database.\n", 22 | "\n", 23 | "5. Generating and Storing Text Embeddings: Using program created in the past to generate text embedding via OpenAI API call and store embedding as vector in Neo4j. \n", 24 | "\n", 25 | "6. Querying and Analyzing Data: Examples of Cypher queries to retrieve and analyze the stored data, illustrating how Neo4j can uncover insights and relationships hidden within your PDF content.\n", 26 | "\n", 27 | "7. Conclusions: Tips on best practices for processing PDFs, designing graph schemas, and optimizing Neo4j queries, along with common troubleshooting advice for potential issues encountered during the process.\n", 28 | "\n", 29 | "For a quick introduction on LlamaParse, please check this article.\n", 30 | "\n", 31 | "Note for this example, it is required for `llama_index >=0.10.4` version. If `pip install --upgrade ` didn't work, you may use `pip uninstall ` and install required package again." 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "

1. Setting up the environment

" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "!pip3 install llama-index\n", 48 | "!pip3 install llama-index-core\n", 49 | "!pip3 install llama-index-embeddings-openai\n", 50 | "!pip3 install llama-parse\n", 51 | "!pip3 install neo4j" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!wget 'https://raw.githubusercontent.com/Joshua-Yu/graph-rag/main/openai%2Bllamaparse/InjuredWorkerGuidebookCalifornia.pdf' -O './insurance.pdf'" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "

2. PDF document processing

\n", 68 | "\n", 69 | "We need OpenAI and LlamaParse API keys to run the project. \n", 70 | "\n", 71 | "For more information of how to get OpenAI API key, please visit here.\n", 72 | "\n", 73 | "For more information of how to get LlamaParse key, please visit here." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 1, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio\n", 83 | "import nest_asyncio\n", 84 | "nest_asyncio.apply()\n", 85 | "\n", 86 | "import os\n", 87 | "# API access to llama-cloud\n", 88 | "os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"LLAMACLOUD-API-KEY\"\n", 89 | "\n", 90 | "# Using OpenAI API for embeddings/llms\n", 91 | "os.environ[\"OPENAI_API_KEY\"] = \"OPENAI-API-KEY\"" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 112, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "from llama_index.llms.openai import OpenAI\n", 101 | "from llama_index.embeddings.openai import OpenAIEmbedding\n", 102 | "from llama_index.core import VectorStoreIndex\n", 103 | "from llama_index.core import Settings\n", 104 | "\n", 105 | "EMBEDDING_MODEL = \"text-embedding-3-small\"\n", 106 | "GENERATION_MODEL = \"gpt-4\"\n", 107 | "\n", 108 | "llm = OpenAI(model=GENERATION_MODEL)\n", 109 | "\n", 110 | "Settings.llm = llm\n", 111 | "\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "

Using brand new `LlamaParse` PDF reader for PDF Parsing

\n", 119 | "\n", 120 | "we also compare two different retrieval/query engine strategies:\n", 121 | "1. Using raw Markdown text as nodes for building index and apply simple query engine for generating the results;\n", 122 | "2. Using `MarkdownElementNodeParser` for parsing the `LlamaParse` output Markdown results and building recursive retriever query engine for generation." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "from llama_parse import LlamaParse\n", 132 | "\n", 133 | "pdf_file_name = './insurance.pdf'\n", 134 | "\n", 135 | "documents = LlamaParse(result_type=\"markdown\").load_data(pdf_file_name)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# Check loaded documents\n", 145 | "\n", 146 | "print(f\"Number of documents: {len(documents)}\")\n", 147 | "\n", 148 | "for doc in documents:\n", 149 | " print(doc.doc_id)\n", 150 | " print(doc.text[:500] + '...')\n", 151 | " " 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 5, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "# Parse the documents using MarkdownElementNodeParser\n", 161 | "\n", 162 | "from llama_index.core.node_parser import MarkdownElementNodeParser\n", 163 | "\n", 164 | "node_parser = MarkdownElementNodeParser(llm=llm, num_workers=8)\n", 165 | "\n", 166 | "nodes = node_parser.get_nodes_from_documents(documents)\n" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 7, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "# Convert nodes into objects\n", 176 | "\n", 177 | "base_nodes, objects = node_parser.get_nodes_and_objects(nodes)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "import json\n", 187 | "\n", 188 | "\n", 189 | "# Check parsed node objects \n", 190 | "\n", 191 | "print(f\"Number of nodes: {len(base_nodes)}\")\n", 192 | "\n" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "\n", 202 | "TABLE_REF_SUFFIX = '_table_ref'\n", 203 | "TABLE_ID_SUFFIX = '_table'\n", 204 | "\n", 205 | "# Check parsed objects \n", 206 | "\n", 207 | "print(f\"Number of objects: {len(objects)}\")\n", 208 | "\n", 209 | "for node in objects: \n", 210 | " print(f\"id:{node.node_id}\")\n", 211 | " print(f\"hash:{node.hash}\")\n", 212 | " print(f\"parent:{node.parent_node}\")\n", 213 | " print(f\"prev:{node.prev_node}\")\n", 214 | " print(f\"next:{node.next_node}\")\n", 215 | "\n", 216 | " # Object is a Table\n", 217 | " if node.node_id[-1 * len(TABLE_REF_SUFFIX):] == TABLE_REF_SUFFIX:\n", 218 | "\n", 219 | " if node.next_node is not None:\n", 220 | " next_node = node.next_node\n", 221 | " \n", 222 | " print(f\"next_node metadata:{next_node.metadata}\")\n", 223 | " print(f\"next_next_node:{next_next_nod_id}\")\n", 224 | "\n", 225 | " obj_metadata = json.loads(str(next_node.json()))\n", 226 | "\n", 227 | " print(str(obj_metadata))\n", 228 | "\n", 229 | " print(f\"def:{obj_metadata['metadata']['table_df']}\")\n", 230 | " print(f\"summary:{obj_metadata['metadata']['table_summary']}\")\n", 231 | "\n", 232 | "\n", 233 | " print(f\"next:{node.next_node}\")\n", 234 | " print(f\"type:{node.get_type()}\")\n", 235 | " print(f\"class:{node.class_name()}\")\n", 236 | " print(f\"content:{node.get_content()[:200]}\")\n", 237 | " print(f\"metadata:{node.metadata}\")\n", 238 | " print(f\"extra:{node.extra_info}\")\n", 239 | " \n", 240 | " node_json = json.loads(node.json())\n", 241 | "\n", 242 | " print(f\"start_idx:{node_json.get('start_char_idx')}\")\n", 243 | " print(f\"end_idx:{node_json['end_char_idx']}\")\n", 244 | "\n", 245 | " if 'table_summary' in node_json: \n", 246 | " print(f\"summary:{node_json['table_summary']}\")\n", 247 | "\n", 248 | " print(\"=====================================\") " 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "

3. Graph model for parsed document

\n", 256 | "\n", 257 | "Regardless which PDF parsing tool to use, to save results into Neo4j as a knowledge graph, the graph schema is in fact quite consistent. There are samples using other document parsing tool to create document knowledge graph, the links of which can be found here: \n", 258 | " - Building A Graph+LLM Powered RAG Application from PDF Documents (using LLMSherpa) (link)\n", 259 | " - Integrating unstructured.io with Neo4j AuraDB to Build Document Knowledge Graph(link)\n", 260 | "\n", 261 | "\n", 262 | "\"document \n", 263 | "\n", 264 | "In this project, the similar graph model will be used. Let's start with graph database schema definition: \n", 265 | " - Uniqueness constraint on key property\n", 266 | " - Vector index on embeddings\n", 267 | "\n", 268 | "You can use a local Neo4j instance, or get one from AuraDB for FREE from here." 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 77, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "\n", 278 | "\n", 279 | "from neo4j import GraphDatabase\n", 280 | "\n", 281 | "# Local Neo4j instance\n", 282 | "# NEO4J_URL = \"bolt://localhost:7687\"\n", 283 | "# Remote Neo4j instance on AuraDB\n", 284 | "NEO4J_URL = \"http://.databases.neo4j.io:7474\"\n", 285 | "NEO4J_USER = \"neo4j\"\n", 286 | "NEO4J_PASSWORD = \"\"\n", 287 | "NEO4J_DATABASE = \"neo4j\"\n", 288 | "\n", 289 | "def initialiseNeo4jSchema():\n", 290 | " cypher_schema = [\n", 291 | " \"CREATE CONSTRAINT sectionKey IF NOT EXISTS FOR (c:Section) REQUIRE (c.key) IS UNIQUE;\",\n", 292 | " \"CREATE CONSTRAINT chunkKey IF NOT EXISTS FOR (c:Chunk) REQUIRE (c.key) IS UNIQUE;\",\n", 293 | " \"CREATE CONSTRAINT documentKey IF NOT EXISTS FOR (c:Document) REQUIRE (c.url_hash) IS UNIQUE;\",\n", 294 | " \"CREATE VECTOR INDEX `chunkVectorIndex` IF NOT EXISTS FOR (e:Embedding) ON (e.value) OPTIONS { indexConfig: {`vector.dimensions`: 1536, `vector.similarity_function`: 'cosine'}};\"\n", 295 | " ]\n", 296 | "\n", 297 | " driver = GraphDatabase.driver(NEO4J_URL, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))\n", 298 | "\n", 299 | " with driver.session() as session:\n", 300 | " for cypher in cypher_schema:\n", 301 | " session.run(cypher)\n", 302 | " driver.close()" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "\n", 312 | "# create constraints and indexes\n", 313 | "\n", 314 | "initialiseNeo4jSchema()\n" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "

4. Storing Extracted Data in Neo4j

" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "\n", 331 | "\n", 332 | "driver = GraphDatabase.driver(NEO4J_URL, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))\n", 333 | "\n", 334 | "# ================================================\n", 335 | "# 1) Save documents\n", 336 | "\n", 337 | "print(\"Start saving documents to Neo4j...\")\n", 338 | "i = 0\n", 339 | "with driver.session() as session:\n", 340 | " for doc in documents:\n", 341 | " cypher = \"MERGE (d:Document {url_hash: $doc_id}) ON CREATE SET d.url=$url;\"\n", 342 | " session.run(cypher, doc_id=doc.doc_id, url=doc.doc_id)\n", 343 | " i = i + 1\n", 344 | " session.close()\n", 345 | "\n", 346 | "print(f\"{i} documents saved.\")\n", 347 | "\n", 348 | "# ================================================\n", 349 | "# 2) Save nodes\n", 350 | "\n", 351 | "print(\"Start saving nodes to Neo4j...\")\n", 352 | "\n", 353 | "i = 0\n", 354 | "with driver.session() as session:\n", 355 | " for node in base_nodes: \n", 356 | "\n", 357 | " # >>1 Create Section node\n", 358 | " cypher = \"MERGE (c:Section {key: $node_id})\\n\"\n", 359 | " cypher += \" FOREACH (ignoreMe IN CASE WHEN c.type IS NULL THEN [1] ELSE [] END |\\n\"\n", 360 | " cypher += \" SET c.hash = $hash, c.text=$content, c.type=$type, c.class=$class_name, c.start_idx=$start_idx, c.end_idx=$end_idx )\\n\"\n", 361 | " cypher += \" WITH c\\n\"\n", 362 | " cypher += \" MATCH (d:Document {url_hash: $doc_id})\\n\"\n", 363 | " cypher += \" MERGE (d)<-[:HAS_DOCUMENT]-(c);\"\n", 364 | "\n", 365 | " node_json = json.loads(node.json())\n", 366 | "\n", 367 | " session.run(cypher, node_id=node.node_id, hash=node.hash, content=node.get_content(), type='TEXT', class_name=node.class_name()\n", 368 | " , start_idx=node_json['start_char_idx'], end_idx=node_json['end_char_idx'], doc_id=node.ref_doc_id)\n", 369 | "\n", 370 | " # >>2 Link node using NEXT relationship\n", 371 | "\n", 372 | " if node.next_node is not None: # and node.next_node.node_id[-1*len(TABLE_REF_SUFFIX):] != TABLE_REF_SUFFIX:\n", 373 | " cypher = \"MATCH (c:Section {key: $node_id})\\n\" # current node should exist\n", 374 | " cypher += \"MERGE (p:Section {key: $next_id})\\n\" # previous node may not exist\n", 375 | " cypher += \"MERGE (p)<-[:NEXT]-(c);\"\n", 376 | "\n", 377 | " session.run(cypher, node_id=node.node_id, next_id=node.next_node.node_id)\n", 378 | "\n", 379 | " if node.prev_node is not None: # Because tables are in objects list, so we need to link from the opposite direction\n", 380 | " cypher = \"MATCH (c:Section {key: $node_id})\\n\" # current node should exist\n", 381 | " cypher += \"MERGE (p:Section {key: $prev_id})\\n\" # previous node may not exist\n", 382 | " cypher += \"MERGE (p)-[:NEXT]->(c);\"\n", 383 | "\n", 384 | " if node.prev_node.node_id[-1 * len(TABLE_ID_SUFFIX):] == TABLE_ID_SUFFIX:\n", 385 | " prev_id = node.prev_node.node_id + '_ref'\n", 386 | " else:\n", 387 | " prev_id = node.prev_node.node_id\n", 388 | "\n", 389 | " session.run(cypher, node_id=node.node_id, prev_id=prev_id)\n", 390 | "\n", 391 | " i = i + 1\n", 392 | " session.close()\n", 393 | "\n", 394 | "print(f\"{i} nodes saved.\")\n", 395 | "\n", 396 | "# ================================================\n", 397 | "# 3) Save objects\n", 398 | "\n", 399 | "print(\"Start saving objects to Neo4j...\")\n", 400 | "\n", 401 | "i = 0\n", 402 | "with driver.session() as session:\n", 403 | " for node in objects: \n", 404 | " node_json = json.loads(node.json())\n", 405 | "\n", 406 | " # Object is a Table, then the ????_ref_table object is created as a Section, and the table object is Chunk\n", 407 | " if node.node_id[-1 * len(TABLE_REF_SUFFIX):] == TABLE_REF_SUFFIX:\n", 408 | " if node.next_node is not None: # here is where actual table object is loaded\n", 409 | " next_node = node.next_node\n", 410 | "\n", 411 | " obj_metadata = json.loads(str(next_node.json()))\n", 412 | "\n", 413 | " cypher = \"MERGE (s:Section {key: $node_id})\\n\"\n", 414 | " cypher += \"WITH s MERGE (c:Chunk {key: $table_id})\\n\"\n", 415 | " cypher += \" FOREACH (ignoreMe IN CASE WHEN c.type IS NULL THEN [1] ELSE [] END |\\n\"\n", 416 | " cypher += \" SET c.hash = $hash, c.definition=$content, c.text=$table_summary, c.type=$type, c.start_idx=$start_idx, c.end_idx=$end_idx )\\n\"\n", 417 | " cypher += \" WITH s, c\\n\"\n", 418 | " cypher += \" MERGE (s) <-[:UNDER_SECTION]- (c)\\n\"\n", 419 | " cypher += \" WITH s MATCH (d:Document {url_hash: $doc_id})\\n\"\n", 420 | " cypher += \" MERGE (d)<-[:HAS_DOCUMENT]-(s);\"\n", 421 | "\n", 422 | " session.run(cypher, node_id=node.node_id, hash=next_node.hash, content=obj_metadata['metadata']['table_df'], type='TABLE'\n", 423 | " , start_idx=node_json['start_char_idx'], end_idx=node_json['end_char_idx']\n", 424 | " , doc_id=node.ref_doc_id, table_summary=obj_metadata['metadata']['table_summary'], table_id=next_node.node_id)\n", 425 | " \n", 426 | " if node.prev_node is not None:\n", 427 | " cypher = \"MATCH (c:Section {key: $node_id})\\n\" # current node should exist\n", 428 | " cypher += \"MERGE (p:Section {key: $prev_id})\\n\" # previous node may not exist\n", 429 | " cypher += \"MERGE (p)-[:NEXT]->(c);\"\n", 430 | "\n", 431 | " if node.prev_node.node_id[-1 * len(TABLE_ID_SUFFIX):] == TABLE_ID_SUFFIX:\n", 432 | " prev_id = node.prev_node.node_id + '_ref'\n", 433 | " else:\n", 434 | " prev_id = node.prev_node.node_id\n", 435 | " \n", 436 | " session.run(cypher, node_id=node.node_id, prev_id=prev_id)\n", 437 | " \n", 438 | " i = i + 1\n", 439 | " session.close()\n", 440 | "\n", 441 | "# ================================================\n", 442 | "# 4) Create Chunks for each Section object of type TEXT\n", 443 | "# If there are changes to the content of TEXT section, the Section node needs to be recreated\n", 444 | "\n", 445 | "print(\"Start creating chunks for each TEXT Section...\")\n", 446 | "\n", 447 | "with driver.session() as session:\n", 448 | "\n", 449 | " cypher = \"MATCH (s:Section) WHERE s.type='TEXT' \\n\"\n", 450 | " cypher += \"WITH s CALL {\\n\"\n", 451 | " cypher += \"WITH s WITH s, split(s.text, '\\n') AS para\\n\"\n", 452 | " cypher += \"WITH s, para, range(0, size(para)-1) AS iterator\\n\"\n", 453 | " cypher += \"UNWIND iterator AS i WITH s, trim(para[i]) AS chunk, i WHERE size(chunk) > 0\\n\"\n", 454 | " cypher += \"CREATE (c:Chunk {key: s.key + '_' + i}) SET c.type='TEXT', c.text = chunk, c.seq = i \\n\"\n", 455 | " cypher += \"CREATE (s) <-[:UNDER_SECTION]-(c) } IN TRANSACTIONS OF 500 ROWS ;\"\n", 456 | " \n", 457 | " session.run(cypher)\n", 458 | " \n", 459 | " session.close()\n", 460 | "\n", 461 | "\n", 462 | "print(f\"{i} objects saved.\")\n", 463 | "\n", 464 | "print(\"=================DONE====================\")\n", 465 | "\n", 466 | "driver.close()\n", 467 | "\n", 468 | "\n" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": {}, 474 | "source": [ 475 | "

5. Generating and Storing Text Embeddings

" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 113, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "\n", 485 | "from openai import OpenAI\n", 486 | "\n", 487 | "\n", 488 | "def get_embedding(client, text, model):\n", 489 | " response = client.embeddings.create(\n", 490 | " input=text,\n", 491 | " model=model,\n", 492 | " )\n", 493 | " return response.data[0].embedding\n", 494 | "\n", 495 | "def LoadEmbedding(label, property):\n", 496 | " driver = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USER, NEO4J_PASSWORD), database=NEO4J_DATABASE)\n", 497 | " openai_client = OpenAI (api_key = os.environ[\"OPENAI_API_KEY\"])\n", 498 | "\n", 499 | " with driver.session() as session:\n", 500 | " # get chunks in document, together with their section titles\n", 501 | " result = session.run(f\"MATCH (ch:{label}) RETURN id(ch) AS id, ch.{property} AS text\")\n", 502 | " # call OpenAI embedding API to generate embeddings for each proporty of node\n", 503 | " # for each node, update the embedding property\n", 504 | " count = 0\n", 505 | " for record in result:\n", 506 | " id = record[\"id\"]\n", 507 | " text = record[\"text\"]\n", 508 | " \n", 509 | " # For better performance, text can be batched\n", 510 | " embedding = get_embedding(openai_client, text, EMBEDDING_MODEL)\n", 511 | " \n", 512 | " # key property of Embedding node differentiates different embeddings\n", 513 | " cypher = \"CREATE (e:Embedding) SET e.key=$key, e.value=$embedding, e.model=$model\"\n", 514 | " cypher = cypher + \" WITH e MATCH (n) WHERE id(n) = $id CREATE (n) -[:HAS_EMBEDDING]-> (e)\"\n", 515 | " session.run(cypher,key=property, embedding=embedding, id=id, model=EMBEDDING_MODEL) \n", 516 | " count = count + 1\n", 517 | "\n", 518 | " session.close()\n", 519 | " \n", 520 | " print(\"Processed \" + str(count) + \" \" + label + \" nodes for property @\" + property + \".\")\n", 521 | " return count\n", 522 | "\n" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "# For smaller amount (<2000) of text data to embed\n", 532 | "LoadEmbedding(\"Chunk\", \"text\")" 533 | ] 534 | }, 535 | { 536 | "cell_type": "markdown", 537 | "metadata": {}, 538 | "source": [ 539 | "

6. Querying Document Knowledge Graph

\n", 540 | "\n", 541 | "\n", 542 | "Let's open Neo4j Browser to check the loaded document graph. \n", 543 | "\n", 544 | "Type `MATCH (n:Section) RETURN n` in text box and run it, and we will see a chain of sections of the document. By clicking and expanding a Section node, we can see its connected Chunk nodes. \n", 545 | "\n", 546 | "\n", 547 | "\n", 548 | "\n", 549 | "If a Section node has type `TEXT`, it has a group of Chunk nodes and each stores a paragraph in the text property. \n", 550 | "\n", 551 | "\n", 552 | "\n", 553 | "If a Section node has type `TABLE`, it only has one Chunk node, with text property storing the summary of table contents, and definition property storing contents of the table. \n", 554 | "\n", 555 | "\n", 556 | "\n" 557 | ] 558 | }, 559 | { 560 | "cell_type": "markdown", 561 | "metadata": {}, 562 | "source": [ 563 | "\n", 564 | "Every Chunk node is connected with an Embedding node which stores the embedding of the text content of the Chunk node. In the beginning of this project, a vector index has been defined to allow us to perform similarity search more efficiently.\n", 565 | "\n", 566 | "Because Section nodes have text content that may exceed the token length limit (8k, ~ 5k words) enforced by the embedding model, by splitting content into paragraph can help remediate this limitation, and embedd text that are more relevant as they appear in the same paragraph.\n", 567 | "\n" 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "metadata": {}, 573 | "source": [ 574 | "

7. Conclusions

\n", 575 | "\n", 576 | "LlamaParse stands out as a highly capable tool for parsing PDF documents, adept at navigating the complexities of both structured and unstructured data with remarkable efficiency. Its advanced algorithms and intuitive API facilitate the seamless extraction of text, tables, images, and metadata from PDFs, transforming what is often a challenging task into a streamlined process. \n", 577 | "\n", 578 | "Storing the extracted data as a graph in Neo4j further amplifies the benefits. By representing data entities and their relationships in a graph database, users can uncover patterns and connections that would be difficult, if not impossible, to detect using traditional relational databases. Neo4j's graph model offers a natural and intuitive way to visualize complex relationships, enhancing the ability to conduct sophisticated analyses and derive actionable insights.\n", 579 | "\n", 580 | "A consistent document knowlwdge graph schema makes it much easier to integrate with other tools for downstream tasks, e.g. to build Retrieval Augmented Generation using GenAI Stack (LangChain and Streamlit).\n", 581 | "\n", 582 | "The combination of LlamaParse's extraction capabilities and Neo4j's graph-based storage and analysis opens up new possibilities for data-driven decision-making. It allows for more nuanced understanding of data relationships, efficient data querying, and the ability to scale with the growing size and complexity of datasets. This synergy not only accelerates the extraction and analysis processes but also contributes to a more informed and strategic approach to data management.\n" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [] 591 | } 592 | ], 593 | "metadata": { 594 | "kernelspec": { 595 | "display_name": "Python 3 (ipykernel)", 596 | "language": "python", 597 | "name": "python3" 598 | }, 599 | "language_info": { 600 | "codemirror_mode": { 601 | "name": "ipython", 602 | "version": 3 603 | }, 604 | "file_extension": ".py", 605 | "mimetype": "text/x-python", 606 | "name": "python", 607 | "nbconvert_exporter": "python", 608 | "pygments_lexer": "ipython3", 609 | "version": "3.11.6" 610 | } 611 | }, 612 | "nbformat": 4, 613 | "nbformat_minor": 4 614 | } 615 | -------------------------------------------------------------------------------- /gemini-multimodal/gemini-vision-er-graph-demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "QDU0XJ1xRDlL" 7 | }, 8 | "source": [ 9 | "## Use Gemini Vision Model to Convert an ER Diagram into a Graph Model\n", 10 | "\n", 11 | "\n", 12 | "Version 0.1\n", 13 | "Last updated: 2024-02-19\n", 14 | "\n", 15 | "Contact: Fanghua Yu / https://www.linkedin.com/in/joshuayu/\n", 16 | "\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "N5afkyDMSBW5" 23 | }, 24 | "source": [ 25 | "### Install Vertex AI SDK for Python\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "id": "kc4WxYmLSBW5", 33 | "tags": [] 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "! pip3 install --upgrade --user google-cloud-aiplatform" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": { 43 | "id": "R5Xep4W9lq-Z" 44 | }, 45 | "source": [ 46 | "### Restart current runtime\n", 47 | "\n", 48 | "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "colab": { 56 | "base_uri": "https://localhost:8080/" 57 | }, 58 | "id": "XRvKdaPDTznN", 59 | "outputId": "154a71b5-f302-4f53-ed2f-b3e5fef9195b" 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "# Restart kernel after installs so that your environment can access the new packages\n", 64 | "import IPython\n", 65 | "import time\n", 66 | "\n", 67 | "app = IPython.Application.instance()\n", 68 | "app.kernel.do_shutdown(True)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "id": "6Fom0ZkMSBW6" 75 | }, 76 | "source": [ 77 | "### Authenticate your notebook environment (Colab only)\n", 78 | "\n", 79 | "If you are running this notebook on Google Colab, run the following cell to authenticate your environment. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).\n" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 1, 85 | "metadata": { 86 | "id": "LCaCx6PLSBW6", 87 | "tags": [] 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "import sys\n", 92 | "\n", 93 | "# Additional authentication is required for Google Colab\n", 94 | "if \"google.colab\" in sys.modules:\n", 95 | " # Authenticate user to Google Cloud\n", 96 | " from google.colab import auth\n", 97 | "\n", 98 | " auth.authenticate_user()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": { 104 | "id": "QGB8Txa_e4V0" 105 | }, 106 | "source": [ 107 | "### Define Google Cloud project information (Colab only)\n", 108 | "\n", 109 | "If you are running this notebook on Google Colab, specify the Google Cloud project information to use. In the following cell, you specify your project information, import the Vertex AI package, and initialize the package. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).\n" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 2, 115 | "metadata": { 116 | "id": "JGOJHtgDe5-r", 117 | "tags": [] 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "if \"google.colab\" in sys.modules:\n", 122 | " # Define project information\n", 123 | " PROJECT_ID = \"YOUR-PROJECT-ID\" # @param {type:\"string\"}\n", 124 | " LOCATION = \"us-central1\" # @param {type:\"string\"}\n", 125 | "\n", 126 | " # Initialize Vertex AI\n", 127 | " import vertexai\n", 128 | "\n", 129 | " vertexai.init(project=PROJECT_ID, location=LOCATION)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": { 135 | "id": "BuQwwRiniVFG" 136 | }, 137 | "source": [ 138 | "### Import libraries\n" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 3, 144 | "metadata": { 145 | "id": "JTk488WDPBtQ", 146 | "tags": [] 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "from vertexai.generative_models import (\n", 151 | " GenerationConfig,\n", 152 | " GenerativeModel,\n", 153 | " Image,\n", 154 | " Part,\n", 155 | ")" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": { 161 | "id": "N7rZuTClfNs0" 162 | }, 163 | "source": [ 164 | "## Use the Gemini 1.0 Pro Vision model\n", 165 | "\n", 166 | "Gemini 1.0 Pro Vision (`gemini-1.0-pro-vision`) is a multimodal model that supports multimodal prompts. You can include text, image(s), and video in your prompt requests and get text or code responses.\n" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 4, 172 | "metadata": { 173 | "id": "2998506fe6d1", 174 | "tags": [] 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "# Specify the Gemini model to use\n", 179 | "\n", 180 | "multimodal_model = GenerativeModel(\"gemini-1.0-pro-vision\")" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": { 186 | "id": "MpL3OkSCfIAR" 187 | }, 188 | "source": [ 189 | "### Define helper functions\n" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 5, 195 | "metadata": { 196 | "id": "S7QMAHXse339", 197 | "tags": [] 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "# The follow code was orginally created as part of the Google Vertex AI sample project.\n", 202 | "\n", 203 | "import http.client\n", 204 | "import typing\n", 205 | "import urllib.request\n", 206 | "\n", 207 | "import IPython.display\n", 208 | "from PIL import Image as PIL_Image\n", 209 | "from PIL import ImageOps as PIL_ImageOps\n", 210 | "\n", 211 | "\n", 212 | "def get_image_bytes_from_url(image_url: str) -> bytes:\n", 213 | " with urllib.request.urlopen(image_url) as response:\n", 214 | " response = typing.cast(http.client.HTTPResponse, response)\n", 215 | " image_bytes = response.read()\n", 216 | " return image_bytes\n", 217 | "\n", 218 | "\n", 219 | "def load_image_from_url(image_url: str) -> Image:\n", 220 | " image_bytes = get_image_bytes_from_url(image_url)\n", 221 | " return Image.from_bytes(image_bytes)\n", 222 | "\n", 223 | "\n", 224 | "def display_content_as_image(content: str | Image | Part) -> bool:\n", 225 | " if not isinstance(content, Image):\n", 226 | " return False\n", 227 | " display_images([content])\n", 228 | " return True\n", 229 | "\n", 230 | "\n", 231 | "\n", 232 | "def print_multimodal_prompt(contents: list[str | Image | Part]):\n", 233 | " \"\"\"\n", 234 | " Given contents that would be sent to Gemini,\n", 235 | " output the full multimodal prompt for ease of readability.\n", 236 | " \"\"\"\n", 237 | " for content in contents:\n", 238 | " if display_content_as_image(content):\n", 239 | " continue\n", 240 | " if display_content_as_video(content):\n", 241 | " continue\n", 242 | " print(content)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": { 248 | "id": "4437b7608c8e" 249 | }, 250 | "source": [ 251 | "## Understanding Components in an E-R Diagram\n", 252 | "\n", 253 | "\n", 254 | "Gemini Vision Pro model has the capabilities to understand diagrams and take actionable steps, such as optimization or code generation. Here we will use the movie rating example to demonstrate how Gemini can decipher an entity relationship (ER) diagram, understand the relationships between tables, and generate schema descriptions in JSON layout.\n", 255 | "\n", 256 | "\n", 257 | "![E-R Diagram for Movie Rating](https://github.com/Joshua-Yu/graph-rag/raw/main/gemini-multimodal/resources/movie-er.jpg) \n", 258 | "\n" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 6, 264 | "metadata": { 265 | "colab": { 266 | "base_uri": "https://localhost:8080/" 267 | }, 268 | "id": "klY4yBEiKmET", 269 | "outputId": "82320aea-57c4-4558-e267-9b76dcb13273", 270 | "tags": [] 271 | }, 272 | "outputs": [ 273 | { 274 | "name": "stdout", 275 | "output_type": "stream", 276 | "text": [ 277 | "\n", 278 | "-------Response--------\n", 279 | " ```json\n", 280 | "{\n", 281 | " \"entities\": [\n", 282 | " {\n", 283 | " \"name\": \"Actor\",\n", 284 | " \"fields\": [\n", 285 | " {\n", 286 | "\n", 287 | " \"name\": \"ActorID\",\n", 288 | " \"type\": \"Integer\",\n", 289 | " \"primary_key\": true,\n", 290 | " \"not_null\":\n", 291 | " true\n", 292 | " },\n", 293 | " {\n", 294 | " \"name\": \"FirstName\",\n", 295 | " \"type\": \"Varchar(50)\",\n", 296 | " \"not_\n", 297 | "null\": true\n", 298 | " },\n", 299 | " {\n", 300 | " \"name\": \"LastName\",\n", 301 | " \"type\": \"Varchar(30)\",\n", 302 | " \"\n", 303 | "not_null\": true\n", 304 | " },\n", 305 | " {\n", 306 | " \"name\": \"Nationality\",\n", 307 | " \"type\": \"Varchar(40)\",\n", 308 | " \"not_null\": true\n", 309 | " },\n", 310 | " {\n", 311 | " \"name\": \"BirthDate\",\n", 312 | " \"type\": \"Date\",\n", 313 | " \"\n", 314 | "not_null\": true\n", 315 | " }\n", 316 | " ]\n", 317 | " },\n", 318 | " {\n", 319 | " \"name\": \"Award\",\n", 320 | " \"fields\": [\n", 321 | " {\n", 322 | " \"name\": \"AwardID\",\n", 323 | " \"type\": \"Integer\",\n", 324 | " \"primary_key\": true,\n", 325 | " \"not\n", 326 | "_null\": true\n", 327 | " },\n", 328 | " {\n", 329 | " \"name\": \"AwardName\",\n", 330 | " \"type\": \"Varchar(50)\",\n", 331 | " \"not_null\": true\n", 332 | " }\n", 333 | " ]\n", 334 | " },\n", 335 | " {\n", 336 | " \"name\": \"Genre\",\n", 337 | " \"fields\":\n", 338 | " [\n", 339 | " {\n", 340 | " \"name\": \"GenreID\",\n", 341 | " \"type\": \"Integer\",\n", 342 | " \"primary_key\": true,\n", 343 | " \"not_null\": true\n", 344 | " },\n", 345 | " {\n", 346 | " \"name\": \"GenreName\",\n", 347 | " \"type\": \"Varchar(40\n", 348 | ")\",\n", 349 | " \"not_null\": true\n", 350 | " }\n", 351 | " ]\n", 352 | " },\n", 353 | " {\n", 354 | " \"name\": \"Language\",\n", 355 | " \"fields\": [\n", 356 | " {\n", 357 | " \"name\": \"LanguageID\",\n", 358 | " \"type\": \"Integer\",\n", 359 | " \"primary_key\": true,\n", 360 | "\n", 361 | " \"not_null\": true\n", 362 | " },\n", 363 | " {\n", 364 | " \"name\": \"Language\",\n", 365 | " \"type\": \"Varchar(50)\",\n", 366 | " \"not_null\": true\n", 367 | " }\n", 368 | " ]\n", 369 | " },\n", 370 | " {\n", 371 | " \"name\": \"Movie\",\n", 372 | " \n", 373 | "\"fields\": [\n", 374 | " {\n", 375 | " \"name\": \"MovieID\",\n", 376 | " \"type\": \"Integer\",\n", 377 | " \"primary_key\": true,\n", 378 | " \"not_null\": true\n", 379 | " },\n", 380 | " {\n", 381 | " \"name\": \"DirectorID\",\n", 382 | " \"type\": \"Integer\",\n", 383 | "\n", 384 | " \"not_null\": true\n", 385 | " },\n", 386 | " {\n", 387 | " \"name\": \"GenreID\",\n", 388 | " \"type\": \"Integer\",\n", 389 | " \"not_null\": true\n", 390 | " },\n", 391 | " {\n", 392 | " \"name\": \"Title\",\n", 393 | " \"type\": \"Varchar(50\n", 394 | ")\",\n", 395 | " \"not_null\": true\n", 396 | " },\n", 397 | " {\n", 398 | " \"name\": \"ReleaseYear\",\n", 399 | " \"type\": \"Integer\",\n", 400 | " \"not_null\": true\n", 401 | " },\n", 402 | " {\n", 403 | " \"name\": \"Rating\",\n", 404 | " \"type\": \"Number(4,\n", 405 | " 2)\",\n", 406 | " \"not_null\": true\n", 407 | " },\n", 408 | " {\n", 409 | " \"name\": \"Plot\",\n", 410 | " \"type\": \"Text(300)\",\n", 411 | " \"not_null\": true\n", 412 | " },\n", 413 | " {\n", 414 | " \"name\": \"MovieLength\",\n", 415 | " \"type\n", 416 | "\": \"Integer\",\n", 417 | " \"not_null\": true\n", 418 | " }\n", 419 | " ]\n", 420 | " },\n", 421 | " {\n", 422 | " \"name\": \"Director\",\n", 423 | " \"fields\": [\n", 424 | " {\n", 425 | " \"name\": \"DirectorID\",\n", 426 | " \"type\": \"Integer\",\n", 427 | " \"primary_key\n", 428 | "\": true,\n", 429 | " \"not_null\": true\n", 430 | " },\n", 431 | " {\n", 432 | " \"name\": \"FirstName\",\n", 433 | " \"type\": \"Varchar(50)\",\n", 434 | " \"not_null\": true\n", 435 | " },\n", 436 | " {\n", 437 | " \"name\": \"LastName\",\n", 438 | " \"type\":\n", 439 | " \"Varchar(40)\",\n", 440 | " \"not_null\": true\n", 441 | " },\n", 442 | " {\n", 443 | " \"name\": \"Nationality\",\n", 444 | " \"type\": \"Varchar(40)\",\n", 445 | " \"not_null\": true\n", 446 | " },\n", 447 | " {\n", 448 | " \"name\": \"BirthDate\",\n", 449 | "\n", 450 | " \"type\": \"Date\",\n", 451 | " \"not_null\": true\n", 452 | " }\n", 453 | " ]\n", 454 | " },\n", 455 | " {\n", 456 | " \"name\": \"Ratings\",\n", 457 | " \"fields\": [\n", 458 | " {\n", 459 | " \"name\": \"RatingID\",\n", 460 | " \"type\": \"Integer\",\n", 461 | " \"\n", 462 | "primary_key\": true,\n", 463 | " \"not_null\": true\n", 464 | " },\n", 465 | " {\n", 466 | " \"name\": \"Rating\",\n", 467 | " \"type\": \"Decimal(4, 2)\",\n", 468 | " \"not_null\": true\n", 469 | " },\n", 470 | " {\n", 471 | " \"name\": \"Source\",\n", 472 | "\n", 473 | " \"type\": \"Varchar(75)\",\n", 474 | " \"not_null\": true\n", 475 | " }\n", 476 | " ]\n", 477 | " },\n", 478 | " {\n", 479 | " \"name\": \"Cinema\",\n", 480 | " \"fields\": [\n", 481 | " {\n", 482 | " \"name\": \"CinemaID\",\n", 483 | " \"type\": \"Integer\n", 484 | "\",\n", 485 | " \"primary_key\": true,\n", 486 | " \"not_null\": true\n", 487 | " },\n", 488 | " {\n", 489 | " \"name\": \"CinemaName\",\n", 490 | " \"type\": \"Varchar(100)\",\n", 491 | " \"not_null\": true\n", 492 | " },\n", 493 | " {\n", 494 | " \"name\n", 495 | "\": \"Country\",\n", 496 | " \"type\": \"Varchar(100)\",\n", 497 | " \"not_null\": true\n", 498 | " },\n", 499 | " {\n", 500 | " \"name\": \"State\",\n", 501 | " \"type\": \"Varchar(100)\",\n", 502 | " \"not_null\": true\n", 503 | " },\n", 504 | "\n", 505 | " {\n", 506 | " \"name\": \"City\",\n", 507 | " \"type\": \"Varchar(100)\",\n", 508 | " \"not_null\": true\n", 509 | " },\n", 510 | " {\n", 511 | " \"name\": \"StreetName\",\n", 512 | " \"type\": \"Varchar(100)\",\n", 513 | " \"not_\n", 514 | "null\": true\n", 515 | " },\n", 516 | " {\n", 517 | " \"name\": \"StreetID\",\n", 518 | " \"type\": \"Varchar(20)\",\n", 519 | " \"not_null\": true\n", 520 | " }\n", 521 | " ]\n", 522 | " }\n", 523 | " ],\n", 524 | " \"relationships\": [\n", 525 | " {\n", 526 | " \"name\":\n", 527 | " \"Actor_Award\",\n", 528 | " \"type\": \"many-to-many\",\n", 529 | " \"source_entity\": \"Actor\",\n", 530 | " \"source_field\": \"ActorID\",\n", 531 | " \"target_entity\": \"Award\",\n", 532 | " \"target_field\": \"AwardID\"\n", 533 | " },\n", 534 | " \n", 535 | "{\n", 536 | " \"name\": \"Movie_Genre\",\n", 537 | " \"type\": \"many-to-one\",\n", 538 | " \"source_entity\": \"Movie\",\n", 539 | " \"source_field\": \"GenreID\",\n", 540 | " \"target_entity\": \"Genre\",\n", 541 | " \"target_field\": \"GenreID\n", 542 | "\"\n", 543 | " },\n", 544 | " {\n", 545 | " \"name\": \"Movie_Language\",\n", 546 | " \"type\": \"many-to-one\",\n", 547 | " \"source_entity\": \"Movie\",\n", 548 | " \"source_field\": \"LanguageID\",\n", 549 | " \"target_entity\": \"Language\",\n", 550 | " \"target\n", 551 | "_field\": \"LanguageID\"\n", 552 | " },\n", 553 | " {\n", 554 | " \"name\": \"Movie_Director\",\n", 555 | " \"type\": \"many-to-one\",\n", 556 | " \"source_entity\": \"Movie\",\n", 557 | " \"source_field\": \"DirectorID\",\n", 558 | " \"target_entity\": \"\n", 559 | "Director\",\n", 560 | " \"target_field\": \"DirectorID\"\n", 561 | " },\n", 562 | " {\n", 563 | " \"name\": \"Movie_Ratings\",\n", 564 | " \"type\": \"one-to-many\",\n", 565 | " \"source_entity\": \"Movie\",\n", 566 | " \"source_field\": \"MovieID\",\n", 567 | " \n", 568 | "\"target_entity\": \"Ratings\",\n", 569 | " \"target_field\": \"RatingID\"\n", 570 | " },\n", 571 | " {\n", 572 | " \"name\": \"Movie_Cinema\",\n", 573 | " \"type\": \"many-to-one\",\n", 574 | " \"source_entity\": \"Movie\",\n", 575 | " \"source_field\":\n", 576 | " \"CinemaID\",\n", 577 | " \"target_entity\": \"Cinema\",\n", 578 | " \"target_field\": \"CinemaID\"\n", 579 | " }\n", 580 | " ]\n", 581 | "}\n", 582 | "```\n" 583 | ] 584 | } 585 | ], 586 | "source": [ 587 | "\n", 588 | "image_er_url = \"https://github.com/Joshua-Yu/graph-rag/raw/main/gemini-multimodal/resources/movie-er.jpg\"\n", 589 | "image_er = load_image_from_url(image_er_url) \n", 590 | "\n", 591 | "prompt = \"Document the entities and relationships in this ER diagram and structure your response in JSON format for entity, relationship and their fields.\"\n", 592 | "\n", 593 | "contents = [prompt, image_er]\n", 594 | "\n", 595 | "# Use a more deterministic configuration with a low temperature\n", 596 | "generation_config = GenerationConfig(\n", 597 | " temperature=0.1,\n", 598 | " top_p=0.8,\n", 599 | " top_k=40,\n", 600 | " candidate_count=1,\n", 601 | " max_output_tokens=2048,\n", 602 | ")\n", 603 | "\n", 604 | "responses = multimodal_model.generate_content(\n", 605 | " contents,\n", 606 | " generation_config=generation_config,\n", 607 | " stream=True,\n", 608 | ")\n", 609 | "\n", 610 | "# print(\"-------Prompt--------\")\n", 611 | "# print_multimodal_prompt(contents)\n", 612 | "\n", 613 | "finalResponse = \"\"\n", 614 | "\n", 615 | "print(\"\\n-------Response--------\")\n", 616 | "for response in responses:\n", 617 | " # Because streaming mode is enabled, we need to collect all pieces of generated text\n", 618 | " finalResponse += response.text\n", 619 | " print(response.text)\n", 620 | "\n", 621 | " " 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 8, 627 | "metadata": { 628 | "tags": [] 629 | }, 630 | "outputs": [], 631 | "source": [ 632 | "import json\n", 633 | "\n", 634 | "#finalResponse = finalResponseCache\n", 635 | "\n", 636 | "finalResponse = finalResponse.replace(\"true\", \"\\\"true\\\"\").replace(\"false\", \"\\\"false\\\"\").replace(\"_\",\"-\").replace(\"```json\", \"\").replace(\"```\",\"\").strip()\n", 637 | "\n", 638 | "responseJson = json.loads(finalResponse)\n", 639 | "\n", 640 | "#print(responseJson)\n", 641 | "\n", 642 | "\n", 643 | "\n" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 9, 649 | "metadata": { 650 | "tags": [] 651 | }, 652 | "outputs": [ 653 | { 654 | "name": "stdout", 655 | "output_type": "stream", 656 | "text": [ 657 | "CREATE CONSTRAINT Actor_ActorID_unique FOR (n:Actor) REQUIRE n.ActorID IS UNIQUE;\n", 658 | "CREATE CONSTRAINT Actor_FirstName_notnull FOR (n:Actor) REQUIRE author.FirstName IS NOT NULL;\n", 659 | "CREATE CONSTRAINT Actor_LastName_notnull FOR (n:Actor) REQUIRE author.LastName IS NOT NULL;\n", 660 | "CREATE CONSTRAINT Actor_Nationality_notnull FOR (n:Actor) REQUIRE author.Nationality IS NOT NULL;\n", 661 | "CREATE CONSTRAINT Actor_BirthDate_notnull FOR (n:Actor) REQUIRE author.BirthDate IS NOT NULL;\n", 662 | "CREATE CONSTRAINT Award_AwardID_unique FOR (n:Award) REQUIRE n.AwardID IS UNIQUE;\n", 663 | "CREATE CONSTRAINT Award_AwardName_notnull FOR (n:Award) REQUIRE author.AwardName IS NOT NULL;\n", 664 | "CREATE CONSTRAINT Genre_GenreID_unique FOR (n:Genre) REQUIRE n.GenreID IS UNIQUE;\n", 665 | "CREATE CONSTRAINT Genre_GenreName_notnull FOR (n:Genre) REQUIRE author.GenreName IS NOT NULL;\n", 666 | "CREATE CONSTRAINT Language_LanguageID_unique FOR (n:Language) REQUIRE n.LanguageID IS UNIQUE;\n", 667 | "CREATE CONSTRAINT Language_Language_notnull FOR (n:Language) REQUIRE author.Language IS NOT NULL;\n", 668 | "CREATE CONSTRAINT Movie_MovieID_unique FOR (n:Movie) REQUIRE n.MovieID IS UNIQUE;\n", 669 | "CREATE CONSTRAINT Movie_DirectorID_notnull FOR (n:Movie) REQUIRE author.DirectorID IS NOT NULL;\n", 670 | "CREATE CONSTRAINT Movie_GenreID_notnull FOR (n:Movie) REQUIRE author.GenreID IS NOT NULL;\n", 671 | "CREATE CONSTRAINT Movie_Title_notnull FOR (n:Movie) REQUIRE author.Title IS NOT NULL;\n", 672 | "CREATE CONSTRAINT Movie_ReleaseYear_notnull FOR (n:Movie) REQUIRE author.ReleaseYear IS NOT NULL;\n", 673 | "CREATE CONSTRAINT Movie_Rating_notnull FOR (n:Movie) REQUIRE author.Rating IS NOT NULL;\n", 674 | "CREATE CONSTRAINT Movie_Plot_notnull FOR (n:Movie) REQUIRE author.Plot IS NOT NULL;\n", 675 | "CREATE CONSTRAINT Movie_MovieLength_notnull FOR (n:Movie) REQUIRE author.MovieLength IS NOT NULL;\n", 676 | "CREATE CONSTRAINT Director_DirectorID_unique FOR (n:Director) REQUIRE n.DirectorID IS UNIQUE;\n", 677 | "CREATE CONSTRAINT Director_FirstName_notnull FOR (n:Director) REQUIRE author.FirstName IS NOT NULL;\n", 678 | "CREATE CONSTRAINT Director_LastName_notnull FOR (n:Director) REQUIRE author.LastName IS NOT NULL;\n", 679 | "CREATE CONSTRAINT Director_Nationality_notnull FOR (n:Director) REQUIRE author.Nationality IS NOT NULL;\n", 680 | "CREATE CONSTRAINT Director_BirthDate_notnull FOR (n:Director) REQUIRE author.BirthDate IS NOT NULL;\n", 681 | "CREATE CONSTRAINT Ratings_RatingID_unique FOR (n:Ratings) REQUIRE n.RatingID IS UNIQUE;\n", 682 | "CREATE CONSTRAINT Ratings_Rating_notnull FOR (n:Ratings) REQUIRE author.Rating IS NOT NULL;\n", 683 | "CREATE CONSTRAINT Ratings_Source_notnull FOR (n:Ratings) REQUIRE author.Source IS NOT NULL;\n", 684 | "CREATE CONSTRAINT Cinema_CinemaID_unique FOR (n:Cinema) REQUIRE n.CinemaID IS UNIQUE;\n", 685 | "CREATE CONSTRAINT Cinema_CinemaName_notnull FOR (n:Cinema) REQUIRE author.CinemaName IS NOT NULL;\n", 686 | "CREATE CONSTRAINT Cinema_Country_notnull FOR (n:Cinema) REQUIRE author.Country IS NOT NULL;\n", 687 | "CREATE CONSTRAINT Cinema_State_notnull FOR (n:Cinema) REQUIRE author.State IS NOT NULL;\n", 688 | "CREATE CONSTRAINT Cinema_City_notnull FOR (n:Cinema) REQUIRE author.City IS NOT NULL;\n", 689 | "CREATE CONSTRAINT Cinema_StreetName_notnull FOR (n:Cinema) REQUIRE author.StreetName IS NOT NULL;\n", 690 | "CREATE CONSTRAINT Cinema_StreetID_notnull FOR (n:Cinema) REQUIRE author.StreetID IS NOT NULL;\n", 691 | "\n" 692 | ] 693 | } 694 | ], 695 | "source": [ 696 | "\n", 697 | "cypher_constraints = \"\"\n", 698 | "cypher_load_nodes = \"\"\n", 699 | "cypher_build_relationships = \"\"\n", 700 | "\n", 701 | "# a.1 Iterate through entities and create UNIQUENESS and/or EXISTENCE constraints\n", 702 | "for entity in responseJson.get(\"entities\"): \n", 703 | " entity_name = entity.get(\"name\")\n", 704 | " \n", 705 | " for field in entity.get(\"fields\"):\n", 706 | " field_name = field.get(\"name\")\n", 707 | " \n", 708 | " if field.get(\"primary-key\") == \"true\": \n", 709 | " cypher = f\"CREATE CONSTRAINT {entity_name}_{field_name}_unique FOR (n:{entity_name}) REQUIRE n.{field_name} IS UNIQUE;\\n\"\n", 710 | " cypher_constraints += cypher\n", 711 | " \n", 712 | " if field.get(\"not-null\") == \"true\" and not field.get(\"primary-key\") == \"true\": # primary-key must be unique\n", 713 | " cypher = f\"CREATE CONSTRAINT {entity_name}_{field_name}_notnull FOR (n:{entity_name}) REQUIRE author.{field_name} IS NOT NULL;\\n\"\n", 714 | " cypher_constraints += cypher\n", 715 | "\n", 716 | " \n", 717 | "print(cypher_constraints)\n", 718 | "\n", 719 | "\n", 720 | "\n", 721 | " \n", 722 | "# TO DO: \n", 723 | "# - composite key\n", 724 | "# - check and override property value\n", 725 | "# - assume every table has a primary key\n", 726 | "# - entity file, relationship file\n", 727 | " " 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 10, 733 | "metadata": { 734 | "tags": [] 735 | }, 736 | "outputs": [ 737 | { 738 | "name": "stdout", 739 | "output_type": "stream", 740 | "text": [ 741 | "// ----------- LOAD CSV for nodes of Actor -----------\n", 742 | ":auto\n", 743 | "WITH $filename\n", 744 | "LOAD CSV FROM $filename AS line\n", 745 | "CALL {\n", 746 | "MERGE (n:Actor {ActorID:line.ActorID})\n", 747 | "SET n.FirstName = line.FirstName,n.LastName = line.LastName,n.Nationality = line.Nationality,n.BirthDate = line.BirthDate\n", 748 | "} IN TRANSACTIONS 2000;\n", 749 | "// ----------- LOAD CSV for nodes of Award -----------\n", 750 | ":auto\n", 751 | "WITH $filename\n", 752 | "LOAD CSV FROM $filename AS line\n", 753 | "CALL {\n", 754 | "MERGE (n:Award {AwardID:line.AwardID})\n", 755 | "SET n.AwardName = line.AwardName\n", 756 | "} IN TRANSACTIONS 2000;\n", 757 | "// ----------- LOAD CSV for nodes of Genre -----------\n", 758 | ":auto\n", 759 | "WITH $filename\n", 760 | "LOAD CSV FROM $filename AS line\n", 761 | "CALL {\n", 762 | "MERGE (n:Genre {GenreID:line.GenreID})\n", 763 | "SET n.GenreName = line.GenreName\n", 764 | "} IN TRANSACTIONS 2000;\n", 765 | "// ----------- LOAD CSV for nodes of Language -----------\n", 766 | ":auto\n", 767 | "WITH $filename\n", 768 | "LOAD CSV FROM $filename AS line\n", 769 | "CALL {\n", 770 | "MERGE (n:Language {LanguageID:line.LanguageID})\n", 771 | "SET n.Language = line.Language\n", 772 | "} IN TRANSACTIONS 2000;\n", 773 | "// ----------- LOAD CSV for nodes of Movie -----------\n", 774 | ":auto\n", 775 | "WITH $filename\n", 776 | "LOAD CSV FROM $filename AS line\n", 777 | "CALL {\n", 778 | "MERGE (n:Movie {MovieID:line.MovieID})\n", 779 | "SET n.DirectorID = line.DirectorID,n.GenreID = line.GenreID,n.Title = line.Title,n.ReleaseYear = line.ReleaseYear,n.Rating = line.Rating,n.Plot = line.Plot,n.MovieLength = line.MovieLength\n", 780 | "} IN TRANSACTIONS 2000;\n", 781 | "// ----------- LOAD CSV for nodes of Director -----------\n", 782 | ":auto\n", 783 | "WITH $filename\n", 784 | "LOAD CSV FROM $filename AS line\n", 785 | "CALL {\n", 786 | "MERGE (n:Director {DirectorID:line.DirectorID})\n", 787 | "SET n.FirstName = line.FirstName,n.LastName = line.LastName,n.Nationality = line.Nationality,n.BirthDate = line.BirthDate\n", 788 | "} IN TRANSACTIONS 2000;\n", 789 | "// ----------- LOAD CSV for nodes of Ratings -----------\n", 790 | ":auto\n", 791 | "WITH $filename\n", 792 | "LOAD CSV FROM $filename AS line\n", 793 | "CALL {\n", 794 | "MERGE (n:Ratings {RatingID:line.RatingID})\n", 795 | "SET n.Rating = line.Rating,n.Source = line.Source\n", 796 | "} IN TRANSACTIONS 2000;\n", 797 | "// ----------- LOAD CSV for nodes of Cinema -----------\n", 798 | ":auto\n", 799 | "WITH $filename\n", 800 | "LOAD CSV FROM $filename AS line\n", 801 | "CALL {\n", 802 | "MERGE (n:Cinema {CinemaID:line.CinemaID})\n", 803 | "SET n.CinemaName = line.CinemaName,n.Country = line.Country,n.State = line.State,n.City = line.City,n.StreetName = line.StreetName,n.StreetID = line.StreetID\n", 804 | "} IN TRANSACTIONS 2000;\n" 805 | ] 806 | } 807 | ], 808 | "source": [ 809 | "\n", 810 | "\n", 811 | "# a.2 Iterate through entities and generate LOAD CSV statement based on entity schema\n", 812 | "#\n", 813 | "# Description: \n", 814 | "# - load data from a CSV having column headers\n", 815 | "# - bulk commit\n", 816 | "# - using MERGE for UPSERT\n", 817 | "\n", 818 | "\n", 819 | "for entity in responseJson.get(\"entities\"): \n", 820 | " entity_name = entity.get(\"name\")\n", 821 | " \n", 822 | " cypher_load_nodes = f\"// ----------- LOAD CSV for nodes of {entity_name} -----------\\n\"\n", 823 | " cypher_load_nodes += f\":auto\\nWITH $filename\\nLOAD CSV FROM $filename AS line\\nCALL \" + \"{\" + f\"\\nMERGE (n:{entity_name} \"\n", 824 | " set_statement = \"SET \"\n", 825 | " \n", 826 | " for field in entity.get(\"fields\"):\n", 827 | " field_name = field.get(\"name\")\n", 828 | " \n", 829 | " if field.get(\"primary-key\") == \"true\": \n", 830 | " cypher_load_nodes += \"{\" + f\"{field_name}:line.{field_name}\" + \"}\" + \")\\n\"\n", 831 | " else: \n", 832 | " set_statement = set_statement + f\"n.{field_name} = line.{field_name},\"\n", 833 | " \n", 834 | " cypher_load_nodes += set_statement[:len(set_statement)-1]\n", 835 | " cypher_load_nodes += \"\\n} IN TRANSACTIONS 2000;\"\n", 836 | " \n", 837 | " print(cypher_load_nodes)\n", 838 | " " 839 | ] 840 | }, 841 | { 842 | "cell_type": "code", 843 | "execution_count": 11, 844 | "metadata": { 845 | "tags": [] 846 | }, 847 | "outputs": [ 848 | { 849 | "name": "stdout", 850 | "output_type": "stream", 851 | "text": [ 852 | "// ----------- LOAD CSV for relationships of Actor-Award -----------\n", 853 | ":auto\n", 854 | "WITH $filename\n", 855 | "LOAD CSV FROM $filename AS line\n", 856 | "CALL {\n", 857 | "MATCH (e1:Actor{ActorID:line.ActorID})\n", 858 | "MATCH (e2:Award{AwardID:line.AwardID})\n", 859 | "MERGE (e1) -[:HAS_AWARD]-> (e2)\n", 860 | "} IN TRANSACTIONS 2000;\n", 861 | "\n", 862 | "// ----------- LOAD CSV for relationships of Movie-Genre -----------\n", 863 | ":auto\n", 864 | "WITH $filename\n", 865 | "LOAD CSV FROM $filename AS line\n", 866 | "CALL {\n", 867 | "MATCH (e1:Movie{GenreID:line.GenreID})\n", 868 | "MATCH (e2:Genre{GenreID:line.GenreID})\n", 869 | "MERGE (e1) -[:HAS_GENRE]-> (e2)\n", 870 | "} IN TRANSACTIONS 2000;\n", 871 | "\n", 872 | "// ----------- LOAD CSV for relationships of Movie-Language -----------\n", 873 | ":auto\n", 874 | "WITH $filename\n", 875 | "LOAD CSV FROM $filename AS line\n", 876 | "CALL {\n", 877 | "MATCH (e1:Movie{LanguageID:line.LanguageID})\n", 878 | "MATCH (e2:Language{LanguageID:line.LanguageID})\n", 879 | "MERGE (e1) -[:HAS_LANGUAGE]-> (e2)\n", 880 | "} IN TRANSACTIONS 2000;\n", 881 | "\n", 882 | "// ----------- LOAD CSV for relationships of Movie-Director -----------\n", 883 | ":auto\n", 884 | "WITH $filename\n", 885 | "LOAD CSV FROM $filename AS line\n", 886 | "CALL {\n", 887 | "MATCH (e1:Movie{DirectorID:line.DirectorID})\n", 888 | "MATCH (e2:Director{DirectorID:line.DirectorID})\n", 889 | "MERGE (e1) -[:HAS_DIRECTOR]-> (e2)\n", 890 | "} IN TRANSACTIONS 2000;\n", 891 | "\n", 892 | "// ----------- LOAD CSV for relationships of Movie-Ratings -----------\n", 893 | ":auto\n", 894 | "WITH $filename\n", 895 | "LOAD CSV FROM $filename AS line\n", 896 | "CALL {\n", 897 | "MATCH (e1:Movie{MovieID:line.MovieID})\n", 898 | "MATCH (e2:Ratings{RatingID:line.RatingID})\n", 899 | "MERGE (e1) -[:HAS_RATINGS]-> (e2)\n", 900 | "} IN TRANSACTIONS 2000;\n", 901 | "\n", 902 | "// ----------- LOAD CSV for relationships of Movie-Cinema -----------\n", 903 | ":auto\n", 904 | "WITH $filename\n", 905 | "LOAD CSV FROM $filename AS line\n", 906 | "CALL {\n", 907 | "MATCH (e1:Movie{CinemaID:line.CinemaID})\n", 908 | "MATCH (e2:Cinema{CinemaID:line.CinemaID})\n", 909 | "MERGE (e1) -[:HAS_CINEMA]-> (e2)\n", 910 | "} IN TRANSACTIONS 2000;\n", 911 | "\n" 912 | ] 913 | } 914 | ], 915 | "source": [ 916 | "\n", 917 | "# a.3 Iterate Relationships to create relationship between nodes\n", 918 | "#\n", 919 | "# Naming conventions: \n", 920 | "# - for source_node:SourceLabel -> target_node:TargetLabel, the name of relationship is HAS_ in big cases\n", 921 | "\n", 922 | "for relationship in responseJson.get(\"relationships\"):\n", 923 | " source_entity = relationship.get(\"source-entity\")\n", 924 | " source_field = relationship.get(\"source-field\")\n", 925 | " target_entity = relationship.get(\"target-entity\")\n", 926 | " target_field = relationship.get(\"target-field\")\n", 927 | " relation_name = relationship.get(\"name\")\n", 928 | " \n", 929 | " \n", 930 | " cypher_create_relationships = f\"// ----------- LOAD CSV for relationships of {relation_name} -----------\\n\"\n", 931 | " cypher_create_relationships += f\":auto\\nWITH $filename\\nLOAD CSV FROM $filename AS line\\nCALL \" + \"{\\n\"\n", 932 | " cypher_create_relationships += f\"MATCH (e1:{source_entity}\" + \"{\" + f\"{source_field}:line.{source_field}\" + \"})\\n\"\n", 933 | " cypher_create_relationships += f\"MATCH (e2:{target_entity}\" + \"{\" + f\"{target_field}:line.{target_field}\" + \"})\\n\"\n", 934 | " cypher_create_relationships += f\"MERGE (e1) -[:HAS_{target_entity.upper()}]-> (e2)\\n\" + \"} IN TRANSACTIONS 2000;\\n\"\n", 935 | " \n", 936 | " print(cypher_create_relationships)\n", 937 | "\n" 938 | ] 939 | }, 940 | { 941 | "cell_type": "code", 942 | "execution_count": null, 943 | "metadata": {}, 944 | "outputs": [], 945 | "source": [] 946 | } 947 | ], 948 | "metadata": { 949 | "colab": { 950 | "provenance": [], 951 | "toc_visible": true 952 | }, 953 | "environment": { 954 | "kernel": "python3", 955 | "name": "common-cpu.m113", 956 | "type": "gcloud", 957 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m113" 958 | }, 959 | "kernelspec": { 960 | "display_name": "Python (Local)", 961 | "language": "python", 962 | "name": "base" 963 | }, 964 | "language_info": { 965 | "codemirror_mode": { 966 | "name": "ipython", 967 | "version": 3 968 | }, 969 | "file_extension": ".py", 970 | "mimetype": "text/x-python", 971 | "name": "python", 972 | "nbconvert_exporter": "python", 973 | "pygments_lexer": "ipython3", 974 | "version": "3.10.13" 975 | } 976 | }, 977 | "nbformat": 4, 978 | "nbformat_minor": 4 979 | } 980 | -------------------------------------------------------------------------------- /unstructured-io/Unstructured-IO_PDF_KGLoader.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "pj4wr-FiikUP" 7 | }, 8 | "source": [ 9 | "## Install dependent packages. For unstructured.io, we only install PDF related functions for now." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "id": "Je-m03WaZInU" 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "\n", 21 | "!pip install neo4j\n", 22 | "!pip install unstructured\n", 23 | "!pip install \"unstructured[pdf]\"\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": { 29 | "id": "LKEM-Leji3XR" 30 | }, 31 | "source": [ 32 | "# Install common language packages from NLTK." 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "id": "7qoubIkZaOsK" 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "!pip install --user -U nltk\n", 44 | "!python -m nltk.downloader popular" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "id": "J52SmCoGi-R4" 51 | }, 52 | "source": [ 53 | "# Install **poppler** and **tesseract** which are required by pdf2image. The following lines are for linux and macos only. For Windows and other platforms, please refer to :\n", 54 | "- https://pdf2image.readthedocs.io/en/latest/installation.html\n", 55 | "- https://tesseract-ocr.github.io/tessdoc/Installation.html" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "id": "F3-HFx2ugf-C" 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "!sudo apt-get install poppler-utils" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "id": "8XC3wMFzhYUS" 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "!sudo apt-get install tesseract-ocr\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "id": "FF6mMzNJanMT" 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "\n", 89 | "!pip install tesseract" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "id": "D0jggqqrYPVP" 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "from neo4j import GraphDatabase\n", 101 | "import uuid\n", 102 | "import hashlib\n", 103 | "\n", 104 | "# Neo4j graph database can be self-hosted or local.\n", 105 | "# Or go to https://neo4j.com/cloud/platform/aura-graph-database/ to create a FREE AuraDB instance.\n", 106 | "# Fill conection details below\n", 107 | "NEO4J_URL = \"neo4j+s://.databases.neo4j.io\"\n", 108 | "NEO4J_USER = \"neo4j\"\n", 109 | "NEO4J_PASSWORD = \"\"\n", 110 | "NEO4J_DATABASE = \"neo4j\"" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "id": "QCa_EZRVYPVP" 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "\n", 122 | "def initialiseNeo4j():\n", 123 | " cypher_schema = [\n", 124 | " \"CREATE CONSTRAINT sectionKey IF NOT EXISTS FOR (c:Section) REQUIRE (c.key) IS UNIQUE;\",\n", 125 | " \"CREATE CONSTRAINT chunkKey IF NOT EXISTS FOR (c:Chunk) REQUIRE (c.key) IS UNIQUE;\",\n", 126 | " \"CREATE CONSTRAINT documentKey IF NOT EXISTS FOR (c:Document) REQUIRE (c.url_hash) IS UNIQUE;\",\n", 127 | " \"CREATE CONSTRAINT tableKey IF NOT EXISTS FOR (c:Table) REQUIRE (c.key) IS UNIQUE;\",\n", 128 | " \"CREATE CONSTRAINT elementKey IF NOT EXISTS FOR (c:Element) REQUIRE (c.key) IS UNIQUE;\",\n", 129 | " \"CALL db.index.vector.createNodeIndex('chunkVectorIndex', 'Embedding', 'value', 1536, 'COSINE');\"\n", 130 | " ]\n", 131 | "\n", 132 | " driver = GraphDatabase.driver(NEO4J_URL, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))\n", 133 | "\n", 134 | " with driver.session() as session:\n", 135 | " for cypher in cypher_schema:\n", 136 | " session.run(cypher)\n", 137 | " driver.close()\n", 138 | "\n", 139 | "def ingestDocumentNeo4j(elements, doc_location):\n", 140 | "\n", 141 | " cypher_pool = [\n", 142 | " # 0 - Document\n", 143 | " \"MERGE (d:Document {url_hash: $doc_url_hash_val}) ON CREATE SET d.url = $doc_url_val, d.last_modified = $doc_last_modified_val RETURN d;\",\n", 144 | " # 1 - Section\n", 145 | " \"MERGE (p:Section {key: $element_id_val}) ON CREATE SET p:Element, p.page_idx = $page_idx_val, p.title_hash = $title_hash_val, p.block_idx = $block_idx_val, p.title = $title_val, p.tag = $tag_val RETURN p;\",\n", 146 | " # 2 - Link Section with the Document\n", 147 | " \"MATCH (d:Document {url_hash: $doc_url_hash_val}) MATCH (s:Section {key: $element_id_val}) MERGE (d)<-[:HAS_DOCUMENT]-(s);\",\n", 148 | " # 3 - Link Section with a parent Element\n", 149 | " \"MATCH (s1:Section {key: $element_id_val}) MATCH (s2:Element {key: $sec_parent_element_id_val}) MERGE (s2)<-[:UNDER_SECTION]-(s1);\",\n", 150 | " # 4 - Chunk\n", 151 | " \"MERGE (c:Chunk {key: $element_id_val}) ON CREATE SET c:Element, c.sentences = $sentences_val, c.sentences_hash = $sentences_hash_val, c.block_idx = $block_idx_val, c.page_idx = $page_idx_val, c.tag = $tag_val RETURN c;\",\n", 152 | " # 5 - Link Chunk to another element\n", 153 | " \"MATCH (c:Chunk {key: $element_id_val}) MATCH (s:Element {key:$chk_parent_element_id_val}) MERGE (s)<-[:HAS_PARENT]-(c);\",\n", 154 | " # 6 - Table\n", 155 | " \"MERGE (t:Table {key: $element_id_val}) ON CREATE SET t:Element, t.name = $name_val, t.doc_url_hash = $doc_url_hash_val, t.block_idx = $block_idx_val, t.page_idx = $page_idx_val, t.html = $html_val, t.rows = $rows_val RETURN t;\",\n", 156 | " # 7 - Link Table to Section\n", 157 | " \"MATCH (t:Table {key: $element_id_val}) MATCH (s:Section {key: $tb_parent_element_id_val}) MERGE (s)<-[:HAS_PARENT]-(t);\",\n", 158 | " # 8 - Link Table to Document\n", 159 | " \"MATCH (t:Table {key: $element_id_val}) MATCH (s:Document {url_hash: $doc_url_hash_val}) MERGE (s)<-[:HAS_PARENT]-(t);\",\n", 160 | " # 9 - Image\n", 161 | " \"MERGE (t:Image {key: $element_id_val}) ON CREATE SET t:Element, t.name = $name_val, t.doc_url_hash = $doc_url_hash_val, t.block_idx = $block_idx_val, t.page_idx = $page_idx_val RETURN t;\",\n", 162 | " # 10 - Link Image to Document\n", 163 | " \"MATCH (t:Image {key: $element_id_val}) MATCH (s:Document {url_hash: $doc_url_hash_val}) MERGE (s)<-[:HAS_PARENT]-(t);\",\n", 164 | " # 11 - Link top Chunk to Document\n", 165 | " \"MATCH (t:Chunk {key: $element_id_val}) MATCH (s:Document {url_hash: $doc_url_hash_val}) MERGE (s)<-[:HAS_PARENT]-(t);\"\n", 166 | " ]\n", 167 | "\n", 168 | " driver = GraphDatabase.driver(NEO4J_URL, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))\n", 169 | "\n", 170 | " with driver.session() as session:\n", 171 | " cypher = \"\"\n", 172 | "\n", 173 | " # 1 - Create Document node\n", 174 | " doc_url_val = doc_location\n", 175 | " doc_url_hash_val = hashlib.md5(doc_url_val.encode(\"utf-8\")).hexdigest()\n", 176 | " doc_last_modified_val = elements[0].metadata.last_modified\n", 177 | "\n", 178 | " cypher = cypher_pool[0]\n", 179 | " session.run(cypher, doc_url_hash_val=doc_url_hash_val, doc_url_val=doc_url_val, doc_last_modified_val=doc_last_modified_val)\n", 180 | "\n", 181 | " # 2 - Create Section nodes if element.category = 'Title'\n", 182 | "\n", 183 | " countSection = 0\n", 184 | " countChunk = 0\n", 185 | " countTable = 0\n", 186 | " countImage = 0\n", 187 | "\n", 188 | " # iterate all items in list elements and keep an index i\n", 189 | " for i, sec in enumerate(elements) :\n", 190 | "\n", 191 | " tag_val = sec.category\n", 192 | " page_idx_val = sec.metadata.page_number\n", 193 | " block_idx_val = i\n", 194 | " element_id_val = sec.id\n", 195 | " text_val = sec.text\n", 196 | " text_hash_val = hashlib.md5(text_val.encode(\"utf-8\")).hexdigest()\n", 197 | " parent_id_val = str(sec.metadata.parent_id)\n", 198 | "\n", 199 | " if sec.category == 'Title':\n", 200 | "\n", 201 | " # MERGE section node\n", 202 | " cypher = cypher_pool[1]\n", 203 | " session.run(cypher, page_idx_val=page_idx_val\n", 204 | " , title_hash_val=text_hash_val\n", 205 | " , title_val=text_val\n", 206 | " , tag_val=tag_val\n", 207 | " , block_idx_val=block_idx_val\n", 208 | " , doc_url_hash_val=doc_url_hash_val\n", 209 | " , element_id_val=element_id_val\n", 210 | " )\n", 211 | "\n", 212 | " # Link Section with a parent section or Document\n", 213 | "\n", 214 | " if parent_id_val == \"None\": # use Document as parent\n", 215 | " cypher = cypher_pool[2]\n", 216 | " session.run(cypher\n", 217 | " , doc_url_hash_val=doc_url_hash_val\n", 218 | " , element_id_val=element_id_val\n", 219 | " )\n", 220 | "\n", 221 | " else: # use parent section\n", 222 | " cypher = cypher_pool[3]\n", 223 | " session.run(cypher\n", 224 | " , sec_parent_element_id_val=parent_id_val\n", 225 | " , doc_url_hash_val=doc_url_hash_val\n", 226 | " , element_id_val=element_id_val\n", 227 | " )\n", 228 | " # **** if sec_parent_val == \"None\":\n", 229 | "\n", 230 | " countSection += 1\n", 231 | " continue\n", 232 | " # **** for sec in elements: category = 'Title'\n", 233 | "\n", 234 | "\n", 235 | " # ------- Continue within the session block -------\n", 236 | " # 3 - Create Chunk nodes from chunks\n", 237 | "\n", 238 | " if sec.category == 'NarrativeText' or sec.category == 'List' or sec.category == 'ListItem' \\\n", 239 | " or sec.category == 'UncategorizedText' or sec.category == 'Header':\n", 240 | "\n", 241 | "\n", 242 | " # MERGE chunk node\n", 243 | " cypher = cypher_pool[4]\n", 244 | " session.run(cypher, sentences_hash_val=text_hash_val\n", 245 | " , sentences_val=text_val\n", 246 | " , block_idx_val=block_idx_val\n", 247 | " , page_idx_val=page_idx_val\n", 248 | " , tag_val=tag_val\n", 249 | " , doc_url_hash_val=doc_url_hash_val\n", 250 | " , element_id_val=element_id_val\n", 251 | " )\n", 252 | "\n", 253 | " # Link chunk with a parent Element. If none, link it to Document\n", 254 | "\n", 255 | " if not parent_id_val == \"None\":\n", 256 | "\n", 257 | " cypher = cypher_pool[5]\n", 258 | " session.run(cypher\n", 259 | " , doc_url_hash_val=doc_url_hash_val\n", 260 | " , chk_parent_element_id_val=parent_id_val\n", 261 | " , element_id_val=element_id_val\n", 262 | " )\n", 263 | " else: # link chunk to Document\n", 264 | " cypher = cypher_pool[11]\n", 265 | " session.run(cypher\n", 266 | " , doc_url_hash_val=doc_url_hash_val\n", 267 | " , element_id_val=element_id_val\n", 268 | " )\n", 269 | "\n", 270 | " countChunk += 1\n", 271 | " continue\n", 272 | " # **** for sec in elements: Chunk\n", 273 | "\n", 274 | " # 4 - Create Table nodes\n", 275 | "\n", 276 | " if sec.category == 'Table':\n", 277 | "\n", 278 | " html_val = sec.metadata.text_as_html\n", 279 | " # count in html\n", 280 | " rows_val = len(html_val.split(''))\n", 281 | "\n", 282 | " # MERGE table node\n", 283 | "\n", 284 | " cypher = cypher_pool[6]\n", 285 | " session.run(cypher, block_idx_val=block_idx_val\n", 286 | " , page_idx_val=page_idx_val\n", 287 | " , name_val=text_val\n", 288 | " , html_val=html_val\n", 289 | " , rows_val=rows_val\n", 290 | " , doc_url_hash_val=doc_url_hash_val\n", 291 | " , element_id_val=element_id_val\n", 292 | " )\n", 293 | "\n", 294 | " # Link table with a section\n", 295 | " # Table always has a parent section\n", 296 | "\n", 297 | " if not parent_id_val == \"None\":\n", 298 | " cypher = cypher_pool[7]\n", 299 | " session.run(cypher\n", 300 | " , tb_parent_element_id_val=parent_id_val\n", 301 | " , element_id_val=element_id_val\n", 302 | " )\n", 303 | "\n", 304 | " else: # link table to Document\n", 305 | " cypher = cypher_pool[8]\n", 306 | " session.run(cypher\n", 307 | " , doc_url_hash_val=doc_url_hash_val\n", 308 | " , element_id_val=element_id_val\n", 309 | " )\n", 310 | " countTable += 1\n", 311 | " continue\n", 312 | " # **** for sec in elements: category = 'Table'\n", 313 | "\n", 314 | "\n", 315 | " # 5 - Create Image nodes\n", 316 | "\n", 317 | " if sec.category == 'Image':\n", 318 | "\n", 319 | " # MERGE Image node\n", 320 | "\n", 321 | " cypher = cypher_pool[9]\n", 322 | " session.run(cypher, block_idx_val=block_idx_val\n", 323 | " , page_idx_val=page_idx_val\n", 324 | " , name_val=text_val\n", 325 | " , doc_url_hash_val=doc_url_hash_val\n", 326 | " , element_id_val=element_id_val\n", 327 | " )\n", 328 | "\n", 329 | " # Link image with a section\n", 330 | " # Image always linkes to Document\n", 331 | "\n", 332 | " cypher = cypher_pool[10]\n", 333 | " session.run(cypher\n", 334 | " , image_parent_element_id_val=doc_url_hash_val\n", 335 | " , element_id_val=element_id_val\n", 336 | " , doc_url_hash_val=doc_url_hash_val\n", 337 | " )\n", 338 | "\n", 339 | " countImage += 1\n", 340 | " continue\n", 341 | " # **** for sec in elements: category = 'Image'\n", 342 | " # *** for i, sec in enumerate(elements) :\n", 343 | "\n", 344 | " print(f'\\'{doc_url_val}\\' Done! Summary: ')\n", 345 | " print('#Sections: ' + str(countSection))\n", 346 | " print('#Chunks: ' + str(countChunk))\n", 347 | " print('#Tables: ' + str(countTable))\n", 348 | " print('#Images: ' + str(countImage))\n", 349 | "\n", 350 | " # *** with driver.session() as session:\n", 351 | "\n", 352 | " driver.close()\n", 353 | "\n", 354 | "\n", 355 | "# *** def ingestDocumentNeo4j(elements, doc_location):\n" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": { 362 | "id": "BeAnqWKoYPVQ" 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "\n", 367 | "# create constraints and indexes. only need to execute once.\n", 368 | "\n", 369 | "initialiseNeo4j()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "colab": { 377 | "base_uri": "https://localhost:8080/", 378 | "height": 289, 379 | "referenced_widgets": [ 380 | "1fe6310a1ad0424b82329945262a7e2f", 381 | "a9eaef02d6924c5b83db2ee973b73db5", 382 | "7e7ab500393845698c648b8a83b13018", 383 | "11031b0baf484220808b8b8d0992be3a", 384 | "0fddcf4296e24ead8a69664c2ab905a4", 385 | "2db5b641d66746fa83aec5f5f889c2a6", 386 | "082c5bed11a945448297a20eb5aaa8f7", 387 | "8b7cd68e87bc448d95bba88fe4fdfb60", 388 | "84b62c98c94a47b48be1e8a49edbf8c0", 389 | "1cce7d2834b54f18b3ec1f75daaa66d7", 390 | "f4d54c7600764958a410be1032cf5e61", 391 | "21315a6c975b4e0f8d8a48eb0d507292", 392 | "61da3171628b46c8849ca5dd391dabc4", 393 | "c75e65306fca40bcac323abc321edcf7", 394 | "f358143d63a5430a92f5b5b4dc394c43", 395 | "6c940c35fa2b48b29476865eacbf41cc", 396 | "9bbfa722dc914b41861bcf6f17f75c4b", 397 | "9d315fcf38d14359840a104ca03bbe50", 398 | "90b0568e50aa4da498b3f96c6eddc010", 399 | "be33409a46e84e4f81386c95612febff", 400 | "ebd4aa37f59d44b1b37c585d5d25fb48", 401 | "ee5e2033a3cb4f54bbf79ca371f7c198", 402 | "aa788ed8fa314d63a8167a583ea9571b", 403 | "7ddbaab9d0bc4eeea411563290e832e1", 404 | "67d1d034d1e24f7486e10f423de3c8ae", 405 | "566f884b121645a48aa03e480310308a", 406 | "90ef1d683b254127943e241109ea695c", 407 | "3eba18715eaa4d249a96e6736c32685e", 408 | "948890114b7e4a399412068ec59fe468", 409 | "a0a85fbdefe744f38b18b3cef7efebb8", 410 | "d9eae77de3ab4207b4a8cc5a38e61089", 411 | "2918b40af2574c3a8c9cad8845732016", 412 | "72de4e1fc84e444ebb4004ea409ee425" 413 | ] 414 | }, 415 | "id": "-74otV8nYPVR", 416 | "outputId": "85a13d6a-64dc-4b79-be28-39d2144fb196" 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "from unstructured.partition.pdf import partition_pdf\n", 421 | "from unstructured.staging.base import convert_to_dict\n", 422 | "from unstructured.staging.base import elements_to_json\n", 423 | "\n", 424 | "\n", 425 | "doc_location = \"example-docs\" # replace this to your document location\n", 426 | "doc_file_name = \"layout-parser-paper.pdf\" # replace this to your document file name\n", 427 | "doc_url = doc_location + \"/\" + doc_file_name\n", 428 | "\n", 429 | "# partition the pdf into elements\n", 430 | "\n", 431 | "elements = partition_pdf(filename=doc_location+\"/\"+doc_file_name,\n", 432 | " infer_table_structure=True\n", 433 | " )\n", 434 | "\n", 435 | "ingestDocumentNeo4j(elements, doc_url)\n", 436 | "\n", 437 | "# DONE\n" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": { 444 | "id": "-tRSOBs3YPVR" 445 | }, 446 | "outputs": [], 447 | "source": [ 448 | "\n", 449 | "\n", 450 | "# save the elements as a json file\n", 451 | "\n", 452 | "convert_to_dict(elements)\n", 453 | "\n", 454 | "filename = doc_location+\"/\"+doc_file_name+\".json\"\n", 455 | "elements_to_json(elements, filename=filename)\n" 456 | ] 457 | } 458 | ], 459 | "metadata": { 460 | "colab": { 461 | "provenance": [] 462 | }, 463 | "kernelspec": { 464 | "display_name": "unstructured", 465 | "language": "python", 466 | "name": "python3" 467 | }, 468 | "language_info": { 469 | "codemirror_mode": { 470 | "name": "ipython", 471 | "version": 3 472 | }, 473 | "file_extension": ".py", 474 | "mimetype": "text/x-python", 475 | "name": "python", 476 | "nbconvert_exporter": "python", 477 | "pygments_lexer": "ipython3", 478 | "version": "3.11.6" 479 | }, 480 | "widgets": { 481 | "application/vnd.jupyter.widget-state+json": { 482 | "082c5bed11a945448297a20eb5aaa8f7": { 483 | "model_module": "@jupyter-widgets/controls", 484 | "model_module_version": "1.5.0", 485 | "model_name": "DescriptionStyleModel", 486 | "state": { 487 | "_model_module": "@jupyter-widgets/controls", 488 | "_model_module_version": "1.5.0", 489 | "_model_name": "DescriptionStyleModel", 490 | "_view_count": null, 491 | "_view_module": "@jupyter-widgets/base", 492 | "_view_module_version": "1.2.0", 493 | "_view_name": "StyleView", 494 | "description_width": "" 495 | } 496 | }, 497 | "0fddcf4296e24ead8a69664c2ab905a4": { 498 | "model_module": "@jupyter-widgets/base", 499 | "model_module_version": "1.2.0", 500 | "model_name": "LayoutModel", 501 | "state": { 502 | "_model_module": "@jupyter-widgets/base", 503 | "_model_module_version": "1.2.0", 504 | "_model_name": "LayoutModel", 505 | "_view_count": null, 506 | "_view_module": "@jupyter-widgets/base", 507 | "_view_module_version": "1.2.0", 508 | "_view_name": "LayoutView", 509 | "align_content": null, 510 | "align_items": null, 511 | "align_self": null, 512 | "border": null, 513 | "bottom": null, 514 | "display": null, 515 | "flex": null, 516 | "flex_flow": null, 517 | "grid_area": null, 518 | "grid_auto_columns": null, 519 | "grid_auto_flow": null, 520 | "grid_auto_rows": null, 521 | "grid_column": null, 522 | "grid_gap": null, 523 | "grid_row": null, 524 | "grid_template_areas": null, 525 | "grid_template_columns": null, 526 | "grid_template_rows": null, 527 | "height": null, 528 | "justify_content": null, 529 | "justify_items": null, 530 | "left": null, 531 | "margin": null, 532 | "max_height": null, 533 | "max_width": null, 534 | "min_height": null, 535 | "min_width": null, 536 | "object_fit": null, 537 | "object_position": null, 538 | "order": null, 539 | "overflow": null, 540 | "overflow_x": null, 541 | "overflow_y": null, 542 | "padding": null, 543 | "right": null, 544 | "top": null, 545 | "visibility": null, 546 | "width": null 547 | } 548 | }, 549 | "11031b0baf484220808b8b8d0992be3a": { 550 | "model_module": "@jupyter-widgets/controls", 551 | "model_module_version": "1.5.0", 552 | "model_name": "HTMLModel", 553 | "state": { 554 | "_dom_classes": [], 555 | "_model_module": "@jupyter-widgets/controls", 556 | "_model_module_version": "1.5.0", 557 | "_model_name": "HTMLModel", 558 | "_view_count": null, 559 | "_view_module": "@jupyter-widgets/controls", 560 | "_view_module_version": "1.5.0", 561 | "_view_name": "HTMLView", 562 | "description": "", 563 | "description_tooltip": null, 564 | "layout": "IPY_MODEL_1cce7d2834b54f18b3ec1f75daaa66d7", 565 | "placeholder": "​", 566 | "style": "IPY_MODEL_f4d54c7600764958a410be1032cf5e61", 567 | "value": " 1.47k/1.47k [00:00<00:00, 44.6kB/s]" 568 | } 569 | }, 570 | "1cce7d2834b54f18b3ec1f75daaa66d7": { 571 | "model_module": "@jupyter-widgets/base", 572 | "model_module_version": "1.2.0", 573 | "model_name": "LayoutModel", 574 | "state": { 575 | "_model_module": "@jupyter-widgets/base", 576 | "_model_module_version": "1.2.0", 577 | "_model_name": "LayoutModel", 578 | "_view_count": null, 579 | "_view_module": "@jupyter-widgets/base", 580 | "_view_module_version": "1.2.0", 581 | "_view_name": "LayoutView", 582 | "align_content": null, 583 | "align_items": null, 584 | "align_self": null, 585 | "border": null, 586 | "bottom": null, 587 | "display": null, 588 | "flex": null, 589 | "flex_flow": null, 590 | "grid_area": null, 591 | "grid_auto_columns": null, 592 | "grid_auto_flow": null, 593 | "grid_auto_rows": null, 594 | "grid_column": null, 595 | "grid_gap": null, 596 | "grid_row": null, 597 | "grid_template_areas": null, 598 | "grid_template_columns": null, 599 | "grid_template_rows": null, 600 | "height": null, 601 | "justify_content": null, 602 | "justify_items": null, 603 | "left": null, 604 | "margin": null, 605 | "max_height": null, 606 | "max_width": null, 607 | "min_height": null, 608 | "min_width": null, 609 | "object_fit": null, 610 | "object_position": null, 611 | "order": null, 612 | "overflow": null, 613 | "overflow_x": null, 614 | "overflow_y": null, 615 | "padding": null, 616 | "right": null, 617 | "top": null, 618 | "visibility": null, 619 | "width": null 620 | } 621 | }, 622 | "1fe6310a1ad0424b82329945262a7e2f": { 623 | "model_module": "@jupyter-widgets/controls", 624 | "model_module_version": "1.5.0", 625 | "model_name": "HBoxModel", 626 | "state": { 627 | "_dom_classes": [], 628 | "_model_module": "@jupyter-widgets/controls", 629 | "_model_module_version": "1.5.0", 630 | "_model_name": "HBoxModel", 631 | "_view_count": null, 632 | "_view_module": "@jupyter-widgets/controls", 633 | "_view_module_version": "1.5.0", 634 | "_view_name": "HBoxView", 635 | "box_style": "", 636 | "children": [ 637 | "IPY_MODEL_a9eaef02d6924c5b83db2ee973b73db5", 638 | "IPY_MODEL_7e7ab500393845698c648b8a83b13018", 639 | "IPY_MODEL_11031b0baf484220808b8b8d0992be3a" 640 | ], 641 | "layout": "IPY_MODEL_0fddcf4296e24ead8a69664c2ab905a4" 642 | } 643 | }, 644 | "21315a6c975b4e0f8d8a48eb0d507292": { 645 | "model_module": "@jupyter-widgets/controls", 646 | "model_module_version": "1.5.0", 647 | "model_name": "HBoxModel", 648 | "state": { 649 | "_dom_classes": [], 650 | "_model_module": "@jupyter-widgets/controls", 651 | "_model_module_version": "1.5.0", 652 | "_model_name": "HBoxModel", 653 | "_view_count": null, 654 | "_view_module": "@jupyter-widgets/controls", 655 | "_view_module_version": "1.5.0", 656 | "_view_name": "HBoxView", 657 | "box_style": "", 658 | "children": [ 659 | "IPY_MODEL_61da3171628b46c8849ca5dd391dabc4", 660 | "IPY_MODEL_c75e65306fca40bcac323abc321edcf7", 661 | "IPY_MODEL_f358143d63a5430a92f5b5b4dc394c43" 662 | ], 663 | "layout": "IPY_MODEL_6c940c35fa2b48b29476865eacbf41cc" 664 | } 665 | }, 666 | "2918b40af2574c3a8c9cad8845732016": { 667 | "model_module": "@jupyter-widgets/base", 668 | "model_module_version": "1.2.0", 669 | "model_name": "LayoutModel", 670 | "state": { 671 | "_model_module": "@jupyter-widgets/base", 672 | "_model_module_version": "1.2.0", 673 | "_model_name": "LayoutModel", 674 | "_view_count": null, 675 | "_view_module": "@jupyter-widgets/base", 676 | "_view_module_version": "1.2.0", 677 | "_view_name": "LayoutView", 678 | "align_content": null, 679 | "align_items": null, 680 | "align_self": null, 681 | "border": null, 682 | "bottom": null, 683 | "display": null, 684 | "flex": null, 685 | "flex_flow": null, 686 | "grid_area": null, 687 | "grid_auto_columns": null, 688 | "grid_auto_flow": null, 689 | "grid_auto_rows": null, 690 | "grid_column": null, 691 | "grid_gap": null, 692 | "grid_row": null, 693 | "grid_template_areas": null, 694 | "grid_template_columns": null, 695 | "grid_template_rows": null, 696 | "height": null, 697 | "justify_content": null, 698 | "justify_items": null, 699 | "left": null, 700 | "margin": null, 701 | "max_height": null, 702 | "max_width": null, 703 | "min_height": null, 704 | "min_width": null, 705 | "object_fit": null, 706 | "object_position": null, 707 | "order": null, 708 | "overflow": null, 709 | "overflow_x": null, 710 | "overflow_y": null, 711 | "padding": null, 712 | "right": null, 713 | "top": null, 714 | "visibility": null, 715 | "width": null 716 | } 717 | }, 718 | "2db5b641d66746fa83aec5f5f889c2a6": { 719 | "model_module": "@jupyter-widgets/base", 720 | "model_module_version": "1.2.0", 721 | "model_name": "LayoutModel", 722 | "state": { 723 | "_model_module": "@jupyter-widgets/base", 724 | "_model_module_version": "1.2.0", 725 | "_model_name": "LayoutModel", 726 | "_view_count": null, 727 | "_view_module": "@jupyter-widgets/base", 728 | "_view_module_version": "1.2.0", 729 | "_view_name": "LayoutView", 730 | "align_content": null, 731 | "align_items": null, 732 | "align_self": null, 733 | "border": null, 734 | "bottom": null, 735 | "display": null, 736 | "flex": null, 737 | "flex_flow": null, 738 | "grid_area": null, 739 | "grid_auto_columns": null, 740 | "grid_auto_flow": null, 741 | "grid_auto_rows": null, 742 | "grid_column": null, 743 | "grid_gap": null, 744 | "grid_row": null, 745 | "grid_template_areas": null, 746 | "grid_template_columns": null, 747 | "grid_template_rows": null, 748 | "height": null, 749 | "justify_content": null, 750 | "justify_items": null, 751 | "left": null, 752 | "margin": null, 753 | "max_height": null, 754 | "max_width": null, 755 | "min_height": null, 756 | "min_width": null, 757 | "object_fit": null, 758 | "object_position": null, 759 | "order": null, 760 | "overflow": null, 761 | "overflow_x": null, 762 | "overflow_y": null, 763 | "padding": null, 764 | "right": null, 765 | "top": null, 766 | "visibility": null, 767 | "width": null 768 | } 769 | }, 770 | "3eba18715eaa4d249a96e6736c32685e": { 771 | "model_module": "@jupyter-widgets/base", 772 | "model_module_version": "1.2.0", 773 | "model_name": "LayoutModel", 774 | "state": { 775 | "_model_module": "@jupyter-widgets/base", 776 | "_model_module_version": "1.2.0", 777 | "_model_name": "LayoutModel", 778 | "_view_count": null, 779 | "_view_module": "@jupyter-widgets/base", 780 | "_view_module_version": "1.2.0", 781 | "_view_name": "LayoutView", 782 | "align_content": null, 783 | "align_items": null, 784 | "align_self": null, 785 | "border": null, 786 | "bottom": null, 787 | "display": null, 788 | "flex": null, 789 | "flex_flow": null, 790 | "grid_area": null, 791 | "grid_auto_columns": null, 792 | "grid_auto_flow": null, 793 | "grid_auto_rows": null, 794 | "grid_column": null, 795 | "grid_gap": null, 796 | "grid_row": null, 797 | "grid_template_areas": null, 798 | "grid_template_columns": null, 799 | "grid_template_rows": null, 800 | "height": null, 801 | "justify_content": null, 802 | "justify_items": null, 803 | "left": null, 804 | "margin": null, 805 | "max_height": null, 806 | "max_width": null, 807 | "min_height": null, 808 | "min_width": null, 809 | "object_fit": null, 810 | "object_position": null, 811 | "order": null, 812 | "overflow": null, 813 | "overflow_x": null, 814 | "overflow_y": null, 815 | "padding": null, 816 | "right": null, 817 | "top": null, 818 | "visibility": null, 819 | "width": null 820 | } 821 | }, 822 | "566f884b121645a48aa03e480310308a": { 823 | "model_module": "@jupyter-widgets/controls", 824 | "model_module_version": "1.5.0", 825 | "model_name": "HTMLModel", 826 | "state": { 827 | "_dom_classes": [], 828 | "_model_module": "@jupyter-widgets/controls", 829 | "_model_module_version": "1.5.0", 830 | "_model_name": "HTMLModel", 831 | "_view_count": null, 832 | "_view_module": "@jupyter-widgets/controls", 833 | "_view_module_version": "1.5.0", 834 | "_view_name": "HTMLView", 835 | "description": "", 836 | "description_tooltip": null, 837 | "layout": "IPY_MODEL_2918b40af2574c3a8c9cad8845732016", 838 | "placeholder": "​", 839 | "style": "IPY_MODEL_72de4e1fc84e444ebb4004ea409ee425", 840 | "value": " 46.8M/46.8M [00:00<00:00, 103MB/s]" 841 | } 842 | }, 843 | "61da3171628b46c8849ca5dd391dabc4": { 844 | "model_module": "@jupyter-widgets/controls", 845 | "model_module_version": "1.5.0", 846 | "model_name": "HTMLModel", 847 | "state": { 848 | "_dom_classes": [], 849 | "_model_module": "@jupyter-widgets/controls", 850 | "_model_module_version": "1.5.0", 851 | "_model_name": "HTMLModel", 852 | "_view_count": null, 853 | "_view_module": "@jupyter-widgets/controls", 854 | "_view_module_version": "1.5.0", 855 | "_view_name": "HTMLView", 856 | "description": "", 857 | "description_tooltip": null, 858 | "layout": "IPY_MODEL_9bbfa722dc914b41861bcf6f17f75c4b", 859 | "placeholder": "​", 860 | "style": "IPY_MODEL_9d315fcf38d14359840a104ca03bbe50", 861 | "value": "model.safetensors: 100%" 862 | } 863 | }, 864 | "67d1d034d1e24f7486e10f423de3c8ae": { 865 | "model_module": "@jupyter-widgets/controls", 866 | "model_module_version": "1.5.0", 867 | "model_name": "FloatProgressModel", 868 | "state": { 869 | "_dom_classes": [], 870 | "_model_module": "@jupyter-widgets/controls", 871 | "_model_module_version": "1.5.0", 872 | "_model_name": "FloatProgressModel", 873 | "_view_count": null, 874 | "_view_module": "@jupyter-widgets/controls", 875 | "_view_module_version": "1.5.0", 876 | "_view_name": "ProgressView", 877 | "bar_style": "success", 878 | "description": "", 879 | "description_tooltip": null, 880 | "layout": "IPY_MODEL_a0a85fbdefe744f38b18b3cef7efebb8", 881 | "max": 46807446, 882 | "min": 0, 883 | "orientation": "horizontal", 884 | "style": "IPY_MODEL_d9eae77de3ab4207b4a8cc5a38e61089", 885 | "value": 46807446 886 | } 887 | }, 888 | "6c940c35fa2b48b29476865eacbf41cc": { 889 | "model_module": "@jupyter-widgets/base", 890 | "model_module_version": "1.2.0", 891 | "model_name": "LayoutModel", 892 | "state": { 893 | "_model_module": "@jupyter-widgets/base", 894 | "_model_module_version": "1.2.0", 895 | "_model_name": "LayoutModel", 896 | "_view_count": null, 897 | "_view_module": "@jupyter-widgets/base", 898 | "_view_module_version": "1.2.0", 899 | "_view_name": "LayoutView", 900 | "align_content": null, 901 | "align_items": null, 902 | "align_self": null, 903 | "border": null, 904 | "bottom": null, 905 | "display": null, 906 | "flex": null, 907 | "flex_flow": null, 908 | "grid_area": null, 909 | "grid_auto_columns": null, 910 | "grid_auto_flow": null, 911 | "grid_auto_rows": null, 912 | "grid_column": null, 913 | "grid_gap": null, 914 | "grid_row": null, 915 | "grid_template_areas": null, 916 | "grid_template_columns": null, 917 | "grid_template_rows": null, 918 | "height": null, 919 | "justify_content": null, 920 | "justify_items": null, 921 | "left": null, 922 | "margin": null, 923 | "max_height": null, 924 | "max_width": null, 925 | "min_height": null, 926 | "min_width": null, 927 | "object_fit": null, 928 | "object_position": null, 929 | "order": null, 930 | "overflow": null, 931 | "overflow_x": null, 932 | "overflow_y": null, 933 | "padding": null, 934 | "right": null, 935 | "top": null, 936 | "visibility": null, 937 | "width": null 938 | } 939 | }, 940 | "72de4e1fc84e444ebb4004ea409ee425": { 941 | "model_module": "@jupyter-widgets/controls", 942 | "model_module_version": "1.5.0", 943 | "model_name": "DescriptionStyleModel", 944 | "state": { 945 | "_model_module": "@jupyter-widgets/controls", 946 | "_model_module_version": "1.5.0", 947 | "_model_name": "DescriptionStyleModel", 948 | "_view_count": null, 949 | "_view_module": "@jupyter-widgets/base", 950 | "_view_module_version": "1.2.0", 951 | "_view_name": "StyleView", 952 | "description_width": "" 953 | } 954 | }, 955 | "7ddbaab9d0bc4eeea411563290e832e1": { 956 | "model_module": "@jupyter-widgets/controls", 957 | "model_module_version": "1.5.0", 958 | "model_name": "HTMLModel", 959 | "state": { 960 | "_dom_classes": [], 961 | "_model_module": "@jupyter-widgets/controls", 962 | "_model_module_version": "1.5.0", 963 | "_model_name": "HTMLModel", 964 | "_view_count": null, 965 | "_view_module": "@jupyter-widgets/controls", 966 | "_view_module_version": "1.5.0", 967 | "_view_name": "HTMLView", 968 | "description": "", 969 | "description_tooltip": null, 970 | "layout": "IPY_MODEL_3eba18715eaa4d249a96e6736c32685e", 971 | "placeholder": "​", 972 | "style": "IPY_MODEL_948890114b7e4a399412068ec59fe468", 973 | "value": "model.safetensors: 100%" 974 | } 975 | }, 976 | "7e7ab500393845698c648b8a83b13018": { 977 | "model_module": "@jupyter-widgets/controls", 978 | "model_module_version": "1.5.0", 979 | "model_name": "FloatProgressModel", 980 | "state": { 981 | "_dom_classes": [], 982 | "_model_module": "@jupyter-widgets/controls", 983 | "_model_module_version": "1.5.0", 984 | "_model_name": "FloatProgressModel", 985 | "_view_count": null, 986 | "_view_module": "@jupyter-widgets/controls", 987 | "_view_module_version": "1.5.0", 988 | "_view_name": "ProgressView", 989 | "bar_style": "success", 990 | "description": "", 991 | "description_tooltip": null, 992 | "layout": "IPY_MODEL_8b7cd68e87bc448d95bba88fe4fdfb60", 993 | "max": 1469, 994 | "min": 0, 995 | "orientation": "horizontal", 996 | "style": "IPY_MODEL_84b62c98c94a47b48be1e8a49edbf8c0", 997 | "value": 1469 998 | } 999 | }, 1000 | "84b62c98c94a47b48be1e8a49edbf8c0": { 1001 | "model_module": "@jupyter-widgets/controls", 1002 | "model_module_version": "1.5.0", 1003 | "model_name": "ProgressStyleModel", 1004 | "state": { 1005 | "_model_module": "@jupyter-widgets/controls", 1006 | "_model_module_version": "1.5.0", 1007 | "_model_name": "ProgressStyleModel", 1008 | "_view_count": null, 1009 | "_view_module": "@jupyter-widgets/base", 1010 | "_view_module_version": "1.2.0", 1011 | "_view_name": "StyleView", 1012 | "bar_color": null, 1013 | "description_width": "" 1014 | } 1015 | }, 1016 | "8b7cd68e87bc448d95bba88fe4fdfb60": { 1017 | "model_module": "@jupyter-widgets/base", 1018 | "model_module_version": "1.2.0", 1019 | "model_name": "LayoutModel", 1020 | "state": { 1021 | "_model_module": "@jupyter-widgets/base", 1022 | "_model_module_version": "1.2.0", 1023 | "_model_name": "LayoutModel", 1024 | "_view_count": null, 1025 | "_view_module": "@jupyter-widgets/base", 1026 | "_view_module_version": "1.2.0", 1027 | "_view_name": "LayoutView", 1028 | "align_content": null, 1029 | "align_items": null, 1030 | "align_self": null, 1031 | "border": null, 1032 | "bottom": null, 1033 | "display": null, 1034 | "flex": null, 1035 | "flex_flow": null, 1036 | "grid_area": null, 1037 | "grid_auto_columns": null, 1038 | "grid_auto_flow": null, 1039 | "grid_auto_rows": null, 1040 | "grid_column": null, 1041 | "grid_gap": null, 1042 | "grid_row": null, 1043 | "grid_template_areas": null, 1044 | "grid_template_columns": null, 1045 | "grid_template_rows": null, 1046 | "height": null, 1047 | "justify_content": null, 1048 | "justify_items": null, 1049 | "left": null, 1050 | "margin": null, 1051 | "max_height": null, 1052 | "max_width": null, 1053 | "min_height": null, 1054 | "min_width": null, 1055 | "object_fit": null, 1056 | "object_position": null, 1057 | "order": null, 1058 | "overflow": null, 1059 | "overflow_x": null, 1060 | "overflow_y": null, 1061 | "padding": null, 1062 | "right": null, 1063 | "top": null, 1064 | "visibility": null, 1065 | "width": null 1066 | } 1067 | }, 1068 | "90b0568e50aa4da498b3f96c6eddc010": { 1069 | "model_module": "@jupyter-widgets/base", 1070 | "model_module_version": "1.2.0", 1071 | "model_name": "LayoutModel", 1072 | "state": { 1073 | "_model_module": "@jupyter-widgets/base", 1074 | "_model_module_version": "1.2.0", 1075 | "_model_name": "LayoutModel", 1076 | "_view_count": null, 1077 | "_view_module": "@jupyter-widgets/base", 1078 | "_view_module_version": "1.2.0", 1079 | "_view_name": "LayoutView", 1080 | "align_content": null, 1081 | "align_items": null, 1082 | "align_self": null, 1083 | "border": null, 1084 | "bottom": null, 1085 | "display": null, 1086 | "flex": null, 1087 | "flex_flow": null, 1088 | "grid_area": null, 1089 | "grid_auto_columns": null, 1090 | "grid_auto_flow": null, 1091 | "grid_auto_rows": null, 1092 | "grid_column": null, 1093 | "grid_gap": null, 1094 | "grid_row": null, 1095 | "grid_template_areas": null, 1096 | "grid_template_columns": null, 1097 | "grid_template_rows": null, 1098 | "height": null, 1099 | "justify_content": null, 1100 | "justify_items": null, 1101 | "left": null, 1102 | "margin": null, 1103 | "max_height": null, 1104 | "max_width": null, 1105 | "min_height": null, 1106 | "min_width": null, 1107 | "object_fit": null, 1108 | "object_position": null, 1109 | "order": null, 1110 | "overflow": null, 1111 | "overflow_x": null, 1112 | "overflow_y": null, 1113 | "padding": null, 1114 | "right": null, 1115 | "top": null, 1116 | "visibility": null, 1117 | "width": null 1118 | } 1119 | }, 1120 | "90ef1d683b254127943e241109ea695c": { 1121 | "model_module": "@jupyter-widgets/base", 1122 | "model_module_version": "1.2.0", 1123 | "model_name": "LayoutModel", 1124 | "state": { 1125 | "_model_module": "@jupyter-widgets/base", 1126 | "_model_module_version": "1.2.0", 1127 | "_model_name": "LayoutModel", 1128 | "_view_count": null, 1129 | "_view_module": "@jupyter-widgets/base", 1130 | "_view_module_version": "1.2.0", 1131 | "_view_name": "LayoutView", 1132 | "align_content": null, 1133 | "align_items": null, 1134 | "align_self": null, 1135 | "border": null, 1136 | "bottom": null, 1137 | "display": null, 1138 | "flex": null, 1139 | "flex_flow": null, 1140 | "grid_area": null, 1141 | "grid_auto_columns": null, 1142 | "grid_auto_flow": null, 1143 | "grid_auto_rows": null, 1144 | "grid_column": null, 1145 | "grid_gap": null, 1146 | "grid_row": null, 1147 | "grid_template_areas": null, 1148 | "grid_template_columns": null, 1149 | "grid_template_rows": null, 1150 | "height": null, 1151 | "justify_content": null, 1152 | "justify_items": null, 1153 | "left": null, 1154 | "margin": null, 1155 | "max_height": null, 1156 | "max_width": null, 1157 | "min_height": null, 1158 | "min_width": null, 1159 | "object_fit": null, 1160 | "object_position": null, 1161 | "order": null, 1162 | "overflow": null, 1163 | "overflow_x": null, 1164 | "overflow_y": null, 1165 | "padding": null, 1166 | "right": null, 1167 | "top": null, 1168 | "visibility": null, 1169 | "width": null 1170 | } 1171 | }, 1172 | "948890114b7e4a399412068ec59fe468": { 1173 | "model_module": "@jupyter-widgets/controls", 1174 | "model_module_version": "1.5.0", 1175 | "model_name": "DescriptionStyleModel", 1176 | "state": { 1177 | "_model_module": "@jupyter-widgets/controls", 1178 | "_model_module_version": "1.5.0", 1179 | "_model_name": "DescriptionStyleModel", 1180 | "_view_count": null, 1181 | "_view_module": "@jupyter-widgets/base", 1182 | "_view_module_version": "1.2.0", 1183 | "_view_name": "StyleView", 1184 | "description_width": "" 1185 | } 1186 | }, 1187 | "9bbfa722dc914b41861bcf6f17f75c4b": { 1188 | "model_module": "@jupyter-widgets/base", 1189 | "model_module_version": "1.2.0", 1190 | "model_name": "LayoutModel", 1191 | "state": { 1192 | "_model_module": "@jupyter-widgets/base", 1193 | "_model_module_version": "1.2.0", 1194 | "_model_name": "LayoutModel", 1195 | "_view_count": null, 1196 | "_view_module": "@jupyter-widgets/base", 1197 | "_view_module_version": "1.2.0", 1198 | "_view_name": "LayoutView", 1199 | "align_content": null, 1200 | "align_items": null, 1201 | "align_self": null, 1202 | "border": null, 1203 | "bottom": null, 1204 | "display": null, 1205 | "flex": null, 1206 | "flex_flow": null, 1207 | "grid_area": null, 1208 | "grid_auto_columns": null, 1209 | "grid_auto_flow": null, 1210 | "grid_auto_rows": null, 1211 | "grid_column": null, 1212 | "grid_gap": null, 1213 | "grid_row": null, 1214 | "grid_template_areas": null, 1215 | "grid_template_columns": null, 1216 | "grid_template_rows": null, 1217 | "height": null, 1218 | "justify_content": null, 1219 | "justify_items": null, 1220 | "left": null, 1221 | "margin": null, 1222 | "max_height": null, 1223 | "max_width": null, 1224 | "min_height": null, 1225 | "min_width": null, 1226 | "object_fit": null, 1227 | "object_position": null, 1228 | "order": null, 1229 | "overflow": null, 1230 | "overflow_x": null, 1231 | "overflow_y": null, 1232 | "padding": null, 1233 | "right": null, 1234 | "top": null, 1235 | "visibility": null, 1236 | "width": null 1237 | } 1238 | }, 1239 | "9d315fcf38d14359840a104ca03bbe50": { 1240 | "model_module": "@jupyter-widgets/controls", 1241 | "model_module_version": "1.5.0", 1242 | "model_name": "DescriptionStyleModel", 1243 | "state": { 1244 | "_model_module": "@jupyter-widgets/controls", 1245 | "_model_module_version": "1.5.0", 1246 | "_model_name": "DescriptionStyleModel", 1247 | "_view_count": null, 1248 | "_view_module": "@jupyter-widgets/base", 1249 | "_view_module_version": "1.2.0", 1250 | "_view_name": "StyleView", 1251 | "description_width": "" 1252 | } 1253 | }, 1254 | "a0a85fbdefe744f38b18b3cef7efebb8": { 1255 | "model_module": "@jupyter-widgets/base", 1256 | "model_module_version": "1.2.0", 1257 | "model_name": "LayoutModel", 1258 | "state": { 1259 | "_model_module": "@jupyter-widgets/base", 1260 | "_model_module_version": "1.2.0", 1261 | "_model_name": "LayoutModel", 1262 | "_view_count": null, 1263 | "_view_module": "@jupyter-widgets/base", 1264 | "_view_module_version": "1.2.0", 1265 | "_view_name": "LayoutView", 1266 | "align_content": null, 1267 | "align_items": null, 1268 | "align_self": null, 1269 | "border": null, 1270 | "bottom": null, 1271 | "display": null, 1272 | "flex": null, 1273 | "flex_flow": null, 1274 | "grid_area": null, 1275 | "grid_auto_columns": null, 1276 | "grid_auto_flow": null, 1277 | "grid_auto_rows": null, 1278 | "grid_column": null, 1279 | "grid_gap": null, 1280 | "grid_row": null, 1281 | "grid_template_areas": null, 1282 | "grid_template_columns": null, 1283 | "grid_template_rows": null, 1284 | "height": null, 1285 | "justify_content": null, 1286 | "justify_items": null, 1287 | "left": null, 1288 | "margin": null, 1289 | "max_height": null, 1290 | "max_width": null, 1291 | "min_height": null, 1292 | "min_width": null, 1293 | "object_fit": null, 1294 | "object_position": null, 1295 | "order": null, 1296 | "overflow": null, 1297 | "overflow_x": null, 1298 | "overflow_y": null, 1299 | "padding": null, 1300 | "right": null, 1301 | "top": null, 1302 | "visibility": null, 1303 | "width": null 1304 | } 1305 | }, 1306 | "a9eaef02d6924c5b83db2ee973b73db5": { 1307 | "model_module": "@jupyter-widgets/controls", 1308 | "model_module_version": "1.5.0", 1309 | "model_name": "HTMLModel", 1310 | "state": { 1311 | "_dom_classes": [], 1312 | "_model_module": "@jupyter-widgets/controls", 1313 | "_model_module_version": "1.5.0", 1314 | "_model_name": "HTMLModel", 1315 | "_view_count": null, 1316 | "_view_module": "@jupyter-widgets/controls", 1317 | "_view_module_version": "1.5.0", 1318 | "_view_name": "HTMLView", 1319 | "description": "", 1320 | "description_tooltip": null, 1321 | "layout": "IPY_MODEL_2db5b641d66746fa83aec5f5f889c2a6", 1322 | "placeholder": "​", 1323 | "style": "IPY_MODEL_082c5bed11a945448297a20eb5aaa8f7", 1324 | "value": "config.json: 100%" 1325 | } 1326 | }, 1327 | "aa788ed8fa314d63a8167a583ea9571b": { 1328 | "model_module": "@jupyter-widgets/controls", 1329 | "model_module_version": "1.5.0", 1330 | "model_name": "HBoxModel", 1331 | "state": { 1332 | "_dom_classes": [], 1333 | "_model_module": "@jupyter-widgets/controls", 1334 | "_model_module_version": "1.5.0", 1335 | "_model_name": "HBoxModel", 1336 | "_view_count": null, 1337 | "_view_module": "@jupyter-widgets/controls", 1338 | "_view_module_version": "1.5.0", 1339 | "_view_name": "HBoxView", 1340 | "box_style": "", 1341 | "children": [ 1342 | "IPY_MODEL_7ddbaab9d0bc4eeea411563290e832e1", 1343 | "IPY_MODEL_67d1d034d1e24f7486e10f423de3c8ae", 1344 | "IPY_MODEL_566f884b121645a48aa03e480310308a" 1345 | ], 1346 | "layout": "IPY_MODEL_90ef1d683b254127943e241109ea695c" 1347 | } 1348 | }, 1349 | "be33409a46e84e4f81386c95612febff": { 1350 | "model_module": "@jupyter-widgets/controls", 1351 | "model_module_version": "1.5.0", 1352 | "model_name": "ProgressStyleModel", 1353 | "state": { 1354 | "_model_module": "@jupyter-widgets/controls", 1355 | "_model_module_version": "1.5.0", 1356 | "_model_name": "ProgressStyleModel", 1357 | "_view_count": null, 1358 | "_view_module": "@jupyter-widgets/base", 1359 | "_view_module_version": "1.2.0", 1360 | "_view_name": "StyleView", 1361 | "bar_color": null, 1362 | "description_width": "" 1363 | } 1364 | }, 1365 | "c75e65306fca40bcac323abc321edcf7": { 1366 | "model_module": "@jupyter-widgets/controls", 1367 | "model_module_version": "1.5.0", 1368 | "model_name": "FloatProgressModel", 1369 | "state": { 1370 | "_dom_classes": [], 1371 | "_model_module": "@jupyter-widgets/controls", 1372 | "_model_module_version": "1.5.0", 1373 | "_model_name": "FloatProgressModel", 1374 | "_view_count": null, 1375 | "_view_module": "@jupyter-widgets/controls", 1376 | "_view_module_version": "1.5.0", 1377 | "_view_name": "ProgressView", 1378 | "bar_style": "success", 1379 | "description": "", 1380 | "description_tooltip": null, 1381 | "layout": "IPY_MODEL_90b0568e50aa4da498b3f96c6eddc010", 1382 | "max": 115434268, 1383 | "min": 0, 1384 | "orientation": "horizontal", 1385 | "style": "IPY_MODEL_be33409a46e84e4f81386c95612febff", 1386 | "value": 115434268 1387 | } 1388 | }, 1389 | "d9eae77de3ab4207b4a8cc5a38e61089": { 1390 | "model_module": "@jupyter-widgets/controls", 1391 | "model_module_version": "1.5.0", 1392 | "model_name": "ProgressStyleModel", 1393 | "state": { 1394 | "_model_module": "@jupyter-widgets/controls", 1395 | "_model_module_version": "1.5.0", 1396 | "_model_name": "ProgressStyleModel", 1397 | "_view_count": null, 1398 | "_view_module": "@jupyter-widgets/base", 1399 | "_view_module_version": "1.2.0", 1400 | "_view_name": "StyleView", 1401 | "bar_color": null, 1402 | "description_width": "" 1403 | } 1404 | }, 1405 | "ebd4aa37f59d44b1b37c585d5d25fb48": { 1406 | "model_module": "@jupyter-widgets/base", 1407 | "model_module_version": "1.2.0", 1408 | "model_name": "LayoutModel", 1409 | "state": { 1410 | "_model_module": "@jupyter-widgets/base", 1411 | "_model_module_version": "1.2.0", 1412 | "_model_name": "LayoutModel", 1413 | "_view_count": null, 1414 | "_view_module": "@jupyter-widgets/base", 1415 | "_view_module_version": "1.2.0", 1416 | "_view_name": "LayoutView", 1417 | "align_content": null, 1418 | "align_items": null, 1419 | "align_self": null, 1420 | "border": null, 1421 | "bottom": null, 1422 | "display": null, 1423 | "flex": null, 1424 | "flex_flow": null, 1425 | "grid_area": null, 1426 | "grid_auto_columns": null, 1427 | "grid_auto_flow": null, 1428 | "grid_auto_rows": null, 1429 | "grid_column": null, 1430 | "grid_gap": null, 1431 | "grid_row": null, 1432 | "grid_template_areas": null, 1433 | "grid_template_columns": null, 1434 | "grid_template_rows": null, 1435 | "height": null, 1436 | "justify_content": null, 1437 | "justify_items": null, 1438 | "left": null, 1439 | "margin": null, 1440 | "max_height": null, 1441 | "max_width": null, 1442 | "min_height": null, 1443 | "min_width": null, 1444 | "object_fit": null, 1445 | "object_position": null, 1446 | "order": null, 1447 | "overflow": null, 1448 | "overflow_x": null, 1449 | "overflow_y": null, 1450 | "padding": null, 1451 | "right": null, 1452 | "top": null, 1453 | "visibility": null, 1454 | "width": null 1455 | } 1456 | }, 1457 | "ee5e2033a3cb4f54bbf79ca371f7c198": { 1458 | "model_module": "@jupyter-widgets/controls", 1459 | "model_module_version": "1.5.0", 1460 | "model_name": "DescriptionStyleModel", 1461 | "state": { 1462 | "_model_module": "@jupyter-widgets/controls", 1463 | "_model_module_version": "1.5.0", 1464 | "_model_name": "DescriptionStyleModel", 1465 | "_view_count": null, 1466 | "_view_module": "@jupyter-widgets/base", 1467 | "_view_module_version": "1.2.0", 1468 | "_view_name": "StyleView", 1469 | "description_width": "" 1470 | } 1471 | }, 1472 | "f358143d63a5430a92f5b5b4dc394c43": { 1473 | "model_module": "@jupyter-widgets/controls", 1474 | "model_module_version": "1.5.0", 1475 | "model_name": "HTMLModel", 1476 | "state": { 1477 | "_dom_classes": [], 1478 | "_model_module": "@jupyter-widgets/controls", 1479 | "_model_module_version": "1.5.0", 1480 | "_model_name": "HTMLModel", 1481 | "_view_count": null, 1482 | "_view_module": "@jupyter-widgets/controls", 1483 | "_view_module_version": "1.5.0", 1484 | "_view_name": "HTMLView", 1485 | "description": "", 1486 | "description_tooltip": null, 1487 | "layout": "IPY_MODEL_ebd4aa37f59d44b1b37c585d5d25fb48", 1488 | "placeholder": "​", 1489 | "style": "IPY_MODEL_ee5e2033a3cb4f54bbf79ca371f7c198", 1490 | "value": " 115M/115M [00:01<00:00, 97.3MB/s]" 1491 | } 1492 | }, 1493 | "f4d54c7600764958a410be1032cf5e61": { 1494 | "model_module": "@jupyter-widgets/controls", 1495 | "model_module_version": "1.5.0", 1496 | "model_name": "DescriptionStyleModel", 1497 | "state": { 1498 | "_model_module": "@jupyter-widgets/controls", 1499 | "_model_module_version": "1.5.0", 1500 | "_model_name": "DescriptionStyleModel", 1501 | "_view_count": null, 1502 | "_view_module": "@jupyter-widgets/base", 1503 | "_view_module_version": "1.2.0", 1504 | "_view_name": "StyleView", 1505 | "description_width": "" 1506 | } 1507 | } 1508 | } 1509 | } 1510 | }, 1511 | "nbformat": 4, 1512 | "nbformat_minor": 0 1513 | } 1514 | --------------------------------------------------------------------------------