├── tests
    ├── __init__.py
    ├── graph
    │   ├── __init__.py
    │   ├── test_community_alg.py
    │   ├── test_base.py
    │   ├── test_getter_setter.py
    │   └── search
    │   │   ├── test_global_search.py
    │   │   └── test_quick_search.py
    ├── agents
    │   ├── __init__.py
    │   ├── test_reranker.py
    │   └── test_jinja_prompt_helper.py
    ├── providers
    │   ├── __init__.py
    │   └── test_openai.py
    ├── persistence
    │   ├── __init__.py
    │   ├── adapters
    │   │   ├── __init__.py
    │   │   └── simple_repository
    │   │   │   ├── __init__.py
    │   │   │   ├── help.py
    │   │   │   └── test_load.py
    │   ├── vector_db
    │   │   ├── __init__.py
    │   │   └── help.py
    │   ├── test_factory.py
    │   └── test_metadata.py
    ├── visualization
    │   ├── __init__.py
    │   ├── conftest.py
    │   └── test_visualizer.py
    ├── tools
    │   ├── fast_pdf_parser
    │   │   └── test_parser.py
    │   └── test_fuzzy_matcher.py
    ├── builder
    │   ├── test_crop_img.py
    │   └── test_building_tools.py
    └── conftest.py
├── docs
    ├── static
    │   ├── .nojekyll
    │   └── img
    │   │   ├── favicon.ico
    │   │   ├── pinkdot.png
    │   │   └── Eschergraph_background.png
    ├── babel.config.js
    ├── docs
    │   ├── img
    │   │   └── InteractivePlot.png
    │   ├── explained-eschergraph
    │   │   ├── img
    │   │   │   ├── pdf-analysis.png
    │   │   │   └── Eschergraph_background.png
    │   │   ├── _category_.json
    │   │   ├── Graph building.md
    │   │   └── File Parsing.md
    │   ├── coming_soon.md
    │   └── getting_started.md
    ├── pages
    │   ├── markdown-page.md
    │   ├── index.js
    │   └── index.module.css
    ├── src
    │   ├── pages
    │   │   ├── markdown-page.md
    │   │   ├── index.js
    │   │   └── index.module.css
    │   ├── components
    │   │   └── HomepageFeatures
    │   │   │   ├── styles.module.css
    │   │   │   └── index.js
    │   ├── theme
    │   │   └── Admonition
    │   │   │   └── Icon
    │   │   │       └── Danger.js
    │   └── css
    │   │   └── custom.css
    ├── components
    │   └── HomepageFeatures
    │   │   ├── styles.module.css
    │   │   └── index.js
    ├── .gitignore
    ├── sidebars.js
    ├── css
    │   └── custom.css
    └── package.json
├── eschergraph
    ├── py.typed
    ├── agents
    │   ├── agent.py
    │   ├── providers
    │   │   ├── __init__.py
    │   │   └── jina.py
    │   ├── prompts
    │   │   ├── summary.jinja
    │   │   ├── json_keywords.jinja
    │   │   ├── search
    │   │   │   ├── global_search_context.jinja
    │   │   │   └── question_with_context.jinja
    │   │   ├── json_build.jinja
    │   │   ├── json_property.jinja
    │   │   ├── json_figure.jinja
    │   │   ├── identifying_nodes.jinja
    │   │   ├── json_table.jinja
    │   │   └── community_prompt.jinja
    │   ├── __init__.py
    │   ├── protocols.py
    │   ├── embedding.py
    │   ├── reranker.py
    │   ├── tools.py
    │   ├── jinja_helper.py
    │   └── llm.py
    ├── logging
    │   └── __init__.py
    ├── tools
    │   ├── __init__.py
    │   ├── estimator.py
    │   ├── prepare_sync_data.py
    │   └── fuzzy_matcher.py
    ├── benchmarking
    │   └── __init__.py
    ├── persistence
    │   ├── adapters
    │   │   ├── __init__.py
    │   │   └── simple_repository
    │   │   │   ├── __init__.py
    │   │   │   └── models.py
    │   ├── vector_db
    │   │   ├── adapters
    │   │   │   └── __init__.py
    │   │   ├── vector_search_result.py
    │   │   ├── __init__.py
    │   │   ├── factory.py
    │   │   └── vector_db.py
    │   ├── __init__.py
    │   ├── factory.py
    │   ├── exceptions.py
    │   ├── document.py
    │   ├── change_log.py
    │   └── metadata.py
    ├── builder
    │   ├── reader
    │   │   ├── pdf_document_layout_analysis
    │   │   │   ├── fast_trainer
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── paragraph.py
    │   │   │   │   ├── model_configuration.py
    │   │   │   │   └── paragraph_extractor_trainer.py
    │   │   │   ├── pdf_features
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── pdf_modes.py
    │   │   │   │   ├── pdf_token_context.py
    │   │   │   │   ├── pdf_font.py
    │   │   │   │   ├── pdf_page.py
    │   │   │   │   ├── rectangle.py
    │   │   │   │   └── pdf_token.py
    │   │   │   ├── pdf_token_type_labels
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── pdf_labels.py
    │   │   │   │   ├── token_type.py
    │   │   │   │   ├── page_labels.py
    │   │   │   │   └── label.py
    │   │   │   ├── pdf_tokens_type_trainer
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── model_configuration.py
    │   │   │   │   ├── config.py
    │   │   │   │   ├── pdf_trainer.py
    │   │   │   │   └── token_type_trainer.py
    │   │   │   ├── __init__.py
    │   │   │   └── pdf_segment.py
    │   │   ├── __init__.py
    │   │   ├── fast_pdf_parser
    │   │   │   ├── __init__.py
    │   │   │   └── models.py
    │   │   └── multi_modal
    │   │   │   └── data_structure.py
    │   ├── __init__.py
    │   ├── models.py
    │   └── build_log.py
    ├── visualization
    │   ├── __init__.py
    │   └── visualizer.py
    ├── main.py
    ├── __init__.py
    ├── graph
    │   ├── __init__.py
    │   ├── search
    │   │   ├── attribute_search.py
    │   │   └── global_search.py
    │   ├── loading.py
    │   ├── community.py
    │   ├── comm_graph.py
    │   ├── community_alg.py
    │   ├── utils.py
    │   ├── property.py
    │   ├── base.py
    │   └── edge.py
    ├── config.py
    └── exceptions.py
├── examples
    └── __init__.py
├── scripts
    ├── __init__.py
    ├── parser.py
    ├── multi_modal_build.py
    └── evaluate_global_search.py
├── tests_integration
    ├── __init__.py
    ├── document_summary.py
    ├── builder
    │   └── integration_test_build_pipeline.py
    ├── build_search.py
    └── chunk_optimized.py
├── poetry.toml
├── CHANGELOG.md
├── test_files
    ├── test_file.pdf
    ├── test_file_2.pdf
    ├── Attention Is All You Need.pdf
    └── txt_file.txt
├── CONTRIBUTING.md
├── .semversioner
    ├── next-release
    │   ├── minor-20240924073544980188.json
    │   └── minor-20240924092509210471.json
    ├── 0.1.0.json
    ├── 0.2.2.json
    ├── 0.2.4.json
    ├── 0.3.1.json
    ├── 0.2.0.json
    ├── 0.2.5.json
    ├── 0.2.1.json
    ├── 0.3.0.json
    └── 0.2.3.json
├── .editorconfig
├── LICENSE
├── .github
    └── workflows
    │   ├── release.yml
    │   ├── ci.yml
    │   └── docs-release.yml
├── .pre-commit-config.yaml
└── .gitignore


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/static/.nojekyll:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eschergraph/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/graph/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eschergraph/agents/agent.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/agents/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/providers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eschergraph/logging/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eschergraph/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/persistence/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests_integration/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eschergraph/benchmarking/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/persistence/adapters/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eschergraph/agents/providers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/persistence/vector_db/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eschergraph/persistence/adapters/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/persistence/adapters/simple_repository/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | create = true
3 | in-project = true


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PinkDot-AI/eschergraph/HEAD/CHANGELOG.md


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/fast_trainer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_features/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_token_type_labels/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_tokens_type_trainer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test_files/test_file.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PinkDot-AI/eschergraph/HEAD/test_files/test_file.pdf


--------------------------------------------------------------------------------
/docs/static/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PinkDot-AI/eschergraph/HEAD/docs/static/img/favicon.ico


--------------------------------------------------------------------------------
/docs/static/img/pinkdot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PinkDot-AI/eschergraph/HEAD/docs/static/img/pinkdot.png


--------------------------------------------------------------------------------
/test_files/test_file_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PinkDot-AI/eschergraph/HEAD/test_files/test_file_2.pdf


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to EscherGraph
2 | 
3 | For information on how to contribute, see [Future Link](link-to-docs)


--------------------------------------------------------------------------------
/docs/babel.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   presets: [require.resolve('@docusaurus/core/lib/babel/preset')],
3 | };
4 | 


--------------------------------------------------------------------------------
/docs/docs/img/InteractivePlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PinkDot-AI/eschergraph/HEAD/docs/docs/img/InteractivePlot.png


--------------------------------------------------------------------------------
/test_files/Attention Is All You Need.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PinkDot-AI/eschergraph/HEAD/test_files/Attention Is All You Need.pdf


--------------------------------------------------------------------------------
/docs/static/img/Eschergraph_background.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PinkDot-AI/eschergraph/HEAD/docs/static/img/Eschergraph_background.png


--------------------------------------------------------------------------------
/.semversioner/next-release/minor-20240924073544980188.json:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "minor",
3 |   "description": "Added a document node in level 2"
4 | }
5 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from eschergraph.builder.reader.reader import Reader as Reader
4 | 


--------------------------------------------------------------------------------
/docs/docs/explained-eschergraph/img/pdf-analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PinkDot-AI/eschergraph/HEAD/docs/docs/explained-eschergraph/img/pdf-analysis.png


--------------------------------------------------------------------------------
/eschergraph/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from eschergraph.visualization.visualizer import Visualizer as Visualizer
4 | 


--------------------------------------------------------------------------------
/docs/pages/markdown-page.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Markdown page example
3 | ---
4 | 
5 | # Markdown page example
6 | 
7 | You don't need React to write simple standalone pages.
8 | 


--------------------------------------------------------------------------------
/docs/docs/explained-eschergraph/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 |   "label": "EscherGraph Explained",
3 |   "position": 2,
4 |   "link": {
5 |     "type": "generated-index"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/docs/src/pages/markdown-page.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Markdown page example
3 | ---
4 | 
5 | # Markdown page example
6 | 
7 | You don't need React to write simple standalone pages.
8 | 


--------------------------------------------------------------------------------
/.semversioner/next-release/minor-20240924092509210471.json:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "minor",
3 |   "description": "Add tags to a document for filtering based on structured metadata"
4 | }
5 | 


--------------------------------------------------------------------------------
/docs/docs/explained-eschergraph/img/Eschergraph_background.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PinkDot-AI/eschergraph/HEAD/docs/docs/explained-eschergraph/img/Eschergraph_background.png


--------------------------------------------------------------------------------
/eschergraph/persistence/vector_db/adapters/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from eschergraph.persistence.vector_db.adapters.chromadb import ChromaDB as ChromaDB
4 | 


--------------------------------------------------------------------------------
/docs/pages/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import  { Redirect } from 'react-router-dom';
3 | 
4 | export default function Home() {
5 |   return <Redirect to='/docs/getting_started' />;
6 | }


--------------------------------------------------------------------------------
/docs/src/pages/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import  { Redirect } from 'react-router-dom';
3 | 
4 | export default function Home() {
5 |   return <Redirect to='/docs/getting_started' />;
6 | }


--------------------------------------------------------------------------------
/eschergraph/builder/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from eschergraph.builder.build_log import BuildLog as BuildLog
4 | from eschergraph.builder.build_pipeline import BuildPipeline as BuildPipeline
5 | 


--------------------------------------------------------------------------------
/.semversioner/0.1.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Initial release",
 5 |       "type": "minor"
 6 |     }
 7 |   ],
 8 |   "created_at": "2024-08-25T13:17:30+00:00",
 9 |   "version": "0.1.0"
10 | }


--------------------------------------------------------------------------------
/eschergraph/main.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | 
 4 | def main() -> None:
 5 |   """The (initial) main function.
 6 | 
 7 |   With an example docstring.
 8 |   """
 9 |   print("Welcome to the eschergraph library!")
10 | 


--------------------------------------------------------------------------------
/eschergraph/persistence/adapters/simple_repository/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from eschergraph.persistence.adapters.simple_repository.simple_repository import (
4 |   SimpleRepository as SimpleRepository,
5 | )
6 | 


--------------------------------------------------------------------------------
/docs/components/HomepageFeatures/styles.module.css:
--------------------------------------------------------------------------------
 1 | .features {
 2 |   display: flex;
 3 |   align-items: center;
 4 |   padding: 2rem 0;
 5 |   width: 100%;
 6 | }
 7 | 
 8 | .featureSvg {
 9 |   height: 200px;
10 |   width: 200px;
11 | }
12 | 


--------------------------------------------------------------------------------
/docs/src/components/HomepageFeatures/styles.module.css:
--------------------------------------------------------------------------------
 1 | .features {
 2 |   display: flex;
 3 |   align-items: center;
 4 |   padding: 2rem 0;
 5 |   width: 100%;
 6 | }
 7 | 
 8 | .featureSvg {
 9 |   height: 200px;
10 |   width: 200px;
11 | }
12 | 


--------------------------------------------------------------------------------
/scripts/parser.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from eschergraph.builder.reader.reader import Reader
4 | 
5 | path = "test_files/Attention Is All You Need.pdf"
6 | 
7 | r = Reader(file_location=path, multimodal=True)
8 | r.parse()
9 | 


--------------------------------------------------------------------------------
/.semversioner/0.2.2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Add the contributors to the README.md",
 5 |       "type": "patch"
 6 |     }
 7 |   ],
 8 |   "created_at": "2024-09-04T09:30:02+00:00",
 9 |   "version": "0.2.2"
10 | }


--------------------------------------------------------------------------------
/.semversioner/0.2.4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Bug where persisting edges fails has been fixed",
 5 |       "type": "patch"
 6 |     }
 7 |   ],
 8 |   "created_at": "2024-09-06T09:03:51+00:00",
 9 |   "version": "0.2.4"
10 | }


--------------------------------------------------------------------------------
/.semversioner/0.3.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Made a start with support for multimodal parsing",
 5 |       "type": "patch"
 6 |     }
 7 |   ],
 8 |   "created_at": "2024-09-23T14:07:48+00:00",
 9 |   "version": "0.3.1"
10 | }


--------------------------------------------------------------------------------
/docs/src/theme/Admonition/Icon/Danger.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import Danger from '@theme-original/Admonition/Icon/Danger';
 3 | 
 4 | export default function DangerWrapper(props) {
 5 |   return (
 6 |     <>
 7 |       <Danger {...props} />
 8 |     </>
 9 |   );
10 | }


--------------------------------------------------------------------------------
/.semversioner/0.2.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Add documentation and some missing features for the official release",
 5 |       "type": "minor"
 6 |     }
 7 |   ],
 8 |   "created_at": "2024-09-03T18:52:39+00:00",
 9 |   "version": "0.2.0"
10 | }


--------------------------------------------------------------------------------
/eschergraph/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from eschergraph.agents.providers.openai import OpenAIModel as OpenAIModel
4 | from eschergraph.agents.providers.openai import OpenAIProvider as OpenAIProvider
5 | from eschergraph.graph.graph import Graph as Graph
6 | 


--------------------------------------------------------------------------------
/.semversioner/0.2.5.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Fixed a bug where context was sometimes missing for search and global search",
 5 |       "type": "patch"
 6 |     }
 7 |   ],
 8 |   "created_at": "2024-09-10T15:24:54+00:00",
 9 |   "version": "0.2.5"
10 | }


--------------------------------------------------------------------------------
/eschergraph/persistence/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from eschergraph.persistence.document import Document as Document
4 | from eschergraph.persistence.metadata import Metadata as Metadata
5 | from eschergraph.persistence.repository import Repository as Repository
6 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_features/pdf_modes.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from dataclasses import dataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class PdfModes:
 8 |   lines_space_mode: float = 0
 9 |   right_space_mode: float = 0
10 |   font_size_mode: float = 0
11 | 


--------------------------------------------------------------------------------
/tests/persistence/test_factory.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from eschergraph.persistence import Repository
4 | from eschergraph.persistence.factory import get_default_repository
5 | 
6 | 
7 | def test_getting_default_repository() -> None:
8 |   assert isinstance(get_default_repository(), Repository)
9 | 


--------------------------------------------------------------------------------
/eschergraph/agents/prompts/summary.jinja:
--------------------------------------------------------------------------------
1 | Here is a document:
2 | {{ full_text }}
3 | 
4 | Please provide a comprehensive and complete summary of the following document.
5 | Make sure that it is possible to infer all the information presented in the 
6 | document by reading the summary.
7 | 
8 | Please provide only the summary!


--------------------------------------------------------------------------------
/eschergraph/builder/reader/fast_pdf_parser/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from eschergraph.builder.reader.fast_pdf_parser.models import (
4 |   PdfParsedSegment as PdfParsedSegment,
5 | )
6 | from eschergraph.builder.reader.fast_pdf_parser.parser import (
7 |   FastPdfParser as FastPdfParser,
8 | )
9 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | /node_modules
 3 | 
 4 | # Production
 5 | /build
 6 | 
 7 | # Generated files
 8 | .docusaurus
 9 | .cache-loader
10 | 
11 | # Misc
12 | .DS_Store
13 | .env.local
14 | .env.development.local
15 | .env.test.local
16 | .env.production.local
17 | 
18 | npm-debug.log*
19 | yarn-debug.log*
20 | yarn-error.log*
21 | 


--------------------------------------------------------------------------------
/eschergraph/persistence/vector_db/vector_search_result.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from uuid import UUID
 4 | 
 5 | from attrs import define
 6 | 
 7 | 
 8 | @define
 9 | class VectorSearchResult:
10 |   """The result from a vector search."""
11 | 
12 |   id: UUID
13 |   chunk: str
14 |   type: str
15 |   distance: float
16 | 


--------------------------------------------------------------------------------
/eschergraph/graph/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from eschergraph.graph.community import Community as Community
4 | from eschergraph.graph.edge import Edge as Edge
5 | from eschergraph.graph.graph import Graph as Graph
6 | from eschergraph.graph.node import Node as Node
7 | from eschergraph.graph.property import Property as Property
8 | 


--------------------------------------------------------------------------------
/tests/tools/fast_pdf_parser/test_parser.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | from eschergraph.builder.reader.fast_pdf_parser.parser import FastPdfParser
 6 | 
 7 | 
 8 | def test_fast_pdf_parser() -> None:
 9 |   FastPdfParser.parse(
10 |     Path.cwd().as_posix() + "/test_files/Attention Is All You Need.pdf"
11 |   )
12 | 


--------------------------------------------------------------------------------
/eschergraph/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from eschergraph.agents.embedding import Embedding as Embedding
4 | from eschergraph.agents.llm import ModelProvider as ModelProvider
5 | from eschergraph.agents.providers.openai import OpenAIModel as OpenAIModel
6 | from eschergraph.agents.providers.openai import OpenAIProvider as OpenAIProvider
7 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_features/pdf_token_context.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from dataclasses import dataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class PdfTokenContext:
 8 |   right_of_token_on_the_left = 0
 9 |   left_of_token_on_the_left = 0
10 |   left_of_token_on_the_right = 0
11 |   right_of_token_on_the_right = 0
12 | 


--------------------------------------------------------------------------------
/eschergraph/graph/search/attribute_search.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from attrs import define
 4 | 
 5 | from eschergraph.persistence import Metadata
 6 | 
 7 | 
 8 | @define
 9 | class AttributeSearch:
10 |   """This is the dataclass for the attribute search object."""
11 | 
12 |   text: str
13 |   metadata: set[Metadata] | None
14 |   parent_nodes: list[str]
15 | 


--------------------------------------------------------------------------------
/.semversioner/0.2.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Fixed issue with relative positioning of jinja prompts in package",
 5 |       "type": "patch"
 6 |     },
 7 |     {
 8 |       "description": "Removed unnecessary dependencies",
 9 |       "type": "patch"
10 |     }
11 |   ],
12 |   "created_at": "2024-09-04T09:03:23+00:00",
13 |   "version": "0.2.1"
14 | }


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*.{py,rst,md,yml}]
 4 | indent_style = space
 5 | end_of_line = lf
 6 | trim_trailing_whitespace = true
 7 | insert_final_newline = true
 8 | 
 9 | [*.py]
10 | max_line_length = 79
11 | indent_size = 2
12 | 
13 | [*.rst]
14 | max_line_length = 79
15 | indent_size = 2
16 | 
17 | [*.md]
18 | max_line_length = 79
19 | indent_size = 2
20 | 
21 | [*.yml]
22 | indent_size = 2


--------------------------------------------------------------------------------
/eschergraph/builder/reader/fast_pdf_parser/models.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TypedDict
 4 | 
 5 | 
 6 | class PdfParsedSegment(TypedDict):
 7 |   """A parsed PDF segment."""
 8 | 
 9 |   left: float
10 |   top: float
11 |   width: float
12 |   height: float
13 |   page_number: int
14 |   page_width: int
15 |   page_height: int
16 |   text: str
17 |   type: str
18 | 


--------------------------------------------------------------------------------
/eschergraph/persistence/vector_db/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from eschergraph.persistence.vector_db.factory import (
 4 |   get_vector_db as get_vector_db,
 5 | )
 6 | from eschergraph.persistence.vector_db.vector_db import VectorDB as VectorDB
 7 | from eschergraph.persistence.vector_db.vector_search_result import (
 8 |   VectorSearchResult as VectorSearchResult,
 9 | )
10 | 


--------------------------------------------------------------------------------
/eschergraph/graph/loading.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from enum import Enum
 4 | 
 5 | 
 6 | class LoadState(Enum):
 7 |   """The enum class that contains the load states for an EscherGraph object.
 8 | 
 9 |   The integer values indicate the loading hierarchy. A load state includes also
10 |   all the states with a lower value.
11 |   """
12 | 
13 |   REFERENCE = 0
14 |   CORE = 1
15 |   CONNECTED = 2
16 |   FULL = 3
17 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | """The code adapted from https://github.com/huridocs/pdf-document-layout-analysis.
4 | 
5 | We have used and adapted some of their code for the fast pdf parser that
6 | uses LightGBM models. All the credits to them! However, we need to copy the
7 | code as PyPI does not allow for the code to be downloaded from their GitHub directly.
8 | """
9 | 


--------------------------------------------------------------------------------
/eschergraph/graph/community.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Optional
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | from attrs import define
 7 | from attrs import field
 8 | 
 9 | if TYPE_CHECKING:
10 |   from eschergraph.graph.node import Node
11 | 
12 | 
13 | @define
14 | class Community:
15 |   """A community class that holds a node if the node is part of a community."""
16 | 
17 |   node: Optional[Node] = field(default=None)
18 | 


--------------------------------------------------------------------------------
/eschergraph/agents/prompts/json_keywords.jinja:
--------------------------------------------------------------------------------
 1 | Given the provided document, please do the following:
 2 | 
 3 | Read the document and identify a range of 5 to 15 keywords that capture the most important terms and concepts
 4 | in the document, avoiding generic or broad terms.
 5 | 
 6 | Please present the generated keywords in the following JSON format:
 7 | {'keywords': [keyword 1], [keyword 2], ..., [keyword15]}
 8 | 
 9 | Here is the full text from a document
10 | {{ full_text }}


--------------------------------------------------------------------------------
/docs/pages/index.module.css:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * CSS files with the .module.css suffix will be treated as CSS modules
 3 |  * and scoped locally.
 4 |  */
 5 | 
 6 | .heroBanner {
 7 |   padding: 4rem 0;
 8 |   text-align: center;
 9 |   position: relative;
10 |   overflow: hidden;
11 | }
12 | 
13 | @media screen and (max-width: 996px) {
14 |   .heroBanner {
15 |     padding: 2rem;
16 |   }
17 | }
18 | 
19 | .buttons {
20 |   display: flex;
21 |   align-items: center;
22 |   justify-content: center;
23 | }
24 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/fast_trainer/paragraph.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_token import (
 4 |   PdfToken,
 5 | )
 6 | 
 7 | 
 8 | class Paragraph:
 9 |   def __init__(self, tokens: list[PdfToken], pdf_name: str = ""):
10 |     self.tokens = tokens
11 |     self.pdf_name = pdf_name
12 | 
13 |   def add_token(self, token: PdfToken):
14 |     self.tokens.append(token)
15 | 


--------------------------------------------------------------------------------
/.semversioner/0.3.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Add the possibility to search in specific documents",
 5 |       "type": "minor"
 6 |     },
 7 |     {
 8 |       "description": "Add a check to prevent duplicate documents",
 9 |       "type": "patch"
10 |     },
11 |     {
12 |       "description": "Simplified the vector database interface",
13 |       "type": "patch"
14 |     }
15 |   ],
16 |   "created_at": "2024-09-13T14:04:14+00:00",
17 |   "version": "0.3.0"
18 | }


--------------------------------------------------------------------------------
/docs/src/pages/index.module.css:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * CSS files with the .module.css suffix will be treated as CSS modules
 3 |  * and scoped locally.
 4 |  */
 5 | 
 6 | .heroBanner {
 7 |   padding: 4rem 0;
 8 |   text-align: center;
 9 |   position: relative;
10 |   overflow: hidden;
11 | }
12 | 
13 | @media screen and (max-width: 996px) {
14 |   .heroBanner {
15 |     padding: 2rem;
16 |   }
17 | }
18 | 
19 | .buttons {
20 |   display: flex;
21 |   align-items: center;
22 |   justify-content: center;
23 | }
24 | 


--------------------------------------------------------------------------------
/eschergraph/persistence/factory.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from eschergraph.persistence.adapters.simple_repository import SimpleRepository
 6 | from eschergraph.persistence.repository import Repository
 7 | 
 8 | 
 9 | def get_default_repository(name: Optional[str] = None) -> Repository:
10 |   """Return the default repository for initialization if a graph.
11 | 
12 |   Returns:
13 |     The SimpleRepository with default settings.
14 |   """
15 |   return SimpleRepository(name=name)  # type: ignore
16 | 


--------------------------------------------------------------------------------
/tests/persistence/test_metadata.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from random import randrange
 4 | from uuid import uuid4
 5 | 
 6 | from eschergraph.persistence import Metadata
 7 | 
 8 | 
 9 | def create_metadata() -> Metadata:
10 |   return Metadata(document_id=uuid4(), chunk_id=randrange(start=0, stop=int(1e6)))
11 | 
12 | 
13 | def test_hash_metadata() -> None:
14 |   assert isinstance(hash(create_metadata()), int)
15 | 
16 | 
17 | def test_hash_metadata_unequal() -> None:
18 |   assert hash(create_metadata()) != hash(create_metadata())
19 | 


--------------------------------------------------------------------------------
/.semversioner/0.2.3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Changed the global search prompt to prevent the mentioning of communities",
 5 |       "type": "patch"
 6 |     },
 7 |     {
 8 |       "description": "Made small changes to the quick search to improve performance",
 9 |       "type": "patch"
10 |     },
11 |     {
12 |       "description": "Update the property extraction prompt to extract all information from the text",
13 |       "type": "patch"
14 |     }
15 |   ],
16 |   "created_at": "2024-09-05T12:11:18+00:00",
17 |   "version": "0.2.3"
18 | }


--------------------------------------------------------------------------------
/eschergraph/agents/protocols.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Protocol
 4 | 
 5 | from eschergraph.agents.tools import Tool
 6 | 
 7 | 
 8 | class ExternalFactor(Protocol):
 9 |   """The interface that agents expect for external factors.
10 | 
11 |   The agent expects this protocol to be implemented for all
12 |   external factors that it uses.
13 |   """
14 | 
15 |   def get_tools(self) -> list[Tool]:
16 |     """Get a list with all the tools for the agent.
17 | 
18 |     This function returns a list of tools that the agent can use.
19 |     """
20 |     ...
21 | 


--------------------------------------------------------------------------------
/eschergraph/graph/comm_graph.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from uuid import UUID
 4 | 
 5 | from attrs import define
 6 | 
 7 | 
 8 | @define
 9 | class CommunityGraphResult:
10 |   """The community graph result data structure.
11 | 
12 |   This community graph is returned after applying
13 |   the Leiden algorithm, for example.
14 |   """
15 | 
16 |   partitions: list[list[UUID]]
17 |   """A list of lists, where each each inner list
18 |   is a community containing the respective node id's"""
19 |   edges: list[UUID]
20 |   """A list of edge id's for all the edges in the community graph."""
21 | 


--------------------------------------------------------------------------------
/eschergraph/config.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | DEFAULT_SAVE_LOCATION: str = "./eschergraph_storage"
 4 | DEFAULT_GRAPH_NAME: str = "escher_default"
 5 | COMMUNITY_TEMPLATE: str = "community_prompt.jinja"
 6 | TEMPLATE_IMPORTANCE: str = "search/importance_rank.jinja"
 7 | JSON_BUILD: str = "json_build.jinja"
 8 | JSON_PROPERTY: str = "json_property.jinja"
 9 | JSON_TABLE: str = "json_table.jinja"
10 | JSON_KEYWORDS: str = "json_keywords.jinja"
11 | JSON_FIGURE: str = "json_figure.jinja"
12 | SUMMARY: str = "summary.jinja"
13 | MAIN_COLLECTION: str = "main_collection"
14 | GLOBAL_SEARCH_TEMPLATE: str = "search/global_search_context.jinja"
15 | 


--------------------------------------------------------------------------------
/eschergraph/persistence/exceptions.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from eschergraph.exceptions import BaseEscherGraphException
 4 | 
 5 | 
 6 | class PersistenceException(BaseEscherGraphException):
 7 |   """The base class for all EscherGraph exceptions that relate to persistence."""
 8 | 
 9 | 
10 | class DirectoryDoesNotExistException(PersistenceException):
11 |   """The specified directory does not exist."""
12 | 
13 | 
14 | class FilesMissingException(PersistenceException):
15 |   """Some files are missing or corrupted."""
16 | 
17 | 
18 | class PersistingEdgeException(PersistenceException):
19 |   """Both referenced nodes need to exist when an edge is persisted directly."""
20 | 


--------------------------------------------------------------------------------
/tests/providers/test_openai.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from faker import Faker
 4 | 
 5 | from eschergraph.agents.providers.openai import OpenAIModel
 6 | from eschergraph.agents.providers.openai import OpenAIProvider
 7 | from tests.conftest import provider_test
 8 | 
 9 | faker: Faker = Faker()
10 | 
11 | 
12 | @provider_test
13 | def test_openai_embedding_call() -> None:
14 |   embedding_provider: OpenAIProvider = OpenAIProvider(
15 |     model=OpenAIModel.TEXT_EMBEDDING_LARGE
16 |   )
17 |   text_list: list[str] = [faker.text(max_nb_chars=120) for _ in range(20)]
18 |   embeddings: list[list[float]] = embedding_provider.get_embedding(text_list)
19 |   assert embeddings
20 | 


--------------------------------------------------------------------------------
/scripts/multi_modal_build.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from eschergraph.agents.providers.jina import JinaReranker
 4 | from eschergraph.agents.providers.openai import OpenAIModel
 5 | from eschergraph.agents.providers.openai import OpenAIProvider
 6 | from eschergraph.builder.build_pipeline import BuildPipeline
 7 | from eschergraph.builder.reader.reader import Reader
 8 | 
 9 | builder = BuildPipeline(
10 |   model=OpenAIProvider(model=OpenAIModel.GPT_4o_MINI), reranker=JinaReranker()
11 | )
12 | 
13 | path = "test_files/Attention Is All You Need.pdf"
14 | 
15 | r = Reader(file_location=path, multimodal=True)
16 | r.parse()
17 | 
18 | 
19 | builder._handle_multi_modal(r.visual_elements)
20 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_token_type_labels/pdf_labels.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.rectangle import (
 6 |   Rectangle,
 7 | )
 8 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.page_labels import (
 9 |   PageLabels,
10 | )
11 | 
12 | 
13 | class PdfLabels(BaseModel):
14 |   pages: list[PageLabels] = list()
15 | 
16 |   def get_label_type(self, page_number: int, token_bounding_box: Rectangle):
17 |     for page in self.pages:
18 |       if page.number != page_number:
19 |         continue
20 | 
21 |       return page.get_token_type(token_bounding_box)
22 | 
23 |     return 6
24 | 


--------------------------------------------------------------------------------
/eschergraph/persistence/document.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any
 4 | from uuid import UUID
 5 | 
 6 | from attrs import define
 7 | from attrs import field
 8 | 
 9 | 
10 | @define
11 | class Document:
12 |   """The document data object."""
13 | 
14 |   id: UUID
15 |   name: str
16 |   chunk_num: int
17 |   token_num: int
18 |   tags: dict[str, Any] = field(factory=dict)
19 |   """"The (semi-)structured metadata that can be used for filtering."""
20 | 
21 |   def __hash__(self) -> int:
22 |     """The hash method for a document.
23 | 
24 |     Method is written for testing, but can also be used elsewhere.
25 | 
26 |     Returns:
27 |       int: The document's hash.
28 |     """
29 |     return hash((self.id, self.name, self.chunk_num, self.token_num))
30 | 


--------------------------------------------------------------------------------
/eschergraph/builder/models.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Optional
 4 | from typing import TYPE_CHECKING
 5 | from uuid import UUID
 6 | 
 7 | from attrs import define
 8 | 
 9 | # Prevent circular import errors
10 | if TYPE_CHECKING:
11 |   from eschergraph.builder.reader.multi_modal.data_structure import (
12 |     VisualDocumentElement,
13 |   )
14 |   from eschergraph.persistence.document import Document
15 | 
16 | 
17 | @define
18 | class Chunk:
19 |   """The chunk object."""
20 | 
21 |   text: str
22 |   chunk_id: int
23 |   doc_id: UUID
24 |   page_num: Optional[int]
25 | 
26 | 
27 | @define
28 | class ProcessedFile:
29 |   """A file processed by the reader."""
30 | 
31 |   document: Document
32 |   full_text: str
33 |   chunks: list[Chunk]
34 |   visual_elements: Optional[list[VisualDocumentElement]] = None
35 | 


--------------------------------------------------------------------------------
/eschergraph/agents/prompts/search/global_search_context.jinja:
--------------------------------------------------------------------------------
 1 | You are an AI assistant tasked with answering questions based solely on the provided context. 
 2 | You will recieve general questions about documents, or text types. The context will be a graph community summary extract relevant to the question.
 3 | So do not worry if the answer is not fully contained in the chunks, but give a general answer.
 4 | 
 5 | First, carefully read and analyze the following context:
 6 | 
 7 | <context>
 8 | {{CONTEXT}}
 9 | </context>
10 | 
11 | Now, consider the following question:
12 | 
13 | <query>
14 | {{QUERY}}
15 | </query>
16 | 
17 | Answer this question to the best of your ability and remember that you are answering general questions, and the pieces of texts are summaries of graph communities of a graph representation of a piece of text or document.
18 | 
19 | Answer:


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/fast_trainer/model_configuration.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_tokens_type_trainer.model_configuration import (
 4 |   ModelConfiguration,
 5 | )
 6 | 
 7 | config_json = {
 8 |   "boosting_type": "gbdt",
 9 |   "verbose": -1,
10 |   "learning_rate": 0.1,
11 |   "num_class": 2,
12 |   "context_size": 1,
13 |   "num_boost_round": 400,
14 |   "num_leaves": 191,
15 |   "bagging_fraction": 0.9166599392739231,
16 |   "bagging_freq": 7,
17 |   "feature_fraction": 0.3116707710163228,
18 |   "lambda_l1": 0.0006901861637621734,
19 |   "lambda_l2": 1.1886914989632197e-05,
20 |   "min_data_in_leaf": 50,
21 |   "feature_pre_filter": True,
22 |   "seed": 22,
23 |   "deterministic": True,
24 | }
25 | 
26 | MODEL_CONFIGURATION = ModelConfiguration(**config_json)
27 | 


--------------------------------------------------------------------------------
/eschergraph/persistence/change_log.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from enum import Enum
 4 | from uuid import UUID
 5 | 
 6 | from attrs import define
 7 | from attrs import field
 8 | 
 9 | from eschergraph.graph.base import EscherBase
10 | 
11 | 
12 | class Action(Enum):
13 |   """The action that occurred to the EscherBase object."""
14 | 
15 |   CREATE: str = "create"
16 |   UPDATE: str = "update"
17 |   DELETE: str = "delete"
18 | 
19 | 
20 | @define
21 | class ChangeLog:
22 |   """The log othat captures a persisted change to an EscherBase object."""
23 | 
24 |   id: UUID
25 |   """The primary key of the object."""
26 |   action: Action
27 |   type: type[EscherBase]
28 |   level: int
29 |   """The level in the graph at which the change occurred."""
30 |   attributes: list[str] = field(factory=list)
31 |   """A list with the name of the attributes could be impacted."""
32 | 


--------------------------------------------------------------------------------
/tests_integration/document_summary.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from eschergraph import OpenAIModel
 4 | from eschergraph import OpenAIProvider
 5 | from eschergraph.agents.llm import ModelProvider
 6 | from eschergraph.builder.build_pipeline import BuildPipeline
 7 | from eschergraph.builder.reader import Reader
 8 | 
 9 | TEST_FILE_LOCATION: str = "./test_files/Attention Is All You Need.pdf"
10 | 
11 | if __name__ == "__main__":
12 |   # The model used for obtaining a summary
13 |   model: ModelProvider = OpenAIProvider(model=OpenAIModel.GPT_4o_MINI)
14 | 
15 |   # Read the test document
16 |   reader: Reader = Reader(file_location=TEST_FILE_LOCATION)
17 |   reader.parse()
18 | 
19 |   summary: str = BuildPipeline._get_summary(model, full_text=reader.full_text)
20 | 
21 |   print(summary)
22 | 
23 |   # Check whether a summary has been provided
24 |   assert summary
25 | 


--------------------------------------------------------------------------------
/docs/sidebars.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Creating a sidebar enables you to:
 3 |  - create an ordered group of docs
 4 |  - render a sidebar for each doc of that group
 5 |  - provide next/previous navigation
 6 | 
 7 |  The sidebars can be generated from the filesystem, or explicitly defined here.
 8 | 
 9 |  Create as many sidebars as you want.
10 |  */
11 | 
12 | // @ts-check
13 | 
14 | /** @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} */
15 | const sidebars = {
16 |   // By default, Docusaurus generates a sidebar from the docs folder structure
17 |   tutorialSidebar: [{type: 'autogenerated', dirName: '.'}],
18 | 
19 |   // But you can create a sidebar manually
20 |   /*
21 |   tutorialSidebar: [
22 |     'intro',
23 |     'hello',
24 |     {
25 |       type: 'category',
26 |       label: 'Tutorial',
27 |       items: ['tutorial-basics/create-a-document'],
28 |     },
29 |   ],
30 |    */
31 | };
32 | 
33 | export default sidebars;
34 | 


--------------------------------------------------------------------------------
/tests/graph/test_community_alg.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from uuid import UUID
 4 | 
 5 | import igraph as ig
 6 | 
 7 | from eschergraph.graph.community_alg import _create_level_igraph
 8 | from eschergraph.graph.community_alg import get_leidenalg_communities
 9 | from tests.graph.help import create_simple_extracted_graph
10 | 
11 | 
12 | def test_create_level_igraph() -> None:
13 |   _, nodes, edges = create_simple_extracted_graph()
14 | 
15 |   igraph: ig.Graph = _create_level_igraph(nodes)
16 | 
17 |   assert {v["name"] for v in igraph.vs} == {node.id for node in nodes}
18 |   assert len(list(igraph.es)) == len(edges)
19 | 
20 | 
21 | def test_get_leidenalg_communities() -> None:
22 |   _, nodes, _ = create_simple_extracted_graph()
23 |   partitions: list[list[UUID]] = get_leidenalg_communities(nodes).partitions
24 | 
25 |   assert {n.id for n in nodes} == {id for p in partitions for id in p}
26 |   assert len(partitions) < len(nodes)
27 | 


--------------------------------------------------------------------------------
/eschergraph/persistence/vector_db/factory.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from eschergraph.agents.embedding import get_embedding_model
 4 | from eschergraph.persistence.vector_db.adapters.chromadb import ChromaDB
 5 | from eschergraph.persistence.vector_db.vector_db import VectorDB
 6 | 
 7 | 
 8 | def get_vector_db(save_name: str, db_type: str = "chroma_db") -> VectorDB:
 9 |   """Factory method to get the default vector database implementation.
10 | 
11 |   Args:
12 |     db_type (str): Type of the vector database (e.g., 'specific_db1', 'specific_db2').
13 |     save_name (str): the save name for the persisted vector db .
14 | 
15 |   Returns:
16 |     An implementation of the VectorDB abstract base class.
17 |   """
18 |   if db_type == "chroma_db":
19 |     return ChromaDB(
20 |       save_name=save_name,
21 |       embedding_model=get_embedding_model(),
22 |     )
23 |   else:
24 |     raise ValueError(f"Unknown vector database type: {db_type}")
25 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_features/pdf_font.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from lxml.etree import ElementBase
 4 | 
 5 | 
 6 | class PdfFont:
 7 |   def __init__(
 8 |     self, font_id: str, bold: bool, italics: bool, font_size: float, color: str
 9 |   ):
10 |     self.font_size = font_size
11 |     self.font_id = font_id
12 |     self.bold: bool = bold
13 |     self.italics: bool = italics
14 |     self.color = color
15 | 
16 |   @staticmethod
17 |   def from_poppler_etree(xml_text_style_tag: ElementBase):
18 |     bold: bool = "Bold" in xml_text_style_tag.attrib["family"]
19 |     italics: bool = "Italic" in xml_text_style_tag.attrib["family"]
20 |     font_size: float = float(xml_text_style_tag.attrib["size"])
21 |     color: str = (
22 |       "#000000"
23 |       if "color" not in xml_text_style_tag.attrib
24 |       else xml_text_style_tag.attrib["color"]
25 |     )
26 |     return PdfFont(xml_text_style_tag.attrib["id"], bold, italics, font_size, color)
27 | 


--------------------------------------------------------------------------------
/scripts/evaluate_global_search.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from eschergraph.agents import OpenAIModel
 4 | from eschergraph.agents import OpenAIProvider
 5 | from eschergraph.graph import Graph
 6 | 
 7 | TEST_DOCUMENT: str = "./test_files/Attention Is All You Need.pdf"
 8 | 
 9 | 
10 | # Watch out! Running this may incur costs for building the graph
11 | def get_or_create_graph() -> Graph:
12 |   """Get or build the graph needed for the evaluation."""
13 |   graph: Graph = Graph(
14 |     name="eval_global_search", model=OpenAIProvider(model=OpenAIModel.GPT_4o)
15 |   )
16 | 
17 |   # Check if the test file has already been added to the graph
18 |   if graph.repository.get_all_at_level(0):
19 |     return graph
20 | 
21 |   graph.build(TEST_DOCUMENT)
22 |   return graph
23 | 
24 | 
25 | if __name__ == "__main__":
26 |   graph: Graph = get_or_create_graph()
27 |   print(graph.global_search("What is the point of a transformer architecture?"))
28 |   # print(graph.search("How to measure proficiency in translation tasks?"))
29 | 


--------------------------------------------------------------------------------
/eschergraph/persistence/adapters/simple_repository/models.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Optional
 4 | from typing import TypedDict
 5 | from uuid import UUID
 6 | 
 7 | 
 8 | class MetadataModel(TypedDict):
 9 |   """The persistent data model for an object's metadata."""
10 | 
11 |   document_id: UUID
12 |   chunk_id: int
13 | 
14 | 
15 | class PropertyModel(TypedDict):
16 |   """The persistent data model for a property."""
17 | 
18 |   description: str
19 |   node: UUID
20 |   metadata: list[MetadataModel]
21 | 
22 | 
23 | class NodeModel(TypedDict):
24 |   """The persistent data model for a node."""
25 | 
26 |   name: str
27 |   description: str
28 |   level: int
29 |   properties: list[UUID]
30 |   edges: set[UUID]
31 |   community: Optional[UUID]
32 |   metadata: list[MetadataModel]
33 |   child_nodes: set[UUID]
34 |   is_visual: bool
35 | 
36 | 
37 | class EdgeModel(TypedDict):
38 |   """The persistent data model for an edge."""
39 | 
40 |   frm: UUID
41 |   to: UUID
42 |   description: str
43 |   metadata: list[MetadataModel]
44 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_token_type_labels/token_type.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from enum import Enum
 4 | 
 5 | 
 6 | class TokenType(Enum):
 7 |   FORMULA = "Formula"
 8 |   FOOTNOTE = "Footnote"
 9 |   LIST_ITEM = "List item"
10 |   TABLE = "Table"
11 |   PICTURE = "Picture"
12 |   TITLE = "Title"
13 |   TEXT = "Text"
14 |   PAGE_HEADER = "Page header"
15 |   SECTION_HEADER = "Section header"
16 |   CAPTION = "Caption"
17 |   PAGE_FOOTER = "Page footer"
18 | 
19 |   @staticmethod
20 |   def from_text(text: str):
21 |     try:
22 |       return TokenType[text.upper()]
23 |     except KeyError:
24 |       return TokenType.TEXT
25 | 
26 |   @staticmethod
27 |   def from_index(index: int):
28 |     try:
29 |       return list(TokenType)[index]
30 |     except IndexError:
31 |       return TokenType.TEXT.name.lower()
32 | 
33 |   @staticmethod
34 |   def from_value(value: str):
35 |     for token_type in TokenType:
36 |       if token_type.value == value:
37 |         return token_type
38 |     return TokenType.TEXT
39 | 
40 |   def get_index(self) -> int:
41 |     return list(TokenType).index(self)
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 PinkDot AI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_token_type_labels/page_labels.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.rectangle import (
 6 |   Rectangle,
 7 | )
 8 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.label import (
 9 |   Label,
10 | )
11 | 
12 | 
13 | class PageLabels(BaseModel):
14 |   number: int
15 |   labels: list[Label]
16 | 
17 |   def add_label(self, label: Label):
18 |     self.labels.append(label)
19 | 
20 |   def get_token_type(self, token_bounding_box: Rectangle):
21 |     intersection_percentage = 0
22 |     token_type = 6
23 |     sorted_labels_by_area = sorted(self.labels, key=lambda x: x.area())
24 |     for label in sorted_labels_by_area:
25 |       if label.intersection_percentage(token_bounding_box) > intersection_percentage:
26 |         intersection_percentage = label.intersection_percentage(token_bounding_box)
27 |         token_type = label.label_type
28 |       if intersection_percentage > 95:
29 |         return token_type
30 | 
31 |     return token_type
32 | 


--------------------------------------------------------------------------------
/eschergraph/builder/build_log.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TypedDict
 4 | 
 5 | from attrs import define
 6 | from attrs import field
 7 | 
 8 | from eschergraph.persistence import Metadata
 9 | 
10 | 
11 | class NodeExt(TypedDict):
12 |   """A directly extracted node."""
13 | 
14 |   name: str
15 |   description: str
16 | 
17 | 
18 | class EdgeExt(TypedDict):
19 |   """An directly extracted edge."""
20 | 
21 |   source: str
22 |   target: str
23 |   relationship: str
24 | 
25 | 
26 | class PropertyExt(TypedDict):
27 |   """A directly extracted property."""
28 | 
29 |   entity_name: str
30 |   properties: list[str]
31 | 
32 | 
33 | class NodeEdgeExt(TypedDict):
34 |   """Nodes and edges as extracted and returned by LLM."""
35 | 
36 |   entities: list[NodeExt]
37 |   relationships: list[EdgeExt]
38 | 
39 | 
40 | @define
41 | class BuildLog:
42 |   """This is the dataclass for the building logs."""
43 | 
44 |   metadata: Metadata
45 |   nodes: list[NodeExt]
46 |   edges: list[EdgeExt]
47 |   chunk_text: str
48 |   properties: list[PropertyExt] = field(factory=list)
49 |   main_visual_entity_name: str | None = field(default=None)
50 | 


--------------------------------------------------------------------------------
/docs/css/custom.css:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Any CSS included here will be global. The classic template
 3 |  * bundles Infima by default. Infima is a CSS framework designed to
 4 |  * work well for content-centric websites.
 5 |  */
 6 | 
 7 | /* You can override the default Infima variables here. */
 8 | :root {
 9 |   --ifm-color-primary: #F576F8;
10 |   --ifm-color-primary-dark: #d66ad0;
11 |   --ifm-color-primary-darker: #b95eb3;
12 |   --ifm-color-primary-darkest: #9c5095;
13 |   --ifm-color-primary-light: #ff8aff;
14 |   --ifm-color-primary-lighter: #ff96ff;
15 |   --ifm-color-primary-lightest: #ffb1ff;
16 |   --ifm-code-font-size: 95%;
17 |   --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1);
18 | }
19 | 
20 | /* For readability concerns, you should choose a lighter palette in dark mode. */
21 | [data-theme='dark'] {
22 |   --ifm-color-primary: #F576F8;
23 |   --ifm-color-primary-dark: #d66ad0;
24 |   --ifm-color-primary-darker: #b95eb3;
25 |   --ifm-color-primary-darkest: #9c5095;
26 |   --ifm-color-primary-light: #ff8aff;
27 |   --ifm-color-primary-lighter: #ff96ff;
28 |   --ifm-color-primary-lightest: #ffb1ff;
29 |   --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3);
30 | }
31 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_tokens_type_trainer/model_configuration.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from dataclasses import asdict
 4 | from dataclasses import dataclass
 5 | 
 6 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.token_type import (
 7 |   TokenType,
 8 | )
 9 | 
10 | 
11 | @dataclass
12 | class ModelConfiguration:
13 |   context_size: int = 4
14 |   num_boost_round: int = 700
15 |   num_leaves: int = 127
16 |   bagging_fraction: float = 0.6810645192499981
17 |   lambda_l1: float = 1.1533558410486358e-08
18 |   lambda_l2: float = 4.91211684620458
19 |   feature_fraction: float = 0.7087268965467017
20 |   bagging_freq: int = 10
21 |   min_data_in_leaf: int = 47
22 |   feature_pre_filter: bool = False
23 |   boosting_type: str = "gbdt"
24 |   objective: str = "multiclass"
25 |   metric: str = "multi_logloss"
26 |   learning_rate: float = 0.1
27 |   seed: int = 22
28 |   num_class: int = len(TokenType)
29 |   verbose: int = -1
30 |   deterministic: bool = True
31 |   resume_training: bool = False
32 |   early_stopping_rounds: int = None
33 | 
34 |   def dict(self):
35 |     return asdict(self)
36 | 


--------------------------------------------------------------------------------
/docs/src/css/custom.css:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Any CSS included here will be global. The classic template
 3 |  * bundles Infima by default. Infima is a CSS framework designed to
 4 |  * work well for content-centric websites.
 5 |  */
 6 | 
 7 | /* You can override the default Infima variables here. */
 8 | :root {
 9 |   --ifm-color-primary: #F576F8;
10 |   --ifm-color-primary-dark: #d66ad0;
11 |   --ifm-color-primary-darker: #b95eb3;
12 |   --ifm-color-primary-darkest: #9c5095;
13 |   --ifm-color-primary-light: #ff8aff;
14 |   --ifm-color-primary-lighter: #ff96ff;
15 |   --ifm-color-primary-lightest: #ffb1ff;
16 |   --ifm-code-font-size: 95%;
17 |   --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1);
18 | }
19 | 
20 | /* For readability concerns, you should choose a lighter palette in dark mode. */
21 | [data-theme='dark'] {
22 |   --ifm-color-primary: #F576F8;
23 |   --ifm-color-primary-dark: #d66ad0;
24 |   --ifm-color-primary-darker: #b95eb3;
25 |   --ifm-color-primary-darkest: #9c5095;
26 |   --ifm-color-primary-light: #ff8aff;
27 |   --ifm-color-primary-lighter: #ff96ff;
28 |   --ifm-color-primary-lightest: #ffb1ff;
29 |   --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3);
30 | }
31 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_tokens_type_trainer/config.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from os.path import join
 4 | from pathlib import Path
 5 | 
 6 | PDF_TOKENS_TYPE_ROOT_PATH = Path(__file__).parent.parent.parent.absolute()
 7 | PDF_LABELED_DATA_ROOT_PATH = Path(
 8 |   join(PDF_TOKENS_TYPE_ROOT_PATH.parent.absolute(), "pdf-labeled-data")
 9 | )
10 | TOKEN_TYPE_LABEL_PATH = Path(
11 |   join(PDF_LABELED_DATA_ROOT_PATH, "labeled_data", "token_type")
12 | )
13 | 
14 | TRAINED_MODEL_PATH = join(PDF_TOKENS_TYPE_ROOT_PATH, "model", "pdf_tokens_type.model")
15 | TOKEN_TYPE_RELATIVE_PATH = join("labeled_data", "token_type")
16 | MISTAKES_RELATIVE_PATH = join("labeled_data", "task_mistakes")
17 | 
18 | XML_NAME = "etree.xml"
19 | LABELS_FILE_NAME = "labels.json"
20 | STATUS_FILE_NAME = "status.txt"
21 | 
22 | CHARACTER_TYPE = [
23 |   "Lt",
24 |   "Lo",
25 |   "Sk",
26 |   "Lm",
27 |   "Sm",
28 |   "Cf",
29 |   "Nl",
30 |   "Pe",
31 |   "Po",
32 |   "Pd",
33 |   "Me",
34 |   "Sc",
35 |   "Ll",
36 |   "Pf",
37 |   "Mc",
38 |   "Lu",
39 |   "Zs",
40 |   "Cn",
41 |   "Cc",
42 |   "No",
43 |   "Co",
44 |   "Ps",
45 |   "Nd",
46 |   "Mn",
47 |   "Pi",
48 |   "So",
49 |   "Pc",
50 | ]
51 | 


--------------------------------------------------------------------------------
/tests/persistence/vector_db/help.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import random
 4 | from uuid import UUID
 5 | from uuid import uuid4
 6 | 
 7 | from faker import Faker
 8 | 
 9 | from eschergraph.persistence.vector_db import VectorSearchResult
10 | 
11 | faker: Faker = Faker()
12 | 
13 | 
14 | def generate_insert_data(
15 |   num_docs: int = 10,
16 | ) -> tuple[list[str], list[UUID], list[dict[str, str | int]]]:
17 |   docs: list[str] = [faker.text(max_nb_chars=80) for _ in range(num_docs)]
18 |   ids: list[UUID] = [uuid4() for _ in range(num_docs)]
19 |   metadatas: list[dict[str, str | int]] = [
20 |     {
21 |       "level": random.randint(0, 10),
22 |       "type": random.choice(["node", "edge", "property"]),
23 |       "document_id": str(uuid4()),
24 |     }
25 |     for _ in range(num_docs)
26 |   ]
27 | 
28 |   return docs, ids, metadatas
29 | 
30 | 
31 | def generate_vector_search_results(num_results: int = 10) -> list[VectorSearchResult]:
32 |   return [
33 |     VectorSearchResult(
34 |       id=uuid4(),
35 |       chunk=faker.text(max_nb_chars=80),
36 |       type=random.choice(["node", "edge", "property"]),
37 |       distance=random.random(),
38 |     )
39 |     for _ in range(num_results)
40 |   ]
41 | 


--------------------------------------------------------------------------------
/eschergraph/persistence/metadata.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Optional
 4 | from uuid import UUID
 5 | 
 6 | from attrs import define
 7 | from attrs import field
 8 | 
 9 | 
10 | @define
11 | class MetadataVisual:
12 |   """The metadata that is attached to a part of the graph."""
13 | 
14 |   id: UUID
15 |   content: str
16 |   save_location: str
17 |   page_num: int | None
18 |   type: str
19 | 
20 |   def __hash__(self) -> int:
21 |     """This is the hash function for the MetadataVisual datastructure."""
22 |     return hash(self.id)
23 | 
24 | 
25 | @define
26 | class Metadata:
27 |   """The metadata that is attached to a part of the graph."""
28 | 
29 |   document_id: UUID
30 |   chunk_id: Optional[int]
31 |   visual_metadata: Optional[MetadataVisual] = field(default=None)
32 | 
33 |   def __hash__(self) -> int:
34 |     """This is the hash function for the Metadata datastructure."""
35 |     visual_id: int | UUID = 1
36 |     if isinstance(self.visual_metadata, dict):
37 |       self.visual_metadata = MetadataVisual(**self.visual_metadata)
38 |     if self.visual_metadata:
39 |       visual_id = self.visual_metadata.id
40 | 
41 |     return hash((self.document_id, self.chunk_id, visual_id))
42 | 


--------------------------------------------------------------------------------
/eschergraph/agents/embedding.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | from abc import ABC
 5 | from abc import abstractmethod
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | load_dotenv()
10 | 
11 | 
12 | def get_embedding_model(embedding_type: str = "text_embedding_3_large") -> Embedding:
13 |   """Factory method to get the default embedding model: openai's text embedding 3 large.
14 | 
15 |   Args:
16 |     embedding_type (str): Type of the embedding model.
17 | 
18 |   Returns:
19 |     An implementation of the VectorDB abstract base class.
20 |   """
21 |   openai_api_key: str | None = os.getenv("OPENAI_API_KEY")
22 | 
23 |   if embedding_type == "text_embedding_3_large" and openai_api_key:
24 |     from eschergraph.agents.providers.openai import OpenAIProvider, OpenAIModel
25 | 
26 |     return OpenAIProvider(
27 |       model=OpenAIModel.TEXT_EMBEDDING_LARGE,
28 |     )
29 |   else:
30 |     raise ValueError(f"Unknown embedding model type: {embedding_type}")
31 | 
32 | 
33 | class Embedding(ABC):
34 |   """The abstract base class for all the embedding models used in the package."""
35 | 
36 |   @abstractmethod
37 |   def get_embedding(self, list_text: list[str]) -> list[list[float]]:
38 |     """Get the embedding vectors for a list of text."""
39 |     raise NotImplementedError
40 | 


--------------------------------------------------------------------------------
/tests/builder/test_crop_img.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from eschergraph.builder.reader.multi_modal.multi_modal_parser import (
 4 |   _crop_image_from_pdf_page,
 5 | )
 6 | 
 7 | 
 8 | def test_crop_image_negative_bbox() -> None:
 9 |   path: str = "test_files/Attention Is All You Need.pdf"
10 |   page_num: int = 12
11 |   bbox: list[float] = [-10.2, -10.1, -5.0, -5]
12 | 
13 |   img = _crop_image_from_pdf_page(path, page_num - 1, bbox)
14 |   w, h = img.size
15 |   assert w == 0
16 |   assert h == 0
17 | 
18 | 
19 | def test_crop_image_bbox_outside_page() -> None:
20 |   path: str = "test_files/Attention Is All You Need.pdf"
21 |   page_num: int = 12
22 |   bbox: list[float] = [
23 |     10000.1,
24 |     10000.2,
25 |     20000.3,
26 |     20000.2,
27 |   ]  # Far outside the typical page dimensions
28 | 
29 |   img = _crop_image_from_pdf_page(path, page_num - 1, bbox)
30 |   w, h = img.size
31 |   assert w == 0
32 |   assert h == 0
33 | 
34 | 
35 | def test_crop_image_zero_area_bbox() -> None:
36 |   path: str = "test_files/Attention Is All You Need.pdf"
37 |   page_num: int = 12
38 |   bbox: list[float] = [100.0, 100.0, 100.0, 100.0]  # A box with no area
39 | 
40 |   img = _crop_image_from_pdf_page(path, page_num - 1, bbox)
41 |   w, h = img.size
42 |   assert w == 0
43 |   assert h == 0
44 | 


--------------------------------------------------------------------------------
/eschergraph/agents/prompts/search/question_with_context.jinja:
--------------------------------------------------------------------------------
 1 | You are an AI assistant tasked with answering questions based solely on the provided context. 
 2 | Your goal is to provide accurate and relevant answers without adding any information that is not present in the given text.
 3 | 
 4 | First, carefully read and analyze the following context:
 5 | 
 6 | <context>
 7 | {{CONTEXT}}
 8 | </context>
 9 | 
10 | Now, consider the following question:
11 | 
12 | <query>
13 | {{QUERY}}
14 | </query>
15 | 
16 | To answer this question:
17 | 
18 | 1. Thoroughly examine the context for information directly related to the query.
19 | 2. Only use information explicitly stated in the context. Do not add any external knowledge or make assumptions beyond what is provided.
20 | 3. If the context does not contain sufficient information to fully answer the query, state that you cannot answer the question based on the given information.
21 | 4. Ensure your answer is faithful to the query and does not deviate from addressing the specific question asked.
22 | 5. Do not mention the chunks, if the answer is not in the chunks then say 'I cannot find the answer in the documents'
23 | 
24 | Remember, it is crucial to stay within the bounds of the provided context and not introduce any external information in your response.
25 | 
26 | Answer:


--------------------------------------------------------------------------------
/docs/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "eschergraph-docs",
 3 |   "version": "0.3.1",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "docusaurus": "docusaurus",
 7 |     "start": "docusaurus start",
 8 |     "build": "docusaurus build",
 9 |     "swizzle": "docusaurus swizzle",
10 |     "deploy": "docusaurus deploy",
11 |     "clear": "docusaurus clear",
12 |     "serve": "docusaurus serve",
13 |     "write-translations": "docusaurus write-translations",
14 |     "write-heading-ids": "docusaurus write-heading-ids"
15 |   },
16 |   "dependencies": {
17 |     "@docusaurus/core": "^3.5.2",
18 |     "@docusaurus/preset-classic": "^3.5.2",
19 |     "@mdx-js/react": "^3.0.0",
20 |     "clsx": "^2.0.0",
21 |     "eschergraph-docs": "file:",
22 |     "prism-react-renderer": "^2.3.0",
23 |     "react": "^18.0.0",
24 |     "react-dom": "^18.0.0"
25 |   },
26 |   "devDependencies": {
27 |     "@docusaurus/module-type-aliases": "^3.5.2",
28 |     "@docusaurus/types": "^3.5.2",
29 |     "terser-webpack-plugin": "^5.3.10"
30 |   },
31 |   "browserslist": {
32 |     "production": [
33 |       ">0.5%",
34 |       "not dead",
35 |       "not op_mini all"
36 |     ],
37 |     "development": [
38 |       "last 3 chrome version",
39 |       "last 3 firefox version",
40 |       "last 5 safari version"
41 |     ]
42 |   },
43 |   "engines": {
44 |     "node": ">=18.0"
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/tests/graph/test_base.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from unittest.mock import Mock
 4 | from uuid import UUID
 5 | 
 6 | from eschergraph.graph.base import EscherBase
 7 | from eschergraph.graph.loading import LoadState
 8 | 
 9 | 
10 | def test_escherbase_creation(mock_repository: Mock) -> None:
11 |   base: EscherBase = EscherBase(repository=mock_repository)
12 |   assert isinstance(base.id, UUID)
13 | 
14 | 
15 | def test_escherbase_metadata_initial(mock_repository: Mock) -> None:
16 |   base: EscherBase = EscherBase(repository=mock_repository)
17 |   assert not base._metadata
18 | 
19 | 
20 | def test_loadstate_setter(mock_repository: Mock) -> None:
21 |   base: EscherBase = EscherBase(repository=mock_repository)
22 |   base.loadstate = LoadState.CORE
23 | 
24 |   mock_repository.load.assert_called_once()
25 |   mock_repository.load.assert_called_with(base, loadstate=LoadState.CORE)
26 | 
27 | 
28 | def test_loadstate_setter_decrease(mock_repository: Mock) -> None:
29 |   base: EscherBase = EscherBase(repository=mock_repository)
30 |   base.loadstate = LoadState.FULL
31 | 
32 |   # Nothing should be done if the loadstate is decreased
33 |   base.loadstate = LoadState.CORE
34 | 
35 |   mock_repository.load.assert_called_once()
36 |   mock_repository.load.assert_called_with(base, loadstate=LoadState.FULL)
37 |   assert base.loadstate == LoadState.FULL
38 | 


--------------------------------------------------------------------------------
/eschergraph/agents/reranker.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from abc import ABC
 4 | from abc import abstractmethod
 5 | 
 6 | from attrs import define
 7 | 
 8 | 
 9 | @define
10 | class RerankerResult:
11 |   """Represents a reranked item with its index, relevance score, and associated text.
12 | 
13 |   Attributes:
14 |       index (int): The position of the item in the original list.
15 |       relevance_score (float): The relevance score assigned by the reranker.
16 |       text (str): The content of the item.
17 |   """
18 | 
19 |   index: int
20 |   relevance_score: float
21 |   text: str
22 | 
23 | 
24 | class Reranker(ABC):
25 |   """The abstract base class for all rerankers used in the package."""
26 | 
27 |   required_credentials: list[str]
28 | 
29 |   @abstractmethod
30 |   def rerank(
31 |     self, query: str, text_list: list[str], top_n: int
32 |   ) -> list[RerankerResult]:
33 |     """Rerank the search results based on relevance for the query.
34 | 
35 |     Args:
36 |       query (str): The query to search for.
37 |       text_list (list[str]): The results to rerank.
38 |       top_n (int): The number of results to return.
39 | 
40 |     Returns:
41 |       A list of reranked results.
42 |     """
43 |     raise NotImplementedError
44 | 
45 |   def get_model_name(self) -> str:
46 |     """Returns the models name."""
47 |     raise NotImplementedError
48 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_features/pdf_page.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from lxml.etree import ElementBase
 4 | 
 5 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_font import (
 6 |   PdfFont,
 7 | )
 8 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_token import (
 9 |   PdfToken,
10 | )
11 | 
12 | 
13 | class PdfPage:
14 |   def __init__(
15 |     self,
16 |     page_number: int,
17 |     page_width: int,
18 |     page_height: int,
19 |     tokens: list[PdfToken],
20 |     pdf_name: str,
21 |   ):
22 |     self.page_number = page_number
23 |     self.page_width = page_width
24 |     self.page_height = page_height
25 |     self.tokens = tokens
26 |     self.pdf_name = pdf_name
27 | 
28 |   @staticmethod
29 |   def from_poppler_etree(
30 |     xml_page: ElementBase, fonts_by_font_id: dict[str, PdfFont], pdf_name: str
31 |   ):
32 |     page_number = int(xml_page.attrib["number"])
33 |     tokens = [
34 |       PdfToken.from_poppler_etree(
35 |         page_number, xml_tag, fonts_by_font_id[xml_tag.attrib["font"]]
36 |       )
37 |       for xml_tag in xml_page.findall(".//text")
38 |     ]
39 |     tokens = [token for token in tokens if token.content.strip()]
40 |     width = int(xml_page.attrib["width"])
41 |     height = int(xml_page.attrib["height"])
42 |     return PdfPage(page_number, width, height, tokens, pdf_name)
43 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_segment.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from statistics import mode
 4 | 
 5 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_token import (
 6 |   PdfToken,
 7 | )
 8 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.rectangle import (
 9 |   Rectangle,
10 | )
11 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.token_type import (
12 |   TokenType,
13 | )
14 | 
15 | 
16 | class PdfSegment:
17 |   def __init__(
18 |     self,
19 |     page_number: int,
20 |     bounding_box: Rectangle,
21 |     text_content: str,
22 |     segment_type: TokenType,
23 |     pdf_name: str = "",
24 |   ):
25 |     self.page_number = page_number
26 |     self.bounding_box = bounding_box
27 |     self.text_content = text_content
28 |     self.segment_type = segment_type
29 |     self.pdf_name = pdf_name
30 | 
31 |   @staticmethod
32 |   def from_pdf_tokens(pdf_tokens: list[PdfToken], pdf_name: str = ""):
33 |     text: str = " ".join([pdf_token.content for pdf_token in pdf_tokens])
34 |     bounding_boxes = [pdf_token.bounding_box for pdf_token in pdf_tokens]
35 |     segment_type = mode([token.token_type for token in pdf_tokens])
36 |     return PdfSegment(
37 |       pdf_tokens[0].page_number,
38 |       Rectangle.merge_rectangles(bounding_boxes),
39 |       text,
40 |       segment_type,
41 |       pdf_name,
42 |     )
43 | 


--------------------------------------------------------------------------------
/eschergraph/agents/tools.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from abc import ABC
 4 | from typing import Callable
 5 | from typing import Optional
 6 | 
 7 | from attrs import define
 8 | 
 9 | 
10 | class Tool(ABC):
11 |   """The base class for a tool.
12 | 
13 |   Any tool that an agent can interact with.
14 |   """
15 | 
16 |   executor: Callable[..., None]
17 | 
18 | 
19 | @define
20 | class Parameter:
21 |   """The class for a function parameter.
22 | 
23 |   Includes all the information that an agent needs about the
24 |   function arguments.
25 |   """
26 | 
27 |   name: str
28 |   type: str
29 |   description: str
30 |   enum: Optional[list[str]] = None
31 |   is_required: bool = False
32 | 
33 |   def to_key(self) -> str:
34 |     """Returns the name of the parameter."""
35 |     return self.name
36 | 
37 |   def to_value(self) -> dict[str, str | list[str]]:
38 |     """Returns the value of the parameter in the description."""
39 |     result: dict[str, str | list[str]] = {
40 |       "type": self.type,
41 |       "description": self.description,
42 |     }
43 | 
44 |     if self.enum:
45 |       result.update({"enum": self.enum})
46 | 
47 |     return result
48 | 
49 | 
50 | @define
51 | class Function(Tool):
52 |   """A function that an agent can interact with.
53 | 
54 |   A function that the agent can execute with the required information
55 |   added.
56 |   """
57 | 
58 |   name: str
59 |   description: str
60 |   parameters: list[Parameter]
61 |   required: list[str]
62 | 


--------------------------------------------------------------------------------
/tests/visualization/conftest.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from uuid import UUID
 4 | 
 5 | import pytest
 6 | from faker import Faker
 7 | from pytest import TempPathFactory
 8 | 
 9 | from eschergraph.graph import Edge
10 | from eschergraph.graph import Graph
11 | from eschergraph.graph import Node
12 | from eschergraph.graph.community_alg import get_leidenalg_communities
13 | from tests.graph.help import create_simple_extracted_graph
14 | 
15 | faker: Faker = Faker()
16 | 
17 | 
18 | @pytest.fixture(scope="module")
19 | def graph_visual() -> tuple[Graph, list[Node], list[Edge]]:
20 |   return create_simple_extracted_graph()
21 | 
22 | 
23 | @pytest.fixture(scope="module")
24 | def community_graph(
25 |   graph_visual: tuple[Graph, list[Node], list[Edge]],
26 | ) -> tuple[list[list[Node]], list[Edge]]:
27 |   _, nodes, edges = graph_visual
28 |   node_ids: list[list[UUID]] = get_leidenalg_communities(nodes).partitions
29 | 
30 |   node_dict: dict[UUID, Node] = {node.id: node for node in nodes}
31 | 
32 |   # Transform the list of node_ids into a list of nodes
33 |   comms: list[list[Node]] = []
34 |   for comm in node_ids:
35 |     comm_nodes: list[Node] = []
36 |     for id in comm:
37 |       comm_nodes.append(node_dict[id])
38 | 
39 |     comms.append(comm_nodes)
40 | 
41 |   return comms, edges
42 | 
43 | 
44 | @pytest.fixture(scope="function")
45 | def visualization_dir(tmp_path_factory: TempPathFactory) -> str:
46 |   return tmp_path_factory.mktemp("visualization").as_posix()
47 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 | 
 6 | concurrency:
 7 |   group: pypi
 8 |   cancel-in-progress: true
 9 | 
10 | env:
11 |   POETRY_VERSION: "1.8.3"
12 |   PYTHON_VERSION: "3.10"
13 | 
14 | jobs:
15 |   release:
16 |     name: Release to PyPI
17 |     runs-on: ubuntu-latest
18 |     environment:
19 |       name: pypi
20 |       url: https://pypi.org/p/eschergraph
21 |     permissions:
22 |       id-token: write
23 |   
24 |     steps:
25 |       - uses: actions/checkout@v4
26 |         with:
27 |           fetch-depth: 0
28 |           fetch-tags: true
29 |       
30 |       - name: Set up Python
31 |         uses: actions/setup-python@v5
32 |         with:
33 |           python-version: ${{ env.PYTHON_VERSION }}
34 |       
35 |       - name: Install Poetry
36 |         uses: abatilo/actions-poetry@v3.0.0
37 |         with:
38 |           poetry-version: $POETRY_VERSION
39 |           poetry-plugins: 'poetry-dynamic-versioning'
40 |       
41 |       - name: Install dependencies
42 |         shell: bash
43 |         run: poetry install
44 |       
45 |       - name: Export publication version
46 |         run: echo "version=`poetry version --short`" >> $GITHUB_OUTPUT
47 |       
48 |       - name: Build
49 |         shell: bash
50 |         run: poetry build
51 | 
52 |       - name: Publish package distributions to PyPI
53 |         uses: pypa/gh-action-pypi-publish@release/v1
54 |         with:
55 |           packages-dir: dist
56 |           skip-existing: true
57 |           verbose: true
58 |       
59 | 


--------------------------------------------------------------------------------
/tests/visualization/test_visualizer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | import webbrowser
 5 | from unittest.mock import MagicMock
 6 | 
 7 | from pytest import MonkeyPatch
 8 | 
 9 | from eschergraph.graph import Edge
10 | from eschergraph.graph import Graph
11 | from eschergraph.graph import Node
12 | from eschergraph.visualization import Visualizer
13 | 
14 | 
15 | def test_visualize_graph(
16 |   visualization_dir: str,
17 |   graph_visual: tuple[Graph, list[Node], list[Edge]],
18 |   monkeypatch: MonkeyPatch,
19 | ) -> None:
20 |   graph, nodes, _ = graph_visual
21 |   graph.repository.get_all_at_level.return_value = nodes
22 | 
23 |   mock_open_browser: MagicMock = MagicMock()
24 |   monkeypatch.setattr(webbrowser, "open", mock_open_browser)
25 | 
26 |   html_path: str = visualization_dir + "/graph_visual.html"
27 |   Visualizer.visualize_graph(graph=graph, level=0, save_location=html_path)
28 | 
29 |   mock_open_browser.assert_called_once()
30 |   assert os.path.exists(html_path)
31 | 
32 | 
33 | def test_visualize_community_graph(
34 |   community_graph: tuple[list[list[Node]], list[Edge]],
35 |   visualization_dir: str,
36 |   monkeypatch: MonkeyPatch,
37 | ) -> None:
38 |   mock_open_browser: MagicMock = MagicMock()
39 | 
40 |   monkeypatch.setattr(webbrowser, "open", mock_open_browser)
41 | 
42 |   html_path: str = visualization_dir + "/community_visual.html"
43 |   Visualizer.visualize_community_graph(
44 |     comms=community_graph[0],
45 |     edges=community_graph[1],
46 |     save_location=html_path,
47 |   )
48 | 
49 |   mock_open_browser.assert_called_once()
50 |   assert os.path.exists(html_path)
51 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # Install the pre-commit hooks below with
 2 | # 'pre-commit install'
 3 | 
 4 | # Auto-update the version of the hooks with
 5 | # 'pre-commit autoupdate'
 6 | 
 7 | # Run the hooks on all files with
 8 | # 'pre-commit run --all'
 9 | 
10 | repos:
11 | - repo: https://github.com/pre-commit/pre-commit-hooks
12 |   rev: v4.6.0
13 |   hooks:
14 |   - id: check-ast
15 |   - id: check-merge-conflict
16 |   - id: check-toml
17 |   - id: check-yaml
18 |   - id: end-of-file-fixer
19 |     # only include python files
20 |     files: \.py$
21 |   - id: debug-statements
22 |     # only include python files
23 |     files: \.py$
24 |   - id: trailing-whitespace
25 |     # only include python files
26 |     files: \.py$
27 | 
28 | - repo: https://github.com/asottile/reorder-python-imports
29 |   rev: v3.13.0
30 |   hooks:
31 |   - id: reorder-python-imports
32 |     args: [--add-import, 'from __future__ import annotations']
33 | 
34 | - repo: https://github.com/astral-sh/ruff-pre-commit
35 |   rev: v0.6.2
36 |   hooks:
37 |   # Run the linter
38 |   - id: ruff
39 |     args: [ --fix ]
40 |   # Run the formatter
41 |   - id: ruff-format
42 | 
43 | # - repo: https://github.com/pre-commit/mirrors-mypy
44 | #   rev: 'v1.11.2'
45 | #   hooks:
46 | #   - id: mypy
47 | #     files: (eschergraph/|tests/|scripts/)
48 | #     exclude: |
49 | #       eschergraph/tools/pdf_document_layout_analysis/|
50 | #       eschergraph/tools/fast_pdf_parser/parser.py|
51 | #       eschergraph/agents/providers/openai.py
52 | #     additional_dependencies: [attrs, jinja2, openai, tenacity, pytest, types-assertpy, types-requests]
53 | #     args: [--config=pyproject.toml]


--------------------------------------------------------------------------------
/docs/docs/coming_soon.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 3
 3 | ---
 4 | 
 5 | # Coming Soon | Contributions
 6 | 
 7 | We would love to hear your feedback on the EscherGraph. Use our [Discord](https://discord.com/invite/P5gzsNVb) to ask us questions and be a part of the community!
 8 | 
 9 | ### What We Are Working On:
10 | 
11 | 1. Making the EscherGraph multimodal
12 | 2. Making the EscherGraph production-ready
13 |    -  integrating a graph database
14 | 3. Fine-tuning smaller open-source models to match GPT-4o performance for EscherGraph building—to cut costs for graph building
15 | 4. Benchmarking to optimize performance
16 | 5. Adding integrations to more model providers and vector databases
17 | 
18 | Got any issues we should be working on? Add them on GitHub or in the Discord! We will try to get back to you as quickly as possible.
19 | 
20 | ### Possible Contributions:
21 | 
22 | 1. **Code Contributions**:
23 |    - Help us improve the existing codebase.
24 |    - Implement new features or fix bugs.
25 |    - Write tests to ensure code quality.
26 |    - Add integrations with other providers.
27 | 
28 | 2. **Documentation**:
29 |    - Improve the documentation to make it more user-friendly.
30 |    - Write tutorials or guides to help new users get started.
31 | 
32 | 3. **Feedback and Testing**:
33 |    - Provide feedback on the current features.
34 |    - Test new features and report any issues.
35 | 
36 | 4. **Community Support**:
37 |    - Answer questions on our Discord or GitHub.
38 |    - Help other users with their issues.
39 | 
40 | Your contributions are greatly appreciated and will help us make EscherGraph even better!
41 | 
42 | - PinkDot AI team.
43 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   # push:
 4 |   #   branches: [main]
 5 |   pull_request:
 6 |     branches: [main]
 7 | 
 8 | permissions:
 9 |   contents: read
10 |   pull-requests: read
11 | 
12 | # Only run the latest commit for each PR
13 | concurrency:
14 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
15 |   cancel-in-progress: true
16 | 
17 | 
18 | env:
19 |   POETRY_VERSION: 1.8.3
20 | 
21 | jobs:
22 |   ci:
23 |     strategy:
24 |       matrix:
25 |         python-version: ["3.12", "3.11", "3.10"]
26 |         os: [ubuntu-latest, windows-latest]
27 |       fail-fast: false
28 |     
29 |     runs-on: ${{ matrix.os }}
30 |     steps:
31 |       - uses: actions/checkout@v4
32 |       
33 |       - name: Set up Python
34 |         uses: actions/setup-python@v5
35 |         with:
36 |           python-version: ${{ matrix.python-version }}
37 |       
38 |       - name: Install Poetry
39 |         uses: abatilo/actions-poetry@v3.0.0
40 |         with:
41 |           poetry-version: $POETRY_VERSION
42 |           poetry-plugins: 'poetry-dynamic-versioning'
43 |       
44 |       - name: Ubuntu poppler set up
45 |         if: matrix.os == 'ubuntu-latest'
46 |         run: |
47 |           sudo apt-get install -y poppler-utils
48 |       
49 |       - name: Install dependencies
50 |         shell: bash
51 |         run: |
52 |           poetry install
53 |       
54 |       - name: Check
55 |         run: |
56 |           poetry run poe linting
57 |           poetry run poe type_check
58 |       
59 |       - name: Build
60 |         run: |
61 |           poetry build
62 |       
63 |       - name: Unit tests
64 |         run:
65 |           poetry run pytest
66 |           
67 | 


--------------------------------------------------------------------------------
/eschergraph/graph/community_alg.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any
 4 | from uuid import UUID
 5 | 
 6 | import igraph as ig
 7 | import leidenalg
 8 | 
 9 | from eschergraph.graph.comm_graph import CommunityGraphResult
10 | from eschergraph.graph.node import Node
11 | 
12 | 
13 | def _create_level_igraph(nodes: list[Node]) -> ig.Graph:
14 |   node_ids: set[UUID] = {node.id for node in nodes}
15 |   vertices: list[dict[str, UUID]] = [{"name": id} for id in node_ids]
16 |   edges: list[dict[str, UUID | int]] = [
17 |     {"source": edge.frm.id, "target": edge.to.id, "edge_weight": 1, "id": edge.id}
18 |     for node in nodes
19 |     for edge in node.edges
20 |     if edge.frm.id == node.id and edge.to.id in node_ids
21 |   ]
22 |   return ig.Graph.DictList(vertices=vertices, edges=edges, directed=False)
23 | 
24 | 
25 | def get_leidenalg_communities(
26 |   nodes: list[Node],
27 | ) -> CommunityGraphResult:
28 |   """Get the communities with the Leiden algorithm.
29 | 
30 |   The communities are calculated only for the provided nodes.
31 | 
32 |   Args:
33 |     nodes (list[Node]): A list of nodes
34 | 
35 |   Returns:
36 |     A list of lists, where each list corresponds to a community.
37 |     Henceforth, each community is a list of node id's.
38 |   """
39 |   igraph: ig.Graph = _create_level_igraph(nodes)
40 |   partition: Any = leidenalg.find_partition(igraph, leidenalg.ModularityVertexPartition)
41 | 
42 |   edges: list[UUID] = [edge.attributes()["id"] for edge in igraph.es()]
43 | 
44 |   return CommunityGraphResult(
45 |     partitions=[
46 |       [node["name"] for node in subgraph.to_dict_list()[0]]
47 |       for subgraph in partition.subgraphs()
48 |     ],
49 |     edges=edges,
50 |   )
51 | 


--------------------------------------------------------------------------------
/tests/agents/test_reranker.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from unittest.mock import MagicMock
 4 | 
 5 | from eschergraph.agents.providers.jina import JinaReranker
 6 | from eschergraph.agents.reranker import RerankerResult
 7 | 
 8 | 
 9 | def test_reranker() -> None:
10 |   """Test the rerank method of the JinaReranker class using a mock."""
11 |   # Create a mock JinaReranker instance
12 |   mock_client: MagicMock = MagicMock(spec=JinaReranker)
13 | 
14 |   # Define the mock return value for the rerank method
15 |   mock_batch_items: list[RerankerResult] = [
16 |     RerankerResult(index=1, text="mock text", relevance_score=0.18),
17 |     RerankerResult(index=0, text="mock text 2", relevance_score=0.3),
18 |   ]
19 |   mock_client.rerank.return_value = mock_batch_items
20 | 
21 |   # Define the test inputs
22 |   query: str = "Today is an amazing day"
23 |   text_list: list[str] = ["one", "two"]
24 |   top_n: int = 2
25 | 
26 |   # Call the rerank method
27 |   reranked_items: list[RerankerResult] = mock_client.rerank(query, text_list, top_n)
28 | 
29 |   # Check that the rerank method returns the correct number of items
30 |   assert len(reranked_items) == 2
31 | 
32 |   # Verify that the reranked items match the expected mock data
33 |   assert reranked_items[0].index == 1
34 |   assert reranked_items[0].text == "mock text"
35 |   assert reranked_items[0].relevance_score == 0.18
36 | 
37 |   assert reranked_items[1].index == 0
38 |   assert reranked_items[1].text == "mock text 2"
39 |   assert reranked_items[1].relevance_score == 0.3
40 | 
41 |   # Check that the rerank method was called with the correct arguments
42 |   mock_client.rerank.assert_called_once_with(query, text_list, top_n)
43 | 
44 |   # If there's a need to check the exact call, we can also use
45 |   # mock_client.rerank.assert_called_once()
46 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | from pathlib import Path
 5 | from unittest.mock import MagicMock
 6 | from unittest.mock import Mock
 7 | 
 8 | import pytest
 9 | from dotenv import load_dotenv
10 | from pytest import TempPathFactory
11 | 
12 | from eschergraph.agents.llm import ModelProvider
13 | from eschergraph.agents.reranker import Reranker
14 | from eschergraph.graph import Graph
15 | from eschergraph.persistence import Repository
16 | from eschergraph.persistence.vector_db import VectorDB
17 | 
18 | load_dotenv()
19 | 
20 | # Mark the provider tests with a decorator, only run them when wanted
21 | provider_test = pytest.mark.skipif(
22 |   "TEST_PROVIDERS" not in os.environ or os.environ["TEST_PROVIDERS"] != "true",
23 |   reason="Credentials for external provider required.",
24 | )
25 | 
26 | 
27 | @pytest.fixture(scope="function")
28 | def mock_repository() -> Mock:
29 |   mock: MagicMock = MagicMock(spec=Repository)
30 |   mock.get_node_by_name.return_value = None
31 | 
32 |   return mock
33 | 
34 | 
35 | @pytest.fixture(scope="function")
36 | def saved_graph_dir(tmp_path_factory: TempPathFactory) -> Path:
37 |   return tmp_path_factory.mktemp("saved_graph")
38 | 
39 | 
40 | # Create a graph for unit testing
41 | @pytest.fixture(scope="function")
42 | def graph_unit() -> Graph:
43 |   model: MagicMock = MagicMock(spec=ModelProvider)
44 |   reranker: MagicMock = MagicMock(spec=Reranker)
45 |   vector_db: MagicMock = MagicMock(spec=VectorDB)
46 |   repository: MagicMock = MagicMock(spec=Repository)
47 | 
48 |   # Set the required credentials to an empty list
49 |   model.required_credentials = []
50 |   reranker.required_credentials = []
51 |   vector_db.required_credentials = []
52 | 
53 |   return Graph(
54 |     model=model, reranker=reranker, repository=repository, vector_db=vector_db
55 |   )
56 | 


--------------------------------------------------------------------------------
/tests_integration/builder/integration_test_build_pipeline.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import time
 4 | 
 5 | from eschergraph.agents.providers.jina import JinaReranker
 6 | from eschergraph.agents.providers.openai import OpenAIModel
 7 | from eschergraph.agents.providers.openai import OpenAIProvider
 8 | from eschergraph.graph import Graph
 9 | 
10 | TEST_FILE_SMALL: str = "./test_files/test_file.pdf"
11 | TEST_FILE_BIG: str = "./test_files/Attention Is All You Need.pdf"
12 | TEST_FILE_2: str = "./test_files/test_file_2.pdf"
13 | 
14 | 
15 | def integration_test_building() -> None:
16 |   # Set up all the graph dependencies
17 |   graph_name: str = "test_graph"
18 |   graph: Graph = Graph(
19 |     name=graph_name,
20 |     model=OpenAIProvider(model=OpenAIModel.GPT_4o_MINI),
21 |   )
22 | 
23 |   # Build the graph
24 |   graph.build(files=TEST_FILE_2, multi_modal=True)
25 | 
26 |   query = "What does the Figure 1. Generic Risk Model with Key Risk Factors illustrate?"
27 |   answer = graph.search(query)
28 |   print(answer.answer)
29 |   print()
30 |   for s in answer.sources:
31 |     print(s)
32 |     print()
33 | 
34 |   for v in answer.visuals:
35 |     print(v.save_location)
36 |   print("\n-------------\n")
37 |   query = "What does the Figure 1 illustrate?"
38 |   answer = graph.search(query)
39 |   print(answer.answer)
40 |   print()
41 |   for s in answer.sources:
42 |     print(s)
43 |     print()
44 | 
45 |   for v in answer.visuals:
46 |     print(v.save_location)
47 | 
48 | 
49 | integration_test_building()
50 | 
51 | 
52 | def test_search_graph() -> None:
53 |   """Tests the search functionality of a Graph object."""
54 |   t = time.time()
55 |   openai_client = OpenAIProvider(model=OpenAIModel.GPT_4o_MINI)
56 |   reranker_client = JinaReranker()
57 |   graph: Graph = Graph(name="my graph", model=openai_client, reranker=reranker_client)
58 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/multi_modal/data_structure.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TypedDict
 4 | from uuid import UUID
 5 | 
 6 | from attrs import define
 7 | 
 8 | 
 9 | class BoundingRegion(TypedDict):
10 |   """Represents a region on a page, defined by its page number and an optional polygon."""
11 | 
12 |   page_number: int
13 |   polygon: list[float] | None
14 | 
15 | 
16 | class TableCell(TypedDict):
17 |   """Defines a cell in a table, including its row/column indices, content, and bounding regions."""
18 | 
19 |   row_index: int
20 |   column_index: int
21 |   content: str
22 |   bounding_regions: list[BoundingRegion]
23 | 
24 | 
25 | class Paragraph(TypedDict):
26 |   """Represents a paragraph with an optional role and associated content."""
27 | 
28 |   id: int
29 |   role: str | None
30 |   content: str
31 |   page_num: int | None
32 | 
33 | 
34 | class Table(TypedDict):
35 |   """Represents a table, including its structure (rows/columns), bounding regions, cells, caption, and page number."""
36 | 
37 |   id: int
38 |   row_count: int
39 |   column_count: int
40 |   bounding_regions: list[BoundingRegion]
41 |   cells: list[TableCell]
42 |   caption: str | None
43 |   page_num: int
44 | 
45 | 
46 | class Figure(TypedDict):
47 |   """Defines a figure with its content, caption, and associated page number."""
48 | 
49 |   id: str
50 |   caption: str | None
51 |   bounding_regions: list[BoundingRegion]
52 |   page_num: int
53 | 
54 | 
55 | class AnalysisResult(TypedDict):
56 |   """Represents the result of an analysis, containing tables, figures, and paragraphs."""
57 | 
58 |   tables: list[Table]
59 |   figures: list[Figure]
60 |   paragraphs: list[Paragraph]
61 | 
62 | 
63 | @define
64 | class VisualDocumentElement:
65 |   """This is the dataclasse for the Visual elemenets in a document. For now Figures and Tables."""
66 | 
67 |   content: str
68 |   caption: str | None
69 |   save_location: str
70 |   page_num: int | None
71 |   doc_id: UUID
72 |   type: str
73 | 


--------------------------------------------------------------------------------
/docs/components/HomepageFeatures/index.js:
--------------------------------------------------------------------------------
 1 | import clsx from 'clsx';
 2 | import Heading from '@theme/Heading';
 3 | import styles from './styles.module.css';
 4 | 
 5 | const FeatureList = [
 6 |   {
 7 |     title: 'Easy to Use',
 8 |     Svg: require('@site/static/img/undraw_docusaurus_mountain.svg').default,
 9 |     description: (
10 |       <>
11 |         Docusaurus was designed from the ground up to be easily installed and
12 |         used to get your website up and running quickly.
13 |       </>
14 |     ),
15 |   },
16 |   {
17 |     title: 'Focus on What Matters',
18 |     Svg: require('@site/static/img/undraw_docusaurus_tree.svg').default,
19 |     description: (
20 |       <>
21 |         Docusaurus lets you focus on your docs, and we&apos;ll do the chores. Go
22 |         ahead and move your docs into the <code>docs</code> directory.
23 |       </>
24 |     ),
25 |   },
26 |   {
27 |     title: 'Powered by React',
28 |     Svg: require('@site/static/img/undraw_docusaurus_react.svg').default,
29 |     description: (
30 |       <>
31 |         Extend or customize your website layout by reusing React. Docusaurus can
32 |         be extended while reusing the same header and footer.
33 |       </>
34 |     ),
35 |   },
36 | ];
37 | 
38 | function Feature({Svg, title, description}) {
39 |   return (
40 |     <div className={clsx('col col--4')}>
41 |       <div className="text--center">
42 |         <Svg className={styles.featureSvg} role="img" />
43 |       </div>
44 |       <div className="text--center padding-horiz--md">
45 |         <Heading as="h3">{title}</Heading>
46 |         <p>{description}</p>
47 |       </div>
48 |     </div>
49 |   );
50 | }
51 | 
52 | export default function HomepageFeatures() {
53 |   return (
54 |     <section className={styles.features}>
55 |       <div className="container">
56 |         <div className="row">
57 |           {FeatureList.map((props, idx) => (
58 |             <Feature key={idx} {...props} />
59 |           ))}
60 |         </div>
61 |       </div>
62 |     </section>
63 |   );
64 | }
65 | 


--------------------------------------------------------------------------------
/docs/src/components/HomepageFeatures/index.js:
--------------------------------------------------------------------------------
 1 | import clsx from 'clsx';
 2 | import Heading from '@theme/Heading';
 3 | import styles from './styles.module.css';
 4 | 
 5 | const FeatureList = [
 6 |   {
 7 |     title: 'Easy to Use',
 8 |     Svg: require('@site/static/img/undraw_docusaurus_mountain.svg').default,
 9 |     description: (
10 |       <>
11 |         Docusaurus was designed from the ground up to be easily installed and
12 |         used to get your website up and running quickly.
13 |       </>
14 |     ),
15 |   },
16 |   {
17 |     title: 'Focus on What Matters',
18 |     Svg: require('@site/static/img/undraw_docusaurus_tree.svg').default,
19 |     description: (
20 |       <>
21 |         Docusaurus lets you focus on your docs, and we&apos;ll do the chores. Go
22 |         ahead and move your docs into the <code>docs</code> directory.
23 |       </>
24 |     ),
25 |   },
26 |   {
27 |     title: 'Powered by React',
28 |     Svg: require('@site/static/img/undraw_docusaurus_react.svg').default,
29 |     description: (
30 |       <>
31 |         Extend or customize your website layout by reusing React. Docusaurus can
32 |         be extended while reusing the same header and footer.
33 |       </>
34 |     ),
35 |   },
36 | ];
37 | 
38 | function Feature({Svg, title, description}) {
39 |   return (
40 |     <div className={clsx('col col--4')}>
41 |       <div className="text--center">
42 |         <Svg className={styles.featureSvg} role="img" />
43 |       </div>
44 |       <div className="text--center padding-horiz--md">
45 |         <Heading as="h3">{title}</Heading>
46 |         <p>{description}</p>
47 |       </div>
48 |     </div>
49 |   );
50 | }
51 | 
52 | export default function HomepageFeatures() {
53 |   return (
54 |     <section className={styles.features}>
55 |       <div className="container">
56 |         <div className="row">
57 |           {FeatureList.map((props, idx) => (
58 |             <Feature key={idx} {...props} />
59 |           ))}
60 |         </div>
61 |       </div>
62 |     </section>
63 |   );
64 | }
65 | 


--------------------------------------------------------------------------------
/tests/tools/test_fuzzy_matcher.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from unittest.mock import MagicMock
 4 | 
 5 | from eschergraph.tools.fuzzy_matcher import FuzzyMatcher
 6 | 
 7 | 
 8 | def test_find_matches_with_mock() -> None:
 9 |   """Test the _find_matches method of the FuzzyMatcher class using a mock."""
10 |   mock_fuzzy_matcher = MagicMock(spec=FuzzyMatcher)
11 | 
12 |   # Define the mock return value for the _find_matches method
13 |   mock_find_matches_return = ("apple", ["Apple", "apple inc."])
14 |   mock_fuzzy_matcher._find_matches.return_value = mock_find_matches_return
15 | 
16 |   result = mock_fuzzy_matcher._find_matches(
17 |     "apple", ["apple", "Apple", "apple inc.", "banana"]
18 |   )
19 | 
20 |   assert (
21 |     result == mock_find_matches_return
22 |   ), f"Expected {mock_find_matches_return}, got {result}"
23 | 
24 | 
25 | def test_match_nodes_with_mock() -> None:
26 |   """Test the _match_nodes method of the FuzzyMatcher class using a mock."""
27 |   mock_fuzzy_matcher = MagicMock(spec=FuzzyMatcher)
28 | 
29 |   mock_match_nodes_return = {
30 |     "apple": ["Apple", "apple inc."],
31 |     "Apple": ["apple", "apple inc."],
32 |     "apple inc.": ["apple", "Apple"],
33 |     "banana": [],
34 |   }
35 |   mock_fuzzy_matcher._match_nodes.return_value = mock_match_nodes_return
36 | 
37 |   result = mock_fuzzy_matcher._match_nodes(["apple", "Apple", "apple inc.", "banana"])
38 | 
39 |   assert (
40 |     result == mock_match_nodes_return
41 |   ), f"Expected {mock_match_nodes_return}, got {result}"
42 | 
43 | 
44 | def test_get_match_sets_with_mock() -> None:
45 |   """Test the get_match_sets method of the FuzzyMatcher class using a mock."""
46 |   mock_fuzzy_matcher = MagicMock(spec=FuzzyMatcher)
47 | 
48 |   mock_get_match_sets_return = [{"apple", "Apple", "apple inc."}, {"banana"}]
49 |   mock_fuzzy_matcher.get_match_sets.return_value = mock_get_match_sets_return
50 | 
51 |   result = mock_fuzzy_matcher.get_match_sets(["apple", "Apple", "apple inc.", "banana"])
52 | 
53 |   assert (
54 |     result == mock_get_match_sets_return
55 |   ), f"Expected {mock_get_match_sets_return}, got {result}"
56 | 


--------------------------------------------------------------------------------
/tests/persistence/adapters/simple_repository/help.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import cast
 4 | 
 5 | from attrs import asdict
 6 | 
 7 | from eschergraph.graph import Edge
 8 | from eschergraph.graph import Node
 9 | from eschergraph.graph import Property
10 | from eschergraph.persistence.adapters.simple_repository.models import EdgeModel
11 | from eschergraph.persistence.adapters.simple_repository.models import (
12 |   MetadataModel,
13 | )
14 | from eschergraph.persistence.adapters.simple_repository.models import NodeModel
15 | from eschergraph.persistence.adapters.simple_repository.models import (
16 |   PropertyModel,
17 | )
18 | 
19 | 
20 | def compare_node_to_node_model(node: Node, node_model: NodeModel) -> bool:
21 |   # Check equality for a node being in a community
22 |   if node.community.node and not node_model["community"]:
23 |     return False
24 |   elif not node.community.node and node_model["community"]:
25 |     return False
26 |   elif node.community.node and node_model["community"]:
27 |     if not node.community.node.id == node_model["community"]:
28 |       return False
29 | 
30 |   return (
31 |     node.name == node_model["name"]
32 |     and node.description == node_model["description"]
33 |     and node.level == node_model["level"]
34 |     and {edge.id for edge in node.edges} == node_model["edges"]
35 |     and [property.id for property in node.properties] == node_model["properties"]
36 |     and [cast(MetadataModel, asdict(md)) for md in node.metadata]
37 |     == node_model["metadata"]
38 |   )
39 | 
40 | 
41 | def compare_edge_to_edge_model(edge: Edge, edge_model: EdgeModel) -> bool:
42 |   return (
43 |     edge.frm.id == edge_model["frm"]
44 |     and edge.to.id == edge_model["to"]
45 |     and edge.description == edge_model["description"]
46 |     and [cast(MetadataModel, asdict(md)) for md in edge.metadata]
47 |     == edge_model["metadata"]
48 |   )
49 | 
50 | 
51 | def compare_property_to_property_model(
52 |   property: Property, property_model: PropertyModel
53 | ) -> bool:
54 |   return (
55 |     property.node.id == property_model["node"]
56 |     and property.description == property_model["description"]
57 |   )
58 | 


--------------------------------------------------------------------------------
/eschergraph/agents/prompts/json_build.jinja:
--------------------------------------------------------------------------------
 1 | -Goal-
 2 | Extract all relevant information from the provided text into a graph representation containing entities and relations. 
 3 | The most important part is that you try to represent all the information in the provided text in a structured format!
 4 | 
 5 | -Steps-
 6 | 1. Identify all named entities in singular form. For people please include the entire name. Entities can also be technologies. 
 7 | For each identified entity, extract the following information:
 8 | - entity_name: Name of the entity
 9 | - entity_description: Comprehensive description of the entity's attributes and activities
10 | 
11 | Format each entity output as a JSON entry with the following format:
12 | 
13 | {"name": <entity name>, "description": <entity description>}
14 | 
15 | 2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
16 | For each pair of related entities, extract the following information:
17 | - source_entity: name of the source entity, as identified in step 1
18 | - target_entity: name of the target entity, as identified in step 1
19 | - relationship: explanation as to why you think the source entity and the target entity are related to each other
20 | 
21 | Format each relationship as a JSON entry with the following format:
22 | 
23 | {"source": <source_entity>, "target": <target_entity>, "relationship": <relationship_description>}
24 | 
25 | 3. Return output in English as a single list of all JSON entities and relationships identified in steps 1 and 2.
26 | return the JSON like this: 
27 | 
28 | {
29 |  'entities': [{"name": <entity name1>, "description": <entity description1>}, {"name": <entity name1>, "description": <entity description1>}],
30 |  'relationships':[{"source": <source_entity>, "target": <target_entity>, "relationship": <relationship_description>}, and more]
31 | }
32 | 
33 | However, only extract entities that are specific so avoid extracting entities like CEO or employee, but instead
34 | extract only named entities and technologies.
35 | 
36 | -Real Data-
37 | ######################
38 | text: {{ input_text }}
39 | ######################
40 | output:


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_token_type_labels/label.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from lxml.etree import ElementBase
 4 | from pydantic import BaseModel
 5 | 
 6 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.rectangle import (
 7 |   Rectangle,
 8 | )
 9 | 
10 | 
11 | class Label(BaseModel):
12 |   top: int
13 |   left: int
14 |   width: int
15 |   height: int
16 |   label_type: int
17 |   metadata: str = ""
18 | 
19 |   def intersection_percentage(self, token_bounding_box: Rectangle):
20 |     label_bounding_box = Rectangle(
21 |       left=self.left,
22 |       top=self.top,
23 |       right=self.left + self.width,
24 |       bottom=self.top + self.height,
25 |     )
26 |     return label_bounding_box.get_intersection_percentage(token_bounding_box)
27 | 
28 |   def get_location_discrepancy(self, token_bounding_box: Rectangle):
29 |     coordinates_discrepancy: int = abs(self.left - token_bounding_box.left) + abs(
30 |       self.top - token_bounding_box.top
31 |     )
32 |     size_discrepancy: int = abs(self.height - token_bounding_box.height) + abs(
33 |       self.width - token_bounding_box.width
34 |     )
35 |     return coordinates_discrepancy + size_discrepancy
36 | 
37 |   def area(self):
38 |     return self.width * self.height
39 | 
40 |   @staticmethod
41 |   def from_rectangle(rectangle: Rectangle, token_type: int):
42 |     return Label(
43 |       top=rectangle.top,
44 |       left=rectangle.left,
45 |       width=rectangle.width,
46 |       height=rectangle.height,
47 |       label_type=token_type,
48 |     )
49 | 
50 |   @staticmethod
51 |   def from_text_elements(text_elements: list[ElementBase]):
52 |     top = min([int(x.attrib["top"]) for x in text_elements])
53 |     left = min([int(x.attrib["left"]) for x in text_elements])
54 |     bottom = max([
55 |       int(x.attrib["top"]) + int(x.attrib["height"]) for x in text_elements
56 |     ])
57 |     right = max([int(x.attrib["left"]) + int(x.attrib["width"]) for x in text_elements])
58 | 
59 |     return Label(
60 |       top=top,
61 |       left=left,
62 |       width=int(right - left),
63 |       height=int(bottom - top),
64 |       label_type=0,
65 |     )
66 | 


--------------------------------------------------------------------------------
/.github/workflows/docs-release.yml:
--------------------------------------------------------------------------------
 1 | name: Docs Release
 2 | on:
 3 |   push:
 4 |     branches: [main]
 5 | 
 6 | concurrency:
 7 |     group: docs
 8 |     cancel-in-progress: true
 9 | 
10 | permissions:
11 |   id-token: write
12 |   contents: read
13 | 
14 | env:
15 |   aws_region: "us-east-1"
16 |   s3_bucket: "eschergraph.docs.pinkdot.ai"
17 | 
18 | jobs:
19 |   build:
20 |     environment: docs
21 |     runs-on: ubuntu-latest
22 |     defaults:
23 |       run:
24 |         working-directory: ./docs
25 |     steps:
26 |       - name: Checkout Code
27 |         uses: actions/checkout@v4
28 | 
29 |       - name: Use Node.js
30 |         uses: actions/setup-node@v3
31 |         with:
32 |           node-version: '18'
33 |           cache: 'npm'
34 |           cache-dependency-path: './docs/package-lock.json'
35 | 
36 |       - name: Install dependencies    
37 |         run: npm ci
38 |       
39 |       - name: Build
40 |         run: npm run build --if-present
41 |       
42 |       - name: Upload build files as artifact
43 |         uses: actions/upload-artifact@v4
44 |         with:
45 |           name: vite-build-docs
46 |           path: ./docs/build # Still to check
47 |   
48 |   deploy:
49 |     environment: docs
50 |     needs: build
51 |     runs-on: ubuntu-latest
52 |     steps:
53 |       - name: Configure AWS Credentials
54 |         uses: aws-actions/configure-aws-credentials@v4
55 |         with:
56 |           role-to-assume: ${{ secrets.AWS_CD_ROLE }}
57 |           aws-region: ${{ env.aws_region }}
58 | 
59 |       - name: Download Artifacts
60 |         uses: actions/download-artifact@v4
61 |         with:
62 |           name: vite-build-docs
63 |           path: build
64 |           
65 |       - name: Upload the files to s3
66 |         run: |
67 |           aws s3 sync build s3://${{ env.s3_bucket }} --delete
68 |       
69 |       - name: Invalidate the Cloudfront distribution
70 |         env:
71 |           ROOT_DISTRIBUTION_ID: ${{ secrets.CLOUDFRONT_ROOT_DISTRIBUTION_ID }}
72 |           WWW_DISTRIBUTION_ID: ${{ secrets.CLOUDFRONT_WWW_DISTRIBUTION_ID }}
73 |         run: |
74 |           aws cloudfront create-invalidation --distribution-id $ROOT_DISTRIBUTION_ID --paths "/*"
75 |           aws cloudfront create-invalidation --distribution-id $WWW_DISTRIBUTION_ID --paths "/*"
76 | 


--------------------------------------------------------------------------------
/tests/persistence/adapters/simple_repository/test_load.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | from uuid import UUID
 5 | 
 6 | from eschergraph.graph import Edge
 7 | from eschergraph.graph import Node
 8 | from eschergraph.graph.loading import LoadState
 9 | from eschergraph.persistence.adapters.simple_repository import SimpleRepository
10 | from tests.graph.help import create_simple_extracted_graph
11 | from tests.persistence.adapters.simple_repository.help import (
12 |   compare_edge_to_edge_model,
13 | )
14 | from tests.persistence.adapters.simple_repository.help import (
15 |   compare_node_to_node_model,
16 | )
17 | 
18 | 
19 | def test_full_graph_loading(saved_graph_dir: Path) -> None:
20 |   repository: SimpleRepository = SimpleRepository(
21 |     save_location=saved_graph_dir.as_posix()
22 |   )
23 |   _, nodes, edges = create_simple_extracted_graph(repository=repository)
24 |   node_ids_repository: set[UUID] = set(repository.nodes.keys())
25 |   document_id: UUID = next(iter(nodes[0].metadata)).document_id
26 | 
27 |   assert node_ids_repository == {node.id for node in nodes}
28 |   assert set(repository.edges.keys()) == {edge.id for edge in edges}
29 |   assert set(repository.doc_node_name_index.keys()) == {document_id}
30 |   assert node_ids_repository == set(
31 |     repository.doc_node_name_index[document_id].values()
32 |   )
33 | 
34 |   repository.save()
35 |   del repository
36 | 
37 |   new_repository: SimpleRepository = SimpleRepository(
38 |     save_location=saved_graph_dir.as_posix()
39 |   )
40 | 
41 |   for node in nodes:
42 |     new_node: Node = Node(id=node.id, repository=new_repository)
43 |     assert new_node.loadstate == LoadState.REFERENCE
44 |     assert compare_node_to_node_model(
45 |       node=new_node,
46 |       node_model=new_repository.nodes[node.id],
47 |     )
48 |     assert new_node.loadstate == LoadState.FULL  # type: ignore
49 | 
50 |   for edge in edges:
51 |     new_edge: Edge = Edge(
52 |       id=edge.id,
53 |       frm=Node(edge.frm.id, repository=new_repository),
54 |       to=Node(edge.to.id, repository=new_repository),
55 |       repository=new_repository,
56 |     )
57 |     assert new_edge.loadstate == LoadState.REFERENCE
58 |     assert compare_edge_to_edge_model(
59 |       edge=new_edge, edge_model=new_repository.edges[edge.id]
60 |     )
61 |     assert new_edge.loadstate == LoadState.CORE  # type: ignore
62 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_features/rectangle.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from lxml.etree import ElementBase
 4 | 
 5 | 
 6 | class Rectangle:
 7 |   def __init__(self, left: int, top: int, right: int, bottom: int):
 8 |     self.left = left
 9 |     self.top = top
10 |     self.right = right
11 |     self.bottom = bottom
12 |     self.fix_wrong_areas()
13 |     self.width = self.right - self.left
14 |     self.height = self.bottom - self.top
15 | 
16 |   @staticmethod
17 |   def from_poppler_tag_etree(tag: ElementBase) -> "Rectangle":
18 |     x_min = int(tag.attrib["left"])
19 |     y_min = int(tag.attrib["top"])
20 |     x_max = x_min + int(tag.attrib["width"])
21 |     y_max = y_min + int(tag.attrib["height"])
22 | 
23 |     return Rectangle(x_min, y_min, x_max, y_max)
24 | 
25 |   def fix_wrong_areas(self):
26 |     if self.right == self.left:
27 |       self.left -= 1
28 |       self.right += 1
29 | 
30 |     if self.top == self.bottom:
31 |       self.top -= 1
32 |       self.bottom += 1
33 | 
34 |     if self.right < self.left:
35 |       self.right, self.left = self.left, self.right
36 | 
37 |     if self.bottom < self.top:
38 |       self.top, self.bottom = self.bottom, self.top
39 | 
40 |   def get_intersection_percentage(self, rectangle: "Rectangle") -> float:
41 |     x1 = max(self.left, rectangle.left)
42 |     y1 = max(self.top, rectangle.top)
43 |     x2 = min(self.right, rectangle.right)
44 |     y2 = min(self.bottom, rectangle.bottom)
45 | 
46 |     if x2 <= x1 or y2 <= y1:
47 |       return 0
48 | 
49 |     return 100 * (x2 - x1) * (y2 - y1) / self.area()
50 | 
51 |   def area(self):
52 |     return self.width * self.height
53 | 
54 |   def to_dict(self):
55 |     return {
56 |       "top": self.top,
57 |       "left": self.left,
58 |       "right": self.right,
59 |       "bottom": self.bottom,
60 |     }
61 | 
62 |   @staticmethod
63 |   def merge_rectangles(rectangles: list["Rectangle"]) -> "Rectangle":
64 |     left = min([rectangle.left for rectangle in rectangles])
65 |     top = min([rectangle.top for rectangle in rectangles])
66 |     right = max([rectangle.right for rectangle in rectangles])
67 |     bottom = max([rectangle.bottom for rectangle in rectangles])
68 | 
69 |     return Rectangle(left, top, right, bottom)
70 | 
71 |   @staticmethod
72 |   def from_width_height(left: int, top: int, width: int, height: int):
73 |     return Rectangle(left, top, left + width, top + height)
74 | 


--------------------------------------------------------------------------------
/eschergraph/exceptions.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | 
 4 | class BaseEscherGraphException(Exception):
 5 |   """The base class for all EscherGraph exceptions."""
 6 | 
 7 | 
 8 | class PromptFormattingException(BaseEscherGraphException):
 9 |   """When some jinja prompt variables have not been formatted.
10 | 
11 |   Used to check if the prompt has been sent to the LLM / agent as intended.
12 |   """
13 | 
14 | 
15 | class IllogicalActionException(BaseEscherGraphException):
16 |   """When something unlogical happens, like searching before building a graph."""
17 | 
18 | 
19 | class ExternalProviderException(BaseEscherGraphException):
20 |   """When something unexpected occurs during an interation with an external service."""
21 | 
22 | 
23 | class DataLoadingException(BaseEscherGraphException):
24 |   """Raised when some data on the EscherGraph objects has not been loaded as expected."""
25 | 
26 | 
27 | class NodeDoesNotExistException(BaseEscherGraphException):
28 |   """Node has not been found."""
29 | 
30 | 
31 | class EdgeCreationException(BaseEscherGraphException):
32 |   """Edge is created between a node and itself."""
33 | 
34 | 
35 | class NodeCreationException(BaseEscherGraphException):
36 |   """Something went wrong creating a node."""
37 | 
38 | 
39 | class CredentialException(BaseEscherGraphException):
40 |   """Missing credential for external provider."""
41 | 
42 | 
43 | class FileTypeNotProcessableException(BaseEscherGraphException):
44 |   """File is not processable due to its type."""
45 | 
46 | 
47 | class EdgeDoesNotExistException(BaseEscherGraphException):
48 |   """The specified edge could not be found."""
49 | 
50 | 
51 | class RepositoryException(BaseEscherGraphException):
52 |   """Something unexpected happens with the repository."""
53 | 
54 | 
55 | class ExternalDependencyException(BaseEscherGraphException):
56 |   """External dependency (outside of Python) is missing."""
57 | 
58 | 
59 | class DocumentDoesNotExistException(BaseEscherGraphException):
60 |   """The specified document does not exist in the graph."""
61 | 
62 | 
63 | class DocumentAlreadyExistsException(BaseEscherGraphException):
64 |   """The graph attempts to build for a document that already exists."""
65 | 
66 | 
67 | class FileException(BaseEscherGraphException):
68 |   """Provided filepath is not a file or the file does not exist."""
69 | 
70 | 
71 | class ImageProcessingException(BaseEscherGraphException):
72 |   """Exception that occurs when processing an image."""
73 | 


--------------------------------------------------------------------------------
/eschergraph/persistence/vector_db/vector_db.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from abc import ABC
 4 | from abc import abstractmethod
 5 | from typing import Optional
 6 | from uuid import UUID
 7 | 
 8 | from eschergraph.persistence.vector_db.vector_search_result import VectorSearchResult
 9 | 
10 | 
11 | class VectorDB(ABC):
12 |   """This is the abstract base class for all vector DB implementations.
13 | 
14 |   It is important to note that an embedding model is included in the abstract vector database class.
15 |   """
16 | 
17 |   required_credentials: list[str]
18 | 
19 |   @abstractmethod
20 |   def connect(self) -> None:
21 |     """Possible connection method."""
22 |     raise NotImplementedError
23 | 
24 |   @abstractmethod
25 |   def insert(
26 |     self,
27 |     documents: list[str],
28 |     ids: list[UUID],
29 |     metadata: list[dict[str, str | int]],
30 |     collection_name: str,
31 |   ) -> None:
32 |     """Store documents with their embeddings, ids, and metadata.
33 | 
34 |     Args:
35 |       embeddings (List[List[float]]): List of embeddings for the documents.
36 |       documents (List[str]): List of document texts.
37 |       ids (List[int]): List of document IDs.
38 |       metadata (List[dict[str, Any]]): List of metadata dictionaries.
39 |       collection_name (str): The name of the collection.
40 |     """
41 |     raise NotImplementedError
42 | 
43 |   @abstractmethod
44 |   def search(
45 |     self,
46 |     query: str,
47 |     top_n: int,
48 |     collection_name: str,
49 |     metadata: Optional[dict[str, str | int]] = None,
50 |   ) -> list[VectorSearchResult]:
51 |     """Search for the top_n documents that are most similar to the given query.
52 | 
53 |     Args:
54 |       query (str): The query to search for.
55 |       top_n (int): Number of top search results to retrieve.
56 |       collection_name (str): The name of the collection.
57 |       metadata (Optional[dict[str, str | int]]): Metadata to filter the search results.
58 | 
59 |     Returns:
60 |       A list of vector search results.
61 |     """
62 |     raise NotImplementedError
63 | 
64 |   @abstractmethod
65 |   def delete_by_ids(
66 |     self,
67 |     ids: list[UUID],
68 |     collection_name: str,
69 |   ) -> None:
70 |     """Delete records from collection by their ids.
71 | 
72 |     Args:
73 |       ids (list[str]): list of ids that need to be removed
74 |       collection_name (str): The name of the collection.
75 |     """
76 |     raise NotImplementedError
77 | 


--------------------------------------------------------------------------------
/eschergraph/agents/jinja_helper.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import re
 4 | from pathlib import Path
 5 | from typing import Any
 6 | 
 7 | from jinja2 import BaseLoader
 8 | from jinja2 import Environment
 9 | from jinja2 import FileSystemLoader
10 | from jinja2 import select_autoescape
11 | from jinja2 import Template
12 | 
13 | from eschergraph.exceptions import PromptFormattingException
14 | 
15 | 
16 | def process_template(template_file: str, data: dict[str, str]) -> str:
17 |   """Process the jinja template into a string.
18 | 
19 |   Function has been inspired by: https://github.com/ArjanCodes/examples/blob/main/2024/tuesday_tips/jinja2/jinja_helper.py
20 | 
21 |   Args:
22 |     template_file (str): The name of the jinja prompt template.
23 |     data (dict): The parameters and their values to insert into the prompt.
24 | 
25 |   Returns:
26 |     The formatted prompt as a string.
27 |   """
28 |   parent_path: str = Path(__file__).parent.absolute().as_posix()
29 |   jinja_env: Environment = Environment(
30 |     loader=FileSystemLoader(searchpath=parent_path + "/prompts"),
31 |     autoescape=select_autoescape(),
32 |   )
33 | 
34 |   template_variables: list[Any] = extract_variables(template_file, jinja_env)
35 | 
36 |   # Check if all variables in template have been provided as data
37 |   if not set(template_variables) == set(data.keys()):
38 |     raise PromptFormattingException(
39 |       "Some variables in the prompt have not been formatted."
40 |     )
41 | 
42 |   template: Template = jinja_env.get_template(template_file)
43 | 
44 |   return template.render(**data)
45 | 
46 | 
47 | def extract_variables(template_file: str, jinja_env: Environment) -> list[Any]:
48 |   """Extract all variables in a Jinja template in string format.
49 | 
50 |   Args:
51 |     template_file (str): the name of the jinja prompt template.
52 |     jinja_env (Environment): the jinja Environment.
53 | 
54 |   Returns:
55 |     A list of all the identified variables in the string template.
56 |   """
57 |   # Check if the baseloader is None
58 |   if not jinja_env.loader:
59 |     raise PromptFormattingException(
60 |       "Something went wrong formatting the prompt template."
61 |     )
62 |   else:
63 |     loader: BaseLoader = jinja_env.loader
64 | 
65 |   # Get the template as plain text
66 |   plain_template: str = loader.get_source(jinja_env, template_file)[0]
67 | 
68 |   variable_pattern: str = r"\{\{ *([\w_]+) *\}\}"
69 |   return re.findall(variable_pattern, plain_template)
70 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_tokens_type_trainer/pdf_trainer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | import lightgbm as lgb
 6 | import numpy as np
 7 | 
 8 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_features import (
 9 |   PdfFeatures,
10 | )
11 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_font import (
12 |   PdfFont,
13 | )
14 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_token import (
15 |   PdfToken,
16 | )
17 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.rectangle import (
18 |   Rectangle,
19 | )
20 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.token_type import (
21 |   TokenType,
22 | )
23 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_tokens_type_trainer.model_configuration import (
24 |   ModelConfiguration,
25 | )
26 | 
27 | 
28 | class PdfTrainer:
29 |   def __init__(
30 |     self,
31 |     pdfs_features: list[PdfFeatures],
32 |     model_configuration: ModelConfiguration = None,
33 |   ):
34 |     self.pdfs_features = pdfs_features
35 |     self.model_configuration = (
36 |       model_configuration if model_configuration else ModelConfiguration()
37 |     )
38 | 
39 |   def get_model_input(self) -> np.ndarray:
40 |     pass
41 | 
42 |   @staticmethod
43 |   def features_rows_to_x(features_rows):
44 |     if not features_rows:
45 |       return np.zeros((0, 0))
46 | 
47 |     x = np.zeros(((len(features_rows)), len(features_rows[0])))
48 |     for i, v in enumerate(features_rows):
49 |       x[i] = v
50 |     return x
51 | 
52 |   def loop_tokens(self):
53 |     for pdf_features in self.pdfs_features:
54 |       for page, token in pdf_features.loop_tokens():
55 |         yield token
56 | 
57 |   @staticmethod
58 |   def get_padding_token(segment_number: int, page_number: int):
59 |     return PdfToken(
60 |       page_number,
61 |       "pad_token",
62 |       "",
63 |       PdfFont("pad_font_id", False, False, 0.0, "#000000"),
64 |       segment_number,
65 |       Rectangle(0, 0, 0, 0),
66 |       TokenType.TEXT,
67 |     )
68 | 
69 |   # Models are already downloaded
70 |   def predict(self, model_path: str | Path = None):
71 |     x = self.get_model_input()
72 | 
73 |     if not x.any():
74 |       return self.pdfs_features
75 | 
76 |     lightgbm_model = lgb.Booster(model_file=model_path)
77 |     return lightgbm_model.predict(x)
78 | 


--------------------------------------------------------------------------------
/docs/docs/explained-eschergraph/Graph building.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 2
 3 | ---
 4 | 
 5 | # Graph Building Pipeline
 6 | ### Eschergraph Pipeline
 7 | 
 8 | Some aspects of this graph building pipeline have been inspired by [GraphRAG](https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/), a Microsoft Research project. Their great work and insights on communities in a graph have been especially inspiring. 
 9 | 
10 | The steps for the EscherGraph building are: 
11 | 
12 | 1) Parse document into chunks of about 500 tokens
13 | 2) Extract nodes & edges for each chunk using an LLM
14 | 3) Extract properties for each node using an LLM
15 | 4) Match similar nodes, and merging their edges and properties (more on this [below](#node-matcher))
16 | 5) Persist the graph to a database
17 | 6) Build communities with the [LeidenAlg](https://github.com/vtraag/leidenalg)
18 | 7) Sync all nodes, edges and properties to a vector database, by default [ChromaDB](https://www.trychroma.com/)
19 | 
20 | 
21 | ![EscherGraph building steps](img/Eschergraph_background.png)
22 | 
23 | ### Node Matcher
24 | The node matcher is used to match and merge nodes that refer to the same entity.
25 | It involves two steps.
26 | 
27 | 1. Identify potentially matching nodes using the Levenshtein distance. Indeed, nodes with exactly the same name are matched straight away.
28 |    - For example:
29 |      - matching 'p100' to 'p100 gpu'
30 |      - 'Sam' and 'Sam Altman'
31 | 
32 | 2) Decide on potential matches using LLM reasoning and contextual clues. This is done to make the right decision, even for edge cases: when dealing with different entities that are referenced by the same name across different chunks. For example, 'Sam', 'Sam Altman', and 'Sam Bankman-Fried'.
33 | Therefore, the node matcher utilizes additional context to differentiate between them. This process involves:
34 |      - LLM identifies node name ambiguities: given a list of potentially matching entity names, the LLM returns a list of edge cases.
35 | 
36 |      - Re-ranking potential matches: a re-ranker evaluates the similarity of the nodes based on context, metadata, or additional attributes to accurately determine which specific entity a node is referring to.
37 | 
38 |      - Contextual clues: the re-ranker leverages additional contextual information from the surrounding data or relationships to classify which node is the correct match. This might include looking at node connections, associated attributes, or other identifiers to make a more informed decision.


--------------------------------------------------------------------------------
/eschergraph/agents/prompts/json_property.jinja:
--------------------------------------------------------------------------------
 1 | The goal is to add descriptive properties to all entities that occur in the provided text. 
 2 | It is essential that all information from the provided piece of text is extracted and assigned as a property to an entity.
 3 | We need to make sure that the resulting graph contains exactly the same information as the source text; it needs to contain all the information.
 4 | 
 5 | ===============================================================================
 6 | EXAMPLE SECTION: The following section includes example output. These examples must be
 7 | **excluded from your answer**.
 8 | 
 9 | EXAMPLE 1
10 | Task: Extract all the properties for each entity from the source text.
11 | Text: Nicklas Bendtner is a Danish football legend that played for Arsenal. He was especially
12 | feared for his ability to score a goal at any moment.
13 | JSON RESPONSE:
14 | {"entities": [{"Nicklas Bendtner": ["A Danish football legend", "Played for Arsenal", "Especially feared for his ability to score a goal at any moment"]}]}
15 | END OF EXAMPLE 1
16 | 
17 | EXAMPLE 2
18 | Task: Extract all the properties for each entity from the source text.
19 | Text: EscherGraph is a state-of-the-art open-source package that provides a framework for building
20 | a structured graph from any unstructured data source. Although, an entirely novel approach, it can
21 | be seen as the next evolution of knowledge graphs. It was developed at PinkDot AI, a Delft based
22 | AI startup.
23 | JSON RESPONSE:
24 | {"entities": [
25 |   {"EscherGraph": ["A state-of-the-art open source package that provides a framework for building a structured graph from any unstructured data source", "an entirely novel approach that can be seen as the next evolution of knowledge graphs"]},
26 |   {"PinkDot AI": ["A Delft based AI startup"]}
27 | ]}
28 | END OF EXAMPLE 2
29 | ===============================================================================
30 | 
31 | ===============================================================================
32 | REAL DATA: The following section is the real data. You should use only this real data to prepare your answer. Generate entities with properties only.
33 | Task: Extract all the properties for each entity from the source text.
34 | 
35 | Make sure all information from the provided piece of text is extracted and assigned as a property to an entity.
36 | Text: {{ input_text }}
37 | JSON response:
38 | {"entities": [{"<ENTITY_NAME>": ["<PROPERTY_1>", "<PROPERTY_2>", "<PROPERTY_3>"]}]}
39 | Match the properties to existing nodes only!! This is absolutely crucial!!!
40 | Existing nodes: {{ current_nodes }}
41 | 


--------------------------------------------------------------------------------
/tests_integration/build_search.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import time
 4 | from pathlib import Path
 5 | from tempfile import TemporaryDirectory
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | from eschergraph import Graph
10 | from eschergraph.agents import OpenAIModel
11 | from eschergraph.agents import OpenAIProvider
12 | from eschergraph.persistence import Repository
13 | from eschergraph.persistence.adapters.simple_repository import SimpleRepository
14 | from eschergraph.persistence.document import Document
15 | from eschergraph.persistence.vector_db import VectorDB
16 | from eschergraph.persistence.vector_db.adapters.chromadb import ChromaDB
17 | from eschergraph.visualization import Visualizer
18 | 
19 | TEST_FILE: str = "./test_files/test_file.pdf"
20 | 
21 | # Load all the credentials
22 | load_dotenv()
23 | 
24 | 
25 | def build_graph() -> None:
26 |   # The temporary directory (clean run for each test)
27 |   temp_dir: TemporaryDirectory = TemporaryDirectory()
28 |   temp_path: Path = Path(temp_dir.name)
29 | 
30 |   # Set up all the graph dependencies
31 |   graph_name: str = "test_graph"
32 |   repository: Repository = SimpleRepository(
33 |     name=graph_name, save_location=temp_path.as_posix()
34 |   )
35 |   chroma: VectorDB = ChromaDB(
36 |     save_name=graph_name,
37 |     persistent=False,
38 |     embedding_model=OpenAIProvider(model=OpenAIModel.TEXT_EMBEDDING_LARGE),
39 |   )
40 |   graph: Graph = Graph(
41 |     name=graph_name,
42 |     repository=repository,
43 |     vector_db=chroma,
44 |     model=OpenAIProvider(model=OpenAIModel.GPT_4o_MINI),
45 |   )
46 | 
47 |   # Build the graph
48 |   graph.build(TEST_FILE)
49 | 
50 |   Visualizer.visualize_graph(
51 |     graph, level=0, save_location=temp_path.as_posix() + "/level_0.html"
52 |   )
53 |   Visualizer.visualize_graph(
54 |     graph, level=1, save_location=temp_path.as_posix() + "/level_1.html"
55 |   )
56 |   Visualizer.visualize_graph(
57 |     graph, level=2, save_location=temp_path.as_posix() + "/level_2.html"
58 |   )
59 | 
60 |   documents: list[Document] = graph.get_all_documents()
61 | 
62 |   print("The document currently in the graph: ")
63 |   print(documents)
64 |   doc_name: str = documents[0].name
65 | 
66 |   answer = graph.search("Who are the architects?", filter_filenames=[doc_name])
67 |   print(answer)
68 | 
69 |   global_answer = graph.global_search(
70 |     "What are the key points?", filter_filenames=[doc_name]
71 |   )
72 |   print(global_answer)
73 | 
74 |   graph.dashboard()
75 | 
76 |   # Wait a few seconds before cleaning up to open the visuals
77 |   time.sleep(10)
78 | 
79 |   # Clean up all the persistent data
80 |   temp_dir.cleanup()
81 | 
82 | 
83 | if __name__ == "__main__":
84 |   build_graph()
85 | 


--------------------------------------------------------------------------------
/eschergraph/agents/prompts/json_figure.jinja:
--------------------------------------------------------------------------------
 1 | You are given a figure, analyse this well regarding information about various entities and their relationships. 
 2 | The goal is to map all the relations of the figure with regarding entities and extract all relevant information in the figure.
 3 | The figure will be regarded as if it is an entity.
 4 | Extract the entities and describe the relationships between the figure and these entities in a structured format.
 5 | 
 6 | There might be possible relations between entities mentioned in the figure. Also extract these relations.
 7 | Analyse the figure and the capiton and give a rich description of the figure. 
 8 | Give the figure a name based on the caption, make sure to include that it is a figure in the name.
 9 | 
10 | -Steps-
11 | 1. Identify all named entities in singular form. For people please include the entire name. Entities can also be technologies. 
12 | For each identified entity, extract the following information:
13 | - entity_name: Name of the entity
14 | - entity_description: Comprehensive description of the entity's attributes and activities
15 | - specify whether this node is the main node regarding the given figure. There can only be one true (specify boolean value True) node for the figure 
16 | Format each entity output as a JSON entry with the following format:
17 | 
18 | {"name": <entity name>, "description": <entity description>, 'main_node': bool}
19 | 
20 | 2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
21 | For each pair of related entities, extract the following information:
22 | - source_entity: name of the source entity, as identified in step 1
23 | - target_entity: name of the target entity, as identified in step 1
24 | - relationship: explanation as to why you think the source entity and the target entity are related to each other
25 | 
26 | -- Complete JSON Output format:--  
27 | {  
28 |   "entities": [  
29 |     {"name": "<figure_name>", "description": "<rich figure description>", 'main_node': bool},  
30 |     {"name": "<entity_name2>", "description": "<entity_description2>", 'main_node': bool}  
31 |   ],  
32 |   "relationships": [  
33 |     {"source": "<table_name>", "target": "<entity_name1>", "relationship": "<relationship_description>"},  
34 |     {"source": "<table_name>", "target": "<entity_name2>", "relationship": "<relationship_description>"},  
35 |     {"source": "<entity_name1>", "target": "<entity_name2>", "relationship": "<relationship_description>"}  
36 |   ]  
37 | }
38 | 
39 |  -Real Data-
40 | ############### 
41 | Figure caption: {{ figure_caption }}
42 | These are the keywords of the document in which the figure appears {{ keywords }}
43 | ###############
44 | 


--------------------------------------------------------------------------------
/eschergraph/graph/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | from uuid import UUID
 5 | 
 6 | from eschergraph.exceptions import DocumentAlreadyExistsException
 7 | from eschergraph.exceptions import DocumentDoesNotExistException
 8 | from eschergraph.exceptions import FileException
 9 | from eschergraph.persistence import Repository
10 | from eschergraph.persistence.document import Document
11 | 
12 | 
13 | def duplicate_document_check(file_list: list[str], repository: Repository) -> None:
14 |   """Check if the documents already exist in the graph.
15 | 
16 |   Also, it raises an exception if a provided filepath does not point to
17 |   a file.
18 | 
19 |   Args:
20 |     file_list (list[str]): A list of filepaths pointing to files.
21 |     repository (Repository): The repository that stores the graph data.
22 | 
23 |   Raises:
24 |     A DocumentAlreadyExistsException as soon as it discovers a document that already
25 |     exists.
26 |     FileException if one of the provided paths does not point to a file, or if the
27 |     file does not exist.
28 |   """
29 |   for file in file_list:
30 |     file_path: Path = Path(file)
31 | 
32 |     # Check if the filepath points to a file
33 |     if not file_path.is_file():
34 |       raise FileException(f"Make sure that this is a file that exists: {file_path}")
35 | 
36 |     filename: str = file_path.name
37 | 
38 |     if repository.get_document_by_name(filename):
39 |       raise DocumentAlreadyExistsException(
40 |         f"A file with this name already exists in the graph: {filename}"
41 |       )
42 | 
43 | 
44 | def search_check(repository: Repository) -> bool:
45 |   """Check if there are any elements at level 0 in the graph repository.
46 | 
47 |   Args:
48 |     repository (Repository): The repository that stores the graph.
49 | 
50 |   Returns:
51 |     bool: True if there are elements at level 0, otherwise False.
52 |   """
53 |   return len(repository.get_all_at_level(0)) > 0
54 | 
55 | 
56 | def get_document_ids_from_filenames(
57 |   filenames: list[str], repository: Repository
58 | ) -> list[UUID]:
59 |   """Get a document id from a list of filenames.
60 | 
61 |   Used to get the document id's for the filter in the search.
62 | 
63 |   Args:
64 |     filenames (list[str]): A list of filenames.
65 |     repository (Repository): The repository that saves the data.
66 | 
67 |   Returns:
68 |     list[UUID]: A list of document id's.
69 | 
70 |   Raises:
71 |     DocumentDoesNotExistException: If one of the provided filenames does not exist.
72 |   """
73 |   doc_ids: list[UUID] = []
74 |   for name in filenames:
75 |     doc: Document | None = repository.get_document_by_name(name)
76 | 
77 |     if not doc:
78 |       raise DocumentDoesNotExistException(f"Document with name: {name}, does not exist")
79 |     doc_ids.append(doc.id)
80 | 
81 |   return doc_ids
82 | 


--------------------------------------------------------------------------------
/tests/graph/test_getter_setter.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from unittest.mock import Mock
 4 | from uuid import uuid4
 5 | 
 6 | import pytest
 7 | 
 8 | from eschergraph.graph.base import EscherBase
 9 | from eschergraph.graph.community import Community
10 | from eschergraph.graph.getter_setter import _extract_inner_type
11 | from eschergraph.graph.getter_setter import _extract_property_type
12 | from eschergraph.graph.getter_setter import _parse_future_annotations
13 | from eschergraph.graph.getter_setter import loading_getter_setter
14 | from eschergraph.graph.loading import LoadState
15 | from eschergraph.persistence import Metadata
16 | 
17 | 
18 | @pytest.fixture(scope="function")
19 | def base_repository(mock_repository: Mock) -> Mock:
20 |   # Set the metadata equal to an empty set
21 |   def load_side_effect(base: EscherBase, loadstate: LoadState) -> None:
22 |     base._metadata = set()
23 | 
24 |   mock_repository.load.side_effect = load_side_effect
25 | 
26 |   return mock_repository
27 | 
28 | 
29 | def test_extract_property_type_string() -> None:
30 |   assert _extract_property_type("list[str]") == ""
31 |   assert _extract_property_type("Optional[int]") == "int"
32 |   assert _extract_property_type("Optional[set[int]]") == "set[int]"
33 |   assert _extract_property_type("") == ""
34 | 
35 | 
36 | def test_extract_inner_type() -> None:
37 |   assert _extract_inner_type("") == ""
38 |   assert _extract_inner_type("list[str]") == ""
39 |   assert _extract_inner_type("Optional[set[int]]") == "set"
40 |   assert _extract_inner_type("Optional[Node]") == "Node"
41 | 
42 | 
43 | def test_parse_future_annotations() -> None:
44 |   with pytest.raises(RuntimeError):
45 |     _parse_future_annotations("")
46 |     _parse_future_annotations("list[str]")
47 | 
48 |   _parse_future_annotations("Optional[set[int]]") == set
49 |   _parse_future_annotations("Optional[list[str]]") == list
50 |   _parse_future_annotations("Optional[Community]") == Community
51 | 
52 | 
53 | # Testing whether the class decorator works.
54 | # Note that it cannot be applied to the base class directly as that would
55 | # trigger a circular import.
56 | @loading_getter_setter
57 | class ExtendedBase(EscherBase): ...
58 | 
59 | 
60 | def test_check_loadstate_metadata(base_repository: Mock) -> None:
61 |   base: EscherBase = ExtendedBase(repository=base_repository)
62 | 
63 |   assert isinstance(base.metadata, set)
64 |   assert base.loadstate == LoadState.CORE
65 | 
66 | 
67 | def test_setting_metadata(base_repository: Mock) -> None:
68 |   base: EscherBase = ExtendedBase(repository=base_repository)
69 | 
70 |   metadata_set: set[Metadata] = {Metadata(document_id=uuid4(), chunk_id=1)}
71 |   assert not base._metadata
72 |   assert base.loadstate == LoadState.REFERENCE
73 | 
74 |   base.metadata = metadata_set
75 | 
76 |   assert base.metadata == metadata_set
77 |   assert base.loadstate == LoadState.CORE  # type: ignore
78 | 


--------------------------------------------------------------------------------
/eschergraph/agents/prompts/identifying_nodes.jinja:
--------------------------------------------------------------------------------
 1 | You are an expert in understanding the difference between named entities. 
 2 | -- Goal: 
 3 | You will recieve a list of entity names that I potentially want to merge. The goal is to merge them if they are the same but with a different name,
 4 | but if they are inherrently different then return them als different entities. I am reliing on your judgement on this!
 5 | 
 6 | Give then unique set of nodes all appropriate names. If the entities are different versions of the name of a person, then always return the full name.
 7 | 
 8 | Always answer in the JSON format below. Give a list of entities with their new name and the names of the merged entities from the input list.
 9 | It is possible assing multiple input entities to more entities in the answer:
10 | {'entities':
11 |     [
12 |         {'name':<correct_entitiy_name>, 'merged entities': [<name1>, <name2>,...]}
13 |     ]
14 | }
15 | Make sure to only put entities from the input lis in the merged entities and that all input entities are merged somewhere!
16 | 
17 | If it is reasonble to merge an input entity with multiple output entities, because there might be some overlap, 
18 | then make sure to add this input entitiy in both the output entities merged entity list. Like in this example: 
19 | 
20 | Entities: Lennart, Lennart Timmermans, Patrick Timmermans, Timmermans
21 | Answer:{'entities':
22 |     [
23 |         {'name':Lennart Timmermans, 'merged entities': [Lennart Timmermans, Lennart, Timmermans]}
24 |         {'name':Patrick Timmermans, 'merged entities': [Patrick Timmermans, Timmermans]}
25 |     ]
26 | }
27 | The entity Timmermans is in both output entities
28 | 
29 | -- More Examples:
30 | Entities: Manchester United, Manchester, United
31 | Answer:{'entities':
32 |     [
33 |         {'name':Manchester United, 'merged entities': [Manchester United, United]}
34 |         {'name':Manchester, 'merged entities': [Manchester]}
35 | 
36 |     ]
37 | }
38 | Entities: Bjarne, Bjarne Herben
39 | Answer:{'entities':
40 |     [
41 |         {'name':Bjarne Herben, 'merged entities': [Bjarne Herben, Bjarne]}
42 |     ]
43 | }
44 | 
45 | Entities: Nvidia, Nvidia H100 GPU, H100, GPU
46 | Answer:{'entities':
47 |     [
48 |         {'name':Nvidia, 'merged entities': Nvidia}
49 |         {'name':Nvidia H100 GPU, 'merged entities': [H100, Nvidia H100 GPU]}
50 |         {'name': GPU, 'merged entities': [GPU]}
51 |     ]
52 | }
53 | Entities: '1988 world series', 'world series trophy', '2001 world series', 'world series', '2017 world series'
54 |     [
55 |         {'name':World Series, 'merged entities': [1988 world series, 2001 world series, 2017 world series, world series]}
56 |         {'name':world series trophy, 'merged entities': [world series trophy]}
57 |     ]
58 | 
59 | ---Now here is the real data. Extract all unique entities from the list, and make sure to double tag if there is an overlap in entities.
60 | input entities: {{entities}}


--------------------------------------------------------------------------------
/eschergraph/graph/property.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Optional
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | from attrs import define
 7 | from attrs import field
 8 | 
 9 | from eschergraph.graph.base import EscherBase
10 | from eschergraph.graph.getter_setter import loading_getter_setter
11 | from eschergraph.graph.loading import LoadState
12 | from eschergraph.persistence import Metadata
13 | 
14 | # Prevent circular import errors
15 | if TYPE_CHECKING:
16 |   from eschergraph.graph.node import Node
17 | 
18 | 
19 | @loading_getter_setter
20 | @define
21 | class Property(EscherBase):
22 |   """The property class.
23 | 
24 |   Conceptually, we consider a property to be the same as en edge.
25 |   Therefore, a property can be considered as sort of an edge between the
26 |   node and itself.
27 |   """
28 | 
29 |   node: Node = field(kw_only=True)
30 |   _description: Optional[str] = field(default=None, metadata={"group": LoadState.CORE})
31 | 
32 |   # The type annotation for the properties added by the decorator
33 |   description: str = field(init=False)
34 | 
35 |   @classmethod
36 |   def create(
37 |     cls,
38 |     node: Node,
39 |     description: str,
40 |     metadata: Optional[set[Metadata]] = None,
41 |   ) -> Property:
42 |     """Create a new property.
43 | 
44 |     The property that is created is automatically added to the specified node.
45 | 
46 |     Args:
47 |       node (Node): The node to which the property belongs.
48 |       description (str): The property's description.
49 |       metadata (Optional[set[Metadata]]): The optional set with metadata about the property's extraction.
50 | 
51 | 
52 |     Returns:
53 |       The newly created property.
54 |     """
55 |     # The same repository as the node
56 |     property: Property = cls(
57 |       node=node,
58 |       description=description,
59 |       repository=node.repository,
60 |       metadata=metadata if metadata else set(),
61 |       loadstate=LoadState.FULL,
62 |     )
63 | 
64 |     # Add the property to the node
65 |     node.properties.append(property)
66 | 
67 |     return property
68 | 
69 |   def __eq__(self, other: object) -> bool:
70 |     """The equals method for a property.
71 | 
72 |     Two property objects are considered equal if they have the same description, and
73 |     if they belong to the same node.
74 | 
75 |     Args:
76 |       other (object): The object to compare the property to.
77 | 
78 |     Returns:
79 |       True if equal and False otherwise.
80 |     """
81 |     if isinstance(other, Property):
82 |       return self.description == other.description and self.node.id == other.node.id
83 |     return False
84 | 
85 |   def __hash__(self) -> int:
86 |     """The hash function for a property.
87 | 
88 |     The hash is computed based on the id as this uniquely defines the property.
89 | 
90 |     Returns:
91 |       The computed hash value, which is an integer.
92 |     """
93 |     return hash(self.id)
94 | 


--------------------------------------------------------------------------------
/test_files/txt_file.txt:
--------------------------------------------------------------------------------
 1 | The Architects of Tomorrow
 2 | In a small, dimly lit room, a team of programmers sat hunched over their keyboards. The hum of servers filled
 3 | the air as lines of code streamed across their screens. These weren’t just any programmers—they were the
 4 | Architects of Tomorrow.Sarah, the team leader, paused to stretch her fingers. She glanced around the room
 5 | at her team—Mark, the algorithm wizard; Aisha, the cybersecurity guru; and Leo, the interface designer who
 6 | could make code come alive on screen. They were deep in the final stages of a project they had been
 7 | working on for years: a new kind of network, one that would connect not just devices, but ideas, dreams, and
 8 | potential.This wasn’t just about faster internet or smarter gadgets. The network they were building could
 9 | amplify human creativity, bridging minds across the globe. It could help a scientist in Tokyo collaborate in
10 | real-time with an artist in Paris, or a teacher in Nairobi to instantly share lessons with students in New York. It
11 | was a web of thought, a tapestry of innovation that would redefine what it meant to be connected.“Do you
12 | think they’ll understand it?” Mark asked, breaking the silence. He was staring at a complex algorithm on his
13 | screen, one that would enable the network to learn and grow with its users.Sarah smiled. “They don’t have to
14 | understand it, just like we don’t need to understand every neuron in our brains to think. They’ll feel it. They’ll
15 | see the world changing, and they’ll be a part of it.”Aisha nodded, her eyes flicking to a series of security
16 | protocols she had been fine-tuning. “As long as we protect it. This kind of power... it needs to be
17 | safeguarded. We can’t let it be misused.”“That’s why we’re here,” Leo said, his voice calm but determined.
18 | “We’re building something that transcends what’s been done before. Something that can’t just be reduced to
19 | ones and zeros. It’s about connection—real, human connection.”Hours blurred into days as the team worked
20 | tirelessly, their passion fueling each keystroke. And then, one quiet morning, as the sun’s first light filtered into
21 | their room, it was done.Sarah took a deep breath and hit the final key. The network hummed to life, invisible
22 | but powerful. All across the world, people began to feel a shift—subtle at first, like a whisper of inspiration. An
23 | artist suddenly saw the final stroke of their masterpiece. A doctor discovered the missing link in a cure. A
24 | child, halfway across the world, picked up a book and understood it in a way they never had before.The
25 | programmers sat back, watching the world awaken to their creation. It wasn’t just code anymore. It was hope,
26 | opportunity, and a future they had shaped with their own hands.And as the world began to connect in ways it
27 | had never imagined, the Architects of Tomorrow quietly closed their laptops. Their work was done, but the
28 | future they had built was just beginning.


--------------------------------------------------------------------------------
/eschergraph/agents/llm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from abc import ABC
  4 | from abc import abstractmethod
  5 | from typing import Any
  6 | 
  7 | from attrs import define
  8 | from attrs import field
  9 | 
 10 | 
 11 | @define
 12 | class FunctionCall:
 13 |   """The function call as returned by the model.
 14 | 
 15 |   The arguments are a JSON representation of the arguments that need to be
 16 |   supplied to the function. They still need to be validated.
 17 |   """
 18 | 
 19 |   name: str
 20 |   arguments: dict[str, Any]
 21 | 
 22 | 
 23 | @define
 24 | class TokenUsage:
 25 |   """Information on the tokens used by the LLM."""
 26 | 
 27 |   prompt_tokens: int
 28 |   completion_tokens: int
 29 |   total_tokens: int
 30 | 
 31 | 
 32 | @define
 33 | class ModelProvider(ABC):
 34 |   """The abstract base class for all the LLMs used in the package."""
 35 | 
 36 |   required_credentials: list[str] = ["OPENAI_API_KEY"]
 37 |   tokens: list[TokenUsage] = field(factory=list)
 38 |   max_threads: int = field(default=10)
 39 | 
 40 |   @abstractmethod
 41 |   def get_model_name(self) -> str:
 42 |     """Get the model name of the model provider.
 43 | 
 44 |     Returns:
 45 |       The string name of the used llm model
 46 |     """
 47 |     raise NotImplementedError
 48 | 
 49 |   @abstractmethod
 50 |   def get_plain_response(self, prompt: str) -> str | None:
 51 |     """Get a plain text response from an LLM.
 52 | 
 53 |     Args:
 54 |       prompt (str): The prompt to send to the LLM.
 55 | 
 56 |     Returns:
 57 |       The response from the LLM.
 58 |     """
 59 |     raise NotImplementedError
 60 | 
 61 |   @abstractmethod
 62 |   def get_multi_modal_response(self, prompt: str, image_path: str) -> Any:
 63 |     """Get a text response from OpenAI.
 64 | 
 65 |     Note that the model that is used is specified when instantiating the class.
 66 | 
 67 |     Args:
 68 |       prompt (str): The user prompt that is send to ChatGPT.
 69 |       image_path (str): the image to be analyse with the text
 70 | 
 71 |     Returns:
 72 |       The answer given or None.
 73 |     """
 74 |     raise NotImplementedError
 75 | 
 76 |   @abstractmethod
 77 |   def get_formatted_response(self, prompt: str, response_format: Any) -> str | None:
 78 |     """Get a formatted response from an LLM.
 79 | 
 80 |     Args:
 81 |       prompt (str): The user prompt that is send to ChatGPT.
 82 |       response_format (dict): Type of format that will be returned
 83 | 
 84 |     Returns:
 85 |       Formatted answer
 86 |     """
 87 |     raise NotImplementedError
 88 | 
 89 |   @abstractmethod
 90 |   def get_json_response(self, prompt: str) -> dict[str, Any]:
 91 |     """Get a JSON response from an LLM.
 92 | 
 93 |     The JSON output is parsed into a Python dictionary before it is returned.
 94 | 
 95 |     Args:
 96 |       prompt (str): The prompt to send to the LLM.
 97 | 
 98 |     Returns:
 99 |       The parsed dictionary object.
100 |     """
101 |     raise NotImplementedError
102 | 


--------------------------------------------------------------------------------
/eschergraph/agents/prompts/json_table.jinja:
--------------------------------------------------------------------------------
 1 | You are given the markdown of a table. The table may contain information about various entities and their relationships. 
 2 | The goal is to map all the relations of the table with regarding entities, and describe all relevant information in the table.
 3 | 
 4 | The table name will be regarded as if it is an entity.
 5 | Extract the entities and describe the relationships between the table and these entities in a structured format. Entities has to be named entities.
 6 | 
 7 | There might be possible relations between entities mentioned in the table. Also extract these relations.
 8 | 
 9 | Analyse the table and the caption and give a rich description for the table.
10 | Give the table a name based on the caption, make sure to include that it is a table in the name.
11 | 
12 | --JSON Output format:--  
13 | {  
14 |   "entities": [  
15 |     {"name": "<entity_name1>", "description": "<entity_description1>"},  
16 |     {"name": "<entity_name2>", "description": "<entity_description2>"}  
17 |   ],  
18 |   "relationships": [  
19 |     {"source": "<table_name>", "target": "<entity_name1>", "relationship": "<relationship_description>"},  
20 |     {"source": "<table_name>", "target": "<entity_name2>", "relationship": "<relationship_description>"},  
21 |     {"source": "<entity_name1>", "target": "<entity_name2>", "relationship": "<relationship_description>"}  
22 |   ]  
23 | }  
24 | 
25 | --Example:--  
26 | For a table describing sales performance across different regions:  
27 | 
28 | - **Table Name:** Sales Report Q3  
29 | - **Entity 1:** North America (NA)  
30 | - **Entity 2:** Europe (EU)  
31 | - **Entity 3:** Asia Pacific (APAC)  
32 | Caption: Sales Report Q3 for hypothetical company
33 | Answer:
34 | {  
35 |   "entities": [  
36 |     {"name": "Sales Report Q3  ", "description": "Sales Report Q3 for hypothetical company description the regions North American (NA), Europe (EU) and Asia Pacific (APAC)"},  
37 |     {"name": "North America (NA) ", "description": "<North America (NA) description>"}  
38 |     {"name": "Europe (EU)", "description": "<Europe (EU)   description>"}  
39 |     {"name": "Asia Pacific (APAC)   ", "description": "<Asia Pacific (APAC) description>"}  
40 | 
41 |   ],  
42 |   "relationships": [  
43 |     {"source": "Table 1 - Sales Report Q3", "target": "North America (NA)", "relationship": "Has a table sales report in Q3 of x for hypothetical company"},  
44 |     {"source": "Table 1 - Sales Report Q3", "target": "Europe (EU)", "relationship": "Has a table sales report in Q3 of y for hypothetical company"},  
45 |     {"source": "Table 1 - Sales Report Q3", "target": "Asia Pacific (APAC)", "relationship": "Has a table sales report in Q3 of z for hypothetical company"}  
46 |   ]  
47 | }  
48 | 
49 |  -Real Data-
50 | ######################
51 | 
52 | {{ markdown_table }}  
53 | Table caption: {{ table_caption }}
54 | These are the keywords of the document in which the table appears {{ keywords }}
55 | 
56 | ######################
57 | 


--------------------------------------------------------------------------------
/docs/docs/explained-eschergraph/File Parsing.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 1
 3 | ---
 4 | 
 5 | # File Parsing
 6 | 
 7 | ### PDF files
 8 | 
 9 | Parsing PDF files can be challenging due to difficulties in extracting and chunking text accurately. For PDF files, EscherGraph utilizes two open-source document layout models developed by [HURIDOCS](https://github.com/huridocs/pdf-document-layout-analysis), we use their lightweight LightGBM models to make it easy to setup and run the package. These models leverage XML data extracted by Poppler for analysis.
10 | 
11 | We are actively working on enhancing EscherGraph to be multimodal. The VGT (Vision Grid Transformer) model from HURIDOCS will enable this advancement, only this model is too large to run on a local device, and needs a GPU. 
12 | 
13 | For paragraph detection and chunking, we use their models within our parser.
14 | 
15 | ```python
16 | from eschergraph.builder.reader.reader import Reader, Chunk
17 | file_location = 'test_files/Attention is All You Need.pdf'
18 | 
19 | reader = Reader(
20 |     file_location: file_location
21 | )
22 | reader.parse()
23 | chunks = reader.chunks -> list[Chunk]
24 | ```
25 | 
26 | This is the Chunk object definition.
27 | 
28 | ```python
29 | @define
30 | class Chunk:
31 |   """The chunk object."""
32 | 
33 |   text: str
34 |   chunk_id: int
35 |   doc_id: UUID
36 |   page_num: Optional[int]
37 |   doc_name: str
38 | ```
39 | ### TXT files:
40 | For TXT files, we use the Langchain recursive splitter, with a standard chunk size of 1500 characters and an overlap of 300 characters.
41 | ```python
42 | from eschergraph.builder.reader.reader import Reader
43 | file_location = 'test_files/txt_file.txt'
44 | 
45 | reader = Reader(
46 |     file_location = file_location,
47 |     chunk_size = 1500,
48 |     overlap = 300
49 | )
50 | reader.parse()
51 | chunks = reader.chunks
52 | ```
53 | 
54 | ## Poppler disclaimer
55 | As mentioned previously, our PDF parser uses Poppler internally to convert PDF into XML. Therefore, you are required to have Poppler installed when building a graph from PDF files with our package. Unfortunately, it can be quite a hassle to install Poppler on Windows. In order to mitigate this, our package will automatically install Poppler on Windows, if not already present. We do this by checking if the required functionality is in the path, if not, then we download a Poppler binary from [poppler-windows](https://github.com/oschwartz10612/poppler-windows). The zip file is then extracted and placed in the package's source. It is only during runtime that the binary is placed in the PATH and executed. Hence, this will only occur within the process that runs EscherGraph whilst parsing a PDF.
56 | 
57 | We wanted to be fully transparent about this, since a package downloading and running binaries on your hardware can also be done with malicious intent. However, we have done this to make it as easy as possible for Windows users to use our package. If interested, the corresponding code can be found in `eschergraph/tools/fast_pdf_parse/parser.py`.


--------------------------------------------------------------------------------
/eschergraph/agents/providers/jina.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | from typing import Any
 5 | 
 6 | import requests
 7 | from attrs import define
 8 | from requests import Response
 9 | 
10 | from eschergraph.agents.reranker import Reranker
11 | from eschergraph.agents.reranker import RerankerResult
12 | from eschergraph.exceptions import CredentialException
13 | from eschergraph.exceptions import ExternalProviderException
14 | 
15 | 
16 | @define
17 | class JinaReranker(Reranker):
18 |   """A reranker that uses Jina's API to rerank a list of documents based on their relevance to a query.
19 | 
20 |   Methods:
21 |       rerank(query: str, text_list: list[str], top_n: int) -> Optional[list[RerankerResult]]:
22 |           Sends a request to Jina's API to rerank the provided text list according to the query.
23 |       get_model_name() -> str: returns the string of the model name
24 |   """
25 | 
26 |   required_credentials: list[str] = ["JINA_API_KEY"]
27 |   model: str = "jina-reranker-v2-base-multilingual"
28 | 
29 |   def get_model_name(self) -> str:
30 |     """Returns the name of the model."""
31 |     return self.model
32 | 
33 |   def rerank(
34 |     self, query: str, text_list: list[str], top_n: int
35 |   ) -> list[RerankerResult]:
36 |     """Reranks a list of text documents based on their relevance to the query using Jina's API.
37 | 
38 |     Args:
39 |         query (str): The query string for which documents are being reranked.
40 |         text_list (list[str]): The list of documents (texts) to be reranked.
41 |         top_n (int): The number of top relevant documents to return.
42 | 
43 |     Returns:
44 |         Optional[list[RerankerResult]]: A list of reranked items with their relevance scores and text,
45 |         or None if the request fails.
46 |     """
47 |     if not text_list:
48 |       return []
49 | 
50 |     api_key: str = os.getenv("JINA_API_KEY")
51 | 
52 |     if not api_key:
53 |       raise CredentialException("No API key for the Jina Reranker has been set")
54 | 
55 |     url = "https://api.jina.ai/v1/rerank"
56 |     headers = {
57 |       "Content-Type": "application/json",
58 |       "Authorization": f"Bearer {api_key}",
59 |     }
60 |     data = {
61 |       "model": self.model,
62 |       "query": query,
63 |       "documents": text_list,
64 |       "top_n": top_n,
65 |     }
66 | 
67 |     try:
68 |       response: Response = requests.post(url, headers=headers, json=data)
69 |       response.raise_for_status()
70 |       response_json: Any = response.json()
71 | 
72 |       return [
73 |         RerankerResult(
74 |           index=r["index"],
75 |           relevance_score=r["relevance_score"],
76 |           text=r["document"]["text"],
77 |         )
78 |         for r in response_json.get("results", [])
79 |       ]
80 | 
81 |     except requests.RequestException as e:
82 |       raise ExternalProviderException(f"Request failed: {e}")
83 |     except ValueError as e:
84 |       raise ExternalProviderException(f"Something went wrong parsing the resulf: {e}")
85 | 


--------------------------------------------------------------------------------
/eschergraph/graph/search/global_search.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any
 4 | from typing import Optional
 5 | from typing import TYPE_CHECKING
 6 | from uuid import UUID
 7 | 
 8 | from eschergraph.agents.jinja_helper import process_template
 9 | from eschergraph.config import GLOBAL_SEARCH_TEMPLATE
10 | from eschergraph.config import MAIN_COLLECTION
11 | from eschergraph.graph.search.attribute_search import AttributeSearch
12 | from eschergraph.graph.search.quick_search import rerank_and_filter_attributes
13 | from eschergraph.persistence.vector_db.vector_search_result import VectorSearchResult
14 | 
15 | if TYPE_CHECKING:
16 |   from eschergraph.graph import Graph
17 | 
18 | 
19 | def global_search(
20 |   graph: Graph, query: str, doc_filter: Optional[list[UUID]] = None
21 | ) -> str:
22 |   """Search a graph globally through its communities.
23 | 
24 |   Note that the findings for a community should be sorted, this is the default behavior when building a graph.
25 | 
26 |   Args:
27 |     graph (Graph): The graph object representing the data structure.
28 |     query (str): The query string used to search within the graph.
29 |     doc_filter: (Optional[list[UUID]]) The optional list of document id's to filter for.
30 | 
31 |   Returns:
32 |     str: The processed response from the graph model based on the search results..
33 |   """
34 |   extractions: list[AttributeSearch] = get_relevant_extractions(
35 |     graph, query, doc_filter
36 |   )
37 |   ans_template: str = GLOBAL_SEARCH_TEMPLATE
38 |   context: str = "\n".join([a.text for a in extractions])
39 |   full_prompt: str = process_template(
40 |     ans_template, {"CONTEXT": context, "QUERY": query}
41 |   )
42 |   response: str | None = graph.model.get_plain_response(full_prompt)
43 |   if not response:
44 |     return ""
45 | 
46 |   return response
47 | 
48 | 
49 | def get_relevant_extractions(
50 |   graph: Graph, prompt: str, doc_filter: Optional[list[UUID]] = None
51 | ) -> list[AttributeSearch]:
52 |   """Extract relevant attributes from the graph based on the search prompt.
53 | 
54 |   Args:
55 |     graph (Graph): The graph object containing the data to search through.
56 |     prompt (str): The query prompt used to perform the attribute search.
57 |     doc_filter: (Optional[list[UUID]]) The optional list of document id's to filter for.
58 | 
59 |   Returns:
60 |     list[AttributeSearch]: A list of relevant attributes extracted from the graph, after filtering and reranking.
61 |   """
62 |   # Perform the search at level 1
63 |   search_metadata: dict[str, Any] = {"level": 1}
64 | 
65 |   if doc_filter:
66 |     search_metadata["document_id"] = [str(id) for id in doc_filter]
67 | 
68 |   attributes_results: list[VectorSearchResult] = graph.vector_db.search(
69 |     query=prompt,
70 |     top_n=15,
71 |     metadata=search_metadata,
72 |     collection_name=MAIN_COLLECTION,
73 |   )
74 | 
75 |   results: list[AttributeSearch] = rerank_and_filter_attributes(
76 |     graph=graph, query=prompt, attributes_results=attributes_results, threshold=0
77 |   )
78 |   return results
79 | 


--------------------------------------------------------------------------------
/eschergraph/graph/base.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Optional
 4 | from typing import TYPE_CHECKING
 5 | from uuid import UUID
 6 | from uuid import uuid4
 7 | 
 8 | from attrs import define
 9 | from attrs import field
10 | from attrs import fields_dict
11 | 
12 | from eschergraph.graph.loading import LoadState
13 | from eschergraph.persistence import Repository
14 | 
15 | # To prevent circular import errors
16 | if TYPE_CHECKING:
17 |   from eschergraph.persistence import Metadata
18 | 
19 | 
20 | @define
21 | class EscherBase:
22 |   """The base class for objects in the package that need to be persisted."""
23 | 
24 |   id: UUID = field(factory=uuid4, metadata={"group": LoadState.REFERENCE})
25 |   _metadata: Optional[set[Metadata]] = field(
26 |     default=None, hash=False, metadata={"group": LoadState.CORE}
27 |   )
28 |   _loadstate: LoadState = field(default=LoadState.REFERENCE)
29 |   """The attribute that keeps track of the loading state of a Node."""
30 |   repository: Repository = field(kw_only=True)
31 | 
32 |   # Type annotation for the dynamically added properties in the child classes
33 |   metadata: set[Metadata] = field(init=False)
34 | 
35 |   def _check_loadstate(self, attr_name: str) -> None:
36 |     """Check if the attribute has been loaded by the current loadstate.
37 | 
38 |     If not enough has been loaded, then load more instance data from the repository.
39 | 
40 |     Args:
41 |       attr_name (str): The name of the attribute that starts with an underscore.
42 |     """
43 |     required_loadstate: LoadState = fields_dict(self.__class__)[attr_name].metadata[
44 |       "group"
45 |     ]
46 | 
47 |     # Load more instance data from the repository if load state is too small
48 |     if self.loadstate.value < required_loadstate.value:
49 |       self.repository.load(self, loadstate=required_loadstate)
50 |       self._loadstate = required_loadstate
51 | 
52 |   @property
53 |   def loadstate(self) -> LoadState:
54 |     """The getter for the loadstate of an EscherGraph object.
55 | 
56 |     Returns:
57 |       The object' loadstate.
58 |     object's loadstate.
59 |     """
60 |     return self._loadstate
61 | 
62 |   @loadstate.setter
63 |   def loadstate(self, loadstate: LoadState) -> None:
64 |     """The setter for the loadstate of the EscherGraph base object.
65 | 
66 |     We use a custom setter because we need to make sure that the value of the loadstate
67 |     reflects that attributes that are loaded. In addition, the loadstate cannot yet decrease
68 |     as we are not yet removing attributes on a class.
69 | 
70 |     Args:
71 |       loadstate (LoadState): The loadstate to set and the state in which the object should
72 |       be loaded.
73 |     """
74 |     # Do nothing if this decreases the loadstate
75 |     if loadstate.value <= self._loadstate.value:
76 |       return
77 | 
78 |     self.repository.load(self, loadstate=loadstate)
79 |     self._loadstate = loadstate
80 | 
81 |   def __hash__(self) -> int:
82 |     """The hash method for an EscherBase object.
83 | 
84 |     It only uses the id.
85 | 
86 |     Returns:
87 |       The integer hash value.
88 |     """
89 |     return hash(self.id)
90 | 


--------------------------------------------------------------------------------
/tests_integration/chunk_optimized.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import time
 4 | from typing import cast
 5 | 
 6 | from eschergraph.agents.jinja_helper import process_template
 7 | from eschergraph.agents.providers.jina import JinaReranker
 8 | from eschergraph.agents.providers.openai import OpenAIModel
 9 | from eschergraph.agents.providers.openai import OpenAIProvider
10 | from eschergraph.builder.build_log import BuildLog
11 | from eschergraph.builder.build_log import NodeEdgeExt
12 | from eschergraph.builder.build_pipeline import BuildPipeline
13 | from eschergraph.builder.models import Chunk
14 | from eschergraph.builder.reader.reader import Reader
15 | from eschergraph.config import JSON_BUILD
16 | from eschergraph.config import JSON_PROPERTY
17 | from eschergraph.graph.graph import Graph
18 | from eschergraph.persistence.metadata import Metadata
19 | 
20 | 
21 | def chunk_optimizer():
22 |   # The temporary directory (clean run for each test)
23 | 
24 |   builder = BuildPipeline(
25 |     model=OpenAIProvider(model=OpenAIModel.GPT_4o_MINI), reranker=JinaReranker()
26 |   )
27 |   test_file: str = "test_files/test_file_2.pdf"
28 | 
29 |   chunks: list[Chunk] = Reader(file_location=test_file, optimal_tokens=400).parse()
30 | 
31 |   for i in range(2):
32 |     chunk = chunks[i]
33 |     prompt_formatted: str = process_template(JSON_BUILD, {"input_text": chunk.text})
34 | 
35 |     answer = builder.model.get_json_response(prompt=prompt_formatted)
36 |     json_nodes_edges: NodeEdgeExt = cast(NodeEdgeExt, answer)
37 |     metadata: Metadata = Metadata(document_id=chunk.doc_id, chunk_id=chunk.chunk_id)
38 |     log = BuildLog(
39 |       chunk_text=chunk.text,
40 |       metadata=metadata,
41 |       nodes=json_nodes_edges["entities"],
42 |       edges=json_nodes_edges["relationships"],
43 |     )
44 |     # node properties
45 |     node_names: list[str] = [node["name"] for node in log.nodes]
46 |     if not node_names:
47 |       return
48 | 
49 |     prompt_formatted: str = process_template(
50 |       JSON_PROPERTY,
51 |       {
52 |         "current_nodes": ", ".join(node_names),
53 |         "input_text": log.chunk_text,
54 |       },
55 |     )
56 |     properties: dict[str, list[dict[str, list[str]]]] = builder.model.get_json_response(
57 |       prompt=prompt_formatted
58 |     )
59 | 
60 |     print("TEXT")
61 |     print(chunk.text)
62 |     print("EXTRACT")
63 |     print(node_names)
64 |     print(properties)
65 |     print("EDGES")
66 |     for e in log.edges:
67 |       print(e)
68 | 
69 | 
70 | TEST_FILE_2 = "test_files/test_file_2.pdf"
71 | 
72 | 
73 | def search_check():
74 |   # Set up all the graph dependencies
75 |   graph_name: str = "eschergraph4"
76 | 
77 |   graph: Graph = Graph(
78 |     model=OpenAIProvider(model=OpenAIModel.GPT_4o_MINI),
79 |     name=graph_name,
80 |   )
81 |   # graph.build(files=TEST_FILE_2)
82 | 
83 |   query = "Who is Mahmood Sher-Jan?"
84 |   r = graph.search(query)
85 |   print(r)
86 | 
87 | 
88 | def test_search_graph() -> None:
89 |   """Tests the search functionality of a Graph object."""
90 |   t = time.time()
91 |   openai_client = OpenAIProvider(model=OpenAIModel.GPT_4o_MINI)
92 |   reranker_client = JinaReranker()
93 |   graph: Graph = Graph(name="my graph", model=openai_client, reranker=reranker_client)
94 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_tokens_type_trainer/token_type_trainer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | 
 8 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_token import (
 9 |   PdfToken,
10 | )
11 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.token_type import (
12 |   TokenType,
13 | )
14 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_tokens_type_trainer.pdf_trainer import (
15 |   PdfTrainer,
16 | )
17 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_tokens_type_trainer.token_features import (
18 |   TokenFeatures,
19 | )
20 | 
21 | 
22 | class TokenTypeTrainer(PdfTrainer):
23 |   def get_model_input(self) -> np.ndarray:
24 |     features_rows = []
25 | 
26 |     contex_size = self.model_configuration.context_size
27 |     for token_features, page in self.loop_token_features():
28 |       page_tokens = [
29 |         self.get_padding_token(segment_number=i - 999999, page_number=page.page_number)
30 |         for i in range(contex_size)
31 |       ]
32 |       page_tokens += page.tokens
33 |       page_tokens += [
34 |         self.get_padding_token(segment_number=999999 + i, page_number=page.page_number)
35 |         for i in range(contex_size)
36 |       ]
37 | 
38 |       tokens_indexes = range(contex_size, len(page_tokens) - contex_size)
39 |       page_features = [
40 |         self.get_context_features(token_features, page_tokens, i)
41 |         for i in tokens_indexes
42 |       ]
43 |       features_rows.extend(page_features)
44 | 
45 |     return self.features_rows_to_x(features_rows)
46 | 
47 |   def loop_token_features(self):
48 |     for pdf_features in tqdm(self.pdfs_features):
49 |       token_features = TokenFeatures(pdf_features)
50 | 
51 |       for page in pdf_features.pages:
52 |         if not page.tokens:
53 |           continue
54 | 
55 |         yield token_features, page
56 | 
57 |   def get_context_features(
58 |     self, token_features: TokenFeatures, page_tokens: list[PdfToken], token_index: int
59 |   ):
60 |     token_row_features = []
61 |     first_token_from_context = token_index - self.model_configuration.context_size
62 |     for i in range(self.model_configuration.context_size * 2):
63 |       first_token = page_tokens[first_token_from_context + i]
64 |       second_token = page_tokens[first_token_from_context + i + 1]
65 |       token_row_features.extend(
66 |         token_features.get_features(first_token, second_token, page_tokens)
67 |       )
68 | 
69 |     return token_row_features
70 | 
71 |   def predict(self, model_path: str | Path = None):
72 |     predictions = super().predict(model_path)
73 |     predictions_assigned = 0
74 |     for token_features, page in self.loop_token_features():
75 |       for token, prediction in zip(
76 |         page.tokens,
77 |         predictions[predictions_assigned : predictions_assigned + len(page.tokens)],
78 |       ):
79 |         token.prediction = int(np.argmax(prediction))
80 | 
81 |       predictions_assigned += len(page.tokens)
82 | 
83 |   def set_token_types(self, model_path: str | Path = None):
84 |     self.predict(model_path)
85 |     for token in self.loop_tokens():
86 |       token.token_type = TokenType.from_index(token.prediction)
87 | 


--------------------------------------------------------------------------------
/eschergraph/agents/prompts/community_prompt.jinja:
--------------------------------------------------------------------------------
 1 | 
 2 | You are an AI assistant that helps a human analyst to perform general information discovery. Information discovery is the process of identifying and assessing relevant information associated with certain entities (e.g., organizations and individuals) within a network.
 3 | 
 4 | # Goal
 5 | Write a comprehensive report of a community, given a list of entities that belong to the community as well as their relationships and optional properties. The report will be used to inform decision-makers about information associated with the community and their potential impact. The content of this report includes an overview of the community's key entities, their legal compliance, technical capabilities, reputation, and noteworthy claims.
 6 | 
 7 | # Report Structure
 8 | 
 9 | The report should include the following sections:
10 | 
11 | - TITLE: community's name that represents its key entities - title should be short but specific. When possible, include representative named entities in the title.
12 | - SUMMARY: An executive summary of the community's overall structure, how its entities are related to each other, and significant information associated with its entities.
13 | - DETAILED FINDINGS: A list of 5-10 key insights about the community. Each insight should have a short summary followed by multiple paragraphs of explanatory text grounded according to the grounding rules below. Be comprehensive.
14 | 
15 | Return output as a well-formed JSON-formatted string with the following format:
16 | {
17 |     "title": <report_title>,
18 |     "summary": <executive_summary>,
19 |     "findings": [
20 |         {
21 |             "summary":<insight_1_summary>,
22 |             "explanation": <insight_1_explanation>
23 |         },
24 |         {
25 |             "summary":<insight_2_summary>,
26 |             "explanation": <insight_2_explanation>
27 |         }
28 |     ]
29 | }
30 | 
31 | # Grounding Rules
32 | 
33 | Do not include information with no supporting evidence in the data.
34 | 
35 | 
36 | # Real Data
37 | 
38 | Use the following text for your answer. Do not make anything up in your answer.
39 | 
40 | Edges:
41 | {{ relationships }}
42 | 
43 | Properties:
44 | {{ properties }}
45 | 
46 | The report should include the following sections:
47 | 
48 | - TITLE: community's name that represents its key entities - title should be short but specific. When possible, include representative named entities in the title.
49 | - SUMMARY: An executive summary of the community's overall structure, how its entities are related to each other, and significant information associated with its entities.
50 | - DETAILED FINDINGS: A list of 5-10 key insights about the community. Each insight should have a short summary followed by multiple paragraphs of explanatory text grounded according to the grounding rules below. Be comprehensive.
51 | 
52 | Return output as a well-formed JSON only formatted string with the following format:
53 | {
54 |     "title": <report_title>,
55 |     "summary": <executive_summary>,
56 |     "findings": [
57 |         {
58 |             "summary":<insight_1_summary>,
59 |             "explanation": <insight_1_explanation>
60 |         },
61 |         {
62 |             "summary":<insight_2_summary>,
63 |             "explanation": <insight_2_explanation>
64 |         }
65 |     ]
66 | }
67 | 
68 | 
69 | Output:
70 | 


--------------------------------------------------------------------------------
/eschergraph/visualization/visualizer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING
 4 | from uuid import UUID
 5 | 
 6 | import seaborn as sns
 7 | from pyvis.network import Network
 8 | 
 9 | from eschergraph.graph import Edge
10 | from eschergraph.graph.community_alg import get_leidenalg_communities
11 | 
12 | if TYPE_CHECKING:
13 |   from eschergraph.graph import Graph
14 |   from eschergraph.graph import Node
15 | 
16 | 
17 | # TODO: add level and graph name to the visualization
18 | class Visualizer:
19 |   """The visualizer for EscherGraphs."""
20 | 
21 |   @staticmethod
22 |   def visualize_graph(
23 |     graph: Graph, level: int = 0, save_location: str = "graph_visual.html"
24 |   ) -> None:
25 |     """Visualize a level of a graph.
26 | 
27 |     Args:
28 |       graph (Graph): The graph to visualize.
29 |       level (int): The level of the graph that needs to be visualized.
30 |       save_location (str): The location to save the generated visual.
31 |     """
32 |     nodes: list[Node] = graph.repository.get_all_at_level(level=level)
33 |     edges: list[Edge] = [edge for node in nodes for edge in node.edges]
34 |     node_ids: list[list[UUID]] = get_leidenalg_communities(nodes).partitions
35 |     node_dict: dict[UUID, Node] = {node.id: node for node in nodes}
36 | 
37 |     # Transform the list of node_ids into a list of nodes
38 |     comms: list[list[Node]] = []
39 |     for comm in node_ids:
40 |       comm_nodes: list[Node] = []
41 |       for id in comm:
42 |         comm_nodes.append(node_dict[id])
43 | 
44 |       comms.append(comm_nodes)
45 | 
46 |     Visualizer.visualize_community_graph(
47 |       comms=comms, edges=edges, save_location=save_location
48 |     )
49 | 
50 |   @staticmethod
51 |   def visualize_community_graph(
52 |     comms: list[list[Node]],
53 |     edges: list[Edge],
54 |     save_location: str = "community_visual.html",
55 |   ) -> None:
56 |     """Visualize a graph of communities.
57 | 
58 |     Communities are provided in a list containing lists of nodes, where each
59 |     list of nodes corresponds to a community.
60 | 
61 |     Args:
62 |       comms (list[list[node]]): A list of communities.
63 |       edges (list[Edge]): The list of edges in the community graph.
64 |       save_location (str): The location to save the generated visual.
65 |     """
66 |     palette: list[str] = sns.color_palette("hls", len(comms)).as_hex()
67 |     net = Network(
68 |       notebook=False,
69 |       cdn_resources="remote",
70 |       height="900px",
71 |       width="100%",
72 |       select_menu=True,
73 |       filter_menu=False,
74 |     )
75 | 
76 |     # Map all nodes in comms to their id for lookup
77 |     node_id: dict[UUID, Node] = {nd.id: nd for comm in comms for nd in comm}
78 | 
79 |     for idx, comm in enumerate(comms):
80 |       for nd in comm:
81 |         net.add_node(
82 |           nd.name,
83 |           label=nd.name,
84 |           title=nd.description,
85 |           value=len(nd.edges),
86 |           color=palette[idx],
87 |         )
88 | 
89 |     for edge in edges:
90 |       net.add_edge(
91 |         node_id[edge.frm.id].name, node_id[edge.to.id].name, title=edge.description
92 |       )
93 | 
94 |     net.force_atlas_2based(central_gravity=0.015, gravity=-31)
95 |     net.show_buttons(filter_=["physics"])
96 | 
97 |     net.show(name=save_location, notebook=False)
98 | 


--------------------------------------------------------------------------------
/eschergraph/tools/estimator.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | 
 4 | class Estimator:
 5 |   """This is a class for estimating the cost and time to build a graph from a document."""
 6 | 
 7 |   @staticmethod
 8 |   def get_cost_indication(total_tokens: int, model: str) -> float:
 9 |     """Estimates the cost based on the number of tokens and the model used.
10 | 
11 |     Args:
12 |         total_tokens (int): The total number of tokens.
13 |         model (str): The model used for estimation ('gpt-4o' or 'gpt-4o-mini').
14 | 
15 |     Returns:
16 |         float: The estimated cost of processing.
17 |     """
18 |     # Initialize variables
19 |     prompt_cost: float = 0.0
20 |     completion_cost: float = 0.0
21 | 
22 |     # for each chunk 2 llm calls are performed, but also for the node machter an average of 2 llm calls per page.
23 |     # Also some calls for the community building are used. We estimated it to be 2.5 llm calls per tokens.
24 |     llm_calls_per_token_estimation = 2.5
25 | 
26 |     # Assumed that completion tokens are equal to prompt tokens
27 |     if model == "gpt-4o":
28 |       prompt_cost = (total_tokens / 1e6) * 5.00
29 |       completion_cost = (total_tokens / 1e6) * 15.00
30 |     elif model == "gpt-4o-mini":
31 |       prompt_cost = (total_tokens / 1e6) * 0.150
32 |       completion_cost = (total_tokens / 1e6) * 0.600
33 |     else:
34 |       raise ValueError("Invalid model specified.")
35 | 
36 |     building_cost: float = prompt_cost + (completion_cost / 4)
37 |     return round(building_cost * llm_calls_per_token_estimation, 4)
38 | 
39 |   @staticmethod
40 |   def get_time_indication(num_chunks: int, model: str) -> str:
41 |     """Estimates the time required to process the document based on the number of chunks and the model used.
42 | 
43 |     Args:
44 |         num_chunks (int): The number of chunks to process.
45 |         model (str): The model used for estimation ('gpt-4o' or 'gpt-4o-mini').
46 | 
47 |     Returns:
48 |         str: The estimated time to complete the processing, either in seconds or minutes.
49 |     """
50 |     # Determine average time per chunk based on model
51 |     average_time_per_chunk: int = 4 if model == "gpt-4o" else 2
52 | 
53 |     max_workers: int = 2  # as used in ThreadPoolExecutor
54 | 
55 |     # If number of chunks is less than or equal to max_workers,
56 |     # the time taken would be approximately the time for one chunk.
57 |     if num_chunks <= max_workers:
58 |       estimated_time = average_time_per_chunk
59 |     else:
60 |       # Calculate the time for full batches and any remaining chunks
61 |       full_batches = num_chunks // max_workers
62 |       remaining_chunks = num_chunks % max_workers
63 | 
64 |       estimated_time = full_batches * average_time_per_chunk
65 |       if remaining_chunks > 0:
66 |         estimated_time += average_time_per_chunk
67 | 
68 |     node_mathcer_delay = num_chunks * average_time_per_chunk
69 |     community_building_delay = num_chunks * average_time_per_chunk
70 | 
71 |     estimated_time = estimated_time + node_mathcer_delay + community_building_delay
72 | 
73 |     # If the estimated time is more than 60 seconds, return time in minutes
74 |     if estimated_time > 60:
75 |       minutes = round(estimated_time / 60, 3)
76 |       return f"{minutes} minute{'s' if minutes > 1 else ''}"
77 |     else:
78 |       return f"{round(estimated_time, 3)} seconds"
79 | 


--------------------------------------------------------------------------------
/tests/agents/test_jinja_prompt_helper.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | from assertpy import assert_that
 5 | from jinja2 import Environment
 6 | from jinja2 import FileSystemLoader
 7 | from jinja2 import select_autoescape
 8 | 
 9 | from eschergraph.agents.jinja_helper import extract_variables
10 | from eschergraph.agents.jinja_helper import process_template
11 | from eschergraph.exceptions import PromptFormattingException
12 | 
13 | json_build_template: str = """-Goal-
14 | Extract all relevant information from the provided text into a graph representation containing entities and relations.
15 | The most important part is that you try to represent all the information in the provided text in a structured format!
16 | 
17 | -Steps-
18 | 1. Identify all named entities in singular form. For people please include the entire name. Entities can also be technologies.
19 | For each identified entity, extract the following information:
20 | - entity_name: Name of the entity
21 | - entity_description: Comprehensive description of the entity's attributes and activities
22 | 
23 | Format each entity output as a JSON entry with the following format:
24 | 
25 | {"name": <entity name>, "description": <entity description>}
26 | 
27 | 2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
28 | For each pair of related entities, extract the following information:
29 | - source_entity: name of the source entity, as identified in step 1
30 | - target_entity: name of the target entity, as identified in step 1
31 | - relationship_description: explanation as to why you think the source entity and the target entity are related to each other
32 | 
33 | Format each relationship as a JSON entry with the following format:
34 | 
35 | {"source": <source_entity>, "target": <target_entity>, "relationship": <relationship_description>}
36 | 
37 | 3. Return output in English as a single list of all JSON entities and relationships identified in steps 1 and 2.
38 | return the JSON like this:
39 | 
40 | {
41 |  'entities': [{"name": <entity name1>, "description": <entity description1>}, {"name": <entity name1>, "description": <entity description1>}],
42 |  'relationships':[{"source": <source_entity>, "target": <target_entity>, "relationship": <relationship_description>}, and more]
43 | }
44 | 
45 | However, only extract entities that are specific so avoid extracting entities like CEO or employee, but instead
46 | extract only named entities.
47 | 
48 | -Real Data-
49 | ######################
50 | text: This is a test
51 | ######################
52 | output:"""
53 | 
54 | input_text: str = "This is a test"
55 | 
56 | 
57 | def test_templating_function_json_build_empty_data() -> None:
58 |   with pytest.raises(PromptFormattingException):
59 |     process_template(template_file="json_build.jinja", data={})
60 | 
61 | 
62 | def test_templating_function_json_property_missing_data() -> None:
63 |   with pytest.raises(PromptFormattingException):
64 |     process_template(
65 |       template_file="json_property.jinja", data={"input_text": input_text}
66 |     )
67 | 
68 | 
69 | def test_extract_variables() -> None:
70 |   jinja_env: Environment = Environment(
71 |     loader=FileSystemLoader(searchpath="./eschergraph/agents/prompts"),
72 |     autoescape=select_autoescape(),
73 |   )
74 | 
75 |   assert_that(
76 |     extract_variables("json_property.jinja", jinja_env)
77 |   ).does_not_contain_duplicates().contains_only("input_text", "current_nodes")
78 | 


--------------------------------------------------------------------------------
/tests/graph/search/test_global_search.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from unittest.mock import patch
 4 | from uuid import UUID
 5 | from uuid import uuid4
 6 | 
 7 | from eschergraph.config import MAIN_COLLECTION
 8 | from eschergraph.graph.graph import Graph
 9 | from eschergraph.graph.search.global_search import AttributeSearch
10 | from eschergraph.graph.search.global_search import get_relevant_extractions
11 | from eschergraph.graph.search.global_search import global_search
12 | 
13 | 
14 | def test_global_search(graph_unit: Graph) -> None:
15 |   query = "test query"
16 |   context = "Attribute 1\nAttribute 2"
17 |   full_prompt = "Processed template with context and query"
18 | 
19 |   with patch(
20 |     "eschergraph.graph.search.global_search.get_relevant_extractions"
21 |   ) as mock_get_extractions:
22 |     with patch(
23 |       "eschergraph.graph.search.global_search.process_template"
24 |     ) as mock_process_template:
25 |       mock_get_extractions.return_value = [
26 |         AttributeSearch(text="Attribute 1", metadata=None, parent_nodes=[""]),
27 |         AttributeSearch(text="Attribute 2", metadata=None, parent_nodes=[""]),
28 |       ]
29 |       mock_process_template.return_value = full_prompt
30 |       graph_unit.model.get_plain_response.return_value = "Generated answer"
31 | 
32 |       result = global_search(graph_unit, query)
33 | 
34 |       assert result == "Generated answer"
35 |       mock_get_extractions.assert_called_once_with(graph_unit, query, None)
36 |       mock_process_template.assert_called_once_with(
37 |         "search/global_search_context.jinja", {"CONTEXT": context, "QUERY": query}
38 |       )
39 |       graph_unit.model.get_plain_response.assert_called_once_with(full_prompt)
40 | 
41 | 
42 | def test_global_search_get_relevant_extractions(graph_unit: Graph) -> None:
43 |   prompt = "test prompt"
44 |   search_results = [
45 |     {"chunk": "Chunk 1", "metadata": {"level": 1}},
46 |     {"chunk": "Chunk 2", "metadata": {"level": 1}},
47 |     {
48 |       "chunk": 123,
49 |       "metadata": {"level": 1},
50 |     },  # This should be filtered out as it's not a string
51 |   ]
52 |   reranked_results = [
53 |     AttributeSearch(text="Reranked Chunk 1", metadata=None, parent_nodes=[""]),
54 |     AttributeSearch(text="Reranked Chunk 2", metadata=None, parent_nodes=[""]),
55 |   ]
56 | 
57 |   with patch(
58 |     "eschergraph.graph.search.global_search.rerank_and_filter_attributes",
59 |     return_value=reranked_results,
60 |   ):
61 |     graph_unit.vector_db.search.return_value = search_results
62 | 
63 |     get_relevant_extractions(graph_unit, prompt)
64 | 
65 |     graph_unit.vector_db.search.assert_called_once_with(
66 |       query=prompt, top_n=15, metadata={"level": 1}, collection_name=MAIN_COLLECTION
67 |     )
68 | 
69 | 
70 | def test_global_search_with_doc_filter(graph_unit: Graph) -> None:
71 |   doc_filter: list[UUID] = [uuid4() for _ in range(10)]
72 | 
73 |   global_search(graph_unit, "test_query", doc_filter=doc_filter)
74 | 
75 |   graph_unit.vector_db.search.assert_called_once_with(
76 |     query="test_query",
77 |     top_n=15,
78 |     metadata={"level": 1, "document_id": [str(id) for id in doc_filter]},
79 |     collection_name=MAIN_COLLECTION,
80 |   )
81 | 
82 | 
83 | def test_global_search_without_doc_filter(graph_unit: Graph) -> None:
84 |   global_search(graph_unit, "test_query")
85 | 
86 |   graph_unit.vector_db.search.assert_called_once_with(
87 |     query="test_query", top_n=15, metadata={"level": 1}, collection_name=MAIN_COLLECTION
88 |   )
89 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/fast_trainer/paragraph_extractor_trainer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | from eschergraph.builder.reader.pdf_document_layout_analysis.fast_trainer.paragraph import (
 6 |   Paragraph,
 7 | )
 8 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_token import (
 9 |   PdfToken,
10 | )
11 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_segment import (
12 |   PdfSegment,
13 | )
14 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.token_type import (
15 |   TokenType,
16 | )
17 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_tokens_type_trainer.token_features import (
18 |   TokenFeatures,
19 | )
20 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_tokens_type_trainer.token_type_trainer import (
21 |   TokenTypeTrainer,
22 | )
23 | 
24 | 
25 | class ParagraphExtractorTrainer(TokenTypeTrainer):
26 |   def get_context_features(
27 |     self, token_features: TokenFeatures, page_tokens: list[PdfToken], token_index: int
28 |   ):
29 |     token_row_features = list()
30 |     first_token_from_context = token_index - self.model_configuration.context_size
31 |     for i in range(self.model_configuration.context_size * 2):
32 |       first_token = page_tokens[first_token_from_context + i]
33 |       second_token = page_tokens[first_token_from_context + i + 1]
34 |       features = token_features.get_features(first_token, second_token, page_tokens)
35 |       features += self.get_paragraph_extraction_features(first_token, second_token)
36 |       token_row_features.extend(features)
37 | 
38 |     return token_row_features
39 | 
40 |   @staticmethod
41 |   def get_paragraph_extraction_features(
42 |     first_token: PdfToken, second_token: PdfToken
43 |   ) -> list[int]:
44 |     one_hot_token_type_1 = [
45 |       1 if token_type == first_token.token_type else 0 for token_type in TokenType
46 |     ]
47 |     one_hot_token_type_2 = [
48 |       1 if token_type == second_token.token_type else 0 for token_type in TokenType
49 |     ]
50 |     return one_hot_token_type_1 + one_hot_token_type_2
51 | 
52 |   def loop_token_next_token(self):
53 |     for pdf_features in self.pdfs_features:
54 |       for page in pdf_features.pages:
55 |         if not page.tokens:
56 |           continue
57 |         if len(page.tokens) == 1:
58 |           yield page, page.tokens[0], page.tokens[0]
59 |         for token, next_token in zip(page.tokens, page.tokens[1:]):
60 |           yield page, token, next_token
61 | 
62 |   def get_pdf_segments(
63 |     self, paragraph_extractor_model_path: str | Path
64 |   ) -> list[PdfSegment]:
65 |     paragraphs = self.get_paragraphs(paragraph_extractor_model_path)
66 |     pdf_segments = [
67 |       PdfSegment.from_pdf_tokens(paragraph.tokens, paragraph.pdf_name)
68 |       for paragraph in paragraphs
69 |     ]
70 | 
71 |     return pdf_segments
72 | 
73 |   def get_paragraphs(self, paragraph_extractor_model_path) -> list[Paragraph]:
74 |     self.predict(paragraph_extractor_model_path)
75 |     paragraphs: list[Paragraph] = []
76 |     last_page = None
77 |     for page, token, next_token in self.loop_token_next_token():
78 |       if last_page != page:
79 |         last_page = page
80 |         paragraphs.append(Paragraph([token], page.pdf_name))
81 |       if token == next_token:
82 |         continue
83 |       if token.prediction:
84 |         paragraphs[-1].add_token(next_token)
85 |         continue
86 |       paragraphs.append(Paragraph([next_token], page.pdf_name))
87 | 
88 |     return paragraphs
89 | 


--------------------------------------------------------------------------------
/eschergraph/graph/edge.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Optional
  4 | from typing import TYPE_CHECKING
  5 | 
  6 | from attrs import define
  7 | from attrs import field
  8 | 
  9 | from eschergraph.exceptions import EdgeCreationException
 10 | from eschergraph.exceptions import RepositoryException
 11 | from eschergraph.graph.base import EscherBase
 12 | from eschergraph.graph.getter_setter import loading_getter_setter
 13 | from eschergraph.graph.loading import LoadState
 14 | from eschergraph.persistence import Metadata
 15 | 
 16 | # Prevent circular import errors
 17 | if TYPE_CHECKING:
 18 |   from eschergraph.graph.node import Node
 19 | 
 20 | 
 21 | @loading_getter_setter
 22 | @define
 23 | class Edge(EscherBase):
 24 |   """The edge in an EscherGraph.
 25 | 
 26 |   Although, we specify from and to nodes, edges are actually undirectional
 27 |   as they are richly descriptive. This is also reflected in the equals method.
 28 | 
 29 |   Note that the loadstate for an Edge is directly passed on to the two nodes that are
 30 |   connected by the edge.
 31 |   """
 32 | 
 33 |   frm: Node = field(kw_only=True)
 34 |   to: Node = field(kw_only=True)
 35 |   _description: Optional[str] = field(default=None, metadata={"group": LoadState.CORE})
 36 | 
 37 |   # The type annotation for the dynamically added property
 38 |   description: str = field(init=False)
 39 | 
 40 |   @classmethod
 41 |   def create(
 42 |     cls,
 43 |     frm: Node,
 44 |     to: Node,
 45 |     description: str,
 46 |     metadata: Optional[set[Metadata]] = None,
 47 |   ) -> Edge:
 48 |     """The method that allows for the creation of a new edge.
 49 | 
 50 |     Note that edges do have a to and from method, but they
 51 |     are undirectional. This is also reflected in the equals method.
 52 | 
 53 |     Args:
 54 |       frm (Node): The from node in the edge.
 55 |       to (Node): The to node in the edge.
 56 |       description (str): A rich description of the relation.
 57 |       metadata (Optional[set[Metadata]]): The optional metadata for the edge.
 58 | 
 59 |     Returns:
 60 |       A new edge.
 61 |     """
 62 |     if frm.id == to.id:
 63 |       raise EdgeCreationException(
 64 |         "An edge should be created between two different nodes."
 65 |       )
 66 | 
 67 |     if not frm.repository is to.repository:
 68 |       raise RepositoryException(
 69 |         "The two nodes that are connected by an edge need to have the same repository."
 70 |       )
 71 | 
 72 |     edge: Edge = cls(
 73 |       frm=frm,
 74 |       to=to,
 75 |       description=description,
 76 |       repository=frm.repository,
 77 |       metadata=metadata if metadata else set(),
 78 |       loadstate=LoadState.FULL,
 79 |     )
 80 | 
 81 |     # Add the edge to the nodes
 82 |     frm.edges.add(edge)
 83 |     to.edges.add(edge)
 84 | 
 85 |     return edge
 86 | 
 87 |   def __eq__(self, other: object) -> bool:
 88 |     """The equals method for an edge.
 89 | 
 90 |     Two edges are equal if they have the same description and run between the same nodes.
 91 | 
 92 |     Args:
 93 |       other (object): The object to compare the edge to.
 94 | 
 95 |     Returns:
 96 |       True if equal and false otherwise.
 97 |     """
 98 |     if isinstance(other, Edge):
 99 |       return {self.frm.id, self.to.id} == {
100 |         other.frm.id,
101 |         other.to.id,
102 |       } and self.description == other.description
103 | 
104 |     return False
105 | 
106 |   def __hash__(self) -> int:
107 |     """The hash function for an edge.
108 | 
109 |     Returns:
110 |      The integer hash value for an edge.
111 |     """
112 |     return hash((self.id, self.frm.id, self.to.id, self.description))
113 | 


--------------------------------------------------------------------------------
/tests/builder/test_building_tools.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from unittest.mock import MagicMock
  4 | from uuid import uuid4
  5 | 
  6 | from faker import Faker
  7 | 
  8 | from eschergraph.builder.building_tools import BuildingTools
  9 | from eschergraph.builder.models import Chunk
 10 | from eschergraph.builder.models import ProcessedFile
 11 | from eschergraph.builder.reader.reader import Reader
 12 | 
 13 | faker: Faker = Faker()
 14 | 
 15 | 
 16 | def test_check_node_ext_pass() -> None:
 17 |   valid_node_ext = {"name": "Node1", "description": "A sample node"}
 18 |   assert BuildingTools.check_node_ext(valid_node_ext) == True
 19 | 
 20 | 
 21 | def test_check_node_ext_fail() -> None:
 22 |   invalid_node_ext = {
 23 |     "name": "Node1",
 24 |     "desc": "A sample node",  # Incorrect key
 25 |   }
 26 |   assert BuildingTools.check_node_ext(invalid_node_ext) == False
 27 | 
 28 | 
 29 | def test_check_edge_ext_pass() -> None:
 30 |   valid_edge_ext = {
 31 |     "source": "Node1",
 32 |     "target": "Node2",
 33 |     "relationship": "connected_to",
 34 |   }
 35 |   assert BuildingTools.check_edge_ext(valid_edge_ext) == True
 36 | 
 37 | 
 38 | def test_check_edge_ext_fail() -> None:
 39 |   invalid_edge_ext = {
 40 |     "source": "Node1",
 41 |     "target": "Node2",
 42 |     "relation": "connected_to",  # Incorrect key
 43 |   }
 44 |   assert BuildingTools.check_edge_ext(invalid_edge_ext) == False
 45 | 
 46 | 
 47 | def test_check_property_ext_pass() -> None:
 48 |   valid_property_ext = {"entity_name": "Entity1", "properties": ["prop1", "prop2"]}
 49 |   assert BuildingTools.check_property_ext(valid_property_ext) == True
 50 | 
 51 | 
 52 | def test_check_property_ext_fail() -> None:
 53 |   invalid_property_ext = {
 54 |     "entity_name": "Entity1",
 55 |     "properties": "prop1, prop2",  # Incorrect type
 56 |   }
 57 |   assert BuildingTools.check_property_ext(invalid_property_ext) == False
 58 | 
 59 | 
 60 | def test_check_node_edge_ext_pass() -> None:
 61 |   valid_node_edge_ext = {
 62 |     "entities": [{"name": "Node1", "description": "A sample node"}],
 63 |     "relationships": [
 64 |       {"source": "Node1", "target": "Node2", "relationship": "connected_to"}
 65 |     ],
 66 |   }
 67 |   assert BuildingTools.check_node_edge_ext(valid_node_edge_ext) == True
 68 | 
 69 | 
 70 | def test_check_node_edge_ext_fail() -> None:
 71 |   invalid_node_edge_ext = {
 72 |     "entities": [
 73 |       {"name": "Node1", "desc": "A sample node"}  # Incorrect key
 74 |     ],
 75 |     "relationships": [
 76 |       {"source": "Node1", "target": "Node2", "rel": "connected_to"}  # Incorrect key
 77 |     ],
 78 |   }
 79 |   assert BuildingTools.check_node_edge_ext(invalid_node_edge_ext) == False
 80 | 
 81 | 
 82 | def test_process_files_empty() -> None:
 83 |   assert BuildingTools.process_files(files=[], multi_modal=False) == []
 84 | 
 85 | 
 86 | def test_process_files_single_file() -> None:
 87 |   reader_mock: MagicMock = MagicMock(spec=Reader)
 88 |   # Set the mock to return itself for initialization
 89 |   reader_mock.return_value = reader_mock
 90 |   reader_mock.chunks = [
 91 |     Chunk(text=text, chunk_id=idx, doc_id=uuid4(), page_num=idx)
 92 |     for idx, text in enumerate(faker.texts(nb_texts=15, max_nb_chars=80))
 93 |   ]
 94 |   reader_mock.total_tokens = 10000
 95 |   reader_mock.full_text = faker.text(max_nb_chars=1200)
 96 |   reader_mock.visual_elements = None
 97 | 
 98 |   processed: list[ProcessedFile] = BuildingTools.process_files(
 99 |     files=["./test_files/test.pdf"], multi_modal=False, reader_impl=reader_mock
100 |   )
101 |   processed_file: ProcessedFile = processed[0]
102 | 
103 |   assert len(processed) == 1
104 |   assert processed_file.chunks == reader_mock.chunks
105 |   assert processed_file.full_text == reader_mock.full_text
106 |   assert processed_file.visual_elements is None
107 |   assert processed_file.document.token_num == reader_mock.total_tokens
108 |   assert processed_file.document.chunk_num == 15
109 | 


--------------------------------------------------------------------------------
/eschergraph/tools/prepare_sync_data.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from uuid import UUID
  4 | 
  5 | from eschergraph.graph.edge import Edge
  6 | from eschergraph.graph.node import Node
  7 | from eschergraph.graph.property import Property
  8 | from eschergraph.persistence.change_log import Action
  9 | from eschergraph.persistence.change_log import ChangeLog
 10 | from eschergraph.persistence.repository import Repository
 11 | 
 12 | 
 13 | def prepare_sync_data(
 14 |   repository: Repository,
 15 | ) -> tuple[
 16 |   list[tuple[UUID, str, dict[str, str | int]]],
 17 |   list[UUID],
 18 | ]:
 19 |   """Prepares data for synchronization with the vector database.
 20 | 
 21 |   Args:
 22 |     repository (Repository): The graph's repository.
 23 | 
 24 |   Returns:
 25 |     tuple: A tuple containing lists of documents, IDs, metadata, and IDs to delete.
 26 |   """
 27 |   change_logs: list[ChangeLog] = repository.get_change_log()
 28 | 
 29 |   # Map each object id to its change_logs
 30 |   objects_logs: dict[UUID, list[ChangeLog]] = {log.id: [] for log in change_logs}
 31 |   for log in change_logs:
 32 |     objects_logs[log.id].append(log)
 33 | 
 34 |   ids_to_create, ids_to_delete = _get_actions_for_objects(objects_logs)
 35 |   create_main: list[tuple[UUID, str, dict[str, str | int]]] = []
 36 | 
 37 |   for id in ids_to_create:
 38 |     cur_log: ChangeLog = objects_logs[id][0]
 39 |     # Prepare metadata based on log type
 40 |     if cur_log.type == Node:
 41 |       node: Node | None = repository.get_node_by_id(id)
 42 |       if not node:
 43 |         continue
 44 | 
 45 |       # We add the document_id to all the object
 46 |       md_node: dict[str, str | int] = {
 47 |         "level": cur_log.level,
 48 |         "type": "node",
 49 |         "document_id": _get_node_document_id(node),
 50 |       }
 51 |       node_string = node.name + ", " + node.description
 52 |       create_main.append((id, node_string, md_node))
 53 | 
 54 |     elif cur_log.type == Edge:
 55 |       edge: Edge | None = repository.get_edge_by_id(id)
 56 |       if not edge:
 57 |         continue
 58 |       md_edge: dict[str, str | int] = {
 59 |         "level": cur_log.level,
 60 |         "type": "edge",
 61 |         "document_id": _get_node_document_id(edge.frm),
 62 |       }
 63 |       create_main.append((id, edge.description, md_edge))
 64 | 
 65 |     elif cur_log.type == Property:
 66 |       property: Property | None = repository.get_property_by_id(id)
 67 |       if not property:
 68 |         continue
 69 |       md_prop: dict[str, str | int] = {
 70 |         "level": cur_log.level,
 71 |         "type": "property",
 72 |         "document_id": _get_node_document_id(property.node),
 73 |       }
 74 |       property_string = property.node.name + ", " + property.description
 75 |       create_main.append((id, property_string, md_prop))
 76 | 
 77 |   return create_main, ids_to_delete
 78 | 
 79 | 
 80 | def _get_actions_for_objects(
 81 |   objects_logs: dict[UUID, list[ChangeLog]],
 82 | ) -> tuple[list[UUID], list[UUID]]:
 83 |   ids_to_delete: list[UUID] = []
 84 |   ids_to_create: list[UUID] = []
 85 |   for id, object_logs in objects_logs.items():
 86 |     # Create a set of actions for the object
 87 |     actions: set[Action] = {log.action for log in object_logs}
 88 |     if not Action.CREATE in actions:
 89 |       ids_to_delete.append(id)
 90 |     if not Action.DELETE in actions:
 91 |       ids_to_create.append(id)
 92 | 
 93 |   return ids_to_create, ids_to_delete
 94 | 
 95 | 
 96 | def _get_node_document_id(node: Node) -> str:
 97 |   """Returns the UUID of the node's document_id in string format.
 98 | 
 99 |   Currently, all graph objects do still exclusively belong to a single
100 |   document as we have not added inter-document merging or
101 |   edge finding. As soon as this is added, this logic will change.
102 | 
103 |   Args:
104 |     node (Node): The node to get the document_id for.
105 | 
106 |   Returns:
107 |     The UUID as a string.
108 |   """
109 |   cur_level: int = node.level
110 |   cur_node: Node = node
111 | 
112 |   # Get the metadata on a level 0 child node
113 |   while cur_level > 0:
114 |     cur_node = cur_node.child_nodes[0]
115 |     cur_level -= 1
116 | 
117 |   return str(next(iter(cur_node.metadata)).document_id)
118 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | node_modules
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 
164 | # The default graph save location
165 | **/eschergraph_storage/**
166 | 
167 | # The visualization files
168 | community_visual.html
169 | 
170 | # Downloaded parser models
171 | **/fast_models/**
172 | 
173 | # Packaged binaries for the code
174 | **/bins/**


--------------------------------------------------------------------------------
/tests/graph/search/test_quick_search.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import random
  4 | from unittest.mock import MagicMock
  5 | from unittest.mock import patch
  6 | from uuid import UUID
  7 | from uuid import uuid4
  8 | 
  9 | from eschergraph.agents.jinja_helper import process_template
 10 | from eschergraph.agents.reranker import RerankerResult
 11 | from eschergraph.config import MAIN_COLLECTION
 12 | from eschergraph.graph.graph import Graph
 13 | from eschergraph.graph.search.quick_search import quick_search
 14 | from eschergraph.graph.search.quick_search import RAGAnswer
 15 | from eschergraph.graph.search.quick_search import rerank_and_filter_attributes
 16 | from eschergraph.persistence.vector_db.vector_search_result import VectorSearchResult
 17 | from tests.persistence.vector_db.help import generate_vector_search_results
 18 | 
 19 | RAG_SEARCH = "search/question_with_context.jinja"
 20 | 
 21 | 
 22 | def test_quick_search_empty_query(graph_unit: Graph) -> None:
 23 |   RAGanswer: RAGAnswer = quick_search(graph_unit, "")
 24 |   assert RAGanswer.answer == "please ask a question"
 25 | 
 26 | 
 27 | def test_quick_search_no_attributes_found(graph_unit: Graph) -> None:
 28 |   with patch(
 29 |     "eschergraph.graph.search.quick_search.get_attributes_search", return_value=[]
 30 |   ):
 31 |     graph_unit.model.get_plain_response.return_value = "No results found"
 32 |     RAGanswer: RAGAnswer = quick_search(graph_unit, "test query")
 33 |     assert RAGanswer.answer == "No results found"
 34 |     graph_unit.model.get_plain_response.assert_called_with(
 35 |       process_template(
 36 |         RAG_SEARCH,
 37 |         data={
 38 |           "CONTEXT": "Nothing found in the graph regarding this question!",
 39 |           "QUERY": "test query",
 40 |         },
 41 |       )
 42 |     )
 43 | 
 44 | 
 45 | def test_rerank_and_filter_no_attributes(graph_unit: Graph) -> None:
 46 |   mock_filter_attributes: MagicMock = MagicMock()
 47 |   with patch(
 48 |     "eschergraph.graph.search.quick_search.filter_attributes", mock_filter_attributes
 49 |   ):
 50 |     graph_unit.reranker.rerank.return_value = []
 51 |     rerank_and_filter_attributes(graph_unit, "test query", [])
 52 |     graph_unit.reranker.rerank.assert_called_once_with("test query", [], top_n=0)
 53 |     mock_filter_attributes.assert_called_once_with(graph_unit, [], {}, 0.2)
 54 | 
 55 | 
 56 | def test_rerank_and_filter_attributes(graph_unit: Graph) -> None:
 57 |   attributes_results: list[VectorSearchResult] = generate_vector_search_results(
 58 |     num_results=2
 59 |   )
 60 |   rerank_result: list[RerankerResult] = [
 61 |     RerankerResult(
 62 |       index=1, relevance_score=random.uniform(0, 1), text=attributes_results[1].chunk
 63 |     ),
 64 |     RerankerResult(
 65 |       index=0, relevance_score=random.uniform(0, 1), text=attributes_results[0].chunk
 66 |     ),
 67 |   ]
 68 |   graph_unit.reranker.rerank.return_value = rerank_result
 69 |   mock_filter_attributes: MagicMock = MagicMock()
 70 |   with patch(
 71 |     "eschergraph.graph.search.quick_search.filter_attributes", mock_filter_attributes
 72 |   ):
 73 |     rerank_and_filter_attributes(
 74 |       graph_unit, "test query", attributes_results, threshold=0.2
 75 |     )
 76 | 
 77 |   graph_unit.reranker.rerank.assert_called_once_with(
 78 |     "test query", [attributes_results[0].chunk, attributes_results[1].chunk], top_n=2
 79 |   )
 80 |   mock_filter_attributes.assert_called_once_with(
 81 |     graph_unit, rerank_result, {r.chunk: r for r in attributes_results}, 0.2
 82 |   )
 83 | 
 84 | 
 85 | def test_quick_search_with_doc_filter(graph_unit: Graph) -> None:
 86 |   doc_filter: list[UUID] = [uuid4() for _ in range(10)]
 87 | 
 88 |   quick_search(graph_unit, "test_query", doc_filter=doc_filter)
 89 | 
 90 |   graph_unit.vector_db.search.assert_called_once_with(
 91 |     query="test_query",
 92 |     top_n=40,
 93 |     metadata={"level": 0, "document_id": [str(id) for id in doc_filter]},
 94 |     collection_name=MAIN_COLLECTION,
 95 |   )
 96 | 
 97 | 
 98 | def test_quick_search_without_doc_filter(graph_unit: Graph) -> None:
 99 |   quick_search(graph_unit, "test_query")
100 | 
101 |   graph_unit.vector_db.search.assert_called_once_with(
102 |     query="test_query", top_n=40, metadata={"level": 0}, collection_name=MAIN_COLLECTION
103 |   )
104 | 


--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_features/pdf_token.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from lxml.etree import ElementBase
  4 | 
  5 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_font import (
  6 |   PdfFont,
  7 | )
  8 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_token_context import (
  9 |   PdfTokenContext,
 10 | )
 11 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.rectangle import (
 12 |   Rectangle,
 13 | )
 14 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.label import (
 15 |   Label,
 16 | )
 17 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.token_type import (
 18 |   TokenType,
 19 | )
 20 | 
 21 | 
 22 | class PdfToken:
 23 |   def __init__(
 24 |     self,
 25 |     page_number,
 26 |     tag_id: str,
 27 |     content: str,
 28 |     pdf_font: PdfFont,
 29 |     reading_order_no: int,
 30 |     bounding_box: Rectangle,
 31 |     token_type: TokenType,
 32 |   ):
 33 |     self.page_number = int(page_number)
 34 |     self.id: str = tag_id
 35 |     self.content: str = content
 36 |     self.font: PdfFont = pdf_font
 37 |     self.reading_order_no: int = reading_order_no
 38 |     self.bounding_box: Rectangle = bounding_box
 39 |     self.token_type: TokenType = token_type
 40 |     self.pdf_token_context: PdfTokenContext = PdfTokenContext()
 41 |     self.prediction: int = 0
 42 | 
 43 |   def same_line(self, token: "PdfToken"):
 44 |     if self.bounding_box.bottom < token.bounding_box.top:
 45 |       return False
 46 | 
 47 |     if token.bounding_box.bottom < self.bounding_box.top:
 48 |       return False
 49 | 
 50 |     return True
 51 | 
 52 |   @staticmethod
 53 |   def from_poppler_etree(page_number: int, xml_tag: ElementBase, pdf_font: PdfFont):
 54 |     if "id" in xml_tag.attrib:
 55 |       tag_id = xml_tag.attrib["id"]
 56 |     else:
 57 |       tag_id = "tag"
 58 | 
 59 |     content = "".join(xml_tag.itertext()).strip()
 60 |     reading_order_no = (
 61 |       int(xml_tag.attrib["reading_order_no"])
 62 |       if "reading_order_no" in xml_tag.attrib
 63 |       else -1
 64 |     )
 65 |     bounding_box = Rectangle.from_poppler_tag_etree(xml_tag)
 66 |     token_type = TokenType.TEXT
 67 | 
 68 |     return PdfToken(
 69 |       page_number, tag_id, content, pdf_font, reading_order_no, bounding_box, token_type
 70 |     )
 71 | 
 72 |   def get_label_intersection_percentage(self, label: Label):
 73 |     label_bounding_box = Rectangle.from_width_height(
 74 |       left=label.left, top=label.top, width=label.width, height=label.height
 75 |     )
 76 | 
 77 |     return self.bounding_box.get_intersection_percentage(label_bounding_box)
 78 | 
 79 |   def get_same_line_tokens(self, page_tokens: list["PdfToken"]):
 80 |     top, height = self.bounding_box.top, self.bounding_box.height
 81 | 
 82 |     same_line_tokens = [
 83 |       each_token
 84 |       for each_token in page_tokens
 85 |       if top <= each_token.bounding_box.top < (top + height)
 86 |       or top < each_token.bounding_box.bottom <= (top + height)
 87 |     ]
 88 | 
 89 |     return same_line_tokens
 90 | 
 91 |   def get_context(self, page_tokens: list["PdfToken"]):
 92 |     left, right = self.bounding_box.left, self.bounding_box.right
 93 | 
 94 |     self.pdf_token_context.left_of_tokens_on_the_left = left
 95 | 
 96 |     same_line_tokens = self.get_same_line_tokens(page_tokens)
 97 | 
 98 |     on_the_left = [
 99 |       each_token
100 |       for each_token in same_line_tokens
101 |       if each_token.bounding_box.right < right
102 |     ]
103 |     on_the_right = [
104 |       each_token
105 |       for each_token in same_line_tokens
106 |       if left < each_token.bounding_box.left
107 |     ]
108 | 
109 |     if on_the_left:
110 |       self.pdf_token_context.right_of_token_on_the_left = max([
111 |         x.bounding_box.right for x in on_the_left
112 |       ])
113 |       self.pdf_token_context.left_of_token_on_the_left = min([
114 |         x.bounding_box.left for x in on_the_left
115 |       ])
116 | 
117 |     if on_the_right:
118 |       self.pdf_token_context.left_of_token_on_the_right = min([
119 |         x.bounding_box.left for x in on_the_right
120 |       ])
121 |       self.pdf_token_context.right_of_token_on_the_right = max([
122 |         x.bounding_box.right for x in on_the_right
123 |       ])
124 | 


--------------------------------------------------------------------------------
/eschergraph/tools/fuzzy_matcher.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from concurrent.futures import as_completed
  4 | from concurrent.futures import ThreadPoolExecutor
  5 | 
  6 | from fuzzywuzzy import fuzz
  7 | 
  8 | 
  9 | class FuzzyMatcher:
 10 |   """Matching node names based on Levenshtein distance."""
 11 | 
 12 |   @staticmethod
 13 |   def get_match_sets(names: list[str]) -> list[set[str]]:
 14 |     """Get the sets of matches for the provided names.
 15 | 
 16 |     Args:
 17 |       names (list[str]): The list of names.
 18 | 
 19 |     Returns:
 20 |       A list of sets of strings.
 21 | 
 22 |     """
 23 |     matches: dict[str, list[str]] = FuzzyMatcher._match_nodes(names)
 24 |     return FuzzyMatcher._match_sets(matches)
 25 | 
 26 |   @staticmethod
 27 |   def _match_nodes(node_names: list[str]) -> dict[str, list[str]]:
 28 |     """Matches nodes in a graph based on similarity to provided node names.
 29 | 
 30 |     :param graph: The graph containing nodes to be matched.
 31 |     :param node_names: A list of node names to be matched against the graph.
 32 |     :return: A dictionary where keys are node names and values are lists of matching nodes.
 33 |     """
 34 |     result: dict[str, list[str]] = dict()
 35 | 
 36 |     with ThreadPoolExecutor(max_workers=10) as executor:
 37 |       futures = [
 38 |         executor.submit(FuzzyMatcher._find_matches, name, node_names)
 39 |         for name in node_names
 40 |       ]
 41 |       for future in as_completed(futures):
 42 |         name, match_nodes = future.result()
 43 |         if match_nodes:
 44 |           result[name] = match_nodes
 45 |     return result
 46 | 
 47 |   @staticmethod
 48 |   def _is_similar(name1: str, name2: str) -> bool:
 49 |     """Checks if two node names are sufficiently similar using fuzzy matching.
 50 | 
 51 |     Args:
 52 |       name1 (str): The first name.
 53 |       name2 (str): The second name.
 54 | 
 55 |     Returns:
 56 |       True if sufficiently similar, False otherwise.
 57 |     """
 58 |     return bool(fuzz.token_set_ratio(name1, name2) >= 95)
 59 | 
 60 |   @staticmethod
 61 |   def _find_matches(query: str, names: list[str]) -> tuple[str, list[str]]:
 62 |     """Finds matches for a given query string within a list of names.
 63 | 
 64 |     Args:
 65 |       query (str): The query string to find matches for.
 66 |       names (list[str]): A list of node names to match against.
 67 | 
 68 |     Returns:
 69 |       A tuple where the first element is the query string
 70 |       and the second is a list of matching node names.
 71 |     """
 72 |     matches = []
 73 |     for name in names:
 74 |       if FuzzyMatcher._is_similar(query, name) and query != name:
 75 |         matches.append(name)
 76 |     return query, matches
 77 | 
 78 |   @staticmethod
 79 |   def _match_sets(matches: dict[str, list[str]]) -> list[set[str]]:
 80 |     """Group similar nodes into sets based on matching provided.
 81 | 
 82 |     Args:
 83 |       matches (dict[str, list[str]]): A dictionary that contains the list of matches
 84 |         under each node name.
 85 | 
 86 |     Returns:
 87 |       A list of sets, where each set contains names of similar nodes.
 88 |     """
 89 |     nodes_visited: set[str] = set()
 90 |     merged: list[set[str]] = []
 91 | 
 92 |     for key in matches.keys():
 93 |       if key in nodes_visited:
 94 |         continue
 95 |       cluster = FuzzyMatcher._vertical_matching(
 96 |         nodes_visited=nodes_visited,
 97 |         cluster={key},
 98 |         matches={k: set(v) for k, v in matches.items()},
 99 |         current=key,
100 |       )
101 |       merged.append(cluster)
102 | 
103 |     return merged
104 | 
105 |   @staticmethod
106 |   def _vertical_matching(
107 |     nodes_visited: set[str],
108 |     cluster: set[str],
109 |     matches: dict[str, set[str]],
110 |     current: str,
111 |   ) -> set[str]:
112 |     """Recursively matches nodes to form clusters of similar nodes.
113 | 
114 |     Args:
115 |       nodes_visited (set[str]): Set with visited node names.
116 |       cluster (set[str]): Set with all (recursively) matched nodes.
117 |       matches (dict[str, list[str]]): All fuzzy matches for each node.
118 |       current (str): Name of the current node.
119 | 
120 |     Returns:
121 |       The cluster of similar nodes as a set.
122 |     """
123 |     nodes_visited.add(current)
124 | 
125 |     for match in matches[current]:
126 |       if match not in nodes_visited:
127 |         cluster.add(match)
128 |         cluster = FuzzyMatcher._vertical_matching(
129 |           nodes_visited, cluster, matches, match
130 |         )
131 |     return cluster
132 | 


--------------------------------------------------------------------------------
/docs/docs/getting_started.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | sidebar_position: 1
  3 | ---
  4 | 
  5 | # Getting started
  6 | 
  7 | Let's learn how to build, and RAG search with **EscherGraph** in under 5 min.
  8 | 
  9 | ## Installing
 10 | Install the package in your Python environment with the following command.
 11 | 
 12 | ```bash
 13 | pip install eschergraph
 14 | ```
 15 | 
 16 | To build and search with EscherGraph: an LLM, an embedding model, and a reranker are needed. We recommend using OpenAI's GPT4o and text-embedding-3-large models, and the jina-reranker-v2-base-multilingual from Jina AI. These are also the defaults.
 17 | In the upcoming examples, we will assume that these defaults are used.
 18 | 
 19 | In case you still need to obtain a Jina AI API key, you can get a key with 1 million tokens, free and without registration [here](https://jina.ai/).
 20 | 
 21 | ## Credentials
 22 | The API keys needed to connect with external API's can be supplied to the graph in two ways:
 23 | 1. via environment variables;
 24 | 2. optional keyword arguments when instantiating the graph.
 25 | 
 26 | Below we will consider both ways in which a graph instance can be created. Note that it is also possible to supply the required credentials using a combination of these methods, as long as all the keys are supplied at least once.
 27 | 
 28 | ## Initialize graph
 29 | ### 1. Environment variables
 30 | First, put your Jina AI and OpenAI API keys in a `.env` file.
 31 | 
 32 | ```python
 33 | # .env file
 34 | OPENAI_API_KEY = ... 
 35 | JINA_API_KEY = ...
 36 | ```
 37 | 
 38 | Then, when instantiating a graph, make sure to have the environment variables loaded.
 39 | For example, you can use the `load_dotenv` function from the library `python-dotenv` to load them
 40 | from a `.env` file.
 41 | ```python
 42 | from dotenv import load_dotenv
 43 | from eschergraph import Graph
 44 | 
 45 | load_dotenv()
 46 | 
 47 | graph = Graph(name="pink_graph")
 48 | ```
 49 | 
 50 | ### 2. Keyword arguments
 51 | ```python
 52 | from eschergraph import Graph
 53 | 
 54 | graph = Graph(
 55 |   name="pink_graph",
 56 |   openai_api_key="...",
 57 |   jina_api_key="..."
 58 | )
 59 | ```
 60 | 
 61 | Currently, the supported models are GPT4o and GPT4o-mini. We recommend always using GPT-4o for graph building, since GPT-4o mini introduces too much noise when building a graph. However, it is perfectly fine to use GPT-4o mini for playing around and testing. In case you wish to initialize a graph with GPT-4o mini, this is done in the following way.
 62 | 
 63 | ```python
 64 | from eschergraph import Graph
 65 | from eschergraph.agents import OpenAIProvider
 66 | from eschergraph.agents import OpenAIModel
 67 | 
 68 | graph = Graph(
 69 |   name="pink_graph",
 70 |   model=OpenAIProvider(model=OpenAIModel.GPT_4o_MINI)
 71 | )
 72 | ```
 73 | 
 74 | Now, that we have a graph instance, you will see that all basic operations are straightforward.
 75 | 
 76 | ## Build graph
 77 | ```python
 78 | my_file1 = 'test_files/Attention Is All You Need.pdf'
 79 | 
 80 | graph.build(files = my_file1)
 81 | 
 82 | # Adding more files to the graph is possible by simply building again:
 83 | my_file2 = "test_files/test_file2.txt"
 84 | my_file3 = "test_files/test_file3.pdf"
 85 | 
 86 | graph.build(files = [my_file2, my_file3])
 87 | ```
 88 | Build can be used to add documents to the graph. All you need to do is specify the filepath of the files that you want to add to the graph. It is possible to specify both a string of a single filepath or a list containing multiple filepaths.
 89 | 
 90 | ## Search
 91 | ### Local RAG search
 92 | A local RAG search uses the information stored in the graph to generate an answer using the most relevant information as extracted from the source.
 93 | ```python
 94 | question = 'On which hardware chips were the inital models trained?'
 95 | 
 96 | answer = graph.search(question)
 97 | print(answer)
 98 | ```
 99 | Local search considers all, nodes, edges, and properties to select the most relevant context using embedding similarity and reranking. 
100 | 
101 | ### Global RAG search
102 | ```python
103 | global_question = 'What are the conclusions from the paper?'
104 | 
105 | answer = graph.global_search(global_question)
106 | print(answer)
107 | ```
108 | A global search considers the higher levels of the graph, and is great for answering general topic questions about the files in the graph.
109 | For example, it can be used to draw conclusions and interpret sentiment in a text.
110 | 
111 | ## Visualize
112 | ### Dashboard
113 | ```python
114 | graph.dashboard()
115 | ```
116 | Print general info and statistics about the graph using the dashboard.
117 | 
118 | ### Interactive plot
119 | An interactive plot for the graph's lowest and community level can be generated easily as well.
120 | ```python
121 | graph.visualize()
122 | ```
123 | 
124 | ![EscherGraph Interactive Plot](img/InteractivePlot.png)
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------