62 |
63 | );
64 | }
65 |
--------------------------------------------------------------------------------
/tests/tools/test_fuzzy_matcher.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from unittest.mock import MagicMock
4 |
5 | from eschergraph.tools.fuzzy_matcher import FuzzyMatcher
6 |
7 |
8 | def test_find_matches_with_mock() -> None:
9 | """Test the _find_matches method of the FuzzyMatcher class using a mock."""
10 | mock_fuzzy_matcher = MagicMock(spec=FuzzyMatcher)
11 |
12 | # Define the mock return value for the _find_matches method
13 | mock_find_matches_return = ("apple", ["Apple", "apple inc."])
14 | mock_fuzzy_matcher._find_matches.return_value = mock_find_matches_return
15 |
16 | result = mock_fuzzy_matcher._find_matches(
17 | "apple", ["apple", "Apple", "apple inc.", "banana"]
18 | )
19 |
20 | assert (
21 | result == mock_find_matches_return
22 | ), f"Expected {mock_find_matches_return}, got {result}"
23 |
24 |
25 | def test_match_nodes_with_mock() -> None:
26 | """Test the _match_nodes method of the FuzzyMatcher class using a mock."""
27 | mock_fuzzy_matcher = MagicMock(spec=FuzzyMatcher)
28 |
29 | mock_match_nodes_return = {
30 | "apple": ["Apple", "apple inc."],
31 | "Apple": ["apple", "apple inc."],
32 | "apple inc.": ["apple", "Apple"],
33 | "banana": [],
34 | }
35 | mock_fuzzy_matcher._match_nodes.return_value = mock_match_nodes_return
36 |
37 | result = mock_fuzzy_matcher._match_nodes(["apple", "Apple", "apple inc.", "banana"])
38 |
39 | assert (
40 | result == mock_match_nodes_return
41 | ), f"Expected {mock_match_nodes_return}, got {result}"
42 |
43 |
44 | def test_get_match_sets_with_mock() -> None:
45 | """Test the get_match_sets method of the FuzzyMatcher class using a mock."""
46 | mock_fuzzy_matcher = MagicMock(spec=FuzzyMatcher)
47 |
48 | mock_get_match_sets_return = [{"apple", "Apple", "apple inc."}, {"banana"}]
49 | mock_fuzzy_matcher.get_match_sets.return_value = mock_get_match_sets_return
50 |
51 | result = mock_fuzzy_matcher.get_match_sets(["apple", "Apple", "apple inc.", "banana"])
52 |
53 | assert (
54 | result == mock_get_match_sets_return
55 | ), f"Expected {mock_get_match_sets_return}, got {result}"
56 |
--------------------------------------------------------------------------------
/tests/persistence/adapters/simple_repository/help.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import cast
4 |
5 | from attrs import asdict
6 |
7 | from eschergraph.graph import Edge
8 | from eschergraph.graph import Node
9 | from eschergraph.graph import Property
10 | from eschergraph.persistence.adapters.simple_repository.models import EdgeModel
11 | from eschergraph.persistence.adapters.simple_repository.models import (
12 | MetadataModel,
13 | )
14 | from eschergraph.persistence.adapters.simple_repository.models import NodeModel
15 | from eschergraph.persistence.adapters.simple_repository.models import (
16 | PropertyModel,
17 | )
18 |
19 |
20 | def compare_node_to_node_model(node: Node, node_model: NodeModel) -> bool:
21 | # Check equality for a node being in a community
22 | if node.community.node and not node_model["community"]:
23 | return False
24 | elif not node.community.node and node_model["community"]:
25 | return False
26 | elif node.community.node and node_model["community"]:
27 | if not node.community.node.id == node_model["community"]:
28 | return False
29 |
30 | return (
31 | node.name == node_model["name"]
32 | and node.description == node_model["description"]
33 | and node.level == node_model["level"]
34 | and {edge.id for edge in node.edges} == node_model["edges"]
35 | and [property.id for property in node.properties] == node_model["properties"]
36 | and [cast(MetadataModel, asdict(md)) for md in node.metadata]
37 | == node_model["metadata"]
38 | )
39 |
40 |
41 | def compare_edge_to_edge_model(edge: Edge, edge_model: EdgeModel) -> bool:
42 | return (
43 | edge.frm.id == edge_model["frm"]
44 | and edge.to.id == edge_model["to"]
45 | and edge.description == edge_model["description"]
46 | and [cast(MetadataModel, asdict(md)) for md in edge.metadata]
47 | == edge_model["metadata"]
48 | )
49 |
50 |
51 | def compare_property_to_property_model(
52 | property: Property, property_model: PropertyModel
53 | ) -> bool:
54 | return (
55 | property.node.id == property_model["node"]
56 | and property.description == property_model["description"]
57 | )
58 |
--------------------------------------------------------------------------------
/eschergraph/agents/prompts/json_build.jinja:
--------------------------------------------------------------------------------
1 | -Goal-
2 | Extract all relevant information from the provided text into a graph representation containing entities and relations.
3 | The most important part is that you try to represent all the information in the provided text in a structured format!
4 |
5 | -Steps-
6 | 1. Identify all named entities in singular form. For people please include the entire name. Entities can also be technologies.
7 | For each identified entity, extract the following information:
8 | - entity_name: Name of the entity
9 | - entity_description: Comprehensive description of the entity's attributes and activities
10 |
11 | Format each entity output as a JSON entry with the following format:
12 |
13 | {"name": , "description": }
14 |
15 | 2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
16 | For each pair of related entities, extract the following information:
17 | - source_entity: name of the source entity, as identified in step 1
18 | - target_entity: name of the target entity, as identified in step 1
19 | - relationship: explanation as to why you think the source entity and the target entity are related to each other
20 |
21 | Format each relationship as a JSON entry with the following format:
22 |
23 | {"source": , "target": , "relationship": }
24 |
25 | 3. Return output in English as a single list of all JSON entities and relationships identified in steps 1 and 2.
26 | return the JSON like this:
27 |
28 | {
29 | 'entities': [{"name": , "description": }, {"name": , "description": }],
30 | 'relationships':[{"source": , "target": , "relationship": }, and more]
31 | }
32 |
33 | However, only extract entities that are specific so avoid extracting entities like CEO or employee, but instead
34 | extract only named entities and technologies.
35 |
36 | -Real Data-
37 | ######################
38 | text: {{ input_text }}
39 | ######################
40 | output:
--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_token_type_labels/label.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from lxml.etree import ElementBase
4 | from pydantic import BaseModel
5 |
6 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.rectangle import (
7 | Rectangle,
8 | )
9 |
10 |
11 | class Label(BaseModel):
12 | top: int
13 | left: int
14 | width: int
15 | height: int
16 | label_type: int
17 | metadata: str = ""
18 |
19 | def intersection_percentage(self, token_bounding_box: Rectangle):
20 | label_bounding_box = Rectangle(
21 | left=self.left,
22 | top=self.top,
23 | right=self.left + self.width,
24 | bottom=self.top + self.height,
25 | )
26 | return label_bounding_box.get_intersection_percentage(token_bounding_box)
27 |
28 | def get_location_discrepancy(self, token_bounding_box: Rectangle):
29 | coordinates_discrepancy: int = abs(self.left - token_bounding_box.left) + abs(
30 | self.top - token_bounding_box.top
31 | )
32 | size_discrepancy: int = abs(self.height - token_bounding_box.height) + abs(
33 | self.width - token_bounding_box.width
34 | )
35 | return coordinates_discrepancy + size_discrepancy
36 |
37 | def area(self):
38 | return self.width * self.height
39 |
40 | @staticmethod
41 | def from_rectangle(rectangle: Rectangle, token_type: int):
42 | return Label(
43 | top=rectangle.top,
44 | left=rectangle.left,
45 | width=rectangle.width,
46 | height=rectangle.height,
47 | label_type=token_type,
48 | )
49 |
50 | @staticmethod
51 | def from_text_elements(text_elements: list[ElementBase]):
52 | top = min([int(x.attrib["top"]) for x in text_elements])
53 | left = min([int(x.attrib["left"]) for x in text_elements])
54 | bottom = max([
55 | int(x.attrib["top"]) + int(x.attrib["height"]) for x in text_elements
56 | ])
57 | right = max([int(x.attrib["left"]) + int(x.attrib["width"]) for x in text_elements])
58 |
59 | return Label(
60 | top=top,
61 | left=left,
62 | width=int(right - left),
63 | height=int(bottom - top),
64 | label_type=0,
65 | )
66 |
--------------------------------------------------------------------------------
/.github/workflows/docs-release.yml:
--------------------------------------------------------------------------------
1 | name: Docs Release
2 | on:
3 | push:
4 | branches: [main]
5 |
6 | concurrency:
7 | group: docs
8 | cancel-in-progress: true
9 |
10 | permissions:
11 | id-token: write
12 | contents: read
13 |
14 | env:
15 | aws_region: "us-east-1"
16 | s3_bucket: "eschergraph.docs.pinkdot.ai"
17 |
18 | jobs:
19 | build:
20 | environment: docs
21 | runs-on: ubuntu-latest
22 | defaults:
23 | run:
24 | working-directory: ./docs
25 | steps:
26 | - name: Checkout Code
27 | uses: actions/checkout@v4
28 |
29 | - name: Use Node.js
30 | uses: actions/setup-node@v3
31 | with:
32 | node-version: '18'
33 | cache: 'npm'
34 | cache-dependency-path: './docs/package-lock.json'
35 |
36 | - name: Install dependencies
37 | run: npm ci
38 |
39 | - name: Build
40 | run: npm run build --if-present
41 |
42 | - name: Upload build files as artifact
43 | uses: actions/upload-artifact@v4
44 | with:
45 | name: vite-build-docs
46 | path: ./docs/build # Still to check
47 |
48 | deploy:
49 | environment: docs
50 | needs: build
51 | runs-on: ubuntu-latest
52 | steps:
53 | - name: Configure AWS Credentials
54 | uses: aws-actions/configure-aws-credentials@v4
55 | with:
56 | role-to-assume: ${{ secrets.AWS_CD_ROLE }}
57 | aws-region: ${{ env.aws_region }}
58 |
59 | - name: Download Artifacts
60 | uses: actions/download-artifact@v4
61 | with:
62 | name: vite-build-docs
63 | path: build
64 |
65 | - name: Upload the files to s3
66 | run: |
67 | aws s3 sync build s3://${{ env.s3_bucket }} --delete
68 |
69 | - name: Invalidate the Cloudfront distribution
70 | env:
71 | ROOT_DISTRIBUTION_ID: ${{ secrets.CLOUDFRONT_ROOT_DISTRIBUTION_ID }}
72 | WWW_DISTRIBUTION_ID: ${{ secrets.CLOUDFRONT_WWW_DISTRIBUTION_ID }}
73 | run: |
74 | aws cloudfront create-invalidation --distribution-id $ROOT_DISTRIBUTION_ID --paths "/*"
75 | aws cloudfront create-invalidation --distribution-id $WWW_DISTRIBUTION_ID --paths "/*"
76 |
--------------------------------------------------------------------------------
/tests/persistence/adapters/simple_repository/test_load.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 | from uuid import UUID
5 |
6 | from eschergraph.graph import Edge
7 | from eschergraph.graph import Node
8 | from eschergraph.graph.loading import LoadState
9 | from eschergraph.persistence.adapters.simple_repository import SimpleRepository
10 | from tests.graph.help import create_simple_extracted_graph
11 | from tests.persistence.adapters.simple_repository.help import (
12 | compare_edge_to_edge_model,
13 | )
14 | from tests.persistence.adapters.simple_repository.help import (
15 | compare_node_to_node_model,
16 | )
17 |
18 |
19 | def test_full_graph_loading(saved_graph_dir: Path) -> None:
20 | repository: SimpleRepository = SimpleRepository(
21 | save_location=saved_graph_dir.as_posix()
22 | )
23 | _, nodes, edges = create_simple_extracted_graph(repository=repository)
24 | node_ids_repository: set[UUID] = set(repository.nodes.keys())
25 | document_id: UUID = next(iter(nodes[0].metadata)).document_id
26 |
27 | assert node_ids_repository == {node.id for node in nodes}
28 | assert set(repository.edges.keys()) == {edge.id for edge in edges}
29 | assert set(repository.doc_node_name_index.keys()) == {document_id}
30 | assert node_ids_repository == set(
31 | repository.doc_node_name_index[document_id].values()
32 | )
33 |
34 | repository.save()
35 | del repository
36 |
37 | new_repository: SimpleRepository = SimpleRepository(
38 | save_location=saved_graph_dir.as_posix()
39 | )
40 |
41 | for node in nodes:
42 | new_node: Node = Node(id=node.id, repository=new_repository)
43 | assert new_node.loadstate == LoadState.REFERENCE
44 | assert compare_node_to_node_model(
45 | node=new_node,
46 | node_model=new_repository.nodes[node.id],
47 | )
48 | assert new_node.loadstate == LoadState.FULL # type: ignore
49 |
50 | for edge in edges:
51 | new_edge: Edge = Edge(
52 | id=edge.id,
53 | frm=Node(edge.frm.id, repository=new_repository),
54 | to=Node(edge.to.id, repository=new_repository),
55 | repository=new_repository,
56 | )
57 | assert new_edge.loadstate == LoadState.REFERENCE
58 | assert compare_edge_to_edge_model(
59 | edge=new_edge, edge_model=new_repository.edges[edge.id]
60 | )
61 | assert new_edge.loadstate == LoadState.CORE # type: ignore
62 |
--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_features/rectangle.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from lxml.etree import ElementBase
4 |
5 |
6 | class Rectangle:
7 | def __init__(self, left: int, top: int, right: int, bottom: int):
8 | self.left = left
9 | self.top = top
10 | self.right = right
11 | self.bottom = bottom
12 | self.fix_wrong_areas()
13 | self.width = self.right - self.left
14 | self.height = self.bottom - self.top
15 |
16 | @staticmethod
17 | def from_poppler_tag_etree(tag: ElementBase) -> "Rectangle":
18 | x_min = int(tag.attrib["left"])
19 | y_min = int(tag.attrib["top"])
20 | x_max = x_min + int(tag.attrib["width"])
21 | y_max = y_min + int(tag.attrib["height"])
22 |
23 | return Rectangle(x_min, y_min, x_max, y_max)
24 |
25 | def fix_wrong_areas(self):
26 | if self.right == self.left:
27 | self.left -= 1
28 | self.right += 1
29 |
30 | if self.top == self.bottom:
31 | self.top -= 1
32 | self.bottom += 1
33 |
34 | if self.right < self.left:
35 | self.right, self.left = self.left, self.right
36 |
37 | if self.bottom < self.top:
38 | self.top, self.bottom = self.bottom, self.top
39 |
40 | def get_intersection_percentage(self, rectangle: "Rectangle") -> float:
41 | x1 = max(self.left, rectangle.left)
42 | y1 = max(self.top, rectangle.top)
43 | x2 = min(self.right, rectangle.right)
44 | y2 = min(self.bottom, rectangle.bottom)
45 |
46 | if x2 <= x1 or y2 <= y1:
47 | return 0
48 |
49 | return 100 * (x2 - x1) * (y2 - y1) / self.area()
50 |
51 | def area(self):
52 | return self.width * self.height
53 |
54 | def to_dict(self):
55 | return {
56 | "top": self.top,
57 | "left": self.left,
58 | "right": self.right,
59 | "bottom": self.bottom,
60 | }
61 |
62 | @staticmethod
63 | def merge_rectangles(rectangles: list["Rectangle"]) -> "Rectangle":
64 | left = min([rectangle.left for rectangle in rectangles])
65 | top = min([rectangle.top for rectangle in rectangles])
66 | right = max([rectangle.right for rectangle in rectangles])
67 | bottom = max([rectangle.bottom for rectangle in rectangles])
68 |
69 | return Rectangle(left, top, right, bottom)
70 |
71 | @staticmethod
72 | def from_width_height(left: int, top: int, width: int, height: int):
73 | return Rectangle(left, top, left + width, top + height)
74 |
--------------------------------------------------------------------------------
/eschergraph/exceptions.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 |
4 | class BaseEscherGraphException(Exception):
5 | """The base class for all EscherGraph exceptions."""
6 |
7 |
8 | class PromptFormattingException(BaseEscherGraphException):
9 | """When some jinja prompt variables have not been formatted.
10 |
11 | Used to check if the prompt has been sent to the LLM / agent as intended.
12 | """
13 |
14 |
15 | class IllogicalActionException(BaseEscherGraphException):
16 | """When something unlogical happens, like searching before building a graph."""
17 |
18 |
19 | class ExternalProviderException(BaseEscherGraphException):
20 | """When something unexpected occurs during an interation with an external service."""
21 |
22 |
23 | class DataLoadingException(BaseEscherGraphException):
24 | """Raised when some data on the EscherGraph objects has not been loaded as expected."""
25 |
26 |
27 | class NodeDoesNotExistException(BaseEscherGraphException):
28 | """Node has not been found."""
29 |
30 |
31 | class EdgeCreationException(BaseEscherGraphException):
32 | """Edge is created between a node and itself."""
33 |
34 |
35 | class NodeCreationException(BaseEscherGraphException):
36 | """Something went wrong creating a node."""
37 |
38 |
39 | class CredentialException(BaseEscherGraphException):
40 | """Missing credential for external provider."""
41 |
42 |
43 | class FileTypeNotProcessableException(BaseEscherGraphException):
44 | """File is not processable due to its type."""
45 |
46 |
47 | class EdgeDoesNotExistException(BaseEscherGraphException):
48 | """The specified edge could not be found."""
49 |
50 |
51 | class RepositoryException(BaseEscherGraphException):
52 | """Something unexpected happens with the repository."""
53 |
54 |
55 | class ExternalDependencyException(BaseEscherGraphException):
56 | """External dependency (outside of Python) is missing."""
57 |
58 |
59 | class DocumentDoesNotExistException(BaseEscherGraphException):
60 | """The specified document does not exist in the graph."""
61 |
62 |
63 | class DocumentAlreadyExistsException(BaseEscherGraphException):
64 | """The graph attempts to build for a document that already exists."""
65 |
66 |
67 | class FileException(BaseEscherGraphException):
68 | """Provided filepath is not a file or the file does not exist."""
69 |
70 |
71 | class ImageProcessingException(BaseEscherGraphException):
72 | """Exception that occurs when processing an image."""
73 |
--------------------------------------------------------------------------------
/eschergraph/persistence/vector_db/vector_db.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from abc import ABC
4 | from abc import abstractmethod
5 | from typing import Optional
6 | from uuid import UUID
7 |
8 | from eschergraph.persistence.vector_db.vector_search_result import VectorSearchResult
9 |
10 |
11 | class VectorDB(ABC):
12 | """This is the abstract base class for all vector DB implementations.
13 |
14 | It is important to note that an embedding model is included in the abstract vector database class.
15 | """
16 |
17 | required_credentials: list[str]
18 |
19 | @abstractmethod
20 | def connect(self) -> None:
21 | """Possible connection method."""
22 | raise NotImplementedError
23 |
24 | @abstractmethod
25 | def insert(
26 | self,
27 | documents: list[str],
28 | ids: list[UUID],
29 | metadata: list[dict[str, str | int]],
30 | collection_name: str,
31 | ) -> None:
32 | """Store documents with their embeddings, ids, and metadata.
33 |
34 | Args:
35 | embeddings (List[List[float]]): List of embeddings for the documents.
36 | documents (List[str]): List of document texts.
37 | ids (List[int]): List of document IDs.
38 | metadata (List[dict[str, Any]]): List of metadata dictionaries.
39 | collection_name (str): The name of the collection.
40 | """
41 | raise NotImplementedError
42 |
43 | @abstractmethod
44 | def search(
45 | self,
46 | query: str,
47 | top_n: int,
48 | collection_name: str,
49 | metadata: Optional[dict[str, str | int]] = None,
50 | ) -> list[VectorSearchResult]:
51 | """Search for the top_n documents that are most similar to the given query.
52 |
53 | Args:
54 | query (str): The query to search for.
55 | top_n (int): Number of top search results to retrieve.
56 | collection_name (str): The name of the collection.
57 | metadata (Optional[dict[str, str | int]]): Metadata to filter the search results.
58 |
59 | Returns:
60 | A list of vector search results.
61 | """
62 | raise NotImplementedError
63 |
64 | @abstractmethod
65 | def delete_by_ids(
66 | self,
67 | ids: list[UUID],
68 | collection_name: str,
69 | ) -> None:
70 | """Delete records from collection by their ids.
71 |
72 | Args:
73 | ids (list[str]): list of ids that need to be removed
74 | collection_name (str): The name of the collection.
75 | """
76 | raise NotImplementedError
77 |
--------------------------------------------------------------------------------
/eschergraph/agents/jinja_helper.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import re
4 | from pathlib import Path
5 | from typing import Any
6 |
7 | from jinja2 import BaseLoader
8 | from jinja2 import Environment
9 | from jinja2 import FileSystemLoader
10 | from jinja2 import select_autoescape
11 | from jinja2 import Template
12 |
13 | from eschergraph.exceptions import PromptFormattingException
14 |
15 |
16 | def process_template(template_file: str, data: dict[str, str]) -> str:
17 | """Process the jinja template into a string.
18 |
19 | Function has been inspired by: https://github.com/ArjanCodes/examples/blob/main/2024/tuesday_tips/jinja2/jinja_helper.py
20 |
21 | Args:
22 | template_file (str): The name of the jinja prompt template.
23 | data (dict): The parameters and their values to insert into the prompt.
24 |
25 | Returns:
26 | The formatted prompt as a string.
27 | """
28 | parent_path: str = Path(__file__).parent.absolute().as_posix()
29 | jinja_env: Environment = Environment(
30 | loader=FileSystemLoader(searchpath=parent_path + "/prompts"),
31 | autoescape=select_autoescape(),
32 | )
33 |
34 | template_variables: list[Any] = extract_variables(template_file, jinja_env)
35 |
36 | # Check if all variables in template have been provided as data
37 | if not set(template_variables) == set(data.keys()):
38 | raise PromptFormattingException(
39 | "Some variables in the prompt have not been formatted."
40 | )
41 |
42 | template: Template = jinja_env.get_template(template_file)
43 |
44 | return template.render(**data)
45 |
46 |
47 | def extract_variables(template_file: str, jinja_env: Environment) -> list[Any]:
48 | """Extract all variables in a Jinja template in string format.
49 |
50 | Args:
51 | template_file (str): the name of the jinja prompt template.
52 | jinja_env (Environment): the jinja Environment.
53 |
54 | Returns:
55 | A list of all the identified variables in the string template.
56 | """
57 | # Check if the baseloader is None
58 | if not jinja_env.loader:
59 | raise PromptFormattingException(
60 | "Something went wrong formatting the prompt template."
61 | )
62 | else:
63 | loader: BaseLoader = jinja_env.loader
64 |
65 | # Get the template as plain text
66 | plain_template: str = loader.get_source(jinja_env, template_file)[0]
67 |
68 | variable_pattern: str = r"\{\{ *([\w_]+) *\}\}"
69 | return re.findall(variable_pattern, plain_template)
70 |
--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_tokens_type_trainer/pdf_trainer.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 |
5 | import lightgbm as lgb
6 | import numpy as np
7 |
8 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_features import (
9 | PdfFeatures,
10 | )
11 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_font import (
12 | PdfFont,
13 | )
14 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_token import (
15 | PdfToken,
16 | )
17 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.rectangle import (
18 | Rectangle,
19 | )
20 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.token_type import (
21 | TokenType,
22 | )
23 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_tokens_type_trainer.model_configuration import (
24 | ModelConfiguration,
25 | )
26 |
27 |
28 | class PdfTrainer:
29 | def __init__(
30 | self,
31 | pdfs_features: list[PdfFeatures],
32 | model_configuration: ModelConfiguration = None,
33 | ):
34 | self.pdfs_features = pdfs_features
35 | self.model_configuration = (
36 | model_configuration if model_configuration else ModelConfiguration()
37 | )
38 |
39 | def get_model_input(self) -> np.ndarray:
40 | pass
41 |
42 | @staticmethod
43 | def features_rows_to_x(features_rows):
44 | if not features_rows:
45 | return np.zeros((0, 0))
46 |
47 | x = np.zeros(((len(features_rows)), len(features_rows[0])))
48 | for i, v in enumerate(features_rows):
49 | x[i] = v
50 | return x
51 |
52 | def loop_tokens(self):
53 | for pdf_features in self.pdfs_features:
54 | for page, token in pdf_features.loop_tokens():
55 | yield token
56 |
57 | @staticmethod
58 | def get_padding_token(segment_number: int, page_number: int):
59 | return PdfToken(
60 | page_number,
61 | "pad_token",
62 | "",
63 | PdfFont("pad_font_id", False, False, 0.0, "#000000"),
64 | segment_number,
65 | Rectangle(0, 0, 0, 0),
66 | TokenType.TEXT,
67 | )
68 |
69 | # Models are already downloaded
70 | def predict(self, model_path: str | Path = None):
71 | x = self.get_model_input()
72 |
73 | if not x.any():
74 | return self.pdfs_features
75 |
76 | lightgbm_model = lgb.Booster(model_file=model_path)
77 | return lightgbm_model.predict(x)
78 |
--------------------------------------------------------------------------------
/docs/docs/explained-eschergraph/Graph building.md:
--------------------------------------------------------------------------------
1 | ---
2 | sidebar_position: 2
3 | ---
4 |
5 | # Graph Building Pipeline
6 | ### Eschergraph Pipeline
7 |
8 | Some aspects of this graph building pipeline have been inspired by [GraphRAG](https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/), a Microsoft Research project. Their great work and insights on communities in a graph have been especially inspiring.
9 |
10 | The steps for the EscherGraph building are:
11 |
12 | 1) Parse document into chunks of about 500 tokens
13 | 2) Extract nodes & edges for each chunk using an LLM
14 | 3) Extract properties for each node using an LLM
15 | 4) Match similar nodes, and merging their edges and properties (more on this [below](#node-matcher))
16 | 5) Persist the graph to a database
17 | 6) Build communities with the [LeidenAlg](https://github.com/vtraag/leidenalg)
18 | 7) Sync all nodes, edges and properties to a vector database, by default [ChromaDB](https://www.trychroma.com/)
19 |
20 |
21 | 
22 |
23 | ### Node Matcher
24 | The node matcher is used to match and merge nodes that refer to the same entity.
25 | It involves two steps.
26 |
27 | 1. Identify potentially matching nodes using the Levenshtein distance. Indeed, nodes with exactly the same name are matched straight away.
28 | - For example:
29 | - matching 'p100' to 'p100 gpu'
30 | - 'Sam' and 'Sam Altman'
31 |
32 | 2) Decide on potential matches using LLM reasoning and contextual clues. This is done to make the right decision, even for edge cases: when dealing with different entities that are referenced by the same name across different chunks. For example, 'Sam', 'Sam Altman', and 'Sam Bankman-Fried'.
33 | Therefore, the node matcher utilizes additional context to differentiate between them. This process involves:
34 | - LLM identifies node name ambiguities: given a list of potentially matching entity names, the LLM returns a list of edge cases.
35 |
36 | - Re-ranking potential matches: a re-ranker evaluates the similarity of the nodes based on context, metadata, or additional attributes to accurately determine which specific entity a node is referring to.
37 |
38 | - Contextual clues: the re-ranker leverages additional contextual information from the surrounding data or relationships to classify which node is the correct match. This might include looking at node connections, associated attributes, or other identifiers to make a more informed decision.
--------------------------------------------------------------------------------
/eschergraph/agents/prompts/json_property.jinja:
--------------------------------------------------------------------------------
1 | The goal is to add descriptive properties to all entities that occur in the provided text.
2 | It is essential that all information from the provided piece of text is extracted and assigned as a property to an entity.
3 | We need to make sure that the resulting graph contains exactly the same information as the source text; it needs to contain all the information.
4 |
5 | ===============================================================================
6 | EXAMPLE SECTION: The following section includes example output. These examples must be
7 | **excluded from your answer**.
8 |
9 | EXAMPLE 1
10 | Task: Extract all the properties for each entity from the source text.
11 | Text: Nicklas Bendtner is a Danish football legend that played for Arsenal. He was especially
12 | feared for his ability to score a goal at any moment.
13 | JSON RESPONSE:
14 | {"entities": [{"Nicklas Bendtner": ["A Danish football legend", "Played for Arsenal", "Especially feared for his ability to score a goal at any moment"]}]}
15 | END OF EXAMPLE 1
16 |
17 | EXAMPLE 2
18 | Task: Extract all the properties for each entity from the source text.
19 | Text: EscherGraph is a state-of-the-art open-source package that provides a framework for building
20 | a structured graph from any unstructured data source. Although, an entirely novel approach, it can
21 | be seen as the next evolution of knowledge graphs. It was developed at PinkDot AI, a Delft based
22 | AI startup.
23 | JSON RESPONSE:
24 | {"entities": [
25 | {"EscherGraph": ["A state-of-the-art open source package that provides a framework for building a structured graph from any unstructured data source", "an entirely novel approach that can be seen as the next evolution of knowledge graphs"]},
26 | {"PinkDot AI": ["A Delft based AI startup"]}
27 | ]}
28 | END OF EXAMPLE 2
29 | ===============================================================================
30 |
31 | ===============================================================================
32 | REAL DATA: The following section is the real data. You should use only this real data to prepare your answer. Generate entities with properties only.
33 | Task: Extract all the properties for each entity from the source text.
34 |
35 | Make sure all information from the provided piece of text is extracted and assigned as a property to an entity.
36 | Text: {{ input_text }}
37 | JSON response:
38 | {"entities": [{"": ["", "", ""]}]}
39 | Match the properties to existing nodes only!! This is absolutely crucial!!!
40 | Existing nodes: {{ current_nodes }}
41 |
--------------------------------------------------------------------------------
/tests_integration/build_search.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import time
4 | from pathlib import Path
5 | from tempfile import TemporaryDirectory
6 |
7 | from dotenv import load_dotenv
8 |
9 | from eschergraph import Graph
10 | from eschergraph.agents import OpenAIModel
11 | from eschergraph.agents import OpenAIProvider
12 | from eschergraph.persistence import Repository
13 | from eschergraph.persistence.adapters.simple_repository import SimpleRepository
14 | from eschergraph.persistence.document import Document
15 | from eschergraph.persistence.vector_db import VectorDB
16 | from eschergraph.persistence.vector_db.adapters.chromadb import ChromaDB
17 | from eschergraph.visualization import Visualizer
18 |
19 | TEST_FILE: str = "./test_files/test_file.pdf"
20 |
21 | # Load all the credentials
22 | load_dotenv()
23 |
24 |
25 | def build_graph() -> None:
26 | # The temporary directory (clean run for each test)
27 | temp_dir: TemporaryDirectory = TemporaryDirectory()
28 | temp_path: Path = Path(temp_dir.name)
29 |
30 | # Set up all the graph dependencies
31 | graph_name: str = "test_graph"
32 | repository: Repository = SimpleRepository(
33 | name=graph_name, save_location=temp_path.as_posix()
34 | )
35 | chroma: VectorDB = ChromaDB(
36 | save_name=graph_name,
37 | persistent=False,
38 | embedding_model=OpenAIProvider(model=OpenAIModel.TEXT_EMBEDDING_LARGE),
39 | )
40 | graph: Graph = Graph(
41 | name=graph_name,
42 | repository=repository,
43 | vector_db=chroma,
44 | model=OpenAIProvider(model=OpenAIModel.GPT_4o_MINI),
45 | )
46 |
47 | # Build the graph
48 | graph.build(TEST_FILE)
49 |
50 | Visualizer.visualize_graph(
51 | graph, level=0, save_location=temp_path.as_posix() + "/level_0.html"
52 | )
53 | Visualizer.visualize_graph(
54 | graph, level=1, save_location=temp_path.as_posix() + "/level_1.html"
55 | )
56 | Visualizer.visualize_graph(
57 | graph, level=2, save_location=temp_path.as_posix() + "/level_2.html"
58 | )
59 |
60 | documents: list[Document] = graph.get_all_documents()
61 |
62 | print("The document currently in the graph: ")
63 | print(documents)
64 | doc_name: str = documents[0].name
65 |
66 | answer = graph.search("Who are the architects?", filter_filenames=[doc_name])
67 | print(answer)
68 |
69 | global_answer = graph.global_search(
70 | "What are the key points?", filter_filenames=[doc_name]
71 | )
72 | print(global_answer)
73 |
74 | graph.dashboard()
75 |
76 | # Wait a few seconds before cleaning up to open the visuals
77 | time.sleep(10)
78 |
79 | # Clean up all the persistent data
80 | temp_dir.cleanup()
81 |
82 |
83 | if __name__ == "__main__":
84 | build_graph()
85 |
--------------------------------------------------------------------------------
/eschergraph/agents/prompts/json_figure.jinja:
--------------------------------------------------------------------------------
1 | You are given a figure, analyse this well regarding information about various entities and their relationships.
2 | The goal is to map all the relations of the figure with regarding entities and extract all relevant information in the figure.
3 | The figure will be regarded as if it is an entity.
4 | Extract the entities and describe the relationships between the figure and these entities in a structured format.
5 |
6 | There might be possible relations between entities mentioned in the figure. Also extract these relations.
7 | Analyse the figure and the capiton and give a rich description of the figure.
8 | Give the figure a name based on the caption, make sure to include that it is a figure in the name.
9 |
10 | -Steps-
11 | 1. Identify all named entities in singular form. For people please include the entire name. Entities can also be technologies.
12 | For each identified entity, extract the following information:
13 | - entity_name: Name of the entity
14 | - entity_description: Comprehensive description of the entity's attributes and activities
15 | - specify whether this node is the main node regarding the given figure. There can only be one true (specify boolean value True) node for the figure
16 | Format each entity output as a JSON entry with the following format:
17 |
18 | {"name": , "description": , 'main_node': bool}
19 |
20 | 2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
21 | For each pair of related entities, extract the following information:
22 | - source_entity: name of the source entity, as identified in step 1
23 | - target_entity: name of the target entity, as identified in step 1
24 | - relationship: explanation as to why you think the source entity and the target entity are related to each other
25 |
26 | -- Complete JSON Output format:--
27 | {
28 | "entities": [
29 | {"name": "", "description": "", 'main_node': bool},
30 | {"name": "", "description": "", 'main_node': bool}
31 | ],
32 | "relationships": [
33 | {"source": "", "target": "", "relationship": ""},
34 | {"source": "", "target": "", "relationship": ""},
35 | {"source": "", "target": "", "relationship": ""}
36 | ]
37 | }
38 |
39 | -Real Data-
40 | ###############
41 | Figure caption: {{ figure_caption }}
42 | These are the keywords of the document in which the figure appears {{ keywords }}
43 | ###############
44 |
--------------------------------------------------------------------------------
/eschergraph/graph/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 | from uuid import UUID
5 |
6 | from eschergraph.exceptions import DocumentAlreadyExistsException
7 | from eschergraph.exceptions import DocumentDoesNotExistException
8 | from eschergraph.exceptions import FileException
9 | from eschergraph.persistence import Repository
10 | from eschergraph.persistence.document import Document
11 |
12 |
13 | def duplicate_document_check(file_list: list[str], repository: Repository) -> None:
14 | """Check if the documents already exist in the graph.
15 |
16 | Also, it raises an exception if a provided filepath does not point to
17 | a file.
18 |
19 | Args:
20 | file_list (list[str]): A list of filepaths pointing to files.
21 | repository (Repository): The repository that stores the graph data.
22 |
23 | Raises:
24 | A DocumentAlreadyExistsException as soon as it discovers a document that already
25 | exists.
26 | FileException if one of the provided paths does not point to a file, or if the
27 | file does not exist.
28 | """
29 | for file in file_list:
30 | file_path: Path = Path(file)
31 |
32 | # Check if the filepath points to a file
33 | if not file_path.is_file():
34 | raise FileException(f"Make sure that this is a file that exists: {file_path}")
35 |
36 | filename: str = file_path.name
37 |
38 | if repository.get_document_by_name(filename):
39 | raise DocumentAlreadyExistsException(
40 | f"A file with this name already exists in the graph: {filename}"
41 | )
42 |
43 |
44 | def search_check(repository: Repository) -> bool:
45 | """Check if there are any elements at level 0 in the graph repository.
46 |
47 | Args:
48 | repository (Repository): The repository that stores the graph.
49 |
50 | Returns:
51 | bool: True if there are elements at level 0, otherwise False.
52 | """
53 | return len(repository.get_all_at_level(0)) > 0
54 |
55 |
56 | def get_document_ids_from_filenames(
57 | filenames: list[str], repository: Repository
58 | ) -> list[UUID]:
59 | """Get a document id from a list of filenames.
60 |
61 | Used to get the document id's for the filter in the search.
62 |
63 | Args:
64 | filenames (list[str]): A list of filenames.
65 | repository (Repository): The repository that saves the data.
66 |
67 | Returns:
68 | list[UUID]: A list of document id's.
69 |
70 | Raises:
71 | DocumentDoesNotExistException: If one of the provided filenames does not exist.
72 | """
73 | doc_ids: list[UUID] = []
74 | for name in filenames:
75 | doc: Document | None = repository.get_document_by_name(name)
76 |
77 | if not doc:
78 | raise DocumentDoesNotExistException(f"Document with name: {name}, does not exist")
79 | doc_ids.append(doc.id)
80 |
81 | return doc_ids
82 |
--------------------------------------------------------------------------------
/tests/graph/test_getter_setter.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from unittest.mock import Mock
4 | from uuid import uuid4
5 |
6 | import pytest
7 |
8 | from eschergraph.graph.base import EscherBase
9 | from eschergraph.graph.community import Community
10 | from eschergraph.graph.getter_setter import _extract_inner_type
11 | from eschergraph.graph.getter_setter import _extract_property_type
12 | from eschergraph.graph.getter_setter import _parse_future_annotations
13 | from eschergraph.graph.getter_setter import loading_getter_setter
14 | from eschergraph.graph.loading import LoadState
15 | from eschergraph.persistence import Metadata
16 |
17 |
18 | @pytest.fixture(scope="function")
19 | def base_repository(mock_repository: Mock) -> Mock:
20 | # Set the metadata equal to an empty set
21 | def load_side_effect(base: EscherBase, loadstate: LoadState) -> None:
22 | base._metadata = set()
23 |
24 | mock_repository.load.side_effect = load_side_effect
25 |
26 | return mock_repository
27 |
28 |
29 | def test_extract_property_type_string() -> None:
30 | assert _extract_property_type("list[str]") == ""
31 | assert _extract_property_type("Optional[int]") == "int"
32 | assert _extract_property_type("Optional[set[int]]") == "set[int]"
33 | assert _extract_property_type("") == ""
34 |
35 |
36 | def test_extract_inner_type() -> None:
37 | assert _extract_inner_type("") == ""
38 | assert _extract_inner_type("list[str]") == ""
39 | assert _extract_inner_type("Optional[set[int]]") == "set"
40 | assert _extract_inner_type("Optional[Node]") == "Node"
41 |
42 |
43 | def test_parse_future_annotations() -> None:
44 | with pytest.raises(RuntimeError):
45 | _parse_future_annotations("")
46 | _parse_future_annotations("list[str]")
47 |
48 | _parse_future_annotations("Optional[set[int]]") == set
49 | _parse_future_annotations("Optional[list[str]]") == list
50 | _parse_future_annotations("Optional[Community]") == Community
51 |
52 |
53 | # Testing whether the class decorator works.
54 | # Note that it cannot be applied to the base class directly as that would
55 | # trigger a circular import.
56 | @loading_getter_setter
57 | class ExtendedBase(EscherBase): ...
58 |
59 |
60 | def test_check_loadstate_metadata(base_repository: Mock) -> None:
61 | base: EscherBase = ExtendedBase(repository=base_repository)
62 |
63 | assert isinstance(base.metadata, set)
64 | assert base.loadstate == LoadState.CORE
65 |
66 |
67 | def test_setting_metadata(base_repository: Mock) -> None:
68 | base: EscherBase = ExtendedBase(repository=base_repository)
69 |
70 | metadata_set: set[Metadata] = {Metadata(document_id=uuid4(), chunk_id=1)}
71 | assert not base._metadata
72 | assert base.loadstate == LoadState.REFERENCE
73 |
74 | base.metadata = metadata_set
75 |
76 | assert base.metadata == metadata_set
77 | assert base.loadstate == LoadState.CORE # type: ignore
78 |
--------------------------------------------------------------------------------
/eschergraph/agents/prompts/identifying_nodes.jinja:
--------------------------------------------------------------------------------
1 | You are an expert in understanding the difference between named entities.
2 | -- Goal:
3 | You will recieve a list of entity names that I potentially want to merge. The goal is to merge them if they are the same but with a different name,
4 | but if they are inherrently different then return them als different entities. I am reliing on your judgement on this!
5 |
6 | Give then unique set of nodes all appropriate names. If the entities are different versions of the name of a person, then always return the full name.
7 |
8 | Always answer in the JSON format below. Give a list of entities with their new name and the names of the merged entities from the input list.
9 | It is possible assing multiple input entities to more entities in the answer:
10 | {'entities':
11 | [
12 | {'name':, 'merged entities': [, ,...]}
13 | ]
14 | }
15 | Make sure to only put entities from the input lis in the merged entities and that all input entities are merged somewhere!
16 |
17 | If it is reasonble to merge an input entity with multiple output entities, because there might be some overlap,
18 | then make sure to add this input entitiy in both the output entities merged entity list. Like in this example:
19 |
20 | Entities: Lennart, Lennart Timmermans, Patrick Timmermans, Timmermans
21 | Answer:{'entities':
22 | [
23 | {'name':Lennart Timmermans, 'merged entities': [Lennart Timmermans, Lennart, Timmermans]}
24 | {'name':Patrick Timmermans, 'merged entities': [Patrick Timmermans, Timmermans]}
25 | ]
26 | }
27 | The entity Timmermans is in both output entities
28 |
29 | -- More Examples:
30 | Entities: Manchester United, Manchester, United
31 | Answer:{'entities':
32 | [
33 | {'name':Manchester United, 'merged entities': [Manchester United, United]}
34 | {'name':Manchester, 'merged entities': [Manchester]}
35 |
36 | ]
37 | }
38 | Entities: Bjarne, Bjarne Herben
39 | Answer:{'entities':
40 | [
41 | {'name':Bjarne Herben, 'merged entities': [Bjarne Herben, Bjarne]}
42 | ]
43 | }
44 |
45 | Entities: Nvidia, Nvidia H100 GPU, H100, GPU
46 | Answer:{'entities':
47 | [
48 | {'name':Nvidia, 'merged entities': Nvidia}
49 | {'name':Nvidia H100 GPU, 'merged entities': [H100, Nvidia H100 GPU]}
50 | {'name': GPU, 'merged entities': [GPU]}
51 | ]
52 | }
53 | Entities: '1988 world series', 'world series trophy', '2001 world series', 'world series', '2017 world series'
54 | [
55 | {'name':World Series, 'merged entities': [1988 world series, 2001 world series, 2017 world series, world series]}
56 | {'name':world series trophy, 'merged entities': [world series trophy]}
57 | ]
58 |
59 | ---Now here is the real data. Extract all unique entities from the list, and make sure to double tag if there is an overlap in entities.
60 | input entities: {{entities}}
--------------------------------------------------------------------------------
/eschergraph/graph/property.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import Optional
4 | from typing import TYPE_CHECKING
5 |
6 | from attrs import define
7 | from attrs import field
8 |
9 | from eschergraph.graph.base import EscherBase
10 | from eschergraph.graph.getter_setter import loading_getter_setter
11 | from eschergraph.graph.loading import LoadState
12 | from eschergraph.persistence import Metadata
13 |
14 | # Prevent circular import errors
15 | if TYPE_CHECKING:
16 | from eschergraph.graph.node import Node
17 |
18 |
19 | @loading_getter_setter
20 | @define
21 | class Property(EscherBase):
22 | """The property class.
23 |
24 | Conceptually, we consider a property to be the same as en edge.
25 | Therefore, a property can be considered as sort of an edge between the
26 | node and itself.
27 | """
28 |
29 | node: Node = field(kw_only=True)
30 | _description: Optional[str] = field(default=None, metadata={"group": LoadState.CORE})
31 |
32 | # The type annotation for the properties added by the decorator
33 | description: str = field(init=False)
34 |
35 | @classmethod
36 | def create(
37 | cls,
38 | node: Node,
39 | description: str,
40 | metadata: Optional[set[Metadata]] = None,
41 | ) -> Property:
42 | """Create a new property.
43 |
44 | The property that is created is automatically added to the specified node.
45 |
46 | Args:
47 | node (Node): The node to which the property belongs.
48 | description (str): The property's description.
49 | metadata (Optional[set[Metadata]]): The optional set with metadata about the property's extraction.
50 |
51 |
52 | Returns:
53 | The newly created property.
54 | """
55 | # The same repository as the node
56 | property: Property = cls(
57 | node=node,
58 | description=description,
59 | repository=node.repository,
60 | metadata=metadata if metadata else set(),
61 | loadstate=LoadState.FULL,
62 | )
63 |
64 | # Add the property to the node
65 | node.properties.append(property)
66 |
67 | return property
68 |
69 | def __eq__(self, other: object) -> bool:
70 | """The equals method for a property.
71 |
72 | Two property objects are considered equal if they have the same description, and
73 | if they belong to the same node.
74 |
75 | Args:
76 | other (object): The object to compare the property to.
77 |
78 | Returns:
79 | True if equal and False otherwise.
80 | """
81 | if isinstance(other, Property):
82 | return self.description == other.description and self.node.id == other.node.id
83 | return False
84 |
85 | def __hash__(self) -> int:
86 | """The hash function for a property.
87 |
88 | The hash is computed based on the id as this uniquely defines the property.
89 |
90 | Returns:
91 | The computed hash value, which is an integer.
92 | """
93 | return hash(self.id)
94 |
--------------------------------------------------------------------------------
/test_files/txt_file.txt:
--------------------------------------------------------------------------------
1 | The Architects of Tomorrow
2 | In a small, dimly lit room, a team of programmers sat hunched over their keyboards. The hum of servers filled
3 | the air as lines of code streamed across their screens. These weren’t just any programmers—they were the
4 | Architects of Tomorrow.Sarah, the team leader, paused to stretch her fingers. She glanced around the room
5 | at her team—Mark, the algorithm wizard; Aisha, the cybersecurity guru; and Leo, the interface designer who
6 | could make code come alive on screen. They were deep in the final stages of a project they had been
7 | working on for years: a new kind of network, one that would connect not just devices, but ideas, dreams, and
8 | potential.This wasn’t just about faster internet or smarter gadgets. The network they were building could
9 | amplify human creativity, bridging minds across the globe. It could help a scientist in Tokyo collaborate in
10 | real-time with an artist in Paris, or a teacher in Nairobi to instantly share lessons with students in New York. It
11 | was a web of thought, a tapestry of innovation that would redefine what it meant to be connected.“Do you
12 | think they’ll understand it?” Mark asked, breaking the silence. He was staring at a complex algorithm on his
13 | screen, one that would enable the network to learn and grow with its users.Sarah smiled. “They don’t have to
14 | understand it, just like we don’t need to understand every neuron in our brains to think. They’ll feel it. They’ll
15 | see the world changing, and they’ll be a part of it.”Aisha nodded, her eyes flicking to a series of security
16 | protocols she had been fine-tuning. “As long as we protect it. This kind of power... it needs to be
17 | safeguarded. We can’t let it be misused.”“That’s why we’re here,” Leo said, his voice calm but determined.
18 | “We’re building something that transcends what’s been done before. Something that can’t just be reduced to
19 | ones and zeros. It’s about connection—real, human connection.”Hours blurred into days as the team worked
20 | tirelessly, their passion fueling each keystroke. And then, one quiet morning, as the sun’s first light filtered into
21 | their room, it was done.Sarah took a deep breath and hit the final key. The network hummed to life, invisible
22 | but powerful. All across the world, people began to feel a shift—subtle at first, like a whisper of inspiration. An
23 | artist suddenly saw the final stroke of their masterpiece. A doctor discovered the missing link in a cure. A
24 | child, halfway across the world, picked up a book and understood it in a way they never had before.The
25 | programmers sat back, watching the world awaken to their creation. It wasn’t just code anymore. It was hope,
26 | opportunity, and a future they had shaped with their own hands.And as the world began to connect in ways it
27 | had never imagined, the Architects of Tomorrow quietly closed their laptops. Their work was done, but the
28 | future they had built was just beginning.
--------------------------------------------------------------------------------
/eschergraph/agents/llm.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from abc import ABC
4 | from abc import abstractmethod
5 | from typing import Any
6 |
7 | from attrs import define
8 | from attrs import field
9 |
10 |
11 | @define
12 | class FunctionCall:
13 | """The function call as returned by the model.
14 |
15 | The arguments are a JSON representation of the arguments that need to be
16 | supplied to the function. They still need to be validated.
17 | """
18 |
19 | name: str
20 | arguments: dict[str, Any]
21 |
22 |
23 | @define
24 | class TokenUsage:
25 | """Information on the tokens used by the LLM."""
26 |
27 | prompt_tokens: int
28 | completion_tokens: int
29 | total_tokens: int
30 |
31 |
32 | @define
33 | class ModelProvider(ABC):
34 | """The abstract base class for all the LLMs used in the package."""
35 |
36 | required_credentials: list[str] = ["OPENAI_API_KEY"]
37 | tokens: list[TokenUsage] = field(factory=list)
38 | max_threads: int = field(default=10)
39 |
40 | @abstractmethod
41 | def get_model_name(self) -> str:
42 | """Get the model name of the model provider.
43 |
44 | Returns:
45 | The string name of the used llm model
46 | """
47 | raise NotImplementedError
48 |
49 | @abstractmethod
50 | def get_plain_response(self, prompt: str) -> str | None:
51 | """Get a plain text response from an LLM.
52 |
53 | Args:
54 | prompt (str): The prompt to send to the LLM.
55 |
56 | Returns:
57 | The response from the LLM.
58 | """
59 | raise NotImplementedError
60 |
61 | @abstractmethod
62 | def get_multi_modal_response(self, prompt: str, image_path: str) -> Any:
63 | """Get a text response from OpenAI.
64 |
65 | Note that the model that is used is specified when instantiating the class.
66 |
67 | Args:
68 | prompt (str): The user prompt that is send to ChatGPT.
69 | image_path (str): the image to be analyse with the text
70 |
71 | Returns:
72 | The answer given or None.
73 | """
74 | raise NotImplementedError
75 |
76 | @abstractmethod
77 | def get_formatted_response(self, prompt: str, response_format: Any) -> str | None:
78 | """Get a formatted response from an LLM.
79 |
80 | Args:
81 | prompt (str): The user prompt that is send to ChatGPT.
82 | response_format (dict): Type of format that will be returned
83 |
84 | Returns:
85 | Formatted answer
86 | """
87 | raise NotImplementedError
88 |
89 | @abstractmethod
90 | def get_json_response(self, prompt: str) -> dict[str, Any]:
91 | """Get a JSON response from an LLM.
92 |
93 | The JSON output is parsed into a Python dictionary before it is returned.
94 |
95 | Args:
96 | prompt (str): The prompt to send to the LLM.
97 |
98 | Returns:
99 | The parsed dictionary object.
100 | """
101 | raise NotImplementedError
102 |
--------------------------------------------------------------------------------
/eschergraph/agents/prompts/json_table.jinja:
--------------------------------------------------------------------------------
1 | You are given the markdown of a table. The table may contain information about various entities and their relationships.
2 | The goal is to map all the relations of the table with regarding entities, and describe all relevant information in the table.
3 |
4 | The table name will be regarded as if it is an entity.
5 | Extract the entities and describe the relationships between the table and these entities in a structured format. Entities has to be named entities.
6 |
7 | There might be possible relations between entities mentioned in the table. Also extract these relations.
8 |
9 | Analyse the table and the caption and give a rich description for the table.
10 | Give the table a name based on the caption, make sure to include that it is a table in the name.
11 |
12 | --JSON Output format:--
13 | {
14 | "entities": [
15 | {"name": "", "description": ""},
16 | {"name": "", "description": ""}
17 | ],
18 | "relationships": [
19 | {"source": "", "target": "", "relationship": ""},
20 | {"source": "", "target": "", "relationship": ""},
21 | {"source": "", "target": "", "relationship": ""}
22 | ]
23 | }
24 |
25 | --Example:--
26 | For a table describing sales performance across different regions:
27 |
28 | - **Table Name:** Sales Report Q3
29 | - **Entity 1:** North America (NA)
30 | - **Entity 2:** Europe (EU)
31 | - **Entity 3:** Asia Pacific (APAC)
32 | Caption: Sales Report Q3 for hypothetical company
33 | Answer:
34 | {
35 | "entities": [
36 | {"name": "Sales Report Q3 ", "description": "Sales Report Q3 for hypothetical company description the regions North American (NA), Europe (EU) and Asia Pacific (APAC)"},
37 | {"name": "North America (NA) ", "description": ""}
38 | {"name": "Europe (EU)", "description": ""}
39 | {"name": "Asia Pacific (APAC) ", "description": ""}
40 |
41 | ],
42 | "relationships": [
43 | {"source": "Table 1 - Sales Report Q3", "target": "North America (NA)", "relationship": "Has a table sales report in Q3 of x for hypothetical company"},
44 | {"source": "Table 1 - Sales Report Q3", "target": "Europe (EU)", "relationship": "Has a table sales report in Q3 of y for hypothetical company"},
45 | {"source": "Table 1 - Sales Report Q3", "target": "Asia Pacific (APAC)", "relationship": "Has a table sales report in Q3 of z for hypothetical company"}
46 | ]
47 | }
48 |
49 | -Real Data-
50 | ######################
51 |
52 | {{ markdown_table }}
53 | Table caption: {{ table_caption }}
54 | These are the keywords of the document in which the table appears {{ keywords }}
55 |
56 | ######################
57 |
--------------------------------------------------------------------------------
/docs/docs/explained-eschergraph/File Parsing.md:
--------------------------------------------------------------------------------
1 | ---
2 | sidebar_position: 1
3 | ---
4 |
5 | # File Parsing
6 |
7 | ### PDF files
8 |
9 | Parsing PDF files can be challenging due to difficulties in extracting and chunking text accurately. For PDF files, EscherGraph utilizes two open-source document layout models developed by [HURIDOCS](https://github.com/huridocs/pdf-document-layout-analysis), we use their lightweight LightGBM models to make it easy to setup and run the package. These models leverage XML data extracted by Poppler for analysis.
10 |
11 | We are actively working on enhancing EscherGraph to be multimodal. The VGT (Vision Grid Transformer) model from HURIDOCS will enable this advancement, only this model is too large to run on a local device, and needs a GPU.
12 |
13 | For paragraph detection and chunking, we use their models within our parser.
14 |
15 | ```python
16 | from eschergraph.builder.reader.reader import Reader, Chunk
17 | file_location = 'test_files/Attention is All You Need.pdf'
18 |
19 | reader = Reader(
20 | file_location: file_location
21 | )
22 | reader.parse()
23 | chunks = reader.chunks -> list[Chunk]
24 | ```
25 |
26 | This is the Chunk object definition.
27 |
28 | ```python
29 | @define
30 | class Chunk:
31 | """The chunk object."""
32 |
33 | text: str
34 | chunk_id: int
35 | doc_id: UUID
36 | page_num: Optional[int]
37 | doc_name: str
38 | ```
39 | ### TXT files:
40 | For TXT files, we use the Langchain recursive splitter, with a standard chunk size of 1500 characters and an overlap of 300 characters.
41 | ```python
42 | from eschergraph.builder.reader.reader import Reader
43 | file_location = 'test_files/txt_file.txt'
44 |
45 | reader = Reader(
46 | file_location = file_location,
47 | chunk_size = 1500,
48 | overlap = 300
49 | )
50 | reader.parse()
51 | chunks = reader.chunks
52 | ```
53 |
54 | ## Poppler disclaimer
55 | As mentioned previously, our PDF parser uses Poppler internally to convert PDF into XML. Therefore, you are required to have Poppler installed when building a graph from PDF files with our package. Unfortunately, it can be quite a hassle to install Poppler on Windows. In order to mitigate this, our package will automatically install Poppler on Windows, if not already present. We do this by checking if the required functionality is in the path, if not, then we download a Poppler binary from [poppler-windows](https://github.com/oschwartz10612/poppler-windows). The zip file is then extracted and placed in the package's source. It is only during runtime that the binary is placed in the PATH and executed. Hence, this will only occur within the process that runs EscherGraph whilst parsing a PDF.
56 |
57 | We wanted to be fully transparent about this, since a package downloading and running binaries on your hardware can also be done with malicious intent. However, we have done this to make it as easy as possible for Windows users to use our package. If interested, the corresponding code can be found in `eschergraph/tools/fast_pdf_parse/parser.py`.
--------------------------------------------------------------------------------
/eschergraph/agents/providers/jina.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import os
4 | from typing import Any
5 |
6 | import requests
7 | from attrs import define
8 | from requests import Response
9 |
10 | from eschergraph.agents.reranker import Reranker
11 | from eschergraph.agents.reranker import RerankerResult
12 | from eschergraph.exceptions import CredentialException
13 | from eschergraph.exceptions import ExternalProviderException
14 |
15 |
16 | @define
17 | class JinaReranker(Reranker):
18 | """A reranker that uses Jina's API to rerank a list of documents based on their relevance to a query.
19 |
20 | Methods:
21 | rerank(query: str, text_list: list[str], top_n: int) -> Optional[list[RerankerResult]]:
22 | Sends a request to Jina's API to rerank the provided text list according to the query.
23 | get_model_name() -> str: returns the string of the model name
24 | """
25 |
26 | required_credentials: list[str] = ["JINA_API_KEY"]
27 | model: str = "jina-reranker-v2-base-multilingual"
28 |
29 | def get_model_name(self) -> str:
30 | """Returns the name of the model."""
31 | return self.model
32 |
33 | def rerank(
34 | self, query: str, text_list: list[str], top_n: int
35 | ) -> list[RerankerResult]:
36 | """Reranks a list of text documents based on their relevance to the query using Jina's API.
37 |
38 | Args:
39 | query (str): The query string for which documents are being reranked.
40 | text_list (list[str]): The list of documents (texts) to be reranked.
41 | top_n (int): The number of top relevant documents to return.
42 |
43 | Returns:
44 | Optional[list[RerankerResult]]: A list of reranked items with their relevance scores and text,
45 | or None if the request fails.
46 | """
47 | if not text_list:
48 | return []
49 |
50 | api_key: str = os.getenv("JINA_API_KEY")
51 |
52 | if not api_key:
53 | raise CredentialException("No API key for the Jina Reranker has been set")
54 |
55 | url = "https://api.jina.ai/v1/rerank"
56 | headers = {
57 | "Content-Type": "application/json",
58 | "Authorization": f"Bearer {api_key}",
59 | }
60 | data = {
61 | "model": self.model,
62 | "query": query,
63 | "documents": text_list,
64 | "top_n": top_n,
65 | }
66 |
67 | try:
68 | response: Response = requests.post(url, headers=headers, json=data)
69 | response.raise_for_status()
70 | response_json: Any = response.json()
71 |
72 | return [
73 | RerankerResult(
74 | index=r["index"],
75 | relevance_score=r["relevance_score"],
76 | text=r["document"]["text"],
77 | )
78 | for r in response_json.get("results", [])
79 | ]
80 |
81 | except requests.RequestException as e:
82 | raise ExternalProviderException(f"Request failed: {e}")
83 | except ValueError as e:
84 | raise ExternalProviderException(f"Something went wrong parsing the resulf: {e}")
85 |
--------------------------------------------------------------------------------
/eschergraph/graph/search/global_search.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import Any
4 | from typing import Optional
5 | from typing import TYPE_CHECKING
6 | from uuid import UUID
7 |
8 | from eschergraph.agents.jinja_helper import process_template
9 | from eschergraph.config import GLOBAL_SEARCH_TEMPLATE
10 | from eschergraph.config import MAIN_COLLECTION
11 | from eschergraph.graph.search.attribute_search import AttributeSearch
12 | from eschergraph.graph.search.quick_search import rerank_and_filter_attributes
13 | from eschergraph.persistence.vector_db.vector_search_result import VectorSearchResult
14 |
15 | if TYPE_CHECKING:
16 | from eschergraph.graph import Graph
17 |
18 |
19 | def global_search(
20 | graph: Graph, query: str, doc_filter: Optional[list[UUID]] = None
21 | ) -> str:
22 | """Search a graph globally through its communities.
23 |
24 | Note that the findings for a community should be sorted, this is the default behavior when building a graph.
25 |
26 | Args:
27 | graph (Graph): The graph object representing the data structure.
28 | query (str): The query string used to search within the graph.
29 | doc_filter: (Optional[list[UUID]]) The optional list of document id's to filter for.
30 |
31 | Returns:
32 | str: The processed response from the graph model based on the search results..
33 | """
34 | extractions: list[AttributeSearch] = get_relevant_extractions(
35 | graph, query, doc_filter
36 | )
37 | ans_template: str = GLOBAL_SEARCH_TEMPLATE
38 | context: str = "\n".join([a.text for a in extractions])
39 | full_prompt: str = process_template(
40 | ans_template, {"CONTEXT": context, "QUERY": query}
41 | )
42 | response: str | None = graph.model.get_plain_response(full_prompt)
43 | if not response:
44 | return ""
45 |
46 | return response
47 |
48 |
49 | def get_relevant_extractions(
50 | graph: Graph, prompt: str, doc_filter: Optional[list[UUID]] = None
51 | ) -> list[AttributeSearch]:
52 | """Extract relevant attributes from the graph based on the search prompt.
53 |
54 | Args:
55 | graph (Graph): The graph object containing the data to search through.
56 | prompt (str): The query prompt used to perform the attribute search.
57 | doc_filter: (Optional[list[UUID]]) The optional list of document id's to filter for.
58 |
59 | Returns:
60 | list[AttributeSearch]: A list of relevant attributes extracted from the graph, after filtering and reranking.
61 | """
62 | # Perform the search at level 1
63 | search_metadata: dict[str, Any] = {"level": 1}
64 |
65 | if doc_filter:
66 | search_metadata["document_id"] = [str(id) for id in doc_filter]
67 |
68 | attributes_results: list[VectorSearchResult] = graph.vector_db.search(
69 | query=prompt,
70 | top_n=15,
71 | metadata=search_metadata,
72 | collection_name=MAIN_COLLECTION,
73 | )
74 |
75 | results: list[AttributeSearch] = rerank_and_filter_attributes(
76 | graph=graph, query=prompt, attributes_results=attributes_results, threshold=0
77 | )
78 | return results
79 |
--------------------------------------------------------------------------------
/eschergraph/graph/base.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import Optional
4 | from typing import TYPE_CHECKING
5 | from uuid import UUID
6 | from uuid import uuid4
7 |
8 | from attrs import define
9 | from attrs import field
10 | from attrs import fields_dict
11 |
12 | from eschergraph.graph.loading import LoadState
13 | from eschergraph.persistence import Repository
14 |
15 | # To prevent circular import errors
16 | if TYPE_CHECKING:
17 | from eschergraph.persistence import Metadata
18 |
19 |
20 | @define
21 | class EscherBase:
22 | """The base class for objects in the package that need to be persisted."""
23 |
24 | id: UUID = field(factory=uuid4, metadata={"group": LoadState.REFERENCE})
25 | _metadata: Optional[set[Metadata]] = field(
26 | default=None, hash=False, metadata={"group": LoadState.CORE}
27 | )
28 | _loadstate: LoadState = field(default=LoadState.REFERENCE)
29 | """The attribute that keeps track of the loading state of a Node."""
30 | repository: Repository = field(kw_only=True)
31 |
32 | # Type annotation for the dynamically added properties in the child classes
33 | metadata: set[Metadata] = field(init=False)
34 |
35 | def _check_loadstate(self, attr_name: str) -> None:
36 | """Check if the attribute has been loaded by the current loadstate.
37 |
38 | If not enough has been loaded, then load more instance data from the repository.
39 |
40 | Args:
41 | attr_name (str): The name of the attribute that starts with an underscore.
42 | """
43 | required_loadstate: LoadState = fields_dict(self.__class__)[attr_name].metadata[
44 | "group"
45 | ]
46 |
47 | # Load more instance data from the repository if load state is too small
48 | if self.loadstate.value < required_loadstate.value:
49 | self.repository.load(self, loadstate=required_loadstate)
50 | self._loadstate = required_loadstate
51 |
52 | @property
53 | def loadstate(self) -> LoadState:
54 | """The getter for the loadstate of an EscherGraph object.
55 |
56 | Returns:
57 | The object' loadstate.
58 | object's loadstate.
59 | """
60 | return self._loadstate
61 |
62 | @loadstate.setter
63 | def loadstate(self, loadstate: LoadState) -> None:
64 | """The setter for the loadstate of the EscherGraph base object.
65 |
66 | We use a custom setter because we need to make sure that the value of the loadstate
67 | reflects that attributes that are loaded. In addition, the loadstate cannot yet decrease
68 | as we are not yet removing attributes on a class.
69 |
70 | Args:
71 | loadstate (LoadState): The loadstate to set and the state in which the object should
72 | be loaded.
73 | """
74 | # Do nothing if this decreases the loadstate
75 | if loadstate.value <= self._loadstate.value:
76 | return
77 |
78 | self.repository.load(self, loadstate=loadstate)
79 | self._loadstate = loadstate
80 |
81 | def __hash__(self) -> int:
82 | """The hash method for an EscherBase object.
83 |
84 | It only uses the id.
85 |
86 | Returns:
87 | The integer hash value.
88 | """
89 | return hash(self.id)
90 |
--------------------------------------------------------------------------------
/tests_integration/chunk_optimized.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import time
4 | from typing import cast
5 |
6 | from eschergraph.agents.jinja_helper import process_template
7 | from eschergraph.agents.providers.jina import JinaReranker
8 | from eschergraph.agents.providers.openai import OpenAIModel
9 | from eschergraph.agents.providers.openai import OpenAIProvider
10 | from eschergraph.builder.build_log import BuildLog
11 | from eschergraph.builder.build_log import NodeEdgeExt
12 | from eschergraph.builder.build_pipeline import BuildPipeline
13 | from eschergraph.builder.models import Chunk
14 | from eschergraph.builder.reader.reader import Reader
15 | from eschergraph.config import JSON_BUILD
16 | from eschergraph.config import JSON_PROPERTY
17 | from eschergraph.graph.graph import Graph
18 | from eschergraph.persistence.metadata import Metadata
19 |
20 |
21 | def chunk_optimizer():
22 | # The temporary directory (clean run for each test)
23 |
24 | builder = BuildPipeline(
25 | model=OpenAIProvider(model=OpenAIModel.GPT_4o_MINI), reranker=JinaReranker()
26 | )
27 | test_file: str = "test_files/test_file_2.pdf"
28 |
29 | chunks: list[Chunk] = Reader(file_location=test_file, optimal_tokens=400).parse()
30 |
31 | for i in range(2):
32 | chunk = chunks[i]
33 | prompt_formatted: str = process_template(JSON_BUILD, {"input_text": chunk.text})
34 |
35 | answer = builder.model.get_json_response(prompt=prompt_formatted)
36 | json_nodes_edges: NodeEdgeExt = cast(NodeEdgeExt, answer)
37 | metadata: Metadata = Metadata(document_id=chunk.doc_id, chunk_id=chunk.chunk_id)
38 | log = BuildLog(
39 | chunk_text=chunk.text,
40 | metadata=metadata,
41 | nodes=json_nodes_edges["entities"],
42 | edges=json_nodes_edges["relationships"],
43 | )
44 | # node properties
45 | node_names: list[str] = [node["name"] for node in log.nodes]
46 | if not node_names:
47 | return
48 |
49 | prompt_formatted: str = process_template(
50 | JSON_PROPERTY,
51 | {
52 | "current_nodes": ", ".join(node_names),
53 | "input_text": log.chunk_text,
54 | },
55 | )
56 | properties: dict[str, list[dict[str, list[str]]]] = builder.model.get_json_response(
57 | prompt=prompt_formatted
58 | )
59 |
60 | print("TEXT")
61 | print(chunk.text)
62 | print("EXTRACT")
63 | print(node_names)
64 | print(properties)
65 | print("EDGES")
66 | for e in log.edges:
67 | print(e)
68 |
69 |
70 | TEST_FILE_2 = "test_files/test_file_2.pdf"
71 |
72 |
73 | def search_check():
74 | # Set up all the graph dependencies
75 | graph_name: str = "eschergraph4"
76 |
77 | graph: Graph = Graph(
78 | model=OpenAIProvider(model=OpenAIModel.GPT_4o_MINI),
79 | name=graph_name,
80 | )
81 | # graph.build(files=TEST_FILE_2)
82 |
83 | query = "Who is Mahmood Sher-Jan?"
84 | r = graph.search(query)
85 | print(r)
86 |
87 |
88 | def test_search_graph() -> None:
89 | """Tests the search functionality of a Graph object."""
90 | t = time.time()
91 | openai_client = OpenAIProvider(model=OpenAIModel.GPT_4o_MINI)
92 | reranker_client = JinaReranker()
93 | graph: Graph = Graph(name="my graph", model=openai_client, reranker=reranker_client)
94 |
--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_tokens_type_trainer/token_type_trainer.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 |
5 | import numpy as np
6 | from tqdm import tqdm
7 |
8 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_token import (
9 | PdfToken,
10 | )
11 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.token_type import (
12 | TokenType,
13 | )
14 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_tokens_type_trainer.pdf_trainer import (
15 | PdfTrainer,
16 | )
17 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_tokens_type_trainer.token_features import (
18 | TokenFeatures,
19 | )
20 |
21 |
22 | class TokenTypeTrainer(PdfTrainer):
23 | def get_model_input(self) -> np.ndarray:
24 | features_rows = []
25 |
26 | contex_size = self.model_configuration.context_size
27 | for token_features, page in self.loop_token_features():
28 | page_tokens = [
29 | self.get_padding_token(segment_number=i - 999999, page_number=page.page_number)
30 | for i in range(contex_size)
31 | ]
32 | page_tokens += page.tokens
33 | page_tokens += [
34 | self.get_padding_token(segment_number=999999 + i, page_number=page.page_number)
35 | for i in range(contex_size)
36 | ]
37 |
38 | tokens_indexes = range(contex_size, len(page_tokens) - contex_size)
39 | page_features = [
40 | self.get_context_features(token_features, page_tokens, i)
41 | for i in tokens_indexes
42 | ]
43 | features_rows.extend(page_features)
44 |
45 | return self.features_rows_to_x(features_rows)
46 |
47 | def loop_token_features(self):
48 | for pdf_features in tqdm(self.pdfs_features):
49 | token_features = TokenFeatures(pdf_features)
50 |
51 | for page in pdf_features.pages:
52 | if not page.tokens:
53 | continue
54 |
55 | yield token_features, page
56 |
57 | def get_context_features(
58 | self, token_features: TokenFeatures, page_tokens: list[PdfToken], token_index: int
59 | ):
60 | token_row_features = []
61 | first_token_from_context = token_index - self.model_configuration.context_size
62 | for i in range(self.model_configuration.context_size * 2):
63 | first_token = page_tokens[first_token_from_context + i]
64 | second_token = page_tokens[first_token_from_context + i + 1]
65 | token_row_features.extend(
66 | token_features.get_features(first_token, second_token, page_tokens)
67 | )
68 |
69 | return token_row_features
70 |
71 | def predict(self, model_path: str | Path = None):
72 | predictions = super().predict(model_path)
73 | predictions_assigned = 0
74 | for token_features, page in self.loop_token_features():
75 | for token, prediction in zip(
76 | page.tokens,
77 | predictions[predictions_assigned : predictions_assigned + len(page.tokens)],
78 | ):
79 | token.prediction = int(np.argmax(prediction))
80 |
81 | predictions_assigned += len(page.tokens)
82 |
83 | def set_token_types(self, model_path: str | Path = None):
84 | self.predict(model_path)
85 | for token in self.loop_tokens():
86 | token.token_type = TokenType.from_index(token.prediction)
87 |
--------------------------------------------------------------------------------
/eschergraph/agents/prompts/community_prompt.jinja:
--------------------------------------------------------------------------------
1 |
2 | You are an AI assistant that helps a human analyst to perform general information discovery. Information discovery is the process of identifying and assessing relevant information associated with certain entities (e.g., organizations and individuals) within a network.
3 |
4 | # Goal
5 | Write a comprehensive report of a community, given a list of entities that belong to the community as well as their relationships and optional properties. The report will be used to inform decision-makers about information associated with the community and their potential impact. The content of this report includes an overview of the community's key entities, their legal compliance, technical capabilities, reputation, and noteworthy claims.
6 |
7 | # Report Structure
8 |
9 | The report should include the following sections:
10 |
11 | - TITLE: community's name that represents its key entities - title should be short but specific. When possible, include representative named entities in the title.
12 | - SUMMARY: An executive summary of the community's overall structure, how its entities are related to each other, and significant information associated with its entities.
13 | - DETAILED FINDINGS: A list of 5-10 key insights about the community. Each insight should have a short summary followed by multiple paragraphs of explanatory text grounded according to the grounding rules below. Be comprehensive.
14 |
15 | Return output as a well-formed JSON-formatted string with the following format:
16 | {
17 | "title": ,
18 | "summary": ,
19 | "findings": [
20 | {
21 | "summary":,
22 | "explanation":
23 | },
24 | {
25 | "summary":,
26 | "explanation":
27 | }
28 | ]
29 | }
30 |
31 | # Grounding Rules
32 |
33 | Do not include information with no supporting evidence in the data.
34 |
35 |
36 | # Real Data
37 |
38 | Use the following text for your answer. Do not make anything up in your answer.
39 |
40 | Edges:
41 | {{ relationships }}
42 |
43 | Properties:
44 | {{ properties }}
45 |
46 | The report should include the following sections:
47 |
48 | - TITLE: community's name that represents its key entities - title should be short but specific. When possible, include representative named entities in the title.
49 | - SUMMARY: An executive summary of the community's overall structure, how its entities are related to each other, and significant information associated with its entities.
50 | - DETAILED FINDINGS: A list of 5-10 key insights about the community. Each insight should have a short summary followed by multiple paragraphs of explanatory text grounded according to the grounding rules below. Be comprehensive.
51 |
52 | Return output as a well-formed JSON only formatted string with the following format:
53 | {
54 | "title": ,
55 | "summary": ,
56 | "findings": [
57 | {
58 | "summary":,
59 | "explanation":
60 | },
61 | {
62 | "summary":,
63 | "explanation":
64 | }
65 | ]
66 | }
67 |
68 |
69 | Output:
70 |
--------------------------------------------------------------------------------
/eschergraph/visualization/visualizer.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import TYPE_CHECKING
4 | from uuid import UUID
5 |
6 | import seaborn as sns
7 | from pyvis.network import Network
8 |
9 | from eschergraph.graph import Edge
10 | from eschergraph.graph.community_alg import get_leidenalg_communities
11 |
12 | if TYPE_CHECKING:
13 | from eschergraph.graph import Graph
14 | from eschergraph.graph import Node
15 |
16 |
17 | # TODO: add level and graph name to the visualization
18 | class Visualizer:
19 | """The visualizer for EscherGraphs."""
20 |
21 | @staticmethod
22 | def visualize_graph(
23 | graph: Graph, level: int = 0, save_location: str = "graph_visual.html"
24 | ) -> None:
25 | """Visualize a level of a graph.
26 |
27 | Args:
28 | graph (Graph): The graph to visualize.
29 | level (int): The level of the graph that needs to be visualized.
30 | save_location (str): The location to save the generated visual.
31 | """
32 | nodes: list[Node] = graph.repository.get_all_at_level(level=level)
33 | edges: list[Edge] = [edge for node in nodes for edge in node.edges]
34 | node_ids: list[list[UUID]] = get_leidenalg_communities(nodes).partitions
35 | node_dict: dict[UUID, Node] = {node.id: node for node in nodes}
36 |
37 | # Transform the list of node_ids into a list of nodes
38 | comms: list[list[Node]] = []
39 | for comm in node_ids:
40 | comm_nodes: list[Node] = []
41 | for id in comm:
42 | comm_nodes.append(node_dict[id])
43 |
44 | comms.append(comm_nodes)
45 |
46 | Visualizer.visualize_community_graph(
47 | comms=comms, edges=edges, save_location=save_location
48 | )
49 |
50 | @staticmethod
51 | def visualize_community_graph(
52 | comms: list[list[Node]],
53 | edges: list[Edge],
54 | save_location: str = "community_visual.html",
55 | ) -> None:
56 | """Visualize a graph of communities.
57 |
58 | Communities are provided in a list containing lists of nodes, where each
59 | list of nodes corresponds to a community.
60 |
61 | Args:
62 | comms (list[list[node]]): A list of communities.
63 | edges (list[Edge]): The list of edges in the community graph.
64 | save_location (str): The location to save the generated visual.
65 | """
66 | palette: list[str] = sns.color_palette("hls", len(comms)).as_hex()
67 | net = Network(
68 | notebook=False,
69 | cdn_resources="remote",
70 | height="900px",
71 | width="100%",
72 | select_menu=True,
73 | filter_menu=False,
74 | )
75 |
76 | # Map all nodes in comms to their id for lookup
77 | node_id: dict[UUID, Node] = {nd.id: nd for comm in comms for nd in comm}
78 |
79 | for idx, comm in enumerate(comms):
80 | for nd in comm:
81 | net.add_node(
82 | nd.name,
83 | label=nd.name,
84 | title=nd.description,
85 | value=len(nd.edges),
86 | color=palette[idx],
87 | )
88 |
89 | for edge in edges:
90 | net.add_edge(
91 | node_id[edge.frm.id].name, node_id[edge.to.id].name, title=edge.description
92 | )
93 |
94 | net.force_atlas_2based(central_gravity=0.015, gravity=-31)
95 | net.show_buttons(filter_=["physics"])
96 |
97 | net.show(name=save_location, notebook=False)
98 |
--------------------------------------------------------------------------------
/eschergraph/tools/estimator.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 |
4 | class Estimator:
5 | """This is a class for estimating the cost and time to build a graph from a document."""
6 |
7 | @staticmethod
8 | def get_cost_indication(total_tokens: int, model: str) -> float:
9 | """Estimates the cost based on the number of tokens and the model used.
10 |
11 | Args:
12 | total_tokens (int): The total number of tokens.
13 | model (str): The model used for estimation ('gpt-4o' or 'gpt-4o-mini').
14 |
15 | Returns:
16 | float: The estimated cost of processing.
17 | """
18 | # Initialize variables
19 | prompt_cost: float = 0.0
20 | completion_cost: float = 0.0
21 |
22 | # for each chunk 2 llm calls are performed, but also for the node machter an average of 2 llm calls per page.
23 | # Also some calls for the community building are used. We estimated it to be 2.5 llm calls per tokens.
24 | llm_calls_per_token_estimation = 2.5
25 |
26 | # Assumed that completion tokens are equal to prompt tokens
27 | if model == "gpt-4o":
28 | prompt_cost = (total_tokens / 1e6) * 5.00
29 | completion_cost = (total_tokens / 1e6) * 15.00
30 | elif model == "gpt-4o-mini":
31 | prompt_cost = (total_tokens / 1e6) * 0.150
32 | completion_cost = (total_tokens / 1e6) * 0.600
33 | else:
34 | raise ValueError("Invalid model specified.")
35 |
36 | building_cost: float = prompt_cost + (completion_cost / 4)
37 | return round(building_cost * llm_calls_per_token_estimation, 4)
38 |
39 | @staticmethod
40 | def get_time_indication(num_chunks: int, model: str) -> str:
41 | """Estimates the time required to process the document based on the number of chunks and the model used.
42 |
43 | Args:
44 | num_chunks (int): The number of chunks to process.
45 | model (str): The model used for estimation ('gpt-4o' or 'gpt-4o-mini').
46 |
47 | Returns:
48 | str: The estimated time to complete the processing, either in seconds or minutes.
49 | """
50 | # Determine average time per chunk based on model
51 | average_time_per_chunk: int = 4 if model == "gpt-4o" else 2
52 |
53 | max_workers: int = 2 # as used in ThreadPoolExecutor
54 |
55 | # If number of chunks is less than or equal to max_workers,
56 | # the time taken would be approximately the time for one chunk.
57 | if num_chunks <= max_workers:
58 | estimated_time = average_time_per_chunk
59 | else:
60 | # Calculate the time for full batches and any remaining chunks
61 | full_batches = num_chunks // max_workers
62 | remaining_chunks = num_chunks % max_workers
63 |
64 | estimated_time = full_batches * average_time_per_chunk
65 | if remaining_chunks > 0:
66 | estimated_time += average_time_per_chunk
67 |
68 | node_mathcer_delay = num_chunks * average_time_per_chunk
69 | community_building_delay = num_chunks * average_time_per_chunk
70 |
71 | estimated_time = estimated_time + node_mathcer_delay + community_building_delay
72 |
73 | # If the estimated time is more than 60 seconds, return time in minutes
74 | if estimated_time > 60:
75 | minutes = round(estimated_time / 60, 3)
76 | return f"{minutes} minute{'s' if minutes > 1 else ''}"
77 | else:
78 | return f"{round(estimated_time, 3)} seconds"
79 |
--------------------------------------------------------------------------------
/tests/agents/test_jinja_prompt_helper.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 | from assertpy import assert_that
5 | from jinja2 import Environment
6 | from jinja2 import FileSystemLoader
7 | from jinja2 import select_autoescape
8 |
9 | from eschergraph.agents.jinja_helper import extract_variables
10 | from eschergraph.agents.jinja_helper import process_template
11 | from eschergraph.exceptions import PromptFormattingException
12 |
13 | json_build_template: str = """-Goal-
14 | Extract all relevant information from the provided text into a graph representation containing entities and relations.
15 | The most important part is that you try to represent all the information in the provided text in a structured format!
16 |
17 | -Steps-
18 | 1. Identify all named entities in singular form. For people please include the entire name. Entities can also be technologies.
19 | For each identified entity, extract the following information:
20 | - entity_name: Name of the entity
21 | - entity_description: Comprehensive description of the entity's attributes and activities
22 |
23 | Format each entity output as a JSON entry with the following format:
24 |
25 | {"name": , "description": }
26 |
27 | 2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
28 | For each pair of related entities, extract the following information:
29 | - source_entity: name of the source entity, as identified in step 1
30 | - target_entity: name of the target entity, as identified in step 1
31 | - relationship_description: explanation as to why you think the source entity and the target entity are related to each other
32 |
33 | Format each relationship as a JSON entry with the following format:
34 |
35 | {"source": , "target": , "relationship": }
36 |
37 | 3. Return output in English as a single list of all JSON entities and relationships identified in steps 1 and 2.
38 | return the JSON like this:
39 |
40 | {
41 | 'entities': [{"name": , "description": }, {"name": , "description": }],
42 | 'relationships':[{"source": , "target": , "relationship": }, and more]
43 | }
44 |
45 | However, only extract entities that are specific so avoid extracting entities like CEO or employee, but instead
46 | extract only named entities.
47 |
48 | -Real Data-
49 | ######################
50 | text: This is a test
51 | ######################
52 | output:"""
53 |
54 | input_text: str = "This is a test"
55 |
56 |
57 | def test_templating_function_json_build_empty_data() -> None:
58 | with pytest.raises(PromptFormattingException):
59 | process_template(template_file="json_build.jinja", data={})
60 |
61 |
62 | def test_templating_function_json_property_missing_data() -> None:
63 | with pytest.raises(PromptFormattingException):
64 | process_template(
65 | template_file="json_property.jinja", data={"input_text": input_text}
66 | )
67 |
68 |
69 | def test_extract_variables() -> None:
70 | jinja_env: Environment = Environment(
71 | loader=FileSystemLoader(searchpath="./eschergraph/agents/prompts"),
72 | autoescape=select_autoescape(),
73 | )
74 |
75 | assert_that(
76 | extract_variables("json_property.jinja", jinja_env)
77 | ).does_not_contain_duplicates().contains_only("input_text", "current_nodes")
78 |
--------------------------------------------------------------------------------
/tests/graph/search/test_global_search.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from unittest.mock import patch
4 | from uuid import UUID
5 | from uuid import uuid4
6 |
7 | from eschergraph.config import MAIN_COLLECTION
8 | from eschergraph.graph.graph import Graph
9 | from eschergraph.graph.search.global_search import AttributeSearch
10 | from eschergraph.graph.search.global_search import get_relevant_extractions
11 | from eschergraph.graph.search.global_search import global_search
12 |
13 |
14 | def test_global_search(graph_unit: Graph) -> None:
15 | query = "test query"
16 | context = "Attribute 1\nAttribute 2"
17 | full_prompt = "Processed template with context and query"
18 |
19 | with patch(
20 | "eschergraph.graph.search.global_search.get_relevant_extractions"
21 | ) as mock_get_extractions:
22 | with patch(
23 | "eschergraph.graph.search.global_search.process_template"
24 | ) as mock_process_template:
25 | mock_get_extractions.return_value = [
26 | AttributeSearch(text="Attribute 1", metadata=None, parent_nodes=[""]),
27 | AttributeSearch(text="Attribute 2", metadata=None, parent_nodes=[""]),
28 | ]
29 | mock_process_template.return_value = full_prompt
30 | graph_unit.model.get_plain_response.return_value = "Generated answer"
31 |
32 | result = global_search(graph_unit, query)
33 |
34 | assert result == "Generated answer"
35 | mock_get_extractions.assert_called_once_with(graph_unit, query, None)
36 | mock_process_template.assert_called_once_with(
37 | "search/global_search_context.jinja", {"CONTEXT": context, "QUERY": query}
38 | )
39 | graph_unit.model.get_plain_response.assert_called_once_with(full_prompt)
40 |
41 |
42 | def test_global_search_get_relevant_extractions(graph_unit: Graph) -> None:
43 | prompt = "test prompt"
44 | search_results = [
45 | {"chunk": "Chunk 1", "metadata": {"level": 1}},
46 | {"chunk": "Chunk 2", "metadata": {"level": 1}},
47 | {
48 | "chunk": 123,
49 | "metadata": {"level": 1},
50 | }, # This should be filtered out as it's not a string
51 | ]
52 | reranked_results = [
53 | AttributeSearch(text="Reranked Chunk 1", metadata=None, parent_nodes=[""]),
54 | AttributeSearch(text="Reranked Chunk 2", metadata=None, parent_nodes=[""]),
55 | ]
56 |
57 | with patch(
58 | "eschergraph.graph.search.global_search.rerank_and_filter_attributes",
59 | return_value=reranked_results,
60 | ):
61 | graph_unit.vector_db.search.return_value = search_results
62 |
63 | get_relevant_extractions(graph_unit, prompt)
64 |
65 | graph_unit.vector_db.search.assert_called_once_with(
66 | query=prompt, top_n=15, metadata={"level": 1}, collection_name=MAIN_COLLECTION
67 | )
68 |
69 |
70 | def test_global_search_with_doc_filter(graph_unit: Graph) -> None:
71 | doc_filter: list[UUID] = [uuid4() for _ in range(10)]
72 |
73 | global_search(graph_unit, "test_query", doc_filter=doc_filter)
74 |
75 | graph_unit.vector_db.search.assert_called_once_with(
76 | query="test_query",
77 | top_n=15,
78 | metadata={"level": 1, "document_id": [str(id) for id in doc_filter]},
79 | collection_name=MAIN_COLLECTION,
80 | )
81 |
82 |
83 | def test_global_search_without_doc_filter(graph_unit: Graph) -> None:
84 | global_search(graph_unit, "test_query")
85 |
86 | graph_unit.vector_db.search.assert_called_once_with(
87 | query="test_query", top_n=15, metadata={"level": 1}, collection_name=MAIN_COLLECTION
88 | )
89 |
--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/fast_trainer/paragraph_extractor_trainer.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 |
5 | from eschergraph.builder.reader.pdf_document_layout_analysis.fast_trainer.paragraph import (
6 | Paragraph,
7 | )
8 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_token import (
9 | PdfToken,
10 | )
11 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_segment import (
12 | PdfSegment,
13 | )
14 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.token_type import (
15 | TokenType,
16 | )
17 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_tokens_type_trainer.token_features import (
18 | TokenFeatures,
19 | )
20 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_tokens_type_trainer.token_type_trainer import (
21 | TokenTypeTrainer,
22 | )
23 |
24 |
25 | class ParagraphExtractorTrainer(TokenTypeTrainer):
26 | def get_context_features(
27 | self, token_features: TokenFeatures, page_tokens: list[PdfToken], token_index: int
28 | ):
29 | token_row_features = list()
30 | first_token_from_context = token_index - self.model_configuration.context_size
31 | for i in range(self.model_configuration.context_size * 2):
32 | first_token = page_tokens[first_token_from_context + i]
33 | second_token = page_tokens[first_token_from_context + i + 1]
34 | features = token_features.get_features(first_token, second_token, page_tokens)
35 | features += self.get_paragraph_extraction_features(first_token, second_token)
36 | token_row_features.extend(features)
37 |
38 | return token_row_features
39 |
40 | @staticmethod
41 | def get_paragraph_extraction_features(
42 | first_token: PdfToken, second_token: PdfToken
43 | ) -> list[int]:
44 | one_hot_token_type_1 = [
45 | 1 if token_type == first_token.token_type else 0 for token_type in TokenType
46 | ]
47 | one_hot_token_type_2 = [
48 | 1 if token_type == second_token.token_type else 0 for token_type in TokenType
49 | ]
50 | return one_hot_token_type_1 + one_hot_token_type_2
51 |
52 | def loop_token_next_token(self):
53 | for pdf_features in self.pdfs_features:
54 | for page in pdf_features.pages:
55 | if not page.tokens:
56 | continue
57 | if len(page.tokens) == 1:
58 | yield page, page.tokens[0], page.tokens[0]
59 | for token, next_token in zip(page.tokens, page.tokens[1:]):
60 | yield page, token, next_token
61 |
62 | def get_pdf_segments(
63 | self, paragraph_extractor_model_path: str | Path
64 | ) -> list[PdfSegment]:
65 | paragraphs = self.get_paragraphs(paragraph_extractor_model_path)
66 | pdf_segments = [
67 | PdfSegment.from_pdf_tokens(paragraph.tokens, paragraph.pdf_name)
68 | for paragraph in paragraphs
69 | ]
70 |
71 | return pdf_segments
72 |
73 | def get_paragraphs(self, paragraph_extractor_model_path) -> list[Paragraph]:
74 | self.predict(paragraph_extractor_model_path)
75 | paragraphs: list[Paragraph] = []
76 | last_page = None
77 | for page, token, next_token in self.loop_token_next_token():
78 | if last_page != page:
79 | last_page = page
80 | paragraphs.append(Paragraph([token], page.pdf_name))
81 | if token == next_token:
82 | continue
83 | if token.prediction:
84 | paragraphs[-1].add_token(next_token)
85 | continue
86 | paragraphs.append(Paragraph([next_token], page.pdf_name))
87 |
88 | return paragraphs
89 |
--------------------------------------------------------------------------------
/eschergraph/graph/edge.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import Optional
4 | from typing import TYPE_CHECKING
5 |
6 | from attrs import define
7 | from attrs import field
8 |
9 | from eschergraph.exceptions import EdgeCreationException
10 | from eschergraph.exceptions import RepositoryException
11 | from eschergraph.graph.base import EscherBase
12 | from eschergraph.graph.getter_setter import loading_getter_setter
13 | from eschergraph.graph.loading import LoadState
14 | from eschergraph.persistence import Metadata
15 |
16 | # Prevent circular import errors
17 | if TYPE_CHECKING:
18 | from eschergraph.graph.node import Node
19 |
20 |
21 | @loading_getter_setter
22 | @define
23 | class Edge(EscherBase):
24 | """The edge in an EscherGraph.
25 |
26 | Although, we specify from and to nodes, edges are actually undirectional
27 | as they are richly descriptive. This is also reflected in the equals method.
28 |
29 | Note that the loadstate for an Edge is directly passed on to the two nodes that are
30 | connected by the edge.
31 | """
32 |
33 | frm: Node = field(kw_only=True)
34 | to: Node = field(kw_only=True)
35 | _description: Optional[str] = field(default=None, metadata={"group": LoadState.CORE})
36 |
37 | # The type annotation for the dynamically added property
38 | description: str = field(init=False)
39 |
40 | @classmethod
41 | def create(
42 | cls,
43 | frm: Node,
44 | to: Node,
45 | description: str,
46 | metadata: Optional[set[Metadata]] = None,
47 | ) -> Edge:
48 | """The method that allows for the creation of a new edge.
49 |
50 | Note that edges do have a to and from method, but they
51 | are undirectional. This is also reflected in the equals method.
52 |
53 | Args:
54 | frm (Node): The from node in the edge.
55 | to (Node): The to node in the edge.
56 | description (str): A rich description of the relation.
57 | metadata (Optional[set[Metadata]]): The optional metadata for the edge.
58 |
59 | Returns:
60 | A new edge.
61 | """
62 | if frm.id == to.id:
63 | raise EdgeCreationException(
64 | "An edge should be created between two different nodes."
65 | )
66 |
67 | if not frm.repository is to.repository:
68 | raise RepositoryException(
69 | "The two nodes that are connected by an edge need to have the same repository."
70 | )
71 |
72 | edge: Edge = cls(
73 | frm=frm,
74 | to=to,
75 | description=description,
76 | repository=frm.repository,
77 | metadata=metadata if metadata else set(),
78 | loadstate=LoadState.FULL,
79 | )
80 |
81 | # Add the edge to the nodes
82 | frm.edges.add(edge)
83 | to.edges.add(edge)
84 |
85 | return edge
86 |
87 | def __eq__(self, other: object) -> bool:
88 | """The equals method for an edge.
89 |
90 | Two edges are equal if they have the same description and run between the same nodes.
91 |
92 | Args:
93 | other (object): The object to compare the edge to.
94 |
95 | Returns:
96 | True if equal and false otherwise.
97 | """
98 | if isinstance(other, Edge):
99 | return {self.frm.id, self.to.id} == {
100 | other.frm.id,
101 | other.to.id,
102 | } and self.description == other.description
103 |
104 | return False
105 |
106 | def __hash__(self) -> int:
107 | """The hash function for an edge.
108 |
109 | Returns:
110 | The integer hash value for an edge.
111 | """
112 | return hash((self.id, self.frm.id, self.to.id, self.description))
113 |
--------------------------------------------------------------------------------
/tests/builder/test_building_tools.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from unittest.mock import MagicMock
4 | from uuid import uuid4
5 |
6 | from faker import Faker
7 |
8 | from eschergraph.builder.building_tools import BuildingTools
9 | from eschergraph.builder.models import Chunk
10 | from eschergraph.builder.models import ProcessedFile
11 | from eschergraph.builder.reader.reader import Reader
12 |
13 | faker: Faker = Faker()
14 |
15 |
16 | def test_check_node_ext_pass() -> None:
17 | valid_node_ext = {"name": "Node1", "description": "A sample node"}
18 | assert BuildingTools.check_node_ext(valid_node_ext) == True
19 |
20 |
21 | def test_check_node_ext_fail() -> None:
22 | invalid_node_ext = {
23 | "name": "Node1",
24 | "desc": "A sample node", # Incorrect key
25 | }
26 | assert BuildingTools.check_node_ext(invalid_node_ext) == False
27 |
28 |
29 | def test_check_edge_ext_pass() -> None:
30 | valid_edge_ext = {
31 | "source": "Node1",
32 | "target": "Node2",
33 | "relationship": "connected_to",
34 | }
35 | assert BuildingTools.check_edge_ext(valid_edge_ext) == True
36 |
37 |
38 | def test_check_edge_ext_fail() -> None:
39 | invalid_edge_ext = {
40 | "source": "Node1",
41 | "target": "Node2",
42 | "relation": "connected_to", # Incorrect key
43 | }
44 | assert BuildingTools.check_edge_ext(invalid_edge_ext) == False
45 |
46 |
47 | def test_check_property_ext_pass() -> None:
48 | valid_property_ext = {"entity_name": "Entity1", "properties": ["prop1", "prop2"]}
49 | assert BuildingTools.check_property_ext(valid_property_ext) == True
50 |
51 |
52 | def test_check_property_ext_fail() -> None:
53 | invalid_property_ext = {
54 | "entity_name": "Entity1",
55 | "properties": "prop1, prop2", # Incorrect type
56 | }
57 | assert BuildingTools.check_property_ext(invalid_property_ext) == False
58 |
59 |
60 | def test_check_node_edge_ext_pass() -> None:
61 | valid_node_edge_ext = {
62 | "entities": [{"name": "Node1", "description": "A sample node"}],
63 | "relationships": [
64 | {"source": "Node1", "target": "Node2", "relationship": "connected_to"}
65 | ],
66 | }
67 | assert BuildingTools.check_node_edge_ext(valid_node_edge_ext) == True
68 |
69 |
70 | def test_check_node_edge_ext_fail() -> None:
71 | invalid_node_edge_ext = {
72 | "entities": [
73 | {"name": "Node1", "desc": "A sample node"} # Incorrect key
74 | ],
75 | "relationships": [
76 | {"source": "Node1", "target": "Node2", "rel": "connected_to"} # Incorrect key
77 | ],
78 | }
79 | assert BuildingTools.check_node_edge_ext(invalid_node_edge_ext) == False
80 |
81 |
82 | def test_process_files_empty() -> None:
83 | assert BuildingTools.process_files(files=[], multi_modal=False) == []
84 |
85 |
86 | def test_process_files_single_file() -> None:
87 | reader_mock: MagicMock = MagicMock(spec=Reader)
88 | # Set the mock to return itself for initialization
89 | reader_mock.return_value = reader_mock
90 | reader_mock.chunks = [
91 | Chunk(text=text, chunk_id=idx, doc_id=uuid4(), page_num=idx)
92 | for idx, text in enumerate(faker.texts(nb_texts=15, max_nb_chars=80))
93 | ]
94 | reader_mock.total_tokens = 10000
95 | reader_mock.full_text = faker.text(max_nb_chars=1200)
96 | reader_mock.visual_elements = None
97 |
98 | processed: list[ProcessedFile] = BuildingTools.process_files(
99 | files=["./test_files/test.pdf"], multi_modal=False, reader_impl=reader_mock
100 | )
101 | processed_file: ProcessedFile = processed[0]
102 |
103 | assert len(processed) == 1
104 | assert processed_file.chunks == reader_mock.chunks
105 | assert processed_file.full_text == reader_mock.full_text
106 | assert processed_file.visual_elements is None
107 | assert processed_file.document.token_num == reader_mock.total_tokens
108 | assert processed_file.document.chunk_num == 15
109 |
--------------------------------------------------------------------------------
/eschergraph/tools/prepare_sync_data.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from uuid import UUID
4 |
5 | from eschergraph.graph.edge import Edge
6 | from eschergraph.graph.node import Node
7 | from eschergraph.graph.property import Property
8 | from eschergraph.persistence.change_log import Action
9 | from eschergraph.persistence.change_log import ChangeLog
10 | from eschergraph.persistence.repository import Repository
11 |
12 |
13 | def prepare_sync_data(
14 | repository: Repository,
15 | ) -> tuple[
16 | list[tuple[UUID, str, dict[str, str | int]]],
17 | list[UUID],
18 | ]:
19 | """Prepares data for synchronization with the vector database.
20 |
21 | Args:
22 | repository (Repository): The graph's repository.
23 |
24 | Returns:
25 | tuple: A tuple containing lists of documents, IDs, metadata, and IDs to delete.
26 | """
27 | change_logs: list[ChangeLog] = repository.get_change_log()
28 |
29 | # Map each object id to its change_logs
30 | objects_logs: dict[UUID, list[ChangeLog]] = {log.id: [] for log in change_logs}
31 | for log in change_logs:
32 | objects_logs[log.id].append(log)
33 |
34 | ids_to_create, ids_to_delete = _get_actions_for_objects(objects_logs)
35 | create_main: list[tuple[UUID, str, dict[str, str | int]]] = []
36 |
37 | for id in ids_to_create:
38 | cur_log: ChangeLog = objects_logs[id][0]
39 | # Prepare metadata based on log type
40 | if cur_log.type == Node:
41 | node: Node | None = repository.get_node_by_id(id)
42 | if not node:
43 | continue
44 |
45 | # We add the document_id to all the object
46 | md_node: dict[str, str | int] = {
47 | "level": cur_log.level,
48 | "type": "node",
49 | "document_id": _get_node_document_id(node),
50 | }
51 | node_string = node.name + ", " + node.description
52 | create_main.append((id, node_string, md_node))
53 |
54 | elif cur_log.type == Edge:
55 | edge: Edge | None = repository.get_edge_by_id(id)
56 | if not edge:
57 | continue
58 | md_edge: dict[str, str | int] = {
59 | "level": cur_log.level,
60 | "type": "edge",
61 | "document_id": _get_node_document_id(edge.frm),
62 | }
63 | create_main.append((id, edge.description, md_edge))
64 |
65 | elif cur_log.type == Property:
66 | property: Property | None = repository.get_property_by_id(id)
67 | if not property:
68 | continue
69 | md_prop: dict[str, str | int] = {
70 | "level": cur_log.level,
71 | "type": "property",
72 | "document_id": _get_node_document_id(property.node),
73 | }
74 | property_string = property.node.name + ", " + property.description
75 | create_main.append((id, property_string, md_prop))
76 |
77 | return create_main, ids_to_delete
78 |
79 |
80 | def _get_actions_for_objects(
81 | objects_logs: dict[UUID, list[ChangeLog]],
82 | ) -> tuple[list[UUID], list[UUID]]:
83 | ids_to_delete: list[UUID] = []
84 | ids_to_create: list[UUID] = []
85 | for id, object_logs in objects_logs.items():
86 | # Create a set of actions for the object
87 | actions: set[Action] = {log.action for log in object_logs}
88 | if not Action.CREATE in actions:
89 | ids_to_delete.append(id)
90 | if not Action.DELETE in actions:
91 | ids_to_create.append(id)
92 |
93 | return ids_to_create, ids_to_delete
94 |
95 |
96 | def _get_node_document_id(node: Node) -> str:
97 | """Returns the UUID of the node's document_id in string format.
98 |
99 | Currently, all graph objects do still exclusively belong to a single
100 | document as we have not added inter-document merging or
101 | edge finding. As soon as this is added, this logic will change.
102 |
103 | Args:
104 | node (Node): The node to get the document_id for.
105 |
106 | Returns:
107 | The UUID as a string.
108 | """
109 | cur_level: int = node.level
110 | cur_node: Node = node
111 |
112 | # Get the metadata on a level 0 child node
113 | while cur_level > 0:
114 | cur_node = cur_node.child_nodes[0]
115 | cur_level -= 1
116 |
117 | return str(next(iter(cur_node.metadata)).document_id)
118 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 | node_modules
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
164 | # The default graph save location
165 | **/eschergraph_storage/**
166 |
167 | # The visualization files
168 | community_visual.html
169 |
170 | # Downloaded parser models
171 | **/fast_models/**
172 |
173 | # Packaged binaries for the code
174 | **/bins/**
--------------------------------------------------------------------------------
/tests/graph/search/test_quick_search.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import random
4 | from unittest.mock import MagicMock
5 | from unittest.mock import patch
6 | from uuid import UUID
7 | from uuid import uuid4
8 |
9 | from eschergraph.agents.jinja_helper import process_template
10 | from eschergraph.agents.reranker import RerankerResult
11 | from eschergraph.config import MAIN_COLLECTION
12 | from eschergraph.graph.graph import Graph
13 | from eschergraph.graph.search.quick_search import quick_search
14 | from eschergraph.graph.search.quick_search import RAGAnswer
15 | from eschergraph.graph.search.quick_search import rerank_and_filter_attributes
16 | from eschergraph.persistence.vector_db.vector_search_result import VectorSearchResult
17 | from tests.persistence.vector_db.help import generate_vector_search_results
18 |
19 | RAG_SEARCH = "search/question_with_context.jinja"
20 |
21 |
22 | def test_quick_search_empty_query(graph_unit: Graph) -> None:
23 | RAGanswer: RAGAnswer = quick_search(graph_unit, "")
24 | assert RAGanswer.answer == "please ask a question"
25 |
26 |
27 | def test_quick_search_no_attributes_found(graph_unit: Graph) -> None:
28 | with patch(
29 | "eschergraph.graph.search.quick_search.get_attributes_search", return_value=[]
30 | ):
31 | graph_unit.model.get_plain_response.return_value = "No results found"
32 | RAGanswer: RAGAnswer = quick_search(graph_unit, "test query")
33 | assert RAGanswer.answer == "No results found"
34 | graph_unit.model.get_plain_response.assert_called_with(
35 | process_template(
36 | RAG_SEARCH,
37 | data={
38 | "CONTEXT": "Nothing found in the graph regarding this question!",
39 | "QUERY": "test query",
40 | },
41 | )
42 | )
43 |
44 |
45 | def test_rerank_and_filter_no_attributes(graph_unit: Graph) -> None:
46 | mock_filter_attributes: MagicMock = MagicMock()
47 | with patch(
48 | "eschergraph.graph.search.quick_search.filter_attributes", mock_filter_attributes
49 | ):
50 | graph_unit.reranker.rerank.return_value = []
51 | rerank_and_filter_attributes(graph_unit, "test query", [])
52 | graph_unit.reranker.rerank.assert_called_once_with("test query", [], top_n=0)
53 | mock_filter_attributes.assert_called_once_with(graph_unit, [], {}, 0.2)
54 |
55 |
56 | def test_rerank_and_filter_attributes(graph_unit: Graph) -> None:
57 | attributes_results: list[VectorSearchResult] = generate_vector_search_results(
58 | num_results=2
59 | )
60 | rerank_result: list[RerankerResult] = [
61 | RerankerResult(
62 | index=1, relevance_score=random.uniform(0, 1), text=attributes_results[1].chunk
63 | ),
64 | RerankerResult(
65 | index=0, relevance_score=random.uniform(0, 1), text=attributes_results[0].chunk
66 | ),
67 | ]
68 | graph_unit.reranker.rerank.return_value = rerank_result
69 | mock_filter_attributes: MagicMock = MagicMock()
70 | with patch(
71 | "eschergraph.graph.search.quick_search.filter_attributes", mock_filter_attributes
72 | ):
73 | rerank_and_filter_attributes(
74 | graph_unit, "test query", attributes_results, threshold=0.2
75 | )
76 |
77 | graph_unit.reranker.rerank.assert_called_once_with(
78 | "test query", [attributes_results[0].chunk, attributes_results[1].chunk], top_n=2
79 | )
80 | mock_filter_attributes.assert_called_once_with(
81 | graph_unit, rerank_result, {r.chunk: r for r in attributes_results}, 0.2
82 | )
83 |
84 |
85 | def test_quick_search_with_doc_filter(graph_unit: Graph) -> None:
86 | doc_filter: list[UUID] = [uuid4() for _ in range(10)]
87 |
88 | quick_search(graph_unit, "test_query", doc_filter=doc_filter)
89 |
90 | graph_unit.vector_db.search.assert_called_once_with(
91 | query="test_query",
92 | top_n=40,
93 | metadata={"level": 0, "document_id": [str(id) for id in doc_filter]},
94 | collection_name=MAIN_COLLECTION,
95 | )
96 |
97 |
98 | def test_quick_search_without_doc_filter(graph_unit: Graph) -> None:
99 | quick_search(graph_unit, "test_query")
100 |
101 | graph_unit.vector_db.search.assert_called_once_with(
102 | query="test_query", top_n=40, metadata={"level": 0}, collection_name=MAIN_COLLECTION
103 | )
104 |
--------------------------------------------------------------------------------
/eschergraph/builder/reader/pdf_document_layout_analysis/pdf_features/pdf_token.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from lxml.etree import ElementBase
4 |
5 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_font import (
6 | PdfFont,
7 | )
8 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.pdf_token_context import (
9 | PdfTokenContext,
10 | )
11 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_features.rectangle import (
12 | Rectangle,
13 | )
14 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.label import (
15 | Label,
16 | )
17 | from eschergraph.builder.reader.pdf_document_layout_analysis.pdf_token_type_labels.token_type import (
18 | TokenType,
19 | )
20 |
21 |
22 | class PdfToken:
23 | def __init__(
24 | self,
25 | page_number,
26 | tag_id: str,
27 | content: str,
28 | pdf_font: PdfFont,
29 | reading_order_no: int,
30 | bounding_box: Rectangle,
31 | token_type: TokenType,
32 | ):
33 | self.page_number = int(page_number)
34 | self.id: str = tag_id
35 | self.content: str = content
36 | self.font: PdfFont = pdf_font
37 | self.reading_order_no: int = reading_order_no
38 | self.bounding_box: Rectangle = bounding_box
39 | self.token_type: TokenType = token_type
40 | self.pdf_token_context: PdfTokenContext = PdfTokenContext()
41 | self.prediction: int = 0
42 |
43 | def same_line(self, token: "PdfToken"):
44 | if self.bounding_box.bottom < token.bounding_box.top:
45 | return False
46 |
47 | if token.bounding_box.bottom < self.bounding_box.top:
48 | return False
49 |
50 | return True
51 |
52 | @staticmethod
53 | def from_poppler_etree(page_number: int, xml_tag: ElementBase, pdf_font: PdfFont):
54 | if "id" in xml_tag.attrib:
55 | tag_id = xml_tag.attrib["id"]
56 | else:
57 | tag_id = "tag"
58 |
59 | content = "".join(xml_tag.itertext()).strip()
60 | reading_order_no = (
61 | int(xml_tag.attrib["reading_order_no"])
62 | if "reading_order_no" in xml_tag.attrib
63 | else -1
64 | )
65 | bounding_box = Rectangle.from_poppler_tag_etree(xml_tag)
66 | token_type = TokenType.TEXT
67 |
68 | return PdfToken(
69 | page_number, tag_id, content, pdf_font, reading_order_no, bounding_box, token_type
70 | )
71 |
72 | def get_label_intersection_percentage(self, label: Label):
73 | label_bounding_box = Rectangle.from_width_height(
74 | left=label.left, top=label.top, width=label.width, height=label.height
75 | )
76 |
77 | return self.bounding_box.get_intersection_percentage(label_bounding_box)
78 |
79 | def get_same_line_tokens(self, page_tokens: list["PdfToken"]):
80 | top, height = self.bounding_box.top, self.bounding_box.height
81 |
82 | same_line_tokens = [
83 | each_token
84 | for each_token in page_tokens
85 | if top <= each_token.bounding_box.top < (top + height)
86 | or top < each_token.bounding_box.bottom <= (top + height)
87 | ]
88 |
89 | return same_line_tokens
90 |
91 | def get_context(self, page_tokens: list["PdfToken"]):
92 | left, right = self.bounding_box.left, self.bounding_box.right
93 |
94 | self.pdf_token_context.left_of_tokens_on_the_left = left
95 |
96 | same_line_tokens = self.get_same_line_tokens(page_tokens)
97 |
98 | on_the_left = [
99 | each_token
100 | for each_token in same_line_tokens
101 | if each_token.bounding_box.right < right
102 | ]
103 | on_the_right = [
104 | each_token
105 | for each_token in same_line_tokens
106 | if left < each_token.bounding_box.left
107 | ]
108 |
109 | if on_the_left:
110 | self.pdf_token_context.right_of_token_on_the_left = max([
111 | x.bounding_box.right for x in on_the_left
112 | ])
113 | self.pdf_token_context.left_of_token_on_the_left = min([
114 | x.bounding_box.left for x in on_the_left
115 | ])
116 |
117 | if on_the_right:
118 | self.pdf_token_context.left_of_token_on_the_right = min([
119 | x.bounding_box.left for x in on_the_right
120 | ])
121 | self.pdf_token_context.right_of_token_on_the_right = max([
122 | x.bounding_box.right for x in on_the_right
123 | ])
124 |
--------------------------------------------------------------------------------
/eschergraph/tools/fuzzy_matcher.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from concurrent.futures import as_completed
4 | from concurrent.futures import ThreadPoolExecutor
5 |
6 | from fuzzywuzzy import fuzz
7 |
8 |
9 | class FuzzyMatcher:
10 | """Matching node names based on Levenshtein distance."""
11 |
12 | @staticmethod
13 | def get_match_sets(names: list[str]) -> list[set[str]]:
14 | """Get the sets of matches for the provided names.
15 |
16 | Args:
17 | names (list[str]): The list of names.
18 |
19 | Returns:
20 | A list of sets of strings.
21 |
22 | """
23 | matches: dict[str, list[str]] = FuzzyMatcher._match_nodes(names)
24 | return FuzzyMatcher._match_sets(matches)
25 |
26 | @staticmethod
27 | def _match_nodes(node_names: list[str]) -> dict[str, list[str]]:
28 | """Matches nodes in a graph based on similarity to provided node names.
29 |
30 | :param graph: The graph containing nodes to be matched.
31 | :param node_names: A list of node names to be matched against the graph.
32 | :return: A dictionary where keys are node names and values are lists of matching nodes.
33 | """
34 | result: dict[str, list[str]] = dict()
35 |
36 | with ThreadPoolExecutor(max_workers=10) as executor:
37 | futures = [
38 | executor.submit(FuzzyMatcher._find_matches, name, node_names)
39 | for name in node_names
40 | ]
41 | for future in as_completed(futures):
42 | name, match_nodes = future.result()
43 | if match_nodes:
44 | result[name] = match_nodes
45 | return result
46 |
47 | @staticmethod
48 | def _is_similar(name1: str, name2: str) -> bool:
49 | """Checks if two node names are sufficiently similar using fuzzy matching.
50 |
51 | Args:
52 | name1 (str): The first name.
53 | name2 (str): The second name.
54 |
55 | Returns:
56 | True if sufficiently similar, False otherwise.
57 | """
58 | return bool(fuzz.token_set_ratio(name1, name2) >= 95)
59 |
60 | @staticmethod
61 | def _find_matches(query: str, names: list[str]) -> tuple[str, list[str]]:
62 | """Finds matches for a given query string within a list of names.
63 |
64 | Args:
65 | query (str): The query string to find matches for.
66 | names (list[str]): A list of node names to match against.
67 |
68 | Returns:
69 | A tuple where the first element is the query string
70 | and the second is a list of matching node names.
71 | """
72 | matches = []
73 | for name in names:
74 | if FuzzyMatcher._is_similar(query, name) and query != name:
75 | matches.append(name)
76 | return query, matches
77 |
78 | @staticmethod
79 | def _match_sets(matches: dict[str, list[str]]) -> list[set[str]]:
80 | """Group similar nodes into sets based on matching provided.
81 |
82 | Args:
83 | matches (dict[str, list[str]]): A dictionary that contains the list of matches
84 | under each node name.
85 |
86 | Returns:
87 | A list of sets, where each set contains names of similar nodes.
88 | """
89 | nodes_visited: set[str] = set()
90 | merged: list[set[str]] = []
91 |
92 | for key in matches.keys():
93 | if key in nodes_visited:
94 | continue
95 | cluster = FuzzyMatcher._vertical_matching(
96 | nodes_visited=nodes_visited,
97 | cluster={key},
98 | matches={k: set(v) for k, v in matches.items()},
99 | current=key,
100 | )
101 | merged.append(cluster)
102 |
103 | return merged
104 |
105 | @staticmethod
106 | def _vertical_matching(
107 | nodes_visited: set[str],
108 | cluster: set[str],
109 | matches: dict[str, set[str]],
110 | current: str,
111 | ) -> set[str]:
112 | """Recursively matches nodes to form clusters of similar nodes.
113 |
114 | Args:
115 | nodes_visited (set[str]): Set with visited node names.
116 | cluster (set[str]): Set with all (recursively) matched nodes.
117 | matches (dict[str, list[str]]): All fuzzy matches for each node.
118 | current (str): Name of the current node.
119 |
120 | Returns:
121 | The cluster of similar nodes as a set.
122 | """
123 | nodes_visited.add(current)
124 |
125 | for match in matches[current]:
126 | if match not in nodes_visited:
127 | cluster.add(match)
128 | cluster = FuzzyMatcher._vertical_matching(
129 | nodes_visited, cluster, matches, match
130 | )
131 | return cluster
132 |
--------------------------------------------------------------------------------
/docs/docs/getting_started.md:
--------------------------------------------------------------------------------
1 | ---
2 | sidebar_position: 1
3 | ---
4 |
5 | # Getting started
6 |
7 | Let's learn how to build, and RAG search with **EscherGraph** in under 5 min.
8 |
9 | ## Installing
10 | Install the package in your Python environment with the following command.
11 |
12 | ```bash
13 | pip install eschergraph
14 | ```
15 |
16 | To build and search with EscherGraph: an LLM, an embedding model, and a reranker are needed. We recommend using OpenAI's GPT4o and text-embedding-3-large models, and the jina-reranker-v2-base-multilingual from Jina AI. These are also the defaults.
17 | In the upcoming examples, we will assume that these defaults are used.
18 |
19 | In case you still need to obtain a Jina AI API key, you can get a key with 1 million tokens, free and without registration [here](https://jina.ai/).
20 |
21 | ## Credentials
22 | The API keys needed to connect with external API's can be supplied to the graph in two ways:
23 | 1. via environment variables;
24 | 2. optional keyword arguments when instantiating the graph.
25 |
26 | Below we will consider both ways in which a graph instance can be created. Note that it is also possible to supply the required credentials using a combination of these methods, as long as all the keys are supplied at least once.
27 |
28 | ## Initialize graph
29 | ### 1. Environment variables
30 | First, put your Jina AI and OpenAI API keys in a `.env` file.
31 |
32 | ```python
33 | # .env file
34 | OPENAI_API_KEY = ...
35 | JINA_API_KEY = ...
36 | ```
37 |
38 | Then, when instantiating a graph, make sure to have the environment variables loaded.
39 | For example, you can use the `load_dotenv` function from the library `python-dotenv` to load them
40 | from a `.env` file.
41 | ```python
42 | from dotenv import load_dotenv
43 | from eschergraph import Graph
44 |
45 | load_dotenv()
46 |
47 | graph = Graph(name="pink_graph")
48 | ```
49 |
50 | ### 2. Keyword arguments
51 | ```python
52 | from eschergraph import Graph
53 |
54 | graph = Graph(
55 | name="pink_graph",
56 | openai_api_key="...",
57 | jina_api_key="..."
58 | )
59 | ```
60 |
61 | Currently, the supported models are GPT4o and GPT4o-mini. We recommend always using GPT-4o for graph building, since GPT-4o mini introduces too much noise when building a graph. However, it is perfectly fine to use GPT-4o mini for playing around and testing. In case you wish to initialize a graph with GPT-4o mini, this is done in the following way.
62 |
63 | ```python
64 | from eschergraph import Graph
65 | from eschergraph.agents import OpenAIProvider
66 | from eschergraph.agents import OpenAIModel
67 |
68 | graph = Graph(
69 | name="pink_graph",
70 | model=OpenAIProvider(model=OpenAIModel.GPT_4o_MINI)
71 | )
72 | ```
73 |
74 | Now, that we have a graph instance, you will see that all basic operations are straightforward.
75 |
76 | ## Build graph
77 | ```python
78 | my_file1 = 'test_files/Attention Is All You Need.pdf'
79 |
80 | graph.build(files = my_file1)
81 |
82 | # Adding more files to the graph is possible by simply building again:
83 | my_file2 = "test_files/test_file2.txt"
84 | my_file3 = "test_files/test_file3.pdf"
85 |
86 | graph.build(files = [my_file2, my_file3])
87 | ```
88 | Build can be used to add documents to the graph. All you need to do is specify the filepath of the files that you want to add to the graph. It is possible to specify both a string of a single filepath or a list containing multiple filepaths.
89 |
90 | ## Search
91 | ### Local RAG search
92 | A local RAG search uses the information stored in the graph to generate an answer using the most relevant information as extracted from the source.
93 | ```python
94 | question = 'On which hardware chips were the inital models trained?'
95 |
96 | answer = graph.search(question)
97 | print(answer)
98 | ```
99 | Local search considers all, nodes, edges, and properties to select the most relevant context using embedding similarity and reranking.
100 |
101 | ### Global RAG search
102 | ```python
103 | global_question = 'What are the conclusions from the paper?'
104 |
105 | answer = graph.global_search(global_question)
106 | print(answer)
107 | ```
108 | A global search considers the higher levels of the graph, and is great for answering general topic questions about the files in the graph.
109 | For example, it can be used to draw conclusions and interpret sentiment in a text.
110 |
111 | ## Visualize
112 | ### Dashboard
113 | ```python
114 | graph.dashboard()
115 | ```
116 | Print general info and statistics about the graph using the dashboard.
117 |
118 | ### Interactive plot
119 | An interactive plot for the graph's lowest and community level can be generated easily as well.
120 | ```python
121 | graph.visualize()
122 | ```
123 |
124 | 
125 |
126 |
127 |
--------------------------------------------------------------------------------