├── tests ├── __init__.py └── test_eknowledge.py ├── .github └── FUNDING.yml ├── eknowledge ├── __init__.py ├── prompts.py ├── relations.py └── main.py ├── setup.py ├── LICENSE ├── README.md └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [chigwell] 4 | -------------------------------------------------------------------------------- /eknowledge/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import execute_graph_generation, split_text_by_words, SYSTEM_PROMPT, USER_PROMPT, RELATIONS -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name='eknowledge', 6 | version='2025.4.171239', 7 | author='Eugene Evstafev', 8 | author_email='chigwel@gmail.com', 9 | description='A Python package for executing graph generation from textual inputs.', 10 | long_description=open('README.md').read(), 11 | long_description_content_type='text/markdown', 12 | url='https://github.com/chigwell/eknowledge', 13 | packages=find_packages(), 14 | install_requires=[ 15 | 'langchain-core==0.3.51', 16 | 'langchain-ollama==0.3.0' 17 | ], 18 | classifiers=[ 19 | 'License :: OSI Approved :: MIT License', 20 | 'Development Status :: 3 - Alpha', 21 | 'Intended Audience :: Developers', 22 | 'Programming Language :: Python :: 3', 23 | 'Operating System :: OS Independent', 24 | ], 25 | python_requires='>=3.6', 26 | ) -------------------------------------------------------------------------------- /eknowledge/prompts.py: -------------------------------------------------------------------------------- 1 | SYSTEM_PROMPT = """You are an ontology generation assistant. Your task is to identify and extract nodes and their relationships from the user's input text. You will receive a text input, and you need to generate a list of nodes and their relationships in the format specified below. Please ensure that the output is in the correct format and includes all relevant information. 2 | Please respond ONLY with the connections within ... tags, where each connection is inside a tag like this: 3 | 4 | 5 | ENTITY_A 6 | RELATIONSHIP_TYPE 7 | ENTITY_B 8 | 9 | 10 | """ 11 | USER_PROMPT = """ 12 | Please identify and list the relationships between nodes for the ontology based on the user's input text: 13 | ====== 14 | {text} 15 | ====== 16 | Possible relationships include: 17 | {relationships} 18 | """ 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Eugene Evstafiev 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PyPI version](https://badge.fury.io/py/eknowledge.svg)](https://badge.fury.io/py/eknowledge) 2 | [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT) 3 | [![Downloads](https://static.pepy.tech/badge/eknowledge)](https://pepy.tech/project/eknowledge) 4 | [![LinkedIn](https://img.shields.io/badge/LinkedIn-blue)](https://www.linkedin.com/in/eugene-evstafev-716669181/) 5 | 6 | # eknowledge 7 | 8 | `eknowledge` is a Python package designed to facilitate the generation of knowledge graphs from textual inputs. It leverages language models to parse text and extract relationships between entities, organizing these relationships into a structured graph format. This tool is ideal for developers, researchers, and anyone interested in structured knowledge extraction from unstructured text. 9 | 10 | ## Installation 11 | 12 | Install `eknowledge` using pip: 13 | 14 | ```bash 15 | pip install eknowledge langchain_llm7 16 | ``` 17 | 18 | ## Usage 19 | 20 | Here's a simple example to get you started with `eknowledge`. This example demonstrates how to generate a knowledge graph from a given text input using the package. 21 | 22 | ### Example 23 | 24 | ```python 25 | from eknowledge import execute_graph_generation 26 | from langchain_llm7 import ChatLLM7 27 | 28 | # Initialize the language model 29 | MODEL = "deepseek-r1" 30 | llm = ChatLLM7(model=MODEL) 31 | 32 | # Define your input text 33 | input_text = "The quick brown fox jumps over the lazy dog." 34 | 35 | # Generate the graph 36 | graph = execute_graph_generation( 37 | text=input_text, 38 | llm=llm, 39 | chunk_size=100, 40 | verbose=True 41 | ) 42 | 43 | # Output the graph 44 | print(graph) 45 | # > Splitting text into 1 chunks of size 100 words. 46 | # > Processing chunk 1/1... 47 | # > Nodes successfully processed in chunk 1/1. 48 | # > [{'from': 'quick brown fox', 'relationship': 'interacts_with', 'to': 'lazy dog'}] 49 | ``` 50 | 51 | This script will output a knowledge graph based on the relationships identified in the text. 52 | 53 | ## Contributing 54 | 55 | Contributions are welcome! Please open issues or submit pull requests for any bugs, features, or improvements you would like to see. 56 | 57 | ## License 58 | 59 | `eknowledge` is MIT licensed, as found in the [LICENSE](https://opensource.org/licenses/MIT) file. 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /eknowledge/relations.py: -------------------------------------------------------------------------------- 1 | RELATIONS = [ 2 | 'is_a', # Subclass relationship 3 | 'part_of', # Compositional relationship 4 | 'has_part', # Reverse of part_of 5 | 'associated_with', # General association 6 | 'equivalent_to', # Equivalence relationship 7 | 'disjoint_with', # Disjoint relationship 8 | 'depends_on', # Dependency relationship 9 | 'inverse_of', # Inverse relationship 10 | 'transitive', # Transitive relationship 11 | 'symmetrical', # Symmetrical relationship 12 | 'asymmetrical', # Asymmetrical relationship 13 | 'reflexive', # Reflexive relationship 14 | 'has_property', # Entity has a specific property 15 | 'has_attribute', # Similar to has_property, more general 16 | 'connected_to', # General connection, less specific than associated_with 17 | 'used_for', # Indicates typical use or purpose 18 | 'belongs_to', # Membership relation 19 | 'contains', # Contains relationship 20 | 'produced_by', # Indicates production or creation relationship 21 | 'preceded_by', # Temporal or sequential precedence 22 | 'succeeded_by', # Temporal or sequential succession 23 | 'interacts_with', # Interaction without specific direction 24 | 'causes', # Causality relationship 25 | 'influences', # Influence, weaker than causality 26 | 'contradicts', # Contradictory relationship 27 | 'complementary_to', # Complementarity in properties or function 28 | 'alternative_to', # Provides an alternative to 29 | 'derived_from', # Indicates origin or derivation 30 | 'has_member', # Indicates membership (group to individual) 31 | 'member_of', # Individual is member of group (reverse of has_member) 32 | 'subclass_of', # Another form of is_a, commonly used in RDF/OWL 33 | 'superclass_of', # Reverse of subclass_of 34 | 'annotated_with', # Used for linking annotations or metadata 35 | 'realizes', # Realization relationship in BFO (Basic Formal Ontology) 36 | 'has_quality', # Quality possession 37 | 'located_in', # Spatial containment or location relationship 38 | 'contains_information_about', # Information content relationship 39 | 'expresses', # Expression relationship in genetics or traits 40 | 'enabled_by', # Enabling condition relationship 41 | 'occurs_in', # Temporal occurrence within a context 42 | 'during', # Temporal relationship specifying during another event 43 | 'has_function', # Functionality relationship 44 | 'has_role', # Role specification relationship 45 | 'has_participant', # Participation in an event or process 46 | 'has_agent', # Agency relationship 47 | 'has_output', # Output specification relationship 48 | 'has_input', # Input specification relationship 49 | 'measured_by', # Measurement relationship 50 | 'provides', # Provision relationship 51 | 'requires', # Requirement relationship 52 | 'temporally_related_to', # General temporal relationship 53 | 'spatially_related_to', # General spatial relationship 54 | 'has_version', # Version control relationship 55 | 'has_exception', # Exception specification 56 | 'aggregates', # Aggregation relationship 57 | 'decomposed_into', # Decomposition relationship 58 | 'reified_as', # Reification relationship 59 | 'instantiated_by', # Instantiation relationship 60 | 'has_potential', # Potentiality relationship 61 | 'has_motive', # Motivation relationship 62 | 'negatively_regulates', # Negative regulation in biological contexts 63 | 'positively_regulates', # Positive regulation in biological contexts 64 | 'has_symptom', # Symptomatic relationship in medical ontologies 65 | 'treated_by', # Treatment relationship in medical contexts 66 | 'diagnosed_by' # Diagnostic relationship in medical contexts 67 | ] -------------------------------------------------------------------------------- /eknowledge/main.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | from .relations import RELATIONS 4 | from .prompts import SYSTEM_PROMPT, USER_PROMPT 5 | from langchain_core.messages import HumanMessage, SystemMessage 6 | 7 | 8 | def split_text_by_words(text: str, chunk_size: int) -> list[str]: 9 | """ 10 | Splits a text into chunks, where each chunk has a maximum number of words. 11 | 12 | This function splits the text by whitespace to identify words and then 13 | groups these words into chunks, ensuring no chunk exceeds the specified 14 | word count. 15 | 16 | Args: 17 | text: The input string to split. 18 | chunk_size: The maximum number of words allowed in each chunk. Must be 19 | a positive integer. 20 | 21 | Returns: 22 | A list of strings, where each string is a chunk of the original text. 23 | Returns an empty list if the input text is empty or contains only 24 | whitespace. 25 | 26 | Raises: 27 | ValueError: If chunk_size is not a positive integer. 28 | TypeError: If the input 'text' is not a string. 29 | """ 30 | # --- Input Validation --- 31 | if not isinstance(text, str): 32 | raise TypeError("Input 'text' must be a string.") 33 | if not isinstance(chunk_size, int) or chunk_size <= 0: 34 | raise ValueError("chunk_size must be a positive integer.") 35 | 36 | # --- Word Splitting --- 37 | # Use split() which handles various whitespace characters (spaces, tabs, newlines) 38 | # and ignores empty strings resulting from multiple spaces. 39 | words = text.split() 40 | 41 | # Handle case where text is empty or only whitespace 42 | if not words: 43 | return [] 44 | 45 | # --- Chunking --- 46 | chunks = [] 47 | current_chunk_word_count = 0 48 | start_index = 0 49 | 50 | # Iterate through the word list using range and step 51 | for i in range(0, len(words), chunk_size): 52 | # Determine the slice of words for this chunk 53 | word_slice = words[i : i + chunk_size] 54 | 55 | # Join the words in the slice back into a string with single spaces 56 | chunk_string = " ".join(word_slice) 57 | 58 | # Add the resulting chunk string to the list 59 | chunks.append(chunk_string) 60 | 61 | return chunks 62 | 63 | 64 | def execute_graph_generation( 65 | text="", 66 | llm=None, 67 | chunk_size=100, 68 | relations=RELATIONS, 69 | max_retries=10, 70 | system_prompt=SYSTEM_PROMPT, 71 | user_prompt=USER_PROMPT, 72 | verbose=False, 73 | sleep_time=0.75, 74 | ): 75 | if llm is None: 76 | raise ValueError("LLM object must be provided.") 77 | 78 | chunks = split_text_by_words(text, chunk_size) 79 | if verbose: 80 | print(f"Splitting text into {len(chunks)} chunks of size {chunk_size} words.") 81 | 82 | graph = [] 83 | total_chunks = len(chunks) 84 | for count_chunk, chunk in enumerate(chunks, 1): 85 | if verbose: 86 | print(f"Processing chunk {count_chunk}/{total_chunks}...") 87 | 88 | found_valid_node_in_chunk = False 89 | retry_count = 0 90 | messages = [ 91 | SystemMessage(content=system_prompt), 92 | HumanMessage(content=user_prompt.format(text=chunk, relationships=relations)) 93 | ] 94 | 95 | # Loop until a valid node is found OR retries are exhausted for this chunk 96 | while not found_valid_node_in_chunk and retry_count < max_retries: 97 | try: 98 | response = llm.invoke(messages) 99 | # Ensure response and content are usable 100 | content = response.content if response and hasattr(response, 'content') else "" 101 | 102 | if not isinstance(content, str): 103 | if verbose: 104 | print( 105 | f"LLM response content is not a string (type: {type(content)}) for chunk {count_chunk}. Retrying (attempt {retry_count + 1}/{max_retries})...") 106 | retry_count += 1 107 | continue # Retry if content isn't a string 108 | 109 | nodes_raw = re.findall(r"(.*?)", content, re.DOTALL) 110 | 111 | if not nodes_raw: 112 | # If LLM responds but without any tags 113 | if verbose: 114 | print( 115 | f"No tags found in response for chunk {count_chunk}. Retrying (attempt {retry_count + 1}/{max_retries})...") 116 | # No need to raise error here, just retry 117 | retry_count += 1 118 | continue # Retry 119 | 120 | # --- Process found tags --- 121 | processed_at_least_one_node = False 122 | for node_content in nodes_raw: 123 | from_node_match = re.search(r"(.*?)", node_content) 124 | relationship_match = re.search(r"(.*?)", node_content) 125 | to_node_match = re.search(r"(.*?)", node_content) 126 | 127 | if from_node_match and relationship_match and to_node_match: 128 | graph.append({ 129 | "from": from_node_match.group(1).strip(), 130 | "relationship": relationship_match.group(1).strip(), 131 | "to": to_node_match.group(1).strip() 132 | }) 133 | processed_at_least_one_node = True # Mark that we found a valid one 134 | 135 | # --- Decide whether to exit the while loop --- 136 | if processed_at_least_one_node: 137 | if verbose: 138 | print(f"Nodes successfully processed in chunk {count_chunk}/{total_chunks}.") 139 | found_valid_node_in_chunk = True # Exit the while loop for this chunk 140 | else: 141 | # Found tags, but none had the correct inner structure 142 | if verbose: 143 | print( 144 | f" tags found but no valid structure in chunk {count_chunk}. Retrying (attempt {retry_count + 1}/{max_retries})...") 145 | retry_count += 1 146 | # Loop continues (while condition checked again) 147 | 148 | except Exception as e: 149 | # Catch LLM errors or unexpected processing errors (like regex on bad types if check failed) 150 | if verbose: 151 | print( 152 | f"Error during LLM invocation or processing for chunk {count_chunk}: {e}. Retrying (attempt {retry_count + 1}/{max_retries})...") 153 | retry_count += 1 154 | time.sleep(sleep_time) 155 | continue # Retry 156 | time.sleep(sleep_time) 157 | if not found_valid_node_in_chunk and verbose: 158 | print(f"Max retries ({max_retries}) reached for chunk {count_chunk}. No valid nodes added for this chunk.") 159 | 160 | return graph 161 | 162 | 163 | -------------------------------------------------------------------------------- /tests/test_eknowledge.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock, patch, call 3 | from eknowledge import split_text_by_words, execute_graph_generation, RELATIONS, SYSTEM_PROMPT, USER_PROMPT 4 | 5 | try: 6 | from langchain_core.messages import HumanMessage, SystemMessage 7 | except ImportError: 8 | class MockMessageBase: 9 | def __init__(self, content): 10 | self.content = content 11 | HumanMessage = MockMessageBase 12 | SystemMessage = MockMessageBase 13 | 14 | class MockLLMResponse: 15 | def __init__(self, content): 16 | self.content = content 17 | 18 | class TestSplitTextByWords(unittest.TestCase): 19 | 20 | def test_basic_splitting(self): 21 | text = "This is a test sentence with eight words total." 22 | expected = ["This is a test", "sentence with eight words", "total."] 23 | self.assertEqual(split_text_by_words(text, 4), expected) 24 | 25 | def test_exact_multiple(self): 26 | text = "One two three four five six" 27 | expected = ["One two three", "four five six"] 28 | self.assertEqual(split_text_by_words(text, 3), expected) 29 | 30 | def test_less_than_chunk_size(self): 31 | text = "Short text" 32 | expected = ["Short text"] 33 | self.assertEqual(split_text_by_words(text, 5), expected) 34 | 35 | def test_chunk_size_one(self): 36 | text = "Split word by word" 37 | expected = ["Split", "word", "by", "word"] 38 | self.assertEqual(split_text_by_words(text, 1), expected) 39 | 40 | def test_empty_string(self): 41 | text = "" 42 | expected = [] 43 | self.assertEqual(split_text_by_words(text, 5), expected) 44 | 45 | def test_whitespace_string(self): 46 | text = " \n \t " 47 | expected = [] 48 | self.assertEqual(split_text_by_words(text, 5), expected) 49 | 50 | def test_multiple_spaces(self): 51 | text = "Word1 Word2 Word3 \n Word4" 52 | expected = ["Word1 Word2", "Word3 Word4"] 53 | self.assertEqual(split_text_by_words(text, 2), expected) 54 | 55 | def test_invalid_chunk_size_zero(self): 56 | with self.assertRaisesRegex(ValueError, "chunk_size must be a positive integer."): 57 | split_text_by_words("Some text", 0) 58 | 59 | def test_invalid_chunk_size_negative(self): 60 | with self.assertRaisesRegex(ValueError, "chunk_size must be a positive integer."): 61 | split_text_by_words("Some text", -1) 62 | 63 | def test_invalid_chunk_size_float(self): 64 | # Floats are not ints, even if positive 65 | with self.assertRaisesRegex(ValueError, "chunk_size must be a positive integer."): 66 | split_text_by_words("Some text", 5.0) 67 | 68 | def test_invalid_text_type_int(self): 69 | with self.assertRaisesRegex(TypeError, "Input 'text' must be a string."): 70 | split_text_by_words(12345, 5) # type: ignore 71 | 72 | def test_invalid_text_type_none(self): 73 | with self.assertRaisesRegex(TypeError, "Input 'text' must be a string."): 74 | split_text_by_words(None, 5) # type: ignore 75 | 76 | 77 | class TestExecuteGraphGeneration(unittest.TestCase): 78 | 79 | def setUp(self): 80 | """Set up a mock LLM for tests.""" 81 | self.mock_llm = MagicMock() 82 | self.default_relations = ["REL1", "REL2"] 83 | self.default_system_prompt = "System Instructions" 84 | self.default_user_prompt = "User Instructions: {text} - {relationships}" 85 | 86 | def test_no_llm_provided(self): 87 | """Test that ValueError is raised if llm is None.""" 88 | with self.assertRaisesRegex(ValueError, "LLM object must be provided."): 89 | execute_graph_generation(text="Some text", llm=None) 90 | 91 | def test_single_chunk_success(self): 92 | """Test successful graph generation from a single chunk.""" 93 | text = "Node A is connected to Node B." 94 | chunk_size = 10 95 | mock_response_content = """ 96 | Some preamble... 97 | 98 | Node A 99 | is connected to 100 | Node B 101 | 102 | Some postamble... 103 | """ 104 | self.mock_llm.invoke.return_value = MockLLMResponse(content=mock_response_content) 105 | 106 | expected_graph = [{ 107 | "from": "Node A", 108 | "relationship": "is connected to", 109 | "to": "Node B" 110 | }] 111 | 112 | result_graph = execute_graph_generation( 113 | text=text, 114 | llm=self.mock_llm, 115 | chunk_size=chunk_size, 116 | relations=self.default_relations, 117 | system_prompt=self.default_system_prompt, 118 | user_prompt=self.default_user_prompt 119 | ) 120 | 121 | self.assertEqual(result_graph, expected_graph) 122 | # Check LLM was called once with correct messages 123 | self.mock_llm.invoke.assert_called_once() 124 | call_args = self.mock_llm.invoke.call_args[0][0] # Get the 'messages' argument 125 | self.assertIsInstance(call_args[0], SystemMessage) 126 | self.assertEqual(call_args[0].content, self.default_system_prompt) 127 | self.assertIsInstance(call_args[1], HumanMessage) 128 | self.assertIn(text, call_args[1].content) 129 | self.assertIn(str(self.default_relations), call_args[1].content) 130 | 131 | 132 | def test_llm_returns_no_nodes(self): 133 | """Test scenario where LLM response lacks tags, causing retries.""" 134 | text = "Some text that results in no nodes." 135 | chunk_size = 10 136 | max_retries = 3 137 | mock_response_content = "The LLM responded, but found nothing relevant." 138 | 139 | # LLM always returns content without nodes 140 | self.mock_llm.invoke.return_value = MockLLMResponse(content=mock_response_content) 141 | 142 | result_graph = execute_graph_generation( 143 | text=text, 144 | llm=self.mock_llm, 145 | chunk_size=chunk_size, 146 | max_retries=max_retries, 147 | verbose=False # Keep console clean for test output 148 | ) 149 | 150 | self.assertEqual(result_graph, []) # Expect empty graph 151 | # Check LLM was called max_retries times 152 | self.assertEqual(self.mock_llm.invoke.call_count, max_retries) 153 | 154 | def test_llm_returns_malformed_node(self): 155 | """Test where exists but inner tags are missing.""" 156 | text = "Text leading to malformed output." 157 | chunk_size = 10 158 | max_retries = 2 159 | mock_response_content = """ 160 | 161 | GoodNode1 162 | RELATES_TO 163 | AnotherNode 164 | 165 | 166 | BadNode 167 | 168 | """ 169 | # Mock LLM to return malformed content (will retry as no *valid* node found initially) 170 | # Then return valid content on retry 171 | self.mock_llm.invoke.side_effect = [ 172 | MockLLMResponse(content="Bad"), # First attempt fails validation 173 | MockLLMResponse(content=mock_response_content) # Second attempt succeeds 174 | ] 175 | 176 | 177 | expected_graph = [{ 178 | "from": "GoodNode1", 179 | "relationship": "RELATES_TO", 180 | "to": "AnotherNode" 181 | }] 182 | 183 | result_graph = execute_graph_generation( 184 | text=text, 185 | llm=self.mock_llm, 186 | chunk_size=chunk_size, 187 | max_retries=max_retries 188 | ) 189 | 190 | self.assertEqual(result_graph, expected_graph) 191 | self.assertEqual(self.mock_llm.invoke.call_count, 2) # Called twice (initial + retry) 192 | 193 | 194 | def test_llm_invocation_error_then_success(self): 195 | """Test retry mechanism when llm.invoke raises an exception initially.""" 196 | text = "Text causing initial failure." 197 | chunk_size = 10 198 | max_retries = 3 199 | success_response = "ARB" 200 | 201 | # Configure mock to raise error first, then succeed 202 | self.mock_llm.invoke.side_effect = [ 203 | ValueError("Simulated LLM API error"), 204 | MockLLMResponse(content=success_response) 205 | ] 206 | 207 | expected_graph = [{"from": "A", "relationship": "R", "to": "B"}] 208 | 209 | # Use patch to temporarily suppress print statements during test 210 | with patch('builtins.print') as mock_print: 211 | result_graph = execute_graph_generation( 212 | text=text, 213 | llm=self.mock_llm, 214 | chunk_size=chunk_size, 215 | max_retries=max_retries, 216 | verbose=True # Enable verbose to test print suppression 217 | ) 218 | 219 | self.assertEqual(result_graph, expected_graph) 220 | self.assertEqual(self.mock_llm.invoke.call_count, 2) # Failed once, succeeded once 221 | # Check if error message was printed (if verbose=True) 222 | self.assertTrue(any("Error during LLM invocation" in str(call_args) for call_args in mock_print.call_args_list)) 223 | 224 | 225 | def test_llm_invocation_error_max_retries_exceeded(self): 226 | """Test scenario where LLM consistently fails, exceeding max_retries.""" 227 | text = "Text causing consistent failure." 228 | chunk_size = 10 229 | max_retries = 2 230 | 231 | # Configure mock to always raise an error 232 | self.mock_llm.invoke.side_effect = ConnectionError("Simulated persistent network issue") 233 | 234 | # Suppress print for cleaner test output 235 | with patch('builtins.print'): 236 | result_graph = execute_graph_generation( 237 | text=text, 238 | llm=self.mock_llm, 239 | chunk_size=chunk_size, 240 | max_retries=max_retries, 241 | verbose=False 242 | ) 243 | 244 | self.assertEqual(result_graph, []) # Expect empty graph 245 | # Check LLM was called max_retries times 246 | self.assertEqual(self.mock_llm.invoke.call_count, max_retries) 247 | 248 | 249 | def test_empty_input_text(self): 250 | """Test behavior with empty input text.""" 251 | result_graph = execute_graph_generation(text="", llm=self.mock_llm) 252 | self.assertEqual(result_graph, []) 253 | self.mock_llm.invoke.assert_not_called() # LLM should not be called if no chunks 254 | 255 | def test_verbose_output(self): 256 | """Test that messages are printed when verbose=True.""" 257 | text = "Verbose test text." 258 | chunk_size = 5 259 | mock_response = "VTT" 260 | self.mock_llm.invoke.return_value = MockLLMResponse(content=mock_response) 261 | 262 | with patch('builtins.print') as mock_print: 263 | execute_graph_generation(text=text, llm=self.mock_llm, chunk_size=chunk_size, verbose=True) 264 | 265 | # Check for expected print messages 266 | mock_print.assert_any_call("Splitting text into 1 chunks of size 5 words.") 267 | mock_print.assert_any_call("Processing chunk 1/1...") 268 | mock_print.assert_any_call("Nodes successfully processed in chunk 1/1.") 269 | 270 | 271 | if __name__ == '__main__': 272 | unittest.main(argv=['first-arg-is-ignored'], exit=False) --------------------------------------------------------------------------------