├── .github ├── CONTRIBUTOR_AGREEMENT.md ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ ├── documentation_improvement.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md ├── contributors │ └── .gitkeep └── workflows │ ├── README.md │ ├── ci-tests.yml │ ├── codeql.yml │ ├── contributor-agreement-check.yml │ ├── daily-import-test.yml │ └── docs.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── NOTICE ├── README.md ├── SECURITY.md ├── contextgem ├── __init__.py ├── internal │ ├── __init__.py │ ├── base │ │ ├── __init__.py │ │ ├── attrs.py │ │ ├── concepts.py │ │ ├── examples.py │ │ ├── instances.py │ │ ├── items.py │ │ ├── llms.py │ │ ├── mixins.py │ │ ├── paras_and_sents.py │ │ └── serialization.py │ ├── converters │ │ ├── __init__.py │ │ └── docx │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── exceptions.py │ │ │ ├── namespaces.py │ │ │ └── package.py │ ├── data_models.py │ ├── decorators.py │ ├── items.py │ ├── llm_output_structs │ │ ├── __init__.py │ │ ├── aspect_structs.py │ │ ├── concept_structs.py │ │ └── utils.py │ ├── loggers.py │ ├── prompts │ │ ├── extract_aspect_items.j2 │ │ └── extract_concept_items.j2 │ ├── system │ │ └── default_system_message.j2 │ ├── typings │ │ ├── __init__.py │ │ ├── aliases.py │ │ ├── strings_to_types.py │ │ ├── typed_class_utils.py │ │ ├── types_normalization.py │ │ ├── types_to_strings.py │ │ └── user_type_hints_validation.py │ └── utils.py └── public │ ├── __init__.py │ ├── aspects.py │ ├── concepts.py │ ├── converters │ ├── __init__.py │ └── docx.py │ ├── data_models.py │ ├── documents.py │ ├── examples.py │ ├── images.py │ ├── llms.py │ ├── paragraphs.py │ ├── pipelines.py │ ├── sentences.py │ └── utils.py ├── dev ├── __init__.py ├── content_snippets │ └── feature_table.html ├── generate_notebooks.py ├── notebooks │ ├── docs │ │ ├── advanced │ │ │ ├── advanced_aspects_and_concepts_document.ipynb │ │ │ ├── advanced_aspects_with_concepts.ipynb │ │ │ └── advanced_multiple_docs_pipeline.ipynb │ │ ├── aspects │ │ │ ├── aspect_with_concepts.ipynb │ │ │ ├── aspect_with_justifications.ipynb │ │ │ ├── aspect_with_sub_aspects.ipynb │ │ │ ├── basic_aspect.ipynb │ │ │ └── complex_hierarchy.ipynb │ │ ├── concepts │ │ │ ├── boolean_concept │ │ │ │ ├── boolean_concept.ipynb │ │ │ │ └── refs_and_justifications.ipynb │ │ │ ├── date_concept │ │ │ │ ├── date_concept.ipynb │ │ │ │ └── refs_and_justifications.ipynb │ │ │ ├── json_object_concept │ │ │ │ ├── adding_examples.ipynb │ │ │ │ ├── json_object_concept.ipynb │ │ │ │ └── refs_and_justifications.ipynb │ │ │ ├── label_concept │ │ │ │ ├── document_aspect_analysis.ipynb │ │ │ │ ├── label_concept.ipynb │ │ │ │ ├── multi_label_classification.ipynb │ │ │ │ └── refs_and_justifications.ipynb │ │ │ ├── numerical_concept │ │ │ │ ├── numerical_concept.ipynb │ │ │ │ └── refs_and_justifications.ipynb │ │ │ ├── rating_concept │ │ │ │ ├── multiple_ratings.ipynb │ │ │ │ ├── rating_concept.ipynb │ │ │ │ └── refs_and_justifications.ipynb │ │ │ └── string_concept │ │ │ │ ├── adding_examples.ipynb │ │ │ │ ├── refs_and_justifications.ipynb │ │ │ │ └── string_concept.ipynb │ │ ├── llms │ │ │ └── llm_extraction_methods │ │ │ │ ├── extract_all.ipynb │ │ │ │ ├── extract_aspects_from_document.ipynb │ │ │ │ ├── extract_concepts_from_aspect.ipynb │ │ │ │ └── extract_concepts_from_document.ipynb │ │ └── quickstart │ │ │ ├── quickstart_aspect.ipynb │ │ │ ├── quickstart_concept_aspect.ipynb │ │ │ ├── quickstart_concept_document_text.ipynb │ │ │ ├── quickstart_concept_document_vision.ipynb │ │ │ └── quickstart_sub_aspect.ipynb │ └── readme │ │ ├── docx_converter.ipynb │ │ ├── llm_chat.ipynb │ │ ├── quickstart_aspect.ipynb │ │ └── quickstart_concept.ipynb ├── populate_project_readme.py ├── readme.template.md ├── requirements │ ├── requirements.dev.txt │ └── requirements.main.txt └── usage_examples │ ├── __init__.py │ ├── docs │ ├── __init__.py │ ├── advanced │ │ ├── __init__.py │ │ ├── advanced_aspects_and_concepts_document.py │ │ ├── advanced_aspects_with_concepts.py │ │ └── advanced_multiple_docs_pipeline.py │ ├── aspects │ │ ├── __init__.py │ │ ├── aspect_with_concepts.py │ │ ├── aspect_with_justifications.py │ │ ├── aspect_with_sub_aspects.py │ │ ├── basic_aspect.py │ │ └── complex_hierarchy.py │ ├── concepts │ │ ├── __init__.py │ │ ├── boolean_concept │ │ │ ├── __init__.py │ │ │ ├── boolean_concept.py │ │ │ └── refs_and_justifications.py │ │ ├── date_concept │ │ │ ├── __init__.py │ │ │ ├── date_concept.py │ │ │ └── refs_and_justifications.py │ │ ├── json_object_concept │ │ │ ├── __init__.py │ │ │ ├── adding_examples.py │ │ │ ├── json_object_concept.py │ │ │ ├── refs_and_justifications.py │ │ │ └── structure │ │ │ │ ├── __init__.py │ │ │ │ ├── nested_class_structure.py │ │ │ │ ├── nested_structure.py │ │ │ │ ├── simple_class_structure.py │ │ │ │ └── simple_structure.py │ │ ├── label_concept │ │ │ ├── __init__.py │ │ │ ├── document_aspect_analysis.py │ │ │ ├── label_concept.py │ │ │ ├── multi_label_classification.py │ │ │ └── refs_and_justifications.py │ │ ├── numerical_concept │ │ │ ├── __init__.py │ │ │ ├── numerical_concept.py │ │ │ └── refs_and_justifications.py │ │ ├── rating_concept │ │ │ ├── __init__.py │ │ │ ├── multiple_ratings.py │ │ │ ├── rating_concept.py │ │ │ └── refs_and_justifications.py │ │ └── string_concept │ │ │ ├── __init__.py │ │ │ ├── adding_examples.py │ │ │ ├── refs_and_justifications.py │ │ │ └── string_concept.py │ ├── llm_config │ │ ├── __init__.py │ │ ├── cost_tracking.py │ │ ├── detailed_usage.py │ │ ├── fallback_llm.py │ │ ├── llm_api.py │ │ ├── llm_group.py │ │ ├── llm_local.py │ │ ├── o1_o4.py │ │ └── tracking_usage_and_cost.py │ ├── llms │ │ ├── __init__.py │ │ ├── llm_extraction_methods │ │ │ ├── __init__.py │ │ │ ├── extract_all.py │ │ │ ├── extract_aspects_from_document.py │ │ │ ├── extract_concepts_from_aspect.py │ │ │ └── extract_concepts_from_document.py │ │ └── llm_init │ │ │ ├── __init__.py │ │ │ ├── llm_api.py │ │ │ ├── llm_local.py │ │ │ └── lm_studio_connection_error_fix.py │ ├── optimizations │ │ ├── __init__.py │ │ ├── optimization_accuracy.py │ │ ├── optimization_choosing_llm.py │ │ ├── optimization_cost.py │ │ ├── optimization_long_docs.py │ │ └── optimization_speed.py │ ├── quickstart │ │ ├── __init__.py │ │ ├── quickstart_aspect.py │ │ ├── quickstart_concept_aspect.py │ │ ├── quickstart_concept_document_text.py │ │ ├── quickstart_concept_document_vision.py │ │ └── quickstart_sub_aspect.py │ └── serialization │ │ ├── __init__.py │ │ └── serialization.py │ ├── docstrings │ ├── __init__.py │ ├── aspects │ │ ├── __init__.py │ │ └── def_aspect.py │ ├── concepts │ │ ├── __init__.py │ │ ├── def_boolean_concept.py │ │ ├── def_date_concept.py │ │ ├── def_json_object_concept.py │ │ ├── def_label_concept.py │ │ ├── def_numerical_concept.py │ │ ├── def_rating_concept.py │ │ └── def_string_concept.py │ ├── data_models │ │ ├── __init__.py │ │ ├── def_llm_pricing.py │ │ └── def_rating_scale.py │ ├── documents │ │ ├── __init__.py │ │ └── def_document.py │ ├── examples │ │ ├── __init__.py │ │ ├── def_example_json_object.py │ │ └── def_example_string.py │ ├── images │ │ ├── __init__.py │ │ └── def_image.py │ ├── llms │ │ ├── __init__.py │ │ ├── def_llm.py │ │ └── def_llm_group.py │ ├── paragraphs │ │ ├── __init__.py │ │ └── def_paragraph.py │ ├── pipelines │ │ ├── __init__.py │ │ └── def_pipeline.py │ ├── sentences │ │ ├── __init__.py │ │ └── def_sentence.py │ └── utils │ │ ├── __init__.py │ │ ├── json_object_cls_struct.py │ │ └── reload_logger_settings.py │ ├── readme │ ├── __init__.py │ ├── docx_converter.py │ ├── llm_chat.py │ ├── quickstart_aspect.py │ └── quickstart_concept.py │ └── vs_other_frameworks │ ├── __init__.py │ ├── advanced │ ├── __init__.py │ ├── instructor.py │ ├── langchain.py │ └── llama_index.py │ └── basic │ ├── __init__.py │ ├── instructor.py │ ├── langchain.py │ ├── llama_index.py │ └── llama_index_rag.py ├── docs ├── Makefile ├── build_raw_docs_for_llm.py ├── docs-raw-for-llm.txt ├── make.bat └── source │ ├── _static │ ├── contextgem_component_examples.png │ ├── contextgem_how_it_works_infographics.png │ ├── contextgem_readme_header.png │ ├── contextgem_website_preview.png │ ├── custom.css │ ├── docs_preview_image_aspects.png │ ├── docs_preview_image_boolean_concept.png │ ├── docs_preview_image_date_concept.png │ ├── docs_preview_image_json_object_concept.png │ ├── docs_preview_image_label_concept.png │ ├── docs_preview_image_numerical_concept.png │ ├── docs_preview_image_rating_concept.png │ ├── docs_preview_image_string_concept.png │ ├── favicon.ico │ ├── readme_code_snippet.png │ └── tab_solid.png │ ├── advanced_usage.rst │ ├── api │ ├── aspects.rst │ ├── concepts.rst │ ├── converters.rst │ ├── data_models.rst │ ├── documents.rst │ ├── examples.rst │ ├── images.rst │ ├── llms.rst │ ├── paragraphs.rst │ ├── pipelines.rst │ ├── sentences.rst │ └── utils.rst │ ├── aspects │ └── aspects.rst │ ├── concepts │ ├── boolean_concept.rst │ ├── date_concept.rst │ ├── json_object_concept.rst │ ├── label_concept.rst │ ├── numerical_concept.rst │ ├── rating_concept.rst │ ├── string_concept.rst │ └── supported_concepts.rst │ ├── conf.py │ ├── converters │ └── docx.rst │ ├── how_it_works.rst │ ├── index.rst │ ├── installation.rst │ ├── llms │ ├── llm_config.rst │ ├── llm_extraction_methods.rst │ └── supported_llms.rst │ ├── motivation.rst │ ├── optimizations │ ├── optimization_accuracy.rst │ ├── optimization_choosing_llm.rst │ ├── optimization_cost.rst │ ├── optimization_long_docs.rst │ └── optimization_speed.rst │ ├── quickstart.rst │ ├── robots.txt │ ├── serialization.rst │ └── vs_other_frameworks.rst ├── poetry.lock ├── pyproject.toml └── tests ├── __init__.py ├── cassettes ├── TestAll.test_aspect_extraction_from_paragraphs[llm0].yaml ├── TestAll.test_aspect_extraction_from_paragraphs[llm1].yaml ├── TestAll.test_chat.yaml ├── TestAll.test_docx_converter_llm_extract.yaml ├── TestAll.test_extract_all[llm0-document0].yaml ├── TestAll.test_extract_all[llm0-document1].yaml ├── TestAll.test_extract_all[llm0-document2].yaml ├── TestAll.test_extract_all[llm1-document0].yaml ├── TestAll.test_extract_all[llm1-document1].yaml ├── TestAll.test_extract_all[llm1-document2].yaml ├── TestAll.test_extract_aspects_from_document[llm0].yaml ├── TestAll.test_extract_aspects_from_document[llm1].yaml ├── TestAll.test_extract_complex_json_object_concept.yaml ├── TestAll.test_extract_concepts_from_aspect[llm0].yaml ├── TestAll.test_extract_concepts_from_aspect[llm1].yaml ├── TestAll.test_extract_concepts_from_document[llm0].yaml ├── TestAll.test_extract_concepts_from_document[llm1].yaml ├── TestAll.test_extract_label_concept[llm0].yaml ├── TestAll.test_extract_label_concept[llm1].yaml ├── TestAll.test_extract_with_fallback.yaml ├── TestAll.test_local_llms.yaml ├── TestAll.test_serialization_and_cloning[llm0-document0].yaml ├── TestAll.test_serialization_and_cloning[llm0-document1].yaml ├── TestAll.test_serialization_and_cloning[llm0-document2].yaml ├── TestAll.test_serialization_and_cloning[llm1-document0].yaml ├── TestAll.test_serialization_and_cloning[llm1-document1].yaml ├── TestAll.test_serialization_and_cloning[llm1-document2].yaml ├── TestAll.test_system_messages.yaml ├── TestAll.test_usage_examples.yaml └── TestAll.test_vision[image0].yaml ├── conftest.py ├── custom_prompts ├── custom_prompt_aspects_no_tags.j2 ├── custom_prompt_aspects_with_tags.j2 ├── custom_prompt_concepts_no_tags.j2 └── custom_prompt_concepts_with_tags.j2 ├── docx_files ├── badly_formatted.docx └── en_nda_with_anomalies.docx ├── invoices ├── invoice.jpg ├── invoice.png ├── invoice.webp └── invoice2.jpg ├── ndas ├── en_nda_with_anomalies.txt ├── ua_nda_with_anomalies.txt └── zh_nda_with_anomalies.txt ├── other_files └── complex_user_profile.txt ├── test_all.py └── utils.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Create a report to help us improve 4 | title: '[BUG] ' 5 | labels: bug 6 | assignees: '' 7 | --- 8 | 9 | ## Description 10 | A clear and concise description of what the bug is. 11 | 12 | ## Steps to Reproduce 13 | Steps to reproduce the behavior. 14 | 15 | ## Expected Behavior 16 | A clear and concise description of what you expected to happen. 17 | 18 | ## Actual Behavior 19 | A clear and concise description of what actually happened. 20 | 21 | ## Environment 22 | - OS: [e.g. Windows 11, Ubuntu 24.04] 23 | - Python version: [e.g. 3.13.2] 24 | - contextgem version: [e.g. 0.1.0] 25 | - Any other relevant environment details 26 | 27 | ## Error Logs 28 | ``` 29 | Paste any error logs or traceback here 30 | ``` 31 | 32 | ## Additional Context 33 | Add any other context about the problem here. 34 | 35 | ## Possible Solution 36 | If you have suggestions on how to fix the issue, please describe them here. -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Have a question? Just ask :) 4 | url: https://github.com/shcherbak-ai/contextgem/discussions/new/ 5 | about: For questions or discussions that aren't bugs or feature requests 6 | - name: Documentation 7 | url: https://contextgem.dev 8 | about: Check the documentation for usage information and guides -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation_improvement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Documentation Improvement 3 | about: Suggest improvements to our documentation 4 | title: '[DOCS] ' 5 | labels: documentation 6 | assignees: '' 7 | --- 8 | 9 | ## What Documentation Needs Improvement? 10 | Provide links or describe the current documentation that needs to be improved. 11 | 12 | ## What's Wrong or Missing? 13 | A clear and concise description of what's wrong with the current documentation or what information is missing. 14 | 15 | ## Suggested Improvement 16 | A clear and concise description of how you think the documentation should be improved. 17 | 18 | ## Additional Context 19 | Add any other context about the documentation request here. -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | about: Suggest an idea for this project 4 | title: '[FEATURE] ' 5 | labels: enhancement 6 | assignees: '' 7 | --- 8 | 9 | ## Problem Statement 10 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 11 | 12 | ## Proposed Solution 13 | A clear and concise description of what you want to happen. 14 | 15 | ## Alternatives Considered 16 | A clear and concise description of any alternative solutions or features you've considered. 17 | 18 | ## Additional Context 19 | Add any other context, mockups, or examples about the feature request here. 20 | 21 | ## Implementation Ideas 22 | If you have ideas about how this could be implemented, please share them here. -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Description 4 | 9 | 10 | ## Related Issues 11 | 12 | 13 | ## Types of change 14 | 16 | - [ ] Bug fix (non-breaking change which fixes an issue) 17 | - [ ] New feature (non-breaking change which adds functionality) 18 | - [ ] Breaking change (fix or feature that would cause existing functionality to change) 19 | - [ ] Documentation update 20 | - [ ] Performance improvement 21 | - [ ] Code cleanup or refactor 22 | 23 | ## How to Test 24 | 25 | 26 | ## Checklist 27 | 29 | - [ ] I confirm that I have the right to submit this contribution and grant all the rights specified in the Contributor Agreement. 30 | - [ ] I have read, agreed to, filled in, and included my Contributor Agreement in `.github/contributors/[my-username].md`. 31 | - [ ] I ran the tests, and all new and existing tests passed. 32 | - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. -------------------------------------------------------------------------------- /.github/contributors/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/.github/contributors/.gitkeep -------------------------------------------------------------------------------- /.github/workflows/README.md: -------------------------------------------------------------------------------- 1 | # GitHub Workflows 2 | 3 | This directory contains GitHub Actions workflow configurations for continuous integration (CI) of the ContextGem project. 4 | 5 | 6 | ## Available Workflows 7 | 8 | ### tests (`ci-tests.yml`) 9 | 10 | **Features:** 11 | - Runs on multiple operating systems (Ubuntu, macOS, Windows) 12 | - Tests across Python versions 3.10, 3.11, 3.12, and 3.13 13 | - Checks formatting with Black 14 | - Runs test suite with VCR (recorded API responses) 15 | - Generates test coverage reports 16 | 17 | **Trigger:** 18 | - Automatically runs on push and pull request events on the main branch 19 | - Can be triggered manually through the GitHub Actions UI 20 | 21 | **Environment Variables:** 22 | - This workflow uses the following environment variables: 23 | - `CONTEXTGEM_OPENAI_API_KEY`: Secret OpenAI API key 24 | - `GIST_SECRET`: Secret token to upload coverage results to a gist for badge generation 25 | 26 | ### CodeQL Analysis (`codeql.yml`) 27 | 28 | This workflow performs code security scanning using GitHub's CodeQL analysis engine. 29 | 30 | **Features:** 31 | - Scans Python codebase for security vulnerabilities and coding errors 32 | - Analyzes code quality and identifies potential issues 33 | - Results are available in the Security tab of the repository 34 | 35 | **Trigger:** 36 | - Automatically runs on push and pull request events on the main and dev branches 37 | - Scheduled to run weekly 38 | - Can be triggered manually through the GitHub Actions UI 39 | 40 | ### Documentation Build (`docs.yml`) 41 | 42 | This workflow builds and deploys the project documentation to GitHub Pages. 43 | 44 | **Features:** 45 | - Builds documentation using Sphinx 46 | - Deploys documentation to GitHub Pages when merged to main 47 | - Creates preview builds on pull requests 48 | 49 | **Trigger:** 50 | - Automatically runs on push and pull request events on the main branch 51 | - Can be triggered manually through the GitHub Actions UI 52 | 53 | ### Check Contributor Agreement (`contributor-agreement-check.yml`) 54 | 55 | This workflow ensures all contributors have signed the Contributor Agreement by checking for properly filled agreement files. 56 | 57 | **Features:** 58 | - Verifies that each contributor has a signed agreement file 59 | - Ensures agreement files are not empty and contain the contributor's username 60 | - Prevents deletion of existing contributor agreement files 61 | - Posts helpful comments on PRs when agreement requirements aren't met 62 | 63 | **Trigger:** 64 | - Automatically runs on all pull request events (opened, synchronized, reopened) 65 | 66 | 67 | ## Running Workflows 68 | 69 | - **tests:** These run automatically on push/PR to the main branch 70 | - **CodeQL Analysis:** Runs automatically on push/PR to main/dev, weekly, and manually 71 | - **Documentation Build:** Runs automatically on push/PR to main and manually 72 | - **Check Contributor Agreement:** Runs automatically on all PRs 73 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [ main, dev ] 6 | pull_request: 7 | branches: [ main, dev ] 8 | schedule: 9 | - cron: '0 0 * * 0' # Run once per week at midnight on Sunday 10 | workflow_dispatch: 11 | 12 | jobs: 13 | analyze: 14 | name: Analyze 15 | runs-on: ubuntu-latest 16 | permissions: 17 | actions: read 18 | contents: read 19 | security-events: write 20 | 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | language: [ 'python' ] 25 | 26 | steps: 27 | - name: Checkout repository 28 | uses: actions/checkout@v4 29 | 30 | - name: Initialize CodeQL 31 | uses: github/codeql-action/init@v3 32 | with: 33 | languages: ${{ matrix.language }} 34 | 35 | - name: Set up Python 36 | uses: actions/setup-python@v5 37 | with: 38 | python-version: '3.13' 39 | 40 | - name: Install Poetry 41 | uses: snok/install-poetry@v1 42 | with: 43 | virtualenvs-create: true 44 | virtualenvs-in-project: true 45 | installer-parallel: true 46 | 47 | - name: Load cached pip wheels 48 | id: cached-pip-wheels 49 | uses: actions/cache@v4 50 | with: 51 | path: | 52 | ~/.cache/pip 53 | ~/Library/Caches/pip 54 | ~\AppData\Local\pip\Cache 55 | key: pip-${{ runner.os }}-python-${{ hashFiles('**/poetry.lock') }} 56 | 57 | - name: Install dependencies 58 | run: poetry install --no-interaction --with dev --no-root 59 | 60 | - name: Perform CodeQL Analysis 61 | uses: github/codeql-action/analyze@v3 62 | with: 63 | category: "/language:${{matrix.language}}" -------------------------------------------------------------------------------- /.github/workflows/daily-import-test.yml: -------------------------------------------------------------------------------- 1 | name: Daily Import Test 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * *' # Run daily at midnight UTC 6 | workflow_dispatch: # Allow manual triggering 7 | 8 | jobs: 9 | import-test: 10 | runs-on: macos-latest 11 | 12 | steps: 13 | - name: Set up Python 3.13 14 | uses: actions/setup-python@v5 15 | with: 16 | python-version: '3.13' 17 | 18 | - name: Install contextgem from PyPI 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install contextgem 22 | 23 | - name: Test import 24 | run: | 25 | python -c "import contextgem; print(f'Successfully imported contextgem version {contextgem.__version__}')" -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: build docs 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | workflow_dispatch: 9 | 10 | permissions: 11 | contents: read 12 | pages: write 13 | id-token: write 14 | 15 | concurrency: 16 | group: "pages" 17 | cancel-in-progress: true 18 | 19 | jobs: 20 | build: 21 | runs-on: ubuntu-latest 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v4 25 | 26 | - name: Setup Python 27 | id: setup-python 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: '3.13' 31 | 32 | - name: Install Poetry 33 | uses: snok/install-poetry@v1 34 | with: 35 | virtualenvs-create: true 36 | virtualenvs-in-project: true 37 | installer-parallel: true 38 | 39 | - name: Load cached pip wheels 40 | id: cached-pip-wheels 41 | uses: actions/cache@v4 42 | with: 43 | path: | 44 | ~/.cache/pip 45 | ~/Library/Caches/pip 46 | ~\AppData\Local\pip\Cache 47 | key: pip-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} 48 | 49 | - name: Install dependencies 50 | run: poetry install --no-interaction --with dev --no-root 51 | 52 | - name: Build documentation 53 | run: | 54 | cd docs 55 | poetry run sphinx-build -b html source _build/html -v -E 56 | 57 | - name: Create .nojekyll file 58 | run: touch docs/_build/html/.nojekyll 59 | 60 | - name: Upload artifact 61 | uses: actions/upload-pages-artifact@v3 62 | with: 63 | path: ./docs/_build/html 64 | 65 | deploy: 66 | environment: 67 | name: github-pages 68 | url: ${{ steps.deployment.outputs.page_url }} 69 | runs-on: ubuntu-latest 70 | needs: build 71 | if: github.ref == 'refs/heads/main' 72 | steps: 73 | - name: Deploy to GitHub Pages 74 | id: deployment 75 | uses: actions/deploy-pages@v4 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | __pycache__ 3 | .pytest_cache 4 | 5 | env 6 | .env 7 | venv 8 | .venv 9 | .coverage 10 | .cz.msg 11 | .vscode 12 | ~$* 13 | *.tmp 14 | 15 | notebooks 16 | htmlcov 17 | coverage_annotate 18 | !dev/notebooks 19 | docs/build 20 | dist 21 | .DS_Store 22 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | 3 | # Commitizen hook for conventional commits 4 | - repo: https://github.com/commitizen-tools/commitizen 5 | rev: v4.5.1 6 | hooks: 7 | - id: commitizen 8 | stages: [commit-msg] 9 | 10 | # Custom local hooks 11 | - repo: local 12 | hooks: 13 | 14 | # Use locally installed Black 15 | - id: black 16 | name: Black code formatter 17 | entry: poetry run black contextgem dev tests 18 | pass_filenames: false 19 | language: system 20 | files: '\.py$' 21 | stages: [pre-commit] 22 | 23 | # Use locally installed isort 24 | - id: isort 25 | name: Sort imports 26 | entry: poetry run isort contextgem dev tests 27 | pass_filenames: false 28 | language: system 29 | files: '\.py$' 30 | stages: [pre-commit] 31 | 32 | # Poetry check 33 | - id: poetry 34 | name: Poetry check 35 | entry: poetry check --lock 36 | pass_filenames: false 37 | language: system 38 | stages: [pre-commit] 39 | 40 | # Export requirements files 41 | - id: export-requirements 42 | name: Export requirements files 43 | entry: python 44 | args: ["-c", "import subprocess; subprocess.run(['poetry', 'export', '-f', 'requirements.txt', '--output', 'dev/requirements/requirements.main.txt']); subprocess.run(['poetry', 'export', '-f', 'requirements.txt', '--output', 'dev/requirements/requirements.dev.txt', '--with', 'dev'])"] 45 | language: python 46 | pass_filenames: false 47 | always_run: true 48 | stages: [pre-commit] 49 | 50 | # Update README.md from template 51 | - id: update-readme 52 | name: Update README.md 53 | entry: python 54 | args: ["-c", "import subprocess; subprocess.run(['python', 'dev/populate_project_readme.py'])"] 55 | language: python 56 | pass_filenames: false 57 | always_run: true 58 | stages: [pre-commit] 59 | 60 | # Build raw docs for LLM 61 | - id: build-raw-docs 62 | name: Build raw docs for LLM 63 | entry: python docs/build_raw_docs_for_llm.py 64 | language: system 65 | pass_filenames: false 66 | always_run: true 67 | stages: [pre-commit] 68 | 69 | # Generate example notebooks 70 | - id: generate-notebooks 71 | name: Generate example notebooks 72 | entry: python dev/generate_notebooks.py 73 | language: system 74 | pass_filenames: false 75 | always_run: true 76 | stages: [pre-commit] 77 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to ContextGem will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), with the following additional categories: 5 | 6 | - **Refactor**: Code reorganization that doesn't change functionality but improves structure or maintainability 7 | 8 | ## [0.6.1](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.6.1) - 2025-06-04 9 | ### Changed 10 | - Updated documentation for LM Studio models to clarify dummy API key requirement 11 | 12 | ## [0.6.0](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.6.0) - 2025-06-03 13 | ### Added 14 | - LabelConcept - a classification concept type that categorizes content using predefined labels. 15 | 16 | ## [0.5.0](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.5.0) - 2025-05-29 17 | ### Fixed 18 | - Params handling for reasoning (CoT-capable) models other than OpenAI o-series. Enabled automatic retry of LLM calls with dropping unsupported params if such unsupported params were set for the model. Improved handling and validation of LLM call params. 19 | 20 | ### Changed 21 | - Migrated to wtpsplit-lite - a lightweight version of wtpsplit that only retains accelerated ONNX inference of SaT models with minimal dependencies. 22 | 23 | ## [0.4.1](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.4.1) - 2025-05-26 24 | ### Added 25 | - Comprehensive docs on extracting aspects, extracting concepts, and LLM extraction methods 26 | 27 | ## [0.4.0](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.4.0) - 2025-05-20 28 | ### Added 29 | - Support for local SaT model paths in Document's `sat_model_id` parameter 30 | 31 | ## [0.3.0](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.3.0) - 2025-05-19 32 | ### Added 33 | - Expanded JsonObjectConcept to support nested class hierarchies, nested dictionary structures, lists containing objects, and literal types. 34 | 35 | ## [0.2.4](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.2.4) - 2025-05-09 36 | ### Fixed 37 | - Removed 'think' tags and content from LLM outputs (e.g. when using DeepSeek R1 via Ollama) which was breaking JSON parsing and validation 38 | 39 | ### Added 40 | - Documentation for cloud/local LLMs and LLM configuration guide 41 | 42 | ## [0.2.3](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.2.3) - 2025-05-04 43 | ### Changed 44 | - Updated litellm dependency version after encoding bug has been fixed upstream 45 | 46 | ## [0.2.2](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.2.2) - 2025-05-02 47 | ### Refactor 48 | - Refactored DOCX converter internals for better maintainability 49 | 50 | ## [0.2.1](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.2.1) - 2023-04-30 51 | ### Fixed 52 | - Fixed litellm dependency issue, pinning to version ==1.67.1 to avoid encoding bug in newer versions of litellm 53 | 54 | ## [0.2.0](https://github.com/shcherbak-ai/contextgem/releases/tag/v0.2.0) - 2023-04-21 55 | ### Added 56 | - Added DocxConverter for converting DOCX files into ContextGem Document objects -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: Shcherbak 5 | given-names: Sergii 6 | email: sergii@shcherbak.ai 7 | title: "ContextGem: Effortless LLM extraction from documents" 8 | date-released: 2025-04-02 9 | url: "https://github.com/shcherbak-ai/contextgem" 10 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | ContextGem - Effortless LLM extraction from documents 2 | ====================================================== 3 | 4 | Copyright (c) 2025 Shcherbak AI AS 5 | All rights reserved 6 | Developed by Sergii Shcherbak 7 | 8 | This software is licensed under the Apache License, Version 2.0 (the "License"); 9 | you may not use this file except in compliance with the License. 10 | You may obtain a copy of the License at 11 | 12 | http://www.apache.org/licenses/LICENSE-2.0 13 | 14 | Unless required by applicable law or agreed to in writing, software 15 | distributed under the License is distributed on an "AS IS" BASIS, 16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | See the License for the specific language governing permissions and 18 | limitations under the License. 19 | 20 | ============================================================================== 21 | THIRD-PARTY COMPONENTS 22 | ============================================================================== 23 | 24 | This software includes the following third-party components: 25 | 26 | Core Dependencies: 27 | - aiolimiter: Rate limiting for asynchronous operations 28 | - Jinja2: Template engine 29 | - litellm: LLM interface library (this software uses only MIT-licensed portions of LiteLLM and does not utilize any components from the enterprise/ directory) 30 | - loguru: Logging utility 31 | - pydantic: Data validation 32 | - python-ulid: ULID generation 33 | - wtpsplit-lite: Lightweight version of wtpsplit that only retains accelerated ONNX inference of SaT models with minimal dependencies 34 | 35 | Development Dependencies: 36 | - black: Code formatting 37 | - commitizen: Conventional commit tool and release management 38 | - coverage: Test coverage measurement 39 | - isort: Sorting imports 40 | - nbformat: Notebook format utilities 41 | - pip-tools: Dependency management 42 | - pre-commit: Pre-commit hooks 43 | - pytest: Testing framework 44 | - pytest-cov: Coverage plugin for pytest 45 | - pytest-recording: Recording HTTP interactions for tests 46 | - python-dotenv: Environment variable management 47 | - sphinx: Documentation generator 48 | - sphinx-autodoc-typehints: Type annotation support for Sphinx 49 | - sphinx-book-theme: Book-like theme for Sphinx 50 | - sphinx-copybutton: Adds copy button to code blocks in Sphinx docs 51 | - sphinx-design: Component library for Sphinx documentation 52 | - sphinx-sitemap: Generates XML sitemaps for Sphinx documentation 53 | - sphinxext-opengraph: OpenGraph metadata support for Sphinx documentation 54 | 55 | Each of these components may have their own licenses. Users should refer to the 56 | respective project repositories for detailed license information. 57 | 58 | ============================================================================== -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | 4 | ## Supported Versions 5 | 6 | We maintain security practices for the latest release of this library. Older versions may not receive security updates. 7 | 8 | 9 | ## Security Testing 10 | 11 | This project is automatically tested for security issues using [CodeQL](https://codeql.github.com/) static analysis (run via GitHub Actions). 12 | 13 | We also use [Snyk](https://snyk.io) as needed for supplementary dependency vulnerability monitoring. 14 | 15 | 16 | ## Data Privacy 17 | 18 | This library uses LiteLLM as a local Python package to communicate with LLM providers using unified interface. No data or telemetry is transmitted to LiteLLM servers, as the SDK is run entirely within the user's environment. According to LiteLLM's documentation, self-hosted or local SDK use involves no data storage and no telemetry. For details, see [LiteLLM's documentation](https://docs.litellm.ai/docs/data_security). 19 | 20 | 21 | ## Reporting a Vulnerability 22 | 23 | We value the security community's role in protecting our users. If you discover a potential security issue in this project, please report it as follows: 24 | 25 | 📧 **Email**: `sergii@shcherbak.ai` 26 | 27 | When reporting, please include: 28 | - A detailed description of the issue 29 | - Steps to reproduce the vulnerability 30 | - Any relevant logs, context, or configurations 31 | 32 | We aim to respond promptly to all valid reports. Please note that we do not currently offer a bug bounty program. 33 | 34 | 35 | ## Questions? 36 | 37 | If you’re unsure whether something is a vulnerability or just a bug, feel free to reach out via the email above before submitting a full report. 38 | -------------------------------------------------------------------------------- /contextgem/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | """ 20 | ContextGem - Effortless LLM extraction from documents 21 | """ 22 | 23 | __version__ = "0.6.1" 24 | __author__ = "Shcherbak AI AS" 25 | 26 | from contextgem.public import ( 27 | Aspect, 28 | BooleanConcept, 29 | DateConcept, 30 | Document, 31 | DocumentLLM, 32 | DocumentLLMGroup, 33 | DocumentPipeline, 34 | DocxConverter, 35 | Image, 36 | JsonObjectClassStruct, 37 | JsonObjectConcept, 38 | JsonObjectExample, 39 | LabelConcept, 40 | LLMPricing, 41 | NumericalConcept, 42 | Paragraph, 43 | RatingConcept, 44 | RatingScale, 45 | Sentence, 46 | StringConcept, 47 | StringExample, 48 | image_to_base64, 49 | reload_logger_settings, 50 | ) 51 | 52 | __all__ = [ 53 | # Aspects 54 | "Aspect", 55 | # Concepts 56 | "StringConcept", 57 | "BooleanConcept", 58 | "NumericalConcept", 59 | "RatingConcept", 60 | "JsonObjectConcept", 61 | "DateConcept", 62 | "LabelConcept", 63 | # Documents 64 | "Document", 65 | # Pipelines 66 | "DocumentPipeline", 67 | # Paragraphs 68 | "Paragraph", 69 | # Sentences 70 | "Sentence", 71 | # Images 72 | "Image", 73 | # Examples 74 | "StringExample", 75 | "JsonObjectExample", 76 | # LLMs 77 | "DocumentLLM", 78 | "DocumentLLMGroup", 79 | # Data models 80 | "LLMPricing", 81 | "RatingScale", 82 | # Utils 83 | "image_to_base64", 84 | "reload_logger_settings", 85 | "JsonObjectClassStruct", 86 | # Converters 87 | "DocxConverter", 88 | ] 89 | -------------------------------------------------------------------------------- /contextgem/internal/base/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from contextgem.internal.base.attrs import ( 20 | _AssignedAspectsProcessor, 21 | _AssignedConceptsProcessor, 22 | _AssignedInstancesProcessor, 23 | _ExtractedItemsAttributeProcessor, 24 | _RefParasAndSentsAttrituteProcessor, 25 | ) 26 | from contextgem.internal.base.concepts import _Concept 27 | from contextgem.internal.base.instances import _InstanceBase 28 | from contextgem.internal.base.items import _ExtractedItem 29 | from contextgem.internal.base.mixins import _PostInitCollectorMixin 30 | from contextgem.internal.base.paras_and_sents import _ParasAndSentsBase 31 | 32 | __all__ = [ 33 | # Instances 34 | "_InstanceBase", 35 | # Attrs processors 36 | "_AssignedAspectsProcessor", 37 | "_AssignedConceptsProcessor", 38 | "_AssignedInstancesProcessor", 39 | "_ExtractedItemsAttributeProcessor", 40 | "_RefParasAndSentsAttrituteProcessor", 41 | # Mixins 42 | "_PostInitCollectorMixin", 43 | # Concepts 44 | "_Concept", 45 | # Extracted items 46 | "_ExtractedItem", 47 | # Paragraphs and sentences 48 | "_ParasAndSentsBase", 49 | ] 50 | -------------------------------------------------------------------------------- /contextgem/internal/base/examples.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | """ 20 | Module defining the base classes for example subclasses. 21 | 22 | This module provides the foundational class structure for examples that can be used 23 | in the ContextGem framework. Examples serve as user-provided samples for extraction tasks, 24 | helping to guide and improve the extraction process by providing reference patterns 25 | or expected outputs. 26 | """ 27 | 28 | from __future__ import annotations 29 | 30 | from typing import Any 31 | 32 | from contextgem.internal.base.instances import _InstanceBase 33 | 34 | 35 | class _Example(_InstanceBase): 36 | """ 37 | Base class that represents an example for extraction tasks in the ContextGem framework. 38 | 39 | Examples serve as user-provided samples that guide the extraction process by 40 | demonstrating expected patterns or outputs for specific extraction tasks. 41 | 42 | :ivar content: Arbitrary content associated with the example. 43 | :type content: Any 44 | """ 45 | 46 | content: Any 47 | -------------------------------------------------------------------------------- /contextgem/internal/base/items.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | """ 20 | Module defining the base classes for item subclasses. 21 | 22 | This module provides the foundational class structure for items that can be extracted 23 | from aspects or documents in the ContextGem framework. Items serve as the basic units of information 24 | extracted from aspects or documents, providing a structured way to store and process extracted data. 25 | """ 26 | 27 | from __future__ import annotations 28 | 29 | from typing import Any, Optional 30 | 31 | from pydantic import Field, PrivateAttr 32 | 33 | from contextgem.internal.base.attrs import _RefParasAndSentsAttrituteProcessor 34 | from contextgem.internal.decorators import _post_init_method 35 | from contextgem.internal.typings.aliases import NonEmptyStr 36 | from contextgem.public.paragraphs import Paragraph 37 | from contextgem.public.sentences import Sentence 38 | 39 | 40 | class _ExtractedItem(_RefParasAndSentsAttrituteProcessor): 41 | """ 42 | Base class for items extracted from aspects or documents in the ContextGem framework. 43 | 44 | This class provides a structured way to store extracted information along with 45 | optional justification and reference data. 46 | 47 | :ivar value: The extracted information value. 48 | :type value: Any 49 | :ivar justification: Optional explanation providing context for the extraction. 50 | Defaults to None. 51 | :type justification: Optional[NonEmptyStr] 52 | :ivar reference_paragraphs: List of paragraphs referenced by this item. 53 | :type reference_paragraphs: list[Paragraph] 54 | :ivar reference_sentences: List of sentences referenced by this item. 55 | :type reference_sentences: list[Sentence] 56 | """ 57 | 58 | value: Any = Field(..., frozen=True) 59 | justification: Optional[NonEmptyStr] = Field(default=None, frozen=True) 60 | 61 | _reference_paragraphs: list[Paragraph] = PrivateAttr(default_factory=list) 62 | _reference_sentences: list[Sentence] = PrivateAttr(default_factory=list) 63 | 64 | @_post_init_method 65 | def _post_init(self, __context): 66 | if self.__class__ == _ExtractedItem: 67 | raise TypeError("Cannot instantiate base class directly") 68 | -------------------------------------------------------------------------------- /contextgem/internal/converters/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from contextgem.internal.converters.docx import ( 20 | WORD_XML_NAMESPACES, 21 | DocxContentError, 22 | DocxConverterError, 23 | DocxFormatError, 24 | DocxXmlError, 25 | _DocxConverterBase, 26 | _DocxPackage, 27 | ) 28 | 29 | __all__ = [ 30 | "WORD_XML_NAMESPACES", 31 | "DocxConverterError", 32 | "DocxFormatError", 33 | "DocxXmlError", 34 | "DocxContentError", 35 | "_DocxConverterBase", 36 | "_DocxPackage", 37 | ] 38 | -------------------------------------------------------------------------------- /contextgem/internal/converters/docx/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from contextgem.internal.converters.docx.base import _DocxConverterBase 20 | from contextgem.internal.converters.docx.exceptions import ( 21 | DocxContentError, 22 | DocxConverterError, 23 | DocxFormatError, 24 | DocxXmlError, 25 | ) 26 | from contextgem.internal.converters.docx.namespaces import WORD_XML_NAMESPACES 27 | from contextgem.internal.converters.docx.package import _DocxPackage 28 | 29 | __all__ = [ 30 | "_DocxConverterBase", 31 | "DocxConverterError", 32 | "DocxFormatError", 33 | "DocxXmlError", 34 | "DocxContentError", 35 | "WORD_XML_NAMESPACES", 36 | "_DocxPackage", 37 | ] 38 | -------------------------------------------------------------------------------- /contextgem/internal/converters/docx/exceptions.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | """ 20 | Exceptions for the DOCX converter module. 21 | 22 | This module defines custom exception classes used by the DOCX converter 23 | to handle various error conditions that may occur during document processing. 24 | These exceptions provide more specific error information than generic exceptions, 25 | making it easier to diagnose and handle problems when working with DOCX files. 26 | """ 27 | 28 | 29 | # Define custom exceptions 30 | class DocxConverterError(Exception): 31 | """Base exception class for DOCX converter errors.""" 32 | 33 | pass 34 | 35 | 36 | class DocxFormatError(DocxConverterError): 37 | """Exception raised when the DOCX file format is invalid or corrupted.""" 38 | 39 | pass 40 | 41 | 42 | class DocxXmlError(DocxConverterError): 43 | """Exception raised when there's an error parsing XML in the DOCX file.""" 44 | 45 | pass 46 | 47 | 48 | class DocxContentError(DocxConverterError): 49 | """Exception raised when required content is missing from the DOCX file.""" 50 | 51 | pass 52 | -------------------------------------------------------------------------------- /contextgem/internal/converters/docx/namespaces.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | """ 20 | Namespaces for the DOCX converter module. 21 | 22 | This module defines the XML namespaces used in DOCX files for different elements. 23 | It provides a dictionary of namespace URIs mapped to their prefixes, which are 24 | used to parse and process the XML content of DOCX files. 25 | """ 26 | 27 | 28 | # Define XML namespaces used in DOCX files 29 | WORD_XML_NAMESPACES = { 30 | "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", 31 | "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", 32 | "a": "http://schemas.openxmlformats.org/drawingml/2006/main", 33 | "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture", 34 | "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", 35 | "rels": "http://schemas.openxmlformats.org/package/2006/relationships", 36 | "v": "urn:schemas-microsoft-com:vml", 37 | "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006", 38 | } 39 | -------------------------------------------------------------------------------- /contextgem/internal/llm_output_structs/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from contextgem.internal.llm_output_structs.aspect_structs import ( 20 | _get_aspect_extraction_output_struct, 21 | ) 22 | from contextgem.internal.llm_output_structs.concept_structs import ( 23 | _get_concept_extraction_output_struct, 24 | _LabelConceptItemValueModel, 25 | ) 26 | from contextgem.internal.llm_output_structs.utils import _create_root_model 27 | 28 | __all__ = [ 29 | # Utils 30 | "_create_root_model", 31 | # Aspect structs 32 | "_get_aspect_extraction_output_struct", 33 | # Concept structs 34 | "_get_concept_extraction_output_struct", 35 | "_LabelConceptItemValueModel", 36 | ] 37 | -------------------------------------------------------------------------------- /contextgem/internal/llm_output_structs/utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | """ 20 | Module defining utility functions for dynamically computing LLM output validation structures. 21 | """ 22 | 23 | from pydantic import RootModel 24 | 25 | 26 | def _create_root_model(name: str, root_type: type): 27 | """ 28 | Creates a dynamic model class extending RootModel for a specified type. 29 | 30 | :param name: The name of the new class to be created. 31 | :type name: str 32 | :param root_type: The root type to be used as a parameter for RootModel. 33 | :type root_type: type 34 | :return: A dynamically created class inheriting from RootModel 35 | parameterized with the given type. 36 | :rtype: type 37 | """ 38 | return type(name, (RootModel[root_type],), {}) 39 | -------------------------------------------------------------------------------- /contextgem/internal/typings/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from contextgem.internal.typings.aliases import ( 20 | AssignedInstancesAttrName, 21 | AsyncCalsAndKwargs, 22 | DefaultDecimalField, 23 | DefaultPromptType, 24 | ExtractedInstanceType, 25 | JustificationDepth, 26 | LanguageRequirement, 27 | LLMRoleAny, 28 | LLMRoleAspect, 29 | NonEmptyStr, 30 | ReferenceDepth, 31 | SaTModelId, 32 | StandardSaTModelId, 33 | ) 34 | from contextgem.internal.typings.strings_to_types import _deserialize_type_hint 35 | from contextgem.internal.typings.typed_class_utils import ( 36 | _get_model_fields, 37 | _is_typed_class, 38 | _raise_dict_class_type_error, 39 | ) 40 | from contextgem.internal.typings.types_normalization import _normalize_type_annotation 41 | from contextgem.internal.typings.types_to_strings import ( 42 | _format_dict_structure, 43 | _format_type, 44 | _is_json_serializable_type, 45 | _JsonObjectItemStructure, 46 | _raise_json_serializable_type_error, 47 | _serialize_type_hint, 48 | ) 49 | from contextgem.internal.typings.user_type_hints_validation import ( 50 | _dynamic_pydantic_model, 51 | ) 52 | 53 | __all__ = [ 54 | # Aliases 55 | "NonEmptyStr", 56 | "LLMRoleAny", 57 | "LLMRoleAspect", 58 | "AssignedInstancesAttrName", 59 | "ExtractedInstanceType", 60 | "DefaultPromptType", 61 | "ReferenceDepth", 62 | "SaTModelId", 63 | "StandardSaTModelId", 64 | "LanguageRequirement", 65 | "JustificationDepth", 66 | "AsyncCalsAndKwargs", 67 | "DefaultDecimalField", 68 | # Strings to types 69 | "_deserialize_type_hint", 70 | # Types to strings 71 | "_is_json_serializable_type", 72 | "_format_type", 73 | "_JsonObjectItemStructure", 74 | "_serialize_type_hint", 75 | "_format_dict_structure", 76 | "_raise_json_serializable_type_error", 77 | # User type hints validation 78 | "_dynamic_pydantic_model", 79 | # Typed class utils 80 | "_is_typed_class", 81 | "_get_model_fields", 82 | "_raise_dict_class_type_error", 83 | # Types normalization 84 | "_normalize_type_annotation", 85 | ] 86 | -------------------------------------------------------------------------------- /contextgem/internal/typings/aliases.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | """ 20 | Module defining type aliases used throughout the ContextGem framework. 21 | 22 | This module provides standardized type definitions and aliases that ensure 23 | consistent typing across the codebase. It includes specialized string types, 24 | literal types for configuration options, and compatibility solutions for 25 | different Python versions. 26 | """ 27 | 28 | import sys 29 | from decimal import Decimal 30 | from pathlib import Path 31 | from typing import Annotated, Any, Callable, Coroutine, Literal, TypeVar, Union 32 | 33 | from pydantic import Field, StrictStr, StringConstraints 34 | 35 | if sys.version_info >= (3, 11): 36 | from typing import Self 37 | else: 38 | Self = TypeVar("Self") 39 | 40 | NonEmptyStr = Annotated[ 41 | StrictStr, StringConstraints(strip_whitespace=True, min_length=1) 42 | ] 43 | 44 | LLMRoleAny = Literal[ 45 | "extractor_text", "reasoner_text", "extractor_vision", "reasoner_vision" 46 | ] 47 | 48 | LLMRoleAspect = Literal["extractor_text", "reasoner_text"] 49 | 50 | AssignedInstancesAttrName = Literal["aspects", "concepts"] 51 | 52 | DefaultPromptType = Literal["aspects", "concepts"] 53 | 54 | ExtractedInstanceType = Literal["aspect", "concept"] 55 | 56 | ReferenceDepth = Literal["paragraphs", "sentences"] 57 | 58 | ClassificationType = Literal["multi_class", "multi_label"] 59 | 60 | # Define standard SaT model IDs as a separate type 61 | StandardSaTModelId = Literal[ 62 | "sat-1l", 63 | "sat-1l-sm", 64 | "sat-3l", 65 | "sat-3l-sm", 66 | "sat-6l", 67 | "sat-6l-sm", 68 | "sat-9l", 69 | "sat-12l", 70 | "sat-12l-sm", 71 | ] 72 | 73 | # Combined type for sat_model_id parameter 74 | SaTModelId = Union[ 75 | StandardSaTModelId, 76 | str, # Local path as a string 77 | Path, # Local path as a Path object 78 | ] 79 | 80 | LanguageRequirement = Literal["en", "adapt"] 81 | 82 | JustificationDepth = Literal["brief", "balanced", "comprehensive"] 83 | 84 | AsyncCalsAndKwargs = list[ 85 | tuple[Callable[..., Coroutine[Any, Any, Any]], dict[str, Any]] 86 | ] 87 | 88 | DefaultDecimalField = Field( 89 | default_factory=lambda: Decimal("0.00000"), ge=Decimal("0.00000") 90 | ) 91 | 92 | ReasoningEffort = Literal["low", "medium", "high"] 93 | 94 | RawTextMode = Literal["raw", "markdown"] 95 | -------------------------------------------------------------------------------- /contextgem/public/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from contextgem.public.aspects import Aspect 20 | from contextgem.public.concepts import ( 21 | BooleanConcept, 22 | DateConcept, 23 | JsonObjectConcept, 24 | LabelConcept, 25 | NumericalConcept, 26 | RatingConcept, 27 | StringConcept, 28 | ) 29 | from contextgem.public.converters import DocxConverter 30 | from contextgem.public.data_models import LLMPricing, RatingScale 31 | from contextgem.public.documents import Document 32 | from contextgem.public.examples import JsonObjectExample, StringExample 33 | from contextgem.public.images import Image 34 | from contextgem.public.llms import DocumentLLM, DocumentLLMGroup 35 | from contextgem.public.paragraphs import Paragraph 36 | from contextgem.public.pipelines import DocumentPipeline 37 | from contextgem.public.sentences import Sentence 38 | from contextgem.public.utils import ( 39 | JsonObjectClassStruct, 40 | image_to_base64, 41 | reload_logger_settings, 42 | ) 43 | 44 | __all__ = [ 45 | # Aspects 46 | "Aspect", 47 | # Concepts 48 | "StringConcept", 49 | "BooleanConcept", 50 | "NumericalConcept", 51 | "RatingConcept", 52 | "JsonObjectConcept", 53 | "DateConcept", 54 | "LabelConcept", 55 | # Documents 56 | "Document", 57 | # Pipelines 58 | "DocumentPipeline", 59 | # Paragraphs 60 | "Paragraph", 61 | # Sentences 62 | "Sentence", 63 | # Images 64 | "Image", 65 | # Examples 66 | "StringExample", 67 | "JsonObjectExample", 68 | # LLMs 69 | "DocumentLLM", 70 | "DocumentLLMGroup", 71 | # Data models 72 | "LLMPricing", 73 | "RatingScale", 74 | # Utils 75 | "image_to_base64", 76 | "reload_logger_settings", 77 | "JsonObjectClassStruct", 78 | # Converters 79 | "DocxConverter", 80 | ] 81 | -------------------------------------------------------------------------------- /contextgem/public/converters/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from contextgem.public.converters.docx import DocxConverter 20 | 21 | __all__ = [ 22 | "DocxConverter", 23 | ] 24 | -------------------------------------------------------------------------------- /contextgem/public/images.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | """ 20 | Module for handling document images. 21 | 22 | This module provides the Image class, which represents visual content that can be attached to 23 | or fully represent a document. Images are stored in base64-encoded format with specified MIME types 24 | to ensure proper handling. 25 | 26 | The module supports common image formats (JPEG, PNG, WebP) and integrates with the broader ContextGem 27 | framework for document analysis that includes visual content alongside textual information. 28 | """ 29 | 30 | from __future__ import annotations 31 | 32 | from typing import Literal 33 | 34 | from contextgem.internal.base.instances import _InstanceBase 35 | from contextgem.internal.typings.aliases import NonEmptyStr 36 | 37 | 38 | class Image(_InstanceBase): 39 | """ 40 | Represents an image with specified MIME type and base64-encoded data. 41 | An image is typically attached to a document, or fully represents a document. 42 | 43 | :ivar mime_type: The MIME type of the image. This must be one of the 44 | predefined valid types ("image/jpg", "image/jpeg", "image/png", 45 | "image/webp"). 46 | :type mime_type: Literal["image/jpg", "image/jpeg", "image/png", 47 | "image/webp"] 48 | :ivar base64_data: The base64-encoded data of the image. The util function 49 | `image_to_base64()` from contextgem.public.utils can be used to encode images to base64. 50 | :type base64_data: NonEmptyStr 51 | 52 | Note: 53 | - Attached to documents: 54 | An image must be attached to a document. A document can have multiple images. 55 | 56 | - Extraction types: 57 | Only concept extraction is supported for images. Use LLM with role ``"extractor_vision"`` 58 | or ``"reasoner_vision"`` to extract concepts from images. 59 | 60 | Example: 61 | .. literalinclude:: ../../../dev/usage_examples/docstrings/images/def_image.py 62 | :language: python 63 | :caption: Image definition 64 | """ 65 | 66 | mime_type: Literal["image/jpg", "image/jpeg", "image/png", "image/webp"] 67 | base64_data: NonEmptyStr 68 | -------------------------------------------------------------------------------- /contextgem/public/sentences.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | """ 20 | Module for handling document sentences. 21 | 22 | This module provides the Sentence class, which represents a structured unit of text 23 | within a document paragraph. Sentences are the fundamental building blocks of text analysis, 24 | containing the raw text content of individual statements. 25 | 26 | The module supports validation to ensure data integrity and integrates with the paragraph 27 | structure to maintain the hierarchical organization of document content. 28 | """ 29 | 30 | from __future__ import annotations 31 | 32 | from pydantic import Field 33 | 34 | from contextgem.internal.base.paras_and_sents import _ParasAndSentsBase 35 | from contextgem.internal.typings.aliases import NonEmptyStr 36 | 37 | 38 | class Sentence(_ParasAndSentsBase): 39 | """ 40 | Represents a sentence within a document paragraph. 41 | 42 | Sentences are immutable text units that serve as the fundamental building blocks for 43 | document analysis. The raw text content is preserved and cannot be modified after 44 | initialization to maintain data integrity. 45 | 46 | :ivar raw_text: The complete text content of the sentence. This value is frozen after initialization. 47 | :type raw_text: NonEmptyStr 48 | 49 | Note: 50 | Normally, you do not need to construct sentences manually, as they are populated automatically 51 | from document's ``raw_text`` or ``paragraphs`` attributes. Only use this constructor for 52 | advanced use cases, such as when you have a custom paragraph/sentence segmentation tool. 53 | 54 | Example: 55 | .. literalinclude:: ../../../dev/usage_examples/docstrings/sentences/def_sentence.py 56 | :language: python 57 | :caption: Sentence definition 58 | """ 59 | 60 | raw_text: NonEmptyStr = Field(..., frozen=True) 61 | -------------------------------------------------------------------------------- /dev/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/__init__.py -------------------------------------------------------------------------------- /dev/notebooks/readme/docx_converter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "cell_0", 6 | "metadata": {}, 7 | "source": [ 8 | "# Using ContextGem's DocxConverter" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "cell_1", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "%pip install -U contextgem" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "cell_2", 24 | "metadata": {}, 25 | "source": [ 26 | "To run the extraction, please provide your LLM details in the ``DocumentLLM(...)`` constructor further below." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "cell_3", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# Using ContextGem's DocxConverter\n", 37 | "\n", 38 | "from contextgem import DocxConverter\n", 39 | "\n", 40 | "converter = DocxConverter()\n", 41 | "\n", 42 | "# Convert a DOCX file to an LLM-ready ContextGem Document\n", 43 | "# from path\n", 44 | "document = converter.convert(\"path/to/document.docx\")\n", 45 | "# or from file object\n", 46 | "with open(\"path/to/document.docx\", \"rb\") as docx_file_object:\n", 47 | " document = converter.convert(docx_file_object)\n", 48 | "\n", 49 | "# You can also use it as a standalone text extractor\n", 50 | "docx_text = converter.convert_to_text_format(\n", 51 | " \"path/to/document.docx\",\n", 52 | " output_format=\"markdown\", # or \"raw\"\n", 53 | ")\n" 54 | ] 55 | } 56 | ], 57 | "metadata": { 58 | "kernelspec": { 59 | "display_name": "Python 3", 60 | "language": "python", 61 | "name": "python3" 62 | }, 63 | "language_info": { 64 | "codemirror_mode": { 65 | "name": "ipython", 66 | "version": 3 67 | }, 68 | "file_extension": ".py", 69 | "mimetype": "text/x-python", 70 | "name": "python", 71 | "nbconvert_exporter": "python", 72 | "pygments_lexer": "ipython3", 73 | "version": "3.10.0" 74 | } 75 | }, 76 | "nbformat": 4, 77 | "nbformat_minor": 5 78 | } -------------------------------------------------------------------------------- /dev/notebooks/readme/llm_chat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "cell_0", 6 | "metadata": {}, 7 | "source": [ 8 | "# Using LLMs for chat (text + vision), with fallback LLM support" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "cell_1", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "%pip install -U contextgem" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "cell_2", 24 | "metadata": {}, 25 | "source": [ 26 | "To run the extraction, please provide your LLM details in the ``DocumentLLM(...)`` constructor further below." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "cell_3", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# Using LLMs for chat (text + vision), with fallback LLM support\n", 37 | "\n", 38 | "import os\n", 39 | "\n", 40 | "from contextgem import DocumentLLM\n", 41 | "\n", 42 | "# from contextgem import Image\n", 43 | "\n", 44 | "main_model = DocumentLLM(\n", 45 | " model=\"openai/gpt-4o\", # or another provider/model\n", 46 | " api_key=os.getenv(\"CONTEXTGEM_OPENAI_API_KEY\"), # your API key for the LLM provider\n", 47 | ")\n", 48 | "\n", 49 | "# Optional: fallback LLM\n", 50 | "fallback_model = DocumentLLM(\n", 51 | " model=\"openai/gpt-4o-mini\", # or another provider/model\n", 52 | " api_key=os.getenv(\"CONTEXTGEM_OPENAI_API_KEY\"), # your API key for the LLM provider\n", 53 | " is_fallback=True,\n", 54 | ")\n", 55 | "main_model.fallback_llm = fallback_model\n", 56 | "\n", 57 | "response = main_model.chat(\n", 58 | " \"Hello\",\n", 59 | " # images=[Image(...)]\n", 60 | ")\n", 61 | "# or `response = await main_model.chat_async(...)`\n", 62 | "\n", 63 | "print(response)\n" 64 | ] 65 | } 66 | ], 67 | "metadata": { 68 | "kernelspec": { 69 | "display_name": "Python 3", 70 | "language": "python", 71 | "name": "python3" 72 | }, 73 | "language_info": { 74 | "codemirror_mode": { 75 | "name": "ipython", 76 | "version": 3 77 | }, 78 | "file_extension": ".py", 79 | "mimetype": "text/x-python", 80 | "name": "python", 81 | "nbconvert_exporter": "python", 82 | "pygments_lexer": "ipython3", 83 | "version": "3.10.0" 84 | } 85 | }, 86 | "nbformat": 4, 87 | "nbformat_minor": 5 88 | } -------------------------------------------------------------------------------- /dev/populate_project_readme.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | """ 20 | This script populates the README.md file with code examples from the usage_examples/ directory. 21 | 22 | To use it, run: 23 | 24 | ```bash 25 | python dev/populate_project_readme.py 26 | ``` 27 | """ 28 | 29 | README_TEMPLATE_PATH = "dev/README.TEMPLATE.md" 30 | README_OUTPUT_PATH = "README.md" 31 | README_FOOTER = "" 32 | 33 | 34 | def generate_readme(): 35 | with open(README_TEMPLATE_PATH, "r", encoding="utf-8") as template_file: 36 | template = template_file.read() 37 | 38 | # Replace markers with actual code examples 39 | for example_file, marker in USAGE_EXAMPLES_MAPPING.items(): 40 | code_snippet = extract_code_from_file(example_file) 41 | template = template.replace(marker, code_snippet) 42 | 43 | with open(README_OUTPUT_PATH, "w", encoding="utf-8") as readme_file: 44 | readme_file.write(template) 45 | readme_file.write(README_FOOTER) 46 | print("Project README.md file populated successfully.") 47 | 48 | 49 | def extract_code_from_file(file_path): 50 | with open(file_path, "r", encoding="utf-8") as f: 51 | content = f.read() 52 | return content 53 | 54 | 55 | # Map example files to markers in the template 56 | USAGE_EXAMPLES_MAPPING = { 57 | "dev/content_snippets/feature_table.html": "{{FEATURE_TABLE}}", 58 | "dev/usage_examples/readme/quickstart_aspect.py": "{{QUICKSTART_ASPECT}}", 59 | "dev/usage_examples/readme/quickstart_concept.py": "{{QUICKSTART_CONCEPT}}", 60 | "dev/usage_examples/readme/docx_converter.py": "{{DOCX_CONVERTER}}", 61 | } 62 | 63 | if __name__ == "__main__": 64 | generate_readme() 65 | -------------------------------------------------------------------------------- /dev/usage_examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/advanced/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/advanced/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/aspects/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/aspects/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/aspects/aspect_with_justifications.py: -------------------------------------------------------------------------------- 1 | # ContextGem: Aspect Extraction with Justifications 2 | 3 | import os 4 | 5 | from contextgem import Aspect, Document, DocumentLLM 6 | 7 | # Create a document instance 8 | doc = Document( 9 | raw_text=( 10 | "NON-DISCLOSURE AGREEMENT\n" 11 | "\n" 12 | 'This Non-Disclosure Agreement ("Agreement") is entered into between TechCorp Inc. ' 13 | '("Disclosing Party") and Innovation Labs LLC ("Receiving Party") on January 15, 2024.\n' 14 | "...\n" 15 | ), 16 | ) 17 | 18 | # Define a single aspect focused on NDA direction with justifications 19 | nda_direction_aspect = Aspect( 20 | name="NDA Direction", 21 | description="Provisions informing the NDA direction (whether mutual or one-way) and information flow between parties", 22 | add_justifications=True, 23 | justification_depth="balanced", 24 | justification_max_sents=4, 25 | ) 26 | 27 | # Add the aspect to the document 28 | doc.aspects = [nda_direction_aspect] 29 | 30 | # Configure DocumentLLM with your API parameters 31 | llm = DocumentLLM( 32 | model="azure/gpt-4.1-mini", 33 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 34 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 35 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 36 | ) 37 | 38 | # Extract the aspect with justifications 39 | nda_direction_aspect = llm.extract_aspects_from_document(doc)[0] 40 | for i, item in enumerate(nda_direction_aspect.extracted_items, 1): 41 | print(f"- {i}. {item.value}") 42 | print(f" Justification: {item.justification}") 43 | print() 44 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/aspects/basic_aspect.py: -------------------------------------------------------------------------------- 1 | # ContextGem: Aspect Extraction 2 | 3 | import os 4 | 5 | from contextgem import Aspect, Document, DocumentLLM 6 | 7 | # Create a document instance 8 | doc = Document( 9 | raw_text=( 10 | "Software License Agreement\n" 11 | "This software license agreement (Agreement) is entered into between Tech Corp (Licensor) and Client Corp (Licensee).\n" 12 | "...\n" 13 | "2. Term and Termination\n" 14 | "This Agreement shall commence on the Effective Date and shall continue for a period of three (3) years, " 15 | "unless earlier terminated in accordance with the provisions hereof. Either party may terminate this Agreement " 16 | "upon thirty (30) days written notice to the other party.\n" 17 | "\n" 18 | "3. Payment Terms\n" 19 | "Licensee agrees to pay Licensor an annual license fee of $10,000, payable within thirty (30) days of the " 20 | "invoice date. Late payments shall incur a penalty of 1.5% per month.\n" 21 | "...\n" 22 | ), 23 | ) 24 | 25 | # Define an aspect to extract the termination clause 26 | termination_aspect = Aspect( 27 | name="Termination Clauses", 28 | description="Sections describing how and when the agreement can be terminated, including notice periods and conditions", 29 | ) 30 | 31 | # Add the aspect to the document 32 | doc.add_aspects([termination_aspect]) 33 | 34 | # Configure DocumentLLM with your API parameters 35 | llm = DocumentLLM( 36 | model="azure/gpt-4.1-mini", 37 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 38 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 39 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 40 | ) 41 | 42 | # Extract the aspect from the document 43 | termination_aspect = llm.extract_aspects_from_document(doc)[0] 44 | 45 | # Access the extracted information 46 | print("Extracted Termination Clauses:") 47 | for item in termination_aspect.extracted_items: 48 | print(f"- {item.value}") 49 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/concepts/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/boolean_concept/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/concepts/boolean_concept/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/boolean_concept/boolean_concept.py: -------------------------------------------------------------------------------- 1 | # ContextGem: BooleanConcept Extraction 2 | 3 | import os 4 | 5 | from contextgem import BooleanConcept, Document, DocumentLLM 6 | 7 | # Create a Document object from text 8 | doc = Document( 9 | raw_text="This document contains confidential information and should not be shared publicly." 10 | ) 11 | 12 | # Define a BooleanConcept to detect confidential content 13 | confidentiality_concept = BooleanConcept( 14 | name="Is confidential", 15 | description="Whether the document contains confidential information", 16 | ) 17 | 18 | # Attach the concept to the document 19 | doc.add_concepts([confidentiality_concept]) 20 | 21 | # Configure DocumentLLM with your API parameters 22 | llm = DocumentLLM( 23 | model="azure/gpt-4.1-mini", 24 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 25 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 26 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 27 | ) 28 | 29 | # Extract the concept from the document 30 | confidentiality_concept = llm.extract_concepts_from_document(doc)[0] 31 | 32 | # Print the extracted value 33 | print(confidentiality_concept.extracted_items[0].value) # Output: True 34 | # Or access the extracted value from the document object 35 | print(doc.concepts[0].extracted_items[0].value) # Output: True 36 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/boolean_concept/refs_and_justifications.py: -------------------------------------------------------------------------------- 1 | # ContextGem: BooleanConcept Extraction with References and Justifications 2 | 3 | import os 4 | 5 | from contextgem import BooleanConcept, Document, DocumentLLM 6 | 7 | # Sample document text containing policy information 8 | policy_text = """ 9 | Company Data Retention Policy (Updated 2024) 10 | 11 | All customer data must be encrypted at rest and in transit using industry-standard encryption protocols. 12 | Personal information should be retained for no longer than 3 years after the customer relationship ends. 13 | Employees are required to complete data privacy training annually. 14 | """ 15 | 16 | # Create a Document from the text 17 | doc = Document(raw_text=policy_text) 18 | 19 | # Create a BooleanConcept with justifications and references enabled 20 | compliance_concept = BooleanConcept( 21 | name="Has encryption requirement", 22 | description="Whether the document specifies that data must be encrypted", 23 | add_justifications=True, # Enable justifications to understand reasoning 24 | justification_depth="brief", 25 | justification_max_sents=1, # Allow up to 1 sentences for each justification 26 | add_references=True, # Include references to source text 27 | reference_depth="sentences", # Reference specific sentences rather than paragraphs 28 | ) 29 | 30 | # Attach the concept to the document 31 | doc.add_concepts([compliance_concept]) 32 | 33 | # Configure DocumentLLM with your API parameters 34 | llm = DocumentLLM( 35 | model="azure/gpt-4o-mini", 36 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 37 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 38 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 39 | ) 40 | 41 | # Extract the concept 42 | compliance_concept = llm.extract_concepts_from_document(doc)[0] 43 | 44 | # Print the extracted value with justification and references 45 | print(f"Has encryption requirement: {compliance_concept.extracted_items[0].value}") 46 | print(f"\nJustification: {compliance_concept.extracted_items[0].justification}") 47 | print("\nSource references:") 48 | for sent in compliance_concept.extracted_items[0].reference_sentences: 49 | print(f"- {sent.raw_text}") 50 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/date_concept/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/concepts/date_concept/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/date_concept/date_concept.py: -------------------------------------------------------------------------------- 1 | # ContextGem: DateConcept Extraction 2 | 3 | import os 4 | 5 | from contextgem import DateConcept, Document, DocumentLLM 6 | 7 | # Create a Document object from text 8 | doc = Document( 9 | raw_text="The research paper was published on March 15, 2025 and has been cited 42 times since." 10 | ) 11 | 12 | # Define a DateConcept to extract the publication date 13 | date_concept = DateConcept( 14 | name="Publication date", 15 | description="The date when the paper was published", 16 | ) 17 | 18 | # Attach the concept to the document 19 | doc.add_concepts([date_concept]) 20 | 21 | # Configure DocumentLLM with your API parameters 22 | llm = DocumentLLM( 23 | model="azure/gpt-4.1-mini", 24 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 25 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 26 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 27 | ) 28 | 29 | # Extract the concept from the document 30 | date_concept = llm.extract_concepts_from_document(doc)[0] 31 | 32 | # Print the extracted value 33 | print( 34 | type(date_concept.extracted_items[0].value), date_concept.extracted_items[0].value 35 | ) 36 | # Output: 2025-03-15 37 | 38 | # Or access the extracted value from the document object 39 | print( 40 | type(doc.concepts[0].extracted_items[0].value), 41 | doc.concepts[0].extracted_items[0].value, 42 | ) 43 | # Output: 2025-03-15 44 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/date_concept/refs_and_justifications.py: -------------------------------------------------------------------------------- 1 | # ContextGem: DateConcept Extraction with References and Justifications 2 | 3 | import os 4 | 5 | from contextgem import DateConcept, Document, DocumentLLM 6 | 7 | # Sample document text containing project timeline information 8 | project_text = """ 9 | Project Timeline: Website Redesign 10 | 11 | The website redesign project officially kicked off on March 1, 2024. 12 | The development team has estimated the project will take 4 months to complete. 13 | 14 | Key milestones: 15 | - Design phase: 1 month 16 | - Development phase: 2 months 17 | - Testing and deployment: 1 month 18 | 19 | The marketing team needs the final completion date to plan the launch campaign. 20 | """ 21 | 22 | # Create a Document from the text 23 | doc = Document(raw_text=project_text) 24 | 25 | # Create a DateConcept to calculate the project completion date 26 | completion_date_concept = DateConcept( 27 | name="Project completion date", 28 | description="The final completion date for the website redesign project", 29 | add_justifications=True, # enable justifications to understand extraction logic 30 | justification_depth="balanced", 31 | justification_max_sents=3, # allow up to 3 sentences for the calculation justification 32 | add_references=True, # include references to source text 33 | reference_depth="sentences", # reference specific sentences rather than paragraphs 34 | singular_occurrence=True, # extract only one calculated date 35 | ) 36 | 37 | # Attach the concept to the document 38 | doc.add_concepts([completion_date_concept]) 39 | 40 | # Configure DocumentLLM 41 | llm = DocumentLLM( 42 | model="azure/gpt-4.1", 43 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 44 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 45 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 46 | ) 47 | 48 | # Extract the concept 49 | completion_date_concept = llm.extract_concepts_from_document(doc)[0] 50 | 51 | # Print the calculated completion date with justification and references 52 | print("Calculated project completion date:") 53 | extracted_item = completion_date_concept.extracted_items[ 54 | 0 55 | ] # get the single calculated date 56 | print(f"\nCompletion Date: {extracted_item.value}") # expected output: 2024-07-01 57 | print(f"Calculation Justification: {extracted_item.justification}") 58 | print("Source references used for calculation:") 59 | for sent in extracted_item.reference_sentences: 60 | print(f"- {sent.raw_text}") 61 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/json_object_concept/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/concepts/json_object_concept/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/json_object_concept/json_object_concept.py: -------------------------------------------------------------------------------- 1 | # ContextGem: JsonObjectConcept Extraction 2 | 3 | import os 4 | from pprint import pprint 5 | from typing import Literal 6 | 7 | from contextgem import Document, DocumentLLM, JsonObjectConcept 8 | 9 | # Define product information text 10 | product_text = """ 11 | Product: Smart Fitness Watch X7 12 | Price: $199.99 13 | Features: Heart rate monitoring, GPS tracking, Sleep analysis 14 | Battery Life: 5 days 15 | Water Resistance: IP68 16 | Available Colors: Black, Silver, Blue 17 | Customer Rating: 4.5/5 18 | """ 19 | 20 | # Create a Document object from text 21 | doc = Document(raw_text=product_text) 22 | 23 | # Define a JsonObjectConcept with a structure for product information 24 | product_concept = JsonObjectConcept( 25 | name="Product Information", 26 | description="Extract detailed product information including name, price, features, and specifications", 27 | structure={ 28 | "name": str, 29 | "price": float, 30 | "features": list[str], 31 | "specifications": { 32 | "battery_life": str, 33 | "water_resistance": Literal["IP67", "IP68", "IPX7", "Not water resistant"], 34 | }, 35 | "available_colors": list[str], 36 | "customer_rating": float, 37 | }, 38 | ) 39 | 40 | # Attach the concept to the document 41 | doc.add_concepts([product_concept]) 42 | 43 | # Configure DocumentLLM with your API parameters 44 | llm = DocumentLLM( 45 | model="azure/gpt-4.1-mini", 46 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 47 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 48 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 49 | ) 50 | 51 | # Extract the concept from the document 52 | product_concept = llm.extract_concepts_from_document(doc)[0] 53 | 54 | # Print the extracted structured data 55 | extracted_product = product_concept.extracted_items[0].value 56 | pprint(extracted_product) 57 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/json_object_concept/structure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/concepts/json_object_concept/structure/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/json_object_concept/structure/nested_class_structure.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from contextgem import JsonObjectConcept 4 | from contextgem.public.utils import JsonObjectClassStruct 5 | 6 | # Use dataclasses to define the structure of the JSON object 7 | 8 | 9 | # All classes in the nested class structure must inherit from JsonObjectClassStruct 10 | # to enable automatic conversion of the class hierarchy to a dictionary structure 11 | # for JsonObjectConcept 12 | @dataclass 13 | class Location(JsonObjectClassStruct): 14 | latitude: float 15 | longitude: float 16 | altitude: float 17 | 18 | 19 | @dataclass 20 | class Sensor(JsonObjectClassStruct): 21 | id: str 22 | type: str 23 | location: Location # reference to another class 24 | active: bool 25 | 26 | 27 | @dataclass 28 | class SensorNetwork(JsonObjectClassStruct): 29 | network_id: str 30 | primary_sensor: Sensor # reference to another class 31 | backup_sensors: list[Sensor] # list of another class 32 | 33 | 34 | sensor_network_concept = JsonObjectConcept( 35 | name="IoT Sensor Network", 36 | description="Configuration for a network of IoT sensors", 37 | structure=SensorNetwork, # nested class structure 38 | ) 39 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/json_object_concept/structure/nested_structure.py: -------------------------------------------------------------------------------- 1 | from contextgem import JsonObjectConcept 2 | 3 | device_config_concept = JsonObjectConcept( 4 | name="Device Configuration", 5 | description="Configuration details for a networked device", 6 | structure={ 7 | "device": {"id": str, "type": str, "model": str}, 8 | "network": {"ip_address": str, "subnet_mask": str, "gateway": str}, 9 | "settings": {"enabled": bool, "mode": str}, 10 | }, 11 | ) 12 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/json_object_concept/structure/simple_class_structure.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from contextgem import JsonObjectConcept 4 | 5 | 6 | # Use a Pydantic model to define the structure of the JSON object 7 | class ProductSpec(BaseModel): 8 | name: str 9 | version: str 10 | features: list[str] 11 | 12 | 13 | product_spec_concept = JsonObjectConcept( 14 | name="Product Specification", 15 | description="Technical specifications for a product", 16 | structure=ProductSpec, 17 | ) 18 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/json_object_concept/structure/simple_structure.py: -------------------------------------------------------------------------------- 1 | from contextgem import JsonObjectConcept 2 | 3 | product_info_concept = JsonObjectConcept( 4 | name="Product Information", 5 | description="Product details", 6 | structure={ 7 | "name": str, 8 | "price": float, 9 | "is_available": bool, 10 | "ratings": list[float], 11 | }, 12 | ) 13 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/label_concept/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/concepts/label_concept/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/label_concept/label_concept.py: -------------------------------------------------------------------------------- 1 | # ContextGem: Contract Type Classification using LabelConcept 2 | 3 | import os 4 | 5 | from contextgem import Document, DocumentLLM, LabelConcept 6 | 7 | # Create a Document object from legal document text 8 | legal_doc_text = """ 9 | NON-DISCLOSURE AGREEMENT 10 | 11 | This Non-Disclosure Agreement ("Agreement") is entered into as of January 15, 2025, by and between TechCorp Inc., a Delaware corporation ("Disclosing Party"), and DataSystems LLC, a California limited liability company ("Receiving Party"). 12 | 13 | WHEREAS, Disclosing Party possesses certain confidential information relating to its proprietary technology and business operations; 14 | 15 | NOW, THEREFORE, in consideration of the mutual covenants contained herein, the parties agree as follows: 16 | 17 | 1. CONFIDENTIAL INFORMATION 18 | The term "Confidential Information" shall mean any and all non-public information... 19 | 20 | 2. OBLIGATIONS OF RECEIVING PARTY 21 | Receiving Party agrees to hold all Confidential Information in strict confidence... 22 | """ 23 | 24 | doc = Document(raw_text=legal_doc_text) 25 | 26 | # Define a LabelConcept for contract type classification 27 | contract_type_concept = LabelConcept( 28 | name="Contract Type", 29 | description="Classify the type of contract", 30 | labels=["NDA", "Consultancy Agreement", "Privacy Policy", "Other"], 31 | classification_type="multi_class", # only one label can be selected (mutually exclusive labels) 32 | singular_occurrence=True, # expect only one classification result 33 | ) 34 | print(contract_type_concept._format_labels_in_prompt) 35 | 36 | # Attach the concept to the document 37 | doc.add_concepts([contract_type_concept]) 38 | 39 | # Configure DocumentLLM with your API parameters 40 | llm = DocumentLLM( 41 | model="azure/gpt-4.1-mini", 42 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 43 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 44 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 45 | ) 46 | 47 | # Extract the concept from the document 48 | contract_type_concept = llm.extract_concepts_from_document(doc)[0] 49 | 50 | # Check if any labels were extracted 51 | if contract_type_concept.extracted_items: 52 | # Get the classified document type 53 | classified_type = contract_type_concept.extracted_items[0].value 54 | print(f"Document classified as: {classified_type}") # Output: ['NDA'] 55 | else: 56 | print("No applicable labels found for this document") 57 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/numerical_concept/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/concepts/numerical_concept/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/numerical_concept/numerical_concept.py: -------------------------------------------------------------------------------- 1 | # ContextGem: NumericalConcept Extraction 2 | 3 | import os 4 | 5 | from contextgem import Document, DocumentLLM, NumericalConcept 6 | 7 | # Create a Document object from text 8 | doc = Document( 9 | raw_text="The latest smartphone model costs $899.99 and will be available next week." 10 | ) 11 | 12 | # Define a NumericalConcept to extract the price 13 | price_concept = NumericalConcept( 14 | name="Product price", 15 | description="The price of the product", 16 | numeric_type="float", # We expect a decimal price 17 | ) 18 | 19 | # Attach the concept to the document 20 | doc.add_concepts([price_concept]) 21 | 22 | # Configure DocumentLLM with your API parameters 23 | llm = DocumentLLM( 24 | model="azure/gpt-4.1-mini", 25 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 26 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 27 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 28 | ) 29 | 30 | # Extract the concept from the document 31 | price_concept = llm.extract_concepts_from_document(doc)[0] 32 | 33 | # Print the extracted value 34 | print(price_concept.extracted_items[0].value) # Output: 899.99 35 | # Or access the extracted value from the document object 36 | print(doc.concepts[0].extracted_items[0].value) # Output: 899.99 37 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/numerical_concept/refs_and_justifications.py: -------------------------------------------------------------------------------- 1 | # ContextGem: NumericalConcept Extraction with References and Justifications 2 | 3 | import os 4 | 5 | from contextgem import Document, DocumentLLM, NumericalConcept 6 | 7 | # Document with values that require calculation/inference 8 | report_text = """ 9 | Quarterly Sales Report - Q2 2023 10 | 11 | Product A: Sold 450 units at $75 each 12 | Product B: Sold 320 units at $125 each 13 | Product C: Sold 180 units at $95 each 14 | 15 | Marketing expenses: $28,500 16 | Operating costs: $42,700 17 | """ 18 | 19 | # Create a Document from the text 20 | doc = Document(raw_text=report_text) 21 | 22 | # Create a NumericalConcept for total revenue 23 | total_revenue_concept = NumericalConcept( 24 | name="Total quarterly revenue", 25 | description="The total revenue calculated by multiplying units sold by their price", 26 | add_justifications=True, 27 | justification_depth="comprehensive", # Detailed justification to show calculation steps 28 | justification_max_sents=4, # Maximum number of sentences for justification 29 | add_references=True, 30 | reference_depth="paragraphs", # Reference specific paragraphs 31 | singular_occurrence=True, # Ensure that the data is merged into a single item 32 | ) 33 | 34 | # Attach the concept to the document 35 | doc.add_concepts([total_revenue_concept]) 36 | 37 | # Configure DocumentLLM with your API parameters 38 | llm = DocumentLLM( 39 | model="azure/o4-mini", 40 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 41 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 42 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 43 | ) 44 | 45 | # Extract the concept 46 | total_revenue_concept = llm.extract_concepts_from_document(doc)[0] 47 | 48 | # Print the extracted inferred value with justification 49 | print("Calculated total quarterly revenue:") 50 | for item in total_revenue_concept.extracted_items: 51 | print(f"\nTotal Revenue: {item.value}") 52 | print(f"Calculation Justification: {item.justification}") 53 | print("Source references:") 54 | for para in item.reference_paragraphs: 55 | print(f"- {para.raw_text}") 56 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/rating_concept/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/concepts/rating_concept/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/rating_concept/rating_concept.py: -------------------------------------------------------------------------------- 1 | # ContextGem: RatingConcept Extraction 2 | 3 | import os 4 | 5 | from contextgem import Document, DocumentLLM, RatingConcept, RatingScale 6 | 7 | # Create a Document object from text describing a product without an explicit rating 8 | smartphone_description = ( 9 | "This smartphone features a 5000mAh battery that lasts all day with heavy use. " 10 | "The display is 6.7 inch AMOLED with 120Hz refresh rate. " 11 | "Camera system includes a 50MP main sensor, 12MP ultrawide, and 8MP telephoto lens. " 12 | "The phone runs on the latest processor with 8GB RAM and 256GB storage. " 13 | "It has IP68 water resistance and Gorilla Glass Victus protection." 14 | ) 15 | 16 | doc = Document(raw_text=smartphone_description) 17 | 18 | # Define a RatingConcept that requires analysis to determine a rating 19 | product_quality = RatingConcept( 20 | name="Product Quality Rating", 21 | description=( 22 | "Evaluate the overall quality of the smartphone based on its specifications, " 23 | "features, and adherence to industry best practices" 24 | ), 25 | rating_scale=RatingScale(start=1, end=10), 26 | add_justifications=True, # include justification for the rating 27 | justification_depth="balanced", 28 | justification_max_sents=5, 29 | ) 30 | 31 | # Attach the concept to the document 32 | doc.add_concepts([product_quality]) 33 | 34 | # Configure DocumentLLM with your API parameters 35 | llm = DocumentLLM( 36 | model="azure/gpt-4.1", 37 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 38 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 39 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 40 | ) 41 | 42 | # Extract the concept from the document - the LLM will analyze and assign a rating 43 | product_quality = llm.extract_concepts_from_document(doc)[0] 44 | 45 | # Print the calculated rating 46 | print(f"Quality Rating: {product_quality.extracted_items[0].value}") 47 | # Print the justification 48 | print(f"Justification: {product_quality.extracted_items[0].justification}") 49 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/rating_concept/refs_and_justifications.py: -------------------------------------------------------------------------------- 1 | # ContextGem: RatingConcept Extraction with References and Justifications 2 | 3 | import os 4 | 5 | from contextgem import Document, DocumentLLM, RatingConcept, RatingScale 6 | 7 | # Sample document text about a software product with various aspects 8 | software_review = """ 9 | Software Review: ProjectManager Pro 5.0 10 | 11 | User Interface: The interface is clean and modern, with intuitive navigation. New users can quickly find what they need without extensive training. The dashboard provides a comprehensive overview of project status. 12 | 13 | Performance: The application loads quickly even with large projects. Resource-intensive operations like generating reports occasionally cause minor lag on older systems. The mobile app performs exceptionally well, even on limited bandwidth. 14 | 15 | Features: Project templates are well-designed and cover most common project types. Task dependencies are easily managed, and the Gantt chart visualization is excellent. However, the software lacks advanced risk management tools that competitors offer. 16 | 17 | Support: The documentation is comprehensive and well-organized. Customer service response time averages 4 hours, which is acceptable but not industry-leading. The knowledge base needs more video tutorials. 18 | """ 19 | 20 | # Create a Document from the text 21 | doc = Document(raw_text=software_review) 22 | 23 | # Create a RatingConcept with justifications and references enabled 24 | usability_rating_concept = RatingConcept( 25 | name="Software usability rating", 26 | description="Evaluate the overall usability of the software on a scale of 1-10 based on UI design, intuitiveness, and learning curve", 27 | rating_scale=RatingScale(start=1, end=10), 28 | add_justifications=True, # enable justifications to explain the rating 29 | justification_depth="comprehensive", # provide detailed reasoning 30 | justification_max_sents=5, # allow up to 5 sentences for justification 31 | add_references=True, # include references to source text 32 | reference_depth="sentences", # reference specific sentences rather than paragraphs 33 | ) 34 | 35 | # Attach the concept to the document 36 | doc.add_concepts([usability_rating_concept]) 37 | 38 | # Configure DocumentLLM with your API parameters 39 | llm = DocumentLLM( 40 | model="azure/gpt-4.1", 41 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 42 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 43 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 44 | ) 45 | 46 | # Extract the concept 47 | usability_rating_concept = llm.extract_concepts_from_document(doc)[0] 48 | 49 | # Print the extracted rating item with justification and references 50 | extracted_item = usability_rating_concept.extracted_items[0] 51 | print(f"Software Usability Rating: {extracted_item.value}/10") 52 | print(f"\nJustification: {extracted_item.justification}") 53 | print("\nSource references:") 54 | for sent in extracted_item.reference_sentences: 55 | print(f"- {sent.raw_text}") 56 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/string_concept/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/concepts/string_concept/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/string_concept/adding_examples.py: -------------------------------------------------------------------------------- 1 | # ContextGem: StringConcept Extraction with Examples 2 | 3 | import os 4 | 5 | from contextgem import Document, DocumentLLM, StringConcept, StringExample 6 | 7 | # Create a Document object from text 8 | contract_text = """ 9 | SERVICE AGREEMENT 10 | This Service Agreement (the "Agreement") is entered into as of January 15, 2025 by and between: 11 | XYZ Innovations Inc., a Delaware corporation with offices at 123 Tech Avenue, San Francisco, CA 12 | ("Provider"), and 13 | Omega Enterprises LLC, a New York limited liability company with offices at 456 Business Plaza, 14 | New York, NY ("Customer"). 15 | """ 16 | doc = Document(raw_text=contract_text) 17 | 18 | # Create a StringConcept for extracting parties and their roles 19 | parties_concept = StringConcept( 20 | name="Contract parties", 21 | description="Names of parties and their roles in the contract", 22 | examples=[ 23 | StringExample(content="Acme Corporation (Supplier)"), 24 | StringExample(content="TechGroup Inc. (Client)"), 25 | ], # add examples providing additional guidance to the LLM 26 | ) 27 | 28 | # Attach the concept to the document 29 | doc.add_concepts([parties_concept]) 30 | 31 | # Configure DocumentLLM with your API parameters 32 | llm = DocumentLLM( 33 | model="azure/gpt-4.1-mini", 34 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 35 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 36 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 37 | ) 38 | 39 | # Extract the concept from the document 40 | parties_concept = llm.extract_concepts_from_document(doc)[0] 41 | 42 | # Print the extracted parties and their roles 43 | print("Extracted parties and roles:") 44 | for item in parties_concept.extracted_items: 45 | print(f"- {item.value}") 46 | 47 | # Expected output: 48 | # - XYZ Innovations Inc. (Provider) 49 | # - Omega Enterprises LLC (Customer) 50 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/string_concept/refs_and_justifications.py: -------------------------------------------------------------------------------- 1 | # ContextGem: StringConcept Extraction with References and Justifications 2 | 3 | import os 4 | 5 | from contextgem import Document, DocumentLLM, StringConcept 6 | 7 | # Sample document text containing financial information 8 | financial_text = """ 9 | 2024 Financial Performance Summary 10 | 11 | Revenue increased to $120 million in fiscal year 2024, representing 15% growth compared to the previous year. This growth was primarily driven by the expansion of our enterprise client base and the successful launch of our premium service tier. 12 | 13 | The Board has recommended a dividend of $1.25 per share, which will be payable to shareholders of record as of March 15, 2025. 14 | """ 15 | 16 | # Create a Document from the text 17 | doc = Document(raw_text=financial_text) 18 | 19 | # Create a StringConcept with justifications and references enabled 20 | key_figures_concept = StringConcept( 21 | name="Financial key figures", 22 | description="Important financial metrics and figures mentioned in the report", 23 | add_justifications=True, # enable justifications to understand extraction reasoning 24 | justification_depth="balanced", 25 | justification_max_sents=3, # allow up to 3 sentences for each justification 26 | add_references=True, # include references to source text 27 | reference_depth="sentences", # reference specific sentences rather than paragraphs 28 | ) 29 | 30 | # Attach the concept to the document 31 | doc.add_concepts([key_figures_concept]) 32 | 33 | # Configure DocumentLLM with your API parameters 34 | llm = DocumentLLM( 35 | model="azure/gpt-4o-mini", 36 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 37 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 38 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 39 | ) 40 | 41 | # Extract the concept 42 | key_figures_concept = llm.extract_concepts_from_document(doc)[0] 43 | 44 | # Print the extracted items with justifications and references 45 | print("Extracted financial key figures:") 46 | for item in key_figures_concept.extracted_items: 47 | print(f"\nFigure: {item.value}") 48 | print(f"Justification: {item.justification}") 49 | print("Source references:") 50 | for sent in item.reference_sentences: 51 | print(f"- {sent.raw_text}") 52 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/concepts/string_concept/string_concept.py: -------------------------------------------------------------------------------- 1 | # ContextGem: StringConcept Extraction 2 | 3 | import os 4 | 5 | from contextgem import Document, DocumentLLM, StringConcept 6 | 7 | # Create a Document object from text 8 | doc = Document(raw_text="My name is John Smith and I am 30 years old.") 9 | 10 | # Define a StringConcept to extract a person's name 11 | name_concept = StringConcept( 12 | name="Person name", 13 | description="Full name of the person", 14 | ) 15 | 16 | # Attach the concept to the document 17 | doc.add_concepts([name_concept]) 18 | 19 | # Configure DocumentLLM with your API parameters 20 | llm = DocumentLLM( 21 | model="azure/gpt-4.1-mini", 22 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 23 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 24 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 25 | ) 26 | 27 | # Extract the concept from the document 28 | name_concept = llm.extract_concepts_from_document(doc)[0] 29 | 30 | # Get the extracted value 31 | print(name_concept.extracted_items[0].value) # Output: "John Smith" 32 | # Or access the extracted value from the document object 33 | print(doc.concepts[0].extracted_items[0].value) # Output: "John Smith" 34 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llm_config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/llm_config/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/llm_config/cost_tracking.py: -------------------------------------------------------------------------------- 1 | from contextgem import DocumentLLM, LLMPricing 2 | 3 | llm = DocumentLLM( 4 | model="openai/gpt-4o-mini", 5 | api_key="", 6 | pricing_details=LLMPricing( 7 | input_per_1m_tokens=0.150, # Cost per 1M input tokens 8 | output_per_1m_tokens=0.600, # Cost per 1M output tokens 9 | ), 10 | ) 11 | 12 | # Perform some extraction tasks 13 | 14 | # Later, you can check the cost 15 | cost_info = llm.get_cost() 16 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llm_config/detailed_usage.py: -------------------------------------------------------------------------------- 1 | from contextgem import DocumentLLM 2 | 3 | llm = DocumentLLM( 4 | model="openai/gpt-4.1", 5 | api_key="", 6 | ) 7 | 8 | # Perform some extraction tasks 9 | 10 | usage_info = llm.get_usage() 11 | 12 | # Access the first usage container in the list (for the primary LLM) 13 | llm_usage = usage_info[0] 14 | 15 | # Get detailed call information 16 | for call in llm_usage.usage.calls: 17 | print(f"Prompt: {call.prompt}") 18 | print(f"Response: {call.response}") # original, unprocessed response 19 | print(f"Sent at: {call.timestamp_sent}") 20 | print(f"Received at: {call.timestamp_received}") 21 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llm_config/fallback_llm.py: -------------------------------------------------------------------------------- 1 | from contextgem import DocumentLLM 2 | 3 | # Primary LLM 4 | primary_llm = DocumentLLM( 5 | model="openai/gpt-4o-mini", 6 | api_key="", 7 | role="extractor_text", # default role 8 | ) 9 | 10 | # Fallback LLM 11 | fallback_llm = DocumentLLM( 12 | model="anthropic/claude-3-5-haiku", 13 | api_key="", 14 | role="extractor_text", # Must match the primary LLM's role 15 | is_fallback=True, 16 | ) 17 | 18 | # Assign fallback LLM to primary 19 | primary_llm.fallback_llm = fallback_llm 20 | 21 | # Then use the primary LLM as usual 22 | # document = primary_llm.extract_all(document) 23 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llm_config/llm_api.py: -------------------------------------------------------------------------------- 1 | from contextgem import DocumentLLM 2 | 3 | llm = DocumentLLM( 4 | model="openai/gpt-4o-mini", # Format: / 5 | api_key="", 6 | ) 7 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llm_config/llm_group.py: -------------------------------------------------------------------------------- 1 | from contextgem import DocumentLLM, DocumentLLMGroup 2 | 3 | # Create LLMs with different roles 4 | text_extractor = DocumentLLM( 5 | model="openai/gpt-4o-mini", 6 | api_key="", 7 | role="extractor_text", 8 | output_language="adapt", 9 | ) 10 | 11 | text_reasoner = DocumentLLM( 12 | model="openai/o3-mini", 13 | api_key="", 14 | role="reasoner_text", 15 | max_completion_tokens=16000, 16 | reasoning_effort="high", 17 | output_language="adapt", 18 | ) 19 | 20 | # Create a group 21 | llm_group = DocumentLLMGroup( 22 | llms=[text_extractor, text_reasoner], 23 | output_language="adapt", # All LLMs in the group must share the same output language setting 24 | ) 25 | 26 | # Then use the group as usual 27 | # document = llm_group.extract_all(document) 28 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llm_config/llm_local.py: -------------------------------------------------------------------------------- 1 | from contextgem import DocumentLLM 2 | 3 | local_llm = DocumentLLM( 4 | model="ollama/llama3.1:8b", 5 | api_base="http://localhost:11434", # Default Ollama endpoint 6 | ) 7 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llm_config/o1_o4.py: -------------------------------------------------------------------------------- 1 | from contextgem import DocumentLLM 2 | 3 | llm = DocumentLLM( 4 | model="openai/o3-mini", 5 | api_key="", 6 | max_completion_tokens=8000, # Specific to reasoning (CoT-capable) models 7 | reasoning_effort="medium", # Optional: "low", "medium", "high" 8 | ) 9 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llm_config/tracking_usage_and_cost.py: -------------------------------------------------------------------------------- 1 | from contextgem import DocumentLLM 2 | 3 | llm = DocumentLLM( 4 | model="anthropic/claude-3-5-haiku", 5 | api_key="", 6 | ) 7 | 8 | # Perform some extraction tasks 9 | 10 | # Get usage statistics 11 | usage_info = llm.get_usage() 12 | 13 | # Get cost statistics 14 | cost_info = llm.get_cost() 15 | 16 | # Reset usage and cost statistics 17 | llm.reset_usage_and_cost() 18 | 19 | # The same methods are available for LLM groups, with optional filtering by LLM role 20 | # usage_info = llm_group.get_usage(llm_role="extractor_text") 21 | # cost_info = llm_group.get_cost(llm_role="extractor_text") 22 | # llm_group.reset_usage_and_cost(llm_role="extractor_text") 23 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/llms/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/llms/llm_extraction_methods/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/llms/llm_extraction_methods/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/llms/llm_extraction_methods/extract_all.py: -------------------------------------------------------------------------------- 1 | # ContextGem: Extracting All Aspects and Concepts from Document 2 | 3 | import os 4 | 5 | from contextgem import Aspect, Document, DocumentLLM, StringConcept 6 | 7 | # Sample text content 8 | text_content = """ 9 | John Smith is a 30-year-old software engineer working at TechCorp. 10 | He has 5 years of experience in Python development and leads a team of 8 developers. 11 | His annual salary is $95,000 and he graduated from MIT with a Computer Science degree. 12 | """ 13 | 14 | # Create a Document object from text 15 | doc = Document(raw_text=text_content) 16 | 17 | # Define aspects and concepts directly on the document 18 | doc.aspects = [ 19 | Aspect( 20 | name="Professional Information", 21 | description="Information about the person's career, job, and work experience", 22 | ) 23 | ] 24 | 25 | doc.concepts = [ 26 | StringConcept( 27 | name="Person name", 28 | description="Full name of the person", 29 | ) 30 | ] 31 | 32 | # Configure DocumentLLM with your API parameters 33 | llm = DocumentLLM( 34 | model="azure/gpt-4.1-mini", 35 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 36 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 37 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 38 | ) 39 | 40 | # Extract all aspects and concepts from the document 41 | processed_doc = llm.extract_all(doc) 42 | 43 | # Access extracted aspect information 44 | aspect = processed_doc.aspects[0] 45 | print(f"Aspect: {aspect.name}") 46 | print(f"Extracted items: {[item.value for item in aspect.extracted_items]}") 47 | 48 | # Access extracted concept information 49 | concept = processed_doc.concepts[0] 50 | print(f"Concept: {concept.name}") 51 | print(f"Extracted value: {concept.extracted_items[0].value}") 52 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llms/llm_extraction_methods/extract_aspects_from_document.py: -------------------------------------------------------------------------------- 1 | # ContextGem: Extracting Aspects from Documents 2 | 3 | import os 4 | 5 | from contextgem import Aspect, Document, DocumentLLM 6 | 7 | # Sample text content 8 | text_content = """ 9 | TechCorp is a leading software development company founded in 2015 with headquarters in San Francisco. 10 | The company specializes in cloud-based solutions and has grown to 500 employees across 12 countries. 11 | Their flagship product, CloudManager Pro, serves over 10,000 enterprise clients worldwide. 12 | TechCorp reported $50 million in revenue for 2023, representing a 25% growth from the previous year. 13 | The company is known for its innovative AI-powered analytics platform and excellent customer support. 14 | They recently expanded into the European market and plan to launch three new products in 2024. 15 | """ 16 | 17 | # Create a Document object from text 18 | doc = Document(raw_text=text_content) 19 | 20 | # Define aspects to extract from the document 21 | doc.aspects = [ 22 | Aspect( 23 | name="Company Overview", 24 | description="Basic information about the company, founding, location, and size", 25 | ), 26 | Aspect( 27 | name="Financial Performance", 28 | description="Revenue, growth metrics, and financial indicators", 29 | ), 30 | Aspect( 31 | name="Products and Services", 32 | description="Information about the company's products, services, and offerings", 33 | ), 34 | ] 35 | 36 | # Configure DocumentLLM with your API parameters 37 | llm = DocumentLLM( 38 | model="azure/gpt-4.1-mini", 39 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 40 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 41 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 42 | ) 43 | 44 | # Extract aspects from the document 45 | extracted_aspects = llm.extract_aspects_from_document(doc) 46 | 47 | # Access extracted aspect information 48 | for aspect in extracted_aspects: 49 | print(f"Aspect: {aspect.name}") 50 | print(f"Extracted items: {[item.value for item in aspect.extracted_items]}") 51 | print("---") 52 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llms/llm_extraction_methods/extract_concepts_from_aspect.py: -------------------------------------------------------------------------------- 1 | # ContextGem: Extracting Concepts from Specific Aspects 2 | 3 | import os 4 | 5 | from contextgem import Aspect, Document, DocumentLLM, NumericalConcept, StringConcept 6 | 7 | # Sample text content 8 | text_content = """ 9 | DataFlow Systems is an innovative fintech startup that was established in 2020 in Austin, Texas. 10 | The company has rapidly grown to 150 employees and operates in 8 major cities across North America. 11 | DataFlow's core platform, FinanceStream, is used by more than 5,000 small businesses for automated accounting. 12 | In their latest financial report, DataFlow Systems announced $12 million in annual revenue for 2024. 13 | This represents an impressive 40% increase compared to their 2023 performance. 14 | The company has secured $25 million in Series B funding and plans to expand internationally next year. 15 | """ 16 | 17 | # Create a Document object from text 18 | doc = Document(raw_text=text_content) 19 | 20 | # Define an aspect to extract from the document 21 | financial_aspect = Aspect( 22 | name="Financial Performance", 23 | description="Revenue, growth metrics, and financial indicators", 24 | ) 25 | 26 | # Add concepts to the aspect 27 | financial_aspect.concepts = [ 28 | StringConcept( 29 | name="Annual Revenue", 30 | description="Total revenue reported for the year", 31 | ), 32 | NumericalConcept( 33 | name="Growth Rate", 34 | description="Percentage growth rate compared to previous year", 35 | numeric_type="float", 36 | ), 37 | NumericalConcept( 38 | name="Revenue Year", 39 | description="The year for which revenue is reported", 40 | ), 41 | ] 42 | 43 | # Attach the aspect to the document 44 | doc.aspects = [financial_aspect] 45 | 46 | # Configure DocumentLLM with your API parameters 47 | llm = DocumentLLM( 48 | model="azure/gpt-4.1", 49 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 50 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 51 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 52 | ) 53 | 54 | # First, extract the aspect from the document (required before concept extraction) 55 | extracted_aspects = llm.extract_aspects_from_document(doc) 56 | financial_aspect = extracted_aspects[0] 57 | 58 | # Extract concepts from the specific aspect 59 | extracted_concepts = llm.extract_concepts_from_aspect(financial_aspect, doc) 60 | 61 | # Access extracted concepts for the aspect 62 | print(f"Aspect: {financial_aspect.name}") 63 | print(f"Extracted items: {[item.value for item in financial_aspect.extracted_items]}") 64 | print("\nConcepts extracted from this aspect:") 65 | for concept in extracted_concepts: 66 | print(f" {concept.name}: {[item.value for item in concept.extracted_items]}") 67 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llms/llm_extraction_methods/extract_concepts_from_document.py: -------------------------------------------------------------------------------- 1 | # ContextGem: Extracting Concepts Directly from Documents 2 | 3 | import os 4 | 5 | from contextgem import Document, DocumentLLM, NumericalConcept, StringConcept 6 | 7 | # Sample text content 8 | text_content = """ 9 | GreenTech Solutions is an environmental technology company founded in 2018 in Portland, Oregon. 10 | The company develops sustainable energy solutions and has 75 employees working remotely across the United States. 11 | Their primary product, EcoMonitor, helps businesses track carbon emissions and has been adopted by 2,500 organizations. 12 | GreenTech Solutions reported strong financial performance with $8.5 million in revenue for 2024. 13 | The company's CEO, Sarah Johnson, announced plans to achieve carbon neutrality by 2025. 14 | They recently opened a new research facility in Seattle and hired 20 additional engineers. 15 | """ 16 | 17 | # Create a Document object from text 18 | doc = Document(raw_text=text_content) 19 | 20 | # Define concepts to extract from the document 21 | doc.concepts = [ 22 | StringConcept( 23 | name="Company Name", 24 | description="Full name of the company", 25 | ), 26 | StringConcept( 27 | name="CEO Name", 28 | description="Full name of the company's CEO", 29 | ), 30 | NumericalConcept( 31 | name="Employee Count", 32 | description="Total number of employees at the company", 33 | numeric_type="int", 34 | ), 35 | StringConcept( 36 | name="Annual Revenue", 37 | description="Company's total revenue for the year", 38 | ), 39 | ] 40 | 41 | # Configure DocumentLLM with your API parameters 42 | llm = DocumentLLM( 43 | model="azure/gpt-4.1", 44 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 45 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 46 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 47 | ) 48 | 49 | # Extract concepts from the document 50 | extracted_concepts = llm.extract_concepts_from_document(doc) 51 | 52 | # Access extracted concept information 53 | print("Concepts extracted from document:") 54 | for concept in extracted_concepts: 55 | print(f" {concept.name}: {[item.value for item in concept.extracted_items]}") 56 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llms/llm_init/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/llms/llm_init/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/llms/llm_init/llm_api.py: -------------------------------------------------------------------------------- 1 | from contextgem import DocumentLLM 2 | 3 | # Pattern for using any cloud LLM provider 4 | llm = DocumentLLM( 5 | model="/", 6 | api_key="", 7 | ) 8 | 9 | # Example - Using OpenAI LLM 10 | llm_openai = DocumentLLM( 11 | model="openai/gpt-4.1-mini", 12 | api_key="", 13 | # see DocumentLLM API reference for all configuration options 14 | ) 15 | 16 | # Example - Using Azure OpenAI LLM 17 | llm_azure_openai = DocumentLLM( 18 | model="azure/o4-mini", 19 | api_key="", 20 | api_version="", 21 | api_base="", 22 | # see DocumentLLM API reference for all configuration options 23 | ) 24 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llms/llm_init/llm_local.py: -------------------------------------------------------------------------------- 1 | from contextgem import DocumentLLM 2 | 3 | local_llm = DocumentLLM( 4 | model="ollama/", 5 | api_base="http://localhost:11434", # Default Ollama endpoint 6 | ) 7 | 8 | # Example - Using Llama 3.1 LLM via Ollama 9 | llm_llama = DocumentLLM( 10 | model="ollama/llama3.1:8b", 11 | api_base="http://localhost:11434", 12 | # see DocumentLLM API reference for all configuration options 13 | ) 14 | 15 | # Example - Using DeepSeek R1 reasoning model via Ollama 16 | llm_deepseek = DocumentLLM( 17 | model="ollama/deepseek-r1:32b", 18 | api_base="http://localhost:11434", 19 | # see DocumentLLM API reference for all configuration options 20 | ) 21 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/llms/llm_init/lm_studio_connection_error_fix.py: -------------------------------------------------------------------------------- 1 | from contextgem import DocumentLLM 2 | 3 | llm = DocumentLLM( 4 | model="lm_studio/meta-llama-3.1-8b-instruct", 5 | api_base="http://localhost:1234/v1", 6 | api_key="dummy-key", # dummy key to avoid connection error 7 | ) 8 | 9 | # This is a known issue with calling LM Studio API in litellm: 10 | # https://github.com/openai/openai-python/issues/961 11 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/optimizations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/optimizations/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/optimizations/optimization_accuracy.py: -------------------------------------------------------------------------------- 1 | # Example of optimizing extraction for accuracy 2 | 3 | import os 4 | 5 | from contextgem import Document, DocumentLLM, StringConcept, StringExample 6 | 7 | # Define document 8 | doc = Document( 9 | raw_text="Non-Disclosure Agreement...", 10 | sat_model_id="sat-6l-sm", # default is "sat-3l-sm" 11 | paragraph_segmentation_mode="sat", # default is "newlines" 12 | # sentence segmentation mode is always "sat", as other approaches proved to be less accurate 13 | ) 14 | 15 | # Define document concepts 16 | doc.concepts = [ 17 | StringConcept( 18 | name="Title", # A very simple concept, just an example for testing purposes 19 | description="Title of the document", 20 | add_justifications=True, # enable justifications 21 | justification_depth="brief", # default 22 | examples=[ 23 | StringExample( 24 | content="Supplier Agreement", 25 | ) 26 | ], 27 | ), 28 | # ... add other concepts ... 29 | ] 30 | 31 | # ... attach other aspects/concepts to the document ... 32 | 33 | # Define and configure LLM 34 | llm = DocumentLLM( 35 | model="openai/gpt-4o", 36 | api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"), 37 | fallback_llm=DocumentLLM( 38 | model="openai/gpt-4-turbo", 39 | api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"), 40 | is_fallback=True, 41 | ), # configure a fallback LLM 42 | ) 43 | 44 | # Extract data from document with specific configuration options 45 | doc = llm.extract_all( 46 | doc, 47 | max_paragraphs_to_analyze_per_call=30, # limit the number of paragraphs to analyze in an individual LLM call 48 | max_items_per_call=1, # limit the number of aspects/concepts to analyze in an individual LLM call 49 | use_concurrency=True, # optional: enable concurrent extractions 50 | ) 51 | 52 | # ... use the extracted data ... 53 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/optimizations/optimization_choosing_llm.py: -------------------------------------------------------------------------------- 1 | # Example of selecting different LLMs for different tasks 2 | 3 | import os 4 | 5 | from contextgem import Aspect, Document, DocumentLLM, DocumentLLMGroup, StringConcept 6 | 7 | # Define LLMs 8 | base_llm = DocumentLLM( 9 | model="openai/gpt-4o-mini", 10 | api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"), 11 | role="extractor_text", # default 12 | ) 13 | 14 | # Optional - attach a fallback LLM 15 | base_llm_fallback = DocumentLLM( 16 | model="openai/gpt-3-5-turbo", 17 | api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"), 18 | role="extractor_text", # must have the same role as the parent LLM 19 | is_fallback=True, 20 | ) 21 | base_llm.fallback_llm = base_llm_fallback 22 | 23 | advanced_llm = DocumentLLM( 24 | model="openai/gpt-4o", # can be a larger model (reasoning or non-reasoning) 25 | api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"), 26 | role="reasoner_text", 27 | ) 28 | 29 | # You can organize LLMs in a group to use them in a pipeline 30 | llm_group = DocumentLLMGroup( 31 | llms=[base_llm, advanced_llm], 32 | ) 33 | 34 | # Assign the existing LLMs to aspects/concepts 35 | document = Document( 36 | raw_text="document_text", 37 | aspects=[ 38 | Aspect( 39 | name="aspect_name", 40 | description="aspect_description", 41 | llm_role="extractor_text", 42 | concepts=[ 43 | StringConcept( 44 | name="concept_name", 45 | description="concept_description", 46 | llm_role="reasoner_text", 47 | ) 48 | ], 49 | ) 50 | ], 51 | ) 52 | 53 | # Then use the LLM group to extract all information from the document 54 | # This will use different LLMs for different aspects/concepts under the hood 55 | # document = llm_group.extract_all(document) 56 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/optimizations/optimization_cost.py: -------------------------------------------------------------------------------- 1 | # Example of optimizing extraction for cost 2 | 3 | import os 4 | 5 | from contextgem import DocumentLLM, LLMPricing 6 | 7 | llm = DocumentLLM( 8 | model="openai/gpt-4o-mini", 9 | api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"), 10 | pricing_details=LLMPricing( 11 | input_per_1m_tokens=0.150, 12 | output_per_1m_tokens=0.600, 13 | ), # add pricing details to track costs 14 | ) 15 | 16 | # ... use the LLM for extraction ... 17 | 18 | # ... monitor usage and cost ... 19 | usage = llm.get_usage() # get the usage details, including tokens and calls' details. 20 | cost = llm.get_cost() # get the cost details, including input, output, and total costs. 21 | print(usage) 22 | print(cost) 23 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/optimizations/optimization_long_docs.py: -------------------------------------------------------------------------------- 1 | # Example of configuring LLM extraction to process long documents 2 | 3 | import os 4 | 5 | from contextgem import Document, DocumentLLM 6 | 7 | # Define document 8 | long_doc = Document( 9 | raw_text="long_document_text", 10 | ) 11 | 12 | # ... attach aspects/concepts to the document ... 13 | 14 | # Define and configure LLM 15 | llm = DocumentLLM( 16 | model="openai/gpt-4o-mini", 17 | api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"), 18 | ) 19 | 20 | # Extract data from document with specific configuration options 21 | long_doc = llm.extract_all( 22 | long_doc, 23 | max_paragraphs_to_analyze_per_call=50, # limit the number of paragraphs to analyze in an individual LLM call 24 | max_items_per_call=2, # limit the number of aspects/concepts to analyze in an individual LLM call 25 | use_concurrency=True, # optional: enable concurrent extractions 26 | ) 27 | 28 | # ... use the extracted data ... 29 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/optimizations/optimization_speed.py: -------------------------------------------------------------------------------- 1 | # Example of optimizing extraction for speed 2 | 3 | import os 4 | 5 | from aiolimiter import AsyncLimiter 6 | 7 | from contextgem import Document, DocumentLLM 8 | 9 | # Define document 10 | document = Document( 11 | raw_text="document_text", 12 | # aspects=[Aspect(...), ...], 13 | # concepts=[Concept(...), ...], 14 | ) 15 | 16 | # Define LLM with a fallback model 17 | llm = DocumentLLM( 18 | model="openai/gpt-4o-mini", 19 | api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"), 20 | async_limiter=AsyncLimiter( 21 | 10, 5 22 | ), # e.g. 10 acquisitions per 5-second period; adjust to your LLM API setup 23 | fallback_llm=DocumentLLM( 24 | model="openai/gpt-3.5-turbo", 25 | api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"), 26 | is_fallback=True, 27 | async_limiter=AsyncLimiter( 28 | 20, 5 29 | ), # e.g. 20 acquisitions per 5-second period; adjust to your LLM API setup 30 | ), 31 | ) 32 | 33 | # Use the LLM for extraction with concurrency enabled 34 | llm.extract_all(document, use_concurrency=True) 35 | 36 | # ... use the extracted data ... 37 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/quickstart/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/quickstart/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/quickstart/quickstart_aspect.py: -------------------------------------------------------------------------------- 1 | # Quick Start Example - Extracting aspect from a document 2 | 3 | import os 4 | 5 | from contextgem import Aspect, Document, DocumentLLM 6 | 7 | # Example document instance 8 | # Document content is shortened for brevity 9 | doc = Document( 10 | raw_text=( 11 | "Consultancy Agreement\n" 12 | "This agreement between Company A (Supplier) and Company B (Customer)...\n" 13 | "The term of the agreement is 1 year from the Effective Date...\n" 14 | "The Supplier shall provide consultancy services as described in Annex 2...\n" 15 | "The Customer shall pay the Supplier within 30 calendar days of receiving an invoice...\n" 16 | "This agreement is governed by the laws of Norway...\n" 17 | ), 18 | ) 19 | 20 | # Define an aspect with optional concept(s), using natural language 21 | doc_aspect = Aspect( 22 | name="Governing law", 23 | description="Clauses defining the governing law of the agreement", 24 | reference_depth="sentences", 25 | ) 26 | 27 | # Add aspects to the document 28 | doc.add_aspects([doc_aspect]) 29 | # (add more aspects to the document, if needed) 30 | 31 | # Create an LLM for extraction 32 | llm = DocumentLLM( 33 | model="openai/gpt-4o-mini", # or any other LLM from e.g. Anthropic, etc. 34 | api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"), # your API key 35 | ) 36 | 37 | # Extract information from the document 38 | extracted_aspects = llm.extract_aspects_from_document(doc) 39 | # or use async version llm.extract_aspects_from_document_async(doc) 40 | 41 | # Access extracted information 42 | print("Governing law aspect:") 43 | print( 44 | extracted_aspects[0].extracted_items 45 | ) # extracted aspect items with references to sentences 46 | # or doc.get_aspect_by_name("Governing law").extracted_items 47 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/quickstart/quickstart_concept_aspect.py: -------------------------------------------------------------------------------- 1 | # Quick Start Example - Extracting a concept from an aspect 2 | 3 | import os 4 | 5 | from contextgem import Aspect, Document, DocumentLLM, StringConcept, StringExample 6 | 7 | # Example document instance 8 | # Document content is shortened for brevity 9 | doc = Document( 10 | raw_text=( 11 | "Employment Agreement\n" 12 | "This agreement between TechCorp Inc. (Employer) and Jane Smith (Employee)...\n" 13 | "The employment shall commence on January 15, 2023 and continue until terminated...\n" 14 | "The Employee shall work as a Senior Software Engineer reporting to the CTO...\n" 15 | "The Employee shall receive an annual salary of $120,000 paid monthly...\n" 16 | "The Employee is entitled to 20 days of paid vacation per year...\n" 17 | "The Employee agrees to a notice period of 30 days for resignation...\n" 18 | "This agreement is governed by the laws of California...\n" 19 | ), 20 | ) 21 | 22 | # Define an aspect with a specific concept, using natural language 23 | doc_aspect = Aspect( 24 | name="Compensation", 25 | description="Clauses defining the compensation and benefits for the employee", 26 | reference_depth="sentences", 27 | ) 28 | 29 | # Define a concept within the aspect 30 | aspect_concept = StringConcept( 31 | name="Annual Salary", 32 | description="The annual base salary amount specified in the employment agreement", 33 | examples=[ # optional 34 | StringExample( 35 | content="$X per year", # guidance regarding format 36 | ) 37 | ], 38 | add_references=True, 39 | reference_depth="sentences", 40 | ) 41 | 42 | # Add the concept to the aspect 43 | doc_aspect.add_concepts([aspect_concept]) 44 | # (add more concepts to the aspect, if needed) 45 | 46 | # Add the aspect to the document 47 | doc.add_aspects([doc_aspect]) 48 | # (add more aspects to the document, if needed) 49 | 50 | # Create an LLM for extraction 51 | llm = DocumentLLM( 52 | model="openai/gpt-4o-mini", # or any other LLM from e.g. Anthropic, etc. 53 | api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"), # your API key 54 | ) 55 | 56 | # Extract information from the document 57 | doc = llm.extract_all(doc) 58 | # or use async version llm.extract_all_async(doc) 59 | 60 | # Access extracted information in the document object 61 | print("Compensation aspect:") 62 | print( 63 | doc.get_aspect_by_name("Compensation").extracted_items 64 | ) # extracted aspect items with references to sentences 65 | print("Annual Salary concept:") 66 | print( 67 | doc.get_aspect_by_name("Compensation") 68 | .get_concept_by_name("Annual Salary") 69 | .extracted_items 70 | ) # extracted concept items with references to sentences 71 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/quickstart/quickstart_concept_document_text.py: -------------------------------------------------------------------------------- 1 | # Quick Start Example - Extracting a concept from a document 2 | 3 | import os 4 | 5 | from contextgem import Document, DocumentLLM, JsonObjectConcept, JsonObjectExample 6 | 7 | # Example document instance 8 | # Document content is shortened for brevity 9 | doc = Document( 10 | raw_text=( 11 | "Statement of Work\n" 12 | "Project: Cloud Migration Initiative\n" 13 | "Client: Acme Corporation\n" 14 | "Contractor: TechSolutions Inc.\n\n" 15 | "Project Timeline:\n" 16 | "Start Date: March 1, 2025\n" 17 | "End Date: August 31, 2025\n\n" 18 | "Deliverables:\n" 19 | "1. Infrastructure assessment report (Due: March 15, 2025)\n" 20 | "2. Migration strategy document (Due: April 10, 2025)\n" 21 | "3. Test environment setup (Due: May 20, 2025)\n" 22 | "4. Production migration (Due: July 15, 2025)\n" 23 | "5. Post-migration support (Due: August 31, 2025)\n\n" 24 | "Budget: $250,000\n" 25 | "Payment Schedule: 20% upfront, 30% at midpoint, 50% upon completion\n" 26 | ), 27 | ) 28 | 29 | # Define a document-level concept using e.g. JsonObjectConcept 30 | # This will extract structured data from the entire document 31 | doc_concept = JsonObjectConcept( 32 | name="Project Details", 33 | description="Key project information including timeline, deliverables, and budget", 34 | structure={ 35 | "project_name": str, 36 | "client": str, 37 | "contractor": str, 38 | "budget": str, 39 | "payment_terms": str, 40 | }, # simply use a dictionary with type hints (including generic aliases and union types) 41 | add_references=True, 42 | reference_depth="paragraphs", 43 | ) 44 | 45 | # Add the concept to the document 46 | doc.add_concepts([doc_concept]) 47 | # (add more concepts to the document, if needed) 48 | 49 | # Create an LLM for extraction 50 | llm = DocumentLLM( 51 | model="openai/gpt-4o-mini", # or any other LLM from e.g. Anthropic, etc. 52 | api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"), # your API key 53 | ) 54 | 55 | # Extract information from the document 56 | extracted_concepts = llm.extract_concepts_from_document(doc) 57 | # or use async version llm.extract_concepts_from_document_async(doc) 58 | 59 | # Access extracted information 60 | print("Project Details:") 61 | print( 62 | extracted_concepts[0].extracted_items 63 | ) # extracted concept items with references to paragraphs 64 | # Or doc.get_concept_by_name("Project Details").extracted_items 65 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/quickstart/quickstart_concept_document_vision.py: -------------------------------------------------------------------------------- 1 | # Quick Start Example - Extracting concept from a document with an image 2 | 3 | import os 4 | from pathlib import Path 5 | 6 | from contextgem import Document, DocumentLLM, Image, NumericalConcept, image_to_base64 7 | 8 | # Path adapted for testing 9 | current_file = Path(__file__).resolve() 10 | root_path = current_file.parents[4] 11 | image_path = root_path / "tests" / "invoices" / "invoice.jpg" 12 | 13 | # Create an image instance 14 | doc_image = Image(mime_type="image/jpg", base64_data=image_to_base64(image_path)) 15 | 16 | # Example document instance holding only the image 17 | doc = Document( 18 | images=[doc_image], # may contain multiple images 19 | ) 20 | 21 | # Define a concept to extract the invoice total amount 22 | doc_concept = NumericalConcept( 23 | name="Invoice Total", 24 | description="The total amount to be paid as shown on the invoice", 25 | numeric_type="float", 26 | llm_role="extractor_vision", # use vision model 27 | ) 28 | 29 | # Add concept to the document 30 | doc.add_concepts([doc_concept]) 31 | # (add more concepts to the document, if needed) 32 | 33 | # Create an LLM for extraction 34 | llm = DocumentLLM( 35 | model="openai/gpt-4o-mini", # Using a model with vision capabilities 36 | api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"), # your API key 37 | role="extractor_vision", # mark LLM as vision model 38 | ) 39 | 40 | # Extract information from the document 41 | extracted_concepts = llm.extract_concepts_from_document(doc) 42 | # or use async version: await llm.extract_concepts_from_document_async(doc) 43 | 44 | # Access extracted information 45 | print("Invoice Total:") 46 | print(extracted_concepts[0].extracted_items) # extracted concept items 47 | # or doc.get_concept_by_name("Invoice Total").extracted_items 48 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/quickstart/quickstart_sub_aspect.py: -------------------------------------------------------------------------------- 1 | # Quick Start Example - Extracting an aspect with sub-aspects 2 | 3 | import os 4 | 5 | from contextgem import Aspect, Document, DocumentLLM 6 | 7 | # Sample document (content shortened for brevity) 8 | contract_text = """ 9 | EMPLOYMENT AGREEMENT 10 | ... 11 | 8. TERMINATION 12 | 8.1 Termination by the Company. The Company may terminate the Employee's employment for Cause at any time upon written notice. 13 | "Cause" shall mean: (i) Employee's material breach of this Agreement; (ii) Employee's conviction of a felony; or 14 | (iii) Employee's willful misconduct that causes material harm to the Company. 15 | 8.2 Termination by the Employee. The Employee may terminate employment for Good Reason upon 30 days' written notice to the Company. 16 | "Good Reason" shall mean a material reduction in Employee's base salary or a material diminution in Employee's duties. 17 | 8.3 Severance. If the Employee's employment is terminated by the Company without Cause or by the Employee for Good Reason, 18 | the Employee shall be entitled to receive severance pay equal to six (6) months of the Employee's base salary. 19 | ... 20 | """ 21 | 22 | doc = Document(raw_text=contract_text) 23 | 24 | # Define termination aspect with practical sub-aspects 25 | termination_aspect = Aspect( 26 | name="Termination", 27 | description="Provisions related to the termination of employment", 28 | aspects=[ # assign sub-aspects (optional) 29 | Aspect( 30 | name="Company Termination Rights", 31 | description="Conditions under which the company can terminate employment", 32 | ), 33 | Aspect( 34 | name="Employee Termination Rights", 35 | description="Conditions under which the employee can terminate employment", 36 | ), 37 | Aspect( 38 | name="Severance Terms", 39 | description="Compensation or benefits provided upon termination", 40 | ), 41 | ], 42 | ) 43 | 44 | # Add the aspect to the document. Sub-aspects are added with the parent aspect. 45 | doc.add_aspects([termination_aspect]) 46 | # (add more aspects to the document, if needed) 47 | 48 | # Create an LLM for extraction 49 | llm = DocumentLLM( 50 | model="openai/gpt-4o-mini", # or any other LLM from e.g. Anthropic, etc. 51 | api_key=os.environ.get( 52 | "CONTEXTGEM_OPENAI_API_KEY" 53 | ), # your API key of the LLM provider 54 | ) 55 | 56 | # Extract all information from the document 57 | doc = llm.extract_all(doc) 58 | 59 | # Get results with references in the document object 60 | print("\nTermination aspect:\n") 61 | termination_aspect = doc.get_aspect_by_name("Termination") 62 | for sub_aspect in termination_aspect.aspects: 63 | print(sub_aspect.name) 64 | for item in sub_aspect.extracted_items: 65 | print(item.value) 66 | print("\n") 67 | -------------------------------------------------------------------------------- /dev/usage_examples/docs/serialization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docs/serialization/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docs/serialization/serialization.py: -------------------------------------------------------------------------------- 1 | # Example of serializing and deserializing ContextGem document, 2 | # document pipeline, and LLM config. 3 | 4 | import os 5 | from pathlib import Path 6 | 7 | from contextgem import ( 8 | Aspect, 9 | BooleanConcept, 10 | Document, 11 | DocumentLLM, 12 | DocumentPipeline, 13 | DocxConverter, 14 | StringConcept, 15 | ) 16 | 17 | # Create a document object 18 | converter = DocxConverter() 19 | docx_path = str( 20 | Path(__file__).resolve().parents[4] 21 | / "tests" 22 | / "docx_files" 23 | / "en_nda_with_anomalies.docx" 24 | ) # your file path here (Path adapted for testing) 25 | doc = converter.convert(docx_path, strict_mode=True) 26 | 27 | # Create a document pipeline 28 | document_pipeline = DocumentPipeline( 29 | aspects=[ 30 | Aspect( 31 | name="Categories of confidential information", 32 | description="Clauses describing confidential information covered by the NDA", 33 | concepts=[ 34 | StringConcept( 35 | name="Types of disclosure", 36 | description="Types of disclosure of confidential information", 37 | ), 38 | # ... 39 | ], 40 | ), 41 | # ... 42 | ], 43 | concepts=[ 44 | BooleanConcept( 45 | name="Is mutual", 46 | description="Whether the NDA is mutual (both parties act as discloser/recipient)", 47 | add_justifications=True, 48 | ), 49 | # ... 50 | ], 51 | ) 52 | 53 | # Attach the pipeline to the document 54 | doc.assign_pipeline(document_pipeline) 55 | 56 | # Configure a document LLM with your API parameters 57 | llm = DocumentLLM( 58 | model="azure/gpt-4.1-mini", 59 | api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"), 60 | api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"), 61 | api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"), 62 | ) 63 | 64 | # Extract data from the document 65 | doc = llm.extract_all(doc) 66 | 67 | # Serialize the LLM config, pipeline and document 68 | llm_config_json = llm.to_json() # or to_dict() / to_disk() 69 | document_pipeline_json = document_pipeline.to_json() # or to_dict() / to_disk() 70 | processed_doc_json = doc.to_json() # or to_dict() / to_disk() 71 | 72 | # Deserialize the LLM config, pipeline and document 73 | llm_deserialized = DocumentLLM.from_json( 74 | llm_config_json 75 | ) # or from_dict() / from_disk() 76 | document_pipeline_deserialized = DocumentPipeline.from_json( 77 | document_pipeline_json 78 | ) # or from_dict() / from_disk() 79 | processed_doc_deserialized = Document.from_json( 80 | processed_doc_json 81 | ) # or from_dict() / from_disk() 82 | 83 | # All extracted data is preserved! 84 | assert processed_doc_deserialized.aspects[0].concepts[0].extracted_items 85 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docstrings/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/aspects/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docstrings/aspects/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/aspects/def_aspect.py: -------------------------------------------------------------------------------- 1 | from contextgem import Aspect 2 | 3 | # Define an aspect focused on termination clauses 4 | termination_aspect = Aspect( 5 | name="Termination provisions", 6 | description="Contract termination conditions, notice requirements, and severance terms.", 7 | reference_depth="sentences", 8 | add_justifications=True, 9 | justification_depth="comprehensive", 10 | ) 11 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/concepts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docstrings/concepts/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/concepts/def_boolean_concept.py: -------------------------------------------------------------------------------- 1 | from contextgem import BooleanConcept 2 | 3 | # Create the concept with specific configuration 4 | has_confidentiality = BooleanConcept( 5 | name="Contains confidentiality clause", 6 | description="Determines whether the contract includes provisions requiring parties to maintain confidentiality", 7 | llm_role="reasoner_text", 8 | singular_occurrence=True, 9 | add_justifications=True, 10 | justification_depth="brief", 11 | ) 12 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/concepts/def_date_concept.py: -------------------------------------------------------------------------------- 1 | from contextgem import DateConcept 2 | 3 | # Create a date concept to extract the effective date of the contract 4 | effective_date = DateConcept( 5 | name="Effective date", 6 | description="The effective as specified in the contract", 7 | add_references=True, # Include references to where dates were found 8 | singular_occurrence=True, # Only extract one effective date per document 9 | ) 10 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/concepts/def_json_object_concept.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | from contextgem import JsonObjectConcept 4 | 5 | # Define a JSON object concept for capturing address information 6 | address_info_concept = JsonObjectConcept( 7 | name="Address information", 8 | description=( 9 | "Structured address data from text including street, " 10 | "city, state, postal code, and country." 11 | ), 12 | structure={ 13 | "street": str | None, 14 | "city": str | None, 15 | "state": str | None, 16 | "postal_code": str | None, 17 | "country": str | None, 18 | "address_type": Literal["residential", "business"] | None, 19 | }, 20 | ) 21 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/concepts/def_label_concept.py: -------------------------------------------------------------------------------- 1 | from contextgem import LabelConcept 2 | 3 | # Multi-class classification: single label selection 4 | document_type_concept = LabelConcept( 5 | name="Document Type", 6 | description="Classify the type of legal document", 7 | labels=["NDA", "Consultancy Agreement", "Privacy Policy", "Other"], 8 | classification_type="multi_class", 9 | singular_occurrence=True, 10 | ) 11 | 12 | # Multi-label classification: multiple label selection 13 | content_topics_concept = LabelConcept( 14 | name="Content Topics", 15 | description="Identify all relevant topics covered in the document", 16 | labels=["Finance", "Legal", "Technology", "HR", "Operations", "Marketing"], 17 | classification_type="multi_label", 18 | add_justifications=True, 19 | justification_depth="brief", # add justifications for the selected labels 20 | ) 21 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/concepts/def_numerical_concept.py: -------------------------------------------------------------------------------- 1 | from contextgem import NumericalConcept 2 | 3 | # Create concepts for different numerical values in the contract 4 | payment_amount = NumericalConcept( 5 | name="Payment amount", 6 | description="The monetary value to be paid according to the contract terms", 7 | numeric_type="float", 8 | llm_role="extractor_text", 9 | add_references=True, 10 | reference_depth="sentences", 11 | ) 12 | 13 | payment_days = NumericalConcept( 14 | name="Payment term days", 15 | description="The number of days within which payment must be made", 16 | numeric_type="int", 17 | llm_role="extractor_text", 18 | add_justifications=True, 19 | justification_depth="balanced", 20 | ) 21 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/concepts/def_rating_concept.py: -------------------------------------------------------------------------------- 1 | from contextgem import RatingConcept, RatingScale 2 | 3 | # Create a rating scale for contract fairness evaluation 4 | fairness_scale = RatingScale(start=1, end=5) 5 | 6 | # Create a concept to rate the fairness of contract terms 7 | fairness_rating = RatingConcept( 8 | name="Contract fairness rating", 9 | description="Evaluation of how balanced and fair the contract terms are for all parties", 10 | rating_scale=fairness_scale, 11 | llm_role="reasoner_text", 12 | add_justifications=True, 13 | justification_depth="comprehensive", 14 | justification_max_sents=10, 15 | ) 16 | 17 | # Create a clarity scale for contract language evaluation 18 | clarity_scale = RatingScale(start=1, end=10) 19 | 20 | # Create a concept to rate the clarity of contract language 21 | clarity_rating = RatingConcept( 22 | name="Language clarity rating", 23 | description="Assessment of how clear and unambiguous the contract language is", 24 | rating_scale=clarity_scale, 25 | llm_role="reasoner_text", 26 | add_justifications=True, 27 | justification_depth="balanced", 28 | justification_max_sents=3, 29 | ) 30 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/concepts/def_string_concept.py: -------------------------------------------------------------------------------- 1 | from contextgem import StringConcept, StringExample 2 | 3 | # Define a string concept for identifying contract party names 4 | # and their roles in the contract 5 | party_names_and_roles_concept = StringConcept( 6 | name="Party names and roles", 7 | description=( 8 | "Names of all parties entering into the agreement " 9 | "and their contractual roles" 10 | ), 11 | examples=[ 12 | StringExample( 13 | content="X (Client)", # guidance regarding format 14 | ) 15 | ], 16 | ) 17 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/data_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docstrings/data_models/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/data_models/def_llm_pricing.py: -------------------------------------------------------------------------------- 1 | from contextgem import LLMPricing 2 | 3 | # Create a pricing model for an LLM (openai/o3-mini example) 4 | pricing = LLMPricing( 5 | input_per_1m_tokens=1.10, # $1.10 per million input tokens 6 | output_per_1m_tokens=4.40, # $4.40 per million output tokens 7 | ) 8 | 9 | # LLMPricing objects are immutable 10 | try: 11 | pricing.input_per_1m_tokens = 0.7 12 | except ValueError as e: 13 | print(f"Error when trying to modify pricing: {e}") 14 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/data_models/def_rating_scale.py: -------------------------------------------------------------------------------- 1 | from contextgem import RatingScale 2 | 3 | # Create a rating scale with default values (0 to 10) 4 | default_scale = RatingScale() 5 | 6 | # Create a custom rating scale (1 to 5) 7 | custom_scale = RatingScale( 8 | start=1, 9 | end=5, 10 | ) 11 | 12 | # RatingScale objects are immutable 13 | try: 14 | custom_scale.end = 7 15 | except ValueError as e: 16 | print(f"Error when trying to modify rating scale: {e}") 17 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/documents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docstrings/documents/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/documents/def_document.py: -------------------------------------------------------------------------------- 1 | from contextgem import Document 2 | 3 | # Create a document with raw text content 4 | contract_document = Document( 5 | raw_text=( 6 | "...This agreement is effective as of January 1, 2025.\n\n" 7 | "All parties must comply with the terms outlined herein. The terms include " 8 | "monthly reporting requirements and quarterly performance reviews.\n\n" 9 | "Failure to adhere to these terms may result in termination of the agreement. " 10 | "Additionally, any breach of confidentiality will be subject to penalties as " 11 | "described in this agreement.\n\n" 12 | "This agreement shall remain in force for a period of three (3) years unless " 13 | "otherwise terminated according to the provisions stated above..." 14 | ), 15 | paragraph_segmentation_mode="newlines", # Default mode, splits on newlines 16 | ) 17 | 18 | # Create a document with more advanced paragraph segmentation using a SaT model 19 | report_document = Document( 20 | raw_text=( 21 | "Executive Summary " 22 | "This report outlines our quarterly performance. " 23 | "Revenue increased by [15%] compared to the previous quarter.\n\n" 24 | "Customer satisfaction metrics show positive trends across all regions..." 25 | ), 26 | paragraph_segmentation_mode="sat", # Use SaT model for intelligent paragraph segmentation 27 | sat_model_id="sat-3l-sm", # Specify which SaT model to use 28 | ) 29 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docstrings/examples/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/examples/def_example_json_object.py: -------------------------------------------------------------------------------- 1 | from contextgem import JsonObjectConcept, JsonObjectExample 2 | 3 | # Create a JSON object example 4 | json_example = JsonObjectExample( 5 | content={ 6 | "name": "John Doe", 7 | "education": "Bachelor's degree in Computer Science", 8 | "skills": ["Python", "Machine Learning", "Data Analysis"], 9 | "hobbies": ["Reading", "Traveling", "Gaming"], 10 | } 11 | ) 12 | 13 | 14 | # Define a structure for JSON object concept 15 | class PersonInfo: 16 | name: str 17 | education: str 18 | skills: list[str] 19 | hobbies: list[str] 20 | 21 | 22 | # Also works as a dict with type hints, e.g. 23 | # PersonInfo = { 24 | # "name": str, 25 | # "education": str, 26 | # "skills": list[str], 27 | # "hobbies": list[str], 28 | # } 29 | 30 | # Attach JSON example to a JsonObjectConcept 31 | json_concept = JsonObjectConcept( 32 | name="Candidate info", 33 | description="Structured information about a job candidate", 34 | structure=PersonInfo, # Define the expected structure 35 | examples=[json_example], # Attach the example to the concept (optional) 36 | ) 37 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/examples/def_example_string.py: -------------------------------------------------------------------------------- 1 | from contextgem import StringConcept, StringExample 2 | 3 | # Create string examples 4 | string_examples = [ 5 | StringExample(content="X (Client)"), 6 | StringExample(content="Y (Supplier)"), 7 | ] 8 | 9 | # Attach string examples to a StringConcept 10 | string_concept = StringConcept( 11 | name="Contract party name and role", 12 | description="The name and role of the contract party", 13 | examples=string_examples, # Attach the example to the concept (optional) 14 | ) 15 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/images/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docstrings/images/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/images/def_image.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from contextgem import Document, Image, image_to_base64 4 | 5 | # Path is adapted for doc tests 6 | current_file = Path(__file__).resolve() 7 | root_path = current_file.parents[4] 8 | 9 | # Using the utility function to convert an image file to base64 10 | image_path = root_path / "tests" / "invoices" / "invoice.jpg" 11 | base64_data = image_to_base64(image_path) 12 | 13 | # Create an image instance with the base64-encoded data 14 | jpg_image = Image(mime_type="image/jpg", base64_data=base64_data) 15 | 16 | # Using pre-encoded base64 data directly 17 | png_image = Image( 18 | mime_type="image/png", base64_data="base64-string" # image as a base64 string 19 | ) 20 | 21 | # Using a different supported image format 22 | webp_image = Image( 23 | mime_type="image/webp", 24 | base64_data=image_to_base64(root_path / "tests" / "invoices" / "invoice.webp"), 25 | ) 26 | 27 | # Attaching an image to a document 28 | # Documents can contain both text and multiple images, or just images 29 | 30 | # Create a document with text content 31 | text_document = Document( 32 | raw_text="This is a document with an attached image that shows an invoice.", 33 | images=[jpg_image], 34 | ) 35 | 36 | # Create a document with only image content (no text) 37 | image_only_document = Document(images=[jpg_image]) 38 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/llms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docstrings/llms/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/llms/def_llm.py: -------------------------------------------------------------------------------- 1 | from contextgem import DocumentLLM, LLMPricing 2 | 3 | # Create a single LLM for text extraction 4 | text_extractor = DocumentLLM( 5 | model="openai/gpt-4o-mini", 6 | api_key="your-api-key", # Replace with your actual API key 7 | role="extractor_text", # Role for text extraction 8 | pricing_details=LLMPricing( # optional 9 | input_per_1m_tokens=0.150, output_per_1m_tokens=0.600 10 | ), 11 | ) 12 | 13 | # Create a fallback LLM in case the primary model fails 14 | fallback_text_extractor = DocumentLLM( 15 | model="anthropic/claude-3-7-sonnet", 16 | api_key="your-anthropic-api-key", # Replace with your actual API key 17 | role="extractor_text", # must be the same as the role of the primary LLM 18 | is_fallback=True, 19 | pricing_details=LLMPricing( # optional 20 | input_per_1m_tokens=3.00, output_per_1m_tokens=15.00 21 | ), 22 | ) 23 | # Assign the fallback LLM to the primary LLM 24 | text_extractor.fallback_llm = fallback_text_extractor 25 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/llms/def_llm_group.py: -------------------------------------------------------------------------------- 1 | from contextgem import DocumentLLM, DocumentLLMGroup 2 | 3 | # Create a text extractor LLM with a fallback 4 | text_extractor = DocumentLLM( 5 | model="openai/gpt-4o-mini", 6 | api_key="your-openai-api-key", # Replace with your actual API key 7 | role="extractor_text", 8 | ) 9 | 10 | # Create a fallback LLM for the text extractor 11 | text_extractor_fallback = DocumentLLM( 12 | model="anthropic/claude-3-5-haiku", 13 | api_key="your-anthropic-api-key", # Replace with your actual API key 14 | role="extractor_text", # Must have the same role as the primary LLM 15 | is_fallback=True, 16 | ) 17 | 18 | # Assign the fallback LLM to the primary text extractor 19 | text_extractor.fallback_llm = text_extractor_fallback 20 | 21 | # Create a text reasoner LLM 22 | text_reasoner = DocumentLLM( 23 | model="openai/o3-mini", 24 | api_key="your-openai-api-key", # Replace with your actual API key 25 | role="reasoner_text", # For more complex tasks that require reasoning 26 | ) 27 | 28 | # Create a vision extractor LLM 29 | vision_extractor = DocumentLLM( 30 | model="openai/gpt-4o-mini", 31 | api_key="your-openai-api-key", # Replace with your actual API key 32 | role="extractor_vision", # For handling images 33 | ) 34 | 35 | # Create a vision reasoner LLM 36 | vision_reasoner = DocumentLLM( 37 | model="openai/gpt-4o", 38 | api_key="your-openai-api-key", 39 | role="reasoner_vision", # For more complex vision tasks that require reasoning 40 | ) 41 | 42 | # Create a DocumentLLMGroup with all four LLMs 43 | llm_group = DocumentLLMGroup( 44 | llms=[text_extractor, text_reasoner, vision_extractor, vision_reasoner], 45 | output_language="en", # All LLMs must have the same output language ("en" is default) 46 | ) 47 | # This group will have 5 LLMs: four main ones, with different roles, 48 | # and one fallback LLM for a specific LLM. Each LLM can have a fallback LLM. 49 | 50 | # Get usage statistics for the whole group or for a specific role 51 | group_usage = llm_group.get_usage() 52 | text_extractor_usage = llm_group.get_usage(llm_role="extractor_text") 53 | 54 | # Get cost statistics for the whole group or for a specific role 55 | all_costs = llm_group.get_cost() 56 | text_extractor_cost = llm_group.get_cost(llm_role="extractor_text") 57 | 58 | # Reset usage and cost statistics for the whole group or for a specific role 59 | llm_group.reset_usage_and_cost() 60 | llm_group.reset_usage_and_cost(llm_role="extractor_text") 61 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/paragraphs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docstrings/paragraphs/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/paragraphs/def_paragraph.py: -------------------------------------------------------------------------------- 1 | from contextgem import Paragraph 2 | 3 | # Create a paragraph with raw text content 4 | contract_paragraph = Paragraph( 5 | raw_text=( 6 | "This agreement is effective as of January 1, 2025. " 7 | "All parties must comply with the terms outlined herein. " 8 | "Failure to adhere to these terms may result in termination of the agreement." 9 | ) 10 | ) 11 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docstrings/pipelines/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/pipelines/def_pipeline.py: -------------------------------------------------------------------------------- 1 | from contextgem import ( 2 | Aspect, 3 | BooleanConcept, 4 | DateConcept, 5 | Document, 6 | DocumentPipeline, 7 | StringConcept, 8 | ) 9 | 10 | # Create a pipeline for NDA (Non-Disclosure Agreement) review 11 | nda_pipeline = DocumentPipeline( 12 | aspects=[ 13 | Aspect( 14 | name="Confidential information", 15 | description="Clauses defining the confidential information", 16 | ), 17 | Aspect( 18 | name="Exclusions", 19 | description="Clauses defining exclusions from confidential information", 20 | ), 21 | Aspect( 22 | name="Obligations", 23 | description="Clauses defining confidentiality obligations", 24 | ), 25 | Aspect( 26 | name="Liability", 27 | description="Clauses defining liability for breach of the agreement", 28 | ), 29 | # ... Add more aspects as needed 30 | ], 31 | concepts=[ 32 | StringConcept( 33 | name="Anomaly", 34 | description="Anomaly in the contract, e.g. out-of-context or nonsensical clauses", 35 | llm_role="reasoner_text", 36 | add_references=True, # Add references to the source text 37 | reference_depth="sentences", # Reference to the sentence level 38 | add_justifications=True, # Add justifications for the anomaly 39 | justification_depth="balanced", # Justification at the sentence level 40 | justification_max_sents=5, # Maximum number of sentences in the justification 41 | ), 42 | BooleanConcept( 43 | name="Is mutual", 44 | description="Whether the NDA is mutual (bidirectional) or one-way", 45 | singular_occurrence=True, 46 | llm_role="reasoner_text", # Use the reasoner role for this concept 47 | ), 48 | DateConcept( 49 | name="Effective date", 50 | description="The date when the NDA agreement becomes effective", 51 | singular_occurrence=True, 52 | ), 53 | StringConcept( 54 | name="Term", 55 | description="The term of the NDA", 56 | ), 57 | StringConcept( 58 | name="Governing law", 59 | description="The governing law of the agreement", 60 | singular_occurrence=True, 61 | ), 62 | # ... Add more concepts as needed 63 | ], 64 | ) 65 | 66 | # Assign the pipeline to the NDA document 67 | nda_document = Document(raw_text="[NDA text]") 68 | nda_document.assign_pipeline(nda_pipeline) 69 | 70 | # Now the document is ready for processing with the NDA review pipeline! 71 | # The document can be processed to extract the defined aspects and concepts 72 | 73 | # Extract all aspects and concepts from the NDA using an LLM group 74 | # with LLMs with roles "extractor_text" and "reasoner_text". 75 | # llm_group.extract_all(nda_document) 76 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/sentences/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docstrings/sentences/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/sentences/def_sentence.py: -------------------------------------------------------------------------------- 1 | from contextgem import Sentence 2 | 3 | # Create a sentence with raw text content 4 | sentence = Sentence(raw_text="This is a simple sentence.") 5 | 6 | # Sentences are immutable - their content cannot be changed after creation 7 | try: 8 | sentence.raw_text = "Attempting to modify the sentence." 9 | except ValueError as e: 10 | print(f"Error when trying to modify sentence: {e}") 11 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/docstrings/utils/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/utils/json_object_cls_struct.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from contextgem import JsonObjectClassStruct, JsonObjectConcept 4 | 5 | 6 | @dataclass 7 | class Address(JsonObjectClassStruct): 8 | street: str 9 | city: str 10 | country: str 11 | 12 | 13 | @dataclass 14 | class Contact(JsonObjectClassStruct): 15 | email: str 16 | phone: str 17 | address: Address 18 | 19 | 20 | @dataclass 21 | class Person(JsonObjectClassStruct): 22 | name: str 23 | age: int 24 | contact: Contact 25 | 26 | 27 | # Use the class structure with JsonObjectConcept 28 | # JsonObjectClassStruct enables automatic conversion of typed class hierarchies 29 | # into the dictionary structure required by JsonObjectConcept, preserving the 30 | # type information and nested relationships between classes. 31 | JsonObjectConcept(name="person", description="Person information", structure=Person) 32 | -------------------------------------------------------------------------------- /dev/usage_examples/docstrings/utils/reload_logger_settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from contextgem import reload_logger_settings 4 | 5 | # Initial logger settings are loaded from environment variables at import time 6 | 7 | # Change logger level to WARNING 8 | os.environ["CONTEXTGEM_LOGGER_LEVEL"] = "WARNING" 9 | print("Setting logger level to WARNING") 10 | reload_logger_settings() 11 | # Now the logger will only show WARNING level and above messages 12 | 13 | # Disable the logger completely 14 | os.environ["CONTEXTGEM_DISABLE_LOGGER"] = "True" 15 | print("Disabling the logger") 16 | reload_logger_settings() 17 | # Now the logger is disabled and won't show any messages 18 | 19 | # You can re-enable the logger by setting CONTEXTGEM_DISABLE_LOGGER to "False" 20 | # os.environ["CONTEXTGEM_DISABLE_LOGGER"] = "False" 21 | # reload_logger_settings() 22 | -------------------------------------------------------------------------------- /dev/usage_examples/readme/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/readme/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/readme/docx_converter.py: -------------------------------------------------------------------------------- 1 | # Using ContextGem's DocxConverter 2 | 3 | from contextgem import DocxConverter 4 | 5 | converter = DocxConverter() 6 | 7 | # Convert a DOCX file to an LLM-ready ContextGem Document 8 | # from path 9 | document = converter.convert("path/to/document.docx") 10 | # or from file object 11 | with open("path/to/document.docx", "rb") as docx_file_object: 12 | document = converter.convert(docx_file_object) 13 | 14 | # You can also use it as a standalone text extractor 15 | docx_text = converter.convert_to_text_format( 16 | "path/to/document.docx", 17 | output_format="markdown", # or "raw" 18 | ) 19 | -------------------------------------------------------------------------------- /dev/usage_examples/readme/llm_chat.py: -------------------------------------------------------------------------------- 1 | # Using LLMs for chat (text + vision), with fallback LLM support 2 | 3 | import os 4 | 5 | from contextgem import DocumentLLM 6 | 7 | # from contextgem import Image 8 | 9 | main_model = DocumentLLM( 10 | model="openai/gpt-4o", # or another provider/model 11 | api_key=os.getenv("CONTEXTGEM_OPENAI_API_KEY"), # your API key for the LLM provider 12 | ) 13 | 14 | # Optional: fallback LLM 15 | fallback_model = DocumentLLM( 16 | model="openai/gpt-4o-mini", # or another provider/model 17 | api_key=os.getenv("CONTEXTGEM_OPENAI_API_KEY"), # your API key for the LLM provider 18 | is_fallback=True, 19 | ) 20 | main_model.fallback_llm = fallback_model 21 | 22 | response = main_model.chat( 23 | "Hello", 24 | # images=[Image(...)] 25 | ) 26 | # or `response = await main_model.chat_async(...)` 27 | 28 | print(response) 29 | -------------------------------------------------------------------------------- /dev/usage_examples/readme/quickstart_aspect.py: -------------------------------------------------------------------------------- 1 | # Quick Start Example - Extracting payment terms from a document 2 | 3 | import os 4 | 5 | from contextgem import Aspect, Document, DocumentLLM 6 | 7 | # Sample document text (shortened for brevity) 8 | doc = Document( 9 | raw_text=( 10 | "SERVICE AGREEMENT\n" 11 | "SERVICES. Provider agrees to provide the following services to Client: " 12 | "Cloud-based data analytics platform access and maintenance...\n" 13 | "PAYMENT. Client agrees to pay $5,000 per month for the services. " 14 | "Payment is due on the 1st of each month. Late payments will incur a 2% fee per month...\n" 15 | "CONFIDENTIALITY. Both parties agree to keep all proprietary information confidential " 16 | "for a period of 5 years following termination of this Agreement..." 17 | ), 18 | ) 19 | 20 | # Define the aspects to extract 21 | doc.aspects = [ 22 | Aspect( 23 | name="Payment Terms", 24 | description="Payment terms and conditions in the contract", 25 | # see the docs for more configuration options, e.g. sub-aspects, concepts, etc. 26 | ), 27 | # Add more aspects as needed 28 | ] 29 | # Or use `doc.add_aspects([...])` 30 | 31 | # Define an LLM for extracting information from the document 32 | llm = DocumentLLM( 33 | model="openai/gpt-4o-mini", # or another provider/LLM 34 | api_key=os.environ.get( 35 | "CONTEXTGEM_OPENAI_API_KEY" 36 | ), # your API key for the LLM provider 37 | # see the docs for more configuration options 38 | ) 39 | 40 | # Extract information from the document 41 | doc = llm.extract_all(doc) # or use async version `await llm.extract_all_async(doc)` 42 | 43 | # Access extracted information in the document object 44 | for item in doc.aspects[0].extracted_items: 45 | print(f"• {item.value}") 46 | # or `doc.get_aspect_by_name("Payment Terms").extracted_items` 47 | 48 | # Output (exact paragraphs from the document): 49 | # • PAYMENT. Client agrees to pay $5,000 per month for the services. Payment is due on the 1st of each month. Late payments will incur a 2% fee per month... 50 | -------------------------------------------------------------------------------- /dev/usage_examples/readme/quickstart_concept.py: -------------------------------------------------------------------------------- 1 | # Quick Start Example - Extracting anomalies from a document, with source references and justifications 2 | 3 | import os 4 | 5 | from contextgem import Document, DocumentLLM, StringConcept 6 | 7 | # Sample document text (shortened for brevity) 8 | doc = Document( 9 | raw_text=( 10 | "Consultancy Agreement\n" 11 | "This agreement between Company A (Supplier) and Company B (Customer)...\n" 12 | "The term of the agreement is 1 year from the Effective Date...\n" 13 | "The Supplier shall provide consultancy services as described in Annex 2...\n" 14 | "The Customer shall pay the Supplier within 30 calendar days of receiving an invoice...\n" 15 | "The purple elephant danced gracefully on the moon while eating ice cream.\n" # 💎 anomaly 16 | "Time-traveling dinosaurs will review all deliverables before acceptance.\n" # 💎 another anomaly 17 | "This agreement is governed by the laws of Norway...\n" 18 | ), 19 | ) 20 | 21 | # Attach a document-level concept 22 | doc.concepts = [ 23 | StringConcept( 24 | name="Anomalies", # in longer contexts, this concept is hard to capture with RAG 25 | description="Anomalies in the document", 26 | add_references=True, 27 | reference_depth="sentences", 28 | add_justifications=True, 29 | justification_depth="brief", 30 | # see the docs for more configuration options 31 | ) 32 | # add more concepts to the document, if needed 33 | # see the docs for available concepts: StringConcept, JsonObjectConcept, etc. 34 | ] 35 | # Or use `doc.add_concepts([...])` 36 | 37 | # Define an LLM for extracting information from the document 38 | llm = DocumentLLM( 39 | model="openai/gpt-4o-mini", # or another provider/LLM 40 | api_key=os.environ.get( 41 | "CONTEXTGEM_OPENAI_API_KEY" 42 | ), # your API key for the LLM provider 43 | # see the docs for more configuration options 44 | ) 45 | 46 | # Extract information from the document 47 | doc = llm.extract_all(doc) # or use async version `await llm.extract_all_async(doc)` 48 | 49 | # Access extracted information in the document object 50 | anomalies_concept = doc.concepts[0] 51 | # or `doc.get_concept_by_name("Anomalies")` 52 | for item in anomalies_concept.extracted_items: 53 | print(f"Anomaly:") 54 | print(f" {item.value}") 55 | print(f"Justification:") 56 | print(f" {item.justification}") 57 | print("Reference paragraphs:") 58 | for p in item.reference_paragraphs: 59 | print(f" - {p.raw_text}") 60 | print("Reference sentences:") 61 | for s in item.reference_sentences: 62 | print(f" - {s.raw_text}") 63 | print() 64 | -------------------------------------------------------------------------------- /dev/usage_examples/vs_other_frameworks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/vs_other_frameworks/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/vs_other_frameworks/advanced/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/vs_other_frameworks/advanced/__init__.py -------------------------------------------------------------------------------- /dev/usage_examples/vs_other_frameworks/basic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/dev/usage_examples/vs_other_frameworks/basic/__init__.py -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/_static/contextgem_component_examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/contextgem_component_examples.png -------------------------------------------------------------------------------- /docs/source/_static/contextgem_how_it_works_infographics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/contextgem_how_it_works_infographics.png -------------------------------------------------------------------------------- /docs/source/_static/contextgem_readme_header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/contextgem_readme_header.png -------------------------------------------------------------------------------- /docs/source/_static/contextgem_website_preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/contextgem_website_preview.png -------------------------------------------------------------------------------- /docs/source/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* Theme-specific image display */ 2 | html[data-theme="light"] .only-dark { 3 | display: none !important; 4 | } 5 | 6 | html[data-theme="dark"] .only-light { 7 | display: none !important; 8 | } 9 | -------------------------------------------------------------------------------- /docs/source/_static/docs_preview_image_aspects.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/docs_preview_image_aspects.png -------------------------------------------------------------------------------- /docs/source/_static/docs_preview_image_boolean_concept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/docs_preview_image_boolean_concept.png -------------------------------------------------------------------------------- /docs/source/_static/docs_preview_image_date_concept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/docs_preview_image_date_concept.png -------------------------------------------------------------------------------- /docs/source/_static/docs_preview_image_json_object_concept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/docs_preview_image_json_object_concept.png -------------------------------------------------------------------------------- /docs/source/_static/docs_preview_image_label_concept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/docs_preview_image_label_concept.png -------------------------------------------------------------------------------- /docs/source/_static/docs_preview_image_numerical_concept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/docs_preview_image_numerical_concept.png -------------------------------------------------------------------------------- /docs/source/_static/docs_preview_image_rating_concept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/docs_preview_image_rating_concept.png -------------------------------------------------------------------------------- /docs/source/_static/docs_preview_image_string_concept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/docs_preview_image_string_concept.png -------------------------------------------------------------------------------- /docs/source/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/favicon.ico -------------------------------------------------------------------------------- /docs/source/_static/readme_code_snippet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/readme_code_snippet.png -------------------------------------------------------------------------------- /docs/source/_static/tab_solid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/docs/source/_static/tab_solid.png -------------------------------------------------------------------------------- /docs/source/api/aspects.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Aspects API 19 | 20 | Aspects 21 | ======== 22 | 23 | .. automodule:: contextgem.public.aspects 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | :inherited-members: 28 | :exclude-members: model_config, model_post_init -------------------------------------------------------------------------------- /docs/source/api/concepts.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Concepts API 19 | 20 | Concepts 21 | ========= 22 | 23 | .. automodule:: contextgem.public.concepts 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | :inherited-members: 28 | :exclude-members: model_config, model_post_init -------------------------------------------------------------------------------- /docs/source/api/converters.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Converters API 19 | 20 | Converters 21 | =========== 22 | 23 | .. automodule:: contextgem.public.converters 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | :inherited-members: 28 | -------------------------------------------------------------------------------- /docs/source/api/data_models.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Data Models API 19 | 20 | Data models 21 | ============ 22 | 23 | .. automodule:: contextgem.public.data_models 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | :inherited-members: 28 | :exclude-members: model_config, model_post_init -------------------------------------------------------------------------------- /docs/source/api/documents.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Documents API 19 | 20 | Documents 21 | ========== 22 | 23 | .. automodule:: contextgem.public.documents 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | :inherited-members: 28 | :exclude-members: model_config, model_post_init -------------------------------------------------------------------------------- /docs/source/api/examples.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Examples API 19 | 20 | Examples 21 | ========= 22 | 23 | .. automodule:: contextgem.public.examples 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | :inherited-members: 28 | :exclude-members: model_config, model_post_init -------------------------------------------------------------------------------- /docs/source/api/images.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Images API 19 | 20 | Images 21 | ======= 22 | 23 | .. automodule:: contextgem.public.images 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | :inherited-members: 28 | :exclude-members: model_config, model_post_init -------------------------------------------------------------------------------- /docs/source/api/llms.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: LLMs API 19 | 20 | LLMs 21 | ===== 22 | 23 | .. automodule:: contextgem.public.llms 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | :inherited-members: 28 | :exclude-members: model_config, model_post_init 29 | :private-members: _update_default_prompt, _eq_deserialized_llm_config -------------------------------------------------------------------------------- /docs/source/api/paragraphs.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Paragraphs API 19 | 20 | Paragraphs 21 | =========== 22 | 23 | .. automodule:: contextgem.public.paragraphs 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | :inherited-members: 28 | :exclude-members: model_config, model_post_init -------------------------------------------------------------------------------- /docs/source/api/pipelines.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Pipelines API 19 | 20 | Pipelines 21 | ========== 22 | 23 | .. automodule:: contextgem.public.pipelines 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | :inherited-members: 28 | :exclude-members: model_config, model_post_init 29 | -------------------------------------------------------------------------------- /docs/source/api/sentences.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Sentences API 19 | 20 | Sentences 21 | ========== 22 | 23 | .. automodule:: contextgem.public.sentences 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | :inherited-members: 28 | :exclude-members: model_config, model_post_init -------------------------------------------------------------------------------- /docs/source/api/utils.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Utility functions and classes 19 | 20 | Utility functions and classes 21 | ============================== 22 | 23 | .. automodule:: contextgem.public.utils 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | :inherited-members: 28 | :exclude-members: model_config, model_post_init -------------------------------------------------------------------------------- /docs/source/concepts/supported_concepts.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | Supported Concepts 19 | =================== 20 | 21 | In ContextGem, Concepts are building blocks for defining the structured data you want to extract from documents. 22 | Each concept type is designed for different kinds of information, allowing you to build complex extraction schemas. 23 | 24 | Available Concept Types 25 | ------------------------ 26 | 27 | ContextGem provides several types of concepts, each tailored for specific extraction needs: 28 | 29 | - 📝 :doc:`StringConcept `: For extracting text values 30 | - ✅ :doc:`BooleanConcept `: For extracting boolean (True/False) values 31 | - 🔢 :doc:`NumericalConcept `: For extracting numerical values (integers or floats) 32 | - 📅 :doc:`DateConcept `: For extracting date objects 33 | - ⭐ :doc:`RatingConcept `: For extracting numerical ratings within a defined scale 34 | - 📊 :doc:`JsonObjectConcept `: For extracting structured data with multiple fields 35 | - 🏷️ :doc:`LabelConcept `: For classification using predefined labels (multi-class or multi-label) 36 | 37 | This section provides detailed documentation for each concept type, including usage examples and best practices. 38 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Installation 19 | 20 | Installation 21 | ============ 22 | 23 | 🔧 Prerequisites 24 | ----------------- 25 | 26 | Before installing ContextGem, ensure you have: 27 | 28 | * Python 3.10-3.13 29 | * pip (Python package installer) 30 | 31 | 📦 Installation Methods 32 | ------------------------ 33 | 34 | From PyPI 35 | ~~~~~~~~~ 36 | 37 | The simplest way to install ContextGem is via pip: 38 | 39 | .. code-block:: bash 40 | 41 | pip install -U contextgem 42 | 43 | From Source 44 | ~~~~~~~~~~~ 45 | 46 | To install from source: 47 | 48 | .. code-block:: bash 49 | 50 | git clone https://github.com/shcherbak-ai/contextgem.git 51 | cd contextgem 52 | pip install -e . 53 | 54 | Development Installation 55 | ~~~~~~~~~~~~~~~~~~~~~~~~ 56 | 57 | For development, we use Poetry: 58 | 59 | .. code-block:: bash 60 | 61 | # Install poetry if you don't have it 62 | pip install poetry 63 | 64 | # Install dependencies including development extras 65 | poetry install --with dev 66 | 67 | # Activate the virtual environment 68 | poetry shell 69 | 70 | ✅ Verifying Installation 71 | -------------------------- 72 | 73 | To verify that ContextGem is installed correctly, run: 74 | 75 | .. code-block:: bash 76 | 77 | python -c "import contextgem; print(contextgem.__version__)" -------------------------------------------------------------------------------- /docs/source/llms/supported_llms.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Supported LLM Providers and Models 19 | 20 | 21 | Supported LLMs 22 | =============== 23 | 24 | ContextGem supports all LLM providers and models available through the LiteLLM integration. This means you can use models from major cloud providers like OpenAI, Anthropic, Google, and Azure, as well as run local models through providers like Ollama and LM Studio. 25 | 26 | ContextGem works with both types of LLM architectures: 27 | 28 | * Reasoning/CoT-capable models (e.g., ``openai/o4-mini``, ``ollama/deepseek-r1:32b``) 29 | * Non-reasoning models (e.g., ``openai/gpt-4.1``, ``ollama/llama3.1:8b``) 30 | 31 | For a complete list of supported providers, see the `LiteLLM Providers documentation `_. 32 | 33 | 34 | ☁️ Cloud-based LLMs 35 | --------------------- 36 | 37 | You can initialize cloud-based LLMs by specifying the provider and model name in the format ``/``: 38 | 39 | .. literalinclude:: ../../../dev/usage_examples/docs/llms/llm_init/llm_api.py 40 | :language: python 41 | :caption: Using cloud LLM providers 42 | 43 | 44 | 💻 Local LLMs 45 | --------------- 46 | 47 | For local LLMs, you'll need to specify the provider, model name, and the appropriate API base URL: 48 | 49 | .. literalinclude:: ../../../dev/usage_examples/docs/llms/llm_init/llm_local.py 50 | :language: python 51 | :caption: Using local LLM providers 52 | 53 | .. note:: 54 | **LM Studio Connection Error**: If you encounter a connection error (``litellm.APIError: APIError: Lm_studioException - Connection error``) when using LM Studio, check that you have provided a dummy API key. While API keys are usually not expected for local models, this is a specific case where LM Studio requires one: 55 | 56 | .. literalinclude:: ../../../dev/usage_examples/docs/llms/llm_init/lm_studio_connection_error_fix.py 57 | :language: python 58 | :caption: LM Studio with dummy API key 59 | 60 | This is a known issue with calling LM Studio API in litellm: https://github.com/openai/openai-python/issues/961 61 | 62 | 63 | For a complete list of configuration options available when initializing DocumentLLM instances, see the next section :doc:`llm_config`. 64 | -------------------------------------------------------------------------------- /docs/source/optimizations/optimization_accuracy.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Optimizing for Accuracy 19 | 20 | Optimizing for Accuracy 21 | ======================== 22 | 23 | When accuracy is paramount, ContextGem offers several techniques to improve extraction quality, some of which are pretty obvious: 24 | 25 | - **🚀 Use a Capable LLM**: Choose a powerful LLM model for extraction. 26 | - **🪄 Use Larger Segmentation Models**: Select a larger SaT model for intelligent segmentation of paragraphs or sentences, to ensure the highest segmentation accuracy in complex documents (e.g. contracts). 27 | - **💡 Provide Examples**: For most complex concepts, add examples to guide the LLM's extraction format and style. 28 | - **🧠 Request Justifications**: For most complex aspects/concepts, enable justifications to understand the LLM's reasoning and instruct the LLM to "think" when giving an answer. 29 | - **📏 Limit Paragraphs Per Call**: This will reduce each prompt's length and ensure a more focused analysis. 30 | - **🔢 Limit Aspects/Concepts Per Call**: Process a smaller number of aspects or concepts in each LLM call, preventing prompt overloading. 31 | - **🔄 Use a Fallback LLM**: Configure a fallback LLM to retry failed extractions with a different model. 32 | 33 | 34 | .. literalinclude:: ../../../dev/usage_examples/docs/optimizations/optimization_accuracy.py 35 | :language: python 36 | :caption: Example of optimizing extraction for accuracy 37 | -------------------------------------------------------------------------------- /docs/source/optimizations/optimization_cost.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Optimizing for Cost 19 | 20 | Optimizing for Cost 21 | ==================== 22 | 23 | ContextGem offers several strategies to optimize for cost efficiency while maintaining extraction quality: 24 | 25 | - **💸 Select Cost-Efficient Models**: Use smaller/distilled non-reasoning LLMs for extracting aspects and basic concepts (e.g. titles, payment amounts, dates). 26 | - **⚙️ Use Default Parameters**: All the extractions will be processed in as few LLM calls as possible. 27 | - **📉 Enable Justifications Only When Necessary**: Do not use justifications for simple aspects or concepts. This will reduce the number of tokens generated. 28 | - **📊 Monitor Usage and Cost**: Track LLM calls, token consumption, and cost to identify optimization opportunities. 29 | 30 | 31 | .. literalinclude:: ../../../dev/usage_examples/docs/optimizations/optimization_cost.py 32 | :language: python 33 | :caption: Example of optimizing extraction for cost 34 | -------------------------------------------------------------------------------- /docs/source/optimizations/optimization_long_docs.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Dealing with Long Documents 19 | 20 | Dealing with Long Documents 21 | ============================ 22 | 23 | ContextGem offers specialized configuration options for efficiently processing lengthy documents. 24 | 25 | ✂️ Segmentation Approach 26 | -------------------------- 27 | 28 | Unlike many systems that rely on chunking (e.g. RAG), ContextGem intelligently segments documents into natural semantic units like paragraphs and sentences. This preserves the contextual integrity of the content while allowing you to configure: 29 | 30 | - Maximum number of paragraphs per LLM call 31 | - Maximum number of aspects/concepts to analyze per LLM call 32 | - Maximum number of images per LLM call (if the document contains images) 33 | 34 | ⚙️ Effective Optimization Strategies 35 | -------------------------------------- 36 | 37 | - **🔄 Use Long-Context Models**: Select models with large context windows. (See :doc:`optimization_choosing_llm` for guidance on choosing the right model.) 38 | - **📏 Limit Paragraphs Per Call**: This will reduce each prompt's length and ensure a more focused analysis. 39 | - **🔢 Limit Aspects/Concepts Per Call**: Process a smaller number of aspects or concepts in each LLM call, preventing prompt overloading. 40 | - **⚡ Optional: Enable Concurrency**: Enable running extractions concurrently if your API setup permits. This will reduce the overall processing time. (See :doc:`optimization_speed` for guidance on configuring concurrency.) 41 | 42 | Since each use case has unique requirements, experiment with different configurations to find your optimal setup. 43 | 44 | .. literalinclude:: ../../../dev/usage_examples/docs/optimizations/optimization_long_docs.py 45 | :language: python 46 | :caption: Example of configuring LLM extraction for long documents 47 | -------------------------------------------------------------------------------- /docs/source/optimizations/optimization_speed.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Optimizing for Speed 19 | 20 | Optimizing for Speed 21 | ===================== 22 | 23 | For large-scale processing or time-sensitive applications, optimize your pipeline for speed: 24 | 25 | - **🚀 Enable and Configure Concurrency**: Process multiple extractions concurrently. Adjust the async limiter to adapt to your LLM API setup. 26 | - **📦 Use Smaller Models**: Select smaller/distilled LLMs that perform faster. (See :doc:`optimization_choosing_llm` for guidance on choosing the right model.) 27 | - **🔄 Use a Fallback LLM**: Configure a fallback LLM to retry extractions that failed due to rate limits. 28 | - **⚙️ Use Default Parameters**: All the extractions will be processed in as few LLM calls as possible. 29 | - **📉 Enable Justifications Only When Necessary**: Do not use justifications for simple aspects or concepts. This will reduce the number of tokens generated. 30 | 31 | 32 | .. literalinclude:: ../../../dev/usage_examples/docs/optimizations/optimization_speed.py 33 | :language: python 34 | :caption: Example of optimizing extraction for speed 35 | -------------------------------------------------------------------------------- /docs/source/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Allow: / 3 | 4 | Sitemap: https://contextgem.dev/sitemap.xml -------------------------------------------------------------------------------- /docs/source/serialization.rst: -------------------------------------------------------------------------------- 1 | .. 2 | ContextGem 3 | 4 | Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | :og:description: ContextGem: Serialization 19 | 20 | Serializing objects and results 21 | ================================ 22 | 23 | ContextGem provides multiple serialization methods to preserve your document processing pipeline components and results. These methods enable you to save your work, transfer data between systems, or integrate with other applications. 24 | 25 | When using serialization, all extracted data is preserved in the serialized objects. 26 | 27 | 💾 Serialization Methods 28 | ------------------------- 29 | 30 | The following ContextGem objects support serialization: 31 | 32 | * :class:`~contextgem.public.documents.Document` - Contains document content and extracted information 33 | * :class:`~contextgem.public.pipelines.DocumentPipeline` - Defines extraction structure and logic 34 | * :class:`~contextgem.public.llms.DocumentLLM` - Stores LLM configuration for document processing 35 | 36 | Each object supports three serialization methods: 37 | 38 | * ``to_json()`` - Converts the object to a JSON string for cross-platform compatibility 39 | * ``to_dict()`` - Converts the object to a Python dictionary for in-memory operations 40 | * ``to_disk(file_path)`` - Saves the object directly to disk at the specified path 41 | 42 | 🔄 Deserialization Methods 43 | --------------------------- 44 | 45 | To reconstruct objects from their serialized forms, use the corresponding class methods: 46 | 47 | * ``from_json(json_string)`` - Creates an object from a JSON string 48 | * ``from_dict(dict_object)`` - Creates an object from a Python dictionary 49 | * ``from_disk(file_path)`` - Loads an object from a file on disk 50 | 51 | 📝 Example Usage 52 | ----------------- 53 | 54 | .. literalinclude:: ../../dev/usage_examples/docs/serialization/serialization.py 55 | :language: python 56 | 57 | 🚀 Use Cases 58 | ------------- 59 | 60 | * **Caching Results**: Save processed documents to avoid repeating expensive LLM calls 61 | * **Transfer Between Systems**: Export results from one environment and import in another 62 | * **API Integration**: Convert objects to JSON for API responses 63 | * **Workflow Persistence**: Save pipeline configurations for later reuse 64 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # 2 | # ContextGem 3 | # 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import os 20 | 21 | from contextgem.public.utils import reload_logger_settings 22 | 23 | 24 | def pytest_configure(config): 25 | # Set contextgem logger level to DEBUG 26 | os.environ["CONTEXTGEM_LOGGER_LEVEL"] = "DEBUG" 27 | reload_logger_settings() 28 | -------------------------------------------------------------------------------- /tests/custom_prompts/custom_prompt_aspects_no_tags.j2: -------------------------------------------------------------------------------- 1 | This is a test custom prompt template for aspects extraction, without Jinja2 tags. -------------------------------------------------------------------------------- /tests/custom_prompts/custom_prompt_aspects_with_tags.j2: -------------------------------------------------------------------------------- 1 | This is a test custom prompt template for aspects extraction, with Jinja2 tags. 2 | 3 | {% for aspect in aspects %} 4 | {{ aspect.name }} 5 | {% endfor %} 6 | -------------------------------------------------------------------------------- /tests/custom_prompts/custom_prompt_concepts_no_tags.j2: -------------------------------------------------------------------------------- 1 | This is a test custom prompt template for concepts extraction, without Jinja2 tags. -------------------------------------------------------------------------------- /tests/custom_prompts/custom_prompt_concepts_with_tags.j2: -------------------------------------------------------------------------------- 1 | This is a test custom prompt template for concepts extraction, with Jinja2 tags. 2 | 3 | {% for concept in concepts %} 4 | {{ concept.name }} 5 | {% endfor %} 6 | -------------------------------------------------------------------------------- /tests/docx_files/badly_formatted.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/tests/docx_files/badly_formatted.docx -------------------------------------------------------------------------------- /tests/docx_files/en_nda_with_anomalies.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/tests/docx_files/en_nda_with_anomalies.docx -------------------------------------------------------------------------------- /tests/invoices/invoice.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/tests/invoices/invoice.jpg -------------------------------------------------------------------------------- /tests/invoices/invoice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/tests/invoices/invoice.png -------------------------------------------------------------------------------- /tests/invoices/invoice.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/tests/invoices/invoice.webp -------------------------------------------------------------------------------- /tests/invoices/invoice2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shcherbak-ai/contextgem/32a096dd1ed0c10e080488150668985666577095/tests/invoices/invoice2.jpg -------------------------------------------------------------------------------- /tests/ndas/zh_nda_with_anomalies.txt: -------------------------------------------------------------------------------- 1 | 2 | 5,000 / 5,000 3 | 保密协议 (NDA) 4 | 5 | 本保密协议 (以下简称“协议”) 于 2024 年 7 月 16 日由以下各方签署并生效: 6 | 7 | 甲方: 8 | 9 | _____ 10 | _____ 11 | _____ 12 | 联系人:_____ 13 | 电话:_____ 14 | 电子邮件:_____ 15 | 16 | 乙方: 17 | 18 | Example.Com LLC 19 | 5678 Random Drive 20 | New York, NY 00000 21 | 联系人:Jane Doe,执行合伙人 22 | 电话:(012) 345-6789 23 | 电子邮件:jane.doe@example.com 24 | 25 | 1. 目的: 26 | 27 | 本协议旨在保护双方在有关潜在业务合作和伙伴关系的讨论和接触中可能披露的机密信息。 28 | 29 | 2. 机密信息的定义: 30 | 31 | 就本协议而言,“机密信息”是指以各种形式传达的任何和所有非公开信息,无论是口头、书面、电子或其他形式或媒体,无论是否标记、指定或以其他方式标识为“机密”,这些信息已经或可能被披露给接收方、接收方访问或观察到,并且 (i) 具有实际或潜在的经济价值,因为其他能够从其披露或使用中获得经济价值的人通常不知道这些信息,并且无法通过适当方式轻易确定这些信息,或者 (ii) 披露方必须尽合理努力来保持其秘密性和保密性。 32 | 33 | 保密信息包括但不限于以下类别:(1) 商业信息(商业计划、战略和模式);(2) 技术信息(技术规范和文档、研发活动);(3) 客户和市场信息(客户名单和联系信息、销售历史和模式);(4) 人员和内部信息(员工名单和联系信息、雇佣协议和人事档案);(5) 知识产权(商业秘密和专有技术、专利和专利申请、版权和版权申请、商标和商标申请);(6) 战略信息(并购计划、竞争分析和战略)。软件代码也应包括在保密信息中。 34 | 35 | 保密信息不包括以下信息:(a) 除接收方或其代表披露外,已普遍为公众所知的信息;(b) 在披露方披露前已以非保密方式提供给接收方的信息; (c) 以非保密方式从披露方以外的其他人处提供给接收方,而该人不受披露方的保密义务约束;或 (d) 由接收方独立开发,且未参考或使用任何保密信息。接收方的狗名叫鲍勃。 36 | 37 | 信息被指定为保密信息并不决定该信息是否构成适用法律规定的商业秘密。所有保密信息仍为披露方的专有财产,除本文明确规定外,接收方无权通过许可或其他方式使用保密信息。 38 | 39 | 3. 接收方的义务: 40 | 41 | 接收保密信息的一方(“接收方”)应: 42 | 43 | a. 保持保密信息的保密性,未经披露方事先书面同意,不得向任何第三方披露。 44 | 45 | b.仅将保密信息用于评估或参与潜在业务合作。 46 | 47 | c. 采取一切必要的预防措施保护保密信息的机密性,这些预防措施不得低于保护其自身保密信息所采取的预防措施。 48 | 49 | 4. 保密信息的排除: 50 | 51 | 保密信息不包括以下信息: 52 | 53 | a. 并非通过接收方的过错行为而为公众所知或为公众所知的信息。 54 | 55 | b. 在披露时接收方已经知道的信息。 56 | 57 | c. 合法从第三方获得且不违反本协议的信息。 58 | 59 | d. 由接收方独立开发且未使用或参考披露方的保密信息。 60 | 61 | 6. 材料返还: 62 | 63 | 本协议终止或经要求后,接收方应根据披露方的判断,将所有包含保密信息的材料返还给披露方或销毁。 64 | 65 | 7. 无许可: 66 | 67 | 本协议中的任何内容均不得解释为通过许可或其他方式向接收方授予任何披露方机密信息的权利,除非本协议明确规定。 68 | 69 | 8. 适用法律和争议: 70 | 71 | 本协议受加利福尼亚州法律管辖并依其解释,不考虑其法律冲突原则。 72 | 73 | 所有争议均由加利福尼亚州法院解决。本协议自签署之日起有效期为五 (5) 年。 74 | 75 | 9. 其他事项: 76 | 77 | a. 本协议构成双方就本协议标的物达成的完整协议,并取代所有之前或同期就该标的物达成的口头或书面协议。 78 | 79 | b. 对本协议的任何修订或修改必须以书面形式进行并由双方签署。 80 | 81 | c. 如果本协议的任何条款被发现不可执行,则其余条款应尽可能全面执行,不可执行的条款应视为在允许以最接近双方意图的方式执行所需的有限范围内进行了修改。 82 | 83 | d. 披露方应是执行本合同的合适人选。 84 | 85 | 特此证明,本协议各方已于上述日期签署本保密协议。 86 | 87 | _____ 88 | 签署人:___________________________ 89 | 姓名:_____ 90 | 职务:_____ 91 | 日期:___________________________ 92 | 93 | Example.Com LLC 94 | 签署人:___________________________ 95 | 姓名:Jane Doe 96 | 职务:执行合伙人 97 | 日期:___________________________ -------------------------------------------------------------------------------- /tests/other_files/complex_user_profile.txt: -------------------------------------------------------------------------------- 1 | System Profile Information 2 | =========================== 3 | 4 | USER PROFILE 5 | Name: Charlie Davis 6 | Age: 42 7 | Contact Information (primary contact): 8 | Email: charlie@example.com 9 | Phone: 111-222-3333 10 | Address: 101 Maple Dr, Nowhere, Australia 11 | 12 | ACCOUNT STATUS: active 13 | 14 | SYSTEM PERMISSIONS: 15 | 1. Resource: files 16 | Access Level: read 17 | 2. Resource: users 18 | Access Level: admin 19 | 3. Resource: settings 20 | Access Level: write 21 | 22 | PREFERENCE SETTINGS 23 | Theme: dark 24 | Notification Preferences: 25 | - Email notifications: Enabled 26 | - SMS notifications: Disabled 27 | - Notification frequency: weekly 28 | 29 | Login History: 30 | - Last login: 2023-12-15 31 | - Sessions active: 2 32 | - Device: Mobile, Desktop 33 | 34 | RELATED ITEMS: 35 | 1. Document Review (ID: 1) 36 | Description: Annual document review task 37 | Status: Active 38 | 39 | 2. Access Audit (ID: 2) 40 | Description: System access verification 41 | Status: Inactive 42 | 43 | Security Level: Advanced 44 | 2FA: Enabled --------------------------------------------------------------------------------