├── .github
    ├── CONTRIBUTOR_AGREEMENT.md
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── config.yml
    │   ├── documentation_improvement.md
    │   └── feature_request.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── contributors
    │   └── .gitkeep
    └── workflows
    │   ├── README.md
    │   ├── bandit-security.yml
    │   ├── ci-tests.yml
    │   ├── codeql.yml
    │   ├── contributor-agreement-check.yml
    │   ├── daily-import-test.yml
    │   └── docs.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── NOTICE
├── README.md
├── SECURITY.md
├── contextgem
    ├── __init__.py
    ├── internal
    │   ├── __init__.py
    │   ├── base
    │   │   ├── __init__.py
    │   │   ├── attrs.py
    │   │   ├── concepts.py
    │   │   ├── examples.py
    │   │   ├── instances.py
    │   │   ├── items.py
    │   │   ├── llms.py
    │   │   ├── md_text.py
    │   │   ├── mixins.py
    │   │   ├── paras_and_sents.py
    │   │   └── serialization.py
    │   ├── converters
    │   │   ├── __init__.py
    │   │   └── docx
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── namespaces.py
    │   │   │   ├── package.py
    │   │   │   └── utils.py
    │   ├── data_models.py
    │   ├── decorators.py
    │   ├── exceptions.py
    │   ├── items.py
    │   ├── llm_output_structs
    │   │   ├── __init__.py
    │   │   ├── aspect_structs.py
    │   │   ├── concept_structs.py
    │   │   └── utils.py
    │   ├── loggers.py
    │   ├── prompts
    │   │   ├── extract_aspect_items.j2
    │   │   └── extract_concept_items.j2
    │   ├── system
    │   │   └── default_system_message.j2
    │   ├── typings
    │   │   ├── __init__.py
    │   │   ├── aliases.py
    │   │   ├── strings_to_types.py
    │   │   ├── typed_class_utils.py
    │   │   ├── types_normalization.py
    │   │   ├── types_to_strings.py
    │   │   └── user_type_hints_validation.py
    │   └── utils.py
    └── public
    │   ├── __init__.py
    │   ├── aspects.py
    │   ├── concepts.py
    │   ├── converters
    │       ├── __init__.py
    │       └── docx.py
    │   ├── data_models.py
    │   ├── documents.py
    │   ├── examples.py
    │   ├── images.py
    │   ├── llms.py
    │   ├── paragraphs.py
    │   ├── pipelines.py
    │   ├── sentences.py
    │   └── utils.py
├── dev
    ├── __init__.py
    ├── content_snippets
    │   └── feature_table.html
    ├── generate_notebooks.py
    ├── notebooks
    │   ├── docs
    │   │   ├── advanced
    │   │   │   ├── advanced_aspects_and_concepts_document.ipynb
    │   │   │   ├── advanced_aspects_with_concepts.ipynb
    │   │   │   └── advanced_multiple_docs_pipeline.ipynb
    │   │   ├── aspects
    │   │   │   ├── aspect_with_concepts.ipynb
    │   │   │   ├── aspect_with_justifications.ipynb
    │   │   │   ├── aspect_with_sub_aspects.ipynb
    │   │   │   ├── basic_aspect.ipynb
    │   │   │   └── complex_hierarchy.ipynb
    │   │   ├── concepts
    │   │   │   ├── boolean_concept
    │   │   │   │   ├── boolean_concept.ipynb
    │   │   │   │   └── refs_and_justifications.ipynb
    │   │   │   ├── date_concept
    │   │   │   │   ├── date_concept.ipynb
    │   │   │   │   └── refs_and_justifications.ipynb
    │   │   │   ├── json_object_concept
    │   │   │   │   ├── adding_examples.ipynb
    │   │   │   │   ├── json_object_concept.ipynb
    │   │   │   │   └── refs_and_justifications.ipynb
    │   │   │   ├── label_concept
    │   │   │   │   ├── document_aspect_analysis.ipynb
    │   │   │   │   ├── label_concept.ipynb
    │   │   │   │   ├── multi_label_classification.ipynb
    │   │   │   │   └── refs_and_justifications.ipynb
    │   │   │   ├── numerical_concept
    │   │   │   │   ├── numerical_concept.ipynb
    │   │   │   │   └── refs_and_justifications.ipynb
    │   │   │   ├── rating_concept
    │   │   │   │   ├── multiple_ratings.ipynb
    │   │   │   │   ├── rating_concept.ipynb
    │   │   │   │   └── refs_and_justifications.ipynb
    │   │   │   └── string_concept
    │   │   │   │   ├── adding_examples.ipynb
    │   │   │   │   ├── refs_and_justifications.ipynb
    │   │   │   │   └── string_concept.ipynb
    │   │   ├── llms
    │   │   │   └── llm_extraction_methods
    │   │   │   │   ├── extract_all.ipynb
    │   │   │   │   ├── extract_aspects_from_document.ipynb
    │   │   │   │   ├── extract_concepts_from_aspect.ipynb
    │   │   │   │   └── extract_concepts_from_document.ipynb
    │   │   └── quickstart
    │   │   │   ├── quickstart_aspect.ipynb
    │   │   │   ├── quickstart_concept_aspect.ipynb
    │   │   │   ├── quickstart_concept_document_text.ipynb
    │   │   │   ├── quickstart_concept_document_vision.ipynb
    │   │   │   └── quickstart_sub_aspect.ipynb
    │   └── readme
    │   │   ├── docx_converter.ipynb
    │   │   ├── llm_chat.ipynb
    │   │   ├── quickstart_aspect.ipynb
    │   │   └── quickstart_concept.ipynb
    ├── populate_project_readme.py
    ├── readme.template.md
    ├── requirements
    │   ├── requirements.dev.txt
    │   └── requirements.main.txt
    └── usage_examples
    │   ├── __init__.py
    │   ├── docs
    │       ├── __init__.py
    │       ├── advanced
    │       │   ├── __init__.py
    │       │   ├── advanced_aspects_and_concepts_document.py
    │       │   ├── advanced_aspects_with_concepts.py
    │       │   └── advanced_multiple_docs_pipeline.py
    │       ├── aspects
    │       │   ├── __init__.py
    │       │   ├── aspect_with_concepts.py
    │       │   ├── aspect_with_justifications.py
    │       │   ├── aspect_with_sub_aspects.py
    │       │   ├── basic_aspect.py
    │       │   └── complex_hierarchy.py
    │       ├── concepts
    │       │   ├── __init__.py
    │       │   ├── boolean_concept
    │       │   │   ├── __init__.py
    │       │   │   ├── boolean_concept.py
    │       │   │   └── refs_and_justifications.py
    │       │   ├── date_concept
    │       │   │   ├── __init__.py
    │       │   │   ├── date_concept.py
    │       │   │   └── refs_and_justifications.py
    │       │   ├── json_object_concept
    │       │   │   ├── __init__.py
    │       │   │   ├── adding_examples.py
    │       │   │   ├── json_object_concept.py
    │       │   │   ├── refs_and_justifications.py
    │       │   │   └── structure
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── nested_class_structure.py
    │       │   │   │   ├── nested_structure.py
    │       │   │   │   ├── simple_class_structure.py
    │       │   │   │   └── simple_structure.py
    │       │   ├── label_concept
    │       │   │   ├── __init__.py
    │       │   │   ├── document_aspect_analysis.py
    │       │   │   ├── label_concept.py
    │       │   │   ├── multi_label_classification.py
    │       │   │   └── refs_and_justifications.py
    │       │   ├── numerical_concept
    │       │   │   ├── __init__.py
    │       │   │   ├── numerical_concept.py
    │       │   │   └── refs_and_justifications.py
    │       │   ├── rating_concept
    │       │   │   ├── __init__.py
    │       │   │   ├── multiple_ratings.py
    │       │   │   ├── rating_concept.py
    │       │   │   └── refs_and_justifications.py
    │       │   └── string_concept
    │       │   │   ├── __init__.py
    │       │   │   ├── adding_examples.py
    │       │   │   ├── refs_and_justifications.py
    │       │   │   └── string_concept.py
    │       ├── llm_config
    │       │   ├── __init__.py
    │       │   ├── cost_tracking.py
    │       │   ├── detailed_usage.py
    │       │   ├── fallback_llm.py
    │       │   ├── llm_api.py
    │       │   ├── llm_group.py
    │       │   ├── llm_local.py
    │       │   ├── o1_o4.py
    │       │   └── tracking_usage_and_cost.py
    │       ├── llms
    │       │   ├── __init__.py
    │       │   ├── llm_extraction_methods
    │       │   │   ├── __init__.py
    │       │   │   ├── extract_all.py
    │       │   │   ├── extract_aspects_from_document.py
    │       │   │   ├── extract_concepts_from_aspect.py
    │       │   │   └── extract_concepts_from_document.py
    │       │   └── llm_init
    │       │   │   ├── __init__.py
    │       │   │   ├── llm_api.py
    │       │   │   ├── llm_local.py
    │       │   │   └── lm_studio_connection_error_fix.py
    │       ├── optimizations
    │       │   ├── __init__.py
    │       │   ├── optimization_accuracy.py
    │       │   ├── optimization_choosing_llm.py
    │       │   ├── optimization_cost.py
    │       │   ├── optimization_long_docs.py
    │       │   └── optimization_speed.py
    │       ├── quickstart
    │       │   ├── __init__.py
    │       │   ├── quickstart_aspect.py
    │       │   ├── quickstart_concept_aspect.py
    │       │   ├── quickstart_concept_document_text.py
    │       │   ├── quickstart_concept_document_vision.py
    │       │   └── quickstart_sub_aspect.py
    │       └── serialization
    │       │   ├── __init__.py
    │       │   └── serialization.py
    │   ├── docstrings
    │       ├── __init__.py
    │       ├── aspects
    │       │   ├── __init__.py
    │       │   └── def_aspect.py
    │       ├── concepts
    │       │   ├── __init__.py
    │       │   ├── def_boolean_concept.py
    │       │   ├── def_date_concept.py
    │       │   ├── def_json_object_concept.py
    │       │   ├── def_label_concept.py
    │       │   ├── def_numerical_concept.py
    │       │   ├── def_rating_concept.py
    │       │   └── def_string_concept.py
    │       ├── data_models
    │       │   ├── __init__.py
    │       │   └── def_llm_pricing.py
    │       ├── documents
    │       │   ├── __init__.py
    │       │   └── def_document.py
    │       ├── examples
    │       │   ├── __init__.py
    │       │   ├── def_example_json_object.py
    │       │   └── def_example_string.py
    │       ├── images
    │       │   ├── __init__.py
    │       │   └── def_image.py
    │       ├── llms
    │       │   ├── __init__.py
    │       │   ├── def_llm.py
    │       │   └── def_llm_group.py
    │       ├── paragraphs
    │       │   ├── __init__.py
    │       │   └── def_paragraph.py
    │       ├── pipelines
    │       │   ├── __init__.py
    │       │   └── def_pipeline.py
    │       ├── sentences
    │       │   ├── __init__.py
    │       │   └── def_sentence.py
    │       └── utils
    │       │   ├── __init__.py
    │       │   ├── json_object_cls_struct.py
    │       │   └── reload_logger_settings.py
    │   ├── readme
    │       ├── __init__.py
    │       ├── docx_converter.py
    │       ├── llm_chat.py
    │       ├── quickstart_aspect.py
    │       └── quickstart_concept.py
    │   └── vs_other_frameworks
    │       ├── __init__.py
    │       ├── advanced
    │           ├── __init__.py
    │           ├── instructor.py
    │           ├── langchain.py
    │           └── llama_index.py
    │       └── basic
    │           ├── __init__.py
    │           ├── instructor.py
    │           ├── langchain.py
    │           ├── llama_index.py
    │           └── llama_index_rag.py
├── docs
    ├── Makefile
    ├── build_raw_docs_for_llm.py
    ├── docs-raw-for-llm.txt
    ├── make.bat
    └── source
    │   ├── _static
    │       ├── contextgem_component_examples.png
    │       ├── contextgem_how_it_works_infographics.png
    │       ├── contextgem_readme_header.png
    │       ├── contextgem_website_preview.png
    │       ├── custom.css
    │       ├── docs_preview_image_aspects.png
    │       ├── docs_preview_image_boolean_concept.png
    │       ├── docs_preview_image_date_concept.png
    │       ├── docs_preview_image_json_object_concept.png
    │       ├── docs_preview_image_label_concept.png
    │       ├── docs_preview_image_numerical_concept.png
    │       ├── docs_preview_image_rating_concept.png
    │       ├── docs_preview_image_string_concept.png
    │       ├── favicon.ico
    │       ├── readme_code_snippet.png
    │       └── tab_solid.png
    │   ├── advanced_usage.rst
    │   ├── api
    │       ├── aspects.rst
    │       ├── concepts.rst
    │       ├── converters.rst
    │       ├── data_models.rst
    │       ├── documents.rst
    │       ├── examples.rst
    │       ├── images.rst
    │       ├── llms.rst
    │       ├── paragraphs.rst
    │       ├── pipelines.rst
    │       ├── sentences.rst
    │       └── utils.rst
    │   ├── aspects
    │       └── aspects.rst
    │   ├── concepts
    │       ├── boolean_concept.rst
    │       ├── date_concept.rst
    │       ├── json_object_concept.rst
    │       ├── label_concept.rst
    │       ├── numerical_concept.rst
    │       ├── rating_concept.rst
    │       ├── string_concept.rst
    │       └── supported_concepts.rst
    │   ├── conf.py
    │   ├── converters
    │       └── docx.rst
    │   ├── how_it_works.rst
    │   ├── index.rst
    │   ├── installation.rst
    │   ├── llms
    │       ├── llm_config.rst
    │       ├── llm_extraction_methods.rst
    │       └── supported_llms.rst
    │   ├── motivation.rst
    │   ├── optimizations
    │       ├── optimization_accuracy.rst
    │       ├── optimization_choosing_llm.rst
    │       ├── optimization_cost.rst
    │       ├── optimization_long_docs.rst
    │       ├── optimization_small_llm_troubleshooting.rst
    │       └── optimization_speed.rst
    │   ├── quickstart.rst
    │   ├── robots.txt
    │   ├── serialization.rst
    │   └── vs_other_frameworks.rst
├── poetry.lock
├── pyproject.toml
└── tests
    ├── __init__.py
    ├── cassettes
        ├── TestAll.test_aspect_extraction_from_paragraphs[llm0].yaml
        ├── TestAll.test_aspect_extraction_from_paragraphs[llm1].yaml
        ├── TestAll.test_chat.yaml
        ├── TestAll.test_docx_converter_llm_extract[False].yaml
        ├── TestAll.test_docx_converter_llm_extract[True].yaml
        ├── TestAll.test_extract_all[llm0-document0].yaml
        ├── TestAll.test_extract_all[llm0-document1].yaml
        ├── TestAll.test_extract_all[llm0-document2].yaml
        ├── TestAll.test_extract_all[llm0-document3].yaml
        ├── TestAll.test_extract_all[llm0-document4].yaml
        ├── TestAll.test_extract_all[llm1-document0].yaml
        ├── TestAll.test_extract_all[llm1-document1].yaml
        ├── TestAll.test_extract_all[llm1-document2].yaml
        ├── TestAll.test_extract_all[llm1-document3].yaml
        ├── TestAll.test_extract_all[llm1-document4].yaml
        ├── TestAll.test_extract_aspects_from_document[llm0].yaml
        ├── TestAll.test_extract_aspects_from_document[llm1].yaml
        ├── TestAll.test_extract_complex_json_object_concept.yaml
        ├── TestAll.test_extract_concepts_from_aspect[llm0].yaml
        ├── TestAll.test_extract_concepts_from_aspect[llm1].yaml
        ├── TestAll.test_extract_concepts_from_document[llm0].yaml
        ├── TestAll.test_extract_concepts_from_document[llm1].yaml
        ├── TestAll.test_extract_label_concept[llm0].yaml
        ├── TestAll.test_extract_label_concept[llm1].yaml
        ├── TestAll.test_extract_with_fallback.yaml
        ├── TestAll.test_llm_extraction_error_exception.yaml
        ├── TestAll.test_local_llms.yaml
        ├── TestAll.test_sat_model_deferred_segmentation.yaml
        ├── TestAll.test_serialization_and_cloning[llm0-document0].yaml
        ├── TestAll.test_serialization_and_cloning[llm0-document1].yaml
        ├── TestAll.test_serialization_and_cloning[llm0-document2].yaml
        ├── TestAll.test_serialization_and_cloning[llm0-document3].yaml
        ├── TestAll.test_serialization_and_cloning[llm0-document4].yaml
        ├── TestAll.test_serialization_and_cloning[llm1-document0].yaml
        ├── TestAll.test_serialization_and_cloning[llm1-document1].yaml
        ├── TestAll.test_serialization_and_cloning[llm1-document2].yaml
        ├── TestAll.test_serialization_and_cloning[llm1-document3].yaml
        ├── TestAll.test_serialization_and_cloning[llm1-document4].yaml
        ├── TestAll.test_system_messages.yaml
        ├── TestAll.test_usage_examples.yaml
        ├── TestAll.test_very_long_doc_extraction.yaml
        └── TestAll.test_vision.yaml
    ├── conftest.py
    ├── custom_prompts
        ├── custom_prompt_aspects_no_tags.j2
        ├── custom_prompt_aspects_with_tags.j2
        ├── custom_prompt_concepts_no_tags.j2
        └── custom_prompt_concepts_with_tags.j2
    ├── docx_converted
        ├── badly_formatted_md.txt
        ├── badly_formatted_paras_md.txt
        ├── badly_formatted_paras_raw.txt
        └── badly_formatted_raw.txt
    ├── docx_files
        ├── badly_formatted.docx
        ├── en_nda_with_anomalies.docx
        └── ua_nda_with_anomalies.docx
    ├── images
        ├── invoices
        │   ├── invoice.jpg
        │   ├── invoice.png
        │   ├── invoice.webp
        │   └── invoice2.jpg
        └── other
        │   └── apt_plan.png
    ├── memory_profiling.py
    ├── ndas
        ├── en_nda_with_anomalies.txt
        ├── ua_nda_with_anomalies.txt
        └── zh_nda_with_anomalies.txt
    ├── other_files
        ├── complex_user_profile.txt
        └── gdpr_modified_for_testing.txt
    ├── test_all.py
    ├── url_security.py
    └── utils.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Create a report to help us improve
 4 | title: '[BUG] '
 5 | labels: bug
 6 | assignees: ''
 7 | ---
 8 | 
 9 | ## Description
10 | A clear and concise description of what the bug is.
11 | 
12 | ## Steps to Reproduce
13 | Steps to reproduce the behavior.
14 | 
15 | ## Expected Behavior
16 | A clear and concise description of what you expected to happen.
17 | 
18 | ## Actual Behavior
19 | A clear and concise description of what actually happened.
20 | 
21 | ## Environment
22 | - OS: [e.g. Windows 11, Ubuntu 24.04]
23 | - Python version: [e.g. 3.13.2]
24 | - contextgem version: [e.g. 0.1.0]
25 | - Any other relevant environment details
26 | 
27 | ## Error Logs
28 | ```
29 | Paste any error logs or traceback here
30 | ```
31 | 
32 | ## Additional Context
33 | Add any other context about the problem here.
34 | 
35 | ## Possible Solution
36 | If you have suggestions on how to fix the issue, please describe them here.


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |   - name: Have a question? Just ask :)
4 |     url: https://github.com/shcherbak-ai/contextgem/discussions/new/
5 |     about: For questions or discussions that aren't bugs or feature requests
6 |   - name: Documentation
7 |     url: https://contextgem.dev
8 |     about: Check the documentation for usage information and guides


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation_improvement.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Documentation Improvement
 3 | about: Suggest improvements to our documentation
 4 | title: '[DOCS] '
 5 | labels: documentation
 6 | assignees: ''
 7 | ---
 8 | 
 9 | ## What Documentation Needs Improvement?
10 | Provide links or describe the current documentation that needs to be improved.
11 | 
12 | ## What's Wrong or Missing?
13 | A clear and concise description of what's wrong with the current documentation or what information is missing.
14 | 
15 | ## Suggested Improvement
16 | A clear and concise description of how you think the documentation should be improved.
17 | 
18 | ## Additional Context
19 | Add any other context about the documentation request here.


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature Request
 3 | about: Suggest an idea for this project
 4 | title: '[FEATURE] '
 5 | labels: enhancement
 6 | assignees: ''
 7 | ---
 8 | 
 9 | ## Problem Statement
10 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
11 | 
12 | ## Proposed Solution
13 | A clear and concise description of what you want to happen.
14 | 
15 | ## Alternatives Considered
16 | A clear and concise description of any alternative solutions or features you've considered.
17 | 
18 | ## Additional Context
19 | Add any other context, mockups, or examples about the feature request here.
20 | 
21 | ## Implementation Ideas
22 | If you have ideas about how this could be implemented, please share them here.


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--- Provide a general summary of your changes in the title. -->
 2 | 
 3 | ## Description
 4 | <!--- Use this section to describe your changes. If your changes required
 5 | testing, include information about the testing environment and the tests you
 6 | ran. If your test fixes a bug reported in an issue, don't forget to include the
 7 | issue number. If your PR is still a work in progress, that's totally fine – just
 8 | include a note to let us know. -->
 9 | 
10 | ## Related Issues
11 | <!--- Link to any related issues here using the syntax: Fixes #issue_number or Relates to #issue_number -->
12 | 
13 | ## Types of change
14 | <!-- What type of change does your PR cover? Is it a bug fix, an enhancement
15 | or new feature, or a change to the documentation? -->
16 | - [ ] Bug fix (non-breaking change which fixes an issue)
17 | - [ ] New feature (non-breaking change which adds functionality)
18 | - [ ] Breaking change (fix or feature that would cause existing functionality to change)
19 | - [ ] Documentation update
20 | - [ ] Performance improvement
21 | - [ ] Code cleanup or refactor
22 | 
23 | ## How to Test
24 | <!--- Please describe how reviewers can test your changes -->
25 | 
26 | ## Checklist
27 | <!--- Before you submit the PR, go over this checklist and make sure you can
28 | tick off all the boxes. [] -> [x] -->
29 | - [ ] I confirm that I have the right to submit this contribution and grant all the rights specified in the Contributor Agreement.
30 | - [ ] I have read, agreed to, filled in, and included my Contributor Agreement in `.github/contributors/[my-username].md`.
31 | - [ ] I ran the tests, and all new and existing tests passed.
32 | - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 


--------------------------------------------------------------------------------
/.github/contributors/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/.github/contributors/.gitkeep


--------------------------------------------------------------------------------
/.github/workflows/README.md:
--------------------------------------------------------------------------------
 1 | # GitHub Workflows
 2 | 
 3 | This directory contains GitHub Actions workflow configurations for continuous integration (CI) of the ContextGem project.
 4 | 
 5 | 
 6 | ## Available Workflows
 7 | 
 8 | ### tests (`ci-tests.yml`)
 9 | 
10 | **Features:**
11 | - Runs on multiple operating systems (Ubuntu, macOS, Windows)
12 | - Tests across Python versions 3.10, 3.11, 3.12, and 3.13
13 | - Checks formatting with Black
14 | - Runs test suite with VCR (recorded API responses)
15 | - Generates test coverage reports
16 | 
17 | **Trigger:**
18 | - Automatically runs on push and pull request events on the main branch
19 | - Can be triggered manually through the GitHub Actions UI
20 | 
21 | **Environment Variables:**
22 | - This workflow uses the following environment variables:
23 |     - `CONTEXTGEM_OPENAI_API_KEY`: Secret OpenAI API key
24 |     - `GIST_SECRET`: Secret token to upload coverage results to a gist for badge generation
25 | 
26 | ### CodeQL Analysis (`codeql.yml`)
27 | 
28 | This workflow performs code security scanning using GitHub's CodeQL analysis engine.
29 | 
30 | **Features:**
31 | - Scans Python codebase for security vulnerabilities and coding errors
32 | - Analyzes code quality and identifies potential issues
33 | - Results are available in the Security tab of the repository
34 | 
35 | **Trigger:**
36 | - Automatically runs on push and pull request events on the main and dev branches
37 | - Scheduled to run weekly
38 | - Can be triggered manually through the GitHub Actions UI
39 | 
40 | ### Documentation Build (`docs.yml`)
41 | 
42 | This workflow builds and deploys the project documentation to GitHub Pages.
43 | 
44 | **Features:**
45 | - Builds documentation using Sphinx
46 | - Deploys documentation to GitHub Pages when merged to main
47 | - Creates preview builds on pull requests
48 | 
49 | **Trigger:**
50 | - Automatically runs on push and pull request events on the main branch
51 | - Can be triggered manually through the GitHub Actions UI
52 | 
53 | ### Check Contributor Agreement (`contributor-agreement-check.yml`)
54 | 
55 | This workflow ensures all contributors have signed the Contributor Agreement by checking for properly filled agreement files.
56 | 
57 | **Features:**
58 | - Verifies that each contributor has a signed agreement file
59 | - Ensures agreement files are not empty and contain the contributor's username
60 | - Prevents deletion of existing contributor agreement files
61 | - Posts helpful comments on PRs when agreement requirements aren't met
62 | 
63 | **Trigger:**
64 | - Automatically runs on all pull request events (opened, synchronized, reopened)
65 | 
66 | 
67 | ## Running Workflows
68 | 
69 | - **tests:** These run automatically on push/PR to the main branch
70 | - **CodeQL Analysis:** Runs automatically on push/PR to main/dev, weekly, and manually
71 | - **Documentation Build:** Runs automatically on push/PR to main and manually
72 | - **Check Contributor Agreement:** Runs automatically on all PRs
73 | 


--------------------------------------------------------------------------------
/.github/workflows/bandit-security.yml:
--------------------------------------------------------------------------------
 1 | name: bandit security
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main, dev ]
 6 |   pull_request:
 7 |     branches: [ main, dev ]
 8 |   schedule:
 9 |     - cron: '0 0 * * 0'  # Run once per week at midnight on Sunday
10 |   workflow_dispatch:
11 | 
12 | jobs:
13 |   bandit-security-check:
14 |     runs-on: ubuntu-latest
15 |     
16 |     steps:
17 |       - name: Check out repository
18 |         uses: actions/checkout@v4
19 | 
20 |       - name: Set up Python 3.13
21 |         id: setup-python
22 |         uses: actions/setup-python@v5
23 |         with:
24 |           python-version: "3.13"
25 | 
26 |       - name: Install Poetry
27 |         uses: snok/install-poetry@v1
28 |         with:
29 |           virtualenvs-create: true
30 |           virtualenvs-in-project: true
31 |           installer-parallel: true
32 | 
33 |       - name: Load cached pip wheels
34 |         id: cached-pip-wheels
35 |         uses: actions/cache@v4
36 |         with:
37 |           path: |
38 |             ~/.cache/pip
39 |           key: pip-${{ runner.os }}-python-${{ hashFiles('**/poetry.lock') }}
40 | 
41 |       - name: Install dependencies
42 |         run: poetry install --no-interaction --with dev --no-root
43 | 
44 |       - name: Run Bandit security check
45 |         run: |
46 |           poetry run bandit -r contextgem
47 |           poetry run bandit -r tests dev --skip B101
48 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | name: CodeQL
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main, dev ]
 6 |   pull_request:
 7 |     branches: [ main, dev ]
 8 |   schedule:
 9 |     - cron: '0 0 * * 0'  # Run once per week at midnight on Sunday
10 |   workflow_dispatch:
11 | 
12 | jobs:
13 |   analyze:
14 |     name: Analyze
15 |     runs-on: ubuntu-latest
16 |     permissions:
17 |       actions: read
18 |       contents: read
19 |       security-events: write
20 | 
21 |     strategy:
22 |       fail-fast: false
23 |       matrix:
24 |         language: [ 'python' ]
25 | 
26 |     steps:
27 |     - name: Checkout repository
28 |       uses: actions/checkout@v4
29 | 
30 |     - name: Initialize CodeQL
31 |       uses: github/codeql-action/init@v3
32 |       with:
33 |         languages: ${{ matrix.language }}
34 | 
35 |     - name: Set up Python
36 |       uses: actions/setup-python@v5
37 |       with:
38 |         python-version: '3.13'
39 | 
40 |     - name: Install Poetry
41 |       uses: snok/install-poetry@v1
42 |       with:
43 |         virtualenvs-create: true
44 |         virtualenvs-in-project: true
45 |         installer-parallel: true
46 | 
47 |     - name: Load cached pip wheels
48 |       id: cached-pip-wheels
49 |       uses: actions/cache@v4
50 |       with:
51 |         path: |
52 |           ~/.cache/pip
53 |         key: pip-${{ runner.os }}-python-${{ hashFiles('**/poetry.lock') }}
54 | 
55 |     - name: Install dependencies
56 |       run: poetry install --no-interaction --with dev --no-root
57 | 
58 |     - name: Perform CodeQL Analysis
59 |       uses: github/codeql-action/analyze@v3
60 |       with:
61 |         category: "/language:${{matrix.language}}" 


--------------------------------------------------------------------------------
/.github/workflows/daily-import-test.yml:
--------------------------------------------------------------------------------
 1 | name: Daily Import Test
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 0 * * *'  # Run daily at midnight UTC
 6 |   workflow_dispatch:  # Allow manual triggering
 7 | 
 8 | jobs:
 9 |   import-test:
10 |     runs-on: macos-latest
11 |     
12 |     steps:
13 |       - name: Set up Python 3.13
14 |         uses: actions/setup-python@v5
15 |         with:
16 |           python-version: '3.13'
17 |           
18 |       - name: Install contextgem from PyPI
19 |         run: |
20 |           python -m pip install --upgrade pip
21 |           pip install contextgem
22 |           
23 |       - name: Test import
24 |         run: |
25 |           python -c "import contextgem; print(f'Successfully imported contextgem version {contextgem.__version__}')" 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: build docs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 |   workflow_dispatch:
 9 | 
10 | permissions:
11 |   contents: read
12 |   pages: write
13 |   id-token: write
14 | 
15 | concurrency:
16 |   group: "pages"
17 |   cancel-in-progress: true
18 | 
19 | jobs:
20 |   build:
21 |     runs-on: ubuntu-latest
22 |     steps:
23 |       - name: Checkout
24 |         uses: actions/checkout@v4
25 |       
26 |       - name: Setup Python
27 |         id: setup-python
28 |         uses: actions/setup-python@v5
29 |         with:
30 |           python-version: '3.13'
31 |       
32 |       - name: Install Poetry
33 |         uses: snok/install-poetry@v1
34 |         with:
35 |           virtualenvs-create: true
36 |           virtualenvs-in-project: true
37 |           installer-parallel: true
38 |   
39 |       - name: Load cached pip wheels
40 |         id: cached-pip-wheels
41 |         uses: actions/cache@v4
42 |         with:
43 |           path: |
44 |             ~/.cache/pip
45 |             ~/Library/Caches/pip
46 |             ~\AppData\Local\pip\Cache
47 |           key: pip-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
48 |       
49 |       - name: Install dependencies
50 |         run: poetry install --no-interaction --with dev --no-root
51 |       
52 |       - name: Build documentation
53 |         run: |
54 |           cd docs
55 |           poetry run sphinx-build -b html source _build/html -v -E
56 |       
57 |       - name: Create .nojekyll file
58 |         run: touch docs/_build/html/.nojekyll
59 |       
60 |       - name: Upload artifact
61 |         uses: actions/upload-pages-artifact@v3
62 |         with:
63 |           path: ./docs/_build/html
64 |   
65 |   deploy:
66 |     environment:
67 |       name: github-pages
68 |       url: ${{ steps.deployment.outputs.page_url }}
69 |     runs-on: ubuntu-latest
70 |     needs: build
71 |     if: github.ref == 'refs/heads/main'
72 |     steps:
73 |       - name: Deploy to GitHub Pages
74 |         id: deployment
75 |         uses: actions/deploy-pages@v4 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | __pycache__
 3 | .pytest_cache
 4 | 
 5 | env
 6 | .env
 7 | venv
 8 | .venv
 9 | .coverage
10 | .cz.msg
11 | .vscode
12 | ~$*
13 | *.tmp
14 | 
15 | notebooks
16 | htmlcov
17 | coverage_annotate
18 | !dev/notebooks
19 | docs/build
20 | dist
21 | .DS_Store
22 | code_sample.py
23 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 |   - family-names: Shcherbak
 5 |     given-names: Sergii
 6 |     email: sergii@shcherbak.ai
 7 | title: "ContextGem: Effortless LLM extraction from documents"
 8 | date-released: 2025-04-02
 9 | url: "https://github.com/shcherbak-ai/contextgem"
10 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | ContextGem - Effortless LLM extraction from documents
 2 | ======================================================
 3 | 
 4 | Copyright (c) 2025 Shcherbak AI AS
 5 | All rights reserved
 6 | Developed by Sergii Shcherbak
 7 | 
 8 | This software is licensed under the Apache License, Version 2.0 (the "License");
 9 | you may not use this file except in compliance with the License.
10 | You may obtain a copy of the License at
11 | 
12 |      http://www.apache.org/licenses/LICENSE-2.0
13 |  
14 | Unless required by applicable law or agreed to in writing, software
15 | distributed under the License is distributed on an "AS IS" BASIS,
16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | See the License for the specific language governing permissions and
18 | limitations under the License.
19 | 
20 | ==============================================================================
21 | THIRD-PARTY COMPONENTS
22 | ==============================================================================
23 | 
24 | This software includes the following third-party components:
25 | 
26 | Core Dependencies:
27 | - aiolimiter: Rate limiting for asynchronous operations
28 | - Jinja2: Template engine
29 | - litellm: LLM interface library (this software uses only MIT-licensed portions of LiteLLM and does not utilize any components from the enterprise/ directory)
30 | - loguru: Logging utility
31 | - lxml: High-performance XML processing library
32 | - pydantic: Data validation
33 | - python-ulid: ULID generation
34 | - wtpsplit-lite: Lightweight version of wtpsplit that only retains accelerated ONNX inference of SaT models with minimal dependencies
35 | 
36 | Development Dependencies:
37 | - bandit: Python security linter
38 | - black: Code formatting
39 | - commitizen: Conventional commit tool and release management
40 | - coverage: Test coverage measurement
41 | - isort: Sorting imports
42 | - memory-profiler: Python memory usage monitoring tool
43 | - nbformat: Notebook format utilities
44 | - pip-tools: Dependency management
45 | - pre-commit: Pre-commit hooks
46 | - pympler: Python memory analysis for object-level memory measurement
47 | - pytest: Testing framework
48 | - pytest-cov: Coverage plugin for pytest
49 | - pytest-recording: Recording HTTP interactions for tests
50 | - python-dotenv: Environment variable management
51 | - sphinx: Documentation generator
52 | - sphinx-autodoc-typehints: Type annotation support for Sphinx
53 | - sphinx-book-theme: Book-like theme for Sphinx
54 | - sphinx-copybutton: Adds copy button to code blocks in Sphinx docs
55 | - sphinx-design: Component library for Sphinx documentation
56 | - sphinx-sitemap: Generates XML sitemaps for Sphinx documentation
57 | - sphinxext-opengraph: OpenGraph metadata support for Sphinx documentation
58 | 
59 | Each of these components may have their own licenses. Users should refer to the 
60 | respective project repositories for detailed license information.
61 | 
62 | ============================================================================== 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | 
 4 | ## Supported Versions
 5 | 
 6 | We maintain security practices for the latest release of this library. Older versions may not receive security updates.
 7 | 
 8 | 
 9 | ## Security Testing
10 | 
11 | This project is automatically tested for security issues using multiple security scanning tools:
12 | 
13 | - **[CodeQL](https://codeql.github.com/)** - GitHub's semantic code analysis engine for vulnerability detection (run via GitHub Actions)
14 | - **[Bandit](https://github.com/PyCQA/bandit)** - Python security linter for common security issues (run via GitHub Actions and pre-commit hooks)
15 | - **[Snyk](https://snyk.io)** - Dependency vulnerability monitoring (used as needed)
16 | 
17 | 
18 | ## Data Privacy
19 | 
20 | This library uses LiteLLM as a local Python package to communicate with LLM providers using unified interface. No data or telemetry is transmitted to LiteLLM servers, as the SDK is run entirely within the user's environment. According to LiteLLM's documentation, self-hosted or local SDK use involves no data storage and no telemetry. For details, see [LiteLLM's documentation](https://docs.litellm.ai/docs/data_security).
21 | 
22 | 
23 | ## Reporting a Vulnerability
24 | 
25 | We value the security community's role in protecting our users. If you discover a potential security issue in this project, please report it as follows:
26 | 
27 | 📧 **Email**: `sergii@shcherbak.ai`
28 | 
29 | When reporting, please include:
30 | - A detailed description of the issue
31 | - Steps to reproduce the vulnerability
32 | - Any relevant logs, context, or configurations
33 | 
34 | We aim to respond promptly to all valid reports. Please note that we do not currently offer a bug bounty program.
35 | 
36 | 
37 | ## Questions?
38 | 
39 | If you're unsure whether something is a vulnerability or just a bug, feel free to reach out via the email above before submitting a full report.
40 | 


--------------------------------------------------------------------------------
/contextgem/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | """
20 | ContextGem - Effortless LLM extraction from documents
21 | """
22 | 
23 | __version__ = "0.11.1"
24 | __author__ = "Shcherbak AI AS"
25 | 
26 | from contextgem.public import (
27 |     Aspect,
28 |     BooleanConcept,
29 |     DateConcept,
30 |     Document,
31 |     DocumentLLM,
32 |     DocumentLLMGroup,
33 |     DocumentPipeline,
34 |     DocxConverter,
35 |     Image,
36 |     JsonObjectClassStruct,
37 |     JsonObjectConcept,
38 |     JsonObjectExample,
39 |     LabelConcept,
40 |     LLMPricing,
41 |     NumericalConcept,
42 |     Paragraph,
43 |     RatingConcept,
44 |     RatingScale,
45 |     Sentence,
46 |     StringConcept,
47 |     StringExample,
48 |     image_to_base64,
49 |     reload_logger_settings,
50 | )
51 | 
52 | __all__ = [
53 |     # Aspects
54 |     "Aspect",
55 |     # Concepts
56 |     "StringConcept",
57 |     "BooleanConcept",
58 |     "NumericalConcept",
59 |     "RatingConcept",
60 |     "JsonObjectConcept",
61 |     "DateConcept",
62 |     "LabelConcept",
63 |     # Documents
64 |     "Document",
65 |     # Pipelines
66 |     "DocumentPipeline",
67 |     # Paragraphs
68 |     "Paragraph",
69 |     # Sentences
70 |     "Sentence",
71 |     # Images
72 |     "Image",
73 |     # Examples
74 |     "StringExample",
75 |     "JsonObjectExample",
76 |     # LLMs
77 |     "DocumentLLM",
78 |     "DocumentLLMGroup",
79 |     # Data models
80 |     "LLMPricing",
81 |     "RatingScale",  # deprecated, will be removed in v1.0.0
82 |     # Utils
83 |     "image_to_base64",
84 |     "reload_logger_settings",
85 |     "JsonObjectClassStruct",
86 |     # Converters
87 |     "DocxConverter",
88 | ]
89 | 


--------------------------------------------------------------------------------
/contextgem/internal/base/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | from contextgem.internal.base.attrs import (
20 |     _AssignedAspectsProcessor,
21 |     _AssignedConceptsProcessor,
22 |     _AssignedInstancesProcessor,
23 |     _ExtractedItemsAttributeProcessor,
24 |     _RefParasAndSentsAttrituteProcessor,
25 | )
26 | from contextgem.internal.base.concepts import _Concept
27 | from contextgem.internal.base.instances import _InstanceBase
28 | from contextgem.internal.base.items import _ExtractedItem
29 | from contextgem.internal.base.md_text import _MarkdownTextAttributesProcessor
30 | from contextgem.internal.base.mixins import _PostInitCollectorMixin
31 | from contextgem.internal.base.paras_and_sents import _ParasAndSentsBase
32 | 
33 | __all__ = [
34 |     # Instances
35 |     "_InstanceBase",
36 |     # Attrs processors
37 |     "_AssignedAspectsProcessor",
38 |     "_AssignedConceptsProcessor",
39 |     "_AssignedInstancesProcessor",
40 |     "_ExtractedItemsAttributeProcessor",
41 |     "_RefParasAndSentsAttrituteProcessor",
42 |     # Mixins
43 |     "_PostInitCollectorMixin",
44 |     # Concepts
45 |     "_Concept",
46 |     # Extracted items
47 |     "_ExtractedItem",
48 |     # Paragraphs and sentences
49 |     "_ParasAndSentsBase",
50 |     # Markdown text
51 |     "_MarkdownTextAttributesProcessor",
52 | ]
53 | 


--------------------------------------------------------------------------------
/contextgem/internal/base/examples.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | """
20 | Module defining the base classes for example subclasses.
21 | 
22 | This module provides the foundational class structure for examples that can be used
23 | in the ContextGem framework. Examples serve as user-provided samples for extraction tasks,
24 | helping to guide and improve the extraction process by providing reference patterns
25 | or expected outputs.
26 | """
27 | 
28 | from __future__ import annotations
29 | 
30 | from typing import Any
31 | 
32 | from contextgem.internal.base.instances import _InstanceBase
33 | 
34 | 
35 | class _Example(_InstanceBase):
36 |     """
37 |     Base class that represents an example for extraction tasks in the ContextGem framework.
38 | 
39 |     Examples serve as user-provided samples that guide the extraction process by
40 |     demonstrating expected patterns or outputs for specific extraction tasks.
41 | 
42 |     :ivar content: Arbitrary content associated with the example.
43 |     :vartype content: Any
44 |     """
45 | 
46 |     content: Any
47 | 


--------------------------------------------------------------------------------
/contextgem/internal/base/items.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | """
20 | Module defining the base classes for item subclasses.
21 | 
22 | This module provides the foundational class structure for items that can be extracted
23 | from aspects or documents in the ContextGem framework. Items serve as the basic units of information
24 | extracted from aspects or documents, providing a structured way to store and process extracted data.
25 | """
26 | 
27 | from __future__ import annotations
28 | 
29 | from typing import Any, Optional
30 | 
31 | from pydantic import Field, PrivateAttr
32 | 
33 | from contextgem.internal.base.attrs import _RefParasAndSentsAttrituteProcessor
34 | from contextgem.internal.decorators import _post_init_method
35 | from contextgem.internal.typings.aliases import NonEmptyStr
36 | from contextgem.public.paragraphs import Paragraph
37 | from contextgem.public.sentences import Sentence
38 | 
39 | 
40 | class _ExtractedItem(_RefParasAndSentsAttrituteProcessor):
41 |     """
42 |     Base class for items extracted from aspects or documents in the ContextGem framework.
43 | 
44 |     This class provides a structured way to store extracted information along with
45 |     optional justification and reference data.
46 | 
47 |     :ivar value: The extracted information value.
48 |     :vartype value: Any
49 |     :ivar justification: Optional explanation providing context for the extraction.
50 |         Defaults to None.
51 |     :vartype justification: Optional[NonEmptyStr]
52 |     :ivar reference_paragraphs: List of paragraphs referenced by this item.
53 |     :vartype reference_paragraphs: list[Paragraph]
54 |     :ivar reference_sentences: List of sentences referenced by this item.
55 |     :vartype reference_sentences: list[Sentence]
56 |     """
57 | 
58 |     value: Any = Field(..., frozen=True)
59 |     justification: Optional[NonEmptyStr] = Field(default=None, frozen=True)
60 | 
61 |     _reference_paragraphs: list[Paragraph] = PrivateAttr(default_factory=list)
62 |     _reference_sentences: list[Sentence] = PrivateAttr(default_factory=list)
63 | 
64 |     @_post_init_method
65 |     def _post_init(self, __context):
66 |         if self.__class__ == _ExtractedItem:
67 |             raise TypeError("Cannot instantiate base class directly")
68 | 


--------------------------------------------------------------------------------
/contextgem/internal/base/md_text.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | """
20 | Module for processing and managing markdown text attributes.
21 | """
22 | 
23 | from typing import Optional
24 | 
25 | from pydantic import BaseModel, PrivateAttr
26 | 
27 | from contextgem.internal.typings.aliases import NonEmptyStr
28 | 
29 | 
30 | class _MarkdownTextAttributesProcessor(BaseModel):
31 |     """
32 |     Base class for processing and managing markdown text attributes.
33 |     """
34 | 
35 |     _md_text: Optional[NonEmptyStr] = PrivateAttr(default=None)
36 | 
37 |     def _validate_md_text(self, value: Optional[NonEmptyStr]) -> None:
38 |         """
39 |         Validates the markdown text content.
40 | 
41 |         :param value: The markdown text content to validate. Must be a non-empty string or None.
42 |         :type value: Optional[NonEmptyStr]
43 |         :raises ValueError: If the value is an empty string or not a string.
44 |         :return: None
45 |         """
46 |         if value is not None and (not isinstance(value, str) or not value.strip()):
47 |             raise ValueError("Markdown text must be a non-empty string")
48 | 


--------------------------------------------------------------------------------
/contextgem/internal/converters/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | from contextgem.internal.converters.docx import _DocxConverterBase, _DocxPackage
20 | 
21 | __all__ = [
22 |     "_DocxConverterBase",
23 |     "_DocxPackage",
24 | ]
25 | 


--------------------------------------------------------------------------------
/contextgem/internal/converters/docx/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | from contextgem.internal.converters.docx.base import _DocxConverterBase
20 | from contextgem.internal.converters.docx.package import _DocxPackage
21 | 
22 | __all__ = [
23 |     "_DocxConverterBase",
24 |     "_DocxPackage",
25 | ]
26 | 


--------------------------------------------------------------------------------
/contextgem/internal/converters/docx/namespaces.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | """
20 | Namespaces for the DOCX converter module.
21 | 
22 | This module defines the XML namespaces used in DOCX files for different elements.
23 | It provides a dictionary of namespace URIs mapped to their prefixes, which are
24 | used to parse and process the XML content of DOCX files.
25 | """
26 | 
27 | 
28 | # Define XML namespaces used in DOCX files
29 | WORD_XML_NAMESPACES = {
30 |     "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
31 |     "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
32 |     "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
33 |     "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
34 |     "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
35 |     "rels": "http://schemas.openxmlformats.org/package/2006/relationships",
36 |     "v": "urn:schemas-microsoft-com:vml",
37 |     "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
38 | }
39 | 


--------------------------------------------------------------------------------
/contextgem/internal/llm_output_structs/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | from contextgem.internal.llm_output_structs.aspect_structs import (
20 |     _get_aspect_extraction_output_struct,
21 | )
22 | from contextgem.internal.llm_output_structs.concept_structs import (
23 |     _get_concept_extraction_output_struct,
24 |     _LabelConceptItemValueModel,
25 | )
26 | from contextgem.internal.llm_output_structs.utils import _create_root_model
27 | 
28 | __all__ = [
29 |     # Utils
30 |     "_create_root_model",
31 |     # Aspect structs
32 |     "_get_aspect_extraction_output_struct",
33 |     # Concept structs
34 |     "_get_concept_extraction_output_struct",
35 |     "_LabelConceptItemValueModel",
36 | ]
37 | 


--------------------------------------------------------------------------------
/contextgem/internal/llm_output_structs/utils.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | """
20 | Module defining utility functions for dynamically computing LLM output validation structures.
21 | """
22 | 
23 | from pydantic import RootModel
24 | 
25 | 
26 | def _create_root_model(name: str, root_type: type):
27 |     """
28 |     Creates a dynamic model class extending RootModel for a specified type.
29 | 
30 |     :param name: The name of the new class to be created.
31 |     :type name: str
32 |     :param root_type: The root type to be used as a parameter for RootModel.
33 |     :type root_type: type
34 |     :return: A dynamically created class inheriting from RootModel
35 |         parameterized with the given type.
36 |     :rtype: type
37 |     """
38 |     return type(name, (RootModel[root_type],), {})
39 | 


--------------------------------------------------------------------------------
/contextgem/internal/typings/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | from contextgem.internal.typings.aliases import (
20 |     AssignedInstancesAttrName,
21 |     AsyncCalsAndKwargs,
22 |     DefaultDecimalField,
23 |     DefaultPromptType,
24 |     ExtractedInstanceType,
25 |     JustificationDepth,
26 |     LanguageRequirement,
27 |     LLMRoleAny,
28 |     LLMRoleAspect,
29 |     NonEmptyStr,
30 |     ReasoningEffort,
31 |     ReferenceDepth,
32 |     SaTModelId,
33 |     StandardSaTModelId,
34 |     TextMode,
35 | )
36 | from contextgem.internal.typings.strings_to_types import _deserialize_type_hint
37 | from contextgem.internal.typings.typed_class_utils import (
38 |     _get_model_fields,
39 |     _is_typed_class,
40 |     _raise_dict_class_type_error,
41 | )
42 | from contextgem.internal.typings.types_normalization import _normalize_type_annotation
43 | from contextgem.internal.typings.types_to_strings import (
44 |     _format_dict_structure,
45 |     _format_type,
46 |     _is_json_serializable_type,
47 |     _JsonObjectItemStructure,
48 |     _raise_json_serializable_type_error,
49 |     _serialize_type_hint,
50 | )
51 | from contextgem.internal.typings.user_type_hints_validation import (
52 |     _dynamic_pydantic_model,
53 | )
54 | 
55 | __all__ = [
56 |     # Aliases
57 |     "NonEmptyStr",
58 |     "LLMRoleAny",
59 |     "LLMRoleAspect",
60 |     "AssignedInstancesAttrName",
61 |     "ExtractedInstanceType",
62 |     "DefaultPromptType",
63 |     "ReferenceDepth",
64 |     "SaTModelId",
65 |     "StandardSaTModelId",
66 |     "LanguageRequirement",
67 |     "JustificationDepth",
68 |     "AsyncCalsAndKwargs",
69 |     "DefaultDecimalField",
70 |     "ReasoningEffort",
71 |     "TextMode",
72 |     # Strings to types
73 |     "_deserialize_type_hint",
74 |     # Types to strings
75 |     "_is_json_serializable_type",
76 |     "_format_type",
77 |     "_JsonObjectItemStructure",
78 |     "_serialize_type_hint",
79 |     "_format_dict_structure",
80 |     "_raise_json_serializable_type_error",
81 |     # User type hints validation
82 |     "_dynamic_pydantic_model",
83 |     # Typed class utils
84 |     "_is_typed_class",
85 |     "_get_model_fields",
86 |     "_raise_dict_class_type_error",
87 |     # Types normalization
88 |     "_normalize_type_annotation",
89 | ]
90 | 


--------------------------------------------------------------------------------
/contextgem/internal/typings/aliases.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | """
20 | Module defining type aliases used throughout the ContextGem framework.
21 | 
22 | This module provides standardized type definitions and aliases that ensure
23 | consistent typing across the codebase. It includes specialized string types,
24 | literal types for configuration options, and compatibility solutions for
25 | different Python versions.
26 | """
27 | 
28 | import sys
29 | from decimal import Decimal
30 | from pathlib import Path
31 | from typing import Annotated, Any, Callable, Coroutine, Literal, TypeVar, Union
32 | 
33 | from pydantic import Field, StrictStr, StringConstraints
34 | 
35 | if sys.version_info >= (3, 11):
36 |     from typing import Self
37 | else:
38 |     Self = TypeVar("Self")
39 | 
40 | NonEmptyStr = Annotated[
41 |     StrictStr, StringConstraints(strip_whitespace=True, min_length=1)
42 | ]
43 | 
44 | LLMRoleAny = Literal[
45 |     "extractor_text", "reasoner_text", "extractor_vision", "reasoner_vision"
46 | ]
47 | 
48 | LLMRoleAspect = Literal["extractor_text", "reasoner_text"]
49 | 
50 | AssignedInstancesAttrName = Literal["aspects", "concepts"]
51 | 
52 | DefaultPromptType = Literal["aspects", "concepts"]
53 | 
54 | ExtractedInstanceType = Literal["aspect", "concept"]
55 | 
56 | ReferenceDepth = Literal["paragraphs", "sentences"]
57 | 
58 | ClassificationType = Literal["multi_class", "multi_label"]
59 | 
60 | # Define standard SaT model IDs as a separate type
61 | StandardSaTModelId = Literal[
62 |     "sat-1l",
63 |     "sat-1l-sm",
64 |     "sat-3l",
65 |     "sat-3l-sm",
66 |     "sat-6l",
67 |     "sat-6l-sm",
68 |     "sat-9l",
69 |     "sat-12l",
70 |     "sat-12l-sm",
71 | ]
72 | 
73 | # Combined type for sat_model_id parameter
74 | SaTModelId = Union[
75 |     StandardSaTModelId,
76 |     str,  # Local path as a string
77 |     Path,  # Local path as a Path object
78 | ]
79 | 
80 | LanguageRequirement = Literal["en", "adapt"]
81 | 
82 | JustificationDepth = Literal["brief", "balanced", "comprehensive"]
83 | 
84 | AsyncCalsAndKwargs = list[
85 |     tuple[Callable[..., Coroutine[Any, Any, Any]], dict[str, Any]]
86 | ]
87 | 
88 | DefaultDecimalField = Field(
89 |     default_factory=lambda: Decimal("0.00000"), ge=Decimal("0.00000")
90 | )
91 | 
92 | ReasoningEffort = Literal["low", "medium", "high"]
93 | 
94 | TextMode = Literal["raw", "markdown"]
95 | 


--------------------------------------------------------------------------------
/contextgem/public/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | from contextgem.public.aspects import Aspect
20 | from contextgem.public.concepts import (
21 |     BooleanConcept,
22 |     DateConcept,
23 |     JsonObjectConcept,
24 |     LabelConcept,
25 |     NumericalConcept,
26 |     RatingConcept,
27 |     StringConcept,
28 | )
29 | from contextgem.public.converters import DocxConverter
30 | from contextgem.public.data_models import LLMPricing, RatingScale
31 | from contextgem.public.documents import Document
32 | from contextgem.public.examples import JsonObjectExample, StringExample
33 | from contextgem.public.images import Image
34 | from contextgem.public.llms import DocumentLLM, DocumentLLMGroup
35 | from contextgem.public.paragraphs import Paragraph
36 | from contextgem.public.pipelines import DocumentPipeline
37 | from contextgem.public.sentences import Sentence
38 | from contextgem.public.utils import (
39 |     JsonObjectClassStruct,
40 |     image_to_base64,
41 |     reload_logger_settings,
42 | )
43 | 
44 | __all__ = [
45 |     # Aspects
46 |     "Aspect",
47 |     # Concepts
48 |     "StringConcept",
49 |     "BooleanConcept",
50 |     "NumericalConcept",
51 |     "RatingConcept",
52 |     "JsonObjectConcept",
53 |     "DateConcept",
54 |     "LabelConcept",
55 |     # Documents
56 |     "Document",
57 |     # Pipelines
58 |     "DocumentPipeline",
59 |     # Paragraphs
60 |     "Paragraph",
61 |     # Sentences
62 |     "Sentence",
63 |     # Images
64 |     "Image",
65 |     # Examples
66 |     "StringExample",
67 |     "JsonObjectExample",
68 |     # LLMs
69 |     "DocumentLLM",
70 |     "DocumentLLMGroup",
71 |     # Data models
72 |     "LLMPricing",
73 |     "RatingScale",  # deprecated, will be removed in v1.0.0
74 |     # Utils
75 |     "image_to_base64",
76 |     "reload_logger_settings",
77 |     "JsonObjectClassStruct",
78 |     # Converters
79 |     "DocxConverter",
80 | ]
81 | 


--------------------------------------------------------------------------------
/contextgem/public/converters/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | from contextgem.public.converters.docx import DocxConverter
20 | 
21 | __all__ = [
22 |     "DocxConverter",
23 | ]
24 | 


--------------------------------------------------------------------------------
/contextgem/public/images.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | """
20 | Module for handling document images.
21 | 
22 | This module provides the Image class, which represents visual content that can be attached to
23 | or fully represent a document. Images are stored in base64-encoded format with specified MIME types
24 | to ensure proper handling.
25 | 
26 | The module supports common image formats (JPEG, PNG, WebP) and integrates with the broader ContextGem
27 | framework for document analysis that includes visual content alongside textual information.
28 | """
29 | 
30 | from __future__ import annotations
31 | 
32 | from typing import Literal
33 | 
34 | from contextgem.internal.base.instances import _InstanceBase
35 | from contextgem.internal.typings.aliases import NonEmptyStr
36 | 
37 | 
38 | class Image(_InstanceBase):
39 |     """
40 |     Represents an image with specified MIME type and base64-encoded data.
41 |     An image is typically attached to a document, or fully represents a document.
42 | 
43 |     :ivar mime_type: The MIME type of the image. This must be one of the
44 |         predefined valid types ("image/jpg", "image/jpeg", "image/png",
45 |         "image/webp").
46 |     :vartype mime_type: Literal["image/jpg", "image/jpeg", "image/png",
47 |         "image/webp"]
48 |     :ivar base64_data: The base64-encoded data of the image. The util function
49 |         `image_to_base64()` from contextgem.public.utils can be used to encode images to base64.
50 |     :vartype base64_data: NonEmptyStr
51 | 
52 |     Note:
53 |         - Attached to documents:
54 |             An image must be attached to a document. A document can have multiple images.
55 | 
56 |         - Extraction types:
57 |             Only concept extraction is supported for images. Use LLM with role ``"extractor_vision"``
58 |             or ``"reasoner_vision"`` to extract concepts from images.
59 | 
60 |     Example:
61 |         .. literalinclude:: ../../../dev/usage_examples/docstrings/images/def_image.py
62 |             :language: python
63 |             :caption: Image definition
64 |     """
65 | 
66 |     mime_type: Literal["image/jpg", "image/jpeg", "image/png", "image/webp"]
67 |     base64_data: NonEmptyStr
68 | 


--------------------------------------------------------------------------------
/contextgem/public/pipelines.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | """
20 | Module for handling document processing pipelines.
21 | 
22 | This module provides the DocumentPipeline class, which represents a reusable collection
23 | of pre-defined aspects and concepts that can be assigned to documents. Pipelines enable
24 | standardized document analysis by packaging common extraction patterns into reusable units.
25 | 
26 | Pipelines serve as templates for document processing, allowing consistent application of
27 | the same analysis approach across multiple documents. They encapsulate both the structural
28 | organization (aspects) and the specific information to extract (concepts) in a single,
29 | assignable object.
30 | """
31 | 
32 | from __future__ import annotations
33 | 
34 | from pydantic import Field
35 | 
36 | from contextgem.internal.base.attrs import _AssignedInstancesProcessor
37 | from contextgem.internal.base.concepts import _Concept
38 | from contextgem.public.aspects import Aspect
39 | 
40 | 
41 | class DocumentPipeline(_AssignedInstancesProcessor):
42 |     """
43 |     Represents a reusable collection of predefined aspects and concepts for document analysis.
44 | 
45 |     Document pipelines serve as templates that can be assigned to multiple documents,
46 |     ensuring consistent application of the same analysis criteria. They package common
47 |     extraction patterns into reusable units, allowing for standardized document processing.
48 | 
49 |     :ivar aspects: A list of aspects to extract from documents. Aspects represent structural
50 |                   categories of information. Defaults to an empty list.
51 |     :vartype aspects: list[Aspect]
52 |     :ivar concepts: A list of concepts to identify within documents. Concepts represent
53 |                    specific information elements to extract. Defaults to an empty list.
54 |     :vartype concepts: list[_Concept]
55 | 
56 |     Note:
57 |         A pipeline is a reusable configuration of extraction steps. You can use the same pipeline
58 |         to extract data from multiple documents.
59 | 
60 |     Example:
61 |         .. literalinclude:: ../../../dev/usage_examples/docstrings/pipelines/def_pipeline.py
62 |             :language: python
63 |             :caption: Document pipeline definition
64 |     """
65 | 
66 |     aspects: list[Aspect] = Field(default_factory=list)
67 |     concepts: list[_Concept] = Field(default_factory=list)
68 | 


--------------------------------------------------------------------------------
/contextgem/public/sentences.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | """
20 | Module for handling document sentences.
21 | 
22 | This module provides the Sentence class, which represents a structured unit of text
23 | within a document paragraph. Sentences are the fundamental building blocks of text analysis,
24 | containing the raw text content of individual statements.
25 | 
26 | The module supports validation to ensure data integrity and integrates with the paragraph
27 | structure to maintain the hierarchical organization of document content.
28 | """
29 | 
30 | from __future__ import annotations
31 | 
32 | from pydantic import Field
33 | 
34 | from contextgem.internal.base.paras_and_sents import _ParasAndSentsBase
35 | from contextgem.internal.typings.aliases import NonEmptyStr
36 | 
37 | 
38 | class Sentence(_ParasAndSentsBase):
39 |     """
40 |     Represents a sentence within a document paragraph.
41 | 
42 |     Sentences are immutable text units that serve as the fundamental building blocks for
43 |     document analysis. The raw text content is preserved and cannot be modified after
44 |     initialization to maintain data integrity.
45 | 
46 |     :ivar raw_text: The complete text content of the sentence. This value is frozen after initialization.
47 |     :vartype raw_text: NonEmptyStr
48 | 
49 |     Note:
50 |         Normally, you do not need to construct sentences manually, as they are populated automatically
51 |         from document's ``raw_text`` or ``paragraphs`` attributes. Only use this constructor for
52 |         advanced use cases, such as when you have a custom paragraph/sentence segmentation tool.
53 | 
54 |     Example:
55 |         .. literalinclude:: ../../../dev/usage_examples/docstrings/sentences/def_sentence.py
56 |             :language: python
57 |             :caption: Sentence definition
58 |     """
59 | 
60 |     raw_text: NonEmptyStr = Field(..., frozen=True)
61 | 


--------------------------------------------------------------------------------
/dev/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/__init__.py


--------------------------------------------------------------------------------
/dev/notebooks/readme/docx_converter.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cells": [
 3 |     {
 4 |       "cell_type": "markdown",
 5 |       "id": "cell_0",
 6 |       "metadata": {},
 7 |       "source": [
 8 |         "# Using ContextGem's DocxConverter"
 9 |       ]
10 |     },
11 |     {
12 |       "cell_type": "code",
13 |       "execution_count": null,
14 |       "id": "cell_1",
15 |       "metadata": {},
16 |       "outputs": [],
17 |       "source": [
18 |         "%pip install -U contextgem"
19 |       ]
20 |     },
21 |     {
22 |       "cell_type": "markdown",
23 |       "id": "cell_2",
24 |       "metadata": {},
25 |       "source": [
26 |         "To run the extraction, please provide your LLM details in the ``DocumentLLM(...)`` constructor further below."
27 |       ]
28 |     },
29 |     {
30 |       "cell_type": "code",
31 |       "execution_count": null,
32 |       "id": "cell_3",
33 |       "metadata": {},
34 |       "outputs": [],
35 |       "source": [
36 |         "# Using ContextGem's DocxConverter\n",
37 |         "\n",
38 |         "from contextgem import DocxConverter\n",
39 |         "\n",
40 |         "converter = DocxConverter()\n",
41 |         "\n",
42 |         "# Convert a DOCX file to an LLM-ready ContextGem Document\n",
43 |         "# from path\n",
44 |         "document = converter.convert(\"path/to/document.docx\")\n",
45 |         "# or from file object\n",
46 |         "with open(\"path/to/document.docx\", \"rb\") as docx_file_object:\n",
47 |         "    document = converter.convert(docx_file_object)\n",
48 |         "\n",
49 |         "# Perform data extraction on the resulting Document object\n",
50 |         "# document.add_aspects(...)\n",
51 |         "# document.add_concepts(...)\n",
52 |         "# llm.extract_all(document)\n",
53 |         "\n",
54 |         "# You can also use DocxConverter instance as a standalone text extractor\n",
55 |         "docx_text = converter.convert_to_text_format(\n",
56 |         "    \"path/to/document.docx\",\n",
57 |         "    output_format=\"markdown\",  # or \"raw\"\n",
58 |         ")\n"
59 |       ]
60 |     }
61 |   ],
62 |   "metadata": {
63 |     "kernelspec": {
64 |       "display_name": "Python 3",
65 |       "language": "python",
66 |       "name": "python3"
67 |     },
68 |     "language_info": {
69 |       "codemirror_mode": {
70 |         "name": "ipython",
71 |         "version": 3
72 |       },
73 |       "file_extension": ".py",
74 |       "mimetype": "text/x-python",
75 |       "name": "python",
76 |       "nbconvert_exporter": "python",
77 |       "pygments_lexer": "ipython3",
78 |       "version": "3.10.0"
79 |     }
80 |   },
81 |   "nbformat": 4,
82 |   "nbformat_minor": 5
83 | }


--------------------------------------------------------------------------------
/dev/notebooks/readme/llm_chat.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cells": [
 3 |     {
 4 |       "cell_type": "markdown",
 5 |       "id": "cell_0",
 6 |       "metadata": {},
 7 |       "source": [
 8 |         "# Using LLMs for chat (text + vision), with fallback LLM support"
 9 |       ]
10 |     },
11 |     {
12 |       "cell_type": "code",
13 |       "execution_count": null,
14 |       "id": "cell_1",
15 |       "metadata": {},
16 |       "outputs": [],
17 |       "source": [
18 |         "%pip install -U contextgem"
19 |       ]
20 |     },
21 |     {
22 |       "cell_type": "markdown",
23 |       "id": "cell_2",
24 |       "metadata": {},
25 |       "source": [
26 |         "To run the extraction, please provide your LLM details in the ``DocumentLLM(...)`` constructor further below."
27 |       ]
28 |     },
29 |     {
30 |       "cell_type": "code",
31 |       "execution_count": null,
32 |       "id": "cell_3",
33 |       "metadata": {},
34 |       "outputs": [],
35 |       "source": [
36 |         "# Using LLMs for chat (text + vision), with fallback LLM support\n",
37 |         "\n",
38 |         "import os\n",
39 |         "\n",
40 |         "from contextgem import DocumentLLM\n",
41 |         "\n",
42 |         "# from contextgem import Image\n",
43 |         "\n",
44 |         "main_model = DocumentLLM(\n",
45 |         "    model=\"openai/gpt-4o\",  # or another provider/model\n",
46 |         "    api_key=os.getenv(\"CONTEXTGEM_OPENAI_API_KEY\"),  # your API key for the LLM provider\n",
47 |         ")\n",
48 |         "\n",
49 |         "# Optional: fallback LLM\n",
50 |         "fallback_model = DocumentLLM(\n",
51 |         "    model=\"openai/gpt-4o-mini\",  # or another provider/model\n",
52 |         "    api_key=os.getenv(\"CONTEXTGEM_OPENAI_API_KEY\"),  # your API key for the LLM provider\n",
53 |         "    is_fallback=True,\n",
54 |         ")\n",
55 |         "main_model.fallback_llm = fallback_model\n",
56 |         "\n",
57 |         "response = main_model.chat(\n",
58 |         "    \"Hello\",\n",
59 |         "    # images=[Image(...)]\n",
60 |         ")\n",
61 |         "# or `response = await main_model.chat_async(...)`\n",
62 |         "\n",
63 |         "print(response)\n"
64 |       ]
65 |     }
66 |   ],
67 |   "metadata": {
68 |     "kernelspec": {
69 |       "display_name": "Python 3",
70 |       "language": "python",
71 |       "name": "python3"
72 |     },
73 |     "language_info": {
74 |       "codemirror_mode": {
75 |         "name": "ipython",
76 |         "version": 3
77 |       },
78 |       "file_extension": ".py",
79 |       "mimetype": "text/x-python",
80 |       "name": "python",
81 |       "nbconvert_exporter": "python",
82 |       "pygments_lexer": "ipython3",
83 |       "version": "3.10.0"
84 |     }
85 |   },
86 |   "nbformat": 4,
87 |   "nbformat_minor": 5
88 | }


--------------------------------------------------------------------------------
/dev/populate_project_readme.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ContextGem
 3 | #
 4 | # Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | """
20 | This script populates the README.md file with code examples from the usage_examples/ directory.
21 | 
22 | To use it, run:
23 | 
24 | ```bash
25 | python dev/populate_project_readme.py
26 | ```
27 | """
28 | 
29 | README_TEMPLATE_PATH = "dev/README.TEMPLATE.md"
30 | README_OUTPUT_PATH = "README.md"
31 | README_FOOTER = ""
32 | 
33 | 
34 | def generate_readme():
35 |     with open(README_TEMPLATE_PATH, "r", encoding="utf-8") as template_file:
36 |         template = template_file.read()
37 | 
38 |     # Replace markers with actual code examples
39 |     for example_file, marker in USAGE_EXAMPLES_MAPPING.items():
40 |         code_snippet = extract_code_from_file(example_file)
41 |         template = template.replace(marker, code_snippet)
42 | 
43 |     with open(README_OUTPUT_PATH, "w", encoding="utf-8") as readme_file:
44 |         readme_file.write(template)
45 |         readme_file.write(README_FOOTER)
46 |     print("Project README.md file populated successfully.")
47 | 
48 | 
49 | def extract_code_from_file(file_path):
50 |     with open(file_path, "r", encoding="utf-8") as f:
51 |         content = f.read()
52 |     return content
53 | 
54 | 
55 | # Map example files to markers in the template
56 | USAGE_EXAMPLES_MAPPING = {
57 |     "dev/content_snippets/feature_table.html": "{{FEATURE_TABLE}}",
58 |     "dev/usage_examples/readme/quickstart_aspect.py": "{{QUICKSTART_ASPECT}}",
59 |     "dev/usage_examples/readme/quickstart_concept.py": "{{QUICKSTART_CONCEPT}}",
60 |     "dev/usage_examples/readme/docx_converter.py": "{{DOCX_CONVERTER}}",
61 | }
62 | 
63 | if __name__ == "__main__":
64 |     generate_readme()
65 | 


--------------------------------------------------------------------------------
/dev/usage_examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/advanced/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/advanced/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/aspects/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/aspects/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/aspects/aspect_with_justifications.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: Aspect Extraction with Justifications
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Aspect, Document, DocumentLLM
 6 | 
 7 | # Create a document instance
 8 | doc = Document(
 9 |     raw_text=(
10 |         "NON-DISCLOSURE AGREEMENT\n"
11 |         "\n"
12 |         'This Non-Disclosure Agreement ("Agreement") is entered into between TechCorp Inc. '
13 |         '("Disclosing Party") and Innovation Labs LLC ("Receiving Party") on January 15, 2024.\n'
14 |         "...\n"
15 |     ),
16 | )
17 | 
18 | # Define a single aspect focused on NDA direction with justifications
19 | nda_direction_aspect = Aspect(
20 |     name="NDA Direction",
21 |     description="Provisions informing the NDA direction (whether mutual or one-way) and information flow between parties",
22 |     add_justifications=True,
23 |     justification_depth="balanced",
24 |     justification_max_sents=4,
25 | )
26 | 
27 | # Add the aspect to the document
28 | doc.aspects = [nda_direction_aspect]
29 | 
30 | # Configure DocumentLLM with your API parameters
31 | llm = DocumentLLM(
32 |     model="azure/gpt-4.1-mini",
33 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
34 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
35 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
36 | )
37 | 
38 | # Extract the aspect with justifications
39 | nda_direction_aspect = llm.extract_aspects_from_document(doc)[0]
40 | for i, item in enumerate(nda_direction_aspect.extracted_items, 1):
41 |     print(f"- {i}. {item.value}")
42 |     print(f"  Justification: {item.justification}")
43 |     print()
44 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/aspects/basic_aspect.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: Aspect Extraction
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Aspect, Document, DocumentLLM
 6 | 
 7 | # Create a document instance
 8 | doc = Document(
 9 |     raw_text=(
10 |         "Software License Agreement\n"
11 |         "This software license agreement (Agreement) is entered into between Tech Corp (Licensor) and Client Corp (Licensee).\n"
12 |         "...\n"
13 |         "2. Term and Termination\n"
14 |         "This Agreement shall commence on the Effective Date and shall continue for a period of three (3) years, "
15 |         "unless earlier terminated in accordance with the provisions hereof. Either party may terminate this Agreement "
16 |         "upon thirty (30) days written notice to the other party.\n"
17 |         "\n"
18 |         "3. Payment Terms\n"
19 |         "Licensee agrees to pay Licensor an annual license fee of $10,000, payable within thirty (30) days of the "
20 |         "invoice date. Late payments shall incur a penalty of 1.5% per month.\n"
21 |         "...\n"
22 |     ),
23 | )
24 | 
25 | # Define an aspect to extract the termination clause
26 | termination_aspect = Aspect(
27 |     name="Termination Clauses",
28 |     description="Sections describing how and when the agreement can be terminated, including notice periods and conditions",
29 | )
30 | 
31 | # Add the aspect to the document
32 | doc.add_aspects([termination_aspect])
33 | 
34 | # Configure DocumentLLM with your API parameters
35 | llm = DocumentLLM(
36 |     model="azure/gpt-4.1-mini",
37 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
38 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
39 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
40 | )
41 | 
42 | # Extract the aspect from the document
43 | termination_aspect = llm.extract_aspects_from_document(doc)[0]
44 | 
45 | # Access the extracted information
46 | print("Extracted Termination Clauses:")
47 | for item in termination_aspect.extracted_items:
48 |     print(f"- {item.value}")
49 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/concepts/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/boolean_concept/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/concepts/boolean_concept/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/boolean_concept/boolean_concept.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: BooleanConcept Extraction
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import BooleanConcept, Document, DocumentLLM
 6 | 
 7 | # Create a Document object from text
 8 | doc = Document(
 9 |     raw_text="This document contains confidential information and should not be shared publicly."
10 | )
11 | 
12 | # Define a BooleanConcept to detect confidential content
13 | confidentiality_concept = BooleanConcept(
14 |     name="Is confidential",
15 |     description="Whether the document contains confidential information",
16 | )
17 | 
18 | # Attach the concept to the document
19 | doc.add_concepts([confidentiality_concept])
20 | 
21 | # Configure DocumentLLM with your API parameters
22 | llm = DocumentLLM(
23 |     model="azure/gpt-4.1-mini",
24 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
25 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
26 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
27 | )
28 | 
29 | # Extract the concept from the document
30 | confidentiality_concept = llm.extract_concepts_from_document(doc)[0]
31 | 
32 | # Print the extracted value
33 | print(confidentiality_concept.extracted_items[0].value)  # Output: True
34 | # Or access the extracted value from the document object
35 | print(doc.concepts[0].extracted_items[0].value)  # Output: True
36 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/boolean_concept/refs_and_justifications.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: BooleanConcept Extraction with References and Justifications
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import BooleanConcept, Document, DocumentLLM
 6 | 
 7 | # Sample document text containing policy information
 8 | policy_text = """
 9 | Company Data Retention Policy (Updated 2024)
10 | 
11 | All customer data must be encrypted at rest and in transit using industry-standard encryption protocols.
12 | Personal information should be retained for no longer than 3 years after the customer relationship ends.
13 | Employees are required to complete data privacy training annually.
14 | """
15 | 
16 | # Create a Document from the text
17 | doc = Document(raw_text=policy_text)
18 | 
19 | # Create a BooleanConcept with justifications and references enabled
20 | compliance_concept = BooleanConcept(
21 |     name="Has encryption requirement",
22 |     description="Whether the document specifies that data must be encrypted",
23 |     add_justifications=True,  # Enable justifications to understand reasoning
24 |     justification_depth="brief",
25 |     justification_max_sents=1,  # Allow up to 1 sentences for each justification
26 |     add_references=True,  # Include references to source text
27 |     reference_depth="sentences",  # Reference specific sentences rather than paragraphs
28 | )
29 | 
30 | # Attach the concept to the document
31 | doc.add_concepts([compliance_concept])
32 | 
33 | # Configure DocumentLLM with your API parameters
34 | llm = DocumentLLM(
35 |     model="azure/gpt-4o-mini",
36 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
37 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
38 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
39 | )
40 | 
41 | # Extract the concept
42 | compliance_concept = llm.extract_concepts_from_document(doc)[0]
43 | 
44 | # Print the extracted value with justification and references
45 | print(f"Has encryption requirement: {compliance_concept.extracted_items[0].value}")
46 | print(f"\nJustification: {compliance_concept.extracted_items[0].justification}")
47 | print("\nSource references:")
48 | for sent in compliance_concept.extracted_items[0].reference_sentences:
49 |     print(f"- {sent.raw_text}")
50 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/date_concept/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/concepts/date_concept/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/date_concept/date_concept.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: DateConcept Extraction
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import DateConcept, Document, DocumentLLM
 6 | 
 7 | # Create a Document object from text
 8 | doc = Document(
 9 |     raw_text="The research paper was published on March 15, 2025 and has been cited 42 times since."
10 | )
11 | 
12 | # Define a DateConcept to extract the publication date
13 | date_concept = DateConcept(
14 |     name="Publication date",
15 |     description="The date when the paper was published",
16 | )
17 | 
18 | # Attach the concept to the document
19 | doc.add_concepts([date_concept])
20 | 
21 | # Configure DocumentLLM with your API parameters
22 | llm = DocumentLLM(
23 |     model="azure/gpt-4.1-mini",
24 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
25 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
26 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
27 | )
28 | 
29 | # Extract the concept from the document
30 | date_concept = llm.extract_concepts_from_document(doc)[0]
31 | 
32 | # Print the extracted value
33 | print(
34 |     type(date_concept.extracted_items[0].value), date_concept.extracted_items[0].value
35 | )
36 | # Output: <class 'datetime.date'> 2025-03-15
37 | 
38 | # Or access the extracted value from the document object
39 | print(
40 |     type(doc.concepts[0].extracted_items[0].value),
41 |     doc.concepts[0].extracted_items[0].value,
42 | )
43 | # Output: <class 'datetime.date'> 2025-03-15
44 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/date_concept/refs_and_justifications.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: DateConcept Extraction with References and Justifications
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import DateConcept, Document, DocumentLLM
 6 | 
 7 | # Sample document text containing project timeline information
 8 | project_text = """
 9 | Project Timeline: Website Redesign
10 | 
11 | The website redesign project officially kicked off on March 1, 2024.
12 | The development team has estimated the project will take 4 months to complete.
13 | 
14 | Key milestones:
15 | - Design phase: 1 month
16 | - Development phase: 2 months  
17 | - Testing and deployment: 1 month
18 | 
19 | The marketing team needs the final completion date to plan the launch campaign.
20 | """
21 | 
22 | # Create a Document from the text
23 | doc = Document(raw_text=project_text)
24 | 
25 | # Create a DateConcept to calculate the project completion date
26 | completion_date_concept = DateConcept(
27 |     name="Project completion date",
28 |     description="The final completion date for the website redesign project",
29 |     add_justifications=True,  # enable justifications to understand extraction logic
30 |     justification_depth="balanced",
31 |     justification_max_sents=3,  # allow up to 3 sentences for the calculation justification
32 |     add_references=True,  # include references to source text
33 |     reference_depth="sentences",  # reference specific sentences rather than paragraphs
34 |     singular_occurrence=True,  # extract only one calculated date
35 | )
36 | 
37 | # Attach the concept to the document
38 | doc.add_concepts([completion_date_concept])
39 | 
40 | # Configure DocumentLLM
41 | llm = DocumentLLM(
42 |     model="azure/gpt-4.1",
43 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
44 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
45 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
46 | )
47 | 
48 | # Extract the concept
49 | completion_date_concept = llm.extract_concepts_from_document(doc)[0]
50 | 
51 | # Print the calculated completion date with justification and references
52 | print("Calculated project completion date:")
53 | extracted_item = completion_date_concept.extracted_items[
54 |     0
55 | ]  # get the single calculated date
56 | print(f"\nCompletion Date: {extracted_item.value}")  # expected output: 2024-07-01
57 | print(f"Calculation Justification: {extracted_item.justification}")
58 | print("Source references used for calculation:")
59 | for sent in extracted_item.reference_sentences:
60 |     print(f"- {sent.raw_text}")
61 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/json_object_concept/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/concepts/json_object_concept/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/json_object_concept/json_object_concept.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: JsonObjectConcept Extraction
 2 | 
 3 | import os
 4 | from pprint import pprint
 5 | from typing import Literal
 6 | 
 7 | from contextgem import Document, DocumentLLM, JsonObjectConcept
 8 | 
 9 | # Define product information text
10 | product_text = """
11 | Product: Smart Fitness Watch X7
12 | Price: $199.99
13 | Features: Heart rate monitoring, GPS tracking, Sleep analysis
14 | Battery Life: 5 days
15 | Water Resistance: IP68
16 | Available Colors: Black, Silver, Blue
17 | Customer Rating: 4.5/5
18 | """
19 | 
20 | # Create a Document object from text
21 | doc = Document(raw_text=product_text)
22 | 
23 | # Define a JsonObjectConcept with a structure for product information
24 | product_concept = JsonObjectConcept(
25 |     name="Product Information",
26 |     description="Extract detailed product information including name, price, features, and specifications",
27 |     structure={
28 |         "name": str,
29 |         "price": float,
30 |         "features": list[str],
31 |         "specifications": {
32 |             "battery_life": str,
33 |             "water_resistance": Literal["IP67", "IP68", "IPX7", "Not water resistant"],
34 |         },
35 |         "available_colors": list[str],
36 |         "customer_rating": float,
37 |     },
38 | )
39 | 
40 | # Attach the concept to the document
41 | doc.add_concepts([product_concept])
42 | 
43 | # Configure DocumentLLM with your API parameters
44 | llm = DocumentLLM(
45 |     model="azure/gpt-4.1-mini",
46 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
47 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
48 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
49 | )
50 | 
51 | # Extract the concept from the document
52 | product_concept = llm.extract_concepts_from_document(doc)[0]
53 | 
54 | # Print the extracted structured data
55 | extracted_product = product_concept.extracted_items[0].value
56 | pprint(extracted_product)
57 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/json_object_concept/structure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/concepts/json_object_concept/structure/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/json_object_concept/structure/nested_class_structure.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from contextgem import JsonObjectConcept
 4 | from contextgem.public.utils import JsonObjectClassStruct
 5 | 
 6 | # Use dataclasses to define the structure of the JSON object
 7 | 
 8 | 
 9 | # All classes in the nested class structure must inherit from JsonObjectClassStruct
10 | # to enable automatic conversion of the class hierarchy to a dictionary structure
11 | # for JsonObjectConcept
12 | @dataclass
13 | class Location(JsonObjectClassStruct):
14 |     latitude: float
15 |     longitude: float
16 |     altitude: float
17 | 
18 | 
19 | @dataclass
20 | class Sensor(JsonObjectClassStruct):
21 |     id: str
22 |     type: str
23 |     location: Location  # reference to another class
24 |     active: bool
25 | 
26 | 
27 | @dataclass
28 | class SensorNetwork(JsonObjectClassStruct):
29 |     network_id: str
30 |     primary_sensor: Sensor  # reference to another class
31 |     backup_sensors: list[Sensor]  # list of another class
32 | 
33 | 
34 | sensor_network_concept = JsonObjectConcept(
35 |     name="IoT Sensor Network",
36 |     description="Configuration for a network of IoT sensors",
37 |     structure=SensorNetwork,  # nested class structure
38 | )
39 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/json_object_concept/structure/nested_structure.py:
--------------------------------------------------------------------------------
 1 | from contextgem import JsonObjectConcept
 2 | 
 3 | device_config_concept = JsonObjectConcept(
 4 |     name="Device Configuration",
 5 |     description="Configuration details for a networked device",
 6 |     structure={
 7 |         "device": {"id": str, "type": str, "model": str},
 8 |         "network": {"ip_address": str, "subnet_mask": str, "gateway": str},
 9 |         "settings": {"enabled": bool, "mode": str},
10 |     },
11 | )
12 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/json_object_concept/structure/simple_class_structure.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from contextgem import JsonObjectConcept
 4 | 
 5 | 
 6 | # Use a Pydantic model to define the structure of the JSON object
 7 | class ProductSpec(BaseModel):
 8 |     name: str
 9 |     version: str
10 |     features: list[str]
11 | 
12 | 
13 | product_spec_concept = JsonObjectConcept(
14 |     name="Product Specification",
15 |     description="Technical specifications for a product",
16 |     structure=ProductSpec,
17 | )
18 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/json_object_concept/structure/simple_structure.py:
--------------------------------------------------------------------------------
 1 | from contextgem import JsonObjectConcept
 2 | 
 3 | product_info_concept = JsonObjectConcept(
 4 |     name="Product Information",
 5 |     description="Product details",
 6 |     structure={
 7 |         "name": str,
 8 |         "price": float,
 9 |         "is_available": bool,
10 |         "ratings": list[float],
11 |     },
12 | )
13 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/label_concept/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/concepts/label_concept/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/label_concept/label_concept.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: Contract Type Classification using LabelConcept
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Document, DocumentLLM, LabelConcept
 6 | 
 7 | # Create a Document object from legal document text
 8 | legal_doc_text = """
 9 | NON-DISCLOSURE AGREEMENT
10 | 
11 | This Non-Disclosure Agreement ("Agreement") is entered into as of January 15, 2025, by and between TechCorp Inc., a Delaware corporation ("Disclosing Party"), and DataSystems LLC, a California limited liability company ("Receiving Party").
12 | 
13 | WHEREAS, Disclosing Party possesses certain confidential information relating to its proprietary technology and business operations;
14 | 
15 | NOW, THEREFORE, in consideration of the mutual covenants contained herein, the parties agree as follows:
16 | 
17 | 1. CONFIDENTIAL INFORMATION
18 | The term "Confidential Information" shall mean any and all non-public information...
19 | 
20 | 2. OBLIGATIONS OF RECEIVING PARTY
21 | Receiving Party agrees to hold all Confidential Information in strict confidence...
22 | """
23 | 
24 | doc = Document(raw_text=legal_doc_text)
25 | 
26 | # Define a LabelConcept for contract type classification
27 | contract_type_concept = LabelConcept(
28 |     name="Contract Type",
29 |     description="Classify the type of contract",
30 |     labels=["NDA", "Consultancy Agreement", "Privacy Policy", "Other"],
31 |     classification_type="multi_class",  # only one label can be selected (mutually exclusive labels)
32 |     singular_occurrence=True,  # expect only one classification result
33 | )
34 | print(contract_type_concept._format_labels_in_prompt)
35 | 
36 | # Attach the concept to the document
37 | doc.add_concepts([contract_type_concept])
38 | 
39 | # Configure DocumentLLM with your API parameters
40 | llm = DocumentLLM(
41 |     model="azure/gpt-4.1-mini",
42 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
43 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
44 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
45 | )
46 | 
47 | # Extract the concept from the document
48 | contract_type_concept = llm.extract_concepts_from_document(doc)[0]
49 | 
50 | # Check if any labels were extracted
51 | if contract_type_concept.extracted_items:
52 |     # Get the classified document type
53 |     classified_type = contract_type_concept.extracted_items[0].value
54 |     print(f"Document classified as: {classified_type}")  # Output: ['NDA']
55 | else:
56 |     print("No applicable labels found for this document")
57 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/numerical_concept/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/concepts/numerical_concept/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/numerical_concept/numerical_concept.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: NumericalConcept Extraction
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Document, DocumentLLM, NumericalConcept
 6 | 
 7 | # Create a Document object from text
 8 | doc = Document(
 9 |     raw_text="The latest smartphone model costs $899.99 and will be available next week."
10 | )
11 | 
12 | # Define a NumericalConcept to extract the price
13 | price_concept = NumericalConcept(
14 |     name="Product price",
15 |     description="The price of the product",
16 |     numeric_type="float",  # We expect a decimal price
17 | )
18 | 
19 | # Attach the concept to the document
20 | doc.add_concepts([price_concept])
21 | 
22 | # Configure DocumentLLM with your API parameters
23 | llm = DocumentLLM(
24 |     model="azure/gpt-4.1-mini",
25 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
26 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
27 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
28 | )
29 | 
30 | # Extract the concept from the document
31 | price_concept = llm.extract_concepts_from_document(doc)[0]
32 | 
33 | # Print the extracted value
34 | print(price_concept.extracted_items[0].value)  # Output: 899.99
35 | # Or access the extracted value from the document object
36 | print(doc.concepts[0].extracted_items[0].value)  # Output: 899.99
37 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/numerical_concept/refs_and_justifications.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: NumericalConcept Extraction with References and Justifications
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Document, DocumentLLM, NumericalConcept
 6 | 
 7 | # Document with values that require calculation/inference
 8 | report_text = """
 9 | Quarterly Sales Report - Q2 2023
10 | 
11 | Product A: Sold 450 units at $75 each
12 | Product B: Sold 320 units at $125 each
13 | Product C: Sold 180 units at $95 each
14 | 
15 | Marketing expenses: $28,500
16 | Operating costs: $42,700
17 | """
18 | 
19 | # Create a Document from the text
20 | doc = Document(raw_text=report_text)
21 | 
22 | # Create a NumericalConcept for total revenue
23 | total_revenue_concept = NumericalConcept(
24 |     name="Total quarterly revenue",
25 |     description="The total revenue calculated by multiplying units sold by their price",
26 |     add_justifications=True,
27 |     justification_depth="comprehensive",  # Detailed justification to show calculation steps
28 |     justification_max_sents=4,  # Maximum number of sentences for justification
29 |     add_references=True,
30 |     reference_depth="paragraphs",  # Reference specific paragraphs
31 |     singular_occurrence=True,  # Ensure that the data is merged into a single item
32 | )
33 | 
34 | # Attach the concept to the document
35 | doc.add_concepts([total_revenue_concept])
36 | 
37 | # Configure DocumentLLM with your API parameters
38 | llm = DocumentLLM(
39 |     model="azure/o4-mini",
40 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
41 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
42 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
43 | )
44 | 
45 | # Extract the concept
46 | total_revenue_concept = llm.extract_concepts_from_document(doc)[0]
47 | 
48 | # Print the extracted inferred value with justification
49 | print("Calculated total quarterly revenue:")
50 | for item in total_revenue_concept.extracted_items:
51 |     print(f"\nTotal Revenue: {item.value}")
52 |     print(f"Calculation Justification: {item.justification}")
53 |     print("Source references:")
54 |     for para in item.reference_paragraphs:
55 |         print(f"- {para.raw_text}")
56 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/rating_concept/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/concepts/rating_concept/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/rating_concept/rating_concept.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: RatingConcept Extraction
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Document, DocumentLLM, RatingConcept
 6 | 
 7 | # Create a Document object from text describing a product without an explicit rating
 8 | smartphone_description = (
 9 |     "This smartphone features a 5000mAh battery that lasts all day with heavy use. "
10 |     "The display is 6.7 inch AMOLED with 120Hz refresh rate. "
11 |     "Camera system includes a 50MP main sensor, 12MP ultrawide, and 8MP telephoto lens. "
12 |     "The phone runs on the latest processor with 8GB RAM and 256GB storage. "
13 |     "It has IP68 water resistance and Gorilla Glass Victus protection."
14 | )
15 | 
16 | doc = Document(raw_text=smartphone_description)
17 | 
18 | # Define a RatingConcept that requires analysis to determine a rating
19 | product_quality = RatingConcept(
20 |     name="Product Quality Rating",
21 |     description=(
22 |         "Evaluate the overall quality of the smartphone based on its specifications, "
23 |         "features, and adherence to industry best practices"
24 |     ),
25 |     rating_scale=(1, 10),
26 |     add_justifications=True,  # include justification for the rating
27 |     justification_depth="balanced",
28 |     justification_max_sents=5,
29 | )
30 | 
31 | # Attach the concept to the document
32 | doc.add_concepts([product_quality])
33 | 
34 | # Configure DocumentLLM with your API parameters
35 | llm = DocumentLLM(
36 |     model="azure/gpt-4.1",
37 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
38 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
39 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
40 | )
41 | 
42 | # Extract the concept from the document - the LLM will analyze and assign a rating
43 | product_quality = llm.extract_concepts_from_document(doc)[0]
44 | 
45 | # Print the calculated rating
46 | print(f"Quality Rating: {product_quality.extracted_items[0].value}")
47 | # Print the justification
48 | print(f"Justification: {product_quality.extracted_items[0].justification}")
49 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/rating_concept/refs_and_justifications.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: RatingConcept Extraction with References and Justifications
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Document, DocumentLLM, RatingConcept
 6 | 
 7 | # Sample document text about a software product with various aspects
 8 | software_review = """
 9 | Software Review: ProjectManager Pro 5.0
10 | 
11 | User Interface: The interface is clean and modern, with intuitive navigation. New users can quickly find what they need without extensive training. The dashboard provides a comprehensive overview of project status.
12 | 
13 | Performance: The application loads quickly even with large projects. Resource-intensive operations like generating reports occasionally cause minor lag on older systems. The mobile app performs exceptionally well, even on limited bandwidth.
14 | 
15 | Features: Project templates are well-designed and cover most common project types. Task dependencies are easily managed, and the Gantt chart visualization is excellent. However, the software lacks advanced risk management tools that competitors offer.
16 | 
17 | Support: The documentation is comprehensive and well-organized. Customer service response time averages 4 hours, which is acceptable but not industry-leading. The knowledge base needs more video tutorials.
18 | """
19 | 
20 | # Create a Document from the text
21 | doc = Document(raw_text=software_review)
22 | 
23 | # Create a RatingConcept with justifications and references enabled
24 | usability_rating_concept = RatingConcept(
25 |     name="Software usability rating",
26 |     description="Evaluate the overall usability of the software on a scale of 1-10 based on UI design, intuitiveness, and learning curve",
27 |     rating_scale=(1, 10),
28 |     add_justifications=True,  # enable justifications to explain the rating
29 |     justification_depth="comprehensive",  # provide detailed reasoning
30 |     justification_max_sents=5,  # allow up to 5 sentences for justification
31 |     add_references=True,  # include references to source text
32 |     reference_depth="sentences",  # reference specific sentences rather than paragraphs
33 | )
34 | 
35 | # Attach the concept to the document
36 | doc.add_concepts([usability_rating_concept])
37 | 
38 | # Configure DocumentLLM with your API parameters
39 | llm = DocumentLLM(
40 |     model="azure/gpt-4.1",
41 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
42 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
43 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
44 | )
45 | 
46 | # Extract the concept
47 | usability_rating_concept = llm.extract_concepts_from_document(doc)[0]
48 | 
49 | # Print the extracted rating item with justification and references
50 | extracted_item = usability_rating_concept.extracted_items[0]
51 | print(f"Software Usability Rating: {extracted_item.value}/10")
52 | print(f"\nJustification: {extracted_item.justification}")
53 | print("\nSource references:")
54 | for sent in extracted_item.reference_sentences:
55 |     print(f"- {sent.raw_text}")
56 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/string_concept/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/concepts/string_concept/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/string_concept/adding_examples.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: StringConcept Extraction with Examples
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Document, DocumentLLM, StringConcept, StringExample
 6 | 
 7 | # Create a Document object from text
 8 | contract_text = """
 9 | SERVICE AGREEMENT
10 | This Service Agreement (the "Agreement") is entered into as of January 15, 2025 by and between:
11 | XYZ Innovations Inc., a Delaware corporation with offices at 123 Tech Avenue, San Francisco, CA 
12 | ("Provider"), and
13 | Omega Enterprises LLC, a New York limited liability company with offices at 456 Business Plaza, 
14 | New York, NY ("Customer").
15 | """
16 | doc = Document(raw_text=contract_text)
17 | 
18 | # Create a StringConcept for extracting parties and their roles
19 | parties_concept = StringConcept(
20 |     name="Contract parties",
21 |     description="Names of parties and their roles in the contract",
22 |     examples=[
23 |         StringExample(content="Acme Corporation (Supplier)"),
24 |         StringExample(content="TechGroup Inc. (Client)"),
25 |     ],  # add examples providing additional guidance to the LLM
26 | )
27 | 
28 | # Attach the concept to the document
29 | doc.add_concepts([parties_concept])
30 | 
31 | # Configure DocumentLLM with your API parameters
32 | llm = DocumentLLM(
33 |     model="azure/gpt-4.1-mini",
34 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
35 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
36 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
37 | )
38 | 
39 | # Extract the concept from the document
40 | parties_concept = llm.extract_concepts_from_document(doc)[0]
41 | 
42 | # Print the extracted parties and their roles
43 | print("Extracted parties and roles:")
44 | for item in parties_concept.extracted_items:
45 |     print(f"- {item.value}")
46 | 
47 | # Expected output:
48 | # - XYZ Innovations Inc. (Provider)
49 | # - Omega Enterprises LLC (Customer)
50 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/string_concept/refs_and_justifications.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: StringConcept Extraction with References and Justifications
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Document, DocumentLLM, StringConcept
 6 | 
 7 | # Sample document text containing financial information
 8 | financial_text = """
 9 | 2024 Financial Performance Summary
10 | 
11 | Revenue increased to $120 million in fiscal year 2024, representing 15% growth compared to the previous year. This growth was primarily driven by the expansion of our enterprise client base and the successful launch of our premium service tier.
12 | 
13 | The Board has recommended a dividend of $1.25 per share, which will be payable to shareholders of record as of March 15, 2025.
14 | """
15 | 
16 | # Create a Document from the text
17 | doc = Document(raw_text=financial_text)
18 | 
19 | # Create a StringConcept with justifications and references enabled
20 | key_figures_concept = StringConcept(
21 |     name="Financial key figures",
22 |     description="Important financial metrics and figures mentioned in the report",
23 |     add_justifications=True,  # enable justifications to understand extraction reasoning
24 |     justification_depth="balanced",
25 |     justification_max_sents=3,  # allow up to 3 sentences for each justification
26 |     add_references=True,  # include references to source text
27 |     reference_depth="sentences",  # reference specific sentences rather than paragraphs
28 | )
29 | 
30 | # Attach the concept to the document
31 | doc.add_concepts([key_figures_concept])
32 | 
33 | # Configure DocumentLLM with your API parameters
34 | llm = DocumentLLM(
35 |     model="azure/gpt-4o-mini",
36 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
37 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
38 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
39 | )
40 | 
41 | # Extract the concept
42 | key_figures_concept = llm.extract_concepts_from_document(doc)[0]
43 | 
44 | # Print the extracted items with justifications and references
45 | print("Extracted financial key figures:")
46 | for item in key_figures_concept.extracted_items:
47 |     print(f"\nFigure: {item.value}")
48 |     print(f"Justification: {item.justification}")
49 |     print("Source references:")
50 |     for sent in item.reference_sentences:
51 |         print(f"- {sent.raw_text}")
52 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/concepts/string_concept/string_concept.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: StringConcept Extraction
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Document, DocumentLLM, StringConcept
 6 | 
 7 | # Create a Document object from text
 8 | doc = Document(raw_text="My name is John Smith and I am 30 years old.")
 9 | 
10 | # Define a StringConcept to extract a person's name
11 | name_concept = StringConcept(
12 |     name="Person name",
13 |     description="Full name of the person",
14 | )
15 | 
16 | # Attach the concept to the document
17 | doc.add_concepts([name_concept])
18 | 
19 | # Configure DocumentLLM with your API parameters
20 | llm = DocumentLLM(
21 |     model="azure/gpt-4.1-mini",
22 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
23 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
24 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
25 | )
26 | 
27 | # Extract the concept from the document
28 | name_concept = llm.extract_concepts_from_document(doc)[0]
29 | 
30 | # Get the extracted value
31 | print(name_concept.extracted_items[0].value)  # Output: "John Smith"
32 | # Or access the extracted value from the document object
33 | print(doc.concepts[0].extracted_items[0].value)  # Output: "John Smith"
34 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llm_config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/llm_config/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llm_config/cost_tracking.py:
--------------------------------------------------------------------------------
 1 | from contextgem import DocumentLLM, LLMPricing
 2 | 
 3 | llm = DocumentLLM(
 4 |     model="openai/gpt-4o-mini",
 5 |     api_key="<your-openai-api-key>",
 6 |     pricing_details=LLMPricing(
 7 |         input_per_1m_tokens=0.150,  # Cost per 1M input tokens
 8 |         output_per_1m_tokens=0.600,  # Cost per 1M output tokens
 9 |     ),
10 | )
11 | 
12 | # Perform some extraction tasks
13 | 
14 | # Later, you can check the cost
15 | cost_info = llm.get_cost()
16 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llm_config/detailed_usage.py:
--------------------------------------------------------------------------------
 1 | from contextgem import DocumentLLM
 2 | 
 3 | llm = DocumentLLM(
 4 |     model="openai/gpt-4.1",
 5 |     api_key="<your-openai-api-key>",
 6 | )
 7 | 
 8 | # Perform some extraction tasks
 9 | 
10 | usage_info = llm.get_usage()
11 | 
12 | # Access the first usage container in the list (for the primary LLM)
13 | llm_usage = usage_info[0]
14 | 
15 | # Get detailed call information
16 | for call in llm_usage.usage.calls:
17 |     print(f"Prompt: {call.prompt}")
18 |     print(f"Response: {call.response}")  # original, unprocessed response
19 |     print(f"Sent at: {call.timestamp_sent}")
20 |     print(f"Received at: {call.timestamp_received}")
21 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llm_config/fallback_llm.py:
--------------------------------------------------------------------------------
 1 | from contextgem import DocumentLLM
 2 | 
 3 | # Primary LLM
 4 | primary_llm = DocumentLLM(
 5 |     model="openai/gpt-4o-mini",
 6 |     api_key="<your-openai-api-key>",
 7 |     role="extractor_text",  # default role
 8 | )
 9 | 
10 | # Fallback LLM
11 | fallback_llm = DocumentLLM(
12 |     model="anthropic/claude-3-5-haiku",
13 |     api_key="<your-anthropic-api-key>",
14 |     role="extractor_text",  # Must match the primary LLM's role
15 |     is_fallback=True,
16 | )
17 | 
18 | # Assign fallback LLM to primary
19 | primary_llm.fallback_llm = fallback_llm
20 | 
21 | # Then use the primary LLM as usual
22 | # document = primary_llm.extract_all(document)
23 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llm_config/llm_api.py:
--------------------------------------------------------------------------------
1 | from contextgem import DocumentLLM
2 | 
3 | llm = DocumentLLM(
4 |     model="openai/gpt-4o-mini",  # Format: <provider>/<model_name>
5 |     api_key="<your-api-key>",
6 | )
7 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llm_config/llm_group.py:
--------------------------------------------------------------------------------
 1 | from contextgem import DocumentLLM, DocumentLLMGroup
 2 | 
 3 | # Create LLMs with different roles
 4 | text_extractor = DocumentLLM(
 5 |     model="openai/gpt-4o-mini",
 6 |     api_key="<your-openai-api-key>",
 7 |     role="extractor_text",
 8 |     output_language="adapt",
 9 | )
10 | 
11 | text_reasoner = DocumentLLM(
12 |     model="openai/o3-mini",
13 |     api_key="<your-openai-api-key>",
14 |     role="reasoner_text",
15 |     max_completion_tokens=16000,
16 |     reasoning_effort="high",
17 |     output_language="adapt",
18 | )
19 | 
20 | # Create a group
21 | llm_group = DocumentLLMGroup(
22 |     llms=[text_extractor, text_reasoner],
23 |     output_language="adapt",  # All LLMs in the group must share the same output language setting
24 | )
25 | 
26 | # Then use the group as usual
27 | # document = llm_group.extract_all(document)
28 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llm_config/llm_local.py:
--------------------------------------------------------------------------------
1 | from contextgem import DocumentLLM
2 | 
3 | local_llm = DocumentLLM(
4 |     model="ollama_chat/llama3.3:70b",
5 |     api_base="http://localhost:11434",  # Default Ollama endpoint
6 | )
7 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llm_config/o1_o4.py:
--------------------------------------------------------------------------------
1 | from contextgem import DocumentLLM
2 | 
3 | llm = DocumentLLM(
4 |     model="openai/o3-mini",
5 |     api_key="<your-openai-api-key>",
6 |     max_completion_tokens=8000,  # Specific to reasoning (CoT-capable) models
7 |     reasoning_effort="medium",  # Optional: "low", "medium", "high"
8 | )
9 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llm_config/tracking_usage_and_cost.py:
--------------------------------------------------------------------------------
 1 | from contextgem import DocumentLLM
 2 | 
 3 | llm = DocumentLLM(
 4 |     model="anthropic/claude-3-5-haiku",
 5 |     api_key="<your-anthropic-api-key>",
 6 | )
 7 | 
 8 | # Perform some extraction tasks
 9 | 
10 | # Get usage statistics
11 | usage_info = llm.get_usage()
12 | 
13 | # Get cost statistics
14 | cost_info = llm.get_cost()
15 | 
16 | # Reset usage and cost statistics
17 | llm.reset_usage_and_cost()
18 | 
19 | # The same methods are available for LLM groups, with optional filtering by LLM role
20 | # usage_info = llm_group.get_usage(llm_role="extractor_text")
21 | # cost_info = llm_group.get_cost(llm_role="extractor_text")
22 | # llm_group.reset_usage_and_cost(llm_role="extractor_text")
23 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/llms/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llms/llm_extraction_methods/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/llms/llm_extraction_methods/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llms/llm_extraction_methods/extract_all.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: Extracting All Aspects and Concepts from Document
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Aspect, Document, DocumentLLM, StringConcept
 6 | 
 7 | # Sample text content
 8 | text_content = """
 9 | John Smith is a 30-year-old software engineer working at TechCorp. 
10 | He has 5 years of experience in Python development and leads a team of 8 developers.
11 | His annual salary is $95,000 and he graduated from MIT with a Computer Science degree.
12 | """
13 | 
14 | # Create a Document object from text
15 | doc = Document(raw_text=text_content)
16 | 
17 | # Define aspects and concepts directly on the document
18 | doc.aspects = [
19 |     Aspect(
20 |         name="Professional Information",
21 |         description="Information about the person's career, job, and work experience",
22 |     )
23 | ]
24 | 
25 | doc.concepts = [
26 |     StringConcept(
27 |         name="Person name",
28 |         description="Full name of the person",
29 |     )
30 | ]
31 | 
32 | # Configure DocumentLLM with your API parameters
33 | llm = DocumentLLM(
34 |     model="azure/gpt-4.1-mini",
35 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
36 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
37 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
38 | )
39 | 
40 | # Extract all aspects and concepts from the document
41 | processed_doc = llm.extract_all(doc)
42 | 
43 | # Access extracted aspect information
44 | aspect = processed_doc.aspects[0]
45 | print(f"Aspect: {aspect.name}")
46 | print(f"Extracted items: {[item.value for item in aspect.extracted_items]}")
47 | 
48 | # Access extracted concept information
49 | concept = processed_doc.concepts[0]
50 | print(f"Concept: {concept.name}")
51 | print(f"Extracted value: {concept.extracted_items[0].value}")
52 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llms/llm_extraction_methods/extract_aspects_from_document.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: Extracting Aspects from Documents
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Aspect, Document, DocumentLLM
 6 | 
 7 | # Sample text content
 8 | text_content = """
 9 | TechCorp is a leading software development company founded in 2015 with headquarters in San Francisco.
10 | The company specializes in cloud-based solutions and has grown to 500 employees across 12 countries.
11 | Their flagship product, CloudManager Pro, serves over 10,000 enterprise clients worldwide.
12 | TechCorp reported $50 million in revenue for 2023, representing a 25% growth from the previous year.
13 | The company is known for its innovative AI-powered analytics platform and excellent customer support.
14 | They recently expanded into the European market and plan to launch three new products in 2024.
15 | """
16 | 
17 | # Create a Document object from text
18 | doc = Document(raw_text=text_content)
19 | 
20 | # Define aspects to extract from the document
21 | doc.aspects = [
22 |     Aspect(
23 |         name="Company Overview",
24 |         description="Basic information about the company, founding, location, and size",
25 |     ),
26 |     Aspect(
27 |         name="Financial Performance",
28 |         description="Revenue, growth metrics, and financial indicators",
29 |     ),
30 |     Aspect(
31 |         name="Products and Services",
32 |         description="Information about the company's products, services, and offerings",
33 |     ),
34 | ]
35 | 
36 | # Configure DocumentLLM with your API parameters
37 | llm = DocumentLLM(
38 |     model="azure/gpt-4.1-mini",
39 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
40 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
41 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
42 | )
43 | 
44 | # Extract aspects from the document
45 | extracted_aspects = llm.extract_aspects_from_document(doc)
46 | 
47 | # Access extracted aspect information
48 | for aspect in extracted_aspects:
49 |     print(f"Aspect: {aspect.name}")
50 |     print(f"Extracted items: {[item.value for item in aspect.extracted_items]}")
51 |     print("---")
52 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llms/llm_extraction_methods/extract_concepts_from_aspect.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: Extracting Concepts from Specific Aspects
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Aspect, Document, DocumentLLM, NumericalConcept, StringConcept
 6 | 
 7 | # Sample text content
 8 | text_content = """
 9 | DataFlow Systems is an innovative fintech startup that was established in 2020 in Austin, Texas.
10 | The company has rapidly grown to 150 employees and operates in 8 major cities across North America.
11 | DataFlow's core platform, FinanceStream, is used by more than 5,000 small businesses for automated accounting.
12 | In their latest financial report, DataFlow Systems announced $12 million in annual revenue for 2024.
13 | This represents an impressive 40% increase compared to their 2023 performance.
14 | The company has secured $25 million in Series B funding and plans to expand internationally next year.
15 | """
16 | 
17 | # Create a Document object from text
18 | doc = Document(raw_text=text_content)
19 | 
20 | # Define an aspect to extract from the document
21 | financial_aspect = Aspect(
22 |     name="Financial Performance",
23 |     description="Revenue, growth metrics, and financial indicators",
24 | )
25 | 
26 | # Add concepts to the aspect
27 | financial_aspect.concepts = [
28 |     StringConcept(
29 |         name="Annual Revenue",
30 |         description="Total revenue reported for the year",
31 |     ),
32 |     NumericalConcept(
33 |         name="Growth Rate",
34 |         description="Percentage growth rate compared to previous year",
35 |         numeric_type="float",
36 |     ),
37 |     NumericalConcept(
38 |         name="Revenue Year",
39 |         description="The year for which revenue is reported",
40 |     ),
41 | ]
42 | 
43 | # Attach the aspect to the document
44 | doc.aspects = [financial_aspect]
45 | 
46 | # Configure DocumentLLM with your API parameters
47 | llm = DocumentLLM(
48 |     model="azure/gpt-4.1",
49 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
50 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
51 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
52 | )
53 | 
54 | # First, extract the aspect from the document (required before concept extraction)
55 | extracted_aspects = llm.extract_aspects_from_document(doc)
56 | financial_aspect = extracted_aspects[0]
57 | 
58 | # Extract concepts from the specific aspect
59 | extracted_concepts = llm.extract_concepts_from_aspect(financial_aspect, doc)
60 | 
61 | # Access extracted concepts for the aspect
62 | print(f"Aspect: {financial_aspect.name}")
63 | print(f"Extracted items: {[item.value for item in financial_aspect.extracted_items]}")
64 | print("\nConcepts extracted from this aspect:")
65 | for concept in extracted_concepts:
66 |     print(f"  {concept.name}: {[item.value for item in concept.extracted_items]}")
67 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llms/llm_extraction_methods/extract_concepts_from_document.py:
--------------------------------------------------------------------------------
 1 | # ContextGem: Extracting Concepts Directly from Documents
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Document, DocumentLLM, NumericalConcept, StringConcept
 6 | 
 7 | # Sample text content
 8 | text_content = """
 9 | GreenTech Solutions is an environmental technology company founded in 2018 in Portland, Oregon.
10 | The company develops sustainable energy solutions and has 75 employees working remotely across the United States.
11 | Their primary product, EcoMonitor, helps businesses track carbon emissions and has been adopted by 2,500 organizations.
12 | GreenTech Solutions reported strong financial performance with $8.5 million in revenue for 2024.
13 | The company's CEO, Sarah Johnson, announced plans to achieve carbon neutrality by 2025.
14 | They recently opened a new research facility in Seattle and hired 20 additional engineers.
15 | """
16 | 
17 | # Create a Document object from text
18 | doc = Document(raw_text=text_content)
19 | 
20 | # Define concepts to extract from the document
21 | doc.concepts = [
22 |     StringConcept(
23 |         name="Company Name",
24 |         description="Full name of the company",
25 |     ),
26 |     StringConcept(
27 |         name="CEO Name",
28 |         description="Full name of the company's CEO",
29 |     ),
30 |     NumericalConcept(
31 |         name="Employee Count",
32 |         description="Total number of employees at the company",
33 |         numeric_type="int",
34 |     ),
35 |     StringConcept(
36 |         name="Annual Revenue",
37 |         description="Company's total revenue for the year",
38 |     ),
39 | ]
40 | 
41 | # Configure DocumentLLM with your API parameters
42 | llm = DocumentLLM(
43 |     model="azure/gpt-4.1",
44 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
45 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
46 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
47 | )
48 | 
49 | # Extract concepts from the document
50 | extracted_concepts = llm.extract_concepts_from_document(doc)
51 | 
52 | # Access extracted concept information
53 | print("Concepts extracted from document:")
54 | for concept in extracted_concepts:
55 |     print(f"  {concept.name}: {[item.value for item in concept.extracted_items]}")
56 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llms/llm_init/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/llms/llm_init/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llms/llm_init/llm_api.py:
--------------------------------------------------------------------------------
 1 | from contextgem import DocumentLLM
 2 | 
 3 | # Pattern for using any cloud LLM provider
 4 | llm = DocumentLLM(
 5 |     model="<provider>/<model_name>",
 6 |     api_key="<api_key>",
 7 | )
 8 | 
 9 | # Example - Using OpenAI LLM
10 | llm_openai = DocumentLLM(
11 |     model="openai/gpt-4.1-mini",
12 |     api_key="<api_key>",
13 |     # see DocumentLLM API reference for all configuration options
14 | )
15 | 
16 | # Example - Using Azure OpenAI LLM
17 | llm_azure_openai = DocumentLLM(
18 |     model="azure/o4-mini",
19 |     api_key="<api_key>",
20 |     api_version="<api_version>",
21 |     api_base="<api_base>",
22 |     # see DocumentLLM API reference for all configuration options
23 | )
24 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llms/llm_init/llm_local.py:
--------------------------------------------------------------------------------
 1 | from contextgem import DocumentLLM
 2 | 
 3 | local_llm = DocumentLLM(
 4 |     model="ollama_chat/<model_name>",
 5 |     api_base="http://localhost:11434",  # Default Ollama endpoint
 6 | )
 7 | 
 8 | # Example - Using Llama 3.1 LLM via Ollama
 9 | llm_llama = DocumentLLM(
10 |     model="ollama_chat/llama3.3:70b",
11 |     api_base="http://localhost:11434",
12 |     # see DocumentLLM API reference for all configuration options
13 | )
14 | 
15 | # Example - Using DeepSeek R1 reasoning model via Ollama
16 | llm_deepseek = DocumentLLM(
17 |     model="ollama_chat/deepseek-r1:32b",
18 |     api_base="http://localhost:11434",
19 |     # see DocumentLLM API reference for all configuration options
20 | )
21 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/llms/llm_init/lm_studio_connection_error_fix.py:
--------------------------------------------------------------------------------
 1 | from contextgem import DocumentLLM
 2 | 
 3 | llm = DocumentLLM(
 4 |     model="lm_studio/mistralai/mistral-small-3.2",
 5 |     api_base="http://localhost:1234/v1",
 6 |     api_key="dummy-key",  # dummy key to avoid connection error
 7 | )
 8 | 
 9 | # This is a known issue with calling LM Studio API in litellm:
10 | # https://github.com/openai/openai-python/issues/961
11 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/optimizations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/optimizations/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/optimizations/optimization_accuracy.py:
--------------------------------------------------------------------------------
 1 | # Example of optimizing extraction for accuracy
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Document, DocumentLLM, StringConcept, StringExample
 6 | 
 7 | # Define document
 8 | doc = Document(
 9 |     raw_text="Non-Disclosure Agreement...",
10 |     sat_model_id="sat-6l-sm",  # default is "sat-3l-sm"
11 |     paragraph_segmentation_mode="sat",  # default is "newlines"
12 |     # sentence segmentation mode is always "sat", as other approaches proved to be less accurate
13 | )
14 | 
15 | # Define document concepts
16 | doc.concepts = [
17 |     StringConcept(
18 |         name="Title",  # A very simple concept, just an example for testing purposes
19 |         description="Title of the document",
20 |         add_justifications=True,  # enable justifications
21 |         justification_depth="brief",  # default
22 |         examples=[
23 |             StringExample(
24 |                 content="Supplier Agreement",
25 |             )
26 |         ],
27 |     ),
28 |     # ... add other concepts ...
29 | ]
30 | 
31 | # ... attach other aspects/concepts to the document ...
32 | 
33 | # Define and configure LLM
34 | llm = DocumentLLM(
35 |     model="openai/gpt-4o",
36 |     api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"),
37 |     fallback_llm=DocumentLLM(
38 |         model="openai/gpt-4-turbo",
39 |         api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"),
40 |         is_fallback=True,
41 |     ),  # configure a fallback LLM
42 | )
43 | 
44 | # Extract data from document with specific configuration options
45 | doc = llm.extract_all(
46 |     doc,
47 |     max_paragraphs_to_analyze_per_call=30,  # limit the number of paragraphs to analyze in an individual LLM call
48 |     max_items_per_call=1,  # limit the number of aspects/concepts to analyze in an individual LLM call
49 |     use_concurrency=True,  # optional: enable concurrent extractions
50 | )
51 | 
52 | # ... use the extracted data ...
53 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/optimizations/optimization_choosing_llm.py:
--------------------------------------------------------------------------------
 1 | # Example of selecting different LLMs for different tasks
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Aspect, Document, DocumentLLM, DocumentLLMGroup, StringConcept
 6 | 
 7 | # Define LLMs
 8 | base_llm = DocumentLLM(
 9 |     model="openai/gpt-4o-mini",
10 |     api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"),
11 |     role="extractor_text",  # default
12 | )
13 | 
14 | # Optional - attach a fallback LLM
15 | base_llm_fallback = DocumentLLM(
16 |     model="openai/gpt-3-5-turbo",
17 |     api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"),
18 |     role="extractor_text",  # must have the same role as the parent LLM
19 |     is_fallback=True,
20 | )
21 | base_llm.fallback_llm = base_llm_fallback
22 | 
23 | advanced_llm = DocumentLLM(
24 |     model="openai/gpt-4o",  # can be a larger model (reasoning or non-reasoning)
25 |     api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"),
26 |     role="reasoner_text",
27 | )
28 | 
29 | # You can organize LLMs in a group to use them in a pipeline
30 | llm_group = DocumentLLMGroup(
31 |     llms=[base_llm, advanced_llm],
32 | )
33 | 
34 | # Assign the existing LLMs to aspects/concepts
35 | document = Document(
36 |     raw_text="document_text",
37 |     aspects=[
38 |         Aspect(
39 |             name="aspect_name",
40 |             description="aspect_description",
41 |             llm_role="extractor_text",
42 |             concepts=[
43 |                 StringConcept(
44 |                     name="concept_name",
45 |                     description="concept_description",
46 |                     llm_role="reasoner_text",
47 |                 )
48 |             ],
49 |         )
50 |     ],
51 | )
52 | 
53 | # Then use the LLM group to extract all information from the document
54 | # This will use different LLMs for different aspects/concepts under the hood
55 | # document = llm_group.extract_all(document)
56 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/optimizations/optimization_cost.py:
--------------------------------------------------------------------------------
 1 | # Example of optimizing extraction for cost
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import DocumentLLM, LLMPricing
 6 | 
 7 | llm = DocumentLLM(
 8 |     model="openai/gpt-4o-mini",
 9 |     api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"),
10 |     pricing_details=LLMPricing(
11 |         input_per_1m_tokens=0.150,
12 |         output_per_1m_tokens=0.600,
13 |     ),  # add pricing details to track costs
14 | )
15 | 
16 | # ... use the LLM for extraction ...
17 | 
18 | # ... monitor usage and cost ...
19 | usage = llm.get_usage()  # get the usage details, including tokens and calls' details.
20 | cost = llm.get_cost()  # get the cost details, including input, output, and total costs.
21 | print(usage)
22 | print(cost)
23 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/optimizations/optimization_long_docs.py:
--------------------------------------------------------------------------------
 1 | # Example of configuring LLM extraction to process long documents
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Document, DocumentLLM
 6 | 
 7 | # Define document
 8 | long_doc = Document(
 9 |     raw_text="long_document_text",
10 | )
11 | 
12 | # ... attach aspects/concepts to the document ...
13 | 
14 | # Define and configure LLM
15 | llm = DocumentLLM(
16 |     model="openai/gpt-4o-mini",
17 |     api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"),
18 | )
19 | 
20 | # Extract data from document with specific configuration options
21 | long_doc = llm.extract_all(
22 |     long_doc,
23 |     max_paragraphs_to_analyze_per_call=50,  # limit the number of paragraphs to analyze in an individual LLM call
24 |     max_items_per_call=2,  # limit the number of aspects/concepts to analyze in an individual LLM call
25 |     use_concurrency=True,  # optional: enable concurrent extractions
26 | )
27 | 
28 | # ... use the extracted data ...
29 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/optimizations/optimization_speed.py:
--------------------------------------------------------------------------------
 1 | # Example of optimizing extraction for speed
 2 | 
 3 | import os
 4 | 
 5 | from aiolimiter import AsyncLimiter
 6 | 
 7 | from contextgem import Document, DocumentLLM
 8 | 
 9 | # Define document
10 | document = Document(
11 |     raw_text="document_text",
12 |     # aspects=[Aspect(...), ...],
13 |     # concepts=[Concept(...), ...],
14 | )
15 | 
16 | # Define LLM with a fallback model
17 | llm = DocumentLLM(
18 |     model="openai/gpt-4o-mini",
19 |     api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"),
20 |     async_limiter=AsyncLimiter(
21 |         10, 5
22 |     ),  # e.g. 10 acquisitions per 5-second period; adjust to your LLM API setup
23 |     fallback_llm=DocumentLLM(
24 |         model="openai/gpt-3.5-turbo",
25 |         api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"),
26 |         is_fallback=True,
27 |         async_limiter=AsyncLimiter(
28 |             20, 5
29 |         ),  # e.g. 20 acquisitions per 5-second period; adjust to your LLM API setup
30 |     ),
31 | )
32 | 
33 | # Use the LLM for extraction with concurrency enabled
34 | llm.extract_all(document, use_concurrency=True)
35 | 
36 | # ... use the extracted data ...
37 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/quickstart/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/quickstart/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/quickstart/quickstart_aspect.py:
--------------------------------------------------------------------------------
 1 | # Quick Start Example - Extracting aspect from a document
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Aspect, Document, DocumentLLM
 6 | 
 7 | # Example document instance
 8 | # Document content is shortened for brevity
 9 | doc = Document(
10 |     raw_text=(
11 |         "Consultancy Agreement\n"
12 |         "This agreement between Company A (Supplier) and Company B (Customer)...\n"
13 |         "The term of the agreement is 1 year from the Effective Date...\n"
14 |         "The Supplier shall provide consultancy services as described in Annex 2...\n"
15 |         "The Customer shall pay the Supplier within 30 calendar days of receiving an invoice...\n"
16 |         "This agreement is governed by the laws of Norway...\n"
17 |     ),
18 | )
19 | 
20 | # Define an aspect with optional concept(s), using natural language
21 | doc_aspect = Aspect(
22 |     name="Governing law",
23 |     description="Clauses defining the governing law of the agreement",
24 |     reference_depth="sentences",
25 | )
26 | 
27 | # Add aspects to the document
28 | doc.add_aspects([doc_aspect])
29 | # (add more aspects to the document, if needed)
30 | 
31 | # Create an LLM for extraction
32 | llm = DocumentLLM(
33 |     model="openai/gpt-4o-mini",  # or any other LLM from e.g. Anthropic, etc.
34 |     api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"),  # your API key
35 | )
36 | 
37 | # Extract information from the document
38 | extracted_aspects = llm.extract_aspects_from_document(doc)
39 | # or use async version llm.extract_aspects_from_document_async(doc)
40 | 
41 | # Access extracted information
42 | print("Governing law aspect:")
43 | print(
44 |     extracted_aspects[0].extracted_items
45 | )  # extracted aspect items with references to sentences
46 | # or doc.get_aspect_by_name("Governing law").extracted_items
47 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/quickstart/quickstart_concept_aspect.py:
--------------------------------------------------------------------------------
 1 | # Quick Start Example - Extracting a concept from an aspect
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Aspect, Document, DocumentLLM, StringConcept, StringExample
 6 | 
 7 | # Example document instance
 8 | # Document content is shortened for brevity
 9 | doc = Document(
10 |     raw_text=(
11 |         "Employment Agreement\n"
12 |         "This agreement between TechCorp Inc. (Employer) and Jane Smith (Employee)...\n"
13 |         "The employment shall commence on January 15, 2023 and continue until terminated...\n"
14 |         "The Employee shall work as a Senior Software Engineer reporting to the CTO...\n"
15 |         "The Employee shall receive an annual salary of $120,000 paid monthly...\n"
16 |         "The Employee is entitled to 20 days of paid vacation per year...\n"
17 |         "The Employee agrees to a notice period of 30 days for resignation...\n"
18 |         "This agreement is governed by the laws of California...\n"
19 |     ),
20 | )
21 | 
22 | # Define an aspect with a specific concept, using natural language
23 | doc_aspect = Aspect(
24 |     name="Compensation",
25 |     description="Clauses defining the compensation and benefits for the employee",
26 |     reference_depth="sentences",
27 | )
28 | 
29 | # Define a concept within the aspect
30 | aspect_concept = StringConcept(
31 |     name="Annual Salary",
32 |     description="The annual base salary amount specified in the employment agreement",
33 |     examples=[  # optional
34 |         StringExample(
35 |             content="$X per year",  # guidance regarding format
36 |         )
37 |     ],
38 |     add_references=True,
39 |     reference_depth="sentences",
40 | )
41 | 
42 | # Add the concept to the aspect
43 | doc_aspect.add_concepts([aspect_concept])
44 | # (add more concepts to the aspect, if needed)
45 | 
46 | # Add the aspect to the document
47 | doc.add_aspects([doc_aspect])
48 | # (add more aspects to the document, if needed)
49 | 
50 | # Create an LLM for extraction
51 | llm = DocumentLLM(
52 |     model="openai/gpt-4o-mini",  # or any other LLM from e.g. Anthropic, etc.
53 |     api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"),  # your API key
54 | )
55 | 
56 | # Extract information from the document
57 | doc = llm.extract_all(doc)
58 | # or use async version llm.extract_all_async(doc)
59 | 
60 | # Access extracted information in the document object
61 | print("Compensation aspect:")
62 | print(
63 |     doc.get_aspect_by_name("Compensation").extracted_items
64 | )  # extracted aspect items with references to sentences
65 | print("Annual Salary concept:")
66 | print(
67 |     doc.get_aspect_by_name("Compensation")
68 |     .get_concept_by_name("Annual Salary")
69 |     .extracted_items
70 | )  # extracted concept items with references to sentences
71 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/quickstart/quickstart_concept_document_text.py:
--------------------------------------------------------------------------------
 1 | # Quick Start Example - Extracting a concept from a document
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Document, DocumentLLM, JsonObjectConcept, JsonObjectExample
 6 | 
 7 | # Example document instance
 8 | # Document content is shortened for brevity
 9 | doc = Document(
10 |     raw_text=(
11 |         "Statement of Work\n"
12 |         "Project: Cloud Migration Initiative\n"
13 |         "Client: Acme Corporation\n"
14 |         "Contractor: TechSolutions Inc.\n\n"
15 |         "Project Timeline:\n"
16 |         "Start Date: March 1, 2025\n"
17 |         "End Date: August 31, 2025\n\n"
18 |         "Deliverables:\n"
19 |         "1. Infrastructure assessment report (Due: March 15, 2025)\n"
20 |         "2. Migration strategy document (Due: April 10, 2025)\n"
21 |         "3. Test environment setup (Due: May 20, 2025)\n"
22 |         "4. Production migration (Due: July 15, 2025)\n"
23 |         "5. Post-migration support (Due: August 31, 2025)\n\n"
24 |         "Budget: $250,000\n"
25 |         "Payment Schedule: 20% upfront, 30% at midpoint, 50% upon completion\n"
26 |     ),
27 | )
28 | 
29 | # Define a document-level concept using e.g. JsonObjectConcept
30 | # This will extract structured data from the entire document
31 | doc_concept = JsonObjectConcept(
32 |     name="Project Details",
33 |     description="Key project information including timeline, deliverables, and budget",
34 |     structure={
35 |         "project_name": str,
36 |         "client": str,
37 |         "contractor": str,
38 |         "budget": str,
39 |         "payment_terms": str,
40 |     },  # simply use a dictionary with type hints (including generic aliases and union types)
41 |     add_references=True,
42 |     reference_depth="paragraphs",
43 | )
44 | 
45 | # Add the concept to the document
46 | doc.add_concepts([doc_concept])
47 | # (add more concepts to the document, if needed)
48 | 
49 | # Create an LLM for extraction
50 | llm = DocumentLLM(
51 |     model="openai/gpt-4o-mini",  # or any other LLM from e.g. Anthropic, etc.
52 |     api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"),  # your API key
53 | )
54 | 
55 | # Extract information from the document
56 | extracted_concepts = llm.extract_concepts_from_document(doc)
57 | # or use async version llm.extract_concepts_from_document_async(doc)
58 | 
59 | # Access extracted information
60 | print("Project Details:")
61 | print(
62 |     extracted_concepts[0].extracted_items
63 | )  # extracted concept items with references to paragraphs
64 | # Or doc.get_concept_by_name("Project Details").extracted_items
65 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/quickstart/quickstart_concept_document_vision.py:
--------------------------------------------------------------------------------
 1 | # Quick Start Example - Extracting concept from a document with an image
 2 | 
 3 | import os
 4 | from pathlib import Path
 5 | 
 6 | from contextgem import Document, DocumentLLM, Image, NumericalConcept, image_to_base64
 7 | 
 8 | # Path adapted for testing
 9 | current_file = Path(__file__).resolve()
10 | root_path = current_file.parents[4]
11 | image_path = root_path / "tests" / "images" / "invoices" / "invoice.jpg"
12 | 
13 | # Create an image instance
14 | doc_image = Image(mime_type="image/jpg", base64_data=image_to_base64(image_path))
15 | 
16 | # Example document instance holding only the image
17 | doc = Document(
18 |     images=[doc_image],  # may contain multiple images
19 | )
20 | 
21 | # Define a concept to extract the invoice total amount
22 | doc_concept = NumericalConcept(
23 |     name="Invoice Total",
24 |     description="The total amount to be paid as shown on the invoice",
25 |     numeric_type="float",
26 |     llm_role="extractor_vision",  # use vision model
27 | )
28 | 
29 | # Add concept to the document
30 | doc.add_concepts([doc_concept])
31 | # (add more concepts to the document, if needed)
32 | 
33 | # Create an LLM for extraction
34 | llm = DocumentLLM(
35 |     model="openai/gpt-4o-mini",  # Using a model with vision capabilities
36 |     api_key=os.environ.get("CONTEXTGEM_OPENAI_API_KEY"),  # your API key
37 |     role="extractor_vision",  # mark LLM as vision model
38 | )
39 | 
40 | # Extract information from the document
41 | extracted_concepts = llm.extract_concepts_from_document(doc)
42 | # or use async version: await llm.extract_concepts_from_document_async(doc)
43 | 
44 | # Access extracted information
45 | print("Invoice Total:")
46 | print(extracted_concepts[0].extracted_items)  # extracted concept items
47 | # or doc.get_concept_by_name("Invoice Total").extracted_items
48 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/quickstart/quickstart_sub_aspect.py:
--------------------------------------------------------------------------------
 1 | # Quick Start Example - Extracting an aspect with sub-aspects
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Aspect, Document, DocumentLLM
 6 | 
 7 | # Sample document (content shortened for brevity)
 8 | contract_text = """
 9 | EMPLOYMENT AGREEMENT
10 | ...
11 | 8. TERMINATION
12 | 8.1 Termination by the Company. The Company may terminate the Employee's employment for Cause at any time upon written notice. 
13 | "Cause" shall mean: (i) Employee's material breach of this Agreement; (ii) Employee's conviction of a felony; or 
14 | (iii) Employee's willful misconduct that causes material harm to the Company.
15 | 8.2 Termination by the Employee. The Employee may terminate employment for Good Reason upon 30 days' written notice to the Company. 
16 | "Good Reason" shall mean a material reduction in Employee's base salary or a material diminution in Employee's duties.
17 | 8.3 Severance. If the Employee's employment is terminated by the Company without Cause or by the Employee for Good Reason, 
18 | the Employee shall be entitled to receive severance pay equal to six (6) months of the Employee's base salary.
19 | ...
20 | """
21 | 
22 | doc = Document(raw_text=contract_text)
23 | 
24 | # Define termination aspect with practical sub-aspects
25 | termination_aspect = Aspect(
26 |     name="Termination",
27 |     description="Provisions related to the termination of employment",
28 |     aspects=[  # assign sub-aspects (optional)
29 |         Aspect(
30 |             name="Company Termination Rights",
31 |             description="Conditions under which the company can terminate employment",
32 |         ),
33 |         Aspect(
34 |             name="Employee Termination Rights",
35 |             description="Conditions under which the employee can terminate employment",
36 |         ),
37 |         Aspect(
38 |             name="Severance Terms",
39 |             description="Compensation or benefits provided upon termination",
40 |         ),
41 |     ],
42 | )
43 | 
44 | # Add the aspect to the document. Sub-aspects are added with the parent aspect.
45 | doc.add_aspects([termination_aspect])
46 | # (add more aspects to the document, if needed)
47 | 
48 | # Create an LLM for extraction
49 | llm = DocumentLLM(
50 |     model="openai/gpt-4o-mini",  # or any other LLM from e.g. Anthropic, etc.
51 |     api_key=os.environ.get(
52 |         "CONTEXTGEM_OPENAI_API_KEY"
53 |     ),  # your API key of the LLM provider
54 | )
55 | 
56 | # Extract all information from the document
57 | doc = llm.extract_all(doc)
58 | 
59 | # Get results with references in the document object
60 | print("\nTermination aspect:\n")
61 | termination_aspect = doc.get_aspect_by_name("Termination")
62 | for sub_aspect in termination_aspect.aspects:
63 |     print(sub_aspect.name)
64 |     for item in sub_aspect.extracted_items:
65 |         print(item.value)
66 |     print("\n")
67 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docs/serialization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docs/serialization/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docs/serialization/serialization.py:
--------------------------------------------------------------------------------
 1 | # Example of serializing and deserializing ContextGem document,
 2 | # document pipeline, and LLM config.
 3 | 
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | from contextgem import (
 8 |     Aspect,
 9 |     BooleanConcept,
10 |     Document,
11 |     DocumentLLM,
12 |     DocumentPipeline,
13 |     DocxConverter,
14 |     StringConcept,
15 | )
16 | 
17 | # Create a document object
18 | converter = DocxConverter()
19 | docx_path = str(
20 |     Path(__file__).resolve().parents[4]
21 |     / "tests"
22 |     / "docx_files"
23 |     / "en_nda_with_anomalies.docx"
24 | )  # your file path here (Path adapted for testing)
25 | doc = converter.convert(docx_path, strict_mode=True)
26 | 
27 | # Create a document pipeline
28 | document_pipeline = DocumentPipeline(
29 |     aspects=[
30 |         Aspect(
31 |             name="Categories of confidential information",
32 |             description="Clauses describing confidential information covered by the NDA",
33 |             concepts=[
34 |                 StringConcept(
35 |                     name="Types of disclosure",
36 |                     description="Types of disclosure of confidential information",
37 |                 ),
38 |                 # ...
39 |             ],
40 |         ),
41 |         # ...
42 |     ],
43 |     concepts=[
44 |         BooleanConcept(
45 |             name="Is mutual",
46 |             description="Whether the NDA is mutual (both parties act as discloser/recipient)",
47 |             add_justifications=True,
48 |         ),
49 |         # ...
50 |     ],
51 | )
52 | 
53 | # Attach the pipeline to the document
54 | doc.assign_pipeline(document_pipeline)
55 | 
56 | # Configure a document LLM with your API parameters
57 | llm = DocumentLLM(
58 |     model="azure/gpt-4.1-mini",
59 |     api_key=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_KEY"),
60 |     api_version=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_VERSION"),
61 |     api_base=os.getenv("CONTEXTGEM_AZURE_OPENAI_API_BASE"),
62 | )
63 | 
64 | # Extract data from the document
65 | doc = llm.extract_all(doc)
66 | 
67 | # Serialize the LLM config, pipeline and document
68 | llm_config_json = llm.to_json()  # or to_dict() / to_disk()
69 | document_pipeline_json = document_pipeline.to_json()  # or to_dict() / to_disk()
70 | processed_doc_json = doc.to_json()  # or to_dict() / to_disk()
71 | 
72 | # Deserialize the LLM config, pipeline and document
73 | llm_deserialized = DocumentLLM.from_json(
74 |     llm_config_json
75 | )  # or from_dict() / from_disk()
76 | document_pipeline_deserialized = DocumentPipeline.from_json(
77 |     document_pipeline_json
78 | )  # or from_dict() / from_disk()
79 | processed_doc_deserialized = Document.from_json(
80 |     processed_doc_json
81 | )  # or from_dict() / from_disk()
82 | 
83 | # All extracted data is preserved!
84 | assert processed_doc_deserialized.aspects[0].concepts[0].extracted_items
85 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docstrings/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/aspects/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docstrings/aspects/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/aspects/def_aspect.py:
--------------------------------------------------------------------------------
 1 | from contextgem import Aspect
 2 | 
 3 | # Define an aspect focused on termination clauses
 4 | termination_aspect = Aspect(
 5 |     name="Termination provisions",
 6 |     description="Contract termination conditions, notice requirements, and severance terms.",
 7 |     reference_depth="sentences",
 8 |     add_justifications=True,
 9 |     justification_depth="comprehensive",
10 | )
11 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/concepts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docstrings/concepts/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/concepts/def_boolean_concept.py:
--------------------------------------------------------------------------------
 1 | from contextgem import BooleanConcept
 2 | 
 3 | # Create the concept with specific configuration
 4 | has_confidentiality = BooleanConcept(
 5 |     name="Contains confidentiality clause",
 6 |     description="Determines whether the contract includes provisions requiring parties to maintain confidentiality",
 7 |     llm_role="reasoner_text",
 8 |     singular_occurrence=True,
 9 |     add_justifications=True,
10 |     justification_depth="brief",
11 | )
12 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/concepts/def_date_concept.py:
--------------------------------------------------------------------------------
 1 | from contextgem import DateConcept
 2 | 
 3 | # Create a date concept to extract the effective date of the contract
 4 | effective_date = DateConcept(
 5 |     name="Effective date",
 6 |     description="The effective as specified in the contract",
 7 |     add_references=True,  # Include references to where dates were found
 8 |     singular_occurrence=True,  # Only extract one effective date per document
 9 | )
10 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/concepts/def_json_object_concept.py:
--------------------------------------------------------------------------------
 1 | from typing import Literal
 2 | 
 3 | from contextgem import JsonObjectConcept
 4 | 
 5 | # Define a JSON object concept for capturing address information
 6 | address_info_concept = JsonObjectConcept(
 7 |     name="Address information",
 8 |     description=(
 9 |         "Structured address data from text including street, "
10 |         "city, state, postal code, and country."
11 |     ),
12 |     structure={
13 |         "street": str | None,
14 |         "city": str | None,
15 |         "state": str | None,
16 |         "postal_code": str | None,
17 |         "country": str | None,
18 |         "address_type": Literal["residential", "business"] | None,
19 |     },
20 | )
21 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/concepts/def_label_concept.py:
--------------------------------------------------------------------------------
 1 | from contextgem import LabelConcept
 2 | 
 3 | # Multi-class classification: single label selection
 4 | document_type_concept = LabelConcept(
 5 |     name="Document Type",
 6 |     description="Classify the type of legal document",
 7 |     labels=["NDA", "Consultancy Agreement", "Privacy Policy", "Other"],
 8 |     classification_type="multi_class",
 9 |     singular_occurrence=True,
10 | )
11 | 
12 | # Multi-label classification: multiple label selection
13 | content_topics_concept = LabelConcept(
14 |     name="Content Topics",
15 |     description="Identify all relevant topics covered in the document",
16 |     labels=["Finance", "Legal", "Technology", "HR", "Operations", "Marketing"],
17 |     classification_type="multi_label",
18 |     add_justifications=True,
19 |     justification_depth="brief",  # add justifications for the selected labels
20 | )
21 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/concepts/def_numerical_concept.py:
--------------------------------------------------------------------------------
 1 | from contextgem import NumericalConcept
 2 | 
 3 | # Create concepts for different numerical values in the contract
 4 | payment_amount = NumericalConcept(
 5 |     name="Payment amount",
 6 |     description="The monetary value to be paid according to the contract terms",
 7 |     numeric_type="float",
 8 |     llm_role="extractor_text",
 9 |     add_references=True,
10 |     reference_depth="sentences",
11 | )
12 | 
13 | payment_days = NumericalConcept(
14 |     name="Payment term days",
15 |     description="The number of days within which payment must be made",
16 |     numeric_type="int",
17 |     llm_role="extractor_text",
18 |     add_justifications=True,
19 |     justification_depth="balanced",
20 | )
21 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/concepts/def_rating_concept.py:
--------------------------------------------------------------------------------
 1 | from contextgem import RatingConcept
 2 | 
 3 | # Create a concept to rate the fairness of contract terms
 4 | fairness_rating = RatingConcept(
 5 |     name="Contract fairness rating",
 6 |     description="Evaluation of how balanced and fair the contract terms are for all parties",
 7 |     rating_scale=(1, 5),
 8 |     llm_role="reasoner_text",
 9 |     add_justifications=True,
10 |     justification_depth="comprehensive",
11 |     justification_max_sents=10,
12 | )
13 | 
14 | # Create a concept to rate the clarity of contract language
15 | clarity_rating = RatingConcept(
16 |     name="Language clarity rating",
17 |     description="Assessment of how clear and unambiguous the contract language is",
18 |     rating_scale=(1, 10),
19 |     llm_role="reasoner_text",
20 |     add_justifications=True,
21 |     justification_depth="balanced",
22 |     justification_max_sents=3,
23 | )
24 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/concepts/def_string_concept.py:
--------------------------------------------------------------------------------
 1 | from contextgem import StringConcept, StringExample
 2 | 
 3 | # Define a string concept for identifying contract party names
 4 | # and their roles in the contract
 5 | party_names_and_roles_concept = StringConcept(
 6 |     name="Party names and roles",
 7 |     description=(
 8 |         "Names of all parties entering into the agreement "
 9 |         "and their contractual roles"
10 |     ),
11 |     examples=[
12 |         StringExample(
13 |             content="X (Client)",  # guidance regarding format
14 |         )
15 |     ],
16 | )
17 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/data_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docstrings/data_models/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/data_models/def_llm_pricing.py:
--------------------------------------------------------------------------------
 1 | from contextgem import LLMPricing
 2 | 
 3 | # Create a pricing model for an LLM (openai/o3-mini example)
 4 | pricing = LLMPricing(
 5 |     input_per_1m_tokens=1.10,  # $1.10 per million input tokens
 6 |     output_per_1m_tokens=4.40,  # $4.40 per million output tokens
 7 | )
 8 | 
 9 | # LLMPricing objects are immutable
10 | try:
11 |     pricing.input_per_1m_tokens = 0.7
12 | except ValueError as e:
13 |     print(f"Error when trying to modify pricing: {e}")
14 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/documents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docstrings/documents/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/documents/def_document.py:
--------------------------------------------------------------------------------
 1 | from contextgem import Document
 2 | 
 3 | # Create a document with raw text content
 4 | contract_document = Document(
 5 |     raw_text=(
 6 |         "...This agreement is effective as of January 1, 2025.\n\n"
 7 |         "All parties must comply with the terms outlined herein. The terms include "
 8 |         "monthly reporting requirements and quarterly performance reviews.\n\n"
 9 |         "Failure to adhere to these terms may result in termination of the agreement. "
10 |         "Additionally, any breach of confidentiality will be subject to penalties as "
11 |         "described in this agreement.\n\n"
12 |         "This agreement shall remain in force for a period of three (3) years unless "
13 |         "otherwise terminated according to the provisions stated above..."
14 |     ),
15 |     paragraph_segmentation_mode="newlines",  # Default mode, splits on newlines
16 | )
17 | 
18 | # Create a document with more advanced paragraph segmentation using a SaT model
19 | report_document = Document(
20 |     raw_text=(
21 |         "Executive Summary "
22 |         "This report outlines our quarterly performance. "
23 |         "Revenue increased by [15%] compared to the previous quarter.\n\n"
24 |         "Customer satisfaction metrics show positive trends across all regions..."
25 |     ),
26 |     paragraph_segmentation_mode="sat",  # Use SaT model for intelligent paragraph segmentation
27 |     sat_model_id="sat-3l-sm",  # Specify which SaT model to use
28 | )
29 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docstrings/examples/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/examples/def_example_json_object.py:
--------------------------------------------------------------------------------
 1 | from contextgem import JsonObjectConcept, JsonObjectExample
 2 | 
 3 | # Create a JSON object example
 4 | json_example = JsonObjectExample(
 5 |     content={
 6 |         "name": "John Doe",
 7 |         "education": "Bachelor's degree in Computer Science",
 8 |         "skills": ["Python", "Machine Learning", "Data Analysis"],
 9 |         "hobbies": ["Reading", "Traveling", "Gaming"],
10 |     }
11 | )
12 | 
13 | 
14 | # Define a structure for JSON object concept
15 | class PersonInfo:
16 |     name: str
17 |     education: str
18 |     skills: list[str]
19 |     hobbies: list[str]
20 | 
21 | 
22 | # Also works as a dict with type hints, e.g.
23 | # PersonInfo = {
24 | #     "name": str,
25 | #     "education": str,
26 | #     "skills": list[str],
27 | #     "hobbies": list[str],
28 | # }
29 | 
30 | # Attach JSON example to a JsonObjectConcept
31 | json_concept = JsonObjectConcept(
32 |     name="Candidate info",
33 |     description="Structured information about a job candidate",
34 |     structure=PersonInfo,  # Define the expected structure
35 |     examples=[json_example],  # Attach the example to the concept (optional)
36 | )
37 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/examples/def_example_string.py:
--------------------------------------------------------------------------------
 1 | from contextgem import StringConcept, StringExample
 2 | 
 3 | # Create string examples
 4 | string_examples = [
 5 |     StringExample(content="X (Client)"),
 6 |     StringExample(content="Y (Supplier)"),
 7 | ]
 8 | 
 9 | # Attach string examples to a StringConcept
10 | string_concept = StringConcept(
11 |     name="Contract party name and role",
12 |     description="The name and role of the contract party",
13 |     examples=string_examples,  # Attach the example to the concept (optional)
14 | )
15 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/images/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docstrings/images/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/images/def_image.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from contextgem import Document, Image, image_to_base64
 4 | 
 5 | # Path is adapted for doc tests
 6 | current_file = Path(__file__).resolve()
 7 | root_path = current_file.parents[4]
 8 | 
 9 | # Using the utility function to convert an image file to base64
10 | image_path = root_path / "tests" / "images" / "invoices" / "invoice.jpg"
11 | base64_data = image_to_base64(image_path)
12 | 
13 | # Create an image instance with the base64-encoded data
14 | jpg_image = Image(mime_type="image/jpg", base64_data=base64_data)
15 | 
16 | # Using pre-encoded base64 data directly
17 | png_image = Image(
18 |     mime_type="image/png", base64_data="base64-string"  # image as a base64 string
19 | )
20 | 
21 | # Using a different supported image format
22 | webp_image = Image(
23 |     mime_type="image/webp",
24 |     base64_data=image_to_base64(
25 |         root_path / "tests" / "images" / "invoices" / "invoice.webp"
26 |     ),
27 | )
28 | 
29 | # Attaching an image to a document
30 | # Documents can contain both text and multiple images, or just images
31 | 
32 | # Create a document with text content
33 | text_document = Document(
34 |     raw_text="This is a document with an attached image that shows an invoice.",
35 |     images=[jpg_image],
36 | )
37 | 
38 | # Create a document with only image content (no text)
39 | image_only_document = Document(images=[jpg_image])
40 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/llms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docstrings/llms/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/llms/def_llm.py:
--------------------------------------------------------------------------------
 1 | from contextgem import DocumentLLM, LLMPricing
 2 | 
 3 | # Create a single LLM for text extraction
 4 | text_extractor = DocumentLLM(
 5 |     model="openai/gpt-4o-mini",
 6 |     api_key="your-api-key",  # Replace with your actual API key
 7 |     role="extractor_text",  # Role for text extraction
 8 |     pricing_details=LLMPricing(  # optional
 9 |         input_per_1m_tokens=0.150, output_per_1m_tokens=0.600
10 |     ),
11 | )
12 | 
13 | # Create a fallback LLM in case the primary model fails
14 | fallback_text_extractor = DocumentLLM(
15 |     model="anthropic/claude-3-7-sonnet",
16 |     api_key="your-anthropic-api-key",  # Replace with your actual API key
17 |     role="extractor_text",  # must be the same as the role of the primary LLM
18 |     is_fallback=True,
19 |     pricing_details=LLMPricing(  # optional
20 |         input_per_1m_tokens=3.00, output_per_1m_tokens=15.00
21 |     ),
22 | )
23 | # Assign the fallback LLM to the primary LLM
24 | text_extractor.fallback_llm = fallback_text_extractor
25 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/llms/def_llm_group.py:
--------------------------------------------------------------------------------
 1 | from contextgem import DocumentLLM, DocumentLLMGroup
 2 | 
 3 | # Create a text extractor LLM with a fallback
 4 | text_extractor = DocumentLLM(
 5 |     model="openai/gpt-4o-mini",
 6 |     api_key="your-openai-api-key",  # Replace with your actual API key
 7 |     role="extractor_text",
 8 | )
 9 | 
10 | # Create a fallback LLM for the text extractor
11 | text_extractor_fallback = DocumentLLM(
12 |     model="anthropic/claude-3-5-haiku",
13 |     api_key="your-anthropic-api-key",  # Replace with your actual API key
14 |     role="extractor_text",  # Must have the same role as the primary LLM
15 |     is_fallback=True,
16 | )
17 | 
18 | # Assign the fallback LLM to the primary text extractor
19 | text_extractor.fallback_llm = text_extractor_fallback
20 | 
21 | # Create a text reasoner LLM
22 | text_reasoner = DocumentLLM(
23 |     model="openai/o3-mini",
24 |     api_key="your-openai-api-key",  # Replace with your actual API key
25 |     role="reasoner_text",  # For more complex tasks that require reasoning
26 | )
27 | 
28 | # Create a vision extractor LLM
29 | vision_extractor = DocumentLLM(
30 |     model="openai/gpt-4o-mini",
31 |     api_key="your-openai-api-key",  # Replace with your actual API key
32 |     role="extractor_vision",  # For handling images
33 | )
34 | 
35 | # Create a vision reasoner LLM
36 | vision_reasoner = DocumentLLM(
37 |     model="openai/gpt-4o",
38 |     api_key="your-openai-api-key",
39 |     role="reasoner_vision",  # For more complex vision tasks that require reasoning
40 | )
41 | 
42 | # Create a DocumentLLMGroup with all four LLMs
43 | llm_group = DocumentLLMGroup(
44 |     llms=[text_extractor, text_reasoner, vision_extractor, vision_reasoner],
45 |     output_language="en",  # All LLMs must have the same output language ("en" is default)
46 | )
47 | # This group will have 5 LLMs: four main ones, with different roles,
48 | # and one fallback LLM for a specific LLM. Each LLM can have a fallback LLM.
49 | 
50 | # Get usage statistics for the whole group or for a specific role
51 | group_usage = llm_group.get_usage()
52 | text_extractor_usage = llm_group.get_usage(llm_role="extractor_text")
53 | 
54 | # Get cost statistics for the whole group or for a specific role
55 | all_costs = llm_group.get_cost()
56 | text_extractor_cost = llm_group.get_cost(llm_role="extractor_text")
57 | 
58 | # Reset usage and cost statistics for the whole group or for a specific role
59 | llm_group.reset_usage_and_cost()
60 | llm_group.reset_usage_and_cost(llm_role="extractor_text")
61 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/paragraphs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docstrings/paragraphs/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/paragraphs/def_paragraph.py:
--------------------------------------------------------------------------------
 1 | from contextgem import Paragraph
 2 | 
 3 | # Create a paragraph with raw text content
 4 | contract_paragraph = Paragraph(
 5 |     raw_text=(
 6 |         "This agreement is effective as of January 1, 2025. "
 7 |         "All parties must comply with the terms outlined herein. "
 8 |         "Failure to adhere to these terms may result in termination of the agreement."
 9 |     )
10 | )
11 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docstrings/pipelines/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/pipelines/def_pipeline.py:
--------------------------------------------------------------------------------
 1 | from contextgem import (
 2 |     Aspect,
 3 |     BooleanConcept,
 4 |     DateConcept,
 5 |     Document,
 6 |     DocumentPipeline,
 7 |     StringConcept,
 8 | )
 9 | 
10 | # Create a pipeline for NDA (Non-Disclosure Agreement) review
11 | nda_pipeline = DocumentPipeline(
12 |     aspects=[
13 |         Aspect(
14 |             name="Confidential information",
15 |             description="Clauses defining the confidential information",
16 |         ),
17 |         Aspect(
18 |             name="Exclusions",
19 |             description="Clauses defining exclusions from confidential information",
20 |         ),
21 |         Aspect(
22 |             name="Obligations",
23 |             description="Clauses defining confidentiality obligations",
24 |         ),
25 |         Aspect(
26 |             name="Liability",
27 |             description="Clauses defining liability for breach of the agreement",
28 |         ),
29 |         # ... Add more aspects as needed
30 |     ],
31 |     concepts=[
32 |         StringConcept(
33 |             name="Anomaly",
34 |             description="Anomaly in the contract, e.g. out-of-context or nonsensical clauses",
35 |             llm_role="reasoner_text",
36 |             add_references=True,  # Add references to the source text
37 |             reference_depth="sentences",  # Reference to the sentence level
38 |             add_justifications=True,  # Add justifications for the anomaly
39 |             justification_depth="balanced",  # Justification at the sentence level
40 |             justification_max_sents=5,  # Maximum number of sentences in the justification
41 |         ),
42 |         BooleanConcept(
43 |             name="Is mutual",
44 |             description="Whether the NDA is mutual (bidirectional) or one-way",
45 |             singular_occurrence=True,
46 |             llm_role="reasoner_text",  # Use the reasoner role for this concept
47 |         ),
48 |         DateConcept(
49 |             name="Effective date",
50 |             description="The date when the NDA agreement becomes effective",
51 |             singular_occurrence=True,
52 |         ),
53 |         StringConcept(
54 |             name="Term",
55 |             description="The term of the NDA",
56 |         ),
57 |         StringConcept(
58 |             name="Governing law",
59 |             description="The governing law of the agreement",
60 |             singular_occurrence=True,
61 |         ),
62 |         # ... Add more concepts as needed
63 |     ],
64 | )
65 | 
66 | # Assign the pipeline to the NDA document
67 | nda_document = Document(raw_text="[NDA text]")
68 | nda_document.assign_pipeline(nda_pipeline)
69 | 
70 | # Now the document is ready for processing with the NDA review pipeline!
71 | # The document can be processed to extract the defined aspects and concepts
72 | 
73 | # Extract all aspects and concepts from the NDA using an LLM group
74 | # with LLMs with roles "extractor_text" and "reasoner_text".
75 | # llm_group.extract_all(nda_document)
76 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/sentences/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docstrings/sentences/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/sentences/def_sentence.py:
--------------------------------------------------------------------------------
 1 | from contextgem import Sentence
 2 | 
 3 | # Create a sentence with raw text content
 4 | sentence = Sentence(raw_text="This is a simple sentence.")
 5 | 
 6 | # Sentences are immutable - their content cannot be changed after creation
 7 | try:
 8 |     sentence.raw_text = "Attempting to modify the sentence."
 9 | except ValueError as e:
10 |     print(f"Error when trying to modify sentence: {e}")
11 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/docstrings/utils/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/utils/json_object_cls_struct.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from contextgem import JsonObjectClassStruct, JsonObjectConcept
 4 | 
 5 | 
 6 | @dataclass
 7 | class Address(JsonObjectClassStruct):
 8 |     street: str
 9 |     city: str
10 |     country: str
11 | 
12 | 
13 | @dataclass
14 | class Contact(JsonObjectClassStruct):
15 |     email: str
16 |     phone: str
17 |     address: Address
18 | 
19 | 
20 | @dataclass
21 | class Person(JsonObjectClassStruct):
22 |     name: str
23 |     age: int
24 |     contact: Contact
25 | 
26 | 
27 | # Use the class structure with JsonObjectConcept
28 | # JsonObjectClassStruct enables automatic conversion of typed class hierarchies
29 | # into the dictionary structure required by JsonObjectConcept, preserving the
30 | # type information and nested relationships between classes.
31 | JsonObjectConcept(name="person", description="Person information", structure=Person)
32 | 


--------------------------------------------------------------------------------
/dev/usage_examples/docstrings/utils/reload_logger_settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from contextgem import reload_logger_settings
 4 | 
 5 | # Initial logger settings are loaded from environment variables at import time
 6 | 
 7 | # Change logger level to WARNING
 8 | os.environ["CONTEXTGEM_LOGGER_LEVEL"] = "WARNING"
 9 | print("Setting logger level to WARNING")
10 | reload_logger_settings()
11 | # Now the logger will only show WARNING level and above messages
12 | 
13 | # Disable the logger completely
14 | os.environ["CONTEXTGEM_DISABLE_LOGGER"] = "True"
15 | print("Disabling the logger")
16 | reload_logger_settings()
17 | # Now the logger is disabled and won't show any messages
18 | 
19 | # You can re-enable the logger by setting CONTEXTGEM_DISABLE_LOGGER to "False"
20 | # os.environ["CONTEXTGEM_DISABLE_LOGGER"] = "False"
21 | # reload_logger_settings()
22 | 


--------------------------------------------------------------------------------
/dev/usage_examples/readme/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/readme/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/readme/docx_converter.py:
--------------------------------------------------------------------------------
 1 | # Using ContextGem's DocxConverter
 2 | 
 3 | from contextgem import DocxConverter
 4 | 
 5 | converter = DocxConverter()
 6 | 
 7 | # Convert a DOCX file to an LLM-ready ContextGem Document
 8 | # from path
 9 | document = converter.convert("path/to/document.docx")
10 | # or from file object
11 | with open("path/to/document.docx", "rb") as docx_file_object:
12 |     document = converter.convert(docx_file_object)
13 | 
14 | # Perform data extraction on the resulting Document object
15 | # document.add_aspects(...)
16 | # document.add_concepts(...)
17 | # llm.extract_all(document)
18 | 
19 | # You can also use DocxConverter instance as a standalone text extractor
20 | docx_text = converter.convert_to_text_format(
21 |     "path/to/document.docx",
22 |     output_format="markdown",  # or "raw"
23 | )
24 | 


--------------------------------------------------------------------------------
/dev/usage_examples/readme/llm_chat.py:
--------------------------------------------------------------------------------
 1 | # Using LLMs for chat (text + vision), with fallback LLM support
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import DocumentLLM
 6 | 
 7 | # from contextgem import Image
 8 | 
 9 | main_model = DocumentLLM(
10 |     model="openai/gpt-4o",  # or another provider/model
11 |     api_key=os.getenv("CONTEXTGEM_OPENAI_API_KEY"),  # your API key for the LLM provider
12 | )
13 | 
14 | # Optional: fallback LLM
15 | fallback_model = DocumentLLM(
16 |     model="openai/gpt-4o-mini",  # or another provider/model
17 |     api_key=os.getenv("CONTEXTGEM_OPENAI_API_KEY"),  # your API key for the LLM provider
18 |     is_fallback=True,
19 | )
20 | main_model.fallback_llm = fallback_model
21 | 
22 | response = main_model.chat(
23 |     "Hello",
24 |     # images=[Image(...)]
25 | )
26 | # or `response = await main_model.chat_async(...)`
27 | 
28 | print(response)
29 | 


--------------------------------------------------------------------------------
/dev/usage_examples/readme/quickstart_aspect.py:
--------------------------------------------------------------------------------
 1 | # Quick Start Example - Extracting payment terms from a document
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Aspect, Document, DocumentLLM
 6 | 
 7 | # Sample document text (shortened for brevity)
 8 | doc = Document(
 9 |     raw_text=(
10 |         "SERVICE AGREEMENT\n"
11 |         "SERVICES. Provider agrees to provide the following services to Client: "
12 |         "Cloud-based data analytics platform access and maintenance...\n"
13 |         "PAYMENT. Client agrees to pay $5,000 per month for the services. "
14 |         "Payment is due on the 1st of each month. Late payments will incur a 2% fee per month...\n"
15 |         "CONFIDENTIALITY. Both parties agree to keep all proprietary information confidential "
16 |         "for a period of 5 years following termination of this Agreement..."
17 |     ),
18 | )
19 | 
20 | # Define the aspects to extract
21 | doc.aspects = [
22 |     Aspect(
23 |         name="Payment Terms",
24 |         description="Payment terms and conditions in the contract",
25 |         # see the docs for more configuration options, e.g. sub-aspects, concepts, etc.
26 |     ),
27 |     # Add more aspects as needed
28 | ]
29 | # Or use `doc.add_aspects([...])`
30 | 
31 | # Define an LLM for extracting information from the document
32 | llm = DocumentLLM(
33 |     model="openai/gpt-4o-mini",  # or another provider/LLM
34 |     api_key=os.environ.get(
35 |         "CONTEXTGEM_OPENAI_API_KEY"
36 |     ),  # your API key for the LLM provider
37 |     # see the docs for more configuration options
38 | )
39 | 
40 | # Extract information from the document
41 | doc = llm.extract_all(doc)  # or use async version `await llm.extract_all_async(doc)`
42 | 
43 | # Access extracted information in the document object
44 | for item in doc.aspects[0].extracted_items:
45 |     print(f"• {item.value}")
46 | # or `doc.get_aspect_by_name("Payment Terms").extracted_items`
47 | 
48 | # Output (exact paragraphs from the document):
49 | # • PAYMENT. Client agrees to pay $5,000 per month for the services. Payment is due on the 1st of each month. Late payments will incur a 2% fee per month...
50 | 


--------------------------------------------------------------------------------
/dev/usage_examples/readme/quickstart_concept.py:
--------------------------------------------------------------------------------
 1 | # Quick Start Example - Extracting anomalies from a document, with source references and justifications
 2 | 
 3 | import os
 4 | 
 5 | from contextgem import Document, DocumentLLM, StringConcept
 6 | 
 7 | # Sample document text (shortened for brevity)
 8 | doc = Document(
 9 |     raw_text=(
10 |         "Consultancy Agreement\n"
11 |         "This agreement between Company A (Supplier) and Company B (Customer)...\n"
12 |         "The term of the agreement is 1 year from the Effective Date...\n"
13 |         "The Supplier shall provide consultancy services as described in Annex 2...\n"
14 |         "The Customer shall pay the Supplier within 30 calendar days of receiving an invoice...\n"
15 |         "The purple elephant danced gracefully on the moon while eating ice cream.\n"  # 💎 anomaly
16 |         "Time-traveling dinosaurs will review all deliverables before acceptance.\n"  # 💎 another anomaly
17 |         "This agreement is governed by the laws of Norway...\n"
18 |     ),
19 | )
20 | 
21 | # Attach a document-level concept
22 | doc.concepts = [
23 |     StringConcept(
24 |         name="Anomalies",  # in longer contexts, this concept is hard to capture with RAG
25 |         description="Anomalies in the document",
26 |         add_references=True,
27 |         reference_depth="sentences",
28 |         add_justifications=True,
29 |         justification_depth="brief",
30 |         # see the docs for more configuration options
31 |     )
32 |     # add more concepts to the document, if needed
33 |     # see the docs for available concepts: StringConcept, JsonObjectConcept, etc.
34 | ]
35 | # Or use `doc.add_concepts([...])`
36 | 
37 | # Define an LLM for extracting information from the document
38 | llm = DocumentLLM(
39 |     model="openai/gpt-4o-mini",  # or another provider/LLM
40 |     api_key=os.environ.get(
41 |         "CONTEXTGEM_OPENAI_API_KEY"
42 |     ),  # your API key for the LLM provider
43 |     # see the docs for more configuration options
44 | )
45 | 
46 | # Extract information from the document
47 | doc = llm.extract_all(doc)  # or use async version `await llm.extract_all_async(doc)`
48 | 
49 | # Access extracted information in the document object
50 | anomalies_concept = doc.concepts[0]
51 | # or `doc.get_concept_by_name("Anomalies")`
52 | for item in anomalies_concept.extracted_items:
53 |     print(f"Anomaly:")
54 |     print(f"  {item.value}")
55 |     print(f"Justification:")
56 |     print(f"  {item.justification}")
57 |     print("Reference paragraphs:")
58 |     for p in item.reference_paragraphs:
59 |         print(f"  - {p.raw_text}")
60 |     print("Reference sentences:")
61 |     for s in item.reference_sentences:
62 |         print(f"  - {s.raw_text}")
63 |     print()
64 | 


--------------------------------------------------------------------------------
/dev/usage_examples/vs_other_frameworks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/vs_other_frameworks/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/vs_other_frameworks/advanced/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/vs_other_frameworks/advanced/__init__.py


--------------------------------------------------------------------------------
/dev/usage_examples/vs_other_frameworks/basic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/dev/usage_examples/vs_other_frameworks/basic/__init__.py


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/_static/contextgem_component_examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/contextgem_component_examples.png


--------------------------------------------------------------------------------
/docs/source/_static/contextgem_how_it_works_infographics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/contextgem_how_it_works_infographics.png


--------------------------------------------------------------------------------
/docs/source/_static/contextgem_readme_header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/contextgem_readme_header.png


--------------------------------------------------------------------------------
/docs/source/_static/contextgem_website_preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/contextgem_website_preview.png


--------------------------------------------------------------------------------
/docs/source/_static/custom.css:
--------------------------------------------------------------------------------
1 | /* Theme-specific image display */
2 | html[data-theme="light"] .only-dark {
3 |     display: none !important;
4 | }
5 | 
6 | html[data-theme="dark"] .only-light {
7 |     display: none !important;
8 | }
9 | 


--------------------------------------------------------------------------------
/docs/source/_static/docs_preview_image_aspects.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/docs_preview_image_aspects.png


--------------------------------------------------------------------------------
/docs/source/_static/docs_preview_image_boolean_concept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/docs_preview_image_boolean_concept.png


--------------------------------------------------------------------------------
/docs/source/_static/docs_preview_image_date_concept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/docs_preview_image_date_concept.png


--------------------------------------------------------------------------------
/docs/source/_static/docs_preview_image_json_object_concept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/docs_preview_image_json_object_concept.png


--------------------------------------------------------------------------------
/docs/source/_static/docs_preview_image_label_concept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/docs_preview_image_label_concept.png


--------------------------------------------------------------------------------
/docs/source/_static/docs_preview_image_numerical_concept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/docs_preview_image_numerical_concept.png


--------------------------------------------------------------------------------
/docs/source/_static/docs_preview_image_rating_concept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/docs_preview_image_rating_concept.png


--------------------------------------------------------------------------------
/docs/source/_static/docs_preview_image_string_concept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/docs_preview_image_string_concept.png


--------------------------------------------------------------------------------
/docs/source/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/favicon.ico


--------------------------------------------------------------------------------
/docs/source/_static/readme_code_snippet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/readme_code_snippet.png


--------------------------------------------------------------------------------
/docs/source/_static/tab_solid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/docs/source/_static/tab_solid.png


--------------------------------------------------------------------------------
/docs/source/api/aspects.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Aspects API
19 | 
20 | Aspects
21 | ========
22 | 
23 | .. automodule:: contextgem.public.aspects
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :inherited-members:
28 |    :exclude-members: model_config, model_post_init


--------------------------------------------------------------------------------
/docs/source/api/concepts.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Concepts API
19 | 
20 | Concepts
21 | =========
22 | 
23 | .. automodule:: contextgem.public.concepts
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :inherited-members:
28 |    :exclude-members: model_config, model_post_init


--------------------------------------------------------------------------------
/docs/source/api/converters.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Converters API
19 | 
20 | Converters
21 | ===========
22 | 
23 | .. automodule:: contextgem.public.converters
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :inherited-members:
28 | 


--------------------------------------------------------------------------------
/docs/source/api/data_models.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Data Models API
19 | 
20 | Data models
21 | ============
22 | 
23 | .. automodule:: contextgem.public.data_models
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :inherited-members:
28 |    :exclude-members: model_config, model_post_init


--------------------------------------------------------------------------------
/docs/source/api/documents.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Documents API
19 | 
20 | Documents
21 | ==========
22 | 
23 | .. automodule:: contextgem.public.documents
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :inherited-members:
28 |    :exclude-members: model_config, model_post_init


--------------------------------------------------------------------------------
/docs/source/api/examples.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Examples API
19 | 
20 | Examples
21 | =========
22 | 
23 | .. automodule:: contextgem.public.examples
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :inherited-members:
28 |    :exclude-members: model_config, model_post_init


--------------------------------------------------------------------------------
/docs/source/api/images.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Images API
19 | 
20 | Images
21 | =======
22 | 
23 | .. automodule:: contextgem.public.images
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :inherited-members:
28 |    :exclude-members: model_config, model_post_init


--------------------------------------------------------------------------------
/docs/source/api/llms.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: LLMs API
19 | 
20 | LLMs
21 | =====
22 | 
23 | .. automodule:: contextgem.public.llms
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :inherited-members:
28 |    :exclude-members: model_config, model_post_init
29 |    :private-members: _update_default_prompt, _eq_deserialized_llm_config


--------------------------------------------------------------------------------
/docs/source/api/paragraphs.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Paragraphs API
19 | 
20 | Paragraphs
21 | ===========
22 | 
23 | .. automodule:: contextgem.public.paragraphs
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :inherited-members:
28 |    :exclude-members: model_config, model_post_init


--------------------------------------------------------------------------------
/docs/source/api/pipelines.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Pipelines API
19 | 
20 | Pipelines
21 | ==========
22 | 
23 | .. automodule:: contextgem.public.pipelines
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :inherited-members:
28 |    :exclude-members: model_config, model_post_init
29 | 


--------------------------------------------------------------------------------
/docs/source/api/sentences.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Sentences API
19 | 
20 | Sentences
21 | ==========
22 | 
23 | .. automodule:: contextgem.public.sentences
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :inherited-members:
28 |    :exclude-members: model_config, model_post_init


--------------------------------------------------------------------------------
/docs/source/api/utils.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Utility functions and classes
19 | 
20 | Utility functions and classes
21 | ==============================
22 | 
23 | .. automodule:: contextgem.public.utils
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :inherited-members:
28 |    :exclude-members: model_config, model_post_init


--------------------------------------------------------------------------------
/docs/source/concepts/supported_concepts.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | Supported Concepts
19 | ===================
20 | 
21 | In ContextGem, Concepts are building blocks for defining the structured data you want to extract from documents. 
22 | Each concept type is designed for different kinds of information, allowing you to build complex extraction schemas.
23 | 
24 | Available Concept Types
25 | ------------------------
26 | 
27 | ContextGem provides several types of concepts, each tailored for specific extraction needs:
28 | 
29 | - 📝 :doc:`StringConcept <string_concept>`: For extracting text values
30 | - ✅ :doc:`BooleanConcept <boolean_concept>`: For extracting boolean (True/False) values
31 | - 🔢 :doc:`NumericalConcept <numerical_concept>`: For extracting numerical values (integers or floats)
32 | - 📅 :doc:`DateConcept <date_concept>`: For extracting date objects
33 | - ⭐ :doc:`RatingConcept <rating_concept>`: For extracting numerical ratings within a defined scale
34 | - 📊 :doc:`JsonObjectConcept <json_object_concept>`: For extracting structured data with multiple fields
35 | - 🏷️ :doc:`LabelConcept <label_concept>`: For classification using predefined labels (multi-class or multi-label)
36 | 
37 | This section provides detailed documentation for each concept type, including usage examples and best practices.
38 | 


--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Installation
19 | 
20 | Installation
21 | ============
22 | 
23 | 🔧 Prerequisites
24 | -----------------
25 | 
26 | Before installing ContextGem, ensure you have:
27 | 
28 | * Python 3.10-3.13
29 | * pip (Python package installer)
30 | 
31 | 📦 Installation Methods
32 | ------------------------
33 | 
34 | From PyPI
35 | ~~~~~~~~~
36 | 
37 | The simplest way to install ContextGem is via pip:
38 | 
39 | .. code-block:: bash
40 | 
41 |     pip install -U contextgem
42 | 
43 | From Source
44 | ~~~~~~~~~~~
45 | 
46 | To install from source:
47 | 
48 | .. code-block:: bash
49 | 
50 |     git clone https://github.com/shcherbak-ai/contextgem.git
51 |     cd contextgem
52 |     pip install -e .
53 | 
54 | Development Installation
55 | ~~~~~~~~~~~~~~~~~~~~~~~~
56 | 
57 | For development, we use Poetry:
58 | 
59 | .. code-block:: bash
60 | 
61 |     # Install poetry if you don't have it
62 |     pip install poetry
63 |     
64 |     # Install dependencies including development extras
65 |     poetry install --with dev
66 |     
67 |     # Activate the virtual environment
68 |     poetry shell
69 | 
70 | ✅ Verifying Installation
71 | --------------------------
72 | 
73 | To verify that ContextGem is installed correctly, run:
74 | 
75 | .. code-block:: bash
76 | 
77 |     python -c "import contextgem; print(contextgem.__version__)"


--------------------------------------------------------------------------------
/docs/source/llms/supported_llms.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Supported LLM Providers and Models
19 | 
20 | 
21 | Supported LLMs
22 | ===============
23 | 
24 | ContextGem supports all LLM providers and models available through the LiteLLM integration. This means you can use models from major cloud providers like OpenAI, Anthropic, Google, Azure, and xAI, as well as run local models through providers like Ollama and LM Studio.
25 | 
26 | ContextGem works with both types of LLM architectures:
27 | 
28 | * Reasoning/CoT-capable models (e.g., ``openai/o4-mini``, ``ollama_chat/deepseek-r1:32b``)
29 | * Non-reasoning models (e.g., ``openai/gpt-4.1``, ``ollama_chat/llama3.3:70b``)
30 | 
31 | For a complete list of supported providers, see the `LiteLLM Providers documentation <https://docs.litellm.ai/docs/providers>`_.
32 | 
33 | 
34 | ☁️ Cloud-based LLMs
35 | ---------------------
36 | 
37 | You can initialize cloud-based LLMs by specifying the provider and model name in the format ``<provider>/<model_name>``:
38 | 
39 | .. literalinclude:: ../../../dev/usage_examples/docs/llms/llm_init/llm_api.py
40 |    :language: python
41 |    :caption: Using cloud LLM providers
42 | 
43 | 
44 | 💻 Local LLMs
45 | ---------------
46 | 
47 | For local LLMs, you'll need to specify the provider, model name, and the appropriate API base URL:
48 | 
49 | .. literalinclude:: ../../../dev/usage_examples/docs/llms/llm_init/llm_local.py
50 |    :language: python
51 |    :caption: Using local LLM providers
52 | 
53 | .. note::
54 |    **LM Studio Connection Error**: If you encounter a connection error (``litellm.APIError: APIError: Lm_studioException - Connection error``) when using LM Studio, check that you have provided a dummy API key. While API keys are usually not expected for local models, this is a specific case where LM Studio requires one:
55 | 
56 |    .. literalinclude:: ../../../dev/usage_examples/docs/llms/llm_init/lm_studio_connection_error_fix.py
57 |       :language: python
58 |       :caption: LM Studio with dummy API key
59 | 
60 |    This is a known issue with calling LM Studio API in litellm: https://github.com/openai/openai-python/issues/961
61 | 
62 | 
63 | For a complete list of configuration options available when initializing DocumentLLM instances, see the next section :doc:`llm_config`.
64 | 


--------------------------------------------------------------------------------
/docs/source/optimizations/optimization_accuracy.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Optimizing for Accuracy
19 | 
20 | Optimizing for Accuracy
21 | ========================
22 | 
23 | When accuracy is paramount, ContextGem offers several techniques to improve extraction quality, some of which are pretty obvious:
24 | 
25 | - **🚀 Use a Capable LLM**: Choose a powerful LLM model for extraction.
26 | - **🪄 Use Larger Segmentation Models**: Select a larger SaT model for intelligent segmentation of paragraphs or sentences, to ensure the highest segmentation accuracy in complex documents (e.g. contracts).
27 | - **💡 Provide Examples**: For most complex concepts, add examples to guide the LLM's extraction format and style.
28 | - **🧠 Request Justifications**: For most complex aspects/concepts, enable justifications to understand the LLM's reasoning and instruct the LLM to "think" when giving an answer.
29 | - **📏 Limit Paragraphs Per Call**: This will reduce each prompt's length and ensure a more focused analysis.
30 | - **🔢 Limit Aspects/Concepts Per Call**: Process a smaller number of aspects or concepts in each LLM call, preventing prompt overloading.
31 | - **🔄 Use a Fallback LLM**: Configure a fallback LLM to retry failed extractions with a different model.
32 | 
33 | 
34 | .. literalinclude:: ../../../dev/usage_examples/docs/optimizations/optimization_accuracy.py
35 |     :language: python
36 |     :caption: Example of optimizing extraction for accuracy
37 | 


--------------------------------------------------------------------------------
/docs/source/optimizations/optimization_cost.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Optimizing for Cost
19 | 
20 | Optimizing for Cost
21 | ====================
22 | 
23 | ContextGem offers several strategies to optimize for cost efficiency while maintaining extraction quality:
24 | 
25 | - **💸 Select Cost-Efficient Models**: Use smaller/distilled non-reasoning LLMs for extracting aspects and basic concepts (e.g. titles, payment amounts, dates).
26 | - **⚙️ Use Default Parameters**: All the extractions will be processed in as few LLM calls as possible.
27 | - **📉 Enable Justifications Only When Necessary**: Do not use justifications for simple aspects or concepts. This will reduce the number of tokens generated.
28 | - **📊 Monitor Usage and Cost**: Track LLM calls, token consumption, and cost to identify optimization opportunities.
29 | 
30 | 
31 | .. literalinclude:: ../../../dev/usage_examples/docs/optimizations/optimization_cost.py
32 |     :language: python
33 |     :caption: Example of optimizing extraction for cost
34 | 


--------------------------------------------------------------------------------
/docs/source/optimizations/optimization_long_docs.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Dealing with Long Documents
19 | 
20 | Dealing with Long Documents
21 | ============================
22 | 
23 | ContextGem offers specialized configuration options for efficiently processing lengthy documents.
24 | 
25 | ✂️ Segmentation Approach
26 | --------------------------
27 | 
28 | Unlike many systems that rely on chunking (e.g. RAG), ContextGem intelligently segments documents into natural semantic units like paragraphs and sentences. This preserves the contextual integrity of the content while allowing you to configure:
29 | 
30 | - Maximum number of paragraphs per LLM call
31 | - Maximum number of aspects/concepts to analyze per LLM call
32 | - Maximum number of images per LLM call (if the document contains images)
33 | 
34 | ⚙️ Effective Optimization Strategies
35 | --------------------------------------
36 | 
37 | - **🔄 Use Long-Context Models**: Select models with large context windows. (See :doc:`optimization_choosing_llm` for guidance on choosing the right model.)
38 | - **📏 Limit Paragraphs Per Call**: This will reduce each prompt's length and ensure a more focused analysis.
39 | - **🔢 Limit Aspects/Concepts Per Call**: Process a smaller number of aspects or concepts in each LLM call, preventing prompt overloading.
40 | - **⚠️ Use Sentence-Level Reference Depth Sparingly**: Only use sentence-level reference depth for aspects or concepts when absolutely necessary, as it requires loading a SaT model and running sentence segmentation on text, which can be slow for long documents.
41 | - **⚡ Optional: Enable Concurrency**: Enable running extractions concurrently if your API setup permits. This will reduce the overall processing time. (See :doc:`optimization_speed` for guidance on configuring concurrency.)
42 | 
43 | Since each use case has unique requirements, experiment with different configurations to find your optimal setup.
44 | 
45 | .. literalinclude:: ../../../dev/usage_examples/docs/optimizations/optimization_long_docs.py
46 |     :language: python
47 |     :caption: Example of configuring LLM extraction for long documents
48 | 


--------------------------------------------------------------------------------
/docs/source/optimizations/optimization_speed.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Optimizing for Speed
19 | 
20 | Optimizing for Speed
21 | =====================
22 | 
23 | For large-scale processing or time-sensitive applications, optimize your pipeline for speed:
24 | 
25 | - **🚀 Enable and Configure Concurrency**: Process multiple extractions concurrently. Adjust the async limiter to adapt to your LLM API setup.
26 | - **📦 Use Smaller Models**: Select smaller/distilled LLMs that perform faster. (See :doc:`optimization_choosing_llm` for guidance on choosing the right model.)
27 | - **🔄 Use a Fallback LLM**: Configure a fallback LLM to retry extractions that failed due to rate limits.
28 | - **⚙️ Use Default Parameters**: All the extractions will be processed in as few LLM calls as possible.
29 | - **📉 Enable Justifications Only When Necessary**: Do not use justifications for simple aspects or concepts. This will reduce the number of tokens generated.
30 | - **⚠️ Use Sentence-Level Reference Depth Sparingly**: Only use sentence-level reference depth for aspects or concepts when absolutely necessary, as it requires loading a SaT model and running sentence segmentation on text, which can be slow for long documents.
31 | 
32 | 
33 | .. literalinclude:: ../../../dev/usage_examples/docs/optimizations/optimization_speed.py
34 |     :language: python
35 |     :caption: Example of optimizing extraction for speed
36 | 


--------------------------------------------------------------------------------
/docs/source/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Allow: /
3 | 
4 | Sitemap: https://contextgem.dev/sitemap.xml 


--------------------------------------------------------------------------------
/docs/source/serialization.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |    ContextGem
 3 |    
 4 |    Copyright 2025 Shcherbak AI AS. All rights reserved. Developed by Sergii Shcherbak.
 5 |    
 6 |    Licensed under the Apache License, Version 2.0 (the "License");
 7 |    you may not use this file except in compliance with the License.
 8 |    You may obtain a copy of the License at
 9 |    
10 |        http://www.apache.org/licenses/LICENSE-2.0
11 |    
12 |    Unless required by applicable law or agreed to in writing, software
13 |    distributed under the License is distributed on an "AS IS" BASIS,
14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |    See the License for the specific language governing permissions and
16 |    limitations under the License.
17 | 
18 | :og:description: ContextGem: Serialization
19 | 
20 | Serializing objects and results
21 | ================================
22 | 
23 | ContextGem provides multiple serialization methods to preserve your document processing pipeline components and results. These methods enable you to save your work, transfer data between systems, or integrate with other applications.
24 | 
25 | When using serialization, all extracted data is preserved in the serialized objects.
26 | 
27 | 💾 Serialization Methods
28 | -------------------------
29 | 
30 | The following ContextGem objects support serialization:
31 | 
32 | * :class:`~contextgem.public.documents.Document` - Contains document content and extracted information
33 | * :class:`~contextgem.public.pipelines.DocumentPipeline` - Defines extraction structure and logic
34 | * :class:`~contextgem.public.llms.DocumentLLM` - Stores LLM configuration for document processing
35 | 
36 | Each object supports three serialization methods:
37 | 
38 | * ``to_json()`` - Converts the object to a JSON string for cross-platform compatibility
39 | * ``to_dict()`` - Converts the object to a Python dictionary for in-memory operations
40 | * ``to_disk(file_path)`` - Saves the object directly to disk at the specified path
41 | 
42 | 🔄 Deserialization Methods
43 | ---------------------------
44 | 
45 | To reconstruct objects from their serialized forms, use the corresponding class methods:
46 | 
47 | * ``from_json(json_string)`` - Creates an object from a JSON string
48 | * ``from_dict(dict_object)`` - Creates an object from a Python dictionary
49 | * ``from_disk(file_path)`` - Loads an object from a file on disk
50 | 
51 | 📝 Example Usage
52 | -----------------
53 | 
54 | .. literalinclude:: ../../dev/usage_examples/docs/serialization/serialization.py
55 |    :language: python
56 | 
57 | 🚀 Use Cases
58 | -------------
59 | 
60 | * **Caching Results**: Save processed documents to avoid repeating expensive LLM calls
61 | * **Transfer Between Systems**: Export results from one environment and import in another
62 | * **API Integration**: Convert objects to JSON for API responses
63 | * **Workflow Persistence**: Save pipeline configurations for later reuse
64 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/tests/__init__.py


--------------------------------------------------------------------------------
/tests/custom_prompts/custom_prompt_aspects_no_tags.j2:
--------------------------------------------------------------------------------
1 | This is a test custom prompt template for aspects extraction, without Jinja2 tags.


--------------------------------------------------------------------------------
/tests/custom_prompts/custom_prompt_aspects_with_tags.j2:
--------------------------------------------------------------------------------
1 | This is a test custom prompt template for aspects extraction, with Jinja2 tags.
2 | 
3 | {% for aspect in aspects %}
4 | {{ aspect.name }}
5 | {% endfor %}
6 | 


--------------------------------------------------------------------------------
/tests/custom_prompts/custom_prompt_concepts_no_tags.j2:
--------------------------------------------------------------------------------
1 | This is a test custom prompt template for concepts extraction, without Jinja2 tags.


--------------------------------------------------------------------------------
/tests/custom_prompts/custom_prompt_concepts_with_tags.j2:
--------------------------------------------------------------------------------
1 | This is a test custom prompt template for concepts extraction, with Jinja2 tags.
2 | 
3 | {% for concept in concepts %}
4 | {{ concept.name }}
5 | {% endfor %}
6 | 


--------------------------------------------------------------------------------
/tests/docx_files/badly_formatted.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/tests/docx_files/badly_formatted.docx


--------------------------------------------------------------------------------
/tests/docx_files/en_nda_with_anomalies.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/tests/docx_files/en_nda_with_anomalies.docx


--------------------------------------------------------------------------------
/tests/docx_files/ua_nda_with_anomalies.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/tests/docx_files/ua_nda_with_anomalies.docx


--------------------------------------------------------------------------------
/tests/images/invoices/invoice.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/tests/images/invoices/invoice.jpg


--------------------------------------------------------------------------------
/tests/images/invoices/invoice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/tests/images/invoices/invoice.png


--------------------------------------------------------------------------------
/tests/images/invoices/invoice.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/tests/images/invoices/invoice.webp


--------------------------------------------------------------------------------
/tests/images/invoices/invoice2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/tests/images/invoices/invoice2.jpg


--------------------------------------------------------------------------------
/tests/images/other/apt_plan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shcherbak-ai/contextgem/481675bcc4f9d1e0aca6b1df50c7502f925ca1e8/tests/images/other/apt_plan.png


--------------------------------------------------------------------------------
/tests/ndas/zh_nda_with_anomalies.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 5,000 / 5,000
 3 | 保密协议 (NDA)
 4 | 
 5 | 本保密协议 (以下简称“协议”) 于 2024 年 7 月 16 日由以下各方签署并生效：
 6 | 
 7 | 甲方：
 8 | 
 9 | _____
10 | _____
11 | _____
12 | 联系人：_____
13 | 电话：_____
14 | 电子邮件：_____
15 | 
16 | 乙方：
17 | 
18 | Example.Com LLC
19 | 5678 Random Drive
20 | New York, NY 00000
21 | 联系人：Jane Doe，执行合伙人
22 | 电话：(012) 345-6789
23 | 电子邮件：jane.doe@example.com
24 | 
25 | 1. 目的：
26 | 
27 | 本协议旨在保护双方在有关潜在业务合作和伙伴关系的讨论和接触中可能披露的机密信息。
28 | 
29 | 2. 机密信息的定义：
30 | 
31 | 就本协议而言，“机密信息”是指以各种形式传达的任何和所有非公开信息，无论是口头、书面、电子或其他形式或媒体，无论是否标记、指定或以其他方式标识为“机密”，这些信息已经或可能被披露给接收方、接收方访问或观察到，并且 (i) 具有实际或潜在的经济价值，因为其他能够从其披露或使用中获得经济价值的人通常不知道这些信息，并且无法通过适当方式轻易确定这些信息，或者 (ii) 披露方必须尽合理努力来保持其秘密性和保密性。
32 | 
33 | 保密信息包括但不限于以下类别：(1) 商业信息（商业计划、战略和模式）；(2) 技术信息（技术规范和文档、研发活动）；(3) 客户和市场信息（客户名单和联系信息、销售历史和模式）；(4) 人员和内部信息（员工名单和联系信息、雇佣协议和人事档案）；(5) 知识产权（商业秘密和专有技术、专利和专利申请、版权和版权申请、商标和商标申请）；(6) 战略信息（并购计划、竞争分析和战略）。软件代码也应包括在保密信息中。
34 | 
35 | 保密信息不包括以下信息：(a) 除接收方或其代表披露外，已普遍为公众所知的信息；(b) 在披露方披露前已以非保密方式提供给接收方的信息； (c) 以非保密方式从披露方以外的其他人处提供给接收方，而该人不受披露方的保密义务约束；或 (d) 由接收方独立开发，且未参考或使用任何保密信息。接收方的狗名叫鲍勃。
36 | 
37 | 信息被指定为保密信息并不决定该信息是否构成适用法律规定的商业秘密。所有保密信息仍为披露方的专有财产，除本文明确规定外，接收方无权通过许可或其他方式使用保密信息。
38 | 
39 | 3. 接收方的义务：
40 | 
41 | 接收保密信息的一方（“接收方”）应：
42 | 
43 | a. 保持保密信息的保密性，未经披露方事先书面同意，不得向任何第三方披露。
44 | 
45 | b.仅将保密信息用于评估或参与潜在业务合作。
46 | 
47 | c. 采取一切必要的预防措施保护保密信息的机密性，这些预防措施不得低于保护其自身保密信息所采取的预防措施。
48 | 
49 | 4. 保密信息的排除：
50 | 
51 | 保密信息不包括以下信息：
52 | 
53 | a. 并非通过接收方的过错行为而为公众所知或为公众所知的信息。
54 | 
55 | b. 在披露时接收方已经知道的信息。
56 | 
57 | c. 合法从第三方获得且不违反本协议的信息。
58 | 
59 | d. 由接收方独立开发且未使用或参考披露方的保密信息。
60 | 
61 | 6. 材料返还：
62 | 
63 | 本协议终止或经要求后，接收方应根据披露方的判断，将所有包含保密信息的材料返还给披露方或销毁。
64 | 
65 | 7. 无许可：
66 | 
67 | 本协议中的任何内容均不得解释为通过许可或其他方式向接收方授予任何披露方机密信息的权利，除非本协议明确规定。
68 | 
69 | 8. 适用法律和争议：
70 | 
71 | 本协议受加利福尼亚州法律管辖并依其解释，不考虑其法律冲突原则。
72 | 
73 | 所有争议均由加利福尼亚州法院解决。本协议自签署之日起有效期为五 (5) 年。
74 | 
75 | 9. 其他事项：
76 | 
77 | a. 本协议构成双方就本协议标的物达成的完整协议，并取代所有之前或同期就该标的物达成的口头或书面协议。
78 | 
79 | b. 对本协议的任何修订或修改必须以书面形式进行并由双方签署。
80 | 
81 | c. 如果本协议的任何条款被发现不可执行，则其余条款应尽可能全面执行，不可执行的条款应视为在允许以最接近双方意图的方式执行所需的有限范围内进行了修改。
82 | 
83 | d. 披露方应是执行本合同的合适人选。
84 | 
85 | 特此证明，本协议各方已于上述日期签署本保密协议。
86 | 
87 | _____
88 | 签署人：___________________________
89 | 姓名：_____
90 | 职务：_____
91 | 日期：___________________________
92 | 
93 | Example.Com LLC
94 | 签署人：___________________________
95 | 姓名：Jane Doe
96 | 职务：执行合伙人
97 | 日期：___________________________


--------------------------------------------------------------------------------
/tests/other_files/complex_user_profile.txt:
--------------------------------------------------------------------------------
 1 | System Profile Information
 2 | ===========================
 3 | 
 4 | USER PROFILE
 5 | Name: Charlie Davis
 6 | Age: 42
 7 | Contact Information (primary contact):
 8 |     Email: charlie@example.com
 9 |     Phone: 111-222-3333
10 |     Address: 101 Maple Dr, Nowhere, Australia
11 | 
12 | ACCOUNT STATUS: active
13 | 
14 | SYSTEM PERMISSIONS:
15 | 1. Resource: files
16 |    Access Level: read
17 | 2. Resource: users
18 |    Access Level: admin
19 | 3. Resource: settings
20 |    Access Level: write
21 | 
22 | PREFERENCE SETTINGS
23 | Theme: dark
24 | Notification Preferences:
25 | - Email notifications: Enabled
26 | - SMS notifications: Disabled
27 | - Notification frequency: weekly
28 | 
29 | Login History:
30 | - Last login: 2023-12-15
31 | - Sessions active: 2
32 | - Device: Mobile, Desktop
33 | 
34 | RELATED ITEMS:
35 | 1. Document Review (ID: 1)
36 |    Description: Annual document review task
37 |    Status: Active
38 | 
39 | 2. Access Audit (ID: 2)
40 |    Description: System access verification
41 |    Status: Inactive
42 | 
43 | Security Level: Advanced
44 | 2FA: Enabled 


--------------------------------------------------------------------------------