├── .commitlintrc
├── .dockerignore
├── .env.example
├── .gitattributes
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   ├── config.yml
    │   └── feature_request.yml
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── auto-bump-and-release.yaml
    │   ├── build-push-docker.yaml
    │   ├── pr-lint.yaml
    │   ├── style-check.yaml
    │   └── unit-test.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE.txt
├── README.md
├── app.py
├── doc_env_reqs.txt
├── docs
    ├── about.md
    ├── development
    │   ├── contributing.md
    │   ├── create-a-component.md
    │   ├── data-components.md
    │   ├── index.md
    │   └── utilities.md
    ├── extra
    │   └── css
    │   │   └── code_select.css
    ├── images
    │   ├── 269170170-af94ff6b-b8b4-4602-ab6e-2947deb30dff.png
    │   ├── 269170198-9ac1b95a-b667-42e7-b318-98a1b805d6df.png
    │   ├── 271332562-ac8f9aac-d853-4571-a48b-d866a99eaf3e.png
    │   ├── 274787925-e2593010-d7ef-46e3-8719-6fcae0315b5d.png
    │   ├── change_space_params.png
    │   ├── chat-demo.gif
    │   ├── chat-tab-demo.png
    │   ├── chat-tab.png
    │   ├── close_logs_space.png
    │   ├── cohere_api_key.png
    │   ├── duplicate_space.png
    │   ├── file-index-tab.png
    │   ├── index-embedding.png
    │   ├── info-panel-scores.png
    │   ├── initial_startup.png
    │   ├── llm-default.png
    │   ├── models.png
    │   ├── pdf-viewer-setup.png
    │   ├── preview-graph.png
    │   ├── preview.png
    │   ├── resources-tab.png
    │   ├── retrieval-setting.png
    │   ├── set_api_key_space.png
    │   └── space_build.png
    ├── index.md
    ├── local_model.md
    ├── online_install.md
    ├── pages
    │   └── app
    │   │   ├── customize-flows.md
    │   │   ├── ext
    │   │       └── user-management.md
    │   │   ├── features.md
    │   │   ├── functional-description.md
    │   │   ├── index
    │   │       └── file.md
    │   │   └── settings
    │   │       ├── overview.md
    │   │       └── user-settings.md
    ├── scripts
    │   ├── generate_examples_docs.py
    │   └── generate_reference_docs.py
    ├── theme
    │   ├── assets
    │   │   └── pymdownx-extras
    │   │   │   ├── extra-fb5a2a1c86.css
    │   │   │   ├── extra-fb5a2a1c86.css.map
    │   │   │   ├── extra-loader-MCFnu0Wd.js
    │   │   │   ├── extra-loader-MCFnu0Wd.js.map
    │   │   │   ├── material-extra-3rdparty-E-i8w1WA.js
    │   │   │   ├── material-extra-3rdparty-E-i8w1WA.js.map
    │   │   │   ├── material-extra-theme-TVq-kNRT.js
    │   │   │   └── material-extra-theme-TVq-kNRT.js.map
    │   ├── main.html
    │   └── partials
    │   │   ├── footer.html
    │   │   ├── header.html
    │   │   └── libs.html
    └── usage.md
├── flowsettings.py
├── fly.toml
├── launch.sh
├── libs
    ├── kotaemon
    │   ├── README.md
    │   ├── kotaemon
    │   │   ├── __init__.py
    │   │   ├── agents
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── io
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── base.py
    │   │   │   ├── langchain_based.py
    │   │   │   ├── react
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── agent.py
    │   │   │   │   └── prompt.py
    │   │   │   ├── rewoo
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── agent.py
    │   │   │   │   ├── planner.py
    │   │   │   │   ├── prompt.py
    │   │   │   │   └── solver.py
    │   │   │   ├── tools
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base.py
    │   │   │   │   ├── google.py
    │   │   │   │   ├── llm.py
    │   │   │   │   └── wikipedia.py
    │   │   │   └── utils.py
    │   │   ├── base
    │   │   │   ├── __init__.py
    │   │   │   ├── component.py
    │   │   │   └── schema.py
    │   │   ├── chatbot
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   └── simple_respondent.py
    │   │   ├── cli.py
    │   │   ├── contribs
    │   │   │   ├── __init__.py
    │   │   │   ├── docs.py
    │   │   │   └── promptui
    │   │   │   │   ├── .gitignore
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base.py
    │   │   │   │   ├── cli.py
    │   │   │   │   ├── config.py
    │   │   │   │   ├── export.py
    │   │   │   │   ├── logs.py
    │   │   │   │   ├── themes.py
    │   │   │   │   ├── tunnel.py
    │   │   │   │   └── ui
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── blocks.py
    │   │   │   │       ├── chat.py
    │   │   │   │       └── pipeline.py
    │   │   ├── embeddings
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── endpoint_based.py
    │   │   │   ├── fastembed.py
    │   │   │   ├── langchain_based.py
    │   │   │   ├── openai.py
    │   │   │   ├── tei_endpoint_embed.py
    │   │   │   └── voyageai.py
    │   │   ├── indices
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── extractors
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── doc_parsers.py
    │   │   │   ├── ingests
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── files.py
    │   │   │   ├── qa
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── citation.py
    │   │   │   │   ├── citation_qa.py
    │   │   │   │   ├── citation_qa_inline.py
    │   │   │   │   ├── format_context.py
    │   │   │   │   └── utils.py
    │   │   │   ├── rankings
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base.py
    │   │   │   │   ├── cohere.py
    │   │   │   │   ├── llm.py
    │   │   │   │   ├── llm_scoring.py
    │   │   │   │   └── llm_trulens.py
    │   │   │   ├── retrievers
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── jina_web_search.py
    │   │   │   │   └── tavily_web_search.py
    │   │   │   ├── splitters
    │   │   │   │   └── __init__.py
    │   │   │   └── vectorindex.py
    │   │   ├── llms
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── branching.py
    │   │   │   ├── chats
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base.py
    │   │   │   │   ├── endpoint_based.py
    │   │   │   │   ├── langchain_based.py
    │   │   │   │   ├── llamacpp.py
    │   │   │   │   └── openai.py
    │   │   │   ├── completions
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base.py
    │   │   │   │   └── langchain_based.py
    │   │   │   ├── cot.py
    │   │   │   ├── linear.py
    │   │   │   └── prompts
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base.py
    │   │   │   │   └── template.py
    │   │   ├── loaders
    │   │   │   ├── __init__.py
    │   │   │   ├── adobe_loader.py
    │   │   │   ├── azureai_document_intelligence_loader.py
    │   │   │   ├── base.py
    │   │   │   ├── composite_loader.py
    │   │   │   ├── docling_loader.py
    │   │   │   ├── docx_loader.py
    │   │   │   ├── excel_loader.py
    │   │   │   ├── html_loader.py
    │   │   │   ├── mathpix_loader.py
    │   │   │   ├── ocr_loader.py
    │   │   │   ├── pdf_loader.py
    │   │   │   ├── txt_loader.py
    │   │   │   ├── unstructured_loader.py
    │   │   │   ├── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── adobe.py
    │   │   │   │   ├── box.py
    │   │   │   │   ├── gpt4v.py
    │   │   │   │   ├── pdf_ocr.py
    │   │   │   │   └── table.py
    │   │   │   └── web_loader.py
    │   │   ├── parsers
    │   │   │   ├── __init__.py
    │   │   │   └── regex_extractor.py
    │   │   ├── rerankings
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── cohere.py
    │   │   │   ├── tei_fast_rerank.py
    │   │   │   └── voyageai.py
    │   │   └── storages
    │   │   │   ├── __init__.py
    │   │   │   ├── docstores
    │   │   │       ├── __init__.py
    │   │   │       ├── base.py
    │   │   │       ├── elasticsearch.py
    │   │   │       ├── in_memory.py
    │   │   │       ├── lancedb.py
    │   │   │       └── simple_file.py
    │   │   │   └── vectorstores
    │   │   │       ├── __init__.py
    │   │   │       ├── base.py
    │   │   │       ├── chroma.py
    │   │   │       ├── in_memory.py
    │   │   │       ├── lancedb.py
    │   │   │       ├── milvus.py
    │   │   │       ├── qdrant.py
    │   │   │       └── simple_file.py
    │   ├── pyproject.toml
    │   ├── pytest.ini
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── _test_multimodal_reader.py
    │   │   ├── conftest.py
    │   │   ├── resources
    │   │       ├── 7810d908b0ff4ce381dcab873196d133.jpg
    │   │       ├── dummy.docx
    │   │       ├── dummy.mhtml
    │   │       ├── dummy.pdf
    │   │       ├── dummy.xlsx
    │   │       ├── embedding_openai.json
    │   │       ├── embedding_openai_batch.json
    │   │       ├── fullocr_sample_output.json
    │   │       ├── ggml-vocab-llama.gguf
    │   │       ├── html
    │   │       │   ├── dummy.html
    │   │       │   └── dummy_image.png
    │   │       ├── multimodal.pdf
    │   │       ├── policy.md
    │   │       └── table.pdf
    │   │   ├── simple_pipeline.py
    │   │   ├── test_agent.py
    │   │   ├── test_composite.py
    │   │   ├── test_cot.py
    │   │   ├── test_docstores.py
    │   │   ├── test_documents.py
    │   │   ├── test_embedding_models.py
    │   │   ├── test_indexing_retrieval.py
    │   │   ├── test_ingestor.py
    │   │   ├── test_llms_chat_models.py
    │   │   ├── test_llms_completion_models.py
    │   │   ├── test_post_processing.py
    │   │   ├── test_prompt.py
    │   │   ├── test_promptui.py
    │   │   ├── test_reader.py
    │   │   ├── test_reranking.py
    │   │   ├── test_splitter.py
    │   │   ├── test_table_reader.py
    │   │   ├── test_telemetry.py
    │   │   ├── test_template.py
    │   │   ├── test_tools.py
    │   │   └── test_vectorstore.py
    └── ktem
    │   ├── .gitignore
    │   ├── MANIFEST.in
    │   ├── alembic.ini
    │   ├── ktem
    │       ├── __init__.py
    │       ├── app.py
    │       ├── assets
    │       │   ├── __init__.py
    │       │   ├── css
    │       │   │   └── main.css
    │       │   ├── icons
    │       │   │   ├── dark_mode.svg
    │       │   │   ├── delete.svg
    │       │   │   ├── expand.svg
    │       │   │   ├── new.svg
    │       │   │   ├── rename.svg
    │       │   │   └── sidebar.svg
    │       │   ├── img
    │       │   │   └── favicon.svg
    │       │   ├── js
    │       │   │   ├── main.js
    │       │   │   ├── pdf_viewer.js
    │       │   │   └── svg-pan-zoom.min.js
    │       │   ├── md
    │       │   │   ├── about.md
    │       │   │   ├── changelogs.md
    │       │   │   └── usage.md
    │       │   └── theme.py
    │       ├── components.py
    │       ├── db
    │       │   ├── __init__.py
    │       │   ├── base_models.py
    │       │   ├── engine.py
    │       │   └── models.py
    │       ├── embeddings
    │       │   ├── __init__.py
    │       │   ├── db.py
    │       │   ├── manager.py
    │       │   └── ui.py
    │       ├── exceptions.py
    │       ├── extension_protocol.py
    │       ├── index
    │       │   ├── __init__.py
    │       │   ├── base.py
    │       │   ├── file
    │       │   │   ├── __init__.py
    │       │   │   ├── base.py
    │       │   │   ├── exceptions.py
    │       │   │   ├── graph
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── graph_index.py
    │       │   │   │   ├── light_graph_index.py
    │       │   │   │   ├── lightrag_pipelines.py
    │       │   │   │   ├── nano_graph_index.py
    │       │   │   │   ├── nano_pipelines.py
    │       │   │   │   ├── pipelines.py
    │       │   │   │   └── visualize.py
    │       │   │   ├── index.py
    │       │   │   ├── knet
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── knet_index.py
    │       │   │   │   └── pipelines.py
    │       │   │   ├── pipelines.py
    │       │   │   ├── ui.py
    │       │   │   └── utils.py
    │       │   ├── manager.py
    │       │   ├── models.py
    │       │   └── ui.py
    │       ├── llms
    │       │   ├── __init__.py
    │       │   ├── db.py
    │       │   ├── manager.py
    │       │   └── ui.py
    │       ├── main.py
    │       ├── pages
    │       │   ├── __init__.py
    │       │   ├── chat
    │       │   │   ├── __init__.py
    │       │   │   ├── chat_panel.py
    │       │   │   ├── chat_suggestion.py
    │       │   │   ├── common.py
    │       │   │   ├── control.py
    │       │   │   ├── demo_hint.py
    │       │   │   ├── paper_list.py
    │       │   │   └── report.py
    │       │   ├── help.py
    │       │   ├── login.py
    │       │   ├── resources
    │       │   │   ├── __init__.py
    │       │   │   └── user.py
    │       │   ├── settings.py
    │       │   └── setup.py
    │       ├── reasoning
    │       │   ├── __init__.py
    │       │   ├── base.py
    │       │   ├── prompt_optimization
    │       │   │   ├── __init__.py
    │       │   │   ├── decompose_question.py
    │       │   │   ├── fewshot_rewrite_question.py
    │       │   │   ├── mindmap.py
    │       │   │   ├── rephrase_question_train.json
    │       │   │   ├── rewrite_question.py
    │       │   │   ├── suggest_conversation_name.py
    │       │   │   └── suggest_followup_chat.py
    │       │   ├── react.py
    │       │   ├── rewoo.py
    │       │   └── simple.py
    │       ├── rerankings
    │       │   ├── __init__.py
    │       │   ├── db.py
    │       │   ├── manager.py
    │       │   └── ui.py
    │       ├── settings.py
    │       └── utils
    │       │   ├── __init__.py
    │       │   ├── commands.py
    │       │   ├── conversation.py
    │       │   ├── file.py
    │       │   ├── generator.py
    │       │   ├── hf_papers.py
    │       │   ├── lang.py
    │       │   ├── plantuml.py
    │       │   ├── rate_limit.py
    │       │   ├── render.py
    │       │   └── visualize_cited.py
    │   ├── ktem_tests
    │       ├── __init__.py
    │       ├── resources
    │       │   └── embedding_openai.json
    │       └── test_qa.py
    │   ├── migrations
    │       ├── README
    │       ├── env.py
    │       ├── script.py.mako
    │       └── versions
    │       │   └── .keep
    │   ├── pyproject.toml
    │   └── requirements.txt
├── mkdocs.yml
├── pyproject.toml
├── scripts
    ├── download_pdfjs.sh
    ├── migrate
    │   ├── __init__.py
    │   └── migrate_chroma_db.py
    ├── run_linux.sh
    ├── run_macos.sh
    ├── run_windows.bat
    ├── serve_local.py
    ├── server_llamacpp_linux.sh
    ├── server_llamacpp_macos.sh
    ├── server_llamacpp_windows.bat
    ├── update_linux.sh
    ├── update_macos.sh
    └── update_windows.bat
├── settings.yaml.example
├── sso_app.py
├── sso_app_demo.py
└── templates
    ├── component-default
        └── README.md
    └── project-default
        ├── cookiecutter.json
        └── {{cookiecutter.project_name}}
            ├── .gitattributes
            ├── .gitignore
            ├── .pre-commit-config.yaml
            ├── README.md
            ├── setup.py
            ├── tests
                └── __init__.py
            └── {{cookiecutter.project_name}}
                ├── __init__.py
                └── pipeline.py


/.commitlintrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": ["@commitlint/config-conventional"],
 3 |   "defaultIgnores": true,
 4 |   "rules": {
 5 |     "body-leading-blank": [1, "always"],
 6 |     "body-max-line-length": [2, "always", 100],
 7 |     "footer-leading-blank": [1, "always"],
 8 |     "footer-max-line-length": [2, "always", 10000],
 9 |     "header-max-length": [2, "always", 200],
10 |     "subject-case": [
11 |       2,
12 |       "never",
13 |       []
14 |     ],
15 |     "subject-empty": [2, "never"],
16 |     "subject-full-stop": [2, "never", "."],
17 |     "type-case": [2, "always", "lower-case"],
18 |     "type-empty": [2, "never"],
19 |     "type-enum": [
20 |       2,
21 |       "always",
22 |       [
23 |         "build",
24 |         "chore",
25 |         "ci",
26 |         "docs",
27 |         "feat",
28 |         "fix",
29 |         "perf",
30 |         "refactor",
31 |         "revert",
32 |         "style",
33 |         "test"
34 |       ]
35 |     ]
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | .github/
 2 | .git/
 3 | .mypy_cache/
 4 | __pycache__/
 5 | ktem_app_data/
 6 | env/
 7 | .pre-commit-config.yaml
 8 | .commitlintrc
 9 | .gitignore
10 | .gitattributes
11 | README.md
12 | *.zip
13 | *.sh
14 | 
15 | !/launch.sh
16 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # this is an example .env file, use it to create your own .env file and place it in the root of the project
 2 | 
 3 | # settings for OpenAI
 4 | OPENAI_API_BASE=https://api.openai.com/v1
 5 | OPENAI_API_KEY=<YOUR_OPENAI_KEY>
 6 | OPENAI_CHAT_MODEL=gpt-4o-mini
 7 | OPENAI_EMBEDDINGS_MODEL=text-embedding-3-large
 8 | 
 9 | # settings for Azure OpenAI
10 | AZURE_OPENAI_ENDPOINT=
11 | AZURE_OPENAI_API_KEY=
12 | OPENAI_API_VERSION=2024-02-15-preview
13 | AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo
14 | AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002
15 | 
16 | # settings for Cohere
17 | COHERE_API_KEY=<COHERE_API_KEY>
18 | 
19 | # settings for Mistral
20 | # MISTRAL_API_KEY=placeholder
21 | 
22 | # settings for VoyageAI
23 | VOYAGE_API_KEY=<VOYAGE_API_KEY>
24 | 
25 | # settings for local models
26 | LOCAL_MODEL=qwen2.5:7b
27 | LOCAL_MODEL_EMBEDDINGS=nomic-embed-text
28 | 
29 | # settings for GraphRAG
30 | GRAPHRAG_API_KEY=<YOUR_OPENAI_KEY>
31 | GRAPHRAG_LLM_MODEL=gpt-4o-mini
32 | GRAPHRAG_EMBEDDING_MODEL=text-embedding-3-small
33 | 
34 | # set to true if you want to use customized GraphRAG config file
35 | USE_CUSTOMIZED_GRAPHRAG_SETTING=false
36 | 
37 | # settings for Azure DI
38 | AZURE_DI_ENDPOINT=
39 | AZURE_DI_CREDENTIAL=
40 | 
41 | # settings for Adobe API
42 | # get free credential at https://acrobatservices.adobe.com/dc-integration-creation-app-cdn/main.html?api=pdf-extract-api
43 | # also install pip install "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements"
44 | PDF_SERVICES_CLIENT_ID=
45 | PDF_SERVICES_CLIENT_SECRET=
46 | 
47 | # settings for PDF.js
48 | PDFJS_VERSION_DIST="pdfjs-4.0.379-dist"
49 | 
50 | # variable for authentication method selection
51 | # for authentication with google leave empty
52 | # for authentication with keycloak :
53 | # AUTHENTICATION_METHOD="KEYCLOAK"
54 | 
55 | AUTHENTICATION_METHOD=
56 | 
57 | # settings for keycloak
58 | KEYCLOAK_SERVER_URL=
59 | KEYCLOAK_CLIENT_ID=
60 | KEYCLOAK_REALM=
61 | KEYCLOAK_CLIENT_SECRET=
62 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.bat   text eol=crlf
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
 1 | name: "Bug Report"
 2 | description: Report something that is not working as expected
 3 | title: "[BUG] "
 4 | labels: ["bug"]
 5 | body:
 6 |   - type: markdown
 7 |     attributes:
 8 |       value: |
 9 |         *Please fill this form with as much information as possible.*
10 |   - type: textarea
11 |     id: description
12 |     attributes:
13 |       label: "Description"
14 |       description: Please enter an explicit description of your issue
15 |       placeholder: Short and explicit description of your incident...
16 |     validations:
17 |       required: true
18 |   - type: textarea
19 |     id: reprod
20 |     attributes:
21 |       label: "Reproduction steps"
22 |       description: Please enter an explicit description of your issue
23 |       value: |
24 |         1. Go to '...'
25 |         2. Click on '....'
26 |         3. Scroll down to '....'
27 |         4. See error
28 |       render: bash
29 |     validations:
30 |       required: true
31 |   - type: textarea
32 |     id: screenshot
33 |     attributes:
34 |       label: "Screenshots"
35 |       description: If applicable, add screenshots to help explain your problem.
36 |       value: |
37 |         ![DESCRIPTION](LINK.png)
38 |       render: bash
39 |     validations:
40 |       required: false
41 |   - type: textarea
42 |     id: logs
43 |     attributes:
44 |       label: "Logs"
45 |       description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
46 |       render: bash
47 |     validations:
48 |       required: false
49 |   - type: dropdown
50 |     id: browsers
51 |     attributes:
52 |       label: "Browsers"
53 |       description: What browsers are you seeing the problem on ?
54 |       multiple: true
55 |       options:
56 |         - Firefox
57 |         - Chrome
58 |         - Safari
59 |         - Microsoft Edge
60 |         - Opera
61 |         - Brave
62 |         - Other
63 |     validations:
64 |       required: false
65 |   - type: dropdown
66 |     id: os
67 |     attributes:
68 |       label: "OS"
69 |       description: What is the impacted environment ?
70 |       multiple: true
71 |       options:
72 |         - Windows
73 |         - MacOS
74 |         - Linux
75 |         - Other
76 |     validations:
77 |       required: false
78 |   - type: textarea
79 |     id: additional_information
80 |     attributes:
81 |       label: "Additional information"
82 |       description: Add any relevant information or context.
83 |       placeholder:
84 |     validations:
85 |       required: false
86 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: "Feature Request"
 2 | description: Brainstorm and propose new features for the project
 3 | title: "[REQUEST] "
 4 | labels: ["enhancement"]
 5 | body:
 6 |   - type: markdown
 7 |     attributes:
 8 |       value: |
 9 |         *Please fill this form with as much information as possible.*
10 |   - type: textarea
11 |     id: reference_issues
12 |     attributes:
13 |       label: "Reference Issues"
14 |       description: Common issues
15 |       placeholder: "#Issues IDs"
16 |     validations:
17 |       required: false
18 |   - type: textarea
19 |     id: summary
20 |     attributes:
21 |       label: "Summary"
22 |       description: Provide a brief explanation of the feature
23 |       placeholder: Describe in a few lines your feature request
24 |     validations:
25 |       required: true
26 |   - type: textarea
27 |     id: basic_example
28 |     attributes:
29 |       label: "Basic Example"
30 |       description: Indicate here some basic examples of your feature.
31 |       placeholder: A few specific words about your feature request.
32 |     validations:
33 |       required: true
34 |   - type: textarea
35 |     id: drawbacks
36 |     attributes:
37 |       label: "Drawbacks"
38 |       description: What are the drawbacks/impacts of your feature request ?
39 |       placeholder: Identify the drawbacks and impacts while being neutral on your feature request
40 |     validations:
41 |       required: true
42 |   - type: textarea
43 |     id: additional_information
44 |     attributes:
45 |       label: "Additional information"
46 |       description: Add any additional information that you think is important for your feature request
47 |       placeholder:
48 |     validations:
49 |       required: false
50 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | - Please include a summary of the changes and the related issue.
 4 | - Fixes # (issue)
 5 | 
 6 | ## Type of change
 7 | 
 8 | - [ ] New features (non-breaking change).
 9 | - [ ] Bug fix (non-breaking change).
10 | - [ ] Breaking change (fix or feature that would cause existing functionality not to work as expected).
11 | 
12 | ## Checklist
13 | 
14 | - [ ] I have performed a self-review of my code.
15 | - [ ] I have added thorough tests if it is a core feature.
16 | - [ ] There is a reference to the original bug report and related work.
17 | - [ ] I have commented on my code, particularly in hard-to-understand areas.
18 | - [ ] The feature is well documented.
19 | 


--------------------------------------------------------------------------------
/.github/workflows/auto-bump-and-release.yaml:
--------------------------------------------------------------------------------
 1 | name: Auto Bump and Release
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   auto-bump-and-release:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Clone the repo
13 |         uses: actions/checkout@v4
14 |         with:
15 |           fetch-depth: 0
16 |       - name: Update Application Version
17 |         id: update-version
18 |         uses: anothrNick/github-tag-action@v1
19 |         env:
20 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
21 |           WITH_V: true
22 |           DEFAULT_BUMP: patch
23 |           MAJOR_STRING_TOKEN: "bump:major"
24 |           MINOR_STRING_TOKEN: "bump:minor"
25 |           PATCH_STRING_TOKEN: "bump:patch"
26 |       - name: Create release for ${{ steps.update-version.outputs.new_tag }}
27 |         # need to repeat this if statement because Github Action doesn't support early
28 |         # stopping for steps
29 |         if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }}
30 |         run: |
31 |           echo Create release folder
32 |           mkdir kotaemon-app
33 |           echo ${{ steps.update-version.outputs.new_tag }} > kotaemon-app/VERSION
34 |           cp LICENSE.txt kotaemon-app/
35 |           cp flowsettings.py kotaemon-app/
36 |           cp app.py kotaemon-app/
37 |           cp .env.example kotaemon-app/.env
38 |           cp -r scripts kotaemon-app/
39 |           mkdir -p kotaemon-app/libs/ktem/ktem/
40 |           cp -r libs/ktem/ktem/assets kotaemon-app/libs/ktem/ktem/
41 | 
42 |           tree kotaemon-app
43 |           zip -r kotaemon-app.zip kotaemon-app
44 |       - name: Release ${{ steps.update-version.outputs.new_tag }}
45 |         if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }}
46 |         uses: softprops/action-gh-release@v2
47 |         with:
48 |           files: kotaemon-app.zip
49 |           fail_on_unmatched_files: true
50 |           token: ${{ secrets.GITHUB_TOKEN }}
51 |           generate_release_notes: true
52 |           tag_name: ${{ steps.update-version.outputs.new_tag }}
53 |           make_latest: true
54 |       - name: Setup latest branch locally without switching current branch
55 |         if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }}
56 |         run: git fetch origin latest:latest
57 |       - name: Update latest branch
58 |         if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }}
59 |         run: |
60 |           git branch -f latest tags/${{ steps.update-version.outputs.new_tag }}
61 |           git checkout latest
62 |           git push -f -u origin latest
63 | 


--------------------------------------------------------------------------------
/.github/workflows/style-check.yaml:
--------------------------------------------------------------------------------
 1 | name: style-check
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main, develop]
 6 |   push:
 7 |     branches: [main, develop]
 8 | 
 9 | jobs:
10 |   pre-commit:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Clone the repo
14 |         uses: actions/checkout@v4
15 |       - name: Setup python
16 |         uses: actions/setup-python@v4
17 |         with:
18 |           python-version: "3.10"
19 |       - name: run pre-commit
20 |         uses: pre-commit/action@v3.0.0
21 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.3.0
 4 |     hooks:
 5 |       - id: check-yaml
 6 |         args: ["--unsafe"]
 7 |       - id: check-toml
 8 |       - id: end-of-file-fixer
 9 |       - id: trailing-whitespace
10 |       - id: mixed-line-ending
11 |       - id: detect-aws-credentials
12 |         args: ["--allow-missing-credentials"]
13 |       - id: detect-private-key
14 |       - id: check-added-large-files
15 |         args: ["--maxkb=750"]
16 |       - id: debug-statements
17 |   - repo: https://github.com/ambv/black
18 |     rev: 22.3.0
19 |     hooks:
20 |       - id: black
21 |         language_version: python3
22 |   - repo: https://github.com/pycqa/isort
23 |     rev: 5.12.0
24 |     hooks:
25 |       - id: isort
26 |         args: ["--profile", "black"]
27 |         language_version: python3.10
28 |   - repo: https://github.com/pycqa/flake8
29 |     rev: 4.0.1
30 |     hooks:
31 |       - id: flake8
32 |         args: ["--max-line-length", "88", "--extend-ignore", "E203"]
33 |   - repo: https://github.com/myint/autoflake
34 |     rev: v1.4
35 |     hooks:
36 |       - id: autoflake
37 |         args:
38 |           [
39 |             "--in-place",
40 |             "--remove-unused-variables",
41 |             "--remove-all-unused-imports",
42 |             "--ignore-init-module-imports",
43 |             "--exclude=tests/*",
44 |           ]
45 |   - repo: https://github.com/pre-commit/mirrors-prettier
46 |     rev: v2.7.1
47 |     hooks:
48 |       - id: prettier
49 |         types_or: [markdown, yaml]
50 |   - repo: https://github.com/pre-commit/mirrors-mypy
51 |     rev: "v1.7.1"
52 |     hooks:
53 |       - id: mypy
54 |         additional_dependencies:
55 |           [
56 |             types-PyYAML==6.0.12.11,
57 |             "types-requests",
58 |             "sqlmodel",
59 |             "types-Markdown",
60 |             "types-cachetools",
61 |             types-tzlocal,
62 |           ]
63 |         args: ["--check-untyped-defs", "--ignore-missing-imports"]
64 |         exclude: "^templates/"
65 |   - repo: https://github.com/codespell-project/codespell
66 |     rev: v2.2.4
67 |     hooks:
68 |       - id: codespell
69 |         additional_dependencies:
70 |           - tomli
71 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10
2 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from theflow.settings import settings as flowsettings
 4 | 
 5 | KH_APP_DATA_DIR = getattr(flowsettings, "KH_APP_DATA_DIR", ".")
 6 | KH_GRADIO_SHARE = getattr(flowsettings, "KH_GRADIO_SHARE", False)
 7 | GRADIO_TEMP_DIR = os.getenv("GRADIO_TEMP_DIR", None)
 8 | # override GRADIO_TEMP_DIR if it's not set
 9 | if GRADIO_TEMP_DIR is None:
10 |     GRADIO_TEMP_DIR = os.path.join(KH_APP_DATA_DIR, "gradio_tmp")
11 |     os.environ["GRADIO_TEMP_DIR"] = GRADIO_TEMP_DIR
12 | 
13 | 
14 | from ktem.main import App  # noqa
15 | 
16 | app = App()
17 | demo = app.make()
18 | demo.queue().launch(
19 |     favicon_path=app._favicon,
20 |     inbrowser=True,
21 |     allowed_paths=[
22 |         "libs/ktem/ktem/assets",
23 |         GRADIO_TEMP_DIR,
24 |     ],
25 |     share=KH_GRADIO_SHARE,
26 | )
27 | 


--------------------------------------------------------------------------------
/doc_env_reqs.txt:
--------------------------------------------------------------------------------
 1 | mkdocs
 2 | mkdocstrings[python]
 3 | mkdocs-material
 4 | mkdocs-gen-files
 5 | mkdocs-literate-nav
 6 | mkdocs-git-revision-date-localized-plugin
 7 | mkdocs-section-index
 8 | mkdocs-include-markdown-plugin[cache]
 9 | mdx_truly_sane_lists
10 | 


--------------------------------------------------------------------------------
/docs/about.md:
--------------------------------------------------------------------------------
 1 | # About Kotaemon
 2 | 
 3 | An open-source tool for chatting with your documents. Built with both end users and
 4 | developers in mind.
 5 | 
 6 | [Source Code](https://github.com/Cinnamon/kotaemon) |
 7 | [HF Space](https://huggingface.co/spaces/cin-model/kotaemon-demo)
 8 | 
 9 | [Installation Guide](https://cinnamon.github.io/kotaemon/) |
10 | [Developer Guide](https://cinnamon.github.io/kotaemon/development/) |
11 | [Feedback](https://github.com/Cinnamon/kotaemon/issues)
12 | 


--------------------------------------------------------------------------------
/docs/development/create-a-component.md:
--------------------------------------------------------------------------------
 1 | # Creating a component
 2 | 
 3 | A fundamental concept in kotaemon is "component".
 4 | 
 5 | Anything that isn't data or data structure is a "component". A component can be
 6 | thought of as a step within a pipeline. It takes in some input, processes it,
 7 | and returns an output, just the same as a Python function! The output will then
 8 | become an input for the next component in a pipeline. In fact, a pipeline is just
 9 | a component. More appropriately, a nested component: a component that makes use of one or more other components in
10 | the processing step. So in reality, there isn't a difference between a pipeline
11 | and a component! Because of that, in kotaemon, we will consider them the
12 | same as "component".
13 | 
14 | To define a component, you will:
15 | 
16 | 1. Create a class that subclasses from `kotaemon.base.BaseComponent`
17 | 2. Declare init params with type annotation
18 | 3. Declare nodes (nodes are just other components!) with type annotation
19 | 4. Implement the processing logic in `run`.
20 | 
21 | The syntax of a component is as follow:
22 | 
23 | ```python
24 | from kotaemon.base import BaseComponent
25 | from kotaemon.llms import LCAzureChatOpenAI
26 | from kotaemon.parsers import RegexExtractor
27 | 
28 | 
29 | class FancyPipeline(BaseComponent):
30 |     param1: str = "This is param1"
31 |     param2: int = 10
32 |     param3: float
33 | 
34 |     node1: BaseComponent    # this is a node because of BaseComponent type annotation
35 |     node2: LCAzureChatOpenAI  # this is also a node because LCAzureChatOpenAI subclasses BaseComponent
36 |     node3: RegexExtractor   # this is also a node bceause RegexExtractor subclasses BaseComponent
37 | 
38 |     def run(self, some_text: str):
39 |         prompt = (self.param1 + some_text) * int(self.param2 + self.param3)
40 |         llm_pred = self.node2(prompt).text
41 |         matches = self.node3(llm_pred)
42 |         return matches
43 | ```
44 | 
45 | Then this component can be used as follow:
46 | 
47 | ```python
48 | llm = LCAzureChatOpenAI(endpoint="some-endpont")
49 | extractor = RegexExtractor(pattern=["yes", "Yes"])
50 | 
51 | component = FancyPipeline(
52 |     param1="Hello"
53 |     param3=1.5
54 |     node1=llm,
55 |     node2=llm,
56 |     node3=extractor
57 | )
58 | component("goodbye")
59 | ```
60 | 
61 | This way, we can define each operation as a reusable component, and use them to
62 | compose larger reusable components!
63 | 
64 | ## Benefits of component
65 | 
66 | By defining a component as above, we formally encapsulate all the necessary
67 | information inside a single class. This introduces several benefits:
68 | 
69 | 1. Allow tools like promptui to inspect the inner working of a component in
70 |    order to automatically generate the promptui.
71 | 2. Allow visualizing a pipeline for debugging purpose.
72 | 


--------------------------------------------------------------------------------
/docs/development/data-components.md:
--------------------------------------------------------------------------------
 1 | # Data & Data Structure Components
 2 | 
 3 | The data & data structure components include:
 4 | 
 5 | - The `Document` class.
 6 | - The document store.
 7 | - The vector store.
 8 | 
 9 | ## Data Loader
10 | 
11 | - PdfLoader
12 | - Layout-aware with table parsing PdfLoader
13 | 
14 |   - MathPixLoader: To use this loader, you need MathPix API key, refer to [mathpix docs](https://docs.mathpix.com/#introduction) for more information
15 |   - OCRLoader: This loader uses lib-table and Flax pipeline to perform OCR and read table structure from PDF file (TODO: add more info about deployment of this module).
16 |   - Output:
17 | 
18 |     - Document: text + metadata to identify whether it is table or not
19 | 
20 |       ```
21 |       - "source": source file name
22 |       - "type": "table" or "text"
23 |       - "table_origin": original table in markdown format (to be feed to LLM or visualize using external tools)
24 |       - "page_label": page number in the original PDF document
25 |       ```
26 | 
27 | ## Document Store
28 | 
29 | - InMemoryDocumentStore
30 | 
31 | ## Vector Store
32 | 
33 | - ChromaVectorStore
34 | - InMemoryVectorStore
35 | 


--------------------------------------------------------------------------------
/docs/development/index.md:
--------------------------------------------------------------------------------
1 | {%
2 |     include-markdown "../../README.md"
3 |     start="<!-- start-intro -->"
4 |     end="<!-- end-intro -->"
5 | %}
6 | 


--------------------------------------------------------------------------------
/docs/extra/css/code_select.css:
--------------------------------------------------------------------------------
1 | .language-pycon .gp,
2 | .language-pycon .go {
3 |   /* Generic.Prompt,  Generic.Output */
4 |   user-select: none;
5 | }
6 | 


--------------------------------------------------------------------------------
/docs/images/269170170-af94ff6b-b8b4-4602-ab6e-2947deb30dff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/269170170-af94ff6b-b8b4-4602-ab6e-2947deb30dff.png


--------------------------------------------------------------------------------
/docs/images/269170198-9ac1b95a-b667-42e7-b318-98a1b805d6df.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/269170198-9ac1b95a-b667-42e7-b318-98a1b805d6df.png


--------------------------------------------------------------------------------
/docs/images/271332562-ac8f9aac-d853-4571-a48b-d866a99eaf3e.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/271332562-ac8f9aac-d853-4571-a48b-d866a99eaf3e.png


--------------------------------------------------------------------------------
/docs/images/274787925-e2593010-d7ef-46e3-8719-6fcae0315b5d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/274787925-e2593010-d7ef-46e3-8719-6fcae0315b5d.png


--------------------------------------------------------------------------------
/docs/images/change_space_params.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/change_space_params.png


--------------------------------------------------------------------------------
/docs/images/chat-demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/chat-demo.gif


--------------------------------------------------------------------------------
/docs/images/chat-tab-demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/chat-tab-demo.png


--------------------------------------------------------------------------------
/docs/images/chat-tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/chat-tab.png


--------------------------------------------------------------------------------
/docs/images/close_logs_space.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/close_logs_space.png


--------------------------------------------------------------------------------
/docs/images/cohere_api_key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/cohere_api_key.png


--------------------------------------------------------------------------------
/docs/images/duplicate_space.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/duplicate_space.png


--------------------------------------------------------------------------------
/docs/images/file-index-tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/file-index-tab.png


--------------------------------------------------------------------------------
/docs/images/index-embedding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/index-embedding.png


--------------------------------------------------------------------------------
/docs/images/info-panel-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/info-panel-scores.png


--------------------------------------------------------------------------------
/docs/images/initial_startup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/initial_startup.png


--------------------------------------------------------------------------------
/docs/images/llm-default.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/llm-default.png


--------------------------------------------------------------------------------
/docs/images/models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/models.png


--------------------------------------------------------------------------------
/docs/images/pdf-viewer-setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/pdf-viewer-setup.png


--------------------------------------------------------------------------------
/docs/images/preview-graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/preview-graph.png


--------------------------------------------------------------------------------
/docs/images/preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/preview.png


--------------------------------------------------------------------------------
/docs/images/resources-tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/resources-tab.png


--------------------------------------------------------------------------------
/docs/images/retrieval-setting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/retrieval-setting.png


--------------------------------------------------------------------------------
/docs/images/set_api_key_space.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/set_api_key_space.png


--------------------------------------------------------------------------------
/docs/images/space_build.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/space_build.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Getting Started with Kotaemon
 2 | 
 3 | ![type:video](https://github.com/Cinnamon/kotaemon/assets/25688648/815ecf68-3a02-4914-a0dd-3f8ec7e75cd9)
 4 | 
 5 | This page is intended for **end users** who want to use the `kotaemon` tool for Question
 6 | Answering on local documents. If you are a **developer** who wants contribute to the project, please visit the [development](development/index.md) page.
 7 | 
 8 | ## Installation (Online HuggingFace Space) - easy (10 mins)
 9 | 
10 | Visit this [guide](online_install.md).
11 | 
12 | ## Installation (Offline) - intermediate (20 mins)
13 | 
14 | ### Download
15 | 
16 | Download the `kotaemon-app.zip` file from the [latest release](https://github.com/Cinnamon/kotaemon/releases/latest/).
17 | 
18 | ### Run setup script
19 | 
20 | 0. Unzip the downloaded file.
21 | 1. Navigate to the `scripts` folder and start an installer that matches your OS:
22 |    - Windows: `run_windows.bat`. Just double click the file.
23 |    - macOS: `run_macos.sh`
24 |      1. Right click on your file and select Open with and Other.
25 |      2. Enable All Applications and choose Terminal.
26 |      3. NOTE: If you always want to open that file with Terminal, then check Always Open With.
27 |      4. From now on, double click on your file and it should work.
28 |    - Linux: `run_linux.sh`. Please run the script using `bash run_linux.sh` in your terminal.
29 | 2. After the installation, the installer will ask to launch the ktem's UI, answer to continue.
30 | 3. If launched, the application will be open automatically in your browser.
31 | 4. Default login information is: `username: admin / password: admin`. You should change this credential right after the first login on the UI.
32 | 
33 | ## Launch
34 | 
35 | To launch the app after initial setup or any change, simply run the `run_*` script again.
36 | 
37 | A browser window will be opened and greets you with this screen:
38 | 
39 | ![Chat tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/chat-tab.png)
40 | 
41 | ## Usage
42 | 
43 | For how to use the application, see [Usage](usage.md). This page will also be available to
44 | you within the application.
45 | 
46 | ## Feedback
47 | 
48 | Feel free to create a bug report or a feature request on our [repo](https://github.com/Cinnamon/kotaemon/issues).
49 | 


--------------------------------------------------------------------------------
/docs/online_install.md:
--------------------------------------------------------------------------------
 1 | ## Installation (Online HuggingFace Space)
 2 | 
 3 | 1. Go to [HF kotaemon_template](https://huggingface.co/spaces/cin-model/kotaemon_template).
 4 | 2. Use Duplicate function to create your own space. Or use this [direct link](https://huggingface.co/spaces/cin-model/kotaemon_template?duplicate=true).
 5 |    ![Duplicate space](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/duplicate_space.png)
 6 |    ![Change space params](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/change_space_params.png)
 7 | 3. Wait for the build to complete and start up (apprx 10 mins).
 8 |    ![Wait space build](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/space_build.png)
 9 |    ![Close space build](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/close_logs_space.png)
10 | 4. Follow the first setup instructions (and register for Cohere API key if needed).
11 |    ![Cohere API](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/cohere_api_key.png)
12 | 5. Complete the setup and use your own private space!
13 |    ![App Startup](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/initial_startup.png)
14 | 


--------------------------------------------------------------------------------
/docs/pages/app/ext/user-management.md:
--------------------------------------------------------------------------------
 1 | `ktem` provides user management as an extension. To enable user management, in
 2 | your `flowsettings.py`, set the following variables:
 3 | 
 4 | - `KH_FEATURE_USER_MANAGEMENT`: True to enable.
 5 | - `KH_FEATURE_USER_MANAGEMENT_ADMIN`: the admin username. This user will be
 6 |   created when the app 1st start.
 7 | - `KH_FEATURE_USER_MANAGEMENT_PASSWORD`: the admin password. This value
 8 |   accompanies the admin username.
 9 | 
10 | Once enabled, you have access to the following features:
11 | 
12 | - User login/logout (located in Settings Tab)
13 | - User changing password (located in Settings Tab)
14 | - Create / List / Edit / Delete user (located in Resources > Users Tab)
15 | 


--------------------------------------------------------------------------------
/docs/pages/app/features.md:
--------------------------------------------------------------------------------
1 | ## Chat
2 | 
3 | The kotaemon focuses on question and answering over a corpus of data. Below
4 | is the gentle introduction about the chat functionality.
5 | 
6 | - Users can upload corpus of files.
7 | - Users can converse to the chatbot to ask questions about the corpus of files.
8 | - Users can view the reference in the files.
9 | 


--------------------------------------------------------------------------------
/docs/pages/app/settings/overview.md:
--------------------------------------------------------------------------------
1 | # Overview
2 | 
3 | There are 3 kinds of settings in `ktem`, geared towards different stakeholders
4 | for different use cases:
5 | 
6 | - Developer settings. These settings are meant for very basic app customization, such as database URL, cloud config, logging config, which features to enable... You will be interested in the developer settings if you deploy `ktem` to your customers, or if you build extension for `ktem` for developers. These settings are declared inside `flowsettings.py`.
7 | - Admin settings. These settings show up in the Admin page, and are meant to allow admin-level user to customize low level features, such as which credentials to connect to data sources, which keys to use for LLM...
8 | - [User settings](/pages/app/settings/user-settings/). These settings are meant for run-time users to tweak ktem to their personal needs, such as which output languages the chatbot should generate, which reasoning type to use...
9 | 


--------------------------------------------------------------------------------
/docs/pages/app/settings/user-settings.md:
--------------------------------------------------------------------------------
 1 | # User settings
 2 | 
 3 | `ktem` allows developers to extend the index and the reasoning pipeline. In
 4 | many cases, these components can have settings that should be modified by
 5 | users at run-time, (e.g. `topk`, `chunksize`...). These are the user settings.
 6 | 
 7 | `ktem` allows developers to declare such user settings in their code. Once
 8 | declared, `ktem` will render them in a Settings page.
 9 | 
10 | There are 2 places that `ktem` looks for declared user settings. You can
11 | refer to the respective pages.
12 | 
13 | - In the index.
14 | - In the reasoning pipeline.
15 | 
16 | ## Syntax of a settings
17 | 
18 | A collection of settings is a dictionary of type `dict[str, dict]`, where the
19 | key is a setting id, and the value is the description of the setting.
20 | 
21 | ```python
22 | settings = {
23 |     "topk": {
24 |         "name": "Top-k chunks",
25 |         "value": 10,
26 |         "component": "number",
27 |     },
28 |     "lang": {
29 |         "name": "Languages",
30 |         "value": "en",
31 |         "component": "dropdown",
32 |         "choices": [("en", "English"), ("cn", "Chinese")],
33 |     }
34 | }
35 | ```
36 | 
37 | Each setting description must have:
38 | 
39 | - name: the human-understandable name of the settings.
40 | - value: the default value of the settings.
41 | - component: the UI component to render such setting on the UI. Available:
42 | 
43 |   - "text": single-value
44 |   - "number": single-value
45 |   - "checkbox": single-value
46 |   - "dropdown": choices
47 |   - "radio": choices
48 |   - "checkboxgroup": choices
49 | 
50 | - choices: the list of choices, if the component type allows.
51 | 
52 | ## Settings page structure
53 | 


--------------------------------------------------------------------------------
/docs/scripts/generate_examples_docs.py:
--------------------------------------------------------------------------------
 1 | # import shutil
 2 | from pathlib import Path
 3 | from typing import Any, Iterable
 4 | 
 5 | import mkdocs_gen_files
 6 | 
 7 | # get the root source code directory
 8 | doc_dir_name = "docs"
 9 | doc_dir = Path(__file__)
10 | while doc_dir.name != doc_dir_name and doc_dir != doc_dir.parent:
11 |     doc_dir = doc_dir.parent
12 | 
13 | if doc_dir == doc_dir.parent:
14 |     raise ValueError(f"root_name ({doc_dir_name}) not in path ({str(Path(__file__))}).")
15 | 
16 | 
17 | def generate_docs_for_examples_readme(
18 |     examples_dir: Path, target_doc_folder: str, ignored_modules: Iterable[Any] = []
19 | ):
20 |     if not examples_dir.is_dir():
21 |         raise ModuleNotFoundError(str(examples_dir))
22 | 
23 |     nav = mkdocs_gen_files.Nav()
24 | 
25 |     for path in sorted(examples_dir.rglob("*README.md")):
26 |         # ignore modules with name starts with underscore (i.e. __init__)
27 |         if path.name.startswith("_") or path.name.startswith("test"):
28 |             continue
29 | 
30 |         module_path = path.parent.relative_to(examples_dir).with_suffix("")
31 |         doc_path = path.parent.relative_to(examples_dir).with_suffix(".md")
32 |         full_doc_path = Path(target_doc_folder, doc_path)
33 | 
34 |         parts = list(module_path.parts)
35 |         identifier = ".".join(parts)
36 | 
37 |         if "tests" in parts:
38 |             continue
39 | 
40 |         ignore = False
41 |         for each_module in ignored_modules:
42 |             if identifier.startswith(each_module):
43 |                 ignore = True
44 |                 break
45 |         if ignore:
46 |             continue
47 | 
48 |         nav_titles = [name.replace("_", " ").title() for name in parts]
49 |         nav[nav_titles] = doc_path.as_posix()
50 | 
51 |         with mkdocs_gen_files.open(full_doc_path, "w") as f:
52 |             f.write(f'--8<-- "{path.relative_to(examples_dir.parent)}"')
53 | 
54 |         mkdocs_gen_files.set_edit_path(
55 |             full_doc_path, Path("..") / path.relative_to(examples_dir.parent)
56 |         )
57 | 
58 |     with mkdocs_gen_files.open(f"{target_doc_folder}/NAV.md", "w") as nav_file:
59 |         nav_file.writelines(nav.build_literate_nav())
60 | 
61 | 
62 | generate_docs_for_examples_readme(
63 |     examples_dir=doc_dir.parent / "examples",
64 |     target_doc_folder="examples",
65 | )
66 | 


--------------------------------------------------------------------------------
/docs/scripts/generate_reference_docs.py:
--------------------------------------------------------------------------------
 1 | # import shutil
 2 | from pathlib import Path
 3 | from typing import Any, Iterable
 4 | 
 5 | import mkdocs_gen_files
 6 | 
 7 | # get the root source code directory
 8 | doc_dir_name = "docs"
 9 | doc_dir = Path(__file__)
10 | while doc_dir.name != doc_dir_name and doc_dir != doc_dir.parent:
11 |     doc_dir = doc_dir.parent
12 | 
13 | if doc_dir == doc_dir.parent:
14 |     raise ValueError(f"root_name ({doc_dir_name}) not in path ({str(Path(__file__))}).")
15 | 
16 | nav_title_map = {"cli": "CLI", "llms": "LLMs"}
17 | 
18 | 
19 | def generate_docs_for_src_code(
20 |     code_dir: Path, target_doc_folder: str, ignored_modules: Iterable[Any] = []
21 | ):
22 |     if not code_dir.is_dir():
23 |         raise ModuleNotFoundError(str(code_dir))
24 | 
25 |     nav = mkdocs_gen_files.Nav()
26 | 
27 |     for path in sorted(code_dir.rglob("*.py")):
28 |         # ignore modules with name starts with underscore (i.e. __init__)
29 |         # if path.name.startswith("_") or path.name.startswith("test"):
30 |         #     continue
31 | 
32 |         module_path = path.relative_to(code_dir).with_suffix("")
33 |         doc_path = path.relative_to(code_dir).with_suffix(".md")
34 |         full_doc_path = Path(target_doc_folder, doc_path)
35 | 
36 |         parts = list(module_path.parts)
37 | 
38 |         if parts[-1] == "__init__":
39 |             doc_path = doc_path.with_name("index.md")
40 |             full_doc_path = full_doc_path.with_name("index.md")
41 |             parts.pop()
42 | 
43 |         if not parts:
44 |             continue
45 | 
46 |         if "tests" in parts:
47 |             continue
48 | 
49 |         identifier = ".".join(parts)
50 |         ignore = False
51 |         for each_module in ignored_modules:
52 |             if identifier.startswith(each_module):
53 |                 ignore = True
54 |                 break
55 |         if ignore:
56 |             continue
57 | 
58 |         nav_titles = [
59 |             nav_title_map.get(name, name.replace("_", " ").title()) for name in parts
60 |         ]
61 |         nav[nav_titles] = doc_path.as_posix()
62 | 
63 |         with mkdocs_gen_files.open(full_doc_path, "w") as f:
64 |             f.write(f"::: {identifier}")
65 | 
66 |         # this method works in docs folder
67 |         mkdocs_gen_files.set_edit_path(
68 |             full_doc_path, Path("..") / path.relative_to(code_dir.parent)
69 |         )
70 | 
71 |     with mkdocs_gen_files.open(f"{target_doc_folder}/Summary.md", "w") as nav_file:
72 |         nav_file.writelines(nav.build_literate_nav())
73 | 
74 | 
75 | generate_docs_for_src_code(
76 |     code_dir=doc_dir.parent / "libs" / "kotaemon" / "kotaemon",
77 |     target_doc_folder="reference",
78 |     ignored_modules={"contribs"},
79 | )
80 | 


--------------------------------------------------------------------------------
/docs/theme/assets/pymdownx-extras/material-extra-theme-TVq-kNRT.js:
--------------------------------------------------------------------------------
1 | !function(){"use strict";var e;e=function(e){"true"===localStorage.getItem("data-md-prefers-color-scheme")&&document.querySelector("body").setAttribute("data-md-color-scheme",e.matches?"dracula":"default")},new MutationObserver((function(t){t.forEach((function(t){if("childList"===t.type&&t.addedNodes.length)for(var a=0;a<t.addedNodes.length;a++){var r=t.addedNodes[a];if(1===r.nodeType&&"body"===r.tagName.toLowerCase()){d=r,o=void 0,c=void 0,l=void 0,o="not all"!==window.matchMedia("(prefers-color-scheme)").media,c=localStorage.getItem("data-md-color-scheme"),l=localStorage.getItem("data-md-prefers-color-scheme"),c||(c="dracula"),l||(l="false"),"true"===l&&o?c=window.matchMedia("(prefers-color-scheme: dark)").matches?"dracula":"default":l="false",d.setAttribute("data-md-prefers-color-scheme",l),d.setAttribute("data-md-color-scheme",c),o&&window.matchMedia("(prefers-color-scheme: dark)").addListener(e);break}}var d,o,c,l}))})).observe(document.querySelector("html"),{childList:!0}),window.toggleScheme=function(){var e=document.querySelector("body"),t="not all"!==window.matchMedia("(prefers-color-scheme)").media,a=e.getAttribute("data-md-color-scheme"),r=e.getAttribute("data-md-prefers-color-scheme");t&&"default"===a&&"true"!==r?(r="true",a=window.matchMedia("(prefers-color-scheme: dark)").matches?"dracula":"default"):t&&"true"===r?(r="false",a="dracula"):"dracula"===a?(r="false",a="default"):(r="false",a="dracula"),localStorage.setItem("data-md-prefers-color-scheme",r),e.setAttribute("data-md-prefers-color-scheme",r),e.setAttribute("data-md-color-scheme",a)}}();
2 | //# sourceMappingURL=material-extra-theme-TVq-kNRT.js.map
3 | 


--------------------------------------------------------------------------------
/docs/theme/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | 
3 | {% block libs %}
4 | {{ super() }}
5 | {% include "partials/libs.html" ignore missing %}
6 | {% endblock %}
7 | 


--------------------------------------------------------------------------------
/docs/theme/partials/footer.html:
--------------------------------------------------------------------------------
 1 | 
 2 | {% import "partials/language.html" as lang with context %}
 3 | <footer class="md-footer">
 4 |   {% if page.previous_page or page.next_page %}
 5 |     <nav
 6 |       class="md-footer__inner md-grid"
 7 |       aria-label="{{ lang.t('footer.title') }}"
 8 |     >
 9 |       {% if page.previous_page %}
10 |         <a
11 |           href="{{ page.previous_page.url | url }}"
12 |           class="md-footer__link md-footer__link--prev"
13 |           rel="prev"
14 |         >
15 |           <div class="md-footer__button md-icon">
16 |             {% include ".icons/material/arrow-left.svg" %}
17 |           </div>
18 |           <div class="md-footer__title">
19 |             <div class="md-ellipsis">
20 |               <span class="md-footer__direction">
21 |                 {{ lang.t("footer.previous") }}
22 |               </span>
23 |               {{ page.previous_page.title }}
24 |             </div>
25 |           </div>
26 |         </a>
27 |       {% endif %}
28 |       {% if page.next_page %}
29 |         <a
30 |           href="{{ page.next_page.url | url }}"
31 |           class="md-footer__link md-footer__link--next"
32 |           rel="next"
33 |         >
34 |           <div class="md-footer__title">
35 |             <div class="md-ellipsis">
36 |               <span class="md-footer__direction">
37 |                 {{ lang.t("footer.next") }}
38 |               </span>
39 |               {{ page.next_page.title }}
40 |             </div>
41 |           </div>
42 |           <div class="md-footer__button md-icon">
43 |             {% include ".icons/material/arrow-right.svg" %}
44 |           </div>
45 |         </a>
46 |       {% endif %}
47 |     </nav>
48 |   {% endif %}
49 | </footer>
50 | 


--------------------------------------------------------------------------------
/docs/theme/partials/libs.html:
--------------------------------------------------------------------------------
1 | <script src="{{ 'assets/pymdownx-extras/material-extra-theme-TVq-kNRT.js' | url }}" type="text/javascript"></script>
2 | <script src="{{ 'assets/pymdownx-extras/material-extra-3rdparty-E-i8w1WA.js' | url }}" type="text/javascript"></script>
3 | 


--------------------------------------------------------------------------------
/fly.toml:
--------------------------------------------------------------------------------
 1 | # fly.toml app configuration file generated for kotaemon on 2024-12-24T20:56:32+07:00
 2 | #
 3 | # See https://fly.io/docs/reference/configuration/ for information about how to use this file.
 4 | #
 5 | 
 6 | app = 'kotaemon'
 7 | primary_region = 'sin'
 8 | 
 9 | [build]
10 | 
11 | [mounts]
12 |   destination = "/app/ktem_app_data"
13 |   source = "ktem_volume"
14 | 
15 | [http_service]
16 |   internal_port = 7860
17 |   force_https = true
18 |   auto_stop_machines = 'suspend'
19 |   auto_start_machines = true
20 |   min_machines_running = 0
21 |   processes = ['app']
22 | 
23 | [[vm]]
24 |   memory = '4gb'
25 |   cpu_kind = 'shared'
26 |   cpus = 4
27 | 


--------------------------------------------------------------------------------
/launch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$GRADIO_SERVER_NAME" ]; then
 4 |     export GRADIO_SERVER_NAME="0.0.0.0"
 5 | fi
 6 | if [ -z "$GRADIO_SERVER_PORT" ]; then
 7 |     export GRADIO_SERVER_PORT="7860"
 8 | fi
 9 | 
10 | # Check if environment variable KH_DEMO_MODE is set to true
11 | if [ "$KH_DEMO_MODE" = "true" ]; then
12 |     echo "KH_DEMO_MODE is true. Launching in demo mode..."
13 |     # Command to launch in demo mode
14 |     GR_FILE_ROOT_PATH="/app" KH_FEATURE_USER_MANAGEMENT=false USE_LIGHTRAG=false uvicorn sso_app_demo:app --host "$GRADIO_SERVER_NAME" --port "$GRADIO_SERVER_PORT"
15 | else
16 |     if [ "$KH_SSO_ENABLED" = "true" ]; then
17 |         echo "KH_SSO_ENABLED is true. Launching in SSO mode..."
18 |         GR_FILE_ROOT_PATH="/app" KH_SSO_ENABLED=true uvicorn sso_app:app --host "$GRADIO_SERVER_NAME" --port "$GRADIO_SERVER_PORT"
19 |     else
20 |         ollama serve &
21 |         python app.py
22 |     fi
23 | fi
24 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/__init__.py:
--------------------------------------------------------------------------------
 1 | # Disable telemetry with monkey patching
 2 | import logging
 3 | 
 4 | logger = logging.getLogger(__name__)
 5 | try:
 6 |     import posthog
 7 | 
 8 |     def capture(*args, **kwargs):
 9 |         logger.info("posthog.capture called with args: %s, kwargs: %s", args, kwargs)
10 | 
11 |     posthog.capture = capture
12 | except ImportError:
13 |     pass
14 | 
15 | try:
16 |     import os
17 | 
18 |     os.environ["HAYSTACK_TELEMETRY_ENABLED"] = "False"
19 |     import haystack.telemetry
20 | 
21 |     haystack.telemetry.telemetry = None
22 | except ImportError:
23 |     pass
24 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/agents/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseAgent
 2 | from .io import AgentFinish, AgentOutput, AgentType, BaseScratchPad
 3 | from .langchain_based import LangchainAgent
 4 | from .react.agent import ReactAgent
 5 | from .rewoo.agent import RewooAgent
 6 | from .tools import BaseTool, ComponentTool, GoogleSearchTool, LLMTool, WikipediaTool
 7 | 
 8 | __all__ = [
 9 |     # agent
10 |     "BaseAgent",
11 |     "ReactAgent",
12 |     "RewooAgent",
13 |     "LangchainAgent",
14 |     # tool
15 |     "BaseTool",
16 |     "ComponentTool",
17 |     "GoogleSearchTool",
18 |     "WikipediaTool",
19 |     "LLMTool",
20 |     # io
21 |     "AgentType",
22 |     "AgentOutput",
23 |     "AgentFinish",
24 |     "BaseScratchPad",
25 | ]
26 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/agents/base.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Union
 2 | 
 3 | from kotaemon.base import BaseComponent, Node, Param
 4 | from kotaemon.llms import BaseLLM, PromptTemplate
 5 | 
 6 | from .io import AgentOutput, AgentType
 7 | from .tools import BaseTool
 8 | 
 9 | 
10 | class BaseAgent(BaseComponent):
11 |     """Define base agent interface"""
12 | 
13 |     name: str = Param(help="Name of the agent.")
14 |     agent_type: AgentType = Param(help="Agent type, must be one of AgentType")
15 |     description: str = Param(
16 |         help=(
17 |             "Description used to tell the model how/when/why to use the agent. You can"
18 |             " provide few-shot examples as a part of the description. This will be"
19 |             " input to the prompt of LLM."
20 |         )
21 |     )
22 |     llm: Optional[BaseLLM] = Node(
23 |         help=(
24 |             "LLM to be used for the agent (optional). LLM must implement BaseLLM"
25 |             " interface."
26 |         )
27 |     )
28 |     prompt_template: Optional[Union[PromptTemplate, dict[str, PromptTemplate]]] = Param(
29 |         help="A prompt template or a dict to supply different prompt to the agent"
30 |     )
31 |     plugins: list[BaseTool] = Param(
32 |         default_callback=lambda _: [],
33 |         help="List of plugins / tools to be used in the agent",
34 |     )
35 | 
36 |     @staticmethod
37 |     def safeguard_run(run_func, *args, **kwargs):
38 |         def wrapper(self, *args, **kwargs):
39 |             try:
40 |                 return run_func(self, *args, **kwargs)
41 |             except Exception as e:
42 |                 return AgentOutput(
43 |                     text="",
44 |                     agent_type=self.agent_type,
45 |                     status="failed",
46 |                     error=str(e),
47 |                 )
48 | 
49 |         return wrapper
50 | 
51 |     def add_tools(self, tools: list[BaseTool]) -> None:
52 |         """Helper method to add tools and update agent state if needed"""
53 |         self.plugins.extend(tools)
54 | 
55 |     def run(self, *args, **kwargs) -> AgentOutput | list[AgentOutput]:
56 |         """Run the component."""
57 |         raise NotImplementedError()
58 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/agents/io/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import AgentAction, AgentFinish, AgentOutput, AgentType, BaseScratchPad
2 | 
3 | __all__ = ["AgentOutput", "AgentFinish", "BaseScratchPad", "AgentType", "AgentAction"]
4 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/agents/react/__init__.py:
--------------------------------------------------------------------------------
1 | from .agent import ReactAgent
2 | 
3 | __all__ = ["ReactAgent"]
4 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/agents/react/prompt.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | 
 3 | from kotaemon.llms import PromptTemplate
 4 | 
 5 | zero_shot_react_prompt = PromptTemplate(
 6 |     template="""Answer the following questions as best you can. Give answer in {lang}. You have access to the following tools:
 7 | {tool_description}
 8 | Use the following format:
 9 | 
10 | Question: the input question you must answer
11 | Thought: you should always think about what to do
12 | 
13 | Action: the action to take, should be one of [{tool_names}]
14 | 
15 | Action Input: the input to the action, should be different from the action input of the same action in previous steps.
16 | 
17 | Observation: the result of the action
18 | 
19 | ... (this Thought/Action/Action Input/Observation can repeat N times)
20 | #Thought: I now know the final answer
21 | Final Answer: the final answer to the original input question
22 | 
23 | Begin! After each Action Input.
24 | 
25 | Question: {instruction}
26 | Thought:{agent_scratchpad}
27 |     """
28 | )
29 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/agents/rewoo/__init__.py:
--------------------------------------------------------------------------------
1 | from .agent import RewooAgent
2 | 
3 | __all__ = ["RewooAgent"]
4 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/agents/tools/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BaseTool, ComponentTool
2 | from .google import GoogleSearchTool
3 | from .llm import LLMTool
4 | from .wikipedia import WikipediaTool
5 | 
6 | __all__ = ["BaseTool", "ComponentTool", "GoogleSearchTool", "WikipediaTool", "LLMTool"]
7 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/agents/tools/google.py:
--------------------------------------------------------------------------------
 1 | from typing import AnyStr, Optional, Type
 2 | from urllib.error import HTTPError
 3 | 
 4 | from langchain_community.utilities import SerpAPIWrapper
 5 | from pydantic import BaseModel, Field
 6 | 
 7 | from .base import BaseTool
 8 | 
 9 | 
10 | class GoogleSearchArgs(BaseModel):
11 |     query: str = Field(..., description="a search query")
12 | 
13 | 
14 | class GoogleSearchTool(BaseTool):
15 |     name: str = "google_search"
16 |     description: str = (
17 |         "A search engine retrieving top search results as snippets from Google. "
18 |         "Input should be a search query."
19 |     )
20 |     args_schema: Optional[Type[BaseModel]] = GoogleSearchArgs
21 | 
22 |     def _run_tool(self, query: AnyStr) -> str:
23 |         try:
24 |             from googlesearch import search
25 |         except ImportError:
26 |             raise ImportError(
27 |                 "install googlesearch using `pip3 install googlesearch-python` to "
28 |                 "use this tool"
29 |             )
30 | 
31 |         try:
32 |             output = ""
33 |             search_results = search(query, advanced=True)
34 |             if search_results:
35 |                 output = "\n".join(
36 |                     "{} {}".format(item.title, item.description)
37 |                     for item in search_results
38 |                 )
39 |         except HTTPError:
40 |             output = "No evidence found."
41 | 
42 |         return output
43 | 
44 | 
45 | class SerpTool(BaseTool):
46 |     name = "google_search"
47 |     description = (
48 |         "Worker that searches results from Google. Useful when you need to find short "
49 |         "and succinct answers about a specific topic. Input should be a search query."
50 |     )
51 |     args_schema: Optional[Type[BaseModel]] = GoogleSearchArgs
52 | 
53 |     def _run_tool(self, query: AnyStr) -> str:
54 |         tool = SerpAPIWrapper()
55 |         evidence = tool.run(query)
56 | 
57 |         return evidence
58 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/agents/tools/llm.py:
--------------------------------------------------------------------------------
 1 | from typing import AnyStr, Optional, Type
 2 | 
 3 | from pydantic import BaseModel, Field
 4 | 
 5 | from kotaemon.agents.tools.base import ToolException
 6 | from kotaemon.llms import BaseLLM
 7 | 
 8 | from .base import BaseTool
 9 | 
10 | 
11 | class LLMArgs(BaseModel):
12 |     query: str = Field(..., description="a search question or prompt")
13 | 
14 | 
15 | class LLMTool(BaseTool):
16 |     name: str = "llm"
17 |     description: str = (
18 |         "A pretrained LLM like yourself. Useful when you need to act with "
19 |         "general world knowledge and common sense. Prioritize it when you "
20 |         "are confident in solving the problem "
21 |         "yourself. Input can be any instruction."
22 |     )
23 |     llm: BaseLLM
24 |     args_schema: Optional[Type[BaseModel]] = LLMArgs
25 |     dummy_mode: bool = True
26 | 
27 |     def _run_tool(self, query: AnyStr) -> str:
28 |         output = None
29 |         try:
30 |             if not self.dummy_mode:
31 |                 response = self.llm(query)
32 |             else:
33 |                 response = None
34 |         except ValueError:
35 |             raise ToolException("LLM Tool call failed")
36 |         output = response.text if response else "<->"
37 |         return output
38 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/agents/tools/wikipedia.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, AnyStr, Optional, Type, Union
 2 | 
 3 | from pydantic import BaseModel, Field
 4 | 
 5 | from kotaemon.base import Document
 6 | 
 7 | from .base import BaseTool
 8 | 
 9 | 
10 | class Wiki:
11 |     """Wrapper around wikipedia API."""
12 | 
13 |     def __init__(self) -> None:
14 |         """Check that wikipedia package is installed."""
15 |         try:
16 |             import wikipedia  # noqa: F401
17 |         except ImportError:
18 |             raise ValueError(
19 |                 "Could not import wikipedia python package. "
20 |                 "Please install it with `pip install wikipedia`."
21 |             )
22 | 
23 |     def search(self, search: str) -> Union[str, Document]:
24 |         """Try to search for wiki page.
25 | 
26 |         If page exists, return the page summary, and a PageWithLookups object.
27 |         If page does not exist, return similar entries.
28 |         """
29 |         import wikipedia
30 | 
31 |         try:
32 |             page_content = wikipedia.page(search).content
33 |             url = wikipedia.page(search).url
34 |             result: Union[str, Document] = Document(
35 |                 text=page_content, metadata={"page": url}
36 |             )
37 |         except wikipedia.PageError:
38 |             result = f"Could not find [{search}]. Similar: {wikipedia.search(search)}"
39 |         except wikipedia.DisambiguationError:
40 |             result = f"Could not find [{search}]. Similar: {wikipedia.search(search)}"
41 |         return result
42 | 
43 | 
44 | class WikipediaArgs(BaseModel):
45 |     query: str = Field(..., description="a search query as input to wkipedia")
46 | 
47 | 
48 | class WikipediaTool(BaseTool):
49 |     """Tool that adds the capability to query the Wikipedia API."""
50 | 
51 |     name: str = "wikipedia"
52 |     description: str = (
53 |         "Search engine from Wikipedia, retrieving relevant wiki page. "
54 |         "Useful when you need to get holistic knowledge about people, "
55 |         "places, companies, historical events, or other subjects. "
56 |         "Input should be a search query."
57 |     )
58 |     args_schema: Optional[Type[BaseModel]] = WikipediaArgs
59 |     doc_store: Any = None
60 | 
61 |     def _run_tool(self, query: AnyStr) -> AnyStr:
62 |         if not self.doc_store:
63 |             self.doc_store = Wiki()
64 |         tool = self.doc_store
65 |         evidence = tool.search(query)
66 |         return evidence
67 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/agents/utils.py:
--------------------------------------------------------------------------------
 1 | from kotaemon.base import Document
 2 | 
 3 | 
 4 | def get_plugin_response_content(output) -> str:
 5 |     """
 6 |     Wrapper for AgentOutput content return
 7 |     """
 8 |     if isinstance(output, Document):
 9 |         return output.text
10 |     else:
11 |         return str(output)
12 | 
13 | 
14 | def calculate_cost(model_name: str, prompt_token: int, completion_token: int) -> float:
15 |     """
16 |     Calculate the cost of a prompt and completion.
17 | 
18 |     Returns:
19 |         float: Cost of the provided model name with provided token information
20 |     """
21 |     # TODO: to be implemented
22 |     return 0.0
23 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/base/__init__.py:
--------------------------------------------------------------------------------
 1 | from .component import BaseComponent, Node, Param, lazy
 2 | from .schema import (
 3 |     AIMessage,
 4 |     BaseMessage,
 5 |     Document,
 6 |     DocumentWithEmbedding,
 7 |     ExtractorOutput,
 8 |     HumanMessage,
 9 |     LLMInterface,
10 |     RetrievedDocument,
11 |     StructuredOutputLLMInterface,
12 |     SystemMessage,
13 | )
14 | 
15 | __all__ = [
16 |     "BaseComponent",
17 |     "Document",
18 |     "DocumentWithEmbedding",
19 |     "BaseMessage",
20 |     "SystemMessage",
21 |     "AIMessage",
22 |     "HumanMessage",
23 |     "RetrievedDocument",
24 |     "LLMInterface",
25 |     "StructuredOutputLLMInterface",
26 |     "ExtractorOutput",
27 |     "Param",
28 |     "Node",
29 |     "lazy",
30 | ]
31 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/base/component.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from typing import Any, AsyncGenerator, Iterator, Optional
 3 | 
 4 | from theflow import Function, Node, Param, lazy
 5 | 
 6 | from kotaemon.base.schema import Document
 7 | 
 8 | 
 9 | class BaseComponent(Function):
10 |     """A component is a class that can be used to compose a pipeline.
11 | 
12 |     !!! tip "Benefits of component"
13 |         - Auto caching, logging
14 |         - Allow deployment
15 | 
16 |     !!! tip "For each component, the spirit is"
17 |         - Tolerate multiple input types, e.g. str, Document, List[str], List[Document]
18 |         - Enforce single output type. Hence, the output type of a component should be
19 |     as generic as possible.
20 |     """
21 | 
22 |     inflow = None
23 | 
24 |     def flow(self):
25 |         if self.inflow is None:
26 |             raise ValueError("No inflow provided.")
27 | 
28 |         if not isinstance(self.inflow, BaseComponent):
29 |             raise ValueError(
30 |                 f"inflow must be a BaseComponent, found {type(self.inflow)}"
31 |             )
32 | 
33 |         return self.__call__(self.inflow.flow())
34 | 
35 |     def set_output_queue(self, queue):
36 |         self._queue = queue
37 |         for name in self._ff_nodes:
38 |             node = getattr(self, name)
39 |             if isinstance(node, BaseComponent):
40 |                 node.set_output_queue(queue)
41 | 
42 |     def report_output(self, output: Optional[Document]):
43 |         if self._queue is not None:
44 |             self._queue.put_nowait(output)
45 | 
46 |     def invoke(self, *args, **kwargs) -> Document | list[Document] | None:
47 |         ...
48 | 
49 |     async def ainvoke(self, *args, **kwargs) -> Document | list[Document] | None:
50 |         ...
51 | 
52 |     def stream(self, *args, **kwargs) -> Iterator[Document] | None:
53 |         ...
54 | 
55 |     def astream(self, *args, **kwargs) -> AsyncGenerator[Document, None] | None:
56 |         ...
57 | 
58 |     @abstractmethod
59 |     def run(
60 |         self, *args, **kwargs
61 |     ) -> Document | list[Document] | Iterator[Document] | None | Any:
62 |         """Run the component."""
63 |         ...
64 | 
65 | 
66 | __all__ = ["BaseComponent", "Param", "Node", "lazy"]
67 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/chatbot/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BaseChatBot, ChatConversation
2 | from .simple_respondent import SimpleRespondentChatbot
3 | 
4 | __all__ = ["BaseChatBot", "SimpleRespondentChatbot", "ChatConversation"]
5 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/chatbot/simple_respondent.py:
--------------------------------------------------------------------------------
 1 | from ..llms import ChatLLM
 2 | from .base import BaseChatBot
 3 | 
 4 | 
 5 | class SimpleRespondentChatbot(BaseChatBot):
 6 |     """Simple text respondent chatbot that essentially wraps around a chat LLM"""
 7 | 
 8 |     llm: ChatLLM
 9 | 
10 |     def _get_message(self) -> str:
11 |         return self.llm(self.history).text
12 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/contribs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/kotaemon/contribs/__init__.py


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/contribs/docs.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | from collections import defaultdict
 3 | 
 4 | from theflow.utils.documentation import get_function_documentation_from_module
 5 | 
 6 | 
 7 | def from_definition_to_markdown(definition: dict) -> str:
 8 |     """From definition to markdown"""
 9 | 
10 |     # Handle params
11 |     params = " N/A\n"
12 |     if definition["params"]:
13 |         params = "\n| Name | Description | Type | Default |\n"
14 |         params += "| --- | --- | --- | --- |\n"
15 |         for name, p in definition["params"].items():
16 |             type_ = p["type"].__name__ if inspect.isclass(p["type"]) else p["type"]
17 |             params += f"| {name} | {p['desc']} | {type_} | {p['default']} |\n"
18 | 
19 |     # Handle nodes
20 |     nodes = " N/A\n"
21 |     if definition["nodes"]:
22 |         nodes = "\n| Name | Description | Type | Input | Output |\n"
23 |         nodes += "| --- | --- | --- | --- | --- |\n"
24 |         for name, n in definition["nodes"].items():
25 |             type_ = n["type"].__name__ if inspect.isclass(n["type"]) else str(n["type"])
26 |             input_ = (
27 |                 n["input"].__name__ if inspect.isclass(n["input"]) else str(n["input"])
28 |             )
29 |             output_ = (
30 |                 n["output"].__name__
31 |                 if inspect.isclass(n["output"])
32 |                 else str(n["output"])
33 |             )
34 |             nodes += f"|{name}|{n['desc']}|{type_}|{input_}|{output_}|\n"
35 | 
36 |     description = inspect.cleandoc(definition["desc"])
37 |     return f"{description}\n\n_**Params:**_{params}\n_**Nodes:**_{nodes}"
38 | 
39 | 
40 | def make_doc(module: str, output: str, separation_level: int):
41 |     """Run exporting components to markdown
42 | 
43 |     Args:
44 |         module (str): module name
45 |         output_path (str): output path to save
46 |         separation_level (int): level of separation
47 |     """
48 |     documentation = sorted(
49 |         get_function_documentation_from_module(module).items(), key=lambda x: x[0]
50 |     )
51 | 
52 |     entries = defaultdict(list)
53 | 
54 |     for name, definition in documentation:
55 |         section = name.split(".")[separation_level].capitalize()
56 |         cls_name = name.split(".")[-1]
57 | 
58 |         markdown = from_definition_to_markdown(definition)
59 |         entries[section].append(f"### {cls_name}\n{markdown}")
60 | 
61 |     final = "\n".join(
62 |         [f"## {section}\n" + "\n".join(entries[section]) for section in entries]
63 |     )
64 | 
65 |     with open(output, "w") as f:
66 |         f.write(final)
67 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/contribs/promptui/.gitignore:
--------------------------------------------------------------------------------
1 | /frpc_*
2 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/contribs/promptui/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/kotaemon/contribs/promptui/__init__.py


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/contribs/promptui/base.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | 
 3 | COMPONENTS_CLASS = {
 4 |     "text": gr.components.Textbox,
 5 |     "checkbox": gr.components.CheckboxGroup,
 6 |     "dropdown": gr.components.Dropdown,
 7 |     "file": gr.components.File,
 8 |     "image": gr.components.Image,
 9 |     "number": gr.components.Number,
10 |     "radio": gr.components.Radio,
11 |     "slider": gr.components.Slider,
12 | }
13 | SUPPORTED_COMPONENTS = set(COMPONENTS_CLASS.keys())
14 | DEFAULT_COMPONENT_BY_TYPES = {
15 |     "str": "text",
16 |     "bool": "checkbox",
17 |     "int": "number",
18 |     "float": "number",
19 |     "list": "dropdown",
20 | }
21 | 
22 | 
23 | def get_component(component_def: dict) -> gr.components.Component:
24 |     """Get the component based on component definition"""
25 |     component_cls = None
26 | 
27 |     if "component" in component_def:
28 |         component = component_def["component"]
29 |         if component not in SUPPORTED_COMPONENTS:
30 |             raise ValueError(
31 |                 f"Unsupported UI component: {component}. "
32 |                 f"Must be one of {SUPPORTED_COMPONENTS}"
33 |             )
34 | 
35 |         component_cls = COMPONENTS_CLASS[component]
36 |     else:
37 |         raise ValueError(
38 |             f"Cannot decide the component from {component_def}. "
39 |             "Please specify `component` with 1 of the following "
40 |             f"values: {SUPPORTED_COMPONENTS}"
41 |         )
42 | 
43 |     return component_cls(**component_def.get("params", {}))
44 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/contribs/promptui/cli.py:
--------------------------------------------------------------------------------
1 | """CLI commands that can be imported by the kotaemon.cli module"""
2 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/contribs/promptui/logs.py:
--------------------------------------------------------------------------------
 1 | class ResultLog:
 2 |     """Callback getter to get the desired log result
 3 | 
 4 |     The callback resolution will be as follow:
 5 |         1. Explicit string name
 6 |         2. Implicitly by: `get_<name>`
 7 |         3. Pass through
 8 |     """
 9 | 
10 |     @staticmethod
11 |     def _get_input(obj):
12 |         return obj["input"]
13 | 
14 |     @staticmethod
15 |     def _get_output(obj):
16 |         return obj["output"]
17 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/contribs/promptui/ui/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import gradio as gr
 4 | import yaml
 5 | from theflow.utils.modules import import_dotted_string
 6 | 
 7 | from ..themes import John
 8 | from .chat import build_chat_ui
 9 | from .pipeline import build_pipeline_ui
10 | 
11 | 
12 | def build_from_dict(config: Union[str, dict]):
13 |     """Build a full UI from YAML config file"""
14 | 
15 |     if isinstance(config, str):
16 |         with open(config) as f:
17 |             config_dict: dict = yaml.safe_load(f)
18 |     elif isinstance(config, dict):
19 |         config_dict = config
20 |     else:
21 |         raise ValueError(
22 |             f"config must be either a yaml path or a dict, got {type(config)}"
23 |         )
24 | 
25 |     demos = []
26 |     for key, value in config_dict.items():
27 |         pipeline_def = import_dotted_string(key, safe=False)
28 |         if value["ui-type"] == "chat":
29 |             demos.append(build_chat_ui(value, pipeline_def).queue())
30 |         else:
31 |             demos.append(build_pipeline_ui(value, pipeline_def).queue())
32 |     if len(demos) == 1:
33 |         demo = demos[0]
34 |     else:
35 |         demo = gr.TabbedInterface(
36 |             demos,
37 |             tab_names=list(config_dict.keys()),
38 |             title="PromptUI from kotaemon",
39 |             analytics_enabled=False,
40 |             theme=John(),
41 |         )
42 | 
43 |     demo.queue()
44 | 
45 |     return demo
46 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/embeddings/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseEmbeddings
 2 | from .endpoint_based import EndpointEmbeddings
 3 | from .fastembed import FastEmbedEmbeddings
 4 | from .langchain_based import (
 5 |     LCAzureOpenAIEmbeddings,
 6 |     LCCohereEmbeddings,
 7 |     LCGoogleEmbeddings,
 8 |     LCHuggingFaceEmbeddings,
 9 |     LCMistralEmbeddings,
10 |     LCOpenAIEmbeddings,
11 | )
12 | from .openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
13 | from .tei_endpoint_embed import TeiEndpointEmbeddings
14 | from .voyageai import VoyageAIEmbeddings
15 | 
16 | __all__ = [
17 |     "BaseEmbeddings",
18 |     "EndpointEmbeddings",
19 |     "TeiEndpointEmbeddings",
20 |     "LCOpenAIEmbeddings",
21 |     "LCAzureOpenAIEmbeddings",
22 |     "LCCohereEmbeddings",
23 |     "LCHuggingFaceEmbeddings",
24 |     "LCGoogleEmbeddings",
25 |     "LCMistralEmbeddings",
26 |     "OpenAIEmbeddings",
27 |     "AzureOpenAIEmbeddings",
28 |     "FastEmbedEmbeddings",
29 |     "VoyageAIEmbeddings",
30 | ]
31 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/embeddings/base.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from kotaemon.base import BaseComponent, Document, DocumentWithEmbedding
 4 | 
 5 | 
 6 | class BaseEmbeddings(BaseComponent):
 7 |     def run(
 8 |         self, text: str | list[str] | Document | list[Document], *args, **kwargs
 9 |     ) -> list[DocumentWithEmbedding]:
10 |         return self.invoke(text, *args, **kwargs)
11 | 
12 |     def invoke(
13 |         self, text: str | list[str] | Document | list[Document], *args, **kwargs
14 |     ) -> list[DocumentWithEmbedding]:
15 |         raise NotImplementedError
16 | 
17 |     async def ainvoke(
18 |         self, text: str | list[str] | Document | list[Document], *args, **kwargs
19 |     ) -> list[DocumentWithEmbedding]:
20 |         raise NotImplementedError
21 | 
22 |     def prepare_input(
23 |         self, text: str | list[str] | Document | list[Document]
24 |     ) -> list[Document]:
25 |         if isinstance(text, (str, Document)):
26 |             return [Document(content=text)]
27 |         elif isinstance(text, list):
28 |             return [Document(content=_) for _ in text]
29 |         return text
30 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/embeddings/endpoint_based.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | from kotaemon.base import Document, DocumentWithEmbedding
 4 | 
 5 | from .base import BaseEmbeddings
 6 | 
 7 | 
 8 | class EndpointEmbeddings(BaseEmbeddings):
 9 |     """
10 |     An Embeddings component that uses an OpenAI API compatible endpoint.
11 | 
12 |     Attributes:
13 |         endpoint_url (str): The url of an OpenAI API compatible endpoint.
14 |     """
15 | 
16 |     endpoint_url: str
17 | 
18 |     def run(
19 |         self, text: str | list[str] | Document | list[Document]
20 |     ) -> list[DocumentWithEmbedding]:
21 |         """
22 |         Generate embeddings from text Args:
23 |             text (str | list[str] | Document | list[Document]): text to generate
24 |             embeddings from
25 |         Returns:
26 |             list[DocumentWithEmbedding]: embeddings
27 |         """
28 |         if not isinstance(text, list):
29 |             text = [text]
30 | 
31 |         outputs = []
32 | 
33 |         for item in text:
34 |             response = requests.post(
35 |                 self.endpoint_url, json={"input": str(item)}
36 |             ).json()
37 |             outputs.append(
38 |                 DocumentWithEmbedding(
39 |                     text=str(item),
40 |                     embedding=response["data"][0]["embedding"],
41 |                     total_tokens=response["usage"]["total_tokens"],
42 |                     prompt_tokens=response["usage"]["prompt_tokens"],
43 |                 )
44 |             )
45 | 
46 |         return outputs
47 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/embeddings/fastembed.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Optional
 2 | 
 3 | from kotaemon.base import Document, DocumentWithEmbedding, Param
 4 | 
 5 | from .base import BaseEmbeddings
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from fastembed import TextEmbedding
 9 | 
10 | 
11 | class FastEmbedEmbeddings(BaseEmbeddings):
12 |     """Utilize fastembed library for embeddings locally without GPU.
13 | 
14 |     Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/
15 |     Code: https://github.com/qdrant/fastembed
16 |     """
17 | 
18 |     model_name: str = Param(
19 |         "BAAI/bge-small-en-v1.5",
20 |         help=(
21 |             "Model name for fastembed. Please refer "
22 |             "[here](https://qdrant.github.io/fastembed/examples/Supported_Models/) "
23 |             "for the list of supported models."
24 |         ),
25 |         required=True,
26 |     )
27 |     batch_size: int = Param(
28 |         256,
29 |         help="Batch size for embeddings. Higher values use more memory, but are faster",
30 |     )
31 |     parallel: Optional[int] = Param(
32 |         None,
33 |         help=(
34 |             "Number of threads to use for embeddings. "
35 |             "If > 1, data-parallel encoding will be used. "
36 |             "If 0, use all available CPUs. "
37 |             "If None, use default onnxruntime threading. "
38 |             "Defaults to None."
39 |         ),
40 |     )
41 | 
42 |     @Param.auto()
43 |     def client_(self) -> "TextEmbedding":
44 |         try:
45 |             from fastembed import TextEmbedding
46 |         except ImportError:
47 |             raise ImportError("Please install FastEmbed: `pip install fastembed`")
48 | 
49 |         return TextEmbedding(model_name=self.model_name)
50 | 
51 |     def invoke(
52 |         self, text: str | list[str] | Document | list[Document], *args, **kwargs
53 |     ) -> list[DocumentWithEmbedding]:
54 |         input_ = self.prepare_input(text)
55 |         embeddings = self.client_.embed(
56 |             [_.content for _ in input_],
57 |             batch_size=self.batch_size,
58 |             parallel=self.parallel,
59 |         )
60 |         return [
61 |             DocumentWithEmbedding(
62 |                 content=doc,
63 |                 embedding=list(embedding),
64 |             )
65 |             for doc, embedding in zip(input_, embeddings)
66 |         ]
67 | 
68 |     async def ainvoke(
69 |         self, text: str | list[str] | Document | list[Document], *args, **kwargs
70 |     ) -> list[DocumentWithEmbedding]:
71 |         """Fastembed does not support async API."""
72 |         return self.invoke(text, *args, **kwargs)
73 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/embeddings/voyageai.py:
--------------------------------------------------------------------------------
 1 | """Implements embeddings from [Voyage AI](https://voyageai.com).
 2 | """
 3 | 
 4 | import importlib
 5 | 
 6 | from kotaemon.base import Document, DocumentWithEmbedding, Param
 7 | 
 8 | from .base import BaseEmbeddings
 9 | 
10 | vo = None
11 | 
12 | 
13 | def _import_voyageai():
14 |     global vo
15 |     if not vo:
16 |         vo = importlib.import_module("voyageai")
17 |     return vo
18 | 
19 | 
20 | def _format_output(texts: list[str], embeddings: list[list]):
21 |     """Formats the output of all `.embed` calls.
22 |     Args:
23 |         texts: List of original documents
24 |         embeddings: Embeddings corresponding to each document
25 |     """
26 |     return [
27 |         DocumentWithEmbedding(content=text, embedding=embedding)
28 |         for text, embedding in zip(texts, embeddings)
29 |     ]
30 | 
31 | 
32 | class VoyageAIEmbeddings(BaseEmbeddings):
33 |     """Voyage AI provides best-in-class embedding models and rerankers."""
34 | 
35 |     api_key: str = Param(None, help="Voyage API key", required=False)
36 |     model: str = Param(
37 |         "voyage-3",
38 |         help=(
39 |             "Model name to use. The Voyage "
40 |             "[documentation](https://docs.voyageai.com/docs/embeddings) "
41 |             "provides a list of all available embedding models."
42 |         ),
43 |         required=True,
44 |     )
45 | 
46 |     def __init__(self, *args, **kwargs):
47 |         super().__init__(*args, **kwargs)
48 |         if not self.api_key:
49 |             raise ValueError("API key must be provided for VoyageAIEmbeddings.")
50 | 
51 |         self._client = _import_voyageai().Client(api_key=self.api_key)
52 |         self._aclient = _import_voyageai().AsyncClient(api_key=self.api_key)
53 | 
54 |     def invoke(
55 |         self, text: str | list[str] | Document | list[Document], *args, **kwargs
56 |     ) -> list[DocumentWithEmbedding]:
57 |         texts = [t.content for t in self.prepare_input(text)]
58 |         embeddings = self._client.embed(texts, model=self.model).embeddings
59 |         return _format_output(texts, embeddings)
60 | 
61 |     async def ainvoke(
62 |         self, text: str | list[str] | Document | list[Document], *args, **kwargs
63 |     ) -> list[DocumentWithEmbedding]:
64 |         texts = [t.content for t in self.prepare_input(text)]
65 |         embeddings = await self._aclient.embed(texts, model=self.model).embeddings
66 |         return _format_output(texts, embeddings)
67 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/__init__.py:
--------------------------------------------------------------------------------
1 | from .vectorindex import VectorIndexing, VectorRetrieval
2 | 
3 | __all__ = ["VectorIndexing", "VectorRetrieval"]
4 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | from .doc_parsers import BaseDocParser, SummaryExtractor, TitleExtractor
2 | 
3 | __all__ = [
4 |     "BaseDocParser",
5 |     "TitleExtractor",
6 |     "SummaryExtractor",
7 | ]
8 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/extractors/doc_parsers.py:
--------------------------------------------------------------------------------
 1 | from ..base import DocTransformer, LlamaIndexDocTransformerMixin
 2 | 
 3 | 
 4 | class BaseDocParser(DocTransformer):
 5 |     ...
 6 | 
 7 | 
 8 | class TitleExtractor(LlamaIndexDocTransformerMixin, BaseDocParser):
 9 |     def __init__(
10 |         self,
11 |         llm=None,
12 |         nodes: int = 5,
13 |         **params,
14 |     ):
15 |         super().__init__(llm=llm, nodes=nodes, **params)
16 | 
17 |     def _get_li_class(self):
18 |         from llama_index.core.extractors import TitleExtractor
19 | 
20 |         return TitleExtractor
21 | 
22 | 
23 | class SummaryExtractor(LlamaIndexDocTransformerMixin, BaseDocParser):
24 |     def __init__(
25 |         self,
26 |         llm=None,
27 |         summaries: list[str] = ["self"],
28 |         **params,
29 |     ):
30 |         super().__init__(llm=llm, summaries=summaries, **params)
31 | 
32 |     def _get_li_class(self):
33 |         from llama_index.core.extractors import SummaryExtractor
34 | 
35 |         return SummaryExtractor
36 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/ingests/__init__.py:
--------------------------------------------------------------------------------
1 | from .files import DocumentIngestor
2 | 
3 | __all__ = ["DocumentIngestor"]
4 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/qa/__init__.py:
--------------------------------------------------------------------------------
1 | from .citation import CitationPipeline
2 | 
3 | __all__ = [
4 |     "CitationPipeline",
5 | ]
6 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/rankings/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseReranking
 2 | from .cohere import CohereReranking
 3 | from .llm import LLMReranking
 4 | from .llm_scoring import LLMScoring
 5 | from .llm_trulens import LLMTrulensScoring
 6 | 
 7 | __all__ = [
 8 |     "CohereReranking",
 9 |     "LLMReranking",
10 |     "LLMScoring",
11 |     "BaseReranking",
12 |     "LLMTrulensScoring",
13 | ]
14 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/rankings/base.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from abc import abstractmethod
 4 | 
 5 | from kotaemon.base import BaseComponent, Document
 6 | 
 7 | 
 8 | class BaseReranking(BaseComponent):
 9 |     @abstractmethod
10 |     def run(self, documents: list[Document], query: str) -> list[Document]:
11 |         """Main method to transform list of documents
12 |         (re-ranking, filtering, etc)"""
13 |         ...
14 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/rankings/cohere.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from decouple import config
 4 | 
 5 | from kotaemon.base import Document
 6 | 
 7 | from .base import BaseReranking
 8 | 
 9 | 
10 | class CohereReranking(BaseReranking):
11 |     model_name: str = "rerank-multilingual-v2.0"
12 |     cohere_api_key: str = config("COHERE_API_KEY", "")
13 |     use_key_from_ktem: bool = False
14 | 
15 |     def run(self, documents: list[Document], query: str) -> list[Document]:
16 |         """Use Cohere Reranker model to re-order documents
17 |         with their relevance score"""
18 |         try:
19 |             import cohere
20 |         except ImportError:
21 |             raise ImportError(
22 |                 "Please install Cohere `pip install cohere` to use Cohere Reranking"
23 |             )
24 | 
25 |         # try to get COHERE_API_KEY from embeddings
26 |         if not self.cohere_api_key and self.use_key_from_ktem:
27 |             try:
28 |                 from ktem.embeddings.manager import (
29 |                     embedding_models_manager as embeddings,
30 |                 )
31 | 
32 |                 cohere_model = embeddings.get("cohere")
33 |                 ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore
34 |                     "cohere_api_key"
35 |                 )
36 |                 if ktem_cohere_api_key != "your-key":
37 |                     self.cohere_api_key = ktem_cohere_api_key
38 |             except Exception as e:
39 |                 print("Cannot get Cohere API key from `ktem`", e)
40 | 
41 |         if not self.cohere_api_key:
42 |             print("Cohere API key not found. Skipping rerankings.")
43 |             return documents
44 | 
45 |         cohere_client = cohere.Client(self.cohere_api_key)
46 |         compressed_docs: list[Document] = []
47 | 
48 |         if not documents:  # to avoid empty api call
49 |             return compressed_docs
50 | 
51 |         _docs = [d.content for d in documents]
52 |         response = cohere_client.rerank(
53 |             model=self.model_name, query=query, documents=_docs
54 |         )
55 |         for r in response.results:
56 |             doc = documents[r.index]
57 |             doc.metadata["reranking_score"] = r.relevance_score
58 |             compressed_docs.append(doc)
59 | 
60 |         return compressed_docs
61 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/rankings/llm.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from concurrent.futures import ThreadPoolExecutor
 4 | 
 5 | from langchain.output_parsers.boolean import BooleanOutputParser
 6 | 
 7 | from kotaemon.base import Document
 8 | from kotaemon.llms import BaseLLM, PromptTemplate
 9 | 
10 | from .base import BaseReranking
11 | 
12 | RERANK_PROMPT_TEMPLATE = """Given the following question and context,
13 | return YES if the context is relevant to the question and NO if it isn't.
14 | 
15 | > Question: {question}
16 | > Context:
17 | >>>
18 | {context}
19 | >>>
20 | > Relevant (YES / NO):"""
21 | 
22 | 
23 | class LLMReranking(BaseReranking):
24 |     llm: BaseLLM
25 |     prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE)
26 |     top_k: int = 3
27 |     concurrent: bool = True
28 | 
29 |     def run(
30 |         self,
31 |         documents: list[Document],
32 |         query: str,
33 |     ) -> list[Document]:
34 |         """Filter down documents based on their relevance to the query."""
35 |         filtered_docs = []
36 |         output_parser = BooleanOutputParser()
37 | 
38 |         if self.concurrent:
39 |             with ThreadPoolExecutor() as executor:
40 |                 futures = []
41 |                 for doc in documents:
42 |                     _prompt = self.prompt_template.populate(
43 |                         question=query, context=doc.get_content()
44 |                     )
45 |                     futures.append(executor.submit(lambda: self.llm(_prompt).text))
46 | 
47 |                 results = [future.result() for future in futures]
48 |         else:
49 |             results = []
50 |             for doc in documents:
51 |                 _prompt = self.prompt_template.populate(
52 |                     question=query, context=doc.get_content()
53 |                 )
54 |                 results.append(self.llm(_prompt).text)
55 | 
56 |         # use Boolean parser to extract relevancy output from LLM
57 |         results = [output_parser.parse(result) for result in results]
58 |         for include_doc, doc in zip(results, documents):
59 |             if include_doc:
60 |                 filtered_docs.append(doc)
61 | 
62 |         # prevent returning empty result
63 |         if len(filtered_docs) == 0:
64 |             filtered_docs = documents[: self.top_k]
65 | 
66 |         return filtered_docs
67 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from concurrent.futures import ThreadPoolExecutor
 4 | 
 5 | import numpy as np
 6 | from langchain.output_parsers.boolean import BooleanOutputParser
 7 | 
 8 | from kotaemon.base import Document
 9 | 
10 | from .llm import LLMReranking
11 | 
12 | 
13 | class LLMScoring(LLMReranking):
14 |     def run(
15 |         self,
16 |         documents: list[Document],
17 |         query: str,
18 |     ) -> list[Document]:
19 |         """Filter down documents based on their relevance to the query."""
20 |         filtered_docs: list[Document] = []
21 |         output_parser = BooleanOutputParser()
22 | 
23 |         if self.concurrent:
24 |             with ThreadPoolExecutor() as executor:
25 |                 futures = []
26 |                 for doc in documents:
27 |                     _prompt = self.prompt_template.populate(
28 |                         question=query, context=doc.get_content()
29 |                     )
30 |                     futures.append(executor.submit(lambda: self.llm(_prompt)))
31 | 
32 |                 results = [future.result() for future in futures]
33 |         else:
34 |             results = []
35 |             for doc in documents:
36 |                 _prompt = self.prompt_template.populate(
37 |                     question=query, context=doc.get_content()
38 |                 )
39 |                 results.append(self.llm(_prompt))
40 | 
41 |         for result, doc in zip(results, documents):
42 |             score = np.exp(np.average(result.logprobs))
43 |             include_doc = output_parser.parse(result.text)
44 |             if include_doc:
45 |                 doc.metadata["llm_reranking_score"] = score
46 |             else:
47 |                 doc.metadata["llm_reranking_score"] = 1 - score
48 |             filtered_docs.append(doc)
49 | 
50 |         # prevent returning empty result
51 |         if len(filtered_docs) == 0:
52 |             filtered_docs = documents[: self.top_k]
53 | 
54 |         return filtered_docs
55 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/retrievers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/kotaemon/indices/retrievers/__init__.py


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/retrievers/jina_web_search.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from decouple import config
 3 | 
 4 | from kotaemon.base import BaseComponent, RetrievedDocument
 5 | 
 6 | JINA_API_KEY = config("JINA_API_KEY", default="")
 7 | JINA_URL = config("JINA_URL", default="https://r.jina.ai/")
 8 | 
 9 | 
10 | class WebSearch(BaseComponent):
11 |     """WebSearch component for fetching data from the web
12 |     using Jina API
13 |     """
14 | 
15 |     def run(
16 |         self,
17 |         text: str,
18 |         *args,
19 |         **kwargs,
20 |     ) -> list[RetrievedDocument]:
21 |         if JINA_API_KEY == "":
22 |             raise ValueError(
23 |                 "This feature requires JINA_API_KEY "
24 |                 "(get free one from https://jina.ai/reader)"
25 |             )
26 | 
27 |         # setup the request
28 |         api_url = f"https://s.jina.ai/{text}"
29 |         headers = {"X-With-Generated-Alt": "true", "Accept": "application/json"}
30 |         if JINA_API_KEY:
31 |             headers["Authorization"] = f"Bearer {JINA_API_KEY}"
32 | 
33 |         response = requests.get(api_url, headers=headers)
34 |         response.raise_for_status()
35 |         response_dict = response.json()
36 | 
37 |         return [
38 |             RetrievedDocument(
39 |                 text=(
40 |                     "###URL: [{url}]({url})\n\n"
41 |                     "####{title}\n\n"
42 |                     "{description}\n"
43 |                     "{content}"
44 |                 ).format(
45 |                     url=item["url"],
46 |                     title=item["title"],
47 |                     description=item["description"],
48 |                     content=item["content"],
49 |                 ),
50 |                 metadata={
51 |                     "file_name": "Web search",
52 |                     "type": "table",
53 |                     "llm_trulens_score": 1.0,
54 |                 },
55 |             )
56 |             for item in response_dict["data"]
57 |         ]
58 | 
59 |     def generate_relevant_scores(self, text, documents: list[RetrievedDocument]):
60 |         return documents
61 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/retrievers/tavily_web_search.py:
--------------------------------------------------------------------------------
 1 | from decouple import config
 2 | 
 3 | from kotaemon.base import BaseComponent, RetrievedDocument
 4 | 
 5 | TAVILY_API_KEY = config("TAVILY_API_KEY", default="")
 6 | 
 7 | 
 8 | class WebSearch(BaseComponent):
 9 |     """WebSearch component for fetching data from the web
10 |     using Jina API
11 |     """
12 | 
13 |     def run(
14 |         self,
15 |         text: str,
16 |         *args,
17 |         **kwargs,
18 |     ) -> list[RetrievedDocument]:
19 |         if TAVILY_API_KEY == "":
20 |             raise ValueError(
21 |                 "This feature requires TAVILY_API_KEY "
22 |                 "(get free one from https://app.tavily.com/)"
23 |             )
24 | 
25 |         try:
26 |             from tavily import TavilyClient
27 |         except ImportError:
28 |             raise ImportError(
29 |                 "Please install `pip install tavily-python` to use this feature"
30 |             )
31 | 
32 |         tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
33 |         results = tavily_client.search(
34 |             query=text,
35 |             search_depth="advanced",
36 |         )["results"]
37 |         context = "\n\n".join(
38 |             "###URL: [{url}]({url})\n\n{content}".format(
39 |                 url=result["url"],
40 |                 content=result["content"],
41 |             )
42 |             for result in results
43 |         )
44 | 
45 |         return [
46 |             RetrievedDocument(
47 |                 text=context,
48 |                 metadata={
49 |                     "file_name": "Web search",
50 |                     "type": "table",
51 |                     "llm_trulens_score": 1.0,
52 |                 },
53 |             )
54 |         ]
55 | 
56 |     def generate_relevant_scores(self, text, documents: list[RetrievedDocument]):
57 |         return documents
58 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/indices/splitters/__init__.py:
--------------------------------------------------------------------------------
 1 | from ..base import DocTransformer, LlamaIndexDocTransformerMixin
 2 | 
 3 | 
 4 | class BaseSplitter(DocTransformer):
 5 |     """Represent base splitter class"""
 6 | 
 7 |     ...
 8 | 
 9 | 
10 | class TokenSplitter(LlamaIndexDocTransformerMixin, BaseSplitter):
11 |     def __init__(
12 |         self,
13 |         chunk_size: int = 1024,
14 |         chunk_overlap: int = 20,
15 |         separator: str = " ",
16 |         **params,
17 |     ):
18 |         super().__init__(
19 |             chunk_size=chunk_size,
20 |             chunk_overlap=chunk_overlap,
21 |             separator=separator,
22 |             **params,
23 |         )
24 | 
25 |     def _get_li_class(self):
26 |         from llama_index.core.text_splitter import TokenTextSplitter
27 | 
28 |         return TokenTextSplitter
29 | 
30 | 
31 | class SentenceWindowSplitter(LlamaIndexDocTransformerMixin, BaseSplitter):
32 |     def __init__(
33 |         self,
34 |         window_size: int = 3,
35 |         window_metadata_key: str = "window",
36 |         original_text_metadata_key: str = "original_text",
37 |         **params,
38 |     ):
39 |         super().__init__(
40 |             window_size=window_size,
41 |             window_metadata_key=window_metadata_key,
42 |             original_text_metadata_key=original_text_metadata_key,
43 |             **params,
44 |         )
45 | 
46 |     def _get_li_class(self):
47 |         from llama_index.core.node_parser import SentenceWindowNodeParser
48 | 
49 |         return SentenceWindowNodeParser
50 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/llms/__init__.py:
--------------------------------------------------------------------------------
 1 | from kotaemon.base.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage
 2 | 
 3 | from .base import BaseLLM
 4 | from .branching import GatedBranchingPipeline, SimpleBranchingPipeline
 5 | from .chats import (
 6 |     AzureChatOpenAI,
 7 |     ChatLLM,
 8 |     ChatOpenAI,
 9 |     EndpointChatLLM,
10 |     LCAnthropicChat,
11 |     LCAzureChatOpenAI,
12 |     LCChatOpenAI,
13 |     LCCohereChat,
14 |     LCGeminiChat,
15 |     LCOllamaChat,
16 |     LlamaCppChat,
17 |     StructuredOutputChatOpenAI,
18 | )
19 | from .completions import LLM, AzureOpenAI, LlamaCpp, OpenAI
20 | from .cot import ManualSequentialChainOfThought, Thought
21 | from .linear import GatedLinearPipeline, SimpleLinearPipeline
22 | from .prompts import BasePromptComponent, PromptTemplate
23 | 
24 | __all__ = [
25 |     "BaseLLM",
26 |     # chat-specific components
27 |     "ChatLLM",
28 |     "EndpointChatLLM",
29 |     "BaseMessage",
30 |     "HumanMessage",
31 |     "AIMessage",
32 |     "SystemMessage",
33 |     "AzureChatOpenAI",
34 |     "ChatOpenAI",
35 |     "StructuredOutputChatOpenAI",
36 |     "LCAnthropicChat",
37 |     "LCGeminiChat",
38 |     "LCCohereChat",
39 |     "LCOllamaChat",
40 |     "LCAzureChatOpenAI",
41 |     "LCChatOpenAI",
42 |     "LlamaCppChat",
43 |     # completion-specific components
44 |     "LLM",
45 |     "OpenAI",
46 |     "AzureOpenAI",
47 |     "LlamaCpp",
48 |     # prompt-specific components
49 |     "BasePromptComponent",
50 |     "PromptTemplate",
51 |     # strategies
52 |     "SimpleLinearPipeline",
53 |     "GatedLinearPipeline",
54 |     "SimpleBranchingPipeline",
55 |     "GatedBranchingPipeline",
56 |     # chain-of-thoughts
57 |     "ManualSequentialChainOfThought",
58 |     "Thought",
59 | ]
60 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/llms/base.py:
--------------------------------------------------------------------------------
 1 | from typing import AsyncGenerator, Iterator
 2 | 
 3 | from langchain_core.language_models.base import BaseLanguageModel
 4 | 
 5 | from kotaemon.base import BaseComponent, LLMInterface
 6 | 
 7 | 
 8 | class BaseLLM(BaseComponent):
 9 |     def to_langchain_format(self) -> BaseLanguageModel:
10 |         raise NotImplementedError
11 | 
12 |     def invoke(self, *args, **kwargs) -> LLMInterface:
13 |         raise NotImplementedError
14 | 
15 |     async def ainvoke(self, *args, **kwargs) -> LLMInterface:
16 |         raise NotImplementedError
17 | 
18 |     def stream(self, *args, **kwargs) -> Iterator[LLMInterface]:
19 |         raise NotImplementedError
20 | 
21 |     def astream(self, *args, **kwargs) -> AsyncGenerator[LLMInterface, None]:
22 |         raise NotImplementedError
23 | 
24 |     def run(self, *args, **kwargs):
25 |         return self.invoke(*args, **kwargs)
26 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/llms/chats/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import ChatLLM
 2 | from .endpoint_based import EndpointChatLLM
 3 | from .langchain_based import (
 4 |     LCAnthropicChat,
 5 |     LCAzureChatOpenAI,
 6 |     LCChatMixin,
 7 |     LCChatOpenAI,
 8 |     LCCohereChat,
 9 |     LCGeminiChat,
10 |     LCOllamaChat,
11 | )
12 | from .llamacpp import LlamaCppChat
13 | from .openai import AzureChatOpenAI, ChatOpenAI, StructuredOutputChatOpenAI
14 | 
15 | __all__ = [
16 |     "ChatOpenAI",
17 |     "AzureChatOpenAI",
18 |     "ChatLLM",
19 |     "EndpointChatLLM",
20 |     "ChatOpenAI",
21 |     "StructuredOutputChatOpenAI",
22 |     "LCAnthropicChat",
23 |     "LCGeminiChat",
24 |     "LCCohereChat",
25 |     "LCOllamaChat",
26 |     "LCChatOpenAI",
27 |     "LCAzureChatOpenAI",
28 |     "LCChatMixin",
29 |     "LlamaCppChat",
30 | ]
31 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/llms/chats/base.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import logging
 4 | 
 5 | from kotaemon.base import BaseComponent
 6 | from kotaemon.llms.base import BaseLLM
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class ChatLLM(BaseLLM):
12 |     def flow(self):
13 |         if self.inflow is None:
14 |             raise ValueError("No inflow provided.")
15 | 
16 |         if not isinstance(self.inflow, BaseComponent):
17 |             raise ValueError(
18 |                 f"inflow must be a BaseComponent, found {type(self.inflow)}"
19 |             )
20 | 
21 |         text = self.inflow.flow().text
22 |         return self.__call__(text)
23 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/llms/completions/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import LLM
2 | from .langchain_based import AzureOpenAI, LCCompletionMixin, LlamaCpp, OpenAI
3 | 
4 | __all__ = ["LLM", "OpenAI", "AzureOpenAI", "LCCompletionMixin", "LlamaCpp"]
5 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/llms/completions/base.py:
--------------------------------------------------------------------------------
1 | from kotaemon.llms.base import BaseLLM
2 | 
3 | 
4 | class LLM(BaseLLM):
5 |     pass
6 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/llms/prompts/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BasePromptComponent
2 | from .template import PromptTemplate
3 | 
4 | __all__ = ["BasePromptComponent", "PromptTemplate"]
5 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/loaders/__init__.py:
--------------------------------------------------------------------------------
 1 | from .adobe_loader import AdobeReader
 2 | from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader
 3 | from .base import AutoReader, BaseReader
 4 | from .composite_loader import DirectoryReader
 5 | from .docling_loader import DoclingReader
 6 | from .docx_loader import DocxReader
 7 | from .excel_loader import ExcelReader, PandasExcelReader
 8 | from .html_loader import HtmlReader, MhtmlReader
 9 | from .mathpix_loader import MathpixPDFReader
10 | from .ocr_loader import ImageReader, OCRReader
11 | from .pdf_loader import PDFThumbnailReader
12 | from .txt_loader import TxtReader
13 | from .unstructured_loader import UnstructuredReader
14 | from .web_loader import WebReader
15 | 
16 | __all__ = [
17 |     "AutoReader",
18 |     "AzureAIDocumentIntelligenceLoader",
19 |     "BaseReader",
20 |     "PandasExcelReader",
21 |     "ExcelReader",
22 |     "MathpixPDFReader",
23 |     "ImageReader",
24 |     "OCRReader",
25 |     "DirectoryReader",
26 |     "UnstructuredReader",
27 |     "DocxReader",
28 |     "HtmlReader",
29 |     "MhtmlReader",
30 |     "AdobeReader",
31 |     "TxtReader",
32 |     "PDFThumbnailReader",
33 |     "WebReader",
34 |     "DoclingReader",
35 | ]
36 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/loaders/composite_loader.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, List, Optional, Type
 2 | 
 3 | from llama_index.core.readers.base import BaseReader as LIBaseReader
 4 | 
 5 | from .base import BaseReader, LIReaderMixin
 6 | 
 7 | 
 8 | class DirectoryReader(LIReaderMixin, BaseReader):
 9 |     """Wrap around llama-index SimpleDirectoryReader
10 | 
11 |     Args:
12 |         input_dir (str): Path to the directory.
13 |         input_files (List): List of file paths to read
14 |             (Optional; overrides input_dir, exclude)
15 |         exclude (List): glob of python file paths to exclude (Optional)
16 |         exclude_hidden (bool): Whether to exclude hidden files (dotfiles).
17 |         encoding (str): Encoding of the files.
18 |             Default is utf-8.
19 |         errors (str): how encoding and decoding errors are to be handled,
20 |               see https://docs.python.org/3/library/functions.html#open
21 |         recursive (bool): Whether to recursively search in subdirectories.
22 |             False by default.
23 |         filename_as_id (bool): Whether to use the filename as the document id.
24 |             False by default.
25 |         required_exts (Optional[List[str]]): List of required extensions.
26 |             Default is None.
27 |         file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
28 |             extension to a BaseReader class that specifies how to convert that file
29 |             to text. If not specified, use default from DEFAULT_FILE_READER_CLS.
30 |         num_files_limit (Optional[int]): Maximum number of files to read.
31 |             Default is None.
32 |         file_metadata (Optional[Callable[str, Dict]]): A function that takes
33 |             in a filename and returns a Dict of metadata for the Document.
34 |             Default is None.
35 |     """
36 | 
37 |     input_dir: Optional[str] = None
38 |     input_files: Optional[List] = None
39 |     exclude: Optional[List] = None
40 |     exclude_hidden: bool = True
41 |     errors: str = "ignore"
42 |     recursive: bool = False
43 |     encoding: str = "utf-8"
44 |     filename_as_id: bool = False
45 |     required_exts: Optional[list[str]] = None
46 |     file_extractor: Optional[dict[str, "LIBaseReader"]] = None
47 |     num_files_limit: Optional[int] = None
48 |     file_metadata: Optional[Callable[[str], dict]] = None
49 | 
50 |     def _get_wrapped_class(self) -> Type["LIBaseReader"]:
51 |         from llama_index.core import SimpleDirectoryReader
52 | 
53 |         return SimpleDirectoryReader
54 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/loaders/txt_loader.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Optional
 3 | 
 4 | from kotaemon.base import Document
 5 | 
 6 | from .base import BaseReader
 7 | 
 8 | 
 9 | class TxtReader(BaseReader):
10 |     def run(
11 |         self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
12 |     ) -> list[Document]:
13 |         return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)
14 | 
15 |     def load_data(
16 |         self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
17 |     ) -> list[Document]:
18 |         with open(file_path, "r", encoding="utf-8") as f:
19 |             text = f.read()
20 | 
21 |         metadata = extra_info or {}
22 |         return [Document(text=text, metadata=metadata)]
23 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/loaders/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/kotaemon/loaders/utils/__init__.py


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/loaders/web_loader.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Optional
 3 | 
 4 | import requests
 5 | from decouple import config
 6 | 
 7 | from kotaemon.base import Document
 8 | 
 9 | from .base import BaseReader
10 | 
11 | JINA_API_KEY = config("JINA_API_KEY", default="")
12 | JINA_URL = config("JINA_URL", default="https://r.jina.ai/")
13 | 
14 | 
15 | class WebReader(BaseReader):
16 |     def run(
17 |         self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
18 |     ) -> list[Document]:
19 |         return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)
20 | 
21 |     def fetch_url(self, url: str):
22 |         # setup the request
23 |         api_url = f"https://r.jina.ai/{url}"
24 |         headers = {
25 |             "X-With-Links-Summary": "true",
26 |         }
27 |         if JINA_API_KEY:
28 |             headers["Authorization"] = f"Bearer {JINA_API_KEY}"
29 | 
30 |         response = requests.get(api_url, headers=headers)
31 |         response.raise_for_status()
32 | 
33 |         data = response.text
34 |         return data
35 | 
36 |     def load_data(
37 |         self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
38 |     ) -> list[Document]:
39 |         file_path = str(file_path)
40 |         output = self.fetch_url(file_path)
41 |         metadata = extra_info or {}
42 | 
43 |         return [Document(text=output, metadata=metadata)]
44 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | from .regex_extractor import FirstMatchRegexExtractor, RegexExtractor
2 | 
3 | __all__ = ["RegexExtractor", "FirstMatchRegexExtractor"]
4 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/rerankings/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BaseReranking
2 | from .cohere import CohereReranking
3 | from .tei_fast_rerank import TeiFastReranking
4 | from .voyageai import VoyageAIReranking
5 | 
6 | __all__ = ["BaseReranking", "TeiFastReranking", "CohereReranking", "VoyageAIReranking"]
7 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/rerankings/base.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from abc import abstractmethod
 4 | 
 5 | from kotaemon.base import BaseComponent, Document
 6 | 
 7 | 
 8 | class BaseReranking(BaseComponent):
 9 |     @abstractmethod
10 |     def run(self, documents: list[Document], query: str) -> list[Document]:
11 |         """Main method to transform list of documents
12 |         (re-ranking, filtering, etc)"""
13 |         ...
14 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/rerankings/cohere.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from decouple import config
 4 | 
 5 | from kotaemon.base import Document, Param
 6 | 
 7 | from .base import BaseReranking
 8 | 
 9 | 
10 | class CohereReranking(BaseReranking):
11 |     """Cohere Reranking model"""
12 | 
13 |     model_name: str = Param(
14 |         "rerank-multilingual-v2.0",
15 |         help=(
16 |             "ID of the model to use. You can go to [Supported Models]"
17 |             "(https://docs.cohere.com/docs/rerank-2) to see the supported models"
18 |         ),
19 |         required=True,
20 |     )
21 |     cohere_api_key: str = Param(
22 |         config("COHERE_API_KEY", ""),
23 |         help="Cohere API key",
24 |         required=True,
25 |     )
26 | 
27 |     def run(self, documents: list[Document], query: str) -> list[Document]:
28 |         """Use Cohere Reranker model to re-order documents
29 |         with their relevance score"""
30 |         try:
31 |             import cohere
32 |         except ImportError:
33 |             raise ImportError(
34 |                 "Please install Cohere " "`pip install cohere` to use Cohere Reranking"
35 |             )
36 | 
37 |         if not self.cohere_api_key or "COHERE_API_KEY" in self.cohere_api_key:
38 |             print("Cohere API key not found. Skipping rerankings.")
39 |             return documents
40 | 
41 |         cohere_client = cohere.Client(self.cohere_api_key)
42 |         compressed_docs: list[Document] = []
43 | 
44 |         if not documents:  # to avoid empty api call
45 |             return compressed_docs
46 | 
47 |         _docs = [d.content for d in documents]
48 |         response = cohere_client.rerank(
49 |             model=self.model_name, query=query, documents=_docs
50 |         )
51 |         for r in response.results:
52 |             doc = documents[r.index]
53 |             doc.metadata["reranking_score"] = r.relevance_score
54 |             compressed_docs.append(doc)
55 | 
56 |         return compressed_docs
57 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/rerankings/voyageai.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import importlib
 4 | 
 5 | from decouple import config
 6 | 
 7 | from kotaemon.base import Document, Param
 8 | 
 9 | from .base import BaseReranking
10 | 
11 | vo = None
12 | 
13 | 
14 | def _import_voyageai():
15 |     global vo
16 |     if not vo:
17 |         vo = importlib.import_module("voyageai")
18 |     return vo
19 | 
20 | 
21 | class VoyageAIReranking(BaseReranking):
22 |     """VoyageAI Reranking model"""
23 | 
24 |     model_name: str = Param(
25 |         "rerank-2",
26 |         help=(
27 |             "ID of the model to use. You can go to [Supported Models]"
28 |             "(https://docs.voyageai.com/docs/reranker) to see the supported models"
29 |         ),
30 |         required=True,
31 |     )
32 |     api_key: str = Param(
33 |         config("VOYAGE_API_KEY", ""),
34 |         help="VoyageAI API key",
35 |         required=True,
36 |     )
37 | 
38 |     def __init__(self, *args, **kwargs):
39 |         super().__init__(*args, **kwargs)
40 |         if not self.api_key:
41 |             raise ValueError("API key must be provided for VoyageAIEmbeddings.")
42 | 
43 |         self._client = _import_voyageai().Client(api_key=self.api_key)
44 |         self._aclient = _import_voyageai().AsyncClient(api_key=self.api_key)
45 | 
46 |     def run(self, documents: list[Document], query: str) -> list[Document]:
47 |         """Use VoyageAI Reranker model to re-order documents
48 |         with their relevance score"""
49 |         compressed_docs: list[Document] = []
50 | 
51 |         if not documents:  # to avoid empty api call
52 |             return compressed_docs
53 | 
54 |         _docs = [d.content for d in documents]
55 |         response = self._client.rerank(
56 |             model=self.model_name, query=query, documents=_docs
57 |         )
58 |         for r in response.results:
59 |             doc = documents[r.index]
60 |             doc.metadata["reranking_score"] = r.relevance_score
61 |             compressed_docs.append(doc)
62 | 
63 |         return compressed_docs
64 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/storages/__init__.py:
--------------------------------------------------------------------------------
 1 | from .docstores import (
 2 |     BaseDocumentStore,
 3 |     ElasticsearchDocumentStore,
 4 |     InMemoryDocumentStore,
 5 |     LanceDBDocumentStore,
 6 |     SimpleFileDocumentStore,
 7 | )
 8 | from .vectorstores import (
 9 |     BaseVectorStore,
10 |     ChromaVectorStore,
11 |     InMemoryVectorStore,
12 |     LanceDBVectorStore,
13 |     MilvusVectorStore,
14 |     QdrantVectorStore,
15 |     SimpleFileVectorStore,
16 | )
17 | 
18 | __all__ = [
19 |     # Document stores
20 |     "BaseDocumentStore",
21 |     "InMemoryDocumentStore",
22 |     "ElasticsearchDocumentStore",
23 |     "SimpleFileDocumentStore",
24 |     "LanceDBDocumentStore",
25 |     # Vector stores
26 |     "BaseVectorStore",
27 |     "ChromaVectorStore",
28 |     "InMemoryVectorStore",
29 |     "SimpleFileVectorStore",
30 |     "LanceDBVectorStore",
31 |     "MilvusVectorStore",
32 |     "QdrantVectorStore",
33 | ]
34 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/storages/docstores/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseDocumentStore
 2 | from .elasticsearch import ElasticsearchDocumentStore
 3 | from .in_memory import InMemoryDocumentStore
 4 | from .lancedb import LanceDBDocumentStore
 5 | from .simple_file import SimpleFileDocumentStore
 6 | 
 7 | __all__ = [
 8 |     "BaseDocumentStore",
 9 |     "InMemoryDocumentStore",
10 |     "ElasticsearchDocumentStore",
11 |     "SimpleFileDocumentStore",
12 |     "LanceDBDocumentStore",
13 | ]
14 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/storages/docstores/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List, Optional, Union
 3 | 
 4 | from kotaemon.base import Document
 5 | 
 6 | 
 7 | class BaseDocumentStore(ABC):
 8 |     """A document store is in charged of storing and managing documents"""
 9 | 
10 |     @abstractmethod
11 |     def __init__(self, *args, **kwargs):
12 |         ...
13 | 
14 |     @abstractmethod
15 |     def add(
16 |         self,
17 |         docs: Union[Document, List[Document]],
18 |         ids: Optional[Union[List[str], str]] = None,
19 |         **kwargs,
20 |     ):
21 |         """Add document into document store
22 | 
23 |         Args:
24 |             docs: Document or list of documents
25 |             ids: List of ids of the documents. Optional, if not set will use doc.doc_id
26 |         """
27 |         ...
28 | 
29 |     @abstractmethod
30 |     def get(self, ids: Union[List[str], str]) -> List[Document]:
31 |         """Get document by id"""
32 |         ...
33 | 
34 |     @abstractmethod
35 |     def get_all(self) -> List[Document]:
36 |         """Get all documents"""
37 |         ...
38 | 
39 |     @abstractmethod
40 |     def count(self) -> int:
41 |         """Count number of documents"""
42 |         ...
43 | 
44 |     @abstractmethod
45 |     def query(
46 |         self, query: str, top_k: int = 10, doc_ids: Optional[list] = None
47 |     ) -> List[Document]:
48 |         """Search document store using search query"""
49 |         ...
50 | 
51 |     @abstractmethod
52 |     def delete(self, ids: Union[List[str], str]):
53 |         """Delete document by id"""
54 |         ...
55 | 
56 |     @abstractmethod
57 |     def drop(self):
58 |         """Drop the document store"""
59 |         ...
60 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/storages/docstores/simple_file.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import List, Optional, Union
 3 | 
 4 | from kotaemon.base import Document
 5 | 
 6 | from .in_memory import InMemoryDocumentStore
 7 | 
 8 | 
 9 | class SimpleFileDocumentStore(InMemoryDocumentStore):
10 |     """Improve InMemoryDocumentStore by auto saving whenever the corpus is changed"""
11 | 
12 |     def __init__(self, path: str | Path, collection_name: str = "default"):
13 |         super().__init__()
14 |         self._path = path
15 |         self._collection_name = collection_name
16 | 
17 |         Path(path).mkdir(parents=True, exist_ok=True)
18 |         self._save_path = Path(path) / f"{collection_name}.json"
19 |         if self._save_path.is_file():
20 |             self.load(self._save_path)
21 | 
22 |     def get(self, ids: Union[List[str], str]) -> List[Document]:
23 |         """Get document by id"""
24 |         if not isinstance(ids, list):
25 |             ids = [ids]
26 | 
27 |         for doc_id in ids:
28 |             if doc_id not in self._store:
29 |                 self.load(self._save_path)
30 |                 break
31 | 
32 |         return [self._store[doc_id] for doc_id in ids]
33 | 
34 |     def add(
35 |         self,
36 |         docs: Union[Document, List[Document]],
37 |         ids: Optional[Union[List[str], str]] = None,
38 |         **kwargs,
39 |     ):
40 |         """Add document into document store
41 | 
42 |         Args:
43 |             docs: list of documents to add
44 |             ids: specify the ids of documents to add or
45 |                 use existing doc.doc_id
46 |             exist_ok: raise error when duplicate doc-id
47 |                 found in the docstore (default to False)
48 |         """
49 |         super().add(docs=docs, ids=ids, **kwargs)
50 |         self.save(self._save_path)
51 | 
52 |     def delete(self, ids: Union[List[str], str]):
53 |         """Delete document by id"""
54 |         super().delete(ids=ids)
55 |         self.save(self._save_path)
56 | 
57 |     def drop(self):
58 |         """Drop the document store"""
59 |         super().drop()
60 |         self._save_path.unlink(missing_ok=True)
61 | 
62 |     def __persist_flow__(self):
63 |         from theflow.utils.modules import serialize
64 | 
65 |         return {
66 |             "path": serialize(self._path),
67 |             "collection_name": self._collection_name,
68 |         }
69 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/storages/vectorstores/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseVectorStore
 2 | from .chroma import ChromaVectorStore
 3 | from .in_memory import InMemoryVectorStore
 4 | from .lancedb import LanceDBVectorStore
 5 | from .milvus import MilvusVectorStore
 6 | from .qdrant import QdrantVectorStore
 7 | from .simple_file import SimpleFileVectorStore
 8 | 
 9 | __all__ = [
10 |     "BaseVectorStore",
11 |     "ChromaVectorStore",
12 |     "InMemoryVectorStore",
13 |     "SimpleFileVectorStore",
14 |     "LanceDBVectorStore",
15 |     "MilvusVectorStore",
16 |     "QdrantVectorStore",
17 | ]
18 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py:
--------------------------------------------------------------------------------
 1 | """Simple vector store index."""
 2 | from typing import Any, Optional, Type
 3 | 
 4 | import fsspec
 5 | from llama_index.core.vector_stores import SimpleVectorStore as LISimpleVectorStore
 6 | from llama_index.core.vector_stores.simple import SimpleVectorStoreData
 7 | 
 8 | from .base import LlamaIndexVectorStore
 9 | 
10 | 
11 | class InMemoryVectorStore(LlamaIndexVectorStore):
12 |     _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore
13 |     store_text: bool = False
14 | 
15 |     def __init__(
16 |         self,
17 |         data: Optional[SimpleVectorStoreData] = None,
18 |         fs: Optional[fsspec.AbstractFileSystem] = None,
19 |         **kwargs: Any,
20 |     ) -> None:
21 |         """Initialize params."""
22 |         self._data = data or SimpleVectorStoreData()
23 |         self._fs = fs or fsspec.filesystem("file")
24 | 
25 |         super().__init__(
26 |             data=data,
27 |             fs=fs,
28 |             **kwargs,
29 |         )
30 | 
31 |     def save(
32 |         self,
33 |         save_path: str,
34 |         fs: Optional[fsspec.AbstractFileSystem] = None,
35 |         **kwargs,
36 |     ):
37 | 
38 |         """save a simpleVectorStore to a dictionary.
39 | 
40 |         Args:
41 |             save_path: Path of saving vector to disk.
42 |             fs: An abstract super-class for pythonic file-systems
43 |         """
44 |         self._client.persist(persist_path=save_path, fs=fs)
45 | 
46 |     def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):
47 | 
48 |         """Create a SimpleKVStore from a load directory.
49 | 
50 |         Args:
51 |             load_path: Path of loading vector.
52 |             fs: An abstract super-class for pythonic file-systems
53 |         """
54 |         self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)
55 | 
56 |     def drop(self):
57 |         """Clear the old data"""
58 |         self._data = SimpleVectorStoreData()
59 | 
60 |     def __persist_flow__(self):
61 |         d = self._data.to_dict()
62 |         d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
63 |         return {
64 |             "data": d,
65 |             # "fs": self._fs,
66 |         }
67 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, List, Optional, cast
 2 | 
 3 | from .base import LlamaIndexVectorStore
 4 | 
 5 | 
 6 | class QdrantVectorStore(LlamaIndexVectorStore):
 7 |     _li_class = None
 8 | 
 9 |     def _get_li_class(self):
10 |         try:
11 |             from llama_index.vector_stores.qdrant import (
12 |                 QdrantVectorStore as LIQdrantVectorStore,
13 |             )
14 |         except ImportError:
15 |             raise ImportError(
16 |                 "Please install missing package: "
17 |                 "'pip install llama-index-vector-stores-qdrant'"
18 |             )
19 | 
20 |         return LIQdrantVectorStore
21 | 
22 |     def __init__(
23 |         self,
24 |         collection_name,
25 |         url: Optional[str] = None,
26 |         api_key: Optional[str] = None,
27 |         client_kwargs: Optional[dict] = None,
28 |         **kwargs: Any,
29 |     ):
30 |         self._collection_name = collection_name
31 |         self._url = url
32 |         self._api_key = api_key
33 |         self._client_kwargs = client_kwargs
34 |         self._kwargs = kwargs
35 | 
36 |         super().__init__(
37 |             collection_name=collection_name,
38 |             url=url,
39 |             api_key=api_key,
40 |             client_kwargs=client_kwargs,
41 |             **kwargs,
42 |         )
43 |         from llama_index.vector_stores.qdrant import (
44 |             QdrantVectorStore as LIQdrantVectorStore,
45 |         )
46 | 
47 |         self._client = cast(LIQdrantVectorStore, self._client)
48 | 
49 |     def delete(self, ids: List[str], **kwargs):
50 |         """Delete vector embeddings from vector stores
51 | 
52 |         Args:
53 |             ids: List of ids of the embeddings to be deleted
54 |             kwargs: meant for vectorstore-specific parameters
55 |         """
56 |         from qdrant_client import models
57 | 
58 |         self._client.client.delete(
59 |             collection_name=self._collection_name,
60 |             points_selector=models.PointIdsList(
61 |                 points=ids,
62 |             ),
63 |             **kwargs,
64 |         )
65 | 
66 |     def drop(self):
67 |         """Delete entire collection from vector stores"""
68 |         self._client.client.delete_collection(self._collection_name)
69 | 
70 |     def count(self) -> int:
71 |         return self._client.client.count(
72 |             collection_name=self._collection_name, exact=True
73 |         ).count
74 | 
75 |     def __persist_flow__(self):
76 |         return {
77 |             "collection_name": self._collection_name,
78 |             "url": self._url,
79 |             "api_key": self._api_key,
80 |             "client_kwargs": self._client_kwargs,
81 |             **self._kwargs,
82 |         }
83 | 


--------------------------------------------------------------------------------
/libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py:
--------------------------------------------------------------------------------
 1 | """Simple file vector store index."""
 2 | from pathlib import Path
 3 | from typing import Any, Optional, Type
 4 | 
 5 | import fsspec
 6 | from llama_index.core.vector_stores import SimpleVectorStore as LISimpleVectorStore
 7 | from llama_index.core.vector_stores.simple import SimpleVectorStoreData
 8 | 
 9 | from kotaemon.base import DocumentWithEmbedding
10 | 
11 | from .base import LlamaIndexVectorStore
12 | 
13 | 
14 | class SimpleFileVectorStore(LlamaIndexVectorStore):
15 |     """Similar to InMemoryVectorStore but is backed by file by default"""
16 | 
17 |     _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore
18 |     store_text: bool = False
19 | 
20 |     def __init__(
21 |         self,
22 |         path: str | Path,
23 |         collection_name: str = "default",
24 |         data: Optional[SimpleVectorStoreData] = None,
25 |         fs: Optional[fsspec.AbstractFileSystem] = None,
26 |         **kwargs: Any,
27 |     ) -> None:
28 |         """Initialize params."""
29 |         self._data = data or SimpleVectorStoreData()
30 |         self._fs = fs or fsspec.filesystem("file")
31 |         self._collection_name = collection_name
32 |         self._path = path
33 |         self._save_path = Path(path) / collection_name
34 | 
35 |         super().__init__(
36 |             data=data,
37 |             fs=fs,
38 |             **kwargs,
39 |         )
40 | 
41 |         if self._save_path.is_file():
42 |             self._client = self._li_class.from_persist_path(
43 |                 persist_path=str(self._save_path), fs=self._fs
44 |             )
45 | 
46 |     def add(
47 |         self,
48 |         embeddings: list[list[float]] | list[DocumentWithEmbedding],
49 |         metadatas: Optional[list[dict]] = None,
50 |         ids: Optional[list[str]] = None,
51 |     ):
52 |         r = super().add(embeddings, metadatas, ids)
53 |         self._client.persist(str(self._save_path), self._fs)
54 |         return r
55 | 
56 |     def delete(self, ids: list[str], **kwargs):
57 |         r = super().delete(ids, **kwargs)
58 |         self._client.persist(str(self._save_path), self._fs)
59 |         return r
60 | 
61 |     def drop(self):
62 |         self._data = SimpleVectorStoreData()
63 |         self._save_path.unlink(missing_ok=True)
64 | 
65 |     def __persist_flow__(self):
66 |         d = self._data.to_dict()
67 |         d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
68 |         return {
69 |             "data": d,
70 |             "collection_name": self._collection_name,
71 |             "path": str(self._path),
72 |             # "fs": self._fs,
73 |         }
74 | 


--------------------------------------------------------------------------------
/libs/kotaemon/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | minversion = 7.4.0
 3 | testpaths = tests
 4 | addopts = -ra -q
 5 | log_cli=true
 6 | log_level=WARNING
 7 | log_format = %(asctime)s %(levelname)s %(message)s
 8 | log_date_format = %Y-%m-%d %H:%M:%S
 9 | log_file = logs/pytest-logs.txt
10 | 


--------------------------------------------------------------------------------
/libs/kotaemon/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/__init__.py


--------------------------------------------------------------------------------
/libs/kotaemon/tests/_test_multimodal_reader.py:
--------------------------------------------------------------------------------
 1 | # TODO: This test is broken and should be rewritten
 2 | from pathlib import Path
 3 | 
 4 | from kotaemon.loaders import AdobeReader
 5 | 
 6 | # from dotenv import load_dotenv
 7 | 
 8 | 
 9 | input_file = Path(__file__).parent / "resources" / "multimodal.pdf"
10 | 
11 | # load_dotenv()
12 | 
13 | 
14 | def test_adobe_reader():
15 |     reader = AdobeReader()
16 |     documents = reader.load_data(input_file)
17 |     table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
18 |     assert len(table_docs) == 2
19 | 
20 |     figure_docs = [doc for doc in documents if doc.metadata.get("type", "") == "image"]
21 |     assert len(figure_docs) == 2
22 | 


--------------------------------------------------------------------------------
/libs/kotaemon/tests/resources/7810d908b0ff4ce381dcab873196d133.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/7810d908b0ff4ce381dcab873196d133.jpg


--------------------------------------------------------------------------------
/libs/kotaemon/tests/resources/dummy.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/dummy.docx


--------------------------------------------------------------------------------
/libs/kotaemon/tests/resources/dummy.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/dummy.pdf


--------------------------------------------------------------------------------
/libs/kotaemon/tests/resources/dummy.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/dummy.xlsx


--------------------------------------------------------------------------------
/libs/kotaemon/tests/resources/ggml-vocab-llama.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/ggml-vocab-llama.gguf


--------------------------------------------------------------------------------
/libs/kotaemon/tests/resources/html/dummy_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/html/dummy_image.png


--------------------------------------------------------------------------------
/libs/kotaemon/tests/resources/multimodal.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/multimodal.pdf


--------------------------------------------------------------------------------
/libs/kotaemon/tests/resources/table.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/table.pdf


--------------------------------------------------------------------------------
/libs/kotaemon/tests/simple_pipeline.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | from typing import List
 3 | 
 4 | from kotaemon.base import BaseComponent, LLMInterface, lazy
 5 | from kotaemon.embeddings import LCAzureOpenAIEmbeddings
 6 | from kotaemon.indices import VectorRetrieval
 7 | from kotaemon.llms import AzureOpenAI
 8 | from kotaemon.storages import ChromaVectorStore
 9 | 
10 | 
11 | class Pipeline(BaseComponent):
12 |     llm: AzureOpenAI = AzureOpenAI.withx(
13 |         azure_endpoint="https://test.openai.azure.com/",
14 |         openai_api_key="some-key",
15 |         openai_api_version="2023-03-15-preview",
16 |         deployment_name="gpt35turbo",
17 |         temperature=0,
18 |         request_timeout=60,
19 |     )
20 | 
21 |     retrieving_pipeline: VectorRetrieval = VectorRetrieval.withx(
22 |         vector_store=lazy(ChromaVectorStore).withx(path=str(tempfile.mkdtemp())),
23 |         embedding=LCAzureOpenAIEmbeddings.withx(
24 |             model="text-embedding-ada-002",
25 |             deployment="embedding-deployment",
26 |             azure_endpoint="https://test.openai.azure.com/",
27 |             openai_api_key="some-key",
28 |         ),
29 |     )
30 | 
31 |     def run(self, text: str) -> LLMInterface:
32 |         matched_texts: List[str] = self.retrieving_pipeline(text)
33 |         return self.llm("\n".join(matched_texts))
34 | 


--------------------------------------------------------------------------------
/libs/kotaemon/tests/test_documents.py:
--------------------------------------------------------------------------------
 1 | from kotaemon.base.schema import Document, RetrievedDocument
 2 | 
 3 | from .conftest import skip_when_haystack_not_installed
 4 | 
 5 | 
 6 | def test_document_constructor_with_builtin_types():
 7 |     for value in ["str", 1, {}, set(), [], tuple, None]:
 8 |         doc = Document(value)
 9 |         assert doc.text == (str(value) if value else "")
10 |         assert doc.content == value
11 |         assert bool(doc) == bool(value)
12 | 
13 | 
14 | def test_document_constructor_with_document():
15 |     text = "Sample text"
16 |     doc1 = Document(text)
17 |     doc2 = Document(doc1)
18 |     assert doc2.text == doc1.text
19 |     assert doc2.content == doc1.content
20 | 
21 | 
22 | @skip_when_haystack_not_installed
23 | def test_document_to_haystack_format():
24 |     from haystack.schema import Document as HaystackDocument
25 | 
26 |     text = "Sample text"
27 |     metadata = {"filename": "sample.txt"}
28 |     doc = Document(text, metadata=metadata)
29 |     haystack_doc = doc.to_haystack_format()
30 |     assert isinstance(haystack_doc, HaystackDocument)
31 |     assert haystack_doc.content == doc.text
32 |     assert haystack_doc.meta == metadata
33 | 
34 | 
35 | def test_retrieved_document_default_values():
36 |     sample_text = "text"
37 |     retrieved_doc = RetrievedDocument(text=sample_text)
38 |     assert retrieved_doc.text == sample_text
39 |     assert retrieved_doc.score == 0.0
40 |     assert retrieved_doc.retrieval_metadata == {}
41 | 
42 | 
43 | def test_retrieved_document_attributes():
44 |     sample_text = "text"
45 |     score = 0.8
46 |     metadata = {"source": "retrieval_system"}
47 |     retrieved_doc = RetrievedDocument(
48 |         text=sample_text, score=score, retrieval_metadata=metadata
49 |     )
50 |     assert retrieved_doc.text == sample_text
51 |     assert retrieved_doc.score == score
52 |     assert retrieved_doc.retrieval_metadata == metadata
53 | 


--------------------------------------------------------------------------------
/libs/kotaemon/tests/test_indexing_retrieval.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | from typing import cast
 4 | from unittest.mock import patch
 5 | 
 6 | from openai.types.create_embedding_response import CreateEmbeddingResponse
 7 | 
 8 | from kotaemon.base import Document
 9 | from kotaemon.embeddings import AzureOpenAIEmbeddings
10 | from kotaemon.indices import VectorIndexing, VectorRetrieval
11 | from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore
12 | 
13 | with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
14 |     openai_embedding = CreateEmbeddingResponse.model_validate(json.load(f))
15 | 
16 | 
17 | @patch(
18 |     "openai.resources.embeddings.Embeddings.create",
19 |     side_effect=lambda *args, **kwargs: openai_embedding,
20 | )
21 | def test_indexing(tmp_path):
22 |     db = ChromaVectorStore(path=str(tmp_path))
23 |     doc_store = InMemoryDocumentStore()
24 |     embedding = AzureOpenAIEmbeddings(
25 |         azure_deployment="text-embedding-ada-002",
26 |         azure_endpoint="https://test.openai.azure.com/",
27 |         api_key="some-key",
28 |         api_version="version",
29 |     )
30 | 
31 |     pipeline = VectorIndexing(vector_store=db, embedding=embedding, doc_store=doc_store)
32 |     pipeline.doc_store = cast(InMemoryDocumentStore, pipeline.doc_store)
33 |     pipeline.vector_store = cast(ChromaVectorStore, pipeline.vector_store)
34 |     assert pipeline.vector_store._collection.count() == 0, "Expected empty collection"
35 |     assert len(pipeline.doc_store._store) == 0, "Expected empty doc store"
36 |     pipeline(text=Document(text="Hello world"))
37 |     assert pipeline.vector_store._collection.count() == 1, "Index 1 item"
38 |     assert len(pipeline.doc_store._store) == 1, "Expected 1 document"
39 | 
40 | 
41 | @patch(
42 |     "openai.resources.embeddings.Embeddings.create",
43 |     side_effect=lambda *args, **kwargs: openai_embedding,
44 | )
45 | def test_retrieving(tmp_path):
46 |     db = ChromaVectorStore(path=str(tmp_path))
47 |     doc_store = InMemoryDocumentStore()
48 |     embedding = AzureOpenAIEmbeddings(
49 |         azure_deployment="text-embedding-ada-002",
50 |         azure_endpoint="https://test.openai.azure.com/",
51 |         api_key="some-key",
52 |         api_version="version",
53 |     )
54 | 
55 |     index_pipeline = VectorIndexing(
56 |         vector_store=db, embedding=embedding, doc_store=doc_store
57 |     )
58 |     retrieval_pipeline = VectorRetrieval(
59 |         vector_store=db, doc_store=doc_store, embedding=embedding
60 |     )
61 | 
62 |     index_pipeline(text=Document(text="Hello world"))
63 |     output = retrieval_pipeline(text="Hello world")
64 |     output1 = retrieval_pipeline(text="Hello world")
65 | 
66 |     assert len(output) == 1, "Expect 1 results"
67 |     assert output == output1, "Expect identical results"
68 | 


--------------------------------------------------------------------------------
/libs/kotaemon/tests/test_ingestor.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from kotaemon.indices.ingests import DocumentIngestor
 4 | from kotaemon.indices.splitters import TokenSplitter
 5 | 
 6 | 
 7 | def test_ingestor_include_src():
 8 |     dirpath = Path(__file__).parent
 9 |     ingestor = DocumentIngestor(
10 |         pdf_mode="normal",
11 |         text_splitter=TokenSplitter(chunk_size=200, chunk_overlap=10),
12 |     )
13 |     nodes = ingestor(dirpath / "resources" / "table.pdf")
14 |     assert type(nodes) is list
15 |     assert nodes[0].relationships
16 | 


--------------------------------------------------------------------------------
/libs/kotaemon/tests/test_post_processing.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from kotaemon.base import Document
 4 | from kotaemon.parsers import RegexExtractor
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def regex_extractor():
 9 |     return RegexExtractor(
10 |         pattern=r"\d+", output_map={"1": "One", "2": "Two", "3": "Three"}
11 |     )
12 | 
13 | 
14 | def test_run_document(regex_extractor):
15 |     document = Document(text="This is a test. 1 2 3")
16 |     extracted_document = regex_extractor(document)[0]
17 |     assert extracted_document.text == "One"
18 |     assert extracted_document.matches == ["One", "Two", "Three"]
19 | 
20 | 
21 | def test_run_raw(regex_extractor):
22 |     output = regex_extractor("This is a test. 123")[0]
23 |     assert output.text == "123"
24 |     assert output.matches == ["123"]
25 | 
26 | 
27 | def test_run_batch_raw(regex_extractor):
28 |     output = regex_extractor(["This is a test. 123", "456"])
29 |     extracted_text = [each.text for each in output]
30 |     extracted_matches = [each.matches for each in output]
31 |     assert extracted_text == ["123", "456"]
32 |     assert extracted_matches == [["123"], ["456"]]
33 | 


--------------------------------------------------------------------------------
/libs/kotaemon/tests/test_prompt.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from kotaemon.base import Document
 4 | from kotaemon.llms import BasePromptComponent, PromptTemplate
 5 | from kotaemon.parsers import RegexExtractor
 6 | 
 7 | 
 8 | def test_set_attributes():
 9 |     template = PromptTemplate("str = {s}, int = {i}, doc = {doc}, comp = {comp}")
10 |     doc = Document(text="Helloo, Alice!")
11 |     comp = RegexExtractor(
12 |         pattern=r"\d+", output_map={"1": "One", "2": "Two", "3": "Three"}
13 |     )
14 |     comp.set_run(kwargs={"text": "This is a test. 1 2 3"}, temp=True)
15 | 
16 |     prompt = BasePromptComponent(template=template, s="Alice", i=30, doc=doc, comp=comp)
17 |     assert prompt.s == "Alice"
18 |     assert prompt.i == 30
19 |     assert prompt.doc == doc
20 |     assert prompt.comp == comp
21 | 
22 | 
23 | def test_check_redundant_kwargs():
24 |     template = PromptTemplate("Hello, {name}!")
25 |     prompt = BasePromptComponent(template=template, name="Alice")
26 |     with pytest.warns(UserWarning, match="Keys provided but not in template: age"):
27 |         prompt._BasePromptComponent__check_redundant_kwargs(name="Alice", age=30)
28 | 
29 | 
30 | def test_check_unset_placeholders():
31 |     template = PromptTemplate("Hello, {name}! I'm {age} years old.")
32 |     prompt = BasePromptComponent(template=template, name="Alice")
33 |     with pytest.raises(ValueError):
34 |         prompt._BasePromptComponent__check_unset_placeholders()
35 | 
36 | 
37 | def test_validate_value_type():
38 |     template = PromptTemplate("Hello, {name}!")
39 |     prompt = BasePromptComponent(template=template)
40 |     with pytest.raises(ValueError):
41 |         prompt._BasePromptComponent__validate_value_type(name={})
42 | 
43 | 
44 | def test_run():
45 |     template = PromptTemplate("str = {s}, int = {i}, doc = {doc}, comp = {comp}")
46 |     doc = Document(text="Helloo, Alice!")
47 |     comp = RegexExtractor(
48 |         pattern=r"\d+", output_map={"1": "One", "2": "Two", "3": "Three"}
49 |     )
50 |     comp.set_run(kwargs={"text": "This is a test. 1 2 3"}, temp=True)
51 | 
52 |     prompt = BasePromptComponent(template=template, s="Alice", i=30, doc=doc, comp=comp)
53 | 
54 |     result = prompt()
55 | 
56 |     assert result.text == "str = Alice, int = 30, doc = Helloo, Alice!, comp = ['One']"
57 | 
58 | 
59 | def test_set_method():
60 |     template = PromptTemplate("Hello, {name}!")
61 |     prompt = BasePromptComponent(template=template)
62 |     prompt.set_value(name="Alice")
63 |     assert prompt.name == "Alice"
64 | 


--------------------------------------------------------------------------------
/libs/kotaemon/tests/test_promptui.py:
--------------------------------------------------------------------------------
 1 | from kotaemon.contribs.promptui.config import export_pipeline_to_config
 2 | from kotaemon.contribs.promptui.export import export_from_dict
 3 | from kotaemon.contribs.promptui.ui import build_from_dict
 4 | 
 5 | from .simple_pipeline import Pipeline
 6 | 
 7 | 
 8 | class TestPromptConfig:
 9 |     def test_export_prompt_config(self):
10 |         """Test if the prompt config is exported correctly"""
11 |         pipeline = Pipeline()
12 |         config_dict = export_pipeline_to_config(pipeline)
13 |         config = list(config_dict.values())[0]
14 | 
15 |         assert "inputs" in config, "inputs should be in config"
16 |         assert "text" in config["inputs"], "inputs should have config"
17 | 
18 |         assert "params" in config, "params should be in config"
19 |         assert "llm.deployment_name" in config["params"]
20 |         assert "llm.azure_endpoint" in config["params"]
21 |         assert "llm.openai_api_key" in config["params"]
22 |         assert "llm.openai_api_version" in config["params"]
23 |         assert "llm.request_timeout" in config["params"]
24 |         assert "llm.temperature" in config["params"]
25 | 
26 | 
27 | class TestPromptUI:
28 |     def test_uigeneration(self):
29 |         """Test if the gradio UI is exposed without any problem"""
30 |         pipeline = Pipeline()
31 |         config = export_pipeline_to_config(pipeline)
32 | 
33 |         build_from_dict(config)
34 | 
35 | 
36 | class TestExport:
37 |     def test_export(self, tmp_path):
38 |         """Test if the export functionality works without error"""
39 |         from pathlib import Path
40 | 
41 |         import yaml
42 |         from theflow.storage import storage
43 | 
44 |         config_path = tmp_path / "config.yaml"
45 |         pipeline = Pipeline()
46 |         Path(storage.url(pipeline.config.store_result)).mkdir(
47 |             parents=True, exist_ok=True
48 |         )
49 | 
50 |         config_dict = export_pipeline_to_config(pipeline)
51 |         pipeline_name = list(config_dict.keys())[0]
52 | 
53 |         config_dict[pipeline_name]["logs"] = {
54 |             "sheet1": {
55 |                 "inputs": [{"name": "text", "step": ".", "variable": "text"}],
56 |                 "outputs": [{"name": "answer", "step": "."}],
57 |             },
58 |         }
59 |         with open(config_path, "w") as f:
60 |             yaml.safe_dump(config_dict, f)
61 | 
62 |         export_from_dict(
63 |             config=str(config_path),
64 |             pipeline=pipeline_name,
65 |             output_path=str(tmp_path / "exported.xlsx"),
66 |         )
67 | 


--------------------------------------------------------------------------------
/libs/kotaemon/tests/test_reranking.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | import pytest
 4 | from openai.types.chat.chat_completion import ChatCompletion
 5 | 
 6 | from kotaemon.base import Document
 7 | from kotaemon.indices.rankings import LLMReranking
 8 | from kotaemon.llms import AzureChatOpenAI
 9 | 
10 | _openai_chat_completion_responses = [
11 |     ChatCompletion.parse_obj(
12 |         {
13 |             "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x",
14 |             "object": "chat.completion",
15 |             "created": 1692338378,
16 |             "model": "gpt-35-turbo",
17 |             "system_fingerprint": None,
18 |             "choices": [
19 |                 {
20 |                     "index": 0,
21 |                     "finish_reason": "stop",
22 |                     "message": {
23 |                         "role": "assistant",
24 |                         "content": text,
25 |                         "function_call": None,
26 |                         "tool_calls": None,
27 |                     },
28 |                     "logprobs": None,
29 |                 }
30 |             ],
31 |             "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},
32 |         }
33 |     )
34 |     for text in [
35 |         "YES",
36 |         "NO",
37 |         "YES",
38 |     ]
39 | ]
40 | 
41 | 
42 | @pytest.fixture
43 | def llm():
44 |     return AzureChatOpenAI(
45 |         api_key="dummy",
46 |         api_version="2024-05-01-preview",
47 |         azure_deployment="gpt-4o",
48 |         azure_endpoint="https://test.openai.azure.com/",
49 |     )
50 | 
51 | 
52 | @patch(
53 |     "openai.resources.chat.completions.Completions.create",
54 |     side_effect=_openai_chat_completion_responses,
55 | )
56 | def test_reranking(openai_completion, llm):
57 |     documents = [Document(text=f"test {idx}") for idx in range(3)]
58 |     query = "test query"
59 | 
60 |     reranker = LLMReranking(llm=llm, concurrent=False)
61 |     rerank_docs = reranker(documents, query=query)
62 | 
63 |     assert len(rerank_docs) == 2
64 | 


--------------------------------------------------------------------------------
/libs/kotaemon/tests/test_table_reader.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | 
 6 | from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader
 7 | 
 8 | from .conftest import skip_when_unstructured_pdf_not_installed
 9 | 
10 | input_file = Path(__file__).parent / "resources" / "table.pdf"
11 | input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx"
12 | 
13 | 
14 | @pytest.fixture
15 | def fullocr_output():
16 |     with open(
17 |         Path(__file__).parent / "resources" / "fullocr_sample_output.json",
18 |         encoding="utf-8",
19 |     ) as f:
20 |         fullocr = json.load(f)
21 |     return fullocr
22 | 
23 | 
24 | @pytest.fixture
25 | def mathpix_output():
26 |     with open(Path(__file__).parent / "resources" / "policy.md", encoding="utf-8") as f:
27 |         content = f.read()
28 |     return content
29 | 
30 | 
31 | @skip_when_unstructured_pdf_not_installed
32 | def test_ocr_reader(fullocr_output):
33 |     reader = OCRReader()
34 |     documents = reader.load_data(input_file, response_content=fullocr_output)
35 |     table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
36 |     assert len(table_docs) == 2
37 | 
38 | 
39 | def test_mathpix_reader(mathpix_output):
40 |     reader = MathpixPDFReader()
41 |     documents = reader.load_data(input_file, response_content=mathpix_output)
42 |     table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
43 |     assert len(table_docs) == 4
44 | 
45 | 
46 | def test_excel_reader():
47 |     reader = PandasExcelReader()
48 |     documents = reader.load_data(
49 |         input_file_excel,
50 |     )
51 |     assert len(documents) == 1
52 | 


--------------------------------------------------------------------------------
/libs/kotaemon/tests/test_telemetry.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import pytest
 5 | 
 6 | from .conftest import skip_when_haystack_not_installed
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def clean_artifacts_for_telemetry():
11 |     try:
12 |         del sys.modules["kotaemon"]
13 |     except KeyError:
14 |         pass
15 | 
16 |     try:
17 |         del sys.modules["haystack"]
18 |     except KeyError:
19 |         pass
20 | 
21 |     try:
22 |         del sys.modules["haystack.telemetry"]
23 |     except KeyError:
24 |         pass
25 | 
26 |     if "HAYSTACK_TELEMETRY_ENABLED" in os.environ:
27 |         del os.environ["HAYSTACK_TELEMETRY_ENABLED"]
28 | 
29 | 
30 | @pytest.mark.usefixtures("clean_artifacts_for_telemetry")
31 | @skip_when_haystack_not_installed
32 | def test_disable_telemetry_import_haystack_first():
33 |     """Test that telemetry is disabled when kotaemon lib is initiated after"""
34 |     import os
35 | 
36 |     import haystack.telemetry
37 | 
38 |     assert haystack.telemetry.telemetry is not None
39 |     assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") != "False"
40 | 
41 |     import kotaemon  # noqa: F401
42 | 
43 |     assert haystack.telemetry.telemetry is None
44 |     assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") == "False"
45 | 
46 | 
47 | @pytest.mark.usefixtures("clean_artifacts_for_telemetry")
48 | @skip_when_haystack_not_installed
49 | def test_disable_telemetry_import_haystack_after_kotaemon():
50 |     """Test that telemetry is disabled when kotaemon lib is initiated before"""
51 |     import os
52 | 
53 |     import haystack.telemetry
54 | 
55 |     import kotaemon  # noqa: F401
56 | 
57 |     assert haystack.telemetry.telemetry is None
58 |     assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") == "False"
59 | 


--------------------------------------------------------------------------------
/libs/kotaemon/tests/test_tools.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | from unittest.mock import patch
 4 | 
 5 | from openai.types.create_embedding_response import CreateEmbeddingResponse
 6 | 
 7 | from kotaemon.agents.tools import ComponentTool, GoogleSearchTool, WikipediaTool
 8 | from kotaemon.base import Document
 9 | from kotaemon.embeddings import AzureOpenAIEmbeddings
10 | from kotaemon.indices.vectorindex import VectorIndexing, VectorRetrieval
11 | from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore
12 | 
13 | with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
14 |     openai_embedding = CreateEmbeddingResponse.model_validate(json.load(f))
15 | 
16 | 
17 | def test_google_tool(mock_google_search):
18 |     tool = GoogleSearchTool()
19 |     assert tool.name
20 |     assert tool.description
21 |     output = tool("What is Cinnamon AI")
22 |     assert output
23 | 
24 | 
25 | def test_wikipedia_tool():
26 |     tool = WikipediaTool()
27 |     assert tool.name
28 |     assert tool.description
29 |     output = tool("Cinnamon")
30 |     assert output
31 | 
32 | 
33 | @patch(
34 |     "openai.resources.embeddings.Embeddings.create",
35 |     side_effect=lambda *args, **kwargs: openai_embedding,
36 | )
37 | def test_pipeline_tool(tmp_path):
38 |     db = ChromaVectorStore(path=str(tmp_path))
39 |     doc_store = InMemoryDocumentStore()
40 |     embedding = AzureOpenAIEmbeddings(
41 |         azure_deployment="embedding-deployment",
42 |         azure_endpoint="https://test.openai.azure.com/",
43 |         api_key="some-key",
44 |         api_version="version",
45 |     )
46 | 
47 |     index_pipeline = VectorIndexing(
48 |         vector_store=db, embedding=embedding, doc_store=doc_store
49 |     )
50 |     retrieval_pipeline = VectorRetrieval(
51 |         vector_store=db, doc_store=doc_store, embedding=embedding
52 |     )
53 | 
54 |     index_tool = ComponentTool(
55 |         name="index_document",
56 |         description="A tool to use to index a document to be searched later",
57 |         component=index_pipeline,
58 |     )
59 |     output = index_tool({"text": Document(text="Cinnamon AI")})
60 | 
61 |     retrieval_tool = ComponentTool(
62 |         name="search_document",
63 |         description="A tool to use to search a document in a vectorstore",
64 |         component=retrieval_pipeline,
65 |     )
66 |     output = retrieval_tool("Cinnamon AI")
67 |     assert output
68 | 


--------------------------------------------------------------------------------
/libs/ktem/.gitignore:
--------------------------------------------------------------------------------
1 | 14-1_抜粋-1.pdf
2 | _example_.db
3 | ktem/assets/prebuilt/
4 | 


--------------------------------------------------------------------------------
/libs/ktem/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include ktem/assets/css/*.css
2 | include ktem/assets/img/*.svg
3 | include ktem/assets/js/*.js
4 | include ktem/assets/md/*.md
5 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/__init__.py


--------------------------------------------------------------------------------
/libs/ktem/ktem/assets/__init__.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from decouple import config
 4 | 
 5 | from .theme import Kotaemon as KotaemonTheme
 6 | 
 7 | PDFJS_VERSION_DIST: str = config("PDFJS_VERSION_DIST", "pdfjs-4.0.379-dist")
 8 | PDFJS_PREBUILT_DIR: Path = config(
 9 |     "PDFJS_PREBUILT_DIR", Path(__file__).parent / "prebuilt" / PDFJS_VERSION_DIST
10 | )
11 | 
12 | __all__ = ["KotaemonTheme", "PDFJS_VERSION_DIST", "PDFJS_PREBUILT_DIR"]
13 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/assets/icons/dark_mode.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg width="800px" height="800px" viewBox="0 0 24 24" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 3 |     <!-- Uploaded to: SVG Repo, www.svgrepo.com, Generator: SVG Repo Mixer Tools -->
 4 |     <title>ic_fluent_dark_theme_24_regular</title>
 5 |     <desc>Created with Sketch.</desc>
 6 |     <g id="🔍-Product-Icons" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
 7 |         <g id="ic_fluent_dark_theme_24_regular" fill="#cecece" fill-rule="nonzero">
 8 |             <path d="M12,22 C17.5228475,22 22,17.5228475 22,12 C22,6.4771525 17.5228475,2 12,2 C6.4771525,2 2,6.4771525 2,12 C2,17.5228475 6.4771525,22 12,22 Z M12,20.5 L12,3.5 C16.6944204,3.5 20.5,7.30557963 20.5,12 C20.5,16.6944204 16.6944204,20.5 12,20.5 Z" id="🎨-Color">
 9 | 
10 | </path>
11 |         </g>
12 |     </g>
13 | </svg>
14 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/assets/icons/delete.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" class="h-5 w-5 shrink-0"><path fill="#f93a37" fill-rule="evenodd" d="M10.556 4a1 1 0 0 0-.97.751l-.292 1.14h5.421l-.293-1.14A1 1 0 0 0 13.453 4zm6.224 1.892-.421-1.639A3 3 0 0 0 13.453 2h-2.897A3 3 0 0 0 7.65 4.253l-.421 1.639H4a1 1 0 1 0 0 2h.1l1.215 11.425A3 3 0 0 0 8.3 22h7.4a3 3 0 0 0 2.984-2.683l1.214-11.425H20a1 1 0 1 0 0-2zm1.108 2H6.112l1.192 11.214A1 1 0 0 0 8.3 20h7.4a1 1 0 0 0 .995-.894zM10 10a1 1 0 0 1 1 1v5a1 1 0 1 1-2 0v-5a1 1 0 0 1 1-1m4 0a1 1 0 0 1 1 1v5a1 1 0 1 1-2 0v-5a1 1 0 0 1 1-1" clip-rule="evenodd"/></svg>
2 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/assets/icons/expand.svg:
--------------------------------------------------------------------------------
1 | <svg class="svg-icon" style="width: 24; height: 24;vertical-align: middle;fill: none;overflow: hidden;" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg"><path fill="#cecece" fill-rule="evenodd"  d="M853.333333 213.333333a42.666667 42.666667 0 0 0-42.666666-42.666666h-213.333334a42.666667 42.666667 0 0 0 0 85.333333h109.653334l-139.946667 140.373333a42.666667 42.666667 0 0 0 0 60.586667 42.666667 42.666667 0 0 0 60.586667 0L768 316.586667V426.666667a42.666667 42.666667 0 0 0 42.666667 42.666666 42.666667 42.666667 0 0 0 42.666666-42.666666zM456.96 567.04a42.666667 42.666667 0 0 0-60.586667 0L256 706.986667V597.333333a42.666667 42.666667 0 0 0-42.666667-42.666666 42.666667 42.666667 0 0 0-42.666666 42.666666v213.333334a42.666667 42.666667 0 0 0 42.666666 42.666666h213.333334a42.666667 42.666667 0 0 0 0-85.333333H316.586667l140.373333-140.373333a42.666667 42.666667 0 0 0 0-60.586667z"  /></svg>
2 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/assets/icons/new.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="#10b981" class="icon-xl-heavy"><path d="M15.673 3.913a3.121 3.121 0 1 1 4.414 4.414l-5.937 5.937a5 5 0 0 1-2.828 1.415l-2.18.31a1 1 0 0 1-1.132-1.13l.311-2.18A5 5 0 0 1 9.736 9.85zm3 1.414a1.12 1.12 0 0 0-1.586 0l-5.937 5.937a3 3 0 0 0-.849 1.697l-.123.86.86-.122a3 3 0 0 0 1.698-.849l5.937-5.937a1.12 1.12 0 0 0 0-1.586M11 4a1 1 0 0 1-1 1c-.998 0-1.702.008-2.253.06-.54.052-.862.141-1.109.267a3 3 0 0 0-1.311 1.311c-.134.263-.226.611-.276 1.216C5.001 8.471 5 9.264 5 10.4v3.2c0 1.137 0 1.929.051 2.546.05.605.142.953.276 1.216a3 3 0 0 0 1.311 1.311c.263.134.611.226 1.216.276.617.05 1.41.051 2.546.051h3.2c1.137 0 1.929 0 2.546-.051.605-.05.953-.142 1.216-.276a3 3 0 0 0 1.311-1.311c.126-.247.215-.569.266-1.108.053-.552.06-1.256.06-2.255a1 1 0 1 1 2 .002c0 .978-.006 1.78-.069 2.442-.064.673-.192 1.27-.475 1.827a5 5 0 0 1-2.185 2.185c-.592.302-1.232.428-1.961.487C15.6 21 14.727 21 13.643 21h-3.286c-1.084 0-1.958 0-2.666-.058-.728-.06-1.369-.185-1.96-.487a5 5 0 0 1-2.186-2.185c-.302-.592-.428-1.233-.487-1.961C3 15.6 3 14.727 3 13.643v-3.286c0-1.084 0-1.958.058-2.666.06-.729.185-1.369.487-1.961A5 5 0 0 1 5.73 3.545c.556-.284 1.154-.411 1.827-.475C8.22 3.007 9.021 3 10 3a1 1 0 0 1 1 1"/></svg>
2 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/assets/icons/rename.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" class="h-5 w-5 shrink-0"><path fill="#cecece" fill-rule="evenodd" d="M13.293 4.293a4.536 4.536 0 1 1 6.414 6.414l-1 1-7.094 7.094A5 5 0 0 1 8.9 20.197l-4.736.79a1 1 0 0 1-1.15-1.151l.789-4.736a5 5 0 0 1 1.396-2.713zM13 7.414l-6.386 6.387a3 3 0 0 0-.838 1.628l-.56 3.355 3.355-.56a3 3 0 0 0 1.628-.837L16.586 11zm5 2.172L14.414 6l.293-.293a2.536 2.536 0 0 1 3.586 3.586z" clip-rule="evenodd"/></svg>
2 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/assets/icons/sidebar.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" class="icon-xl-heavy"><path fill="#cecece" fill-rule="evenodd" d="M8.857 3h6.286c1.084 0 1.958 0 2.666.058.729.06 1.369.185 1.961.487a5 5 0 0 1 2.185 2.185c.302.592.428 1.233.487 1.961.058.708.058 1.582.058 2.666v3.286c0 1.084 0 1.958-.058 2.666-.06.729-.185 1.369-.487 1.961a5 5 0 0 1-2.185 2.185c-.592.302-1.232.428-1.961.487C17.1 21 16.227 21 15.143 21H8.857c-1.084 0-1.958 0-2.666-.058-.728-.06-1.369-.185-1.96-.487a5 5 0 0 1-2.186-2.185c-.302-.592-.428-1.232-.487-1.961C1.5 15.6 1.5 14.727 1.5 13.643v-3.286c0-1.084 0-1.958.058-2.666.06-.728.185-1.369.487-1.96A5 5 0 0 1 4.23 3.544c.592-.302 1.233-.428 1.961-.487C6.9 3 7.773 3 8.857 3M6.354 5.051c-.605.05-.953.142-1.216.276a3 3 0 0 0-1.311 1.311c-.134.263-.226.611-.276 1.216-.05.617-.051 1.41-.051 2.546v3.2c0 1.137 0 1.929.051 2.546.05.605.142.953.276 1.216a3 3 0 0 0 1.311 1.311c.263.134.611.226 1.216.276.617.05 1.41.051 2.546.051h.6V5h-.6c-1.137 0-1.929 0-2.546.051M11.5 5v14h3.6c1.137 0 1.929 0 2.546-.051.605-.05.953-.142 1.216-.276a3 3 0 0 0 1.311-1.311c.134-.263.226-.611.276-1.216.05-.617.051-1.41.051-2.546v-3.2c0-1.137 0-1.929-.051-2.546-.05-.605-.142-.953-.276-1.216a3 3 0 0 0-1.311-1.311c-.263-.134-.611-.226-1.216-.276C17.029 5.001 16.236 5 15.1 5zM5 8.5a1 1 0 0 1 1-1h1a1 1 0 1 1 0 2H6a1 1 0 0 1-1-1M5 12a1 1 0 0 1 1-1h1a1 1 0 1 1 0 2H6a1 1 0 0 1-1-1" clip-rule="evenodd"/></svg>
2 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/assets/md/about.md:
--------------------------------------------------------------------------------
 1 | # About Kotaemon
 2 | 
 3 | An open-source tool for you to chat with your documents.
 4 | 
 5 | [Source Code](https://github.com/Cinnamon/kotaemon) |
 6 | [Demo](https://huggingface.co/spaces/cin-model/kotaemon-demo)
 7 | 
 8 | [User Guide](https://cinnamon.github.io/kotaemon/) |
 9 | [Developer Guide](https://cinnamon.github.io/kotaemon/development/) |
10 | [Feedback](https://github.com/Cinnamon/kotaemon/issues)
11 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/assets/md/changelogs.md:
--------------------------------------------------------------------------------
 1 | # Changelogs
 2 | 
 3 | ## v0.0.1
 4 | 
 5 | - Chat: interact with chatbot with simple pipeline, rewoo and react agents
 6 | - Chat: conversation management: create, delete, rename conversations
 7 | - Files: upload files
 8 | - Files: select files as context for chatbot
 9 | - User management: create, sign-in, sign-out, change password
10 | - Setting: common settings and pipeline-based settings
11 | - Info panel: show Cinnamon AI and Kotaemon information
12 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/db/__init__.py


--------------------------------------------------------------------------------
/libs/ktem/ktem/db/engine.py:
--------------------------------------------------------------------------------
1 | from sqlmodel import create_engine
2 | from theflow.settings import settings
3 | 
4 | engine = create_engine(settings.KH_DATABASE)
5 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/db/models.py:
--------------------------------------------------------------------------------
 1 | import ktem.db.base_models as base_models
 2 | from ktem.db.engine import engine
 3 | from sqlmodel import SQLModel
 4 | from theflow.settings import settings
 5 | from theflow.utils.modules import import_dotted_string
 6 | 
 7 | _base_conv = (
 8 |     import_dotted_string(settings.KH_TABLE_CONV, safe=False)
 9 |     if hasattr(settings, "KH_TABLE_CONV")
10 |     else base_models.BaseConversation
11 | )
12 | 
13 | _base_user = (
14 |     import_dotted_string(settings.KH_TABLE_USER, safe=False)
15 |     if hasattr(settings, "KH_TABLE_USER")
16 |     else base_models.BaseUser
17 | )
18 | 
19 | _base_settings = (
20 |     import_dotted_string(settings.KH_TABLE_SETTINGS, safe=False)
21 |     if hasattr(settings, "KH_TABLE_SETTINGS")
22 |     else base_models.BaseSettings
23 | )
24 | 
25 | _base_issue_report = (
26 |     import_dotted_string(settings.KH_TABLE_ISSUE_REPORT, safe=False)
27 |     if hasattr(settings, "KH_TABLE_ISSUE_REPORT")
28 |     else base_models.BaseIssueReport
29 | )
30 | 
31 | 
32 | class Conversation(_base_conv, table=True):  # type: ignore
33 |     """Conversation record"""
34 | 
35 | 
36 | class User(_base_user, table=True):  # type: ignore
37 |     """User table"""
38 | 
39 | 
40 | class Settings(_base_settings, table=True):  # type: ignore
41 |     """Record of settings"""
42 | 
43 | 
44 | class IssueReport(_base_issue_report, table=True):  # type: ignore
45 |     """Record of issues"""
46 | 
47 | 
48 | if not getattr(settings, "KH_ENABLE_ALEMBIC", False):
49 |     SQLModel.metadata.create_all(engine)
50 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/embeddings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/embeddings/__init__.py


--------------------------------------------------------------------------------
/libs/ktem/ktem/embeddings/db.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | from ktem.db.engine import engine
 4 | from sqlalchemy import JSON, Boolean, Column, String
 5 | from sqlalchemy.orm import DeclarativeBase
 6 | from theflow.settings import settings as flowsettings
 7 | from theflow.utils.modules import import_dotted_string
 8 | 
 9 | 
10 | class Base(DeclarativeBase):
11 |     pass
12 | 
13 | 
14 | class BaseEmbeddingTable(Base):
15 |     """Base table to store language model"""
16 | 
17 |     __abstract__ = True
18 | 
19 |     name = Column(String, primary_key=True, unique=True)
20 |     spec = Column(JSON, default={})
21 |     default = Column(Boolean, default=False)
22 | 
23 | 
24 | _base_llm: Type[BaseEmbeddingTable] = (
25 |     import_dotted_string(flowsettings.KH_EMBEDDING_LLM, safe=False)
26 |     if hasattr(flowsettings, "KH_EMBEDDING_LLM")
27 |     else BaseEmbeddingTable
28 | )
29 | 
30 | 
31 | class EmbeddingTable(_base_llm):  # type: ignore
32 |     __tablename__ = "embedding"
33 | 
34 | 
35 | if not getattr(flowsettings, "KH_ENABLE_ALEMBIC", False):
36 |     EmbeddingTable.metadata.create_all(engine)
37 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/exceptions.py:
--------------------------------------------------------------------------------
 1 | class KHException(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class HookNotDeclared(KHException):
 6 |     pass
 7 | 
 8 | 
 9 | class HookAlreadyDeclared(KHException):
10 |     pass
11 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/extension_protocol.py:
--------------------------------------------------------------------------------
 1 | import pluggy
 2 | 
 3 | hookspec = pluggy.HookspecMarker("ktem")
 4 | hookimpl = pluggy.HookimplMarker("ktem")
 5 | 
 6 | 
 7 | @hookspec
 8 | def ktem_declare_extensions() -> dict:  # type: ignore
 9 |     """Called before the run() function is executed.
10 | 
11 |     This hook is called without any arguments, and should return a dictionary.
12 |     The dictionary has the following structure:
13 | 
14 |         ```
15 |         {
16 |             "id": str,      # cannot contain . or /
17 |             "name": str,    # human-friendly name of the plugin
18 |             "version": str,
19 |             "support_host": str,
20 |             "functionality": {
21 |                 "reasoning": {
22 |                     id: {                         # cannot contain . or /
23 |                         "name": str,
24 |                         "callbacks": {},
25 |                         "settings": {},
26 |                     },
27 |                 },
28 |                 "index": {
29 |                     "name": str,
30 |                     "callbacks": {
31 |                         "get_index_pipeline": callable,
32 |                         "get_retrievers": {name: callable}
33 |                     },
34 |                     "settings": {},
35 |                 },
36 |             },
37 |         }
38 |         ```
39 |     """
40 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/index/__init__.py:
--------------------------------------------------------------------------------
1 | from .manager import IndexManager
2 | 
3 | __all__ = ["IndexManager"]
4 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/index/file/__init__.py:
--------------------------------------------------------------------------------
1 | from .index import FileIndex
2 | 
3 | __all__ = ["FileIndex"]
4 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/index/file/exceptions.py:
--------------------------------------------------------------------------------
1 | from ktem.exceptions import KHException
2 | 
3 | 
4 | class FileExistsError(KHException):
5 |     pass
6 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/index/file/graph/__init__.py:
--------------------------------------------------------------------------------
1 | from .graph_index import GraphRAGIndex
2 | from .light_graph_index import LightRAGIndex
3 | from .nano_graph_index import NanoGraphRAGIndex
4 | 
5 | __all__ = ["GraphRAGIndex", "NanoGraphRAGIndex", "LightRAGIndex"]
6 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/index/file/graph/graph_index.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from ktem.index.file import FileIndex
 4 | 
 5 | from ..base import BaseFileIndexIndexing, BaseFileIndexRetriever
 6 | from .pipelines import GraphRAGIndexingPipeline, GraphRAGRetrieverPipeline
 7 | 
 8 | 
 9 | class GraphRAGIndex(FileIndex):
10 |     def _setup_indexing_cls(self):
11 |         self._indexing_pipeline_cls = GraphRAGIndexingPipeline
12 | 
13 |     def _setup_retriever_cls(self):
14 |         self._retriever_pipeline_cls = [GraphRAGRetrieverPipeline]
15 | 
16 |     def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
17 |         """Define the interface of the indexing pipeline"""
18 | 
19 |         obj = super().get_indexing_pipeline(settings, user_id)
20 |         # disable vectorstore for this kind of Index
21 |         obj.VS = None
22 | 
23 |         return obj
24 | 
25 |     def get_retriever_pipelines(
26 |         self, settings: dict, user_id: int, selected: Any = None
27 |     ) -> list["BaseFileIndexRetriever"]:
28 |         file_ids = self._selector_ui.get_selected_ids(selected)
29 |         retrievers = [
30 |             GraphRAGRetrieverPipeline(
31 |                 file_ids=file_ids,
32 |                 Index=self._resources["Index"],
33 |             )
34 |         ]
35 | 
36 |         return retrievers
37 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/index/file/knet/__init__.py:
--------------------------------------------------------------------------------
1 | from .knet_index import KnowledgeNetworkFileIndex
2 | 
3 | __all__ = ["KnowledgeNetworkFileIndex"]
4 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/index/file/knet/knet_index.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from ktem.index.file import FileIndex
 4 | 
 5 | from ..base import BaseFileIndexIndexing, BaseFileIndexRetriever
 6 | from .pipelines import KnetIndexingPipeline, KnetRetrievalPipeline
 7 | 
 8 | 
 9 | class KnowledgeNetworkFileIndex(FileIndex):
10 |     @classmethod
11 |     def get_admin_settings(cls):
12 |         admin_settings = super().get_admin_settings()
13 | 
14 |         # remove embedding from admin settings
15 |         # as we don't need it
16 |         admin_settings.pop("embedding")
17 |         return admin_settings
18 | 
19 |     def _setup_indexing_cls(self):
20 |         self._indexing_pipeline_cls = KnetIndexingPipeline
21 | 
22 |     def _setup_retriever_cls(self):
23 |         self._retriever_pipeline_cls = [KnetRetrievalPipeline]
24 | 
25 |     def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
26 |         """Define the interface of the indexing pipeline"""
27 | 
28 |         obj = super().get_indexing_pipeline(settings, user_id)
29 |         # disable vectorstore for this kind of Index
30 |         # also set the collection_name for API call
31 |         obj.VS = None
32 |         obj.collection_name = f"kh_index_{self.id}"
33 | 
34 |         return obj
35 | 
36 |     def get_retriever_pipelines(
37 |         self, settings: dict, user_id: int, selected: Any = None
38 |     ) -> list["BaseFileIndexRetriever"]:
39 |         retrievers = super().get_retriever_pipelines(settings, user_id, selected)
40 | 
41 |         for obj in retrievers:
42 |             # disable vectorstore for this kind of Index
43 |             # also set the collection_name for API call
44 |             obj.VS = None
45 |             obj.collection_name = f"kh_index_{self.id}"
46 | 
47 |         return retrievers
48 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/index/file/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import requests
 4 | 
 5 | # regex patterns for Arxiv URL
 6 | ARXIV_URL_PATTERNS = [
 7 |     "https://arxiv.org/abs/",
 8 |     "https://arxiv.org/pdf/",
 9 | ]
10 | 
11 | ILLEGAL_NAME_CHARS = ["\\", "/", ":", "*", "?", '"', "<", ">", "|"]
12 | 
13 | 
14 | def clean_name(name):
15 |     for char in ILLEGAL_NAME_CHARS:
16 |         name = name.replace(char, "_")
17 |     return name
18 | 
19 | 
20 | def is_arxiv_url(url):
21 |     return any(url.startswith(pattern) for pattern in ARXIV_URL_PATTERNS)
22 | 
23 | 
24 | # download PDF from Arxiv URL
25 | def download_arxiv_pdf(url, output_path):
26 |     if not is_arxiv_url(url):
27 |         raise ValueError("Invalid Arxiv URL")
28 | 
29 |     is_abstract_url = "abs" in url
30 |     if is_abstract_url:
31 |         pdf_url = url.replace("abs", "pdf")
32 |         abstract_url = url
33 |     else:
34 |         pdf_url = url
35 |         abstract_url = url.replace("pdf", "abs")
36 | 
37 |     # get paper name from abstract url
38 |     response = requests.get(abstract_url)
39 | 
40 |     # parse HTML response and get h1.title
41 |     from bs4 import BeautifulSoup
42 | 
43 |     soup = BeautifulSoup(response.content, "html.parser")
44 |     name = clean_name(
45 |         soup.find("h1", class_="title").text.strip().replace("Title:", "")
46 |     )
47 |     if not name:
48 |         raise ValueError("Failed to get paper name")
49 | 
50 |     output_file_path = os.path.join(output_path, name + ".pdf")
51 |     # prevent downloading if file already exists
52 |     if not os.path.exists(output_file_path):
53 |         response = requests.get(pdf_url)
54 | 
55 |         with open(output_file_path, "wb") as f:
56 |             f.write(response.content)
57 | 
58 |     return output_file_path
59 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/index/models.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from ktem.db.engine import engine
 4 | from sqlalchemy import JSON, Column
 5 | from sqlmodel import Field, SQLModel
 6 | 
 7 | 
 8 | # TODO: simplify with using SQLAlchemy directly
 9 | class Index(SQLModel, table=True):
10 |     __table_args__ = {"extend_existing": True}
11 |     __tablename__ = "ktem__index"  # type: ignore
12 | 
13 |     id: Optional[int] = Field(default=None, primary_key=True)
14 |     name: str = Field(unique=True)
15 |     index_type: str = Field()
16 |     config: dict = Field(default={}, sa_column=Column(JSON))
17 | 
18 | 
19 | Index.metadata.create_all(engine)
20 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/llms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/llms/__init__.py


--------------------------------------------------------------------------------
/libs/ktem/ktem/llms/db.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | from ktem.db.engine import engine
 4 | from sqlalchemy import JSON, Boolean, Column, String
 5 | from sqlalchemy.orm import DeclarativeBase
 6 | from theflow.settings import settings as flowsettings
 7 | from theflow.utils.modules import import_dotted_string
 8 | 
 9 | 
10 | class Base(DeclarativeBase):
11 |     pass
12 | 
13 | 
14 | class BaseLLMTable(Base):
15 |     """Base table to store language model"""
16 | 
17 |     __abstract__ = True
18 | 
19 |     name = Column(String, primary_key=True, unique=True)
20 |     spec = Column(JSON, default={})
21 |     default = Column(Boolean, default=False)
22 | 
23 | 
24 | _base_llm: Type[BaseLLMTable] = (
25 |     import_dotted_string(flowsettings.KH_TABLE_LLM, safe=False)
26 |     if hasattr(flowsettings, "KH_TABLE_LLM")
27 |     else BaseLLMTable
28 | )
29 | 
30 | 
31 | class LLMTable(_base_llm):  # type: ignore
32 |     __tablename__ = "llm_table"
33 | 
34 | 
35 | if not getattr(flowsettings, "KH_ENABLE_ALEMBIC", False):
36 |     LLMTable.metadata.create_all(engine)
37 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/pages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/pages/__init__.py


--------------------------------------------------------------------------------
/libs/ktem/ktem/pages/chat/chat_panel.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | from ktem.app import BasePage
 3 | from theflow.settings import settings as flowsettings
 4 | 
 5 | KH_DEMO_MODE = getattr(flowsettings, "KH_DEMO_MODE", False)
 6 | 
 7 | if not KH_DEMO_MODE:
 8 |     PLACEHOLDER_TEXT = (
 9 |         "This is the beginning of a new conversation.\n"
10 |         "Start by uploading a file or a web URL. "
11 |         "Visit Files tab for more options (e.g: GraphRAG)."
12 |     )
13 | else:
14 |     PLACEHOLDER_TEXT = (
15 |         "Welcome to Kotaemon Demo. "
16 |         "Start by browsing preloaded conversations to get onboard.\n"
17 |         "Check out Hint section for more tips."
18 |     )
19 | 
20 | 
21 | class ChatPanel(BasePage):
22 |     def __init__(self, app):
23 |         self._app = app
24 |         self.on_building_ui()
25 | 
26 |     def on_building_ui(self):
27 |         self.chatbot = gr.Chatbot(
28 |             label=self._app.app_name,
29 |             placeholder=PLACEHOLDER_TEXT,
30 |             show_label=False,
31 |             elem_id="main-chat-bot",
32 |             show_copy_button=True,
33 |             likeable=True,
34 |             bubble_full_width=False,
35 |         )
36 |         with gr.Row():
37 |             self.text_input = gr.MultimodalTextbox(
38 |                 interactive=True,
39 |                 scale=20,
40 |                 file_count="multiple",
41 |                 placeholder=(
42 |                     "Type a message, search the @web, or tag a file with @filename"
43 |                 ),
44 |                 container=False,
45 |                 show_label=False,
46 |                 elem_id="chat-input",
47 |             )
48 | 
49 |     def submit_msg(self, chat_input, chat_history):
50 |         """Submit a message to the chatbot"""
51 |         return "", chat_history + [(chat_input, None)]
52 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/pages/chat/chat_suggestion.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | from ktem.app import BasePage
 3 | from theflow.settings import settings as flowsettings
 4 | 
 5 | 
 6 | class ChatSuggestion(BasePage):
 7 |     CHAT_SAMPLES = getattr(
 8 |         flowsettings,
 9 |         "KH_FEATURE_CHAT_SUGGESTION_SAMPLES",
10 |         [
11 |             "Summary this document",
12 |             "Generate a FAQ for this document",
13 |             "Identify the main highlights in bullet points",
14 |         ],
15 |     )
16 | 
17 |     def __init__(self, app):
18 |         self._app = app
19 |         self.on_building_ui()
20 | 
21 |     def on_building_ui(self):
22 |         self.chat_samples = [[each] for each in self.CHAT_SAMPLES]
23 |         with gr.Accordion(
24 |             label="Chat Suggestion",
25 |             visible=getattr(flowsettings, "KH_FEATURE_CHAT_SUGGESTION", False),
26 |         ) as self.accordion:
27 |             self.default_example = gr.State(
28 |                 value=self.chat_samples,
29 |             )
30 |             self.examples = gr.DataFrame(
31 |                 value=self.chat_samples,
32 |                 headers=["Next Question"],
33 |                 interactive=False,
34 |                 elem_id="chat-suggestion",
35 |                 wrap=True,
36 |             )
37 | 
38 |     def as_gradio_component(self):
39 |         return self.examples
40 | 
41 |     def select_example(self, ev: gr.SelectData):
42 |         return {"text": ev.value}
43 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/pages/chat/common.py:
--------------------------------------------------------------------------------
1 | DEFAULT_APPLICATION_STATE = {"regen": False}
2 | STATE = {
3 |     "app": DEFAULT_APPLICATION_STATE,
4 | }
5 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/pages/chat/demo_hint.py:
--------------------------------------------------------------------------------
 1 | from textwrap import dedent
 2 | 
 3 | import gradio as gr
 4 | from ktem.app import BasePage
 5 | 
 6 | 
 7 | class HintPage(BasePage):
 8 |     def __init__(self, app):
 9 |         self._app = app
10 |         self.on_building_ui()
11 | 
12 |     def on_building_ui(self):
13 |         with gr.Accordion(label="Hint", open=False):
14 |             gr.Markdown(
15 |                 dedent(
16 |                     """
17 |                 - You can select any text from the chat answer to **highlight relevant citation(s)** on the right panel.
18 |                 - **Citations** can be viewed on both PDF viewer and raw text.
19 |                 - You can tweak the citation format and use advance (CoT) reasoning in **Chat settings** menu.
20 |                 - Want to **explore more**? Check out the **Help** section to create your private space.
21 |             """  # noqa
22 |                 )
23 |             )
24 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/pages/chat/paper_list.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | from ktem.app import BasePage
 3 | from pandas import DataFrame
 4 | 
 5 | from ...utils.hf_papers import fetch_papers
 6 | 
 7 | 
 8 | class PaperListPage(BasePage):
 9 |     def __init__(self, app):
10 |         self._app = app
11 |         self.on_building_ui()
12 | 
13 |     def on_building_ui(self):
14 |         self.papers_state = gr.State(None)
15 |         with gr.Accordion(
16 |             label="Browse popular daily papers",
17 |             open=True,
18 |         ) as self.accordion:
19 |             self.examples = gr.DataFrame(
20 |                 value=[],
21 |                 headers=["title", "url", "upvotes"],
22 |                 column_widths=[60, 30, 10],
23 |                 interactive=False,
24 |                 elem_id="paper-suggestion",
25 |                 wrap=True,
26 |             )
27 |         return self.examples
28 | 
29 |     def load(self):
30 |         papers = fetch_papers(top_n=5)
31 |         papers_df = DataFrame(papers)
32 |         return (papers_df, papers)
33 | 
34 |     def _on_app_created(self):
35 |         self._app.app.load(
36 |             self.load,
37 |             outputs=[self.examples, self.papers_state],
38 |         )
39 | 
40 |     def select_example(self, state, ev: gr.SelectData):
41 |         return state[ev.index[0]]["url"]
42 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/pages/resources/__init__.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | from ktem.app import BasePage
 3 | from ktem.db.models import User, engine
 4 | from ktem.embeddings.ui import EmbeddingManagement
 5 | from ktem.index.ui import IndexManagement
 6 | from ktem.llms.ui import LLMManagement
 7 | from ktem.rerankings.ui import RerankingManagement
 8 | from sqlmodel import Session, select
 9 | 
10 | from .user import UserManagement
11 | 
12 | 
13 | class ResourcesTab(BasePage):
14 |     def __init__(self, app):
15 |         self._app = app
16 |         self.on_building_ui()
17 | 
18 |     def on_building_ui(self):
19 |         with gr.Tab("Index Collections") as self.index_management_tab:
20 |             self.index_management = IndexManagement(self._app)
21 | 
22 |         with gr.Tab("LLMs") as self.llm_management_tab:
23 |             self.llm_management = LLMManagement(self._app)
24 | 
25 |         with gr.Tab("Embeddings") as self.emb_management_tab:
26 |             self.emb_management = EmbeddingManagement(self._app)
27 | 
28 |         with gr.Tab("Rerankings") as self.rerank_management_tab:
29 |             self.rerank_management = RerankingManagement(self._app)
30 | 
31 |         if self._app.f_user_management:
32 |             with gr.Tab("Users", visible=False) as self.user_management_tab:
33 |                 self.user_management = UserManagement(self._app)
34 | 
35 |     def on_subscribe_public_events(self):
36 |         if self._app.f_user_management:
37 |             self._app.subscribe_event(
38 |                 name="onSignIn",
39 |                 definition={
40 |                     "fn": self.toggle_user_management,
41 |                     "inputs": [self._app.user_id],
42 |                     "outputs": [self.user_management_tab],
43 |                     "show_progress": "hidden",
44 |                 },
45 |             )
46 | 
47 |             self._app.subscribe_event(
48 |                 name="onSignOut",
49 |                 definition={
50 |                     "fn": self.toggle_user_management,
51 |                     "inputs": [self._app.user_id],
52 |                     "outputs": [self.user_management_tab],
53 |                     "show_progress": "hidden",
54 |                 },
55 |             )
56 | 
57 |     def toggle_user_management(self, user_id):
58 |         """Show/hide the user management, depending on the user's role"""
59 |         with Session(engine) as session:
60 |             user = session.exec(select(User).where(User.id == user_id)).first()
61 |             if user and user.admin:
62 |                 return gr.update(visible=True)
63 | 
64 |             return gr.update(visible=False)
65 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/reasoning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/reasoning/__init__.py


--------------------------------------------------------------------------------
/libs/ktem/ktem/reasoning/base.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from kotaemon.base import BaseComponent
 4 | 
 5 | 
 6 | class BaseReasoning(BaseComponent):
 7 |     """The reasoning pipeline that handles each of the user chat messages
 8 | 
 9 |     This reasoning pipeline has access to:
10 |         - the retrievers
11 |         - the user settings
12 |         - the message
13 |         - the conversation id
14 |         - the message history
15 |     """
16 | 
17 |     @classmethod
18 |     def get_info(cls) -> dict:
19 |         """Get the pipeline information for the app to organize and display
20 | 
21 |         Returns:
22 |             a dictionary that contains the following keys:
23 |                 - "id": the unique id of the pipeline
24 |                 - "name": the human-friendly name of the pipeline
25 |                 - "description": the overview short description of the pipeline, for
26 |                 user to grasp what does the pipeline do
27 |         """
28 |         raise NotImplementedError
29 | 
30 |     @classmethod
31 |     def get_user_settings(cls) -> dict:
32 |         """Get the default user settings for this pipeline"""
33 |         return {}
34 | 
35 |     @classmethod
36 |     def get_pipeline(
37 |         cls,
38 |         user_settings: dict,
39 |         state: dict,
40 |         retrievers: Optional[list["BaseComponent"]] = None,
41 |     ) -> "BaseReasoning":
42 |         """Get the reasoning pipeline for the app to execute
43 | 
44 |         Args:
45 |             user_setting: user settings
46 |             state: conversation state
47 |             retrievers (list): List of retrievers
48 |         """
49 |         return cls()
50 | 
51 |     def run(self, message: str, conv_id: str, history: list, **kwargs):  # type: ignore
52 |         """Execute the reasoning pipeline"""
53 |         raise NotImplementedError
54 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/reasoning/prompt_optimization/__init__.py:
--------------------------------------------------------------------------------
 1 | from .decompose_question import DecomposeQuestionPipeline
 2 | from .fewshot_rewrite_question import FewshotRewriteQuestionPipeline
 3 | from .mindmap import CreateMindmapPipeline
 4 | from .rewrite_question import RewriteQuestionPipeline
 5 | 
 6 | __all__ = [
 7 |     "DecomposeQuestionPipeline",
 8 |     "FewshotRewriteQuestionPipeline",
 9 |     "RewriteQuestionPipeline",
10 |     "CreateMindmapPipeline",
11 | ]
12 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/reasoning/prompt_optimization/rewrite_question.py:
--------------------------------------------------------------------------------
 1 | from ktem.llms.manager import llms
 2 | 
 3 | from kotaemon.base import BaseComponent, Document, HumanMessage, Node, SystemMessage
 4 | from kotaemon.llms import ChatLLM, PromptTemplate
 5 | 
 6 | DEFAULT_REWRITE_PROMPT = (
 7 |     "Given the following question, rephrase and expand it "
 8 |     "to help you do better answering. Maintain all information "
 9 |     "in the original question. Keep the question as concise as possible. "
10 |     "Only output the rephrased question without additional information. "
11 |     "Give answer in {lang}\n"
12 |     "Original question: {question}\n"
13 |     "Rephrased question: "
14 | )
15 | 
16 | 
17 | class RewriteQuestionPipeline(BaseComponent):
18 |     """Rewrite user question
19 | 
20 |     Args:
21 |         llm: the language model to rewrite question
22 |         rewrite_template: the prompt template for llm to paraphrase a text input
23 |         lang: the language of the answer. Currently support English and Japanese
24 |     """
25 | 
26 |     llm: ChatLLM = Node(default_callback=lambda _: llms.get_default())
27 |     rewrite_template: str = DEFAULT_REWRITE_PROMPT
28 | 
29 |     lang: str = "English"
30 | 
31 |     def run(self, question: str) -> Document:  # type: ignore
32 |         prompt_template = PromptTemplate(self.rewrite_template)
33 |         prompt = prompt_template.populate(question=question, lang=self.lang)
34 |         messages = [
35 |             SystemMessage(content="You are a helpful assistant"),
36 |             HumanMessage(content=prompt),
37 |         ]
38 |         return self.llm(messages)
39 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/reasoning/prompt_optimization/suggest_conversation_name.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from ktem.llms.manager import llms
 4 | 
 5 | from kotaemon.base import AIMessage, BaseComponent, Document, HumanMessage, Node
 6 | from kotaemon.llms import ChatLLM, PromptTemplate
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class SuggestConvNamePipeline(BaseComponent):
12 |     """Suggest a good conversation name based on the chat history."""
13 | 
14 |     llm: ChatLLM = Node(default_callback=lambda _: llms.get_default())
15 |     SUGGEST_NAME_PROMPT_TEMPLATE = (
16 |         "You are an expert at suggesting good and memorable conversation name. "
17 |         "Based on the chat history above, "
18 |         "suggest a good conversation name (max 10 words). "
19 |         "Give answer in {lang}. Just output the conversation "
20 |         "name without any extra."
21 |     )
22 |     prompt_template: str = SUGGEST_NAME_PROMPT_TEMPLATE
23 |     lang: str = "English"
24 | 
25 |     def run(self, chat_history: list[tuple[str, str]]) -> Document:  # type: ignore
26 |         prompt_template = PromptTemplate(self.prompt_template)
27 |         prompt = prompt_template.populate(lang=self.lang)
28 | 
29 |         messages = []
30 |         for human, ai in chat_history:
31 |             messages.append(HumanMessage(content=human))
32 |             messages.append(AIMessage(content=ai))
33 | 
34 |         messages.append(HumanMessage(content=prompt))
35 | 
36 |         return self.llm(messages)
37 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/reasoning/prompt_optimization/suggest_followup_chat.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from ktem.llms.manager import llms
 4 | 
 5 | from kotaemon.base import AIMessage, BaseComponent, Document, HumanMessage, Node
 6 | from kotaemon.llms import ChatLLM, PromptTemplate
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class SuggestFollowupQuesPipeline(BaseComponent):
12 |     """Suggest a list of follow-up questions based on the chat history."""
13 | 
14 |     llm: ChatLLM = Node(default_callback=lambda _: llms.get_default())
15 |     SUGGEST_QUESTIONS_PROMPT_TEMPLATE = (
16 |         "Based on the chat history above. "
17 |         "your task is to generate 3 to 5 relevant follow-up questions. "
18 |         "These questions should be simple, very concise, "
19 |         "and designed to guide the conversation further. "
20 |         "Respond in JSON format with 'questions' key. "
21 |         "Answer using the language {lang} same as the question. "
22 |     )
23 |     prompt_template: str = SUGGEST_QUESTIONS_PROMPT_TEMPLATE
24 |     extra_prompt: str = """Example of valid response:
25 | ```json
26 | {
27 |     "questions": ["the weather is good", "what's your favorite city"]
28 | }
29 | ```"""
30 |     lang: str = "English"
31 | 
32 |     def run(self, chat_history: list[tuple[str, str]]) -> Document:
33 |         prompt_template = PromptTemplate(self.prompt_template)
34 |         prompt = prompt_template.populate(lang=self.lang) + self.extra_prompt
35 | 
36 |         messages = []
37 |         for human, ai in chat_history[-3:]:
38 |             messages.append(HumanMessage(content=human))
39 |             messages.append(AIMessage(content=ai))
40 | 
41 |         messages.append(HumanMessage(content=prompt))
42 | 
43 |         return self.llm(messages)
44 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/rerankings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/rerankings/__init__.py


--------------------------------------------------------------------------------
/libs/ktem/ktem/rerankings/db.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | from ktem.db.engine import engine
 4 | from sqlalchemy import JSON, Boolean, Column, String
 5 | from sqlalchemy.orm import DeclarativeBase
 6 | from theflow.settings import settings as flowsettings
 7 | from theflow.utils.modules import import_dotted_string
 8 | 
 9 | 
10 | class Base(DeclarativeBase):
11 |     pass
12 | 
13 | 
14 | class BaseRerankingTable(Base):
15 |     """Base table to store rerankings model"""
16 | 
17 |     __abstract__ = True
18 | 
19 |     name = Column(String, primary_key=True, unique=True)
20 |     spec = Column(JSON, default={})
21 |     default = Column(Boolean, default=False)
22 | 
23 | 
24 | __base_reranking: Type[BaseRerankingTable] = (
25 |     import_dotted_string(flowsettings.KH_TABLE_RERANKING, safe=False)
26 |     if hasattr(flowsettings, "KH_TABLE_RERANKING")
27 |     else BaseRerankingTable
28 | )
29 | 
30 | 
31 | class RerankingTable(__base_reranking):  # type: ignore
32 |     __tablename__ = "reranking"
33 | 
34 | 
35 | if not getattr(flowsettings, "KH_ENABLE_ALEMBIC", False):
36 |     RerankingTable.metadata.create_all(engine)
37 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .conversation import get_file_names_regex, get_urls
2 | from .lang import SUPPORTED_LANGUAGE_MAP
3 | 
4 | __all__ = ["SUPPORTED_LANGUAGE_MAP", "get_file_names_regex", "get_urls"]
5 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/utils/commands.py:
--------------------------------------------------------------------------------
1 | WEB_SEARCH_COMMAND = "web"
2 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/utils/conversation.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def sync_retrieval_n_message(
 5 |     messages: list[list[str]],
 6 |     retrievals: list[str],
 7 | ) -> list[str]:
 8 |     """Ensure len of  messages history and retrieval history are equal
 9 |     Empty string/Truncate will be used in case any difference exist
10 |     """
11 |     n_message = len(messages)  # include previous history
12 |     n_retrieval = min(n_message, len(retrievals))
13 | 
14 |     diff = n_message - n_retrieval
15 |     retrievals = retrievals[:n_retrieval] + ["" for _ in range(diff)]
16 | 
17 |     assert len(retrievals) == n_message
18 | 
19 |     return retrievals
20 | 
21 | 
22 | def get_file_names_regex(input_str: str) -> tuple[list[str], str]:
23 |     # get all file names with pattern @"filename" in input_str
24 |     # also remove these file names from input_str
25 |     pattern = r'@"([^"]*)"'
26 |     matches = re.findall(pattern, input_str)
27 |     input_str = re.sub(pattern, "", input_str).strip()
28 | 
29 |     return matches, input_str
30 | 
31 | 
32 | def get_urls(input_str: str) -> tuple[list[str], str]:
33 |     # get all urls in input_str
34 |     # also remove these urls from input_str
35 |     pattern = r"https?://[^\s]+"
36 |     matches = re.findall(pattern, input_str)
37 |     input_str = re.sub(pattern, "", input_str).strip()
38 | 
39 |     return matches, input_str
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     print(sync_retrieval_n_message([[""], [""], [""]], []))
44 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/utils/file.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | 
 3 | 
 4 | class YAMLNoDateSafeLoader(yaml.SafeLoader):
 5 |     """Load datetime as strings, not dates"""
 6 | 
 7 |     @classmethod
 8 |     def remove_implicit_resolver(cls, tag_to_remove):
 9 |         """Remove implicit resolvers for a particular tag
10 | 
11 |         Args:
12 |             tag_to_remove (str): YAML tag to remove
13 |         """
14 |         if "yaml_implicit_resolvers" not in cls.__dict__:
15 |             cls.yaml_implicit_resolvers = cls.yaml_implicit_resolvers.copy()
16 | 
17 |         for first_letter, mappings in cls.yaml_implicit_resolvers.items():
18 |             cls.yaml_implicit_resolvers[first_letter] = [
19 |                 (tag, regexp) for tag, regexp in mappings if tag != tag_to_remove
20 |             ]
21 | 
22 | 
23 | YAMLNoDateSafeLoader.remove_implicit_resolver("tag:yaml.org,2002:timestamp")
24 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/utils/generator.py:
--------------------------------------------------------------------------------
 1 | class Generator:
 2 |     """A generator that stores return value from another generator"""
 3 | 
 4 |     def __init__(self, gen):
 5 |         self.gen = gen
 6 | 
 7 |     def __iter__(self):
 8 |         self.value = yield from self.gen
 9 |         return self.value
10 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/utils/lang.py:
--------------------------------------------------------------------------------
 1 | SUPPORTED_LANGUAGE_MAP = {
 2 |     "en": "English",
 3 |     "ja": "Japanese",
 4 |     "vi": "Vietnamese",
 5 |     "es": "Spanish",
 6 |     "fr": "French",
 7 |     "de": "German",
 8 |     "zh": "Chinese",
 9 |     "ru": "Russian",
10 |     "ar": "Arabic",
11 |     "pt": "Portuguese",
12 |     "hi": "Hindi",
13 |     "bn": "Bengali",
14 |     "pa": "Punjabi",
15 |     "ko": "Korean",
16 |     "it": "Italian",
17 |     "nl": "Dutch",
18 |     "tr": "Turkish",
19 |     "pl": "Polish",
20 |     "uk": "Ukrainian",
21 |     "ro": "Romanian",
22 |     "el": "Greek",
23 |     "hu": "Hungarian",
24 |     "sv": "Swedish",
25 |     "cs": "Czech",
26 |     "fi": "Finnish",
27 |     "da": "Danish",
28 |     "no": "Norwegian",
29 |     "he": "Hebrew",
30 |     "th": "Thai",
31 |     "id": "Indonesian",
32 |     "ms": "Malay",
33 | }
34 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem/utils/rate_limit.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from datetime import datetime, timedelta
 3 | 
 4 | import gradio as gr
 5 | from decouple import config
 6 | 
 7 | # In-memory store for rate limiting (for demonstration purposes)
 8 | rate_limit_store: dict[str, dict] = defaultdict(dict)
 9 | 
10 | # Rate limit configuration
11 | RATE_LIMIT = config("RATE_LIMIT", default=20, cast=int)
12 | RATE_LIMIT_PERIOD = timedelta(hours=24)
13 | 
14 | 
15 | def check_rate_limit(limit_type: str, request: gr.Request):
16 |     if request is None:
17 |         raise ValueError("This feature is not available")
18 | 
19 |     user_id = None
20 |     try:
21 |         import gradiologin as grlogin
22 | 
23 |         user = grlogin.get_user(request)
24 |         if user:
25 |             user_id = user.get("email")
26 |     except (ImportError, AssertionError):
27 |         pass
28 | 
29 |     if not user_id:
30 |         raise ValueError("Please sign-in to use this feature")
31 | 
32 |     now = datetime.now()
33 |     user_data = rate_limit_store[limit_type].get(
34 |         user_id, {"count": 0, "reset_time": now + RATE_LIMIT_PERIOD}
35 |     )
36 | 
37 |     if now >= user_data["reset_time"]:
38 |         # Reset the rate limit for the user
39 |         user_data = {"count": 0, "reset_time": now + RATE_LIMIT_PERIOD}
40 | 
41 |     if user_data["count"] >= RATE_LIMIT:
42 |         raise ValueError("Rate limit exceeded. Please try again later.")
43 | 
44 |     # Increment the request count
45 |     user_data["count"] += 1
46 |     rate_limit_store[limit_type][user_id] = user_data
47 | 
48 |     return user_id
49 | 


--------------------------------------------------------------------------------
/libs/ktem/ktem_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem_tests/__init__.py


--------------------------------------------------------------------------------
/libs/ktem/ktem_tests/test_qa.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | from unittest.mock import patch
 4 | 
 5 | import pytest
 6 | from index import ReaderIndexingPipeline
 7 | from openai.resources.embeddings import Embeddings
 8 | from openai.types.chat.chat_completion import ChatCompletion
 9 | 
10 | from kotaemon.llms import AzureChatOpenAI
11 | 
12 | with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
13 |     openai_embedding = json.load(f)
14 | 
15 | 
16 | _openai_chat_completion_response = ChatCompletion.parse_obj(
17 |     {
18 |         "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x",
19 |         "object": "chat.completion",
20 |         "created": 1692338378,
21 |         "model": "gpt-35-turbo",
22 |         "system_fingerprint": None,
23 |         "choices": [
24 |             {
25 |                 "index": 0,
26 |                 "finish_reason": "stop",
27 |                 "message": {
28 |                     "role": "assistant",
29 |                     "content": "Hello! How can I assist you today?",
30 |                     "function_call": None,
31 |                     "tool_calls": None,
32 |                 },
33 |             }
34 |         ],
35 |         "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},
36 |     }
37 | )
38 | 
39 | 
40 | @pytest.fixture(scope="function")
41 | def mock_openai_embedding(monkeypatch):
42 |     monkeypatch.setattr(Embeddings, "create", lambda *args, **kwargs: openai_embedding)
43 | 
44 | 
45 | @patch(
46 |     "openai.resources.chat.completions.Completions.create",
47 |     side_effect=lambda *args, **kwargs: _openai_chat_completion_response,
48 | )
49 | def test_ingest_pipeline(patch, mock_openai_embedding, tmp_path):
50 |     indexing_pipeline = ReaderIndexingPipeline(
51 |         storage_path=tmp_path,
52 |     )
53 |     indexing_pipeline.indexing_vector_pipeline.embedding.openai_api_key = "some-key"
54 |     input_file_path = Path(__file__).parent / "resources/dummy.pdf"
55 | 
56 |     # call ingestion pipeline
57 |     indexing_pipeline(input_file_path, force_reindex=True)
58 |     retrieving_pipeline = indexing_pipeline.to_retrieving_pipeline()
59 | 
60 |     results = retrieving_pipeline("This is a query")
61 |     assert len(results) == 1
62 | 
63 |     # create llm
64 |     llm = AzureChatOpenAI(
65 |         api_key="dummy",
66 |         api_version="2024-05-01-preview",
67 |         azure_deployment="gpt-4o",
68 |         azure_endpoint="https://test.openai.azure.com/",
69 |     )
70 |     qa_pipeline = indexing_pipeline.to_qa_pipeline(llm=llm, openai_api_key="some-key")
71 |     response = qa_pipeline("Summarize this document.")
72 |     assert response
73 | 


--------------------------------------------------------------------------------
/libs/ktem/migrations/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.
2 | 
3 | To enable database migration, please set `KH_ENABLE_ALEMBIC` to True in the
4 | setting file.
5 | 


--------------------------------------------------------------------------------
/libs/ktem/migrations/env.py:
--------------------------------------------------------------------------------
 1 | from logging.config import fileConfig
 2 | 
 3 | from alembic import context
 4 | from ktem.db.models import *  # noqa
 5 | from sqlalchemy import engine_from_config, pool
 6 | from sqlmodel import SQLModel
 7 | from theflow.settings import settings
 8 | 
 9 | # this is the Alembic Config object, which provides
10 | # access to the values within the .ini file in use.
11 | config = context.config
12 | 
13 | # Interpret the config file for Python logging.
14 | # This line sets up loggers basically.
15 | if config.config_file_name is not None:
16 |     fileConfig(config.config_file_name)
17 | 
18 | # add your model's MetaData object here
19 | # for 'autogenerate' support
20 | # from myapp import mymodel
21 | # target_metadata = mymodel.Base.metadata
22 | target_metadata = SQLModel.metadata
23 | 
24 | # other values from the config, defined by the needs of env.py,
25 | # can be acquired:
26 | # my_important_option = config.get_main_option("my_important_option")
27 | # ... etc.
28 | 
29 | 
30 | def run_migrations_offline() -> None:
31 |     """Run migrations in 'offline' mode.
32 | 
33 |     This configures the context with just a URL
34 |     and not an Engine, though an Engine is acceptable
35 |     here as well.  By skipping the Engine creation
36 |     we don't even need a DBAPI to be available.
37 | 
38 |     Calls to context.execute() here emit the given string to the
39 |     script output.
40 | 
41 |     """
42 |     context.configure(
43 |         url=settings.KH_DATABASE,
44 |         target_metadata=target_metadata,
45 |         literal_binds=True,
46 |         dialect_opts={"paramstyle": "named"},
47 |     )
48 | 
49 |     with context.begin_transaction():
50 |         context.run_migrations()
51 | 
52 | 
53 | def run_migrations_online() -> None:
54 |     """Run migrations in 'online' mode.
55 | 
56 |     In this scenario we need to create an Engine
57 |     and associate a connection with the context.
58 | 
59 |     """
60 |     configuration = config.get_section(config.config_ini_section, {})
61 |     configuration["sqlalchemy.url"] = settings.KH_DATABASE
62 |     connectable = engine_from_config(
63 |         configuration, prefix="sqlalchemy.", poolclass=pool.NullPool
64 |     )
65 | 
66 |     with connectable.connect() as connection:
67 |         context.configure(connection=connection, target_metadata=target_metadata)
68 | 
69 |         with context.begin_transaction():
70 |             context.run_migrations()
71 | 
72 | 
73 | if context.is_offline_mode():
74 |     run_migrations_offline()
75 | else:
76 |     run_migrations_online()
77 | 


--------------------------------------------------------------------------------
/libs/ktem/migrations/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | import sqlmodel
13 | ${imports if imports else ""}
14 | 
15 | # revision identifiers, used by Alembic.
16 | revision: str = ${repr(up_revision)}
17 | down_revision: Union[str, None] = ${repr(down_revision)}
18 | branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
19 | depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
20 | 
21 | 
22 | def upgrade() -> None:
23 |     ${upgrades if upgrades else "pass"}
24 | 
25 | 
26 | def downgrade() -> None:
27 |     ${downgrades if downgrades else "pass"}
28 | 


--------------------------------------------------------------------------------
/libs/ktem/migrations/versions/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/migrations/versions/.keep


--------------------------------------------------------------------------------
/libs/ktem/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools >= 61.0", "wheel", "setuptools-git-versioning>=2.0,<3"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.setuptools]
 6 | include-package-data = true
 7 | packages.find.exclude = ["ktem_tests*", "env*"]
 8 | packages.find.include = ["ktem*"]
 9 | 
10 | [tool.setuptools-git-versioning]
11 | enabled = true
12 | dev_template = "{tag}"
13 | dirty_template = "{tag}"
14 | tag_filter = "v?\\d+(\\.\\d+)*.*"
15 | 
16 | [project]
17 | name = "ktem"
18 | dynamic = ["version"]
19 | requires-python = ">= 3.10"
20 | description = "RAG-based Question and Answering Application"
21 | dependencies = [
22 |     "click>=8.1.7,<9",
23 |     "platformdirs>=4.2.1,<5",
24 |     "pluggy>=1.5.0,<2",
25 |     "python-decouple>=3.8,<4",
26 |     "SQLAlchemy>=2.0.29,<3",
27 |     "sqlmodel>=0.0.16,<0.1",
28 |     "tiktoken>=0.6.0,<1",
29 |     "gradio>=4.31.0,<5",
30 |     "gradiologin",
31 |     "python-multipart==0.0.12", # required for gradio, pinning to avoid yanking issues with micropip (fixed in gradio >= 5.4.0)
32 |     "markdown>=3.6,<4",
33 |     "tzlocal>=5.0",
34 | ]
35 | authors = [
36 |     { name = "@trducng", email = "john@cinnamon.is" },
37 |     { name = "@lone17", email = "ian@cinnamon.is" },
38 |     { name = "@taprosoft", email = "tadashi@cinnamon.is" },
39 |     { name = "@cin-albert", email = "albert@cinnamon.is" },
40 | ]
41 | classifiers = [
42 |     "Programming Language :: Python :: 3",
43 |     "Operating System :: OS Independent",
44 | ]
45 | 


--------------------------------------------------------------------------------
/libs/ktem/requirements.txt:
--------------------------------------------------------------------------------
1 | platformdirs
2 | tzlocal
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools >= 61.0", "wheel", "setuptools-git-versioning>=2.0,<3"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.setuptools]
 6 | include-package-data = false
 7 | packages.find.include = []
 8 | 
 9 | [tool.setuptools-git-versioning]
10 | enabled = true
11 | dev_template = "{tag}"
12 | dirty_template = "{tag}"
13 | tag_filter = "v?\\d+(\\.\\d+)*.*"
14 | 
15 | [project]
16 | name = "kotaemon-app"
17 | dynamic = ["version"]
18 | requires-python = ">= 3.10"
19 | description = "Kotaemon App"
20 | dependencies = [
21 |     "kotaemon @ git+https://github.com/Cinnamon/kotaemon.git@main#subdirectory=libs/kotaemon",
22 |     "ktem @ git+https://github.com/Cinnamon/kotaemon.git@main#subdirectory=libs/ktem"
23 | ]
24 | authors = [
25 |     { name = "@trducng", email = "john@cinnamon.is" },
26 |     { name = "@lone17", email = "ian@cinnamon.is" },
27 |     { name = "@taprosoft", email = "tadashi@cinnamon.is" },
28 |     { name = "@cin-albert", email = "albert@cinnamon.is" },
29 | ]
30 | classifiers = [
31 |     "Programming Language :: Python :: 3",
32 |     "Operating System :: OS Independent",
33 | ]
34 | 
35 | [project.urls]
36 | Homepage = "https://cinnamon.github.io/kotaemon/"
37 | Repository = "https://github.com/Cinnamon/kotaemon/"
38 | Documentation = "https://cinnamon.github.io/kotaemon/"
39 | 
40 | [tool.codespell]
41 | skip = "*.js,*.css,*.map"
42 | # `llm` abbreviation for large language models
43 | ignore-words-list = "llm,fo"
44 | quiet-level = 3
45 | check-filenames = ""
46 | 
47 | [tool.isort]
48 | known_first_party = ["kotaemon"]
49 | 


--------------------------------------------------------------------------------
/scripts/download_pdfjs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -eo pipefail
 4 | 
 5 | # Check and capture input argument for PDFJS_VERSION_DIST
 6 | if [ -z "$1" ]; then
 7 |     echo "Usage: $0 <pdfjs_version_dist>"
 8 |     exit 1
 9 | fi
10 | 
11 | pdfjs_version_dist=$1
12 | 
13 | function check_path_for_spaces() {
14 |     if [[ $PWD =~ \  ]]; then
15 |         echo "The current workdir has whitespace which can lead to unintended behaviour. Please modify your path and continue later."
16 |         exit 1
17 |     fi
18 | }
19 | 
20 | function download_and_unzip() {
21 |     local url=$1
22 |     local dest_dir=$2
23 | 
24 |     if [ -d "$dest_dir" ]; then
25 |         echo "Destination directory $dest_dir already exists. Skipping download."
26 |         return
27 |     fi
28 | 
29 |     mkdir -p "$dest_dir"
30 | 
31 |     local zip_file="${dest_dir}/downloaded.zip"
32 |     echo "Downloading $url to $zip_file"
33 |     curl -L -o "$zip_file" "$url"
34 | 
35 |     echo "Unzipping $zip_file to $dest_dir"
36 |     unzip -o "$zip_file" -d "$dest_dir"
37 | 
38 |     rm "$zip_file"
39 |     echo "Download and unzip completed successfully."
40 | }
41 | 
42 | # Main script execution
43 | 
44 | pdf_js_version="4.0.379"
45 | pdf_js_dist_name="pdfjs-${pdf_js_version}-dist"
46 | pdf_js_dist_url="https://github.com/mozilla/pdf.js/releases/download/v${pdf_js_version}/${pdf_js_dist_name}.zip"
47 | 
48 | check_path_for_spaces
49 | 
50 | echo "Downloading and unzipping PDF.js"
51 | download_and_unzip "$pdf_js_dist_url" "$pdfjs_version_dist"
52 | 
53 | echo "PDF.js has been set up in $pdfjs_version_dist"
54 | 


--------------------------------------------------------------------------------
/scripts/migrate/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/scripts/migrate/__init__.py


--------------------------------------------------------------------------------
/scripts/serve_local.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | import subprocess
 3 | from inspect import currentframe, getframeinfo
 4 | from pathlib import Path
 5 | 
 6 | from decouple import config
 7 | 
 8 | system_name = platform.system()
 9 | 
10 | cur_frame = currentframe()
11 | if cur_frame is None:
12 |     raise ValueError("Cannot get the current frame.")
13 | this_file = getframeinfo(cur_frame).filename
14 | this_dir = Path(this_file).parent
15 | 
16 | 
17 | def serve_llamacpp_python(local_model_file: Path, **kwargs):
18 |     def guess_chat_format(local_model_file):
19 |         model_name = local_model_file.stem
20 | 
21 |         # handle known cases that the server backends handle incorrectly
22 |         # this is highly heuristic, should be expand later
23 |         # server backends usually has logic for this but they could still be wrong
24 |         if "qwen" in model_name:
25 |             return "qwen"
26 | 
27 |         return None
28 | 
29 |     # default port
30 |     if "port" not in kwargs:
31 |         kwargs["port"] = 31415
32 | 
33 |     chat_format = guess_chat_format(local_model_file)
34 |     if chat_format:
35 |         kwargs = {**kwargs, "chat_format": chat_format}
36 | 
37 |     # these scripts create a separate conda env and run the server
38 |     if system_name == "Windows":
39 |         script_file = this_dir / "server_llamacpp_windows.bat"
40 |     elif system_name == "Linux":
41 |         script_file = this_dir / "server_llamacpp_linux.sh"
42 |     elif system_name == "Darwin":
43 |         script_file = this_dir / "server_llamacpp_macos.sh"
44 |     else:
45 |         raise ValueError(f"Unsupported system: {system_name}")
46 | 
47 |     args = " ".join(f"--{k} {v}" for k, v in kwargs.items())
48 | 
49 |     cmd = f"{script_file} --model {local_model_file} {args}"
50 |     subprocess.Popen(cmd, shell=True)
51 | 
52 | 
53 | def main():
54 |     local_model_file = config("LOCAL_MODEL", default="")
55 | 
56 |     if not local_model_file:
57 |         print("LOCAL_MODEL not set in the `.env` file.")
58 |         return
59 | 
60 |     local_model_file = Path(local_model_file)
61 |     if not local_model_file.exists():
62 |         print(f"Local model not found: {local_model_file}")
63 |         return
64 | 
65 |     print(f"Local model found: {local_model_file}")
66 |     will_start_server = input("Do you want to use this local model ? (y/n): ")
67 | 
68 |     if will_start_server.lower().strip() not in ["y", "yes"]:
69 |         return
70 | 
71 |     print("Starting the local server...")
72 |     if local_model_file.suffix == ".gguf":
73 |         serve_llamacpp_python(local_model_file)
74 |     else:
75 |         raise ValueError(f"Unsupported model file type: {local_model_file.suffix}")
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/sso_app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import gradiologin as grlogin
 4 | from decouple import config
 5 | from fastapi import FastAPI
 6 | from fastapi.responses import FileResponse
 7 | from theflow.settings import settings as flowsettings
 8 | 
 9 | KH_APP_DATA_DIR = getattr(flowsettings, "KH_APP_DATA_DIR", ".")
10 | GRADIO_TEMP_DIR = os.getenv("GRADIO_TEMP_DIR", None)
11 | AUTHENTICATION_METHOD = config("AUTHENTICATION_METHOD", "GOOGLE")
12 | 
13 | # override GRADIO_TEMP_DIR if it's not set
14 | if GRADIO_TEMP_DIR is None:
15 |     GRADIO_TEMP_DIR = os.path.join(KH_APP_DATA_DIR, "gradio_tmp")
16 |     os.environ["GRADIO_TEMP_DIR"] = GRADIO_TEMP_DIR
17 | 
18 | # for authentication with Google
19 | GOOGLE_CLIENT_ID = config("GOOGLE_CLIENT_ID", default="")
20 | GOOGLE_CLIENT_SECRET = config("GOOGLE_CLIENT_SECRET", default="")
21 | 
22 | # for authentication with Open ID by keycloak
23 | KEYCLOAK_SERVER_URL = config("KEYCLOAK_SERVER_URL", default="")
24 | KEYCLOAK_REALM = config("KEYCLOAK_REALM", default="")
25 | KEYCLOAK_CLIENT_ID = config("KEYCLOAK_CLIENT_ID", default="")
26 | KEYCLOAK_CLIENT_SECRET = config("KEYCLOAK_CLIENT_SECRET", default="")
27 | 
28 | from ktem.main import App  # noqa
29 | 
30 | gradio_app = App()
31 | demo = gradio_app.make()
32 | 
33 | app = FastAPI()
34 | 
35 | if AUTHENTICATION_METHOD == "KEYCLOAK":
36 |     # for authentication with Open ID by keycloak
37 |     grlogin.register(
38 |         name="keycloak",
39 |         server_metadata_url=(
40 |             f"{KEYCLOAK_SERVER_URL}/realms/{KEYCLOAK_REALM}/"
41 |             ".well-known/openid-configuration"
42 |         ),
43 |         client_id=KEYCLOAK_CLIENT_ID,
44 |         client_secret=KEYCLOAK_CLIENT_SECRET,
45 |         client_kwargs={
46 |             "scope": "openid email profile",
47 |         },
48 |     )
49 | 
50 | else:
51 |     # for authentication with Google
52 |     grlogin.register(
53 |         name="google",
54 |         server_metadata_url=(
55 |             "https://accounts.google.com/.well-known/openid-configuration"
56 |         ),
57 |         client_id=GOOGLE_CLIENT_ID,
58 |         client_secret=GOOGLE_CLIENT_SECRET,
59 |         client_kwargs={
60 |             "scope": "openid email profile",
61 |         },
62 |     )
63 | 
64 | 
65 | @app.get("/favicon.ico", include_in_schema=False)
66 | async def favicon():
67 |     return FileResponse(gradio_app._favicon)
68 | 
69 | 
70 | grlogin.mount_gradio_app(
71 |     app,
72 |     demo,
73 |     "/app",
74 |     allowed_paths=[
75 |         "libs/ktem/ktem/assets",
76 |         GRADIO_TEMP_DIR,
77 |     ],
78 | )
79 | 


--------------------------------------------------------------------------------
/templates/component-default/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/templates/component-default/README.md


--------------------------------------------------------------------------------
/templates/project-default/cookiecutter.json:
--------------------------------------------------------------------------------
1 | {
2 |     "project_name": "prj_kotaemon",
3 |     "ptl": "john"
4 | }
5 | 


--------------------------------------------------------------------------------
/templates/project-default/{{cookiecutter.project_name}}/.gitattributes:
--------------------------------------------------------------------------------
 1 | .gitattributes text eol=lf
 2 | .gitignore text eol=lf
 3 | *.build text eol=lf
 4 | *.c text eol=lf
 5 | *.cmake text eol=lf
 6 | *.cpp text eol=lf
 7 | *.csv text eol=lf
 8 | *.f text eol=lf
 9 | *.f90 text eol=lf
10 | *.for text eol=lf
11 | *.grc text eol=lf
12 | *.h text eol=lf
13 | *.ipynb text eol=lf
14 | *.m text eol=lf
15 | *.md text eol=lf
16 | *.pas text eol=lf
17 | *.py text eol=lf
18 | *.rst text eol=lf
19 | *.sh text eol=lf
20 | *.txt text eol=lf
21 | *.yml text eol=lf
22 | Makefile text eol=lf
23 | *.html linguist-documentation
24 | 


--------------------------------------------------------------------------------
/templates/project-default/{{cookiecutter.project_name}}/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.3.0
 4 |     hooks:
 5 |       - id: check-yaml
 6 |       - id: check-toml
 7 |       - id: end-of-file-fixer
 8 |       - id: trailing-whitespace
 9 |       - id: detect-aws-credentials
10 |         args: ["--allow-missing-credentials"]
11 |       - id: detect-private-key
12 |       - id: check-added-large-files
13 |   - repo: https://github.com/ambv/black
14 |     rev: 22.3.0
15 |     hooks:
16 |       - id: black
17 |         language_version: python3
18 |   - repo: https://github.com/pycqa/isort
19 |     rev: 5.12.0
20 |     hooks:
21 |       - id: isort
22 |         args: ["--profile", "black"]
23 |         language_version: python3.10
24 |   - repo: https://github.com/pycqa/flake8
25 |     rev: 4.0.1
26 |     hooks:
27 |       - id: flake8
28 |         args: ["--max-line-length", "88", "--extend-ignore", "E203"]
29 |   - repo: https://github.com/myint/autoflake
30 |     rev: v1.4
31 |     hooks:
32 |       - id: autoflake
33 |         args:
34 |           [
35 |             "--in-place",
36 |             "--remove-unused-variables",
37 |             "--remove-all-unused-imports",
38 |             "--ignore-init-module-imports",
39 |             "--exclude=tests/*",
40 |           ]
41 |   - repo: https://github.com/pre-commit/mirrors-prettier
42 |     rev: v2.7.1
43 |     hooks:
44 |       - id: prettier
45 |         types_or: [markdown, yaml]
46 |   - repo: https://github.com/pre-commit/mirrors-mypy
47 |     rev: "v1.5.1"
48 |     hooks:
49 |       - id: mypy
50 |         additional_dependencies: [types-PyYAML==6.0.12.11, "types-requests"]
51 |         args: ["--check-untyped-defs", "--ignore-missing-imports"]
52 | 


--------------------------------------------------------------------------------
/templates/project-default/{{cookiecutter.project_name}}/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | 
 3 | # Project {{ cookiecutter.project_name }}
 4 | 
 5 | [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/Cinnamon/kotaemon)
 6 | 
 7 | </div>
 8 | 
 9 | # Install
10 | 
11 | ```bash
12 | # Create new conda env (optional)
13 | conda create -n {{ cookiecutter.project_name }} python=3.10
14 | conda activate {{ cookiecutter.project_name }}
15 | 
16 | # Clone and install the project
17 | git clone "<{{ cookiecutter.project_name }}-repo>"
18 | cd "<{{ cookiecutter.project_name }}-repo>"
19 | pip install -e .
20 | 
21 | # Generate the project structure
22 | cd ..
23 | kh start-project
24 | ```
25 | 
26 | # Usage
27 | 
28 | - Build the pipeline in `pipeline.py`
29 | 
30 | For supported utilities and tools, refer: https://github.com/Cinnamon/kotaemon/wiki/Utilities
31 | 
32 | # Contribute
33 | 
34 | - For project issues and errors, please report in this repo issues.
35 | - For kotaemon issues and errors, please report or make PR fixes in https://github.com/Cinnamon/kotaemon.git
36 | - If the template for this project has issues and errors, please report or make
37 |   PR fixes in https://github.com/Cinnamon/kotaemon/tree/main/templates/project-default
38 | 


--------------------------------------------------------------------------------
/templates/project-default/{{cookiecutter.project_name}}/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | setuptools.setup(
 4 |     name="{{ cookiecutter.project_name }}",
 5 |     version="0.0.1",
 6 |     author="{{ cookiecutter.ptl }}",
 7 |     author_email="{{ cookiecutter.ptl }}@cinnamon.is",
 8 |     description="Project {{ cookiecutter.project_name }}",
 9 |     long_description="Project {{ cookiecutter.project_name }}",
10 |     url="https://github.com/Cinnamon/kotaemon",
11 |     python_requires=">=3",
12 |     classifiers=[
13 |         "Programming Language :: Python :: 3",
14 |         "License :: OSI Approved :: MIT License",
15 |         "Operating System :: OS Independent",
16 |     ],
17 |     install_requires=[
18 |         "kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git",
19 |     ],
20 | )
21 | 


--------------------------------------------------------------------------------
/templates/project-default/{{cookiecutter.project_name}}/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/templates/project-default/{{cookiecutter.project_name}}/tests/__init__.py


--------------------------------------------------------------------------------
/templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py


--------------------------------------------------------------------------------