├── .commitlintrc ├── .dockerignore ├── .env.example ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── config.yml │ └── feature_request.yml ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── auto-bump-and-release.yaml │ ├── build-push-docker.yaml │ ├── pr-lint.yaml │ ├── style-check.yaml │ └── unit-test.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE.txt ├── README.md ├── app.py ├── doc_env_reqs.txt ├── docs ├── about.md ├── development │ ├── contributing.md │ ├── create-a-component.md │ ├── data-components.md │ ├── index.md │ └── utilities.md ├── extra │ └── css │ │ └── code_select.css ├── images │ ├── 269170170-af94ff6b-b8b4-4602-ab6e-2947deb30dff.png │ ├── 269170198-9ac1b95a-b667-42e7-b318-98a1b805d6df.png │ ├── 271332562-ac8f9aac-d853-4571-a48b-d866a99eaf3e.png │ ├── 274787925-e2593010-d7ef-46e3-8719-6fcae0315b5d.png │ ├── change_space_params.png │ ├── chat-demo.gif │ ├── chat-tab-demo.png │ ├── chat-tab.png │ ├── close_logs_space.png │ ├── cohere_api_key.png │ ├── duplicate_space.png │ ├── file-index-tab.png │ ├── index-embedding.png │ ├── info-panel-scores.png │ ├── initial_startup.png │ ├── llm-default.png │ ├── models.png │ ├── pdf-viewer-setup.png │ ├── preview-graph.png │ ├── preview.png │ ├── resources-tab.png │ ├── retrieval-setting.png │ ├── set_api_key_space.png │ └── space_build.png ├── index.md ├── local_model.md ├── online_install.md ├── pages │ └── app │ │ ├── customize-flows.md │ │ ├── ext │ │ └── user-management.md │ │ ├── features.md │ │ ├── functional-description.md │ │ ├── index │ │ └── file.md │ │ └── settings │ │ ├── overview.md │ │ └── user-settings.md ├── scripts │ ├── generate_examples_docs.py │ └── generate_reference_docs.py ├── theme │ ├── assets │ │ └── pymdownx-extras │ │ │ ├── extra-fb5a2a1c86.css │ │ │ ├── extra-fb5a2a1c86.css.map │ │ │ ├── extra-loader-MCFnu0Wd.js │ │ │ ├── extra-loader-MCFnu0Wd.js.map │ │ │ ├── material-extra-3rdparty-E-i8w1WA.js │ │ │ ├── material-extra-3rdparty-E-i8w1WA.js.map │ │ │ ├── material-extra-theme-TVq-kNRT.js │ │ │ └── material-extra-theme-TVq-kNRT.js.map │ ├── main.html │ └── partials │ │ ├── footer.html │ │ ├── header.html │ │ └── libs.html └── usage.md ├── flowsettings.py ├── fly.toml ├── launch.sh ├── libs ├── kotaemon │ ├── README.md │ ├── kotaemon │ │ ├── __init__.py │ │ ├── agents │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── io │ │ │ │ ├── __init__.py │ │ │ │ └── base.py │ │ │ ├── langchain_based.py │ │ │ ├── react │ │ │ │ ├── __init__.py │ │ │ │ ├── agent.py │ │ │ │ └── prompt.py │ │ │ ├── rewoo │ │ │ │ ├── __init__.py │ │ │ │ ├── agent.py │ │ │ │ ├── planner.py │ │ │ │ ├── prompt.py │ │ │ │ └── solver.py │ │ │ ├── tools │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── google.py │ │ │ │ ├── llm.py │ │ │ │ └── wikipedia.py │ │ │ └── utils.py │ │ ├── base │ │ │ ├── __init__.py │ │ │ ├── component.py │ │ │ └── schema.py │ │ ├── chatbot │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── simple_respondent.py │ │ ├── cli.py │ │ ├── contribs │ │ │ ├── __init__.py │ │ │ ├── docs.py │ │ │ └── promptui │ │ │ │ ├── .gitignore │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── cli.py │ │ │ │ ├── config.py │ │ │ │ ├── export.py │ │ │ │ ├── logs.py │ │ │ │ ├── themes.py │ │ │ │ ├── tunnel.py │ │ │ │ └── ui │ │ │ │ ├── __init__.py │ │ │ │ ├── blocks.py │ │ │ │ ├── chat.py │ │ │ │ └── pipeline.py │ │ ├── embeddings │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── endpoint_based.py │ │ │ ├── fastembed.py │ │ │ ├── langchain_based.py │ │ │ ├── openai.py │ │ │ ├── tei_endpoint_embed.py │ │ │ └── voyageai.py │ │ ├── indices │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── extractors │ │ │ │ ├── __init__.py │ │ │ │ └── doc_parsers.py │ │ │ ├── ingests │ │ │ │ ├── __init__.py │ │ │ │ └── files.py │ │ │ ├── qa │ │ │ │ ├── __init__.py │ │ │ │ ├── citation.py │ │ │ │ ├── citation_qa.py │ │ │ │ ├── citation_qa_inline.py │ │ │ │ ├── format_context.py │ │ │ │ └── utils.py │ │ │ ├── rankings │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── cohere.py │ │ │ │ ├── llm.py │ │ │ │ ├── llm_scoring.py │ │ │ │ └── llm_trulens.py │ │ │ ├── retrievers │ │ │ │ ├── __init__.py │ │ │ │ ├── jina_web_search.py │ │ │ │ └── tavily_web_search.py │ │ │ ├── splitters │ │ │ │ └── __init__.py │ │ │ └── vectorindex.py │ │ ├── llms │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── branching.py │ │ │ ├── chats │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── endpoint_based.py │ │ │ │ ├── langchain_based.py │ │ │ │ ├── llamacpp.py │ │ │ │ └── openai.py │ │ │ ├── completions │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ └── langchain_based.py │ │ │ ├── cot.py │ │ │ ├── linear.py │ │ │ └── prompts │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ └── template.py │ │ ├── loaders │ │ │ ├── __init__.py │ │ │ ├── adobe_loader.py │ │ │ ├── azureai_document_intelligence_loader.py │ │ │ ├── base.py │ │ │ ├── composite_loader.py │ │ │ ├── docling_loader.py │ │ │ ├── docx_loader.py │ │ │ ├── excel_loader.py │ │ │ ├── html_loader.py │ │ │ ├── mathpix_loader.py │ │ │ ├── ocr_loader.py │ │ │ ├── pdf_loader.py │ │ │ ├── txt_loader.py │ │ │ ├── unstructured_loader.py │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── adobe.py │ │ │ │ ├── box.py │ │ │ │ ├── gpt4v.py │ │ │ │ ├── pdf_ocr.py │ │ │ │ └── table.py │ │ │ └── web_loader.py │ │ ├── parsers │ │ │ ├── __init__.py │ │ │ └── regex_extractor.py │ │ ├── rerankings │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── cohere.py │ │ │ ├── tei_fast_rerank.py │ │ │ └── voyageai.py │ │ └── storages │ │ │ ├── __init__.py │ │ │ ├── docstores │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── elasticsearch.py │ │ │ ├── in_memory.py │ │ │ ├── lancedb.py │ │ │ └── simple_file.py │ │ │ └── vectorstores │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── chroma.py │ │ │ ├── in_memory.py │ │ │ ├── lancedb.py │ │ │ ├── milvus.py │ │ │ ├── qdrant.py │ │ │ └── simple_file.py │ ├── pyproject.toml │ ├── pytest.ini │ └── tests │ │ ├── __init__.py │ │ ├── _test_multimodal_reader.py │ │ ├── conftest.py │ │ ├── resources │ │ ├── 7810d908b0ff4ce381dcab873196d133.jpg │ │ ├── dummy.docx │ │ ├── dummy.mhtml │ │ ├── dummy.pdf │ │ ├── dummy.xlsx │ │ ├── embedding_openai.json │ │ ├── embedding_openai_batch.json │ │ ├── fullocr_sample_output.json │ │ ├── ggml-vocab-llama.gguf │ │ ├── html │ │ │ ├── dummy.html │ │ │ └── dummy_image.png │ │ ├── multimodal.pdf │ │ ├── policy.md │ │ └── table.pdf │ │ ├── simple_pipeline.py │ │ ├── test_agent.py │ │ ├── test_composite.py │ │ ├── test_cot.py │ │ ├── test_docstores.py │ │ ├── test_documents.py │ │ ├── test_embedding_models.py │ │ ├── test_indexing_retrieval.py │ │ ├── test_ingestor.py │ │ ├── test_llms_chat_models.py │ │ ├── test_llms_completion_models.py │ │ ├── test_post_processing.py │ │ ├── test_prompt.py │ │ ├── test_promptui.py │ │ ├── test_reader.py │ │ ├── test_reranking.py │ │ ├── test_splitter.py │ │ ├── test_table_reader.py │ │ ├── test_telemetry.py │ │ ├── test_template.py │ │ ├── test_tools.py │ │ └── test_vectorstore.py └── ktem │ ├── .gitignore │ ├── MANIFEST.in │ ├── alembic.ini │ ├── ktem │ ├── __init__.py │ ├── app.py │ ├── assets │ │ ├── __init__.py │ │ ├── css │ │ │ └── main.css │ │ ├── icons │ │ │ ├── dark_mode.svg │ │ │ ├── delete.svg │ │ │ ├── expand.svg │ │ │ ├── new.svg │ │ │ ├── rename.svg │ │ │ └── sidebar.svg │ │ ├── img │ │ │ └── favicon.svg │ │ ├── js │ │ │ ├── main.js │ │ │ ├── pdf_viewer.js │ │ │ └── svg-pan-zoom.min.js │ │ ├── md │ │ │ ├── about.md │ │ │ ├── changelogs.md │ │ │ └── usage.md │ │ └── theme.py │ ├── components.py │ ├── db │ │ ├── __init__.py │ │ ├── base_models.py │ │ ├── engine.py │ │ └── models.py │ ├── embeddings │ │ ├── __init__.py │ │ ├── db.py │ │ ├── manager.py │ │ └── ui.py │ ├── exceptions.py │ ├── extension_protocol.py │ ├── index │ │ ├── __init__.py │ │ ├── base.py │ │ ├── file │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── exceptions.py │ │ │ ├── graph │ │ │ │ ├── __init__.py │ │ │ │ ├── graph_index.py │ │ │ │ ├── light_graph_index.py │ │ │ │ ├── lightrag_pipelines.py │ │ │ │ ├── nano_graph_index.py │ │ │ │ ├── nano_pipelines.py │ │ │ │ ├── pipelines.py │ │ │ │ └── visualize.py │ │ │ ├── index.py │ │ │ ├── knet │ │ │ │ ├── __init__.py │ │ │ │ ├── knet_index.py │ │ │ │ └── pipelines.py │ │ │ ├── pipelines.py │ │ │ ├── ui.py │ │ │ └── utils.py │ │ ├── manager.py │ │ ├── models.py │ │ └── ui.py │ ├── llms │ │ ├── __init__.py │ │ ├── db.py │ │ ├── manager.py │ │ └── ui.py │ ├── main.py │ ├── pages │ │ ├── __init__.py │ │ ├── chat │ │ │ ├── __init__.py │ │ │ ├── chat_panel.py │ │ │ ├── chat_suggestion.py │ │ │ ├── common.py │ │ │ ├── control.py │ │ │ ├── demo_hint.py │ │ │ ├── paper_list.py │ │ │ └── report.py │ │ ├── help.py │ │ ├── login.py │ │ ├── resources │ │ │ ├── __init__.py │ │ │ └── user.py │ │ ├── settings.py │ │ └── setup.py │ ├── reasoning │ │ ├── __init__.py │ │ ├── base.py │ │ ├── prompt_optimization │ │ │ ├── __init__.py │ │ │ ├── decompose_question.py │ │ │ ├── fewshot_rewrite_question.py │ │ │ ├── mindmap.py │ │ │ ├── rephrase_question_train.json │ │ │ ├── rewrite_question.py │ │ │ ├── suggest_conversation_name.py │ │ │ └── suggest_followup_chat.py │ │ ├── react.py │ │ ├── rewoo.py │ │ └── simple.py │ ├── rerankings │ │ ├── __init__.py │ │ ├── db.py │ │ ├── manager.py │ │ └── ui.py │ ├── settings.py │ └── utils │ │ ├── __init__.py │ │ ├── commands.py │ │ ├── conversation.py │ │ ├── file.py │ │ ├── generator.py │ │ ├── hf_papers.py │ │ ├── lang.py │ │ ├── plantuml.py │ │ ├── rate_limit.py │ │ ├── render.py │ │ └── visualize_cited.py │ ├── ktem_tests │ ├── __init__.py │ ├── resources │ │ └── embedding_openai.json │ └── test_qa.py │ ├── migrations │ ├── README │ ├── env.py │ ├── script.py.mako │ └── versions │ │ └── .keep │ ├── pyproject.toml │ └── requirements.txt ├── mkdocs.yml ├── pyproject.toml ├── scripts ├── download_pdfjs.sh ├── migrate │ ├── __init__.py │ └── migrate_chroma_db.py ├── run_linux.sh ├── run_macos.sh ├── run_windows.bat ├── serve_local.py ├── server_llamacpp_linux.sh ├── server_llamacpp_macos.sh ├── server_llamacpp_windows.bat ├── update_linux.sh ├── update_macos.sh └── update_windows.bat ├── settings.yaml.example ├── sso_app.py ├── sso_app_demo.py └── templates ├── component-default └── README.md └── project-default ├── cookiecutter.json └── {{cookiecutter.project_name}} ├── .gitattributes ├── .gitignore ├── .pre-commit-config.yaml ├── README.md ├── setup.py ├── tests └── __init__.py └── {{cookiecutter.project_name}} ├── __init__.py └── pipeline.py /.commitlintrc: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["@commitlint/config-conventional"], 3 | "defaultIgnores": true, 4 | "rules": { 5 | "body-leading-blank": [1, "always"], 6 | "body-max-line-length": [2, "always", 100], 7 | "footer-leading-blank": [1, "always"], 8 | "footer-max-line-length": [2, "always", 10000], 9 | "header-max-length": [2, "always", 200], 10 | "subject-case": [ 11 | 2, 12 | "never", 13 | [] 14 | ], 15 | "subject-empty": [2, "never"], 16 | "subject-full-stop": [2, "never", "."], 17 | "type-case": [2, "always", "lower-case"], 18 | "type-empty": [2, "never"], 19 | "type-enum": [ 20 | 2, 21 | "always", 22 | [ 23 | "build", 24 | "chore", 25 | "ci", 26 | "docs", 27 | "feat", 28 | "fix", 29 | "perf", 30 | "refactor", 31 | "revert", 32 | "style", 33 | "test" 34 | ] 35 | ] 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .github/ 2 | .git/ 3 | .mypy_cache/ 4 | __pycache__/ 5 | ktem_app_data/ 6 | env/ 7 | .pre-commit-config.yaml 8 | .commitlintrc 9 | .gitignore 10 | .gitattributes 11 | README.md 12 | *.zip 13 | *.sh 14 | 15 | !/launch.sh 16 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # this is an example .env file, use it to create your own .env file and place it in the root of the project 2 | 3 | # settings for OpenAI 4 | OPENAI_API_BASE=https://api.openai.com/v1 5 | OPENAI_API_KEY= 6 | OPENAI_CHAT_MODEL=gpt-4o-mini 7 | OPENAI_EMBEDDINGS_MODEL=text-embedding-3-large 8 | 9 | # settings for Azure OpenAI 10 | AZURE_OPENAI_ENDPOINT= 11 | AZURE_OPENAI_API_KEY= 12 | OPENAI_API_VERSION=2024-02-15-preview 13 | AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo 14 | AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002 15 | 16 | # settings for Cohere 17 | COHERE_API_KEY= 18 | 19 | # settings for Mistral 20 | # MISTRAL_API_KEY=placeholder 21 | 22 | # settings for VoyageAI 23 | VOYAGE_API_KEY= 24 | 25 | # settings for local models 26 | LOCAL_MODEL=qwen2.5:7b 27 | LOCAL_MODEL_EMBEDDINGS=nomic-embed-text 28 | 29 | # settings for GraphRAG 30 | GRAPHRAG_API_KEY= 31 | GRAPHRAG_LLM_MODEL=gpt-4o-mini 32 | GRAPHRAG_EMBEDDING_MODEL=text-embedding-3-small 33 | 34 | # set to true if you want to use customized GraphRAG config file 35 | USE_CUSTOMIZED_GRAPHRAG_SETTING=false 36 | 37 | # settings for Azure DI 38 | AZURE_DI_ENDPOINT= 39 | AZURE_DI_CREDENTIAL= 40 | 41 | # settings for Adobe API 42 | # get free credential at https://acrobatservices.adobe.com/dc-integration-creation-app-cdn/main.html?api=pdf-extract-api 43 | # also install pip install "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements" 44 | PDF_SERVICES_CLIENT_ID= 45 | PDF_SERVICES_CLIENT_SECRET= 46 | 47 | # settings for PDF.js 48 | PDFJS_VERSION_DIST="pdfjs-4.0.379-dist" 49 | 50 | # variable for authentication method selection 51 | # for authentication with google leave empty 52 | # for authentication with keycloak : 53 | # AUTHENTICATION_METHOD="KEYCLOAK" 54 | 55 | AUTHENTICATION_METHOD= 56 | 57 | # settings for keycloak 58 | KEYCLOAK_SERVER_URL= 59 | KEYCLOAK_CLIENT_ID= 60 | KEYCLOAK_REALM= 61 | KEYCLOAK_CLIENT_SECRET= 62 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.bat text eol=crlf 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: "Bug Report" 2 | description: Report something that is not working as expected 3 | title: "[BUG] " 4 | labels: ["bug"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | *Please fill this form with as much information as possible.* 10 | - type: textarea 11 | id: description 12 | attributes: 13 | label: "Description" 14 | description: Please enter an explicit description of your issue 15 | placeholder: Short and explicit description of your incident... 16 | validations: 17 | required: true 18 | - type: textarea 19 | id: reprod 20 | attributes: 21 | label: "Reproduction steps" 22 | description: Please enter an explicit description of your issue 23 | value: | 24 | 1. Go to '...' 25 | 2. Click on '....' 26 | 3. Scroll down to '....' 27 | 4. See error 28 | render: bash 29 | validations: 30 | required: true 31 | - type: textarea 32 | id: screenshot 33 | attributes: 34 | label: "Screenshots" 35 | description: If applicable, add screenshots to help explain your problem. 36 | value: | 37 | ![DESCRIPTION](LINK.png) 38 | render: bash 39 | validations: 40 | required: false 41 | - type: textarea 42 | id: logs 43 | attributes: 44 | label: "Logs" 45 | description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. 46 | render: bash 47 | validations: 48 | required: false 49 | - type: dropdown 50 | id: browsers 51 | attributes: 52 | label: "Browsers" 53 | description: What browsers are you seeing the problem on ? 54 | multiple: true 55 | options: 56 | - Firefox 57 | - Chrome 58 | - Safari 59 | - Microsoft Edge 60 | - Opera 61 | - Brave 62 | - Other 63 | validations: 64 | required: false 65 | - type: dropdown 66 | id: os 67 | attributes: 68 | label: "OS" 69 | description: What is the impacted environment ? 70 | multiple: true 71 | options: 72 | - Windows 73 | - MacOS 74 | - Linux 75 | - Other 76 | validations: 77 | required: false 78 | - type: textarea 79 | id: additional_information 80 | attributes: 81 | label: "Additional information" 82 | description: Add any relevant information or context. 83 | placeholder: 84 | validations: 85 | required: false 86 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: "Feature Request" 2 | description: Brainstorm and propose new features for the project 3 | title: "[REQUEST] " 4 | labels: ["enhancement"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | *Please fill this form with as much information as possible.* 10 | - type: textarea 11 | id: reference_issues 12 | attributes: 13 | label: "Reference Issues" 14 | description: Common issues 15 | placeholder: "#Issues IDs" 16 | validations: 17 | required: false 18 | - type: textarea 19 | id: summary 20 | attributes: 21 | label: "Summary" 22 | description: Provide a brief explanation of the feature 23 | placeholder: Describe in a few lines your feature request 24 | validations: 25 | required: true 26 | - type: textarea 27 | id: basic_example 28 | attributes: 29 | label: "Basic Example" 30 | description: Indicate here some basic examples of your feature. 31 | placeholder: A few specific words about your feature request. 32 | validations: 33 | required: true 34 | - type: textarea 35 | id: drawbacks 36 | attributes: 37 | label: "Drawbacks" 38 | description: What are the drawbacks/impacts of your feature request ? 39 | placeholder: Identify the drawbacks and impacts while being neutral on your feature request 40 | validations: 41 | required: true 42 | - type: textarea 43 | id: additional_information 44 | attributes: 45 | label: "Additional information" 46 | description: Add any additional information that you think is important for your feature request 47 | placeholder: 48 | validations: 49 | required: false 50 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | - Please include a summary of the changes and the related issue. 4 | - Fixes # (issue) 5 | 6 | ## Type of change 7 | 8 | - [ ] New features (non-breaking change). 9 | - [ ] Bug fix (non-breaking change). 10 | - [ ] Breaking change (fix or feature that would cause existing functionality not to work as expected). 11 | 12 | ## Checklist 13 | 14 | - [ ] I have performed a self-review of my code. 15 | - [ ] I have added thorough tests if it is a core feature. 16 | - [ ] There is a reference to the original bug report and related work. 17 | - [ ] I have commented on my code, particularly in hard-to-understand areas. 18 | - [ ] The feature is well documented. 19 | -------------------------------------------------------------------------------- /.github/workflows/auto-bump-and-release.yaml: -------------------------------------------------------------------------------- 1 | name: Auto Bump and Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | auto-bump-and-release: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Clone the repo 13 | uses: actions/checkout@v4 14 | with: 15 | fetch-depth: 0 16 | - name: Update Application Version 17 | id: update-version 18 | uses: anothrNick/github-tag-action@v1 19 | env: 20 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 21 | WITH_V: true 22 | DEFAULT_BUMP: patch 23 | MAJOR_STRING_TOKEN: "bump:major" 24 | MINOR_STRING_TOKEN: "bump:minor" 25 | PATCH_STRING_TOKEN: "bump:patch" 26 | - name: Create release for ${{ steps.update-version.outputs.new_tag }} 27 | # need to repeat this if statement because Github Action doesn't support early 28 | # stopping for steps 29 | if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }} 30 | run: | 31 | echo Create release folder 32 | mkdir kotaemon-app 33 | echo ${{ steps.update-version.outputs.new_tag }} > kotaemon-app/VERSION 34 | cp LICENSE.txt kotaemon-app/ 35 | cp flowsettings.py kotaemon-app/ 36 | cp app.py kotaemon-app/ 37 | cp .env.example kotaemon-app/.env 38 | cp -r scripts kotaemon-app/ 39 | mkdir -p kotaemon-app/libs/ktem/ktem/ 40 | cp -r libs/ktem/ktem/assets kotaemon-app/libs/ktem/ktem/ 41 | 42 | tree kotaemon-app 43 | zip -r kotaemon-app.zip kotaemon-app 44 | - name: Release ${{ steps.update-version.outputs.new_tag }} 45 | if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }} 46 | uses: softprops/action-gh-release@v2 47 | with: 48 | files: kotaemon-app.zip 49 | fail_on_unmatched_files: true 50 | token: ${{ secrets.GITHUB_TOKEN }} 51 | generate_release_notes: true 52 | tag_name: ${{ steps.update-version.outputs.new_tag }} 53 | make_latest: true 54 | - name: Setup latest branch locally without switching current branch 55 | if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }} 56 | run: git fetch origin latest:latest 57 | - name: Update latest branch 58 | if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }} 59 | run: | 60 | git branch -f latest tags/${{ steps.update-version.outputs.new_tag }} 61 | git checkout latest 62 | git push -f -u origin latest 63 | -------------------------------------------------------------------------------- /.github/workflows/style-check.yaml: -------------------------------------------------------------------------------- 1 | name: style-check 2 | 3 | on: 4 | pull_request: 5 | branches: [main, develop] 6 | push: 7 | branches: [main, develop] 8 | 9 | jobs: 10 | pre-commit: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Clone the repo 14 | uses: actions/checkout@v4 15 | - name: Setup python 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: "3.10" 19 | - name: run pre-commit 20 | uses: pre-commit/action@v3.0.0 21 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.3.0 4 | hooks: 5 | - id: check-yaml 6 | args: ["--unsafe"] 7 | - id: check-toml 8 | - id: end-of-file-fixer 9 | - id: trailing-whitespace 10 | - id: mixed-line-ending 11 | - id: detect-aws-credentials 12 | args: ["--allow-missing-credentials"] 13 | - id: detect-private-key 14 | - id: check-added-large-files 15 | args: ["--maxkb=750"] 16 | - id: debug-statements 17 | - repo: https://github.com/ambv/black 18 | rev: 22.3.0 19 | hooks: 20 | - id: black 21 | language_version: python3 22 | - repo: https://github.com/pycqa/isort 23 | rev: 5.12.0 24 | hooks: 25 | - id: isort 26 | args: ["--profile", "black"] 27 | language_version: python3.10 28 | - repo: https://github.com/pycqa/flake8 29 | rev: 4.0.1 30 | hooks: 31 | - id: flake8 32 | args: ["--max-line-length", "88", "--extend-ignore", "E203"] 33 | - repo: https://github.com/myint/autoflake 34 | rev: v1.4 35 | hooks: 36 | - id: autoflake 37 | args: 38 | [ 39 | "--in-place", 40 | "--remove-unused-variables", 41 | "--remove-all-unused-imports", 42 | "--ignore-init-module-imports", 43 | "--exclude=tests/*", 44 | ] 45 | - repo: https://github.com/pre-commit/mirrors-prettier 46 | rev: v2.7.1 47 | hooks: 48 | - id: prettier 49 | types_or: [markdown, yaml] 50 | - repo: https://github.com/pre-commit/mirrors-mypy 51 | rev: "v1.7.1" 52 | hooks: 53 | - id: mypy 54 | additional_dependencies: 55 | [ 56 | types-PyYAML==6.0.12.11, 57 | "types-requests", 58 | "sqlmodel", 59 | "types-Markdown", 60 | "types-cachetools", 61 | types-tzlocal, 62 | ] 63 | args: ["--check-untyped-defs", "--ignore-missing-imports"] 64 | exclude: "^templates/" 65 | - repo: https://github.com/codespell-project/codespell 66 | rev: v2.2.4 67 | hooks: 68 | - id: codespell 69 | additional_dependencies: 70 | - tomli 71 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10 2 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from theflow.settings import settings as flowsettings 4 | 5 | KH_APP_DATA_DIR = getattr(flowsettings, "KH_APP_DATA_DIR", ".") 6 | KH_GRADIO_SHARE = getattr(flowsettings, "KH_GRADIO_SHARE", False) 7 | GRADIO_TEMP_DIR = os.getenv("GRADIO_TEMP_DIR", None) 8 | # override GRADIO_TEMP_DIR if it's not set 9 | if GRADIO_TEMP_DIR is None: 10 | GRADIO_TEMP_DIR = os.path.join(KH_APP_DATA_DIR, "gradio_tmp") 11 | os.environ["GRADIO_TEMP_DIR"] = GRADIO_TEMP_DIR 12 | 13 | 14 | from ktem.main import App # noqa 15 | 16 | app = App() 17 | demo = app.make() 18 | demo.queue().launch( 19 | favicon_path=app._favicon, 20 | inbrowser=True, 21 | allowed_paths=[ 22 | "libs/ktem/ktem/assets", 23 | GRADIO_TEMP_DIR, 24 | ], 25 | share=KH_GRADIO_SHARE, 26 | ) 27 | -------------------------------------------------------------------------------- /doc_env_reqs.txt: -------------------------------------------------------------------------------- 1 | mkdocs 2 | mkdocstrings[python] 3 | mkdocs-material 4 | mkdocs-gen-files 5 | mkdocs-literate-nav 6 | mkdocs-git-revision-date-localized-plugin 7 | mkdocs-section-index 8 | mkdocs-include-markdown-plugin[cache] 9 | mdx_truly_sane_lists 10 | -------------------------------------------------------------------------------- /docs/about.md: -------------------------------------------------------------------------------- 1 | # About Kotaemon 2 | 3 | An open-source tool for chatting with your documents. Built with both end users and 4 | developers in mind. 5 | 6 | [Source Code](https://github.com/Cinnamon/kotaemon) | 7 | [HF Space](https://huggingface.co/spaces/cin-model/kotaemon-demo) 8 | 9 | [Installation Guide](https://cinnamon.github.io/kotaemon/) | 10 | [Developer Guide](https://cinnamon.github.io/kotaemon/development/) | 11 | [Feedback](https://github.com/Cinnamon/kotaemon/issues) 12 | -------------------------------------------------------------------------------- /docs/development/create-a-component.md: -------------------------------------------------------------------------------- 1 | # Creating a component 2 | 3 | A fundamental concept in kotaemon is "component". 4 | 5 | Anything that isn't data or data structure is a "component". A component can be 6 | thought of as a step within a pipeline. It takes in some input, processes it, 7 | and returns an output, just the same as a Python function! The output will then 8 | become an input for the next component in a pipeline. In fact, a pipeline is just 9 | a component. More appropriately, a nested component: a component that makes use of one or more other components in 10 | the processing step. So in reality, there isn't a difference between a pipeline 11 | and a component! Because of that, in kotaemon, we will consider them the 12 | same as "component". 13 | 14 | To define a component, you will: 15 | 16 | 1. Create a class that subclasses from `kotaemon.base.BaseComponent` 17 | 2. Declare init params with type annotation 18 | 3. Declare nodes (nodes are just other components!) with type annotation 19 | 4. Implement the processing logic in `run`. 20 | 21 | The syntax of a component is as follow: 22 | 23 | ```python 24 | from kotaemon.base import BaseComponent 25 | from kotaemon.llms import LCAzureChatOpenAI 26 | from kotaemon.parsers import RegexExtractor 27 | 28 | 29 | class FancyPipeline(BaseComponent): 30 | param1: str = "This is param1" 31 | param2: int = 10 32 | param3: float 33 | 34 | node1: BaseComponent # this is a node because of BaseComponent type annotation 35 | node2: LCAzureChatOpenAI # this is also a node because LCAzureChatOpenAI subclasses BaseComponent 36 | node3: RegexExtractor # this is also a node bceause RegexExtractor subclasses BaseComponent 37 | 38 | def run(self, some_text: str): 39 | prompt = (self.param1 + some_text) * int(self.param2 + self.param3) 40 | llm_pred = self.node2(prompt).text 41 | matches = self.node3(llm_pred) 42 | return matches 43 | ``` 44 | 45 | Then this component can be used as follow: 46 | 47 | ```python 48 | llm = LCAzureChatOpenAI(endpoint="some-endpont") 49 | extractor = RegexExtractor(pattern=["yes", "Yes"]) 50 | 51 | component = FancyPipeline( 52 | param1="Hello" 53 | param3=1.5 54 | node1=llm, 55 | node2=llm, 56 | node3=extractor 57 | ) 58 | component("goodbye") 59 | ``` 60 | 61 | This way, we can define each operation as a reusable component, and use them to 62 | compose larger reusable components! 63 | 64 | ## Benefits of component 65 | 66 | By defining a component as above, we formally encapsulate all the necessary 67 | information inside a single class. This introduces several benefits: 68 | 69 | 1. Allow tools like promptui to inspect the inner working of a component in 70 | order to automatically generate the promptui. 71 | 2. Allow visualizing a pipeline for debugging purpose. 72 | -------------------------------------------------------------------------------- /docs/development/data-components.md: -------------------------------------------------------------------------------- 1 | # Data & Data Structure Components 2 | 3 | The data & data structure components include: 4 | 5 | - The `Document` class. 6 | - The document store. 7 | - The vector store. 8 | 9 | ## Data Loader 10 | 11 | - PdfLoader 12 | - Layout-aware with table parsing PdfLoader 13 | 14 | - MathPixLoader: To use this loader, you need MathPix API key, refer to [mathpix docs](https://docs.mathpix.com/#introduction) for more information 15 | - OCRLoader: This loader uses lib-table and Flax pipeline to perform OCR and read table structure from PDF file (TODO: add more info about deployment of this module). 16 | - Output: 17 | 18 | - Document: text + metadata to identify whether it is table or not 19 | 20 | ``` 21 | - "source": source file name 22 | - "type": "table" or "text" 23 | - "table_origin": original table in markdown format (to be feed to LLM or visualize using external tools) 24 | - "page_label": page number in the original PDF document 25 | ``` 26 | 27 | ## Document Store 28 | 29 | - InMemoryDocumentStore 30 | 31 | ## Vector Store 32 | 33 | - ChromaVectorStore 34 | - InMemoryVectorStore 35 | -------------------------------------------------------------------------------- /docs/development/index.md: -------------------------------------------------------------------------------- 1 | {% 2 | include-markdown "../../README.md" 3 | start="" 4 | end="" 5 | %} 6 | -------------------------------------------------------------------------------- /docs/extra/css/code_select.css: -------------------------------------------------------------------------------- 1 | .language-pycon .gp, 2 | .language-pycon .go { 3 | /* Generic.Prompt, Generic.Output */ 4 | user-select: none; 5 | } 6 | -------------------------------------------------------------------------------- /docs/images/269170170-af94ff6b-b8b4-4602-ab6e-2947deb30dff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/269170170-af94ff6b-b8b4-4602-ab6e-2947deb30dff.png -------------------------------------------------------------------------------- /docs/images/269170198-9ac1b95a-b667-42e7-b318-98a1b805d6df.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/269170198-9ac1b95a-b667-42e7-b318-98a1b805d6df.png -------------------------------------------------------------------------------- /docs/images/271332562-ac8f9aac-d853-4571-a48b-d866a99eaf3e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/271332562-ac8f9aac-d853-4571-a48b-d866a99eaf3e.png -------------------------------------------------------------------------------- /docs/images/274787925-e2593010-d7ef-46e3-8719-6fcae0315b5d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/274787925-e2593010-d7ef-46e3-8719-6fcae0315b5d.png -------------------------------------------------------------------------------- /docs/images/change_space_params.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/change_space_params.png -------------------------------------------------------------------------------- /docs/images/chat-demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/chat-demo.gif -------------------------------------------------------------------------------- /docs/images/chat-tab-demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/chat-tab-demo.png -------------------------------------------------------------------------------- /docs/images/chat-tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/chat-tab.png -------------------------------------------------------------------------------- /docs/images/close_logs_space.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/close_logs_space.png -------------------------------------------------------------------------------- /docs/images/cohere_api_key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/cohere_api_key.png -------------------------------------------------------------------------------- /docs/images/duplicate_space.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/duplicate_space.png -------------------------------------------------------------------------------- /docs/images/file-index-tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/file-index-tab.png -------------------------------------------------------------------------------- /docs/images/index-embedding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/index-embedding.png -------------------------------------------------------------------------------- /docs/images/info-panel-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/info-panel-scores.png -------------------------------------------------------------------------------- /docs/images/initial_startup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/initial_startup.png -------------------------------------------------------------------------------- /docs/images/llm-default.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/llm-default.png -------------------------------------------------------------------------------- /docs/images/models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/models.png -------------------------------------------------------------------------------- /docs/images/pdf-viewer-setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/pdf-viewer-setup.png -------------------------------------------------------------------------------- /docs/images/preview-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/preview-graph.png -------------------------------------------------------------------------------- /docs/images/preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/preview.png -------------------------------------------------------------------------------- /docs/images/resources-tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/resources-tab.png -------------------------------------------------------------------------------- /docs/images/retrieval-setting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/retrieval-setting.png -------------------------------------------------------------------------------- /docs/images/set_api_key_space.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/set_api_key_space.png -------------------------------------------------------------------------------- /docs/images/space_build.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/docs/images/space_build.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Getting Started with Kotaemon 2 | 3 | ![type:video](https://github.com/Cinnamon/kotaemon/assets/25688648/815ecf68-3a02-4914-a0dd-3f8ec7e75cd9) 4 | 5 | This page is intended for **end users** who want to use the `kotaemon` tool for Question 6 | Answering on local documents. If you are a **developer** who wants contribute to the project, please visit the [development](development/index.md) page. 7 | 8 | ## Installation (Online HuggingFace Space) - easy (10 mins) 9 | 10 | Visit this [guide](online_install.md). 11 | 12 | ## Installation (Offline) - intermediate (20 mins) 13 | 14 | ### Download 15 | 16 | Download the `kotaemon-app.zip` file from the [latest release](https://github.com/Cinnamon/kotaemon/releases/latest/). 17 | 18 | ### Run setup script 19 | 20 | 0. Unzip the downloaded file. 21 | 1. Navigate to the `scripts` folder and start an installer that matches your OS: 22 | - Windows: `run_windows.bat`. Just double click the file. 23 | - macOS: `run_macos.sh` 24 | 1. Right click on your file and select Open with and Other. 25 | 2. Enable All Applications and choose Terminal. 26 | 3. NOTE: If you always want to open that file with Terminal, then check Always Open With. 27 | 4. From now on, double click on your file and it should work. 28 | - Linux: `run_linux.sh`. Please run the script using `bash run_linux.sh` in your terminal. 29 | 2. After the installation, the installer will ask to launch the ktem's UI, answer to continue. 30 | 3. If launched, the application will be open automatically in your browser. 31 | 4. Default login information is: `username: admin / password: admin`. You should change this credential right after the first login on the UI. 32 | 33 | ## Launch 34 | 35 | To launch the app after initial setup or any change, simply run the `run_*` script again. 36 | 37 | A browser window will be opened and greets you with this screen: 38 | 39 | ![Chat tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/chat-tab.png) 40 | 41 | ## Usage 42 | 43 | For how to use the application, see [Usage](usage.md). This page will also be available to 44 | you within the application. 45 | 46 | ## Feedback 47 | 48 | Feel free to create a bug report or a feature request on our [repo](https://github.com/Cinnamon/kotaemon/issues). 49 | -------------------------------------------------------------------------------- /docs/online_install.md: -------------------------------------------------------------------------------- 1 | ## Installation (Online HuggingFace Space) 2 | 3 | 1. Go to [HF kotaemon_template](https://huggingface.co/spaces/cin-model/kotaemon_template). 4 | 2. Use Duplicate function to create your own space. Or use this [direct link](https://huggingface.co/spaces/cin-model/kotaemon_template?duplicate=true). 5 | ![Duplicate space](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/duplicate_space.png) 6 | ![Change space params](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/change_space_params.png) 7 | 3. Wait for the build to complete and start up (apprx 10 mins). 8 | ![Wait space build](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/space_build.png) 9 | ![Close space build](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/close_logs_space.png) 10 | 4. Follow the first setup instructions (and register for Cohere API key if needed). 11 | ![Cohere API](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/cohere_api_key.png) 12 | 5. Complete the setup and use your own private space! 13 | ![App Startup](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/initial_startup.png) 14 | -------------------------------------------------------------------------------- /docs/pages/app/ext/user-management.md: -------------------------------------------------------------------------------- 1 | `ktem` provides user management as an extension. To enable user management, in 2 | your `flowsettings.py`, set the following variables: 3 | 4 | - `KH_FEATURE_USER_MANAGEMENT`: True to enable. 5 | - `KH_FEATURE_USER_MANAGEMENT_ADMIN`: the admin username. This user will be 6 | created when the app 1st start. 7 | - `KH_FEATURE_USER_MANAGEMENT_PASSWORD`: the admin password. This value 8 | accompanies the admin username. 9 | 10 | Once enabled, you have access to the following features: 11 | 12 | - User login/logout (located in Settings Tab) 13 | - User changing password (located in Settings Tab) 14 | - Create / List / Edit / Delete user (located in Resources > Users Tab) 15 | -------------------------------------------------------------------------------- /docs/pages/app/features.md: -------------------------------------------------------------------------------- 1 | ## Chat 2 | 3 | The kotaemon focuses on question and answering over a corpus of data. Below 4 | is the gentle introduction about the chat functionality. 5 | 6 | - Users can upload corpus of files. 7 | - Users can converse to the chatbot to ask questions about the corpus of files. 8 | - Users can view the reference in the files. 9 | -------------------------------------------------------------------------------- /docs/pages/app/settings/overview.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | There are 3 kinds of settings in `ktem`, geared towards different stakeholders 4 | for different use cases: 5 | 6 | - Developer settings. These settings are meant for very basic app customization, such as database URL, cloud config, logging config, which features to enable... You will be interested in the developer settings if you deploy `ktem` to your customers, or if you build extension for `ktem` for developers. These settings are declared inside `flowsettings.py`. 7 | - Admin settings. These settings show up in the Admin page, and are meant to allow admin-level user to customize low level features, such as which credentials to connect to data sources, which keys to use for LLM... 8 | - [User settings](/pages/app/settings/user-settings/). These settings are meant for run-time users to tweak ktem to their personal needs, such as which output languages the chatbot should generate, which reasoning type to use... 9 | -------------------------------------------------------------------------------- /docs/pages/app/settings/user-settings.md: -------------------------------------------------------------------------------- 1 | # User settings 2 | 3 | `ktem` allows developers to extend the index and the reasoning pipeline. In 4 | many cases, these components can have settings that should be modified by 5 | users at run-time, (e.g. `topk`, `chunksize`...). These are the user settings. 6 | 7 | `ktem` allows developers to declare such user settings in their code. Once 8 | declared, `ktem` will render them in a Settings page. 9 | 10 | There are 2 places that `ktem` looks for declared user settings. You can 11 | refer to the respective pages. 12 | 13 | - In the index. 14 | - In the reasoning pipeline. 15 | 16 | ## Syntax of a settings 17 | 18 | A collection of settings is a dictionary of type `dict[str, dict]`, where the 19 | key is a setting id, and the value is the description of the setting. 20 | 21 | ```python 22 | settings = { 23 | "topk": { 24 | "name": "Top-k chunks", 25 | "value": 10, 26 | "component": "number", 27 | }, 28 | "lang": { 29 | "name": "Languages", 30 | "value": "en", 31 | "component": "dropdown", 32 | "choices": [("en", "English"), ("cn", "Chinese")], 33 | } 34 | } 35 | ``` 36 | 37 | Each setting description must have: 38 | 39 | - name: the human-understandable name of the settings. 40 | - value: the default value of the settings. 41 | - component: the UI component to render such setting on the UI. Available: 42 | 43 | - "text": single-value 44 | - "number": single-value 45 | - "checkbox": single-value 46 | - "dropdown": choices 47 | - "radio": choices 48 | - "checkboxgroup": choices 49 | 50 | - choices: the list of choices, if the component type allows. 51 | 52 | ## Settings page structure 53 | -------------------------------------------------------------------------------- /docs/scripts/generate_examples_docs.py: -------------------------------------------------------------------------------- 1 | # import shutil 2 | from pathlib import Path 3 | from typing import Any, Iterable 4 | 5 | import mkdocs_gen_files 6 | 7 | # get the root source code directory 8 | doc_dir_name = "docs" 9 | doc_dir = Path(__file__) 10 | while doc_dir.name != doc_dir_name and doc_dir != doc_dir.parent: 11 | doc_dir = doc_dir.parent 12 | 13 | if doc_dir == doc_dir.parent: 14 | raise ValueError(f"root_name ({doc_dir_name}) not in path ({str(Path(__file__))}).") 15 | 16 | 17 | def generate_docs_for_examples_readme( 18 | examples_dir: Path, target_doc_folder: str, ignored_modules: Iterable[Any] = [] 19 | ): 20 | if not examples_dir.is_dir(): 21 | raise ModuleNotFoundError(str(examples_dir)) 22 | 23 | nav = mkdocs_gen_files.Nav() 24 | 25 | for path in sorted(examples_dir.rglob("*README.md")): 26 | # ignore modules with name starts with underscore (i.e. __init__) 27 | if path.name.startswith("_") or path.name.startswith("test"): 28 | continue 29 | 30 | module_path = path.parent.relative_to(examples_dir).with_suffix("") 31 | doc_path = path.parent.relative_to(examples_dir).with_suffix(".md") 32 | full_doc_path = Path(target_doc_folder, doc_path) 33 | 34 | parts = list(module_path.parts) 35 | identifier = ".".join(parts) 36 | 37 | if "tests" in parts: 38 | continue 39 | 40 | ignore = False 41 | for each_module in ignored_modules: 42 | if identifier.startswith(each_module): 43 | ignore = True 44 | break 45 | if ignore: 46 | continue 47 | 48 | nav_titles = [name.replace("_", " ").title() for name in parts] 49 | nav[nav_titles] = doc_path.as_posix() 50 | 51 | with mkdocs_gen_files.open(full_doc_path, "w") as f: 52 | f.write(f'--8<-- "{path.relative_to(examples_dir.parent)}"') 53 | 54 | mkdocs_gen_files.set_edit_path( 55 | full_doc_path, Path("..") / path.relative_to(examples_dir.parent) 56 | ) 57 | 58 | with mkdocs_gen_files.open(f"{target_doc_folder}/NAV.md", "w") as nav_file: 59 | nav_file.writelines(nav.build_literate_nav()) 60 | 61 | 62 | generate_docs_for_examples_readme( 63 | examples_dir=doc_dir.parent / "examples", 64 | target_doc_folder="examples", 65 | ) 66 | -------------------------------------------------------------------------------- /docs/scripts/generate_reference_docs.py: -------------------------------------------------------------------------------- 1 | # import shutil 2 | from pathlib import Path 3 | from typing import Any, Iterable 4 | 5 | import mkdocs_gen_files 6 | 7 | # get the root source code directory 8 | doc_dir_name = "docs" 9 | doc_dir = Path(__file__) 10 | while doc_dir.name != doc_dir_name and doc_dir != doc_dir.parent: 11 | doc_dir = doc_dir.parent 12 | 13 | if doc_dir == doc_dir.parent: 14 | raise ValueError(f"root_name ({doc_dir_name}) not in path ({str(Path(__file__))}).") 15 | 16 | nav_title_map = {"cli": "CLI", "llms": "LLMs"} 17 | 18 | 19 | def generate_docs_for_src_code( 20 | code_dir: Path, target_doc_folder: str, ignored_modules: Iterable[Any] = [] 21 | ): 22 | if not code_dir.is_dir(): 23 | raise ModuleNotFoundError(str(code_dir)) 24 | 25 | nav = mkdocs_gen_files.Nav() 26 | 27 | for path in sorted(code_dir.rglob("*.py")): 28 | # ignore modules with name starts with underscore (i.e. __init__) 29 | # if path.name.startswith("_") or path.name.startswith("test"): 30 | # continue 31 | 32 | module_path = path.relative_to(code_dir).with_suffix("") 33 | doc_path = path.relative_to(code_dir).with_suffix(".md") 34 | full_doc_path = Path(target_doc_folder, doc_path) 35 | 36 | parts = list(module_path.parts) 37 | 38 | if parts[-1] == "__init__": 39 | doc_path = doc_path.with_name("index.md") 40 | full_doc_path = full_doc_path.with_name("index.md") 41 | parts.pop() 42 | 43 | if not parts: 44 | continue 45 | 46 | if "tests" in parts: 47 | continue 48 | 49 | identifier = ".".join(parts) 50 | ignore = False 51 | for each_module in ignored_modules: 52 | if identifier.startswith(each_module): 53 | ignore = True 54 | break 55 | if ignore: 56 | continue 57 | 58 | nav_titles = [ 59 | nav_title_map.get(name, name.replace("_", " ").title()) for name in parts 60 | ] 61 | nav[nav_titles] = doc_path.as_posix() 62 | 63 | with mkdocs_gen_files.open(full_doc_path, "w") as f: 64 | f.write(f"::: {identifier}") 65 | 66 | # this method works in docs folder 67 | mkdocs_gen_files.set_edit_path( 68 | full_doc_path, Path("..") / path.relative_to(code_dir.parent) 69 | ) 70 | 71 | with mkdocs_gen_files.open(f"{target_doc_folder}/Summary.md", "w") as nav_file: 72 | nav_file.writelines(nav.build_literate_nav()) 73 | 74 | 75 | generate_docs_for_src_code( 76 | code_dir=doc_dir.parent / "libs" / "kotaemon" / "kotaemon", 77 | target_doc_folder="reference", 78 | ignored_modules={"contribs"}, 79 | ) 80 | -------------------------------------------------------------------------------- /docs/theme/assets/pymdownx-extras/material-extra-theme-TVq-kNRT.js: -------------------------------------------------------------------------------- 1 | !function(){"use strict";var e;e=function(e){"true"===localStorage.getItem("data-md-prefers-color-scheme")&&document.querySelector("body").setAttribute("data-md-color-scheme",e.matches?"dracula":"default")},new MutationObserver((function(t){t.forEach((function(t){if("childList"===t.type&&t.addedNodes.length)for(var a=0;a 4 | {% if page.previous_page or page.next_page %} 5 | 48 | {% endif %} 49 | 50 | -------------------------------------------------------------------------------- /docs/theme/partials/libs.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /fly.toml: -------------------------------------------------------------------------------- 1 | # fly.toml app configuration file generated for kotaemon on 2024-12-24T20:56:32+07:00 2 | # 3 | # See https://fly.io/docs/reference/configuration/ for information about how to use this file. 4 | # 5 | 6 | app = 'kotaemon' 7 | primary_region = 'sin' 8 | 9 | [build] 10 | 11 | [mounts] 12 | destination = "/app/ktem_app_data" 13 | source = "ktem_volume" 14 | 15 | [http_service] 16 | internal_port = 7860 17 | force_https = true 18 | auto_stop_machines = 'suspend' 19 | auto_start_machines = true 20 | min_machines_running = 0 21 | processes = ['app'] 22 | 23 | [[vm]] 24 | memory = '4gb' 25 | cpu_kind = 'shared' 26 | cpus = 4 27 | -------------------------------------------------------------------------------- /launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$GRADIO_SERVER_NAME" ]; then 4 | export GRADIO_SERVER_NAME="0.0.0.0" 5 | fi 6 | if [ -z "$GRADIO_SERVER_PORT" ]; then 7 | export GRADIO_SERVER_PORT="7860" 8 | fi 9 | 10 | # Check if environment variable KH_DEMO_MODE is set to true 11 | if [ "$KH_DEMO_MODE" = "true" ]; then 12 | echo "KH_DEMO_MODE is true. Launching in demo mode..." 13 | # Command to launch in demo mode 14 | GR_FILE_ROOT_PATH="/app" KH_FEATURE_USER_MANAGEMENT=false USE_LIGHTRAG=false uvicorn sso_app_demo:app --host "$GRADIO_SERVER_NAME" --port "$GRADIO_SERVER_PORT" 15 | else 16 | if [ "$KH_SSO_ENABLED" = "true" ]; then 17 | echo "KH_SSO_ENABLED is true. Launching in SSO mode..." 18 | GR_FILE_ROOT_PATH="/app" KH_SSO_ENABLED=true uvicorn sso_app:app --host "$GRADIO_SERVER_NAME" --port "$GRADIO_SERVER_PORT" 19 | else 20 | ollama serve & 21 | python app.py 22 | fi 23 | fi 24 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/__init__.py: -------------------------------------------------------------------------------- 1 | # Disable telemetry with monkey patching 2 | import logging 3 | 4 | logger = logging.getLogger(__name__) 5 | try: 6 | import posthog 7 | 8 | def capture(*args, **kwargs): 9 | logger.info("posthog.capture called with args: %s, kwargs: %s", args, kwargs) 10 | 11 | posthog.capture = capture 12 | except ImportError: 13 | pass 14 | 15 | try: 16 | import os 17 | 18 | os.environ["HAYSTACK_TELEMETRY_ENABLED"] = "False" 19 | import haystack.telemetry 20 | 21 | haystack.telemetry.telemetry = None 22 | except ImportError: 23 | pass 24 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseAgent 2 | from .io import AgentFinish, AgentOutput, AgentType, BaseScratchPad 3 | from .langchain_based import LangchainAgent 4 | from .react.agent import ReactAgent 5 | from .rewoo.agent import RewooAgent 6 | from .tools import BaseTool, ComponentTool, GoogleSearchTool, LLMTool, WikipediaTool 7 | 8 | __all__ = [ 9 | # agent 10 | "BaseAgent", 11 | "ReactAgent", 12 | "RewooAgent", 13 | "LangchainAgent", 14 | # tool 15 | "BaseTool", 16 | "ComponentTool", 17 | "GoogleSearchTool", 18 | "WikipediaTool", 19 | "LLMTool", 20 | # io 21 | "AgentType", 22 | "AgentOutput", 23 | "AgentFinish", 24 | "BaseScratchPad", 25 | ] 26 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/agents/base.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | 3 | from kotaemon.base import BaseComponent, Node, Param 4 | from kotaemon.llms import BaseLLM, PromptTemplate 5 | 6 | from .io import AgentOutput, AgentType 7 | from .tools import BaseTool 8 | 9 | 10 | class BaseAgent(BaseComponent): 11 | """Define base agent interface""" 12 | 13 | name: str = Param(help="Name of the agent.") 14 | agent_type: AgentType = Param(help="Agent type, must be one of AgentType") 15 | description: str = Param( 16 | help=( 17 | "Description used to tell the model how/when/why to use the agent. You can" 18 | " provide few-shot examples as a part of the description. This will be" 19 | " input to the prompt of LLM." 20 | ) 21 | ) 22 | llm: Optional[BaseLLM] = Node( 23 | help=( 24 | "LLM to be used for the agent (optional). LLM must implement BaseLLM" 25 | " interface." 26 | ) 27 | ) 28 | prompt_template: Optional[Union[PromptTemplate, dict[str, PromptTemplate]]] = Param( 29 | help="A prompt template or a dict to supply different prompt to the agent" 30 | ) 31 | plugins: list[BaseTool] = Param( 32 | default_callback=lambda _: [], 33 | help="List of plugins / tools to be used in the agent", 34 | ) 35 | 36 | @staticmethod 37 | def safeguard_run(run_func, *args, **kwargs): 38 | def wrapper(self, *args, **kwargs): 39 | try: 40 | return run_func(self, *args, **kwargs) 41 | except Exception as e: 42 | return AgentOutput( 43 | text="", 44 | agent_type=self.agent_type, 45 | status="failed", 46 | error=str(e), 47 | ) 48 | 49 | return wrapper 50 | 51 | def add_tools(self, tools: list[BaseTool]) -> None: 52 | """Helper method to add tools and update agent state if needed""" 53 | self.plugins.extend(tools) 54 | 55 | def run(self, *args, **kwargs) -> AgentOutput | list[AgentOutput]: 56 | """Run the component.""" 57 | raise NotImplementedError() 58 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/agents/io/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import AgentAction, AgentFinish, AgentOutput, AgentType, BaseScratchPad 2 | 3 | __all__ = ["AgentOutput", "AgentFinish", "BaseScratchPad", "AgentType", "AgentAction"] 4 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/agents/react/__init__.py: -------------------------------------------------------------------------------- 1 | from .agent import ReactAgent 2 | 3 | __all__ = ["ReactAgent"] 4 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/agents/react/prompt.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from kotaemon.llms import PromptTemplate 4 | 5 | zero_shot_react_prompt = PromptTemplate( 6 | template="""Answer the following questions as best you can. Give answer in {lang}. You have access to the following tools: 7 | {tool_description} 8 | Use the following format: 9 | 10 | Question: the input question you must answer 11 | Thought: you should always think about what to do 12 | 13 | Action: the action to take, should be one of [{tool_names}] 14 | 15 | Action Input: the input to the action, should be different from the action input of the same action in previous steps. 16 | 17 | Observation: the result of the action 18 | 19 | ... (this Thought/Action/Action Input/Observation can repeat N times) 20 | #Thought: I now know the final answer 21 | Final Answer: the final answer to the original input question 22 | 23 | Begin! After each Action Input. 24 | 25 | Question: {instruction} 26 | Thought:{agent_scratchpad} 27 | """ 28 | ) 29 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/agents/rewoo/__init__.py: -------------------------------------------------------------------------------- 1 | from .agent import RewooAgent 2 | 3 | __all__ = ["RewooAgent"] 4 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/agents/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseTool, ComponentTool 2 | from .google import GoogleSearchTool 3 | from .llm import LLMTool 4 | from .wikipedia import WikipediaTool 5 | 6 | __all__ = ["BaseTool", "ComponentTool", "GoogleSearchTool", "WikipediaTool", "LLMTool"] 7 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/agents/tools/google.py: -------------------------------------------------------------------------------- 1 | from typing import AnyStr, Optional, Type 2 | from urllib.error import HTTPError 3 | 4 | from langchain_community.utilities import SerpAPIWrapper 5 | from pydantic import BaseModel, Field 6 | 7 | from .base import BaseTool 8 | 9 | 10 | class GoogleSearchArgs(BaseModel): 11 | query: str = Field(..., description="a search query") 12 | 13 | 14 | class GoogleSearchTool(BaseTool): 15 | name: str = "google_search" 16 | description: str = ( 17 | "A search engine retrieving top search results as snippets from Google. " 18 | "Input should be a search query." 19 | ) 20 | args_schema: Optional[Type[BaseModel]] = GoogleSearchArgs 21 | 22 | def _run_tool(self, query: AnyStr) -> str: 23 | try: 24 | from googlesearch import search 25 | except ImportError: 26 | raise ImportError( 27 | "install googlesearch using `pip3 install googlesearch-python` to " 28 | "use this tool" 29 | ) 30 | 31 | try: 32 | output = "" 33 | search_results = search(query, advanced=True) 34 | if search_results: 35 | output = "\n".join( 36 | "{} {}".format(item.title, item.description) 37 | for item in search_results 38 | ) 39 | except HTTPError: 40 | output = "No evidence found." 41 | 42 | return output 43 | 44 | 45 | class SerpTool(BaseTool): 46 | name = "google_search" 47 | description = ( 48 | "Worker that searches results from Google. Useful when you need to find short " 49 | "and succinct answers about a specific topic. Input should be a search query." 50 | ) 51 | args_schema: Optional[Type[BaseModel]] = GoogleSearchArgs 52 | 53 | def _run_tool(self, query: AnyStr) -> str: 54 | tool = SerpAPIWrapper() 55 | evidence = tool.run(query) 56 | 57 | return evidence 58 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/agents/tools/llm.py: -------------------------------------------------------------------------------- 1 | from typing import AnyStr, Optional, Type 2 | 3 | from pydantic import BaseModel, Field 4 | 5 | from kotaemon.agents.tools.base import ToolException 6 | from kotaemon.llms import BaseLLM 7 | 8 | from .base import BaseTool 9 | 10 | 11 | class LLMArgs(BaseModel): 12 | query: str = Field(..., description="a search question or prompt") 13 | 14 | 15 | class LLMTool(BaseTool): 16 | name: str = "llm" 17 | description: str = ( 18 | "A pretrained LLM like yourself. Useful when you need to act with " 19 | "general world knowledge and common sense. Prioritize it when you " 20 | "are confident in solving the problem " 21 | "yourself. Input can be any instruction." 22 | ) 23 | llm: BaseLLM 24 | args_schema: Optional[Type[BaseModel]] = LLMArgs 25 | dummy_mode: bool = True 26 | 27 | def _run_tool(self, query: AnyStr) -> str: 28 | output = None 29 | try: 30 | if not self.dummy_mode: 31 | response = self.llm(query) 32 | else: 33 | response = None 34 | except ValueError: 35 | raise ToolException("LLM Tool call failed") 36 | output = response.text if response else "<->" 37 | return output 38 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/agents/tools/wikipedia.py: -------------------------------------------------------------------------------- 1 | from typing import Any, AnyStr, Optional, Type, Union 2 | 3 | from pydantic import BaseModel, Field 4 | 5 | from kotaemon.base import Document 6 | 7 | from .base import BaseTool 8 | 9 | 10 | class Wiki: 11 | """Wrapper around wikipedia API.""" 12 | 13 | def __init__(self) -> None: 14 | """Check that wikipedia package is installed.""" 15 | try: 16 | import wikipedia # noqa: F401 17 | except ImportError: 18 | raise ValueError( 19 | "Could not import wikipedia python package. " 20 | "Please install it with `pip install wikipedia`." 21 | ) 22 | 23 | def search(self, search: str) -> Union[str, Document]: 24 | """Try to search for wiki page. 25 | 26 | If page exists, return the page summary, and a PageWithLookups object. 27 | If page does not exist, return similar entries. 28 | """ 29 | import wikipedia 30 | 31 | try: 32 | page_content = wikipedia.page(search).content 33 | url = wikipedia.page(search).url 34 | result: Union[str, Document] = Document( 35 | text=page_content, metadata={"page": url} 36 | ) 37 | except wikipedia.PageError: 38 | result = f"Could not find [{search}]. Similar: {wikipedia.search(search)}" 39 | except wikipedia.DisambiguationError: 40 | result = f"Could not find [{search}]. Similar: {wikipedia.search(search)}" 41 | return result 42 | 43 | 44 | class WikipediaArgs(BaseModel): 45 | query: str = Field(..., description="a search query as input to wkipedia") 46 | 47 | 48 | class WikipediaTool(BaseTool): 49 | """Tool that adds the capability to query the Wikipedia API.""" 50 | 51 | name: str = "wikipedia" 52 | description: str = ( 53 | "Search engine from Wikipedia, retrieving relevant wiki page. " 54 | "Useful when you need to get holistic knowledge about people, " 55 | "places, companies, historical events, or other subjects. " 56 | "Input should be a search query." 57 | ) 58 | args_schema: Optional[Type[BaseModel]] = WikipediaArgs 59 | doc_store: Any = None 60 | 61 | def _run_tool(self, query: AnyStr) -> AnyStr: 62 | if not self.doc_store: 63 | self.doc_store = Wiki() 64 | tool = self.doc_store 65 | evidence = tool.search(query) 66 | return evidence 67 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/agents/utils.py: -------------------------------------------------------------------------------- 1 | from kotaemon.base import Document 2 | 3 | 4 | def get_plugin_response_content(output) -> str: 5 | """ 6 | Wrapper for AgentOutput content return 7 | """ 8 | if isinstance(output, Document): 9 | return output.text 10 | else: 11 | return str(output) 12 | 13 | 14 | def calculate_cost(model_name: str, prompt_token: int, completion_token: int) -> float: 15 | """ 16 | Calculate the cost of a prompt and completion. 17 | 18 | Returns: 19 | float: Cost of the provided model name with provided token information 20 | """ 21 | # TODO: to be implemented 22 | return 0.0 23 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .component import BaseComponent, Node, Param, lazy 2 | from .schema import ( 3 | AIMessage, 4 | BaseMessage, 5 | Document, 6 | DocumentWithEmbedding, 7 | ExtractorOutput, 8 | HumanMessage, 9 | LLMInterface, 10 | RetrievedDocument, 11 | StructuredOutputLLMInterface, 12 | SystemMessage, 13 | ) 14 | 15 | __all__ = [ 16 | "BaseComponent", 17 | "Document", 18 | "DocumentWithEmbedding", 19 | "BaseMessage", 20 | "SystemMessage", 21 | "AIMessage", 22 | "HumanMessage", 23 | "RetrievedDocument", 24 | "LLMInterface", 25 | "StructuredOutputLLMInterface", 26 | "ExtractorOutput", 27 | "Param", 28 | "Node", 29 | "lazy", 30 | ] 31 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/base/component.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Any, AsyncGenerator, Iterator, Optional 3 | 4 | from theflow import Function, Node, Param, lazy 5 | 6 | from kotaemon.base.schema import Document 7 | 8 | 9 | class BaseComponent(Function): 10 | """A component is a class that can be used to compose a pipeline. 11 | 12 | !!! tip "Benefits of component" 13 | - Auto caching, logging 14 | - Allow deployment 15 | 16 | !!! tip "For each component, the spirit is" 17 | - Tolerate multiple input types, e.g. str, Document, List[str], List[Document] 18 | - Enforce single output type. Hence, the output type of a component should be 19 | as generic as possible. 20 | """ 21 | 22 | inflow = None 23 | 24 | def flow(self): 25 | if self.inflow is None: 26 | raise ValueError("No inflow provided.") 27 | 28 | if not isinstance(self.inflow, BaseComponent): 29 | raise ValueError( 30 | f"inflow must be a BaseComponent, found {type(self.inflow)}" 31 | ) 32 | 33 | return self.__call__(self.inflow.flow()) 34 | 35 | def set_output_queue(self, queue): 36 | self._queue = queue 37 | for name in self._ff_nodes: 38 | node = getattr(self, name) 39 | if isinstance(node, BaseComponent): 40 | node.set_output_queue(queue) 41 | 42 | def report_output(self, output: Optional[Document]): 43 | if self._queue is not None: 44 | self._queue.put_nowait(output) 45 | 46 | def invoke(self, *args, **kwargs) -> Document | list[Document] | None: 47 | ... 48 | 49 | async def ainvoke(self, *args, **kwargs) -> Document | list[Document] | None: 50 | ... 51 | 52 | def stream(self, *args, **kwargs) -> Iterator[Document] | None: 53 | ... 54 | 55 | def astream(self, *args, **kwargs) -> AsyncGenerator[Document, None] | None: 56 | ... 57 | 58 | @abstractmethod 59 | def run( 60 | self, *args, **kwargs 61 | ) -> Document | list[Document] | Iterator[Document] | None | Any: 62 | """Run the component.""" 63 | ... 64 | 65 | 66 | __all__ = ["BaseComponent", "Param", "Node", "lazy"] 67 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/chatbot/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseChatBot, ChatConversation 2 | from .simple_respondent import SimpleRespondentChatbot 3 | 4 | __all__ = ["BaseChatBot", "SimpleRespondentChatbot", "ChatConversation"] 5 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/chatbot/simple_respondent.py: -------------------------------------------------------------------------------- 1 | from ..llms import ChatLLM 2 | from .base import BaseChatBot 3 | 4 | 5 | class SimpleRespondentChatbot(BaseChatBot): 6 | """Simple text respondent chatbot that essentially wraps around a chat LLM""" 7 | 8 | llm: ChatLLM 9 | 10 | def _get_message(self) -> str: 11 | return self.llm(self.history).text 12 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/contribs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/kotaemon/contribs/__init__.py -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/contribs/docs.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from collections import defaultdict 3 | 4 | from theflow.utils.documentation import get_function_documentation_from_module 5 | 6 | 7 | def from_definition_to_markdown(definition: dict) -> str: 8 | """From definition to markdown""" 9 | 10 | # Handle params 11 | params = " N/A\n" 12 | if definition["params"]: 13 | params = "\n| Name | Description | Type | Default |\n" 14 | params += "| --- | --- | --- | --- |\n" 15 | for name, p in definition["params"].items(): 16 | type_ = p["type"].__name__ if inspect.isclass(p["type"]) else p["type"] 17 | params += f"| {name} | {p['desc']} | {type_} | {p['default']} |\n" 18 | 19 | # Handle nodes 20 | nodes = " N/A\n" 21 | if definition["nodes"]: 22 | nodes = "\n| Name | Description | Type | Input | Output |\n" 23 | nodes += "| --- | --- | --- | --- | --- |\n" 24 | for name, n in definition["nodes"].items(): 25 | type_ = n["type"].__name__ if inspect.isclass(n["type"]) else str(n["type"]) 26 | input_ = ( 27 | n["input"].__name__ if inspect.isclass(n["input"]) else str(n["input"]) 28 | ) 29 | output_ = ( 30 | n["output"].__name__ 31 | if inspect.isclass(n["output"]) 32 | else str(n["output"]) 33 | ) 34 | nodes += f"|{name}|{n['desc']}|{type_}|{input_}|{output_}|\n" 35 | 36 | description = inspect.cleandoc(definition["desc"]) 37 | return f"{description}\n\n_**Params:**_{params}\n_**Nodes:**_{nodes}" 38 | 39 | 40 | def make_doc(module: str, output: str, separation_level: int): 41 | """Run exporting components to markdown 42 | 43 | Args: 44 | module (str): module name 45 | output_path (str): output path to save 46 | separation_level (int): level of separation 47 | """ 48 | documentation = sorted( 49 | get_function_documentation_from_module(module).items(), key=lambda x: x[0] 50 | ) 51 | 52 | entries = defaultdict(list) 53 | 54 | for name, definition in documentation: 55 | section = name.split(".")[separation_level].capitalize() 56 | cls_name = name.split(".")[-1] 57 | 58 | markdown = from_definition_to_markdown(definition) 59 | entries[section].append(f"### {cls_name}\n{markdown}") 60 | 61 | final = "\n".join( 62 | [f"## {section}\n" + "\n".join(entries[section]) for section in entries] 63 | ) 64 | 65 | with open(output, "w") as f: 66 | f.write(final) 67 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/contribs/promptui/.gitignore: -------------------------------------------------------------------------------- 1 | /frpc_* 2 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/contribs/promptui/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/kotaemon/contribs/promptui/__init__.py -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/contribs/promptui/base.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | 3 | COMPONENTS_CLASS = { 4 | "text": gr.components.Textbox, 5 | "checkbox": gr.components.CheckboxGroup, 6 | "dropdown": gr.components.Dropdown, 7 | "file": gr.components.File, 8 | "image": gr.components.Image, 9 | "number": gr.components.Number, 10 | "radio": gr.components.Radio, 11 | "slider": gr.components.Slider, 12 | } 13 | SUPPORTED_COMPONENTS = set(COMPONENTS_CLASS.keys()) 14 | DEFAULT_COMPONENT_BY_TYPES = { 15 | "str": "text", 16 | "bool": "checkbox", 17 | "int": "number", 18 | "float": "number", 19 | "list": "dropdown", 20 | } 21 | 22 | 23 | def get_component(component_def: dict) -> gr.components.Component: 24 | """Get the component based on component definition""" 25 | component_cls = None 26 | 27 | if "component" in component_def: 28 | component = component_def["component"] 29 | if component not in SUPPORTED_COMPONENTS: 30 | raise ValueError( 31 | f"Unsupported UI component: {component}. " 32 | f"Must be one of {SUPPORTED_COMPONENTS}" 33 | ) 34 | 35 | component_cls = COMPONENTS_CLASS[component] 36 | else: 37 | raise ValueError( 38 | f"Cannot decide the component from {component_def}. " 39 | "Please specify `component` with 1 of the following " 40 | f"values: {SUPPORTED_COMPONENTS}" 41 | ) 42 | 43 | return component_cls(**component_def.get("params", {})) 44 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/contribs/promptui/cli.py: -------------------------------------------------------------------------------- 1 | """CLI commands that can be imported by the kotaemon.cli module""" 2 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/contribs/promptui/logs.py: -------------------------------------------------------------------------------- 1 | class ResultLog: 2 | """Callback getter to get the desired log result 3 | 4 | The callback resolution will be as follow: 5 | 1. Explicit string name 6 | 2. Implicitly by: `get_` 7 | 3. Pass through 8 | """ 9 | 10 | @staticmethod 11 | def _get_input(obj): 12 | return obj["input"] 13 | 14 | @staticmethod 15 | def _get_output(obj): 16 | return obj["output"] 17 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/contribs/promptui/ui/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import gradio as gr 4 | import yaml 5 | from theflow.utils.modules import import_dotted_string 6 | 7 | from ..themes import John 8 | from .chat import build_chat_ui 9 | from .pipeline import build_pipeline_ui 10 | 11 | 12 | def build_from_dict(config: Union[str, dict]): 13 | """Build a full UI from YAML config file""" 14 | 15 | if isinstance(config, str): 16 | with open(config) as f: 17 | config_dict: dict = yaml.safe_load(f) 18 | elif isinstance(config, dict): 19 | config_dict = config 20 | else: 21 | raise ValueError( 22 | f"config must be either a yaml path or a dict, got {type(config)}" 23 | ) 24 | 25 | demos = [] 26 | for key, value in config_dict.items(): 27 | pipeline_def = import_dotted_string(key, safe=False) 28 | if value["ui-type"] == "chat": 29 | demos.append(build_chat_ui(value, pipeline_def).queue()) 30 | else: 31 | demos.append(build_pipeline_ui(value, pipeline_def).queue()) 32 | if len(demos) == 1: 33 | demo = demos[0] 34 | else: 35 | demo = gr.TabbedInterface( 36 | demos, 37 | tab_names=list(config_dict.keys()), 38 | title="PromptUI from kotaemon", 39 | analytics_enabled=False, 40 | theme=John(), 41 | ) 42 | 43 | demo.queue() 44 | 45 | return demo 46 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseEmbeddings 2 | from .endpoint_based import EndpointEmbeddings 3 | from .fastembed import FastEmbedEmbeddings 4 | from .langchain_based import ( 5 | LCAzureOpenAIEmbeddings, 6 | LCCohereEmbeddings, 7 | LCGoogleEmbeddings, 8 | LCHuggingFaceEmbeddings, 9 | LCMistralEmbeddings, 10 | LCOpenAIEmbeddings, 11 | ) 12 | from .openai import AzureOpenAIEmbeddings, OpenAIEmbeddings 13 | from .tei_endpoint_embed import TeiEndpointEmbeddings 14 | from .voyageai import VoyageAIEmbeddings 15 | 16 | __all__ = [ 17 | "BaseEmbeddings", 18 | "EndpointEmbeddings", 19 | "TeiEndpointEmbeddings", 20 | "LCOpenAIEmbeddings", 21 | "LCAzureOpenAIEmbeddings", 22 | "LCCohereEmbeddings", 23 | "LCHuggingFaceEmbeddings", 24 | "LCGoogleEmbeddings", 25 | "LCMistralEmbeddings", 26 | "OpenAIEmbeddings", 27 | "AzureOpenAIEmbeddings", 28 | "FastEmbedEmbeddings", 29 | "VoyageAIEmbeddings", 30 | ] 31 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/embeddings/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from kotaemon.base import BaseComponent, Document, DocumentWithEmbedding 4 | 5 | 6 | class BaseEmbeddings(BaseComponent): 7 | def run( 8 | self, text: str | list[str] | Document | list[Document], *args, **kwargs 9 | ) -> list[DocumentWithEmbedding]: 10 | return self.invoke(text, *args, **kwargs) 11 | 12 | def invoke( 13 | self, text: str | list[str] | Document | list[Document], *args, **kwargs 14 | ) -> list[DocumentWithEmbedding]: 15 | raise NotImplementedError 16 | 17 | async def ainvoke( 18 | self, text: str | list[str] | Document | list[Document], *args, **kwargs 19 | ) -> list[DocumentWithEmbedding]: 20 | raise NotImplementedError 21 | 22 | def prepare_input( 23 | self, text: str | list[str] | Document | list[Document] 24 | ) -> list[Document]: 25 | if isinstance(text, (str, Document)): 26 | return [Document(content=text)] 27 | elif isinstance(text, list): 28 | return [Document(content=_) for _ in text] 29 | return text 30 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/embeddings/endpoint_based.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from kotaemon.base import Document, DocumentWithEmbedding 4 | 5 | from .base import BaseEmbeddings 6 | 7 | 8 | class EndpointEmbeddings(BaseEmbeddings): 9 | """ 10 | An Embeddings component that uses an OpenAI API compatible endpoint. 11 | 12 | Attributes: 13 | endpoint_url (str): The url of an OpenAI API compatible endpoint. 14 | """ 15 | 16 | endpoint_url: str 17 | 18 | def run( 19 | self, text: str | list[str] | Document | list[Document] 20 | ) -> list[DocumentWithEmbedding]: 21 | """ 22 | Generate embeddings from text Args: 23 | text (str | list[str] | Document | list[Document]): text to generate 24 | embeddings from 25 | Returns: 26 | list[DocumentWithEmbedding]: embeddings 27 | """ 28 | if not isinstance(text, list): 29 | text = [text] 30 | 31 | outputs = [] 32 | 33 | for item in text: 34 | response = requests.post( 35 | self.endpoint_url, json={"input": str(item)} 36 | ).json() 37 | outputs.append( 38 | DocumentWithEmbedding( 39 | text=str(item), 40 | embedding=response["data"][0]["embedding"], 41 | total_tokens=response["usage"]["total_tokens"], 42 | prompt_tokens=response["usage"]["prompt_tokens"], 43 | ) 44 | ) 45 | 46 | return outputs 47 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/embeddings/fastembed.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Optional 2 | 3 | from kotaemon.base import Document, DocumentWithEmbedding, Param 4 | 5 | from .base import BaseEmbeddings 6 | 7 | if TYPE_CHECKING: 8 | from fastembed import TextEmbedding 9 | 10 | 11 | class FastEmbedEmbeddings(BaseEmbeddings): 12 | """Utilize fastembed library for embeddings locally without GPU. 13 | 14 | Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/ 15 | Code: https://github.com/qdrant/fastembed 16 | """ 17 | 18 | model_name: str = Param( 19 | "BAAI/bge-small-en-v1.5", 20 | help=( 21 | "Model name for fastembed. Please refer " 22 | "[here](https://qdrant.github.io/fastembed/examples/Supported_Models/) " 23 | "for the list of supported models." 24 | ), 25 | required=True, 26 | ) 27 | batch_size: int = Param( 28 | 256, 29 | help="Batch size for embeddings. Higher values use more memory, but are faster", 30 | ) 31 | parallel: Optional[int] = Param( 32 | None, 33 | help=( 34 | "Number of threads to use for embeddings. " 35 | "If > 1, data-parallel encoding will be used. " 36 | "If 0, use all available CPUs. " 37 | "If None, use default onnxruntime threading. " 38 | "Defaults to None." 39 | ), 40 | ) 41 | 42 | @Param.auto() 43 | def client_(self) -> "TextEmbedding": 44 | try: 45 | from fastembed import TextEmbedding 46 | except ImportError: 47 | raise ImportError("Please install FastEmbed: `pip install fastembed`") 48 | 49 | return TextEmbedding(model_name=self.model_name) 50 | 51 | def invoke( 52 | self, text: str | list[str] | Document | list[Document], *args, **kwargs 53 | ) -> list[DocumentWithEmbedding]: 54 | input_ = self.prepare_input(text) 55 | embeddings = self.client_.embed( 56 | [_.content for _ in input_], 57 | batch_size=self.batch_size, 58 | parallel=self.parallel, 59 | ) 60 | return [ 61 | DocumentWithEmbedding( 62 | content=doc, 63 | embedding=list(embedding), 64 | ) 65 | for doc, embedding in zip(input_, embeddings) 66 | ] 67 | 68 | async def ainvoke( 69 | self, text: str | list[str] | Document | list[Document], *args, **kwargs 70 | ) -> list[DocumentWithEmbedding]: 71 | """Fastembed does not support async API.""" 72 | return self.invoke(text, *args, **kwargs) 73 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/embeddings/voyageai.py: -------------------------------------------------------------------------------- 1 | """Implements embeddings from [Voyage AI](https://voyageai.com). 2 | """ 3 | 4 | import importlib 5 | 6 | from kotaemon.base import Document, DocumentWithEmbedding, Param 7 | 8 | from .base import BaseEmbeddings 9 | 10 | vo = None 11 | 12 | 13 | def _import_voyageai(): 14 | global vo 15 | if not vo: 16 | vo = importlib.import_module("voyageai") 17 | return vo 18 | 19 | 20 | def _format_output(texts: list[str], embeddings: list[list]): 21 | """Formats the output of all `.embed` calls. 22 | Args: 23 | texts: List of original documents 24 | embeddings: Embeddings corresponding to each document 25 | """ 26 | return [ 27 | DocumentWithEmbedding(content=text, embedding=embedding) 28 | for text, embedding in zip(texts, embeddings) 29 | ] 30 | 31 | 32 | class VoyageAIEmbeddings(BaseEmbeddings): 33 | """Voyage AI provides best-in-class embedding models and rerankers.""" 34 | 35 | api_key: str = Param(None, help="Voyage API key", required=False) 36 | model: str = Param( 37 | "voyage-3", 38 | help=( 39 | "Model name to use. The Voyage " 40 | "[documentation](https://docs.voyageai.com/docs/embeddings) " 41 | "provides a list of all available embedding models." 42 | ), 43 | required=True, 44 | ) 45 | 46 | def __init__(self, *args, **kwargs): 47 | super().__init__(*args, **kwargs) 48 | if not self.api_key: 49 | raise ValueError("API key must be provided for VoyageAIEmbeddings.") 50 | 51 | self._client = _import_voyageai().Client(api_key=self.api_key) 52 | self._aclient = _import_voyageai().AsyncClient(api_key=self.api_key) 53 | 54 | def invoke( 55 | self, text: str | list[str] | Document | list[Document], *args, **kwargs 56 | ) -> list[DocumentWithEmbedding]: 57 | texts = [t.content for t in self.prepare_input(text)] 58 | embeddings = self._client.embed(texts, model=self.model).embeddings 59 | return _format_output(texts, embeddings) 60 | 61 | async def ainvoke( 62 | self, text: str | list[str] | Document | list[Document], *args, **kwargs 63 | ) -> list[DocumentWithEmbedding]: 64 | texts = [t.content for t in self.prepare_input(text)] 65 | embeddings = await self._aclient.embed(texts, model=self.model).embeddings 66 | return _format_output(texts, embeddings) 67 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/__init__.py: -------------------------------------------------------------------------------- 1 | from .vectorindex import VectorIndexing, VectorRetrieval 2 | 3 | __all__ = ["VectorIndexing", "VectorRetrieval"] 4 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from .doc_parsers import BaseDocParser, SummaryExtractor, TitleExtractor 2 | 3 | __all__ = [ 4 | "BaseDocParser", 5 | "TitleExtractor", 6 | "SummaryExtractor", 7 | ] 8 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/extractors/doc_parsers.py: -------------------------------------------------------------------------------- 1 | from ..base import DocTransformer, LlamaIndexDocTransformerMixin 2 | 3 | 4 | class BaseDocParser(DocTransformer): 5 | ... 6 | 7 | 8 | class TitleExtractor(LlamaIndexDocTransformerMixin, BaseDocParser): 9 | def __init__( 10 | self, 11 | llm=None, 12 | nodes: int = 5, 13 | **params, 14 | ): 15 | super().__init__(llm=llm, nodes=nodes, **params) 16 | 17 | def _get_li_class(self): 18 | from llama_index.core.extractors import TitleExtractor 19 | 20 | return TitleExtractor 21 | 22 | 23 | class SummaryExtractor(LlamaIndexDocTransformerMixin, BaseDocParser): 24 | def __init__( 25 | self, 26 | llm=None, 27 | summaries: list[str] = ["self"], 28 | **params, 29 | ): 30 | super().__init__(llm=llm, summaries=summaries, **params) 31 | 32 | def _get_li_class(self): 33 | from llama_index.core.extractors import SummaryExtractor 34 | 35 | return SummaryExtractor 36 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/ingests/__init__.py: -------------------------------------------------------------------------------- 1 | from .files import DocumentIngestor 2 | 3 | __all__ = ["DocumentIngestor"] 4 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/qa/__init__.py: -------------------------------------------------------------------------------- 1 | from .citation import CitationPipeline 2 | 3 | __all__ = [ 4 | "CitationPipeline", 5 | ] 6 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/rankings/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseReranking 2 | from .cohere import CohereReranking 3 | from .llm import LLMReranking 4 | from .llm_scoring import LLMScoring 5 | from .llm_trulens import LLMTrulensScoring 6 | 7 | __all__ = [ 8 | "CohereReranking", 9 | "LLMReranking", 10 | "LLMScoring", 11 | "BaseReranking", 12 | "LLMTrulensScoring", 13 | ] 14 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/rankings/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import abstractmethod 4 | 5 | from kotaemon.base import BaseComponent, Document 6 | 7 | 8 | class BaseReranking(BaseComponent): 9 | @abstractmethod 10 | def run(self, documents: list[Document], query: str) -> list[Document]: 11 | """Main method to transform list of documents 12 | (re-ranking, filtering, etc)""" 13 | ... 14 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/rankings/cohere.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from decouple import config 4 | 5 | from kotaemon.base import Document 6 | 7 | from .base import BaseReranking 8 | 9 | 10 | class CohereReranking(BaseReranking): 11 | model_name: str = "rerank-multilingual-v2.0" 12 | cohere_api_key: str = config("COHERE_API_KEY", "") 13 | use_key_from_ktem: bool = False 14 | 15 | def run(self, documents: list[Document], query: str) -> list[Document]: 16 | """Use Cohere Reranker model to re-order documents 17 | with their relevance score""" 18 | try: 19 | import cohere 20 | except ImportError: 21 | raise ImportError( 22 | "Please install Cohere `pip install cohere` to use Cohere Reranking" 23 | ) 24 | 25 | # try to get COHERE_API_KEY from embeddings 26 | if not self.cohere_api_key and self.use_key_from_ktem: 27 | try: 28 | from ktem.embeddings.manager import ( 29 | embedding_models_manager as embeddings, 30 | ) 31 | 32 | cohere_model = embeddings.get("cohere") 33 | ktem_cohere_api_key = cohere_model._kwargs.get( # type: ignore 34 | "cohere_api_key" 35 | ) 36 | if ktem_cohere_api_key != "your-key": 37 | self.cohere_api_key = ktem_cohere_api_key 38 | except Exception as e: 39 | print("Cannot get Cohere API key from `ktem`", e) 40 | 41 | if not self.cohere_api_key: 42 | print("Cohere API key not found. Skipping rerankings.") 43 | return documents 44 | 45 | cohere_client = cohere.Client(self.cohere_api_key) 46 | compressed_docs: list[Document] = [] 47 | 48 | if not documents: # to avoid empty api call 49 | return compressed_docs 50 | 51 | _docs = [d.content for d in documents] 52 | response = cohere_client.rerank( 53 | model=self.model_name, query=query, documents=_docs 54 | ) 55 | for r in response.results: 56 | doc = documents[r.index] 57 | doc.metadata["reranking_score"] = r.relevance_score 58 | compressed_docs.append(doc) 59 | 60 | return compressed_docs 61 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/rankings/llm.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from concurrent.futures import ThreadPoolExecutor 4 | 5 | from langchain.output_parsers.boolean import BooleanOutputParser 6 | 7 | from kotaemon.base import Document 8 | from kotaemon.llms import BaseLLM, PromptTemplate 9 | 10 | from .base import BaseReranking 11 | 12 | RERANK_PROMPT_TEMPLATE = """Given the following question and context, 13 | return YES if the context is relevant to the question and NO if it isn't. 14 | 15 | > Question: {question} 16 | > Context: 17 | >>> 18 | {context} 19 | >>> 20 | > Relevant (YES / NO):""" 21 | 22 | 23 | class LLMReranking(BaseReranking): 24 | llm: BaseLLM 25 | prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE) 26 | top_k: int = 3 27 | concurrent: bool = True 28 | 29 | def run( 30 | self, 31 | documents: list[Document], 32 | query: str, 33 | ) -> list[Document]: 34 | """Filter down documents based on their relevance to the query.""" 35 | filtered_docs = [] 36 | output_parser = BooleanOutputParser() 37 | 38 | if self.concurrent: 39 | with ThreadPoolExecutor() as executor: 40 | futures = [] 41 | for doc in documents: 42 | _prompt = self.prompt_template.populate( 43 | question=query, context=doc.get_content() 44 | ) 45 | futures.append(executor.submit(lambda: self.llm(_prompt).text)) 46 | 47 | results = [future.result() for future in futures] 48 | else: 49 | results = [] 50 | for doc in documents: 51 | _prompt = self.prompt_template.populate( 52 | question=query, context=doc.get_content() 53 | ) 54 | results.append(self.llm(_prompt).text) 55 | 56 | # use Boolean parser to extract relevancy output from LLM 57 | results = [output_parser.parse(result) for result in results] 58 | for include_doc, doc in zip(results, documents): 59 | if include_doc: 60 | filtered_docs.append(doc) 61 | 62 | # prevent returning empty result 63 | if len(filtered_docs) == 0: 64 | filtered_docs = documents[: self.top_k] 65 | 66 | return filtered_docs 67 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from concurrent.futures import ThreadPoolExecutor 4 | 5 | import numpy as np 6 | from langchain.output_parsers.boolean import BooleanOutputParser 7 | 8 | from kotaemon.base import Document 9 | 10 | from .llm import LLMReranking 11 | 12 | 13 | class LLMScoring(LLMReranking): 14 | def run( 15 | self, 16 | documents: list[Document], 17 | query: str, 18 | ) -> list[Document]: 19 | """Filter down documents based on their relevance to the query.""" 20 | filtered_docs: list[Document] = [] 21 | output_parser = BooleanOutputParser() 22 | 23 | if self.concurrent: 24 | with ThreadPoolExecutor() as executor: 25 | futures = [] 26 | for doc in documents: 27 | _prompt = self.prompt_template.populate( 28 | question=query, context=doc.get_content() 29 | ) 30 | futures.append(executor.submit(lambda: self.llm(_prompt))) 31 | 32 | results = [future.result() for future in futures] 33 | else: 34 | results = [] 35 | for doc in documents: 36 | _prompt = self.prompt_template.populate( 37 | question=query, context=doc.get_content() 38 | ) 39 | results.append(self.llm(_prompt)) 40 | 41 | for result, doc in zip(results, documents): 42 | score = np.exp(np.average(result.logprobs)) 43 | include_doc = output_parser.parse(result.text) 44 | if include_doc: 45 | doc.metadata["llm_reranking_score"] = score 46 | else: 47 | doc.metadata["llm_reranking_score"] = 1 - score 48 | filtered_docs.append(doc) 49 | 50 | # prevent returning empty result 51 | if len(filtered_docs) == 0: 52 | filtered_docs = documents[: self.top_k] 53 | 54 | return filtered_docs 55 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/retrievers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/kotaemon/indices/retrievers/__init__.py -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/retrievers/jina_web_search.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from decouple import config 3 | 4 | from kotaemon.base import BaseComponent, RetrievedDocument 5 | 6 | JINA_API_KEY = config("JINA_API_KEY", default="") 7 | JINA_URL = config("JINA_URL", default="https://r.jina.ai/") 8 | 9 | 10 | class WebSearch(BaseComponent): 11 | """WebSearch component for fetching data from the web 12 | using Jina API 13 | """ 14 | 15 | def run( 16 | self, 17 | text: str, 18 | *args, 19 | **kwargs, 20 | ) -> list[RetrievedDocument]: 21 | if JINA_API_KEY == "": 22 | raise ValueError( 23 | "This feature requires JINA_API_KEY " 24 | "(get free one from https://jina.ai/reader)" 25 | ) 26 | 27 | # setup the request 28 | api_url = f"https://s.jina.ai/{text}" 29 | headers = {"X-With-Generated-Alt": "true", "Accept": "application/json"} 30 | if JINA_API_KEY: 31 | headers["Authorization"] = f"Bearer {JINA_API_KEY}" 32 | 33 | response = requests.get(api_url, headers=headers) 34 | response.raise_for_status() 35 | response_dict = response.json() 36 | 37 | return [ 38 | RetrievedDocument( 39 | text=( 40 | "###URL: [{url}]({url})\n\n" 41 | "####{title}\n\n" 42 | "{description}\n" 43 | "{content}" 44 | ).format( 45 | url=item["url"], 46 | title=item["title"], 47 | description=item["description"], 48 | content=item["content"], 49 | ), 50 | metadata={ 51 | "file_name": "Web search", 52 | "type": "table", 53 | "llm_trulens_score": 1.0, 54 | }, 55 | ) 56 | for item in response_dict["data"] 57 | ] 58 | 59 | def generate_relevant_scores(self, text, documents: list[RetrievedDocument]): 60 | return documents 61 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/retrievers/tavily_web_search.py: -------------------------------------------------------------------------------- 1 | from decouple import config 2 | 3 | from kotaemon.base import BaseComponent, RetrievedDocument 4 | 5 | TAVILY_API_KEY = config("TAVILY_API_KEY", default="") 6 | 7 | 8 | class WebSearch(BaseComponent): 9 | """WebSearch component for fetching data from the web 10 | using Jina API 11 | """ 12 | 13 | def run( 14 | self, 15 | text: str, 16 | *args, 17 | **kwargs, 18 | ) -> list[RetrievedDocument]: 19 | if TAVILY_API_KEY == "": 20 | raise ValueError( 21 | "This feature requires TAVILY_API_KEY " 22 | "(get free one from https://app.tavily.com/)" 23 | ) 24 | 25 | try: 26 | from tavily import TavilyClient 27 | except ImportError: 28 | raise ImportError( 29 | "Please install `pip install tavily-python` to use this feature" 30 | ) 31 | 32 | tavily_client = TavilyClient(api_key=TAVILY_API_KEY) 33 | results = tavily_client.search( 34 | query=text, 35 | search_depth="advanced", 36 | )["results"] 37 | context = "\n\n".join( 38 | "###URL: [{url}]({url})\n\n{content}".format( 39 | url=result["url"], 40 | content=result["content"], 41 | ) 42 | for result in results 43 | ) 44 | 45 | return [ 46 | RetrievedDocument( 47 | text=context, 48 | metadata={ 49 | "file_name": "Web search", 50 | "type": "table", 51 | "llm_trulens_score": 1.0, 52 | }, 53 | ) 54 | ] 55 | 56 | def generate_relevant_scores(self, text, documents: list[RetrievedDocument]): 57 | return documents 58 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/indices/splitters/__init__.py: -------------------------------------------------------------------------------- 1 | from ..base import DocTransformer, LlamaIndexDocTransformerMixin 2 | 3 | 4 | class BaseSplitter(DocTransformer): 5 | """Represent base splitter class""" 6 | 7 | ... 8 | 9 | 10 | class TokenSplitter(LlamaIndexDocTransformerMixin, BaseSplitter): 11 | def __init__( 12 | self, 13 | chunk_size: int = 1024, 14 | chunk_overlap: int = 20, 15 | separator: str = " ", 16 | **params, 17 | ): 18 | super().__init__( 19 | chunk_size=chunk_size, 20 | chunk_overlap=chunk_overlap, 21 | separator=separator, 22 | **params, 23 | ) 24 | 25 | def _get_li_class(self): 26 | from llama_index.core.text_splitter import TokenTextSplitter 27 | 28 | return TokenTextSplitter 29 | 30 | 31 | class SentenceWindowSplitter(LlamaIndexDocTransformerMixin, BaseSplitter): 32 | def __init__( 33 | self, 34 | window_size: int = 3, 35 | window_metadata_key: str = "window", 36 | original_text_metadata_key: str = "original_text", 37 | **params, 38 | ): 39 | super().__init__( 40 | window_size=window_size, 41 | window_metadata_key=window_metadata_key, 42 | original_text_metadata_key=original_text_metadata_key, 43 | **params, 44 | ) 45 | 46 | def _get_li_class(self): 47 | from llama_index.core.node_parser import SentenceWindowNodeParser 48 | 49 | return SentenceWindowNodeParser 50 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/llms/__init__.py: -------------------------------------------------------------------------------- 1 | from kotaemon.base.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage 2 | 3 | from .base import BaseLLM 4 | from .branching import GatedBranchingPipeline, SimpleBranchingPipeline 5 | from .chats import ( 6 | AzureChatOpenAI, 7 | ChatLLM, 8 | ChatOpenAI, 9 | EndpointChatLLM, 10 | LCAnthropicChat, 11 | LCAzureChatOpenAI, 12 | LCChatOpenAI, 13 | LCCohereChat, 14 | LCGeminiChat, 15 | LCOllamaChat, 16 | LlamaCppChat, 17 | StructuredOutputChatOpenAI, 18 | ) 19 | from .completions import LLM, AzureOpenAI, LlamaCpp, OpenAI 20 | from .cot import ManualSequentialChainOfThought, Thought 21 | from .linear import GatedLinearPipeline, SimpleLinearPipeline 22 | from .prompts import BasePromptComponent, PromptTemplate 23 | 24 | __all__ = [ 25 | "BaseLLM", 26 | # chat-specific components 27 | "ChatLLM", 28 | "EndpointChatLLM", 29 | "BaseMessage", 30 | "HumanMessage", 31 | "AIMessage", 32 | "SystemMessage", 33 | "AzureChatOpenAI", 34 | "ChatOpenAI", 35 | "StructuredOutputChatOpenAI", 36 | "LCAnthropicChat", 37 | "LCGeminiChat", 38 | "LCCohereChat", 39 | "LCOllamaChat", 40 | "LCAzureChatOpenAI", 41 | "LCChatOpenAI", 42 | "LlamaCppChat", 43 | # completion-specific components 44 | "LLM", 45 | "OpenAI", 46 | "AzureOpenAI", 47 | "LlamaCpp", 48 | # prompt-specific components 49 | "BasePromptComponent", 50 | "PromptTemplate", 51 | # strategies 52 | "SimpleLinearPipeline", 53 | "GatedLinearPipeline", 54 | "SimpleBranchingPipeline", 55 | "GatedBranchingPipeline", 56 | # chain-of-thoughts 57 | "ManualSequentialChainOfThought", 58 | "Thought", 59 | ] 60 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/llms/base.py: -------------------------------------------------------------------------------- 1 | from typing import AsyncGenerator, Iterator 2 | 3 | from langchain_core.language_models.base import BaseLanguageModel 4 | 5 | from kotaemon.base import BaseComponent, LLMInterface 6 | 7 | 8 | class BaseLLM(BaseComponent): 9 | def to_langchain_format(self) -> BaseLanguageModel: 10 | raise NotImplementedError 11 | 12 | def invoke(self, *args, **kwargs) -> LLMInterface: 13 | raise NotImplementedError 14 | 15 | async def ainvoke(self, *args, **kwargs) -> LLMInterface: 16 | raise NotImplementedError 17 | 18 | def stream(self, *args, **kwargs) -> Iterator[LLMInterface]: 19 | raise NotImplementedError 20 | 21 | def astream(self, *args, **kwargs) -> AsyncGenerator[LLMInterface, None]: 22 | raise NotImplementedError 23 | 24 | def run(self, *args, **kwargs): 25 | return self.invoke(*args, **kwargs) 26 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/llms/chats/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import ChatLLM 2 | from .endpoint_based import EndpointChatLLM 3 | from .langchain_based import ( 4 | LCAnthropicChat, 5 | LCAzureChatOpenAI, 6 | LCChatMixin, 7 | LCChatOpenAI, 8 | LCCohereChat, 9 | LCGeminiChat, 10 | LCOllamaChat, 11 | ) 12 | from .llamacpp import LlamaCppChat 13 | from .openai import AzureChatOpenAI, ChatOpenAI, StructuredOutputChatOpenAI 14 | 15 | __all__ = [ 16 | "ChatOpenAI", 17 | "AzureChatOpenAI", 18 | "ChatLLM", 19 | "EndpointChatLLM", 20 | "ChatOpenAI", 21 | "StructuredOutputChatOpenAI", 22 | "LCAnthropicChat", 23 | "LCGeminiChat", 24 | "LCCohereChat", 25 | "LCOllamaChat", 26 | "LCChatOpenAI", 27 | "LCAzureChatOpenAI", 28 | "LCChatMixin", 29 | "LlamaCppChat", 30 | ] 31 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/llms/chats/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | 5 | from kotaemon.base import BaseComponent 6 | from kotaemon.llms.base import BaseLLM 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class ChatLLM(BaseLLM): 12 | def flow(self): 13 | if self.inflow is None: 14 | raise ValueError("No inflow provided.") 15 | 16 | if not isinstance(self.inflow, BaseComponent): 17 | raise ValueError( 18 | f"inflow must be a BaseComponent, found {type(self.inflow)}" 19 | ) 20 | 21 | text = self.inflow.flow().text 22 | return self.__call__(text) 23 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/llms/completions/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import LLM 2 | from .langchain_based import AzureOpenAI, LCCompletionMixin, LlamaCpp, OpenAI 3 | 4 | __all__ = ["LLM", "OpenAI", "AzureOpenAI", "LCCompletionMixin", "LlamaCpp"] 5 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/llms/completions/base.py: -------------------------------------------------------------------------------- 1 | from kotaemon.llms.base import BaseLLM 2 | 3 | 4 | class LLM(BaseLLM): 5 | pass 6 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/llms/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BasePromptComponent 2 | from .template import PromptTemplate 3 | 4 | __all__ = ["BasePromptComponent", "PromptTemplate"] 5 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | from .adobe_loader import AdobeReader 2 | from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader 3 | from .base import AutoReader, BaseReader 4 | from .composite_loader import DirectoryReader 5 | from .docling_loader import DoclingReader 6 | from .docx_loader import DocxReader 7 | from .excel_loader import ExcelReader, PandasExcelReader 8 | from .html_loader import HtmlReader, MhtmlReader 9 | from .mathpix_loader import MathpixPDFReader 10 | from .ocr_loader import ImageReader, OCRReader 11 | from .pdf_loader import PDFThumbnailReader 12 | from .txt_loader import TxtReader 13 | from .unstructured_loader import UnstructuredReader 14 | from .web_loader import WebReader 15 | 16 | __all__ = [ 17 | "AutoReader", 18 | "AzureAIDocumentIntelligenceLoader", 19 | "BaseReader", 20 | "PandasExcelReader", 21 | "ExcelReader", 22 | "MathpixPDFReader", 23 | "ImageReader", 24 | "OCRReader", 25 | "DirectoryReader", 26 | "UnstructuredReader", 27 | "DocxReader", 28 | "HtmlReader", 29 | "MhtmlReader", 30 | "AdobeReader", 31 | "TxtReader", 32 | "PDFThumbnailReader", 33 | "WebReader", 34 | "DoclingReader", 35 | ] 36 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/loaders/composite_loader.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List, Optional, Type 2 | 3 | from llama_index.core.readers.base import BaseReader as LIBaseReader 4 | 5 | from .base import BaseReader, LIReaderMixin 6 | 7 | 8 | class DirectoryReader(LIReaderMixin, BaseReader): 9 | """Wrap around llama-index SimpleDirectoryReader 10 | 11 | Args: 12 | input_dir (str): Path to the directory. 13 | input_files (List): List of file paths to read 14 | (Optional; overrides input_dir, exclude) 15 | exclude (List): glob of python file paths to exclude (Optional) 16 | exclude_hidden (bool): Whether to exclude hidden files (dotfiles). 17 | encoding (str): Encoding of the files. 18 | Default is utf-8. 19 | errors (str): how encoding and decoding errors are to be handled, 20 | see https://docs.python.org/3/library/functions.html#open 21 | recursive (bool): Whether to recursively search in subdirectories. 22 | False by default. 23 | filename_as_id (bool): Whether to use the filename as the document id. 24 | False by default. 25 | required_exts (Optional[List[str]]): List of required extensions. 26 | Default is None. 27 | file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file 28 | extension to a BaseReader class that specifies how to convert that file 29 | to text. If not specified, use default from DEFAULT_FILE_READER_CLS. 30 | num_files_limit (Optional[int]): Maximum number of files to read. 31 | Default is None. 32 | file_metadata (Optional[Callable[str, Dict]]): A function that takes 33 | in a filename and returns a Dict of metadata for the Document. 34 | Default is None. 35 | """ 36 | 37 | input_dir: Optional[str] = None 38 | input_files: Optional[List] = None 39 | exclude: Optional[List] = None 40 | exclude_hidden: bool = True 41 | errors: str = "ignore" 42 | recursive: bool = False 43 | encoding: str = "utf-8" 44 | filename_as_id: bool = False 45 | required_exts: Optional[list[str]] = None 46 | file_extractor: Optional[dict[str, "LIBaseReader"]] = None 47 | num_files_limit: Optional[int] = None 48 | file_metadata: Optional[Callable[[str], dict]] = None 49 | 50 | def _get_wrapped_class(self) -> Type["LIBaseReader"]: 51 | from llama_index.core import SimpleDirectoryReader 52 | 53 | return SimpleDirectoryReader 54 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/loaders/txt_loader.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Optional 3 | 4 | from kotaemon.base import Document 5 | 6 | from .base import BaseReader 7 | 8 | 9 | class TxtReader(BaseReader): 10 | def run( 11 | self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs 12 | ) -> list[Document]: 13 | return self.load_data(Path(file_path), extra_info=extra_info, **kwargs) 14 | 15 | def load_data( 16 | self, file_path: Path, extra_info: Optional[dict] = None, **kwargs 17 | ) -> list[Document]: 18 | with open(file_path, "r", encoding="utf-8") as f: 19 | text = f.read() 20 | 21 | metadata = extra_info or {} 22 | return [Document(text=text, metadata=metadata)] 23 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/loaders/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/kotaemon/loaders/utils/__init__.py -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/loaders/web_loader.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Optional 3 | 4 | import requests 5 | from decouple import config 6 | 7 | from kotaemon.base import Document 8 | 9 | from .base import BaseReader 10 | 11 | JINA_API_KEY = config("JINA_API_KEY", default="") 12 | JINA_URL = config("JINA_URL", default="https://r.jina.ai/") 13 | 14 | 15 | class WebReader(BaseReader): 16 | def run( 17 | self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs 18 | ) -> list[Document]: 19 | return self.load_data(Path(file_path), extra_info=extra_info, **kwargs) 20 | 21 | def fetch_url(self, url: str): 22 | # setup the request 23 | api_url = f"https://r.jina.ai/{url}" 24 | headers = { 25 | "X-With-Links-Summary": "true", 26 | } 27 | if JINA_API_KEY: 28 | headers["Authorization"] = f"Bearer {JINA_API_KEY}" 29 | 30 | response = requests.get(api_url, headers=headers) 31 | response.raise_for_status() 32 | 33 | data = response.text 34 | return data 35 | 36 | def load_data( 37 | self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs 38 | ) -> list[Document]: 39 | file_path = str(file_path) 40 | output = self.fetch_url(file_path) 41 | metadata = extra_info or {} 42 | 43 | return [Document(text=output, metadata=metadata)] 44 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | from .regex_extractor import FirstMatchRegexExtractor, RegexExtractor 2 | 3 | __all__ = ["RegexExtractor", "FirstMatchRegexExtractor"] 4 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/rerankings/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseReranking 2 | from .cohere import CohereReranking 3 | from .tei_fast_rerank import TeiFastReranking 4 | from .voyageai import VoyageAIReranking 5 | 6 | __all__ = ["BaseReranking", "TeiFastReranking", "CohereReranking", "VoyageAIReranking"] 7 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/rerankings/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import abstractmethod 4 | 5 | from kotaemon.base import BaseComponent, Document 6 | 7 | 8 | class BaseReranking(BaseComponent): 9 | @abstractmethod 10 | def run(self, documents: list[Document], query: str) -> list[Document]: 11 | """Main method to transform list of documents 12 | (re-ranking, filtering, etc)""" 13 | ... 14 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/rerankings/cohere.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from decouple import config 4 | 5 | from kotaemon.base import Document, Param 6 | 7 | from .base import BaseReranking 8 | 9 | 10 | class CohereReranking(BaseReranking): 11 | """Cohere Reranking model""" 12 | 13 | model_name: str = Param( 14 | "rerank-multilingual-v2.0", 15 | help=( 16 | "ID of the model to use. You can go to [Supported Models]" 17 | "(https://docs.cohere.com/docs/rerank-2) to see the supported models" 18 | ), 19 | required=True, 20 | ) 21 | cohere_api_key: str = Param( 22 | config("COHERE_API_KEY", ""), 23 | help="Cohere API key", 24 | required=True, 25 | ) 26 | 27 | def run(self, documents: list[Document], query: str) -> list[Document]: 28 | """Use Cohere Reranker model to re-order documents 29 | with their relevance score""" 30 | try: 31 | import cohere 32 | except ImportError: 33 | raise ImportError( 34 | "Please install Cohere " "`pip install cohere` to use Cohere Reranking" 35 | ) 36 | 37 | if not self.cohere_api_key or "COHERE_API_KEY" in self.cohere_api_key: 38 | print("Cohere API key not found. Skipping rerankings.") 39 | return documents 40 | 41 | cohere_client = cohere.Client(self.cohere_api_key) 42 | compressed_docs: list[Document] = [] 43 | 44 | if not documents: # to avoid empty api call 45 | return compressed_docs 46 | 47 | _docs = [d.content for d in documents] 48 | response = cohere_client.rerank( 49 | model=self.model_name, query=query, documents=_docs 50 | ) 51 | for r in response.results: 52 | doc = documents[r.index] 53 | doc.metadata["reranking_score"] = r.relevance_score 54 | compressed_docs.append(doc) 55 | 56 | return compressed_docs 57 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/rerankings/voyageai.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import importlib 4 | 5 | from decouple import config 6 | 7 | from kotaemon.base import Document, Param 8 | 9 | from .base import BaseReranking 10 | 11 | vo = None 12 | 13 | 14 | def _import_voyageai(): 15 | global vo 16 | if not vo: 17 | vo = importlib.import_module("voyageai") 18 | return vo 19 | 20 | 21 | class VoyageAIReranking(BaseReranking): 22 | """VoyageAI Reranking model""" 23 | 24 | model_name: str = Param( 25 | "rerank-2", 26 | help=( 27 | "ID of the model to use. You can go to [Supported Models]" 28 | "(https://docs.voyageai.com/docs/reranker) to see the supported models" 29 | ), 30 | required=True, 31 | ) 32 | api_key: str = Param( 33 | config("VOYAGE_API_KEY", ""), 34 | help="VoyageAI API key", 35 | required=True, 36 | ) 37 | 38 | def __init__(self, *args, **kwargs): 39 | super().__init__(*args, **kwargs) 40 | if not self.api_key: 41 | raise ValueError("API key must be provided for VoyageAIEmbeddings.") 42 | 43 | self._client = _import_voyageai().Client(api_key=self.api_key) 44 | self._aclient = _import_voyageai().AsyncClient(api_key=self.api_key) 45 | 46 | def run(self, documents: list[Document], query: str) -> list[Document]: 47 | """Use VoyageAI Reranker model to re-order documents 48 | with their relevance score""" 49 | compressed_docs: list[Document] = [] 50 | 51 | if not documents: # to avoid empty api call 52 | return compressed_docs 53 | 54 | _docs = [d.content for d in documents] 55 | response = self._client.rerank( 56 | model=self.model_name, query=query, documents=_docs 57 | ) 58 | for r in response.results: 59 | doc = documents[r.index] 60 | doc.metadata["reranking_score"] = r.relevance_score 61 | compressed_docs.append(doc) 62 | 63 | return compressed_docs 64 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/storages/__init__.py: -------------------------------------------------------------------------------- 1 | from .docstores import ( 2 | BaseDocumentStore, 3 | ElasticsearchDocumentStore, 4 | InMemoryDocumentStore, 5 | LanceDBDocumentStore, 6 | SimpleFileDocumentStore, 7 | ) 8 | from .vectorstores import ( 9 | BaseVectorStore, 10 | ChromaVectorStore, 11 | InMemoryVectorStore, 12 | LanceDBVectorStore, 13 | MilvusVectorStore, 14 | QdrantVectorStore, 15 | SimpleFileVectorStore, 16 | ) 17 | 18 | __all__ = [ 19 | # Document stores 20 | "BaseDocumentStore", 21 | "InMemoryDocumentStore", 22 | "ElasticsearchDocumentStore", 23 | "SimpleFileDocumentStore", 24 | "LanceDBDocumentStore", 25 | # Vector stores 26 | "BaseVectorStore", 27 | "ChromaVectorStore", 28 | "InMemoryVectorStore", 29 | "SimpleFileVectorStore", 30 | "LanceDBVectorStore", 31 | "MilvusVectorStore", 32 | "QdrantVectorStore", 33 | ] 34 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/storages/docstores/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseDocumentStore 2 | from .elasticsearch import ElasticsearchDocumentStore 3 | from .in_memory import InMemoryDocumentStore 4 | from .lancedb import LanceDBDocumentStore 5 | from .simple_file import SimpleFileDocumentStore 6 | 7 | __all__ = [ 8 | "BaseDocumentStore", 9 | "InMemoryDocumentStore", 10 | "ElasticsearchDocumentStore", 11 | "SimpleFileDocumentStore", 12 | "LanceDBDocumentStore", 13 | ] 14 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/storages/docstores/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Optional, Union 3 | 4 | from kotaemon.base import Document 5 | 6 | 7 | class BaseDocumentStore(ABC): 8 | """A document store is in charged of storing and managing documents""" 9 | 10 | @abstractmethod 11 | def __init__(self, *args, **kwargs): 12 | ... 13 | 14 | @abstractmethod 15 | def add( 16 | self, 17 | docs: Union[Document, List[Document]], 18 | ids: Optional[Union[List[str], str]] = None, 19 | **kwargs, 20 | ): 21 | """Add document into document store 22 | 23 | Args: 24 | docs: Document or list of documents 25 | ids: List of ids of the documents. Optional, if not set will use doc.doc_id 26 | """ 27 | ... 28 | 29 | @abstractmethod 30 | def get(self, ids: Union[List[str], str]) -> List[Document]: 31 | """Get document by id""" 32 | ... 33 | 34 | @abstractmethod 35 | def get_all(self) -> List[Document]: 36 | """Get all documents""" 37 | ... 38 | 39 | @abstractmethod 40 | def count(self) -> int: 41 | """Count number of documents""" 42 | ... 43 | 44 | @abstractmethod 45 | def query( 46 | self, query: str, top_k: int = 10, doc_ids: Optional[list] = None 47 | ) -> List[Document]: 48 | """Search document store using search query""" 49 | ... 50 | 51 | @abstractmethod 52 | def delete(self, ids: Union[List[str], str]): 53 | """Delete document by id""" 54 | ... 55 | 56 | @abstractmethod 57 | def drop(self): 58 | """Drop the document store""" 59 | ... 60 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/storages/docstores/simple_file.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import List, Optional, Union 3 | 4 | from kotaemon.base import Document 5 | 6 | from .in_memory import InMemoryDocumentStore 7 | 8 | 9 | class SimpleFileDocumentStore(InMemoryDocumentStore): 10 | """Improve InMemoryDocumentStore by auto saving whenever the corpus is changed""" 11 | 12 | def __init__(self, path: str | Path, collection_name: str = "default"): 13 | super().__init__() 14 | self._path = path 15 | self._collection_name = collection_name 16 | 17 | Path(path).mkdir(parents=True, exist_ok=True) 18 | self._save_path = Path(path) / f"{collection_name}.json" 19 | if self._save_path.is_file(): 20 | self.load(self._save_path) 21 | 22 | def get(self, ids: Union[List[str], str]) -> List[Document]: 23 | """Get document by id""" 24 | if not isinstance(ids, list): 25 | ids = [ids] 26 | 27 | for doc_id in ids: 28 | if doc_id not in self._store: 29 | self.load(self._save_path) 30 | break 31 | 32 | return [self._store[doc_id] for doc_id in ids] 33 | 34 | def add( 35 | self, 36 | docs: Union[Document, List[Document]], 37 | ids: Optional[Union[List[str], str]] = None, 38 | **kwargs, 39 | ): 40 | """Add document into document store 41 | 42 | Args: 43 | docs: list of documents to add 44 | ids: specify the ids of documents to add or 45 | use existing doc.doc_id 46 | exist_ok: raise error when duplicate doc-id 47 | found in the docstore (default to False) 48 | """ 49 | super().add(docs=docs, ids=ids, **kwargs) 50 | self.save(self._save_path) 51 | 52 | def delete(self, ids: Union[List[str], str]): 53 | """Delete document by id""" 54 | super().delete(ids=ids) 55 | self.save(self._save_path) 56 | 57 | def drop(self): 58 | """Drop the document store""" 59 | super().drop() 60 | self._save_path.unlink(missing_ok=True) 61 | 62 | def __persist_flow__(self): 63 | from theflow.utils.modules import serialize 64 | 65 | return { 66 | "path": serialize(self._path), 67 | "collection_name": self._collection_name, 68 | } 69 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/storages/vectorstores/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseVectorStore 2 | from .chroma import ChromaVectorStore 3 | from .in_memory import InMemoryVectorStore 4 | from .lancedb import LanceDBVectorStore 5 | from .milvus import MilvusVectorStore 6 | from .qdrant import QdrantVectorStore 7 | from .simple_file import SimpleFileVectorStore 8 | 9 | __all__ = [ 10 | "BaseVectorStore", 11 | "ChromaVectorStore", 12 | "InMemoryVectorStore", 13 | "SimpleFileVectorStore", 14 | "LanceDBVectorStore", 15 | "MilvusVectorStore", 16 | "QdrantVectorStore", 17 | ] 18 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py: -------------------------------------------------------------------------------- 1 | """Simple vector store index.""" 2 | from typing import Any, Optional, Type 3 | 4 | import fsspec 5 | from llama_index.core.vector_stores import SimpleVectorStore as LISimpleVectorStore 6 | from llama_index.core.vector_stores.simple import SimpleVectorStoreData 7 | 8 | from .base import LlamaIndexVectorStore 9 | 10 | 11 | class InMemoryVectorStore(LlamaIndexVectorStore): 12 | _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore 13 | store_text: bool = False 14 | 15 | def __init__( 16 | self, 17 | data: Optional[SimpleVectorStoreData] = None, 18 | fs: Optional[fsspec.AbstractFileSystem] = None, 19 | **kwargs: Any, 20 | ) -> None: 21 | """Initialize params.""" 22 | self._data = data or SimpleVectorStoreData() 23 | self._fs = fs or fsspec.filesystem("file") 24 | 25 | super().__init__( 26 | data=data, 27 | fs=fs, 28 | **kwargs, 29 | ) 30 | 31 | def save( 32 | self, 33 | save_path: str, 34 | fs: Optional[fsspec.AbstractFileSystem] = None, 35 | **kwargs, 36 | ): 37 | 38 | """save a simpleVectorStore to a dictionary. 39 | 40 | Args: 41 | save_path: Path of saving vector to disk. 42 | fs: An abstract super-class for pythonic file-systems 43 | """ 44 | self._client.persist(persist_path=save_path, fs=fs) 45 | 46 | def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None): 47 | 48 | """Create a SimpleKVStore from a load directory. 49 | 50 | Args: 51 | load_path: Path of loading vector. 52 | fs: An abstract super-class for pythonic file-systems 53 | """ 54 | self._client = self._client.from_persist_path(persist_path=load_path, fs=fs) 55 | 56 | def drop(self): 57 | """Clear the old data""" 58 | self._data = SimpleVectorStoreData() 59 | 60 | def __persist_flow__(self): 61 | d = self._data.to_dict() 62 | d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}" 63 | return { 64 | "data": d, 65 | # "fs": self._fs, 66 | } 67 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Optional, cast 2 | 3 | from .base import LlamaIndexVectorStore 4 | 5 | 6 | class QdrantVectorStore(LlamaIndexVectorStore): 7 | _li_class = None 8 | 9 | def _get_li_class(self): 10 | try: 11 | from llama_index.vector_stores.qdrant import ( 12 | QdrantVectorStore as LIQdrantVectorStore, 13 | ) 14 | except ImportError: 15 | raise ImportError( 16 | "Please install missing package: " 17 | "'pip install llama-index-vector-stores-qdrant'" 18 | ) 19 | 20 | return LIQdrantVectorStore 21 | 22 | def __init__( 23 | self, 24 | collection_name, 25 | url: Optional[str] = None, 26 | api_key: Optional[str] = None, 27 | client_kwargs: Optional[dict] = None, 28 | **kwargs: Any, 29 | ): 30 | self._collection_name = collection_name 31 | self._url = url 32 | self._api_key = api_key 33 | self._client_kwargs = client_kwargs 34 | self._kwargs = kwargs 35 | 36 | super().__init__( 37 | collection_name=collection_name, 38 | url=url, 39 | api_key=api_key, 40 | client_kwargs=client_kwargs, 41 | **kwargs, 42 | ) 43 | from llama_index.vector_stores.qdrant import ( 44 | QdrantVectorStore as LIQdrantVectorStore, 45 | ) 46 | 47 | self._client = cast(LIQdrantVectorStore, self._client) 48 | 49 | def delete(self, ids: List[str], **kwargs): 50 | """Delete vector embeddings from vector stores 51 | 52 | Args: 53 | ids: List of ids of the embeddings to be deleted 54 | kwargs: meant for vectorstore-specific parameters 55 | """ 56 | from qdrant_client import models 57 | 58 | self._client.client.delete( 59 | collection_name=self._collection_name, 60 | points_selector=models.PointIdsList( 61 | points=ids, 62 | ), 63 | **kwargs, 64 | ) 65 | 66 | def drop(self): 67 | """Delete entire collection from vector stores""" 68 | self._client.client.delete_collection(self._collection_name) 69 | 70 | def count(self) -> int: 71 | return self._client.client.count( 72 | collection_name=self._collection_name, exact=True 73 | ).count 74 | 75 | def __persist_flow__(self): 76 | return { 77 | "collection_name": self._collection_name, 78 | "url": self._url, 79 | "api_key": self._api_key, 80 | "client_kwargs": self._client_kwargs, 81 | **self._kwargs, 82 | } 83 | -------------------------------------------------------------------------------- /libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py: -------------------------------------------------------------------------------- 1 | """Simple file vector store index.""" 2 | from pathlib import Path 3 | from typing import Any, Optional, Type 4 | 5 | import fsspec 6 | from llama_index.core.vector_stores import SimpleVectorStore as LISimpleVectorStore 7 | from llama_index.core.vector_stores.simple import SimpleVectorStoreData 8 | 9 | from kotaemon.base import DocumentWithEmbedding 10 | 11 | from .base import LlamaIndexVectorStore 12 | 13 | 14 | class SimpleFileVectorStore(LlamaIndexVectorStore): 15 | """Similar to InMemoryVectorStore but is backed by file by default""" 16 | 17 | _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore 18 | store_text: bool = False 19 | 20 | def __init__( 21 | self, 22 | path: str | Path, 23 | collection_name: str = "default", 24 | data: Optional[SimpleVectorStoreData] = None, 25 | fs: Optional[fsspec.AbstractFileSystem] = None, 26 | **kwargs: Any, 27 | ) -> None: 28 | """Initialize params.""" 29 | self._data = data or SimpleVectorStoreData() 30 | self._fs = fs or fsspec.filesystem("file") 31 | self._collection_name = collection_name 32 | self._path = path 33 | self._save_path = Path(path) / collection_name 34 | 35 | super().__init__( 36 | data=data, 37 | fs=fs, 38 | **kwargs, 39 | ) 40 | 41 | if self._save_path.is_file(): 42 | self._client = self._li_class.from_persist_path( 43 | persist_path=str(self._save_path), fs=self._fs 44 | ) 45 | 46 | def add( 47 | self, 48 | embeddings: list[list[float]] | list[DocumentWithEmbedding], 49 | metadatas: Optional[list[dict]] = None, 50 | ids: Optional[list[str]] = None, 51 | ): 52 | r = super().add(embeddings, metadatas, ids) 53 | self._client.persist(str(self._save_path), self._fs) 54 | return r 55 | 56 | def delete(self, ids: list[str], **kwargs): 57 | r = super().delete(ids, **kwargs) 58 | self._client.persist(str(self._save_path), self._fs) 59 | return r 60 | 61 | def drop(self): 62 | self._data = SimpleVectorStoreData() 63 | self._save_path.unlink(missing_ok=True) 64 | 65 | def __persist_flow__(self): 66 | d = self._data.to_dict() 67 | d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}" 68 | return { 69 | "data": d, 70 | "collection_name": self._collection_name, 71 | "path": str(self._path), 72 | # "fs": self._fs, 73 | } 74 | -------------------------------------------------------------------------------- /libs/kotaemon/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | minversion = 7.4.0 3 | testpaths = tests 4 | addopts = -ra -q 5 | log_cli=true 6 | log_level=WARNING 7 | log_format = %(asctime)s %(levelname)s %(message)s 8 | log_date_format = %Y-%m-%d %H:%M:%S 9 | log_file = logs/pytest-logs.txt 10 | -------------------------------------------------------------------------------- /libs/kotaemon/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/__init__.py -------------------------------------------------------------------------------- /libs/kotaemon/tests/_test_multimodal_reader.py: -------------------------------------------------------------------------------- 1 | # TODO: This test is broken and should be rewritten 2 | from pathlib import Path 3 | 4 | from kotaemon.loaders import AdobeReader 5 | 6 | # from dotenv import load_dotenv 7 | 8 | 9 | input_file = Path(__file__).parent / "resources" / "multimodal.pdf" 10 | 11 | # load_dotenv() 12 | 13 | 14 | def test_adobe_reader(): 15 | reader = AdobeReader() 16 | documents = reader.load_data(input_file) 17 | table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"] 18 | assert len(table_docs) == 2 19 | 20 | figure_docs = [doc for doc in documents if doc.metadata.get("type", "") == "image"] 21 | assert len(figure_docs) == 2 22 | -------------------------------------------------------------------------------- /libs/kotaemon/tests/resources/7810d908b0ff4ce381dcab873196d133.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/7810d908b0ff4ce381dcab873196d133.jpg -------------------------------------------------------------------------------- /libs/kotaemon/tests/resources/dummy.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/dummy.docx -------------------------------------------------------------------------------- /libs/kotaemon/tests/resources/dummy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/dummy.pdf -------------------------------------------------------------------------------- /libs/kotaemon/tests/resources/dummy.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/dummy.xlsx -------------------------------------------------------------------------------- /libs/kotaemon/tests/resources/ggml-vocab-llama.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/ggml-vocab-llama.gguf -------------------------------------------------------------------------------- /libs/kotaemon/tests/resources/html/dummy_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/html/dummy_image.png -------------------------------------------------------------------------------- /libs/kotaemon/tests/resources/multimodal.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/multimodal.pdf -------------------------------------------------------------------------------- /libs/kotaemon/tests/resources/table.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/kotaemon/tests/resources/table.pdf -------------------------------------------------------------------------------- /libs/kotaemon/tests/simple_pipeline.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from typing import List 3 | 4 | from kotaemon.base import BaseComponent, LLMInterface, lazy 5 | from kotaemon.embeddings import LCAzureOpenAIEmbeddings 6 | from kotaemon.indices import VectorRetrieval 7 | from kotaemon.llms import AzureOpenAI 8 | from kotaemon.storages import ChromaVectorStore 9 | 10 | 11 | class Pipeline(BaseComponent): 12 | llm: AzureOpenAI = AzureOpenAI.withx( 13 | azure_endpoint="https://test.openai.azure.com/", 14 | openai_api_key="some-key", 15 | openai_api_version="2023-03-15-preview", 16 | deployment_name="gpt35turbo", 17 | temperature=0, 18 | request_timeout=60, 19 | ) 20 | 21 | retrieving_pipeline: VectorRetrieval = VectorRetrieval.withx( 22 | vector_store=lazy(ChromaVectorStore).withx(path=str(tempfile.mkdtemp())), 23 | embedding=LCAzureOpenAIEmbeddings.withx( 24 | model="text-embedding-ada-002", 25 | deployment="embedding-deployment", 26 | azure_endpoint="https://test.openai.azure.com/", 27 | openai_api_key="some-key", 28 | ), 29 | ) 30 | 31 | def run(self, text: str) -> LLMInterface: 32 | matched_texts: List[str] = self.retrieving_pipeline(text) 33 | return self.llm("\n".join(matched_texts)) 34 | -------------------------------------------------------------------------------- /libs/kotaemon/tests/test_documents.py: -------------------------------------------------------------------------------- 1 | from kotaemon.base.schema import Document, RetrievedDocument 2 | 3 | from .conftest import skip_when_haystack_not_installed 4 | 5 | 6 | def test_document_constructor_with_builtin_types(): 7 | for value in ["str", 1, {}, set(), [], tuple, None]: 8 | doc = Document(value) 9 | assert doc.text == (str(value) if value else "") 10 | assert doc.content == value 11 | assert bool(doc) == bool(value) 12 | 13 | 14 | def test_document_constructor_with_document(): 15 | text = "Sample text" 16 | doc1 = Document(text) 17 | doc2 = Document(doc1) 18 | assert doc2.text == doc1.text 19 | assert doc2.content == doc1.content 20 | 21 | 22 | @skip_when_haystack_not_installed 23 | def test_document_to_haystack_format(): 24 | from haystack.schema import Document as HaystackDocument 25 | 26 | text = "Sample text" 27 | metadata = {"filename": "sample.txt"} 28 | doc = Document(text, metadata=metadata) 29 | haystack_doc = doc.to_haystack_format() 30 | assert isinstance(haystack_doc, HaystackDocument) 31 | assert haystack_doc.content == doc.text 32 | assert haystack_doc.meta == metadata 33 | 34 | 35 | def test_retrieved_document_default_values(): 36 | sample_text = "text" 37 | retrieved_doc = RetrievedDocument(text=sample_text) 38 | assert retrieved_doc.text == sample_text 39 | assert retrieved_doc.score == 0.0 40 | assert retrieved_doc.retrieval_metadata == {} 41 | 42 | 43 | def test_retrieved_document_attributes(): 44 | sample_text = "text" 45 | score = 0.8 46 | metadata = {"source": "retrieval_system"} 47 | retrieved_doc = RetrievedDocument( 48 | text=sample_text, score=score, retrieval_metadata=metadata 49 | ) 50 | assert retrieved_doc.text == sample_text 51 | assert retrieved_doc.score == score 52 | assert retrieved_doc.retrieval_metadata == metadata 53 | -------------------------------------------------------------------------------- /libs/kotaemon/tests/test_indexing_retrieval.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from typing import cast 4 | from unittest.mock import patch 5 | 6 | from openai.types.create_embedding_response import CreateEmbeddingResponse 7 | 8 | from kotaemon.base import Document 9 | from kotaemon.embeddings import AzureOpenAIEmbeddings 10 | from kotaemon.indices import VectorIndexing, VectorRetrieval 11 | from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore 12 | 13 | with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f: 14 | openai_embedding = CreateEmbeddingResponse.model_validate(json.load(f)) 15 | 16 | 17 | @patch( 18 | "openai.resources.embeddings.Embeddings.create", 19 | side_effect=lambda *args, **kwargs: openai_embedding, 20 | ) 21 | def test_indexing(tmp_path): 22 | db = ChromaVectorStore(path=str(tmp_path)) 23 | doc_store = InMemoryDocumentStore() 24 | embedding = AzureOpenAIEmbeddings( 25 | azure_deployment="text-embedding-ada-002", 26 | azure_endpoint="https://test.openai.azure.com/", 27 | api_key="some-key", 28 | api_version="version", 29 | ) 30 | 31 | pipeline = VectorIndexing(vector_store=db, embedding=embedding, doc_store=doc_store) 32 | pipeline.doc_store = cast(InMemoryDocumentStore, pipeline.doc_store) 33 | pipeline.vector_store = cast(ChromaVectorStore, pipeline.vector_store) 34 | assert pipeline.vector_store._collection.count() == 0, "Expected empty collection" 35 | assert len(pipeline.doc_store._store) == 0, "Expected empty doc store" 36 | pipeline(text=Document(text="Hello world")) 37 | assert pipeline.vector_store._collection.count() == 1, "Index 1 item" 38 | assert len(pipeline.doc_store._store) == 1, "Expected 1 document" 39 | 40 | 41 | @patch( 42 | "openai.resources.embeddings.Embeddings.create", 43 | side_effect=lambda *args, **kwargs: openai_embedding, 44 | ) 45 | def test_retrieving(tmp_path): 46 | db = ChromaVectorStore(path=str(tmp_path)) 47 | doc_store = InMemoryDocumentStore() 48 | embedding = AzureOpenAIEmbeddings( 49 | azure_deployment="text-embedding-ada-002", 50 | azure_endpoint="https://test.openai.azure.com/", 51 | api_key="some-key", 52 | api_version="version", 53 | ) 54 | 55 | index_pipeline = VectorIndexing( 56 | vector_store=db, embedding=embedding, doc_store=doc_store 57 | ) 58 | retrieval_pipeline = VectorRetrieval( 59 | vector_store=db, doc_store=doc_store, embedding=embedding 60 | ) 61 | 62 | index_pipeline(text=Document(text="Hello world")) 63 | output = retrieval_pipeline(text="Hello world") 64 | output1 = retrieval_pipeline(text="Hello world") 65 | 66 | assert len(output) == 1, "Expect 1 results" 67 | assert output == output1, "Expect identical results" 68 | -------------------------------------------------------------------------------- /libs/kotaemon/tests/test_ingestor.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from kotaemon.indices.ingests import DocumentIngestor 4 | from kotaemon.indices.splitters import TokenSplitter 5 | 6 | 7 | def test_ingestor_include_src(): 8 | dirpath = Path(__file__).parent 9 | ingestor = DocumentIngestor( 10 | pdf_mode="normal", 11 | text_splitter=TokenSplitter(chunk_size=200, chunk_overlap=10), 12 | ) 13 | nodes = ingestor(dirpath / "resources" / "table.pdf") 14 | assert type(nodes) is list 15 | assert nodes[0].relationships 16 | -------------------------------------------------------------------------------- /libs/kotaemon/tests/test_post_processing.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from kotaemon.base import Document 4 | from kotaemon.parsers import RegexExtractor 5 | 6 | 7 | @pytest.fixture 8 | def regex_extractor(): 9 | return RegexExtractor( 10 | pattern=r"\d+", output_map={"1": "One", "2": "Two", "3": "Three"} 11 | ) 12 | 13 | 14 | def test_run_document(regex_extractor): 15 | document = Document(text="This is a test. 1 2 3") 16 | extracted_document = regex_extractor(document)[0] 17 | assert extracted_document.text == "One" 18 | assert extracted_document.matches == ["One", "Two", "Three"] 19 | 20 | 21 | def test_run_raw(regex_extractor): 22 | output = regex_extractor("This is a test. 123")[0] 23 | assert output.text == "123" 24 | assert output.matches == ["123"] 25 | 26 | 27 | def test_run_batch_raw(regex_extractor): 28 | output = regex_extractor(["This is a test. 123", "456"]) 29 | extracted_text = [each.text for each in output] 30 | extracted_matches = [each.matches for each in output] 31 | assert extracted_text == ["123", "456"] 32 | assert extracted_matches == [["123"], ["456"]] 33 | -------------------------------------------------------------------------------- /libs/kotaemon/tests/test_prompt.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from kotaemon.base import Document 4 | from kotaemon.llms import BasePromptComponent, PromptTemplate 5 | from kotaemon.parsers import RegexExtractor 6 | 7 | 8 | def test_set_attributes(): 9 | template = PromptTemplate("str = {s}, int = {i}, doc = {doc}, comp = {comp}") 10 | doc = Document(text="Helloo, Alice!") 11 | comp = RegexExtractor( 12 | pattern=r"\d+", output_map={"1": "One", "2": "Two", "3": "Three"} 13 | ) 14 | comp.set_run(kwargs={"text": "This is a test. 1 2 3"}, temp=True) 15 | 16 | prompt = BasePromptComponent(template=template, s="Alice", i=30, doc=doc, comp=comp) 17 | assert prompt.s == "Alice" 18 | assert prompt.i == 30 19 | assert prompt.doc == doc 20 | assert prompt.comp == comp 21 | 22 | 23 | def test_check_redundant_kwargs(): 24 | template = PromptTemplate("Hello, {name}!") 25 | prompt = BasePromptComponent(template=template, name="Alice") 26 | with pytest.warns(UserWarning, match="Keys provided but not in template: age"): 27 | prompt._BasePromptComponent__check_redundant_kwargs(name="Alice", age=30) 28 | 29 | 30 | def test_check_unset_placeholders(): 31 | template = PromptTemplate("Hello, {name}! I'm {age} years old.") 32 | prompt = BasePromptComponent(template=template, name="Alice") 33 | with pytest.raises(ValueError): 34 | prompt._BasePromptComponent__check_unset_placeholders() 35 | 36 | 37 | def test_validate_value_type(): 38 | template = PromptTemplate("Hello, {name}!") 39 | prompt = BasePromptComponent(template=template) 40 | with pytest.raises(ValueError): 41 | prompt._BasePromptComponent__validate_value_type(name={}) 42 | 43 | 44 | def test_run(): 45 | template = PromptTemplate("str = {s}, int = {i}, doc = {doc}, comp = {comp}") 46 | doc = Document(text="Helloo, Alice!") 47 | comp = RegexExtractor( 48 | pattern=r"\d+", output_map={"1": "One", "2": "Two", "3": "Three"} 49 | ) 50 | comp.set_run(kwargs={"text": "This is a test. 1 2 3"}, temp=True) 51 | 52 | prompt = BasePromptComponent(template=template, s="Alice", i=30, doc=doc, comp=comp) 53 | 54 | result = prompt() 55 | 56 | assert result.text == "str = Alice, int = 30, doc = Helloo, Alice!, comp = ['One']" 57 | 58 | 59 | def test_set_method(): 60 | template = PromptTemplate("Hello, {name}!") 61 | prompt = BasePromptComponent(template=template) 62 | prompt.set_value(name="Alice") 63 | assert prompt.name == "Alice" 64 | -------------------------------------------------------------------------------- /libs/kotaemon/tests/test_promptui.py: -------------------------------------------------------------------------------- 1 | from kotaemon.contribs.promptui.config import export_pipeline_to_config 2 | from kotaemon.contribs.promptui.export import export_from_dict 3 | from kotaemon.contribs.promptui.ui import build_from_dict 4 | 5 | from .simple_pipeline import Pipeline 6 | 7 | 8 | class TestPromptConfig: 9 | def test_export_prompt_config(self): 10 | """Test if the prompt config is exported correctly""" 11 | pipeline = Pipeline() 12 | config_dict = export_pipeline_to_config(pipeline) 13 | config = list(config_dict.values())[0] 14 | 15 | assert "inputs" in config, "inputs should be in config" 16 | assert "text" in config["inputs"], "inputs should have config" 17 | 18 | assert "params" in config, "params should be in config" 19 | assert "llm.deployment_name" in config["params"] 20 | assert "llm.azure_endpoint" in config["params"] 21 | assert "llm.openai_api_key" in config["params"] 22 | assert "llm.openai_api_version" in config["params"] 23 | assert "llm.request_timeout" in config["params"] 24 | assert "llm.temperature" in config["params"] 25 | 26 | 27 | class TestPromptUI: 28 | def test_uigeneration(self): 29 | """Test if the gradio UI is exposed without any problem""" 30 | pipeline = Pipeline() 31 | config = export_pipeline_to_config(pipeline) 32 | 33 | build_from_dict(config) 34 | 35 | 36 | class TestExport: 37 | def test_export(self, tmp_path): 38 | """Test if the export functionality works without error""" 39 | from pathlib import Path 40 | 41 | import yaml 42 | from theflow.storage import storage 43 | 44 | config_path = tmp_path / "config.yaml" 45 | pipeline = Pipeline() 46 | Path(storage.url(pipeline.config.store_result)).mkdir( 47 | parents=True, exist_ok=True 48 | ) 49 | 50 | config_dict = export_pipeline_to_config(pipeline) 51 | pipeline_name = list(config_dict.keys())[0] 52 | 53 | config_dict[pipeline_name]["logs"] = { 54 | "sheet1": { 55 | "inputs": [{"name": "text", "step": ".", "variable": "text"}], 56 | "outputs": [{"name": "answer", "step": "."}], 57 | }, 58 | } 59 | with open(config_path, "w") as f: 60 | yaml.safe_dump(config_dict, f) 61 | 62 | export_from_dict( 63 | config=str(config_path), 64 | pipeline=pipeline_name, 65 | output_path=str(tmp_path / "exported.xlsx"), 66 | ) 67 | -------------------------------------------------------------------------------- /libs/kotaemon/tests/test_reranking.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pytest 4 | from openai.types.chat.chat_completion import ChatCompletion 5 | 6 | from kotaemon.base import Document 7 | from kotaemon.indices.rankings import LLMReranking 8 | from kotaemon.llms import AzureChatOpenAI 9 | 10 | _openai_chat_completion_responses = [ 11 | ChatCompletion.parse_obj( 12 | { 13 | "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", 14 | "object": "chat.completion", 15 | "created": 1692338378, 16 | "model": "gpt-35-turbo", 17 | "system_fingerprint": None, 18 | "choices": [ 19 | { 20 | "index": 0, 21 | "finish_reason": "stop", 22 | "message": { 23 | "role": "assistant", 24 | "content": text, 25 | "function_call": None, 26 | "tool_calls": None, 27 | }, 28 | "logprobs": None, 29 | } 30 | ], 31 | "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19}, 32 | } 33 | ) 34 | for text in [ 35 | "YES", 36 | "NO", 37 | "YES", 38 | ] 39 | ] 40 | 41 | 42 | @pytest.fixture 43 | def llm(): 44 | return AzureChatOpenAI( 45 | api_key="dummy", 46 | api_version="2024-05-01-preview", 47 | azure_deployment="gpt-4o", 48 | azure_endpoint="https://test.openai.azure.com/", 49 | ) 50 | 51 | 52 | @patch( 53 | "openai.resources.chat.completions.Completions.create", 54 | side_effect=_openai_chat_completion_responses, 55 | ) 56 | def test_reranking(openai_completion, llm): 57 | documents = [Document(text=f"test {idx}") for idx in range(3)] 58 | query = "test query" 59 | 60 | reranker = LLMReranking(llm=llm, concurrent=False) 61 | rerank_docs = reranker(documents, query=query) 62 | 63 | assert len(rerank_docs) == 2 64 | -------------------------------------------------------------------------------- /libs/kotaemon/tests/test_table_reader.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader 7 | 8 | from .conftest import skip_when_unstructured_pdf_not_installed 9 | 10 | input_file = Path(__file__).parent / "resources" / "table.pdf" 11 | input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx" 12 | 13 | 14 | @pytest.fixture 15 | def fullocr_output(): 16 | with open( 17 | Path(__file__).parent / "resources" / "fullocr_sample_output.json", 18 | encoding="utf-8", 19 | ) as f: 20 | fullocr = json.load(f) 21 | return fullocr 22 | 23 | 24 | @pytest.fixture 25 | def mathpix_output(): 26 | with open(Path(__file__).parent / "resources" / "policy.md", encoding="utf-8") as f: 27 | content = f.read() 28 | return content 29 | 30 | 31 | @skip_when_unstructured_pdf_not_installed 32 | def test_ocr_reader(fullocr_output): 33 | reader = OCRReader() 34 | documents = reader.load_data(input_file, response_content=fullocr_output) 35 | table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"] 36 | assert len(table_docs) == 2 37 | 38 | 39 | def test_mathpix_reader(mathpix_output): 40 | reader = MathpixPDFReader() 41 | documents = reader.load_data(input_file, response_content=mathpix_output) 42 | table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"] 43 | assert len(table_docs) == 4 44 | 45 | 46 | def test_excel_reader(): 47 | reader = PandasExcelReader() 48 | documents = reader.load_data( 49 | input_file_excel, 50 | ) 51 | assert len(documents) == 1 52 | -------------------------------------------------------------------------------- /libs/kotaemon/tests/test_telemetry.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import pytest 5 | 6 | from .conftest import skip_when_haystack_not_installed 7 | 8 | 9 | @pytest.fixture 10 | def clean_artifacts_for_telemetry(): 11 | try: 12 | del sys.modules["kotaemon"] 13 | except KeyError: 14 | pass 15 | 16 | try: 17 | del sys.modules["haystack"] 18 | except KeyError: 19 | pass 20 | 21 | try: 22 | del sys.modules["haystack.telemetry"] 23 | except KeyError: 24 | pass 25 | 26 | if "HAYSTACK_TELEMETRY_ENABLED" in os.environ: 27 | del os.environ["HAYSTACK_TELEMETRY_ENABLED"] 28 | 29 | 30 | @pytest.mark.usefixtures("clean_artifacts_for_telemetry") 31 | @skip_when_haystack_not_installed 32 | def test_disable_telemetry_import_haystack_first(): 33 | """Test that telemetry is disabled when kotaemon lib is initiated after""" 34 | import os 35 | 36 | import haystack.telemetry 37 | 38 | assert haystack.telemetry.telemetry is not None 39 | assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") != "False" 40 | 41 | import kotaemon # noqa: F401 42 | 43 | assert haystack.telemetry.telemetry is None 44 | assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") == "False" 45 | 46 | 47 | @pytest.mark.usefixtures("clean_artifacts_for_telemetry") 48 | @skip_when_haystack_not_installed 49 | def test_disable_telemetry_import_haystack_after_kotaemon(): 50 | """Test that telemetry is disabled when kotaemon lib is initiated before""" 51 | import os 52 | 53 | import haystack.telemetry 54 | 55 | import kotaemon # noqa: F401 56 | 57 | assert haystack.telemetry.telemetry is None 58 | assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") == "False" 59 | -------------------------------------------------------------------------------- /libs/kotaemon/tests/test_tools.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from unittest.mock import patch 4 | 5 | from openai.types.create_embedding_response import CreateEmbeddingResponse 6 | 7 | from kotaemon.agents.tools import ComponentTool, GoogleSearchTool, WikipediaTool 8 | from kotaemon.base import Document 9 | from kotaemon.embeddings import AzureOpenAIEmbeddings 10 | from kotaemon.indices.vectorindex import VectorIndexing, VectorRetrieval 11 | from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore 12 | 13 | with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f: 14 | openai_embedding = CreateEmbeddingResponse.model_validate(json.load(f)) 15 | 16 | 17 | def test_google_tool(mock_google_search): 18 | tool = GoogleSearchTool() 19 | assert tool.name 20 | assert tool.description 21 | output = tool("What is Cinnamon AI") 22 | assert output 23 | 24 | 25 | def test_wikipedia_tool(): 26 | tool = WikipediaTool() 27 | assert tool.name 28 | assert tool.description 29 | output = tool("Cinnamon") 30 | assert output 31 | 32 | 33 | @patch( 34 | "openai.resources.embeddings.Embeddings.create", 35 | side_effect=lambda *args, **kwargs: openai_embedding, 36 | ) 37 | def test_pipeline_tool(tmp_path): 38 | db = ChromaVectorStore(path=str(tmp_path)) 39 | doc_store = InMemoryDocumentStore() 40 | embedding = AzureOpenAIEmbeddings( 41 | azure_deployment="embedding-deployment", 42 | azure_endpoint="https://test.openai.azure.com/", 43 | api_key="some-key", 44 | api_version="version", 45 | ) 46 | 47 | index_pipeline = VectorIndexing( 48 | vector_store=db, embedding=embedding, doc_store=doc_store 49 | ) 50 | retrieval_pipeline = VectorRetrieval( 51 | vector_store=db, doc_store=doc_store, embedding=embedding 52 | ) 53 | 54 | index_tool = ComponentTool( 55 | name="index_document", 56 | description="A tool to use to index a document to be searched later", 57 | component=index_pipeline, 58 | ) 59 | output = index_tool({"text": Document(text="Cinnamon AI")}) 60 | 61 | retrieval_tool = ComponentTool( 62 | name="search_document", 63 | description="A tool to use to search a document in a vectorstore", 64 | component=retrieval_pipeline, 65 | ) 66 | output = retrieval_tool("Cinnamon AI") 67 | assert output 68 | -------------------------------------------------------------------------------- /libs/ktem/.gitignore: -------------------------------------------------------------------------------- 1 | 14-1_抜粋-1.pdf 2 | _example_.db 3 | ktem/assets/prebuilt/ 4 | -------------------------------------------------------------------------------- /libs/ktem/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include ktem/assets/css/*.css 2 | include ktem/assets/img/*.svg 3 | include ktem/assets/js/*.js 4 | include ktem/assets/md/*.md 5 | -------------------------------------------------------------------------------- /libs/ktem/ktem/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/__init__.py -------------------------------------------------------------------------------- /libs/ktem/ktem/assets/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from decouple import config 4 | 5 | from .theme import Kotaemon as KotaemonTheme 6 | 7 | PDFJS_VERSION_DIST: str = config("PDFJS_VERSION_DIST", "pdfjs-4.0.379-dist") 8 | PDFJS_PREBUILT_DIR: Path = config( 9 | "PDFJS_PREBUILT_DIR", Path(__file__).parent / "prebuilt" / PDFJS_VERSION_DIST 10 | ) 11 | 12 | __all__ = ["KotaemonTheme", "PDFJS_VERSION_DIST", "PDFJS_PREBUILT_DIR"] 13 | -------------------------------------------------------------------------------- /libs/ktem/ktem/assets/icons/dark_mode.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ic_fluent_dark_theme_24_regular 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /libs/ktem/ktem/assets/icons/delete.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /libs/ktem/ktem/assets/icons/expand.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /libs/ktem/ktem/assets/icons/new.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /libs/ktem/ktem/assets/icons/rename.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /libs/ktem/ktem/assets/icons/sidebar.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /libs/ktem/ktem/assets/md/about.md: -------------------------------------------------------------------------------- 1 | # About Kotaemon 2 | 3 | An open-source tool for you to chat with your documents. 4 | 5 | [Source Code](https://github.com/Cinnamon/kotaemon) | 6 | [Demo](https://huggingface.co/spaces/cin-model/kotaemon-demo) 7 | 8 | [User Guide](https://cinnamon.github.io/kotaemon/) | 9 | [Developer Guide](https://cinnamon.github.io/kotaemon/development/) | 10 | [Feedback](https://github.com/Cinnamon/kotaemon/issues) 11 | -------------------------------------------------------------------------------- /libs/ktem/ktem/assets/md/changelogs.md: -------------------------------------------------------------------------------- 1 | # Changelogs 2 | 3 | ## v0.0.1 4 | 5 | - Chat: interact with chatbot with simple pipeline, rewoo and react agents 6 | - Chat: conversation management: create, delete, rename conversations 7 | - Files: upload files 8 | - Files: select files as context for chatbot 9 | - User management: create, sign-in, sign-out, change password 10 | - Setting: common settings and pipeline-based settings 11 | - Info panel: show Cinnamon AI and Kotaemon information 12 | -------------------------------------------------------------------------------- /libs/ktem/ktem/db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/db/__init__.py -------------------------------------------------------------------------------- /libs/ktem/ktem/db/engine.py: -------------------------------------------------------------------------------- 1 | from sqlmodel import create_engine 2 | from theflow.settings import settings 3 | 4 | engine = create_engine(settings.KH_DATABASE) 5 | -------------------------------------------------------------------------------- /libs/ktem/ktem/db/models.py: -------------------------------------------------------------------------------- 1 | import ktem.db.base_models as base_models 2 | from ktem.db.engine import engine 3 | from sqlmodel import SQLModel 4 | from theflow.settings import settings 5 | from theflow.utils.modules import import_dotted_string 6 | 7 | _base_conv = ( 8 | import_dotted_string(settings.KH_TABLE_CONV, safe=False) 9 | if hasattr(settings, "KH_TABLE_CONV") 10 | else base_models.BaseConversation 11 | ) 12 | 13 | _base_user = ( 14 | import_dotted_string(settings.KH_TABLE_USER, safe=False) 15 | if hasattr(settings, "KH_TABLE_USER") 16 | else base_models.BaseUser 17 | ) 18 | 19 | _base_settings = ( 20 | import_dotted_string(settings.KH_TABLE_SETTINGS, safe=False) 21 | if hasattr(settings, "KH_TABLE_SETTINGS") 22 | else base_models.BaseSettings 23 | ) 24 | 25 | _base_issue_report = ( 26 | import_dotted_string(settings.KH_TABLE_ISSUE_REPORT, safe=False) 27 | if hasattr(settings, "KH_TABLE_ISSUE_REPORT") 28 | else base_models.BaseIssueReport 29 | ) 30 | 31 | 32 | class Conversation(_base_conv, table=True): # type: ignore 33 | """Conversation record""" 34 | 35 | 36 | class User(_base_user, table=True): # type: ignore 37 | """User table""" 38 | 39 | 40 | class Settings(_base_settings, table=True): # type: ignore 41 | """Record of settings""" 42 | 43 | 44 | class IssueReport(_base_issue_report, table=True): # type: ignore 45 | """Record of issues""" 46 | 47 | 48 | if not getattr(settings, "KH_ENABLE_ALEMBIC", False): 49 | SQLModel.metadata.create_all(engine) 50 | -------------------------------------------------------------------------------- /libs/ktem/ktem/embeddings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/embeddings/__init__.py -------------------------------------------------------------------------------- /libs/ktem/ktem/embeddings/db.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | from ktem.db.engine import engine 4 | from sqlalchemy import JSON, Boolean, Column, String 5 | from sqlalchemy.orm import DeclarativeBase 6 | from theflow.settings import settings as flowsettings 7 | from theflow.utils.modules import import_dotted_string 8 | 9 | 10 | class Base(DeclarativeBase): 11 | pass 12 | 13 | 14 | class BaseEmbeddingTable(Base): 15 | """Base table to store language model""" 16 | 17 | __abstract__ = True 18 | 19 | name = Column(String, primary_key=True, unique=True) 20 | spec = Column(JSON, default={}) 21 | default = Column(Boolean, default=False) 22 | 23 | 24 | _base_llm: Type[BaseEmbeddingTable] = ( 25 | import_dotted_string(flowsettings.KH_EMBEDDING_LLM, safe=False) 26 | if hasattr(flowsettings, "KH_EMBEDDING_LLM") 27 | else BaseEmbeddingTable 28 | ) 29 | 30 | 31 | class EmbeddingTable(_base_llm): # type: ignore 32 | __tablename__ = "embedding" 33 | 34 | 35 | if not getattr(flowsettings, "KH_ENABLE_ALEMBIC", False): 36 | EmbeddingTable.metadata.create_all(engine) 37 | -------------------------------------------------------------------------------- /libs/ktem/ktem/exceptions.py: -------------------------------------------------------------------------------- 1 | class KHException(Exception): 2 | pass 3 | 4 | 5 | class HookNotDeclared(KHException): 6 | pass 7 | 8 | 9 | class HookAlreadyDeclared(KHException): 10 | pass 11 | -------------------------------------------------------------------------------- /libs/ktem/ktem/extension_protocol.py: -------------------------------------------------------------------------------- 1 | import pluggy 2 | 3 | hookspec = pluggy.HookspecMarker("ktem") 4 | hookimpl = pluggy.HookimplMarker("ktem") 5 | 6 | 7 | @hookspec 8 | def ktem_declare_extensions() -> dict: # type: ignore 9 | """Called before the run() function is executed. 10 | 11 | This hook is called without any arguments, and should return a dictionary. 12 | The dictionary has the following structure: 13 | 14 | ``` 15 | { 16 | "id": str, # cannot contain . or / 17 | "name": str, # human-friendly name of the plugin 18 | "version": str, 19 | "support_host": str, 20 | "functionality": { 21 | "reasoning": { 22 | id: { # cannot contain . or / 23 | "name": str, 24 | "callbacks": {}, 25 | "settings": {}, 26 | }, 27 | }, 28 | "index": { 29 | "name": str, 30 | "callbacks": { 31 | "get_index_pipeline": callable, 32 | "get_retrievers": {name: callable} 33 | }, 34 | "settings": {}, 35 | }, 36 | }, 37 | } 38 | ``` 39 | """ 40 | -------------------------------------------------------------------------------- /libs/ktem/ktem/index/__init__.py: -------------------------------------------------------------------------------- 1 | from .manager import IndexManager 2 | 3 | __all__ = ["IndexManager"] 4 | -------------------------------------------------------------------------------- /libs/ktem/ktem/index/file/__init__.py: -------------------------------------------------------------------------------- 1 | from .index import FileIndex 2 | 3 | __all__ = ["FileIndex"] 4 | -------------------------------------------------------------------------------- /libs/ktem/ktem/index/file/exceptions.py: -------------------------------------------------------------------------------- 1 | from ktem.exceptions import KHException 2 | 3 | 4 | class FileExistsError(KHException): 5 | pass 6 | -------------------------------------------------------------------------------- /libs/ktem/ktem/index/file/graph/__init__.py: -------------------------------------------------------------------------------- 1 | from .graph_index import GraphRAGIndex 2 | from .light_graph_index import LightRAGIndex 3 | from .nano_graph_index import NanoGraphRAGIndex 4 | 5 | __all__ = ["GraphRAGIndex", "NanoGraphRAGIndex", "LightRAGIndex"] 6 | -------------------------------------------------------------------------------- /libs/ktem/ktem/index/file/graph/graph_index.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from ktem.index.file import FileIndex 4 | 5 | from ..base import BaseFileIndexIndexing, BaseFileIndexRetriever 6 | from .pipelines import GraphRAGIndexingPipeline, GraphRAGRetrieverPipeline 7 | 8 | 9 | class GraphRAGIndex(FileIndex): 10 | def _setup_indexing_cls(self): 11 | self._indexing_pipeline_cls = GraphRAGIndexingPipeline 12 | 13 | def _setup_retriever_cls(self): 14 | self._retriever_pipeline_cls = [GraphRAGRetrieverPipeline] 15 | 16 | def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing: 17 | """Define the interface of the indexing pipeline""" 18 | 19 | obj = super().get_indexing_pipeline(settings, user_id) 20 | # disable vectorstore for this kind of Index 21 | obj.VS = None 22 | 23 | return obj 24 | 25 | def get_retriever_pipelines( 26 | self, settings: dict, user_id: int, selected: Any = None 27 | ) -> list["BaseFileIndexRetriever"]: 28 | file_ids = self._selector_ui.get_selected_ids(selected) 29 | retrievers = [ 30 | GraphRAGRetrieverPipeline( 31 | file_ids=file_ids, 32 | Index=self._resources["Index"], 33 | ) 34 | ] 35 | 36 | return retrievers 37 | -------------------------------------------------------------------------------- /libs/ktem/ktem/index/file/knet/__init__.py: -------------------------------------------------------------------------------- 1 | from .knet_index import KnowledgeNetworkFileIndex 2 | 3 | __all__ = ["KnowledgeNetworkFileIndex"] 4 | -------------------------------------------------------------------------------- /libs/ktem/ktem/index/file/knet/knet_index.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from ktem.index.file import FileIndex 4 | 5 | from ..base import BaseFileIndexIndexing, BaseFileIndexRetriever 6 | from .pipelines import KnetIndexingPipeline, KnetRetrievalPipeline 7 | 8 | 9 | class KnowledgeNetworkFileIndex(FileIndex): 10 | @classmethod 11 | def get_admin_settings(cls): 12 | admin_settings = super().get_admin_settings() 13 | 14 | # remove embedding from admin settings 15 | # as we don't need it 16 | admin_settings.pop("embedding") 17 | return admin_settings 18 | 19 | def _setup_indexing_cls(self): 20 | self._indexing_pipeline_cls = KnetIndexingPipeline 21 | 22 | def _setup_retriever_cls(self): 23 | self._retriever_pipeline_cls = [KnetRetrievalPipeline] 24 | 25 | def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing: 26 | """Define the interface of the indexing pipeline""" 27 | 28 | obj = super().get_indexing_pipeline(settings, user_id) 29 | # disable vectorstore for this kind of Index 30 | # also set the collection_name for API call 31 | obj.VS = None 32 | obj.collection_name = f"kh_index_{self.id}" 33 | 34 | return obj 35 | 36 | def get_retriever_pipelines( 37 | self, settings: dict, user_id: int, selected: Any = None 38 | ) -> list["BaseFileIndexRetriever"]: 39 | retrievers = super().get_retriever_pipelines(settings, user_id, selected) 40 | 41 | for obj in retrievers: 42 | # disable vectorstore for this kind of Index 43 | # also set the collection_name for API call 44 | obj.VS = None 45 | obj.collection_name = f"kh_index_{self.id}" 46 | 47 | return retrievers 48 | -------------------------------------------------------------------------------- /libs/ktem/ktem/index/file/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import requests 4 | 5 | # regex patterns for Arxiv URL 6 | ARXIV_URL_PATTERNS = [ 7 | "https://arxiv.org/abs/", 8 | "https://arxiv.org/pdf/", 9 | ] 10 | 11 | ILLEGAL_NAME_CHARS = ["\\", "/", ":", "*", "?", '"', "<", ">", "|"] 12 | 13 | 14 | def clean_name(name): 15 | for char in ILLEGAL_NAME_CHARS: 16 | name = name.replace(char, "_") 17 | return name 18 | 19 | 20 | def is_arxiv_url(url): 21 | return any(url.startswith(pattern) for pattern in ARXIV_URL_PATTERNS) 22 | 23 | 24 | # download PDF from Arxiv URL 25 | def download_arxiv_pdf(url, output_path): 26 | if not is_arxiv_url(url): 27 | raise ValueError("Invalid Arxiv URL") 28 | 29 | is_abstract_url = "abs" in url 30 | if is_abstract_url: 31 | pdf_url = url.replace("abs", "pdf") 32 | abstract_url = url 33 | else: 34 | pdf_url = url 35 | abstract_url = url.replace("pdf", "abs") 36 | 37 | # get paper name from abstract url 38 | response = requests.get(abstract_url) 39 | 40 | # parse HTML response and get h1.title 41 | from bs4 import BeautifulSoup 42 | 43 | soup = BeautifulSoup(response.content, "html.parser") 44 | name = clean_name( 45 | soup.find("h1", class_="title").text.strip().replace("Title:", "") 46 | ) 47 | if not name: 48 | raise ValueError("Failed to get paper name") 49 | 50 | output_file_path = os.path.join(output_path, name + ".pdf") 51 | # prevent downloading if file already exists 52 | if not os.path.exists(output_file_path): 53 | response = requests.get(pdf_url) 54 | 55 | with open(output_file_path, "wb") as f: 56 | f.write(response.content) 57 | 58 | return output_file_path 59 | -------------------------------------------------------------------------------- /libs/ktem/ktem/index/models.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from ktem.db.engine import engine 4 | from sqlalchemy import JSON, Column 5 | from sqlmodel import Field, SQLModel 6 | 7 | 8 | # TODO: simplify with using SQLAlchemy directly 9 | class Index(SQLModel, table=True): 10 | __table_args__ = {"extend_existing": True} 11 | __tablename__ = "ktem__index" # type: ignore 12 | 13 | id: Optional[int] = Field(default=None, primary_key=True) 14 | name: str = Field(unique=True) 15 | index_type: str = Field() 16 | config: dict = Field(default={}, sa_column=Column(JSON)) 17 | 18 | 19 | Index.metadata.create_all(engine) 20 | -------------------------------------------------------------------------------- /libs/ktem/ktem/llms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/llms/__init__.py -------------------------------------------------------------------------------- /libs/ktem/ktem/llms/db.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | from ktem.db.engine import engine 4 | from sqlalchemy import JSON, Boolean, Column, String 5 | from sqlalchemy.orm import DeclarativeBase 6 | from theflow.settings import settings as flowsettings 7 | from theflow.utils.modules import import_dotted_string 8 | 9 | 10 | class Base(DeclarativeBase): 11 | pass 12 | 13 | 14 | class BaseLLMTable(Base): 15 | """Base table to store language model""" 16 | 17 | __abstract__ = True 18 | 19 | name = Column(String, primary_key=True, unique=True) 20 | spec = Column(JSON, default={}) 21 | default = Column(Boolean, default=False) 22 | 23 | 24 | _base_llm: Type[BaseLLMTable] = ( 25 | import_dotted_string(flowsettings.KH_TABLE_LLM, safe=False) 26 | if hasattr(flowsettings, "KH_TABLE_LLM") 27 | else BaseLLMTable 28 | ) 29 | 30 | 31 | class LLMTable(_base_llm): # type: ignore 32 | __tablename__ = "llm_table" 33 | 34 | 35 | if not getattr(flowsettings, "KH_ENABLE_ALEMBIC", False): 36 | LLMTable.metadata.create_all(engine) 37 | -------------------------------------------------------------------------------- /libs/ktem/ktem/pages/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/pages/__init__.py -------------------------------------------------------------------------------- /libs/ktem/ktem/pages/chat/chat_panel.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from ktem.app import BasePage 3 | from theflow.settings import settings as flowsettings 4 | 5 | KH_DEMO_MODE = getattr(flowsettings, "KH_DEMO_MODE", False) 6 | 7 | if not KH_DEMO_MODE: 8 | PLACEHOLDER_TEXT = ( 9 | "This is the beginning of a new conversation.\n" 10 | "Start by uploading a file or a web URL. " 11 | "Visit Files tab for more options (e.g: GraphRAG)." 12 | ) 13 | else: 14 | PLACEHOLDER_TEXT = ( 15 | "Welcome to Kotaemon Demo. " 16 | "Start by browsing preloaded conversations to get onboard.\n" 17 | "Check out Hint section for more tips." 18 | ) 19 | 20 | 21 | class ChatPanel(BasePage): 22 | def __init__(self, app): 23 | self._app = app 24 | self.on_building_ui() 25 | 26 | def on_building_ui(self): 27 | self.chatbot = gr.Chatbot( 28 | label=self._app.app_name, 29 | placeholder=PLACEHOLDER_TEXT, 30 | show_label=False, 31 | elem_id="main-chat-bot", 32 | show_copy_button=True, 33 | likeable=True, 34 | bubble_full_width=False, 35 | ) 36 | with gr.Row(): 37 | self.text_input = gr.MultimodalTextbox( 38 | interactive=True, 39 | scale=20, 40 | file_count="multiple", 41 | placeholder=( 42 | "Type a message, search the @web, or tag a file with @filename" 43 | ), 44 | container=False, 45 | show_label=False, 46 | elem_id="chat-input", 47 | ) 48 | 49 | def submit_msg(self, chat_input, chat_history): 50 | """Submit a message to the chatbot""" 51 | return "", chat_history + [(chat_input, None)] 52 | -------------------------------------------------------------------------------- /libs/ktem/ktem/pages/chat/chat_suggestion.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from ktem.app import BasePage 3 | from theflow.settings import settings as flowsettings 4 | 5 | 6 | class ChatSuggestion(BasePage): 7 | CHAT_SAMPLES = getattr( 8 | flowsettings, 9 | "KH_FEATURE_CHAT_SUGGESTION_SAMPLES", 10 | [ 11 | "Summary this document", 12 | "Generate a FAQ for this document", 13 | "Identify the main highlights in bullet points", 14 | ], 15 | ) 16 | 17 | def __init__(self, app): 18 | self._app = app 19 | self.on_building_ui() 20 | 21 | def on_building_ui(self): 22 | self.chat_samples = [[each] for each in self.CHAT_SAMPLES] 23 | with gr.Accordion( 24 | label="Chat Suggestion", 25 | visible=getattr(flowsettings, "KH_FEATURE_CHAT_SUGGESTION", False), 26 | ) as self.accordion: 27 | self.default_example = gr.State( 28 | value=self.chat_samples, 29 | ) 30 | self.examples = gr.DataFrame( 31 | value=self.chat_samples, 32 | headers=["Next Question"], 33 | interactive=False, 34 | elem_id="chat-suggestion", 35 | wrap=True, 36 | ) 37 | 38 | def as_gradio_component(self): 39 | return self.examples 40 | 41 | def select_example(self, ev: gr.SelectData): 42 | return {"text": ev.value} 43 | -------------------------------------------------------------------------------- /libs/ktem/ktem/pages/chat/common.py: -------------------------------------------------------------------------------- 1 | DEFAULT_APPLICATION_STATE = {"regen": False} 2 | STATE = { 3 | "app": DEFAULT_APPLICATION_STATE, 4 | } 5 | -------------------------------------------------------------------------------- /libs/ktem/ktem/pages/chat/demo_hint.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | 3 | import gradio as gr 4 | from ktem.app import BasePage 5 | 6 | 7 | class HintPage(BasePage): 8 | def __init__(self, app): 9 | self._app = app 10 | self.on_building_ui() 11 | 12 | def on_building_ui(self): 13 | with gr.Accordion(label="Hint", open=False): 14 | gr.Markdown( 15 | dedent( 16 | """ 17 | - You can select any text from the chat answer to **highlight relevant citation(s)** on the right panel. 18 | - **Citations** can be viewed on both PDF viewer and raw text. 19 | - You can tweak the citation format and use advance (CoT) reasoning in **Chat settings** menu. 20 | - Want to **explore more**? Check out the **Help** section to create your private space. 21 | """ # noqa 22 | ) 23 | ) 24 | -------------------------------------------------------------------------------- /libs/ktem/ktem/pages/chat/paper_list.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from ktem.app import BasePage 3 | from pandas import DataFrame 4 | 5 | from ...utils.hf_papers import fetch_papers 6 | 7 | 8 | class PaperListPage(BasePage): 9 | def __init__(self, app): 10 | self._app = app 11 | self.on_building_ui() 12 | 13 | def on_building_ui(self): 14 | self.papers_state = gr.State(None) 15 | with gr.Accordion( 16 | label="Browse popular daily papers", 17 | open=True, 18 | ) as self.accordion: 19 | self.examples = gr.DataFrame( 20 | value=[], 21 | headers=["title", "url", "upvotes"], 22 | column_widths=[60, 30, 10], 23 | interactive=False, 24 | elem_id="paper-suggestion", 25 | wrap=True, 26 | ) 27 | return self.examples 28 | 29 | def load(self): 30 | papers = fetch_papers(top_n=5) 31 | papers_df = DataFrame(papers) 32 | return (papers_df, papers) 33 | 34 | def _on_app_created(self): 35 | self._app.app.load( 36 | self.load, 37 | outputs=[self.examples, self.papers_state], 38 | ) 39 | 40 | def select_example(self, state, ev: gr.SelectData): 41 | return state[ev.index[0]]["url"] 42 | -------------------------------------------------------------------------------- /libs/ktem/ktem/pages/resources/__init__.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from ktem.app import BasePage 3 | from ktem.db.models import User, engine 4 | from ktem.embeddings.ui import EmbeddingManagement 5 | from ktem.index.ui import IndexManagement 6 | from ktem.llms.ui import LLMManagement 7 | from ktem.rerankings.ui import RerankingManagement 8 | from sqlmodel import Session, select 9 | 10 | from .user import UserManagement 11 | 12 | 13 | class ResourcesTab(BasePage): 14 | def __init__(self, app): 15 | self._app = app 16 | self.on_building_ui() 17 | 18 | def on_building_ui(self): 19 | with gr.Tab("Index Collections") as self.index_management_tab: 20 | self.index_management = IndexManagement(self._app) 21 | 22 | with gr.Tab("LLMs") as self.llm_management_tab: 23 | self.llm_management = LLMManagement(self._app) 24 | 25 | with gr.Tab("Embeddings") as self.emb_management_tab: 26 | self.emb_management = EmbeddingManagement(self._app) 27 | 28 | with gr.Tab("Rerankings") as self.rerank_management_tab: 29 | self.rerank_management = RerankingManagement(self._app) 30 | 31 | if self._app.f_user_management: 32 | with gr.Tab("Users", visible=False) as self.user_management_tab: 33 | self.user_management = UserManagement(self._app) 34 | 35 | def on_subscribe_public_events(self): 36 | if self._app.f_user_management: 37 | self._app.subscribe_event( 38 | name="onSignIn", 39 | definition={ 40 | "fn": self.toggle_user_management, 41 | "inputs": [self._app.user_id], 42 | "outputs": [self.user_management_tab], 43 | "show_progress": "hidden", 44 | }, 45 | ) 46 | 47 | self._app.subscribe_event( 48 | name="onSignOut", 49 | definition={ 50 | "fn": self.toggle_user_management, 51 | "inputs": [self._app.user_id], 52 | "outputs": [self.user_management_tab], 53 | "show_progress": "hidden", 54 | }, 55 | ) 56 | 57 | def toggle_user_management(self, user_id): 58 | """Show/hide the user management, depending on the user's role""" 59 | with Session(engine) as session: 60 | user = session.exec(select(User).where(User.id == user_id)).first() 61 | if user and user.admin: 62 | return gr.update(visible=True) 63 | 64 | return gr.update(visible=False) 65 | -------------------------------------------------------------------------------- /libs/ktem/ktem/reasoning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/reasoning/__init__.py -------------------------------------------------------------------------------- /libs/ktem/ktem/reasoning/base.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from kotaemon.base import BaseComponent 4 | 5 | 6 | class BaseReasoning(BaseComponent): 7 | """The reasoning pipeline that handles each of the user chat messages 8 | 9 | This reasoning pipeline has access to: 10 | - the retrievers 11 | - the user settings 12 | - the message 13 | - the conversation id 14 | - the message history 15 | """ 16 | 17 | @classmethod 18 | def get_info(cls) -> dict: 19 | """Get the pipeline information for the app to organize and display 20 | 21 | Returns: 22 | a dictionary that contains the following keys: 23 | - "id": the unique id of the pipeline 24 | - "name": the human-friendly name of the pipeline 25 | - "description": the overview short description of the pipeline, for 26 | user to grasp what does the pipeline do 27 | """ 28 | raise NotImplementedError 29 | 30 | @classmethod 31 | def get_user_settings(cls) -> dict: 32 | """Get the default user settings for this pipeline""" 33 | return {} 34 | 35 | @classmethod 36 | def get_pipeline( 37 | cls, 38 | user_settings: dict, 39 | state: dict, 40 | retrievers: Optional[list["BaseComponent"]] = None, 41 | ) -> "BaseReasoning": 42 | """Get the reasoning pipeline for the app to execute 43 | 44 | Args: 45 | user_setting: user settings 46 | state: conversation state 47 | retrievers (list): List of retrievers 48 | """ 49 | return cls() 50 | 51 | def run(self, message: str, conv_id: str, history: list, **kwargs): # type: ignore 52 | """Execute the reasoning pipeline""" 53 | raise NotImplementedError 54 | -------------------------------------------------------------------------------- /libs/ktem/ktem/reasoning/prompt_optimization/__init__.py: -------------------------------------------------------------------------------- 1 | from .decompose_question import DecomposeQuestionPipeline 2 | from .fewshot_rewrite_question import FewshotRewriteQuestionPipeline 3 | from .mindmap import CreateMindmapPipeline 4 | from .rewrite_question import RewriteQuestionPipeline 5 | 6 | __all__ = [ 7 | "DecomposeQuestionPipeline", 8 | "FewshotRewriteQuestionPipeline", 9 | "RewriteQuestionPipeline", 10 | "CreateMindmapPipeline", 11 | ] 12 | -------------------------------------------------------------------------------- /libs/ktem/ktem/reasoning/prompt_optimization/rewrite_question.py: -------------------------------------------------------------------------------- 1 | from ktem.llms.manager import llms 2 | 3 | from kotaemon.base import BaseComponent, Document, HumanMessage, Node, SystemMessage 4 | from kotaemon.llms import ChatLLM, PromptTemplate 5 | 6 | DEFAULT_REWRITE_PROMPT = ( 7 | "Given the following question, rephrase and expand it " 8 | "to help you do better answering. Maintain all information " 9 | "in the original question. Keep the question as concise as possible. " 10 | "Only output the rephrased question without additional information. " 11 | "Give answer in {lang}\n" 12 | "Original question: {question}\n" 13 | "Rephrased question: " 14 | ) 15 | 16 | 17 | class RewriteQuestionPipeline(BaseComponent): 18 | """Rewrite user question 19 | 20 | Args: 21 | llm: the language model to rewrite question 22 | rewrite_template: the prompt template for llm to paraphrase a text input 23 | lang: the language of the answer. Currently support English and Japanese 24 | """ 25 | 26 | llm: ChatLLM = Node(default_callback=lambda _: llms.get_default()) 27 | rewrite_template: str = DEFAULT_REWRITE_PROMPT 28 | 29 | lang: str = "English" 30 | 31 | def run(self, question: str) -> Document: # type: ignore 32 | prompt_template = PromptTemplate(self.rewrite_template) 33 | prompt = prompt_template.populate(question=question, lang=self.lang) 34 | messages = [ 35 | SystemMessage(content="You are a helpful assistant"), 36 | HumanMessage(content=prompt), 37 | ] 38 | return self.llm(messages) 39 | -------------------------------------------------------------------------------- /libs/ktem/ktem/reasoning/prompt_optimization/suggest_conversation_name.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from ktem.llms.manager import llms 4 | 5 | from kotaemon.base import AIMessage, BaseComponent, Document, HumanMessage, Node 6 | from kotaemon.llms import ChatLLM, PromptTemplate 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class SuggestConvNamePipeline(BaseComponent): 12 | """Suggest a good conversation name based on the chat history.""" 13 | 14 | llm: ChatLLM = Node(default_callback=lambda _: llms.get_default()) 15 | SUGGEST_NAME_PROMPT_TEMPLATE = ( 16 | "You are an expert at suggesting good and memorable conversation name. " 17 | "Based on the chat history above, " 18 | "suggest a good conversation name (max 10 words). " 19 | "Give answer in {lang}. Just output the conversation " 20 | "name without any extra." 21 | ) 22 | prompt_template: str = SUGGEST_NAME_PROMPT_TEMPLATE 23 | lang: str = "English" 24 | 25 | def run(self, chat_history: list[tuple[str, str]]) -> Document: # type: ignore 26 | prompt_template = PromptTemplate(self.prompt_template) 27 | prompt = prompt_template.populate(lang=self.lang) 28 | 29 | messages = [] 30 | for human, ai in chat_history: 31 | messages.append(HumanMessage(content=human)) 32 | messages.append(AIMessage(content=ai)) 33 | 34 | messages.append(HumanMessage(content=prompt)) 35 | 36 | return self.llm(messages) 37 | -------------------------------------------------------------------------------- /libs/ktem/ktem/reasoning/prompt_optimization/suggest_followup_chat.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from ktem.llms.manager import llms 4 | 5 | from kotaemon.base import AIMessage, BaseComponent, Document, HumanMessage, Node 6 | from kotaemon.llms import ChatLLM, PromptTemplate 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class SuggestFollowupQuesPipeline(BaseComponent): 12 | """Suggest a list of follow-up questions based on the chat history.""" 13 | 14 | llm: ChatLLM = Node(default_callback=lambda _: llms.get_default()) 15 | SUGGEST_QUESTIONS_PROMPT_TEMPLATE = ( 16 | "Based on the chat history above. " 17 | "your task is to generate 3 to 5 relevant follow-up questions. " 18 | "These questions should be simple, very concise, " 19 | "and designed to guide the conversation further. " 20 | "Respond in JSON format with 'questions' key. " 21 | "Answer using the language {lang} same as the question. " 22 | ) 23 | prompt_template: str = SUGGEST_QUESTIONS_PROMPT_TEMPLATE 24 | extra_prompt: str = """Example of valid response: 25 | ```json 26 | { 27 | "questions": ["the weather is good", "what's your favorite city"] 28 | } 29 | ```""" 30 | lang: str = "English" 31 | 32 | def run(self, chat_history: list[tuple[str, str]]) -> Document: 33 | prompt_template = PromptTemplate(self.prompt_template) 34 | prompt = prompt_template.populate(lang=self.lang) + self.extra_prompt 35 | 36 | messages = [] 37 | for human, ai in chat_history[-3:]: 38 | messages.append(HumanMessage(content=human)) 39 | messages.append(AIMessage(content=ai)) 40 | 41 | messages.append(HumanMessage(content=prompt)) 42 | 43 | return self.llm(messages) 44 | -------------------------------------------------------------------------------- /libs/ktem/ktem/rerankings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem/rerankings/__init__.py -------------------------------------------------------------------------------- /libs/ktem/ktem/rerankings/db.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | from ktem.db.engine import engine 4 | from sqlalchemy import JSON, Boolean, Column, String 5 | from sqlalchemy.orm import DeclarativeBase 6 | from theflow.settings import settings as flowsettings 7 | from theflow.utils.modules import import_dotted_string 8 | 9 | 10 | class Base(DeclarativeBase): 11 | pass 12 | 13 | 14 | class BaseRerankingTable(Base): 15 | """Base table to store rerankings model""" 16 | 17 | __abstract__ = True 18 | 19 | name = Column(String, primary_key=True, unique=True) 20 | spec = Column(JSON, default={}) 21 | default = Column(Boolean, default=False) 22 | 23 | 24 | __base_reranking: Type[BaseRerankingTable] = ( 25 | import_dotted_string(flowsettings.KH_TABLE_RERANKING, safe=False) 26 | if hasattr(flowsettings, "KH_TABLE_RERANKING") 27 | else BaseRerankingTable 28 | ) 29 | 30 | 31 | class RerankingTable(__base_reranking): # type: ignore 32 | __tablename__ = "reranking" 33 | 34 | 35 | if not getattr(flowsettings, "KH_ENABLE_ALEMBIC", False): 36 | RerankingTable.metadata.create_all(engine) 37 | -------------------------------------------------------------------------------- /libs/ktem/ktem/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .conversation import get_file_names_regex, get_urls 2 | from .lang import SUPPORTED_LANGUAGE_MAP 3 | 4 | __all__ = ["SUPPORTED_LANGUAGE_MAP", "get_file_names_regex", "get_urls"] 5 | -------------------------------------------------------------------------------- /libs/ktem/ktem/utils/commands.py: -------------------------------------------------------------------------------- 1 | WEB_SEARCH_COMMAND = "web" 2 | -------------------------------------------------------------------------------- /libs/ktem/ktem/utils/conversation.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def sync_retrieval_n_message( 5 | messages: list[list[str]], 6 | retrievals: list[str], 7 | ) -> list[str]: 8 | """Ensure len of messages history and retrieval history are equal 9 | Empty string/Truncate will be used in case any difference exist 10 | """ 11 | n_message = len(messages) # include previous history 12 | n_retrieval = min(n_message, len(retrievals)) 13 | 14 | diff = n_message - n_retrieval 15 | retrievals = retrievals[:n_retrieval] + ["" for _ in range(diff)] 16 | 17 | assert len(retrievals) == n_message 18 | 19 | return retrievals 20 | 21 | 22 | def get_file_names_regex(input_str: str) -> tuple[list[str], str]: 23 | # get all file names with pattern @"filename" in input_str 24 | # also remove these file names from input_str 25 | pattern = r'@"([^"]*)"' 26 | matches = re.findall(pattern, input_str) 27 | input_str = re.sub(pattern, "", input_str).strip() 28 | 29 | return matches, input_str 30 | 31 | 32 | def get_urls(input_str: str) -> tuple[list[str], str]: 33 | # get all urls in input_str 34 | # also remove these urls from input_str 35 | pattern = r"https?://[^\s]+" 36 | matches = re.findall(pattern, input_str) 37 | input_str = re.sub(pattern, "", input_str).strip() 38 | 39 | return matches, input_str 40 | 41 | 42 | if __name__ == "__main__": 43 | print(sync_retrieval_n_message([[""], [""], [""]], [])) 44 | -------------------------------------------------------------------------------- /libs/ktem/ktem/utils/file.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | 4 | class YAMLNoDateSafeLoader(yaml.SafeLoader): 5 | """Load datetime as strings, not dates""" 6 | 7 | @classmethod 8 | def remove_implicit_resolver(cls, tag_to_remove): 9 | """Remove implicit resolvers for a particular tag 10 | 11 | Args: 12 | tag_to_remove (str): YAML tag to remove 13 | """ 14 | if "yaml_implicit_resolvers" not in cls.__dict__: 15 | cls.yaml_implicit_resolvers = cls.yaml_implicit_resolvers.copy() 16 | 17 | for first_letter, mappings in cls.yaml_implicit_resolvers.items(): 18 | cls.yaml_implicit_resolvers[first_letter] = [ 19 | (tag, regexp) for tag, regexp in mappings if tag != tag_to_remove 20 | ] 21 | 22 | 23 | YAMLNoDateSafeLoader.remove_implicit_resolver("tag:yaml.org,2002:timestamp") 24 | -------------------------------------------------------------------------------- /libs/ktem/ktem/utils/generator.py: -------------------------------------------------------------------------------- 1 | class Generator: 2 | """A generator that stores return value from another generator""" 3 | 4 | def __init__(self, gen): 5 | self.gen = gen 6 | 7 | def __iter__(self): 8 | self.value = yield from self.gen 9 | return self.value 10 | -------------------------------------------------------------------------------- /libs/ktem/ktem/utils/lang.py: -------------------------------------------------------------------------------- 1 | SUPPORTED_LANGUAGE_MAP = { 2 | "en": "English", 3 | "ja": "Japanese", 4 | "vi": "Vietnamese", 5 | "es": "Spanish", 6 | "fr": "French", 7 | "de": "German", 8 | "zh": "Chinese", 9 | "ru": "Russian", 10 | "ar": "Arabic", 11 | "pt": "Portuguese", 12 | "hi": "Hindi", 13 | "bn": "Bengali", 14 | "pa": "Punjabi", 15 | "ko": "Korean", 16 | "it": "Italian", 17 | "nl": "Dutch", 18 | "tr": "Turkish", 19 | "pl": "Polish", 20 | "uk": "Ukrainian", 21 | "ro": "Romanian", 22 | "el": "Greek", 23 | "hu": "Hungarian", 24 | "sv": "Swedish", 25 | "cs": "Czech", 26 | "fi": "Finnish", 27 | "da": "Danish", 28 | "no": "Norwegian", 29 | "he": "Hebrew", 30 | "th": "Thai", 31 | "id": "Indonesian", 32 | "ms": "Malay", 33 | } 34 | -------------------------------------------------------------------------------- /libs/ktem/ktem/utils/rate_limit.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from datetime import datetime, timedelta 3 | 4 | import gradio as gr 5 | from decouple import config 6 | 7 | # In-memory store for rate limiting (for demonstration purposes) 8 | rate_limit_store: dict[str, dict] = defaultdict(dict) 9 | 10 | # Rate limit configuration 11 | RATE_LIMIT = config("RATE_LIMIT", default=20, cast=int) 12 | RATE_LIMIT_PERIOD = timedelta(hours=24) 13 | 14 | 15 | def check_rate_limit(limit_type: str, request: gr.Request): 16 | if request is None: 17 | raise ValueError("This feature is not available") 18 | 19 | user_id = None 20 | try: 21 | import gradiologin as grlogin 22 | 23 | user = grlogin.get_user(request) 24 | if user: 25 | user_id = user.get("email") 26 | except (ImportError, AssertionError): 27 | pass 28 | 29 | if not user_id: 30 | raise ValueError("Please sign-in to use this feature") 31 | 32 | now = datetime.now() 33 | user_data = rate_limit_store[limit_type].get( 34 | user_id, {"count": 0, "reset_time": now + RATE_LIMIT_PERIOD} 35 | ) 36 | 37 | if now >= user_data["reset_time"]: 38 | # Reset the rate limit for the user 39 | user_data = {"count": 0, "reset_time": now + RATE_LIMIT_PERIOD} 40 | 41 | if user_data["count"] >= RATE_LIMIT: 42 | raise ValueError("Rate limit exceeded. Please try again later.") 43 | 44 | # Increment the request count 45 | user_data["count"] += 1 46 | rate_limit_store[limit_type][user_id] = user_data 47 | 48 | return user_id 49 | -------------------------------------------------------------------------------- /libs/ktem/ktem_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/ktem_tests/__init__.py -------------------------------------------------------------------------------- /libs/ktem/ktem_tests/test_qa.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from unittest.mock import patch 4 | 5 | import pytest 6 | from index import ReaderIndexingPipeline 7 | from openai.resources.embeddings import Embeddings 8 | from openai.types.chat.chat_completion import ChatCompletion 9 | 10 | from kotaemon.llms import AzureChatOpenAI 11 | 12 | with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f: 13 | openai_embedding = json.load(f) 14 | 15 | 16 | _openai_chat_completion_response = ChatCompletion.parse_obj( 17 | { 18 | "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", 19 | "object": "chat.completion", 20 | "created": 1692338378, 21 | "model": "gpt-35-turbo", 22 | "system_fingerprint": None, 23 | "choices": [ 24 | { 25 | "index": 0, 26 | "finish_reason": "stop", 27 | "message": { 28 | "role": "assistant", 29 | "content": "Hello! How can I assist you today?", 30 | "function_call": None, 31 | "tool_calls": None, 32 | }, 33 | } 34 | ], 35 | "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19}, 36 | } 37 | ) 38 | 39 | 40 | @pytest.fixture(scope="function") 41 | def mock_openai_embedding(monkeypatch): 42 | monkeypatch.setattr(Embeddings, "create", lambda *args, **kwargs: openai_embedding) 43 | 44 | 45 | @patch( 46 | "openai.resources.chat.completions.Completions.create", 47 | side_effect=lambda *args, **kwargs: _openai_chat_completion_response, 48 | ) 49 | def test_ingest_pipeline(patch, mock_openai_embedding, tmp_path): 50 | indexing_pipeline = ReaderIndexingPipeline( 51 | storage_path=tmp_path, 52 | ) 53 | indexing_pipeline.indexing_vector_pipeline.embedding.openai_api_key = "some-key" 54 | input_file_path = Path(__file__).parent / "resources/dummy.pdf" 55 | 56 | # call ingestion pipeline 57 | indexing_pipeline(input_file_path, force_reindex=True) 58 | retrieving_pipeline = indexing_pipeline.to_retrieving_pipeline() 59 | 60 | results = retrieving_pipeline("This is a query") 61 | assert len(results) == 1 62 | 63 | # create llm 64 | llm = AzureChatOpenAI( 65 | api_key="dummy", 66 | api_version="2024-05-01-preview", 67 | azure_deployment="gpt-4o", 68 | azure_endpoint="https://test.openai.azure.com/", 69 | ) 70 | qa_pipeline = indexing_pipeline.to_qa_pipeline(llm=llm, openai_api_key="some-key") 71 | response = qa_pipeline("Summarize this document.") 72 | assert response 73 | -------------------------------------------------------------------------------- /libs/ktem/migrations/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. 2 | 3 | To enable database migration, please set `KH_ENABLE_ALEMBIC` to True in the 4 | setting file. 5 | -------------------------------------------------------------------------------- /libs/ktem/migrations/env.py: -------------------------------------------------------------------------------- 1 | from logging.config import fileConfig 2 | 3 | from alembic import context 4 | from ktem.db.models import * # noqa 5 | from sqlalchemy import engine_from_config, pool 6 | from sqlmodel import SQLModel 7 | from theflow.settings import settings 8 | 9 | # this is the Alembic Config object, which provides 10 | # access to the values within the .ini file in use. 11 | config = context.config 12 | 13 | # Interpret the config file for Python logging. 14 | # This line sets up loggers basically. 15 | if config.config_file_name is not None: 16 | fileConfig(config.config_file_name) 17 | 18 | # add your model's MetaData object here 19 | # for 'autogenerate' support 20 | # from myapp import mymodel 21 | # target_metadata = mymodel.Base.metadata 22 | target_metadata = SQLModel.metadata 23 | 24 | # other values from the config, defined by the needs of env.py, 25 | # can be acquired: 26 | # my_important_option = config.get_main_option("my_important_option") 27 | # ... etc. 28 | 29 | 30 | def run_migrations_offline() -> None: 31 | """Run migrations in 'offline' mode. 32 | 33 | This configures the context with just a URL 34 | and not an Engine, though an Engine is acceptable 35 | here as well. By skipping the Engine creation 36 | we don't even need a DBAPI to be available. 37 | 38 | Calls to context.execute() here emit the given string to the 39 | script output. 40 | 41 | """ 42 | context.configure( 43 | url=settings.KH_DATABASE, 44 | target_metadata=target_metadata, 45 | literal_binds=True, 46 | dialect_opts={"paramstyle": "named"}, 47 | ) 48 | 49 | with context.begin_transaction(): 50 | context.run_migrations() 51 | 52 | 53 | def run_migrations_online() -> None: 54 | """Run migrations in 'online' mode. 55 | 56 | In this scenario we need to create an Engine 57 | and associate a connection with the context. 58 | 59 | """ 60 | configuration = config.get_section(config.config_ini_section, {}) 61 | configuration["sqlalchemy.url"] = settings.KH_DATABASE 62 | connectable = engine_from_config( 63 | configuration, prefix="sqlalchemy.", poolclass=pool.NullPool 64 | ) 65 | 66 | with connectable.connect() as connection: 67 | context.configure(connection=connection, target_metadata=target_metadata) 68 | 69 | with context.begin_transaction(): 70 | context.run_migrations() 71 | 72 | 73 | if context.is_offline_mode(): 74 | run_migrations_offline() 75 | else: 76 | run_migrations_online() 77 | -------------------------------------------------------------------------------- /libs/ktem/migrations/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | import sqlmodel 13 | ${imports if imports else ""} 14 | 15 | # revision identifiers, used by Alembic. 16 | revision: str = ${repr(up_revision)} 17 | down_revision: Union[str, None] = ${repr(down_revision)} 18 | branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} 19 | depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} 20 | 21 | 22 | def upgrade() -> None: 23 | ${upgrades if upgrades else "pass"} 24 | 25 | 26 | def downgrade() -> None: 27 | ${downgrades if downgrades else "pass"} 28 | -------------------------------------------------------------------------------- /libs/ktem/migrations/versions/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/libs/ktem/migrations/versions/.keep -------------------------------------------------------------------------------- /libs/ktem/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools >= 61.0", "wheel", "setuptools-git-versioning>=2.0,<3"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.setuptools] 6 | include-package-data = true 7 | packages.find.exclude = ["ktem_tests*", "env*"] 8 | packages.find.include = ["ktem*"] 9 | 10 | [tool.setuptools-git-versioning] 11 | enabled = true 12 | dev_template = "{tag}" 13 | dirty_template = "{tag}" 14 | tag_filter = "v?\\d+(\\.\\d+)*.*" 15 | 16 | [project] 17 | name = "ktem" 18 | dynamic = ["version"] 19 | requires-python = ">= 3.10" 20 | description = "RAG-based Question and Answering Application" 21 | dependencies = [ 22 | "click>=8.1.7,<9", 23 | "platformdirs>=4.2.1,<5", 24 | "pluggy>=1.5.0,<2", 25 | "python-decouple>=3.8,<4", 26 | "SQLAlchemy>=2.0.29,<3", 27 | "sqlmodel>=0.0.16,<0.1", 28 | "tiktoken>=0.6.0,<1", 29 | "gradio>=4.31.0,<5", 30 | "gradiologin", 31 | "python-multipart==0.0.12", # required for gradio, pinning to avoid yanking issues with micropip (fixed in gradio >= 5.4.0) 32 | "markdown>=3.6,<4", 33 | "tzlocal>=5.0", 34 | ] 35 | authors = [ 36 | { name = "@trducng", email = "john@cinnamon.is" }, 37 | { name = "@lone17", email = "ian@cinnamon.is" }, 38 | { name = "@taprosoft", email = "tadashi@cinnamon.is" }, 39 | { name = "@cin-albert", email = "albert@cinnamon.is" }, 40 | ] 41 | classifiers = [ 42 | "Programming Language :: Python :: 3", 43 | "Operating System :: OS Independent", 44 | ] 45 | -------------------------------------------------------------------------------- /libs/ktem/requirements.txt: -------------------------------------------------------------------------------- 1 | platformdirs 2 | tzlocal 3 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools >= 61.0", "wheel", "setuptools-git-versioning>=2.0,<3"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.setuptools] 6 | include-package-data = false 7 | packages.find.include = [] 8 | 9 | [tool.setuptools-git-versioning] 10 | enabled = true 11 | dev_template = "{tag}" 12 | dirty_template = "{tag}" 13 | tag_filter = "v?\\d+(\\.\\d+)*.*" 14 | 15 | [project] 16 | name = "kotaemon-app" 17 | dynamic = ["version"] 18 | requires-python = ">= 3.10" 19 | description = "Kotaemon App" 20 | dependencies = [ 21 | "kotaemon @ git+https://github.com/Cinnamon/kotaemon.git@main#subdirectory=libs/kotaemon", 22 | "ktem @ git+https://github.com/Cinnamon/kotaemon.git@main#subdirectory=libs/ktem" 23 | ] 24 | authors = [ 25 | { name = "@trducng", email = "john@cinnamon.is" }, 26 | { name = "@lone17", email = "ian@cinnamon.is" }, 27 | { name = "@taprosoft", email = "tadashi@cinnamon.is" }, 28 | { name = "@cin-albert", email = "albert@cinnamon.is" }, 29 | ] 30 | classifiers = [ 31 | "Programming Language :: Python :: 3", 32 | "Operating System :: OS Independent", 33 | ] 34 | 35 | [project.urls] 36 | Homepage = "https://cinnamon.github.io/kotaemon/" 37 | Repository = "https://github.com/Cinnamon/kotaemon/" 38 | Documentation = "https://cinnamon.github.io/kotaemon/" 39 | 40 | [tool.codespell] 41 | skip = "*.js,*.css,*.map" 42 | # `llm` abbreviation for large language models 43 | ignore-words-list = "llm,fo" 44 | quiet-level = 3 45 | check-filenames = "" 46 | 47 | [tool.isort] 48 | known_first_party = ["kotaemon"] 49 | -------------------------------------------------------------------------------- /scripts/download_pdfjs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eo pipefail 4 | 5 | # Check and capture input argument for PDFJS_VERSION_DIST 6 | if [ -z "$1" ]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | pdfjs_version_dist=$1 12 | 13 | function check_path_for_spaces() { 14 | if [[ $PWD =~ \ ]]; then 15 | echo "The current workdir has whitespace which can lead to unintended behaviour. Please modify your path and continue later." 16 | exit 1 17 | fi 18 | } 19 | 20 | function download_and_unzip() { 21 | local url=$1 22 | local dest_dir=$2 23 | 24 | if [ -d "$dest_dir" ]; then 25 | echo "Destination directory $dest_dir already exists. Skipping download." 26 | return 27 | fi 28 | 29 | mkdir -p "$dest_dir" 30 | 31 | local zip_file="${dest_dir}/downloaded.zip" 32 | echo "Downloading $url to $zip_file" 33 | curl -L -o "$zip_file" "$url" 34 | 35 | echo "Unzipping $zip_file to $dest_dir" 36 | unzip -o "$zip_file" -d "$dest_dir" 37 | 38 | rm "$zip_file" 39 | echo "Download and unzip completed successfully." 40 | } 41 | 42 | # Main script execution 43 | 44 | pdf_js_version="4.0.379" 45 | pdf_js_dist_name="pdfjs-${pdf_js_version}-dist" 46 | pdf_js_dist_url="https://github.com/mozilla/pdf.js/releases/download/v${pdf_js_version}/${pdf_js_dist_name}.zip" 47 | 48 | check_path_for_spaces 49 | 50 | echo "Downloading and unzipping PDF.js" 51 | download_and_unzip "$pdf_js_dist_url" "$pdfjs_version_dist" 52 | 53 | echo "PDF.js has been set up in $pdfjs_version_dist" 54 | -------------------------------------------------------------------------------- /scripts/migrate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/scripts/migrate/__init__.py -------------------------------------------------------------------------------- /scripts/serve_local.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import subprocess 3 | from inspect import currentframe, getframeinfo 4 | from pathlib import Path 5 | 6 | from decouple import config 7 | 8 | system_name = platform.system() 9 | 10 | cur_frame = currentframe() 11 | if cur_frame is None: 12 | raise ValueError("Cannot get the current frame.") 13 | this_file = getframeinfo(cur_frame).filename 14 | this_dir = Path(this_file).parent 15 | 16 | 17 | def serve_llamacpp_python(local_model_file: Path, **kwargs): 18 | def guess_chat_format(local_model_file): 19 | model_name = local_model_file.stem 20 | 21 | # handle known cases that the server backends handle incorrectly 22 | # this is highly heuristic, should be expand later 23 | # server backends usually has logic for this but they could still be wrong 24 | if "qwen" in model_name: 25 | return "qwen" 26 | 27 | return None 28 | 29 | # default port 30 | if "port" not in kwargs: 31 | kwargs["port"] = 31415 32 | 33 | chat_format = guess_chat_format(local_model_file) 34 | if chat_format: 35 | kwargs = {**kwargs, "chat_format": chat_format} 36 | 37 | # these scripts create a separate conda env and run the server 38 | if system_name == "Windows": 39 | script_file = this_dir / "server_llamacpp_windows.bat" 40 | elif system_name == "Linux": 41 | script_file = this_dir / "server_llamacpp_linux.sh" 42 | elif system_name == "Darwin": 43 | script_file = this_dir / "server_llamacpp_macos.sh" 44 | else: 45 | raise ValueError(f"Unsupported system: {system_name}") 46 | 47 | args = " ".join(f"--{k} {v}" for k, v in kwargs.items()) 48 | 49 | cmd = f"{script_file} --model {local_model_file} {args}" 50 | subprocess.Popen(cmd, shell=True) 51 | 52 | 53 | def main(): 54 | local_model_file = config("LOCAL_MODEL", default="") 55 | 56 | if not local_model_file: 57 | print("LOCAL_MODEL not set in the `.env` file.") 58 | return 59 | 60 | local_model_file = Path(local_model_file) 61 | if not local_model_file.exists(): 62 | print(f"Local model not found: {local_model_file}") 63 | return 64 | 65 | print(f"Local model found: {local_model_file}") 66 | will_start_server = input("Do you want to use this local model ? (y/n): ") 67 | 68 | if will_start_server.lower().strip() not in ["y", "yes"]: 69 | return 70 | 71 | print("Starting the local server...") 72 | if local_model_file.suffix == ".gguf": 73 | serve_llamacpp_python(local_model_file) 74 | else: 75 | raise ValueError(f"Unsupported model file type: {local_model_file.suffix}") 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /sso_app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import gradiologin as grlogin 4 | from decouple import config 5 | from fastapi import FastAPI 6 | from fastapi.responses import FileResponse 7 | from theflow.settings import settings as flowsettings 8 | 9 | KH_APP_DATA_DIR = getattr(flowsettings, "KH_APP_DATA_DIR", ".") 10 | GRADIO_TEMP_DIR = os.getenv("GRADIO_TEMP_DIR", None) 11 | AUTHENTICATION_METHOD = config("AUTHENTICATION_METHOD", "GOOGLE") 12 | 13 | # override GRADIO_TEMP_DIR if it's not set 14 | if GRADIO_TEMP_DIR is None: 15 | GRADIO_TEMP_DIR = os.path.join(KH_APP_DATA_DIR, "gradio_tmp") 16 | os.environ["GRADIO_TEMP_DIR"] = GRADIO_TEMP_DIR 17 | 18 | # for authentication with Google 19 | GOOGLE_CLIENT_ID = config("GOOGLE_CLIENT_ID", default="") 20 | GOOGLE_CLIENT_SECRET = config("GOOGLE_CLIENT_SECRET", default="") 21 | 22 | # for authentication with Open ID by keycloak 23 | KEYCLOAK_SERVER_URL = config("KEYCLOAK_SERVER_URL", default="") 24 | KEYCLOAK_REALM = config("KEYCLOAK_REALM", default="") 25 | KEYCLOAK_CLIENT_ID = config("KEYCLOAK_CLIENT_ID", default="") 26 | KEYCLOAK_CLIENT_SECRET = config("KEYCLOAK_CLIENT_SECRET", default="") 27 | 28 | from ktem.main import App # noqa 29 | 30 | gradio_app = App() 31 | demo = gradio_app.make() 32 | 33 | app = FastAPI() 34 | 35 | if AUTHENTICATION_METHOD == "KEYCLOAK": 36 | # for authentication with Open ID by keycloak 37 | grlogin.register( 38 | name="keycloak", 39 | server_metadata_url=( 40 | f"{KEYCLOAK_SERVER_URL}/realms/{KEYCLOAK_REALM}/" 41 | ".well-known/openid-configuration" 42 | ), 43 | client_id=KEYCLOAK_CLIENT_ID, 44 | client_secret=KEYCLOAK_CLIENT_SECRET, 45 | client_kwargs={ 46 | "scope": "openid email profile", 47 | }, 48 | ) 49 | 50 | else: 51 | # for authentication with Google 52 | grlogin.register( 53 | name="google", 54 | server_metadata_url=( 55 | "https://accounts.google.com/.well-known/openid-configuration" 56 | ), 57 | client_id=GOOGLE_CLIENT_ID, 58 | client_secret=GOOGLE_CLIENT_SECRET, 59 | client_kwargs={ 60 | "scope": "openid email profile", 61 | }, 62 | ) 63 | 64 | 65 | @app.get("/favicon.ico", include_in_schema=False) 66 | async def favicon(): 67 | return FileResponse(gradio_app._favicon) 68 | 69 | 70 | grlogin.mount_gradio_app( 71 | app, 72 | demo, 73 | "/app", 74 | allowed_paths=[ 75 | "libs/ktem/ktem/assets", 76 | GRADIO_TEMP_DIR, 77 | ], 78 | ) 79 | -------------------------------------------------------------------------------- /templates/component-default/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/templates/component-default/README.md -------------------------------------------------------------------------------- /templates/project-default/cookiecutter.json: -------------------------------------------------------------------------------- 1 | { 2 | "project_name": "prj_kotaemon", 3 | "ptl": "john" 4 | } 5 | -------------------------------------------------------------------------------- /templates/project-default/{{cookiecutter.project_name}}/.gitattributes: -------------------------------------------------------------------------------- 1 | .gitattributes text eol=lf 2 | .gitignore text eol=lf 3 | *.build text eol=lf 4 | *.c text eol=lf 5 | *.cmake text eol=lf 6 | *.cpp text eol=lf 7 | *.csv text eol=lf 8 | *.f text eol=lf 9 | *.f90 text eol=lf 10 | *.for text eol=lf 11 | *.grc text eol=lf 12 | *.h text eol=lf 13 | *.ipynb text eol=lf 14 | *.m text eol=lf 15 | *.md text eol=lf 16 | *.pas text eol=lf 17 | *.py text eol=lf 18 | *.rst text eol=lf 19 | *.sh text eol=lf 20 | *.txt text eol=lf 21 | *.yml text eol=lf 22 | Makefile text eol=lf 23 | *.html linguist-documentation 24 | -------------------------------------------------------------------------------- /templates/project-default/{{cookiecutter.project_name}}/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: check-toml 7 | - id: end-of-file-fixer 8 | - id: trailing-whitespace 9 | - id: detect-aws-credentials 10 | args: ["--allow-missing-credentials"] 11 | - id: detect-private-key 12 | - id: check-added-large-files 13 | - repo: https://github.com/ambv/black 14 | rev: 22.3.0 15 | hooks: 16 | - id: black 17 | language_version: python3 18 | - repo: https://github.com/pycqa/isort 19 | rev: 5.12.0 20 | hooks: 21 | - id: isort 22 | args: ["--profile", "black"] 23 | language_version: python3.10 24 | - repo: https://github.com/pycqa/flake8 25 | rev: 4.0.1 26 | hooks: 27 | - id: flake8 28 | args: ["--max-line-length", "88", "--extend-ignore", "E203"] 29 | - repo: https://github.com/myint/autoflake 30 | rev: v1.4 31 | hooks: 32 | - id: autoflake 33 | args: 34 | [ 35 | "--in-place", 36 | "--remove-unused-variables", 37 | "--remove-all-unused-imports", 38 | "--ignore-init-module-imports", 39 | "--exclude=tests/*", 40 | ] 41 | - repo: https://github.com/pre-commit/mirrors-prettier 42 | rev: v2.7.1 43 | hooks: 44 | - id: prettier 45 | types_or: [markdown, yaml] 46 | - repo: https://github.com/pre-commit/mirrors-mypy 47 | rev: "v1.5.1" 48 | hooks: 49 | - id: mypy 50 | additional_dependencies: [types-PyYAML==6.0.12.11, "types-requests"] 51 | args: ["--check-untyped-defs", "--ignore-missing-imports"] 52 | -------------------------------------------------------------------------------- /templates/project-default/{{cookiecutter.project_name}}/README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # Project {{ cookiecutter.project_name }} 4 | 5 | [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/Cinnamon/kotaemon) 6 | 7 |
8 | 9 | # Install 10 | 11 | ```bash 12 | # Create new conda env (optional) 13 | conda create -n {{ cookiecutter.project_name }} python=3.10 14 | conda activate {{ cookiecutter.project_name }} 15 | 16 | # Clone and install the project 17 | git clone "<{{ cookiecutter.project_name }}-repo>" 18 | cd "<{{ cookiecutter.project_name }}-repo>" 19 | pip install -e . 20 | 21 | # Generate the project structure 22 | cd .. 23 | kh start-project 24 | ``` 25 | 26 | # Usage 27 | 28 | - Build the pipeline in `pipeline.py` 29 | 30 | For supported utilities and tools, refer: https://github.com/Cinnamon/kotaemon/wiki/Utilities 31 | 32 | # Contribute 33 | 34 | - For project issues and errors, please report in this repo issues. 35 | - For kotaemon issues and errors, please report or make PR fixes in https://github.com/Cinnamon/kotaemon.git 36 | - If the template for this project has issues and errors, please report or make 37 | PR fixes in https://github.com/Cinnamon/kotaemon/tree/main/templates/project-default 38 | -------------------------------------------------------------------------------- /templates/project-default/{{cookiecutter.project_name}}/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | setuptools.setup( 4 | name="{{ cookiecutter.project_name }}", 5 | version="0.0.1", 6 | author="{{ cookiecutter.ptl }}", 7 | author_email="{{ cookiecutter.ptl }}@cinnamon.is", 8 | description="Project {{ cookiecutter.project_name }}", 9 | long_description="Project {{ cookiecutter.project_name }}", 10 | url="https://github.com/Cinnamon/kotaemon", 11 | python_requires=">=3", 12 | classifiers=[ 13 | "Programming Language :: Python :: 3", 14 | "License :: OSI Approved :: MIT License", 15 | "Operating System :: OS Independent", 16 | ], 17 | install_requires=[ 18 | "kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git", 19 | ], 20 | ) 21 | -------------------------------------------------------------------------------- /templates/project-default/{{cookiecutter.project_name}}/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/templates/project-default/{{cookiecutter.project_name}}/tests/__init__.py -------------------------------------------------------------------------------- /templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cinnamon/kotaemon/5132288386c0f544b00952eacb896a67ad842037/templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py --------------------------------------------------------------------------------