├── .github ├── actions │ └── poetry_setup │ │ └── action.yml └── workflows │ ├── _lint.yml │ ├── _release.yml │ ├── _test.yml │ ├── ci.yml │ ├── doc_publish.yaml │ ├── release.yml │ └── tool_benchmarks.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── archived ├── csv-qa │ ├── README.md │ ├── custom_agent.py │ ├── data.csv │ ├── pandas_agent_gpt_35.py │ ├── pandas_agent_gpt_4.py │ ├── pandas_agent_instruct.py │ ├── pandas_ai.py │ ├── requirements.txt │ ├── result_35.png │ ├── results_4.png │ ├── results_custom.png │ ├── results_pandasai.png │ ├── streamlit_app.py │ ├── titanic.csv │ ├── titanic_data │ │ ├── index.faiss │ │ └── index.pkl │ └── upload_data.py ├── extraction │ ├── oppenheimer.txt │ ├── oppenheimer_short.txt │ ├── requirements.txt │ └── streamlit_app.py ├── langchain-docs-benchmarking │ ├── README.md │ ├── app │ │ ├── __init__.py │ │ └── server.py │ ├── example_custom_config.json │ ├── packages │ │ ├── README.md │ │ ├── anthropic-iterative-search │ │ │ ├── README.md │ │ │ ├── anthropic_iterative_search │ │ │ │ ├── __init__.py │ │ │ │ ├── agent_scratchpad.py │ │ │ │ ├── chain.py │ │ │ │ ├── output_parser.py │ │ │ │ ├── prompts.py │ │ │ │ ├── retriever.py │ │ │ │ └── retriever_agent.py │ │ │ ├── main.py │ │ │ ├── poetry.lock │ │ │ ├── pyproject.toml │ │ │ └── tests │ │ │ │ └── __init__.py │ │ ├── chat-langchain │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── chat_langchain │ │ │ │ ├── __init__.py │ │ │ │ └── chain.py │ │ │ ├── poetry.lock │ │ │ ├── pyproject.toml │ │ │ └── tests │ │ │ │ └── __init__.py │ │ ├── example │ │ │ └── custom_example │ │ │ │ └── example_custom_chain.py │ │ ├── langchain-docs-retriever │ │ │ ├── README.md │ │ │ ├── ingest_docs.py │ │ │ ├── langchain_docs_retriever │ │ │ │ ├── __init__.py │ │ │ │ ├── download_db.py │ │ │ │ └── retriever.py │ │ │ ├── poetry.lock │ │ │ └── pyproject.toml │ │ ├── oai-assistant │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── oai_assistant │ │ │ │ ├── __init__.py │ │ │ │ └── chain.py │ │ │ ├── poetry.lock │ │ │ ├── pyproject.toml │ │ │ └── tests │ │ │ │ └── __init__.py │ │ └── openai-functions-agent │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── main.py │ │ │ ├── openai_functions_agent │ │ │ ├── __init__.py │ │ │ └── agent.py │ │ │ ├── poetry.lock │ │ │ ├── pyproject.toml │ │ │ └── tests │ │ │ └── __init__.py │ ├── poetry.lock │ ├── prepare_dataset.py │ ├── pyproject.toml │ ├── run_evals.py │ └── run_experiments.py └── meta-evals │ ├── README.md │ └── correctness │ ├── README.md │ ├── __init__.py │ ├── _upload_dataset.py │ ├── data │ ├── Opus100-incorrect.json │ ├── Opus100.json │ ├── Web-Q&A-Dataset-Correct.json │ ├── Web-Q&A-Dataset-Incorrect.json │ ├── carb-IE-correct.json │ └── carb-IE-incorrect.json │ └── test_correctness_evaluator.py ├── docs ├── Makefile ├── make.bat └── source │ ├── .gitignore │ ├── _static │ └── parrot.png │ ├── conf.py │ ├── notebooks │ ├── datasets.ipynb │ ├── extraction │ │ ├── chat_extraction.ipynb │ │ ├── email.ipynb │ │ ├── high_cardinality.ipynb │ │ └── intro.ipynb │ ├── getting_started.ipynb │ ├── models.ipynb │ ├── retrieval │ │ ├── comparing_techniques.ipynb │ │ ├── intro.ipynb │ │ ├── langchain_docs_qa.ipynb │ │ ├── multi_modal_benchmarking │ │ │ ├── experiments │ │ │ │ └── gemini.ipynb │ │ │ ├── multi_modal_eval.ipynb │ │ │ └── multi_modal_eval_baseline.ipynb │ │ └── semi_structured_benchmarking │ │ │ ├── semi_structured.ipynb │ │ │ ├── ss_eval_chunk_sizes.ipynb │ │ │ ├── ss_eval_long_context.ipynb │ │ │ └── ss_eval_multi_vector.ipynb │ ├── run_without_langsmith.ipynb │ └── tool_usage │ │ ├── benchmark_all_tasks.ipynb │ │ ├── intro.ipynb │ │ ├── multiverse_math.ipynb │ │ ├── multiverse_math_benchmark.ipynb │ │ ├── oss_experiments │ │ └── mixtral_experiments.ipynb │ │ ├── query_analysis.ipynb │ │ ├── relational_data.ipynb │ │ ├── typewriter_1.ipynb │ │ └── typewriter_26.ipynb │ └── toc.segment ├── langchain_benchmarks ├── .gitignore ├── __init__.py ├── extraction │ ├── __init__.py │ ├── evaluators.py │ ├── implementations.py │ └── tasks │ │ ├── __init__.py │ │ ├── chat_extraction │ │ ├── __init__.py │ │ ├── evaluators.py │ │ └── schema.py │ │ ├── email_task.py │ │ └── high_cardinality │ │ ├── __init__.py │ │ └── name_correction.py ├── model_registration.py ├── rag │ ├── .gitignore │ ├── __init__.py │ ├── evaluators.py │ ├── tasks │ │ ├── .gitignore │ │ ├── __init__.py │ │ ├── langchain_docs │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── _ingest_docs.py │ │ │ ├── architectures │ │ │ │ ├── __init__.py │ │ │ │ ├── chain_registry.py │ │ │ │ └── crqa.py │ │ │ ├── indexing │ │ │ │ ├── .gitignore │ │ │ │ ├── __init__.py │ │ │ │ └── retriever_registry.py │ │ │ └── task.py │ │ ├── multi_modal_slide_decks │ │ │ ├── __init__.py │ │ │ ├── indexing │ │ │ │ ├── __init__.py │ │ │ │ └── retriever_registry.py │ │ │ └── task.py │ │ └── semi_structured_reports │ │ │ ├── __init__.py │ │ │ ├── indexing │ │ │ ├── .gitignore │ │ │ ├── __init__.py │ │ │ └── retriever_registry.py │ │ │ └── task.py │ └── utils │ │ ├── __init__.py │ │ ├── _downloading.py │ │ └── indexing.py ├── rate_limiting.py ├── registration.py ├── schema.py ├── tool_usage │ ├── README.md │ ├── __init__.py │ ├── agents │ │ ├── __init__.py │ │ ├── adapters.py │ │ ├── base.py │ │ ├── runnable_agent.py │ │ └── tool_using_agent.py │ ├── evaluators.py │ ├── prompts.py │ └── tasks │ │ ├── __init__.py │ │ ├── multiverse_math.py │ │ ├── query_analysis.py │ │ ├── relational_data.py │ │ ├── type_writer.py │ │ └── type_writer_26_funcs.py └── utils │ ├── __init__.py │ └── _langsmith.py ├── poetry.lock ├── pyproject.toml ├── scripts ├── check_datasets.py ├── multiverse_math_benchmark.py └── query_analysis_benchmark.py ├── security.md └── tests ├── __init__.py └── unit_tests ├── __init__.py ├── extraction ├── __init__.py ├── test_email_extraction.py └── test_import_stuff.py ├── rag ├── __init__.py └── test_langchain_docs.py ├── test_model_registry.py ├── test_public_api.py ├── test_rate_limiting.py ├── test_utils.py └── tool_usage ├── __init__.py ├── test_evaluator.py ├── test_multiverse_math.py ├── test_public_api.py └── test_tool_usage.py /.github/actions/poetry_setup/action.yml: -------------------------------------------------------------------------------- 1 | # An action for setting up poetry install with caching. 2 | # Using a custom action since the default action does not 3 | # take poetry install groups into account. 4 | # Action code from: 5 | # https://github.com/actions/setup-python/issues/505#issuecomment-1273013236 6 | name: poetry-install-with-caching 7 | description: Poetry install with support for caching of dependency groups. 8 | 9 | inputs: 10 | python-version: 11 | description: Python version, supporting MAJOR.MINOR only 12 | required: true 13 | 14 | poetry-version: 15 | description: Poetry version 16 | required: true 17 | 18 | cache-key: 19 | description: Cache key to use for manual handling of caching 20 | required: true 21 | 22 | working-directory: 23 | description: Directory whose poetry.lock file should be cached 24 | required: true 25 | 26 | runs: 27 | using: composite 28 | steps: 29 | - uses: actions/setup-python@v4 30 | name: Setup python ${{ inputs.python-version }} 31 | with: 32 | python-version: ${{ inputs.python-version }} 33 | 34 | - uses: actions/cache@v3 35 | id: cache-bin-poetry 36 | name: Cache Poetry binary - Python ${{ inputs.python-version }} 37 | env: 38 | SEGMENT_DOWNLOAD_TIMEOUT_MIN: "1" 39 | with: 40 | path: | 41 | /opt/pipx/venvs/poetry 42 | # This step caches the poetry installation, so make sure it's keyed on the poetry version as well. 43 | key: bin-poetry-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-${{ inputs.poetry-version }} 44 | 45 | - name: Refresh shell hashtable and fixup softlinks 46 | if: steps.cache-bin-poetry.outputs.cache-hit == 'true' 47 | shell: bash 48 | env: 49 | POETRY_VERSION: ${{ inputs.poetry-version }} 50 | PYTHON_VERSION: ${{ inputs.python-version }} 51 | run: | 52 | set -eux 53 | 54 | # Refresh the shell hashtable, to ensure correct `which` output. 55 | hash -r 56 | 57 | # `actions/cache@v3` doesn't always seem able to correctly unpack softlinks. 58 | # Delete and recreate the softlinks pipx expects to have. 59 | rm /opt/pipx/venvs/poetry/bin/python 60 | cd /opt/pipx/venvs/poetry/bin 61 | ln -s "$(which "python$PYTHON_VERSION")" python 62 | chmod +x python 63 | cd /opt/pipx_bin/ 64 | ln -s /opt/pipx/venvs/poetry/bin/poetry poetry 65 | chmod +x poetry 66 | 67 | # Ensure everything got set up correctly. 68 | /opt/pipx/venvs/poetry/bin/python --version 69 | /opt/pipx_bin/poetry --version 70 | 71 | - name: Install poetry 72 | if: steps.cache-bin-poetry.outputs.cache-hit != 'true' 73 | shell: bash 74 | env: 75 | POETRY_VERSION: ${{ inputs.poetry-version }} 76 | PYTHON_VERSION: ${{ inputs.python-version }} 77 | run: pipx install "poetry==$POETRY_VERSION" --python "python$PYTHON_VERSION" --verbose 78 | 79 | - name: Restore pip and poetry cached dependencies 80 | uses: actions/cache@v3 81 | env: 82 | SEGMENT_DOWNLOAD_TIMEOUT_MIN: "4" 83 | WORKDIR: ${{ inputs.working-directory == '' && '.' || inputs.working-directory }} 84 | with: 85 | path: | 86 | ~/.cache/pip 87 | ~/.cache/pypoetry/virtualenvs 88 | ~/.cache/pypoetry/cache 89 | ~/.cache/pypoetry/artifacts 90 | ${{ env.WORKDIR }}/.venv 91 | key: py-deps-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-poetry-${{ inputs.poetry-version }}-${{ inputs.cache-key }}-${{ hashFiles(format('{0}/**/poetry.lock', env.WORKDIR)) }} 92 | -------------------------------------------------------------------------------- /.github/workflows/_lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | working-directory: 7 | required: true 8 | type: string 9 | description: "From which folder this pipeline executes" 10 | 11 | env: 12 | POETRY_VERSION: "1.6.1" 13 | WORKDIR: ${{ inputs.working-directory == '' && '.' || inputs.working-directory }} 14 | 15 | jobs: 16 | build: 17 | runs-on: ubuntu-latest 18 | env: 19 | # This number is set "by eye": we want it to be big enough 20 | # so that it's bigger than the number of commits in any reasonable PR, 21 | # and also as small as possible since increasing the number makes 22 | # the initial `git fetch` slower. 23 | FETCH_DEPTH: 50 24 | strategy: 25 | matrix: 26 | # Only lint on the min and max supported Python versions. 27 | # It's extremely unlikely that there's a lint issue on any version in between 28 | # that doesn't show up on the min or max versions. 29 | # 30 | # GitHub rate-limits how many jobs can be running at any one time. 31 | # Starting new jobs is also relatively slow, 32 | # so linting on fewer versions makes CI faster. 33 | python-version: 34 | - "3.8" 35 | - "3.11" 36 | steps: 37 | - uses: actions/checkout@v3 38 | - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }} 39 | uses: "./.github/actions/poetry_setup" 40 | with: 41 | python-version: ${{ matrix.python-version }} 42 | poetry-version: ${{ env.POETRY_VERSION }} 43 | working-directory: ${{ inputs.working-directory }} 44 | cache-key: lint-with-extras 45 | 46 | - name: Check Poetry File 47 | shell: bash 48 | working-directory: ${{ inputs.working-directory }} 49 | run: | 50 | poetry check 51 | 52 | - name: Check lock file 53 | shell: bash 54 | working-directory: ${{ inputs.working-directory }} 55 | run: | 56 | poetry lock --check 57 | 58 | - name: Install dependencies 59 | # Also installs dev/lint/test/typing dependencies, to ensure we have 60 | # type hints for as many of our libraries as possible. 61 | # This helps catch errors that require dependencies to be spotted, for example: 62 | # https://github.com/langchain-ai/langchain/pull/10249/files#diff-935185cd488d015f026dcd9e19616ff62863e8cde8c0bee70318d3ccbca98341 63 | # 64 | # If you change this configuration, make sure to change the `cache-key` 65 | # in the `poetry_setup` action above to stop using the old cache. 66 | # It doesn't matter how you change it, any change will cause a cache-bust. 67 | working-directory: ${{ inputs.working-directory }} 68 | run: | 69 | poetry install --with dev,lint,test,typing 70 | 71 | - name: Get .mypy_cache to speed up mypy 72 | uses: actions/cache@v3 73 | env: 74 | SEGMENT_DOWNLOAD_TIMEOUT_MIN: "2" 75 | with: 76 | path: | 77 | ${{ env.WORKDIR }}/.mypy_cache 78 | key: mypy-${{ runner.os }}-${{ runner.arch }}-py${{ matrix.python-version }}-${{ inputs.working-directory }}-${{ hashFiles(format('{0}/poetry.lock', env.WORKDIR)) }} 79 | 80 | - name: Analysing the code with our lint 81 | working-directory: ${{ inputs.working-directory }} 82 | run: | 83 | make lint 84 | -------------------------------------------------------------------------------- /.github/workflows/_release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | working-directory: 7 | required: true 8 | type: string 9 | description: "From which folder this pipeline executes" 10 | 11 | env: 12 | POETRY_VERSION: "1.6.1" 13 | 14 | jobs: 15 | if_release: 16 | # Disallow publishing from branches that aren't `main`. 17 | if: github.ref == 'refs/heads/main' 18 | runs-on: ubuntu-latest 19 | permissions: 20 | # This permission is used for trusted publishing: 21 | # https://blog.pypi.org/posts/2023-04-20-introducing-trusted-publishers/ 22 | # 23 | # Trusted publishing has to also be configured on PyPI for each package: 24 | # https://docs.pypi.org/trusted-publishers/adding-a-publisher/ 25 | id-token: write 26 | 27 | # This permission is needed by `ncipollo/release-action` to create the GitHub release. 28 | contents: write 29 | defaults: 30 | run: 31 | working-directory: ${{ inputs.working-directory }} 32 | steps: 33 | - uses: actions/checkout@v3 34 | 35 | - name: Set up Python + Poetry ${{ env.POETRY_VERSION }} 36 | uses: "./.github/actions/poetry_setup" 37 | with: 38 | python-version: "3.10" 39 | poetry-version: ${{ env.POETRY_VERSION }} 40 | working-directory: ${{ inputs.working-directory }} 41 | cache-key: release 42 | 43 | - name: Build project for distribution 44 | run: poetry build 45 | - name: Check Version 46 | id: check-version 47 | run: | 48 | echo version=$(poetry version --short) >> $GITHUB_OUTPUT 49 | - name: Create Release 50 | uses: ncipollo/release-action@v1 51 | with: 52 | artifacts: "dist/*" 53 | token: ${{ secrets.GITHUB_TOKEN }} 54 | draft: false 55 | generateReleaseNotes: true 56 | tag: v${{ steps.check-version.outputs.version }} 57 | commit: main 58 | - name: Publish package distributions to PyPI 59 | uses: pypa/gh-action-pypi-publish@release/v1 60 | with: 61 | packages-dir: ${{ inputs.working-directory }}/dist/ 62 | verbose: true 63 | print-hash: true 64 | -------------------------------------------------------------------------------- /.github/workflows/_test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | working-directory: 7 | required: true 8 | type: string 9 | description: "From which folder this pipeline executes" 10 | 11 | env: 12 | POETRY_VERSION: "1.6.1" 13 | 14 | jobs: 15 | build: 16 | defaults: 17 | run: 18 | working-directory: ${{ inputs.working-directory }} 19 | runs-on: ubuntu-latest 20 | strategy: 21 | matrix: 22 | python-version: 23 | - "3.8" 24 | - "3.9" 25 | - "3.10" 26 | - "3.11" 27 | name: Python ${{ matrix.python-version }} 28 | steps: 29 | - uses: actions/checkout@v3 30 | 31 | - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }} 32 | uses: "./.github/actions/poetry_setup" 33 | with: 34 | python-version: ${{ matrix.python-version }} 35 | poetry-version: ${{ env.POETRY_VERSION }} 36 | working-directory: ${{ inputs.working-directory }} 37 | cache-key: core 38 | 39 | - name: Install dependencies 40 | shell: bash 41 | run: poetry install 42 | 43 | - name: Run core tests 44 | shell: bash 45 | run: make test 46 | 47 | - name: Ensure the tests did not create any additional files 48 | shell: bash 49 | run: | 50 | set -eu 51 | 52 | STATUS="$(git status)" 53 | echo "$STATUS" 54 | 55 | # grep will exit non-zero if the target message isn't found, 56 | # and `set -e` above will cause the step to fail. 57 | echo "$STATUS" | grep 'nothing to commit, working tree clean' 58 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Run CI Tests 3 | 4 | on: 5 | push: 6 | branches: [ main ] 7 | pull_request: 8 | paths-ignore: 9 | - 'README.md' 10 | workflow_dispatch: # Allows to trigger the workflow manually in GitHub UI 11 | 12 | # If another push to the same PR or branch happens while this workflow is still running, 13 | # cancel the earlier run in favor of the next run. 14 | # 15 | # There's no point in testing an outdated version of the code. GitHub only allows 16 | # a limited number of job runners to be active at the same time, so it's better to cancel 17 | # pointless jobs early so that more useful jobs can run sooner. 18 | concurrency: 19 | group: ${{ github.workflow }}-${{ github.ref }} 20 | cancel-in-progress: true 21 | 22 | env: 23 | POETRY_VERSION: "1.5.1" 24 | WORKDIR: "." 25 | 26 | jobs: 27 | lint: 28 | uses: 29 | ./.github/workflows/_lint.yml 30 | with: 31 | working-directory: . 32 | secrets: inherit 33 | 34 | test: 35 | timeout-minutes: 5 36 | runs-on: ubuntu-latest 37 | defaults: 38 | run: 39 | working-directory: ${{ env.WORKDIR }} 40 | strategy: 41 | matrix: 42 | python-version: 43 | - "3.8" 44 | - "3.9" 45 | - "3.10" 46 | - "3.11" 47 | name: Python ${{ matrix.python-version }} tests 48 | steps: 49 | - uses: actions/checkout@v3 50 | 51 | - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }} 52 | uses: "./.github/actions/poetry_setup" 53 | with: 54 | python-version: ${{ matrix.python-version }} 55 | poetry-version: ${{ env.POETRY_VERSION }} 56 | working-directory: . 57 | cache-key: benchmarks-all 58 | 59 | - name: Install dependencies 60 | shell: bash 61 | run: | 62 | echo "Running tests, installing dependencies with poetry..." 63 | poetry install --with test,lint,typing,docs 64 | 65 | - name: Run tests 66 | run: make test 67 | 68 | - name: Ensure the tests did not create any additional files 69 | shell: bash 70 | run: | 71 | set -eu 72 | 73 | STATUS="$(git status)" 74 | echo "$STATUS" 75 | 76 | # grep will exit non-zero if the target message isn't found, 77 | # and `set -e` above will cause the step to fail. 78 | echo "$STATUS" | grep 'nothing to commit, working tree clean' 79 | test_docs: 80 | timeout-minutes: 5 81 | runs-on: ubuntu-latest 82 | defaults: 83 | run: 84 | working-directory: ${{ env.WORKDIR }} 85 | strategy: 86 | matrix: 87 | python-version: 88 | - "3.11" 89 | name: Documentation Build for Python ${{ matrix.python-version }} 90 | steps: 91 | - uses: actions/checkout@v3 92 | 93 | - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }} 94 | uses: "./.github/actions/poetry_setup" 95 | with: 96 | python-version: ${{ matrix.python-version }} 97 | poetry-version: ${{ env.POETRY_VERSION }} 98 | working-directory: . 99 | cache-key: benchmarks-all 100 | 101 | - name: Install dependencies 102 | shell: bash 103 | run: | 104 | echo "Running tests, installing dependencies with poetry..." 105 | poetry install --with test,lint,typing,docs 106 | 107 | - name: Test Sphinx Docs 108 | shell: bash 109 | run: | 110 | echo "Attempting to build docs..." 111 | make docs_build 112 | test_datasets: 113 | timeout-minutes: 5 114 | runs-on: ubuntu-latest 115 | defaults: 116 | run: 117 | working-directory: ${{ env.WORKDIR }} 118 | strategy: 119 | matrix: 120 | python-version: 121 | - "3.11" 122 | name: Validate Public Datasets 123 | steps: 124 | - uses: actions/checkout@v3 125 | 126 | - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }} 127 | uses: "./.github/actions/poetry_setup" 128 | with: 129 | python-version: ${{ matrix.python-version }} 130 | poetry-version: ${{ env.POETRY_VERSION }} 131 | working-directory: . 132 | cache-key: benchmarks-all 133 | 134 | - name: Install dependencies 135 | shell: bash 136 | run: | 137 | echo "Running tests, installing dependencies with poetry..." 138 | poetry install --with test,lint,typing,docs 139 | 140 | - name: Request datasets 141 | shell: bash 142 | run: | 143 | echo "Attempting to build docs..." 144 | poetry run python -m scripts.check_datasets -------------------------------------------------------------------------------- /.github/workflows/doc_publish.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Docs 2 | on: [workflow_dispatch] 3 | permissions: 4 | contents: write 5 | 6 | env: 7 | POETRY_VERSION: "1.6.1" 8 | 9 | jobs: 10 | docs: 11 | strategy: 12 | matrix: 13 | python-version: 14 | - "3.11" 15 | runs-on: ubuntu-latest 16 | name: Documentation Publish 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }} 21 | uses: "./.github/actions/poetry_setup" 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | poetry-version: ${{ env.POETRY_VERSION }} 25 | working-directory: . 26 | cache-key: benchmarks-all 27 | 28 | - name: Install dependencies 29 | shell: bash 30 | run: | 31 | echo "Running tests, installing dependencies with poetry..." 32 | poetry install --with test,lint,typing,docs 33 | 34 | - name: Sphinx build 35 | shell: bash 36 | run: | 37 | make docs_build 38 | - name: Publish Docs 39 | uses: peaceiris/actions-gh-pages@v3 40 | with: 41 | publish_branch: gh-pages 42 | github_token: ${{ secrets.GITHUB_TOKEN }} 43 | publish_dir: ./docs/build 44 | force_orphan: true 45 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Publish Package to PyPi 3 | 4 | on: 5 | workflow_dispatch: # Allows to trigger the workflow manually in GitHub UI 6 | 7 | jobs: 8 | release: 9 | uses: 10 | ./.github/workflows/_release.yml 11 | permissions: write-all 12 | with: 13 | working-directory: . 14 | secrets: inherit 15 | -------------------------------------------------------------------------------- /.github/workflows/tool_benchmarks.yml: -------------------------------------------------------------------------------- 1 | name: Weekly Tool Benchmarks 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: '0 0 * * 0' # Runs at midnight (00:00) every Sunday (UTC time) 7 | 8 | env: 9 | POETRY_VERSION: "1.6.1" 10 | LANGCHAIN_API_KEY: ${{ secrets.LANGCHAIN_API_KEY }} 11 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 12 | ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} 13 | 14 | jobs: 15 | run_tool_benchmarks: 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - name: Set up Python 3.12 + Poetry ${{ env.POETRY_VERSION }} 22 | uses: "./.github/actions/poetry_setup" 23 | with: 24 | python-version: '3.12' 25 | poetry-version: ${{ env.POETRY_VERSION }} 26 | working-directory: . 27 | cache-key: benchmarks-all 28 | 29 | - name: Install dependencies 30 | shell: bash 31 | run: | 32 | echo "Running tests, installing dependencies with poetry..." 33 | poetry install --with test,lint,typing,docs 34 | 35 | - name: Multiverse math benchmark 36 | 37 | run: | 38 | cd scripts 39 | poetry run python multiverse_math_benchmark.py 40 | 41 | - name: Query analysis benchmark 42 | run: | 43 | cd scripts 44 | poetry run python query_analysis_benchmark.py -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | .idea/ 162 | .DS_Store 163 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Langchain AI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all lint format test help 2 | 3 | # Default target executed when no arguments are given to make. 4 | all: help 5 | 6 | # LINTING AND FORMATTING: 7 | 8 | # Define a variable for Python and notebook files. 9 | lint format: PYTHON_FILES=. 10 | lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=. --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$') 11 | 12 | lint lint_diff: 13 | [ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff 14 | # [ "$(PYTHON_FILES)" = "" ] || poetry run mypy $(PYTHON_FILES) 15 | 16 | format format_diff: 17 | [ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) 18 | [ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I --fix $(PYTHON_FILES) 19 | 20 | spell_check: 21 | poetry run codespell --toml pyproject.toml 22 | 23 | spell_fix: 24 | poetry run codespell --toml pyproject.toml -w 25 | 26 | 27 | # TESTING AND COVERAGE: 28 | 29 | # Define a variable for the test file path. 30 | TEST_FILE ?= tests/unit_tests/ 31 | 32 | test: 33 | poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE) 34 | 35 | test_watch: 36 | poetry run ptw . -- $(TEST_FILE) 37 | 38 | 39 | # DOCUMENTATION: 40 | 41 | docs_clean: 42 | rm -rf ./docs/build 43 | 44 | docs_build: 45 | # Copy README.md to docs/index.md 46 | cp README.md ./docs/source/index.md 47 | # Append to the table of contents the contents of the file 48 | cat ./docs/source/toc.segment >> ./docs/source/index.md 49 | poetry run sphinx-build "./docs/source" "./docs/build" 50 | 51 | 52 | # HELP: 53 | help: 54 | @echo '' 55 | @echo 'LINTING:' 56 | @echo ' format - run code formatters' 57 | @echo ' lint - run linters' 58 | @echo ' spell_check - run codespell' 59 | @echo ' spell_fix - run codespell and fix the errors' 60 | @echo 'TESTS:' 61 | @echo ' test - run unit tests' 62 | @echo ' test TEST_FILE= - run tests in ' 63 | @echo ' coverage - run unit tests and generate coverage report' 64 | @echo 'DOCUMENTATION:' 65 | @echo ' docs_clean - delete the docs/build directory' 66 | @echo ' docs_build - build the documentation' 67 | @echo '' 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🦜💯 LangChain Benchmarks 2 | 3 | [![Release Notes](https://img.shields.io/github/release/langchain-ai/langchain-benchmarks)](https://github.com/langchain-ai/langchain-benchmarks/releases) 4 | [![CI](https://github.com/langchain-ai/langchain-benchmarks/actions/workflows/ci.yml/badge.svg)](https://github.com/langchain-ai/langchain-benchmarks/actions/workflows/ci.yml) 5 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 6 | [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/langchainai.svg?style=social&label=Follow%20%40LangChainAI)](https://twitter.com/langchainai) 7 | [![](https://dcbadge.vercel.app/api/server/6adMQxSpJS?compact=true&style=flat)](https://discord.gg/6adMQxSpJS) 8 | [![Open Issues](https://img.shields.io/github/issues-raw/langchain-ai/langchain-benchmarks)](https://github.com/langchain-ai/langchain-benchmarks/issues) 9 | 10 | 11 | [📖 Documentation](https://langchain-ai.github.io/langchain-benchmarks/index.html) 12 | 13 | A package to help benchmark various LLM related tasks. 14 | 15 | The benchmarks are organized by end-to-end use cases, and 16 | utilize [LangSmith](https://smith.langchain.com/) heavily. 17 | 18 | We have several goals in open sourcing this: 19 | 20 | - Showing how we collect our benchmark datasets for each task 21 | - Showing what the benchmark datasets we use for each task is 22 | - Showing how we evaluate each task 23 | - Encouraging others to benchmark their solutions on these tasks (we are always looking for better ways of doing things!) 24 | 25 | ## Benchmarking Results 26 | 27 | Read some of the articles about benchmarking results on our blog. 28 | 29 | * [Agent Tool Use](https://blog.langchain.dev/benchmarking-agent-tool-use/) 30 | * [Query Analysis in High Cardinality Situations](https://blog.langchain.dev/high-cardinality/) 31 | * [RAG on Tables](https://blog.langchain.dev/benchmarking-rag-on-tables/) 32 | * [Q&A over CSV data](https://blog.langchain.dev/benchmarking-question-answering-over-csv-data/) 33 | 34 | 35 | ### Tool Usage (2024-04-18) 36 | 37 | See [tool usage docs](https://langchain-ai.github.io/langchain-benchmarks/notebooks/tool_usage/benchmark_all_tasks.html) to recreate! 38 | 39 | ![download](https://github.com/langchain-ai/langchain-benchmarks/assets/3205522/0da33de8-e03f-49cf-bd48-e9ff945828a9) 40 | 41 | Explore Agent Traces on LangSmith: 42 | 43 | * [Relational Data](https://smith.langchain.com/public/22721064-dcf6-4e42-be65-e7c46e6835e7/d) 44 | * [Tool Usage (1-tool)](https://smith.langchain.com/public/ac23cb40-e392-471f-b129-a893a77b6f62/d) 45 | * [Tool Usage (26-tools)](https://smith.langchain.com/public/366bddca-62b3-4b6e-849b-a478abab73db/d) 46 | * [Multiverse Math](https://smith.langchain.com/public/983faff2-54b9-4875-9bf2-c16913e7d489/d) 47 | 48 | ## Installation 49 | 50 | To install the packages, run the following command: 51 | 52 | ```bash 53 | pip install -U langchain-benchmarks 54 | ``` 55 | 56 | All the benchmarks come with an associated benchmark dataset stored in [LangSmith](https://smith.langchain.com). To take advantage of the eval and debugging experience, [sign up](https://smith.langchain.com), and set your API key in your environment: 57 | 58 | ```bash 59 | export LANGCHAIN_API_KEY=ls-... 60 | ``` 61 | 62 | ## Repo Structure 63 | 64 | The package is located within [langchain_benchmarks](./langchain_benchmarks/). Check out the [docs](https://langchain-ai.github.io/langchain-benchmarks/index.html) for information on how to get starte. 65 | 66 | The other directories are legacy and may be moved in the future. 67 | 68 | 69 | ## Archived 70 | 71 | Below are archived benchmarks that require cloning this repo to run. 72 | 73 | - [CSV Question Answering](https://github.com/langchain-ai/langchain-benchmarks/tree/main/archived/csv-qa) 74 | - [Extraction](https://github.com/langchain-ai/langchain-benchmarks/tree/main/archived/extraction) 75 | - [Q&A over the LangChain docs](https://github.com/langchain-ai/langchain-benchmarks/tree/main/archived/langchain-docs-benchmarking) 76 | - [Meta-evaluation of 'correctness' evaluators](https://github.com/langchain-ai/langchain-benchmarks/tree/main/archived/meta-evals) 77 | 78 | 79 | ## Related 80 | 81 | - For cookbooks on other ways to test, debug, monitor, and improve your LLM applications, check out the [LangSmith docs](https://docs.smith.langchain.com/) 82 | - For information on building with LangChain, check out the [python documentation](https://python.langchain.com/docs/get_started/introduction) or [JS documentation](https://js.langchain.com/docs/get_started/introduction) 83 | 84 | -------------------------------------------------------------------------------- /archived/csv-qa/README.md: -------------------------------------------------------------------------------- 1 | # CSV Question Answering 2 | 3 | This module shows how we benchmark question answering over CSV data. 4 | There are several components: 5 | 6 | ## Setup 7 | 8 | To setup, you should install all required packages: 9 | 10 | ```shell 11 | pip install -r requirements.txt 12 | ``` 13 | 14 | You then need to set environment variables. 15 | This heavily uses [LangSmith](https://smith.langchain.com/), so you need to set those environment variables: 16 | 17 | ```shell 18 | export LANGCHAIN_TRACING_V2="true" 19 | export LANGCHAIN_ENDPOINT=https://api.langchain.plus 20 | export LANGCHAIN_API_KEY=... 21 | ``` 22 | 23 | This also uses OpenAI, so you need to set that environment variable: 24 | 25 | ````shell 26 | export OPENAI_API_KEY=... 27 | ```` 28 | 29 | ## How we collected data 30 | 31 | To do this, we set up a simple streamlit app that was logging questions, answers, and feedback to LangSmith. 32 | We then annotated examples in [LangSmith](https://smith.langchain.com/) and added them to a dataset we were creating. 33 | For more details on how to do this generally, see [this cookbook](https://github.com/langchain-ai/langsmith-cookbook/tree/main/feedback-examples/streamlit) 34 | 35 | When doing this, you probably want to specific a project for all runs to be logged to: 36 | 37 | ```shell 38 | export LANGCHAIN_PROJECT="Titanic CSV" 39 | ``` 40 | 41 | The [`streamlit_app.py`](data.csv) file contains the exact code used to run the application. 42 | You can run this with `streamlit run streamlit_app.py` 43 | 44 | ## What the data is 45 | 46 | See [`data.csv`](data.csv) for the data points we labeled. 47 | 48 | ## How we evaluate 49 | 50 | In order to evaluate, we first upload our data to [LangSmith](https://smith.langchain.com/), with dataset name `Titanic CSV`. 51 | This is done in [`upload_data.py`](upload_data.py). You can run this with: 52 | 53 | ```shell 54 | python upload_data.py 55 | ``` 56 | 57 | This allows us to track different evaluation runs against this dataset. 58 | We then use a standard `qa` evaluator to evaluate whether the generated answers are correct are not. 59 | 60 | We include scripts for evaluating a few different methods: 61 | 62 | ## [Pandas Agent, GPT-3](pandas_agent_gpt_35.py) 63 | 64 | Run with `python pandas_agent_gpt_35.py` 65 | 66 | Results: 67 | 68 | ![results_35.png](result_35.png) 69 | 70 | ## [Pandas Agent, GPT-4](pandas_agent_gpt_4.py) 71 | 72 | Run with `python pandas_agent_gpt_4.py` 73 | 74 | Results: 75 | 76 | ![results_4.png](results_4.png) 77 | 78 | ## [PandasAI](pandas_ai.py) 79 | 80 | Need into install more packages: 81 | 82 | ```shell 83 | pip install beautifulsoup4 pandasai 84 | ``` 85 | Then can run with `python pandas_ai.py` 86 | 87 | Results (note token tracking is off because not using LangChain): 88 | 89 | ![results_pandasai.png](results_pandasai.png) 90 | 91 | ## [Custom Agent](custom_agent.py) 92 | 93 | A custom agent equipped with a custom prompt and some custom tools (Python REPL and vectorstore) 94 | 95 | Run with `python custom_agent.py` 96 | 97 | Results: 98 | 99 | ![results_custom.png](results_custom.png) 100 | -------------------------------------------------------------------------------- /archived/csv-qa/custom_agent.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from langchain.agents import AgentExecutor, OpenAIFunctionsAgent 3 | from langchain.agents.agent_toolkits.conversational_retrieval.tool import ( 4 | create_retriever_tool, 5 | ) 6 | from langchain.embeddings import OpenAIEmbeddings 7 | from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder 8 | from langchain.smith import RunEvalConfig, run_on_dataset 9 | from langchain.tools import PythonAstREPLTool 10 | from langchain.vectorstores import FAISS 11 | from langchain_openai import ChatOpenAI 12 | from langsmith import Client 13 | from pydantic import BaseModel, Field 14 | 15 | pd.set_option("display.max_rows", 20) 16 | pd.set_option("display.max_columns", 20) 17 | 18 | embedding_model = OpenAIEmbeddings() 19 | vectorstore = FAISS.load_local("titanic_data", embedding_model) 20 | retriever_tool = create_retriever_tool( 21 | vectorstore.as_retriever(), "person_name_search", "Search for a person by name" 22 | ) 23 | 24 | 25 | TEMPLATE = """You are working with a pandas dataframe in Python. The name of the dataframe is `df`. 26 | It is important to understand the attributes of the dataframe before working with it. This is the result of running `df.head().to_markdown()` 27 | 28 | 29 | {dhead} 30 | 31 | 32 | You are not meant to use only these rows to answer questions - they are meant as a way of telling you about the shape and schema of the dataframe. 33 | You also do not have use only the information here to answer questions - you can run intermediate queries to do exporatory data analysis to give you more information as needed. 34 | 35 | You have a tool called `person_name_search` through which you can lookup a person by name and find the records corresponding to people with similar name as the query. 36 | You should only really use this if your search term contains a persons name. Otherwise, try to solve it with code. 37 | 38 | For example: 39 | 40 | How old is Jane? 41 | Use `person_name_search` since you can use the query `Jane` 42 | 43 | Who has id 320 44 | Use `python_repl` since even though the question is about a person, you don't know their name so you can't include it. 45 | """ 46 | 47 | 48 | class PythonInputs(BaseModel): 49 | query: str = Field(description="code snippet to run") 50 | 51 | 52 | if __name__ == "__main__": 53 | df = pd.read_csv("titanic.csv") 54 | template = TEMPLATE.format(dhead=df.head().to_markdown()) 55 | 56 | prompt = ChatPromptTemplate.from_messages( 57 | [ 58 | ("system", template), 59 | MessagesPlaceholder(variable_name="agent_scratchpad"), 60 | ("human", "{input}"), 61 | ] 62 | ) 63 | 64 | def get_chain(): 65 | repl = PythonAstREPLTool( 66 | locals={"df": df}, 67 | name="python_repl", 68 | description="Runs code and returns the output of the final line", 69 | args_schema=PythonInputs, 70 | ) 71 | tools = [repl, retriever_tool] 72 | agent = OpenAIFunctionsAgent( 73 | llm=ChatOpenAI(temperature=0, model="gpt-4"), prompt=prompt, tools=tools 74 | ) 75 | agent_executor = AgentExecutor( 76 | agent=agent, tools=tools, max_iterations=5, early_stopping_method="generate" 77 | ) 78 | return agent_executor 79 | 80 | client = Client() 81 | eval_config = RunEvalConfig( 82 | evaluators=["qa"], 83 | ) 84 | chain_results = run_on_dataset( 85 | client, 86 | dataset_name="Titanic CSV Data", 87 | llm_or_chain_factory=get_chain, 88 | evaluation=eval_config, 89 | ) 90 | -------------------------------------------------------------------------------- /archived/csv-qa/pandas_agent_gpt_35.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from langchain.agents.agent_toolkits import create_pandas_dataframe_agent 3 | from langchain.agents.agent_types import AgentType 4 | from langchain.smith import RunEvalConfig, run_on_dataset 5 | from langchain_openai import ChatOpenAI 6 | from langsmith import Client 7 | 8 | if __name__ == "__main__": 9 | df = pd.read_csv("titanic.csv") 10 | 11 | def get_chain(): 12 | llm = ChatOpenAI(temperature=0) 13 | agent_executor_kwargs = { 14 | "handle_parsing_errors": True, 15 | } 16 | agent = create_pandas_dataframe_agent( 17 | llm, 18 | df, 19 | agent_type=AgentType.OPENAI_FUNCTIONS, 20 | agent_executor_kwargs=agent_executor_kwargs, 21 | max_iterations=5, 22 | ) 23 | return agent 24 | 25 | client = Client() 26 | eval_config = RunEvalConfig( 27 | evaluators=["qa"], 28 | ) 29 | chain_results = run_on_dataset( 30 | client, 31 | dataset_name="Titanic CSV Data", 32 | llm_or_chain_factory=get_chain, 33 | evaluation=eval_config, 34 | ) 35 | -------------------------------------------------------------------------------- /archived/csv-qa/pandas_agent_gpt_4.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from langchain.agents.agent_toolkits import create_pandas_dataframe_agent 3 | from langchain.agents.agent_types import AgentType 4 | from langchain.smith import RunEvalConfig, run_on_dataset 5 | from langchain_openai import ChatOpenAI 6 | from langsmith import Client 7 | 8 | if __name__ == "__main__": 9 | df = pd.read_csv("titanic.csv") 10 | 11 | def get_chain(): 12 | llm = ChatOpenAI(temperature=0, model="gpt-4") 13 | agent_executor_kwargs = { 14 | "handle_parsing_errors": True, 15 | } 16 | agent = create_pandas_dataframe_agent( 17 | llm, 18 | df, 19 | agent_type=AgentType.OPENAI_FUNCTIONS, 20 | agent_executor_kwargs=agent_executor_kwargs, 21 | max_iterations=5, 22 | ) 23 | return agent 24 | 25 | client = Client() 26 | eval_config = RunEvalConfig( 27 | evaluators=["qa"], 28 | ) 29 | chain_results = run_on_dataset( 30 | client, 31 | dataset_name="Titanic CSV Data", 32 | llm_or_chain_factory=get_chain, 33 | evaluation=eval_config, 34 | ) 35 | -------------------------------------------------------------------------------- /archived/csv-qa/pandas_agent_instruct.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from langchain.agents import AgentExecutor, ZeroShotAgent 3 | from langchain.agents.agent_toolkits.conversational_retrieval.tool import ( 4 | create_retriever_tool, 5 | ) 6 | from langchain.embeddings import OpenAIEmbeddings 7 | from langchain.llms import OpenAI 8 | from langchain.smith import RunEvalConfig, run_on_dataset 9 | from langchain.tools import PythonAstREPLTool 10 | from langchain.vectorstores import FAISS 11 | from langsmith import Client 12 | from pydantic import BaseModel, Field 13 | 14 | pd.set_option("display.max_rows", 20) 15 | pd.set_option("display.max_columns", 20) 16 | 17 | embedding_model = OpenAIEmbeddings() 18 | vectorstore = FAISS.load_local("titanic_data", embedding_model) 19 | retriever_tool = create_retriever_tool( 20 | vectorstore.as_retriever(), "person_name_search", "Search for a person by name" 21 | ) 22 | 23 | 24 | TEMPLATE = """You are working with a pandas dataframe in Python. The name of the dataframe is `df`. 25 | It is important to understand the attributes of the dataframe before working with it. This is the result of running `df.head().to_markdown()` 26 | 27 | 28 | {dhead} 29 | 30 | 31 | You are not meant to use only these rows to answer questions - they are meant as a way of telling you about the shape and schema of the dataframe. 32 | You also do not have use only the information here to answer questions - you can run intermediate queries to do exporatory data analysis to give you more information as needed. 33 | 34 | You have a tool called `person_name_search` through which you can lookup a person by name and find the records corresponding to people with similar name as the query. 35 | You should only really use this if your search term contains a persons name. Otherwise, try to solve it with code. 36 | 37 | For example: 38 | 39 | How old is Jane? 40 | Use `person_name_search` since you can use the query `Jane` 41 | 42 | Who has id 320 43 | Use `python_repl` since even though the question is about a person, you don't know their name so you can't include it.""" 44 | 45 | 46 | class PythonInputs(BaseModel): 47 | query: str = Field(description="code snippet to run") 48 | 49 | 50 | if __name__ == "__main__": 51 | df = pd.read_csv("titanic.csv") 52 | template = TEMPLATE.format(dhead=df.head().to_markdown()) 53 | 54 | def get_chain(): 55 | repl = PythonAstREPLTool( 56 | locals={"df": df}, 57 | name="python_repl", 58 | description="Runs code and returns the output of the final line", 59 | args_schema=PythonInputs, 60 | ) 61 | tools = [repl, retriever_tool] 62 | agent = ZeroShotAgent.from_llm_and_tools( 63 | llm=OpenAI(temperature=0, model="gpt-3.5-turbo-instruct"), 64 | tools=tools, 65 | prefix=template, 66 | ) 67 | agent_executor = AgentExecutor( 68 | agent=agent, tools=tools, max_iterations=5, early_stopping_method="generate" 69 | ) 70 | return agent_executor 71 | 72 | client = Client() 73 | eval_config = RunEvalConfig( 74 | evaluators=["qa"], 75 | ) 76 | chain_results = run_on_dataset( 77 | client, 78 | dataset_name="Titanic CSV Data", 79 | llm_or_chain_factory=get_chain, 80 | evaluation=eval_config, 81 | ) 82 | -------------------------------------------------------------------------------- /archived/csv-qa/pandas_ai.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from langchain.prompts import ChatPromptTemplate 3 | from langchain.schema.output_parser import StrOutputParser 4 | from langchain.smith import RunEvalConfig, run_on_dataset 5 | from langchain_openai import ChatOpenAI 6 | from langsmith import Client 7 | from pandasai import PandasAI 8 | 9 | if __name__ == "__main__": 10 | df = pd.read_csv("titanic.csv") 11 | 12 | pandas_ai = PandasAI(ChatOpenAI(temperature=0, model="gpt-4"), enable_cache=False) 13 | prompt = ChatPromptTemplate.from_messages( 14 | [ 15 | ( 16 | "system", 17 | "Answer the users question about some data. A data scientist will run some code and the results will be returned to you to use in your answer", 18 | ), 19 | ("human", "Question: {input}"), 20 | ("human", "Data Scientist Result: {result}"), 21 | ] 22 | ) 23 | 24 | def get_chain(): 25 | chain = ( 26 | { 27 | "input": lambda x: x["input_question"], 28 | "result": lambda x: pandas_ai(df, prompt=x["input_question"]), 29 | } 30 | | prompt 31 | | ChatOpenAI(temperature=0, model="gpt-4") 32 | | StrOutputParser() 33 | ) 34 | return chain 35 | 36 | client = Client() 37 | eval_config = RunEvalConfig( 38 | evaluators=["qa"], 39 | ) 40 | chain_results = run_on_dataset( 41 | client, 42 | dataset_name="Titanic CSV Data", 43 | llm_or_chain_factory=get_chain, 44 | evaluation=eval_config, 45 | ) 46 | -------------------------------------------------------------------------------- /archived/csv-qa/requirements.txt: -------------------------------------------------------------------------------- 1 | langchain 2 | openai 3 | streamlit 4 | tiktoken 5 | pandas 6 | tabulate 7 | -------------------------------------------------------------------------------- /archived/csv-qa/result_35.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/csv-qa/result_35.png -------------------------------------------------------------------------------- /archived/csv-qa/results_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/csv-qa/results_4.png -------------------------------------------------------------------------------- /archived/csv-qa/results_custom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/csv-qa/results_custom.png -------------------------------------------------------------------------------- /archived/csv-qa/results_pandasai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/csv-qa/results_pandasai.png -------------------------------------------------------------------------------- /archived/csv-qa/streamlit_app.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import streamlit as st 3 | from langchain.agents.agent_toolkits import create_pandas_dataframe_agent 4 | from langchain.agents.agent_types import AgentType 5 | from langchain_openai import ChatOpenAI 6 | 7 | df = pd.read_csv("titanic.csv") 8 | 9 | 10 | llm = ChatOpenAI(temperature=0) 11 | agent = create_pandas_dataframe_agent(llm, df, agent_type=AgentType.OPENAI_FUNCTIONS) 12 | 13 | 14 | from langsmith import Client 15 | 16 | client = Client() 17 | 18 | 19 | def send_feedback(run_id, score): 20 | client.create_feedback(run_id, "user_score", score=score) 21 | 22 | 23 | st.set_page_config(page_title="🦜🔗 Ask the CSV App") 24 | st.title("🦜🔗 Ask the CSV App") 25 | st.info( 26 | "Most 'question answering' applications run over unstructured text data. But a lot of the data in the world is tabular data! This is an attempt to create an application using [LangChain](https://github.com/langchain-ai/langchain) to let you ask questions of data in tabular format. For this demo application, we will use the Titanic Dataset. Please explore it [here](https://github.com/datasciencedojo/datasets/blob/master/titanic.csv) to get a sense for what questions you can ask. Please leave feedback on well the question is answered, and we will use that improve the application!" 27 | ) 28 | 29 | query_text = st.text_input("Enter your question:", placeholder="Who was in cabin C128?") 30 | # Form input and query 31 | result = None 32 | with st.form("myform", clear_on_submit=True): 33 | submitted = st.form_submit_button("Submit") 34 | if submitted: 35 | with st.spinner("Calculating..."): 36 | response = agent({"input": query_text}, include_run_info=True) 37 | result = response["output"] 38 | run_id = response["__run"].run_id 39 | if result is not None: 40 | st.info(result) 41 | col_blank, col_text, col1, col2 = st.columns([10, 2, 1, 1]) 42 | with col_text: 43 | st.text("Feedback:") 44 | with col1: 45 | st.button("👍", on_click=send_feedback, args=(run_id, 1)) 46 | with col2: 47 | st.button("👎", on_click=send_feedback, args=(run_id, 0)) 48 | -------------------------------------------------------------------------------- /archived/csv-qa/titanic_data/index.faiss: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/csv-qa/titanic_data/index.faiss -------------------------------------------------------------------------------- /archived/csv-qa/titanic_data/index.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/csv-qa/titanic_data/index.pkl -------------------------------------------------------------------------------- /archived/csv-qa/upload_data.py: -------------------------------------------------------------------------------- 1 | from langsmith import Client 2 | 3 | if __name__ == "__main__": 4 | client = Client() 5 | dataset = client.upload_csv( 6 | csv_file="data.csv", 7 | input_keys=["input_question"], 8 | output_keys=["output_text"], 9 | name="Titanic CSV Data", 10 | description="QA over titanic data", 11 | data_type="kv", 12 | ) 13 | -------------------------------------------------------------------------------- /archived/extraction/oppenheimer_short.txt: -------------------------------------------------------------------------------- 1 | 'Julius Robert Oppenheimer, often known as Robert or "Oppie", is heralded as the father of the atomic bomb. Emerging from a non-practicing Jewish family in New York, he made several breakthroughs, such as the early black hole theory, before the monumental Manhattan Project. His wife, Katherine “Kitty” Oppenheimer, was a German-born woman with a complex past, including connections to the Communist Party. Oppenheimer\'s journey was beset by political adversaries, notably Lewis Strauss, chairman of the U.S. Atomic Energy Commission, and William Borden, an executive director with hawkish nuclear ambitions. These tensions culminated in the famous 1954 security hearing. Influential figures like lieutenant general Leslie Groves, who had also overseen the Pentagon\'s creation, stood by Oppenheimer\'s side, having earlier chosen him for the Manhattan Project and the Los Alamos location. Intimate relationships, like that with Jean Tatlock, a Communist and the possible muse behind the Trinity test\'s name, and colleagues like Frank, Oppenheimer\'s physicist brother, intertwined with his professional life. Scientists such as Ernest Lawrence, Edward Teller, David Hill, Richard Feynman, and Hans Bethe were some of Oppenheimer\'s contemporaries, each contributing to and contesting the atomic age\'s directions. Boris Pash\'s investigations, and the perspectives of figures like Leo Szilard, Niels Bohr, Harry Truman, and others, framed the broader sociopolitical context. Meanwhile, individuals like Robert Serber, Enrico Fermi, Albert Einstein, and Isidor Isaac Rabi, among many others, each played their parts in this narrative, from naming the atomic bombs to pivotal scientific contributions and advisory roles. All these figures, together with the backdrop of World War II, McCarthyism, and the dawn of the nuclear age, presented a complex mosaic of ambitions, loyalties, betrayals, and ideologies. -------------------------------------------------------------------------------- /archived/extraction/requirements.txt: -------------------------------------------------------------------------------- 1 | langchain 2 | openai 3 | streamlit 4 | -------------------------------------------------------------------------------- /archived/extraction/streamlit_app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from langchain.chains import create_extraction_chain 3 | from langchain_openai import ChatOpenAI 4 | from langsmith import Client 5 | 6 | st.set_page_config(page_title="🦜🔗 Text-to-graph extraction") 7 | client = Client() 8 | 9 | 10 | def send_feedback(run_id, score): 11 | client.create_feedback(run_id, "user_score", score=score) 12 | 13 | 14 | st.title("🦜🔗 Text-to-graph playground") 15 | st.info( 16 | "This playground explores the use of [OpenAI functions](https://openai.com/blog/function-calling-and-other-api-updates) and [LangChain](https://github.com/langchain-ai/langchain) to build knowledge graphs from user-input text. It breaks down the user input text into knowledge graph triples of subject (primary entities or concepts in a sentence), predicate (actions or relationships that connect subjects to objects), and object (entities or concepts that interact with or are acted upon by the subjects)." 17 | ) 18 | 19 | # Input text (optional default) 20 | oppenheimer_text = """'Julius Robert Oppenheimer, often known as Robert or "Oppie", is heralded as the father of the atomic bomb. Emerging from a non-practicing Jewish family in New York, he made several breakthroughs, such as the early black hole theory, before the monumental Manhattan Project. His wife, Katherine “Kitty” Oppenheimer, was a German-born woman with a complex past, including connections to the Communist Party. Oppenheimer\'s journey was beset by political adversaries, notably Lewis Strauss, chairman of the U.S. Atomic Energy Commission, and William Borden, an executive director with hawkish nuclear ambitions. These tensions culminated in the famous 1954 security hearing. Influential figures like lieutenant general Leslie Groves, who had also overseen the Pentagon\'s creation, stood by Oppenheimer\'s side, having earlier chosen him for the Manhattan Project and the Los Alamos location. Intimate relationships, like that with Jean Tatlock, a Communist and the possible muse behind the Trinity test\'s name, and colleagues like Frank, Oppenheimer\'s physicist brother, intertwined with his professional life. Scientists such as Ernest Lawrence, Edward Teller, David Hill, Richard Feynman, and Hans Bethe were some of Oppenheimer\'s contemporaries, each contributing to and contesting the atomic age\'s directions. Boris Pash\'s investigations, and the perspectives of figures like Leo Szilard, Niels Bohr, Harry Truman, and others, framed the broader sociopolitical context. Meanwhile, individuals like Robert Serber, Enrico Fermi, Albert Einstein, and Isidor Isaac Rabi, among many others, each played their parts in this narrative, from naming the atomic bombs to pivotal scientific contributions and advisory roles. All these figures, together with the backdrop of World War II, McCarthyism, and the dawn of the nuclear age, presented a complex mosaic of ambitions, loyalties, betrayals, and ideologies.oppenheimer_short.txt""" 21 | 22 | # Knowledge triplet schema 23 | default_schema = { 24 | "properties": { 25 | "subject": {"type": "string"}, 26 | "predicate": {"type": "string"}, 27 | "object": {"type": "string"}, 28 | }, 29 | "required": ["subject", "predicate", "object"], 30 | } 31 | 32 | # Create a text_area, set the default value to oppenheimer_text 33 | MAX_CHARS = 2000 # Maximum number of characters 34 | user_input_text = st.text_area("Enter your text (<2000 characters):", height=200) 35 | if len(user_input_text) > MAX_CHARS: 36 | st.warning(f"Text is too long. Processing only the first {MAX_CHARS} characters") 37 | user_input_text = user_input_text[:MAX_CHARS] 38 | 39 | 40 | # Output formatting of triples 41 | def json_to_markdown_table(json_list): 42 | if not json_list: 43 | return "No data available." 44 | 45 | # Extract headers 46 | headers = json_list[0].keys() 47 | markdown_table = " | ".join(headers) + "\n" 48 | markdown_table += " | ".join(["---"] * len(headers)) + "\n" 49 | 50 | # Extract rows 51 | for item in json_list: 52 | row = " | ".join([str(item[header]) for header in headers]) 53 | markdown_table += row + "\n" 54 | 55 | return markdown_table 56 | 57 | 58 | # Form input and query 59 | markdown_output = None 60 | with st.form("myform", clear_on_submit=True): 61 | submitted = st.form_submit_button("Submit") 62 | if submitted: 63 | with st.spinner("Calculating..."): 64 | llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo") 65 | chain = create_extraction_chain(default_schema, llm) 66 | extraction_output = chain(user_input_text, include_run_info=True) 67 | markdown_output = json_to_markdown_table(extraction_output["text"]) 68 | run_id = extraction_output["__run"].run_id 69 | 70 | # Feeback 71 | if markdown_output is not None: 72 | st.markdown(markdown_output) 73 | col_blank, col_text, col1, col2 = st.columns([10, 2, 1, 1]) 74 | with col_text: 75 | st.text("Feedback:") 76 | with col1: 77 | st.button("👍", on_click=send_feedback, args=(run_id, 1)) 78 | with col2: 79 | st.button("👎", on_click=send_feedback, args=(run_id, 0)) 80 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking on LangChain Docs 2 | 3 | This directory contains code to benchmark your cognitive architecture on the public [LangChain Q&A docs evaluation benchmark](https://smith.langchain.com/public/e1bfd348-494a-4df5-899a-7c6c09233cc4/d). 4 | 5 | To one one of the existing configurations, activate your poetry environment, configure you LangSmith API key, and run the experiments. 6 | 7 | **Note:** this will benchmark chains on a _copy_ of the dataset and will not update the public leaderboard. 8 | 9 | ## Running the published experiments 10 | 11 | The following steps will let you run pre-configured experiments: 12 | 13 | ### 1. Install requirements 14 | 15 | ```bash 16 | pip install poetry 17 | poetry shell 18 | poetry install 19 | ``` 20 | 21 | ### 2. Configure API keys 22 | 23 | Create a [LangSmith account](https://smith.langchain.com/) and set your API key: 24 | 25 | ```bash 26 | export LANGCHAIN_API_KEY=ls_your-api-key 27 | ``` 28 | 29 | The various cognitive architectures implemented already use Anthropic, [Fireworks.AI](https://www.fireworks.ai/), and OpenAI. Set the required API keys: 30 | 31 | ``` 32 | export OPENAI_API_KEY=your-api-key 33 | export ANTHROPIC_API_KEY=your-api-key 34 | export FIREWORKS_API_KEY=your-api-key 35 | ``` 36 | 37 | ### 3. Run Experiments 38 | 39 | To run all experiments, run: 40 | 41 | ```bash 42 | python run_experiments.py 43 | ``` 44 | 45 | If you want to only run certain experiments in the `run_experiments.py` file, use `--include` or `--exclude` 46 | 47 | Example: 48 | 49 | ```bash 50 | python run_experiments --include mistral-7b-instruct-4k llama-v2-34b-code-instruct-w8a16 51 | ``` 52 | 53 | ## Evaluating your custom cognitive architecture 54 | 55 | You can also evaluate your own custom cognitive architecture. To do so: 56 | 57 | 1. Create a python file defining your architecture: 58 | 59 | ```python 60 | # example_custom_chain.py 61 | 62 | ... 63 | def load_runnable(config: dict) -> "Runnable": 64 | # Load based on the config provided 65 | return my_chain 66 | ``` 67 | 68 | 2. Call `run_experiments.py` with a custom `--config my_config.json` 69 | 70 | ```js 71 | { 72 | // This specifies the path to your custom entrypoint followed by the loader function 73 | "arch": "path/to/example_custom_chain.py::load_runnable", 74 | "model_config": { 75 | // This is passed to load_runnable() in example_custom_chain.py() 76 | "chat_cls": "ChatOpenAI", 77 | "model": "gpt-4" 78 | }, 79 | "project_name": "example-custom-code" // This is the resulting test project name 80 | } 81 | ``` 82 | 83 | We have provided an example in [example_custom_chain.py](./packages/example/custom_example/example_custom_chain.py), which can be run by pointing `run_experiments` to the [example_custom_config.json](./example_custom_config.json) config file: 84 | 85 | ```bash 86 | python run_experiments.py --config ./example_custom_config.json 87 | ``` 88 | 89 | Whenever you provide 1 or more `--config` files, the `--include` and `--exclude` arguments are ignored. 90 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/app/__init__.py -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/app/server.py: -------------------------------------------------------------------------------- 1 | from chat_langchain.chain import chain 2 | from fastapi import FastAPI 3 | from langserve import add_routes 4 | from openai_functions_agent import agent_executor as openai_functions_agent_chain 5 | 6 | app = FastAPI() 7 | 8 | # Edit this to add the chain you want to add 9 | add_routes( 10 | app, 11 | chain, 12 | path="/chat", 13 | # include_callback_events=True, # TODO: Include when fixed 14 | ) 15 | 16 | add_routes(app, openai_functions_agent_chain, path="/openai-functions-agent") 17 | 18 | 19 | def run_server(port: int = 1983): 20 | import uvicorn 21 | 22 | uvicorn.run(app, host="0.0.0.0", port=port) 23 | 24 | 25 | if __name__ == "__main__": 26 | run_server() 27 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/example_custom_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "arch": "packages/example/custom_example/example_custom_chain.py::create_runnable", 3 | "model_config": { 4 | "chat_cls": "ChatOpenAI", 5 | "model": "gpt-4" 6 | }, 7 | "project_name": "example-custom-code" 8 | } 9 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/packages/README.md -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/README.md: -------------------------------------------------------------------------------- 1 | 2 | # anthropic-iterative-search 3 | 4 | This template will create a virtual research assistant with the ability to search Wikipedia to find answers to your questions. 5 | 6 | It is heavily inspired by [this notebook](https://github.com/anthropics/anthropic-cookbook/blob/main/long_context/wikipedia-search-cookbook.ipynb). 7 | 8 | ## Environment Setup 9 | 10 | Set the `ANTHROPIC_API_KEY` environment variable to access the Anthropic models. 11 | 12 | ## Usage 13 | 14 | To use this package, you should first have the LangChain CLI installed: 15 | 16 | ```shell 17 | pip install -U "langchain-cli[serve]" 18 | ``` 19 | 20 | To create a new LangChain project and install this as the only package, you can do: 21 | 22 | ```shell 23 | langchain app new my-app --package anthropic-iterative-search 24 | ``` 25 | 26 | If you want to add this to an existing project, you can just run: 27 | 28 | ```shell 29 | langchain app add anthropic-iterative-search 30 | ``` 31 | 32 | And add the following code to your `server.py` file: 33 | ```python 34 | from anthropic_iterative_search import chain as anthropic_iterative_search_chain 35 | 36 | add_routes(app, anthropic_iterative_search_chain, path="/anthropic-iterative-search") 37 | ``` 38 | 39 | (Optional) Let's now configure LangSmith. 40 | LangSmith will help us trace, monitor and debug LangChain applications. 41 | LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). 42 | If you don't have access, you can skip this section 43 | 44 | 45 | ```shell 46 | export LANGCHAIN_TRACING_V2=true 47 | export LANGCHAIN_API_KEY= 48 | export LANGCHAIN_PROJECT= # if not specified, defaults to "default" 49 | ``` 50 | 51 | If you are inside this directory, then you can spin up a LangServe instance directly by: 52 | 53 | ```shell 54 | langchain serve 55 | ``` 56 | 57 | This will start the FastAPI app with a server is running locally at 58 | [http://localhost:8000](http://localhost:8000) 59 | 60 | We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs) 61 | We can access the playground at [http://127.0.0.1:8000/anthropic-iterative-search/playground](http://127.0.0.1:8000/anthropic-iterative-search/playground) 62 | 63 | We can access the template from code with: 64 | 65 | ```python 66 | from langserve.client import RemoteRunnable 67 | 68 | runnable = RemoteRunnable("http://localhost:8000/anthropic-iterative-search") 69 | ``` -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain.schema.runnable import ConfigurableField 2 | 3 | from .chain import chain 4 | from .retriever_agent import executor 5 | 6 | final_chain = chain.configurable_alternatives( 7 | ConfigurableField(id="chain"), 8 | default_key="response", 9 | # This adds a new option, with name `openai` that is equal to `ChatOpenAI()` 10 | retrieve=executor, 11 | ) 12 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/agent_scratchpad.py: -------------------------------------------------------------------------------- 1 | def _format_docs(docs): 2 | result = "\n".join( 3 | [ 4 | f'\n\n{r}\n\n' 5 | for i, r in enumerate(docs) 6 | ] 7 | ) 8 | return result 9 | 10 | 11 | def format_agent_scratchpad(intermediate_steps): 12 | thoughts = "" 13 | for action, observation in intermediate_steps: 14 | thoughts += action.log 15 | thoughts += "" + _format_docs(observation) 16 | return thoughts 17 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/chain.py: -------------------------------------------------------------------------------- 1 | from langchain.chat_models import ChatAnthropic 2 | from langchain.prompts import ChatPromptTemplate 3 | from langchain.schema.output_parser import StrOutputParser 4 | from langchain.schema.runnable import RunnableLambda 5 | from pydantic import BaseModel 6 | 7 | from .prompts import answer_prompt 8 | from .retriever_agent import executor 9 | 10 | prompt = ChatPromptTemplate.from_template(answer_prompt) 11 | 12 | model = ChatAnthropic(model="claude-2", temperature=0, max_tokens_to_sample=1000) 13 | 14 | chain = ( 15 | RunnableLambda(lambda x: {"query": x["question"]}) 16 | | {"query": lambda x: x["query"], "information": executor | (lambda x: x["output"])} 17 | | prompt 18 | | model 19 | | StrOutputParser() 20 | ) 21 | 22 | # Add typing for the inputs to be used in the playground 23 | 24 | 25 | class Inputs(BaseModel): 26 | question: str 27 | 28 | 29 | chain = chain.with_types(input_type=Inputs) 30 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/output_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from langchain.schema.agent import AgentAction, AgentFinish 4 | 5 | from .agent_scratchpad import _format_docs 6 | 7 | 8 | def extract_between_tags(tag: str, string: str, strip: bool = True) -> str: 9 | ext_list = re.findall(f"<{tag}\s?>(.+?)", string, re.DOTALL) 10 | if strip: 11 | ext_list = [e.strip() for e in ext_list] 12 | if ext_list: 13 | if len(ext_list) != 1: 14 | raise ValueError 15 | # Only return the first one 16 | return ext_list[0] 17 | 18 | 19 | def parse_output(outputs): 20 | partial_completion = outputs["partial_completion"] 21 | steps = outputs["intermediate_steps"] 22 | search_query = extract_between_tags( 23 | "search_query", partial_completion + "" 24 | ) 25 | if search_query is None: 26 | docs = [] 27 | str_output = "" 28 | for action, observation in steps: 29 | docs.extend(observation) 30 | str_output += action.log 31 | str_output += "" + _format_docs(observation) 32 | str_output += partial_completion 33 | return AgentFinish({"docs": docs, "output": str_output}, log=partial_completion) 34 | else: 35 | return AgentAction( 36 | tool="search", tool_input=search_query, log=partial_completion 37 | ) 38 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/prompts.py: -------------------------------------------------------------------------------- 1 | retrieval_prompt = """{retriever_description} Before beginning to research the user's question, first think for a moment inside tags about what information is necessary for a well-informed answer. If the user's question is complex, you may need to decompose the query into multiple subqueries and execute them individually. Sometimes the search engine will return empty search results, or the search results may not contain the information you need. In such cases, feel free to try again with a different query. 2 | 3 | After each call to the Search Engine Tool, reflect briefly inside tags about whether you now have enough information to answer, or whether more information is needed. If you have all the relevant information, write it in tags, WITHOUT actually answering the question. Otherwise, issue a new search. 4 | 5 | Here is the user's question: {query} Remind yourself to make short queries in your scratchpad as you plan out your strategy.""" # noqa: E501 6 | 7 | answer_prompt = "Here is a user query: {query}. Here is some relevant information: {information}. Please answer the question using the relevant information." # noqa: E501 8 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/retriever.py: -------------------------------------------------------------------------------- 1 | from langchain.tools import tool 2 | from langchain_docs_retriever.retriever import get_retriever 3 | 4 | # This is used to tell the model how to best use the retriever. 5 | 6 | retriever_description = """You will be asked a question by a human user. You have access to the following tool to help answer the question. Search Engine Tool * The search engine will exclusively search over the LangChain documentation for pages similar to your query. It returns for each page its title and full page content. Use this tool if you want to get up-to-date and comprehensive information on a topic to help answer queries. Queries should be as atomic as possible -- they only need to address one part of the user's question. For example, if the user's query is "what is the color of a basketball?", your search query should be "basketball". Here's another example: if the user's question is "Who created the first neural network?", your first query should be "neural network". As you can see, these queries are quite short. Think keywords, not phrases. * At any time, you can make a call to the search engine using the following syntax: query_word. * You'll then get results back in tags.""" # noqa: E501 7 | 8 | retriever = get_retriever() 9 | 10 | # This should be the same as the function name below 11 | RETRIEVER_TOOL_NAME = "search" 12 | 13 | 14 | @tool 15 | def search(query, callbacks=None): 16 | """Search the LangChain docs with the retriever.""" 17 | return retriever.get_relevant_documents(query, callbacks=callbacks) 18 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/retriever_agent.py: -------------------------------------------------------------------------------- 1 | from langchain.agents import AgentExecutor 2 | from langchain.chat_models import ChatAnthropic 3 | from langchain.prompts import ChatPromptTemplate 4 | from langchain.schema.output_parser import StrOutputParser 5 | from langchain.schema.runnable import RunnableMap, RunnablePassthrough 6 | 7 | from .agent_scratchpad import format_agent_scratchpad 8 | from .output_parser import parse_output 9 | from .prompts import retrieval_prompt 10 | from .retriever import retriever_description, search 11 | 12 | prompt = ChatPromptTemplate.from_messages( 13 | [ 14 | ("user", retrieval_prompt), 15 | ("ai", "{agent_scratchpad}"), 16 | ] 17 | ) 18 | prompt = prompt.partial(retriever_description=retriever_description) 19 | 20 | model = ChatAnthropic(model="claude-2", temperature=0, max_tokens_to_sample=1000) 21 | 22 | chain = ( 23 | RunnablePassthrough.assign( 24 | agent_scratchpad=lambda x: format_agent_scratchpad(x["intermediate_steps"]) 25 | ) 26 | | prompt 27 | | model.bind(stop_sequences=[""]) 28 | | StrOutputParser() 29 | ) 30 | 31 | agent_chain = ( 32 | RunnableMap( 33 | { 34 | "partial_completion": chain, 35 | "intermediate_steps": lambda x: x["intermediate_steps"], 36 | } 37 | ) 38 | | parse_output 39 | ) 40 | 41 | executor = AgentExecutor(agent=agent_chain, tools=[search]) 42 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/main.py: -------------------------------------------------------------------------------- 1 | from anthropic_iterative_search import final_chain 2 | 3 | if __name__ == "__main__": 4 | query = ( 5 | "Which movie came out first: Oppenheimer, or " 6 | "Are You There God It's Me Margaret?" 7 | ) 8 | print( 9 | final_chain.with_config(configurable={"chain": "retrieve"}).invoke( 10 | {"query": query} 11 | ) 12 | ) 13 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "anthropic-iterative-search" 3 | version = "0.0.1" 4 | description = "" 5 | authors = [] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.8.1,<4.0" 10 | langchain = ">=0.0.331,<0.1.0" 11 | anthropic = "^0.5.0" 12 | wikipedia = "^1.4.0" 13 | 14 | [tool.langserve] 15 | export_module = "anthropic_iterative_search" 16 | export_attr = "final_chain" 17 | 18 | [build-system] 19 | requires = [ 20 | "poetry-core", 21 | ] 22 | build-backend = "poetry.core.masonry.api" 23 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/tests/__init__.py -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/chat-langchain/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 LangChain, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/chat-langchain/README.md: -------------------------------------------------------------------------------- 1 | # chat-langchain 2 | 3 | TODO: What does this package do 4 | 5 | ## Environment Setup 6 | 7 | TODO: What environment variables need to be set (if any) 8 | 9 | ## Usage 10 | 11 | To use this package, you should first have the LangChain CLI installed: 12 | 13 | ```shell 14 | pip install -U "langchain-cli[serve]" 15 | ``` 16 | 17 | To create a new LangChain project and install this as the only package, you can do: 18 | 19 | ```shell 20 | langchain app new my-app --package chat-langchain 21 | ``` 22 | 23 | If you want to add this to an existing project, you can just run: 24 | 25 | ```shell 26 | langchain app add chat-langchain 27 | ``` 28 | 29 | And add the following code to your `server.py` file: 30 | ```python 31 | from chat_langchain import chain as chat_langchain_chain 32 | 33 | add_routes(app, chat_langchain_chain, path="/chat-langchain") 34 | ``` 35 | 36 | (Optional) Let's now configure LangSmith. 37 | LangSmith will help us trace, monitor and debug LangChain applications. 38 | LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). 39 | If you don't have access, you can skip this section 40 | 41 | 42 | ```shell 43 | export LANGCHAIN_TRACING_V2=true 44 | export LANGCHAIN_API_KEY= 45 | export LANGCHAIN_PROJECT= # if not specified, defaults to "default" 46 | ``` 47 | 48 | If you are inside this directory, then you can spin up a LangServe instance directly by: 49 | 50 | ```shell 51 | langchain serve 52 | ``` 53 | 54 | This will start the FastAPI app with a server is running locally at 55 | [http://localhost:8000](http://localhost:8000) 56 | 57 | We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs) 58 | We can access the playground at [http://127.0.0.1:8000/chat-langchain/playground](http://127.0.0.1:8000/chat-langchain/playground) 59 | 60 | We can access the template from code with: 61 | 62 | ```python 63 | from langserve.client import RemoteRunnable 64 | 65 | runnable = RemoteRunnable("http://localhost:8000/chat-langchain") 66 | ``` -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/chat-langchain/chat_langchain/__init__.py: -------------------------------------------------------------------------------- 1 | from chat_langchain.chain import chain 2 | 3 | __all__ = ["chain"] 4 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/chat-langchain/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "chat-langchain" 3 | version = "0.0.1" 4 | description = "" 5 | authors = [] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | openai = ">1,<2" 10 | python = "^3.10" 11 | fastapi = "^0.104.1" 12 | pydantic = "1.10" 13 | langchain = ">=0.0.327,<0.1.0" 14 | uvicorn = "^0.23.2" 15 | beautifulsoup4 = "^4.12.2" 16 | tiktoken = "^0.4.0" 17 | weaviate-client = "^3.23.2" 18 | psycopg2 = "^2.9.7" 19 | lxml = "^4.9.3" 20 | langserve = {extras = ["server"], version = ">=0.0.21,<0.1.0"} 21 | anthropic = "^0.5.0" 22 | 23 | [tool.poetry.group.dev.dependencies] 24 | langchain-cli = ">=0.0.4" 25 | fastapi = "^0.104.0" 26 | sse-starlette = "^1.6.5" 27 | 28 | [tool.langserve] 29 | export_module = "chat_langchain" 30 | export_attr = "chain" 31 | 32 | [build-system] 33 | requires = ["poetry-core"] 34 | build-backend = "poetry.core.masonry.api" 35 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/chat-langchain/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/packages/chat-langchain/tests/__init__.py -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/example/custom_example/example_custom_chain.py: -------------------------------------------------------------------------------- 1 | from langchain.chat_models import ChatAnthropic, ChatOpenAI 2 | from langchain.prompts import ChatPromptTemplate 3 | from langchain.schema.output_parser import StrOutputParser 4 | from langchain_docs_retriever.retriever import get_retriever 5 | 6 | 7 | def create_runnable(config: dict): 8 | config_copy = config.copy() 9 | chat_cls_name = config_copy.pop("chat_cls", "ChatOpenAI") 10 | 11 | assert chat_cls_name in {"ChatOpenAI", "ChatAnthropic"} 12 | chat_cls = { 13 | "ChatOpenAI": ChatOpenAI, 14 | "ChatAnthropic": ChatAnthropic, 15 | }[chat_cls_name] 16 | model = chat_cls(**config_copy) 17 | retriever = get_retriever(config.get("retriever_config", {})) 18 | prompt = ChatPromptTemplate.from_messages( 19 | [ 20 | ("system", "Answer the Q using the following docs\n{docs}"), 21 | ("user", "Q: {question}"), 22 | ] 23 | ) 24 | return ( 25 | { 26 | "question": lambda x: x["question"], 27 | "docs": (lambda x: x["question"]) | retriever, 28 | } 29 | | prompt 30 | | model 31 | | StrOutputParser() 32 | ) 33 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/langchain-docs-retriever/README.md: -------------------------------------------------------------------------------- 1 | # LangChain Docs Retriever 2 | 3 | 4 | A simple vector store retriever over the LangChain python docs. Indexed 5 | simply using [ingest_docs.py](./ingest_docs.py). 6 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/langchain-docs-retriever/langchain_docs_retriever/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/packages/langchain-docs-retriever/langchain_docs_retriever/__init__.py -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/langchain-docs-retriever/langchain_docs_retriever/download_db.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | 4 | import requests 5 | 6 | remote_url = "https://storage.googleapis.com/benchmarks-artifacts/langchain-docs-benchmarking/chroma_db.zip" 7 | directory = os.path.dirname(os.path.realpath(__file__)) 8 | db_directory = os.path.join(directory, "db") 9 | 10 | 11 | def is_folder_populated(folder): 12 | if os.path.exists(folder): 13 | return any(os.scandir(folder)) 14 | return False 15 | 16 | 17 | def download_folder_from_gcs(): 18 | r = requests.get(remote_url, allow_redirects=True) 19 | open("chroma_db.zip", "wb").write(r.content) 20 | 21 | with zipfile.ZipFile("chroma_db.zip", "r") as zip_ref: 22 | zip_ref.extractall(directory) 23 | 24 | os.remove("chroma_db.zip") 25 | 26 | 27 | def fetch_langchain_docs_db(): 28 | if not is_folder_populated(db_directory): 29 | print(f"Folder {db_directory} is not populated. Downloading from GCS...") 30 | download_folder_from_gcs() 31 | 32 | 33 | if __name__ == "__main__": 34 | fetch_langchain_docs_db() 35 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/langchain-docs-retriever/langchain_docs_retriever/retriever.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | 4 | from langchain.embeddings import OpenAIEmbeddings 5 | 6 | # from langchain_docs_retriever.voyage import VoyageEmbeddings 7 | from langchain.embeddings.voyageai import VoyageEmbeddings 8 | from langchain.schema.embeddings import Embeddings 9 | from langchain.schema.retriever import BaseRetriever 10 | from langchain.vectorstores.chroma import Chroma 11 | 12 | from .download_db import fetch_langchain_docs_db 13 | 14 | WEAVIATE_DOCS_INDEX_NAME = "LangChain_agent_docs" 15 | _DIRECTORY = os.path.dirname(os.path.abspath(__file__)) 16 | CHROMA_COLLECTION_NAME = "langchain-docs" 17 | _DB_DIRECTORY = os.path.join(_DIRECTORY, "db") 18 | 19 | 20 | def get_embeddings_model() -> Embeddings: 21 | if os.environ.get("VOYAGE_AI_MODEL"): 22 | return VoyageEmbeddings(model=os.environ["VOYAGE_AI_MODEL"], max_retries=20) 23 | return OpenAIEmbeddings(chunk_size=200) 24 | 25 | 26 | def get_retriever(search_kwargs: Optional[dict] = None) -> BaseRetriever: 27 | embedding_model = get_embeddings_model() 28 | fetch_langchain_docs_db() 29 | vectorstore = Chroma( 30 | collection_name=CHROMA_COLLECTION_NAME, 31 | embedding_function=embedding_model, 32 | persist_directory=_DB_DIRECTORY, 33 | ) 34 | search_kwargs = search_kwargs or dict(k=6) 35 | return vectorstore.as_retriever(search_kwargs=search_kwargs) 36 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/langchain-docs-retriever/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "langchain-docs-retriever" 3 | version = "0.0.1" 4 | description = "" 5 | authors = [] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.10" 10 | fastapi = "^0.104.1" 11 | pydantic = "1.10" 12 | langchain = ">=0.0.331,<0.1.0" 13 | uvicorn = "^0.23.2" 14 | openai = ">1,<2" 15 | psycopg2 = "^2.9.7" 16 | lxml = "^4.9.3" 17 | langserve = {extras = ["server"], version = ">=0.0.23,<0.1.0"} 18 | chromadb = "^0.4.15" 19 | 20 | [tool.poetry.group.dev.dependencies] 21 | langchain-cli = ">=0.0.4" 22 | fastapi = "^0.104.0" 23 | sse-starlette = "^1.6.5" 24 | 25 | [tool.langserve] 26 | export_module = "chat_langchain" 27 | export_attr = "chain" 28 | 29 | [build-system] 30 | requires = ["poetry-core"] 31 | build-backend = "poetry.core.masonry.api" 32 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/oai-assistant/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 LangChain, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/oai-assistant/README.md: -------------------------------------------------------------------------------- 1 | # oai-assistant 2 | 3 | TODO: What does this package do 4 | 5 | ## Environment Setup 6 | 7 | TODO: What environment variables need to be set (if any) 8 | 9 | ## Usage 10 | 11 | To use this package, you should first have the LangChain CLI installed: 12 | 13 | ```shell 14 | pip install -U "langchain-cli[serve]" 15 | ``` 16 | 17 | To create a new LangChain project and install this as the only package, you can do: 18 | 19 | ```shell 20 | langchain app new my-app --package oai-assistant 21 | ``` 22 | 23 | If you want to add this to an existing project, you can just run: 24 | 25 | ```shell 26 | langchain app add oai-assistant 27 | ``` 28 | 29 | And add the following code to your `server.py` file: 30 | ```python 31 | from oai_assistant import chain as oai_assistant_chain 32 | 33 | add_routes(app, oai_assistant_chain, path="/oai-assistant") 34 | ``` 35 | 36 | (Optional) Let's now configure LangSmith. 37 | LangSmith will help us trace, monitor and debug LangChain applications. 38 | LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). 39 | If you don't have access, you can skip this section 40 | 41 | 42 | ```shell 43 | export LANGCHAIN_TRACING_V2=true 44 | export LANGCHAIN_API_KEY= 45 | export LANGCHAIN_PROJECT= # if not specified, defaults to "default" 46 | ``` 47 | 48 | If you are inside this directory, then you can spin up a LangServe instance directly by: 49 | 50 | ```shell 51 | langchain serve 52 | ``` 53 | 54 | This will start the FastAPI app with a server is running locally at 55 | [http://localhost:8000](http://localhost:8000) 56 | 57 | We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs) 58 | We can access the playground at [http://127.0.0.1:8000/oai-assistant/playground](http://127.0.0.1:8000/oai-assistant/playground) 59 | 60 | We can access the template from code with: 61 | 62 | ```python 63 | from langserve.client import RemoteRunnable 64 | 65 | runnable = RemoteRunnable("http://localhost:8000/oai-assistant") 66 | ``` -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/oai-assistant/oai_assistant/__init__.py: -------------------------------------------------------------------------------- 1 | from oai_assistant.chain import agent_executor 2 | 3 | __all__ = ["agent_executor"] 4 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/oai-assistant/oai_assistant/chain.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from langchain.agents import AgentExecutor 4 | from langchain.tools import tool 5 | from langchain_docs_retriever.retriever import get_retriever 6 | from langchain_experimental.openai_assistant import OpenAIAssistantRunnable 7 | 8 | # This is used to tell the model how to best use the retriever. 9 | 10 | 11 | _RETRIEVER = get_retriever() 12 | 13 | 14 | @tool 15 | def search(query, callbacks=None) -> str: 16 | """Search the LangChain docs with the retriever.""" 17 | docs = _RETRIEVER.get_relevant_documents(query, callbacks=callbacks) 18 | return json.dumps([doc.dict() for doc in docs]) 19 | 20 | 21 | tools = [search] 22 | 23 | agent = OpenAIAssistantRunnable.create_assistant( 24 | name="langchain docs assistant", 25 | instructions="You are a helpful assistant tasked with answering technical questions about LangChain.", 26 | tools=tools, 27 | model="gpt-4-1106-preview", 28 | as_agent=True, 29 | ) 30 | 31 | 32 | agent_executor = ( 33 | (lambda x: {"content": x["question"]}) 34 | | AgentExecutor(agent=agent, tools=tools) 35 | | (lambda x: x["output"]) 36 | ) 37 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/oai-assistant/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "oai-assistant" 3 | version = "0.0.1" 4 | description = "" 5 | authors = [] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.8.1,<4.0" 10 | langchain = ">=0.0.332,<0.1.0" 11 | openai = ">1,<2" 12 | langchain-experimental = "^0.0.38" 13 | 14 | [tool.poetry.group.dev.dependencies] 15 | langchain-cli = ">=0.0.4" 16 | fastapi = "^0.104.0" 17 | sse-starlette = "^1.6.5" 18 | 19 | [tool.langserve] 20 | export_module = "oai_assistant" 21 | export_attr = "agent_executor" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/oai-assistant/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/packages/oai-assistant/tests/__init__.py -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/openai-functions-agent/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 LangChain, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/openai-functions-agent/README.md: -------------------------------------------------------------------------------- 1 | 2 | # openai-functions-agent 3 | 4 | This template creates an agent that uses OpenAI function calling to communicate its decisions on what actions to take. 5 | 6 | This example creates an agent that can optionally look up information on the internet using Tavily's search engine. 7 | 8 | ## Environment Setup 9 | 10 | The following environment variables need to be set: 11 | 12 | Set the `OPENAI_API_KEY` environment variable to access the OpenAI models. 13 | 14 | Set the `TAVILY_API_KEY` environment variable to access Tavily. 15 | 16 | ## Usage 17 | 18 | To use this package, you should first have the LangChain CLI installed: 19 | 20 | ```shell 21 | pip install -U "langchain-cli[serve]" 22 | ``` 23 | 24 | To create a new LangChain project and install this as the only package, you can do: 25 | 26 | ```shell 27 | langchain app new my-app --package openai-functions-agent 28 | ``` 29 | 30 | If you want to add this to an existing project, you can just run: 31 | 32 | ```shell 33 | langchain app add openai-functions-agent 34 | ``` 35 | 36 | And add the following code to your `server.py` file: 37 | ```python 38 | from openai_functions_agent import chain as openai_functions_agent_chain 39 | 40 | add_routes(app, openai_functions_agent_chain, path="/openai-functions-agent") 41 | ``` 42 | 43 | (Optional) Let's now configure LangSmith. 44 | LangSmith will help us trace, monitor and debug LangChain applications. 45 | LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). 46 | If you don't have access, you can skip this section 47 | 48 | ```shell 49 | export LANGCHAIN_TRACING_V2=true 50 | export LANGCHAIN_API_KEY= 51 | export LANGCHAIN_PROJECT= # if not specified, defaults to "default" 52 | ``` 53 | 54 | If you are inside this directory, then you can spin up a LangServe instance directly by: 55 | 56 | ```shell 57 | langchain serve 58 | ``` 59 | 60 | This will start the FastAPI app with a server is running locally at 61 | [http://localhost:8000](http://localhost:8000) 62 | 63 | We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs) 64 | We can access the playground at [http://127.0.0.1:8000/openai-functions-agent/playground](http://127.0.0.1:8000/openai-functions-agent/playground) 65 | 66 | We can access the template from code with: 67 | 68 | ```python 69 | from langserve.client import RemoteRunnable 70 | 71 | runnable = RemoteRunnable("http://localhost:8000/openai-functions-agent") 72 | ``` -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/openai-functions-agent/main.py: -------------------------------------------------------------------------------- 1 | from openai_functions_agent.agent import agent_executor 2 | 3 | if __name__ == "__main__": 4 | question = "who won the womens world cup in 2023?" 5 | print(agent_executor.invoke({"input": question, "chat_history": []})) 6 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/openai-functions-agent/openai_functions_agent/__init__.py: -------------------------------------------------------------------------------- 1 | from openai_functions_agent.agent import agent_executor 2 | 3 | __all__ = ["agent_executor"] 4 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/openai-functions-agent/openai_functions_agent/agent.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | from langchain.agents import AgentExecutor 4 | from langchain.agents.format_scratchpad import format_to_openai_functions 5 | from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser 6 | from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder 7 | from langchain.schema.messages import AIMessage, HumanMessage 8 | from langchain.tools import tool 9 | from langchain.tools.render import format_tool_to_openai_function 10 | from langchain_docs_retriever.retriever import get_retriever 11 | from langchain_openai import ChatOpenAI 12 | from pydantic import BaseModel, Field 13 | 14 | # This is used to tell the model how to best use the retriever. 15 | 16 | 17 | _RETRIEVER = get_retriever() 18 | 19 | 20 | @tool 21 | def search(query, callbacks=None): 22 | """Search the LangChain docs with the retriever.""" 23 | return _RETRIEVER.get_relevant_documents(query, callbacks=callbacks) 24 | 25 | 26 | tools = [search] 27 | 28 | llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0) 29 | assistant_system_message = """You are a helpful assistant tasked with answering technical questions about LangChain. \ 30 | Use tools (only if necessary) to best answer the users questions. Do not make up information if you cannot find the answer using your tools.""" 31 | prompt = ChatPromptTemplate.from_messages( 32 | [ 33 | ("system", assistant_system_message), 34 | MessagesPlaceholder(variable_name="chat_history"), 35 | ("user", "{input}"), 36 | MessagesPlaceholder(variable_name="agent_scratchpad"), 37 | ] 38 | ) 39 | 40 | llm_with_tools = llm.bind(functions=[format_tool_to_openai_function(t) for t in tools]) 41 | 42 | 43 | def _format_chat_history(chat_history: List[Tuple[str, str]]): 44 | buffer = [] 45 | for human, ai in chat_history: 46 | buffer.append(HumanMessage(content=human)) 47 | buffer.append(AIMessage(content=ai)) 48 | return buffer 49 | 50 | 51 | agent = ( 52 | { 53 | "input": lambda x: x["input"], 54 | "chat_history": lambda x: _format_chat_history(x["chat_history"]), 55 | "agent_scratchpad": lambda x: format_to_openai_functions( 56 | x["intermediate_steps"] 57 | ), 58 | } 59 | | prompt 60 | | llm_with_tools 61 | | OpenAIFunctionsAgentOutputParser() 62 | ) 63 | 64 | 65 | class AgentInput(BaseModel): 66 | input: str 67 | chat_history: List[Tuple[str, str]] = Field(..., extra={"widget": {"type": "chat"}}) 68 | 69 | 70 | agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False).with_types( 71 | input_type=AgentInput 72 | ) 73 | 74 | 75 | class ChainInput(BaseModel): 76 | question: str 77 | 78 | 79 | def mapper(input: dict): 80 | return {"input": input["question"], "chat_history": []} 81 | 82 | 83 | agent_executor = (mapper | agent_executor | (lambda x: x["output"])).with_types( 84 | input_type=ChainInput 85 | ) 86 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/openai-functions-agent/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "openai-functions-agent" 3 | version = "0.1.0" 4 | description = "" 5 | authors = [ 6 | "Lance Martin ", 7 | ] 8 | readme = "README.md" 9 | 10 | [tool.poetry.dependencies] 11 | python = ">=3.8.1,<4.0" 12 | langchain = ">=0.0.327,<0.1.0" 13 | openai = ">=0.5.0" 14 | tavily-python = "^0.1.9" 15 | 16 | [tool.langserve] 17 | export_module = "openai_functions_agent" 18 | export_attr = "agent_executor" 19 | 20 | [build-system] 21 | requires = [ 22 | "poetry-core", 23 | ] 24 | build-backend = "poetry.core.masonry.api" 25 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/packages/openai-functions-agent/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/packages/openai-functions-agent/tests/__init__.py -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/prepare_dataset.py: -------------------------------------------------------------------------------- 1 | """Copy the public dataset to your own langsmith tenant.""" 2 | from typing import Optional 3 | 4 | from langsmith import Client 5 | 6 | DATASET_NAME = "LangChain Docs Q&A" 7 | PUBLIC_DATASET_TOKEN = "452ccafc-18e1-4314-885b-edd735f17b9d" 8 | 9 | 10 | def create_langchain_docs_dataset( 11 | dataset_name: str = DATASET_NAME, 12 | public_dataset_token: str = PUBLIC_DATASET_TOKEN, 13 | client: Optional[Client] = None, 14 | ): 15 | shared_client = Client( 16 | api_url="https://api.smith.langchain.com", api_key="placeholder" 17 | ) 18 | examples = list(shared_client.list_shared_examples(public_dataset_token)) 19 | client = client or Client() 20 | if client.has_dataset(dataset_name=dataset_name): 21 | loaded_examples = list(client.list_examples(dataset_name=dataset_name)) 22 | if len(loaded_examples) == len(examples): 23 | return 24 | else: 25 | ds = client.read_dataset(dataset_name=dataset_name) 26 | else: 27 | ds = client.create_dataset(dataset_name=dataset_name) 28 | client.create_examples( 29 | inputs=[e.inputs for e in examples], 30 | outputs=[e.outputs for e in examples], 31 | dataset_id=ds.id, 32 | ) 33 | print("Done creating dataset.") 34 | 35 | 36 | if __name__ == "__main__": 37 | import argparse 38 | 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument("--target-api-key", type=str, required=False) 41 | parser.add_argument("--target-endpoint", type=str, required=False) 42 | parser.add_argument("--dataset-name", type=str, default=DATASET_NAME) 43 | parser.add_argument( 44 | "--public-dataset-token", type=str, default=PUBLIC_DATASET_TOKEN 45 | ) 46 | args = parser.parse_args() 47 | client = None 48 | if args.target_api_key or args.target_endpoint: 49 | client = Client( 50 | api_key=args.target_api_key, 51 | api_url=args.target_endpoint, 52 | ) 53 | create_langchain_docs_dataset( 54 | dataset_name=args.dataset_name, 55 | public_dataset_token=args.public_dataset_token, 56 | client=client, 57 | ) 58 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "langservehub-template" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Your Name "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.11" 10 | langsmith = ">=0.0.64,<0.1.0" 11 | sse-starlette = "^1.6.5" 12 | tomli-w = "^1.0.0" 13 | uvicorn = "^0.23.2" 14 | fastapi = "^0.104" 15 | langserve = ">=0.0.16" 16 | chat-langchain = {path = "packages/chat-langchain", develop = true} 17 | langchain-docs-retriever = {path = "packages/langchain-docs-retriever", develop = true} 18 | anthropic-iterative-search = {path = "packages/anthropic-iterative-search", develop = true} 19 | oai-assistant = {path = "packages/oai-assistant", develop = true} 20 | openai-functions-agent = {path = "packages/openai-functions-agent", develop = true} 21 | 22 | [tool.poetry.group.dev.dependencies] 23 | uvicorn = "^0.23.2" 24 | pygithub = "^2.1.1" 25 | 26 | 27 | [build-system] 28 | requires = ["poetry-core"] 29 | build-backend = "poetry.core.masonry.api" 30 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/run_evals.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import importlib.util 3 | import sys 4 | import uuid 5 | from functools import partial 6 | from typing import Callable, Optional 7 | 8 | from anthropic_iterative_search.chain import chain as anthropic_agent_chain 9 | from chat_langchain.chain import create_chain 10 | from langchain.schema.runnable import Runnable 11 | from langchain.smith import RunEvalConfig, run_on_dataset 12 | from langchain_openai import ChatOpenAI 13 | from langsmith import Client 14 | from oai_assistant.chain import agent_executor as openai_assistant_chain 15 | from openai_functions_agent import agent_executor as openai_functions_agent_chain 16 | 17 | ls_client = Client() 18 | 19 | 20 | def import_from_path(path_name: str): 21 | func_name = "create_chain" 22 | if "::" in path_name: 23 | path_name, func_name = path_name.split("::") 24 | spec = importlib.util.spec_from_file_location("module_name", path_name) 25 | module = importlib.util.module_from_spec(spec) 26 | sys.modules["module_name"] = module 27 | spec.loader.exec_module(module) 28 | return getattr(module, func_name) 29 | 30 | 31 | def _get_chain_factory(arch: str) -> Callable: 32 | _map = { 33 | "chat": create_chain, 34 | "anthropic-iterative-search": lambda _: anthropic_agent_chain, 35 | "openai-functions-agent": lambda _: openai_functions_agent_chain, 36 | "openai-assistant": lambda _: openai_assistant_chain, 37 | } 38 | if arch in _map: 39 | return _map[arch] 40 | else: 41 | return import_from_path(arch) 42 | 43 | 44 | def create_runnable( 45 | arch: str, model_config: Optional[dict], retry_config: Optional[dict] = None 46 | ): 47 | factory = _get_chain_factory(arch) 48 | chain: Runnable = factory(model_config) 49 | if retry_config: 50 | return chain.with_retry(**retry_config) 51 | return chain 52 | 53 | 54 | def get_eval_config(): 55 | accuracy_criteria = { 56 | "accuracy": """ 57 | Score 1: The answer is incorrect and unrelated to the question or reference document. 58 | Score 3: The answer shows slight relevance to the question or reference document but is largely incorrect. 59 | Score 5: The answer is partially correct but has significant errors or omissions. 60 | Score 7: The answer is mostly correct with minor errors or omissions, and aligns with the reference document. 61 | Score 10: The answer is correct, complete, and perfectly aligns with the reference document. 62 | 63 | If the reference answer contains multiple alternatives, the predicted answer must only match one of the alternatives to be considered correct. 64 | If the predicted answer contains additional helpful and accurate information that is not present in the reference answer, it should still be considered correct. 65 | """ # noqa 66 | } 67 | 68 | eval_llm = ChatOpenAI(model="gpt-4", temperature=0.0) 69 | return RunEvalConfig( 70 | evaluators=[ 71 | RunEvalConfig.LabeledScoreString( 72 | criteria=accuracy_criteria, llm=eval_llm, normalize_by=10.0 73 | ), 74 | # Mainly to compare with the above 75 | # Suspected to be less reliable. 76 | RunEvalConfig.EmbeddingDistance(), 77 | ] 78 | ) 79 | 80 | 81 | def main( 82 | arch: str, 83 | dataset_name: str, 84 | model_config: Optional[dict] = None, 85 | max_concurrency: int = 5, 86 | project_name: Optional[str] = None, 87 | retry_config: Optional[dict] = None, 88 | ): 89 | eval_config = get_eval_config() 90 | project_name = project_name or arch 91 | project_name += f" {uuid.uuid4().hex[:4]}" 92 | run_on_dataset( 93 | client=ls_client, 94 | dataset_name=dataset_name, 95 | llm_or_chain_factory=partial( 96 | create_runnable, 97 | arch=arch, 98 | model_config=model_config, 99 | retry_config=retry_config, 100 | ), 101 | evaluation=eval_config, 102 | concurrency_level=max_concurrency, 103 | project_name=project_name, 104 | project_metadata={"arch": arch, "model_config": model_config}, 105 | ) 106 | 107 | 108 | if __name__ == "__main__": 109 | parser = argparse.ArgumentParser() 110 | parser.add_argument("--url", type=str) 111 | parser.add_argument("--dataset-name", type=str, default="Chat Langchain Pub") 112 | parser.add_argument("--project-name", type=Optional[str], default=None) 113 | parser.add_argument("--max-concurrency", type=int, default=5) 114 | args = parser.parse_args() 115 | main( 116 | args.url, 117 | args.dataset_name, 118 | max_concurrency=args.max_concurrency, 119 | project_name=args.project_name, 120 | ) 121 | -------------------------------------------------------------------------------- /archived/langchain-docs-benchmarking/run_experiments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | from prepare_dataset import create_langchain_docs_dataset 5 | from run_evals import main 6 | 7 | experiments = [ 8 | { 9 | # "server_url": "http://localhost:1983/openai-functions-agent", 10 | "arch": "openai-functions-agent", 11 | "project_name": "openai-functions-agent", 12 | }, 13 | { 14 | # "server_url": "http://localhost:1983/anthropic_chat", 15 | "arch": "chat", 16 | "model_config": { 17 | "chat_cls": "ChatAnthropic", 18 | "model": "claude-2", 19 | "temperature": 1.0, 20 | }, 21 | "project_name": "anthropic-chat", 22 | }, 23 | { 24 | "arch": "chat", 25 | "model_config": { 26 | "chat_cls": "ChatOpenAI", 27 | "model": "gpt-3.5-turbo-16k", 28 | }, 29 | # "server_url": "http://localhost:1983/chat", 30 | "project_name": "chat-gpt-3.5", 31 | }, 32 | { 33 | "arch": "chat", 34 | "model_config": { 35 | "chat_cls": "ChatFireworks", 36 | "model": "accounts/fireworks/models/mistral-7b-instruct-4k", 37 | }, 38 | "project_name": "mistral-7b-instruct-4k", 39 | }, 40 | { 41 | "arch": "chat", 42 | "model_config": { 43 | "chat_cls": "ChatFireworks", 44 | "model": "accounts/fireworks/models/llama-v2-34b-code-instruct-w8a16", 45 | }, 46 | "project_name": "llama-v2-34b-code-instruct-w8a16", 47 | }, 48 | { 49 | "arch": "chat", 50 | "model_config": { 51 | "chat_cls": "ChatFireworks", 52 | "model": "accounts/fireworks/models/zephyr-7b-beta", 53 | }, 54 | "project_name": "zephyr-7b-beta", 55 | }, 56 | { 57 | "arch": "chat", 58 | "model_config": { 59 | "chat_cls": "ChatOpenAI", 60 | "model": "gpt-4", 61 | }, 62 | "project_name": "gpt-4-chat", 63 | }, 64 | { 65 | "arch": "openai-assistant", 66 | "model_config": {}, 67 | "project_name": "openai-assistant", 68 | "max_concurrency": 2, # Rate limit is VERY low right now. 69 | "retry_config": { 70 | "stop_after_attempt": 10, 71 | }, 72 | }, 73 | # Not worth our time it's so bad and slow 74 | { 75 | # "server_url": "http://localhost:1983/anthropic_iterative_search", 76 | "arch": "anthropic-iterative-search", 77 | "max_concurrency": 2, 78 | "project_name": "anthropic-iterative-search", 79 | }, 80 | ] 81 | 82 | if __name__ == "__main__": 83 | parser = argparse.ArgumentParser() 84 | parser.add_argument("--dataset-name", type=str, default="LangChain Docs Q&A") 85 | parser.add_argument( 86 | "--config", 87 | type=str, 88 | default=None, 89 | nargs="*", 90 | help="Path to a JSON file with experiment config." 91 | " If specified, the include and exclude args are ignored", 92 | ) 93 | parser.add_argument("--include", type=str, nargs="+", default=None) 94 | parser.add_argument( 95 | "--exclude", 96 | type=str, 97 | nargs="+", 98 | ) 99 | args = parser.parse_args() 100 | create_langchain_docs_dataset(dataset_name=args.dataset_name) 101 | selected_experiments = experiments 102 | if args.config: 103 | selected_experiments = [] 104 | for config_path in args.config: 105 | with open(config_path) as f: 106 | selected_experiments.append(json.load(f)) 107 | elif args.include: 108 | selected_experiments = [ 109 | e for e in selected_experiments if e["project_name"] in args.include 110 | ] 111 | to_exclude = args.exclude or [] 112 | if args.include and not to_exclude: 113 | to_exclude = [ 114 | "anthropic-iterative-search", 115 | "openai-assistant", 116 | ] 117 | if args.exclude: 118 | selected_experiments = [ 119 | e for e in selected_experiments if e["project_name"] not in args.exclude 120 | ] 121 | 122 | for experiment in selected_experiments: 123 | print("Running experiment:", experiment) 124 | main( 125 | **experiment, 126 | dataset_name=args.dataset_name, 127 | ) 128 | -------------------------------------------------------------------------------- /archived/meta-evals/README.md: -------------------------------------------------------------------------------- 1 | # Meta-Evaluations 2 | 3 | 4 | This folder holds some scripts/tests for evaluating some of LangChain's default evaluators. -------------------------------------------------------------------------------- /archived/meta-evals/correctness/README.md: -------------------------------------------------------------------------------- 1 | # Correctness Meta-Evals 2 | 3 | This folder contains a test script to check the aggregate performance of the "correctness"-related evaluators. 4 | 5 | To upload the dataset to LangSmith, run: 6 | 7 | ```bash 8 | python meta-evals/correctness/_upload_dataset.py 9 | ``` 10 | 11 | To test, run: 12 | 13 | ```bash 14 | pytest --capture=no meta-evals/correctness/test_correctness_evaluator.py 15 | ``` 16 | 17 | Then navigate to the Web Q&A dataset to review the results. -------------------------------------------------------------------------------- /archived/meta-evals/correctness/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/meta-evals/correctness/__init__.py -------------------------------------------------------------------------------- /archived/meta-evals/correctness/_upload_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from pathlib import Path 4 | 5 | from langsmith import Client 6 | 7 | logging.basicConfig(level=logging.INFO) 8 | 9 | # Synthetic dataset adapted from https://aclanthology.org/D13-1160/ 10 | 11 | _DATA_REPO = Path(__file__).parent / "data" 12 | _CLIENT = Client() 13 | 14 | 15 | def _upload_dataset(path: str): 16 | with open(path, "r") as f: 17 | data = json.load(f) 18 | dataset_name = data["name"] 19 | examples = data["examples"] 20 | try: 21 | dataset = _CLIENT.create_dataset(dataset_name) 22 | except Exception: 23 | logging.warning(f"Skipping {dataset_name}") 24 | return 25 | logging.info(f"Uploading dataset: {dataset_name}") 26 | _CLIENT.create_examples( 27 | inputs=[example["inputs"] for example in examples], 28 | outputs=[example["outputs"] for example in examples], 29 | dataset_id=dataset.id, 30 | ) 31 | 32 | 33 | if __name__ == "__main__": 34 | for dataset in _DATA_REPO.glob("*.json"): 35 | print("Uploading dataset:", dataset) 36 | _upload_dataset(dataset) 37 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/.gitignore: -------------------------------------------------------------------------------- 1 | chromadb/ 2 | index.md 3 | Untitled.ipynb 4 | -------------------------------------------------------------------------------- /docs/source/_static/parrot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/docs/source/_static/parrot.png -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | import pathlib 20 | import sys 21 | from typing import List 22 | 23 | import toml 24 | 25 | ROOT_FOLDER = str(pathlib.Path(__file__).parent.parent.parent) 26 | 27 | # Add the project root to the path 28 | sys.path.insert(0, ROOT_FOLDER) 29 | 30 | with open("../../pyproject.toml") as f: 31 | data = toml.load(f) 32 | 33 | project = "LangChain Benchmarks" 34 | copyright = "2023, Langchain AI" 35 | author = "Langchain AI" 36 | 37 | version = data["tool"]["poetry"]["version"] 38 | release = version 39 | 40 | html_title = project + " " + version 41 | 42 | 43 | # -- General configuration --------------------------------------------------- 44 | 45 | # Add any Sphinx extension module names here, as strings. They can be 46 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 47 | # ones. 48 | extensions = [ 49 | "sphinx.ext.autodoc", 50 | "sphinx.ext.autodoc.typehints", 51 | "sphinx.ext.autosummary", 52 | "sphinx.ext.napoleon", 53 | "sphinx.ext.viewcode", 54 | "myst_nb", 55 | "sphinx_copybutton", 56 | "IPython.sphinxext.ipython_console_highlighting", 57 | ] 58 | source_suffix = [".ipynb", ".html", ".md", ".rst"] 59 | 60 | # Add any paths that contain templates here, relative to this directory. 61 | templates_path = ["_templates"] 62 | 63 | # List of patterns, relative to source directory, that match files and 64 | # directories to ignore when looking for source files. 65 | # This pattern also affects html_static_path and html_extra_path. 66 | exclude_patterns: List[str] = [] 67 | 68 | 69 | # -- Options for HTML output ------------------------------------------------- 70 | 71 | # The theme to use for HTML and HTML Help pages. See the documentation for 72 | # a list of builtin themes. 73 | # 74 | html_theme = "sphinx_book_theme" 75 | 76 | html_theme_options = { 77 | "path_to_docs": "docs/source", 78 | "repository_url": "https://github.com/langchain-ai/langchain-benchmarks", 79 | "home_page_in_toc": True, 80 | "show_navbar_depth": 2, 81 | "use_sidenotes": True, 82 | "use_repository_button": True, 83 | "use_issues_button": True, 84 | "use_source_button": True, 85 | "use_fullscreen_button": True, 86 | "repository_branch": "main", 87 | "launch_buttons": { 88 | "notebook_interface": "jupyterlab", 89 | "colab_url": "https://colab.research.google.com", 90 | }, 91 | } 92 | 93 | html_context = { 94 | "display_github": True, # Integrate GitHub 95 | "github_user": "langchain-ai", # Username 96 | "github_repo": "langchain-benchmarks", # Repo name 97 | "github_version": "main", # Version 98 | "conf_py_path": "/docs/", # Path in the checkout to the docs root 99 | } 100 | 101 | # Add any paths that contain custom static files (such as style sheets) here, 102 | # relative to this directory. They are copied after the builtin static files, 103 | # so a file named "default.css" will overwrite the builtin "default.css". 104 | html_static_path = ["_static"] 105 | 106 | # These paths are either relative to html_static_path 107 | # or fully qualified paths (eg. https://...) 108 | html_css_files = [ 109 | "css/custom.css", 110 | ] 111 | 112 | nb_execution_mode = "off" 113 | autosummary_generate = True 114 | -------------------------------------------------------------------------------- /docs/source/toc.segment: -------------------------------------------------------------------------------- 1 | ```{toctree} 2 | :maxdepth: 2 3 | :caption: Introduction 4 | 5 | ./notebooks/getting_started 6 | ./notebooks/models 7 | ./notebooks/datasets 8 | ``` 9 | 10 | 11 | ```{toctree} 12 | :maxdepth: 0 13 | :caption: Tool Usage 14 | 15 | ./notebooks/tool_usage/intro 16 | ./notebooks/tool_usage/relational_data 17 | ./notebooks/tool_usage/multiverse_math 18 | ./notebooks/tool_usage/typewriter_1 19 | ./notebooks/tool_usage/typewriter_26 20 | ./notebooks/tool_usage/benchmark_all_tasks 21 | ``` 22 | 23 | ```{toctree} 24 | :maxdepth: 0 25 | :caption: Extraction 26 | 27 | ./notebooks/extraction/intro 28 | ./notebooks/extraction/email 29 | ./notebooks/extraction/chat_extraction 30 | ./notebooks/extraction/high_cardinality 31 | ``` 32 | 33 | ```{toctree} 34 | :maxdepth: 2 35 | :caption: RAG 36 | 37 | ./notebooks/retrieval/intro 38 | ./notebooks/retrieval/langchain_docs_qa 39 | ./notebooks/retrieval/semi_structured_benchmarking/semi_structured 40 | ./notebooks/retrieval/semi_structured_benchmarking/ss_eval_chunk_sizes 41 | ./notebooks/retrieval/semi_structured_benchmarking/ss_eval_long_context 42 | ./notebooks/retrieval/semi_structured_benchmarking/ss_eval_multi_vector 43 | ./notebooks/retrieval/multi_modal_benchmarking/multi_modal_eval_baseline 44 | ./notebooks/retrieval/multi_modal_benchmarking/multi_modal_eval 45 | ./notebooks/retrieval/comparing_techniques 46 | ``` 47 | 48 | ```{toctree} 49 | :maxdepth: 2 50 | :caption: Benchmarking Without LangSmith 51 | ./notebooks/run_without_langsmith 52 | ``` 53 | -------------------------------------------------------------------------------- /langchain_benchmarks/.gitignore: -------------------------------------------------------------------------------- 1 | .sql 2 | -------------------------------------------------------------------------------- /langchain_benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib import metadata 2 | 3 | from langchain_benchmarks.model_registration import model_registry 4 | from langchain_benchmarks.rate_limiting import RateLimiter 5 | from langchain_benchmarks.registration import registry 6 | from langchain_benchmarks.utils._langsmith import ( 7 | clone_public_dataset, 8 | download_public_dataset, 9 | ) 10 | 11 | try: 12 | __version__ = metadata.version(__package__) 13 | except metadata.PackageNotFoundError: 14 | # Case where package metadata is not available. 15 | __version__ = "" 16 | del metadata # optional, avoids polluting the results of dir(__package__) 17 | 18 | 19 | # Please keep this list sorted! 20 | __all__ = [ 21 | "__version__", 22 | "clone_public_dataset", 23 | "download_public_dataset", 24 | "model_registry", 25 | "RateLimiter", 26 | "registry", 27 | ] 28 | -------------------------------------------------------------------------------- /langchain_benchmarks/extraction/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.extraction.evaluators import get_eval_config 2 | from langchain_benchmarks.extraction.implementations import ( 3 | create_openai_function_based_extractor, 4 | ) 5 | 6 | # Keep this sorted 7 | __all__ = [ 8 | "get_eval_config", 9 | "create_openai_function_based_extractor", 10 | ] 11 | -------------------------------------------------------------------------------- /langchain_benchmarks/extraction/evaluators.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from langchain.chat_models.base import BaseChatModel 4 | from langchain.smith import RunEvalConfig 5 | from langchain_openai import ChatOpenAI 6 | 7 | 8 | def get_eval_config(eval_llm: Optional[BaseChatModel] = None) -> RunEvalConfig: 9 | eval_llm = eval_llm or ChatOpenAI( 10 | model="gpt-4", 11 | temperature=0, 12 | model_kwargs={"seed": 42}, 13 | max_retries=1, 14 | request_timeout=60, 15 | ) 16 | """Get the evaluation configuration for the email task.""" 17 | return RunEvalConfig( 18 | evaluators=[ 19 | "json_edit_distance", 20 | RunEvalConfig.LabeledScoreString( 21 | criteria={ 22 | "accuracy": """ 23 | Score 1: The answer is incorrect and unrelated to the question or reference document. 24 | Score 3: The answer is partially correct but has more than one omission or major errors. 25 | Score 5: The answer is mostly correct but has more than one omission or major error. 26 | Score 7: The answer is mostly correct but has at most one omission or major error. 27 | Score 9: The answer is mostly correct with no omissions and only minor errors, and aligns with the reference document. 28 | Score 10: The answer is correct, complete, and aligns with the reference document. Extra information is acceptable if it is sensible. 29 | 30 | If the reference answer contains multiple alternatives, the predicted answer must only match one of the alternatives to be considered correct. 31 | If the predicted answer contains additional helpful and accurate information that is not present in the reference answer, it should still be considered correct and not be penalized. 32 | """ # noqa 33 | }, 34 | llm=eval_llm, 35 | normalize_by=10.0, 36 | ), 37 | ], 38 | ) 39 | -------------------------------------------------------------------------------- /langchain_benchmarks/extraction/implementations.py: -------------------------------------------------------------------------------- 1 | """Default implementations of LLMs that can be used for extraction.""" 2 | from typing import Any, Dict, List, Optional, Type 3 | 4 | from langchain.chains.openai_functions import convert_to_openai_function 5 | from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser 6 | from langchain.prompts import ChatPromptTemplate 7 | from langchain.schema.runnable import Runnable 8 | from langchain_openai import ChatOpenAI 9 | from langsmith.client import Client 10 | from pydantic import BaseModel 11 | 12 | from langchain_benchmarks.extraction.evaluators import get_eval_config 13 | from langchain_benchmarks.schema import ExtractionTask 14 | 15 | # PUBLIC API 16 | 17 | 18 | def create_openai_function_based_extractor( 19 | prompt: ChatPromptTemplate, 20 | llm: Runnable, 21 | schema: Type[BaseModel], 22 | ) -> Runnable[dict, dict]: 23 | """Create an extraction chain that uses an LLM to extract a schema. 24 | 25 | The underlying functionality is exclusively for LLMs that support 26 | extraction using openai functions format. 27 | 28 | Args: 29 | prompt: The prompt to use for extraction. 30 | llm: The LLM to use for extraction. 31 | schema: The schema to extract. 32 | 33 | Returns: 34 | An llm that will extract the schema 35 | """ 36 | openai_functions = [convert_to_openai_function(schema)] 37 | llm_kwargs = { 38 | "functions": openai_functions, 39 | "function_call": {"name": openai_functions[0]["name"]}, 40 | } 41 | output_parser = JsonOutputFunctionsParser() 42 | extraction_chain = ( 43 | prompt | llm.bind(**llm_kwargs) | output_parser | (lambda x: {"output": x}) 44 | ) 45 | return extraction_chain 46 | 47 | 48 | def run_on_dataset( 49 | task: ExtractionTask, 50 | llm: Runnable, 51 | *, 52 | tags: Optional[List[str]] = None, 53 | **kwargs: Any, 54 | ) -> Dict[str, Any]: 55 | """Run an LLM on a dataset. 56 | 57 | Args: 58 | task: The task to run on. 59 | llm: The LLM to run. 60 | tags: The tags to use for the run. 61 | kwargs: Additional arguments to pass to the client. 62 | """ 63 | client = Client() 64 | eval_llm = ChatOpenAI( 65 | model="gpt-4", 66 | temperature=0.0, 67 | model_kwargs={"seed": 42}, 68 | max_retries=1, 69 | request_timeout=60, 70 | ) 71 | return client.run_on_dataset( 72 | dataset_name=task.name, 73 | llm_or_chain_factory=create_openai_function_based_extractor( 74 | task.instructions, llm, task.schema 75 | ), 76 | evaluation=get_eval_config(eval_llm), 77 | tags=tags, 78 | **kwargs, 79 | ) 80 | -------------------------------------------------------------------------------- /langchain_benchmarks/extraction/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/langchain_benchmarks/extraction/tasks/__init__.py -------------------------------------------------------------------------------- /langchain_benchmarks/extraction/tasks/chat_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain.prompts import ChatPromptTemplate 2 | 3 | from langchain_benchmarks.extraction.tasks.chat_extraction.evaluators import ( 4 | get_eval_config, 5 | ) 6 | from langchain_benchmarks.extraction.tasks.chat_extraction.schema import GenerateTicket 7 | from langchain_benchmarks.schema import ExtractionTask 8 | 9 | # This is a default prompt that works reasonably for OpenAI models. 10 | 11 | DEFAULT_CHAT_MODEL_PROMPT = ChatPromptTemplate.from_messages( 12 | [ 13 | ( 14 | "system", 15 | "You are a helpdesk assistant responsible with extracting information" 16 | " and generating tickets. Dialogues are between a user and" 17 | " a support engineer.", 18 | ), 19 | ( 20 | "user", 21 | "Generate a ticket for the following question-response pair:\n" 22 | "\n{dialogue}\n", 23 | ), 24 | ] 25 | ) 26 | 27 | 28 | CHAT_EXTRACTION_TASK = ExtractionTask( 29 | name="Chat Extraction", 30 | dataset_id="https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d", 31 | schema=GenerateTicket, 32 | description="""A dataset meant to test the ability of an LLM to extract and infer 33 | structured information from a dialogue. The dialogue is between a user and a support 34 | engineer. Outputs should be structured as a JSON object and test both the ability 35 | of the LLM to correctly structure the information and its ability to perform simple 36 | classification tasks.""", 37 | instructions=DEFAULT_CHAT_MODEL_PROMPT, 38 | ) 39 | 40 | 41 | __all__ = ["CHAT_EXTRACTION_TASK", "get_eval_config"] 42 | -------------------------------------------------------------------------------- /langchain_benchmarks/extraction/tasks/chat_extraction/schema.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import List, Optional 3 | 4 | from pydantic import BaseModel, Field 5 | 6 | 7 | class QuestionCategory(str, Enum): 8 | IMPLEMENTATION_ISSUES = "Implementation Issues" # about existing implementation 9 | FEATURE_REQUESTS = "Feature Requests" 10 | CONCEPT_EXPLANATIONS = "Concept Explanations" 11 | CODE_OPTIMIZATION = "Code Optimization" 12 | SECURITY_AND_PRIVACY_CONCERNS = "Security and Privacy Concerns" 13 | MODEL_TRAINING_AND_FINE_TUNING = "Model Training and Fine-tuning" 14 | DATA_HANDLING_AND_MANIPULATION = "Data Handling and Manipulation" 15 | USER_INTERACTION_FLOW = "User Interaction Flow" 16 | TECHNICAL_INTEGRATION = "Technical Integration" 17 | ERROR_HANDLING_AND_LOGGING = "Error Handling and Logging" 18 | CUSTOMIZATION_AND_CONFIGURATION = "Customization and Configuration" 19 | EXTERNAL_API_AND_DATA_SOURCE_INTEGRATION = ( 20 | "External API and Data Source Integration" 21 | ) 22 | LANGUAGE_AND_LOCALIZATION = "Language and Localization" 23 | STREAMING_AND_REAL_TIME_PROCESSING = "Streaming and Real-time Processing" 24 | TOOL_DEVELOPMENT = "Tool Development" 25 | FUNCTION_CALLING = "Function Calling" 26 | LLM_INTEGRATIONS = "LLM Integrations" 27 | GENERAL_AGENT_QUESTIONS = "General Agent Question" 28 | GENERAL_CHIT_CHAT = "General Chit Chat" 29 | MEMORY = "Memory" 30 | DEBUGGING_HELP = "Debugging Help" 31 | APPLICATION_DESIGN = "Application Design" 32 | PROMPT_TEMPLATES = "Prompt Templates" 33 | COST_TRACKING = "Cost Tracking" 34 | OTHER = "Other" 35 | 36 | 37 | class Sentiment(str, Enum): 38 | NEGATIVE = "Negative" 39 | NEUTRAL = "Neutral" 40 | POSITIVE = "Positive" 41 | 42 | 43 | class ProgrammingLanguage(str, Enum): 44 | PYTHON = "python" 45 | JAVASCRIPT = "javascript" 46 | TYPESCRIPT = "typescript" 47 | UNKNOWN = "unknown" 48 | OTHER = "other" 49 | 50 | 51 | class QuestionCategorization(BaseModel): 52 | question_category: QuestionCategory 53 | category_if_other: Optional[str] = Field( 54 | default=None, description="question category if the category above is 'other'" 55 | ) 56 | is_off_topic: bool = Field( 57 | description="If the input is general chit chat or does not pertain to technical inqueries about LangChain or building/debugging applications with LLMs/AI, it is off topic. For context, LangChain is a library and framework designed" 58 | " to assist in building applications with LLMs. Questions may also be about similar packages like LangServe, LangSmith, OpenAI, Anthropic, vectorstores, agents, etc." 59 | ) 60 | toxicity: int = Field( 61 | ge=0, lt=6, description="Whether or not the input question is toxic" 62 | ) 63 | sentiment: Sentiment 64 | programming_language: ProgrammingLanguage 65 | 66 | 67 | # resolve the issue, provide guidance, or ask for more information 68 | class ResponseType(str, Enum): 69 | RESOLVE_ISSUE = "resolve issue" 70 | PROVIDE_GUIDANCE = "provide guidance" 71 | REQUEST_INFORMATION = "request information" 72 | GIVE_UP = "give up" 73 | NONE = "none" 74 | OTHER = "other" 75 | 76 | 77 | class ResponseCategorization(BaseModel): 78 | response_type: ResponseType 79 | response_type_if_other: Optional[str] = None 80 | confidence_level: int = Field( 81 | ge=0, lt=6, description="The confidence of the assistant in its answer." 82 | ) 83 | followup_actions: Optional[List[str]] = Field( 84 | description="Actions the assistant recommended the user take." 85 | ) 86 | 87 | 88 | class GenerateTicket(BaseModel): 89 | """Generate a ticket containing all the extracted information.""" 90 | 91 | issue_summary: str = Field( 92 | description="short (<10 word) summary of the issue or question" 93 | ) 94 | question: QuestionCategorization = Field( 95 | description="Information inferred from the the question." 96 | ) 97 | response: ResponseCategorization = Field( 98 | description="Information inferred from the the response." 99 | ) 100 | -------------------------------------------------------------------------------- /langchain_benchmarks/extraction/tasks/email_task.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import List, Optional 3 | 4 | from langchain.prompts import ChatPromptTemplate 5 | from pydantic import BaseModel, Field 6 | 7 | from langchain_benchmarks.schema import ExtractionTask 8 | 9 | 10 | class ToneEnum(str, Enum): 11 | """The tone of the email.""" 12 | 13 | positive = "positive" 14 | negative = "negative" 15 | 16 | 17 | class Email(BaseModel): 18 | """Relevant information about an email.""" 19 | 20 | sender: Optional[str] = Field(None, description="The sender's name, if available") 21 | sender_phone_number: Optional[str] = Field( 22 | None, description="The sender's phone number, if available" 23 | ) 24 | sender_address: Optional[str] = Field( 25 | None, description="The sender's address, if available" 26 | ) 27 | action_items: List[str] = Field( 28 | ..., description="A list of action items requested by the email" 29 | ) 30 | topic: str = Field( 31 | ..., description="High level description of what the email is about" 32 | ) 33 | tone: ToneEnum = Field(..., description="The tone of the email.") 34 | 35 | 36 | # This is a default prompt that works for chat models. 37 | DEFAULT_CHAT_MODEL_PROMPT = ChatPromptTemplate.from_messages( 38 | [ 39 | ("system", "You are an expert researcher."), 40 | ( 41 | "human", 42 | "What can you tell me about the following email? Make sure to " 43 | "extract the question in the correct format. " 44 | "Here is the email:\n ```\n{input}\n```", 45 | ), 46 | ] 47 | ) 48 | 49 | EMAIL_EXTRACTION_TASK = ExtractionTask( 50 | name="Email Extraction", 51 | dataset_id="https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d", 52 | schema=Email, 53 | description="""\ 54 | A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, \ 55 | as well as a script for initial extraction and formatting of other emails from \ 56 | an arbitrary .mbox file like the one exported by Gmail. 57 | 58 | Some additional cleanup of the data was done by hand after the initial pass. 59 | 60 | See https://github.com/jacoblee93/oss-model-extraction-evals. 61 | """, 62 | instructions=DEFAULT_CHAT_MODEL_PROMPT, 63 | ) 64 | -------------------------------------------------------------------------------- /langchain_benchmarks/extraction/tasks/high_cardinality/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.extraction.tasks.high_cardinality.name_correction import ( 2 | NAME_CORRECTION_TASK, 3 | ) 4 | 5 | __all__ = ["NAME_CORRECTION_TASK"] 6 | -------------------------------------------------------------------------------- /langchain_benchmarks/extraction/tasks/high_cardinality/name_correction.py: -------------------------------------------------------------------------------- 1 | from langchain.smith import RunEvalConfig 2 | from langsmith.evaluation import EvaluationResult, run_evaluator 3 | from langsmith.schemas import Example, Run 4 | from pydantic import BaseModel, Field 5 | 6 | from langchain_benchmarks.schema import ExtractionTask 7 | 8 | 9 | @run_evaluator 10 | def correct_name(run: Run, example: Example) -> EvaluationResult: 11 | if "name" in run.outputs: 12 | prediction = run.outputs["name"] 13 | else: 14 | prediction = run.outputs["output"]["name"] 15 | name = example.outputs["name"] 16 | score = int(name == prediction) 17 | return EvaluationResult(key="correct", score=score) 18 | 19 | 20 | class Person(BaseModel): 21 | """Information about a person.""" 22 | 23 | name: str = Field(..., description="The person's name") 24 | 25 | 26 | NAME_CORRECTION_TASK = ExtractionTask( 27 | name="Name Correction", 28 | dataset_id="https://smith.langchain.com/public/78df83ee-ba7f-41c6-832c-2b23327d4cf7/d", 29 | schema=Person, 30 | description="""A dataset of 23 misspelled full names and their correct spellings.""", 31 | dataset_url="https://smith.langchain.com/public/78df83ee-ba7f-41c6-832c-2b23327d4cf7/d", 32 | dataset_name="Extracting Corrected Names", 33 | eval_config=RunEvalConfig( 34 | custom_evaluators=[correct_name], 35 | ), 36 | ) 37 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/.gitignore: -------------------------------------------------------------------------------- 1 | *.sql 2 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.rag.evaluators import get_eval_config 2 | from langchain_benchmarks.rag.tasks import LANGCHAIN_DOCS_TASK 3 | 4 | # Please keep this sorted 5 | __all__ = ["get_eval_config", "LANGCHAIN_DOCS_TASK"] 6 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/evaluators.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from langchain.evaluation import load_evaluator 4 | from langchain.smith import RunEvalConfig 5 | from langchain_openai import ChatOpenAI 6 | 7 | try: 8 | from langchain.schema.language_model import BaseLanguageModel 9 | except ImportError: 10 | from langchain_core.language_models import BaseLanguageModel 11 | from langsmith.evaluation.evaluator import EvaluationResult, RunEvaluator 12 | from langsmith.schemas import Example, Run 13 | 14 | 15 | # TODO: Split this into an assertion-by-assertion evaluator 16 | # TODO: Combine with a document relevance evaluator (to report retriever performance) 17 | class FaithfulnessEvaluator(RunEvaluator): 18 | def __init__(self, llm: Optional[BaseLanguageModel] = None): 19 | self.evaluator = load_evaluator( 20 | "labeled_score_string", 21 | criteria={ 22 | "faithfulness": """ 23 | Score 1: The answer directly contradicts the information provided in the reference docs. 24 | Score 3: The answer contains a mix of correct information from the reference docs and incorrect or unverifiable information not found in the docs. 25 | Score 5: The answer is mostly aligned with the reference docs but includes extra information that, while not contradictory, is not verified by the docs. 26 | Score 7: The answer aligns well with the reference docs but includes minor, commonly accepted facts not found in the docs. 27 | Score 10: The answer perfectly aligns with and is fully entailed by the reference docs, with no extra information.""" 28 | }, 29 | llm=llm, 30 | normalize_by=10, 31 | ) 32 | 33 | @staticmethod 34 | def _get_retrieved_docs(run: Run) -> str: 35 | # This assumes there is only one retriever in your chain. 36 | # To select more precisely, name your retrieval chain 37 | # using with_config(name="my_unique_name") and look up 38 | # by run.name 39 | runs = [run] 40 | while runs: 41 | run = runs.pop() 42 | if run.run_type == "retriever": 43 | return str(run.outputs["documents"]) 44 | if run.child_runs: 45 | runs.extend(run.child_runs[::-1]) 46 | return "" 47 | 48 | def evaluate_run( 49 | self, run: Run, example: Optional[Example] = None 50 | ) -> EvaluationResult: 51 | try: 52 | docs_string = self._get_retrieved_docs(run) 53 | docs_string = f"Reference docs:\n\n{docs_string}\n\n\n" 54 | input_query = run.inputs["question"] 55 | if run.outputs is not None and len(run.outputs) == 1: 56 | prediction = next(iter(run.outputs.values())) 57 | else: 58 | prediction = run.outputs["output"] 59 | result = self.evaluator.evaluate_strings( 60 | input=input_query, 61 | prediction=prediction, 62 | reference=docs_string, 63 | ) 64 | return EvaluationResult( 65 | **{"key": "faithfulness", "comment": result.get("reasoning"), **result} 66 | ) 67 | except Exception as e: 68 | return EvaluationResult(key="faithfulness", score=None, comment=repr(e)) 69 | 70 | 71 | _ACCURACY_CRITERION = { 72 | "accuracy": """ 73 | Score 1: The answer is incorrect and unrelated to the question or reference document. 74 | Score 3: The answer shows slight relevance to the question or reference document but is largely incorrect. 75 | Score 5: The answer is partially correct but has significant errors or omissions. 76 | Score 7: The answer is mostly correct with minor errors or omissions, and aligns with the reference document. 77 | Score 10: The answer is correct, complete, and perfectly aligns with the reference document. 78 | 79 | If the reference answer contains multiple alternatives, the predicted answer must only match one of the alternatives to be considered correct. 80 | If the predicted answer contains additional helpful and accurate information that is not present in the reference answer, it should still be considered correct. 81 | """ # noqa 82 | } 83 | 84 | 85 | def get_eval_config() -> RunEvalConfig: 86 | """Returns the evaluator for the environment.""" 87 | eval_llm = ChatOpenAI( 88 | model="gpt-4", 89 | temperature=0.0, 90 | model_kwargs={"seed": 42}, 91 | max_retries=1, 92 | request_timeout=60, 93 | ) 94 | # Use a longer-context LLM to check documents 95 | faithfulness_eval_llm = ChatOpenAI( 96 | model="gpt-4-1106-preview", 97 | temperature=0.0, 98 | model_kwargs={"seed": 42}, 99 | max_retries=1, 100 | request_timeout=60, 101 | ) 102 | 103 | return RunEvalConfig( 104 | evaluators=[ 105 | RunEvalConfig.LabeledScoreString( 106 | criteria=_ACCURACY_CRITERION, llm=eval_llm, normalize_by=10.0 107 | ), 108 | RunEvalConfig.EmbeddingDistance(), 109 | ], 110 | custom_evaluators=[FaithfulnessEvaluator(llm=faithfulness_eval_llm)], 111 | ) 112 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/.gitignore: -------------------------------------------------------------------------------- 1 | pdfs/ 2 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.rag.tasks.langchain_docs.task import LANGCHAIN_DOCS_TASK 2 | from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.task import ( 3 | MULTI_MODAL_SLIDE_DECKS_TASK, 4 | ) 5 | from langchain_benchmarks.rag.tasks.semi_structured_reports.task import ( 6 | SEMI_STRUCTURED_REPORTS_TASK, 7 | ) 8 | 9 | # Please keep this sorted 10 | __all__ = [ 11 | "LANGCHAIN_DOCS_TASK", 12 | "SEMI_STRUCTURED_REPORTS_TASK", 13 | "MULTI_MODAL_SLIDE_DECKS_TASK", 14 | ] 15 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/langchain_docs/README.md: -------------------------------------------------------------------------------- 1 | # LangChain Docs Task 2 | 3 | This code contains utilities to scrape the LangChain docs (already run) and index them 4 | using common techniques. The docs were scraped using the code in `_ingest_docs.py` and 5 | uploaded to gcs. To better compare retrieval techniques, we hold these constant and pull 6 | from that cache whenever generating different indices. 7 | 8 | 9 | The content in `indexing` composes some common indexing strategies with default paramaters for 10 | benchmarking on the langchain docs. -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/langchain_docs/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.rag.tasks.langchain_docs import architectures, indexing 2 | from langchain_benchmarks.rag.tasks.langchain_docs.task import LANGCHAIN_DOCS_TASK 3 | 4 | DATASET_ID = ( 5 | "452ccafc-18e1-4314-885b-edd735f17b9d" # ID of public LangChain Docs dataset 6 | ) 7 | 8 | __all__ = ["architectures", "indexing", "DATASET_ID", "LANGCHAIN_DOCS_TASK"] 9 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/langchain_docs/architectures/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.rag.tasks.langchain_docs.architectures.chain_registry import ( 2 | ARCH_FACTORIES, 3 | ) 4 | 5 | __all__ = ["ARCH_FACTORIES"] 6 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/langchain_docs/architectures/chain_registry.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from langchain.base_language import BaseLanguageModel 4 | from langchain.schema.retriever import BaseRetriever 5 | from langchain.schema.runnable import Runnable 6 | from langchain_openai import ChatOpenAI 7 | 8 | from langchain_benchmarks.rag.tasks.langchain_docs.architectures.crqa import ( 9 | create_response_chain, 10 | get_default_response_generator, 11 | ) 12 | 13 | 14 | def default_response_chain( 15 | retriever: BaseRetriever, 16 | response_generator: Optional[Runnable] = None, 17 | llm: Optional[BaseLanguageModel] = None, 18 | ) -> None: 19 | """Get the chain responsible for generating a response based on the retrieved documents.""" 20 | response_generator = response_generator or get_default_response_generator( 21 | llm=llm or ChatOpenAI(model="gpt-3.5-turbo-16k", model_kwargs={"seed": 42}) 22 | ) 23 | return create_response_chain( 24 | response_generator=response_generator, retriever=retriever 25 | ) 26 | 27 | 28 | ARCH_FACTORIES = { 29 | "conversational-retrieval-qa": default_response_chain, 30 | } 31 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/langchain_docs/architectures/crqa.py: -------------------------------------------------------------------------------- 1 | """Chat langchain 'engine'.""" 2 | # TODO: some simplified architectures that are 3 | # environment-agnostic 4 | from operator import itemgetter 5 | from typing import Callable, Dict, List, Optional, Sequence 6 | 7 | from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder 8 | from langchain.schema import Document 9 | from langchain.schema.language_model import BaseLanguageModel 10 | from langchain.schema.messages import AIMessage, HumanMessage 11 | from langchain.schema.output_parser import StrOutputParser 12 | from langchain.schema.retriever import BaseRetriever 13 | from langchain.schema.runnable import ( 14 | Runnable, 15 | RunnableLambda, 16 | ) 17 | from langchain.schema.runnable.passthrough import RunnableAssign 18 | from pydantic import BaseModel 19 | 20 | RESPONSE_TEMPLATE = """\ 21 | You are an expert programmer and problem-solver, tasked with answering any question \ 22 | about Langchain. 23 | 24 | Generate a comprehensive and informative answer of 80 words or less for the \ 25 | given question based solely on the provided search results (URL and content). You must \ 26 | only use information from the provided search results. Use an unbiased and \ 27 | journalistic tone. Combine search results together into a coherent answer. Do not \ 28 | repeat text. Cite search results using [${{number}}] notation. Only cite the most \ 29 | relevant results that answer the question accurately. Place these citations at the end \ 30 | of the sentence or paragraph that reference them - do not put them all at the end. If \ 31 | different results refer to different entities within the same name, write separate \ 32 | answers for each entity. 33 | 34 | You should use bullet points in your answer for readability. Put citations where they apply 35 | rather than putting them all at the end. 36 | 37 | If there is nothing in the context relevant to the question at hand, just say "Hmm, \ 38 | I'm not sure." Don't try to make up an answer. 39 | 40 | Anything between the following `context` html blocks is retrieved from a knowledge \ 41 | bank, not part of the conversation with the user. 42 | 43 | 44 | {context} 45 | 46 | 47 | REMEMBER: If there is no relevant information within the context, just say "Hmm, I'm \ 48 | not sure." Don't try to make up an answer. Anything between the preceding 'context' \ 49 | html blocks is retrieved from a knowledge bank, not part of the conversation with the \ 50 | user.\ 51 | """ 52 | 53 | 54 | class ChatRequest(BaseModel): 55 | question: str 56 | chat_history: Optional[List[Dict[str, str]]] 57 | 58 | 59 | def _format_docs(docs: Sequence[Document]) -> str: 60 | formatted_docs = [] 61 | for i, doc in enumerate(docs): 62 | doc_string = f"{doc.page_content}" 63 | formatted_docs.append(doc_string) 64 | return "\n".join(formatted_docs) 65 | 66 | 67 | def serialize_history(request: ChatRequest): 68 | chat_history = request.get("chat_history") or [] 69 | converted_chat_history = [] 70 | for message in chat_history: 71 | if message.get("human") is not None: 72 | converted_chat_history.append(HumanMessage(content=message["human"])) 73 | if message.get("ai") is not None: 74 | converted_chat_history.append(AIMessage(content=message["ai"])) 75 | return converted_chat_history 76 | 77 | 78 | def get_default_response_generator(llm: BaseLanguageModel) -> Runnable: 79 | prompt = ChatPromptTemplate.from_messages( 80 | [ 81 | ("system", RESPONSE_TEMPLATE), 82 | MessagesPlaceholder(variable_name="chat_history"), 83 | ("human", "{question}"), 84 | ] 85 | ) 86 | 87 | return (prompt | llm | StrOutputParser()).with_config( 88 | run_name="GenerateResponse", 89 | ) 90 | 91 | 92 | def create_response_chain( 93 | response_generator: Runnable, 94 | retriever: BaseRetriever, 95 | format_docs: Optional[Callable[[Sequence[Document]], str]] = None, 96 | format_chat_history: Optional[Callable[[ChatRequest], str]] = None, 97 | ) -> Runnable: 98 | format_docs = format_docs or _format_docs 99 | format_chat_history = format_chat_history or serialize_history 100 | return ( 101 | RunnableAssign( 102 | { 103 | "chat_history": RunnableLambda(format_chat_history).with_config( 104 | run_name="SerializeHistory" 105 | ) 106 | } 107 | ) 108 | | RunnableAssign( 109 | { 110 | "context": ( 111 | itemgetter("question") | retriever | format_docs 112 | ).with_config(run_name="FormatDocs") 113 | } 114 | ) 115 | | response_generator 116 | ) 117 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/langchain_docs/indexing/.gitignore: -------------------------------------------------------------------------------- 1 | db/ 2 | db_docs/ 3 | .sql 4 | .bin 5 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/langchain_docs/indexing/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.rag.tasks.langchain_docs.indexing.retriever_registry import ( 2 | RETRIEVER_FACTORIES, 3 | ) 4 | 5 | __all__ = [ 6 | "RETRIEVER_FACTORIES", 7 | ] 8 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/langchain_docs/indexing/retriever_registry.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Callable, Iterable, Optional 4 | 5 | from langchain.schema.document import Document 6 | from langchain.schema.embeddings import Embeddings 7 | from langchain.schema.retriever import BaseRetriever 8 | from langchain.vectorstores.chroma import Chroma 9 | 10 | from langchain_benchmarks.rag.utils._downloading import ( 11 | fetch_remote_file, 12 | ) 13 | from langchain_benchmarks.rag.utils.indexing import ( 14 | get_hyde_retriever, 15 | get_parent_document_retriever, 16 | get_vectorstore_retriever, 17 | ) 18 | 19 | logger = logging.getLogger(__name__) 20 | _DIRECTORY = os.path.dirname(os.path.abspath(__file__)) 21 | # Stores the scraped documents from the langchain docs website, week of 2023-11-12 22 | REMOTE_DOCS_FILE = "https://storage.googleapis.com/benchmarks-artifacts/langchain-docs-benchmarking/docs.parquet" 23 | DOCS_FILE = os.path.join(_DIRECTORY, "db_docs/docs.parquet") 24 | 25 | _DEFAULT_SEARCH_KWARGS = {"k": 6} 26 | 27 | 28 | def load_docs_from_parquet(filename: Optional[str] = None) -> Iterable[Document]: 29 | try: 30 | import pandas as pd 31 | except ImportError: 32 | raise ImportError( 33 | "Please install pandas to use the langchain docs benchmarking task.\n" 34 | "pip install pandas" 35 | ) 36 | if filename is None: 37 | filename = DOCS_FILE 38 | if not os.path.exists(filename): 39 | fetch_remote_file(REMOTE_DOCS_FILE, filename) 40 | df = pd.read_parquet(filename) 41 | docs_transformed = [Document(**row) for row in df.to_dict(orient="records")] 42 | for doc in docs_transformed: 43 | for k, v in doc.metadata.items(): 44 | if v is None: 45 | doc.metadata[k] = "" 46 | if not doc.page_content.strip(): 47 | continue 48 | yield doc 49 | 50 | 51 | def _chroma_retriever_factory( 52 | embedding: Embeddings, 53 | *, 54 | docs: Optional[Iterable[Document]] = None, 55 | search_kwargs: Optional[dict] = None, 56 | transform_docs: Optional[Callable] = None, 57 | transformation_name: Optional[str] = None, 58 | ) -> BaseRetriever: 59 | docs = docs or load_docs_from_parquet() 60 | embedding_name = embedding.__class__.__name__ 61 | vectorstore = Chroma( 62 | collection_name=f"lcbm-b-{embedding_name}-{transformation_name}", 63 | embedding_function=embedding, 64 | persist_directory="./chromadb", 65 | ) 66 | return get_vectorstore_retriever( 67 | docs, 68 | embedding, 69 | vectorstore, 70 | collection_name="langchain-docs", 71 | transform_docs=transform_docs, 72 | transformation_name=transformation_name, 73 | search_kwargs=search_kwargs or _DEFAULT_SEARCH_KWARGS, 74 | ) 75 | 76 | 77 | def _chroma_parent_document_retriever_factory( 78 | embedding: Embeddings, 79 | *, 80 | docs: Optional[Iterable[Document]] = None, 81 | search_kwargs: Optional[dict] = None, 82 | transformation_name: Optional[str] = None, 83 | ) -> BaseRetriever: 84 | docs = docs or load_docs_from_parquet() 85 | embedding_name = embedding.__class__.__name__ 86 | vectorstore = Chroma( 87 | collection_name=f"lcbm-b-{embedding_name}-{transformation_name}", 88 | embedding_function=embedding, 89 | persist_directory="./chromadb", 90 | ) 91 | return get_parent_document_retriever( 92 | docs, 93 | embedding, 94 | vectorstore, 95 | collection_name="langchain-docs", 96 | search_kwargs=search_kwargs or _DEFAULT_SEARCH_KWARGS, 97 | transformation_name=transformation_name, 98 | ) 99 | 100 | 101 | def _chroma_hyde_retriever_factory( 102 | embedding: Embeddings, 103 | *, 104 | docs: Optional[Iterable[Document]] = None, 105 | search_kwargs: Optional[dict] = None, 106 | transformation_name: Optional[str] = None, 107 | ) -> BaseRetriever: 108 | docs = docs or load_docs_from_parquet() 109 | embedding_name = embedding.__class__.__name__ 110 | vectorstore = Chroma( 111 | collection_name=f"lcbm-hd-{embedding_name}-{transformation_name}", 112 | embedding_function=embedding, 113 | persist_directory="./chromadb", 114 | ) 115 | return get_hyde_retriever( 116 | docs, 117 | embedding, 118 | vectorstore, 119 | collection_name="langchain-docs", 120 | search_kwargs=search_kwargs or _DEFAULT_SEARCH_KWARGS, 121 | transformation_name=transformation_name, 122 | ) 123 | 124 | 125 | RETRIEVER_FACTORIES = { 126 | "basic": _chroma_retriever_factory, 127 | "parent-doc": _chroma_parent_document_retriever_factory, 128 | "hyde": _chroma_hyde_retriever_factory, 129 | } 130 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/langchain_docs/task.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | 3 | from langchain.schema.document import Document 4 | 5 | from langchain_benchmarks.rag.tasks.langchain_docs import architectures, indexing 6 | from langchain_benchmarks.rag.tasks.langchain_docs.indexing.retriever_registry import ( 7 | DOCS_FILE, 8 | load_docs_from_parquet, 9 | ) 10 | from langchain_benchmarks.schema import RetrievalTask 11 | 12 | # URL of public LangChain Docs dataset 13 | DATASET_ID = "https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d" 14 | 15 | 16 | def load_cached_docs() -> Iterable[Document]: 17 | """Load the docs from the cached file.""" 18 | return load_docs_from_parquet(DOCS_FILE) 19 | 20 | 21 | LANGCHAIN_DOCS_TASK = RetrievalTask( 22 | name="LangChain Docs Q&A", 23 | dataset_id=DATASET_ID, 24 | retriever_factories=indexing.RETRIEVER_FACTORIES, 25 | architecture_factories=architectures.ARCH_FACTORIES, 26 | get_docs=load_cached_docs, 27 | description=( 28 | """\ 29 | Questions and answers based on a snapshot of the LangChain python docs. 30 | 31 | The environment provides the documents and the retriever information. 32 | 33 | Each example is composed of a question and reference answer. 34 | 35 | Success is measured based on the accuracy of the answer relative to the reference answer. 36 | We also measure the faithfulness of the model's response relative to the retrieved documents (if any). 37 | """ # noqa: E501 38 | ), 39 | ) 40 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/multi_modal_slide_decks/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.indexing.retriever_registry import ( 2 | get_file_names, 3 | ) 4 | 5 | __all__ = ["get_file_names"] 6 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/multi_modal_slide_decks/indexing/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.indexing.retriever_registry import ( 2 | get_file_names, 3 | ) 4 | 5 | __all__ = ["get_file_names"] 6 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/multi_modal_slide_decks/indexing/retriever_registry.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import zipfile 4 | from pathlib import Path 5 | from typing import Iterable, Optional 6 | 7 | from langchain_benchmarks.rag.utils._downloading import ( 8 | fetch_remote_file, 9 | is_folder_populated, 10 | ) 11 | 12 | logger = logging.getLogger(__name__) 13 | _DIRECTORY = Path(os.path.abspath(__file__)).parent 14 | # Stores the zipped pdfs for this dataset 15 | REMOTE_DOCS_FILE = "https://storage.googleapis.com/benchmarks-artifacts/langchain-docs-benchmarking/multi_modal_slide_decks.zip" 16 | DOCS_DIR = _DIRECTORY / "pdfs" 17 | 18 | 19 | def fetch_raw_docs( 20 | filename: Optional[str] = None, docs_dir: Optional[str] = None 21 | ) -> None: 22 | filename = filename or _DIRECTORY / Path(REMOTE_DOCS_FILE).name 23 | docs_dir = docs_dir or DOCS_DIR 24 | if not is_folder_populated(docs_dir): 25 | fetch_remote_file(REMOTE_DOCS_FILE, filename) 26 | with zipfile.ZipFile(filename, "r") as zip_ref: 27 | zip_ref.extractall(docs_dir) 28 | 29 | os.remove(filename) 30 | 31 | 32 | def get_file_names() -> Iterable[Path]: 33 | fetch_raw_docs() 34 | # Traverse the directory and partition the pdfs 35 | for path in DOCS_DIR.rglob("*.pdf"): 36 | # Ignore __MACOSX 37 | if "__MACOSX" in str(path): 38 | continue 39 | yield path 40 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/multi_modal_slide_decks/task.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.schema import RetrievalTask 2 | 3 | # ID of public Multi Modal Slide Decks dataset 4 | DATASET_ID = "https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d" 5 | 6 | MULTI_MODAL_SLIDE_DECKS_TASK = RetrievalTask( 7 | name="Multi-modal slide decks", 8 | dataset_id=DATASET_ID, 9 | retriever_factories={}, 10 | architecture_factories={}, 11 | get_docs={}, 12 | description=( 13 | """\ 14 | This public dataset is a work-in-progress and will be extended over time. 15 | 16 | Questions and answers based on slide decks containing visual tables and charts. 17 | 18 | Each example is composed of a question and reference answer. 19 | 20 | Success is measured based on the accuracy of the answer relative to the reference answer. 21 | """ # noqa: E501 22 | ), 23 | ) 24 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/semi_structured_reports/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.rag.tasks.semi_structured_reports.indexing.retriever_registry import ( 2 | get_file_names, 3 | ) 4 | from langchain_benchmarks.rag.tasks.semi_structured_reports.task import ( 5 | SEMI_STRUCTURED_REPORTS_TASK, 6 | ) 7 | 8 | # Please keep this sorted 9 | __all__ = ["get_file_names", "SEMI_STRUCTURED_REPORTS_TASK"] 10 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/semi_structured_reports/indexing/.gitignore: -------------------------------------------------------------------------------- 1 | pdfs/ 2 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/semi_structured_reports/indexing/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.rag.tasks.semi_structured_reports.indexing.retriever_registry import ( 2 | RETRIEVER_FACTORIES, 3 | load_docs, 4 | ) 5 | 6 | __all__ = ["RETRIEVER_FACTORIES", "load_docs"] 7 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/tasks/semi_structured_reports/task.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.rag.tasks.semi_structured_reports import ( 2 | indexing, 3 | ) 4 | from langchain_benchmarks.rag.tasks.semi_structured_reports.indexing.retriever_registry import ( 5 | load_docs, 6 | ) 7 | from langchain_benchmarks.schema import RetrievalTask 8 | 9 | # ID of public Semi-structured Earnings dataset 10 | DATASET_ID = "https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d" 11 | 12 | SEMI_STRUCTURED_REPORTS_TASK = RetrievalTask( 13 | name="Semi-structured Reports", 14 | dataset_id=DATASET_ID, 15 | retriever_factories=indexing.RETRIEVER_FACTORIES, 16 | architecture_factories={}, 17 | get_docs=load_docs, 18 | description=( 19 | """\ 20 | Questions and answers based on PDFs containing tables and charts. 21 | 22 | The task provides the raw documents as well as factory methods to easily index them 23 | and create a retriever. 24 | 25 | Each example is composed of a question and reference answer. 26 | 27 | Success is measured based on the accuracy of the answer relative to the reference answer. 28 | We also measure the faithfulness of the model's response relative to the retrieved documents (if any). 29 | """ # noqa: E501 30 | ), 31 | ) 32 | -------------------------------------------------------------------------------- /langchain_benchmarks/rag/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/langchain_benchmarks/rag/utils/__init__.py -------------------------------------------------------------------------------- /langchain_benchmarks/rag/utils/_downloading.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import requests 4 | 5 | 6 | def is_folder_populated(folder: str): 7 | if os.path.exists(folder): 8 | return any(os.scandir(folder)) 9 | return False 10 | 11 | 12 | def fetch_remote_file(remote: str, local: str): 13 | if not os.path.exists(local): 14 | print(f"File {local} does not exist. Downloading from GCS...") 15 | if not os.path.exists(os.path.dirname(local)): 16 | os.makedirs(os.path.dirname(local)) 17 | r = requests.get(remote, allow_redirects=True) 18 | with open(local, "wb") as f: 19 | f.write(r.content) 20 | print(f"File {remote} downloaded.") 21 | -------------------------------------------------------------------------------- /langchain_benchmarks/rate_limiting.py: -------------------------------------------------------------------------------- 1 | """Implementation of a rate limiter based on a token bucket.""" 2 | import threading 3 | import time 4 | from typing import Any, Optional 5 | 6 | from langchain.schema.runnable import Runnable, RunnableLambda 7 | from langchain.schema.runnable.utils import Input, Output 8 | 9 | 10 | class RateLimiter: 11 | def __init__( 12 | self, 13 | *, 14 | requests_per_second: float = 1, 15 | check_every_n_seconds: float = 0.1, 16 | max_bucket_size: float = 1, 17 | ) -> None: 18 | """A rate limiter based on a token bucket. 19 | 20 | These *tokens* have NOTHING to do with LLM tokens. They are just 21 | a way to keep track of how many requests can be made at a given time. 22 | 23 | This rate limiter is designed to work in a threaded environment. 24 | 25 | It works by filling up a bucket with tokens at a given rate. Each 26 | request consumes a given number of tokens. If there are not enough 27 | tokens in the bucket, the request is blocked until there are enough 28 | tokens. 29 | 30 | Args: 31 | requests_per_second: The number of tokens to add per second to the bucket. 32 | Must be at least 1. The tokens represent "credit" that can be used 33 | to make requests. 34 | check_every_n_seconds: check whether the tokens are available 35 | every this many seconds. Can be a float to represent 36 | fractions of a second. 37 | max_bucket_size: The maximum number of tokens that can be in the bucket. 38 | This is used to prevent bursts of requests. 39 | """ 40 | # Number of requests that we can make per second. 41 | self.requests_per_second = requests_per_second 42 | # Number of tokens in the bucket. 43 | self.available_tokens = 0.0 44 | self.max_bucket_size = max_bucket_size 45 | # A lock to ensure that tokens can only be consumed by one thread 46 | # at a given time. 47 | self._consume_lock = threading.Lock() 48 | # The last time we tried to consume tokens. 49 | self.last: Optional[time.time] = None 50 | self.check_every_n_seconds = check_every_n_seconds 51 | 52 | def _consume(self) -> bool: 53 | """Consume the given amount of tokens if possible. 54 | 55 | Returns: 56 | True means that the tokens were consumed, and the caller can proceed to 57 | make the request. A False means that the tokens were not consumed, and 58 | the caller should try again later. 59 | """ 60 | with self._consume_lock: 61 | now = time.time() 62 | 63 | # initialize on first call to avoid a burst 64 | if self.last is None: 65 | self.last = now 66 | 67 | elapsed = now - self.last 68 | 69 | if elapsed * self.requests_per_second >= 1: 70 | self.available_tokens += elapsed * self.requests_per_second 71 | self.last = now 72 | 73 | # Make sure that we don't exceed the bucket size. 74 | # This is used to prevent bursts of requests. 75 | self.available_tokens = min(self.available_tokens, self.max_bucket_size) 76 | 77 | # As long as we have at least one token, we can proceed. 78 | if self.available_tokens >= 1: 79 | self.available_tokens -= 1 80 | return True 81 | 82 | return False 83 | 84 | def wait(self) -> None: 85 | """Blocking call to wait until the given number of tokens are available.""" 86 | while not self._consume(): 87 | time.sleep(self.check_every_n_seconds) 88 | 89 | 90 | def with_rate_limit( 91 | runnable: Runnable[Input, Output], 92 | rate_limiter: RateLimiter, 93 | ) -> Runnable[Input, Output]: 94 | """Add a rate limiter to the runnable. 95 | 96 | Args: 97 | runnable: The runnable to throttle. 98 | rate_limiter: The throttle to use. 99 | 100 | Returns: 101 | A runnable lambda that acts as a throttled passthrough. 102 | """ 103 | 104 | def _wait(input: dict, **kwargs: Any) -> dict: 105 | """Wait for the rate limiter to allow the request to proceed.""" 106 | rate_limiter.wait() 107 | return input 108 | 109 | return RunnableLambda(_wait).with_config({"name": "Wait"}) | runnable 110 | -------------------------------------------------------------------------------- /langchain_benchmarks/registration.py: -------------------------------------------------------------------------------- 1 | """Registry of environments for ease of access.""" 2 | 3 | from langchain_benchmarks.extraction.tasks import ( 4 | chat_extraction, 5 | email_task, 6 | high_cardinality, 7 | ) 8 | from langchain_benchmarks.rag.tasks import ( 9 | LANGCHAIN_DOCS_TASK, 10 | MULTI_MODAL_SLIDE_DECKS_TASK, 11 | SEMI_STRUCTURED_REPORTS_TASK, 12 | ) 13 | from langchain_benchmarks.schema import Registry 14 | from langchain_benchmarks.tool_usage.tasks import ( 15 | multiverse_math, 16 | relational_data, 17 | type_writer, 18 | type_writer_26_funcs, 19 | ) 20 | 21 | # Using lower case naming to make a bit prettier API when used in a notebook 22 | registry = Registry( 23 | tasks=[ 24 | type_writer.TYPE_WRITER_TASK, 25 | type_writer_26_funcs.TYPE_WRITER_26_FUNCS_TASK, 26 | relational_data.RELATIONAL_DATA_TASK, 27 | multiverse_math.MULTIVERSE_MATH, 28 | email_task.EMAIL_EXTRACTION_TASK, 29 | chat_extraction.CHAT_EXTRACTION_TASK, 30 | LANGCHAIN_DOCS_TASK, 31 | SEMI_STRUCTURED_REPORTS_TASK, 32 | MULTI_MODAL_SLIDE_DECKS_TASK, 33 | high_cardinality.NAME_CORRECTION_TASK, 34 | ] 35 | ) 36 | -------------------------------------------------------------------------------- /langchain_benchmarks/tool_usage/README.md: -------------------------------------------------------------------------------- 1 | # Tool usage 2 | 3 | This sub-package includes code to help test how well tools can be used to make 4 | decisions. -------------------------------------------------------------------------------- /langchain_benchmarks/tool_usage/__init__.py: -------------------------------------------------------------------------------- 1 | """Package for helping to evaluate agent runs.""" 2 | from langchain_benchmarks.tool_usage.agents import ( 3 | CustomRunnableAgentFactory, 4 | StandardAgentFactory, 5 | apply_agent_executor_adapter, 6 | ) 7 | from langchain_benchmarks.tool_usage.evaluators import get_eval_config 8 | 9 | # Please keep this list sorted! 10 | __all__ = [ 11 | "apply_agent_executor_adapter", 12 | "CustomRunnableAgentFactory", 13 | "get_eval_config", 14 | "StandardAgentFactory", 15 | ] 16 | -------------------------------------------------------------------------------- /langchain_benchmarks/tool_usage/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter 2 | from langchain_benchmarks.tool_usage.agents.runnable_agent import ( 3 | CustomRunnableAgentFactory, 4 | ) 5 | from langchain_benchmarks.tool_usage.agents.tool_using_agent import StandardAgentFactory 6 | 7 | __all__ = [ 8 | "apply_agent_executor_adapter", 9 | "CustomRunnableAgentFactory", 10 | "StandardAgentFactory", 11 | ] 12 | -------------------------------------------------------------------------------- /langchain_benchmarks/tool_usage/agents/adapters.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Optional 2 | 3 | from langchain.agents import AgentExecutor 4 | from langchain_core.runnables import Runnable, RunnableLambda, RunnablePassthrough 5 | 6 | 7 | def _ensure_output_exists(inputs: dict) -> dict: 8 | """Make sure that the output key is always present.""" 9 | if "output" not in inputs: 10 | return {"output": "", **inputs} 11 | return inputs 12 | 13 | 14 | def apply_agent_executor_adapter( 15 | agent_executor: AgentExecutor, 16 | *, 17 | state_reader: Optional[Callable[[], Any]] = None, 18 | ) -> Runnable: 19 | """An adapter for the agent executor to standardize its input and output. 20 | 21 | 1) Map `question` to `input` (`question` is used in the datasets, 22 | but `input` is used in the agent executor) 23 | 2) Ensure that `output` is always returned (will be set to "" if missing) -- 24 | note that this may be relaxed after more updates in the eval config. 25 | 3) Populate `state` key in the response of the agent with the system state 26 | if a state reader is provided. 27 | 28 | Args: 29 | agent_executor: the agent executor 30 | state_reader: A callable without parameters that if invoked will return 31 | the state of the environment. Used to populate the 'state' key. 32 | 33 | Returns: 34 | a new runnable with a standardized output. 35 | """ 36 | 37 | def _read_state(*args: Any, **kwargs: Any) -> Any: 38 | """Read the state of the environment.""" 39 | if state_reader is not None: 40 | return state_reader() 41 | else: 42 | return None 43 | 44 | runnable = agent_executor | RunnableLambda(_ensure_output_exists).with_config( 45 | {"run_name": "Ensure Output"} 46 | ) 47 | 48 | if state_reader is not None: 49 | runnable = runnable | RunnablePassthrough.assign(state=_read_state).with_config( 50 | {"run_name": "Read Env State"} 51 | ) 52 | return runnable 53 | -------------------------------------------------------------------------------- /langchain_benchmarks/tool_usage/agents/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | from langchain_core.runnables import Runnable 4 | 5 | 6 | class AgentFactory(abc.ABC): 7 | """Abstract class for agent factory""" 8 | 9 | @abc.abstractmethod 10 | def __call__(self) -> Runnable: 11 | """Create a new agent""" 12 | -------------------------------------------------------------------------------- /langchain_benchmarks/tool_usage/agents/runnable_agent.py: -------------------------------------------------------------------------------- 1 | """Factory for creating agents for the tool usage task.""" 2 | from typing import Union 3 | 4 | from langchain.agents.agent import ( 5 | AgentExecutor, 6 | BaseMultiActionAgent, 7 | BaseSingleActionAgent, 8 | ) 9 | from langchain_core.runnables import Runnable 10 | 11 | from langchain_benchmarks.schema import ToolUsageTask 12 | from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter 13 | from langchain_benchmarks.tool_usage.agents.base import AgentFactory 14 | 15 | 16 | class CustomRunnableAgentFactory(AgentFactory): 17 | """A factory for creating tool using agents. 18 | 19 | A factory for agents that do not leverage any special JSON mode for 20 | function usage; instead all function invocation behavior is implemented solely 21 | through prompt engineering and parsing. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | task: ToolUsageTask, 27 | agent: Union[Runnable, BaseSingleActionAgent, BaseMultiActionAgent], 28 | ) -> None: 29 | """Create an agent factory for the given tool usage task. 30 | 31 | Note: The agent should not be stateful, as it will be reused across 32 | multiple runs. 33 | 34 | Args: 35 | task: The task to create an agent factory for 36 | agent: The agent to use 37 | """ 38 | self.task = task 39 | self.agent = agent 40 | 41 | def __call__(self) -> Runnable: 42 | env = self.task.create_environment() 43 | executor = AgentExecutor( 44 | agent=self.agent, 45 | tools=env.tools, 46 | handle_parsing_errors=True, 47 | return_intermediate_steps=True, 48 | ) 49 | 50 | return apply_agent_executor_adapter( 51 | executor, state_reader=env.read_state 52 | ).with_config({"run_name": "Agent", "metadata": {"task": self.task.name}}) 53 | -------------------------------------------------------------------------------- /langchain_benchmarks/tool_usage/agents/tool_using_agent.py: -------------------------------------------------------------------------------- 1 | """Factory for creating agents. 2 | 3 | This is useful for agents that follow the standard LangChain tool format. 4 | """ 5 | from typing import Optional 6 | 7 | from langchain.agents import AgentExecutor, create_tool_calling_agent 8 | from langchain_core.language_models import BaseChatModel 9 | from langchain_core.prompts import ChatPromptTemplate 10 | from langchain_core.runnables import Runnable 11 | 12 | from langchain_benchmarks.rate_limiting import RateLimiter, with_rate_limit 13 | from langchain_benchmarks.schema import ToolUsageTask 14 | from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter 15 | from langchain_benchmarks.tool_usage.agents.base import AgentFactory 16 | 17 | 18 | class StandardAgentFactory(AgentFactory): 19 | """A standard agent factory. 20 | 21 | Use this factory with chat models that support the standard LangChain tool 22 | calling API where the chat model populates the tool_calls attribute on AIMessage. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | task: ToolUsageTask, 28 | model: BaseChatModel, 29 | prompt: ChatPromptTemplate, 30 | *, 31 | rate_limiter: Optional[RateLimiter] = None, 32 | ) -> None: 33 | """Create an agent factory for the given tool usage task. 34 | 35 | Args: 36 | task: The task to create an agent factory for 37 | model: chat model to use, must support tool usage 38 | prompt: This is a chat prompt at the moment. 39 | Must include an agent_scratchpad 40 | 41 | For example, 42 | 43 | ChatPromptTemplate.from_messages( 44 | [ 45 | ("system", "{instructions}"), 46 | ("human", "{input}"), 47 | MessagesPlaceholder("agent_scratchpad"), 48 | ] 49 | ) 50 | rate_limiter: will be appended to the agent runnable 51 | """ 52 | self.task = task 53 | self.model = model 54 | self.prompt = prompt 55 | self.rate_limiter = rate_limiter 56 | 57 | def __call__(self) -> Runnable: 58 | """Call the factory to create Runnable agent.""" 59 | 60 | env = self.task.create_environment() 61 | 62 | if "instructions" in self.prompt.input_variables: 63 | finalized_prompt = self.prompt.partial(instructions=self.task.instructions) 64 | else: 65 | finalized_prompt = self.prompt 66 | 67 | agent = create_tool_calling_agent(self.model, env.tools, finalized_prompt) 68 | 69 | if self.rate_limiter: 70 | agent = with_rate_limit(agent, self.rate_limiter) 71 | 72 | executor = AgentExecutor( 73 | agent=agent, 74 | tools=env.tools, 75 | handle_parsing_errors=True, 76 | return_intermediate_steps=True, 77 | ) 78 | 79 | return apply_agent_executor_adapter( 80 | executor, state_reader=env.read_state 81 | ).with_config({"run_name": "Agent", "metadata": {"task": self.task.name}}) 82 | -------------------------------------------------------------------------------- /langchain_benchmarks/tool_usage/prompts.py: -------------------------------------------------------------------------------- 1 | from langchain_core.prompts import PromptTemplate 2 | 3 | MATH_TEMPLATE = """\ 4 | You live in an alternate universe. Do not assume that you know anything. 5 | You are a teacher grading a quiz. 6 | You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either CORRECT or INCORRECT. 7 | 8 | Example Format: 9 | QUESTION: question here 10 | STUDENT ANSWER: student's answer here 11 | TRUE ANSWER: true answer here 12 | GRADE: CORRECT or INCORRECT here 13 | 14 | Given that you live in an alternate universe the TRUE answer may be different from what you expect. That's OK! 15 | 16 | Grade the student answers based ONLY on whether it matches the TRUE answer. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin! 17 | 18 | QUESTION: {query} 19 | STUDENT ANSWER: {result} 20 | TRUE ANSWER: {answer} 21 | GRADE:""" 22 | QA_TEMPLATE_FOR_MULTIVERSE_MATH = PromptTemplate( 23 | input_variables=["result", "answer"], template=MATH_TEMPLATE 24 | ) 25 | 26 | MATH_TEMPLATE_NO_QUESTION = """\ 27 | Compare the INPUT_A and INPUT_B and determine whether the numeric result in them is the same. 28 | 29 | If the result is the same, reply with CORRECT. If the result is different, reply with INCORRECT. 30 | 31 | Example Format: 32 | INPUT_A: input_a here 33 | INPUT_B: input_b here 34 | COMPARISON: CORRECT or INCORRECT here 35 | 36 | Ignore differences in punctuation and phrasing between the student answer and true answer, please only compare the first 4 decimal digits. 37 | 38 | For instance if INPUT_A = 123.6751345 and INPUT_B = 123.6751456 you should return CORRECT, since the first 4 decimal points match. 39 | 40 | Begin! 41 | 42 | INPUT_A: {answer} 43 | INPUT_B: {result} 44 | COMPARISON:""" 45 | 46 | # Version without the query 47 | QA_TEMPLATE_FOR_MULTIVERSE_MATH_WITHOUT_QUESTION = PromptTemplate( 48 | input_variables=["result", "answer"], template=MATH_TEMPLATE_NO_QUESTION 49 | ) 50 | -------------------------------------------------------------------------------- /langchain_benchmarks/tool_usage/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/langchain_benchmarks/tool_usage/tasks/__init__.py -------------------------------------------------------------------------------- /langchain_benchmarks/tool_usage/tasks/type_writer.py: -------------------------------------------------------------------------------- 1 | """A task where the agent must type a given string one letter at a time. 2 | 3 | In this variation of the task, the agent is given a single function, 4 | that takes a letter as an argument. 5 | """ 6 | import dataclasses 7 | from typing import Any, Callable, List, cast 8 | 9 | from langchain.tools import BaseTool, tool 10 | 11 | from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask 12 | 13 | 14 | @dataclasses.dataclass 15 | class Paper: 16 | """A piece of paper that the agent can write on.""" 17 | 18 | content: str 19 | 20 | 21 | def create_typer(paper: Paper) -> Callable[[str], str]: 22 | """Create a function that types the given letter.""" 23 | 24 | def type_letter(letter: str) -> str: 25 | """Print the given letter on the paper.""" 26 | if len(letter) != 1: 27 | return "ERROR: The letter must be a single character." 28 | paper.content += letter 29 | return "OK" 30 | 31 | return type_letter 32 | 33 | 34 | # PUBLIC API 35 | 36 | 37 | def get_environment() -> ToolUsageEnvironment: 38 | """Create tools and state reader. 39 | 40 | Attention: this is a factory function, so it will create a new environment 41 | every time it is called. The paper contains state. 42 | 43 | Returns: 44 | A tuple of (tools, state_reader). 45 | """ 46 | paper = Paper(content="") # Start with an empty piece of paper 47 | 48 | def _read_state() -> Any: 49 | """Read the state of the environment.""" 50 | return paper.content 51 | 52 | tools = cast(List[BaseTool], [tool(create_typer(paper))]) 53 | 54 | return ToolUsageEnvironment( 55 | tools=tools, 56 | read_state=_read_state, 57 | ) 58 | 59 | 60 | TYPE_WRITER_TASK = ToolUsageTask( 61 | name="Tool Usage - Typewriter (1 tool)", 62 | dataset_id="https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d", 63 | create_environment=get_environment, 64 | instructions=( 65 | "Repeat the given string using the provided tools. " 66 | "Do not write anything else or provide any explanations. " 67 | "For example, if the string is 'abc', you must print the letters " 68 | "'a', 'b', and 'c' one at a time and in that order. " 69 | ), 70 | description=( 71 | """\ 72 | Environment with a single tool that accepts a single letter as input, and \ 73 | prints it on a piece of virtual paper. 74 | 75 | The objective of this task is to evaluate the ability of the model to use the provided \ 76 | tools to repeat a given input string. 77 | 78 | For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \ 79 | in that order. 80 | 81 | The dataset includes examples of varying difficulty. The difficulty is measured \ 82 | by the length of the string. 83 | """ 84 | ), 85 | eval_params={ 86 | # For this task, the agent's output is irrelevant 87 | # what we care about is the final state of the environment 88 | # (i.e., what's written on the virtual paper) 89 | "output_evaluation": "none", 90 | }, 91 | ) 92 | 93 | 94 | STRINGS_TO_TYPE = [ 95 | # letter repetition 96 | "a", 97 | "aa", 98 | "aaa", 99 | "aaaa", 100 | # 3-letter words 101 | "dog", 102 | "cat", 103 | # 4-letter words 104 | "hand", 105 | "head", 106 | # 5-letter words 107 | "house", 108 | "horse", 109 | # 6-letter words 110 | "school", 111 | "church", 112 | # 7-letter words 113 | "teacher", 114 | "student", 115 | # 8-letter words 116 | "computer", 117 | "keyboard", 118 | # 9-letter words 119 | "university", 120 | "dictionary", 121 | # 10-letter words 122 | "information", 123 | "communication", 124 | ] 125 | 126 | 127 | def _create_dataset(strings: List[str]) -> List[dict]: 128 | """Create the dataset.""" 129 | dataset = [] 130 | for string in strings: 131 | dataset.append( 132 | { 133 | "question": string, 134 | "expected_steps": ["type_letter"] * len(string), 135 | "state": string, 136 | } 137 | ) 138 | return dataset 139 | 140 | 141 | DATASET = _create_dataset(STRINGS_TO_TYPE) 142 | 143 | 144 | def _create_dataset() -> None: 145 | """Create a dataset with the langsmith client.""" 146 | from langsmith.client import Client 147 | 148 | client = Client() 149 | dataset = client.create_dataset( 150 | dataset_name=TYPE_WRITER_TASK.name, 151 | description=TYPE_WRITER_TASK.description, 152 | ) 153 | 154 | for example in DATASET: 155 | client.create_example( 156 | inputs={ 157 | "question": example["question"], 158 | }, 159 | outputs={ 160 | "reference": example["state"], 161 | "expected_steps": example["expected_steps"], 162 | "state": example["state"], 163 | }, 164 | dataset_id=dataset.id, 165 | ) 166 | -------------------------------------------------------------------------------- /langchain_benchmarks/tool_usage/tasks/type_writer_26_funcs.py: -------------------------------------------------------------------------------- 1 | """A task where the agent must type a given string one letter at a time. 2 | 3 | In this variation of the task, the agent is given access to 26 parameterless functions, 4 | each representing a letter of the alphabet. 5 | """ 6 | import dataclasses 7 | from typing import Any, Callable, List, cast 8 | 9 | from langchain.tools import BaseTool, tool 10 | 11 | from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask 12 | 13 | 14 | @dataclasses.dataclass 15 | class Paper: 16 | """A piece of paper that the agent can write on.""" 17 | 18 | content: str 19 | 20 | 21 | def _create_typing_func(letter: str, paper: Paper) -> Callable[[], str]: 22 | """Create a function that types the given letter.""" 23 | 24 | def func() -> str: 25 | paper.content += letter 26 | return "OK" 27 | 28 | func.__doc__ = f'Run to Type the letter "{letter}".' 29 | func.__name__ = letter 30 | return func 31 | 32 | 33 | def _get_available_functions(paper: Paper) -> List[Callable]: 34 | """Get all the available functions.""" 35 | return [ 36 | _create_typing_func(letter, paper) for letter in "abcdefghijklmnopqrstuvwxyz" 37 | ] 38 | 39 | 40 | # PUBLIC API 41 | 42 | 43 | def get_environment() -> ToolUsageEnvironment: 44 | """Create tools and state reader. 45 | 46 | Attention: this is a factory function, so it will create a new environment 47 | every time it is called. The paper contains state. 48 | 49 | Returns: 50 | A tuple of (tools, state_reader). 51 | """ 52 | paper = Paper(content="") # Start with an empty piece of paper 53 | functions = _get_available_functions(paper) 54 | 55 | def _read_state() -> Any: 56 | """Read the state of the environment.""" 57 | return paper.content 58 | 59 | tools = cast(List[BaseTool], [tool(f) for f in functions]) 60 | 61 | return ToolUsageEnvironment( 62 | tools=tools, 63 | read_state=_read_state, 64 | ) 65 | 66 | 67 | TYPE_WRITER_26_FUNCS_TASK = ToolUsageTask( 68 | name="Tool Usage - Typewriter (26 tools)", 69 | dataset_id="https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d", 70 | create_environment=get_environment, 71 | instructions=( 72 | "Repeat the given string by using the provided tools. " 73 | "Do not write anything else or provide any explanations. " 74 | "For example, if the string is 'abc', you must invoke the tools " 75 | "'a', 'b', and 'c' in that order. " 76 | "Please invoke the functions without any arguments." 77 | ), 78 | description=( 79 | """\ 80 | Environment with 26 tools each tool represents a letter of the alphabet. 81 | 82 | The objective of this task is to evaluate the model's ability the use tools 83 | for a simple repetition task. 84 | 85 | For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \ 86 | in that order. 87 | 88 | The dataset includes examples of varying difficulty. The difficulty is measured \ 89 | by the length of the string. 90 | 91 | This is a variation of the typer writer task, where 26 parameterless tools are 92 | given instead of a single tool that takes a letter as an argument. 93 | """ 94 | ), 95 | eval_params={ 96 | # For this task, the agent's output is irrelevant 97 | # what we care about is the final state of the environment 98 | # (i.e., what's written on the virtual paper) 99 | "output_evaluation": "none", 100 | }, 101 | ) 102 | 103 | STRINGS_TO_TYPE = [ 104 | # letter repetition 105 | "a", 106 | "aa", 107 | "aaa", 108 | "aaaa", 109 | # 3-letter words 110 | "dog", 111 | "cat", 112 | # 4-letter words 113 | "hand", 114 | "head", 115 | # 5-letter words 116 | "house", 117 | "horse", 118 | # 6-letter words 119 | "school", 120 | "church", 121 | # 7-letter words 122 | "teacher", 123 | "student", 124 | # 8-letter words 125 | "computer", 126 | "keyboard", 127 | # 9-letter words 128 | "university", 129 | "dictionary", 130 | # 10-letter words 131 | "information", 132 | "communication", 133 | ] 134 | 135 | 136 | def _create_dataset(strings: List[str]) -> List[dict]: 137 | """Create the dataset.""" 138 | dataset = [] 139 | for string in strings: 140 | dataset.append( 141 | { 142 | "question": string, 143 | "expected_steps": [c for c in string], 144 | "state": string, 145 | } 146 | ) 147 | return dataset 148 | 149 | 150 | DATASET = _create_dataset(STRINGS_TO_TYPE) 151 | 152 | 153 | def _create_dataset() -> None: 154 | """Create a dataset with the langsmith client.""" 155 | from langsmith.client import Client 156 | 157 | client = Client() 158 | dataset = client.create_dataset( 159 | dataset_name=TYPE_WRITER_26_FUNCS_TASK.name, 160 | description=TYPE_WRITER_26_FUNCS_TASK.description, 161 | ) 162 | 163 | for example in DATASET: 164 | client.create_example( 165 | inputs={ 166 | "question": example["question"], 167 | }, 168 | outputs={ 169 | "reference": example["state"], 170 | "expected_steps": example["expected_steps"], 171 | "state": example["state"], 172 | }, 173 | dataset_id=dataset.id, 174 | ) 175 | -------------------------------------------------------------------------------- /langchain_benchmarks/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.utils._langsmith import run_without_langsmith 2 | 3 | __all__ = ["run_without_langsmith"] 4 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "langchain-benchmarks" 3 | version = "0.0.15" 4 | description = "🦜💪 Flex those feathers!" 5 | authors = ["LangChain AI"] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.9" 11 | langchain = "^0.3" 12 | langchain-community = "^0.3" 13 | langchain-core= "^0.3.12" 14 | langsmith = ">=0.0.70" 15 | tqdm = "^4" 16 | ipywidgets = "^8" 17 | tabulate = ">=0.8.0" 18 | langchain-openai = "^0.2" 19 | 20 | [tool.poetry.group.dev] 21 | optional = true 22 | 23 | [tool.poetry.group.dev.dependencies] 24 | jupyter = "^1.0.0" 25 | 26 | [tool.poetry.group.typing] 27 | optional = true 28 | 29 | [tool.poetry.group.typing.dependencies] 30 | mypy = "^1.7.0" 31 | [tool.poetry.group.lint] 32 | optional = true 33 | 34 | [tool.poetry.group.lint.dependencies] 35 | ruff = "^0.1.5" 36 | 37 | [tool.poetry.group.docs] 38 | optional = true 39 | 40 | [tool.poetry.group.docs.dependencies] 41 | nbsphinx = ">=0.8.9" 42 | sphinx = ">=5.2.0" 43 | sphinx-autobuild = "^2021.3.14" 44 | sphinx_book_theme = "^1.0.0" 45 | myst-nb = { version = "^1.0.0", python = "^3.9" } 46 | toml = "^0.10.2" 47 | sphinx-copybutton = ">=0.5.1" 48 | 49 | [tool.poetry.group.test] 50 | optional = true 51 | 52 | [tool.poetry.group.test.dependencies] 53 | pytest = "^7.2.1" 54 | pytest-cov = "^4.0.0" 55 | pytest-asyncio = "^0.21.1" 56 | pytest-mock = "^3.11.1" 57 | pytest-socket = "^0.6.0" 58 | pytest-watch = "^4.2.0" 59 | pytest-timeout = "^2.2.0" 60 | freezegun = "^1.3.1" 61 | langchain-anthropic = "^0.2" 62 | langchain-fireworks = "^0.2" 63 | langchain-mistralai = "^0.2" 64 | langchain-groq = "^0.2" 65 | langchain-core = "^0.3.12" 66 | faiss-cpu = ">=1.8.0" 67 | 68 | [tool.ruff] 69 | select = [ 70 | "E", # pycodestyle 71 | "F", # pyflakes 72 | "I", # isort 73 | ] 74 | extend-include = ["*.ipynb"] 75 | 76 | # Same as Black. 77 | line-length = 88 78 | 79 | [tool.ruff.isort] 80 | known-first-party = ["langchain-benchmarks"] 81 | 82 | [tool.mypy] 83 | disallow_untyped_defs = "True" 84 | ignore_missing_imports = "True" 85 | 86 | [tool.coverage.run] 87 | omit = [ 88 | "tests/*", 89 | ] 90 | 91 | 92 | [build-system] 93 | requires = ["poetry-core"] 94 | build-backend = "poetry.core.masonry.api" 95 | 96 | [tool.pytest.ini_options] 97 | # --strict-markers will raise errors on unknown marks. 98 | # https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks 99 | # 100 | # https://docs.pytest.org/en/7.1.x/reference/reference.html 101 | # --strict-config any warnings encountered while parsing the `pytest` 102 | # section of the configuration file raise errors. 103 | addopts = "--strict-markers --strict-config --durations=5 -vv" 104 | # Global timeout for all tests. There shuold be a good reason for a test to 105 | # take more than 5 second 106 | timeout = 5 107 | -------------------------------------------------------------------------------- /scripts/check_datasets.py: -------------------------------------------------------------------------------- 1 | """Script to check that all registered datasets can be downloaded.""" 2 | from langchain_benchmarks import registry 3 | from langchain_benchmarks.utils._langsmith import exists_public_dataset 4 | 5 | 6 | def check_datasets() -> bool: 7 | """Check that all tasks can be downloaded.""" 8 | ok = True 9 | for task in registry.tasks: 10 | print(f"Checking {task.name}...") 11 | if exists_public_dataset(task.dataset_id): 12 | print(" OK") 13 | else: 14 | ok = False 15 | print(" ERROR: Dataset not found") 16 | return ok 17 | 18 | 19 | if __name__ == "__main__": 20 | ok = check_datasets() 21 | if not ok: 22 | exit(1) 23 | -------------------------------------------------------------------------------- /security.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting OSS Vulnerabilities 4 | 5 | LangChain is partnered with [huntr by Protect AI](https://huntr.com/) to provide 6 | a bounty program for our open source projects. 7 | 8 | Please report security vulnerabilities associated with the LangChain 9 | open source projects by visiting the following link: 10 | 11 | [https://huntr.com/bounties/disclose/](https://huntr.com/bounties/disclose/?target=https%3A%2F%2Fgithub.com%2Flangchain-ai%2Flangchain&validSearch=true) 12 | 13 | Before reporting a vulnerability, please review: 14 | 15 | 1) In-Scope Targets and Out-of-Scope Targets below. 16 | 2) The [langchain-ai/langchain](https://python.langchain.com/docs/contributing/repo_structure) monorepo structure. 17 | 3) LangChain [security guidelines](https://python.langchain.com/docs/security) to 18 | understand what we consider to be a security vulnerability vs. developer 19 | responsibility. 20 | 21 | ### In-Scope Targets 22 | 23 | The following packages and repositories are eligible for bug bounties: 24 | 25 | - langchain-core 26 | - langchain (see exceptions) 27 | - langchain-community (see exceptions) 28 | - langgraph 29 | - langserve 30 | 31 | ### Out of Scope Targets 32 | 33 | All out of scope targets defined by huntr as well as: 34 | 35 | - **langchain-experimental**: This repository is for experimental code and is not 36 | eligible for bug bounties, bug reports to it will be marked as interesting or waste of 37 | time and published with no bounty attached. 38 | - **tools**: Tools in either langchain or langchain-community are not eligible for bug 39 | bounties. This includes the following directories 40 | - langchain/tools 41 | - langchain-community/tools 42 | - Please review our [security guidelines](https://python.langchain.com/docs/security) 43 | for more details, but generally tools interact with the real world. Developers are 44 | expected to understand the security implications of their code and are responsible 45 | for the security of their tools. 46 | - Code documented with security notices. This will be decided done on a case by 47 | case basis, but likely will not be eligible for a bounty as the code is already 48 | documented with guidelines for developers that should be followed for making their 49 | application secure. 50 | - Any LangSmith related repositories or APIs see below. 51 | 52 | ## Reporting LangSmith Vulnerabilities 53 | 54 | Please report security vulnerabilities associated with LangSmith by email to `security@langchain.dev`. 55 | 56 | - LangSmith site: https://smith.langchain.com 57 | - SDK client: https://github.com/langchain-ai/langsmith-sdk 58 | 59 | ### Other Security Concerns 60 | 61 | For any other security concerns, please contact us at `security@langchain.dev`. 62 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/tests/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/tests/unit_tests/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/extraction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/tests/unit_tests/extraction/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/extraction/test_email_extraction.py: -------------------------------------------------------------------------------- 1 | def test_email_extraction() -> None: 2 | """Try to import the email task.""" 3 | -------------------------------------------------------------------------------- /tests/unit_tests/extraction/test_import_stuff.py: -------------------------------------------------------------------------------- 1 | def test_import_stuff() -> None: 2 | """Test that all imports work.""" 3 | from langchain_benchmarks.extraction import ( # noqa: F401 4 | evaluators, 5 | implementations, 6 | ) 7 | -------------------------------------------------------------------------------- /tests/unit_tests/rag/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/tests/unit_tests/rag/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/rag/test_langchain_docs.py: -------------------------------------------------------------------------------- 1 | def test_import_rag() -> None: 2 | """Test that the rag tasks can be imported.""" 3 | from langchain_benchmarks.rag import evaluators, tasks # noqa: F401 4 | 5 | 6 | def test_import_langchain_docs() -> None: 7 | """Test that the langchain_docs tasks can be imported.""" 8 | from langchain_benchmarks.rag.tasks.langchain_docs import ( # noqa: F401 9 | DATASET_ID, 10 | LANGCHAIN_DOCS_TASK, 11 | architectures, 12 | indexing, 13 | ) 14 | -------------------------------------------------------------------------------- /tests/unit_tests/test_model_registry.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from langchain_benchmarks.schema import ModelRegistry, RegisteredModel 4 | 5 | # Create some sample RegisteredModel instances for testing 6 | SAMPLE_MODELS = [ 7 | RegisteredModel( 8 | "model1", "fireworks", "Description 1", {"param1": "value1"}, "chat" 9 | ), 10 | RegisteredModel("model2", "openai", "Description 2", {"param2": "value2"}, "llm"), 11 | ] 12 | 13 | 14 | @pytest.fixture 15 | def sample_registry() -> ModelRegistry: 16 | return ModelRegistry(SAMPLE_MODELS) 17 | 18 | 19 | def test_init() -> None: 20 | # Test the constructor of ModelRegistry 21 | registry = ModelRegistry(SAMPLE_MODELS) 22 | assert len(registry.registered_models) == 2 23 | 24 | 25 | def test_get_model(sample_registry: ModelRegistry) -> None: 26 | # Test the get_model method 27 | model = sample_registry.get_model("model1") 28 | assert model.name == "model1" 29 | 30 | 31 | def test_filter(sample_registry: ModelRegistry) -> None: 32 | # Test the filter method 33 | filtered_registry = sample_registry.filter(type="chat") 34 | assert len(filtered_registry.registered_models) == 1 35 | assert filtered_registry.registered_models[0].type == "chat" 36 | 37 | 38 | def test_repr_html(sample_registry: ModelRegistry) -> None: 39 | # Test the _repr_html_ method 40 | html_representation = sample_registry._repr_html_() 41 | assert "" in html_representation 42 | 43 | 44 | def test_len(sample_registry: ModelRegistry) -> None: 45 | # Test the __len__ method 46 | assert len(sample_registry) == 2 47 | 48 | 49 | def test_iter(sample_registry: ModelRegistry) -> None: 50 | # Test the __iter__ method 51 | models = list(iter(sample_registry)) 52 | assert len(models) == 2 53 | assert isinstance(models[0], RegisteredModel) 54 | 55 | 56 | def test_getitem(sample_registry: ModelRegistry) -> None: 57 | # Test the __getitem__ method for integer and string keys 58 | model = sample_registry[0] 59 | assert model.name == "model1" 60 | model = sample_registry["model2"] 61 | assert model.name == "model2" 62 | 63 | 64 | def test_getitem_slice(sample_registry: ModelRegistry) -> None: 65 | # Test the __getitem__ method for slices 66 | sliced_registry = sample_registry[:1] 67 | assert len(sliced_registry.registered_models) == 1 68 | assert sliced_registry.registered_models[0].name == "model1" 69 | -------------------------------------------------------------------------------- /tests/unit_tests/test_public_api.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks import __all__ 2 | 3 | 4 | def test_public_api() -> None: 5 | """Test that the public API is correct.""" 6 | # This test will also fail if __all__ is not sorted. 7 | # Please keep it sorted! 8 | assert __all__ == sorted( 9 | [ 10 | "__version__", 11 | "clone_public_dataset", 12 | "download_public_dataset", 13 | "model_registry", 14 | "RateLimiter", 15 | "registry", 16 | ], 17 | key=lambda x: x.lower(), 18 | ) 19 | -------------------------------------------------------------------------------- /tests/unit_tests/test_rate_limiting.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from freezegun import freeze_time 3 | 4 | from langchain_benchmarks.rate_limiting import RateLimiter 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "delta_time, requests_per_second, max_bucket_size, expected_result", 9 | [ 10 | ( 11 | 1, 12 | 1, 13 | 1, 14 | True, 15 | ), 16 | ( 17 | 0.5, 18 | 1, 19 | 1, 20 | False, 21 | ), 22 | ( 23 | 0.5, 24 | 2, 25 | 1, 26 | True, 27 | ), 28 | ], 29 | ) 30 | def test_consume( 31 | delta_time: float, 32 | requests_per_second: float, 33 | max_bucket_size: float, 34 | expected_result: bool, 35 | ) -> None: 36 | """Test the consumption of tokens over time. 37 | 38 | Args: 39 | delta_time: The time in seconds to add to the initial time. 40 | requests_per_second: The rate at which tokens are added per second. 41 | max_bucket_size: The maximum size of the token bucket. 42 | expected_result: The expected result of the consume operation. 43 | """ 44 | rate_limiter = RateLimiter( 45 | requests_per_second=requests_per_second, max_bucket_size=max_bucket_size 46 | ) 47 | 48 | with freeze_time(auto_tick_seconds=delta_time): 49 | assert rate_limiter._consume() is False 50 | assert rate_limiter._consume() is expected_result 51 | 52 | 53 | def test_consume_count_tokens() -> None: 54 | """Test to check that the bucket size is used correctly.""" 55 | rate_limiter = RateLimiter( 56 | requests_per_second=60, 57 | max_bucket_size=10, 58 | ) 59 | 60 | with freeze_time(auto_tick_seconds=100): 61 | assert rate_limiter._consume() is False 62 | assert rate_limiter._consume() is True 63 | assert ( 64 | rate_limiter.available_tokens == 9 65 | ) # Max bucket size is 10, so 10 - 1 = 9 66 | -------------------------------------------------------------------------------- /tests/unit_tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import unittest.mock as mock 3 | import uuid 4 | from contextlib import contextmanager 5 | from typing import Any, Generator, List, Mapping, Optional, Sequence 6 | from uuid import UUID 7 | 8 | from langsmith.client import ID_TYPE 9 | from langsmith.schemas import Dataset, Example 10 | from langsmith.utils import LangSmithNotFoundError 11 | 12 | from langchain_benchmarks.utils._langsmith import clone_public_dataset 13 | 14 | 15 | # Define a mock Client class that overrides the required methods 16 | class MockLangSmithClient: 17 | def __init__(self) -> None: 18 | """Initialize the mock client.""" 19 | self.datasets = [] 20 | self.examples = [] 21 | 22 | def read_dataset(self, dataset_name: str) -> Dataset: 23 | for dataset in self.datasets: 24 | if dataset.name == dataset_name: 25 | return dataset 26 | raise LangSmithNotFoundError(f'Dataset "{dataset_name}" not found.') 27 | 28 | def create_dataset(self, dataset_name: str) -> Dataset: 29 | # Simulate creating a dataset and returning a mock Dataset object 30 | dataset = Dataset( 31 | id=UUID(int=3), name=dataset_name, created_at=datetime.datetime(2021, 1, 1) 32 | ) 33 | self.datasets.append(dataset) 34 | return dataset 35 | 36 | def create_examples( 37 | self, 38 | *, 39 | inputs: Sequence[Mapping[str, Any]], 40 | outputs: Optional[Sequence[Optional[Mapping[str, Any]]]] = None, 41 | dataset_id: Optional[ID_TYPE] = None, 42 | dataset_name: Optional[str] = None, 43 | max_concurrency: int = 10, 44 | ) -> None: 45 | """Create examples""" 46 | examples = [] 47 | for idx, (input, output) in enumerate(zip(inputs, outputs)): 48 | examples.append( 49 | Example( 50 | id=UUID(int=idx), 51 | inputs=input, 52 | outputs=output, 53 | created_at=datetime.datetime(2021, 1, 1), 54 | dataset_id=dataset_id, 55 | dataset_name=dataset_name, 56 | ) 57 | ) 58 | 59 | return self.examples.extend(examples) 60 | 61 | def list_shared_examples(self, public_dataset_token: str) -> List[Example]: 62 | # Simulate fetching shared examples and returning a list of Example objects 63 | example1 = Example( 64 | id=UUID(int=1), 65 | inputs={"a": 1}, 66 | outputs={}, 67 | created_at=datetime.datetime(2021, 1, 1), 68 | dataset_id=public_dataset_token, 69 | ) 70 | example2 = Example( 71 | id=UUID(int=2), 72 | inputs={"b": 2}, 73 | outputs={}, 74 | created_at=datetime.datetime(2021, 1, 1), 75 | dataset_id=public_dataset_token, 76 | ) 77 | return [example1, example2] 78 | 79 | def read_shared_dataset(self, public_dataset_token: str) -> Dataset: 80 | # Simulate fetching shared dataset and returning a Dataset object 81 | dataset = Dataset( 82 | id=UUID(int=3), 83 | name="my_dataset", 84 | created_at=datetime.datetime(2021, 1, 1), 85 | owner_id=public_dataset_token, 86 | ) 87 | return dataset 88 | 89 | 90 | @contextmanager 91 | def mock_langsmith_client() -> Generator[None, None, None]: 92 | """Mock the langsmith Client class.""" 93 | from langchain_benchmarks.utils import _langsmith 94 | 95 | mock_client = MockLangSmithClient() 96 | 97 | with mock.patch.object(_langsmith, "Client") as client: 98 | client.return_value = mock_client 99 | yield mock_client 100 | 101 | 102 | def test_clone_dataset() -> None: 103 | # Call the clone_dataset function with mock data 104 | public_dataset_token = str(uuid.UUID(int=3)) 105 | dataset_name = "my_dataset" 106 | 107 | with mock_langsmith_client() as mock_client: 108 | clone_public_dataset(public_dataset_token, dataset_name=dataset_name) 109 | assert mock_client.datasets[0].name == dataset_name 110 | assert len(mock_client.examples) == 2 111 | 112 | # Check idempotency 113 | clone_public_dataset(public_dataset_token, dataset_name=dataset_name) 114 | assert len(mock_client.examples) == 2 115 | -------------------------------------------------------------------------------- /tests/unit_tests/tool_usage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/tests/unit_tests/tool_usage/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/tool_usage/test_evaluator.py: -------------------------------------------------------------------------------- 1 | """Test the standard agent evaluator.""" 2 | 3 | import pytest 4 | from langchain.schema import AgentAction 5 | 6 | from langchain_benchmarks.tool_usage.evaluators import compare_outputs 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "run_outputs, example_outputs, expected_results", 11 | [ 12 | ( 13 | { 14 | "intermediate_steps": [ 15 | ( 16 | AgentAction(tool="action_1", tool_input={}, log=""), 17 | "observation1", 18 | ), 19 | ( 20 | AgentAction(tool="action_2", tool_input={}, log=""), 21 | "observation1", 22 | ), 23 | ], 24 | "state": "final_state", 25 | }, 26 | { 27 | "expected_steps": ["action_1", "action_2"], 28 | "state": "final_state", 29 | }, 30 | { 31 | "Intermediate steps correctness": True, 32 | "# steps / # expected steps": 1, 33 | "Correct Final State": 1, 34 | }, 35 | ), 36 | ( 37 | { 38 | "intermediate_steps": [ 39 | ( 40 | AgentAction(tool="action_1", tool_input={}, log=""), 41 | "observation1", 42 | ), 43 | ( 44 | AgentAction(tool="action_2", tool_input={}, log=""), 45 | "observation1", 46 | ), 47 | ], 48 | "state": "final_state", 49 | }, 50 | { 51 | "expected_steps": ["cat", "was", "here"], 52 | "state": "another_state", 53 | }, 54 | { 55 | "Intermediate steps correctness": False, 56 | "# steps / # expected steps": 2 / 3, 57 | "Correct Final State": 0, 58 | }, 59 | ), 60 | ( 61 | { 62 | "intermediate_steps": [ 63 | ( 64 | AgentAction(tool="action_2", tool_input={}, log=""), 65 | "observation1", 66 | ), 67 | ( 68 | AgentAction(tool="action_1", tool_input={}, log=""), 69 | "observation1", 70 | ), 71 | ], 72 | "state": "final_state", 73 | }, 74 | { 75 | "expected_steps": ["action_1", "action_2"], 76 | "order_matters": False, 77 | "state": "different_state", 78 | }, 79 | { 80 | "Intermediate steps correctness": True, 81 | "# steps / # expected steps": 1.0, 82 | "Correct Final State": 0, 83 | }, 84 | ), 85 | # Without state 86 | ( 87 | { 88 | "intermediate_steps": [ 89 | ( 90 | AgentAction(tool="action_2", tool_input={}, log=""), 91 | "observation1", 92 | ), 93 | ( 94 | AgentAction(tool="action_1", tool_input={}, log=""), 95 | "observation1", 96 | ), 97 | ], 98 | }, 99 | { 100 | "expected_steps": ["action_1", "action_2"], 101 | "order_matters": False, 102 | }, 103 | { 104 | "Intermediate steps correctness": True, 105 | "# steps / # expected steps": 1.0, 106 | }, 107 | ), 108 | # Using actual steps 109 | # With order not mattering 110 | ( 111 | { 112 | "actual_steps": ["action_2", "action_1"], 113 | }, 114 | { 115 | "expected_steps": ["action_1", "action_2"], 116 | "order_matters": False, 117 | }, 118 | { 119 | "Intermediate steps correctness": True, 120 | "# steps / # expected steps": 1.0, 121 | }, 122 | ), 123 | # Using actual steps 124 | # With order mattering 125 | ( 126 | { 127 | "actual_steps": ["action_2", "action_1"], 128 | }, 129 | { 130 | "expected_steps": ["action_1", "action_2"], 131 | "order_matters": True, 132 | }, 133 | { 134 | "Intermediate steps correctness": False, 135 | "# steps / # expected steps": 1.0, 136 | }, 137 | ), 138 | ], 139 | ) 140 | def test_compare_outputs(run_outputs, example_outputs, expected_results): 141 | """Test compare outputs.""" 142 | evaluation_results = compare_outputs(run_outputs, example_outputs, run_inputs={}) 143 | assert { 144 | result.key: result.score for result in evaluation_results["results"] 145 | } == expected_results 146 | -------------------------------------------------------------------------------- /tests/unit_tests/tool_usage/test_multiverse_math.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.tool_usage.tasks.multiverse_math import ( 2 | add, 3 | get_environment, 4 | multiply, 5 | ) 6 | 7 | 8 | def test_get_environment() -> None: 9 | """Test the multiverse math task.""" 10 | # Create the environment 11 | env = get_environment() 12 | 13 | # Get the tools 14 | tools = env.tools 15 | 16 | assert len(tools) == 10 17 | 18 | # Get the state reader 19 | read_state = env.read_state 20 | assert read_state is None 21 | 22 | 23 | def test_operations() -> None: 24 | """Test some operations.""" 25 | # Confirm that operations are not distributive 26 | assert multiply(add(1, 2), 7) == 32.34 27 | assert add(multiply(1, 7), multiply(2, 7)) == 24.3 28 | -------------------------------------------------------------------------------- /tests/unit_tests/tool_usage/test_public_api.py: -------------------------------------------------------------------------------- 1 | from langchain_benchmarks.tool_usage import __all__ 2 | 3 | 4 | def test_public_api() -> None: 5 | """Test that the public API is correct.""" 6 | # This test will also fail if __all__ is not sorted. 7 | # Please keep it sorted! 8 | assert __all__ == sorted( 9 | [ 10 | "apply_agent_executor_adapter", 11 | "get_eval_config", 12 | "CustomRunnableAgentFactory", 13 | "StandardAgentFactory", 14 | ], 15 | key=str.lower, 16 | ) 17 | -------------------------------------------------------------------------------- /tests/unit_tests/tool_usage/test_tool_usage.py: -------------------------------------------------------------------------------- 1 | def test_import_tool_usage() -> None: 2 | """Test that tool_usage can be imported""" 3 | --------------------------------------------------------------------------------