├── .github
    ├── actions
    │   └── poetry_setup
    │   │   └── action.yml
    └── workflows
    │   ├── _lint.yml
    │   ├── _release.yml
    │   ├── _test.yml
    │   ├── ci.yml
    │   ├── doc_publish.yaml
    │   ├── release.yml
    │   └── tool_benchmarks.yml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── archived
    ├── csv-qa
    │   ├── README.md
    │   ├── custom_agent.py
    │   ├── data.csv
    │   ├── pandas_agent_gpt_35.py
    │   ├── pandas_agent_gpt_4.py
    │   ├── pandas_agent_instruct.py
    │   ├── pandas_ai.py
    │   ├── requirements.txt
    │   ├── result_35.png
    │   ├── results_4.png
    │   ├── results_custom.png
    │   ├── results_pandasai.png
    │   ├── streamlit_app.py
    │   ├── titanic.csv
    │   ├── titanic_data
    │   │   ├── index.faiss
    │   │   └── index.pkl
    │   └── upload_data.py
    ├── extraction
    │   ├── oppenheimer.txt
    │   ├── oppenheimer_short.txt
    │   ├── requirements.txt
    │   └── streamlit_app.py
    ├── langchain-docs-benchmarking
    │   ├── README.md
    │   ├── app
    │   │   ├── __init__.py
    │   │   └── server.py
    │   ├── example_custom_config.json
    │   ├── packages
    │   │   ├── README.md
    │   │   ├── anthropic-iterative-search
    │   │   │   ├── README.md
    │   │   │   ├── anthropic_iterative_search
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── agent_scratchpad.py
    │   │   │   │   ├── chain.py
    │   │   │   │   ├── output_parser.py
    │   │   │   │   ├── prompts.py
    │   │   │   │   ├── retriever.py
    │   │   │   │   └── retriever_agent.py
    │   │   │   ├── main.py
    │   │   │   ├── poetry.lock
    │   │   │   ├── pyproject.toml
    │   │   │   └── tests
    │   │   │   │   └── __init__.py
    │   │   ├── chat-langchain
    │   │   │   ├── LICENSE
    │   │   │   ├── README.md
    │   │   │   ├── chat_langchain
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── chain.py
    │   │   │   ├── poetry.lock
    │   │   │   ├── pyproject.toml
    │   │   │   └── tests
    │   │   │   │   └── __init__.py
    │   │   ├── example
    │   │   │   └── custom_example
    │   │   │   │   └── example_custom_chain.py
    │   │   ├── langchain-docs-retriever
    │   │   │   ├── README.md
    │   │   │   ├── ingest_docs.py
    │   │   │   ├── langchain_docs_retriever
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── download_db.py
    │   │   │   │   └── retriever.py
    │   │   │   ├── poetry.lock
    │   │   │   └── pyproject.toml
    │   │   ├── oai-assistant
    │   │   │   ├── LICENSE
    │   │   │   ├── README.md
    │   │   │   ├── oai_assistant
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── chain.py
    │   │   │   ├── poetry.lock
    │   │   │   ├── pyproject.toml
    │   │   │   └── tests
    │   │   │   │   └── __init__.py
    │   │   └── openai-functions-agent
    │   │   │   ├── LICENSE
    │   │   │   ├── README.md
    │   │   │   ├── main.py
    │   │   │   ├── openai_functions_agent
    │   │   │       ├── __init__.py
    │   │   │       └── agent.py
    │   │   │   ├── poetry.lock
    │   │   │   ├── pyproject.toml
    │   │   │   └── tests
    │   │   │       └── __init__.py
    │   ├── poetry.lock
    │   ├── prepare_dataset.py
    │   ├── pyproject.toml
    │   ├── run_evals.py
    │   └── run_experiments.py
    └── meta-evals
    │   ├── README.md
    │   └── correctness
    │       ├── README.md
    │       ├── __init__.py
    │       ├── _upload_dataset.py
    │       ├── data
    │           ├── Opus100-incorrect.json
    │           ├── Opus100.json
    │           ├── Web-Q&A-Dataset-Correct.json
    │           ├── Web-Q&A-Dataset-Incorrect.json
    │           ├── carb-IE-correct.json
    │           └── carb-IE-incorrect.json
    │       └── test_correctness_evaluator.py
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── .gitignore
    │   ├── _static
    │       └── parrot.png
    │   ├── conf.py
    │   ├── notebooks
    │       ├── datasets.ipynb
    │       ├── extraction
    │       │   ├── chat_extraction.ipynb
    │       │   ├── email.ipynb
    │       │   ├── high_cardinality.ipynb
    │       │   └── intro.ipynb
    │       ├── getting_started.ipynb
    │       ├── models.ipynb
    │       ├── retrieval
    │       │   ├── comparing_techniques.ipynb
    │       │   ├── intro.ipynb
    │       │   ├── langchain_docs_qa.ipynb
    │       │   ├── multi_modal_benchmarking
    │       │   │   ├── experiments
    │       │   │   │   └── gemini.ipynb
    │       │   │   ├── multi_modal_eval.ipynb
    │       │   │   └── multi_modal_eval_baseline.ipynb
    │       │   └── semi_structured_benchmarking
    │       │   │   ├── semi_structured.ipynb
    │       │   │   ├── ss_eval_chunk_sizes.ipynb
    │       │   │   ├── ss_eval_long_context.ipynb
    │       │   │   └── ss_eval_multi_vector.ipynb
    │       ├── run_without_langsmith.ipynb
    │       └── tool_usage
    │       │   ├── benchmark_all_tasks.ipynb
    │       │   ├── intro.ipynb
    │       │   ├── multiverse_math.ipynb
    │       │   ├── multiverse_math_benchmark.ipynb
    │       │   ├── oss_experiments
    │       │       └── mixtral_experiments.ipynb
    │       │   ├── query_analysis.ipynb
    │       │   ├── relational_data.ipynb
    │       │   ├── typewriter_1.ipynb
    │       │   └── typewriter_26.ipynb
    │   └── toc.segment
├── langchain_benchmarks
    ├── .gitignore
    ├── __init__.py
    ├── extraction
    │   ├── __init__.py
    │   ├── evaluators.py
    │   ├── implementations.py
    │   └── tasks
    │   │   ├── __init__.py
    │   │   ├── chat_extraction
    │   │       ├── __init__.py
    │   │       ├── evaluators.py
    │   │       └── schema.py
    │   │   ├── email_task.py
    │   │   └── high_cardinality
    │   │       ├── __init__.py
    │   │       └── name_correction.py
    ├── model_registration.py
    ├── rag
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── evaluators.py
    │   ├── tasks
    │   │   ├── .gitignore
    │   │   ├── __init__.py
    │   │   ├── langchain_docs
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── _ingest_docs.py
    │   │   │   ├── architectures
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── chain_registry.py
    │   │   │   │   └── crqa.py
    │   │   │   ├── indexing
    │   │   │   │   ├── .gitignore
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── retriever_registry.py
    │   │   │   └── task.py
    │   │   ├── multi_modal_slide_decks
    │   │   │   ├── __init__.py
    │   │   │   ├── indexing
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── retriever_registry.py
    │   │   │   └── task.py
    │   │   └── semi_structured_reports
    │   │   │   ├── __init__.py
    │   │   │   ├── indexing
    │   │   │       ├── .gitignore
    │   │   │       ├── __init__.py
    │   │   │       └── retriever_registry.py
    │   │   │   └── task.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── _downloading.py
    │   │   └── indexing.py
    ├── rate_limiting.py
    ├── registration.py
    ├── schema.py
    ├── tool_usage
    │   ├── README.md
    │   ├── __init__.py
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── adapters.py
    │   │   ├── base.py
    │   │   ├── runnable_agent.py
    │   │   └── tool_using_agent.py
    │   ├── evaluators.py
    │   ├── prompts.py
    │   └── tasks
    │   │   ├── __init__.py
    │   │   ├── multiverse_math.py
    │   │   ├── query_analysis.py
    │   │   ├── relational_data.py
    │   │   ├── type_writer.py
    │   │   └── type_writer_26_funcs.py
    └── utils
    │   ├── __init__.py
    │   └── _langsmith.py
├── poetry.lock
├── pyproject.toml
├── scripts
    ├── check_datasets.py
    ├── multiverse_math_benchmark.py
    └── query_analysis_benchmark.py
├── security.md
└── tests
    ├── __init__.py
    └── unit_tests
        ├── __init__.py
        ├── extraction
            ├── __init__.py
            ├── test_email_extraction.py
            └── test_import_stuff.py
        ├── rag
            ├── __init__.py
            └── test_langchain_docs.py
        ├── test_model_registry.py
        ├── test_public_api.py
        ├── test_rate_limiting.py
        ├── test_utils.py
        └── tool_usage
            ├── __init__.py
            ├── test_evaluator.py
            ├── test_multiverse_math.py
            ├── test_public_api.py
            └── test_tool_usage.py


/.github/actions/poetry_setup/action.yml:
--------------------------------------------------------------------------------
 1 | # An action for setting up poetry install with caching.
 2 | # Using a custom action since the default action does not
 3 | # take poetry install groups into account.
 4 | # Action code from:
 5 | # https://github.com/actions/setup-python/issues/505#issuecomment-1273013236
 6 | name: poetry-install-with-caching
 7 | description: Poetry install with support for caching of dependency groups.
 8 | 
 9 | inputs:
10 |   python-version:
11 |     description: Python version, supporting MAJOR.MINOR only
12 |     required: true
13 | 
14 |   poetry-version:
15 |     description: Poetry version
16 |     required: true
17 | 
18 |   cache-key:
19 |     description: Cache key to use for manual handling of caching
20 |     required: true
21 | 
22 |   working-directory:
23 |     description: Directory whose poetry.lock file should be cached
24 |     required: true
25 | 
26 | runs:
27 |   using: composite
28 |   steps:
29 |     - uses: actions/setup-python@v4
30 |       name: Setup python ${{ inputs.python-version }}
31 |       with:
32 |         python-version: ${{ inputs.python-version }}
33 | 
34 |     - uses: actions/cache@v3
35 |       id: cache-bin-poetry
36 |       name: Cache Poetry binary - Python ${{ inputs.python-version }}
37 |       env:
38 |         SEGMENT_DOWNLOAD_TIMEOUT_MIN: "1"
39 |       with:
40 |         path: |
41 |           /opt/pipx/venvs/poetry
42 |         # This step caches the poetry installation, so make sure it's keyed on the poetry version as well.
43 |         key: bin-poetry-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-${{ inputs.poetry-version }}
44 | 
45 |     - name: Refresh shell hashtable and fixup softlinks
46 |       if: steps.cache-bin-poetry.outputs.cache-hit == 'true'
47 |       shell: bash
48 |       env:
49 |         POETRY_VERSION: ${{ inputs.poetry-version }}
50 |         PYTHON_VERSION: ${{ inputs.python-version }}
51 |       run: |
52 |         set -eux
53 | 
54 |         # Refresh the shell hashtable, to ensure correct `which` output.
55 |         hash -r
56 | 
57 |         # `actions/cache@v3` doesn't always seem able to correctly unpack softlinks.
58 |         # Delete and recreate the softlinks pipx expects to have.
59 |         rm /opt/pipx/venvs/poetry/bin/python
60 |         cd /opt/pipx/venvs/poetry/bin
61 |         ln -s "$(which "python$PYTHON_VERSION")" python
62 |         chmod +x python
63 |         cd /opt/pipx_bin/
64 |         ln -s /opt/pipx/venvs/poetry/bin/poetry poetry
65 |         chmod +x poetry
66 | 
67 |         # Ensure everything got set up correctly.
68 |         /opt/pipx/venvs/poetry/bin/python --version
69 |         /opt/pipx_bin/poetry --version
70 | 
71 |     - name: Install poetry
72 |       if: steps.cache-bin-poetry.outputs.cache-hit != 'true'
73 |       shell: bash
74 |       env:
75 |         POETRY_VERSION: ${{ inputs.poetry-version }}
76 |         PYTHON_VERSION: ${{ inputs.python-version }}
77 |       run: pipx install "poetry==$POETRY_VERSION" --python "python$PYTHON_VERSION" --verbose
78 | 
79 |     - name: Restore pip and poetry cached dependencies
80 |       uses: actions/cache@v3
81 |       env:
82 |         SEGMENT_DOWNLOAD_TIMEOUT_MIN: "4"
83 |         WORKDIR: ${{ inputs.working-directory == '' && '.' || inputs.working-directory }}
84 |       with:
85 |         path: |
86 |           ~/.cache/pip
87 |           ~/.cache/pypoetry/virtualenvs
88 |           ~/.cache/pypoetry/cache
89 |           ~/.cache/pypoetry/artifacts
90 |           ${{ env.WORKDIR }}/.venv
91 |         key: py-deps-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-poetry-${{ inputs.poetry-version }}-${{ inputs.cache-key }}-${{ hashFiles(format('{0}/**/poetry.lock', env.WORKDIR)) }}
92 | 


--------------------------------------------------------------------------------
/.github/workflows/_lint.yml:
--------------------------------------------------------------------------------
 1 | name: lint
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |     inputs:
 6 |       working-directory:
 7 |         required: true
 8 |         type: string
 9 |         description: "From which folder this pipeline executes"
10 | 
11 | env:
12 |   POETRY_VERSION: "1.6.1"
13 |   WORKDIR: ${{ inputs.working-directory == '' && '.' || inputs.working-directory }}
14 | 
15 | jobs:
16 |   build:
17 |     runs-on: ubuntu-latest
18 |     env:
19 |       # This number is set "by eye": we want it to be big enough
20 |       # so that it's bigger than the number of commits in any reasonable PR,
21 |       # and also as small as possible since increasing the number makes
22 |       # the initial `git fetch` slower.
23 |       FETCH_DEPTH: 50
24 |     strategy:
25 |       matrix:
26 |         # Only lint on the min and max supported Python versions.
27 |         # It's extremely unlikely that there's a lint issue on any version in between
28 |         # that doesn't show up on the min or max versions.
29 |         #
30 |         # GitHub rate-limits how many jobs can be running at any one time.
31 |         # Starting new jobs is also relatively slow,
32 |         # so linting on fewer versions makes CI faster.
33 |         python-version:
34 |           - "3.8"
35 |           - "3.11"
36 |     steps:
37 |       - uses: actions/checkout@v3
38 |       - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
39 |         uses: "./.github/actions/poetry_setup"
40 |         with:
41 |           python-version: ${{ matrix.python-version }}
42 |           poetry-version: ${{ env.POETRY_VERSION }}
43 |           working-directory: ${{ inputs.working-directory }}
44 |           cache-key: lint-with-extras
45 | 
46 |       - name: Check Poetry File
47 |         shell: bash
48 |         working-directory: ${{ inputs.working-directory }}
49 |         run: |
50 |           poetry check
51 | 
52 |       - name: Check lock file
53 |         shell: bash
54 |         working-directory: ${{ inputs.working-directory }}
55 |         run: |
56 |           poetry lock --check
57 | 
58 |       - name: Install dependencies
59 |         # Also installs dev/lint/test/typing dependencies, to ensure we have
60 |         # type hints for as many of our libraries as possible.
61 |         # This helps catch errors that require dependencies to be spotted, for example:
62 |         # https://github.com/langchain-ai/langchain/pull/10249/files#diff-935185cd488d015f026dcd9e19616ff62863e8cde8c0bee70318d3ccbca98341
63 |         #
64 |         # If you change this configuration, make sure to change the `cache-key`
65 |         # in the `poetry_setup` action above to stop using the old cache.
66 |         # It doesn't matter how you change it, any change will cause a cache-bust.
67 |         working-directory: ${{ inputs.working-directory }}
68 |         run: |
69 |           poetry install --with dev,lint,test,typing
70 | 
71 |       - name: Get .mypy_cache to speed up mypy
72 |         uses: actions/cache@v3
73 |         env:
74 |           SEGMENT_DOWNLOAD_TIMEOUT_MIN: "2"
75 |         with:
76 |           path: |
77 |             ${{ env.WORKDIR }}/.mypy_cache
78 |           key: mypy-${{ runner.os }}-${{ runner.arch }}-py${{ matrix.python-version }}-${{ inputs.working-directory }}-${{ hashFiles(format('{0}/poetry.lock', env.WORKDIR)) }}
79 | 
80 |       - name: Analysing the code with our lint
81 |         working-directory: ${{ inputs.working-directory }}
82 |         run: |
83 |           make lint
84 | 


--------------------------------------------------------------------------------
/.github/workflows/_release.yml:
--------------------------------------------------------------------------------
 1 | name: release
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |     inputs:
 6 |       working-directory:
 7 |         required: true
 8 |         type: string
 9 |         description: "From which folder this pipeline executes"
10 | 
11 | env:
12 |   POETRY_VERSION: "1.6.1"
13 | 
14 | jobs:
15 |   if_release:
16 |     # Disallow publishing from branches that aren't `main`.
17 |     if: github.ref == 'refs/heads/main'
18 |     runs-on: ubuntu-latest
19 |     permissions:
20 |       # This permission is used for trusted publishing:
21 |       # https://blog.pypi.org/posts/2023-04-20-introducing-trusted-publishers/
22 |       #
23 |       # Trusted publishing has to also be configured on PyPI for each package:
24 |       # https://docs.pypi.org/trusted-publishers/adding-a-publisher/
25 |       id-token: write
26 | 
27 |       # This permission is needed by `ncipollo/release-action` to create the GitHub release.
28 |       contents: write
29 |     defaults:
30 |       run:
31 |         working-directory: ${{ inputs.working-directory }}
32 |     steps:
33 |       - uses: actions/checkout@v3
34 | 
35 |       - name: Set up Python + Poetry ${{ env.POETRY_VERSION }}
36 |         uses: "./.github/actions/poetry_setup"
37 |         with:
38 |           python-version: "3.10"
39 |           poetry-version: ${{ env.POETRY_VERSION }}
40 |           working-directory: ${{ inputs.working-directory }}
41 |           cache-key: release
42 | 
43 |       - name: Build project for distribution
44 |         run: poetry build
45 |       - name: Check Version
46 |         id: check-version
47 |         run: |
48 |           echo version=$(poetry version --short) >> $GITHUB_OUTPUT
49 |       - name: Create Release
50 |         uses: ncipollo/release-action@v1
51 |         with:
52 |           artifacts: "dist/*"
53 |           token: ${{ secrets.GITHUB_TOKEN }}
54 |           draft: false
55 |           generateReleaseNotes: true
56 |           tag: v${{ steps.check-version.outputs.version }}
57 |           commit: main
58 |       - name: Publish package distributions to PyPI
59 |         uses: pypa/gh-action-pypi-publish@release/v1
60 |         with:
61 |           packages-dir: ${{ inputs.working-directory }}/dist/
62 |           verbose: true
63 |           print-hash: true
64 | 


--------------------------------------------------------------------------------
/.github/workflows/_test.yml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |     inputs:
 6 |       working-directory:
 7 |         required: true
 8 |         type: string
 9 |         description: "From which folder this pipeline executes"
10 | 
11 | env:
12 |   POETRY_VERSION: "1.6.1"
13 | 
14 | jobs:
15 |   build:
16 |     defaults:
17 |       run:
18 |         working-directory: ${{ inputs.working-directory }}
19 |     runs-on: ubuntu-latest
20 |     strategy:
21 |       matrix:
22 |         python-version:
23 |           - "3.8"
24 |           - "3.9"
25 |           - "3.10"
26 |           - "3.11"
27 |     name: Python ${{ matrix.python-version }}
28 |     steps:
29 |       - uses: actions/checkout@v3
30 | 
31 |       - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
32 |         uses: "./.github/actions/poetry_setup"
33 |         with:
34 |           python-version: ${{ matrix.python-version }}
35 |           poetry-version: ${{ env.POETRY_VERSION }}
36 |           working-directory: ${{ inputs.working-directory }}
37 |           cache-key: core
38 | 
39 |       - name: Install dependencies
40 |         shell: bash
41 |         run: poetry install
42 | 
43 |       - name: Run core tests
44 |         shell: bash
45 |         run: make test
46 | 
47 |       - name: Ensure the tests did not create any additional files
48 |         shell: bash
49 |         run: |
50 |           set -eu
51 | 
52 |           STATUS="$(git status)"
53 |           echo "$STATUS"
54 | 
55 |           # grep will exit non-zero if the target message isn't found,
56 |           # and `set -e` above will cause the step to fail.
57 |           echo "$STATUS" | grep 'nothing to commit, working tree clean'
58 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | name: Run CI Tests
  3 | 
  4 | on:
  5 |   push:
  6 |     branches: [ main ]
  7 |   pull_request:
  8 |     paths-ignore:
  9 |       - 'README.md'
 10 |   workflow_dispatch:  # Allows to trigger the workflow manually in GitHub UI
 11 | 
 12 | # If another push to the same PR or branch happens while this workflow is still running,
 13 | # cancel the earlier run in favor of the next run.
 14 | #
 15 | # There's no point in testing an outdated version of the code. GitHub only allows
 16 | # a limited number of job runners to be active at the same time, so it's better to cancel
 17 | # pointless jobs early so that more useful jobs can run sooner.
 18 | concurrency:
 19 |   group: ${{ github.workflow }}-${{ github.ref }}
 20 |   cancel-in-progress: true
 21 | 
 22 | env:
 23 |   POETRY_VERSION: "1.5.1"
 24 |   WORKDIR: "."
 25 | 
 26 | jobs:
 27 |   lint:
 28 |     uses:
 29 |       ./.github/workflows/_lint.yml
 30 |     with:
 31 |       working-directory: .
 32 |     secrets: inherit
 33 | 
 34 |   test:
 35 |     timeout-minutes: 5
 36 |     runs-on: ubuntu-latest
 37 |     defaults:
 38 |       run:
 39 |         working-directory: ${{ env.WORKDIR }}
 40 |     strategy:
 41 |       matrix:
 42 |         python-version:
 43 |           - "3.8"
 44 |           - "3.9"
 45 |           - "3.10"
 46 |           - "3.11"
 47 |     name: Python ${{ matrix.python-version }} tests
 48 |     steps:
 49 |       - uses: actions/checkout@v3
 50 | 
 51 |       - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
 52 |         uses: "./.github/actions/poetry_setup"
 53 |         with:
 54 |           python-version: ${{ matrix.python-version }}
 55 |           poetry-version: ${{ env.POETRY_VERSION }}
 56 |           working-directory: .
 57 |           cache-key: benchmarks-all
 58 | 
 59 |       - name: Install dependencies
 60 |         shell: bash
 61 |         run: |
 62 |           echo "Running tests, installing dependencies with poetry..."
 63 |           poetry install --with test,lint,typing,docs
 64 | 
 65 |       - name: Run tests
 66 |         run: make test
 67 | 
 68 |       - name: Ensure the tests did not create any additional files
 69 |         shell: bash
 70 |         run: |
 71 |           set -eu
 72 | 
 73 |           STATUS="$(git status)"
 74 |           echo "$STATUS"
 75 | 
 76 |           # grep will exit non-zero if the target message isn't found,
 77 |           # and `set -e` above will cause the step to fail.
 78 |           echo "$STATUS" | grep 'nothing to commit, working tree clean'
 79 |   test_docs:
 80 |     timeout-minutes: 5
 81 |     runs-on: ubuntu-latest
 82 |     defaults:
 83 |       run:
 84 |         working-directory: ${{ env.WORKDIR }}
 85 |     strategy:
 86 |       matrix:
 87 |         python-version:
 88 |           - "3.11"
 89 |     name: Documentation Build for Python ${{ matrix.python-version }}
 90 |     steps:
 91 |       - uses: actions/checkout@v3
 92 | 
 93 |       - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
 94 |         uses: "./.github/actions/poetry_setup"
 95 |         with:
 96 |           python-version: ${{ matrix.python-version }}
 97 |           poetry-version: ${{ env.POETRY_VERSION }}
 98 |           working-directory: .
 99 |           cache-key: benchmarks-all
100 | 
101 |       - name: Install dependencies
102 |         shell: bash
103 |         run: |
104 |           echo "Running tests, installing dependencies with poetry..."
105 |           poetry install --with test,lint,typing,docs
106 | 
107 |       - name: Test Sphinx Docs
108 |         shell: bash
109 |         run: |
110 |           echo "Attempting to build docs..."
111 |           make docs_build
112 |   test_datasets:
113 |     timeout-minutes: 5
114 |     runs-on: ubuntu-latest
115 |     defaults:
116 |       run:
117 |         working-directory: ${{ env.WORKDIR }}
118 |     strategy:
119 |       matrix:
120 |         python-version:
121 |           - "3.11"
122 |     name: Validate Public Datasets
123 |     steps:
124 |       - uses: actions/checkout@v3
125 | 
126 |       - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
127 |         uses: "./.github/actions/poetry_setup"
128 |         with:
129 |           python-version: ${{ matrix.python-version }}
130 |           poetry-version: ${{ env.POETRY_VERSION }}
131 |           working-directory: .
132 |           cache-key: benchmarks-all
133 | 
134 |       - name: Install dependencies
135 |         shell: bash
136 |         run: |
137 |           echo "Running tests, installing dependencies with poetry..."
138 |           poetry install --with test,lint,typing,docs
139 | 
140 |       - name: Request datasets
141 |         shell: bash
142 |         run: |
143 |           echo "Attempting to build docs..."
144 |           poetry run python -m scripts.check_datasets


--------------------------------------------------------------------------------
/.github/workflows/doc_publish.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish Docs
 2 | on: [workflow_dispatch]
 3 | permissions:
 4 |     contents: write
 5 | 
 6 | env:
 7 |   POETRY_VERSION: "1.6.1"
 8 | 
 9 | jobs:
10 |   docs:
11 |     strategy:
12 |       matrix:
13 |         python-version:
14 |           - "3.11"
15 |     runs-on: ubuntu-latest
16 |     name: Documentation Publish
17 |     steps:
18 |       - uses: actions/checkout@v3
19 | 
20 |       - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
21 |         uses: "./.github/actions/poetry_setup"
22 |         with:
23 |           python-version: ${{ matrix.python-version }}
24 |           poetry-version: ${{ env.POETRY_VERSION }}
25 |           working-directory: .
26 |           cache-key: benchmarks-all
27 | 
28 |       - name: Install dependencies
29 |         shell: bash
30 |         run: |
31 |           echo "Running tests, installing dependencies with poetry..."
32 |           poetry install --with test,lint,typing,docs
33 | 
34 |       - name: Sphinx build
35 |         shell: bash
36 |         run: |
37 |           make docs_build
38 |       - name: Publish Docs
39 |         uses: peaceiris/actions-gh-pages@v3
40 |         with:
41 |           publish_branch: gh-pages
42 |           github_token: ${{ secrets.GITHUB_TOKEN }}
43 |           publish_dir: ./docs/build
44 |           force_orphan: true
45 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Publish Package to PyPi
 3 | 
 4 | on:
 5 |   workflow_dispatch:  # Allows to trigger the workflow manually in GitHub UI
 6 | 
 7 | jobs:
 8 |   release:
 9 |     uses:
10 |       ./.github/workflows/_release.yml
11 |     permissions: write-all
12 |     with:
13 |       working-directory: .
14 |     secrets: inherit
15 | 


--------------------------------------------------------------------------------
/.github/workflows/tool_benchmarks.yml:
--------------------------------------------------------------------------------
 1 | name: Weekly Tool Benchmarks
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     - cron: '0 0 * * 0'  # Runs at midnight (00:00) every Sunday (UTC time)
 7 | 
 8 | env:
 9 |   POETRY_VERSION: "1.6.1"
10 |   LANGCHAIN_API_KEY: ${{ secrets.LANGCHAIN_API_KEY }}
11 |   OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
12 |   ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
13 | 
14 | jobs:
15 |   run_tool_benchmarks:
16 |     runs-on: ubuntu-latest
17 |     
18 |     steps:
19 |       - uses: actions/checkout@v4
20 | 
21 |       - name: Set up Python 3.12 + Poetry ${{ env.POETRY_VERSION }}
22 |         uses: "./.github/actions/poetry_setup"
23 |         with:
24 |           python-version: '3.12'
25 |           poetry-version: ${{ env.POETRY_VERSION }}
26 |           working-directory: .
27 |           cache-key: benchmarks-all
28 |       
29 |       - name: Install dependencies
30 |         shell: bash
31 |         run: |
32 |           echo "Running tests, installing dependencies with poetry..."
33 |           poetry install --with test,lint,typing,docs
34 | 
35 |       - name: Multiverse math benchmark
36 | 
37 |         run: |
38 |           cd scripts
39 |           poetry run python multiverse_math_benchmark.py
40 |       
41 |       - name: Query analysis benchmark
42 |         run: |
43 |           cd scripts
44 |           poetry run python query_analysis_benchmark.py


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Python template
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | .idea/
162 | .DS_Store
163 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Langchain AI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all lint format test help
 2 | 
 3 | # Default target executed when no arguments are given to make.
 4 | all: help
 5 | 
 6 | # LINTING AND FORMATTING:
 7 | 
 8 | # Define a variable for Python and notebook files.
 9 | lint format: PYTHON_FILES=.
10 | lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=. --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
11 | 
12 | lint lint_diff:
13 | 	[ "$(PYTHON_FILES)" = "" ] ||	poetry run ruff format $(PYTHON_FILES) --diff
14 | 	# [ "$(PYTHON_FILES)" = "" ] || poetry run mypy $(PYTHON_FILES)
15 | 
16 | format format_diff:
17 | 	[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES)
18 | 	[ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I --fix $(PYTHON_FILES)
19 | 
20 | spell_check:
21 | 	poetry run codespell --toml pyproject.toml
22 | 
23 | spell_fix:
24 | 	poetry run codespell --toml pyproject.toml -w
25 | 
26 | 
27 | # TESTING AND COVERAGE:
28 | 
29 | # Define a variable for the test file path.
30 | TEST_FILE ?= tests/unit_tests/
31 | 
32 | test:
33 | 	poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE)
34 | 
35 | test_watch:
36 | 	poetry run ptw . -- $(TEST_FILE)
37 | 
38 | 
39 | # DOCUMENTATION:
40 | 
41 | docs_clean:
42 | 	rm -rf ./docs/build
43 | 
44 | docs_build:
45 | 	# Copy README.md to docs/index.md
46 | 	cp README.md ./docs/source/index.md
47 | 	# Append to the table of contents the contents of the file
48 | 	cat ./docs/source/toc.segment >> ./docs/source/index.md
49 | 	poetry run sphinx-build "./docs/source" "./docs/build"
50 | 
51 | 
52 | # HELP:
53 | help:
54 | 	@echo ''
55 | 	@echo 'LINTING:'
56 | 	@echo '  format             - run code formatters'
57 | 	@echo '  lint               - run linters'
58 | 	@echo '  spell_check        - run codespell'
59 | 	@echo '  spell_fix          - run codespell and fix the errors'
60 | 	@echo 'TESTS:'
61 | 	@echo '  test               - run unit tests'
62 | 	@echo '  test TEST_FILE=<test_file>   - run tests in <test_file>'
63 | 	@echo '  coverage           - run unit tests and generate coverage report'
64 | 	@echo 'DOCUMENTATION:'
65 | 	@echo '  docs_clean         - delete the docs/build directory'
66 | 	@echo '  docs_build         - build the documentation'
67 | 	@echo ''
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 🦜💯 LangChain Benchmarks
 2 | 
 3 | [![Release Notes](https://img.shields.io/github/release/langchain-ai/langchain-benchmarks)](https://github.com/langchain-ai/langchain-benchmarks/releases)
 4 | [![CI](https://github.com/langchain-ai/langchain-benchmarks/actions/workflows/ci.yml/badge.svg)](https://github.com/langchain-ai/langchain-benchmarks/actions/workflows/ci.yml)
 5 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 6 | [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/langchainai.svg?style=social&label=Follow%20%40LangChainAI)](https://twitter.com/langchainai)
 7 | [![](https://dcbadge.vercel.app/api/server/6adMQxSpJS?compact=true&style=flat)](https://discord.gg/6adMQxSpJS)
 8 | [![Open Issues](https://img.shields.io/github/issues-raw/langchain-ai/langchain-benchmarks)](https://github.com/langchain-ai/langchain-benchmarks/issues)
 9 | 
10 | 
11 | [📖 Documentation](https://langchain-ai.github.io/langchain-benchmarks/index.html)
12 | 
13 | A package to help benchmark various LLM related tasks.
14 | 
15 | The benchmarks are organized by end-to-end use cases, and
16 | utilize [LangSmith](https://smith.langchain.com/) heavily.
17 | 
18 | We have several goals in open sourcing this:
19 | 
20 | - Showing how we collect our benchmark datasets for each task
21 | - Showing what the benchmark datasets we use for each task is
22 | - Showing how we evaluate each task
23 | - Encouraging others to benchmark their solutions on these tasks (we are always looking for better ways of doing things!)
24 | 
25 | ## Benchmarking Results
26 | 
27 | Read some of the articles about benchmarking results on our blog.
28 | 
29 | * [Agent Tool Use](https://blog.langchain.dev/benchmarking-agent-tool-use/)
30 | * [Query Analysis in High Cardinality Situations](https://blog.langchain.dev/high-cardinality/)
31 | * [RAG on Tables](https://blog.langchain.dev/benchmarking-rag-on-tables/)
32 | * [Q&A over CSV data](https://blog.langchain.dev/benchmarking-question-answering-over-csv-data/)
33 | 
34 | 
35 | ### Tool Usage (2024-04-18)
36 | 
37 | See [tool usage docs](https://langchain-ai.github.io/langchain-benchmarks/notebooks/tool_usage/benchmark_all_tasks.html) to recreate!
38 | 
39 | ![download](https://github.com/langchain-ai/langchain-benchmarks/assets/3205522/0da33de8-e03f-49cf-bd48-e9ff945828a9)
40 | 
41 | Explore Agent Traces on LangSmith:
42 | 
43 | * [Relational Data](https://smith.langchain.com/public/22721064-dcf6-4e42-be65-e7c46e6835e7/d)
44 | * [Tool Usage (1-tool)](https://smith.langchain.com/public/ac23cb40-e392-471f-b129-a893a77b6f62/d)
45 | * [Tool Usage (26-tools)](https://smith.langchain.com/public/366bddca-62b3-4b6e-849b-a478abab73db/d)
46 | * [Multiverse Math](https://smith.langchain.com/public/983faff2-54b9-4875-9bf2-c16913e7d489/d)
47 | 
48 | ## Installation
49 | 
50 | To install the packages, run the following command:
51 | 
52 | ```bash
53 | pip install -U langchain-benchmarks
54 | ```
55 | 
56 | All the benchmarks come with an associated benchmark dataset stored in [LangSmith](https://smith.langchain.com). To take advantage of the eval and debugging experience, [sign up](https://smith.langchain.com), and set your API key in your environment:
57 | 
58 | ```bash
59 | export LANGCHAIN_API_KEY=ls-...
60 | ```
61 | 
62 | ## Repo Structure
63 | 
64 | The package is located within [langchain_benchmarks](./langchain_benchmarks/). Check out the [docs](https://langchain-ai.github.io/langchain-benchmarks/index.html) for information on how to get starte.
65 | 
66 | The other directories are legacy and may be moved in the future.
67 | 
68 | 
69 | ## Archived
70 | 
71 | Below are archived benchmarks that require cloning this repo to run.
72 | 
73 | - [CSV Question Answering](https://github.com/langchain-ai/langchain-benchmarks/tree/main/archived/csv-qa)
74 | - [Extraction](https://github.com/langchain-ai/langchain-benchmarks/tree/main/archived/extraction)
75 | - [Q&A over the LangChain docs](https://github.com/langchain-ai/langchain-benchmarks/tree/main/archived/langchain-docs-benchmarking)
76 | - [Meta-evaluation of 'correctness' evaluators](https://github.com/langchain-ai/langchain-benchmarks/tree/main/archived/meta-evals)
77 | 
78 | 
79 | ## Related
80 | 
81 | - For cookbooks on other ways to test, debug, monitor, and improve your LLM applications, check out the [LangSmith docs](https://docs.smith.langchain.com/)
82 | - For information on building with LangChain, check out the [python documentation](https://python.langchain.com/docs/get_started/introduction) or [JS documentation](https://js.langchain.com/docs/get_started/introduction)
83 | 
84 | 


--------------------------------------------------------------------------------
/archived/csv-qa/README.md:
--------------------------------------------------------------------------------
  1 | # CSV Question Answering
  2 | 
  3 | This module shows how we benchmark question answering over CSV data.
  4 | There are several components:
  5 | 
  6 | ## Setup
  7 | 
  8 | To setup, you should install all required packages:
  9 | 
 10 | ```shell
 11 | pip install -r requirements.txt
 12 | ```
 13 | 
 14 | You then need to set environment variables. 
 15 | This heavily uses [LangSmith](https://smith.langchain.com/), so you need to set those environment variables:
 16 | 
 17 | ```shell
 18 | export LANGCHAIN_TRACING_V2="true"
 19 | export LANGCHAIN_ENDPOINT=https://api.langchain.plus
 20 | export LANGCHAIN_API_KEY=...
 21 | ```
 22 | 
 23 | This also uses OpenAI, so you need to set that environment variable:
 24 | 
 25 | ````shell
 26 | export OPENAI_API_KEY=...
 27 | ````
 28 | 
 29 | ## How we collected data
 30 | 
 31 | To do this, we set up a simple streamlit app that was logging questions, answers, and feedback to LangSmith.
 32 | We then annotated examples in [LangSmith](https://smith.langchain.com/) and added them to a dataset we were creating.
 33 | For more details on how to do this generally, see [this cookbook](https://github.com/langchain-ai/langsmith-cookbook/tree/main/feedback-examples/streamlit)
 34 | 
 35 | When doing this, you probably want to specific a project for all runs to be logged to:
 36 | 
 37 | ```shell
 38 | export LANGCHAIN_PROJECT="Titanic CSV"
 39 | ```
 40 | 
 41 | The [`streamlit_app.py`](data.csv) file contains the exact code used to run the application.
 42 | You can run this with `streamlit run streamlit_app.py`
 43 | 
 44 | ## What the data is
 45 | 
 46 | See [`data.csv`](data.csv) for the data points we labeled.
 47 | 
 48 | ## How we evaluate
 49 | 
 50 | In order to evaluate, we first upload our data to [LangSmith](https://smith.langchain.com/), with dataset name `Titanic CSV`.
 51 | This is done in [`upload_data.py`](upload_data.py). You can run this with:
 52 | 
 53 | ```shell
 54 | python upload_data.py
 55 | ```
 56 | 
 57 | This allows us to track different evaluation runs against this dataset.
 58 | We then use a standard `qa` evaluator to evaluate whether the generated answers are correct are not.
 59 | 
 60 | We include scripts for evaluating a few different methods:
 61 | 
 62 | ## [Pandas Agent, GPT-3](pandas_agent_gpt_35.py)
 63 | 
 64 | Run with `python pandas_agent_gpt_35.py`
 65 | 
 66 | Results:
 67 | 
 68 | ![results_35.png](result_35.png)
 69 | 
 70 | ## [Pandas Agent, GPT-4](pandas_agent_gpt_4.py)
 71 | 
 72 | Run with `python pandas_agent_gpt_4.py`
 73 | 
 74 | Results:
 75 | 
 76 | ![results_4.png](results_4.png)
 77 | 
 78 | ## [PandasAI](pandas_ai.py)
 79 | 
 80 | Need into install more packages:
 81 | 
 82 | ```shell
 83 | pip install beautifulsoup4 pandasai
 84 | ```
 85 | Then can run with `python pandas_ai.py`
 86 | 
 87 | Results (note token tracking is off because not using LangChain):
 88 | 
 89 | ![results_pandasai.png](results_pandasai.png)
 90 | 
 91 | ## [Custom Agent](custom_agent.py)
 92 | 
 93 | A custom agent equipped with a custom prompt and some custom tools (Python REPL and vectorstore)
 94 | 
 95 | Run with `python custom_agent.py`
 96 | 
 97 | Results:
 98 | 
 99 | ![results_custom.png](results_custom.png)
100 | 


--------------------------------------------------------------------------------
/archived/csv-qa/custom_agent.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from langchain.agents import AgentExecutor, OpenAIFunctionsAgent
 3 | from langchain.agents.agent_toolkits.conversational_retrieval.tool import (
 4 |     create_retriever_tool,
 5 | )
 6 | from langchain.embeddings import OpenAIEmbeddings
 7 | from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 8 | from langchain.smith import RunEvalConfig, run_on_dataset
 9 | from langchain.tools import PythonAstREPLTool
10 | from langchain.vectorstores import FAISS
11 | from langchain_openai import ChatOpenAI
12 | from langsmith import Client
13 | from pydantic import BaseModel, Field
14 | 
15 | pd.set_option("display.max_rows", 20)
16 | pd.set_option("display.max_columns", 20)
17 | 
18 | embedding_model = OpenAIEmbeddings()
19 | vectorstore = FAISS.load_local("titanic_data", embedding_model)
20 | retriever_tool = create_retriever_tool(
21 |     vectorstore.as_retriever(), "person_name_search", "Search for a person by name"
22 | )
23 | 
24 | 
25 | TEMPLATE = """You are working with a pandas dataframe in Python. The name of the dataframe is `df`.
26 | It is important to understand the attributes of the dataframe before working with it. This is the result of running `df.head().to_markdown()`
27 | 
28 | <df>
29 | {dhead}
30 | </df>
31 | 
32 | You are not meant to use only these rows to answer questions - they are meant as a way of telling you about the shape and schema of the dataframe.
33 | You also do not have use only the information here to answer questions - you can run intermediate queries to do exporatory data analysis to give you more information as needed.
34 | 
35 | You have a tool called `person_name_search` through which you can lookup a person by name and find the records corresponding to people with similar name as the query.
36 | You should only really use this if your search term contains a persons name. Otherwise, try to solve it with code.
37 | 
38 | For example:
39 | 
40 | <question>How old is Jane?</question>
41 | <logic>Use `person_name_search` since you can use the query `Jane`</logic>
42 | 
43 | <question>Who has id 320</question>
44 | <logic>Use `python_repl` since even though the question is about a person, you don't know their name so you can't include it.</logic>
45 | """
46 | 
47 | 
48 | class PythonInputs(BaseModel):
49 |     query: str = Field(description="code snippet to run")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     df = pd.read_csv("titanic.csv")
54 |     template = TEMPLATE.format(dhead=df.head().to_markdown())
55 | 
56 |     prompt = ChatPromptTemplate.from_messages(
57 |         [
58 |             ("system", template),
59 |             MessagesPlaceholder(variable_name="agent_scratchpad"),
60 |             ("human", "{input}"),
61 |         ]
62 |     )
63 | 
64 |     def get_chain():
65 |         repl = PythonAstREPLTool(
66 |             locals={"df": df},
67 |             name="python_repl",
68 |             description="Runs code and returns the output of the final line",
69 |             args_schema=PythonInputs,
70 |         )
71 |         tools = [repl, retriever_tool]
72 |         agent = OpenAIFunctionsAgent(
73 |             llm=ChatOpenAI(temperature=0, model="gpt-4"), prompt=prompt, tools=tools
74 |         )
75 |         agent_executor = AgentExecutor(
76 |             agent=agent, tools=tools, max_iterations=5, early_stopping_method="generate"
77 |         )
78 |         return agent_executor
79 | 
80 |     client = Client()
81 |     eval_config = RunEvalConfig(
82 |         evaluators=["qa"],
83 |     )
84 |     chain_results = run_on_dataset(
85 |         client,
86 |         dataset_name="Titanic CSV Data",
87 |         llm_or_chain_factory=get_chain,
88 |         evaluation=eval_config,
89 |     )
90 | 


--------------------------------------------------------------------------------
/archived/csv-qa/pandas_agent_gpt_35.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from langchain.agents.agent_toolkits import create_pandas_dataframe_agent
 3 | from langchain.agents.agent_types import AgentType
 4 | from langchain.smith import RunEvalConfig, run_on_dataset
 5 | from langchain_openai import ChatOpenAI
 6 | from langsmith import Client
 7 | 
 8 | if __name__ == "__main__":
 9 |     df = pd.read_csv("titanic.csv")
10 | 
11 |     def get_chain():
12 |         llm = ChatOpenAI(temperature=0)
13 |         agent_executor_kwargs = {
14 |             "handle_parsing_errors": True,
15 |         }
16 |         agent = create_pandas_dataframe_agent(
17 |             llm,
18 |             df,
19 |             agent_type=AgentType.OPENAI_FUNCTIONS,
20 |             agent_executor_kwargs=agent_executor_kwargs,
21 |             max_iterations=5,
22 |         )
23 |         return agent
24 | 
25 |     client = Client()
26 |     eval_config = RunEvalConfig(
27 |         evaluators=["qa"],
28 |     )
29 |     chain_results = run_on_dataset(
30 |         client,
31 |         dataset_name="Titanic CSV Data",
32 |         llm_or_chain_factory=get_chain,
33 |         evaluation=eval_config,
34 |     )
35 | 


--------------------------------------------------------------------------------
/archived/csv-qa/pandas_agent_gpt_4.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from langchain.agents.agent_toolkits import create_pandas_dataframe_agent
 3 | from langchain.agents.agent_types import AgentType
 4 | from langchain.smith import RunEvalConfig, run_on_dataset
 5 | from langchain_openai import ChatOpenAI
 6 | from langsmith import Client
 7 | 
 8 | if __name__ == "__main__":
 9 |     df = pd.read_csv("titanic.csv")
10 | 
11 |     def get_chain():
12 |         llm = ChatOpenAI(temperature=0, model="gpt-4")
13 |         agent_executor_kwargs = {
14 |             "handle_parsing_errors": True,
15 |         }
16 |         agent = create_pandas_dataframe_agent(
17 |             llm,
18 |             df,
19 |             agent_type=AgentType.OPENAI_FUNCTIONS,
20 |             agent_executor_kwargs=agent_executor_kwargs,
21 |             max_iterations=5,
22 |         )
23 |         return agent
24 | 
25 |     client = Client()
26 |     eval_config = RunEvalConfig(
27 |         evaluators=["qa"],
28 |     )
29 |     chain_results = run_on_dataset(
30 |         client,
31 |         dataset_name="Titanic CSV Data",
32 |         llm_or_chain_factory=get_chain,
33 |         evaluation=eval_config,
34 |     )
35 | 


--------------------------------------------------------------------------------
/archived/csv-qa/pandas_agent_instruct.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from langchain.agents import AgentExecutor, ZeroShotAgent
 3 | from langchain.agents.agent_toolkits.conversational_retrieval.tool import (
 4 |     create_retriever_tool,
 5 | )
 6 | from langchain.embeddings import OpenAIEmbeddings
 7 | from langchain.llms import OpenAI
 8 | from langchain.smith import RunEvalConfig, run_on_dataset
 9 | from langchain.tools import PythonAstREPLTool
10 | from langchain.vectorstores import FAISS
11 | from langsmith import Client
12 | from pydantic import BaseModel, Field
13 | 
14 | pd.set_option("display.max_rows", 20)
15 | pd.set_option("display.max_columns", 20)
16 | 
17 | embedding_model = OpenAIEmbeddings()
18 | vectorstore = FAISS.load_local("titanic_data", embedding_model)
19 | retriever_tool = create_retriever_tool(
20 |     vectorstore.as_retriever(), "person_name_search", "Search for a person by name"
21 | )
22 | 
23 | 
24 | TEMPLATE = """You are working with a pandas dataframe in Python. The name of the dataframe is `df`.
25 | It is important to understand the attributes of the dataframe before working with it. This is the result of running `df.head().to_markdown()`
26 | 
27 | <df>
28 | {dhead}
29 | </df>
30 | 
31 | You are not meant to use only these rows to answer questions - they are meant as a way of telling you about the shape and schema of the dataframe.
32 | You also do not have use only the information here to answer questions - you can run intermediate queries to do exporatory data analysis to give you more information as needed.
33 | 
34 | You have a tool called `person_name_search` through which you can lookup a person by name and find the records corresponding to people with similar name as the query.
35 | You should only really use this if your search term contains a persons name. Otherwise, try to solve it with code.
36 | 
37 | For example:
38 | 
39 | <question>How old is Jane?</question>
40 | <logic>Use `person_name_search` since you can use the query `Jane`</logic>
41 | 
42 | <question>Who has id 320</question>
43 | <logic>Use `python_repl` since even though the question is about a person, you don't know their name so you can't include it.</logic>"""
44 | 
45 | 
46 | class PythonInputs(BaseModel):
47 |     query: str = Field(description="code snippet to run")
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     df = pd.read_csv("titanic.csv")
52 |     template = TEMPLATE.format(dhead=df.head().to_markdown())
53 | 
54 |     def get_chain():
55 |         repl = PythonAstREPLTool(
56 |             locals={"df": df},
57 |             name="python_repl",
58 |             description="Runs code and returns the output of the final line",
59 |             args_schema=PythonInputs,
60 |         )
61 |         tools = [repl, retriever_tool]
62 |         agent = ZeroShotAgent.from_llm_and_tools(
63 |             llm=OpenAI(temperature=0, model="gpt-3.5-turbo-instruct"),
64 |             tools=tools,
65 |             prefix=template,
66 |         )
67 |         agent_executor = AgentExecutor(
68 |             agent=agent, tools=tools, max_iterations=5, early_stopping_method="generate"
69 |         )
70 |         return agent_executor
71 | 
72 |     client = Client()
73 |     eval_config = RunEvalConfig(
74 |         evaluators=["qa"],
75 |     )
76 |     chain_results = run_on_dataset(
77 |         client,
78 |         dataset_name="Titanic CSV Data",
79 |         llm_or_chain_factory=get_chain,
80 |         evaluation=eval_config,
81 |     )
82 | 


--------------------------------------------------------------------------------
/archived/csv-qa/pandas_ai.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from langchain.prompts import ChatPromptTemplate
 3 | from langchain.schema.output_parser import StrOutputParser
 4 | from langchain.smith import RunEvalConfig, run_on_dataset
 5 | from langchain_openai import ChatOpenAI
 6 | from langsmith import Client
 7 | from pandasai import PandasAI
 8 | 
 9 | if __name__ == "__main__":
10 |     df = pd.read_csv("titanic.csv")
11 | 
12 |     pandas_ai = PandasAI(ChatOpenAI(temperature=0, model="gpt-4"), enable_cache=False)
13 |     prompt = ChatPromptTemplate.from_messages(
14 |         [
15 |             (
16 |                 "system",
17 |                 "Answer the users question about some data. A data scientist will run some code and the results will be returned to you to use in your answer",
18 |             ),
19 |             ("human", "Question: {input}"),
20 |             ("human", "Data Scientist Result: {result}"),
21 |         ]
22 |     )
23 | 
24 |     def get_chain():
25 |         chain = (
26 |             {
27 |                 "input": lambda x: x["input_question"],
28 |                 "result": lambda x: pandas_ai(df, prompt=x["input_question"]),
29 |             }
30 |             | prompt
31 |             | ChatOpenAI(temperature=0, model="gpt-4")
32 |             | StrOutputParser()
33 |         )
34 |         return chain
35 | 
36 |     client = Client()
37 |     eval_config = RunEvalConfig(
38 |         evaluators=["qa"],
39 |     )
40 |     chain_results = run_on_dataset(
41 |         client,
42 |         dataset_name="Titanic CSV Data",
43 |         llm_or_chain_factory=get_chain,
44 |         evaluation=eval_config,
45 |     )
46 | 


--------------------------------------------------------------------------------
/archived/csv-qa/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain
2 | openai
3 | streamlit
4 | tiktoken
5 | pandas
6 | tabulate
7 | 


--------------------------------------------------------------------------------
/archived/csv-qa/result_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/csv-qa/result_35.png


--------------------------------------------------------------------------------
/archived/csv-qa/results_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/csv-qa/results_4.png


--------------------------------------------------------------------------------
/archived/csv-qa/results_custom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/csv-qa/results_custom.png


--------------------------------------------------------------------------------
/archived/csv-qa/results_pandasai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/csv-qa/results_pandasai.png


--------------------------------------------------------------------------------
/archived/csv-qa/streamlit_app.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import streamlit as st
 3 | from langchain.agents.agent_toolkits import create_pandas_dataframe_agent
 4 | from langchain.agents.agent_types import AgentType
 5 | from langchain_openai import ChatOpenAI
 6 | 
 7 | df = pd.read_csv("titanic.csv")
 8 | 
 9 | 
10 | llm = ChatOpenAI(temperature=0)
11 | agent = create_pandas_dataframe_agent(llm, df, agent_type=AgentType.OPENAI_FUNCTIONS)
12 | 
13 | 
14 | from langsmith import Client
15 | 
16 | client = Client()
17 | 
18 | 
19 | def send_feedback(run_id, score):
20 |     client.create_feedback(run_id, "user_score", score=score)
21 | 
22 | 
23 | st.set_page_config(page_title="🦜🔗 Ask the CSV App")
24 | st.title("🦜🔗 Ask the CSV App")
25 | st.info(
26 |     "Most 'question answering' applications run over unstructured text data. But a lot of the data in the world is tabular data! This is an attempt to create an application using [LangChain](https://github.com/langchain-ai/langchain) to let you ask questions of data in tabular format. For this demo application, we will use the Titanic Dataset. Please explore it [here](https://github.com/datasciencedojo/datasets/blob/master/titanic.csv) to get a sense for what questions you can ask. Please leave feedback on well the question is answered, and we will use that improve the application!"
27 | )
28 | 
29 | query_text = st.text_input("Enter your question:", placeholder="Who was in cabin C128?")
30 | # Form input and query
31 | result = None
32 | with st.form("myform", clear_on_submit=True):
33 |     submitted = st.form_submit_button("Submit")
34 |     if submitted:
35 |         with st.spinner("Calculating..."):
36 |             response = agent({"input": query_text}, include_run_info=True)
37 |             result = response["output"]
38 |             run_id = response["__run"].run_id
39 | if result is not None:
40 |     st.info(result)
41 |     col_blank, col_text, col1, col2 = st.columns([10, 2, 1, 1])
42 |     with col_text:
43 |         st.text("Feedback:")
44 |     with col1:
45 |         st.button("👍", on_click=send_feedback, args=(run_id, 1))
46 |     with col2:
47 |         st.button("👎", on_click=send_feedback, args=(run_id, 0))
48 | 


--------------------------------------------------------------------------------
/archived/csv-qa/titanic_data/index.faiss:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/csv-qa/titanic_data/index.faiss


--------------------------------------------------------------------------------
/archived/csv-qa/titanic_data/index.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/csv-qa/titanic_data/index.pkl


--------------------------------------------------------------------------------
/archived/csv-qa/upload_data.py:
--------------------------------------------------------------------------------
 1 | from langsmith import Client
 2 | 
 3 | if __name__ == "__main__":
 4 |     client = Client()
 5 |     dataset = client.upload_csv(
 6 |         csv_file="data.csv",
 7 |         input_keys=["input_question"],
 8 |         output_keys=["output_text"],
 9 |         name="Titanic CSV Data",
10 |         description="QA over titanic data",
11 |         data_type="kv",
12 |     )
13 | 


--------------------------------------------------------------------------------
/archived/extraction/oppenheimer_short.txt:
--------------------------------------------------------------------------------
1 | 'Julius Robert Oppenheimer, often known as Robert or "Oppie", is heralded as the father of the atomic bomb. Emerging from a non-practicing Jewish family in New York, he made several breakthroughs, such as the early black hole theory, before the monumental Manhattan Project. His wife, Katherine “Kitty” Oppenheimer, was a German-born woman with a complex past, including connections to the Communist Party. Oppenheimer\'s journey was beset by political adversaries, notably Lewis Strauss, chairman of the U.S. Atomic Energy Commission, and William Borden, an executive director with hawkish nuclear ambitions. These tensions culminated in the famous 1954 security hearing. Influential figures like lieutenant general Leslie Groves, who had also overseen the Pentagon\'s creation, stood by Oppenheimer\'s side, having earlier chosen him for the Manhattan Project and the Los Alamos location. Intimate relationships, like that with Jean Tatlock, a Communist and the possible muse behind the Trinity test\'s name, and colleagues like Frank, Oppenheimer\'s physicist brother, intertwined with his professional life. Scientists such as Ernest Lawrence, Edward Teller, David Hill, Richard Feynman, and Hans Bethe were some of Oppenheimer\'s contemporaries, each contributing to and contesting the atomic age\'s directions. Boris Pash\'s investigations, and the perspectives of figures like Leo Szilard, Niels Bohr, Harry Truman, and others, framed the broader sociopolitical context. Meanwhile, individuals like Robert Serber, Enrico Fermi, Albert Einstein, and Isidor Isaac Rabi, among many others, each played their parts in this narrative, from naming the atomic bombs to pivotal scientific contributions and advisory roles. All these figures, together with the backdrop of World War II, McCarthyism, and the dawn of the nuclear age, presented a complex mosaic of ambitions, loyalties, betrayals, and ideologies.


--------------------------------------------------------------------------------
/archived/extraction/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain
2 | openai
3 | streamlit
4 | 


--------------------------------------------------------------------------------
/archived/extraction/streamlit_app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from langchain.chains import create_extraction_chain
 3 | from langchain_openai import ChatOpenAI
 4 | from langsmith import Client
 5 | 
 6 | st.set_page_config(page_title="🦜🔗 Text-to-graph extraction")
 7 | client = Client()
 8 | 
 9 | 
10 | def send_feedback(run_id, score):
11 |     client.create_feedback(run_id, "user_score", score=score)
12 | 
13 | 
14 | st.title("🦜🔗 Text-to-graph playground")
15 | st.info(
16 |     "This playground explores the use of [OpenAI functions](https://openai.com/blog/function-calling-and-other-api-updates) and [LangChain](https://github.com/langchain-ai/langchain) to build knowledge graphs from user-input text. It breaks down the user input text into knowledge graph triples of subject (primary entities or concepts in a sentence), predicate (actions or relationships that connect subjects to objects), and object (entities or concepts that interact with or are acted upon by the subjects)."
17 | )
18 | 
19 | # Input text (optional default)
20 | oppenheimer_text = """'Julius Robert Oppenheimer, often known as Robert or "Oppie", is heralded as the father of the atomic bomb. Emerging from a non-practicing Jewish family in New York, he made several breakthroughs, such as the early black hole theory, before the monumental Manhattan Project. His wife, Katherine “Kitty” Oppenheimer, was a German-born woman with a complex past, including connections to the Communist Party. Oppenheimer\'s journey was beset by political adversaries, notably Lewis Strauss, chairman of the U.S. Atomic Energy Commission, and William Borden, an executive director with hawkish nuclear ambitions. These tensions culminated in the famous 1954 security hearing. Influential figures like lieutenant general Leslie Groves, who had also overseen the Pentagon\'s creation, stood by Oppenheimer\'s side, having earlier chosen him for the Manhattan Project and the Los Alamos location. Intimate relationships, like that with Jean Tatlock, a Communist and the possible muse behind the Trinity test\'s name, and colleagues like Frank, Oppenheimer\'s physicist brother, intertwined with his professional life. Scientists such as Ernest Lawrence, Edward Teller, David Hill, Richard Feynman, and Hans Bethe were some of Oppenheimer\'s contemporaries, each contributing to and contesting the atomic age\'s directions. Boris Pash\'s investigations, and the perspectives of figures like Leo Szilard, Niels Bohr, Harry Truman, and others, framed the broader sociopolitical context. Meanwhile, individuals like Robert Serber, Enrico Fermi, Albert Einstein, and Isidor Isaac Rabi, among many others, each played their parts in this narrative, from naming the atomic bombs to pivotal scientific contributions and advisory roles. All these figures, together with the backdrop of World War II, McCarthyism, and the dawn of the nuclear age, presented a complex mosaic of ambitions, loyalties, betrayals, and ideologies.oppenheimer_short.txt"""
21 | 
22 | # Knowledge triplet schema
23 | default_schema = {
24 |     "properties": {
25 |         "subject": {"type": "string"},
26 |         "predicate": {"type": "string"},
27 |         "object": {"type": "string"},
28 |     },
29 |     "required": ["subject", "predicate", "object"],
30 | }
31 | 
32 | # Create a text_area, set the default value to oppenheimer_text
33 | MAX_CHARS = 2000  # Maximum number of characters
34 | user_input_text = st.text_area("Enter your text (<2000 characters):", height=200)
35 | if len(user_input_text) > MAX_CHARS:
36 |     st.warning(f"Text is too long. Processing only the first {MAX_CHARS} characters")
37 |     user_input_text = user_input_text[:MAX_CHARS]
38 | 
39 | 
40 | # Output formatting of triples
41 | def json_to_markdown_table(json_list):
42 |     if not json_list:
43 |         return "No data available."
44 | 
45 |     # Extract headers
46 |     headers = json_list[0].keys()
47 |     markdown_table = " | ".join(headers) + "\n"
48 |     markdown_table += " | ".join(["---"] * len(headers)) + "\n"
49 | 
50 |     # Extract rows
51 |     for item in json_list:
52 |         row = " | ".join([str(item[header]) for header in headers])
53 |         markdown_table += row + "\n"
54 | 
55 |     return markdown_table
56 | 
57 | 
58 | # Form input and query
59 | markdown_output = None
60 | with st.form("myform", clear_on_submit=True):
61 |     submitted = st.form_submit_button("Submit")
62 |     if submitted:
63 |         with st.spinner("Calculating..."):
64 |             llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
65 |             chain = create_extraction_chain(default_schema, llm)
66 |             extraction_output = chain(user_input_text, include_run_info=True)
67 |             markdown_output = json_to_markdown_table(extraction_output["text"])
68 |             run_id = extraction_output["__run"].run_id
69 | 
70 | # Feeback
71 | if markdown_output is not None:
72 |     st.markdown(markdown_output)
73 |     col_blank, col_text, col1, col2 = st.columns([10, 2, 1, 1])
74 |     with col_text:
75 |         st.text("Feedback:")
76 |     with col1:
77 |         st.button("👍", on_click=send_feedback, args=(run_id, 1))
78 |     with col2:
79 |         st.button("👎", on_click=send_feedback, args=(run_id, 0))
80 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarking on LangChain Docs
 2 | 
 3 | This directory contains code to benchmark your cognitive architecture on the public [LangChain Q&A docs evaluation benchmark](https://smith.langchain.com/public/e1bfd348-494a-4df5-899a-7c6c09233cc4/d).
 4 | 
 5 | To one one of the existing configurations, activate your poetry environment, configure you LangSmith API key, and run the experiments.
 6 | 
 7 | **Note:** this will benchmark chains on a _copy_ of the dataset and will not update the public leaderboard.
 8 | 
 9 | ## Running the published experiments
10 | 
11 | The following steps will let you run pre-configured experiments:
12 | 
13 | ### 1. Install requirements
14 | 
15 | ```bash
16 | pip install poetry
17 | poetry shell
18 | poetry install
19 | ```
20 | 
21 | ### 2. Configure API keys
22 | 
23 | Create a [LangSmith account](https://smith.langchain.com/) and set your API key:
24 | 
25 | ```bash
26 | export LANGCHAIN_API_KEY=ls_your-api-key
27 | ```
28 | 
29 | The various cognitive architectures implemented already use Anthropic, [Fireworks.AI](https://www.fireworks.ai/), and OpenAI. Set the required API keys:
30 | 
31 | ```
32 | export OPENAI_API_KEY=your-api-key
33 | export ANTHROPIC_API_KEY=your-api-key
34 | export FIREWORKS_API_KEY=your-api-key
35 | ```
36 | 
37 | ### 3. Run Experiments
38 | 
39 | To run all experiments, run:
40 | 
41 | ```bash
42 | python run_experiments.py
43 | ```
44 | 
45 | If you want to only run certain experiments in the `run_experiments.py` file, use `--include` or `--exclude`
46 | 
47 | Example:
48 | 
49 | ```bash
50 | python run_experiments --include mistral-7b-instruct-4k llama-v2-34b-code-instruct-w8a16
51 | ```
52 | 
53 | ## Evaluating your custom cognitive architecture
54 | 
55 | You can also evaluate your own custom cognitive architecture. To do so:
56 | 
57 | 1. Create a python file defining your architecture:
58 | 
59 | ```python
60 | # example_custom_chain.py
61 | 
62 | ...
63 | def load_runnable(config: dict) -> "Runnable":
64 |     # Load based on the config provided
65 |     return my_chain
66 | ```
67 | 
68 | 2. Call `run_experiments.py` with a custom `--config my_config.json`
69 | 
70 | ```js
71 | {
72 |   // This specifies the path to your custom entrypoint followed by the loader function
73 |   "arch": "path/to/example_custom_chain.py::load_runnable",
74 |   "model_config": {
75 |     // This is passed to load_runnable() in example_custom_chain.py()
76 |     "chat_cls": "ChatOpenAI",
77 |     "model": "gpt-4"
78 |   },
79 |   "project_name": "example-custom-code" // This is the resulting test project name
80 | }
81 | ```
82 | 
83 | We have provided an example in [example_custom_chain.py](./packages/example/custom_example/example_custom_chain.py), which can be run by pointing `run_experiments` to the [example_custom_config.json](./example_custom_config.json) config file:
84 | 
85 | ```bash
86 | python run_experiments.py --config ./example_custom_config.json
87 | ```
88 | 
89 | Whenever you provide 1 or more `--config` files, the `--include` and `--exclude` arguments are ignored.
90 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/app/__init__.py


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/app/server.py:
--------------------------------------------------------------------------------
 1 | from chat_langchain.chain import chain
 2 | from fastapi import FastAPI
 3 | from langserve import add_routes
 4 | from openai_functions_agent import agent_executor as openai_functions_agent_chain
 5 | 
 6 | app = FastAPI()
 7 | 
 8 | # Edit this to add the chain you want to add
 9 | add_routes(
10 |     app,
11 |     chain,
12 |     path="/chat",
13 |     # include_callback_events=True, # TODO: Include when fixed
14 | )
15 | 
16 | add_routes(app, openai_functions_agent_chain, path="/openai-functions-agent")
17 | 
18 | 
19 | def run_server(port: int = 1983):
20 |     import uvicorn
21 | 
22 |     uvicorn.run(app, host="0.0.0.0", port=port)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     run_server()
27 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/example_custom_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "arch": "packages/example/custom_example/example_custom_chain.py::create_runnable",
3 |   "model_config": {
4 |     "chat_cls": "ChatOpenAI",
5 |     "model": "gpt-4"
6 |   },
7 |   "project_name": "example-custom-code"
8 | }
9 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/packages/README.md


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # anthropic-iterative-search
 3 | 
 4 | This template will create a virtual research assistant with the ability to search Wikipedia to find answers to your questions.
 5 | 
 6 | It is heavily inspired by [this notebook](https://github.com/anthropics/anthropic-cookbook/blob/main/long_context/wikipedia-search-cookbook.ipynb).
 7 | 
 8 | ## Environment Setup
 9 | 
10 | Set the `ANTHROPIC_API_KEY` environment variable to access the Anthropic models.
11 | 
12 | ## Usage
13 | 
14 | To use this package, you should first have the LangChain CLI installed:
15 | 
16 | ```shell
17 | pip install -U "langchain-cli[serve]"
18 | ```
19 | 
20 | To create a new LangChain project and install this as the only package, you can do:
21 | 
22 | ```shell
23 | langchain app new my-app --package anthropic-iterative-search
24 | ```
25 | 
26 | If you want to add this to an existing project, you can just run:
27 | 
28 | ```shell
29 | langchain app add anthropic-iterative-search
30 | ```
31 | 
32 | And add the following code to your `server.py` file:
33 | ```python
34 | from anthropic_iterative_search import chain as anthropic_iterative_search_chain
35 | 
36 | add_routes(app, anthropic_iterative_search_chain, path="/anthropic-iterative-search")
37 | ```
38 | 
39 | (Optional) Let's now configure LangSmith. 
40 | LangSmith will help us trace, monitor and debug LangChain applications. 
41 | LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). 
42 | If you don't have access, you can skip this section
43 | 
44 | 
45 | ```shell
46 | export LANGCHAIN_TRACING_V2=true
47 | export LANGCHAIN_API_KEY=<your-api-key>
48 | export LANGCHAIN_PROJECT=<your-project>  # if not specified, defaults to "default"
49 | ```
50 | 
51 | If you are inside this directory, then you can spin up a LangServe instance directly by:
52 | 
53 | ```shell
54 | langchain serve
55 | ```
56 | 
57 | This will start the FastAPI app with a server is running locally at 
58 | [http://localhost:8000](http://localhost:8000)
59 | 
60 | We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
61 | We can access the playground at [http://127.0.0.1:8000/anthropic-iterative-search/playground](http://127.0.0.1:8000/anthropic-iterative-search/playground)  
62 | 
63 | We can access the template from code with:
64 | 
65 | ```python
66 | from langserve.client import RemoteRunnable
67 | 
68 | runnable = RemoteRunnable("http://localhost:8000/anthropic-iterative-search")
69 | ```


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/__init__.py:
--------------------------------------------------------------------------------
 1 | from langchain.schema.runnable import ConfigurableField
 2 | 
 3 | from .chain import chain
 4 | from .retriever_agent import executor
 5 | 
 6 | final_chain = chain.configurable_alternatives(
 7 |     ConfigurableField(id="chain"),
 8 |     default_key="response",
 9 |     # This adds a new option, with name `openai` that is equal to `ChatOpenAI()`
10 |     retrieve=executor,
11 | )
12 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/agent_scratchpad.py:
--------------------------------------------------------------------------------
 1 | def _format_docs(docs):
 2 |     result = "\n".join(
 3 |         [
 4 |             f'<item index="{i+1}">\n<page_content>\n{r}\n</page_content>\n</item>'
 5 |             for i, r in enumerate(docs)
 6 |         ]
 7 |     )
 8 |     return result
 9 | 
10 | 
11 | def format_agent_scratchpad(intermediate_steps):
12 |     thoughts = ""
13 |     for action, observation in intermediate_steps:
14 |         thoughts += action.log
15 |         thoughts += "</search_query>" + _format_docs(observation)
16 |     return thoughts
17 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/chain.py:
--------------------------------------------------------------------------------
 1 | from langchain.chat_models import ChatAnthropic
 2 | from langchain.prompts import ChatPromptTemplate
 3 | from langchain.schema.output_parser import StrOutputParser
 4 | from langchain.schema.runnable import RunnableLambda
 5 | from pydantic import BaseModel
 6 | 
 7 | from .prompts import answer_prompt
 8 | from .retriever_agent import executor
 9 | 
10 | prompt = ChatPromptTemplate.from_template(answer_prompt)
11 | 
12 | model = ChatAnthropic(model="claude-2", temperature=0, max_tokens_to_sample=1000)
13 | 
14 | chain = (
15 |     RunnableLambda(lambda x: {"query": x["question"]})
16 |     | {"query": lambda x: x["query"], "information": executor | (lambda x: x["output"])}
17 |     | prompt
18 |     | model
19 |     | StrOutputParser()
20 | )
21 | 
22 | # Add typing for the inputs to be used in the playground
23 | 
24 | 
25 | class Inputs(BaseModel):
26 |     question: str
27 | 
28 | 
29 | chain = chain.with_types(input_type=Inputs)
30 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/output_parser.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from langchain.schema.agent import AgentAction, AgentFinish
 4 | 
 5 | from .agent_scratchpad import _format_docs
 6 | 
 7 | 
 8 | def extract_between_tags(tag: str, string: str, strip: bool = True) -> str:
 9 |     ext_list = re.findall(f"<{tag}\s?>(.+?)</{tag}\s?>", string, re.DOTALL)
10 |     if strip:
11 |         ext_list = [e.strip() for e in ext_list]
12 |     if ext_list:
13 |         if len(ext_list) != 1:
14 |             raise ValueError
15 |         # Only return the first one
16 |         return ext_list[0]
17 | 
18 | 
19 | def parse_output(outputs):
20 |     partial_completion = outputs["partial_completion"]
21 |     steps = outputs["intermediate_steps"]
22 |     search_query = extract_between_tags(
23 |         "search_query", partial_completion + "</search_query>"
24 |     )
25 |     if search_query is None:
26 |         docs = []
27 |         str_output = ""
28 |         for action, observation in steps:
29 |             docs.extend(observation)
30 |             str_output += action.log
31 |             str_output += "</search_query>" + _format_docs(observation)
32 |         str_output += partial_completion
33 |         return AgentFinish({"docs": docs, "output": str_output}, log=partial_completion)
34 |     else:
35 |         return AgentAction(
36 |             tool="search", tool_input=search_query, log=partial_completion
37 |         )
38 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/prompts.py:
--------------------------------------------------------------------------------
1 | retrieval_prompt = """{retriever_description} Before beginning to research the user's question, first think for a moment inside <scratchpad> tags about what information is necessary for a well-informed answer. If the user's question is complex, you may need to decompose the query into multiple subqueries and execute them individually. Sometimes the search engine will return empty search results, or the search results may not contain the information you need. In such cases, feel free to try again with a different query. 
2 | 
3 | After each call to the Search Engine Tool, reflect briefly inside <search_quality></search_quality> tags about whether you now have enough information to answer, or whether more information is needed. If you have all the relevant information, write it in <information></information> tags, WITHOUT actually answering the question. Otherwise, issue a new search.
4 | 
5 | Here is the user's question: <question>{query}</question> Remind yourself to make short queries in your scratchpad as you plan out your strategy."""  # noqa: E501
6 | 
7 | answer_prompt = "Here is a user query: <query>{query}</query>. Here is some relevant information: <information>{information}</information>. Please answer the question using the relevant information."  # noqa: E501
8 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/retriever.py:
--------------------------------------------------------------------------------
 1 | from langchain.tools import tool
 2 | from langchain_docs_retriever.retriever import get_retriever
 3 | 
 4 | # This is used to tell the model how to best use the retriever.
 5 | 
 6 | retriever_description = """You will be asked a question by a human user. You have access to the following tool to help answer the question. <tool_description> Search Engine Tool * The search engine will exclusively search over the LangChain documentation for pages similar to your query. It returns for each page its title and full page content. Use this tool if you want to get up-to-date and comprehensive information on a topic to help answer queries. Queries should be as atomic as possible -- they only need to address one part of the user's question. For example, if the user's query is "what is the color of a basketball?", your search query should be "basketball". Here's another example: if the user's question is "Who created the first neural network?", your first query should be "neural network". As you can see, these queries are quite short. Think keywords, not phrases. * At any time, you can make a call to the search engine using the following syntax: <search_query>query_word</search_query>. * You'll then get results back in <search_result> tags.</tool_description>"""  # noqa: E501
 7 | 
 8 | retriever = get_retriever()
 9 | 
10 | # This should be the same as the function name below
11 | RETRIEVER_TOOL_NAME = "search"
12 | 
13 | 
14 | @tool
15 | def search(query, callbacks=None):
16 |     """Search the LangChain docs with the retriever."""
17 |     return retriever.get_relevant_documents(query, callbacks=callbacks)
18 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/anthropic_iterative_search/retriever_agent.py:
--------------------------------------------------------------------------------
 1 | from langchain.agents import AgentExecutor
 2 | from langchain.chat_models import ChatAnthropic
 3 | from langchain.prompts import ChatPromptTemplate
 4 | from langchain.schema.output_parser import StrOutputParser
 5 | from langchain.schema.runnable import RunnableMap, RunnablePassthrough
 6 | 
 7 | from .agent_scratchpad import format_agent_scratchpad
 8 | from .output_parser import parse_output
 9 | from .prompts import retrieval_prompt
10 | from .retriever import retriever_description, search
11 | 
12 | prompt = ChatPromptTemplate.from_messages(
13 |     [
14 |         ("user", retrieval_prompt),
15 |         ("ai", "{agent_scratchpad}"),
16 |     ]
17 | )
18 | prompt = prompt.partial(retriever_description=retriever_description)
19 | 
20 | model = ChatAnthropic(model="claude-2", temperature=0, max_tokens_to_sample=1000)
21 | 
22 | chain = (
23 |     RunnablePassthrough.assign(
24 |         agent_scratchpad=lambda x: format_agent_scratchpad(x["intermediate_steps"])
25 |     )
26 |     | prompt
27 |     | model.bind(stop_sequences=["</search_query>"])
28 |     | StrOutputParser()
29 | )
30 | 
31 | agent_chain = (
32 |     RunnableMap(
33 |         {
34 |             "partial_completion": chain,
35 |             "intermediate_steps": lambda x: x["intermediate_steps"],
36 |         }
37 |     )
38 |     | parse_output
39 | )
40 | 
41 | executor = AgentExecutor(agent=agent_chain, tools=[search])
42 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/main.py:
--------------------------------------------------------------------------------
 1 | from anthropic_iterative_search import final_chain
 2 | 
 3 | if __name__ == "__main__":
 4 |     query = (
 5 |         "Which movie came out first: Oppenheimer, or "
 6 |         "Are You There God It's Me Margaret?"
 7 |     )
 8 |     print(
 9 |         final_chain.with_config(configurable={"chain": "retrieve"}).invoke(
10 |             {"query": query}
11 |         )
12 |     )
13 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "anthropic-iterative-search"
 3 | version = "0.0.1"
 4 | description = ""
 5 | authors = []
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = ">=3.8.1,<4.0"
10 | langchain = ">=0.0.331,<0.1.0"
11 | anthropic = "^0.5.0"
12 | wikipedia = "^1.4.0"
13 | 
14 | [tool.langserve]
15 | export_module = "anthropic_iterative_search"
16 | export_attr = "final_chain"
17 | 
18 | [build-system]
19 | requires = [
20 |     "poetry-core",
21 | ]
22 | build-backend = "poetry.core.masonry.api"
23 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/packages/anthropic-iterative-search/tests/__init__.py


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/chat-langchain/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 LangChain, Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/chat-langchain/README.md:
--------------------------------------------------------------------------------
 1 | # chat-langchain
 2 | 
 3 | TODO: What does this package do
 4 | 
 5 | ## Environment Setup
 6 | 
 7 | TODO: What environment variables need to be set (if any)
 8 | 
 9 | ## Usage
10 | 
11 | To use this package, you should first have the LangChain CLI installed:
12 | 
13 | ```shell
14 | pip install -U "langchain-cli[serve]"
15 | ```
16 | 
17 | To create a new LangChain project and install this as the only package, you can do:
18 | 
19 | ```shell
20 | langchain app new my-app --package chat-langchain
21 | ```
22 | 
23 | If you want to add this to an existing project, you can just run:
24 | 
25 | ```shell
26 | langchain app add chat-langchain
27 | ```
28 | 
29 | And add the following code to your `server.py` file:
30 | ```python
31 | from chat_langchain import chain as chat_langchain_chain
32 | 
33 | add_routes(app, chat_langchain_chain, path="/chat-langchain")
34 | ```
35 | 
36 | (Optional) Let's now configure LangSmith. 
37 | LangSmith will help us trace, monitor and debug LangChain applications. 
38 | LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). 
39 | If you don't have access, you can skip this section
40 | 
41 | 
42 | ```shell
43 | export LANGCHAIN_TRACING_V2=true
44 | export LANGCHAIN_API_KEY=<your-api-key>
45 | export LANGCHAIN_PROJECT=<your-project>  # if not specified, defaults to "default"
46 | ```
47 | 
48 | If you are inside this directory, then you can spin up a LangServe instance directly by:
49 | 
50 | ```shell
51 | langchain serve
52 | ```
53 | 
54 | This will start the FastAPI app with a server is running locally at 
55 | [http://localhost:8000](http://localhost:8000)
56 | 
57 | We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
58 | We can access the playground at [http://127.0.0.1:8000/chat-langchain/playground](http://127.0.0.1:8000/chat-langchain/playground)  
59 | 
60 | We can access the template from code with:
61 | 
62 | ```python
63 | from langserve.client import RemoteRunnable
64 | 
65 | runnable = RemoteRunnable("http://localhost:8000/chat-langchain")
66 | ```


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/chat-langchain/chat_langchain/__init__.py:
--------------------------------------------------------------------------------
1 | from chat_langchain.chain import chain
2 | 
3 | __all__ = ["chain"]
4 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/chat-langchain/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "chat-langchain"
 3 | version = "0.0.1"
 4 | description = ""
 5 | authors = []
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | openai = ">1,<2"
10 | python = "^3.10"
11 | fastapi = "^0.104.1"
12 | pydantic = "1.10"
13 | langchain = ">=0.0.327,<0.1.0"
14 | uvicorn = "^0.23.2"
15 | beautifulsoup4 = "^4.12.2"
16 | tiktoken = "^0.4.0"
17 | weaviate-client = "^3.23.2"
18 | psycopg2 = "^2.9.7"
19 | lxml = "^4.9.3"
20 | langserve = {extras = ["server"], version = ">=0.0.21,<0.1.0"}
21 | anthropic = "^0.5.0"
22 | 
23 | [tool.poetry.group.dev.dependencies]
24 | langchain-cli = ">=0.0.4"
25 | fastapi = "^0.104.0"
26 | sse-starlette = "^1.6.5"
27 | 
28 | [tool.langserve]
29 | export_module = "chat_langchain"
30 | export_attr = "chain"
31 | 
32 | [build-system]
33 | requires = ["poetry-core"]
34 | build-backend = "poetry.core.masonry.api"
35 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/chat-langchain/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/packages/chat-langchain/tests/__init__.py


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/example/custom_example/example_custom_chain.py:
--------------------------------------------------------------------------------
 1 | from langchain.chat_models import ChatAnthropic, ChatOpenAI
 2 | from langchain.prompts import ChatPromptTemplate
 3 | from langchain.schema.output_parser import StrOutputParser
 4 | from langchain_docs_retriever.retriever import get_retriever
 5 | 
 6 | 
 7 | def create_runnable(config: dict):
 8 |     config_copy = config.copy()
 9 |     chat_cls_name = config_copy.pop("chat_cls", "ChatOpenAI")
10 | 
11 |     assert chat_cls_name in {"ChatOpenAI", "ChatAnthropic"}
12 |     chat_cls = {
13 |         "ChatOpenAI": ChatOpenAI,
14 |         "ChatAnthropic": ChatAnthropic,
15 |     }[chat_cls_name]
16 |     model = chat_cls(**config_copy)
17 |     retriever = get_retriever(config.get("retriever_config", {}))
18 |     prompt = ChatPromptTemplate.from_messages(
19 |         [
20 |             ("system", "Answer the Q using the following docs\n{docs}"),
21 |             ("user", "Q: {question}"),
22 |         ]
23 |     )
24 |     return (
25 |         {
26 |             "question": lambda x: x["question"],
27 |             "docs": (lambda x: x["question"]) | retriever,
28 |         }
29 |         | prompt
30 |         | model
31 |         | StrOutputParser()
32 |     )
33 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/langchain-docs-retriever/README.md:
--------------------------------------------------------------------------------
1 | # LangChain Docs Retriever
2 | 
3 | 
4 | A simple vector store retriever over the LangChain python docs. Indexed
5 | simply using [ingest_docs.py](./ingest_docs.py).
6 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/langchain-docs-retriever/langchain_docs_retriever/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/packages/langchain-docs-retriever/langchain_docs_retriever/__init__.py


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/langchain-docs-retriever/langchain_docs_retriever/download_db.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import zipfile
 3 | 
 4 | import requests
 5 | 
 6 | remote_url = "https://storage.googleapis.com/benchmarks-artifacts/langchain-docs-benchmarking/chroma_db.zip"
 7 | directory = os.path.dirname(os.path.realpath(__file__))
 8 | db_directory = os.path.join(directory, "db")
 9 | 
10 | 
11 | def is_folder_populated(folder):
12 |     if os.path.exists(folder):
13 |         return any(os.scandir(folder))
14 |     return False
15 | 
16 | 
17 | def download_folder_from_gcs():
18 |     r = requests.get(remote_url, allow_redirects=True)
19 |     open("chroma_db.zip", "wb").write(r.content)
20 | 
21 |     with zipfile.ZipFile("chroma_db.zip", "r") as zip_ref:
22 |         zip_ref.extractall(directory)
23 | 
24 |     os.remove("chroma_db.zip")
25 | 
26 | 
27 | def fetch_langchain_docs_db():
28 |     if not is_folder_populated(db_directory):
29 |         print(f"Folder {db_directory} is not populated. Downloading from GCS...")
30 |         download_folder_from_gcs()
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     fetch_langchain_docs_db()
35 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/langchain-docs-retriever/langchain_docs_retriever/retriever.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Optional
 3 | 
 4 | from langchain.embeddings import OpenAIEmbeddings
 5 | 
 6 | # from langchain_docs_retriever.voyage import VoyageEmbeddings
 7 | from langchain.embeddings.voyageai import VoyageEmbeddings
 8 | from langchain.schema.embeddings import Embeddings
 9 | from langchain.schema.retriever import BaseRetriever
10 | from langchain.vectorstores.chroma import Chroma
11 | 
12 | from .download_db import fetch_langchain_docs_db
13 | 
14 | WEAVIATE_DOCS_INDEX_NAME = "LangChain_agent_docs"
15 | _DIRECTORY = os.path.dirname(os.path.abspath(__file__))
16 | CHROMA_COLLECTION_NAME = "langchain-docs"
17 | _DB_DIRECTORY = os.path.join(_DIRECTORY, "db")
18 | 
19 | 
20 | def get_embeddings_model() -> Embeddings:
21 |     if os.environ.get("VOYAGE_AI_MODEL"):
22 |         return VoyageEmbeddings(model=os.environ["VOYAGE_AI_MODEL"], max_retries=20)
23 |     return OpenAIEmbeddings(chunk_size=200)
24 | 
25 | 
26 | def get_retriever(search_kwargs: Optional[dict] = None) -> BaseRetriever:
27 |     embedding_model = get_embeddings_model()
28 |     fetch_langchain_docs_db()
29 |     vectorstore = Chroma(
30 |         collection_name=CHROMA_COLLECTION_NAME,
31 |         embedding_function=embedding_model,
32 |         persist_directory=_DB_DIRECTORY,
33 |     )
34 |     search_kwargs = search_kwargs or dict(k=6)
35 |     return vectorstore.as_retriever(search_kwargs=search_kwargs)
36 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/langchain-docs-retriever/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "langchain-docs-retriever"
 3 | version = "0.0.1"
 4 | description = ""
 5 | authors = []
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.10"
10 | fastapi = "^0.104.1"
11 | pydantic = "1.10"
12 | langchain = ">=0.0.331,<0.1.0"
13 | uvicorn = "^0.23.2"
14 | openai = ">1,<2"
15 | psycopg2 = "^2.9.7"
16 | lxml = "^4.9.3"
17 | langserve = {extras = ["server"], version = ">=0.0.23,<0.1.0"}
18 | chromadb = "^0.4.15"
19 | 
20 | [tool.poetry.group.dev.dependencies]
21 | langchain-cli = ">=0.0.4"
22 | fastapi = "^0.104.0"
23 | sse-starlette = "^1.6.5"
24 | 
25 | [tool.langserve]
26 | export_module = "chat_langchain"
27 | export_attr = "chain"
28 | 
29 | [build-system]
30 | requires = ["poetry-core"]
31 | build-backend = "poetry.core.masonry.api"
32 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/oai-assistant/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 LangChain, Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/oai-assistant/README.md:
--------------------------------------------------------------------------------
 1 | # oai-assistant
 2 | 
 3 | TODO: What does this package do
 4 | 
 5 | ## Environment Setup
 6 | 
 7 | TODO: What environment variables need to be set (if any)
 8 | 
 9 | ## Usage
10 | 
11 | To use this package, you should first have the LangChain CLI installed:
12 | 
13 | ```shell
14 | pip install -U "langchain-cli[serve]"
15 | ```
16 | 
17 | To create a new LangChain project and install this as the only package, you can do:
18 | 
19 | ```shell
20 | langchain app new my-app --package oai-assistant
21 | ```
22 | 
23 | If you want to add this to an existing project, you can just run:
24 | 
25 | ```shell
26 | langchain app add oai-assistant
27 | ```
28 | 
29 | And add the following code to your `server.py` file:
30 | ```python
31 | from oai_assistant import chain as oai_assistant_chain
32 | 
33 | add_routes(app, oai_assistant_chain, path="/oai-assistant")
34 | ```
35 | 
36 | (Optional) Let's now configure LangSmith. 
37 | LangSmith will help us trace, monitor and debug LangChain applications. 
38 | LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). 
39 | If you don't have access, you can skip this section
40 | 
41 | 
42 | ```shell
43 | export LANGCHAIN_TRACING_V2=true
44 | export LANGCHAIN_API_KEY=<your-api-key>
45 | export LANGCHAIN_PROJECT=<your-project>  # if not specified, defaults to "default"
46 | ```
47 | 
48 | If you are inside this directory, then you can spin up a LangServe instance directly by:
49 | 
50 | ```shell
51 | langchain serve
52 | ```
53 | 
54 | This will start the FastAPI app with a server is running locally at 
55 | [http://localhost:8000](http://localhost:8000)
56 | 
57 | We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
58 | We can access the playground at [http://127.0.0.1:8000/oai-assistant/playground](http://127.0.0.1:8000/oai-assistant/playground)  
59 | 
60 | We can access the template from code with:
61 | 
62 | ```python
63 | from langserve.client import RemoteRunnable
64 | 
65 | runnable = RemoteRunnable("http://localhost:8000/oai-assistant")
66 | ```


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/oai-assistant/oai_assistant/__init__.py:
--------------------------------------------------------------------------------
1 | from oai_assistant.chain import agent_executor
2 | 
3 | __all__ = ["agent_executor"]
4 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/oai-assistant/oai_assistant/chain.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from langchain.agents import AgentExecutor
 4 | from langchain.tools import tool
 5 | from langchain_docs_retriever.retriever import get_retriever
 6 | from langchain_experimental.openai_assistant import OpenAIAssistantRunnable
 7 | 
 8 | # This is used to tell the model how to best use the retriever.
 9 | 
10 | 
11 | _RETRIEVER = get_retriever()
12 | 
13 | 
14 | @tool
15 | def search(query, callbacks=None) -> str:
16 |     """Search the LangChain docs with the retriever."""
17 |     docs = _RETRIEVER.get_relevant_documents(query, callbacks=callbacks)
18 |     return json.dumps([doc.dict() for doc in docs])
19 | 
20 | 
21 | tools = [search]
22 | 
23 | agent = OpenAIAssistantRunnable.create_assistant(
24 |     name="langchain docs assistant",
25 |     instructions="You are a helpful assistant tasked with answering technical questions about LangChain.",
26 |     tools=tools,
27 |     model="gpt-4-1106-preview",
28 |     as_agent=True,
29 | )
30 | 
31 | 
32 | agent_executor = (
33 |     (lambda x: {"content": x["question"]})
34 |     | AgentExecutor(agent=agent, tools=tools)
35 |     | (lambda x: x["output"])
36 | )
37 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/oai-assistant/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "oai-assistant"
 3 | version = "0.0.1"
 4 | description = ""
 5 | authors = []
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = ">=3.8.1,<4.0"
10 | langchain = ">=0.0.332,<0.1.0"
11 | openai = ">1,<2"
12 | langchain-experimental = "^0.0.38"
13 | 
14 | [tool.poetry.group.dev.dependencies]
15 | langchain-cli = ">=0.0.4"
16 | fastapi = "^0.104.0"
17 | sse-starlette = "^1.6.5"
18 | 
19 | [tool.langserve]
20 | export_module = "oai_assistant"
21 | export_attr = "agent_executor"
22 | 
23 | [build-system]
24 | requires = ["poetry-core"]
25 | build-backend = "poetry.core.masonry.api"
26 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/oai-assistant/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/packages/oai-assistant/tests/__init__.py


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/openai-functions-agent/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 LangChain, Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/openai-functions-agent/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # openai-functions-agent
 3 | 
 4 | This template creates an agent that uses OpenAI function calling to communicate its decisions on what actions to take. 
 5 | 
 6 | This example creates an agent that can optionally look up information on the internet using Tavily's search engine.
 7 | 
 8 | ## Environment Setup
 9 | 
10 | The following environment variables need to be set:
11 | 
12 | Set the `OPENAI_API_KEY` environment variable to access the OpenAI models.
13 | 
14 | Set the `TAVILY_API_KEY` environment variable to access Tavily.
15 | 
16 | ## Usage
17 | 
18 | To use this package, you should first have the LangChain CLI installed:
19 | 
20 | ```shell
21 | pip install -U "langchain-cli[serve]"
22 | ```
23 | 
24 | To create a new LangChain project and install this as the only package, you can do:
25 | 
26 | ```shell
27 | langchain app new my-app --package openai-functions-agent
28 | ```
29 | 
30 | If you want to add this to an existing project, you can just run:
31 | 
32 | ```shell
33 | langchain app add openai-functions-agent
34 | ```
35 | 
36 | And add the following code to your `server.py` file:
37 | ```python
38 | from openai_functions_agent import chain as openai_functions_agent_chain
39 | 
40 | add_routes(app, openai_functions_agent_chain, path="/openai-functions-agent")
41 | ```
42 | 
43 | (Optional) Let's now configure LangSmith. 
44 | LangSmith will help us trace, monitor and debug LangChain applications. 
45 | LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). 
46 | If you don't have access, you can skip this section
47 | 
48 | ```shell
49 | export LANGCHAIN_TRACING_V2=true
50 | export LANGCHAIN_API_KEY=<your-api-key>
51 | export LANGCHAIN_PROJECT=<your-project>  # if not specified, defaults to "default"
52 | ```
53 | 
54 | If you are inside this directory, then you can spin up a LangServe instance directly by:
55 | 
56 | ```shell
57 | langchain serve
58 | ```
59 | 
60 | This will start the FastAPI app with a server is running locally at 
61 | [http://localhost:8000](http://localhost:8000)
62 | 
63 | We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
64 | We can access the playground at [http://127.0.0.1:8000/openai-functions-agent/playground](http://127.0.0.1:8000/openai-functions-agent/playground)  
65 | 
66 | We can access the template from code with:
67 | 
68 | ```python
69 | from langserve.client import RemoteRunnable
70 | 
71 | runnable = RemoteRunnable("http://localhost:8000/openai-functions-agent")
72 | ```


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/openai-functions-agent/main.py:
--------------------------------------------------------------------------------
1 | from openai_functions_agent.agent import agent_executor
2 | 
3 | if __name__ == "__main__":
4 |     question = "who won the womens world cup in 2023?"
5 |     print(agent_executor.invoke({"input": question, "chat_history": []}))
6 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/openai-functions-agent/openai_functions_agent/__init__.py:
--------------------------------------------------------------------------------
1 | from openai_functions_agent.agent import agent_executor
2 | 
3 | __all__ = ["agent_executor"]
4 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/openai-functions-agent/openai_functions_agent/agent.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | from langchain.agents import AgentExecutor
 4 | from langchain.agents.format_scratchpad import format_to_openai_functions
 5 | from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
 6 | from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 7 | from langchain.schema.messages import AIMessage, HumanMessage
 8 | from langchain.tools import tool
 9 | from langchain.tools.render import format_tool_to_openai_function
10 | from langchain_docs_retriever.retriever import get_retriever
11 | from langchain_openai import ChatOpenAI
12 | from pydantic import BaseModel, Field
13 | 
14 | # This is used to tell the model how to best use the retriever.
15 | 
16 | 
17 | _RETRIEVER = get_retriever()
18 | 
19 | 
20 | @tool
21 | def search(query, callbacks=None):
22 |     """Search the LangChain docs with the retriever."""
23 |     return _RETRIEVER.get_relevant_documents(query, callbacks=callbacks)
24 | 
25 | 
26 | tools = [search]
27 | 
28 | llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
29 | assistant_system_message = """You are a helpful assistant tasked with answering technical questions about LangChain. \
30 | Use tools (only if necessary) to best answer the users questions. Do not make up information if you cannot find the answer using your tools."""
31 | prompt = ChatPromptTemplate.from_messages(
32 |     [
33 |         ("system", assistant_system_message),
34 |         MessagesPlaceholder(variable_name="chat_history"),
35 |         ("user", "{input}"),
36 |         MessagesPlaceholder(variable_name="agent_scratchpad"),
37 |     ]
38 | )
39 | 
40 | llm_with_tools = llm.bind(functions=[format_tool_to_openai_function(t) for t in tools])
41 | 
42 | 
43 | def _format_chat_history(chat_history: List[Tuple[str, str]]):
44 |     buffer = []
45 |     for human, ai in chat_history:
46 |         buffer.append(HumanMessage(content=human))
47 |         buffer.append(AIMessage(content=ai))
48 |     return buffer
49 | 
50 | 
51 | agent = (
52 |     {
53 |         "input": lambda x: x["input"],
54 |         "chat_history": lambda x: _format_chat_history(x["chat_history"]),
55 |         "agent_scratchpad": lambda x: format_to_openai_functions(
56 |             x["intermediate_steps"]
57 |         ),
58 |     }
59 |     | prompt
60 |     | llm_with_tools
61 |     | OpenAIFunctionsAgentOutputParser()
62 | )
63 | 
64 | 
65 | class AgentInput(BaseModel):
66 |     input: str
67 |     chat_history: List[Tuple[str, str]] = Field(..., extra={"widget": {"type": "chat"}})
68 | 
69 | 
70 | agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False).with_types(
71 |     input_type=AgentInput
72 | )
73 | 
74 | 
75 | class ChainInput(BaseModel):
76 |     question: str
77 | 
78 | 
79 | def mapper(input: dict):
80 |     return {"input": input["question"], "chat_history": []}
81 | 
82 | 
83 | agent_executor = (mapper | agent_executor | (lambda x: x["output"])).with_types(
84 |     input_type=ChainInput
85 | )
86 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/openai-functions-agent/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "openai-functions-agent"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = [
 6 |     "Lance Martin <lance@langchain.dev>",
 7 | ]
 8 | readme = "README.md"
 9 | 
10 | [tool.poetry.dependencies]
11 | python = ">=3.8.1,<4.0"
12 | langchain = ">=0.0.327,<0.1.0"
13 | openai = ">=0.5.0"
14 | tavily-python = "^0.1.9"
15 | 
16 | [tool.langserve]
17 | export_module = "openai_functions_agent"
18 | export_attr = "agent_executor"
19 | 
20 | [build-system]
21 | requires = [
22 |     "poetry-core",
23 | ]
24 | build-backend = "poetry.core.masonry.api"
25 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/packages/openai-functions-agent/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/langchain-docs-benchmarking/packages/openai-functions-agent/tests/__init__.py


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/prepare_dataset.py:
--------------------------------------------------------------------------------
 1 | """Copy the public dataset to your own langsmith tenant."""
 2 | from typing import Optional
 3 | 
 4 | from langsmith import Client
 5 | 
 6 | DATASET_NAME = "LangChain Docs Q&A"
 7 | PUBLIC_DATASET_TOKEN = "452ccafc-18e1-4314-885b-edd735f17b9d"
 8 | 
 9 | 
10 | def create_langchain_docs_dataset(
11 |     dataset_name: str = DATASET_NAME,
12 |     public_dataset_token: str = PUBLIC_DATASET_TOKEN,
13 |     client: Optional[Client] = None,
14 | ):
15 |     shared_client = Client(
16 |         api_url="https://api.smith.langchain.com", api_key="placeholder"
17 |     )
18 |     examples = list(shared_client.list_shared_examples(public_dataset_token))
19 |     client = client or Client()
20 |     if client.has_dataset(dataset_name=dataset_name):
21 |         loaded_examples = list(client.list_examples(dataset_name=dataset_name))
22 |         if len(loaded_examples) == len(examples):
23 |             return
24 |         else:
25 |             ds = client.read_dataset(dataset_name=dataset_name)
26 |     else:
27 |         ds = client.create_dataset(dataset_name=dataset_name)
28 |     client.create_examples(
29 |         inputs=[e.inputs for e in examples],
30 |         outputs=[e.outputs for e in examples],
31 |         dataset_id=ds.id,
32 |     )
33 |     print("Done creating dataset.")
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     import argparse
38 | 
39 |     parser = argparse.ArgumentParser()
40 |     parser.add_argument("--target-api-key", type=str, required=False)
41 |     parser.add_argument("--target-endpoint", type=str, required=False)
42 |     parser.add_argument("--dataset-name", type=str, default=DATASET_NAME)
43 |     parser.add_argument(
44 |         "--public-dataset-token", type=str, default=PUBLIC_DATASET_TOKEN
45 |     )
46 |     args = parser.parse_args()
47 |     client = None
48 |     if args.target_api_key or args.target_endpoint:
49 |         client = Client(
50 |             api_key=args.target_api_key,
51 |             api_url=args.target_endpoint,
52 |         )
53 |     create_langchain_docs_dataset(
54 |         dataset_name=args.dataset_name,
55 |         public_dataset_token=args.public_dataset_token,
56 |         client=client,
57 |     )
58 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "langservehub-template"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Your Name <you@example.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.11"
10 | langsmith = ">=0.0.64,<0.1.0"
11 | sse-starlette = "^1.6.5"
12 | tomli-w = "^1.0.0"
13 | uvicorn = "^0.23.2"
14 | fastapi = "^0.104"
15 | langserve = ">=0.0.16"
16 | chat-langchain = {path = "packages/chat-langchain", develop = true}
17 | langchain-docs-retriever = {path = "packages/langchain-docs-retriever", develop = true}
18 | anthropic-iterative-search = {path = "packages/anthropic-iterative-search", develop = true}
19 | oai-assistant = {path = "packages/oai-assistant", develop = true}
20 | openai-functions-agent = {path = "packages/openai-functions-agent", develop = true}
21 | 
22 | [tool.poetry.group.dev.dependencies]
23 | uvicorn = "^0.23.2"
24 | pygithub = "^2.1.1"
25 | 
26 | 
27 | [build-system]
28 | requires = ["poetry-core"]
29 | build-backend = "poetry.core.masonry.api"
30 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/run_evals.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import importlib.util
  3 | import sys
  4 | import uuid
  5 | from functools import partial
  6 | from typing import Callable, Optional
  7 | 
  8 | from anthropic_iterative_search.chain import chain as anthropic_agent_chain
  9 | from chat_langchain.chain import create_chain
 10 | from langchain.schema.runnable import Runnable
 11 | from langchain.smith import RunEvalConfig, run_on_dataset
 12 | from langchain_openai import ChatOpenAI
 13 | from langsmith import Client
 14 | from oai_assistant.chain import agent_executor as openai_assistant_chain
 15 | from openai_functions_agent import agent_executor as openai_functions_agent_chain
 16 | 
 17 | ls_client = Client()
 18 | 
 19 | 
 20 | def import_from_path(path_name: str):
 21 |     func_name = "create_chain"
 22 |     if "::" in path_name:
 23 |         path_name, func_name = path_name.split("::")
 24 |     spec = importlib.util.spec_from_file_location("module_name", path_name)
 25 |     module = importlib.util.module_from_spec(spec)
 26 |     sys.modules["module_name"] = module
 27 |     spec.loader.exec_module(module)
 28 |     return getattr(module, func_name)
 29 | 
 30 | 
 31 | def _get_chain_factory(arch: str) -> Callable:
 32 |     _map = {
 33 |         "chat": create_chain,
 34 |         "anthropic-iterative-search": lambda _: anthropic_agent_chain,
 35 |         "openai-functions-agent": lambda _: openai_functions_agent_chain,
 36 |         "openai-assistant": lambda _: openai_assistant_chain,
 37 |     }
 38 |     if arch in _map:
 39 |         return _map[arch]
 40 |     else:
 41 |         return import_from_path(arch)
 42 | 
 43 | 
 44 | def create_runnable(
 45 |     arch: str, model_config: Optional[dict], retry_config: Optional[dict] = None
 46 | ):
 47 |     factory = _get_chain_factory(arch)
 48 |     chain: Runnable = factory(model_config)
 49 |     if retry_config:
 50 |         return chain.with_retry(**retry_config)
 51 |     return chain
 52 | 
 53 | 
 54 | def get_eval_config():
 55 |     accuracy_criteria = {
 56 |         "accuracy": """
 57 | Score 1: The answer is incorrect and unrelated to the question or reference document.
 58 | Score 3: The answer shows slight relevance to the question or reference document but is largely incorrect.
 59 | Score 5: The answer is partially correct but has significant errors or omissions.
 60 | Score 7: The answer is mostly correct with minor errors or omissions, and aligns with the reference document.
 61 | Score 10: The answer is correct, complete, and perfectly aligns with the reference document.
 62 | 
 63 | If the reference answer contains multiple alternatives, the predicted answer must only match one of the alternatives to be considered correct.
 64 | If the predicted answer contains additional helpful and accurate information that is not present in the reference answer, it should still be considered correct.
 65 | """  # noqa
 66 |     }
 67 | 
 68 |     eval_llm = ChatOpenAI(model="gpt-4", temperature=0.0)
 69 |     return RunEvalConfig(
 70 |         evaluators=[
 71 |             RunEvalConfig.LabeledScoreString(
 72 |                 criteria=accuracy_criteria, llm=eval_llm, normalize_by=10.0
 73 |             ),
 74 |             # Mainly to compare with the above
 75 |             # Suspected to be less reliable.
 76 |             RunEvalConfig.EmbeddingDistance(),
 77 |         ]
 78 |     )
 79 | 
 80 | 
 81 | def main(
 82 |     arch: str,
 83 |     dataset_name: str,
 84 |     model_config: Optional[dict] = None,
 85 |     max_concurrency: int = 5,
 86 |     project_name: Optional[str] = None,
 87 |     retry_config: Optional[dict] = None,
 88 | ):
 89 |     eval_config = get_eval_config()
 90 |     project_name = project_name or arch
 91 |     project_name += f" {uuid.uuid4().hex[:4]}"
 92 |     run_on_dataset(
 93 |         client=ls_client,
 94 |         dataset_name=dataset_name,
 95 |         llm_or_chain_factory=partial(
 96 |             create_runnable,
 97 |             arch=arch,
 98 |             model_config=model_config,
 99 |             retry_config=retry_config,
100 |         ),
101 |         evaluation=eval_config,
102 |         concurrency_level=max_concurrency,
103 |         project_name=project_name,
104 |         project_metadata={"arch": arch, "model_config": model_config},
105 |     )
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     parser = argparse.ArgumentParser()
110 |     parser.add_argument("--url", type=str)
111 |     parser.add_argument("--dataset-name", type=str, default="Chat Langchain Pub")
112 |     parser.add_argument("--project-name", type=Optional[str], default=None)
113 |     parser.add_argument("--max-concurrency", type=int, default=5)
114 |     args = parser.parse_args()
115 |     main(
116 |         args.url,
117 |         args.dataset_name,
118 |         max_concurrency=args.max_concurrency,
119 |         project_name=args.project_name,
120 |     )
121 | 


--------------------------------------------------------------------------------
/archived/langchain-docs-benchmarking/run_experiments.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | 
  4 | from prepare_dataset import create_langchain_docs_dataset
  5 | from run_evals import main
  6 | 
  7 | experiments = [
  8 |     {
  9 |         # "server_url": "http://localhost:1983/openai-functions-agent",
 10 |         "arch": "openai-functions-agent",
 11 |         "project_name": "openai-functions-agent",
 12 |     },
 13 |     {
 14 |         # "server_url": "http://localhost:1983/anthropic_chat",
 15 |         "arch": "chat",
 16 |         "model_config": {
 17 |             "chat_cls": "ChatAnthropic",
 18 |             "model": "claude-2",
 19 |             "temperature": 1.0,
 20 |         },
 21 |         "project_name": "anthropic-chat",
 22 |     },
 23 |     {
 24 |         "arch": "chat",
 25 |         "model_config": {
 26 |             "chat_cls": "ChatOpenAI",
 27 |             "model": "gpt-3.5-turbo-16k",
 28 |         },
 29 |         # "server_url": "http://localhost:1983/chat",
 30 |         "project_name": "chat-gpt-3.5",
 31 |     },
 32 |     {
 33 |         "arch": "chat",
 34 |         "model_config": {
 35 |             "chat_cls": "ChatFireworks",
 36 |             "model": "accounts/fireworks/models/mistral-7b-instruct-4k",
 37 |         },
 38 |         "project_name": "mistral-7b-instruct-4k",
 39 |     },
 40 |     {
 41 |         "arch": "chat",
 42 |         "model_config": {
 43 |             "chat_cls": "ChatFireworks",
 44 |             "model": "accounts/fireworks/models/llama-v2-34b-code-instruct-w8a16",
 45 |         },
 46 |         "project_name": "llama-v2-34b-code-instruct-w8a16",
 47 |     },
 48 |     {
 49 |         "arch": "chat",
 50 |         "model_config": {
 51 |             "chat_cls": "ChatFireworks",
 52 |             "model": "accounts/fireworks/models/zephyr-7b-beta",
 53 |         },
 54 |         "project_name": "zephyr-7b-beta",
 55 |     },
 56 |     {
 57 |         "arch": "chat",
 58 |         "model_config": {
 59 |             "chat_cls": "ChatOpenAI",
 60 |             "model": "gpt-4",
 61 |         },
 62 |         "project_name": "gpt-4-chat",
 63 |     },
 64 |     {
 65 |         "arch": "openai-assistant",
 66 |         "model_config": {},
 67 |         "project_name": "openai-assistant",
 68 |         "max_concurrency": 2,  # Rate limit is VERY low right now.
 69 |         "retry_config": {
 70 |             "stop_after_attempt": 10,
 71 |         },
 72 |     },
 73 |     # Not worth our time it's so bad and slow
 74 |     {
 75 |         # "server_url": "http://localhost:1983/anthropic_iterative_search",
 76 |         "arch": "anthropic-iterative-search",
 77 |         "max_concurrency": 2,
 78 |         "project_name": "anthropic-iterative-search",
 79 |     },
 80 | ]
 81 | 
 82 | if __name__ == "__main__":
 83 |     parser = argparse.ArgumentParser()
 84 |     parser.add_argument("--dataset-name", type=str, default="LangChain Docs Q&A")
 85 |     parser.add_argument(
 86 |         "--config",
 87 |         type=str,
 88 |         default=None,
 89 |         nargs="*",
 90 |         help="Path to a JSON file with experiment config."
 91 |         " If specified, the include and exclude args are ignored",
 92 |     )
 93 |     parser.add_argument("--include", type=str, nargs="+", default=None)
 94 |     parser.add_argument(
 95 |         "--exclude",
 96 |         type=str,
 97 |         nargs="+",
 98 |     )
 99 |     args = parser.parse_args()
100 |     create_langchain_docs_dataset(dataset_name=args.dataset_name)
101 |     selected_experiments = experiments
102 |     if args.config:
103 |         selected_experiments = []
104 |         for config_path in args.config:
105 |             with open(config_path) as f:
106 |                 selected_experiments.append(json.load(f))
107 |     elif args.include:
108 |         selected_experiments = [
109 |             e for e in selected_experiments if e["project_name"] in args.include
110 |         ]
111 |     to_exclude = args.exclude or []
112 |     if args.include and not to_exclude:
113 |         to_exclude = [
114 |             "anthropic-iterative-search",
115 |             "openai-assistant",
116 |         ]
117 |     if args.exclude:
118 |         selected_experiments = [
119 |             e for e in selected_experiments if e["project_name"] not in args.exclude
120 |         ]
121 | 
122 |     for experiment in selected_experiments:
123 |         print("Running experiment:", experiment)
124 |         main(
125 |             **experiment,
126 |             dataset_name=args.dataset_name,
127 |         )
128 | 


--------------------------------------------------------------------------------
/archived/meta-evals/README.md:
--------------------------------------------------------------------------------
1 | # Meta-Evaluations
2 | 
3 | 
4 | This folder holds some scripts/tests for evaluating some of LangChain's default evaluators.


--------------------------------------------------------------------------------
/archived/meta-evals/correctness/README.md:
--------------------------------------------------------------------------------
 1 | # Correctness Meta-Evals
 2 | 
 3 | This folder contains a test script to check the aggregate performance of the "correctness"-related evaluators.
 4 | 
 5 | To upload the dataset to LangSmith, run:
 6 | 
 7 | ```bash
 8 | python meta-evals/correctness/_upload_dataset.py
 9 | ```
10 | 
11 | To test, run:
12 | 
13 | ```bash
14 | pytest --capture=no meta-evals/correctness/test_correctness_evaluator.py
15 | ```
16 | 
17 | Then navigate to the Web Q&A dataset to review the results.


--------------------------------------------------------------------------------
/archived/meta-evals/correctness/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/archived/meta-evals/correctness/__init__.py


--------------------------------------------------------------------------------
/archived/meta-evals/correctness/_upload_dataset.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from langsmith import Client
 6 | 
 7 | logging.basicConfig(level=logging.INFO)
 8 | 
 9 | # Synthetic dataset adapted from https://aclanthology.org/D13-1160/
10 | 
11 | _DATA_REPO = Path(__file__).parent / "data"
12 | _CLIENT = Client()
13 | 
14 | 
15 | def _upload_dataset(path: str):
16 |     with open(path, "r") as f:
17 |         data = json.load(f)
18 |         dataset_name = data["name"]
19 |         examples = data["examples"]
20 |         try:
21 |             dataset = _CLIENT.create_dataset(dataset_name)
22 |         except Exception:
23 |             logging.warning(f"Skipping {dataset_name}")
24 |             return
25 |         logging.info(f"Uploading dataset: {dataset_name}")
26 |         _CLIENT.create_examples(
27 |             inputs=[example["inputs"] for example in examples],
28 |             outputs=[example["outputs"] for example in examples],
29 |             dataset_id=dataset.id,
30 |         )
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     for dataset in _DATA_REPO.glob("*.json"):
35 |         print("Uploading dataset:", dataset)
36 |         _upload_dataset(dataset)
37 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/.gitignore:
--------------------------------------------------------------------------------
1 | chromadb/
2 | index.md
3 | Untitled.ipynb
4 | 


--------------------------------------------------------------------------------
/docs/source/_static/parrot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/docs/source/_static/parrot.png


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | #
 13 | # import os
 14 | # import sys
 15 | # sys.path.insert(0, os.path.abspath('.'))
 16 | 
 17 | 
 18 | # -- Project information -----------------------------------------------------
 19 | import pathlib
 20 | import sys
 21 | from typing import List
 22 | 
 23 | import toml
 24 | 
 25 | ROOT_FOLDER = str(pathlib.Path(__file__).parent.parent.parent)
 26 | 
 27 | # Add the project root to the path
 28 | sys.path.insert(0, ROOT_FOLDER)
 29 | 
 30 | with open("../../pyproject.toml") as f:
 31 |     data = toml.load(f)
 32 | 
 33 | project = "LangChain Benchmarks"
 34 | copyright = "2023, Langchain AI"
 35 | author = "Langchain AI"
 36 | 
 37 | version = data["tool"]["poetry"]["version"]
 38 | release = version
 39 | 
 40 | html_title = project + " " + version
 41 | 
 42 | 
 43 | # -- General configuration ---------------------------------------------------
 44 | 
 45 | # Add any Sphinx extension module names here, as strings. They can be
 46 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 47 | # ones.
 48 | extensions = [
 49 |     "sphinx.ext.autodoc",
 50 |     "sphinx.ext.autodoc.typehints",
 51 |     "sphinx.ext.autosummary",
 52 |     "sphinx.ext.napoleon",
 53 |     "sphinx.ext.viewcode",
 54 |     "myst_nb",
 55 |     "sphinx_copybutton",
 56 |     "IPython.sphinxext.ipython_console_highlighting",
 57 | ]
 58 | source_suffix = [".ipynb", ".html", ".md", ".rst"]
 59 | 
 60 | # Add any paths that contain templates here, relative to this directory.
 61 | templates_path = ["_templates"]
 62 | 
 63 | # List of patterns, relative to source directory, that match files and
 64 | # directories to ignore when looking for source files.
 65 | # This pattern also affects html_static_path and html_extra_path.
 66 | exclude_patterns: List[str] = []
 67 | 
 68 | 
 69 | # -- Options for HTML output -------------------------------------------------
 70 | 
 71 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 72 | # a list of builtin themes.
 73 | #
 74 | html_theme = "sphinx_book_theme"
 75 | 
 76 | html_theme_options = {
 77 |     "path_to_docs": "docs/source",
 78 |     "repository_url": "https://github.com/langchain-ai/langchain-benchmarks",
 79 |     "home_page_in_toc": True,
 80 |     "show_navbar_depth": 2,
 81 |     "use_sidenotes": True,
 82 |     "use_repository_button": True,
 83 |     "use_issues_button": True,
 84 |     "use_source_button": True,
 85 |     "use_fullscreen_button": True,
 86 |     "repository_branch": "main",
 87 |     "launch_buttons": {
 88 |         "notebook_interface": "jupyterlab",
 89 |         "colab_url": "https://colab.research.google.com",
 90 |     },
 91 | }
 92 | 
 93 | html_context = {
 94 |     "display_github": True,  # Integrate GitHub
 95 |     "github_user": "langchain-ai",  # Username
 96 |     "github_repo": "langchain-benchmarks",  # Repo name
 97 |     "github_version": "main",  # Version
 98 |     "conf_py_path": "/docs/",  # Path in the checkout to the docs root
 99 | }
100 | 
101 | # Add any paths that contain custom static files (such as style sheets) here,
102 | # relative to this directory. They are copied after the builtin static files,
103 | # so a file named "default.css" will overwrite the builtin "default.css".
104 | html_static_path = ["_static"]
105 | 
106 | # These paths are either relative to html_static_path
107 | # or fully qualified paths (eg. https://...)
108 | html_css_files = [
109 |     "css/custom.css",
110 | ]
111 | 
112 | nb_execution_mode = "off"
113 | autosummary_generate = True
114 | 


--------------------------------------------------------------------------------
/docs/source/toc.segment:
--------------------------------------------------------------------------------
 1 | ```{toctree}
 2 | :maxdepth: 2
 3 | :caption: Introduction
 4 | 
 5 | ./notebooks/getting_started
 6 | ./notebooks/models
 7 | ./notebooks/datasets
 8 | ```
 9 | 
10 | 
11 | ```{toctree}
12 | :maxdepth: 0
13 | :caption: Tool Usage
14 | 
15 | ./notebooks/tool_usage/intro
16 | ./notebooks/tool_usage/relational_data
17 | ./notebooks/tool_usage/multiverse_math
18 | ./notebooks/tool_usage/typewriter_1
19 | ./notebooks/tool_usage/typewriter_26
20 | ./notebooks/tool_usage/benchmark_all_tasks
21 | ```
22 | 
23 | ```{toctree}
24 | :maxdepth: 0
25 | :caption: Extraction
26 | 
27 | ./notebooks/extraction/intro
28 | ./notebooks/extraction/email
29 | ./notebooks/extraction/chat_extraction
30 | ./notebooks/extraction/high_cardinality
31 | ```
32 | 
33 | ```{toctree}
34 | :maxdepth: 2
35 | :caption: RAG
36 | 
37 | ./notebooks/retrieval/intro
38 | ./notebooks/retrieval/langchain_docs_qa
39 | ./notebooks/retrieval/semi_structured_benchmarking/semi_structured
40 | ./notebooks/retrieval/semi_structured_benchmarking/ss_eval_chunk_sizes
41 | ./notebooks/retrieval/semi_structured_benchmarking/ss_eval_long_context
42 | ./notebooks/retrieval/semi_structured_benchmarking/ss_eval_multi_vector
43 | ./notebooks/retrieval/multi_modal_benchmarking/multi_modal_eval_baseline
44 | ./notebooks/retrieval/multi_modal_benchmarking/multi_modal_eval
45 | ./notebooks/retrieval/comparing_techniques
46 | ```
47 | 
48 | ```{toctree}
49 | :maxdepth: 2
50 | :caption: Benchmarking Without LangSmith 
51 | ./notebooks/run_without_langsmith
52 | ```
53 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/.gitignore:
--------------------------------------------------------------------------------
1 | .sql
2 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/__init__.py:
--------------------------------------------------------------------------------
 1 | from importlib import metadata
 2 | 
 3 | from langchain_benchmarks.model_registration import model_registry
 4 | from langchain_benchmarks.rate_limiting import RateLimiter
 5 | from langchain_benchmarks.registration import registry
 6 | from langchain_benchmarks.utils._langsmith import (
 7 |     clone_public_dataset,
 8 |     download_public_dataset,
 9 | )
10 | 
11 | try:
12 |     __version__ = metadata.version(__package__)
13 | except metadata.PackageNotFoundError:
14 |     # Case where package metadata is not available.
15 |     __version__ = ""
16 | del metadata  # optional, avoids polluting the results of dir(__package__)
17 | 
18 | 
19 | # Please keep this list sorted!
20 | __all__ = [
21 |     "__version__",
22 |     "clone_public_dataset",
23 |     "download_public_dataset",
24 |     "model_registry",
25 |     "RateLimiter",
26 |     "registry",
27 | ]
28 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from langchain_benchmarks.extraction.evaluators import get_eval_config
 2 | from langchain_benchmarks.extraction.implementations import (
 3 |     create_openai_function_based_extractor,
 4 | )
 5 | 
 6 | # Keep this sorted
 7 | __all__ = [
 8 |     "get_eval_config",
 9 |     "create_openai_function_based_extractor",
10 | ]
11 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/extraction/evaluators.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from langchain.chat_models.base import BaseChatModel
 4 | from langchain.smith import RunEvalConfig
 5 | from langchain_openai import ChatOpenAI
 6 | 
 7 | 
 8 | def get_eval_config(eval_llm: Optional[BaseChatModel] = None) -> RunEvalConfig:
 9 |     eval_llm = eval_llm or ChatOpenAI(
10 |         model="gpt-4",
11 |         temperature=0,
12 |         model_kwargs={"seed": 42},
13 |         max_retries=1,
14 |         request_timeout=60,
15 |     )
16 |     """Get the evaluation configuration for the email task."""
17 |     return RunEvalConfig(
18 |         evaluators=[
19 |             "json_edit_distance",
20 |             RunEvalConfig.LabeledScoreString(
21 |                 criteria={
22 |                     "accuracy": """
23 |     Score 1: The answer is incorrect and unrelated to the question or reference document.
24 |     Score 3: The answer is partially correct but has more than one omission or major errors.
25 |     Score 5: The answer is mostly correct but has more than one omission or major error.
26 |     Score 7: The answer is mostly correct but has at most one omission or major error.
27 |     Score 9: The answer is mostly correct with no omissions and only minor errors, and aligns with the reference document.
28 |     Score 10: The answer is correct, complete, and aligns with the reference document. Extra information is acceptable if it is sensible.
29 | 
30 |     If the reference answer contains multiple alternatives, the predicted answer must only match one of the alternatives to be considered correct.
31 |     If the predicted answer contains additional helpful and accurate information that is not present in the reference answer, it should still be considered correct and not be penalized.
32 |     """  # noqa
33 |                 },
34 |                 llm=eval_llm,
35 |                 normalize_by=10.0,
36 |             ),
37 |         ],
38 |     )
39 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/extraction/implementations.py:
--------------------------------------------------------------------------------
 1 | """Default implementations of LLMs that can be used for extraction."""
 2 | from typing import Any, Dict, List, Optional, Type
 3 | 
 4 | from langchain.chains.openai_functions import convert_to_openai_function
 5 | from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
 6 | from langchain.prompts import ChatPromptTemplate
 7 | from langchain.schema.runnable import Runnable
 8 | from langchain_openai import ChatOpenAI
 9 | from langsmith.client import Client
10 | from pydantic import BaseModel
11 | 
12 | from langchain_benchmarks.extraction.evaluators import get_eval_config
13 | from langchain_benchmarks.schema import ExtractionTask
14 | 
15 | # PUBLIC API
16 | 
17 | 
18 | def create_openai_function_based_extractor(
19 |     prompt: ChatPromptTemplate,
20 |     llm: Runnable,
21 |     schema: Type[BaseModel],
22 | ) -> Runnable[dict, dict]:
23 |     """Create an extraction chain that uses an LLM to extract a schema.
24 | 
25 |     The underlying functionality is exclusively for LLMs that support
26 |     extraction using openai functions format.
27 | 
28 |     Args:
29 |         prompt: The prompt to use for extraction.
30 |         llm: The LLM to use for extraction.
31 |         schema: The schema to extract.
32 | 
33 |     Returns:
34 |         An llm that will extract the schema
35 |     """
36 |     openai_functions = [convert_to_openai_function(schema)]
37 |     llm_kwargs = {
38 |         "functions": openai_functions,
39 |         "function_call": {"name": openai_functions[0]["name"]},
40 |     }
41 |     output_parser = JsonOutputFunctionsParser()
42 |     extraction_chain = (
43 |         prompt | llm.bind(**llm_kwargs) | output_parser | (lambda x: {"output": x})
44 |     )
45 |     return extraction_chain
46 | 
47 | 
48 | def run_on_dataset(
49 |     task: ExtractionTask,
50 |     llm: Runnable,
51 |     *,
52 |     tags: Optional[List[str]] = None,
53 |     **kwargs: Any,
54 | ) -> Dict[str, Any]:
55 |     """Run an LLM on a dataset.
56 | 
57 |     Args:
58 |         task: The task to run on.
59 |         llm: The LLM to run.
60 |         tags: The tags to use for the run.
61 |         kwargs: Additional arguments to pass to the client.
62 |     """
63 |     client = Client()
64 |     eval_llm = ChatOpenAI(
65 |         model="gpt-4",
66 |         temperature=0.0,
67 |         model_kwargs={"seed": 42},
68 |         max_retries=1,
69 |         request_timeout=60,
70 |     )
71 |     return client.run_on_dataset(
72 |         dataset_name=task.name,
73 |         llm_or_chain_factory=create_openai_function_based_extractor(
74 |             task.instructions, llm, task.schema
75 |         ),
76 |         evaluation=get_eval_config(eval_llm),
77 |         tags=tags,
78 |         **kwargs,
79 |     )
80 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/extraction/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/langchain_benchmarks/extraction/tasks/__init__.py


--------------------------------------------------------------------------------
/langchain_benchmarks/extraction/tasks/chat_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from langchain.prompts import ChatPromptTemplate
 2 | 
 3 | from langchain_benchmarks.extraction.tasks.chat_extraction.evaluators import (
 4 |     get_eval_config,
 5 | )
 6 | from langchain_benchmarks.extraction.tasks.chat_extraction.schema import GenerateTicket
 7 | from langchain_benchmarks.schema import ExtractionTask
 8 | 
 9 | # This is a default prompt that works reasonably for OpenAI models.
10 | 
11 | DEFAULT_CHAT_MODEL_PROMPT = ChatPromptTemplate.from_messages(
12 |     [
13 |         (
14 |             "system",
15 |             "You are a helpdesk assistant responsible with extracting information"
16 |             " and generating tickets. Dialogues are between a user and"
17 |             " a support engineer.",
18 |         ),
19 |         (
20 |             "user",
21 |             "Generate a ticket for the following question-response pair:\n"
22 |             "<Dialogue>\n{dialogue}\n</Dialogue>",
23 |         ),
24 |     ]
25 | )
26 | 
27 | 
28 | CHAT_EXTRACTION_TASK = ExtractionTask(
29 |     name="Chat Extraction",
30 |     dataset_id="https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d",
31 |     schema=GenerateTicket,
32 |     description="""A dataset meant to test the ability of an LLM to extract and infer
33 | structured information from a dialogue. The dialogue is between a user and a support
34 | engineer. Outputs should be structured as a JSON object and test both the ability
35 | of the LLM to correctly structure the information and its ability to perform simple 
36 | classification tasks.""",
37 |     instructions=DEFAULT_CHAT_MODEL_PROMPT,
38 | )
39 | 
40 | 
41 | __all__ = ["CHAT_EXTRACTION_TASK", "get_eval_config"]
42 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/extraction/tasks/chat_extraction/schema.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | from typing import List, Optional
  3 | 
  4 | from pydantic import BaseModel, Field
  5 | 
  6 | 
  7 | class QuestionCategory(str, Enum):
  8 |     IMPLEMENTATION_ISSUES = "Implementation Issues"  # about existing implementation
  9 |     FEATURE_REQUESTS = "Feature Requests"
 10 |     CONCEPT_EXPLANATIONS = "Concept Explanations"
 11 |     CODE_OPTIMIZATION = "Code Optimization"
 12 |     SECURITY_AND_PRIVACY_CONCERNS = "Security and Privacy Concerns"
 13 |     MODEL_TRAINING_AND_FINE_TUNING = "Model Training and Fine-tuning"
 14 |     DATA_HANDLING_AND_MANIPULATION = "Data Handling and Manipulation"
 15 |     USER_INTERACTION_FLOW = "User Interaction Flow"
 16 |     TECHNICAL_INTEGRATION = "Technical Integration"
 17 |     ERROR_HANDLING_AND_LOGGING = "Error Handling and Logging"
 18 |     CUSTOMIZATION_AND_CONFIGURATION = "Customization and Configuration"
 19 |     EXTERNAL_API_AND_DATA_SOURCE_INTEGRATION = (
 20 |         "External API and Data Source Integration"
 21 |     )
 22 |     LANGUAGE_AND_LOCALIZATION = "Language and Localization"
 23 |     STREAMING_AND_REAL_TIME_PROCESSING = "Streaming and Real-time Processing"
 24 |     TOOL_DEVELOPMENT = "Tool Development"
 25 |     FUNCTION_CALLING = "Function Calling"
 26 |     LLM_INTEGRATIONS = "LLM Integrations"
 27 |     GENERAL_AGENT_QUESTIONS = "General Agent Question"
 28 |     GENERAL_CHIT_CHAT = "General Chit Chat"
 29 |     MEMORY = "Memory"
 30 |     DEBUGGING_HELP = "Debugging Help"
 31 |     APPLICATION_DESIGN = "Application Design"
 32 |     PROMPT_TEMPLATES = "Prompt Templates"
 33 |     COST_TRACKING = "Cost Tracking"
 34 |     OTHER = "Other"
 35 | 
 36 | 
 37 | class Sentiment(str, Enum):
 38 |     NEGATIVE = "Negative"
 39 |     NEUTRAL = "Neutral"
 40 |     POSITIVE = "Positive"
 41 | 
 42 | 
 43 | class ProgrammingLanguage(str, Enum):
 44 |     PYTHON = "python"
 45 |     JAVASCRIPT = "javascript"
 46 |     TYPESCRIPT = "typescript"
 47 |     UNKNOWN = "unknown"
 48 |     OTHER = "other"
 49 | 
 50 | 
 51 | class QuestionCategorization(BaseModel):
 52 |     question_category: QuestionCategory
 53 |     category_if_other: Optional[str] = Field(
 54 |         default=None, description="question category if the category above is 'other'"
 55 |     )
 56 |     is_off_topic: bool = Field(
 57 |         description="If the input is general chit chat or does not pertain to technical inqueries about LangChain or building/debugging applications with LLMs/AI, it is off topic. For context, LangChain is a library and framework designed"
 58 |         " to assist in building applications with LLMs. Questions may also be about similar packages like LangServe, LangSmith, OpenAI, Anthropic, vectorstores, agents, etc."
 59 |     )
 60 |     toxicity: int = Field(
 61 |         ge=0, lt=6, description="Whether or not the input question is toxic"
 62 |     )
 63 |     sentiment: Sentiment
 64 |     programming_language: ProgrammingLanguage
 65 | 
 66 | 
 67 | #  resolve the issue, provide guidance, or ask for more information
 68 | class ResponseType(str, Enum):
 69 |     RESOLVE_ISSUE = "resolve issue"
 70 |     PROVIDE_GUIDANCE = "provide guidance"
 71 |     REQUEST_INFORMATION = "request information"
 72 |     GIVE_UP = "give up"
 73 |     NONE = "none"
 74 |     OTHER = "other"
 75 | 
 76 | 
 77 | class ResponseCategorization(BaseModel):
 78 |     response_type: ResponseType
 79 |     response_type_if_other: Optional[str] = None
 80 |     confidence_level: int = Field(
 81 |         ge=0, lt=6, description="The confidence of the assistant in its answer."
 82 |     )
 83 |     followup_actions: Optional[List[str]] = Field(
 84 |         description="Actions the assistant recommended the user take."
 85 |     )
 86 | 
 87 | 
 88 | class GenerateTicket(BaseModel):
 89 |     """Generate a ticket containing all the extracted information."""
 90 | 
 91 |     issue_summary: str = Field(
 92 |         description="short (<10 word) summary of the issue or question"
 93 |     )
 94 |     question: QuestionCategorization = Field(
 95 |         description="Information inferred from the the question."
 96 |     )
 97 |     response: ResponseCategorization = Field(
 98 |         description="Information inferred from the the response."
 99 |     )
100 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/extraction/tasks/email_task.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import List, Optional
 3 | 
 4 | from langchain.prompts import ChatPromptTemplate
 5 | from pydantic import BaseModel, Field
 6 | 
 7 | from langchain_benchmarks.schema import ExtractionTask
 8 | 
 9 | 
10 | class ToneEnum(str, Enum):
11 |     """The tone of the email."""
12 | 
13 |     positive = "positive"
14 |     negative = "negative"
15 | 
16 | 
17 | class Email(BaseModel):
18 |     """Relevant information about an email."""
19 | 
20 |     sender: Optional[str] = Field(None, description="The sender's name, if available")
21 |     sender_phone_number: Optional[str] = Field(
22 |         None, description="The sender's phone number, if available"
23 |     )
24 |     sender_address: Optional[str] = Field(
25 |         None, description="The sender's address, if available"
26 |     )
27 |     action_items: List[str] = Field(
28 |         ..., description="A list of action items requested by the email"
29 |     )
30 |     topic: str = Field(
31 |         ..., description="High level description of what the email is about"
32 |     )
33 |     tone: ToneEnum = Field(..., description="The tone of the email.")
34 | 
35 | 
36 | # This is a default prompt that works for chat models.
37 | DEFAULT_CHAT_MODEL_PROMPT = ChatPromptTemplate.from_messages(
38 |     [
39 |         ("system", "You are an expert researcher."),
40 |         (
41 |             "human",
42 |             "What can you tell me about the following email? Make sure to "
43 |             "extract the question in the correct format. "
44 |             "Here is the email:\n ```\n{input}\n```",
45 |         ),
46 |     ]
47 | )
48 | 
49 | EMAIL_EXTRACTION_TASK = ExtractionTask(
50 |     name="Email Extraction",
51 |     dataset_id="https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d",
52 |     schema=Email,
53 |     description="""\
54 | A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, \
55 | as well as a script for initial extraction and formatting of other emails from \
56 | an arbitrary .mbox file like the one exported by Gmail.
57 | 
58 | Some additional cleanup of the data was done by hand after the initial pass.
59 | 
60 | See https://github.com/jacoblee93/oss-model-extraction-evals.
61 |     """,
62 |     instructions=DEFAULT_CHAT_MODEL_PROMPT,
63 | )
64 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/extraction/tasks/high_cardinality/__init__.py:
--------------------------------------------------------------------------------
1 | from langchain_benchmarks.extraction.tasks.high_cardinality.name_correction import (
2 |     NAME_CORRECTION_TASK,
3 | )
4 | 
5 | __all__ = ["NAME_CORRECTION_TASK"]
6 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/extraction/tasks/high_cardinality/name_correction.py:
--------------------------------------------------------------------------------
 1 | from langchain.smith import RunEvalConfig
 2 | from langsmith.evaluation import EvaluationResult, run_evaluator
 3 | from langsmith.schemas import Example, Run
 4 | from pydantic import BaseModel, Field
 5 | 
 6 | from langchain_benchmarks.schema import ExtractionTask
 7 | 
 8 | 
 9 | @run_evaluator
10 | def correct_name(run: Run, example: Example) -> EvaluationResult:
11 |     if "name" in run.outputs:
12 |         prediction = run.outputs["name"]
13 |     else:
14 |         prediction = run.outputs["output"]["name"]
15 |     name = example.outputs["name"]
16 |     score = int(name == prediction)
17 |     return EvaluationResult(key="correct", score=score)
18 | 
19 | 
20 | class Person(BaseModel):
21 |     """Information about a person."""
22 | 
23 |     name: str = Field(..., description="The person's name")
24 | 
25 | 
26 | NAME_CORRECTION_TASK = ExtractionTask(
27 |     name="Name Correction",
28 |     dataset_id="https://smith.langchain.com/public/78df83ee-ba7f-41c6-832c-2b23327d4cf7/d",
29 |     schema=Person,
30 |     description="""A dataset of 23 misspelled full names and their correct spellings.""",
31 |     dataset_url="https://smith.langchain.com/public/78df83ee-ba7f-41c6-832c-2b23327d4cf7/d",
32 |     dataset_name="Extracting Corrected Names",
33 |     eval_config=RunEvalConfig(
34 |         custom_evaluators=[correct_name],
35 |     ),
36 | )
37 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/.gitignore:
--------------------------------------------------------------------------------
1 | *.sql
2 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/__init__.py:
--------------------------------------------------------------------------------
1 | from langchain_benchmarks.rag.evaluators import get_eval_config
2 | from langchain_benchmarks.rag.tasks import LANGCHAIN_DOCS_TASK
3 | 
4 | # Please keep this sorted
5 | __all__ = ["get_eval_config", "LANGCHAIN_DOCS_TASK"]
6 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/evaluators.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | from langchain.evaluation import load_evaluator
  4 | from langchain.smith import RunEvalConfig
  5 | from langchain_openai import ChatOpenAI
  6 | 
  7 | try:
  8 |     from langchain.schema.language_model import BaseLanguageModel
  9 | except ImportError:
 10 |     from langchain_core.language_models import BaseLanguageModel
 11 | from langsmith.evaluation.evaluator import EvaluationResult, RunEvaluator
 12 | from langsmith.schemas import Example, Run
 13 | 
 14 | 
 15 | # TODO: Split this into an assertion-by-assertion evaluator
 16 | # TODO: Combine with a document relevance evaluator (to report retriever performance)
 17 | class FaithfulnessEvaluator(RunEvaluator):
 18 |     def __init__(self, llm: Optional[BaseLanguageModel] = None):
 19 |         self.evaluator = load_evaluator(
 20 |             "labeled_score_string",
 21 |             criteria={
 22 |                 "faithfulness": """
 23 | Score 1: The answer directly contradicts the information provided in the reference docs.
 24 | Score 3: The answer contains a mix of correct information from the reference docs and incorrect or unverifiable information not found in the docs.
 25 | Score 5: The answer is mostly aligned with the reference docs but includes extra information that, while not contradictory, is not verified by the docs.
 26 | Score 7: The answer aligns well with the reference docs but includes minor, commonly accepted facts not found in the docs.
 27 | Score 10: The answer perfectly aligns with and is fully entailed by the reference docs, with no extra information."""
 28 |             },
 29 |             llm=llm,
 30 |             normalize_by=10,
 31 |         )
 32 | 
 33 |     @staticmethod
 34 |     def _get_retrieved_docs(run: Run) -> str:
 35 |         # This assumes there is only one retriever in your chain.
 36 |         # To select more precisely, name your retrieval chain
 37 |         # using with_config(name="my_unique_name") and look up
 38 |         # by run.name
 39 |         runs = [run]
 40 |         while runs:
 41 |             run = runs.pop()
 42 |             if run.run_type == "retriever":
 43 |                 return str(run.outputs["documents"])
 44 |             if run.child_runs:
 45 |                 runs.extend(run.child_runs[::-1])
 46 |         return ""
 47 | 
 48 |     def evaluate_run(
 49 |         self, run: Run, example: Optional[Example] = None
 50 |     ) -> EvaluationResult:
 51 |         try:
 52 |             docs_string = self._get_retrieved_docs(run)
 53 |             docs_string = f"Reference docs:\n<DOCS>\n{docs_string}\n</DOCS>\n\n"
 54 |             input_query = run.inputs["question"]
 55 |             if run.outputs is not None and len(run.outputs) == 1:
 56 |                 prediction = next(iter(run.outputs.values()))
 57 |             else:
 58 |                 prediction = run.outputs["output"]
 59 |             result = self.evaluator.evaluate_strings(
 60 |                 input=input_query,
 61 |                 prediction=prediction,
 62 |                 reference=docs_string,
 63 |             )
 64 |             return EvaluationResult(
 65 |                 **{"key": "faithfulness", "comment": result.get("reasoning"), **result}
 66 |             )
 67 |         except Exception as e:
 68 |             return EvaluationResult(key="faithfulness", score=None, comment=repr(e))
 69 | 
 70 | 
 71 | _ACCURACY_CRITERION = {
 72 |     "accuracy": """
 73 | Score 1: The answer is incorrect and unrelated to the question or reference document.
 74 | Score 3: The answer shows slight relevance to the question or reference document but is largely incorrect.
 75 | Score 5: The answer is partially correct but has significant errors or omissions.
 76 | Score 7: The answer is mostly correct with minor errors or omissions, and aligns with the reference document.
 77 | Score 10: The answer is correct, complete, and perfectly aligns with the reference document.
 78 | 
 79 | If the reference answer contains multiple alternatives, the predicted answer must only match one of the alternatives to be considered correct.
 80 | If the predicted answer contains additional helpful and accurate information that is not present in the reference answer, it should still be considered correct.
 81 | """  # noqa
 82 | }
 83 | 
 84 | 
 85 | def get_eval_config() -> RunEvalConfig:
 86 |     """Returns the evaluator for the environment."""
 87 |     eval_llm = ChatOpenAI(
 88 |         model="gpt-4",
 89 |         temperature=0.0,
 90 |         model_kwargs={"seed": 42},
 91 |         max_retries=1,
 92 |         request_timeout=60,
 93 |     )
 94 |     # Use a longer-context LLM to check documents
 95 |     faithfulness_eval_llm = ChatOpenAI(
 96 |         model="gpt-4-1106-preview",
 97 |         temperature=0.0,
 98 |         model_kwargs={"seed": 42},
 99 |         max_retries=1,
100 |         request_timeout=60,
101 |     )
102 | 
103 |     return RunEvalConfig(
104 |         evaluators=[
105 |             RunEvalConfig.LabeledScoreString(
106 |                 criteria=_ACCURACY_CRITERION, llm=eval_llm, normalize_by=10.0
107 |             ),
108 |             RunEvalConfig.EmbeddingDistance(),
109 |         ],
110 |         custom_evaluators=[FaithfulnessEvaluator(llm=faithfulness_eval_llm)],
111 |     )
112 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/.gitignore:
--------------------------------------------------------------------------------
1 | pdfs/
2 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | from langchain_benchmarks.rag.tasks.langchain_docs.task import LANGCHAIN_DOCS_TASK
 2 | from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.task import (
 3 |     MULTI_MODAL_SLIDE_DECKS_TASK,
 4 | )
 5 | from langchain_benchmarks.rag.tasks.semi_structured_reports.task import (
 6 |     SEMI_STRUCTURED_REPORTS_TASK,
 7 | )
 8 | 
 9 | # Please keep this sorted
10 | __all__ = [
11 |     "LANGCHAIN_DOCS_TASK",
12 |     "SEMI_STRUCTURED_REPORTS_TASK",
13 |     "MULTI_MODAL_SLIDE_DECKS_TASK",
14 | ]
15 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/langchain_docs/README.md:
--------------------------------------------------------------------------------
 1 | # LangChain Docs Task 
 2 | 
 3 | This code contains utilities to scrape the LangChain docs (already run) and index them
 4 | using common techniques. The docs were scraped using the code in `_ingest_docs.py` and
 5 | uploaded to gcs. To better compare retrieval techniques, we hold these constant and pull
 6 | from that cache whenever generating different indices.
 7 | 
 8 | 
 9 | The content in `indexing` composes some common indexing strategies with default paramaters for
10 | benchmarking on the langchain docs.


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/langchain_docs/__init__.py:
--------------------------------------------------------------------------------
1 | from langchain_benchmarks.rag.tasks.langchain_docs import architectures, indexing
2 | from langchain_benchmarks.rag.tasks.langchain_docs.task import LANGCHAIN_DOCS_TASK
3 | 
4 | DATASET_ID = (
5 |     "452ccafc-18e1-4314-885b-edd735f17b9d"  # ID of public LangChain Docs dataset
6 | )
7 | 
8 | __all__ = ["architectures", "indexing", "DATASET_ID", "LANGCHAIN_DOCS_TASK"]
9 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/langchain_docs/architectures/__init__.py:
--------------------------------------------------------------------------------
1 | from langchain_benchmarks.rag.tasks.langchain_docs.architectures.chain_registry import (
2 |     ARCH_FACTORIES,
3 | )
4 | 
5 | __all__ = ["ARCH_FACTORIES"]
6 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/langchain_docs/architectures/chain_registry.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from langchain.base_language import BaseLanguageModel
 4 | from langchain.schema.retriever import BaseRetriever
 5 | from langchain.schema.runnable import Runnable
 6 | from langchain_openai import ChatOpenAI
 7 | 
 8 | from langchain_benchmarks.rag.tasks.langchain_docs.architectures.crqa import (
 9 |     create_response_chain,
10 |     get_default_response_generator,
11 | )
12 | 
13 | 
14 | def default_response_chain(
15 |     retriever: BaseRetriever,
16 |     response_generator: Optional[Runnable] = None,
17 |     llm: Optional[BaseLanguageModel] = None,
18 | ) -> None:
19 |     """Get the chain responsible for generating a response based on the retrieved documents."""
20 |     response_generator = response_generator or get_default_response_generator(
21 |         llm=llm or ChatOpenAI(model="gpt-3.5-turbo-16k", model_kwargs={"seed": 42})
22 |     )
23 |     return create_response_chain(
24 |         response_generator=response_generator, retriever=retriever
25 |     )
26 | 
27 | 
28 | ARCH_FACTORIES = {
29 |     "conversational-retrieval-qa": default_response_chain,
30 | }
31 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/langchain_docs/architectures/crqa.py:
--------------------------------------------------------------------------------
  1 | """Chat langchain 'engine'."""
  2 | # TODO: some simplified architectures that are
  3 | # environment-agnostic
  4 | from operator import itemgetter
  5 | from typing import Callable, Dict, List, Optional, Sequence
  6 | 
  7 | from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
  8 | from langchain.schema import Document
  9 | from langchain.schema.language_model import BaseLanguageModel
 10 | from langchain.schema.messages import AIMessage, HumanMessage
 11 | from langchain.schema.output_parser import StrOutputParser
 12 | from langchain.schema.retriever import BaseRetriever
 13 | from langchain.schema.runnable import (
 14 |     Runnable,
 15 |     RunnableLambda,
 16 | )
 17 | from langchain.schema.runnable.passthrough import RunnableAssign
 18 | from pydantic import BaseModel
 19 | 
 20 | RESPONSE_TEMPLATE = """\
 21 | You are an expert programmer and problem-solver, tasked with answering any question \
 22 | about Langchain.
 23 | 
 24 | Generate a comprehensive and informative answer of 80 words or less for the \
 25 | given question based solely on the provided search results (URL and content). You must \
 26 | only use information from the provided search results. Use an unbiased and \
 27 | journalistic tone. Combine search results together into a coherent answer. Do not \
 28 | repeat text. Cite search results using [${{number}}] notation. Only cite the most \
 29 | relevant results that answer the question accurately. Place these citations at the end \
 30 | of the sentence or paragraph that reference them - do not put them all at the end. If \
 31 | different results refer to different entities within the same name, write separate \
 32 | answers for each entity.
 33 | 
 34 | You should use bullet points in your answer for readability. Put citations where they apply
 35 | rather than putting them all at the end.
 36 | 
 37 | If there is nothing in the context relevant to the question at hand, just say "Hmm, \
 38 | I'm not sure." Don't try to make up an answer.
 39 | 
 40 | Anything between the following `context`  html blocks is retrieved from a knowledge \
 41 | bank, not part of the conversation with the user. 
 42 | 
 43 | <context>
 44 |     {context} 
 45 | <context/>
 46 | 
 47 | REMEMBER: If there is no relevant information within the context, just say "Hmm, I'm \
 48 | not sure." Don't try to make up an answer. Anything between the preceding 'context' \
 49 | html blocks is retrieved from a knowledge bank, not part of the conversation with the \
 50 | user.\
 51 | """
 52 | 
 53 | 
 54 | class ChatRequest(BaseModel):
 55 |     question: str
 56 |     chat_history: Optional[List[Dict[str, str]]]
 57 | 
 58 | 
 59 | def _format_docs(docs: Sequence[Document]) -> str:
 60 |     formatted_docs = []
 61 |     for i, doc in enumerate(docs):
 62 |         doc_string = f"<doc id='{i}'>{doc.page_content}</doc>"
 63 |         formatted_docs.append(doc_string)
 64 |     return "\n".join(formatted_docs)
 65 | 
 66 | 
 67 | def serialize_history(request: ChatRequest):
 68 |     chat_history = request.get("chat_history") or []
 69 |     converted_chat_history = []
 70 |     for message in chat_history:
 71 |         if message.get("human") is not None:
 72 |             converted_chat_history.append(HumanMessage(content=message["human"]))
 73 |         if message.get("ai") is not None:
 74 |             converted_chat_history.append(AIMessage(content=message["ai"]))
 75 |     return converted_chat_history
 76 | 
 77 | 
 78 | def get_default_response_generator(llm: BaseLanguageModel) -> Runnable:
 79 |     prompt = ChatPromptTemplate.from_messages(
 80 |         [
 81 |             ("system", RESPONSE_TEMPLATE),
 82 |             MessagesPlaceholder(variable_name="chat_history"),
 83 |             ("human", "{question}"),
 84 |         ]
 85 |     )
 86 | 
 87 |     return (prompt | llm | StrOutputParser()).with_config(
 88 |         run_name="GenerateResponse",
 89 |     )
 90 | 
 91 | 
 92 | def create_response_chain(
 93 |     response_generator: Runnable,
 94 |     retriever: BaseRetriever,
 95 |     format_docs: Optional[Callable[[Sequence[Document]], str]] = None,
 96 |     format_chat_history: Optional[Callable[[ChatRequest], str]] = None,
 97 | ) -> Runnable:
 98 |     format_docs = format_docs or _format_docs
 99 |     format_chat_history = format_chat_history or serialize_history
100 |     return (
101 |         RunnableAssign(
102 |             {
103 |                 "chat_history": RunnableLambda(format_chat_history).with_config(
104 |                     run_name="SerializeHistory"
105 |                 )
106 |             }
107 |         )
108 |         | RunnableAssign(
109 |             {
110 |                 "context": (
111 |                     itemgetter("question") | retriever | format_docs
112 |                 ).with_config(run_name="FormatDocs")
113 |             }
114 |         )
115 |         | response_generator
116 |     )
117 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/langchain_docs/indexing/.gitignore:
--------------------------------------------------------------------------------
1 | db/
2 | db_docs/
3 | .sql
4 | .bin
5 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/langchain_docs/indexing/__init__.py:
--------------------------------------------------------------------------------
1 | from langchain_benchmarks.rag.tasks.langchain_docs.indexing.retriever_registry import (
2 |     RETRIEVER_FACTORIES,
3 | )
4 | 
5 | __all__ = [
6 |     "RETRIEVER_FACTORIES",
7 | ]
8 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/langchain_docs/indexing/retriever_registry.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from typing import Callable, Iterable, Optional
  4 | 
  5 | from langchain.schema.document import Document
  6 | from langchain.schema.embeddings import Embeddings
  7 | from langchain.schema.retriever import BaseRetriever
  8 | from langchain.vectorstores.chroma import Chroma
  9 | 
 10 | from langchain_benchmarks.rag.utils._downloading import (
 11 |     fetch_remote_file,
 12 | )
 13 | from langchain_benchmarks.rag.utils.indexing import (
 14 |     get_hyde_retriever,
 15 |     get_parent_document_retriever,
 16 |     get_vectorstore_retriever,
 17 | )
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | _DIRECTORY = os.path.dirname(os.path.abspath(__file__))
 21 | # Stores the scraped documents from the langchain docs website, week of 2023-11-12
 22 | REMOTE_DOCS_FILE = "https://storage.googleapis.com/benchmarks-artifacts/langchain-docs-benchmarking/docs.parquet"
 23 | DOCS_FILE = os.path.join(_DIRECTORY, "db_docs/docs.parquet")
 24 | 
 25 | _DEFAULT_SEARCH_KWARGS = {"k": 6}
 26 | 
 27 | 
 28 | def load_docs_from_parquet(filename: Optional[str] = None) -> Iterable[Document]:
 29 |     try:
 30 |         import pandas as pd
 31 |     except ImportError:
 32 |         raise ImportError(
 33 |             "Please install pandas to use the langchain docs benchmarking task.\n"
 34 |             "pip install pandas"
 35 |         )
 36 |     if filename is None:
 37 |         filename = DOCS_FILE
 38 |     if not os.path.exists(filename):
 39 |         fetch_remote_file(REMOTE_DOCS_FILE, filename)
 40 |     df = pd.read_parquet(filename)
 41 |     docs_transformed = [Document(**row) for row in df.to_dict(orient="records")]
 42 |     for doc in docs_transformed:
 43 |         for k, v in doc.metadata.items():
 44 |             if v is None:
 45 |                 doc.metadata[k] = ""
 46 |         if not doc.page_content.strip():
 47 |             continue
 48 |         yield doc
 49 | 
 50 | 
 51 | def _chroma_retriever_factory(
 52 |     embedding: Embeddings,
 53 |     *,
 54 |     docs: Optional[Iterable[Document]] = None,
 55 |     search_kwargs: Optional[dict] = None,
 56 |     transform_docs: Optional[Callable] = None,
 57 |     transformation_name: Optional[str] = None,
 58 | ) -> BaseRetriever:
 59 |     docs = docs or load_docs_from_parquet()
 60 |     embedding_name = embedding.__class__.__name__
 61 |     vectorstore = Chroma(
 62 |         collection_name=f"lcbm-b-{embedding_name}-{transformation_name}",
 63 |         embedding_function=embedding,
 64 |         persist_directory="./chromadb",
 65 |     )
 66 |     return get_vectorstore_retriever(
 67 |         docs,
 68 |         embedding,
 69 |         vectorstore,
 70 |         collection_name="langchain-docs",
 71 |         transform_docs=transform_docs,
 72 |         transformation_name=transformation_name,
 73 |         search_kwargs=search_kwargs or _DEFAULT_SEARCH_KWARGS,
 74 |     )
 75 | 
 76 | 
 77 | def _chroma_parent_document_retriever_factory(
 78 |     embedding: Embeddings,
 79 |     *,
 80 |     docs: Optional[Iterable[Document]] = None,
 81 |     search_kwargs: Optional[dict] = None,
 82 |     transformation_name: Optional[str] = None,
 83 | ) -> BaseRetriever:
 84 |     docs = docs or load_docs_from_parquet()
 85 |     embedding_name = embedding.__class__.__name__
 86 |     vectorstore = Chroma(
 87 |         collection_name=f"lcbm-b-{embedding_name}-{transformation_name}",
 88 |         embedding_function=embedding,
 89 |         persist_directory="./chromadb",
 90 |     )
 91 |     return get_parent_document_retriever(
 92 |         docs,
 93 |         embedding,
 94 |         vectorstore,
 95 |         collection_name="langchain-docs",
 96 |         search_kwargs=search_kwargs or _DEFAULT_SEARCH_KWARGS,
 97 |         transformation_name=transformation_name,
 98 |     )
 99 | 
100 | 
101 | def _chroma_hyde_retriever_factory(
102 |     embedding: Embeddings,
103 |     *,
104 |     docs: Optional[Iterable[Document]] = None,
105 |     search_kwargs: Optional[dict] = None,
106 |     transformation_name: Optional[str] = None,
107 | ) -> BaseRetriever:
108 |     docs = docs or load_docs_from_parquet()
109 |     embedding_name = embedding.__class__.__name__
110 |     vectorstore = Chroma(
111 |         collection_name=f"lcbm-hd-{embedding_name}-{transformation_name}",
112 |         embedding_function=embedding,
113 |         persist_directory="./chromadb",
114 |     )
115 |     return get_hyde_retriever(
116 |         docs,
117 |         embedding,
118 |         vectorstore,
119 |         collection_name="langchain-docs",
120 |         search_kwargs=search_kwargs or _DEFAULT_SEARCH_KWARGS,
121 |         transformation_name=transformation_name,
122 |     )
123 | 
124 | 
125 | RETRIEVER_FACTORIES = {
126 |     "basic": _chroma_retriever_factory,
127 |     "parent-doc": _chroma_parent_document_retriever_factory,
128 |     "hyde": _chroma_hyde_retriever_factory,
129 | }
130 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/langchain_docs/task.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable
 2 | 
 3 | from langchain.schema.document import Document
 4 | 
 5 | from langchain_benchmarks.rag.tasks.langchain_docs import architectures, indexing
 6 | from langchain_benchmarks.rag.tasks.langchain_docs.indexing.retriever_registry import (
 7 |     DOCS_FILE,
 8 |     load_docs_from_parquet,
 9 | )
10 | from langchain_benchmarks.schema import RetrievalTask
11 | 
12 | # URL of public LangChain Docs dataset
13 | DATASET_ID = "https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d"
14 | 
15 | 
16 | def load_cached_docs() -> Iterable[Document]:
17 |     """Load the docs from the cached file."""
18 |     return load_docs_from_parquet(DOCS_FILE)
19 | 
20 | 
21 | LANGCHAIN_DOCS_TASK = RetrievalTask(
22 |     name="LangChain Docs Q&A",
23 |     dataset_id=DATASET_ID,
24 |     retriever_factories=indexing.RETRIEVER_FACTORIES,
25 |     architecture_factories=architectures.ARCH_FACTORIES,
26 |     get_docs=load_cached_docs,
27 |     description=(
28 |         """\
29 | Questions and answers based on a snapshot of the LangChain python docs.
30 | 
31 | The environment provides the documents and the retriever information.
32 | 
33 | Each example is composed of a question and reference answer.
34 | 
35 | Success is measured based on the accuracy of the answer relative to the reference answer.
36 | We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
37 | """  # noqa: E501
38 |     ),
39 | )
40 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/multi_modal_slide_decks/__init__.py:
--------------------------------------------------------------------------------
1 | from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.indexing.retriever_registry import (
2 |     get_file_names,
3 | )
4 | 
5 | __all__ = ["get_file_names"]
6 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/multi_modal_slide_decks/indexing/__init__.py:
--------------------------------------------------------------------------------
1 | from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.indexing.retriever_registry import (
2 |     get_file_names,
3 | )
4 | 
5 | __all__ = ["get_file_names"]
6 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/multi_modal_slide_decks/indexing/retriever_registry.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import zipfile
 4 | from pathlib import Path
 5 | from typing import Iterable, Optional
 6 | 
 7 | from langchain_benchmarks.rag.utils._downloading import (
 8 |     fetch_remote_file,
 9 |     is_folder_populated,
10 | )
11 | 
12 | logger = logging.getLogger(__name__)
13 | _DIRECTORY = Path(os.path.abspath(__file__)).parent
14 | # Stores the zipped pdfs for this dataset
15 | REMOTE_DOCS_FILE = "https://storage.googleapis.com/benchmarks-artifacts/langchain-docs-benchmarking/multi_modal_slide_decks.zip"
16 | DOCS_DIR = _DIRECTORY / "pdfs"
17 | 
18 | 
19 | def fetch_raw_docs(
20 |     filename: Optional[str] = None, docs_dir: Optional[str] = None
21 | ) -> None:
22 |     filename = filename or _DIRECTORY / Path(REMOTE_DOCS_FILE).name
23 |     docs_dir = docs_dir or DOCS_DIR
24 |     if not is_folder_populated(docs_dir):
25 |         fetch_remote_file(REMOTE_DOCS_FILE, filename)
26 |         with zipfile.ZipFile(filename, "r") as zip_ref:
27 |             zip_ref.extractall(docs_dir)
28 | 
29 |         os.remove(filename)
30 | 
31 | 
32 | def get_file_names() -> Iterable[Path]:
33 |     fetch_raw_docs()
34 |     # Traverse the directory and partition the pdfs
35 |     for path in DOCS_DIR.rglob("*.pdf"):
36 |         # Ignore __MACOSX
37 |         if "__MACOSX" in str(path):
38 |             continue
39 |         yield path
40 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/multi_modal_slide_decks/task.py:
--------------------------------------------------------------------------------
 1 | from langchain_benchmarks.schema import RetrievalTask
 2 | 
 3 | # ID of public Multi Modal Slide Decks dataset
 4 | DATASET_ID = "https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d"
 5 | 
 6 | MULTI_MODAL_SLIDE_DECKS_TASK = RetrievalTask(
 7 |     name="Multi-modal slide decks",
 8 |     dataset_id=DATASET_ID,
 9 |     retriever_factories={},
10 |     architecture_factories={},
11 |     get_docs={},
12 |     description=(
13 |         """\
14 | This public dataset is a work-in-progress and will be extended over time.
15 |         
16 | Questions and answers based on slide decks containing visual tables and charts.
17 | 
18 | Each example is composed of a question and reference answer.
19 | 
20 | Success is measured based on the accuracy of the answer relative to the reference answer.
21 | """  # noqa: E501
22 |     ),
23 | )
24 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/semi_structured_reports/__init__.py:
--------------------------------------------------------------------------------
 1 | from langchain_benchmarks.rag.tasks.semi_structured_reports.indexing.retriever_registry import (
 2 |     get_file_names,
 3 | )
 4 | from langchain_benchmarks.rag.tasks.semi_structured_reports.task import (
 5 |     SEMI_STRUCTURED_REPORTS_TASK,
 6 | )
 7 | 
 8 | # Please keep this sorted
 9 | __all__ = ["get_file_names", "SEMI_STRUCTURED_REPORTS_TASK"]
10 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/semi_structured_reports/indexing/.gitignore:
--------------------------------------------------------------------------------
1 | pdfs/
2 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/semi_structured_reports/indexing/__init__.py:
--------------------------------------------------------------------------------
1 | from langchain_benchmarks.rag.tasks.semi_structured_reports.indexing.retriever_registry import (
2 |     RETRIEVER_FACTORIES,
3 |     load_docs,
4 | )
5 | 
6 | __all__ = ["RETRIEVER_FACTORIES", "load_docs"]
7 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/tasks/semi_structured_reports/task.py:
--------------------------------------------------------------------------------
 1 | from langchain_benchmarks.rag.tasks.semi_structured_reports import (
 2 |     indexing,
 3 | )
 4 | from langchain_benchmarks.rag.tasks.semi_structured_reports.indexing.retriever_registry import (
 5 |     load_docs,
 6 | )
 7 | from langchain_benchmarks.schema import RetrievalTask
 8 | 
 9 | # ID of public Semi-structured Earnings dataset
10 | DATASET_ID = "https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d"
11 | 
12 | SEMI_STRUCTURED_REPORTS_TASK = RetrievalTask(
13 |     name="Semi-structured Reports",
14 |     dataset_id=DATASET_ID,
15 |     retriever_factories=indexing.RETRIEVER_FACTORIES,
16 |     architecture_factories={},
17 |     get_docs=load_docs,
18 |     description=(
19 |         """\
20 | Questions and answers based on PDFs containing tables and charts.
21 | 
22 | The task provides the raw documents as well as factory methods to easily index them
23 | and create a retriever.
24 | 
25 | Each example is composed of a question and reference answer.
26 | 
27 | Success is measured based on the accuracy of the answer relative to the reference answer.
28 | We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
29 | """  # noqa: E501
30 |     ),
31 | )
32 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/langchain_benchmarks/rag/utils/__init__.py


--------------------------------------------------------------------------------
/langchain_benchmarks/rag/utils/_downloading.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def is_folder_populated(folder: str):
 7 |     if os.path.exists(folder):
 8 |         return any(os.scandir(folder))
 9 |     return False
10 | 
11 | 
12 | def fetch_remote_file(remote: str, local: str):
13 |     if not os.path.exists(local):
14 |         print(f"File {local} does not exist. Downloading from GCS...")
15 |         if not os.path.exists(os.path.dirname(local)):
16 |             os.makedirs(os.path.dirname(local))
17 |         r = requests.get(remote, allow_redirects=True)
18 |         with open(local, "wb") as f:
19 |             f.write(r.content)
20 |         print(f"File {remote} downloaded.")
21 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/rate_limiting.py:
--------------------------------------------------------------------------------
  1 | """Implementation of a rate limiter based on a token bucket."""
  2 | import threading
  3 | import time
  4 | from typing import Any, Optional
  5 | 
  6 | from langchain.schema.runnable import Runnable, RunnableLambda
  7 | from langchain.schema.runnable.utils import Input, Output
  8 | 
  9 | 
 10 | class RateLimiter:
 11 |     def __init__(
 12 |         self,
 13 |         *,
 14 |         requests_per_second: float = 1,
 15 |         check_every_n_seconds: float = 0.1,
 16 |         max_bucket_size: float = 1,
 17 |     ) -> None:
 18 |         """A rate limiter based on a token bucket.
 19 | 
 20 |         These *tokens* have NOTHING to do with LLM tokens. They are just
 21 |         a way to keep track of how many requests can be made at a given time.
 22 | 
 23 |         This rate limiter is designed to work in a threaded environment.
 24 | 
 25 |         It works by filling up a bucket with tokens at a given rate. Each
 26 |         request consumes a given number of tokens. If there are not enough
 27 |         tokens in the bucket, the request is blocked until there are enough
 28 |         tokens.
 29 | 
 30 |         Args:
 31 |             requests_per_second: The number of tokens to add per second to the bucket.
 32 |                 Must be at least 1. The tokens represent "credit" that can be used
 33 |                 to make requests.
 34 |             check_every_n_seconds: check whether the tokens are available
 35 |                 every this many seconds. Can be a float to represent
 36 |                 fractions of a second.
 37 |             max_bucket_size: The maximum number of tokens that can be in the bucket.
 38 |                 This is used to prevent bursts of requests.
 39 |         """
 40 |         # Number of requests that we can make per second.
 41 |         self.requests_per_second = requests_per_second
 42 |         # Number of tokens in the bucket.
 43 |         self.available_tokens = 0.0
 44 |         self.max_bucket_size = max_bucket_size
 45 |         # A lock to ensure that tokens can only be consumed by one thread
 46 |         # at a given time.
 47 |         self._consume_lock = threading.Lock()
 48 |         # The last time we tried to consume tokens.
 49 |         self.last: Optional[time.time] = None
 50 |         self.check_every_n_seconds = check_every_n_seconds
 51 | 
 52 |     def _consume(self) -> bool:
 53 |         """Consume the given amount of tokens if possible.
 54 | 
 55 |         Returns:
 56 |             True means that the tokens were consumed, and the caller can proceed to
 57 |             make the request. A False means that the tokens were not consumed, and
 58 |             the caller should try again later.
 59 |         """
 60 |         with self._consume_lock:
 61 |             now = time.time()
 62 | 
 63 |             # initialize on first call to avoid a burst
 64 |             if self.last is None:
 65 |                 self.last = now
 66 | 
 67 |             elapsed = now - self.last
 68 | 
 69 |             if elapsed * self.requests_per_second >= 1:
 70 |                 self.available_tokens += elapsed * self.requests_per_second
 71 |                 self.last = now
 72 | 
 73 |             # Make sure that we don't exceed the bucket size.
 74 |             # This is used to prevent bursts of requests.
 75 |             self.available_tokens = min(self.available_tokens, self.max_bucket_size)
 76 | 
 77 |             # As long as we have at least one token, we can proceed.
 78 |             if self.available_tokens >= 1:
 79 |                 self.available_tokens -= 1
 80 |                 return True
 81 | 
 82 |             return False
 83 | 
 84 |     def wait(self) -> None:
 85 |         """Blocking call to wait until the given number of tokens are available."""
 86 |         while not self._consume():
 87 |             time.sleep(self.check_every_n_seconds)
 88 | 
 89 | 
 90 | def with_rate_limit(
 91 |     runnable: Runnable[Input, Output],
 92 |     rate_limiter: RateLimiter,
 93 | ) -> Runnable[Input, Output]:
 94 |     """Add a rate limiter to the runnable.
 95 | 
 96 |     Args:
 97 |         runnable: The runnable to throttle.
 98 |         rate_limiter: The throttle to use.
 99 | 
100 |     Returns:
101 |         A runnable lambda that acts as a throttled passthrough.
102 |     """
103 | 
104 |     def _wait(input: dict, **kwargs: Any) -> dict:
105 |         """Wait for the rate limiter to allow the request to proceed."""
106 |         rate_limiter.wait()
107 |         return input
108 | 
109 |     return RunnableLambda(_wait).with_config({"name": "Wait"}) | runnable
110 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/registration.py:
--------------------------------------------------------------------------------
 1 | """Registry of environments for ease of access."""
 2 | 
 3 | from langchain_benchmarks.extraction.tasks import (
 4 |     chat_extraction,
 5 |     email_task,
 6 |     high_cardinality,
 7 | )
 8 | from langchain_benchmarks.rag.tasks import (
 9 |     LANGCHAIN_DOCS_TASK,
10 |     MULTI_MODAL_SLIDE_DECKS_TASK,
11 |     SEMI_STRUCTURED_REPORTS_TASK,
12 | )
13 | from langchain_benchmarks.schema import Registry
14 | from langchain_benchmarks.tool_usage.tasks import (
15 |     multiverse_math,
16 |     relational_data,
17 |     type_writer,
18 |     type_writer_26_funcs,
19 | )
20 | 
21 | # Using lower case naming to make a bit prettier API when used in a notebook
22 | registry = Registry(
23 |     tasks=[
24 |         type_writer.TYPE_WRITER_TASK,
25 |         type_writer_26_funcs.TYPE_WRITER_26_FUNCS_TASK,
26 |         relational_data.RELATIONAL_DATA_TASK,
27 |         multiverse_math.MULTIVERSE_MATH,
28 |         email_task.EMAIL_EXTRACTION_TASK,
29 |         chat_extraction.CHAT_EXTRACTION_TASK,
30 |         LANGCHAIN_DOCS_TASK,
31 |         SEMI_STRUCTURED_REPORTS_TASK,
32 |         MULTI_MODAL_SLIDE_DECKS_TASK,
33 |         high_cardinality.NAME_CORRECTION_TASK,
34 |     ]
35 | )
36 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/tool_usage/README.md:
--------------------------------------------------------------------------------
1 | # Tool usage
2 | 
3 | This sub-package includes code to help test how well tools can be used to make
4 | decisions.


--------------------------------------------------------------------------------
/langchain_benchmarks/tool_usage/__init__.py:
--------------------------------------------------------------------------------
 1 | """Package for helping to evaluate agent runs."""
 2 | from langchain_benchmarks.tool_usage.agents import (
 3 |     CustomRunnableAgentFactory,
 4 |     StandardAgentFactory,
 5 |     apply_agent_executor_adapter,
 6 | )
 7 | from langchain_benchmarks.tool_usage.evaluators import get_eval_config
 8 | 
 9 | # Please keep this list sorted!
10 | __all__ = [
11 |     "apply_agent_executor_adapter",
12 |     "CustomRunnableAgentFactory",
13 |     "get_eval_config",
14 |     "StandardAgentFactory",
15 | ]
16 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/tool_usage/agents/__init__.py:
--------------------------------------------------------------------------------
 1 | from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
 2 | from langchain_benchmarks.tool_usage.agents.runnable_agent import (
 3 |     CustomRunnableAgentFactory,
 4 | )
 5 | from langchain_benchmarks.tool_usage.agents.tool_using_agent import StandardAgentFactory
 6 | 
 7 | __all__ = [
 8 |     "apply_agent_executor_adapter",
 9 |     "CustomRunnableAgentFactory",
10 |     "StandardAgentFactory",
11 | ]
12 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/tool_usage/agents/adapters.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Callable, Optional
 2 | 
 3 | from langchain.agents import AgentExecutor
 4 | from langchain_core.runnables import Runnable, RunnableLambda, RunnablePassthrough
 5 | 
 6 | 
 7 | def _ensure_output_exists(inputs: dict) -> dict:
 8 |     """Make sure that the output key is always present."""
 9 |     if "output" not in inputs:
10 |         return {"output": "", **inputs}
11 |     return inputs
12 | 
13 | 
14 | def apply_agent_executor_adapter(
15 |     agent_executor: AgentExecutor,
16 |     *,
17 |     state_reader: Optional[Callable[[], Any]] = None,
18 | ) -> Runnable:
19 |     """An adapter for the agent executor to standardize its input and output.
20 | 
21 |     1) Map `question` to `input` (`question` is used in the datasets,
22 |        but `input` is used in the agent executor)
23 |     2) Ensure that `output` is always returned (will be set to "" if missing) --
24 |        note that this may be relaxed after more updates in the eval config.
25 |     3) Populate `state` key in the response of the agent with the system state
26 |        if a state reader is provided.
27 | 
28 |     Args:
29 |         agent_executor: the agent executor
30 |         state_reader: A callable without parameters that if invoked will return
31 |                       the state of the environment. Used to populate the 'state' key.
32 | 
33 |     Returns:
34 |         a new runnable with a standardized output.
35 |     """
36 | 
37 |     def _read_state(*args: Any, **kwargs: Any) -> Any:
38 |         """Read the state of the environment."""
39 |         if state_reader is not None:
40 |             return state_reader()
41 |         else:
42 |             return None
43 | 
44 |     runnable = agent_executor | RunnableLambda(_ensure_output_exists).with_config(
45 |         {"run_name": "Ensure Output"}
46 |     )
47 | 
48 |     if state_reader is not None:
49 |         runnable = runnable | RunnablePassthrough.assign(state=_read_state).with_config(
50 |             {"run_name": "Read Env State"}
51 |         )
52 |     return runnable
53 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/tool_usage/agents/base.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | from langchain_core.runnables import Runnable
 4 | 
 5 | 
 6 | class AgentFactory(abc.ABC):
 7 |     """Abstract class for agent factory"""
 8 | 
 9 |     @abc.abstractmethod
10 |     def __call__(self) -> Runnable:
11 |         """Create a new agent"""
12 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/tool_usage/agents/runnable_agent.py:
--------------------------------------------------------------------------------
 1 | """Factory for creating agents for the tool usage task."""
 2 | from typing import Union
 3 | 
 4 | from langchain.agents.agent import (
 5 |     AgentExecutor,
 6 |     BaseMultiActionAgent,
 7 |     BaseSingleActionAgent,
 8 | )
 9 | from langchain_core.runnables import Runnable
10 | 
11 | from langchain_benchmarks.schema import ToolUsageTask
12 | from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
13 | from langchain_benchmarks.tool_usage.agents.base import AgentFactory
14 | 
15 | 
16 | class CustomRunnableAgentFactory(AgentFactory):
17 |     """A factory for creating tool using agents.
18 | 
19 |     A factory for agents that do not leverage any special JSON mode for
20 |     function usage; instead all function invocation behavior is implemented solely
21 |     through prompt engineering and parsing.
22 |     """
23 | 
24 |     def __init__(
25 |         self,
26 |         task: ToolUsageTask,
27 |         agent: Union[Runnable, BaseSingleActionAgent, BaseMultiActionAgent],
28 |     ) -> None:
29 |         """Create an agent factory for the given tool usage task.
30 | 
31 |         Note: The agent should not be stateful, as it will be reused across
32 |         multiple runs.
33 | 
34 |         Args:
35 |             task: The task to create an agent factory for
36 |             agent: The agent to use
37 |         """
38 |         self.task = task
39 |         self.agent = agent
40 | 
41 |     def __call__(self) -> Runnable:
42 |         env = self.task.create_environment()
43 |         executor = AgentExecutor(
44 |             agent=self.agent,
45 |             tools=env.tools,
46 |             handle_parsing_errors=True,
47 |             return_intermediate_steps=True,
48 |         )
49 | 
50 |         return apply_agent_executor_adapter(
51 |             executor, state_reader=env.read_state
52 |         ).with_config({"run_name": "Agent", "metadata": {"task": self.task.name}})
53 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/tool_usage/agents/tool_using_agent.py:
--------------------------------------------------------------------------------
 1 | """Factory for creating agents.
 2 | 
 3 | This is useful for agents that follow the standard LangChain tool format.
 4 | """
 5 | from typing import Optional
 6 | 
 7 | from langchain.agents import AgentExecutor, create_tool_calling_agent
 8 | from langchain_core.language_models import BaseChatModel
 9 | from langchain_core.prompts import ChatPromptTemplate
10 | from langchain_core.runnables import Runnable
11 | 
12 | from langchain_benchmarks.rate_limiting import RateLimiter, with_rate_limit
13 | from langchain_benchmarks.schema import ToolUsageTask
14 | from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
15 | from langchain_benchmarks.tool_usage.agents.base import AgentFactory
16 | 
17 | 
18 | class StandardAgentFactory(AgentFactory):
19 |     """A standard agent factory.
20 | 
21 |     Use this factory with chat models that support the standard LangChain tool
22 |     calling API where the chat model populates the tool_calls attribute on AIMessage.
23 |     """
24 | 
25 |     def __init__(
26 |         self,
27 |         task: ToolUsageTask,
28 |         model: BaseChatModel,
29 |         prompt: ChatPromptTemplate,
30 |         *,
31 |         rate_limiter: Optional[RateLimiter] = None,
32 |     ) -> None:
33 |         """Create an agent factory for the given tool usage task.
34 | 
35 |         Args:
36 |             task: The task to create an agent factory for
37 |             model: chat model to use, must support tool usage
38 |             prompt: This is a chat prompt at the moment.
39 |                 Must include an agent_scratchpad
40 | 
41 |                 For example,
42 | 
43 |                 ChatPromptTemplate.from_messages(
44 |                     [
45 |                         ("system", "{instructions}"),
46 |                         ("human", "{input}"),
47 |                         MessagesPlaceholder("agent_scratchpad"),
48 |                     ]
49 |                 )
50 |             rate_limiter: will be appended to the agent runnable
51 |         """
52 |         self.task = task
53 |         self.model = model
54 |         self.prompt = prompt
55 |         self.rate_limiter = rate_limiter
56 | 
57 |     def __call__(self) -> Runnable:
58 |         """Call the factory to create Runnable agent."""
59 | 
60 |         env = self.task.create_environment()
61 | 
62 |         if "instructions" in self.prompt.input_variables:
63 |             finalized_prompt = self.prompt.partial(instructions=self.task.instructions)
64 |         else:
65 |             finalized_prompt = self.prompt
66 | 
67 |         agent = create_tool_calling_agent(self.model, env.tools, finalized_prompt)
68 | 
69 |         if self.rate_limiter:
70 |             agent = with_rate_limit(agent, self.rate_limiter)
71 | 
72 |         executor = AgentExecutor(
73 |             agent=agent,
74 |             tools=env.tools,
75 |             handle_parsing_errors=True,
76 |             return_intermediate_steps=True,
77 |         )
78 | 
79 |         return apply_agent_executor_adapter(
80 |             executor, state_reader=env.read_state
81 |         ).with_config({"run_name": "Agent", "metadata": {"task": self.task.name}})
82 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/tool_usage/prompts.py:
--------------------------------------------------------------------------------
 1 | from langchain_core.prompts import PromptTemplate
 2 | 
 3 | MATH_TEMPLATE = """\
 4 | You live in an alternate universe. Do not assume that you know anything.
 5 | You are a teacher grading a quiz.
 6 | You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either CORRECT or INCORRECT.
 7 | 
 8 | Example Format:
 9 | QUESTION: question here
10 | STUDENT ANSWER: student's answer here
11 | TRUE ANSWER: true answer here
12 | GRADE: CORRECT or INCORRECT here
13 | 
14 | Given that you live in an alternate universe the TRUE answer may be different from what you expect. That's OK!
15 | 
16 | Grade the student answers based ONLY on whether it matches the TRUE answer. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin! 
17 | 
18 | QUESTION: {query}
19 | STUDENT ANSWER: {result}
20 | TRUE ANSWER: {answer}
21 | GRADE:"""
22 | QA_TEMPLATE_FOR_MULTIVERSE_MATH = PromptTemplate(
23 |     input_variables=["result", "answer"], template=MATH_TEMPLATE
24 | )
25 | 
26 | MATH_TEMPLATE_NO_QUESTION = """\
27 | Compare the INPUT_A and INPUT_B and determine whether the numeric result in them is the same.
28 | 
29 | If the result is the same, reply with CORRECT. If the result is different, reply with INCORRECT.
30 | 
31 | Example Format:
32 | INPUT_A: input_a here
33 | INPUT_B: input_b here
34 | COMPARISON: CORRECT or INCORRECT here
35 | 
36 | Ignore differences in punctuation and phrasing between the student answer and true answer, please only compare the first 4 decimal digits.
37 | 
38 | For instance if INPUT_A = 123.6751345 and INPUT_B = 123.6751456 you should return CORRECT, since the first 4 decimal points match.
39 | 
40 | Begin!
41 | 
42 | INPUT_A: {answer}
43 | INPUT_B: {result}
44 | COMPARISON:"""
45 | 
46 | # Version without the query
47 | QA_TEMPLATE_FOR_MULTIVERSE_MATH_WITHOUT_QUESTION = PromptTemplate(
48 |     input_variables=["result", "answer"], template=MATH_TEMPLATE_NO_QUESTION
49 | )
50 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/tool_usage/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/langchain_benchmarks/tool_usage/tasks/__init__.py


--------------------------------------------------------------------------------
/langchain_benchmarks/tool_usage/tasks/type_writer.py:
--------------------------------------------------------------------------------
  1 | """A task where the agent must type a given string one letter at a time.
  2 | 
  3 | In this variation of the task, the agent is given a single function,
  4 | that takes a letter as an argument.
  5 | """
  6 | import dataclasses
  7 | from typing import Any, Callable, List, cast
  8 | 
  9 | from langchain.tools import BaseTool, tool
 10 | 
 11 | from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask
 12 | 
 13 | 
 14 | @dataclasses.dataclass
 15 | class Paper:
 16 |     """A piece of paper that the agent can write on."""
 17 | 
 18 |     content: str
 19 | 
 20 | 
 21 | def create_typer(paper: Paper) -> Callable[[str], str]:
 22 |     """Create a function that types the given letter."""
 23 | 
 24 |     def type_letter(letter: str) -> str:
 25 |         """Print the given letter on the paper."""
 26 |         if len(letter) != 1:
 27 |             return "ERROR: The letter must be a single character."
 28 |         paper.content += letter
 29 |         return "OK"
 30 | 
 31 |     return type_letter
 32 | 
 33 | 
 34 | # PUBLIC API
 35 | 
 36 | 
 37 | def get_environment() -> ToolUsageEnvironment:
 38 |     """Create tools and state reader.
 39 | 
 40 |     Attention: this is a factory function, so it will create a new environment
 41 |                every time it is called. The paper contains state.
 42 | 
 43 |     Returns:
 44 |         A tuple of (tools, state_reader).
 45 |     """
 46 |     paper = Paper(content="")  # Start with an empty piece of paper
 47 | 
 48 |     def _read_state() -> Any:
 49 |         """Read the state of the environment."""
 50 |         return paper.content
 51 | 
 52 |     tools = cast(List[BaseTool], [tool(create_typer(paper))])
 53 | 
 54 |     return ToolUsageEnvironment(
 55 |         tools=tools,
 56 |         read_state=_read_state,
 57 |     )
 58 | 
 59 | 
 60 | TYPE_WRITER_TASK = ToolUsageTask(
 61 |     name="Tool Usage - Typewriter (1 tool)",
 62 |     dataset_id="https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d",
 63 |     create_environment=get_environment,
 64 |     instructions=(
 65 |         "Repeat the given string using the provided tools. "
 66 |         "Do not write anything else or provide any explanations. "
 67 |         "For example, if the string is 'abc', you must print the letters "
 68 |         "'a', 'b', and 'c' one at a time and in that order. "
 69 |     ),
 70 |     description=(
 71 |         """\
 72 | Environment with a single tool that accepts a single letter as input, and \
 73 | prints it on a piece of virtual paper.
 74 | 
 75 | The objective of this task is to evaluate the ability of the model to use the provided \
 76 | tools to repeat a given input string.
 77 | 
 78 | For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \
 79 | in that order.
 80 | 
 81 | The dataset includes examples of varying difficulty. The difficulty is measured \
 82 | by the length of the string.
 83 | """
 84 |     ),
 85 |     eval_params={
 86 |         # For this task, the agent's output is irrelevant
 87 |         # what we care about is the final state of the environment
 88 |         # (i.e., what's written on the virtual paper)
 89 |         "output_evaluation": "none",
 90 |     },
 91 | )
 92 | 
 93 | 
 94 | STRINGS_TO_TYPE = [
 95 |     # letter repetition
 96 |     "a",
 97 |     "aa",
 98 |     "aaa",
 99 |     "aaaa",
100 |     # 3-letter words
101 |     "dog",
102 |     "cat",
103 |     # 4-letter words
104 |     "hand",
105 |     "head",
106 |     # 5-letter words
107 |     "house",
108 |     "horse",
109 |     # 6-letter words
110 |     "school",
111 |     "church",
112 |     # 7-letter words
113 |     "teacher",
114 |     "student",
115 |     # 8-letter words
116 |     "computer",
117 |     "keyboard",
118 |     # 9-letter words
119 |     "university",
120 |     "dictionary",
121 |     # 10-letter words
122 |     "information",
123 |     "communication",
124 | ]
125 | 
126 | 
127 | def _create_dataset(strings: List[str]) -> List[dict]:
128 |     """Create the dataset."""
129 |     dataset = []
130 |     for string in strings:
131 |         dataset.append(
132 |             {
133 |                 "question": string,
134 |                 "expected_steps": ["type_letter"] * len(string),
135 |                 "state": string,
136 |             }
137 |         )
138 |     return dataset
139 | 
140 | 
141 | DATASET = _create_dataset(STRINGS_TO_TYPE)
142 | 
143 | 
144 | def _create_dataset() -> None:
145 |     """Create a dataset with the langsmith client."""
146 |     from langsmith.client import Client
147 | 
148 |     client = Client()
149 |     dataset = client.create_dataset(
150 |         dataset_name=TYPE_WRITER_TASK.name,
151 |         description=TYPE_WRITER_TASK.description,
152 |     )
153 | 
154 |     for example in DATASET:
155 |         client.create_example(
156 |             inputs={
157 |                 "question": example["question"],
158 |             },
159 |             outputs={
160 |                 "reference": example["state"],
161 |                 "expected_steps": example["expected_steps"],
162 |                 "state": example["state"],
163 |             },
164 |             dataset_id=dataset.id,
165 |         )
166 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/tool_usage/tasks/type_writer_26_funcs.py:
--------------------------------------------------------------------------------
  1 | """A task where the agent must type a given string one letter at a time.
  2 | 
  3 | In this variation of the task, the agent is given access to 26 parameterless functions,
  4 | each representing a letter of the alphabet.
  5 | """
  6 | import dataclasses
  7 | from typing import Any, Callable, List, cast
  8 | 
  9 | from langchain.tools import BaseTool, tool
 10 | 
 11 | from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask
 12 | 
 13 | 
 14 | @dataclasses.dataclass
 15 | class Paper:
 16 |     """A piece of paper that the agent can write on."""
 17 | 
 18 |     content: str
 19 | 
 20 | 
 21 | def _create_typing_func(letter: str, paper: Paper) -> Callable[[], str]:
 22 |     """Create a function that types the given letter."""
 23 | 
 24 |     def func() -> str:
 25 |         paper.content += letter
 26 |         return "OK"
 27 | 
 28 |     func.__doc__ = f'Run to Type the letter "{letter}".'
 29 |     func.__name__ = letter
 30 |     return func
 31 | 
 32 | 
 33 | def _get_available_functions(paper: Paper) -> List[Callable]:
 34 |     """Get all the available functions."""
 35 |     return [
 36 |         _create_typing_func(letter, paper) for letter in "abcdefghijklmnopqrstuvwxyz"
 37 |     ]
 38 | 
 39 | 
 40 | # PUBLIC API
 41 | 
 42 | 
 43 | def get_environment() -> ToolUsageEnvironment:
 44 |     """Create tools and state reader.
 45 | 
 46 |     Attention: this is a factory function, so it will create a new environment
 47 |                every time it is called. The paper contains state.
 48 | 
 49 |     Returns:
 50 |         A tuple of (tools, state_reader).
 51 |     """
 52 |     paper = Paper(content="")  # Start with an empty piece of paper
 53 |     functions = _get_available_functions(paper)
 54 | 
 55 |     def _read_state() -> Any:
 56 |         """Read the state of the environment."""
 57 |         return paper.content
 58 | 
 59 |     tools = cast(List[BaseTool], [tool(f) for f in functions])
 60 | 
 61 |     return ToolUsageEnvironment(
 62 |         tools=tools,
 63 |         read_state=_read_state,
 64 |     )
 65 | 
 66 | 
 67 | TYPE_WRITER_26_FUNCS_TASK = ToolUsageTask(
 68 |     name="Tool Usage - Typewriter (26 tools)",
 69 |     dataset_id="https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d",
 70 |     create_environment=get_environment,
 71 |     instructions=(
 72 |         "Repeat the given string by using the provided tools. "
 73 |         "Do not write anything else or provide any explanations. "
 74 |         "For example, if the string is 'abc', you must invoke the tools "
 75 |         "'a', 'b', and 'c' in that order. "
 76 |         "Please invoke the functions without any arguments."
 77 |     ),
 78 |     description=(
 79 |         """\
 80 | Environment with 26 tools each tool represents a letter of the alphabet.
 81 | 
 82 | The objective of this task is to evaluate the model's ability the use tools
 83 | for a simple repetition task.
 84 | 
 85 | For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \
 86 | in that order.
 87 | 
 88 | The dataset includes examples of varying difficulty. The difficulty is measured \
 89 | by the length of the string.
 90 | 
 91 | This is a variation of the typer writer task, where 26 parameterless tools are
 92 | given instead of a single tool that takes a letter as an argument.
 93 | """
 94 |     ),
 95 |     eval_params={
 96 |         # For this task, the agent's output is irrelevant
 97 |         # what we care about is the final state of the environment
 98 |         # (i.e., what's written on the virtual paper)
 99 |         "output_evaluation": "none",
100 |     },
101 | )
102 | 
103 | STRINGS_TO_TYPE = [
104 |     # letter repetition
105 |     "a",
106 |     "aa",
107 |     "aaa",
108 |     "aaaa",
109 |     # 3-letter words
110 |     "dog",
111 |     "cat",
112 |     # 4-letter words
113 |     "hand",
114 |     "head",
115 |     # 5-letter words
116 |     "house",
117 |     "horse",
118 |     # 6-letter words
119 |     "school",
120 |     "church",
121 |     # 7-letter words
122 |     "teacher",
123 |     "student",
124 |     # 8-letter words
125 |     "computer",
126 |     "keyboard",
127 |     # 9-letter words
128 |     "university",
129 |     "dictionary",
130 |     # 10-letter words
131 |     "information",
132 |     "communication",
133 | ]
134 | 
135 | 
136 | def _create_dataset(strings: List[str]) -> List[dict]:
137 |     """Create the dataset."""
138 |     dataset = []
139 |     for string in strings:
140 |         dataset.append(
141 |             {
142 |                 "question": string,
143 |                 "expected_steps": [c for c in string],
144 |                 "state": string,
145 |             }
146 |         )
147 |     return dataset
148 | 
149 | 
150 | DATASET = _create_dataset(STRINGS_TO_TYPE)
151 | 
152 | 
153 | def _create_dataset() -> None:
154 |     """Create a dataset with the langsmith client."""
155 |     from langsmith.client import Client
156 | 
157 |     client = Client()
158 |     dataset = client.create_dataset(
159 |         dataset_name=TYPE_WRITER_26_FUNCS_TASK.name,
160 |         description=TYPE_WRITER_26_FUNCS_TASK.description,
161 |     )
162 | 
163 |     for example in DATASET:
164 |         client.create_example(
165 |             inputs={
166 |                 "question": example["question"],
167 |             },
168 |             outputs={
169 |                 "reference": example["state"],
170 |                 "expected_steps": example["expected_steps"],
171 |                 "state": example["state"],
172 |             },
173 |             dataset_id=dataset.id,
174 |         )
175 | 


--------------------------------------------------------------------------------
/langchain_benchmarks/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from langchain_benchmarks.utils._langsmith import run_without_langsmith
2 | 
3 | __all__ = ["run_without_langsmith"]
4 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "langchain-benchmarks"
  3 | version = "0.0.15"
  4 | description = "🦜💪 Flex those feathers!"
  5 | authors = ["LangChain AI"]
  6 | license = "MIT"
  7 | readme = "README.md"
  8 | 
  9 | [tool.poetry.dependencies]
 10 | python = "^3.9"
 11 | langchain = "^0.3"
 12 | langchain-community = "^0.3"
 13 | langchain-core= "^0.3.12"
 14 | langsmith = ">=0.0.70"
 15 | tqdm = "^4"
 16 | ipywidgets = "^8"
 17 | tabulate = ">=0.8.0"
 18 | langchain-openai = "^0.2"
 19 | 
 20 | [tool.poetry.group.dev]
 21 | optional = true
 22 | 
 23 | [tool.poetry.group.dev.dependencies]
 24 | jupyter = "^1.0.0"
 25 | 
 26 | [tool.poetry.group.typing]
 27 | optional = true
 28 | 
 29 | [tool.poetry.group.typing.dependencies]
 30 | mypy = "^1.7.0"
 31 | [tool.poetry.group.lint]
 32 | optional = true
 33 | 
 34 | [tool.poetry.group.lint.dependencies]
 35 | ruff = "^0.1.5"
 36 | 
 37 | [tool.poetry.group.docs]
 38 | optional = true
 39 | 
 40 | [tool.poetry.group.docs.dependencies]
 41 | nbsphinx = ">=0.8.9"
 42 | sphinx = ">=5.2.0"
 43 | sphinx-autobuild = "^2021.3.14"
 44 | sphinx_book_theme = "^1.0.0"
 45 | myst-nb = { version = "^1.0.0", python = "^3.9" }
 46 | toml = "^0.10.2"
 47 | sphinx-copybutton = ">=0.5.1"
 48 | 
 49 | [tool.poetry.group.test]
 50 | optional = true
 51 | 
 52 | [tool.poetry.group.test.dependencies]
 53 | pytest = "^7.2.1"
 54 | pytest-cov = "^4.0.0"
 55 | pytest-asyncio = "^0.21.1"
 56 | pytest-mock = "^3.11.1"
 57 | pytest-socket = "^0.6.0"
 58 | pytest-watch = "^4.2.0"
 59 | pytest-timeout = "^2.2.0"
 60 | freezegun = "^1.3.1"
 61 | langchain-anthropic = "^0.2"
 62 | langchain-fireworks = "^0.2"
 63 | langchain-mistralai = "^0.2"
 64 | langchain-groq = "^0.2"
 65 | langchain-core = "^0.3.12"
 66 | faiss-cpu = ">=1.8.0"
 67 | 
 68 | [tool.ruff]
 69 | select = [
 70 |     "E",  # pycodestyle
 71 |     "F",  # pyflakes
 72 |     "I",  # isort
 73 | ]
 74 | extend-include = ["*.ipynb"]
 75 | 
 76 | # Same as Black.
 77 | line-length = 88
 78 | 
 79 | [tool.ruff.isort]
 80 | known-first-party = ["langchain-benchmarks"]
 81 | 
 82 | [tool.mypy]
 83 | disallow_untyped_defs = "True"
 84 | ignore_missing_imports = "True"
 85 | 
 86 | [tool.coverage.run]
 87 | omit = [
 88 |     "tests/*",
 89 | ]
 90 | 
 91 | 
 92 | [build-system]
 93 | requires = ["poetry-core"]
 94 | build-backend = "poetry.core.masonry.api"
 95 | 
 96 | [tool.pytest.ini_options]
 97 | # --strict-markers will raise errors on unknown marks.
 98 | # https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
 99 | #
100 | # https://docs.pytest.org/en/7.1.x/reference/reference.html
101 | # --strict-config       any warnings encountered while parsing the `pytest`
102 | #                       section of the configuration file raise errors.
103 | addopts = "--strict-markers --strict-config --durations=5 -vv"
104 | # Global timeout for all tests. There shuold be a good reason for a test to
105 | # take more than 5 second
106 | timeout = 5
107 | 


--------------------------------------------------------------------------------
/scripts/check_datasets.py:
--------------------------------------------------------------------------------
 1 | """Script to check that all registered datasets can be downloaded."""
 2 | from langchain_benchmarks import registry
 3 | from langchain_benchmarks.utils._langsmith import exists_public_dataset
 4 | 
 5 | 
 6 | def check_datasets() -> bool:
 7 |     """Check that all tasks can be downloaded."""
 8 |     ok = True
 9 |     for task in registry.tasks:
10 |         print(f"Checking {task.name}...")
11 |         if exists_public_dataset(task.dataset_id):
12 |             print("  OK")
13 |         else:
14 |             ok = False
15 |             print("  ERROR: Dataset not found")
16 |     return ok
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     ok = check_datasets()
21 |     if not ok:
22 |         exit(1)
23 | 


--------------------------------------------------------------------------------
/security.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Reporting OSS Vulnerabilities
 4 | 
 5 | LangChain is partnered with [huntr by Protect AI](https://huntr.com/) to provide 
 6 | a bounty program for our open source projects. 
 7 | 
 8 | Please report security vulnerabilities associated with the LangChain 
 9 | open source projects by visiting the following link:
10 | 
11 | [https://huntr.com/bounties/disclose/](https://huntr.com/bounties/disclose/?target=https%3A%2F%2Fgithub.com%2Flangchain-ai%2Flangchain&validSearch=true)
12 | 
13 | Before reporting a vulnerability, please review:
14 | 
15 | 1) In-Scope Targets and Out-of-Scope Targets below.
16 | 2) The [langchain-ai/langchain](https://python.langchain.com/docs/contributing/repo_structure) monorepo structure.
17 | 3) LangChain [security guidelines](https://python.langchain.com/docs/security) to
18 |    understand what we consider to be a security vulnerability vs. developer
19 |    responsibility.
20 | 
21 | ### In-Scope Targets
22 | 
23 | The following packages and repositories are eligible for bug bounties:
24 | 
25 | - langchain-core
26 | - langchain (see exceptions)
27 | - langchain-community (see exceptions)
28 | - langgraph
29 | - langserve
30 | 
31 | ### Out of Scope Targets
32 | 
33 | All out of scope targets defined by huntr as well as:
34 | 
35 | - **langchain-experimental**: This repository is for experimental code and is not
36 |   eligible for bug bounties, bug reports to it will be marked as interesting or waste of
37 |   time and published with no bounty attached.
38 | - **tools**: Tools in either langchain or langchain-community are not eligible for bug
39 |   bounties. This includes the following directories
40 |   - langchain/tools
41 |   - langchain-community/tools
42 |   - Please review our [security guidelines](https://python.langchain.com/docs/security)
43 |     for more details, but generally tools interact with the real world. Developers are
44 |     expected to understand the security implications of their code and are responsible
45 |     for the security of their tools.
46 | - Code documented with security notices. This will be decided done on a case by
47 |   case basis, but likely will not be eligible for a bounty as the code is already
48 |   documented with guidelines for developers that should be followed for making their
49 |   application secure.
50 | - Any LangSmith related repositories or APIs see below.
51 | 
52 | ## Reporting LangSmith Vulnerabilities
53 | 
54 | Please report security vulnerabilities associated with LangSmith by email to `security@langchain.dev`.
55 | 
56 | - LangSmith site: https://smith.langchain.com
57 | - SDK client: https://github.com/langchain-ai/langsmith-sdk
58 | 
59 | ### Other Security Concerns
60 | 
61 | For any other security concerns, please contact us at `security@langchain.dev`.
62 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/tests/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/tests/unit_tests/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/extraction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/tests/unit_tests/extraction/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/extraction/test_email_extraction.py:
--------------------------------------------------------------------------------
1 | def test_email_extraction() -> None:
2 |     """Try to import the email task."""
3 | 


--------------------------------------------------------------------------------
/tests/unit_tests/extraction/test_import_stuff.py:
--------------------------------------------------------------------------------
1 | def test_import_stuff() -> None:
2 |     """Test that all imports work."""
3 |     from langchain_benchmarks.extraction import (  # noqa: F401
4 |         evaluators,
5 |         implementations,
6 |     )
7 | 


--------------------------------------------------------------------------------
/tests/unit_tests/rag/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/tests/unit_tests/rag/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/rag/test_langchain_docs.py:
--------------------------------------------------------------------------------
 1 | def test_import_rag() -> None:
 2 |     """Test that the rag tasks can be imported."""
 3 |     from langchain_benchmarks.rag import evaluators, tasks  # noqa: F401
 4 | 
 5 | 
 6 | def test_import_langchain_docs() -> None:
 7 |     """Test that the langchain_docs tasks can be imported."""
 8 |     from langchain_benchmarks.rag.tasks.langchain_docs import (  # noqa: F401
 9 |         DATASET_ID,
10 |         LANGCHAIN_DOCS_TASK,
11 |         architectures,
12 |         indexing,
13 |     )
14 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_model_registry.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from langchain_benchmarks.schema import ModelRegistry, RegisteredModel
 4 | 
 5 | # Create some sample RegisteredModel instances for testing
 6 | SAMPLE_MODELS = [
 7 |     RegisteredModel(
 8 |         "model1", "fireworks", "Description 1", {"param1": "value1"}, "chat"
 9 |     ),
10 |     RegisteredModel("model2", "openai", "Description 2", {"param2": "value2"}, "llm"),
11 | ]
12 | 
13 | 
14 | @pytest.fixture
15 | def sample_registry() -> ModelRegistry:
16 |     return ModelRegistry(SAMPLE_MODELS)
17 | 
18 | 
19 | def test_init() -> None:
20 |     # Test the constructor of ModelRegistry
21 |     registry = ModelRegistry(SAMPLE_MODELS)
22 |     assert len(registry.registered_models) == 2
23 | 
24 | 
25 | def test_get_model(sample_registry: ModelRegistry) -> None:
26 |     # Test the get_model method
27 |     model = sample_registry.get_model("model1")
28 |     assert model.name == "model1"
29 | 
30 | 
31 | def test_filter(sample_registry: ModelRegistry) -> None:
32 |     # Test the filter method
33 |     filtered_registry = sample_registry.filter(type="chat")
34 |     assert len(filtered_registry.registered_models) == 1
35 |     assert filtered_registry.registered_models[0].type == "chat"
36 | 
37 | 
38 | def test_repr_html(sample_registry: ModelRegistry) -> None:
39 |     # Test the _repr_html_ method
40 |     html_representation = sample_registry._repr_html_()
41 |     assert "<table>" in html_representation
42 | 
43 | 
44 | def test_len(sample_registry: ModelRegistry) -> None:
45 |     # Test the __len__ method
46 |     assert len(sample_registry) == 2
47 | 
48 | 
49 | def test_iter(sample_registry: ModelRegistry) -> None:
50 |     # Test the __iter__ method
51 |     models = list(iter(sample_registry))
52 |     assert len(models) == 2
53 |     assert isinstance(models[0], RegisteredModel)
54 | 
55 | 
56 | def test_getitem(sample_registry: ModelRegistry) -> None:
57 |     # Test the __getitem__ method for integer and string keys
58 |     model = sample_registry[0]
59 |     assert model.name == "model1"
60 |     model = sample_registry["model2"]
61 |     assert model.name == "model2"
62 | 
63 | 
64 | def test_getitem_slice(sample_registry: ModelRegistry) -> None:
65 |     # Test the __getitem__ method for slices
66 |     sliced_registry = sample_registry[:1]
67 |     assert len(sliced_registry.registered_models) == 1
68 |     assert sliced_registry.registered_models[0].name == "model1"
69 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_public_api.py:
--------------------------------------------------------------------------------
 1 | from langchain_benchmarks import __all__
 2 | 
 3 | 
 4 | def test_public_api() -> None:
 5 |     """Test that the public API is correct."""
 6 |     # This test will also fail if __all__ is not sorted.
 7 |     # Please keep it sorted!
 8 |     assert __all__ == sorted(
 9 |         [
10 |             "__version__",
11 |             "clone_public_dataset",
12 |             "download_public_dataset",
13 |             "model_registry",
14 |             "RateLimiter",
15 |             "registry",
16 |         ],
17 |         key=lambda x: x.lower(),
18 |     )
19 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_rate_limiting.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from freezegun import freeze_time
 3 | 
 4 | from langchain_benchmarks.rate_limiting import RateLimiter
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "delta_time, requests_per_second, max_bucket_size, expected_result",
 9 |     [
10 |         (
11 |             1,
12 |             1,
13 |             1,
14 |             True,
15 |         ),
16 |         (
17 |             0.5,
18 |             1,
19 |             1,
20 |             False,
21 |         ),
22 |         (
23 |             0.5,
24 |             2,
25 |             1,
26 |             True,
27 |         ),
28 |     ],
29 | )
30 | def test_consume(
31 |     delta_time: float,
32 |     requests_per_second: float,
33 |     max_bucket_size: float,
34 |     expected_result: bool,
35 | ) -> None:
36 |     """Test the consumption of tokens over time.
37 | 
38 |     Args:
39 |         delta_time: The time in seconds to add to the initial time.
40 |         requests_per_second: The rate at which tokens are added per second.
41 |         max_bucket_size: The maximum size of the token bucket.
42 |         expected_result: The expected result of the consume operation.
43 |     """
44 |     rate_limiter = RateLimiter(
45 |         requests_per_second=requests_per_second, max_bucket_size=max_bucket_size
46 |     )
47 | 
48 |     with freeze_time(auto_tick_seconds=delta_time):
49 |         assert rate_limiter._consume() is False
50 |         assert rate_limiter._consume() is expected_result
51 | 
52 | 
53 | def test_consume_count_tokens() -> None:
54 |     """Test to check that the bucket size is used correctly."""
55 |     rate_limiter = RateLimiter(
56 |         requests_per_second=60,
57 |         max_bucket_size=10,
58 |     )
59 | 
60 |     with freeze_time(auto_tick_seconds=100):
61 |         assert rate_limiter._consume() is False
62 |         assert rate_limiter._consume() is True
63 |         assert (
64 |             rate_limiter.available_tokens == 9
65 |         )  # Max bucket size is 10, so 10 - 1 = 9
66 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import unittest.mock as mock
  3 | import uuid
  4 | from contextlib import contextmanager
  5 | from typing import Any, Generator, List, Mapping, Optional, Sequence
  6 | from uuid import UUID
  7 | 
  8 | from langsmith.client import ID_TYPE
  9 | from langsmith.schemas import Dataset, Example
 10 | from langsmith.utils import LangSmithNotFoundError
 11 | 
 12 | from langchain_benchmarks.utils._langsmith import clone_public_dataset
 13 | 
 14 | 
 15 | # Define a mock Client class that overrides the required methods
 16 | class MockLangSmithClient:
 17 |     def __init__(self) -> None:
 18 |         """Initialize the mock client."""
 19 |         self.datasets = []
 20 |         self.examples = []
 21 | 
 22 |     def read_dataset(self, dataset_name: str) -> Dataset:
 23 |         for dataset in self.datasets:
 24 |             if dataset.name == dataset_name:
 25 |                 return dataset
 26 |         raise LangSmithNotFoundError(f'Dataset "{dataset_name}" not found.')
 27 | 
 28 |     def create_dataset(self, dataset_name: str) -> Dataset:
 29 |         # Simulate creating a dataset and returning a mock Dataset object
 30 |         dataset = Dataset(
 31 |             id=UUID(int=3), name=dataset_name, created_at=datetime.datetime(2021, 1, 1)
 32 |         )
 33 |         self.datasets.append(dataset)
 34 |         return dataset
 35 | 
 36 |     def create_examples(
 37 |         self,
 38 |         *,
 39 |         inputs: Sequence[Mapping[str, Any]],
 40 |         outputs: Optional[Sequence[Optional[Mapping[str, Any]]]] = None,
 41 |         dataset_id: Optional[ID_TYPE] = None,
 42 |         dataset_name: Optional[str] = None,
 43 |         max_concurrency: int = 10,
 44 |     ) -> None:
 45 |         """Create examples"""
 46 |         examples = []
 47 |         for idx, (input, output) in enumerate(zip(inputs, outputs)):
 48 |             examples.append(
 49 |                 Example(
 50 |                     id=UUID(int=idx),
 51 |                     inputs=input,
 52 |                     outputs=output,
 53 |                     created_at=datetime.datetime(2021, 1, 1),
 54 |                     dataset_id=dataset_id,
 55 |                     dataset_name=dataset_name,
 56 |                 )
 57 |             )
 58 | 
 59 |         return self.examples.extend(examples)
 60 | 
 61 |     def list_shared_examples(self, public_dataset_token: str) -> List[Example]:
 62 |         # Simulate fetching shared examples and returning a list of Example objects
 63 |         example1 = Example(
 64 |             id=UUID(int=1),
 65 |             inputs={"a": 1},
 66 |             outputs={},
 67 |             created_at=datetime.datetime(2021, 1, 1),
 68 |             dataset_id=public_dataset_token,
 69 |         )
 70 |         example2 = Example(
 71 |             id=UUID(int=2),
 72 |             inputs={"b": 2},
 73 |             outputs={},
 74 |             created_at=datetime.datetime(2021, 1, 1),
 75 |             dataset_id=public_dataset_token,
 76 |         )
 77 |         return [example1, example2]
 78 | 
 79 |     def read_shared_dataset(self, public_dataset_token: str) -> Dataset:
 80 |         # Simulate fetching shared dataset and returning a Dataset object
 81 |         dataset = Dataset(
 82 |             id=UUID(int=3),
 83 |             name="my_dataset",
 84 |             created_at=datetime.datetime(2021, 1, 1),
 85 |             owner_id=public_dataset_token,
 86 |         )
 87 |         return dataset
 88 | 
 89 | 
 90 | @contextmanager
 91 | def mock_langsmith_client() -> Generator[None, None, None]:
 92 |     """Mock the langsmith Client class."""
 93 |     from langchain_benchmarks.utils import _langsmith
 94 | 
 95 |     mock_client = MockLangSmithClient()
 96 | 
 97 |     with mock.patch.object(_langsmith, "Client") as client:
 98 |         client.return_value = mock_client
 99 |         yield mock_client
100 | 
101 | 
102 | def test_clone_dataset() -> None:
103 |     # Call the clone_dataset function with mock data
104 |     public_dataset_token = str(uuid.UUID(int=3))
105 |     dataset_name = "my_dataset"
106 | 
107 |     with mock_langsmith_client() as mock_client:
108 |         clone_public_dataset(public_dataset_token, dataset_name=dataset_name)
109 |         assert mock_client.datasets[0].name == dataset_name
110 |         assert len(mock_client.examples) == 2
111 | 
112 |         # Check idempotency
113 |         clone_public_dataset(public_dataset_token, dataset_name=dataset_name)
114 |         assert len(mock_client.examples) == 2
115 | 


--------------------------------------------------------------------------------
/tests/unit_tests/tool_usage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-benchmarks/34cd281494e61821ce84f50eaaa786b9c78b98b1/tests/unit_tests/tool_usage/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/tool_usage/test_evaluator.py:
--------------------------------------------------------------------------------
  1 | """Test the standard agent evaluator."""
  2 | 
  3 | import pytest
  4 | from langchain.schema import AgentAction
  5 | 
  6 | from langchain_benchmarks.tool_usage.evaluators import compare_outputs
  7 | 
  8 | 
  9 | @pytest.mark.parametrize(
 10 |     "run_outputs, example_outputs, expected_results",
 11 |     [
 12 |         (
 13 |             {
 14 |                 "intermediate_steps": [
 15 |                     (
 16 |                         AgentAction(tool="action_1", tool_input={}, log=""),
 17 |                         "observation1",
 18 |                     ),
 19 |                     (
 20 |                         AgentAction(tool="action_2", tool_input={}, log=""),
 21 |                         "observation1",
 22 |                     ),
 23 |                 ],
 24 |                 "state": "final_state",
 25 |             },
 26 |             {
 27 |                 "expected_steps": ["action_1", "action_2"],
 28 |                 "state": "final_state",
 29 |             },
 30 |             {
 31 |                 "Intermediate steps correctness": True,
 32 |                 "# steps / # expected steps": 1,
 33 |                 "Correct Final State": 1,
 34 |             },
 35 |         ),
 36 |         (
 37 |             {
 38 |                 "intermediate_steps": [
 39 |                     (
 40 |                         AgentAction(tool="action_1", tool_input={}, log=""),
 41 |                         "observation1",
 42 |                     ),
 43 |                     (
 44 |                         AgentAction(tool="action_2", tool_input={}, log=""),
 45 |                         "observation1",
 46 |                     ),
 47 |                 ],
 48 |                 "state": "final_state",
 49 |             },
 50 |             {
 51 |                 "expected_steps": ["cat", "was", "here"],
 52 |                 "state": "another_state",
 53 |             },
 54 |             {
 55 |                 "Intermediate steps correctness": False,
 56 |                 "# steps / # expected steps": 2 / 3,
 57 |                 "Correct Final State": 0,
 58 |             },
 59 |         ),
 60 |         (
 61 |             {
 62 |                 "intermediate_steps": [
 63 |                     (
 64 |                         AgentAction(tool="action_2", tool_input={}, log=""),
 65 |                         "observation1",
 66 |                     ),
 67 |                     (
 68 |                         AgentAction(tool="action_1", tool_input={}, log=""),
 69 |                         "observation1",
 70 |                     ),
 71 |                 ],
 72 |                 "state": "final_state",
 73 |             },
 74 |             {
 75 |                 "expected_steps": ["action_1", "action_2"],
 76 |                 "order_matters": False,
 77 |                 "state": "different_state",
 78 |             },
 79 |             {
 80 |                 "Intermediate steps correctness": True,
 81 |                 "# steps / # expected steps": 1.0,
 82 |                 "Correct Final State": 0,
 83 |             },
 84 |         ),
 85 |         # Without state
 86 |         (
 87 |             {
 88 |                 "intermediate_steps": [
 89 |                     (
 90 |                         AgentAction(tool="action_2", tool_input={}, log=""),
 91 |                         "observation1",
 92 |                     ),
 93 |                     (
 94 |                         AgentAction(tool="action_1", tool_input={}, log=""),
 95 |                         "observation1",
 96 |                     ),
 97 |                 ],
 98 |             },
 99 |             {
100 |                 "expected_steps": ["action_1", "action_2"],
101 |                 "order_matters": False,
102 |             },
103 |             {
104 |                 "Intermediate steps correctness": True,
105 |                 "# steps / # expected steps": 1.0,
106 |             },
107 |         ),
108 |         # Using actual steps
109 |         # With order not mattering
110 |         (
111 |             {
112 |                 "actual_steps": ["action_2", "action_1"],
113 |             },
114 |             {
115 |                 "expected_steps": ["action_1", "action_2"],
116 |                 "order_matters": False,
117 |             },
118 |             {
119 |                 "Intermediate steps correctness": True,
120 |                 "# steps / # expected steps": 1.0,
121 |             },
122 |         ),
123 |         # Using actual steps
124 |         # With order mattering
125 |         (
126 |             {
127 |                 "actual_steps": ["action_2", "action_1"],
128 |             },
129 |             {
130 |                 "expected_steps": ["action_1", "action_2"],
131 |                 "order_matters": True,
132 |             },
133 |             {
134 |                 "Intermediate steps correctness": False,
135 |                 "# steps / # expected steps": 1.0,
136 |             },
137 |         ),
138 |     ],
139 | )
140 | def test_compare_outputs(run_outputs, example_outputs, expected_results):
141 |     """Test compare outputs."""
142 |     evaluation_results = compare_outputs(run_outputs, example_outputs, run_inputs={})
143 |     assert {
144 |         result.key: result.score for result in evaluation_results["results"]
145 |     } == expected_results
146 | 


--------------------------------------------------------------------------------
/tests/unit_tests/tool_usage/test_multiverse_math.py:
--------------------------------------------------------------------------------
 1 | from langchain_benchmarks.tool_usage.tasks.multiverse_math import (
 2 |     add,
 3 |     get_environment,
 4 |     multiply,
 5 | )
 6 | 
 7 | 
 8 | def test_get_environment() -> None:
 9 |     """Test the multiverse math task."""
10 |     # Create the environment
11 |     env = get_environment()
12 | 
13 |     # Get the tools
14 |     tools = env.tools
15 | 
16 |     assert len(tools) == 10
17 | 
18 |     # Get the state reader
19 |     read_state = env.read_state
20 |     assert read_state is None
21 | 
22 | 
23 | def test_operations() -> None:
24 |     """Test some operations."""
25 |     # Confirm that operations are not distributive
26 |     assert multiply(add(1, 2), 7) == 32.34
27 |     assert add(multiply(1, 7), multiply(2, 7)) == 24.3
28 | 


--------------------------------------------------------------------------------
/tests/unit_tests/tool_usage/test_public_api.py:
--------------------------------------------------------------------------------
 1 | from langchain_benchmarks.tool_usage import __all__
 2 | 
 3 | 
 4 | def test_public_api() -> None:
 5 |     """Test that the public API is correct."""
 6 |     # This test will also fail if __all__ is not sorted.
 7 |     # Please keep it sorted!
 8 |     assert __all__ == sorted(
 9 |         [
10 |             "apply_agent_executor_adapter",
11 |             "get_eval_config",
12 |             "CustomRunnableAgentFactory",
13 |             "StandardAgentFactory",
14 |         ],
15 |         key=str.lower,
16 |     )
17 | 


--------------------------------------------------------------------------------
/tests/unit_tests/tool_usage/test_tool_usage.py:
--------------------------------------------------------------------------------
1 | def test_import_tool_usage() -> None:
2 |     """Test that tool_usage can be imported"""
3 | 


--------------------------------------------------------------------------------