├── .github
    ├── actions
    │   └── poetry_setup
    │   │   └── action.yml
    └── workflows
    │   ├── _lint.yml
    │   ├── _test.yml
    │   ├── ci.yml
    │   ├── fe_ci.yml
    │   └── fe_lint_format.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── backend
    ├── Dockerfile
    ├── Makefile
    ├── README.md
    ├── db
    │   ├── __init__.py
    │   └── models.py
    ├── extraction
    │   ├── __init__.py
    │   ├── parsing.py
    │   └── utils.py
    ├── poetry.lock
    ├── pyproject.toml
    ├── scripts
    │   ├── __init__.py
    │   ├── local_entry_point.sh
    │   ├── prod_entry_point.sh
    │   └── run_migrations.py
    ├── server
    │   ├── __init__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── api_key.py
    │   │   ├── configurables.py
    │   │   ├── examples.py
    │   │   ├── extract.py
    │   │   ├── extractors.py
    │   │   ├── shared.py
    │   │   └── suggest.py
    │   ├── extraction_runnable.py
    │   ├── main.py
    │   ├── models.py
    │   ├── retrieval.py
    │   ├── settings.py
    │   └── validators.py
    └── tests
    │   ├── __init__.py
    │   ├── db.py
    │   ├── integration_tests
    │       ├── __init__.py
    │       └── test_extraction.py
    │   └── unit_tests
    │       ├── __init__.py
    │       ├── api
    │           ├── __init__.py
    │           ├── test_api_configuration.py
    │           ├── test_api_defining_extractors.py
    │           ├── test_api_examples.py
    │           └── test_api_extract.py
    │       ├── conftest.py
    │       ├── fake
    │           ├── __init__.py
    │           ├── chat_model.py
    │           └── test_fake_chat_model.py
    │       ├── fixtures
    │           ├── __init__.py
    │           ├── sample.docx
    │           ├── sample.epub
    │           ├── sample.html
    │           ├── sample.odt
    │           ├── sample.pdf
    │           ├── sample.rtf
    │           └── sample.txt
    │       ├── test_deduplication.py
    │       ├── test_parsing.py
    │       ├── test_upload.py
    │       ├── test_utils.py
    │       ├── test_validators.py
    │       └── utils.py
├── docker-compose.yml
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── conf.py
    │   ├── notebooks
    │       ├── earnings_call_example.ipynb
    │       └── quick_start.ipynb
    │   └── toc.segment
└── frontend
    ├── .env.example
    ├── .eslintrc.json
    ├── .gitignore
    ├── .prettierrc
    ├── .yarnrc.yml
    ├── Dockerfile
    ├── app
        ├── components
        │   ├── CreateExtractor.tsx
        │   ├── Extractor.tsx
        │   ├── Playground.tsx
        │   ├── ResultsTable.tsx
        │   ├── ShareModal.tsx
        │   └── Sidebar.tsx
        ├── e
        │   └── [extractorId]
        │   │   └── page.tsx
        ├── globals.css
        ├── layout.tsx
        ├── new
        │   └── page.tsx
        ├── page.tsx
        ├── providers.tsx
        ├── s
        │   └── [sharedExtractorId]
        │   │   └── page.tsx
        └── utils
        │   ├── api.tsx
        │   └── api_url.ts
    ├── next.config.js
    ├── package.json
    ├── postcss.config.js
    ├── public
        ├── favicon.ico
        └── images
        │   └── github-mark.svg
    ├── tailwind.config.ts
    ├── tsconfig.json
    └── yarn.lock


/.github/actions/poetry_setup/action.yml:
--------------------------------------------------------------------------------
 1 | # An action for setting up poetry install with caching.
 2 | # Using a custom action since the default action does not
 3 | # take poetry install groups into account.
 4 | # Action code from:
 5 | # https://github.com/actions/setup-python/issues/505#issuecomment-1273013236
 6 | name: poetry-install-with-caching
 7 | description: Poetry install with support for caching of dependency groups.
 8 | 
 9 | inputs:
10 |   python-version:
11 |     description: Python version, supporting MAJOR.MINOR only
12 |     required: true
13 | 
14 |   poetry-version:
15 |     description: Poetry version
16 |     required: true
17 | 
18 |   cache-key:
19 |     description: Cache key to use for manual handling of caching
20 |     required: true
21 | 
22 |   working-directory:
23 |     description: Directory whose poetry.lock file should be cached
24 |     required: true
25 | 
26 | runs:
27 |   using: composite
28 |   steps:
29 |     - uses: actions/setup-python@v4
30 |       name: Setup python ${{ inputs.python-version }}
31 |       with:
32 |         python-version: ${{ inputs.python-version }}
33 | 
34 |     - uses: actions/cache@v3
35 |       id: cache-bin-poetry
36 |       name: Cache Poetry binary - Python ${{ inputs.python-version }}
37 |       env:
38 |         SEGMENT_DOWNLOAD_TIMEOUT_MIN: "1"
39 |       with:
40 |         path: |
41 |           /opt/pipx/venvs/poetry
42 |         # This step caches the poetry installation, so make sure it's keyed on the poetry version as well.
43 |         key: bin-poetry-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-${{ inputs.poetry-version }}
44 | 
45 |     - name: Refresh shell hashtable and fixup softlinks
46 |       if: steps.cache-bin-poetry.outputs.cache-hit == 'true'
47 |       shell: bash
48 |       env:
49 |         POETRY_VERSION: ${{ inputs.poetry-version }}
50 |         PYTHON_VERSION: ${{ inputs.python-version }}
51 |       run: |
52 |         set -eux
53 | 
54 |         # Refresh the shell hashtable, to ensure correct `which` output.
55 |         hash -r
56 | 
57 |         # `actions/cache@v3` doesn't always seem able to correctly unpack softlinks.
58 |         # Delete and recreate the softlinks pipx expects to have.
59 |         rm /opt/pipx/venvs/poetry/bin/python
60 |         cd /opt/pipx/venvs/poetry/bin
61 |         ln -s "$(which "python$PYTHON_VERSION")" python
62 |         chmod +x python
63 |         cd /opt/pipx_bin/
64 |         ln -s /opt/pipx/venvs/poetry/bin/poetry poetry
65 |         chmod +x poetry
66 | 
67 |         # Ensure everything got set up correctly.
68 |         /opt/pipx/venvs/poetry/bin/python --version
69 |         /opt/pipx_bin/poetry --version
70 | 
71 |     - name: Install poetry
72 |       if: steps.cache-bin-poetry.outputs.cache-hit != 'true'
73 |       shell: bash
74 |       env:
75 |         POETRY_VERSION: ${{ inputs.poetry-version }}
76 |         PYTHON_VERSION: ${{ inputs.python-version }}
77 |       run: pipx install "poetry==$POETRY_VERSION" --python "python$PYTHON_VERSION" --verbose
78 | 
79 |     - name: Restore pip and poetry cached dependencies
80 |       uses: actions/cache@v3
81 |       env:
82 |         SEGMENT_DOWNLOAD_TIMEOUT_MIN: "4"
83 |         WORKDIR: ${{ inputs.working-directory == '' && '.' || inputs.working-directory }}
84 |       with:
85 |         path: |
86 |           ~/.cache/pip
87 |           ~/.cache/pypoetry/virtualenvs
88 |           ~/.cache/pypoetry/cache
89 |           ~/.cache/pypoetry/artifacts
90 |           ${{ env.WORKDIR }}/.venv
91 |         key: py-deps-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-poetry-${{ inputs.poetry-version }}-${{ inputs.cache-key }}-${{ hashFiles(format('{0}/**/poetry.lock', env.WORKDIR)) }}
92 | 


--------------------------------------------------------------------------------
/.github/workflows/_lint.yml:
--------------------------------------------------------------------------------
 1 | name: lint
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     paths:
 6 |       - '!frontend/**'
 7 |   push:
 8 |     paths:
 9 |       - '!frontend/**'
10 |   workflow_call:
11 |     inputs:
12 |       working-directory:
13 |         required: true
14 |         type: string
15 |         description: "From which folder this pipeline executes"
16 | 
17 | env:
18 |   POETRY_VERSION: "1.7.1"
19 |   WORKDIR: ${{ inputs.working-directory == '' && '.' || inputs.working-directory }}
20 | 
21 | jobs:
22 |   build:
23 |     runs-on: ubuntu-latest
24 |     env:
25 |       # This number is set "by eye": we want it to be big enough
26 |       # so that it's bigger than the number of commits in any reasonable PR,
27 |       # and also as small as possible since increasing the number makes
28 |       # the initial `git fetch` slower.
29 |       FETCH_DEPTH: 50
30 |     strategy:
31 |       matrix:
32 |         # Only lint on the min and max supported Python versions.
33 |         # It's extremely unlikely that there's a lint issue on any version in between
34 |         # that doesn't show up on the min or max versions.
35 |         #
36 |         # GitHub rate-limits how many jobs can be running at any one time.
37 |         # Starting new jobs is also relatively slow,
38 |         # so linting on fewer versions makes CI faster.
39 |         python-version:
40 |           - "3.8"
41 |           - "3.11"
42 |     steps:
43 |       - uses: actions/checkout@v3
44 |       - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
45 |         uses: "./.github/actions/poetry_setup"
46 |         with:
47 |           python-version: ${{ matrix.python-version }}
48 |           poetry-version: ${{ env.POETRY_VERSION }}
49 |           working-directory: ${{ inputs.working-directory }}
50 |           cache-key: lint-with-extras
51 | 
52 |       - name: Check Poetry File
53 |         shell: bash
54 |         working-directory: ${{ inputs.working-directory }}
55 |         run: |
56 |           poetry check
57 | 
58 |       - name: Check lock file
59 |         shell: bash
60 |         working-directory: ${{ inputs.working-directory }}
61 |         run: |
62 |           poetry lock --check
63 | 
64 |       - name: Install dependencies
65 |         # Also installs dev/lint/test/typing dependencies, to ensure we have
66 |         # type hints for as many of our libraries as possible.
67 |         # This helps catch errors that require dependencies to be spotted, for example:
68 |         # https://github.com/langchain-ai/langchain/pull/10249/files#diff-935185cd488d015f026dcd9e19616ff62863e8cde8c0bee70318d3ccbca98341
69 |         #
70 |         # If you change this configuration, make sure to change the `cache-key`
71 |         # in the `poetry_setup` action above to stop using the old cache.
72 |         # It doesn't matter how you change it, any change will cause a cache-bust.
73 |         working-directory: ${{ inputs.working-directory }}
74 |         run: |
75 |           poetry install --with dev,lint,test,typing
76 | 
77 |       - name: Get .mypy_cache to speed up mypy
78 |         uses: actions/cache@v3
79 |         env:
80 |           SEGMENT_DOWNLOAD_TIMEOUT_MIN: "2"
81 |         with:
82 |           path: |
83 |             ${{ env.WORKDIR }}/.mypy_cache
84 |           key: mypy-${{ runner.os }}-${{ runner.arch }}-py${{ matrix.python-version }}-${{ inputs.working-directory }}-${{ hashFiles(format('{0}/poetry.lock', env.WORKDIR)) }}
85 | 
86 |       - name: Analysing the code with our lint
87 |         working-directory: ${{ inputs.working-directory }}
88 |         run: |
89 |           make lint
90 | 


--------------------------------------------------------------------------------
/.github/workflows/_test.yml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     paths:
 6 |       - '!frontend/**'
 7 |   push:
 8 |     paths:
 9 |       - '!frontend/**'
10 |   workflow_call:
11 |     inputs:
12 |       working-directory:
13 |         required: true
14 |         type: string
15 |         description: "From which folder this pipeline executes"
16 | 
17 | env:
18 |   POETRY_VERSION: "1.7.1"
19 | 
20 | jobs:
21 |   build:
22 |     defaults:
23 |       run:
24 |         working-directory: ${{ inputs.working-directory }}
25 |     runs-on: ubuntu-latest
26 |     strategy:
27 |       matrix:
28 |         python-version:
29 |           - "3.8"
30 |           - "3.9"
31 |           - "3.10"
32 |           - "3.11"
33 |     name: Python ${{ matrix.python-version }}
34 |     steps:
35 |       - uses: actions/checkout@v3
36 | 
37 |       - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
38 |         uses: "./.github/actions/poetry_setup"
39 |         with:
40 |           python-version: ${{ matrix.python-version }}
41 |           poetry-version: ${{ env.POETRY_VERSION }}
42 |           working-directory: ${{ inputs.working-directory }}
43 |           cache-key: core
44 | 
45 |       - name: Install dependencies
46 |         shell: bash
47 |         run: poetry install
48 | 
49 |       - name: Run core tests
50 |         shell: bash
51 |         run: make test
52 | 
53 |       - name: Ensure the tests did not create any additional files
54 |         shell: bash
55 |         run: |
56 |           set -eu
57 | 
58 |           STATUS="$(git status)"
59 |           echo "$STATUS"
60 | 
61 |           # grep will exit non-zero if the target message isn't found,
62 |           # and `set -e` above will cause the step to fail.
63 |           echo "$STATUS" | grep 'nothing to commit, working tree clean'
64 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | name: Run CI Tests
  3 | 
  4 | on:
  5 |   push:
  6 |     branches: [ main ]
  7 |     paths:
  8 |       - 'backend/**'
  9 |   pull_request:
 10 |     paths:
 11 |       - 'backend/**'
 12 |   workflow_dispatch:  # Allows to trigger the workflow manually in GitHub UI
 13 | 
 14 | # If another push to the same PR or branch happens while this workflow is still running,
 15 | # cancel the earlier run in favor of the next run.
 16 | #
 17 | # There's no point in testing an outdated version of the code. GitHub only allows
 18 | # a limited number of job runners to be active at the same time, so it's better to cancel
 19 | # pointless jobs early so that more useful jobs can run sooner.
 20 | concurrency:
 21 |   group: ${{ github.workflow }}-${{ github.ref }}
 22 |   cancel-in-progress: true
 23 | 
 24 | env:
 25 |   POETRY_VERSION: "1.7.1"
 26 |   WORKDIR: "./backend"
 27 | 
 28 | jobs:
 29 |   lint:
 30 |     uses:
 31 |       ./.github/workflows/_lint.yml
 32 |     with:
 33 |       working-directory: ./backend
 34 |     secrets: inherit
 35 |   test:
 36 |     timeout-minutes: 5
 37 |     runs-on: ubuntu-latest
 38 |     defaults:
 39 |       run:
 40 |         working-directory: ${{ env.WORKDIR }}
 41 |     services:
 42 |       postgres:
 43 |         # ensure postgres version this stays in sync with prod database
 44 |         # and with postgres version used in docker compose
 45 |         image: postgres:16
 46 |         env:
 47 |           # optional (defaults to `postgres`)
 48 |           POSTGRES_DB: langchain_test
 49 |           # required
 50 |           POSTGRES_PASSWORD: langchain
 51 |           # optional (defaults to `5432`)
 52 |           POSTGRES_PORT: 5432
 53 |           # optional (defaults to `postgres`)
 54 |           POSTGRES_USER: langchain
 55 |         ports:
 56 |           # maps tcp port 5432 on service container to the host
 57 |           - 5432:5432
 58 |         # set health checks to wait until postgres has started
 59 |         options: >-
 60 |           --health-cmd pg_isready
 61 |           --health-interval 3s
 62 |           --health-timeout 5s
 63 |           --health-retries 10
 64 |     strategy:
 65 |       matrix:
 66 |         python-version:
 67 |           - "3.8"
 68 |           - "3.9"
 69 |           - "3.10"
 70 |           - "3.11"
 71 |     name: Python ${{ matrix.python-version }} tests
 72 |     steps:
 73 |       - uses: actions/checkout@v3
 74 | 
 75 |       - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
 76 |         uses: "./.github/actions/poetry_setup"
 77 |         with:
 78 |           python-version: ${{ matrix.python-version }}
 79 |           poetry-version: ${{ env.POETRY_VERSION }}
 80 |           working-directory: ${{ env.WORKDIR }}
 81 |           cache-key: langchain-extract-all
 82 |       - name: Test database connection
 83 |         run: |
 84 |           # Set up postgresql-client
 85 |           sudo apt-get install -y postgresql-client
 86 |           # Test psql connection
 87 |           psql -h localhost -p 5432 -U langchain -d langchain_test -c "SELECT 1;"
 88 |         env:
 89 |           # postgress password is required; alternatively, you can run:
 90 |           # `PGPASSWORD=postgres_password psql ...`
 91 |           PGPASSWORD: langchain
 92 | 
 93 |       - name: Install dependencies
 94 |         shell: bash
 95 |         run: |
 96 |           echo "Running tests, installing dependencies with poetry..."
 97 |           poetry install --with test,lint,typing,docs
 98 | 
 99 |       - name: Run tests
100 |         run: make test
101 | 
102 |       - name: Ensure the tests did not create any additional files
103 |         shell: bash
104 |         run: |
105 |           set -eu
106 | 
107 |           STATUS="$(git status)"
108 |           echo "$STATUS"
109 | 
110 |           # grep will exit non-zero if the target message isn't found,
111 |           # and `set -e` above will cause the step to fail.
112 |           echo "$STATUS" | grep 'nothing to commit, working tree clean'
113 | 


--------------------------------------------------------------------------------
/.github/workflows/fe_ci.yml:
--------------------------------------------------------------------------------
 1 | # Run formatting on all PRs
 2 | 
 3 | name: (FE) CI
 4 | 
 5 | on:
 6 |   push:
 7 |     branches: ["main"]
 8 |     paths:
 9 |       - 'frontend/**'
10 |   pull_request:
11 |     paths:
12 |       - 'frontend/**'
13 |   workflow_dispatch:  # Allows triggering the workflow manually in GitHub UI
14 | 
15 | 
16 | # If another push to the same PR or branch happens while this workflow is still running,
17 | # cancel the earlier run in favor of the next run.
18 | #
19 | # There's no point in testing an outdated version of the code. GitHub only allows
20 | # a limited number of job runners to be active at the same time, so it's better to cancel
21 | # pointless jobs early so that more useful jobs can run sooner.
22 | concurrency:
23 |   group: ${{ github.workflow }}-${{ github.ref }}
24 |   cancel-in-progress: true
25 | 
26 | jobs:
27 |   build:
28 |     name: Build frontend
29 |     runs-on: ubuntu-latest
30 |     env:
31 |       NEXT_PUBLIC_BASE_API_URL: http://localhost:8000
32 |     steps:
33 |       - uses: actions/checkout@v4
34 |       - name: Use Node.js 18.x
35 |         uses: actions/setup-node@v3
36 |         with:
37 |           node-version: 18.x
38 |           cache: "yarn"
39 |           cache-dependency-path: ./frontend/yarn.lock
40 |       - name: Install dependencies
41 |         run: yarn install --immutable --mode=skip-build
42 |         working-directory: ./frontend
43 |       - name: Build frontend
44 |         run: yarn build
45 |         working-directory: ./frontend
46 | 


--------------------------------------------------------------------------------
/.github/workflows/fe_lint_format.yml:
--------------------------------------------------------------------------------
 1 | # Run formatting on all PRs
 2 | 
 3 | name: (FE) Lint & Format
 4 | 
 5 | on:
 6 |   push:
 7 |     branches: ["main"]
 8 |     paths:
 9 |       - 'frontend/**'
10 |   pull_request:
11 |     paths:
12 |       - 'frontend/**'
13 |   workflow_dispatch:  # Allows triggering the workflow manually in GitHub UI
14 | 
15 | 
16 | # If another push to the same PR or branch happens while this workflow is still running,
17 | # cancel the earlier run in favor of the next run.
18 | #
19 | # There's no point in testing an outdated version of the code. GitHub only allows
20 | # a limited number of job runners to be active at the same time, so it's better to cancel
21 | # pointless jobs early so that more useful jobs can run sooner.
22 | concurrency:
23 |   group: ${{ github.workflow }}-${{ github.ref }}
24 |   cancel-in-progress: true
25 | 
26 | jobs:
27 |   format:
28 |     name: Check formatting
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |       - uses: actions/checkout@v4
32 |       - name: Use Node.js 18.x
33 |         uses: actions/setup-node@v3
34 |         with:
35 |           node-version: 18.x
36 |           cache: "yarn"
37 |           cache-dependency-path: ./frontend/yarn.lock
38 |       - name: Install dependencies
39 |         run: yarn install --immutable --mode=skip-build
40 |         working-directory: ./frontend
41 |       - name: Check formatting
42 |         run: yarn format:check
43 |         working-directory: ./frontend
44 |   
45 |   lint:
46 |     name: Check linting
47 |     runs-on: ubuntu-latest
48 |     steps:
49 |       - uses: actions/checkout@v4
50 |       - name: Use Node.js 18.x
51 |         uses: actions/setup-node@v3
52 |         with:
53 |           node-version: 18.x
54 |           cache: "yarn"
55 |           cache-dependency-path: ./frontend/yarn.lock
56 |       - name: Install dependencies
57 |         run: yarn install --immutable --mode=skip-build
58 |         working-directory: ./frontend
59 |       - name: Check linting
60 |         run: yarn lint
61 |         working-directory: ./frontend
62 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Python template
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | #.idea/
162 | .DS_Store
163 | 
164 | # Local env file
165 | .local.env
166 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # All directory paths for COPY commands are relative to the build context
 2 | 
 3 | # Ensure this python version stays in sync with CI
 4 | FROM python:3.11-slim as base
 5 | WORKDIR /backend
 6 | 
 7 | # set environment variables
 8 | ENV PYTHONDONTWRITEBYTECODE 1
 9 | ENV PYTHONUNBUFFERED 1
10 | ENV POETRY_HOME="/opt/poetry"
11 | ENV MYPYPATH="/app/src/stubs"
12 | 
13 | # Use bash as the shell for the build
14 | # https://github.com/docker/for-linux/issues/408#issuecomment-414748815
15 | SHELL ["/bin/bash", "-o", "pipefail", "-c"]
16 | 
17 | RUN set -eux && \
18 |     apt-get update && \
19 |     apt-get install -y \
20 |       build-essential \
21 |       curl \
22 |       libpq-dev \
23 |       python3-dev \
24 |       libmagic1
25 | 
26 | # https://python-poetry.org/docs
27 | RUN pip install poetry
28 | 
29 | # install deps before copying project files so the cache is only invalidated
30 | # when the deps change
31 | COPY ./backend/pyproject.toml ./backend/poetry.lock ./
32 | RUN poetry config virtualenvs.create false
33 | RUN poetry install --no-root --only main
34 | 
35 | COPY ./backend .
36 | 
37 | EXPOSE 8080
38 | 
39 | ###
40 | # development image
41 | ###
42 | FROM base as development
43 | 
44 | ENTRYPOINT ["bash", "./scripts/prod_entry_point.sh"]
45 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024-Present Langchain AI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 🚧 Under Active Development 🚧
  2 | 
  3 | This repo is under active developments. Do not use code from `main`. Instead please checkout code from [releases](https://github.com/langchain-ai/langchain-extract/releases)
  4 | 
  5 | This repository is not a library, but a jumping point for your own application -- so do not be surprised to find breaking changes between releases!
  6 | 
  7 | Checkout the demo service deployed at [extract.langchain.com/](https://extract.langchain.com/).
  8 | 
  9 | # 🦜⛏️ LangChain Extract
 10 | 
 11 | https://github.com/langchain-ai/langchain-extract/assets/26529506/6657280e-d05f-4c0f-9c47-07a0ef7c559d
 12 | 
 13 | [![CI](https://github.com/langchain-ai/langchain-extract/actions/workflows/ci.yml/badge.svg)](https://github.com/langchain-ai/langchain-extract/actions/workflows/ci.yml)
 14 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 15 | [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/langchainai.svg?style=social&label=Follow%20%40LangChainAI)](https://twitter.com/langchainai)
 16 | [![](https://dcbadge.vercel.app/api/server/6adMQxSpJS?compact=true&style=flat)](https://discord.gg/6adMQxSpJS)
 17 | [![Open Issues](https://img.shields.io/github/issues-raw/langchain-ai/langchain-extract)](https://github.com/langchain-ai/langchain-extract/issues)
 18 | 
 19 | `langchain-extract` is a simple web server that allows you to extract information from text and files using LLMs. It is build using [FastAPI](https://fastapi.tiangolo.com/), [LangChain](https://python.langchain.com/) and [Postgresql](https://www.postgresql.org/).
 20 | 
 21 | The backend closely follows the [extraction use-case documentation](https://python.langchain.com/docs/use_cases/extraction) and provides
 22 | a reference implementation of an app that helps to do extraction over data using LLMs.
 23 | 
 24 | This repository is meant to be a starting point for building your own extraction application which
 25 | may have slightly different requirements or use cases.
 26 | 
 27 | ## Functionality
 28 | 
 29 | - 🚀 FastAPI webserver with a REST API
 30 | - 📚 OpenAPI Documentation
 31 | - 📝 Use [JSON Schema](https://json-schema.org/) to define what to extract
 32 | - 📊 Use examples to improve the quality of extracted results
 33 | - 📦 Create and save extractors and examples in a database
 34 | - 📂 Extract information from text and/or binary files
 35 | - 🦜️🏓 [LangServe](https://github.com/langchain-ai/langserve) endpoint to integrate with LangChain `RemoteRunnnable`
 36 | 
 37 | ## Releases:
 38 | 
 39 | 0.0.1: https://github.com/langchain-ai/langchain-extract/releases/tag/0.0.1
 40 | 0.0.2: https://github.com/langchain-ai/langchain-extract/releases/tag/0.0.2
 41 | 
 42 | ## 📚 Documentation
 43 | 
 44 | See the example notebooks in the [documentation](https://github.com/langchain-ai/langchain-extract/tree/main/docs/source/notebooks)
 45 | to see how to create examples to improve extraction results, upload files (e.g., HTML, PDF) and more.
 46 | 
 47 | Documentation and server code are both under development!
 48 | 
 49 | ## 🍯 Example API
 50 | 
 51 | Below are two sample `curl` requests to demonstrate how to use the API.
 52 | 
 53 | These only provide minimal examples of how to use the API,
 54 | see the [documentation](https://github.com/langchain-ai/langchain-extract/tree/main/docs/source/notebooks) for more information
 55 | about the API and the [extraction use-case documentation](https://python.langchain.com/docs/use_cases/extraction) for more information about how to extract
 56 | information using LangChain.
 57 | 
 58 | First we generate a user ID for ourselves. **The application does not properly manage users or include legitimate authentication**. Access to extractors, few-shot examples, and other artifacts is controlled via this ID. Consider it secret.
 59 | 
 60 | ```sh
 61 | USER_ID=$(uuidgen)
 62 | export USER_ID
 63 | ```
 64 | 
 65 | ### Create an extractor
 66 | 
 67 | ```sh
 68 | curl -X 'POST' \
 69 |   'http://localhost:8000/extractors' \
 70 |   -H 'accept: application/json' \
 71 |   -H 'Content-Type: application/json' \
 72 |   -H "x-key: ${USER_ID}" \
 73 |   -d '{
 74 |   "name": "Personal Information",
 75 |   "description": "Use to extract personal information",
 76 |   "schema": {
 77 |       "type": "object",
 78 |       "title": "Person",
 79 |       "required": [
 80 |         "name",
 81 |         "age"
 82 |       ],
 83 |       "properties": {
 84 |         "age": {
 85 |           "type": "integer",
 86 |           "title": "Age"
 87 |         },
 88 |         "name": {
 89 |           "type": "string",
 90 |           "title": "Name"
 91 |         }
 92 |       }
 93 |     },
 94 |   "instruction": "Use information about the person from the given user input."
 95 | }'
 96 | ```
 97 | 
 98 | Response:
 99 | 
100 | ```json
101 | {
102 |   "uuid": "e07f389f-3577-4e94-bd88-6b201d1b10b9"
103 | }
104 | ```
105 | 
106 | Use the extract endpoint to extract information from the text (or a file)
107 | using an existing pre-defined extractor.
108 | 
109 | ```sh
110 | curl -s -X 'POST' \
111 | 'http://localhost:8000/extract' \
112 | -H 'accept: application/json' \
113 | -H 'Content-Type: multipart/form-data' \
114 | -H "x-key: ${USER_ID}" \
115 | -F 'extractor_id=e07f389f-3577-4e94-bd88-6b201d1b10b9' \
116 | -F 'text=my name is chester and i am 20 years old. My name is eugene and I am 1 year older than chester.' \
117 | -F 'mode=entire_document' \
118 | -F 'file=' | jq .
119 | ```
120 | 
121 | Response:
122 | 
123 | ```json
124 | {
125 |   "data": [
126 |     {
127 |       "name": "chester",
128 |       "age": 20
129 |     },
130 |     {
131 |       "name": "eugene",
132 |       "age": 21
133 |     }
134 |   ]
135 | }
136 | ```
137 | 
138 | Add a few shot example:
139 | 
140 | ```sh
141 | curl -X POST "http://localhost:8000/examples" \
142 |     -H "Content-Type: application/json" \
143 |     -H "x-key: ${USER_ID}" \
144 |     -d '{
145 |           "extractor_id": "e07f389f-3577-4e94-bd88-6b201d1b10b9",
146 |           "content": "marcos is 10.",
147 |           "output": [
148 |             {
149 |               "name": "MARCOS",
150 |               "age": 10
151 |             }
152 |           ]
153 |         }' | jq .
154 | ```
155 | 
156 | The response will contain a UUID for the example. Examples can be deleted with a DELETE request. This example is now persisted and associated with our extractor, and subsequent extraction runs will incorporate it.
157 | 
158 | ## ✅ Running locally
159 | 
160 | The easiest way to get started is to use `docker-compose` to run the server.
161 | 
162 | **Configure the environment**
163 | 
164 | Add `.local.env` file to the root directory with the following content:
165 | 
166 | ```sh
167 | OPENAI_API_KEY=... # Your OpenAI API key
168 | ```
169 | 
170 | Adding `FIREWORKS_API_KEY` or `TOGETHER_API_KEY` to this file would enable additional models. You can access available models for the server and other information via a `GET` request to the `configuration` endpoint.
171 | 
172 | Build the images:
173 | 
174 | ```sh
175 | docker compose build
176 | ```
177 | 
178 | Run the services:
179 | 
180 | ```sh
181 | docker compose up
182 | ```
183 | 
184 | This will launch both the extraction server and the postgres instance.
185 | 
186 | Verify that the server is running:
187 | 
188 | ```sh
189 | curl -X 'GET' 'http://localhost:8000/ready'
190 | ```
191 | 
192 | This should return `ok`.
193 | 
194 | The UI will be available at [http://localhost:3000](http://localhost:3000).
195 | 
196 | ## Contributions
197 | 
198 | Feel free to develop in this project for your own needs!
199 | For now, we are not accepting pull requests, but would love to hear [questions, ideas or issues](https://github.com/langchain-ai/langchain-extract/discussions).
200 | 
201 | ## Development
202 | 
203 | To set up for development, you will need to install [Poetry](https://python-poetry.org/).
204 | 
205 | The backend code is located in the `backend` directory.
206 | 
207 | ```sh
208 | cd backend
209 | ```
210 | 
211 | Set up the environment using poetry:
212 | 
213 | ```sh
214 | poetry install --with lint,dev,test
215 | ```
216 | 
217 | Run the following script to create a database and schema:
218 | 
219 | ```sh
220 | python -m scripts.run_migrations create
221 | ```
222 | 
223 | From `/backend`:
224 | 
225 | ```sh
226 | OPENAI_API_KEY=[YOUR API KEY] python -m server.main
227 | ```
228 | 
229 | ### Testing
230 | 
231 | Create a test database. The test database is used for running tests and is
232 | separate from the main database. It will have the same schema as the main
233 | database.
234 | 
235 | ```sh
236 | python -m scripts.run_migrations create-test-db
237 | ```
238 | 
239 | Run the tests
240 | 
241 | ```sh
242 | make test
243 | ```
244 | 
245 | ### Linting and format
246 | 
247 | Testing and formatting is done using a Makefile inside `[root]/backend`
248 | 
249 | ```sh
250 | make format
251 | ```
252 | 


--------------------------------------------------------------------------------
/backend/Dockerfile:
--------------------------------------------------------------------------------
 1 | # All directory paths for COPY commands are relative to the build context
 2 | 
 3 | # Ensure this python version stays in sync with CI
 4 | FROM python:3.11-slim as base
 5 | WORKDIR /backend
 6 | 
 7 | # set environment variables
 8 | ENV PYTHONDONTWRITEBYTECODE 1
 9 | ENV PYTHONUNBUFFERED 1
10 | ENV POETRY_HOME="/opt/poetry"
11 | ENV MYPYPATH="/app/src/stubs"
12 | 
13 | # Use bash as the shell for the build
14 | # https://github.com/docker/for-linux/issues/408#issuecomment-414748815
15 | SHELL ["/bin/bash", "-o", "pipefail", "-c"]
16 | 
17 | RUN set -eux && \
18 |     apt-get update && \
19 |     apt-get install -y \
20 |       build-essential \
21 |       curl \
22 |       libpq-dev \
23 |       python3-dev \
24 |       libmagic1
25 | 
26 | # https://python-poetry.org/docs
27 | RUN pip install poetry
28 | 
29 | # install deps before copying project files so the cache is only invalidated
30 | # when the deps change
31 | COPY ./backend/pyproject.toml ./backend/poetry.lock .
32 | RUN poetry config virtualenvs.create false
33 | RUN poetry install --no-root --only main
34 | 
35 | COPY ./backend .
36 | 
37 | EXPOSE 8000
38 | 
39 | ###
40 | # development image
41 | ###
42 | FROM base as development
43 | 
44 | ENTRYPOINT ["bash", "./scripts/local_entry_point.sh"]
45 | 


--------------------------------------------------------------------------------
/backend/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all lint format test help
 2 | 
 3 | # Default target executed when no arguments are given to make.
 4 | all: help
 5 | 
 6 | ######################
 7 | # TESTING AND COVERAGE
 8 | ######################
 9 | 
10 | # Define a variable for the test file path.
11 | TEST_FILE ?= tests/unit_tests/
12 | 
13 | test:
14 | 	poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE)
15 | 
16 | test_watch:
17 | 	poetry run ptw . -- $(TEST_FILE)
18 | 
19 | openapi:
20 | 	OPENAI_API_KEY=placeholder python -c "from server import main; import json; print(json.dumps(main.app.openapi()))"  > openapi.json
21 | 
22 | 
23 | ######################
24 | # LINTING AND FORMATTING
25 | ######################
26 | 
27 | # Define a variable for Python and notebook files.
28 | lint format: PYTHON_FILES=.
29 | lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=. --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
30 | 
31 | lint lint_diff:
32 | 	[ "$(PYTHON_FILES)" = "" ] ||	poetry run ruff format $(PYTHON_FILES) --diff
33 | 	# [ "$(PYTHON_FILES)" = "" ] || poetry run mypy $(PYTHON_FILES)
34 | 
35 | format format_diff:
36 | 	[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES)
37 | 	[ "$(PYTHON_FILES)" = "" ] || poetry run ruff --fix $(PYTHON_FILES)
38 | 
39 | spell_check:
40 | 	poetry run codespell --toml pyproject.toml
41 | 
42 | spell_fix:
43 | 	poetry run codespell --toml pyproject.toml -w
44 | 
45 | ######################
46 | # HELP
47 | ######################
48 | 
49 | help:
50 | 	@echo '===================='
51 | 	@echo '-- LINTING --'
52 | 	@echo 'format                       - run code formatters'
53 | 	@echo 'lint                         - run linters'
54 | 	@echo 'spell_check                 	- run codespell on the project'
55 | 	@echo 'spell_fix                		- run codespell on the project and fix the errors'
56 | 	@echo '-- TESTS --'
57 | 	@echo 'coverage                     - run unit tests and generate coverage report'
58 | 	@echo 'test                         - run unit tests'
59 | 	@echo 'test TEST_FILE=<test_file>   - run all tests in file'
60 | 	@echo '-- DOCUMENTATION tasks are from the top-level Makefile --'
61 | 


--------------------------------------------------------------------------------
/backend/README.md:
--------------------------------------------------------------------------------
1 | See readme at repo root.
2 | 


--------------------------------------------------------------------------------
/backend/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/db/__init__.py


--------------------------------------------------------------------------------
/backend/db/models.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | from datetime import datetime
  3 | from typing import Generator
  4 | 
  5 | from sqlalchemy import (
  6 |     Column,
  7 |     DateTime,
  8 |     ForeignKey,
  9 |     String,
 10 |     Text,
 11 |     UniqueConstraint,
 12 |     create_engine,
 13 | )
 14 | from sqlalchemy.dialects.postgresql import JSONB, UUID
 15 | from sqlalchemy.ext.declarative import declarative_base
 16 | from sqlalchemy.orm import Session, relationship, sessionmaker
 17 | 
 18 | from server.settings import get_postgres_url
 19 | 
 20 | ENGINE = create_engine(get_postgres_url())
 21 | SessionClass = sessionmaker(bind=ENGINE)
 22 | 
 23 | Base = declarative_base()
 24 | 
 25 | 
 26 | # TODO(Eugene): Convert to async code
 27 | def get_session() -> Generator[Session, None, None]:
 28 |     """Create a new session."""
 29 |     session = SessionClass()
 30 | 
 31 |     try:
 32 |         yield session
 33 |     except:
 34 |         session.rollback()
 35 |         raise
 36 |     finally:
 37 |         session.close()
 38 | 
 39 | 
 40 | class TimestampedModel(Base):
 41 |     """An abstract base model that includes the timestamp fields."""
 42 | 
 43 |     __abstract__ = True
 44 | 
 45 |     created_at = Column(
 46 |         DateTime,
 47 |         default=datetime.utcnow,
 48 |         comment="The time the record was created (UTC).",
 49 |     )
 50 |     updated_at = Column(
 51 |         DateTime,
 52 |         default=datetime.utcnow,
 53 |         onupdate=datetime.utcnow,
 54 |         doc="The time the record was last updated (UTC).",
 55 |     )
 56 | 
 57 |     # This is our own uuid assigned to the artifact.
 58 |     # By construction guaranteed to be unique no matter what.
 59 |     uuid = Column(
 60 |         UUID(as_uuid=True),
 61 |         primary_key=True,
 62 |         default=lambda: str(uuid.uuid4()),
 63 |         doc="Unique identifier for this model.",
 64 |     )
 65 | 
 66 | 
 67 | class Example(TimestampedModel):
 68 |     """A representation of an example.
 69 | 
 70 |     Examples consist of content together with the expected output.
 71 | 
 72 |     The output is a JSON object that is expected to be extracted from the content.
 73 | 
 74 |     The JSON object should be valid according to the schema of the associated extractor.
 75 | 
 76 |     The JSON object is defined by the schema of the associated extractor, so
 77 |     it's perfectly fine for a given example to represent the extraction
 78 |     of multiple instances of some object from the content since
 79 |     the JSON schema can represent a list of objects.
 80 |     """
 81 | 
 82 |     __tablename__ = "examples"
 83 | 
 84 |     content = Column(
 85 |         Text,
 86 |         nullable=False,
 87 |         comment="The input portion of the example.",
 88 |     )
 89 |     output = Column(
 90 |         JSONB,
 91 |         comment="The output associated with the example.",
 92 |     )
 93 |     extractor_id = Column(
 94 |         UUID(as_uuid=True),
 95 |         ForeignKey("extractors.uuid", ondelete="CASCADE"),
 96 |         nullable=False,
 97 |         comment="Foreign key referencing the associated extractor.",
 98 |     )
 99 | 
100 |     def __repr__(self) -> str:
101 |         return f"<Example(uuid={self.uuid}, content={self.content[:20]}>"
102 | 
103 | 
104 | class SharedExtractors(TimestampedModel):
105 |     """A table for managing sharing of extractors."""
106 | 
107 |     __tablename__ = "shared_extractors"
108 | 
109 |     extractor_id = Column(
110 |         UUID(as_uuid=True),
111 |         ForeignKey("extractors.uuid", ondelete="CASCADE"),
112 |         index=True,
113 |         nullable=False,
114 |         comment="The extractor that is being shared.",
115 |     )
116 | 
117 |     share_token = Column(
118 |         UUID(as_uuid=True),
119 |         index=True,
120 |         nullable=False,
121 |         unique=True,
122 |         comment="The token that is used to access the shared extractor.",
123 |     )
124 | 
125 |     # Add unique constraint for (extractor_id, share_token)
126 |     __table_args__ = (
127 |         UniqueConstraint("extractor_id", "share_token", name="unique_share_token"),
128 |     )
129 | 
130 |     def __repr__(self) -> str:
131 |         """Return a string representation of the object."""
132 |         return f"<SharedExtractor(id={self.id}, run_id={self.run_id})>"
133 | 
134 | 
135 | class Extractor(TimestampedModel):
136 |     __tablename__ = "extractors"
137 | 
138 |     name = Column(
139 |         String(100),
140 |         nullable=False,
141 |         server_default="",
142 |         comment="The name of the extractor.",
143 |     )
144 |     owner_id = Column(
145 |         UUID(as_uuid=True),
146 |         nullable=False,
147 |         comment="Owner uuid.",
148 |     )
149 |     schema = Column(
150 |         JSONB,
151 |         nullable=False,
152 |         comment="JSON Schema that describes what content will be "
153 |         "extracted from the document",
154 |     )
155 |     description = Column(
156 |         String(100),
157 |         nullable=False,
158 |         server_default="",
159 |         comment="Surfaced via UI to the users.",
160 |     )
161 |     instruction = Column(
162 |         Text, nullable=False, comment="The prompt to the language model."
163 |     )  # TODO: This will need to evolve
164 | 
165 |     examples = relationship("Example", backref="extractor")
166 | 
167 |     # Used for sharing the extractor with others.
168 |     share_uuid = Column(
169 |         UUID(as_uuid=True),
170 |         nullable=True,
171 |         comment="The uuid of the shareable link.",
172 |     )
173 | 
174 |     def __repr__(self) -> str:
175 |         return f"<Extractor(id={self.uuid}, description={self.description})>"
176 | 
177 | 
178 | def validate_extractor_owner(
179 |     session: Session, extractor_id: UUID, user_id: UUID
180 | ) -> Extractor:
181 |     """Validate the extractor id."""
182 |     extractor = (
183 |         session.query(Extractor).filter_by(uuid=extractor_id, owner_id=user_id).first()
184 |     )
185 |     return extractor is not None
186 | 


--------------------------------------------------------------------------------
/backend/extraction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/extraction/__init__.py


--------------------------------------------------------------------------------
/backend/extraction/parsing.py:
--------------------------------------------------------------------------------
 1 | """Convert binary input to blobs and parse them using the appropriate parser."""
 2 | from __future__ import annotations
 3 | 
 4 | from typing import BinaryIO, List
 5 | 
 6 | from fastapi import HTTPException
 7 | from langchain.document_loaders.parsers import BS4HTMLParser, PDFMinerParser
 8 | from langchain.document_loaders.parsers.generic import MimeTypeBasedParser
 9 | from langchain.document_loaders.parsers.txt import TextParser
10 | from langchain_community.document_loaders import Blob
11 | from langchain_core.documents import Document
12 | 
13 | HANDLERS = {
14 |     "application/pdf": PDFMinerParser(),
15 |     "text/plain": TextParser(),
16 |     "text/html": BS4HTMLParser(),
17 |     # Disable for now as they rely on unstructured and there's some install
18 |     # issue with unstructured.
19 |     # from langchain.document_loaders.parsers.msword import MsWordParser
20 |     # "application/msword": MsWordParser(),
21 |     # "application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
22 |     #     MsWordParser()
23 |     # ),
24 | }
25 | 
26 | SUPPORTED_MIMETYPES = sorted(HANDLERS.keys())
27 | 
28 | MAX_FILE_SIZE_MB = 10  # in MB
29 | 
30 | 
31 | def _guess_mimetype(file_bytes: bytes) -> str:
32 |     """Guess the mime-type of a file."""
33 |     try:
34 |         import magic
35 |     except ImportError as e:
36 |         raise ImportError(
37 |             "magic package not found, please install it with `pip install python-magic`"
38 |         ) from e
39 | 
40 |     mime = magic.Magic(mime=True)
41 |     mime_type = mime.from_buffer(file_bytes)
42 |     return mime_type
43 | 
44 | 
45 | def _get_file_size_in_mb(data: BinaryIO) -> float:
46 |     """Get file size in MB."""
47 |     data.seek(0, 2)  # Move the cursor to the end of the file
48 |     file_size = data.tell()
49 |     file_size_in_mb = file_size / (1024 * 1024)
50 |     data.seek(0)
51 |     return file_size_in_mb
52 | 
53 | 
54 | # PUBLIC API
55 | 
56 | MIMETYPE_BASED_PARSER = MimeTypeBasedParser(
57 |     handlers=HANDLERS,
58 |     fallback_parser=None,
59 | )
60 | 
61 | 
62 | def convert_binary_input_to_blob(data: BinaryIO) -> Blob:
63 |     """Convert ingestion input to blob."""
64 |     file_size_in_mb = _get_file_size_in_mb(data)
65 | 
66 |     if file_size_in_mb > MAX_FILE_SIZE_MB:
67 |         raise HTTPException(
68 |             status_code=413,
69 |             detail=f"File size exceeds the maximum limit of {MAX_FILE_SIZE_MB} MB.",
70 |         )
71 | 
72 |     file_data = data.read()
73 |     mimetype = _guess_mimetype(file_data)
74 |     file_name = data.name
75 | 
76 |     return Blob.from_data(
77 |         data=file_data,
78 |         path=file_name,
79 |         mime_type=mimetype,
80 |     )
81 | 
82 | 
83 | def parse_binary_input(data: BinaryIO) -> List[Document]:
84 |     """Parse binary input."""
85 |     blob = convert_binary_input_to_blob(data)
86 |     return MIMETYPE_BASED_PARSER.parse(blob)
87 | 


--------------------------------------------------------------------------------
/backend/extraction/utils.py:
--------------------------------------------------------------------------------
 1 | """Adapters to convert between different formats."""
 2 | from __future__ import annotations
 3 | 
 4 | from langchain_core.utils.json_schema import dereference_refs
 5 | 
 6 | 
 7 | def _rm_titles(kv: dict) -> dict:
 8 |     """Remove titles from a dictionary."""
 9 |     new_kv = {}
10 |     for k, v in kv.items():
11 |         if k == "title":
12 |             continue
13 |         elif isinstance(v, dict):
14 |             new_kv[k] = _rm_titles(v)
15 |         else:
16 |             new_kv[k] = v
17 |     return new_kv
18 | 
19 | 
20 | # PUBLIC API
21 | 
22 | 
23 | def update_json_schema(
24 |     schema: dict,
25 |     *,
26 |     multi: bool = True,
27 | ) -> dict:
28 |     """Add missing fields to JSON schema and add support for multiple records."""
29 |     if multi:
30 |         # Wrap the schema in an object called "Root" with a property called: "data"
31 |         # which will be a json array of the original schema.
32 |         schema_ = {
33 |             "type": "object",
34 |             "properties": {
35 |                 "data": {
36 |                     "type": "array",
37 |                     "items": dereference_refs(schema),
38 |                 },
39 |             },
40 |             "required": ["data"],
41 |         }
42 |     else:
43 |         raise NotImplementedError("Only multi is supported for now.")
44 | 
45 |     schema_["title"] = "extractor"
46 |     schema_["description"] = "Extract information matching the given schema."
47 |     return schema_
48 | 


--------------------------------------------------------------------------------
/backend/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "langchain-extract"
  3 | version = "0.0.1"
  4 | description = "Sample extraction backend."
  5 | authors = ["LangChain AI"]
  6 | license = "MIT"
  7 | readme = "README.md"
  8 | 
  9 | [tool.poetry.dependencies]
 10 | python = "^3.8.1"
 11 | langchain = "~0.1"
 12 | langsmith = ">=0.0.66"
 13 | fastapi = "^0.109.2"
 14 | langserve = "^0.0.45"
 15 | uvicorn = "^0.27.1"
 16 | pydantic = "^1.10"
 17 | langchain-openai = "^0.1.3"
 18 | jsonschema = "^4.21.1"
 19 | sse-starlette = "^2.0.0"
 20 | alembic = "^1.13.1"
 21 | psycopg2 = "^2.9.9"
 22 | python-magic = "^0.4.27"
 23 | pdfminer-six = "^20231228"
 24 | beautifulsoup4 = "^4.12.3"
 25 | lxml = "^5.1.0"
 26 | faiss-cpu = "^1.7.4"
 27 | python-multipart = "^0.0.9"
 28 | langchain-fireworks = "^0.1.1"
 29 | langchain-anthropic = "^0.1.11"
 30 | langchain-groq = "^0.1.3"
 31 | 
 32 | [tool.poetry.group.dev.dependencies]
 33 | jupyterlab = "^3.6.1"
 34 | 
 35 | [tool.poetry.group.typing.dependencies]
 36 | mypy = "^1.7.0"
 37 | 
 38 | [tool.poetry.group.lint.dependencies]
 39 | ruff = "^0.1.5"
 40 | 
 41 | [tool.poetry.group.docs.dependencies]
 42 | nbsphinx = ">=0.8.9"
 43 | sphinx = ">=5.2.0"
 44 | sphinx-autobuild = "^2021.3.14"
 45 | sphinx_book_theme = "^1.0.0"
 46 | myst-nb = { version = "^1.0.0", python = "^3.9" }
 47 | toml = "^0.10.2"
 48 | sphinx-copybutton = ">=0.5.1"
 49 | 
 50 | 
 51 | [tool.poetry.group.test.dependencies]
 52 | pytest = "^7.2.1"
 53 | pytest-cov = "^4.0.0"
 54 | pytest-asyncio = "^0.21.1"
 55 | pytest-mock = "^3.11.1"
 56 | pytest-socket = "^0.6.0"
 57 | pytest-watch = "^4.2.0"
 58 | pytest-timeout = "^2.2.0"
 59 | 
 60 | 
 61 | [tool.ruff]
 62 | select = [
 63 |     "E",  # pycodestyle
 64 |     "F",  # pyflakes
 65 |     "I",  # isort
 66 | ]
 67 | extend-include = ["*.ipynb"]
 68 | 
 69 | # Same as Black.
 70 | line-length = 88
 71 | 
 72 | [tool.mypy]
 73 | disallow_untyped_defs = "True"
 74 | ignore_missing_imports = "True"
 75 | 
 76 | [tool.coverage.run]
 77 | omit = [
 78 |     "tests/*",
 79 | ]
 80 | 
 81 | 
 82 | [build-system]
 83 | requires = ["poetry-core"]
 84 | build-backend = "poetry.core.masonry.api"
 85 | 
 86 | [tool.pytest.ini_options]
 87 | # --strict-markers will raise errors on unknown marks.
 88 | # https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
 89 | #
 90 | # https://docs.pytest.org/en/7.1.x/reference/reference.html
 91 | # --strict-config       any warnings encountered while parsing the `pytest`
 92 | #                       section of the configuration file raise errors.
 93 | addopts = "--strict-markers --strict-config --durations=5 -vv"
 94 | # Global timeout for all tests. There shuold be a good reason for a test to
 95 | # take more than 5 second
 96 | timeout = 5
 97 | # Registering custom markers.
 98 | # https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
 99 | markers = [
100 |   "asyncio: mark tests as requiring asyncio",
101 | ]
102 | asyncio_mode = "auto"
103 | 


--------------------------------------------------------------------------------
/backend/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/scripts/__init__.py


--------------------------------------------------------------------------------
/backend/scripts/local_entry_point.sh:
--------------------------------------------------------------------------------
 1 | # -e: fail on any nonzero exit status
 2 | # -u: fail if any referenced variables are not set
 3 | # -x: print commands before running them
 4 | # -o pipefail: fail if a command in a pipe has a nonzero exit code
 5 | set -euxo pipefail
 6 | 
 7 | # For now just create the db if it doesn't exist
 8 | python -m scripts.run_migrations create
 9 | 
10 | uvicorn server.main:app --host 0.0.0.0 --port 8000 --reload
11 | 


--------------------------------------------------------------------------------
/backend/scripts/prod_entry_point.sh:
--------------------------------------------------------------------------------
 1 | # -e: fail on any nonzero exit status
 2 | # -u: fail if any referenced variables are not set
 3 | # -x: print commands before running them
 4 | # -o pipefail: fail if a command in a pipe has a nonzero exit code
 5 | set -euxo pipefail
 6 | 
 7 | # For now just create the db if it doesn't exist
 8 | python -m scripts.run_migrations create
 9 | 
10 | uvicorn server.main:app --host 0.0.0.0 --port 8080 --reload
11 | 


--------------------------------------------------------------------------------
/backend/scripts/run_migrations.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Run migrations."""
 3 | import click
 4 | 
 5 | from db.models import ENGINE, Base
 6 | 
 7 | 
 8 | @click.group()
 9 | def cli():
10 |     """Database migration commands."""
11 |     pass
12 | 
13 | 
14 | @cli.command()
15 | def create():
16 |     """Create all tables."""
17 |     Base.metadata.create_all(ENGINE)
18 |     click.echo("All tables created successfully.")
19 | 
20 | 
21 | @cli.command()
22 | @click.confirmation_option(prompt="Are you sure you want to drop all tables?")
23 | def drop():
24 |     """Drop all tables."""
25 |     Base.metadata.drop_all(ENGINE)
26 |     click.echo("All tables dropped successfully.")
27 | 
28 | 
29 | @cli.command()
30 | def create_test_db():
31 |     """Create a test database called langchain_test used for testing purposes."""
32 |     import psycopg2
33 |     from psycopg2.errors import DuplicateDatabase
34 | 
35 |     # establishing the connection
36 |     conn = psycopg2.connect(
37 |         database="langchain",
38 |         user="langchain",
39 |         password="langchain",
40 |         host="localhost",
41 |         port="5432",
42 |     )
43 |     conn.autocommit = True
44 | 
45 |     # Creating a cursor object using the cursor() method
46 |     with conn.cursor() as cursor:
47 |         # Preparing query to create a database
48 |         sql = "CREATE DATABASE langchain_test;"
49 | 
50 |         # Creating a database
51 |         try:
52 |             cursor.execute(sql)
53 |             print("Database created successfully.")
54 |         except DuplicateDatabase:
55 |             print("Database already exists")
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     cli()
60 | 


--------------------------------------------------------------------------------
/backend/server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/server/__init__.py


--------------------------------------------------------------------------------
/backend/server/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/server/api/__init__.py


--------------------------------------------------------------------------------
/backend/server/api/api_key.py:
--------------------------------------------------------------------------------
1 | from fastapi.security import APIKeyHeader
2 | 
3 | # For actual auth, you'd need to check the key against a database or some other
4 | # data store. Here, we don't need actual auth, just a key that matches
5 | # a UUID
6 | UserToken = APIKeyHeader(name="x-key")
7 | 


--------------------------------------------------------------------------------
/backend/server/api/configurables.py:
--------------------------------------------------------------------------------
 1 | """Endpoint for listing available chat models for extraction."""
 2 | from typing import List
 3 | 
 4 | from fastapi import APIRouter
 5 | from typing_extensions import TypedDict
 6 | 
 7 | from extraction.parsing import MAX_FILE_SIZE_MB, SUPPORTED_MIMETYPES
 8 | from server.models import SUPPORTED_MODELS
 9 | from server.settings import MAX_CHUNKS, MAX_CONCURRENCY
10 | 
11 | router = APIRouter(
12 |     prefix="/configuration",
13 |     tags=["Configuration"],
14 |     responses={404: {"description": "Not found"}},
15 | )
16 | 
17 | 
18 | class ConfigurationResponse(TypedDict):
19 |     """Response for configuration."""
20 | 
21 |     available_models: List[str]
22 |     accepted_mimetypes: List[str]
23 |     max_file_size_mb: int
24 |     max_concurrency: int
25 |     max_chunks: int
26 |     models: List[dict]
27 | 
28 | 
29 | @router.get("")
30 | def get() -> ConfigurationResponse:
31 |     """Endpoint to show server configuration."""
32 |     return {
33 |         "available_models": sorted(SUPPORTED_MODELS),  # Deprecate
34 |         "models": [
35 |             {
36 |                 "name": model,
37 |                 "description": data["description"],
38 |             }
39 |             for model, data in SUPPORTED_MODELS.items()
40 |         ],
41 |         "accepted_mimetypes": SUPPORTED_MIMETYPES,
42 |         "max_file_size_mb": MAX_FILE_SIZE_MB,
43 |         "max_concurrency": MAX_CONCURRENCY,
44 |         "max_chunks": MAX_CHUNKS,
45 |     }
46 | 


--------------------------------------------------------------------------------
/backend/server/api/examples.py:
--------------------------------------------------------------------------------
 1 | """Endpoints for managing definition of examples.."""
 2 | from typing import Any, List
 3 | from uuid import UUID
 4 | 
 5 | from fastapi import APIRouter, Depends, HTTPException
 6 | from sqlalchemy.orm import Session
 7 | from typing_extensions import Annotated, TypedDict
 8 | 
 9 | from db.models import Example, get_session, validate_extractor_owner
10 | from server.api.api_key import UserToken
11 | 
12 | router = APIRouter(
13 |     prefix="/examples",
14 |     tags=["example definitions"],
15 |     responses={404: {"description": "Not found"}},
16 | )
17 | 
18 | 
19 | class CreateExample(TypedDict):
20 |     """A request to create an example."""
21 | 
22 |     extractor_id: Annotated[UUID, "The extractor ID that this is an example for."]
23 |     content: Annotated[str, "The input portion of the example."]
24 |     output: Annotated[
25 |         List[Any], "JSON object that is expected to be extracted from the content."
26 |     ]
27 | 
28 | 
29 | class CreateExampleResponse(TypedDict):
30 |     """Response for creating an example."""
31 | 
32 |     uuid: UUID
33 | 
34 | 
35 | @router.post("")
36 | def create(
37 |     create_request: CreateExample,
38 |     *,
39 |     session: Session = Depends(get_session),
40 |     user_id: UUID = Depends(UserToken),
41 | ) -> CreateExampleResponse:
42 |     """Endpoint to create an example."""
43 |     if not validate_extractor_owner(session, create_request["extractor_id"], user_id):
44 |         raise HTTPException(status_code=404, detail="Extractor not found for owner.")
45 | 
46 |     instance = Example(
47 |         extractor_id=create_request["extractor_id"],
48 |         content=create_request["content"],
49 |         output=create_request["output"],
50 |     )
51 |     session.add(instance)
52 |     session.commit()
53 |     return {"uuid": instance.uuid}
54 | 
55 | 
56 | @router.get("")
57 | def list(
58 |     extractor_id: UUID,
59 |     *,
60 |     limit: int = 10,
61 |     offset: int = 0,
62 |     session=Depends(get_session),
63 |     user_id: UUID = Depends(UserToken),
64 | ) -> List[Any]:
65 |     """Endpoint to get all examples."""
66 |     if not validate_extractor_owner(session, extractor_id, user_id):
67 |         raise HTTPException(status_code=404, detail="Extractor not found for owner.")
68 |     return (
69 |         session.query(Example)
70 |         .filter(Example.extractor_id == extractor_id)
71 |         .order_by(Example.uuid)
72 |         .limit(limit)
73 |         .offset(offset)
74 |         .all()
75 |     )
76 | 
77 | 
78 | @router.delete("/{uuid}")
79 | def delete(
80 |     uuid: UUID,
81 |     *,
82 |     session: Session = Depends(get_session),
83 |     user_id: UUID = Depends(UserToken),
84 | ) -> None:
85 |     """Endpoint to delete an example."""
86 |     example = session.query(Example).filter_by(uuid=str(uuid)).first()
87 |     if example is None:
88 |         raise HTTPException(status_code=404, detail="Example not found.")
89 |     extractor_id = example.extractor_id
90 |     if not validate_extractor_owner(session, extractor_id, user_id):
91 |         raise HTTPException(status_code=404, detail="Extractor not found for owner.")
92 |     session.query(Example).filter_by(uuid=str(uuid)).delete()
93 |     session.commit()
94 | 


--------------------------------------------------------------------------------
/backend/server/api/extract.py:
--------------------------------------------------------------------------------
  1 | from typing import Literal, Optional
  2 | from uuid import UUID
  3 | 
  4 | from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile
  5 | from sqlalchemy.orm import Session
  6 | from typing_extensions import Annotated
  7 | 
  8 | from db.models import Extractor, SharedExtractors, get_session
  9 | from extraction.parsing import parse_binary_input
 10 | from server.api.api_key import UserToken
 11 | from server.extraction_runnable import ExtractResponse, extract_entire_document
 12 | from server.models import DEFAULT_MODEL
 13 | from server.retrieval import extract_from_content
 14 | 
 15 | router = APIRouter(
 16 |     prefix="/extract",
 17 |     tags=["extract"],
 18 |     responses={404: {"description": "Not found"}},
 19 | )
 20 | 
 21 | 
 22 | @router.post("", response_model=ExtractResponse)
 23 | async def extract_using_existing_extractor(
 24 |     *,
 25 |     extractor_id: Annotated[UUID, Form()],
 26 |     text: Optional[str] = Form(None),
 27 |     mode: Literal["entire_document", "retrieval"] = Form("entire_document"),
 28 |     file: Optional[UploadFile] = File(None),
 29 |     model_name: Optional[str] = Form(DEFAULT_MODEL),
 30 |     session: Session = Depends(get_session),
 31 |     user_id: UUID = Depends(UserToken),
 32 | ) -> ExtractResponse:
 33 |     """Endpoint that is used with an existing extractor.
 34 | 
 35 |     This endpoint will be expanded to support upload of binary files as well as
 36 |     text files.
 37 |     """
 38 |     if text is None and file is None:
 39 |         raise HTTPException(status_code=422, detail="No text or file provided.")
 40 | 
 41 |     extractor = (
 42 |         session.query(Extractor).filter_by(uuid=extractor_id, owner_id=user_id).scalar()
 43 |     )
 44 |     if extractor is None:
 45 |         raise HTTPException(status_code=404, detail="Extractor not found for owner.")
 46 | 
 47 |     if text:
 48 |         text_ = text
 49 |     else:
 50 |         documents = parse_binary_input(file.file)
 51 |         # TODO: Add metadata like location from original file where
 52 |         # the text was extracted from
 53 |         text_ = "\n".join([document.page_content for document in documents])
 54 | 
 55 |     if mode == "entire_document":
 56 |         return await extract_entire_document(text_, extractor, model_name)
 57 |     elif mode == "retrieval":
 58 |         return await extract_from_content(text_, extractor, model_name)
 59 |     else:
 60 |         raise ValueError(
 61 |             f"Invalid mode {mode}. Expected one of 'entire_document', 'retrieval'."
 62 |         )
 63 | 
 64 | 
 65 | @router.post("/shared", response_model=ExtractResponse)
 66 | async def extract_using_shared_extractor(
 67 |     *,
 68 |     extractor_id: Annotated[UUID, Form()],
 69 |     text: Optional[str] = Form(None),
 70 |     mode: Literal["entire_document", "retrieval"] = Form("entire_document"),
 71 |     file: Optional[UploadFile] = File(None),
 72 |     model_name: Optional[str] = Form("default"),
 73 |     session: Session = Depends(get_session),
 74 | ) -> ExtractResponse:
 75 |     """Endpoint that is used with an existing extractor.
 76 | 
 77 |     Args:
 78 |         extractor_id: The UUID of the shared extractor.
 79 |             This is the UUID that is used to share the extractor, not
 80 |             the UUID of the extractor itself.
 81 |         text: The text to extract from.
 82 |         mode: The mode to use for extraction.
 83 |         file: The file to extract from.
 84 |         model_name: The model to use for extraction.
 85 |         session: The database session.
 86 | 
 87 |     """
 88 |     if text is None and file is None:
 89 |         raise HTTPException(status_code=422, detail="No text or file provided.")
 90 | 
 91 |     extractor = (
 92 |         session.query(Extractor)
 93 |         .join(SharedExtractors, Extractor.uuid == SharedExtractors.extractor_id)
 94 |         .filter(SharedExtractors.share_token == extractor_id)
 95 |         .scalar()
 96 |     )
 97 | 
 98 |     if not extractor:
 99 |         raise HTTPException(status_code=404, detail="Extractor not found.")
100 | 
101 |     if text:
102 |         text_ = text
103 |     else:
104 |         documents = parse_binary_input(file.file)
105 |         # TODO: Add metadata like location from original file where
106 |         # the text was extracted from
107 |         text_ = "\n".join([document.page_content for document in documents])
108 | 
109 |     if mode == "entire_document":
110 |         return await extract_entire_document(text_, extractor, model_name)
111 |     elif mode == "retrieval":
112 |         return await extract_from_content(text_, extractor, model_name)
113 |     else:
114 |         raise ValueError(
115 |             f"Invalid mode {mode}. Expected one of 'entire_document', 'retrieval'."
116 |         )
117 | 


--------------------------------------------------------------------------------
/backend/server/api/extractors.py:
--------------------------------------------------------------------------------
  1 | """Endpoints for managing definition of extractors."""
  2 | from typing import Any, Dict, List
  3 | from uuid import UUID, uuid4
  4 | 
  5 | from fastapi import APIRouter, Depends, HTTPException
  6 | from pydantic import BaseModel, Field, validator
  7 | from sqlalchemy.exc import IntegrityError
  8 | from sqlalchemy.orm import Session
  9 | 
 10 | from db.models import Extractor, SharedExtractors, get_session, validate_extractor_owner
 11 | from server.api.api_key import UserToken
 12 | from server.validators import validate_json_schema
 13 | 
 14 | router = APIRouter(
 15 |     prefix="/extractors",
 16 |     tags=["extractor definitions"],
 17 |     responses={404: {"description": "Not found"}},
 18 | )
 19 | 
 20 | 
 21 | class CreateExtractor(BaseModel):
 22 |     """A request to create an extractor."""
 23 | 
 24 |     name: str = Field(default="", description="The name of the extractor.")
 25 | 
 26 |     description: str = Field(
 27 |         default="", description="Short description of the extractor."
 28 |     )
 29 |     json_schema: Dict[str, Any] = Field(
 30 |         ..., description="The schema to use for extraction.", alias="schema"
 31 |     )
 32 |     instruction: str = Field(..., description="The instruction to use for extraction.")
 33 | 
 34 |     @validator("json_schema")
 35 |     def validate_schema(cls, v: Any) -> Dict[str, Any]:
 36 |         """Validate the schema."""
 37 |         validate_json_schema(v)
 38 |         return v
 39 | 
 40 | 
 41 | class CreateExtractorResponse(BaseModel):
 42 |     """Response for creating an extractor."""
 43 | 
 44 |     uuid: UUID = Field(..., description="The UUID of the created extractor.")
 45 | 
 46 | 
 47 | class ShareExtractorRequest(BaseModel):
 48 |     """Response for sharing an extractor."""
 49 | 
 50 |     uuid: UUID = Field(..., description="The UUID of the extractor to share.")
 51 | 
 52 | 
 53 | class ShareExtractorResponse(BaseModel):
 54 |     """Response for sharing an extractor."""
 55 | 
 56 |     share_uuid: UUID = Field(..., description="The UUID for the shared extractor.")
 57 | 
 58 | 
 59 | @router.post("/{uuid}/share", response_model=ShareExtractorResponse)
 60 | def share(
 61 |     uuid: UUID,
 62 |     *,
 63 |     session: Session = Depends(get_session),
 64 |     user_id: UUID = Depends(UserToken),
 65 | ) -> ShareExtractorResponse:
 66 |     """Endpoint to share an extractor.
 67 | 
 68 |     Look up a shared extractor by UUID and return the share UUID if it exists.
 69 |     If not shared, create a new shared extractor entry and return the new share UUID.
 70 | 
 71 |     Args:
 72 |         uuid: The UUID of the extractor to share.
 73 |         session: The database session.
 74 | 
 75 |     Returns:
 76 |         The UUID for the shared extractor.
 77 |     """
 78 |     if not validate_extractor_owner(session, uuid, user_id):
 79 |         raise HTTPException(status_code=404, detail="Extractor not found for owner.")
 80 |     # Check if the extractor is already shared
 81 |     shared_extractor = (
 82 |         session.query(SharedExtractors)
 83 |         .filter(SharedExtractors.extractor_id == uuid)
 84 |         .scalar()
 85 |     )
 86 | 
 87 |     if shared_extractor:
 88 |         # The extractor is already shared, return the existing share_uuid
 89 |         return ShareExtractorResponse(share_uuid=shared_extractor.share_token)
 90 | 
 91 |     # If not shared, create a new shared extractor entry
 92 |     new_shared_extractor = SharedExtractors(
 93 |         extractor_id=uuid,
 94 |         # This will automatically generate a new UUID for share_token
 95 |         share_token=uuid4(),
 96 |     )
 97 | 
 98 |     session.add(new_shared_extractor)
 99 |     try:
100 |         session.commit()
101 |     except IntegrityError:
102 |         session.rollback()
103 |         raise HTTPException(status_code=400, detail="Failed to share the extractor.")
104 | 
105 |     # Return the new share_uuid
106 |     return ShareExtractorResponse(share_uuid=new_shared_extractor.share_token)
107 | 
108 | 
109 | @router.post("")
110 | def create(
111 |     create_request: CreateExtractor,
112 |     *,
113 |     session: Session = Depends(get_session),
114 |     user_id: UUID = Depends(UserToken),
115 | ) -> CreateExtractorResponse:
116 |     """Endpoint to create an extractor."""
117 | 
118 |     instance = Extractor(
119 |         name=create_request.name,
120 |         owner_id=user_id,
121 |         schema=create_request.json_schema,
122 |         description=create_request.description,
123 |         instruction=create_request.instruction,
124 |     )
125 |     session.add(instance)
126 |     session.commit()
127 |     return CreateExtractorResponse(uuid=instance.uuid)
128 | 
129 | 
130 | @router.get("/{uuid}")
131 | def get(
132 |     uuid: UUID,
133 |     *,
134 |     session: Session = Depends(get_session),
135 |     user_id: UUID = Depends(UserToken),
136 | ) -> Dict[str, Any]:
137 |     """Endpoint to get an extractor."""
138 |     extractor = (
139 |         session.query(Extractor).filter_by(uuid=str(uuid), owner_id=user_id).scalar()
140 |     )
141 |     if extractor is None:
142 |         raise HTTPException(status_code=404, detail="Extractor not found for owner.")
143 |     return {
144 |         "uuid": extractor.uuid,
145 |         "name": extractor.name,
146 |         "description": extractor.description,
147 |         "schema": extractor.schema,
148 |         "instruction": extractor.instruction,
149 |     }
150 | 
151 | 
152 | @router.get("")
153 | def list(
154 |     *,
155 |     limit: int = 10,
156 |     offset: int = 0,
157 |     session=Depends(get_session),
158 |     user_id: UUID = Depends(UserToken),
159 | ) -> List[Any]:
160 |     """Endpoint to get all extractors."""
161 |     return (
162 |         session.query(Extractor)
163 |         .filter_by(owner_id=user_id)
164 |         .limit(limit)
165 |         .offset(offset)
166 |         .all()
167 |     )
168 | 
169 | 
170 | @router.delete("/{uuid}")
171 | def delete(
172 |     uuid: UUID,
173 |     *,
174 |     session: Session = Depends(get_session),
175 |     user_id: UUID = Depends(UserToken),
176 | ) -> None:
177 |     """Endpoint to delete an extractor."""
178 |     session.query(Extractor).filter_by(uuid=str(uuid), owner_id=user_id).delete()
179 |     session.commit()
180 | 


--------------------------------------------------------------------------------
/backend/server/api/shared.py:
--------------------------------------------------------------------------------
 1 | """Endpoints for working with shared resources."""
 2 | from typing import Any, Dict
 3 | from uuid import UUID
 4 | 
 5 | from fastapi import APIRouter, Depends, HTTPException
 6 | from pydantic import BaseModel, Field
 7 | from sqlalchemy.orm import Session
 8 | 
 9 | from db.models import Extractor, SharedExtractors, get_session
10 | 
11 | router = APIRouter(
12 |     prefix="/shared/extractors",
13 |     tags=["extractor definitions"],
14 |     responses={404: {"description": "Not found"}},
15 | )
16 | 
17 | 
18 | class SharedExtractorResponse(BaseModel):
19 |     """Response for sharing an extractor."""
20 | 
21 |     # UUID should not be included in the response since it is not a public identifier!
22 |     name: str
23 |     description: str
24 |     # schema is a reserved keyword by pydantic
25 |     schema_: Dict[str, Any] = Field(..., alias="schema")
26 |     instruction: str
27 | 
28 | 
29 | @router.get("/{uuid}")
30 | def get(
31 |     uuid: UUID,
32 |     *,
33 |     session: Session = Depends(get_session),
34 | ) -> SharedExtractorResponse:
35 |     """Get a shared extractor."""
36 |     extractor = (
37 |         session.query(Extractor)
38 |         .join(SharedExtractors, Extractor.uuid == SharedExtractors.extractor_id)
39 |         .filter(SharedExtractors.share_token == uuid)
40 |         .first()
41 |     )
42 | 
43 |     if not extractor:
44 |         raise HTTPException(status_code=404, detail="Extractor not found.")
45 | 
46 |     return SharedExtractorResponse(
47 |         name=extractor.name,
48 |         description=extractor.description,
49 |         schema=extractor.schema,
50 |         instruction=extractor.instruction,
51 |     )
52 | 


--------------------------------------------------------------------------------
/backend/server/api/suggest.py:
--------------------------------------------------------------------------------
  1 | """Module to handle the suggest API endpoint.
  2 | 
  3 | This is logic that leverages LLMs to suggest an extractor for a given task.
  4 | """
  5 | from typing import Optional
  6 | 
  7 | from fastapi import APIRouter
  8 | from langchain_core.prompts import ChatPromptTemplate
  9 | from pydantic import BaseModel, Field
 10 | 
 11 | from server.models import get_model
 12 | 
 13 | router = APIRouter(
 14 |     prefix="/suggest",
 15 |     tags=["Suggest an extractor"],
 16 |     responses={404: {"description": "Not found"}},
 17 | )
 18 | 
 19 | 
 20 | model = get_model()
 21 | 
 22 | 
 23 | class SuggestExtractor(BaseModel):
 24 |     """A request to create an extractor."""
 25 | 
 26 |     description: str = Field(
 27 |         default="",
 28 |         description=(
 29 |             "Short description of what  information the extractor is extracting."
 30 |         ),
 31 |     )
 32 |     jsonSchema: Optional[str] = Field(
 33 |         default=None,
 34 |         description=(
 35 |             "Existing JSON Schema that describes the entity / "
 36 |             "information that should be extracted."
 37 |         ),
 38 |     )
 39 | 
 40 | 
 41 | class ExtractorDefinition(BaseModel):
 42 |     """Define an information extractor to be used in an information extraction system."""  # noqa: E501
 43 | 
 44 |     json_schema: str = Field(
 45 |         ...,
 46 |         description=(
 47 |             "JSON Schema that describes the entity / "
 48 |             "information that should be extracted. "
 49 |             "This schema is specified in JSON Schema format. "
 50 |         ),
 51 |     )
 52 | 
 53 | 
 54 | SUGGEST_PROMPT = ChatPromptTemplate.from_messages(
 55 |     [
 56 |         (
 57 |             "system",
 58 |             "You are are an expert ontologist and have been asked to help a user "
 59 |             "define an information extractor.The user will describe an entity, "
 60 |             "a topic or a piece of information that they would like to extract from "
 61 |             "text. Based on the user input, you are to provide a schema and "
 62 |             "description for the extractor. The schema should be a JSON Schema that "
 63 |             "describes the entity or information to be extracted. information to be "
 64 |             "extracted. Make sure to include title and description for all the "
 65 |             "attributes in the schema.The JSON Schema should describe a top level "
 66 |             "object. The object MUST have a title and description.Unless otherwise "
 67 |             "stated all entity properties in the schema should be considered optional.",
 68 |         ),
 69 |         ("human", "{input}"),
 70 |     ]
 71 | )
 72 | 
 73 | suggestion_chain = SUGGEST_PROMPT | model.with_structured_output(
 74 |     schema=ExtractorDefinition
 75 | ).with_config({"run_name": "suggest"})
 76 | 
 77 | UPDATE_PROMPT = ChatPromptTemplate.from_messages(
 78 |     [
 79 |         (
 80 |             "system",
 81 |             "You are are an expert ontologist and have been asked to help a user "
 82 |             "define an information extractor.gThe existing extractor schema is "
 83 |             "provided.\ng```\n{json_schema}\n```\nThe user will describe a desired "
 84 |             "modification to the schema (e.g., adding a new field, changing a field "
 85 |             "type, etc.).Your goal is to provide a new schema that incorporates the "
 86 |             "user's desired modification.The user may also request a completely new "
 87 |             "schema, in which case you should provide a new schema based on the "
 88 |             "user's input, and ignore the existing schema.The JSON Schema should "
 89 |             "describe a top level object. The object MUST have a title and "
 90 |             "description.Unless otherwise stated all entity properties in the schema "
 91 |             "should be considered optional.",
 92 |         ),
 93 |         ("human", "{input}"),
 94 |     ]
 95 | )
 96 | 
 97 | UPDATE_CHAIN = (
 98 |     UPDATE_PROMPT | model.with_structured_output(schema=ExtractorDefinition)
 99 | ).with_config({"run_name": "suggest_update"})
100 | 
101 | 
102 | # PUBLIC API
103 | 
104 | 
105 | @router.post("")
106 | async def suggest(request: SuggestExtractor) -> ExtractorDefinition:
107 |     """Endpoint to create an extractor."""
108 |     if len(request.jsonSchema) > 10:
109 |         return await UPDATE_CHAIN.ainvoke(
110 |             {"input": request.description, "json_schema": request.jsonSchema}
111 |         )
112 |     return await suggestion_chain.ainvoke({"input": request.description})
113 | 


--------------------------------------------------------------------------------
/backend/server/extraction_runnable.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import json
  4 | import uuid
  5 | from typing import Any, Dict, List, Optional, Sequence
  6 | 
  7 | from fastapi import HTTPException
  8 | from jsonschema import Draft202012Validator, exceptions
  9 | from langchain.text_splitter import TokenTextSplitter
 10 | from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
 11 | from langchain_core.prompts import ChatPromptTemplate
 12 | from langchain_core.runnables import chain
 13 | from langserve import CustomUserType
 14 | from pydantic import BaseModel, Field, validator
 15 | from typing_extensions import TypedDict
 16 | 
 17 | from db.models import Example, Extractor
 18 | from extraction.utils import update_json_schema
 19 | from server import settings
 20 | from server.models import DEFAULT_MODEL, get_chunk_size, get_model
 21 | from server.validators import validate_json_schema
 22 | 
 23 | 
 24 | class ExtractionExample(BaseModel):
 25 |     """An example extraction.
 26 | 
 27 |     This example consists of a text and the expected output of the extraction.
 28 |     """
 29 | 
 30 |     text: str = Field(..., description="The input text")
 31 |     output: List[Dict[str, Any]] = Field(
 32 |         ..., description="The expected output of the example. A list of objects."
 33 |     )
 34 | 
 35 | 
 36 | class ExtractRequest(CustomUserType):
 37 |     """Request body for the extract endpoint."""
 38 | 
 39 |     text: str = Field(..., description="The text to extract from.")
 40 |     json_schema: Dict[str, Any] = Field(
 41 |         ...,
 42 |         description="JSON schema that describes what content should be extracted "
 43 |         "from the text.",
 44 |         alias="schema",
 45 |     )
 46 |     instructions: Optional[str] = Field(
 47 |         None, description="Supplemental system instructions."
 48 |     )
 49 |     examples: Optional[List[ExtractionExample]] = Field(
 50 |         None, description="Examples of extractions."
 51 |     )
 52 |     model_name: Optional[str] = Field("gpt-3.5-turbo", description="Chat model to use.")
 53 | 
 54 |     @validator("json_schema")
 55 |     def validate_schema(cls, v: Any) -> Dict[str, Any]:
 56 |         """Validate the schema."""
 57 |         validate_json_schema(v)
 58 |         return v
 59 | 
 60 | 
 61 | class ExtractResponse(TypedDict, total=False):
 62 |     """Response body for the extract endpoint."""
 63 | 
 64 |     data: List[Any]
 65 |     # content to long will be set to true if the content is too long
 66 |     # and had to be truncated
 67 |     content_too_long: Optional[bool]
 68 | 
 69 | 
 70 | def _cast_example_to_dict(example: Example) -> Dict[str, Any]:
 71 |     """Cast example record to dictionary."""
 72 |     return {
 73 |         "text": example.content,
 74 |         "output": example.output,
 75 |     }
 76 | 
 77 | 
 78 | def _make_prompt_template(
 79 |     instructions: Optional[str],
 80 |     examples: Optional[Sequence[ExtractionExample]],
 81 |     function_name: str,
 82 | ) -> ChatPromptTemplate:
 83 |     """Make a system message from instructions and examples."""
 84 |     prefix = (
 85 |         "You are a top-tier algorithm for extracting information from text. "
 86 |         "Only extract information that is relevant to the provided text. "
 87 |         "If no information is relevant, use the schema and output "
 88 |         "an empty list where appropriate."
 89 |     )
 90 |     if instructions:
 91 |         system_message = ("system", f"{prefix}\n\n{instructions}")
 92 |     else:
 93 |         system_message = ("system", prefix)
 94 |     prompt_components = [system_message]
 95 |     if examples is not None:
 96 |         few_shot_prompt = []
 97 |         for example in examples:
 98 |             # TODO: We'll need to refactor this at some point to
 99 |             # support other encoding strategies. The function calling logic here
100 |             # has some hard-coded assumptions (e.g., name of parameters like `data`).
101 |             _id = uuid.uuid4().hex[:]
102 |             tool_call = {
103 |                 "args": {"data": example.output},
104 |                 "name": function_name,
105 |                 "id": _id,
106 |             }
107 |             few_shot_prompt.extend(
108 |                 [
109 |                     HumanMessage(content=example.text),
110 |                     AIMessage(content="", tool_calls=[tool_call]),
111 |                     ToolMessage(
112 |                         content="You have correctly called this tool.", tool_call_id=_id
113 |                     ),
114 |                 ]
115 |             )
116 |         prompt_components.extend(few_shot_prompt)
117 | 
118 |     prompt_components.append(
119 |         (
120 |             "human",
121 |             "I need to extract information from "
122 |             "the following text: ```\n{text}\n```\n",
123 |         ),
124 |     )
125 |     return ChatPromptTemplate.from_messages(prompt_components)
126 | 
127 | 
128 | # PUBLIC API
129 | 
130 | 
131 | def deduplicate(
132 |     extract_responses: Sequence[ExtractResponse],
133 | ) -> ExtractResponse:
134 |     """Deduplicate the results.
135 | 
136 |     The deduplication is done by comparing the serialized JSON of each of the results
137 |     and only keeping the unique ones.
138 |     """
139 |     unique_extracted = []
140 |     seen = set()
141 |     for response in extract_responses:
142 |         for data_item in response["data"]:
143 |             # Serialize the data item for comparison purposes
144 |             serialized = json.dumps(data_item, sort_keys=True)
145 |             if serialized not in seen:
146 |                 seen.add(serialized)
147 |                 unique_extracted.append(data_item)
148 | 
149 |     return {
150 |         "data": unique_extracted,
151 |     }
152 | 
153 | 
154 | def get_examples_from_extractor(extractor: Extractor) -> List[Dict[str, Any]]:
155 |     """Get examples from an extractor."""
156 |     return [_cast_example_to_dict(example) for example in extractor.examples]
157 | 
158 | 
159 | @chain
160 | async def extraction_runnable(extraction_request: ExtractRequest) -> ExtractResponse:
161 |     """An end point to extract content from a given text object."""
162 |     # TODO: Add validation for model context window size
163 |     schema = update_json_schema(extraction_request.json_schema)
164 |     try:
165 |         Draft202012Validator.check_schema(schema)
166 |     except exceptions.ValidationError as e:
167 |         raise HTTPException(status_code=422, detail=f"Invalid schema: {e.message}")
168 | 
169 |     prompt = _make_prompt_template(
170 |         extraction_request.instructions,
171 |         extraction_request.examples,
172 |         schema["title"],
173 |     )
174 |     model = get_model(extraction_request.model_name)
175 |     runnable = (prompt | model.with_structured_output(schema=schema)).with_config(
176 |         {"run_name": "extraction"}
177 |     )
178 | 
179 |     return await runnable.ainvoke({"text": extraction_request.text})
180 | 
181 | 
182 | async def extract_entire_document(
183 |     content: str,
184 |     extractor: Extractor,
185 |     model_name: str,
186 | ) -> ExtractResponse:
187 |     """Extract from entire document."""
188 | 
189 |     json_schema = extractor.schema
190 |     examples = get_examples_from_extractor(extractor)
191 |     text_splitter = TokenTextSplitter(
192 |         chunk_size=get_chunk_size(model_name),
193 |         chunk_overlap=20,
194 |         model_name=DEFAULT_MODEL,
195 |     )
196 |     texts = text_splitter.split_text(content)
197 |     extraction_requests = [
198 |         ExtractRequest(
199 |             text=text,
200 |             schema=json_schema,
201 |             instructions=extractor.instruction,  # TODO: consistent naming
202 |             examples=examples,
203 |             model_name=model_name,
204 |         )
205 |         for text in texts
206 |     ]
207 | 
208 |     # Limit the number of chunks to process
209 |     if len(extraction_requests) > settings.MAX_CHUNKS and settings.MAX_CHUNKS > 0:
210 |         content_too_long = True
211 |         extraction_requests = extraction_requests[: settings.MAX_CHUNKS]
212 |     else:
213 |         content_too_long = False
214 | 
215 |     # Run extractions which may potentially yield duplicate results
216 |     extract_responses: List[ExtractResponse] = await extraction_runnable.abatch(
217 |         extraction_requests, {"max_concurrency": settings.MAX_CONCURRENCY}
218 |     )
219 |     # Deduplicate the results
220 |     return {
221 |         "data": deduplicate(extract_responses)["data"],
222 |         "content_too_long": content_too_long,
223 |     }
224 | 


--------------------------------------------------------------------------------
/backend/server/main.py:
--------------------------------------------------------------------------------
 1 | """Entry point into the server."""
 2 | import logging
 3 | import os
 4 | from pathlib import Path
 5 | 
 6 | from fastapi import FastAPI
 7 | from fastapi.middleware.cors import CORSMiddleware
 8 | from fastapi.staticfiles import StaticFiles
 9 | from langserve import add_routes
10 | 
11 | from server.api import configurables, examples, extract, extractors, shared, suggest
12 | from server.extraction_runnable import (
13 |     ExtractRequest,
14 |     ExtractResponse,
15 |     extraction_runnable,
16 | )
17 | 
18 | logger = logging.getLogger(__name__)
19 | 
20 | app = FastAPI(
21 |     title="Extraction Powered by LangChain",
22 |     description="An extraction service powered by LangChain.",
23 |     version="0.0.1",
24 |     openapi_tags=[
25 |         {
26 |             "name": "extraction",
27 |             "description": "Operations related to extracting content from text.",
28 |         }
29 |     ],
30 | )
31 | 
32 | 
33 | ROOT = Path(__file__).parent.parent
34 | 
35 | ORIGINS = os.environ.get("CORS_ORIGINS", "").split(",")
36 | 
37 | if ORIGINS:
38 |     app.add_middleware(
39 |         CORSMiddleware,
40 |         allow_origins=ORIGINS,
41 |         allow_credentials=True,
42 |         allow_methods=["*"],
43 |         allow_headers=["*"],
44 |     )
45 | 
46 | 
47 | @app.get("/ready")
48 | def ready() -> str:
49 |     return "ok"
50 | 
51 | 
52 | # Include API endpoints for extractor definitions
53 | app.include_router(extractors.router)
54 | app.include_router(examples.router)
55 | app.include_router(extract.router)
56 | app.include_router(suggest.router)
57 | app.include_router(shared.router)
58 | app.include_router(configurables.router)
59 | 
60 | add_routes(
61 |     app,
62 |     extraction_runnable.with_types(
63 |         input_type=ExtractRequest, output_type=ExtractResponse
64 |     ),
65 |     path="/extract_text",
66 |     enabled_endpoints=["invoke", "batch"],
67 | )
68 | 
69 | 
70 | # Serve the frontend
71 | UI_DIR = str(ROOT / "ui")
72 | 
73 | if os.path.exists(UI_DIR):
74 |     app.mount("/", StaticFiles(directory=UI_DIR, html=True), name="ui")
75 | else:
76 |     logger.warning("No UI directory found, serving API only.")
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     import uvicorn
81 | 
82 |     uvicorn.run(app, host="localhost", port=8000)
83 | 


--------------------------------------------------------------------------------
/backend/server/models.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Optional
 3 | 
 4 | from langchain_anthropic import ChatAnthropic
 5 | from langchain_core.language_models.chat_models import BaseChatModel
 6 | from langchain_fireworks import ChatFireworks
 7 | from langchain_groq import ChatGroq
 8 | from langchain_openai import ChatOpenAI
 9 | 
10 | 
11 | def get_supported_models():
12 |     """Get models according to environment secrets."""
13 |     models = {}
14 |     if "OPENAI_API_KEY" in os.environ:
15 |         models["gpt-3.5-turbo"] = {
16 |             "chat_model": ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
17 |             "description": "GPT-3.5 Turbo",
18 |         }
19 |         if os.environ.get("DISABLE_GPT4", "").lower() != "true":
20 |             models["gpt-4-0125-preview"] = {
21 |                 "chat_model": ChatOpenAI(model="gpt-4-0125-preview", temperature=0),
22 |                 "description": "GPT-4 0125 Preview",
23 |             }
24 |     if "FIREWORKS_API_KEY" in os.environ:
25 |         models["fireworks"] = {
26 |             "chat_model": ChatFireworks(
27 |                 model="accounts/fireworks/models/firefunction-v1",
28 |                 temperature=0,
29 |             ),
30 |             "description": "Fireworks Firefunction-v1",
31 |         }
32 |     if "TOGETHER_API_KEY" in os.environ:
33 |         models["together-ai-mistral-8x7b-instruct-v0.1"] = {
34 |             "chat_model": ChatOpenAI(
35 |                 base_url="https://api.together.xyz/v1",
36 |                 api_key=os.environ["TOGETHER_API_KEY"],
37 |                 model="mistralai/Mixtral-8x7B-Instruct-v0.1",
38 |                 temperature=0,
39 |             ),
40 |             "description": "Mixtral 8x7B Instruct v0.1 (Together AI)",
41 |         }
42 |     if "ANTHROPIC_API_KEY" in os.environ:
43 |         models["claude-3-sonnet-20240229"] = {
44 |             "chat_model": ChatAnthropic(
45 |                 model="claude-3-sonnet-20240229", temperature=0
46 |             ),
47 |             "description": "Claude 3 Sonnet",
48 |         }
49 |     if "GROQ_API_KEY" in os.environ:
50 |         models["groq-llama3-8b-8192"] = {
51 |             "chat_model": ChatGroq(
52 |                 model="llama3-8b-8192",
53 |                 temperature=0,
54 |             ),
55 |             "description": "GROQ Llama 3 8B",
56 |         }
57 | 
58 |     return models
59 | 
60 | 
61 | SUPPORTED_MODELS = get_supported_models()
62 | DEFAULT_MODEL = "gpt-3.5-turbo"
63 | 
64 | 
65 | CHUNK_SIZES = {  # in tokens, defaults to int(4_096 * 0.8). Override here.
66 |     "gpt-4-0125-preview": int(128_000 * 0.8),
67 | }
68 | 
69 | 
70 | def get_chunk_size(model_name: str) -> int:
71 |     """Get the chunk size."""
72 |     return CHUNK_SIZES.get(model_name, int(4_096 * 0.8))
73 | 
74 | 
75 | def get_model(model_name: Optional[str] = None) -> BaseChatModel:
76 |     """Get the model."""
77 |     if model_name is None:
78 |         return SUPPORTED_MODELS[DEFAULT_MODEL]["chat_model"]
79 |     else:
80 |         supported_model_names = list(SUPPORTED_MODELS.keys())
81 |         if model_name not in supported_model_names:
82 |             raise ValueError(
83 |                 f"Model {model_name} not found. "
84 |                 f"Supported models: {supported_model_names}"
85 |             )
86 |         else:
87 |             return SUPPORTED_MODELS[model_name]["chat_model"]
88 | 


--------------------------------------------------------------------------------
/backend/server/retrieval.py:
--------------------------------------------------------------------------------
 1 | from operator import itemgetter
 2 | from typing import Any, Dict, List, Optional
 3 | 
 4 | from langchain.text_splitter import CharacterTextSplitter
 5 | from langchain_community.vectorstores import FAISS
 6 | from langchain_core.runnables import RunnableLambda
 7 | from langchain_openai import OpenAIEmbeddings
 8 | 
 9 | from db.models import Extractor
10 | from server.extraction_runnable import (
11 |     ExtractRequest,
12 |     ExtractResponse,
13 |     deduplicate,
14 |     extraction_runnable,
15 |     get_examples_from_extractor,
16 | )
17 | 
18 | 
19 | def _make_extract_requests(input_dict: Dict[str, Any]) -> List[ExtractRequest]:
20 |     docs = input_dict.pop("text")
21 |     return [ExtractRequest(text=doc.page_content, **input_dict) for doc in docs]
22 | 
23 | 
24 | async def extract_from_content(
25 |     content: str,
26 |     extractor: Extractor,
27 |     model_name: str,
28 |     *,
29 |     text_splitter_kwargs: Optional[Dict[str, Any]] = None,
30 | ) -> ExtractResponse:
31 |     """Extract from potentially long-form content."""
32 |     if text_splitter_kwargs is None:
33 |         text_splitter_kwargs = {
34 |             "separator": "\n\n",
35 |             "chunk_size": 1000,
36 |             "chunk_overlap": 50,
37 |         }
38 |     text_splitter = CharacterTextSplitter(**text_splitter_kwargs)
39 |     docs = text_splitter.create_documents([content])
40 |     doc_contents = [doc.page_content for doc in docs]
41 | 
42 |     vectorstore = FAISS.from_texts(doc_contents, embedding=OpenAIEmbeddings())
43 |     retriever = vectorstore.as_retriever()
44 | 
45 |     runnable = (
46 |         {
47 |             "text": itemgetter("query") | retriever,
48 |             "schema": itemgetter("schema"),
49 |             "instructions": lambda x: x.get("instructions"),
50 |             "examples": lambda x: x.get("examples"),
51 |             "model_name": lambda x: x.get("model_name"),
52 |         }
53 |         | RunnableLambda(_make_extract_requests)
54 |         | extraction_runnable.abatch
55 |     )
56 |     schema = extractor.schema
57 |     examples = get_examples_from_extractor(extractor)
58 |     description = extractor.description  # TODO: improve this
59 |     result = await runnable.ainvoke(
60 |         {
61 |             "query": description,
62 |             "schema": schema,
63 |             "examples": examples,
64 |             "instructions": extractor.instruction,
65 |             "model_name": model_name,
66 |         }
67 |     )
68 |     return deduplicate(result)
69 | 


--------------------------------------------------------------------------------
/backend/server/settings.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | 
 5 | from sqlalchemy.engine import URL
 6 | 
 7 | 
 8 | def get_postgres_url() -> URL:
 9 |     if "INSTANCE_UNIX_SOCKET" in os.environ:
10 |         return URL.create(
11 |             drivername="postgresql+psycopg2",
12 |             username=os.environ.get("PG_USER", "langchain"),
13 |             password=os.environ.get("PG_PASSWORD", "langchain"),
14 |             database=os.environ.get("PG_DATABASE", "langchain"),
15 |             query={
16 |                 "host": os.environ["INSTANCE_UNIX_SOCKET"],
17 |             },
18 |         )
19 | 
20 |     url = URL.create(
21 |         drivername="postgresql+psycopg2",
22 |         username=os.environ.get("PG_USER", "langchain"),
23 |         password=os.environ.get("PG_PASSWORD", "langchain"),
24 |         host=os.environ.get("PG_HOST", "localhost"),
25 |         database=os.environ.get("PG_DATABASE", "langchain"),
26 |         port=5432,
27 |     )
28 |     return url
29 | 
30 | 
31 | # Max concurrency used for extracting content from documents.
32 | # A long document is broken into smaller chunks this controls
33 | # how many chunks are processed concurrently.
34 | MAX_CONCURRENCY = int(os.environ.get("MAX_CONCURRENCY", 1))
35 | 
36 | # Max number of chunks to process per documents
37 | # When a long document is split into chunks, this controls
38 | # how many of those chunks will be processed.
39 | # Set to 0 or negative to disable the max chunks limit.
40 | MAX_CHUNKS = int(os.environ.get("MAX_CHUNKS", -1))
41 | 


--------------------------------------------------------------------------------
/backend/server/validators.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | from fastapi import HTTPException
 4 | from jsonschema import exceptions
 5 | from jsonschema.validators import Draft202012Validator
 6 | 
 7 | 
 8 | def validate_json_schema(schema: Dict[str, Any]) -> None:
 9 |     """Validate a JSON schema."""
10 |     try:
11 |         Draft202012Validator.check_schema(schema)
12 |     except exceptions.ValidationError as e:
13 |         raise HTTPException(
14 |             status_code=422, detail=f"Not a valid JSON schema: {e.message}"
15 |         )
16 | 


--------------------------------------------------------------------------------
/backend/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/__init__.py


--------------------------------------------------------------------------------
/backend/tests/db.py:
--------------------------------------------------------------------------------
 1 | """Utility code that sets up a test database and client for tests."""
 2 | from contextlib import asynccontextmanager
 3 | from typing import Generator
 4 | 
 5 | from httpx import AsyncClient
 6 | from sqlalchemy import URL, create_engine
 7 | from sqlalchemy.orm import sessionmaker
 8 | 
 9 | from db.models import Base, get_session
10 | from server.main import app
11 | 
12 | url = URL.create(
13 |     drivername="postgresql",
14 |     username="langchain",
15 |     password="langchain",
16 |     host="localhost",
17 |     database="langchain_test",
18 |     port=5432,
19 | )
20 | engine = create_engine(url)
21 | TestingSession = sessionmaker(bind=engine)
22 | 
23 | 
24 | def override_get_session() -> Generator[TestingSession, None, None]:
25 |     """Override the get_session dependency with a test session.
26 | 
27 |     This fixture also re-creats the database before each test and drops it after to
28 |     ensure a clean slate for each test.
29 |     """
30 |     try:
31 |         session = TestingSession()
32 |         yield session
33 |     finally:
34 |         session.close()
35 | 
36 | 
37 | app.dependency_overrides[get_session] = override_get_session
38 | 
39 | 
40 | @asynccontextmanager
41 | async def get_async_client() -> AsyncClient:
42 |     """Get an async client."""
43 |     # Clear the database before each test
44 |     Base.metadata.drop_all(engine)
45 |     Base.metadata.create_all(engine)
46 | 
47 |     async_client = AsyncClient(app=app, base_url="http://test")
48 |     try:
49 |         yield async_client
50 |     finally:
51 |         await async_client.aclose()
52 | 


--------------------------------------------------------------------------------
/backend/tests/integration_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/integration_tests/__init__.py


--------------------------------------------------------------------------------
/backend/tests/integration_tests/test_extraction.py:
--------------------------------------------------------------------------------
 1 | """Makes it easy to run an integration tests using a real chat model."""
 2 | from contextlib import asynccontextmanager
 3 | from typing import Optional
 4 | 
 5 | import httpx
 6 | from fastapi import FastAPI
 7 | from httpx import AsyncClient
 8 | from langchain_core.pydantic_v1 import BaseModel
 9 | 
10 | from server.main import app
11 | 
12 | 
13 | @asynccontextmanager
14 | async def get_async_test_client(
15 |     server: FastAPI, *, path: Optional[str] = None, raise_app_exceptions: bool = True
16 | ) -> AsyncClient:
17 |     """Get an async client."""
18 |     url = "http://localhost:9999/"
19 |     if path:
20 |         url += path
21 |     transport = httpx.ASGITransport(
22 |         app=server,
23 |         raise_app_exceptions=raise_app_exceptions,
24 |     )
25 |     async_client = AsyncClient(app=server, base_url=url, transport=transport)
26 |     try:
27 |         yield async_client
28 |     finally:
29 |         await async_client.aclose()
30 | 
31 | 
32 | async def test_extraction_api() -> None:
33 |     """Test the extraction API endpoint."""
34 | 
35 |     class Person(BaseModel):
36 |         age: Optional[int]
37 |         name: Optional[str]
38 |         alias: Optional[str]
39 | 
40 |     async with get_async_test_client(app) as client:
41 |         text = """
42 |         My name is Chester. I am young. I love cats. I have two cats. My age
43 |         is the number of cats I have to the power of 5. (Approximately.)
44 |         I also have a friend. His name is Neo. He is older than me. He is
45 |         also a cat lover. He has 3 cats. He is 25 years old.
46 |         """
47 |         result = await client.post(
48 |             "/extract_text/invoke",
49 |             json={"input": {"text": text, "schema": Person.schema()}},
50 |         )
51 |         assert result.status_code == 200, result.text
52 |         response_data = result.json()
53 |         assert isinstance(response_data["output"]["data"], list)
54 | 
55 |         # Test with instructions
56 |         result = await client.post(
57 |             "/extract_text/invoke",
58 |             json={
59 |                 "input": {
60 |                     "text": text,
61 |                     "schema": Person.schema(),
62 |                     "instructions": "Very important: Chester's alias is Neo.",
63 |                 }
64 |             },
65 |         )
66 |         response_data = result.json()
67 |         assert result.status_code == 200, result.text
68 | 
69 |         # Test with few shot examples
70 |         examples = [
71 |             {
72 |                 "text": "My name is Grung. I am 100.",
73 |                 "output": [Person(age=100, name="######").dict()],
74 |             },
75 |         ]
76 |         result = await client.post(
77 |             "/extract_text/invoke",
78 |             json={
79 |                 "input": {
80 |                     "text": text,
81 |                     "schema": Person.schema(),
82 |                     "instructions": "Redact all names using the characters `######`",
83 |                     "examples": examples,
84 |                 }
85 |             },
86 |         )
87 |         assert result.status_code == 200, result.text
88 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/__init__.py


--------------------------------------------------------------------------------
/backend/tests/unit_tests/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/api/__init__.py


--------------------------------------------------------------------------------
/backend/tests/unit_tests/api/test_api_configuration.py:
--------------------------------------------------------------------------------
 1 | from tests.db import get_async_client
 2 | 
 3 | 
 4 | async def test_configuration_api() -> None:
 5 |     """Test the configuration API."""
 6 |     async with get_async_client() as client:
 7 |         response = await client.get("/configuration")
 8 |         assert response.status_code == 200
 9 |         result = response.json()
10 |         assert isinstance(result, dict)
11 |         assert sorted(result) == [
12 |             "accepted_mimetypes",
13 |             "available_models",
14 |             "max_chunks",
15 |             "max_concurrency",
16 |             "max_file_size_mb",
17 |             "models",
18 |         ]
19 |         models = result["available_models"]
20 |         assert all(isinstance(model_name, str) for model_name in models)
21 |         assert "gpt-3.5-turbo" in models
22 |         assert len(models) >= 2
23 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/api/test_api_defining_extractors.py:
--------------------------------------------------------------------------------
  1 | """Code to test API endpoints."""
  2 | import uuid
  3 | 
  4 | from tests.db import get_async_client
  5 | 
  6 | 
  7 | async def test_extractors_api() -> None:
  8 |     """This will test a few of the extractors API endpoints."""
  9 |     # First verify that the database is empty
 10 |     async with get_async_client() as client:
 11 |         user_id = str(uuid.uuid4())
 12 |         headers = {"x-key": user_id}
 13 |         response = await client.get("/extractors", headers=headers)
 14 |         assert response.status_code == 200
 15 |         assert response.json() == []
 16 | 
 17 |         # Verify that we can create an extractor
 18 |         create_request = {
 19 |             "description": "Test Description",
 20 |             "schema": {"type": "object"},
 21 |             "instruction": "Test Instruction",
 22 |         }
 23 |         response = await client.post(
 24 |             "/extractors", json=create_request, headers=headers
 25 |         )
 26 |         assert response.status_code == 200
 27 | 
 28 |         # Verify that the extractor was created
 29 |         response = await client.get("/extractors", headers=headers)
 30 |         assert response.status_code == 200
 31 |         get_response = response.json()
 32 |         assert len(get_response) == 1
 33 | 
 34 |         # Check headers
 35 |         bad_headers = {"x-key": str(uuid.uuid4())}
 36 |         bad_response = await client.get("/extractors", headers=bad_headers)
 37 |         assert bad_response.status_code == 200
 38 |         assert len(bad_response.json()) == 0
 39 | 
 40 |         # Check we need cookie to delete
 41 |         uuid_str = get_response[0]["uuid"]
 42 |         _ = uuid.UUID(uuid_str)  # assert valid uuid
 43 |         await client.delete(f"/extractors/{uuid_str}", headers=bad_headers)
 44 |         # Check extractor was not deleted
 45 |         response = await client.get("/extractors", headers=headers)
 46 |         assert len(response.json()) == 1
 47 | 
 48 |         # Verify that we can delete an extractor
 49 |         _ = uuid.UUID(uuid_str)  # assert valid uuid
 50 |         response = await client.delete(f"/extractors/{uuid_str}", headers=headers)
 51 |         assert response.status_code == 200
 52 | 
 53 |         get_response = await client.get("/extractors", headers=headers)
 54 |         assert get_response.status_code == 200
 55 |         assert get_response.json() == []
 56 | 
 57 |         # Verify that we can create an extractor
 58 |         create_request = {
 59 |             "description": "Test Description",
 60 |             "schema": {"type": "object"},
 61 |             "instruction": "Test Instruction",
 62 |         }
 63 |         response = await client.post(
 64 |             "/extractors", json=create_request, headers=headers
 65 |         )
 66 |         assert response.status_code == 200
 67 | 
 68 |         # Verify that the extractor was created
 69 |         response = await client.get("/extractors", headers=headers)
 70 |         assert response.status_code == 200
 71 |         assert len(response.json()) == 1
 72 | 
 73 |         # Verify that we can delete an extractor
 74 |         get_response = response.json()
 75 |         uuid_str = get_response[0]["uuid"]
 76 |         _ = uuid.UUID(uuid_str)  # assert valid uuid
 77 |         response = await client.delete(f"/extractors/{uuid_str}", headers=headers)
 78 |         assert response.status_code == 200
 79 | 
 80 |         get_response = await client.get("/extractors", headers=headers)
 81 |         assert get_response.status_code == 200
 82 |         assert get_response.json() == []
 83 | 
 84 |         # Verify that we can create an extractor, including other properties
 85 |         user_id = str(uuid.uuid4())
 86 |         create_request = {
 87 |             "name": "my extractor",
 88 |             "description": "Test Description",
 89 |             "schema": {"type": "object"},
 90 |             "instruction": "Test Instruction",
 91 |         }
 92 |         response = await client.post(
 93 |             "/extractors", json=create_request, headers=headers
 94 |         )
 95 |         extractor_uuid = response.json()["uuid"]
 96 |         assert response.status_code == 200
 97 |         response = await client.get(f"/extractors/{extractor_uuid}", headers=headers)
 98 |         response_data = response.json()
 99 |         assert extractor_uuid == response_data["uuid"]
100 |         assert "my extractor" == response_data["name"]
101 |         assert "user_id" not in response_data
102 | 
103 | 
104 | async def test_sharing_extractor() -> None:
105 |     """Test sharing an extractor."""
106 |     async with get_async_client() as client:
107 |         user_id = str(uuid.uuid4())
108 |         headers = {"x-key": user_id}
109 |         response = await client.get("/extractors", headers=headers)
110 |         assert response.status_code == 200
111 |         assert response.json() == []
112 |         # Verify that we can create an extractor
113 |         create_request = {
114 |             "name": "Test Name",
115 |             "description": "Test Description",
116 |             "schema": {"type": "object"},
117 |             "instruction": "Test Instruction",
118 |         }
119 |         response = await client.post(
120 |             "/extractors", json=create_request, headers=headers
121 |         )
122 |         assert response.status_code == 200
123 | 
124 |         uuid_str = response.json()["uuid"]
125 | 
126 |         # Generate a share uuid
127 |         response = await client.post(f"/extractors/{uuid_str}/share", headers=headers)
128 |         assert response.status_code == 200
129 |         assert "share_uuid" in response.json()
130 |         share_uuid = response.json()["share_uuid"]
131 | 
132 |         # Test idempotency
133 |         response = await client.post(f"/extractors/{uuid_str}/share", headers=headers)
134 |         assert response.status_code == 200
135 |         assert "share_uuid" in response.json()
136 |         assert response.json()["share_uuid"] == share_uuid
137 | 
138 |         # Check headers
139 |         bad_headers = {"x-key": str(uuid.uuid4())}
140 |         response = await client.post(
141 |             f"/extractors/{uuid_str}/share", headers=bad_headers
142 |         )
143 |         assert response.status_code == 404
144 | 
145 |         # Check that we can retrieve the shared extractor
146 |         response = await client.get(f"/shared/extractors/{share_uuid}")
147 |         assert response.status_code == 200
148 |         keys = sorted(response.json())
149 |         assert keys == ["description", "instruction", "name", "schema"]
150 | 
151 |         assert response.json() == {
152 |             "description": "Test Description",
153 |             "instruction": "Test Instruction",
154 |             "name": "Test Name",
155 |             "schema": {"type": "object"},
156 |         }
157 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/api/test_api_examples.py:
--------------------------------------------------------------------------------
  1 | """Code to test API endpoints."""
  2 | import uuid
  3 | 
  4 | from tests.db import get_async_client
  5 | 
  6 | 
  7 | async def _list_extractors() -> list:
  8 |     async with get_async_client() as client:
  9 |         response = await client.get("/extractors")
 10 |         assert response.status_code == 200
 11 |         return response.json()
 12 | 
 13 | 
 14 | async def test_examples_api() -> None:
 15 |     """Runs through a set of API calls to test the examples API."""
 16 |     async with get_async_client() as client:
 17 |         # First create an extractor
 18 |         user_id = str(uuid.uuid4())
 19 |         headers = {"x-key": user_id}
 20 |         create_request = {
 21 |             "description": "Test Description",
 22 |             "name": "Test Name",
 23 |             "schema": {"type": "object"},
 24 |             "instruction": "Test Instruction",
 25 |         }
 26 |         response = await client.post(
 27 |             "/extractors", json=create_request, headers=headers
 28 |         )
 29 |         assert response.status_code == 200
 30 |         # Get the extractor id
 31 |         extractor_id = response.json()["uuid"]
 32 | 
 33 |         # Let's verify that there are no examples
 34 |         response = await client.get(
 35 |             "/examples?extractor_id=" + extractor_id, headers=headers
 36 |         )
 37 |         assert response.status_code == 200
 38 |         assert response.json() == []
 39 | 
 40 |         # Now let's create an example
 41 |         create_request = {
 42 |             "extractor_id": extractor_id,
 43 |             "content": "Test Content",
 44 |             "output": [
 45 |                 {
 46 |                     "age": 100,
 47 |                     "name": "Grung",
 48 |                 }
 49 |             ],
 50 |         }
 51 |         response = await client.post("/examples", json=create_request, headers=headers)
 52 |         assert response.status_code == 200
 53 |         example_id = response.json()["uuid"]
 54 | 
 55 |         # Check headers
 56 |         bad_headers = {"x-key": str(uuid.uuid4())}
 57 |         response = await client.post(
 58 |             "/examples", json=create_request, headers=bad_headers
 59 |         )
 60 |         assert response.status_code == 404
 61 | 
 62 |         # Verify that the example was created
 63 |         response = await client.get(
 64 |             "/examples?extractor_id=" + extractor_id, headers=headers
 65 |         )
 66 |         assert response.status_code == 200
 67 |         assert len(response.json()) == 1
 68 | 
 69 |         keys = ["content", "extractor_id", "output", "uuid"]
 70 |         projected_response = {
 71 |             key: record[key] for key in keys for record in response.json()
 72 |         }
 73 |         assert projected_response == {
 74 |             "content": "Test Content",
 75 |             "extractor_id": extractor_id,
 76 |             "output": [
 77 |                 {
 78 |                     "age": 100,
 79 |                     "name": "Grung",
 80 |                 }
 81 |             ],
 82 |             "uuid": example_id,
 83 |         }
 84 | 
 85 |         # Check headers
 86 |         response = await client.get(
 87 |             "/examples?extractor_id=" + extractor_id, headers=bad_headers
 88 |         )
 89 |         assert response.status_code == 404
 90 | 
 91 |         # Check we need cookie to delete
 92 |         response = await client.delete(f"/examples/{example_id}", headers=bad_headers)
 93 |         assert response.status_code == 404
 94 | 
 95 |         # Verify that we can delete an example
 96 |         response = await client.delete(f"/examples/{example_id}", headers=headers)
 97 |         assert response.status_code == 200
 98 | 
 99 |         # Verify that the example was deleted
100 |         response = await client.get(
101 |             "/examples?extractor_id=" + extractor_id, headers=headers
102 |         )
103 |         assert response.status_code == 200
104 |         assert response.json() == []
105 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/api/test_api_extract.py:
--------------------------------------------------------------------------------
  1 | """Code to test API endpoints."""
  2 | import tempfile
  3 | from unittest.mock import patch
  4 | from uuid import UUID, uuid4
  5 | 
  6 | from langchain.text_splitter import CharacterTextSplitter
  7 | from langchain_community.embeddings import FakeEmbeddings
  8 | from langchain_core.runnables import RunnableLambda
  9 | 
 10 | from tests.db import get_async_client
 11 | 
 12 | 
 13 | def mock_extraction_runnable(*args, **kwargs):
 14 |     """Mock the extraction_runnable function."""
 15 |     extract_request = args[0]
 16 |     return {
 17 |         "data": [
 18 |             extract_request.text[:10],
 19 |         ]
 20 |     }
 21 | 
 22 | 
 23 | def mock_text_splitter(*args, **kwargs):
 24 |     return CharacterTextSplitter()
 25 | 
 26 | 
 27 | def mock_embeddings(*args, **kwargs):
 28 |     return FakeEmbeddings(size=10)
 29 | 
 30 | 
 31 | @patch(
 32 |     "server.extraction_runnable.extraction_runnable",
 33 |     new=RunnableLambda(mock_extraction_runnable),
 34 | )
 35 | @patch(
 36 |     "server.retrieval.extraction_runnable",
 37 |     new=RunnableLambda(mock_extraction_runnable),
 38 | )
 39 | @patch("server.extraction_runnable.TokenTextSplitter", mock_text_splitter)
 40 | @patch("server.retrieval.OpenAIEmbeddings", mock_embeddings)
 41 | async def test_extract_from_file() -> None:
 42 |     """Test extract from file API."""
 43 |     async with get_async_client() as client:
 44 |         user_id = str(uuid4())
 45 |         headers = {"x-key": user_id}
 46 |         # Test with invalid extractor
 47 |         extractor_id = UUID(int=1027)  # 1027 is a good number.
 48 |         response = await client.post(
 49 |             "/extract",
 50 |             data={
 51 |                 "extractor_id": str(extractor_id),
 52 |                 "text": "Test Content",
 53 |             },
 54 |             headers=headers,
 55 |         )
 56 |         assert response.status_code == 404, response.text
 57 | 
 58 |         # First create an extractor
 59 |         create_request = {
 60 |             "name": "Test Name",
 61 |             "description": "Test Description",
 62 |             "schema": {"type": "object"},
 63 |             "instruction": "Test Instruction",
 64 |         }
 65 |         response = await client.post(
 66 |             "/extractors",
 67 |             json=create_request,
 68 |             headers=headers,
 69 |         )
 70 |         assert response.status_code == 200, response.text
 71 |         # Get the extractor id
 72 |         extractor_id = response.json()["uuid"]
 73 | 
 74 |         # Run an extraction.
 75 |         # We'll use multi-form data here.
 76 |         response = await client.post(
 77 |             "/extract",
 78 |             data={
 79 |                 "extractor_id": extractor_id,
 80 |                 "text": "Test Content",
 81 |                 "mode": "entire_document",
 82 |             },
 83 |             headers=headers,
 84 |         )
 85 |         assert response.status_code == 200
 86 |         assert response.json() == {
 87 |             "data": ["Test Conte"],
 88 |             "content_too_long": False,
 89 |         }
 90 | 
 91 |         # Vary chat model
 92 |         response = await client.post(
 93 |             "/extract",
 94 |             data={
 95 |                 "extractor_id": extractor_id,
 96 |                 "text": "Test Content",
 97 |                 "mode": "entire_document",
 98 |                 "model_name": "gpt-3.5-turbo",
 99 |             },
100 |             headers=headers,
101 |         )
102 |         assert response.status_code == 200
103 |         assert response.json() == {
104 |             "data": ["Test Conte"],
105 |             "content_too_long": False,
106 |         }
107 | 
108 |         # Test retrieval
109 |         response = await client.post(
110 |             "/extract",
111 |             data={
112 |                 "extractor_id": extractor_id,
113 |                 "text": "Test Content",
114 |                 "mode": "retrieval",
115 |             },
116 |             headers=headers,
117 |         )
118 |         assert response.status_code == 200
119 |         assert response.json() == {
120 |             "data": ["Test Conte"],
121 |         }
122 | 
123 |         # We'll use multi-form data here.
124 |         # Create a named temporary file
125 |         with tempfile.NamedTemporaryFile(mode="w+t", delete=True) as f:
126 |             f.write("This is a named temporary file.")
127 |             f.seek(0)
128 |             f.flush()
129 |             response = await client.post(
130 |                 "/extract",
131 |                 data={
132 |                     "extractor_id": extractor_id,
133 |                     "mode": "entire_document",
134 |                 },
135 |                 files={"file": f},
136 |                 headers=headers,
137 |             )
138 | 
139 |         assert response.status_code == 200, response.text
140 |         assert response.json() == {"data": ["This is a "], "content_too_long": False}
141 | 
142 | 
143 | @patch(
144 |     "server.extraction_runnable.extraction_runnable",
145 |     new=RunnableLambda(mock_extraction_runnable),
146 | )
147 | @patch("server.extraction_runnable.TokenTextSplitter", mock_text_splitter)
148 | async def test_extract_from_large_file() -> None:
149 |     user_id = str(uuid4())
150 |     headers = {"x-key": user_id}
151 |     async with get_async_client() as client:
152 |         # First create an extractor
153 |         create_request = {
154 |             "name": "Test Name",
155 |             "description": "Test Description",
156 |             "schema": {"type": "object"},
157 |             "instruction": "Test Instruction",
158 |         }
159 |         response = await client.post(
160 |             "/extractors", json=create_request, headers=headers
161 |         )
162 |         assert response.status_code == 200, response.text
163 |         # Get the extractor id
164 |         extractor_id = response.json()["uuid"]
165 | 
166 |         # Test file size constraint
167 |         with tempfile.NamedTemporaryFile(mode="w+t", delete=True) as f:
168 |             f.write("This is a named temporary file.")
169 |             f.seek(0)
170 |             f.flush()
171 |             with patch("extraction.parsing._get_file_size_in_mb", return_value=20):
172 |                 response = await client.post(
173 |                     "/extract",
174 |                     data={
175 |                         "extractor_id": extractor_id,
176 |                         "mode": "entire_document",
177 |                     },
178 |                     files={"file": f},
179 |                     headers=headers,
180 |                 )
181 |         assert response.status_code == 413
182 | 
183 |         # Test chunk count constraint
184 |         with tempfile.NamedTemporaryFile(mode="w+t", delete=True) as f:
185 |             f.write("This is a named temporary file.")
186 |             f.seek(0)
187 |             f.flush()
188 |             with patch("server.extraction_runnable.settings.MAX_CHUNKS", 1):
189 |                 with patch.object(
190 |                     CharacterTextSplitter, "split_text", return_value=["a", "b"]
191 |                 ):
192 |                     response = await client.post(
193 |                         "/extract",
194 |                         data={
195 |                             "extractor_id": extractor_id,
196 |                             "mode": "entire_document",
197 |                         },
198 |                         files={"file": f},
199 |                         headers=headers,
200 |                     )
201 |         assert response.status_code == 200
202 |         assert response.json() == {
203 |             "data": ["a"],
204 |             "content_too_long": True,
205 |         }
206 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | os.environ["OPENAI_API_KEY"] = "placeholder"
4 | os.environ["FIREWORKS_API_KEY"] = "placeholder"
5 | os.environ["TOGETHER_API_KEY"] = "placeholder"
6 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/fake/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/fake/__init__.py


--------------------------------------------------------------------------------
/backend/tests/unit_tests/fake/chat_model.py:
--------------------------------------------------------------------------------
 1 | """Fake Chat Model wrapper for testing purposes."""
 2 | from typing import Any, Iterator, List, Optional
 3 | 
 4 | from langchain_core.callbacks.manager import (
 5 |     CallbackManagerForLLMRun,
 6 | )
 7 | from langchain_core.language_models.chat_models import BaseChatModel
 8 | from langchain_core.messages import (
 9 |     AIMessage,
10 |     BaseMessage,
11 | )
12 | from langchain_core.outputs import ChatGeneration, ChatResult
13 | 
14 | 
15 | class GenericFakeChatModel(BaseChatModel):
16 |     """A generic fake chat model that can be used to test the chat model interface."""
17 | 
18 |     messages: Iterator[AIMessage]
19 |     """Get an iterator over messages.
20 | 
21 |     This can be expanded to accept other types like Callables / dicts / strings
22 |     to make the interface more generic if needed.
23 | 
24 |     Note: if you want to pass a list, you can use `iter` to convert it to an iterator.
25 | 
26 |     Please note that streaming is not implemented yet. We should try to implement it
27 |     in the future by delegating to invoke and then breaking the resulting output
28 |     into message chunks.
29 |     """
30 | 
31 |     def _generate(
32 |         self,
33 |         messages: List[BaseMessage],
34 |         stop: Optional[List[str]] = None,
35 |         run_manager: Optional[CallbackManagerForLLMRun] = None,
36 |         **kwargs: Any,
37 |     ) -> ChatResult:
38 |         """Top Level call"""
39 |         message = next(self.messages)
40 |         generation = ChatGeneration(message=message)
41 |         return ChatResult(generations=[generation])
42 | 
43 |     @property
44 |     def _llm_type(self) -> str:
45 |         return "generic-fake-chat-model"
46 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/fake/test_fake_chat_model.py:
--------------------------------------------------------------------------------
 1 | """Tests for verifying that testing utility code works as expected."""
 2 | from itertools import cycle
 3 | 
 4 | from langchain_core.messages import AIMessage
 5 | 
 6 | from tests.unit_tests.fake.chat_model import GenericFakeChatModel
 7 | 
 8 | 
 9 | class AnyStr(str):
10 |     def __init__(self) -> None:
11 |         super().__init__()
12 | 
13 |     def __eq__(self, other: object) -> bool:
14 |         return isinstance(other, str)
15 | 
16 | 
17 | def test_generic_fake_chat_model_invoke() -> None:
18 |     # Will alternate between responding with hello and goodbye
19 |     infinite_cycle = cycle([AIMessage(content="hello"), AIMessage(content="goodbye")])
20 |     model = GenericFakeChatModel(messages=infinite_cycle)
21 |     response = model.invoke("meow")
22 |     assert response == AIMessage(content="hello", id=AnyStr())
23 |     response = model.invoke("kitty")
24 |     assert response == AIMessage(content="goodbye", id=AnyStr())
25 |     response = model.invoke("meow")
26 |     assert response == AIMessage(content="hello", id=AnyStr())
27 | 
28 | 
29 | async def test_generic_fake_chat_model_ainvoke() -> None:
30 |     # Will alternate between responding with hello and goodbye
31 |     infinite_cycle = cycle([AIMessage(content="hello"), AIMessage(content="goodbye")])
32 |     model = GenericFakeChatModel(messages=infinite_cycle)
33 |     response = await model.ainvoke("meow")
34 |     assert response == AIMessage(content="hello", id=AnyStr())
35 |     response = await model.ainvoke("kitty")
36 |     assert response == AIMessage(content="goodbye", id=AnyStr())
37 |     response = await model.ainvoke("meow")
38 |     assert response == AIMessage(content="hello", id=AnyStr())
39 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/fixtures/__init__.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import List
 3 | 
 4 | HERE = Path(__file__).parent
 5 | 
 6 | # PUBLIC API
 7 | 
 8 | 
 9 | def get_sample_paths() -> List[Path]:
10 |     """List all fixtures."""
11 |     return list(HERE.glob("sample.*"))
12 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/fixtures/sample.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/fixtures/sample.docx


--------------------------------------------------------------------------------
/backend/tests/unit_tests/fixtures/sample.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/fixtures/sample.epub


--------------------------------------------------------------------------------
/backend/tests/unit_tests/fixtures/sample.html:
--------------------------------------------------------------------------------
1 | <html><head><meta content="text/html; charset=UTF-8" http-equiv="content-type"><style type="text/css">.lst-kix_n6n0tzfwn8i8-5>li:before{content:"\0025a0   "}.lst-kix_n6n0tzfwn8i8-6>li:before{content:"\0025cf   "}ul.lst-kix_n6n0tzfwn8i8-8{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-7{list-style-type:none}.lst-kix_n6n0tzfwn8i8-3>li:before{content:"\0025cf   "}.lst-kix_n6n0tzfwn8i8-4>li:before{content:"\0025cb   "}.lst-kix_n6n0tzfwn8i8-7>li:before{content:"\0025cb   "}.lst-kix_n6n0tzfwn8i8-8>li:before{content:"\0025a0   "}.lst-kix_n6n0tzfwn8i8-1>li:before{content:"\0025cb   "}.lst-kix_n6n0tzfwn8i8-2>li:before{content:"\0025a0   "}li.li-bullet-0:before{margin-left:-18pt;white-space:nowrap;display:inline-block;min-width:18pt}.lst-kix_n6n0tzfwn8i8-0>li:before{content:"\0025cf   "}ul.lst-kix_n6n0tzfwn8i8-2{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-1{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-0{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-6{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-5{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-4{list-style-type:none}ul.lst-kix_n6n0tzfwn8i8-3{list-style-type:none}ol{margin:0;padding:0}table td,table th{padding:0}.c6{border-right-style:solid;padding:5pt 5pt 5pt 5pt;border-bottom-color:#000000;border-top-width:1pt;border-right-width:1pt;border-left-color:#000000;vertical-align:top;border-right-color:#000000;border-left-width:1pt;border-top-style:solid;border-left-style:solid;border-bottom-width:1pt;width:156pt;border-top-color:#000000;border-bottom-style:solid}.c0{-webkit-text-decoration-skip:none;color:#000000;font-weight:400;text-decoration:underline;vertical-align:baseline;text-decoration-skip-ink:none;font-size:11pt;font-family:"Arial";font-style:normal}.c4{padding-top:0pt;padding-bottom:0pt;line-height:1.0;orphans:2;widows:2;text-align:left;height:11pt}.c11{color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:italic}.c3{color:#000000;font-weight:400;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:normal}.c12{color:#000000;font-weight:700;text-decoration:none;vertical-align:baseline;font-size:11pt;font-family:"Arial";font-style:normal}.c7{padding-top:0pt;padding-bottom:0pt;line-height:1.0;orphans:2;widows:2;text-align:left}.c1{padding-top:0pt;padding-bottom:0pt;line-height:1.15;orphans:2;widows:2;text-align:left}.c8{text-decoration-skip-ink:none;-webkit-text-decoration-skip:none;color:#1155cc;text-decoration:underline}.c14{border-spacing:0;border-collapse:collapse;margin-right:auto}.c13{background-color:#ffffff;max-width:468pt;padding:72pt 72pt 72pt 72pt}.c15{padding:0;margin:0}.c10{margin-left:36pt;padding-left:0pt}.c5{color:inherit;text-decoration:inherit}.c9{height:11pt}.c2{height:0pt}.title{padding-top:0pt;color:#000000;font-size:26pt;padding-bottom:3pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}.subtitle{padding-top:0pt;color:#666666;font-size:15pt;padding-bottom:16pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}li{color:#000000;font-size:11pt;font-family:"Arial"}p{margin:0;color:#000000;font-size:11pt;font-family:"Arial"}h1{padding-top:20pt;color:#000000;font-size:20pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h2{padding-top:18pt;color:#000000;font-size:16pt;padding-bottom:6pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h3{padding-top:16pt;color:#434343;font-size:14pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h4{padding-top:14pt;color:#666666;font-size:12pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h5{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;orphans:2;widows:2;text-align:left}h6{padding-top:12pt;color:#666666;font-size:11pt;padding-bottom:4pt;font-family:"Arial";line-height:1.15;page-break-after:avoid;font-style:italic;orphans:2;widows:2;text-align:left}</style></head><body class="c13 doc-content"><p class="c1"><span class="c3">🦜️ LangChain</span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span class="c0">Underline</span></p><p class="c1 c9"><span class="c0"></span></p><p class="c1"><span class="c12">Bold</span></p><p class="c1 c9"><span class="c12"></span></p><p class="c1"><span class="c11">Italics</span></p><p class="c1 c9"><span class="c11"></span></p><p class="c1 c9"><span class="c11"></span></p><a id="t.e89270b97fc18eabe5c666cba79cd82cff5b5c3d"></a><a id="t.0"></a><table class="c14"><tbody><tr class="c2"><td class="c6" colspan="1" rowspan="1"><p class="c4"><span class="c12"></span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Col 1</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Col 2</span></p></td></tr><tr class="c2"><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Row 1</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">1</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">2</span></p></td></tr><tr class="c2"><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c12">Row 2</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">3</span></p></td><td class="c6" colspan="1" rowspan="1"><p class="c7"><span class="c3">4</span></p></td></tr></tbody></table><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span>Link: </span><span class="c8"><a class="c5" href="https://www.google.com/url?q=https://www.langchain.com/&amp;sa=D&amp;source=editors&amp;ust=1699572948600868&amp;usg=AOvVaw2T4jvAmPuMvcyed6PrEjq1">https://www.langchain.com/</a></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><ul class="c15 lst-kix_n6n0tzfwn8i8-0 start"><li class="c1 c10 li-bullet-0"><span class="c3">Item 1</span></li><li class="c1 c10 li-bullet-0"><span class="c3">Item 2</span></li><li class="c1 c10 li-bullet-0"><span class="c3">Item 3</span></li><li class="c1 c10 li-bullet-0"><span class="c3">We also love cats 🐱</span></li></ul><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span class="c3">Image</span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1"><span style="overflow: hidden; display: inline-block; margin: 0.00px 0.00px; border: 0.00px solid #000000; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px); width: 624.00px; height: 132.00px;"><img alt="" src="sample_files/image1.png" style="width: 624.00px; height: 132.00px; margin-left: 0.00px; margin-top: 0.00px; transform: rotate(0.00rad) translateZ(0px); -webkit-transform: rotate(0.00rad) translateZ(0px);" title=""></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p><p class="c1 c9"><span class="c3"></span></p></body></html>


--------------------------------------------------------------------------------
/backend/tests/unit_tests/fixtures/sample.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/fixtures/sample.odt


--------------------------------------------------------------------------------
/backend/tests/unit_tests/fixtures/sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/fixtures/sample.pdf


--------------------------------------------------------------------------------
/backend/tests/unit_tests/fixtures/sample.txt:
--------------------------------------------------------------------------------
 1 | ﻿🦜️ LangChain
 2 | 
 3 | 
 4 | 
 5 | 
 6 | Underline
 7 | 
 8 | 
 9 | Bold
10 | 
11 | 
12 | Italics
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 	Col 1
20 | 	Col 2
21 | 	Row 1
22 | 	1
23 | 	2
24 | 	Row 2
25 | 	3
26 | 	4
27 | 	
28 | 
29 | 
30 | 
31 | Link: https://www.langchain.com/
32 | 
33 | 
34 | 
35 | 
36 | * Item 1
37 | * Item 2
38 | * Item 3
39 | * We also love cats 🐱
40 | 
41 | 
42 | Image


--------------------------------------------------------------------------------
/backend/tests/unit_tests/test_deduplication.py:
--------------------------------------------------------------------------------
 1 | from server.extraction_runnable import ExtractResponse, deduplicate
 2 | 
 3 | 
 4 | async def test_deduplication_different_results() -> None:
 5 |     """Test deduplication of extraction results."""
 6 |     result = deduplicate(
 7 |         [
 8 |             {"data": [{"name": "Chester", "age": 42}]},
 9 |             {"data": [{"name": "Jane", "age": 42}]},
10 |         ]
11 |     )
12 |     expected = ExtractResponse(
13 |         data=[
14 |             {"name": "Chester", "age": 42},
15 |             {"name": "Jane", "age": 42},
16 |         ]
17 |     )
18 |     assert expected == result
19 | 
20 |     result = deduplicate(
21 |         [
22 |             {
23 |                 "data": [
24 |                     {"field_1": 1, "field_2": "a"},
25 |                     {"field_1": 2, "field_2": "b"},
26 |                 ]
27 |             },
28 |             {
29 |                 "data": [
30 |                     {"field_1": 1, "field_2": "a"},
31 |                     {"field_1": 2, "field_2": "c"},
32 |                 ]
33 |             },
34 |         ]
35 |     )
36 | 
37 |     expected = ExtractResponse(
38 |         data=[
39 |             {"field_1": 1, "field_2": "a"},
40 |             {"field_1": 2, "field_2": "b"},
41 |             {"field_1": 2, "field_2": "c"},
42 |         ]
43 |     )
44 |     assert expected == result
45 | 
46 |     # Test with data being a list of strings
47 |     result = deduplicate([{"data": ["1", "2"]}, {"data": ["1", "3"]}])
48 |     expected = ExtractResponse(data=["1", "2", "3"])
49 |     assert expected == result
50 | 
51 |     # Test with data being a mix of integer and string
52 |     result = deduplicate([{"data": [1, "2"]}, {"data": ["1", "3"]}])
53 |     expected = ExtractResponse(data=[1, "2", "1", "3"])
54 |     assert expected == result
55 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/test_parsing.py:
--------------------------------------------------------------------------------
 1 | """Test parsing logic."""
 2 | import mimetypes
 3 | 
 4 | from langchain.document_loaders import Blob
 5 | 
 6 | from extraction.parsing import (
 7 |     MIMETYPE_BASED_PARSER,
 8 |     SUPPORTED_MIMETYPES,
 9 | )
10 | from tests.unit_tests.fixtures import get_sample_paths
11 | 
12 | 
13 | def test_list_of_accepted_mimetypes() -> None:
14 |     """This list should generally grow! Protecting against typos in mimetypes."""
15 |     assert SUPPORTED_MIMETYPES == [
16 |         # Two MS Word mimetypes are disabled for now
17 |         # Need to install unstructured to enable them
18 |         # "application/msword",
19 |         "application/pdf",
20 |         # "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
21 |         "text/html",
22 |         "text/plain",
23 |     ]
24 | 
25 | 
26 | def test_attempt_to_parse_each_fixture() -> None:
27 |     """Attempt to parse supported fixtures."""
28 |     seen_mimetypes = set()
29 |     for path in get_sample_paths():
30 |         type_, _ = mimetypes.guess_type(path)
31 |         if type_ not in SUPPORTED_MIMETYPES:
32 |             continue
33 |         seen_mimetypes.add(type_)
34 |         blob = Blob.from_path(path)
35 |         documents = MIMETYPE_BASED_PARSER.parse(blob)
36 |         try:
37 |             assert len(documents) == 1
38 |             doc = documents[0]
39 |             assert "source" in doc.metadata
40 |             assert doc.metadata["source"] == str(path)
41 |             assert "🦜" in doc.page_content
42 |         except Exception as e:
43 |             raise AssertionError(f"Failed to parse {path}") from e
44 | 
45 |     known_missing = {"application/msword"}
46 |     assert set(SUPPORTED_MIMETYPES) - known_missing == seen_mimetypes
47 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/test_upload.py:
--------------------------------------------------------------------------------
 1 | from extraction.parsing import _guess_mimetype
 2 | from tests.unit_tests.fixtures import get_sample_paths
 3 | 
 4 | 
 5 | async def test_mimetype_guessing() -> None:
 6 |     """Verify mimetype guessing for all fixtures."""
 7 |     name_to_mime = {}
 8 |     for file in sorted(get_sample_paths()):
 9 |         data = file.read_bytes()
10 |         name_to_mime[file.name] = _guess_mimetype(data)
11 | 
12 |     assert {
13 |         "sample.docx": (
14 |             "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
15 |         ),
16 |         "sample.epub": "application/epub+zip",
17 |         "sample.html": "text/html",
18 |         "sample.odt": "application/vnd.oasis.opendocument.text",
19 |         "sample.pdf": "application/pdf",
20 |         "sample.rtf": "text/rtf",
21 |         "sample.txt": "text/plain",
22 |     } == name_to_mime
23 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | from langchain.pydantic_v1 import BaseModel, Field
  2 | from langchain_core.messages import AIMessage
  3 | 
  4 | from extraction.utils import update_json_schema
  5 | from server.extraction_runnable import ExtractionExample, _make_prompt_template
  6 | 
  7 | 
  8 | def test_update_json_schema() -> None:
  9 |     """Test updating JSON schema."""
 10 | 
 11 |     class Person(BaseModel):
 12 |         name: str = Field(..., description="The name of the person.")
 13 |         age: int = Field(..., description="The age of the person.")
 14 | 
 15 |     schema = Person.schema()
 16 | 
 17 |     assert schema == {
 18 |         "properties": {
 19 |             "age": {
 20 |                 "description": "The age of the person.",
 21 |                 "title": "Age",
 22 |                 "type": "integer",
 23 |             },
 24 |             "name": {
 25 |                 "description": "The name of the person.",
 26 |                 "title": "Name",
 27 |                 "type": "string",
 28 |             },
 29 |         },
 30 |         "required": ["name", "age"],
 31 |         "title": "Person",
 32 |         "type": "object",
 33 |     }
 34 | 
 35 |     updated_schema = update_json_schema(schema)
 36 |     assert updated_schema == {
 37 |         "type": "object",
 38 |         "properties": {
 39 |             "data": {
 40 |                 "type": "array",
 41 |                 "items": {
 42 |                     "title": "Person",
 43 |                     "type": "object",
 44 |                     "properties": {
 45 |                         "name": {
 46 |                             "title": "Name",
 47 |                             "description": "The name of the person.",
 48 |                             "type": "string",
 49 |                         },
 50 |                         "age": {
 51 |                             "title": "Age",
 52 |                             "description": "The age of the person.",
 53 |                             "type": "integer",
 54 |                         },
 55 |                     },
 56 |                     "required": ["name", "age"],
 57 |                 },
 58 |             }
 59 |         },
 60 |         "required": ["data"],
 61 |         "title": "extractor",
 62 |         "description": "Extract information matching the given schema.",
 63 |     }
 64 | 
 65 | 
 66 | def test_make_prompt_template() -> None:
 67 |     """Test making a system message from instructions and examples."""
 68 |     instructions = "Test instructions."
 69 |     examples = [
 70 |         ExtractionExample(
 71 |             text="Test text.",
 72 |             output=[
 73 |                 {"name": "Test Name", "age": 0},
 74 |                 {"name": "Test Name 2", "age": 1},
 75 |             ],
 76 |         )
 77 |     ]
 78 |     prefix = (
 79 |         "You are a top-tier algorithm for extracting information from text. "
 80 |         "Only extract information that is relevant to the provided text. "
 81 |         "If no information is relevant, use the schema and output "
 82 |         "an empty list where appropriate."
 83 |     )
 84 |     prompt = _make_prompt_template(instructions, examples, "name")
 85 |     messages = prompt.messages
 86 |     assert 5 == len(messages)
 87 |     system = messages[0].prompt.template
 88 |     assert system.startswith(prefix)
 89 |     assert system.endswith(instructions)
 90 | 
 91 |     example_input = messages[1]
 92 |     assert example_input.content == "Test text."
 93 |     example_output = messages[2]
 94 |     assert isinstance(example_output, AIMessage)
 95 |     assert example_output.tool_calls
 96 |     assert len(example_output.tool_calls) == 1
 97 |     assert example_output.tool_calls[0]["name"] == "name"
 98 | 
 99 |     prompt = _make_prompt_template(instructions, None, "name")
100 |     assert 2 == len(prompt.messages)
101 | 
102 |     prompt = _make_prompt_template(None, examples, "name")
103 |     assert 5 == len(prompt.messages)
104 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/test_validators.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from server.validators import validate_json_schema
 4 | 
 5 | 
 6 | def test_validate_json_schema() -> None:
 7 |     """Test validate_json_schema."""
 8 |     # TODO: Validate more extensively to make sure that it actually validates
 9 |     # the schema as expected.
10 |     with pytest.raises(Exception):
11 |         validate_json_schema({"type": "meow"})
12 | 
13 |     with pytest.raises(Exception):
14 |         validate_json_schema({"type": "str"})
15 | 
16 |     validate_json_schema({"type": "string"})
17 | 


--------------------------------------------------------------------------------
/backend/tests/unit_tests/utils.py:
--------------------------------------------------------------------------------
 1 | from contextlib import asynccontextmanager
 2 | from typing import Optional
 3 | 
 4 | import httpx
 5 | from fastapi import FastAPI
 6 | from httpx import AsyncClient
 7 | 
 8 | 
 9 | @asynccontextmanager
10 | async def get_async_test_client(
11 |     server: FastAPI, *, path: Optional[str] = None, raise_app_exceptions: bool = True
12 | ) -> AsyncClient:
13 |     """Get an async client."""
14 |     url = "http://localhost:9999/"
15 |     if path:
16 |         url += path
17 |     transport = httpx.ASGITransport(
18 |         app=server,
19 |         raise_app_exceptions=raise_app_exceptions,
20 |     )
21 |     async_client = AsyncClient(app=server, base_url=url, transport=transport)
22 |     try:
23 |         yield async_client
24 |     finally:
25 |         await async_client.aclose()
26 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | name: langchain-extract
 2 | 
 3 | services:
 4 |   postgres:
 5 |     # Careful if bumping postgres version.
 6 |     # Make sure to keep in sync with CI
 7 |     # version if being tested on CI.
 8 |     image: postgres:16
 9 |     expose:
10 |       - "5432"
11 |     ports:
12 |       - "5432:5432"
13 |     environment:
14 |       POSTGRES_DB: langchain
15 |       POSTGRES_USER: langchain
16 |       POSTGRES_PASSWORD: langchain
17 |     healthcheck:
18 |       test: ["CMD-SHELL", "pg_isready -U langchain -d langchain -W langchain"]
19 |       interval: 10s
20 |       timeout: 5s
21 |       retries: 5
22 |     volumes:
23 |       - postgres_data:/var/lib/postgresql/data
24 | 
25 |   backend:
26 |     build:
27 |       context: .
28 |       dockerfile: ./backend/Dockerfile
29 |       target: development
30 |     env_file:
31 |       - .local.env
32 |     environment:
33 |       - PG_HOST=postgres
34 |         # Define CORS origins for dev work on UI
35 |       - CORS_ORIGINS=http://localhost:3000
36 |     ports:
37 |       - "8000:8000" # Backend is accessible on localhost:8100
38 |     depends_on:
39 |       - postgres
40 |     volumes:
41 |       - ./backend:/backend
42 | 
43 |   frontend:
44 |     build:
45 |       context: ./frontend
46 |       dockerfile: ./Dockerfile
47 |       target: development
48 |     ports:
49 |       - "3000:3000"
50 |     environment:
51 |       - NODE_ENV=development
52 |     volumes:
53 |       - ./frontend:/app
54 |       - /app/node_modules
55 |     depends_on:
56 |       - backend
57 | 
58 | volumes:
59 |   postgres_data:
60 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | #
 13 | # import os
 14 | # import sys
 15 | # sys.path.insert(0, os.path.abspath('.'))
 16 | 
 17 | 
 18 | # -- Project information -----------------------------------------------------
 19 | import pathlib
 20 | import sys
 21 | from typing import List
 22 | 
 23 | import toml
 24 | 
 25 | ROOT_FOLDER = str(pathlib.Path(__file__).parent.parent.parent)
 26 | 
 27 | # Add the project root to the path
 28 | sys.path.insert(0, ROOT_FOLDER)
 29 | 
 30 | with open("../../pyproject.toml") as f:
 31 |     data = toml.load(f)
 32 | 
 33 | project = "LangChain Extract"
 34 | copyright = "2024, Langchain AI"
 35 | author = "Langchain AI"
 36 | 
 37 | version = data["tool"]["poetry"]["version"]
 38 | release = version
 39 | 
 40 | html_title = project + " " + version
 41 | 
 42 | 
 43 | # -- General configuration ---------------------------------------------------
 44 | 
 45 | # Add any Sphinx extension module names here, as strings. They can be
 46 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 47 | # ones.
 48 | extensions = [
 49 |     "sphinx.ext.autodoc",
 50 |     "sphinx.ext.autodoc.typehints",
 51 |     "sphinx.ext.autosummary",
 52 |     "sphinx.ext.napoleon",
 53 |     "sphinx.ext.viewcode",
 54 |     "myst_nb",
 55 |     "sphinx_copybutton",
 56 |     "IPython.sphinxext.ipython_console_highlighting",
 57 | ]
 58 | source_suffix = [".ipynb", ".html", ".md", ".rst"]
 59 | 
 60 | # Add any paths that contain templates here, relative to this directory.
 61 | templates_path = ["_templates"]
 62 | 
 63 | # List of patterns, relative to source directory, that match files and
 64 | # directories to ignore when looking for source files.
 65 | # This pattern also affects html_static_path and html_extra_path.
 66 | exclude_patterns: List[str] = []
 67 | 
 68 | 
 69 | # -- Options for HTML output -------------------------------------------------
 70 | 
 71 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 72 | # a list of builtin themes.
 73 | #
 74 | html_theme = "sphinx_book_theme"
 75 | 
 76 | html_theme_options = {
 77 |     "path_to_docs": "docs/source",
 78 |     "repository_url": "https://github.com/langchain-ai/langchain-extract",
 79 |     "home_page_in_toc": True,
 80 |     "show_navbar_depth": 2,
 81 |     "use_sidenotes": True,
 82 |     "use_repository_button": True,
 83 |     "use_issues_button": True,
 84 |     "use_source_button": True,
 85 |     "use_fullscreen_button": True,
 86 |     "repository_branch": "main",
 87 |     "launch_buttons": {
 88 |         "notebook_interface": "jupyterlab",
 89 |         "colab_url": "https://colab.research.google.com",
 90 |     },
 91 | }
 92 | 
 93 | html_context = {
 94 |     "display_github": True,  # Integrate GitHub
 95 |     "github_user": "langchain-ai",  # Username
 96 |     "github_repo": "langchain-extract",  # Repo name
 97 |     "github_version": "main",  # Version
 98 |     "conf_py_path": "/docs/",  # Path in the checkout to the docs root
 99 | }
100 | 
101 | # Add any paths that contain custom static files (such as style sheets) here,
102 | # relative to this directory. They are copied after the builtin static files,
103 | # so a file named "default.css" will overwrite the builtin "default.css".
104 | html_static_path = ["_static"]
105 | 
106 | # These paths are either relative to html_static_path
107 | # or fully qualified paths (eg. https://...)
108 | html_css_files = [
109 |     "css/custom.css",
110 | ]
111 | 
112 | nb_execution_mode = "off"
113 | autosummary_generate = True


--------------------------------------------------------------------------------
/docs/source/notebooks/earnings_call_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "1549a9c6-cca7-4028-9f0c-80ee3aa1d4b4",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Example: extracting structured data from earnings call transcripts\n",
  9 |     "\n",
 10 |     "Most public companies host earnings calls, providing their management opportunities to discuss past financial results and future plans. Natural language transcripts of these calls may contain useful information, but often this information must first be extracted from the document and arranged into a structured form so that it can be analyzed or compared across time periods and other companies.\n",
 11 |     "\n",
 12 |     "Here we demonstrate the use of a LLM-powered extraction service on extracting information from Uber's Q4 2023 earnings call. We show the importance of incorporating few-shot learning to accurate extraction in a real-world context.\n",
 13 |     "\n",
 14 |     "Uber investor relations makes the prepared remarks for the call available [online](https://s23.q4cdn.com/407969754/files/doc_earnings/2023/q4/transcript/Uber-Q4-23-Prepared-Remarks.pdf).\n",
 15 |     "\n",
 16 |     "First we start our local extraction service, as described in the [README](../../../README.md), and download the PDF document:"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "id": "589ea131-e6ae-4605-8c8f-3ccb0f643477",
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import requests\n",
 27 |     "\n",
 28 |     "url = \"http://localhost:8000\""
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "id": "d7d935fe-4642-4c55-bba4-6dfb8191e4bd",
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "# Uber transcripts from earnings calls and other events at https://investor.uber.com/news-events/default.aspx\n",
 39 |     "\n",
 40 |     "pdf_url = \"https://s23.q4cdn.com/407969754/files/doc_earnings/2023/q4/transcript/Uber-Q4-23-Prepared-Remarks.pdf\""
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "id": "c704a00e-f663-4bce-b482-984278dad8f1",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "# Get PDF bytes\n",
 51 |     "\n",
 52 |     "pdf_response = requests.get(pdf_url)\n",
 53 |     "assert(pdf_response.status_code == 200)\n",
 54 |     "pdf_bytes = pdf_response.content"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "id": "0a10cf8b-f05e-424b-9a72-fb4abfb9e091",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "We next specify the schema of what we intend to extract. Here we specify a record of financial data. We allow the LLM to infer various attributes, such as the time period for the record.\n",
 63 |     "\n",
 64 |     "Note that we include an `evidence` attribute, which provides context for the predictions and supports downstream verification of the results.\n",
 65 |     "\n",
 66 |     "Once we've defined our schema, we create an extractor by posting it to our database."
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 4,
 72 |    "id": "3d5a5bb0-4284-4706-98e6-e622bcc3778d",
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "data": {
 77 |       "text/plain": [
 78 |        "<Response [200]>"
 79 |       ]
 80 |      },
 81 |      "execution_count": 4,
 82 |      "metadata": {},
 83 |      "output_type": "execute_result"
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "from uuid import uuid4\n",
 88 |     "\n",
 89 |     "from pydantic import BaseModel, Field\n",
 90 |     "\n",
 91 |     "class FinancialData(BaseModel):\n",
 92 |     "    name: str = Field(..., description=\"Name of the financial figure, such as revenue.\")\n",
 93 |     "    value: int = Field(..., description=\"Nominal earnings in local currency.\")\n",
 94 |     "    scale: str = Field(..., description=\"Scale of figure, such as MM, B, or percent.\")\n",
 95 |     "    period_start: str = Field(..., description=\"The start of the time period in ISO format.\")\n",
 96 |     "    period_duration: int = Field(..., description=\"Duration of period, in months\")\n",
 97 |     "    evidence: str = Field(..., description=\"Verbatim sentence of text where figure was found.\")\n",
 98 |     "\n",
 99 |     "user_id = str(uuid4())\n",
100 |     "headers = {\"x-key\": user_id}\n",
101 |     "\n",
102 |     "data = {\n",
103 |     "    \"user_id\": user_id,\n",
104 |     "    \"description\": \"Financial revenues and other figures.\",\n",
105 |     "    \"schema\": FinancialData.schema(),\n",
106 |     "    \"instruction\": (\n",
107 |     "        \"Extract standard financial figures, specifically earnings and \"\n",
108 |     "        \"revenue figures.\"\n",
109 |     "    )\n",
110 |     "}\n",
111 |     "\n",
112 |     "response = requests.post(f\"{url}/extractors\", json=data, headers=headers)\n",
113 |     "response"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 5,
119 |    "id": "74b7f3a4-07c1-4cf0-8c75-34d22eb5a661",
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "{'uuid': '151db8c9-ec49-4c6c-a13d-b5335ede8cbb'}\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "extractor = response.json()\n",
132 |     "print(extractor)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "id": "fc18f21c-9f73-4f9e-b63d-7d0a198208c9",
138 |    "metadata": {},
139 |    "source": [
140 |     "We can now try the extractor on our PDF:"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 6,
146 |    "id": "15a1c7e9-3fcd-42ca-88fb-4802fe841a8d",
147 |    "metadata": {},
148 |    "outputs": [
149 |     {
150 |      "data": {
151 |       "text/plain": [
152 |        "<Response [200]>"
153 |       ]
154 |      },
155 |      "execution_count": 6,
156 |      "metadata": {},
157 |      "output_type": "execute_result"
158 |     }
159 |    ],
160 |    "source": [
161 |     "result = requests.post(\n",
162 |     "    f\"{url}/extract\",\n",
163 |     "    data={\"extractor_id\": extractor[\"uuid\"]},\n",
164 |     "    files={\"file\": pdf_bytes},\n",
165 |     "    headers=headers,\n",
166 |     ")\n",
167 |     "\n",
168 |     "result"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 7,
174 |    "id": "894ad738-5f25-4791-a2a6-365d39b583b4",
175 |    "metadata": {},
176 |    "outputs": [
177 |     {
178 |      "data": {
179 |       "text/plain": [
180 |        "{'data': [{'name': 'Adjusted EBITDA',\n",
181 |        "   'scale': 'million',\n",
182 |        "   'value': 1300,\n",
183 |        "   'evidence': 'Q4 was a standout quarter to cap off a standout year... translated to $1.3 billion in Adjusted EBITDA',\n",
184 |        "   'period_start': '2023-10-01',\n",
185 |        "   'period_duration': 3},\n",
186 |        "  {'name': 'GAAP operating income',\n",
187 |        "   'scale': 'million',\n",
188 |        "   'value': 652,\n",
189 |        "   'evidence': 'translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income',\n",
190 |        "   'period_start': '2023-10-01',\n",
191 |        "   'period_duration': 3},\n",
192 |        "  {'name': 'Gross Bookings',\n",
193 |        "   'scale': 'billion',\n",
194 |        "   'value': 37.6,\n",
195 |        "   'evidence': 'Gross Bookings of $37.6 billion',\n",
196 |        "   'period_start': '2023-10-01',\n",
197 |        "   'period_duration': 3},\n",
198 |        "  {'name': 'Revenue',\n",
199 |        "   'scale': 'billion',\n",
200 |        "   'value': 9.9,\n",
201 |        "   'evidence': 'we grew our revenue by 13% YoY on a constant-currency basis to $9.9 billion',\n",
202 |        "   'period_start': '2023-10-01',\n",
203 |        "   'period_duration': 3},\n",
204 |        "  {'name': 'Adjusted EBITDA',\n",
205 |        "   'scale': '$',\n",
206 |        "   'value': 1260000000,\n",
207 |        "   'evidence': 'We expect Adjusted EBITDA of $1.26 billion to $1.34 billion.',\n",
208 |        "   'period_start': '2023-01-01',\n",
209 |        "   'period_duration': 12}]}"
210 |       ]
211 |      },
212 |      "execution_count": 7,
213 |      "metadata": {},
214 |      "output_type": "execute_result"
215 |     }
216 |    ],
217 |    "source": [
218 |     "result.json()"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "id": "cdcf8c31-bbb4-40df-b94e-dcda32b975aa",
224 |    "metadata": {},
225 |    "source": [
226 |     "We've extracted several records capturing various earnings and revenue figures, and have conformed the records to the desired schema.\n",
227 |     "\n",
228 |     "We can convey additional instructions to the LLM efficiently via few-shot examples. For example, we can specify how the names of financial metrics should be normalized, or how scales (millions, billions, percentages, etc.) should be represented in different cases.\n",
229 |     "\n",
230 |     "The `examples` endpoint lets us associate few-shot examples with an extractor. We can specify examples by pairing text inputs with lists of `FinancialData` outputs:"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 8,
236 |    "id": "b07d78e9-23d5-49e8-ae6c-e3d4aaca2d4d",
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "examples = [\n",
241 |     "    {\n",
242 |     "        \"text\": \"In 2022, Revenue was $1 million and EBIT was $2M.\",\n",
243 |     "        \"output\": [\n",
244 |     "            FinancialData(\n",
245 |     "                name=\"revenue\",\n",
246 |     "                value=1,\n",
247 |     "                scale=\"MM\",\n",
248 |     "                period_start=\"2022-01-01\",\n",
249 |     "                period_duration=12,\n",
250 |     "                evidence=\"In 2022, Revenue was $1 million and EBIT was $2M.\",\n",
251 |     "            ).dict(),\n",
252 |     "            FinancialData(\n",
253 |     "                name=\"ebit\",\n",
254 |     "                value=2,\n",
255 |     "                scale=\"MM\",\n",
256 |     "                period_start=\"2022-01-01\",\n",
257 |     "                period_duration=12,\n",
258 |     "                evidence=\"In 2022, Revenue was $1 million and EBIT was $2M.\",\n",
259 |     "            ).dict()\n",
260 |     "        ],\n",
261 |     "    },\n",
262 |     "]\n",
263 |     "\n",
264 |     "responses = []\n",
265 |     "for example in examples:\n",
266 |     "    create_request = {\n",
267 |     "        \"extractor_id\": extractor[\"uuid\"],\n",
268 |     "        \"content\": example[\"text\"],\n",
269 |     "        \"output\": example['output'],\n",
270 |     "    }\n",
271 |     "    response = requests.post(f\"{url}/examples\", json=create_request, headers=headers)\n",
272 |     "    responses.append(response)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "id": "271a90c0-d320-4e35-9317-1ea08c5dde15",
278 |    "metadata": {},
279 |    "source": [
280 |     "Having posted the examples, we can re-run the extraction:"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 9,
286 |    "id": "efc8041e-e3ca-4705-8d34-7f9b93b1400c",
287 |    "metadata": {},
288 |    "outputs": [
289 |     {
290 |      "data": {
291 |       "text/plain": [
292 |        "<Response [200]>"
293 |       ]
294 |      },
295 |      "execution_count": 9,
296 |      "metadata": {},
297 |      "output_type": "execute_result"
298 |     }
299 |    ],
300 |    "source": [
301 |     "result = requests.post(\n",
302 |     "    f\"{url}/extract\",\n",
303 |     "    data={\"extractor_id\": extractor[\"uuid\"]},\n",
304 |     "    files={\"file\": pdf_bytes},\n",
305 |     "    headers=headers,\n",
306 |     ")\n",
307 |     "\n",
308 |     "result"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 10,
314 |    "id": "2101f960-6abd-4fef-9945-bafb449d5435",
315 |    "metadata": {},
316 |    "outputs": [
317 |     {
318 |      "data": {
319 |       "text/plain": [
320 |        "{'data': [{'name': 'adjusted ebitda',\n",
321 |        "   'scale': 'MM',\n",
322 |        "   'value': 1300,\n",
323 |        "   'evidence': 'These strong top-line trends, combined with continued rigor on costs, translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n",
324 |        "   'period_start': '2023-10-01',\n",
325 |        "   'period_duration': 3},\n",
326 |        "  {'name': 'revenue',\n",
327 |        "   'scale': 'MM',\n",
328 |        "   'value': 9900,\n",
329 |        "   'evidence': 'We grew our revenue by 13% YoY on a constant-currency basis to $9.9 billion.',\n",
330 |        "   'period_start': '2023-10-01',\n",
331 |        "   'period_duration': 3},\n",
332 |        "  {'name': 'gaap operating income',\n",
333 |        "   'scale': 'MM',\n",
334 |        "   'value': 652,\n",
335 |        "   'evidence': 'These strong top-line trends, combined with continued rigor on costs, translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n",
336 |        "   'period_start': '2023-10-01',\n",
337 |        "   'period_duration': 3},\n",
338 |        "  {'name': 'adjusted ebitda',\n",
339 |        "   'scale': 'B',\n",
340 |        "   'value': 1260,\n",
341 |        "   'evidence': 'We expect Adjusted EBITDA of $1.26 billion to $1.34 billion.',\n",
342 |        "   'period_start': '2023-01-01',\n",
343 |        "   'period_duration': 12}]}"
344 |       ]
345 |      },
346 |      "execution_count": 10,
347 |      "metadata": {},
348 |      "output_type": "execute_result"
349 |     }
350 |    ],
351 |    "source": [
352 |     "result.json()"
353 |    ]
354 |   }
355 |  ],
356 |  "metadata": {
357 |   "kernelspec": {
358 |    "display_name": "Python 3 (ipykernel)",
359 |    "language": "python",
360 |    "name": "python3"
361 |   },
362 |   "language_info": {
363 |    "codemirror_mode": {
364 |     "name": "ipython",
365 |     "version": 3
366 |    },
367 |    "file_extension": ".py",
368 |    "mimetype": "text/x-python",
369 |    "name": "python",
370 |    "nbconvert_exporter": "python",
371 |    "pygments_lexer": "ipython3",
372 |    "version": "3.10.4"
373 |   }
374 |  },
375 |  "nbformat": 4,
376 |  "nbformat_minor": 5
377 | }
378 | 


--------------------------------------------------------------------------------
/docs/source/notebooks/quick_start.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "7e43ed67-9fbb-4d6c-9a5d-8c4addeb2ed5",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Client Example"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "b123c960-a0b4-4d5e-b15f-729de23974f5",
 15 |    "metadata": {
 16 |     "tags": []
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from langserve import RemoteRunnable"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "id": "19dafdeb-63c5-4218-b0f9-fc20754369be",
 27 |    "metadata": {
 28 |     "tags": []
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from typing import Optional, List\n",
 33 |     "from pydantic import BaseModel, Field\n",
 34 |     "\n",
 35 |     "class Person(BaseModel):\n",
 36 |     "    age: Optional[int] = Field(None, description=\"The age of the person in years.\")\n",
 37 |     "    name: Optional[str] = Field(None, description=\"The name of the person.\")"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "id": "bf79ef88-b816-46aa-addf-9366b7ebdcaf",
 44 |    "metadata": {
 45 |     "tags": []
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "runnable = RemoteRunnable(\"http://localhost:8000/extract_text/\")"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 4,
 55 |    "id": "5f102a5c-a80c-4480-863b-30f3aaad5afe",
 56 |    "metadata": {
 57 |     "tags": []
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "text = \"\"\"\n",
 62 |     "My name is Chester. I am 42 years old. My friend Jane is a year older than me.\n",
 63 |     "\"\"\""
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 5,
 69 |    "id": "553d7dbc-9117-4834-83b1-11e28a513170",
 70 |    "metadata": {
 71 |     "tags": []
 72 |    },
 73 |    "outputs": [
 74 |     {
 75 |      "data": {
 76 |       "text/plain": [
 77 |        "{'data': [{'name': 'Chester', 'age': 42}]}"
 78 |       ]
 79 |      },
 80 |      "execution_count": 5,
 81 |      "metadata": {},
 82 |      "output_type": "execute_result"
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "response = runnable.invoke({\"text\": text, \"schema\": Person.schema()})\n",
 87 |     "response"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "id": "c70d8d7c-5f0b-4757-92b7-cdd40f351275",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "Add instructions:"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 6,
101 |    "id": "97294409-6daf-418d-9cbe-f44946245e35",
102 |    "metadata": {
103 |     "tags": []
104 |    },
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "{'data': [{'name': 'Chester', 'age': 42}]}"
110 |       ]
111 |      },
112 |      "execution_count": 6,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "instructions = \"Redact all names using the characters `######`\"\n",
119 |     "\n",
120 |     "response = runnable.invoke(\n",
121 |     "    {\n",
122 |     "        \"text\": text,\n",
123 |     "        \"schema\": Person.schema(),\n",
124 |     "        \"instructions\": instructions,\n",
125 |     "    }\n",
126 |     ")\n",
127 |     "response"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "id": "24b4a123-7841-465b-b43b-1db439c45fa7",
133 |    "metadata": {},
134 |    "source": [
135 |     "Add few-shot examples:"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 7,
141 |    "id": "bae9416d-abd4-4b41-90c2-3144c8566483",
142 |    "metadata": {
143 |     "tags": []
144 |    },
145 |    "outputs": [
146 |     {
147 |      "data": {
148 |       "text/plain": [
149 |        "{'data': [{'name': '######', 'age': 42}, {'name': 'Jane', 'age': 43}]}"
150 |       ]
151 |      },
152 |      "execution_count": 7,
153 |      "metadata": {},
154 |      "output_type": "execute_result"
155 |     }
156 |    ],
157 |    "source": [
158 |     "instructions = \"Redact all names using the characters `######`\"\n",
159 |     "examples = [\n",
160 |     "    {\n",
161 |     "        \"text\": \"My name is Grung. I am 100.\",\n",
162 |     "        \"output\": [ {\"age\": 100, \"name\": \"######\", \"hello\": \"meow\"}] ,\n",
163 |     "    }\n",
164 |     "]\n",
165 |     "\n",
166 |     "response = runnable.invoke(\n",
167 |     "    {\n",
168 |     "        \"text\": text,\n",
169 |     "        \"schema\": Person.schema(),\n",
170 |     "        \"instructions\": instructions,\n",
171 |     "        \"examples\": examples,\n",
172 |     "    }\n",
173 |     ")\n",
174 |     "response"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "id": "83244e1a-7d4a-489d-b88c-b4e35ac76001",
180 |    "metadata": {},
181 |    "source": [
182 |     "## Persist extractors"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 8,
188 |    "id": "7fc60d58-edfd-4d2b-a71f-fc5e9c6ab58b",
189 |    "metadata": {
190 |     "tags": []
191 |    },
192 |    "outputs": [],
193 |    "source": [
194 |     "import requests\n",
195 |     "from uuid import uuid4"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 9,
201 |    "id": "b1124672-ee4b-484a-be07-16687bb229e3",
202 |    "metadata": {
203 |     "tags": []
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "url = \"http://localhost:8000\"\n",
208 |     "user_id = str(uuid4())  # indicates owner for extractor\n",
209 |     "headers = {\"x-key\": user_id}"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 10,
215 |    "id": "8a491b3e-999f-4f88-87f0-a282d582ef18",
216 |    "metadata": {
217 |     "tags": []
218 |    },
219 |    "outputs": [
220 |     {
221 |      "data": {
222 |       "text/plain": [
223 |        "<Response [200]>"
224 |       ]
225 |      },
226 |      "execution_count": 10,
227 |      "metadata": {},
228 |      "output_type": "execute_result"
229 |     }
230 |    ],
231 |    "source": [
232 |     "data = {\n",
233 |     "    \"name\": \"people_extractor\",\n",
234 |     "    \"description\": \"Extract references to people, having properties name and age.\",\n",
235 |     "    \"schema\": Person.schema(),\n",
236 |     "    \"instruction\": \"Redact all names using the characters `######`\",\n",
237 |     "}\n",
238 |     "\n",
239 |     "response = requests.post(f\"{url}/extractors\", json=data, headers=headers)\n",
240 |     "response"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 11,
246 |    "id": "1cf96c62-5653-4955-87ad-48ca009252d0",
247 |    "metadata": {
248 |     "tags": []
249 |    },
250 |    "outputs": [
251 |     {
252 |      "name": "stdout",
253 |      "output_type": "stream",
254 |      "text": [
255 |       "fa60ccce-5637-41b4-ba1a-085a56d0fa5b\n"
256 |      ]
257 |     }
258 |    ],
259 |    "source": [
260 |     "uuid = response.json()['uuid']\n",
261 |     "print(uuid)"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "id": "c7ddd7ea-31b6-49ad-a89a-8d5d7efa5f22",
267 |    "metadata": {},
268 |    "source": [
269 |     "### Add examples"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 12,
275 |    "id": "d17f0283-a517-497f-9838-ade72f2e6359",
276 |    "metadata": {
277 |     "tags": []
278 |    },
279 |    "outputs": [],
280 |    "source": [
281 |     "import json\n",
282 |     "\n",
283 |     "examples = [\n",
284 |     "    {\n",
285 |     "        \"text\": \"My name is Grung. I am 100.\",\n",
286 |     "        \"output\": [Person(age=100, name=\"######\").dict()],\n",
287 |     "    }\n",
288 |     "]\n",
289 |     "\n",
290 |     "responses = []\n",
291 |     "for example in examples:\n",
292 |     "    create_request = {\n",
293 |     "        \"extractor_id\": uuid,\n",
294 |     "        \"content\": example[\"text\"],\n",
295 |     "        \"output\": example['output'],\n",
296 |     "    }\n",
297 |     "    response = requests.post(f\"{url}/examples\", json=create_request, headers=headers)\n",
298 |     "    responses.append(response)"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 13,
304 |    "id": "eb0b90d9-090a-4b84-9e31-e6c9f92de6ea",
305 |    "metadata": {
306 |     "tags": []
307 |    },
308 |    "outputs": [
309 |     {
310 |      "data": {
311 |       "text/plain": [
312 |        "<Response [200]>"
313 |       ]
314 |      },
315 |      "execution_count": 13,
316 |      "metadata": {},
317 |      "output_type": "execute_result"
318 |     }
319 |    ],
320 |    "source": [
321 |     "response = requests.get(f\"{url}/examples?extractor_id={uuid}\", headers=headers)\n",
322 |     "response"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 14,
328 |    "id": "7b82eb80-bc44-455d-a8df-28616f94885d",
329 |    "metadata": {
330 |     "tags": []
331 |    },
332 |    "outputs": [
333 |     {
334 |      "data": {
335 |       "text/plain": [
336 |        "[{'extractor_id': 'fa60ccce-5637-41b4-ba1a-085a56d0fa5b',\n",
337 |        "  'content': 'My name is Grung. I am 100.',\n",
338 |        "  'created_at': '2024-03-22T12:10:32.862261',\n",
339 |        "  'updated_at': '2024-03-22T12:10:32.862265',\n",
340 |        "  'output': [{'age': 100, 'name': '######'}],\n",
341 |        "  'uuid': '94c8a41a-7d33-4795-a6be-23e72fe6c4e4'}]"
342 |       ]
343 |      },
344 |      "execution_count": 14,
345 |      "metadata": {},
346 |      "output_type": "execute_result"
347 |     }
348 |    ],
349 |    "source": [
350 |     "response.json()"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "markdown",
355 |    "id": "1dcf03bf-0b71-4c1f-a88a-bce090762d2c",
356 |    "metadata": {},
357 |    "source": [
358 |     "### Extract using persisted extractor"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 15,
364 |    "id": "d5f84d0c-3d06-4d76-b9e5-c68e659ef930",
365 |    "metadata": {
366 |     "tags": []
367 |    },
368 |    "outputs": [
369 |     {
370 |      "name": "stdout",
371 |      "output_type": "stream",
372 |      "text": [
373 |       "{'extractor_id': 'fa60ccce-5637-41b4-ba1a-085a56d0fa5b', 'text': '\\nMy name is Chester. I am 42 years old. My friend Jane is a year older than me.\\n'}\n"
374 |      ]
375 |     }
376 |    ],
377 |    "source": [
378 |     "request_data = {\"extractor_id\": uuid, \"text\": text}\n",
379 |     "print(request_data)"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 16,
385 |    "id": "d2bc2481-0dca-42aa-b3a7-d193721e149e",
386 |    "metadata": {
387 |     "tags": []
388 |    },
389 |    "outputs": [
390 |     {
391 |      "data": {
392 |       "text/plain": [
393 |        "<Response [200]>"
394 |       ]
395 |      },
396 |      "execution_count": 16,
397 |      "metadata": {},
398 |      "output_type": "execute_result"
399 |     }
400 |    ],
401 |    "source": [
402 |     "response = requests.post(f\"{url}/extract\", data=request_data, headers=headers)\n",
403 |     "response"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": 17,
409 |    "id": "f389da36-5b43-41f5-b0a6-a389c6303937",
410 |    "metadata": {
411 |     "tags": []
412 |    },
413 |    "outputs": [
414 |     {
415 |      "data": {
416 |       "text/plain": [
417 |        "'{\"data\":[{\"name\":\"######\",\"age\":42},{\"name\":\"######\",\"age\":43}]}'"
418 |       ]
419 |      },
420 |      "execution_count": 17,
421 |      "metadata": {},
422 |      "output_type": "execute_result"
423 |     }
424 |    ],
425 |    "source": [
426 |     "response.text"
427 |    ]
428 |   }
429 |  ],
430 |  "metadata": {
431 |   "kernelspec": {
432 |    "display_name": "Python 3 (ipykernel)",
433 |    "language": "python",
434 |    "name": "python3"
435 |   },
436 |   "language_info": {
437 |    "codemirror_mode": {
438 |     "name": "ipython",
439 |     "version": 3
440 |    },
441 |    "file_extension": ".py",
442 |    "mimetype": "text/x-python",
443 |    "name": "python",
444 |    "nbconvert_exporter": "python",
445 |    "pygments_lexer": "ipython3",
446 |    "version": "3.10.4"
447 |   }
448 |  },
449 |  "nbformat": 4,
450 |  "nbformat_minor": 5
451 | }
452 | 


--------------------------------------------------------------------------------
/docs/source/toc.segment:
--------------------------------------------------------------------------------
1 | ```{toctree}
2 | :maxdepth: 2
3 | :caption: Introduction
4 | 
5 | ./notebooks/getting_started
6 | ```
7 | 


--------------------------------------------------------------------------------
/frontend/.env.example:
--------------------------------------------------------------------------------
1 | # Only set for non development builds.
2 | # Development builds default to `http://localhost:8000`
3 | NEXT_PUBLIC_BASE_API_URL=https://example.com


--------------------------------------------------------------------------------
/frontend/.eslintrc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "plugins": ["react", "@typescript-eslint", "eslint-plugin-import"],
 3 |   "env": {
 4 |     "es2021": true,
 5 |     "commonjs": true,
 6 |     "es6": true,
 7 |     "node": true,
 8 |     "browser": true
 9 |   },
10 |   "globals": {
11 |     "window": true,
12 |     "process": true
13 |   },
14 |   "extends": [
15 |     "plugin:react/recommended",
16 |     "plugin:react/jsx-runtime",
17 |     "plugin:@typescript-eslint/eslint-recommended",
18 |     "plugin:@typescript-eslint/recommended",
19 |     "plugin:@typescript-eslint/recommended"
20 |   ],
21 |   "parserOptions": {
22 |     "ecmaFeatures": {
23 |       "jsx": true
24 |     },
25 |     "ecmaVersion": 12,
26 |     "sourceType": "module"
27 |   },
28 |   "rules": {
29 |     "import/no-extraneous-dependencies": "error",
30 |     "no-underscore-dangle": [
31 |       "error",
32 |       {
33 |         "allow": ["__typename"]
34 |       }
35 |     ],
36 |     "react/display-name": "off",
37 |     "react/prop-types": "off",
38 |     "@typescript-eslint/class-methods-use-this": [
39 |       "error",
40 |       {
41 |         "ignoreOverrideMethods": true
42 |       }
43 |     ],
44 |     "@typescript-eslint/consistent-type-assertions": [
45 |       "error",
46 |       {
47 |         "assertionStyle": "never"
48 |       }
49 |     ],
50 |     "@typescript-eslint/default-param-last": "error",
51 |     "@typescript-eslint/no-empty-function": "error",
52 |     "@typescript-eslint/no-explicit-any": "error",
53 |     "@typescript-eslint/no-shadow": "error",
54 |     "@typescript-eslint/no-unused-vars": [
55 |       "error",
56 |       {
57 |         "argsIgnorePattern": "^_",
58 |         "ignoreRestSiblings": true
59 |       }
60 |     ],
61 |     "@typescript-eslint/no-useless-constructor": "error",
62 |     "camelcase": "off",
63 |     "class-methods-use-this": "off",
64 |     "default-case": "off",
65 |     "default-param-last": "off",
66 |     "import/extensions": "off",
67 |     "import/prefer-default-export": "off",
68 |     "import/order": "error",
69 |     "linebreak-style": ["error", "unix"],
70 |     "max-len": [
71 |       "warn",
72 |       {
73 |         "code": 80,
74 |         "tabWidth": 2,
75 |         "comments": 80,
76 |         "ignoreComments": false,
77 |         "ignoreTrailingComments": true,
78 |         "ignoreUrls": true,
79 |         "ignoreStrings": true,
80 |         "ignoreTemplateLiterals": true,
81 |         "ignoreRegExpLiterals": true
82 |       }
83 |     ],
84 |     "no-console": ["warn", { "allow": ["warn", "error", "debug"] }],
85 |     "no-empty-function": "off",
86 |     "no-plusplus": [
87 |       "error",
88 |       {
89 |         "allowForLoopAfterthoughts": true
90 |       }
91 |     ]
92 |   },
93 |   "settings": {
94 |     "import/resolver": {
95 |       "typescript": {}
96 |     }
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/frontend/.gitignore:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 2 | 
 3 | # dependencies
 4 | /node_modules
 5 | /.pnp
 6 | .pnp.js
 7 | 
 8 | # testing
 9 | /coverage
10 | 
11 | # next.js
12 | /.next/
13 | /out/
14 | 
15 | # production
16 | /build
17 | 
18 | # misc
19 | .DS_Store
20 | *.pem
21 | 
22 | # debug
23 | npm-debug.log*
24 | yarn-debug.log*
25 | yarn-error.log*
26 | 
27 | # local env files
28 | .env*.local
29 | 
30 | # vercel
31 | .vercel
32 | 
33 | # typescript
34 | *.tsbuildinfo
35 | next-env.d.ts
36 | 
37 | .yarn/
38 | 
39 | # lint
40 | .eslintcache


--------------------------------------------------------------------------------
/frontend/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |   "endOfLine": "lf"
3 | }
4 | 


--------------------------------------------------------------------------------
/frontend/.yarnrc.yml:
--------------------------------------------------------------------------------
1 | nodeLinker: node-modules
2 | 


--------------------------------------------------------------------------------
/frontend/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM node:18-alpine AS base
 2 | 
 3 | FROM base AS base-deps
 4 | WORKDIR /app
 5 | 
 6 | COPY --link ./yarn.lock ./package.json ./.yarnrc.yml ./
 7 | 
 8 | FROM base AS installer
 9 | WORKDIR /app
10 | 
11 | COPY --link --from=base-deps /app/package.json ./package.json
12 | COPY --link --from=base-deps /app/yarn.lock ./yarn.lock
13 | COPY --link .yarnrc.yml .
14 | RUN yarn install
15 | 
16 | FROM base AS builder
17 | WORKDIR /app
18 | 
19 | COPY --link --from=installer /app .
20 | COPY --link tsconfig.json tsconfig.json
21 | RUN yarn build
22 | 
23 | FROM base AS development
24 | WORKDIR /app
25 | 
26 | COPY --link --from=installer /app .
27 | 
28 | ENV NODE_ENV=development
29 | 
30 | CMD ["yarn", "dev"]
31 | 


--------------------------------------------------------------------------------
/frontend/app/components/CreateExtractor.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import {
  4 |   AbsoluteCenter,
  5 |   Accordion,
  6 |   AccordionButton,
  7 |   AccordionIcon,
  8 |   AccordionItem,
  9 |   AccordionPanel,
 10 |   Badge,
 11 |   Box,
 12 |   Button,
 13 |   Card,
 14 |   CardBody,
 15 |   CircularProgress,
 16 |   Divider,
 17 |   FormControl,
 18 |   Heading,
 19 |   Icon,
 20 |   IconButton,
 21 |   Input,
 22 |   Text,
 23 | } from "@chakra-ui/react";
 24 | import { json } from "@codemirror/lang-json";
 25 | import Form from "@rjsf/chakra-ui";
 26 | import validator from "@rjsf/validator-ajv8";
 27 | import CodeMirror from "@uiw/react-codemirror";
 28 | import Ajv from "ajv";
 29 | import addFormats from "ajv-formats";
 30 | import { useRouter } from "next/navigation";
 31 | import React from "react";
 32 | 
 33 | import { ChatBubbleBottomCenterTextIcon } from "@heroicons/react/24/outline";
 34 | import { useMutation } from "@tanstack/react-query";
 35 | import { suggestExtractor, useCreateExtractor } from "../utils/api";
 36 | 
 37 | const ajv = new Ajv();
 38 | // Adds support for parsing format types like "date-time"
 39 | // and "email" in JSON Schema.
 40 | // A lot of the JSON Schema generated by LLMS will
 41 | // be generated with these formats out of the box.
 42 | addFormats(ajv);
 43 | 
 44 | /**
 45 |  * Component to create a new extractor with fields
 46 |  * for name, description, schema, and examples
 47 |  */
 48 | const CreateExtractor = ({}) => {
 49 |   const startSchema = "{}";
 50 |   // You might use a mutation hook here if you're
 51 |   // using something like React Query for state management
 52 |   const [schema, setSchema] = React.useState(startSchema);
 53 |   const [creatable, setCreatable] = React.useState(false);
 54 |   const [lastValidSchema, setLastValidSchema] = React.useState(
 55 |     JSON.parse(startSchema),
 56 |   );
 57 |   const [currentSchemaValid, setCurrentSchemaValid] = React.useState(true);
 58 |   const [userInput, setUserInput] = React.useState("");
 59 | 
 60 |   const suggestMutation = useMutation({
 61 |     mutationFn: suggestExtractor,
 62 |     onSuccess: (data) => {
 63 |       let prettySchema = data.json_schema;
 64 | 
 65 |       try {
 66 |         prettySchema = JSON.stringify(JSON.parse(data.json_schema), null, 2);
 67 |       } catch (e) {}
 68 | 
 69 |       setSchema(prettySchema);
 70 |     },
 71 |   });
 72 | 
 73 |   const { push } = useRouter();
 74 |   const { mutate } = useCreateExtractor({
 75 |     onSuccess: (data) => {
 76 |       push(`/e/${data.uuid}`);
 77 |     },
 78 |   });
 79 | 
 80 |   React.useMemo(() => {
 81 |     try {
 82 |       const parsedSchema = JSON.parse(schema);
 83 |       ajv.compile(parsedSchema);
 84 |       setCurrentSchemaValid(true);
 85 |       setLastValidSchema(parsedSchema);
 86 |       // OK to create if schema is parseable and not empty
 87 |       // and contains an object at the top level
 88 |       setCreatable(parsedSchema.type === "object");
 89 |     } catch (e) {
 90 |       setCurrentSchemaValid(false);
 91 |       setCreatable(false);
 92 |     }
 93 |   }, [schema]);
 94 | 
 95 |   const handleSubmit = (event: React.FormEvent<HTMLFormElement>) => {
 96 |     event.preventDefault();
 97 |     const instruction = "";
 98 |     const objectSchema = JSON.parse(schema);
 99 |     // Extract information from schema like name, and description
100 |     const name = objectSchema.title || "Unnamed";
101 |     const description = objectSchema.description || "";
102 |     // backend uses varchar(100) for description
103 |     const shortDescription =
104 |       description.length > 100
105 |         ? description.substring(0, 95) + "..."
106 |         : description;
107 | 
108 |     mutate({
109 |       name,
110 |       description: shortDescription,
111 |       schema: objectSchema,
112 |       instruction,
113 |     });
114 |   };
115 | 
116 |   const handleSuggest = (event: React.FormEvent<HTMLFormElement>) => {
117 |     event.preventDefault();
118 |     const description = event.currentTarget.userInput.value;
119 |     if (description === "") {
120 |       return;
121 |     }
122 |     suggestMutation.mutate({ description, jsonSchema: schema });
123 |     setUserInput("");
124 |   };
125 | 
126 |   return (
127 |     <div className="w-4/5 m-auto">
128 |       <Heading size={"md"} className="m-auto w-4/5" textAlign={"center"}>
129 |         What would you like to extract today?
130 |       </Heading>
131 |       <form className="m-auto flex gap-2 mt-5" onSubmit={handleSuggest}>
132 |         <FormControl id="userInput">
133 |           <Input
134 |             htmlSize={4}
135 |             width="100%"
136 |             autoFocus
137 |             height="auto"
138 |             placeholder="Describe your extraction task..."
139 |             value={userInput}
140 |             onChange={(event) => setUserInput(event.target.value)}
141 |           />
142 |         </FormControl>
143 |         {suggestMutation.isPending ? (
144 |           <CircularProgress isIndeterminate />
145 |         ) : (
146 |           <IconButton
147 |             type="submit"
148 |             icon={<Icon as={ChatBubbleBottomCenterTextIcon} />}
149 |             aria-label="OK"
150 |             colorScheme={userInput === "" ? "gray" : "blue"}
151 |             disabled={userInput === ""}
152 |           />
153 |         )}
154 |       </form>
155 |       <form
156 |         className="m-auto flex flex-col content-between gap-5 mt-10"
157 |         onSubmit={handleSubmit}
158 |       >
159 |         <Box position="relative" padding="10">
160 |           <Divider />
161 |           <AbsoluteCenter bg="white" px="10">
162 |             OR
163 |           </AbsoluteCenter>
164 |         </Box>
165 |         <Accordion allowToggle={true}>
166 |           <AccordionItem>
167 |             <AccordionButton>
168 |               Edit JSON Schema
169 |               <div className="ml-auto">
170 |                 {currentSchemaValid ? (
171 |                   <Badge colorScheme="green">OK</Badge>
172 |                 ) : (
173 |                   <Badge colorScheme="red">Errors!</Badge>
174 |                 )}
175 |                 <AccordionIcon />
176 |               </div>
177 |             </AccordionButton>
178 |             <AccordionPanel>
179 |               <FormControl isInvalid={!currentSchemaValid}>
180 |                 <CodeMirror
181 |                   id="schema"
182 |                   value={schema}
183 |                   aria-label="JSON Schema Editor"
184 |                   onChange={(value) => setSchema(value)}
185 |                   basicSetup={{ autocompletion: true }}
186 |                   extensions={[json()]}
187 |                   minHeight="300px"
188 |                   className="border-4 border-slate-300 border-double"
189 |                 />
190 |               </FormControl>
191 |             </AccordionPanel>
192 |           </AccordionItem>
193 |         </Accordion>
194 |         {Object.keys(lastValidSchema).length !== 0 && (
195 |           <>
196 |             <Heading size="md">Preview</Heading>
197 |             {!currentSchemaValid && (
198 |               <Text color="red.500">
199 |                 JSON Schema has errors. Showing previous valid JSON Schema.
200 |               </Text>
201 |             )}
202 |             <Card>
203 |               <CardBody>
204 |                 <Form
205 |                   schema={lastValidSchema}
206 |                   validator={validator}
207 |                   disabled={!currentSchemaValid || suggestMutation.isPending}
208 |                 >
209 |                   {true} {/* Disables the submit button */}
210 |                 </Form>
211 |               </CardBody>
212 |             </Card>
213 |           </>
214 |         )}
215 |         <Button className="btn" type="submit" size="lg" isDisabled={!creatable}>
216 |           Create
217 |         </Button>
218 |       </form>
219 |     </div>
220 |   );
221 | };
222 | 
223 | export default CreateExtractor;
224 | 


--------------------------------------------------------------------------------
/frontend/app/components/Extractor.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import {
 4 |   Tab,
 5 |   TabList,
 6 |   TabPanel,
 7 |   TabPanels,
 8 |   Tabs,
 9 |   Text,
10 | } from "@chakra-ui/react";
11 | import Form from "@rjsf/chakra-ui";
12 | import validator from "@rjsf/validator-ajv8";
13 | import { docco } from "react-syntax-highlighter/dist/esm/styles/hljs";
14 | 
15 | import SyntaxHighlighter from "react-syntax-highlighter";
16 | import { useGetExtractor } from "../utils/api";
17 | 
18 | type ExtractorProps = {
19 |   extractorId: string;
20 |   isShared: boolean;
21 | };
22 | 
23 | export const Extractor = ({ extractorId, isShared }: ExtractorProps) => {
24 |   const { data, isLoading, isError } = useGetExtractor(extractorId, isShared);
25 |   if (isLoading) {
26 |     return <div>Loading...</div>;
27 |   }
28 |   if (isError) {
29 |     return <div>Unable to load extractor with ID: {extractorId}</div>;
30 |   }
31 | 
32 |   if (data === undefined) {
33 |     throw new Error("Data is undefined");
34 |   }
35 | 
36 |   return (
37 |     <div className="mr-auto">
38 |       <Tabs className="mt-5" variant={"enclosed"} colorScheme="blue" size="sm">
39 |         <TabList>
40 |           <Tab>Form</Tab>
41 |           <Tab>Code</Tab>
42 |         </TabList>
43 |         <TabPanels>
44 |           <TabPanel>
45 |             <Form schema={data.schema} validator={validator}>
46 |               {true} {/* Disables the submit button */}
47 |             </Form>
48 |           </TabPanel>
49 |           <TabPanel>
50 |             <Text className="mt-1 mb-5">
51 |               This shows the raw JSON Schema that describes what information the
52 |               extractor will be extracting from the content.
53 |             </Text>
54 |             <SyntaxHighlighter language="json" style={docco}>
55 |               {JSON.stringify(data.schema, null, 2)}
56 |             </SyntaxHighlighter>
57 |           </TabPanel>
58 |         </TabPanels>
59 |       </Tabs>
60 |     </div>
61 |   );
62 | };
63 | 


--------------------------------------------------------------------------------
/frontend/app/components/Playground.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | import {
  3 |   Button,
  4 |   Heading,
  5 |   Tab,
  6 |   Box,
  7 |   Divider,
  8 |   AbsoluteCenter,
  9 |   TabList,
 10 |   TabPanel,
 11 |   TabPanels,
 12 |   Tabs,
 13 |   Text,
 14 |   Textarea,
 15 |   FormControl,
 16 |   FormLabel,
 17 |   HStack,
 18 |   Radio,
 19 |   RadioGroup,
 20 | } from "@chakra-ui/react";
 21 | import { useMutation } from "@tanstack/react-query";
 22 | import React from "react";
 23 | import SyntaxHighlighter from "react-syntax-highlighter";
 24 | import { docco } from "react-syntax-highlighter/dist/esm/styles/hljs";
 25 | import { runExtraction, useConfiguration } from "../utils/api";
 26 | import { Extractor } from "./Extractor";
 27 | import { ResultsTable } from "./ResultsTable";
 28 | 
 29 | interface PlaygroundProps {
 30 |   /**
 31 |    * The playground currently support viewing
 32 |    * both shared and non-shared extractors
 33 |    */
 34 |   extractorId: string;
 35 |   isShared: boolean;
 36 | }
 37 | 
 38 | /**
 39 |  * Playground to work with an existing extractor.
 40 |  */
 41 | export const Playground = (props: PlaygroundProps) => {
 42 |   const { extractorId, isShared } = props;
 43 |   const { data, isPending, mutate } = useMutation({
 44 |     mutationFn: runExtraction,
 45 |   });
 46 | 
 47 |   const requestServerConfig = useConfiguration();
 48 |   const [isDisabled, setIsDisabled] = React.useState(true);
 49 | 
 50 |   const handleSubmit = (event: React.FormEvent<HTMLFormElement>) => {
 51 |     event.preventDefault();
 52 | 
 53 |     const request = {
 54 |       extractor_id: extractorId,
 55 |       model_name: event.currentTarget.modelId.value,
 56 |     };
 57 | 
 58 |     if (event.currentTarget.text.value) {
 59 |       Object.assign(request, { text: event.currentTarget.text.value });
 60 |     } else {
 61 |       Object.assign(request, { file: event.currentTarget.file.files[0] });
 62 |     }
 63 | 
 64 |     mutate([request, isShared]);
 65 |   };
 66 | 
 67 |   const handleKeyDown = (event: React.KeyboardEvent<HTMLTextAreaElement>) => {
 68 |     if (event.key === "Enter" && !event.shiftKey) {
 69 |       event.preventDefault(); // Prevent the default Enter action
 70 |       if (isDisabled) {
 71 |         return;
 72 |       }
 73 | 
 74 |       event.currentTarget.form?.dispatchEvent(
 75 |         new Event("submit", { cancelable: true, bubbles: true }),
 76 |       );
 77 |     }
 78 |   };
 79 | 
 80 |   const handleChange = (event: React.FormEvent<HTMLFormElement>) => {
 81 |     if (
 82 |       event.currentTarget.text.value === "" &&
 83 |       event.currentTarget.file.files.length === 0
 84 |     ) {
 85 |       setIsDisabled(true);
 86 |       return;
 87 |     }
 88 |     // Also disable if both are present
 89 |     if (
 90 |       event.currentTarget.text.value !== "" &&
 91 |       event.currentTarget.file.files.length !== 0
 92 |     ) {
 93 |       setIsDisabled(true);
 94 |       return;
 95 |     }
 96 | 
 97 |     setIsDisabled(false);
 98 |   };
 99 | 
100 |   return (
101 |     <div className="w-full flex-col justify-between">
102 |       <div className="m-auto">
103 |         {isShared && <Heading>Using a shared exractor</Heading>}
104 |         <div>
105 |           <Extractor extractorId={extractorId} isShared={isShared} />
106 |         </div>
107 |         <Heading>Extract</Heading>
108 | 
109 |         <form
110 |           className="m-auto flex flex-col content-between gap-5 mt-10 mb-10"
111 |           onSubmit={handleSubmit}
112 |           onChange={handleChange}
113 |         >
114 |           {requestServerConfig.isFetched && (
115 |             <FormControl as="fieldset">
116 |               <FormLabel as="legend">Extraction Model</FormLabel>
117 |               <RadioGroup
118 |                 name="modelId"
119 |                 defaultValue={requestServerConfig.data?.models[0].name}
120 |               >
121 |                 <HStack spacing="24px">
122 |                   {requestServerConfig.data?.models.map((model) => (
123 |                     <Radio value={model.name} key={model.name}>
124 |                       {model.description}
125 |                     </Radio>
126 |                   ))}
127 |                 </HStack>
128 |               </RadioGroup>
129 |             </FormControl>
130 |           )}
131 |           {requestServerConfig.isFetched && (
132 |             <>
133 |               <input
134 |                 type="file"
135 |                 name="file"
136 |                 accept={requestServerConfig.data?.accepted_mimetypes.join(", ")}
137 |                 color="blue"
138 |                 className="border-2 border-dashed border-gray-300 rounded-md p-4 w-full file:mr-4"
139 |               />
140 |               <Text fontSize="xs">
141 |                 Max file size is: {requestServerConfig.data?.max_file_size_mb}MB
142 |               </Text>
143 |               <Text fontSize="xs">
144 |                 Supported mimetypes:{" "}
145 |                 {requestServerConfig.data?.accepted_mimetypes.join(", ")}
146 |               </Text>
147 |             </>
148 |           )}
149 |           <Box position="relative" padding="10">
150 |             <Divider />
151 |             <AbsoluteCenter bg="white" px="4">
152 |               OR
153 |             </AbsoluteCenter>
154 |           </Box>
155 |           <Textarea
156 |             placeholder="Enter text to extract information from..."
157 |             name="text"
158 |             className="textarea textarea-bordered h-3/4"
159 |             autoFocus
160 |             onKeyDown={handleKeyDown}
161 |           />
162 |           <Button type="submit" isDisabled={isDisabled}>
163 |             Run
164 |           </Button>
165 |         </form>
166 |       </div>
167 |       <div className="m-auto">
168 |         {data?.content_too_long && (
169 |           <Text color={"red"} margin={5}>
170 |             The content was too long to be processed. Extraction was run on a
171 |             truncated version of the content.
172 |           </Text>
173 |         )}
174 |         <Tabs variant={"enclosed"} colorScheme="blue" size="sm">
175 |           <TabList>
176 |             <Tab>Table</Tab>
177 |             <Tab>JSON</Tab>
178 |           </TabList>
179 |           <TabPanels>
180 |             <TabPanel>
181 |               <ResultsTable data={data} isPending={isPending} />
182 |             </TabPanel>
183 |             <TabPanel>
184 |               <Text className="mt-1 mb-5">
185 |                 Shows the output from the extractor in JSON format.
186 |               </Text>
187 |               <SyntaxHighlighter language="json" style={docco}>
188 |                 {JSON.stringify(data, null, 2)}
189 |               </SyntaxHighlighter>
190 |             </TabPanel>
191 |           </TabPanels>
192 |         </Tabs>
193 |       </div>
194 |     </div>
195 |   );
196 | };
197 | 


--------------------------------------------------------------------------------
/frontend/app/components/ResultsTable.tsx:
--------------------------------------------------------------------------------
  1 | import {
  2 |   Spinner,
  3 |   Table,
  4 |   TableCaption,
  5 |   TableContainer,
  6 |   Tbody,
  7 |   Td,
  8 |   Th,
  9 |   Thead,
 10 |   Tr,
 11 | } from "@chakra-ui/react";
 12 | 
 13 | import { ExtractionResponse } from "../utils/api";
 14 | 
 15 | function isRecord(value: unknown): value is Record<string, unknown> {
 16 |   return typeof value === "object" && value !== null;
 17 | }
 18 | 
 19 | function getColumns(records: unknown[]): Array<string> {
 20 |   // Create a set to store unique keys
 21 |   const uniqueKeys = new Set<string>();
 22 | 
 23 |   // Iterate over each record in the list
 24 |   records.forEach((record) => {
 25 |     // For each key in the current record, add it to the set
 26 |     if (!isRecord(record)) {
 27 |       return;
 28 |     }
 29 |     Object.keys(record).forEach((key) => {
 30 |       if (typeof key === "string") {
 31 |         uniqueKeys.add(key);
 32 |       }
 33 |     });
 34 |   });
 35 | 
 36 |   // Convert the set back into an array and return it
 37 |   return Array.from(uniqueKeys);
 38 | }
 39 | 
 40 | /*
 41 |  * This function takes a value and returns a string representation of it.
 42 |  * If the value is an array, it will join the elements with a comma and space.
 43 |  * If the value is an object, it will create an array of strings representing
 44 |  * each key-value pair, then join them with a comma and space.
 45 |  * Otherwise, it will return the string representation of the value.
 46 |  * @param value - The value to display
 47 |  * @returns The string representation of the value
 48 |  */
 49 | function getDisplayValue(value: unknown): string {
 50 |   if (Array.isArray(value)) {
 51 |     return value.map(getDisplayValue).join(", ");
 52 |   }
 53 |   if (isRecord(value)) {
 54 |     // Creating an array of strings representing each key-value pair,
 55 |     // then joining them with a comma and space.
 56 |     return Object.entries(value)
 57 |       .map(([key, val]) => `${key}: ${getDisplayValue(val)}`)
 58 |       .join(", ");
 59 |   }
 60 |   return String(value);
 61 | }
 62 | 
 63 | export const ResultsTable = ({
 64 |   data,
 65 |   isPending,
 66 | }: {
 67 |   data: ExtractionResponse | undefined;
 68 |   isPending: boolean;
 69 | }) => {
 70 |   // scan all the results to determine the columns
 71 |   // then display the results in a table
 72 |   if (isPending) {
 73 |     return (
 74 |       <Spinner
 75 |         thickness="4px"
 76 |         speed="0.65s"
 77 |         emptyColor="gray.200"
 78 |         color="blue.500"
 79 |         size="xl"
 80 |       />
 81 |     );
 82 |   }
 83 | 
 84 |   const actualData = data?.data;
 85 |   const columns = actualData ? getColumns(actualData) : [];
 86 | 
 87 |   return (
 88 |     <div>
 89 |       <TableContainer>
 90 |         <Table>
 91 |           <TableCaption>Extraction Results</TableCaption>
 92 |           <Thead>
 93 |             <Tr>
 94 |               {columns.map((column, idx) => (
 95 |                 <Th key={`table-header-${idx}`}>{column}</Th>
 96 |               ))}
 97 |             </Tr>
 98 |           </Thead>
 99 |           <Tbody>
100 |             {actualData?.map((row, index) => {
101 |               return (
102 |                 <Tr key={index}>
103 |                   {columns.map((column, idx) => (
104 |                     // Check if the row has the column,
105 |                     // if not, display an empty cell
106 |                     <Td key={`table-cell-${idx}`}>
107 |                       {isRecord(row) && column in row
108 |                         ? getDisplayValue(row[column])
109 |                         : ""}
110 |                     </Td>
111 |                   ))}
112 |                 </Tr>
113 |               );
114 |             })}
115 |           </Tbody>
116 |         </Table>
117 |       </TableContainer>
118 |     </div>
119 |   );
120 | };
121 | 


--------------------------------------------------------------------------------
/frontend/app/components/ShareModal.tsx:
--------------------------------------------------------------------------------
 1 | import {
 2 |   Modal,
 3 |   ModalContent,
 4 |   ModalHeader,
 5 |   ModalFooter,
 6 |   ModalBody,
 7 |   ModalOverlay,
 8 |   Text,
 9 |   Input,
10 |   ModalCloseButton,
11 |   useClipboard,
12 |   Flex,
13 |   Button,
14 | } from "@chakra-ui/react";
15 | 
16 | interface ShareModalProps {
17 |   shareUUID: string;
18 |   isOpen: boolean;
19 |   onClose: () => void;
20 | }
21 | 
22 | export function ShareModal(props: ShareModalProps) {
23 |   const { shareUUID, isOpen, onClose } = props;
24 |   const url = `${window.origin}/s/${shareUUID}`;
25 |   const { onCopy, hasCopied } = useClipboard(url);
26 | 
27 |   return (
28 |     <Modal isOpen={isOpen} onClose={onClose}>
29 |       <ModalOverlay />
30 |       <ModalContent>
31 |         <ModalHeader>Share Link</ModalHeader>
32 |         <ModalCloseButton />
33 |         <ModalBody>
34 |           <Text>
35 |             You can share this link with others to give them access to your
36 |             extractor:
37 |           </Text>
38 |           <Flex mb={2}>
39 |             <Input value={url} isReadOnly={true} />
40 |             <Button onClick={onCopy}>{hasCopied ? "Copied!" : "Copy"}</Button>
41 |           </Flex>
42 |         </ModalBody>
43 | 
44 |         <ModalFooter>
45 |           <Button colorScheme="blue" mr={3} onClick={onClose}>
46 |             Close
47 |           </Button>
48 |         </ModalFooter>
49 |       </ModalContent>
50 |     </Modal>
51 |   );
52 | }
53 | 


--------------------------------------------------------------------------------
/frontend/app/components/Sidebar.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import {
  4 |   Button,
  5 |   Link as ChakraLink,
  6 |   Divider,
  7 |   Flex,
  8 |   Icon,
  9 |   IconButton,
 10 |   Menu,
 11 |   MenuButton,
 12 |   MenuItem,
 13 |   MenuList,
 14 |   Spacer,
 15 |   Text,
 16 |   VStack,
 17 |   useDisclosure,
 18 | } from "@chakra-ui/react";
 19 | import React from "react";
 20 | import {
 21 |   ArrowTopRightOnSquareIcon,
 22 |   EllipsisVerticalIcon,
 23 |   PencilSquareIcon,
 24 |   TrashIcon,
 25 | } from "@heroicons/react/24/outline";
 26 | import { useRouter, useParams } from "next/navigation";
 27 | import { useMutation } from "@tanstack/react-query";
 28 | import {
 29 |   useDeleteExtractor,
 30 |   useGetExtractors,
 31 |   axiosClient,
 32 | } from "../utils/api";
 33 | import { getBaseApiUrl } from "../utils/api_url";
 34 | import { ShareModal } from "./ShareModal";
 35 | 
 36 | export function Sidebar() {
 37 |   const [shareUUID, setShareUUID] = React.useState("");
 38 |   const params = useParams();
 39 |   const currentExtractorId = params.extractorId;
 40 | 
 41 |   const { isOpen, onClose, onOpen } = useDisclosure();
 42 |   const { push } = useRouter();
 43 |   const { data } = useGetExtractors();
 44 |   const deleteExtractor = useDeleteExtractor();
 45 | 
 46 |   const baseUrl = getBaseApiUrl();
 47 |   const mutateShare = useMutation({
 48 |     mutationFn: async (uuid: string) =>
 49 |       axiosClient.post(`${baseUrl}/extractors/${uuid}/share`),
 50 |     onSuccess: (onSuccessData) => {
 51 |       setShareUUID(onSuccessData.data.share_uuid);
 52 |       onOpen();
 53 |     },
 54 |   });
 55 | 
 56 |   // eslint-disable-next-line @typescript-eslint/no-explicit-any
 57 |   const buttons = data?.map((extractor: any) => {
 58 |     const selectedProps =
 59 |       extractor.uuid === currentExtractorId
 60 |         ? {
 61 |             borderLeft: "4px solid gray",
 62 |             borderRadius: "4px",
 63 |             padding: "4px",
 64 |           }
 65 |         : {};
 66 |     return (
 67 |       <Flex
 68 |         flexDirection="column"
 69 |         key={extractor.uuid}
 70 |         w="100%"
 71 |         {...selectedProps}
 72 |       >
 73 |         <Flex alignItems="center">
 74 |           <ChakraLink
 75 |             p={1}
 76 |             onClick={() => push(`/e/${extractor.uuid}`)}
 77 |             cursor="pointer"
 78 |           >
 79 |             <Text noOfLines={1}>
 80 |               <strong>{extractor.name}</strong>
 81 |             </Text>
 82 |           </ChakraLink>
 83 |           <Spacer />
 84 |           <Menu>
 85 |             <MenuButton
 86 |               as={IconButton}
 87 |               aria-label="Options"
 88 |               icon={<Icon as={EllipsisVerticalIcon} />}
 89 |               variant="outline"
 90 |             />
 91 |             <MenuList>
 92 |               <MenuItem
 93 |                 icon={<Icon as={ArrowTopRightOnSquareIcon} />}
 94 |                 onClick={() => {
 95 |                   mutateShare.mutate(extractor.uuid);
 96 |                 }}
 97 |               >
 98 |                 Share
 99 |                 {isOpen && (
100 |                   <ShareModal
101 |                     shareUUID={shareUUID}
102 |                     isOpen={isOpen}
103 |                     onClose={onClose}
104 |                   />
105 |                 )}
106 |               </MenuItem>
107 |               <MenuItem
108 |                 icon={<Icon as={TrashIcon} />}
109 |                 onClick={() => deleteExtractor.mutate(extractor.uuid)}
110 |               >
111 |                 Delete
112 |               </MenuItem>
113 |             </MenuList>
114 |           </Menu>
115 |         </Flex>
116 |         <Text p={1} noOfLines={1} color={"gray"}>
117 |           {extractor.description}
118 |         </Text>
119 |       </Flex>
120 |     );
121 |   });
122 | 
123 |   return (
124 |     <div>
125 |       <VStack>
126 |         <Button
127 |           rightIcon={<Icon as={PencilSquareIcon} />}
128 |           w="80%"
129 |           onClick={() => push("/new")}
130 |         >
131 |           New
132 |         </Button>
133 |         <Divider />
134 |         {buttons}
135 |       </VStack>
136 |     </div>
137 |   );
138 | }
139 | 


--------------------------------------------------------------------------------
/frontend/app/e/[extractorId]/page.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import { Playground } from "../../components/Playground";
 4 | 
 5 | interface ExtractorPageProps {
 6 |   params: {
 7 |     extractorId: string;
 8 |   };
 9 | }
10 | 
11 | export default function Page({ params }: ExtractorPageProps) {
12 |   return <Playground extractorId={params.extractorId} isShared={false} />;
13 | }
14 | 


--------------------------------------------------------------------------------
/frontend/app/globals.css:
--------------------------------------------------------------------------------
1 | @tailwind base;
2 | @tailwind components;
3 | @tailwind utilities;
4 | 


--------------------------------------------------------------------------------
/frontend/app/layout.tsx:
--------------------------------------------------------------------------------
 1 | import "./globals.css";
 2 | import { ReactNode } from "react";
 3 | import type { Metadata } from "next";
 4 | import { Inter } from "next/font/google";
 5 | import { Sidebar } from "./components/Sidebar";
 6 | import { Providers } from "./providers";
 7 | 
 8 | const inter = Inter({ subsets: ["latin"] });
 9 | 
10 | export const metadata: Metadata = {
11 |   title: "LangChain Extract",
12 |   icons:
13 |     "data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22>🦜</text></svg>",
14 |   viewport: {
15 |     width: "device-width",
16 |     initialScale: 1.0,
17 |   },
18 | };
19 | 
20 | export default function RootLayout({ children }: { children: ReactNode }) {
21 |   return (
22 |     <html lang="en" className="h-full">
23 |       <body className={`${inter.className} h-full`}>
24 |         <Providers>
25 |           <div className="flex flex-col w-100% h-full">
26 |             <div className="flex justify-between bg-slate-200 mb-4 p-3 items-center gap-2">
27 |               <div className="font-semibold">🦜⛏️ LangChain Extract</div>
28 |               <div className="text-s text-rose-800">
29 |                 <strong>Research Preview</strong>: this app is unauthenticated
30 |                 and all data can be found. Do not use with sensitive data.
31 |               </div>
32 |             </div>
33 |             <div className="flex gap-3 ml-5 mr-5">
34 |               <div className="w-1/6">
35 |                 <Sidebar />
36 |               </div>
37 |               <main className="m-auto w-5/6">{children}</main>
38 |             </div>
39 |           </div>
40 |         </Providers>
41 |       </body>
42 |     </html>
43 |   );
44 | }
45 | 


--------------------------------------------------------------------------------
/frontend/app/new/page.tsx:
--------------------------------------------------------------------------------
1 | import CreateExtractor from "../components/CreateExtractor";
2 | 
3 | export default CreateExtractor;
4 | 


--------------------------------------------------------------------------------
/frontend/app/page.tsx:
--------------------------------------------------------------------------------
1 | "use client";
2 | 
3 | import CreateExtractor from "./components/CreateExtractor";
4 | 
5 | export default CreateExtractor;
6 | 


--------------------------------------------------------------------------------
/frontend/app/providers.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
 4 | import { ChakraProvider } from "@chakra-ui/react";
 5 | 
 6 | const queryClient = new QueryClient();
 7 | 
 8 | export function Providers({ children }: { children: React.ReactNode }) {
 9 |   return (
10 |     <QueryClientProvider client={queryClient}>
11 |       <ChakraProvider>{children}</ChakraProvider>
12 |     </QueryClientProvider>
13 |   );
14 | }
15 | 


--------------------------------------------------------------------------------
/frontend/app/s/[sharedExtractorId]/page.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import { Playground } from "../../components/Playground";
 4 | 
 5 | interface ExtractorPageProps {
 6 |   params: {
 7 |     sharedExtractorId: string;
 8 |   };
 9 | }
10 | 
11 | export default function Page({ params }: ExtractorPageProps) {
12 |   return <Playground extractorId={params.sharedExtractorId} isShared={true} />;
13 | }
14 | 


--------------------------------------------------------------------------------
/frontend/app/utils/api.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | /* Expose API hooks for use in components */
  3 | import axios from "axios";
  4 | import {
  5 |   useQuery,
  6 |   useQueryClient,
  7 |   useMutation,
  8 |   MutationFunction,
  9 |   QueryFunctionContext,
 10 | } from "@tanstack/react-query";
 11 | import { v4 as uuidv4 } from "uuid";
 12 | import { getBaseApiUrl } from "./api_url";
 13 | 
 14 | type ExtractorData = {
 15 |   uuid: string;
 16 |   name: string;
 17 |   description: string;
 18 |   // eslint-disable-next-line @typescript-eslint/no-explicit-any
 19 |   schema: any;
 20 | };
 21 | 
 22 | const getApiKey = (): string => {
 23 |   if (typeof window === "undefined") {
 24 |     return uuidv4();
 25 |   }
 26 | 
 27 |   const key = localStorage.getItem("lc-extract-key");
 28 |   if (!key) {
 29 |     // Generate key
 30 |     const newKey = uuidv4();
 31 |     localStorage.setItem("lc-extract-key", newKey);
 32 |     return newKey;
 33 |   }
 34 |   return key;
 35 | };
 36 | 
 37 | // Create an instance with custom headers
 38 | export const axiosClient = axios.create({
 39 |   headers: {
 40 |     "x-key": getApiKey(),
 41 |   },
 42 | });
 43 | 
 44 | type GetExtractorQueryKey = [string, string, boolean]; // [queryKey, uuid, isShared]
 45 | 
 46 | type OnSuccessFn = (data: { uuid: string }) => void;
 47 | 
 48 | const getExtractor = async ({
 49 |   queryKey,
 50 | }: QueryFunctionContext<GetExtractorQueryKey>): Promise<ExtractorData> => {
 51 |   const [, uuid, isShared] = queryKey;
 52 |   const baseUrl = getBaseApiUrl();
 53 |   if (isShared) {
 54 |     const response = await axiosClient.get(
 55 |       `${baseUrl}/shared/extractors/${uuid}`,
 56 |     );
 57 |     return response.data;
 58 |   } else {
 59 |     const response = await axiosClient.get(`${baseUrl}/extractors/${uuid}`);
 60 |     return response.data;
 61 |   }
 62 | };
 63 | 
 64 | const listExtractors = async () => {
 65 |   const baseUrl = getBaseApiUrl();
 66 |   const response = await axiosClient.get(`${baseUrl}/extractors`);
 67 |   return response.data;
 68 | };
 69 | 
 70 | // eslint-disable-next-line @typescript-eslint/no-explicit-any
 71 | const createExtractor: MutationFunction<any, any> = async (extractor) => {
 72 |   const baseUrl = getBaseApiUrl();
 73 |   const response = await axiosClient.post(`${baseUrl}/extractors`, extractor);
 74 |   return response.data;
 75 | };
 76 | 
 77 | export type Model = {
 78 |   name: string;
 79 |   description: string;
 80 | };
 81 | 
 82 | export type ServerConfiguration = {
 83 |   max_file_size_mb: number;
 84 |   accepted_mimetypes: string[];
 85 |   models: Model[];
 86 | };
 87 | 
 88 | const getConfiguration = async (): Promise<ServerConfiguration> => {
 89 |   const baseUrl = getBaseApiUrl();
 90 |   const response = await axiosClient.get(`${baseUrl}/configuration`);
 91 |   return response.data;
 92 | };
 93 | 
 94 | export const useConfiguration = () => {
 95 |   return useQuery({
 96 |     queryKey: ["getConfiguration"],
 97 |     queryFn: getConfiguration,
 98 |   });
 99 | };
100 | 
101 | export const suggestExtractor = async ({
102 |   description,
103 |   jsonSchema,
104 | }: {
105 |   description: string;
106 |   jsonSchema: string;
107 | }) => {
108 |   if (description === "") {
109 |     return {};
110 |   }
111 |   const baseUrl = getBaseApiUrl();
112 |   const response = await axiosClient.post(`${baseUrl}/suggest`, {
113 |     description,
114 |     jsonSchema,
115 |   });
116 |   return response.data;
117 | };
118 | 
119 | type ExtractionRequest = {
120 |   extractor_id: string;
121 |   text?: string;
122 |   file?: File;
123 | };
124 | 
125 | export type ExtractionResponse = {
126 |   data: unknown[];
127 |   content_too_long?: boolean;
128 | };
129 | 
130 | export const runExtraction: MutationFunction<
131 |   ExtractionResponse,
132 |   [ExtractionRequest, boolean]
133 | > = async ([extractionRequest, isShared]) => {
134 |   const endpoint = isShared ? "extract/shared" : "extract";
135 |   const baseUrl = getBaseApiUrl();
136 |   const response = await axiosClient.postForm(
137 |     `${baseUrl}/${endpoint}`,
138 |     extractionRequest,
139 |   );
140 |   return response.data;
141 | };
142 | 
143 | export const useRunExtraction = () => {
144 |   return useMutation({ mutationFn: runExtraction });
145 | };
146 | 
147 | export const useGetExtractor = (uuid: string, isShared: boolean) => {
148 |   return useQuery({
149 |     queryKey: ["getExtractor", uuid, isShared],
150 |     queryFn: getExtractor,
151 |   });
152 | };
153 | 
154 | export const useGetExtractors = () => {
155 |   return useQuery({ queryKey: ["getExtractors"], queryFn: listExtractors });
156 | };
157 | 
158 | export const useDeleteExtractor = () => {
159 |   const baseUrl = getBaseApiUrl();
160 |   const queryClient = useQueryClient();
161 |   return useMutation({
162 |     mutationFn: (uuid: string) =>
163 |       axiosClient.delete(`${baseUrl}/extractors/${uuid}`),
164 |     onSuccess: () => {
165 |       queryClient.invalidateQueries({ queryKey: ["getExtractors"] });
166 |     },
167 |   });
168 | };
169 | 
170 | export const useCreateExtractor = ({
171 |   onSuccess,
172 | }: {
173 |   onSuccess: OnSuccessFn;
174 | }) => {
175 |   const queryClient = useQueryClient();
176 |   return useMutation({
177 |     mutationFn: createExtractor,
178 |     onSuccess: (data) => {
179 |       queryClient.invalidateQueries({ queryKey: ["getExtractors"] });
180 |       onSuccess(data);
181 |     },
182 |   });
183 | };
184 | 


--------------------------------------------------------------------------------
/frontend/app/utils/api_url.ts:
--------------------------------------------------------------------------------
 1 | export const getBaseApiUrl = () => {
 2 |   if (process.env.NODE_ENV === "development") {
 3 |     return "http://localhost:8000";
 4 |   }
 5 |   if (!process.env.NEXT_PUBLIC_BASE_API_URL) {
 6 |     throw new Error(
 7 |       "NEXT_PUBLIC_BASE_API_URL must be set if not in development.",
 8 |     );
 9 |   }
10 |   return process.env.NEXT_PUBLIC_BASE_API_URL;
11 | };
12 | 


--------------------------------------------------------------------------------
/frontend/next.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('next').NextConfig} */
2 | const nextConfig = {};
3 | 
4 | module.exports = nextConfig;
5 | 


--------------------------------------------------------------------------------
/frontend/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "langchain-extract-frontend",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "packageManager": "yarn@1.22.19",
 6 |   "scripts": {
 7 |     "dev": "next dev",
 8 |     "build": "next build",
 9 |     "start": "next start",
10 |     "format": "prettier --write .",
11 |     "format:check": "prettier --config .prettierrc --check \"app\"",
12 |     "lint": "eslint --cache --ext .tsx,.jsx --config .eslintrc.json . --ignore-path .gitignore",
13 |     "lint:fix": "yarn lint --fix",
14 |     "fix": "prettier --write . && yarn lint --fix"
15 |   },
16 |   "dependencies": {
17 |     "@chakra-ui/icons": "^2.1.0",
18 |     "@chakra-ui/react": "^2.8.1",
19 |     "@codemirror/lang-json": "^6.0.1",
20 |     "@emotion/react": "^11.11.1",
21 |     "@emotion/styled": "^11.11.0",
22 |     "@heroicons/react": "^2.1.1",
23 |     "@langchain/community": "^0.0.27",
24 |     "@langchain/core": "^0.1.26",
25 |     "@langchain/openai": "^0.0.14",
26 |     "@langchain/weaviate": "^0.0.1",
27 |     "@rjsf/chakra-ui": "^5.17.1",
28 |     "@rjsf/core": "^5.17.1",
29 |     "@rjsf/utils": "^5.17.1",
30 |     "@rjsf/validator-ajv8": "^5.17.1",
31 |     "@tanstack/react-query": "^5.28.4",
32 |     "@types/dompurify": "^3.0.5",
33 |     "@types/marked": "^5.0.1",
34 |     "@types/node": "20.4.9",
35 |     "@types/react": "18.2.20",
36 |     "@types/react-dom": "18.2.7",
37 |     "@uiw/react-codemirror": "^4.21.24",
38 |     "ajv": "^8.12.0",
39 |     "ajv-formats": "^2.1.1",
40 |     "autoprefixer": "10.4.14",
41 |     "axios": "^1.6.8",
42 |     "chakra-react-select": "^4.7.6",
43 |     "framer-motion": "^11.0.14",
44 |     "highlight.js": "^11.8.0",
45 |     "marked": "^7.0.2",
46 |     "next": "13.5.4",
47 |     "postcss": "8.4.31",
48 |     "react": "18.2.0",
49 |     "react-dom": "18.2.0",
50 |     "react-syntax-highlighter": "^15.5.0",
51 |     "react-textarea-autosize": "^8.5.3",
52 |     "tailwindcss": "3.3.3",
53 |     "uuid": "^9.0.1"
54 |   },
55 |   "devDependencies": {
56 |     "@types/react-syntax-highlighter": "^15.5.11",
57 |     "@typescript-eslint/eslint-plugin": "^7.3.0",
58 |     "@typescript-eslint/parser": "^7.3.0",
59 |     "eslint": "^8.33.0",
60 |     "eslint-config-airbnb-base": "^15.0.0",
61 |     "eslint-config-prettier": "^8.6.0",
62 |     "eslint-import-resolver-typescript": "^3.6.1",
63 |     "eslint-plugin-import": "^2.27.5",
64 |     "eslint-plugin-jest": "^27.6.0",
65 |     "eslint-plugin-no-instanceof": "^1.0.1",
66 |     "eslint-plugin-prettier": "^4.2.1",
67 |     "eslint-plugin-react": "^7.34.1",
68 |     "prettier": "^3.0.3",
69 |     "typescript": "5.1.6"
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/frontend/postcss.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   plugins: {
3 |     tailwindcss: {},
4 |     autoprefixer: {},
5 |   },
6 | };
7 | 


--------------------------------------------------------------------------------
/frontend/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/frontend/public/favicon.ico


--------------------------------------------------------------------------------
/frontend/public/images/github-mark.svg:
--------------------------------------------------------------------------------
1 | <svg width="98" height="96" xmlns="http://www.w3.org/2000/svg"><path fill-rule="evenodd" clip-rule="evenodd" d="M48.854 0C21.839 0 0 22 0 49.217c0 21.756 13.993 40.172 33.405 46.69 2.427.49 3.316-1.059 3.316-2.362 0-1.141-.08-5.052-.08-9.127-13.59 2.934-16.42-5.867-16.42-5.867-2.184-5.704-5.42-7.17-5.42-7.17-4.448-3.015.324-3.015.324-3.015 4.934.326 7.523 5.052 7.523 5.052 4.367 7.496 11.404 5.378 14.235 4.074.404-3.178 1.699-5.378 3.074-6.6-10.839-1.141-22.243-5.378-22.243-24.283 0-5.378 1.94-9.778 5.014-13.2-.485-1.222-2.184-6.275.486-13.038 0 0 4.125-1.304 13.426 5.052a46.97 46.97 0 0 1 12.214-1.63c4.125 0 8.33.571 12.213 1.63 9.302-6.356 13.427-5.052 13.427-5.052 2.67 6.763.97 11.816.485 13.038 3.155 3.422 5.015 7.822 5.015 13.2 0 18.905-11.404 23.06-22.324 24.283 1.78 1.548 3.316 4.481 3.316 9.126 0 6.6-.08 11.897-.08 13.526 0 1.304.89 2.853 3.316 2.364 19.412-6.52 33.405-24.935 33.405-46.691C97.707 22 75.788 0 48.854 0z" fill="#ffffff"/></svg>


--------------------------------------------------------------------------------
/frontend/tailwind.config.ts:
--------------------------------------------------------------------------------
 1 | import type { Config } from "tailwindcss";
 2 | 
 3 | const config: Config = {
 4 |   content: [
 5 |     "./pages/**/*.{js,ts,jsx,tsx,mdx}",
 6 |     "./components/**/*.{js,ts,jsx,tsx,mdx}",
 7 |     "./app/**/*.{js,ts,jsx,tsx,mdx}",
 8 |   ],
 9 |   theme: {
10 |     extend: {
11 |       backgroundImage: {
12 |         "gradient-radial": "radial-gradient(var(--tw-gradient-stops))",
13 |         "gradient-conic":
14 |           "conic-gradient(from 180deg at 50% 50%, var(--tw-gradient-stops))",
15 |       },
16 |     },
17 |   },
18 |   plugins: [],
19 | };
20 | export default config;
21 | 


--------------------------------------------------------------------------------
/frontend/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "es5",
 4 |     "lib": ["dom", "dom.iterable", "esnext"],
 5 |     "allowJs": true,
 6 |     "skipLibCheck": true,
 7 |     "strict": true,
 8 |     "forceConsistentCasingInFileNames": true,
 9 |     "noEmit": true,
10 |     "esModuleInterop": true,
11 |     "module": "esnext",
12 |     "moduleResolution": "node",
13 |     "resolveJsonModule": true,
14 |     "isolatedModules": true,
15 |     "jsx": "preserve",
16 |     "incremental": true,
17 |     "baseUrl": "./",
18 |     "plugins": [
19 |       {
20 |         "name": "next"
21 |       }
22 |     ],
23 |     "paths": {
24 |       "@/*": ["./*"]
25 |     }
26 |   },
27 |   "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
28 |   "exclude": ["node_modules"]
29 | }
30 | 


--------------------------------------------------------------------------------