├── .github ├── actions │ └── poetry_setup │ │ └── action.yml └── workflows │ ├── _lint.yml │ ├── _test.yml │ ├── ci.yml │ ├── fe_ci.yml │ └── fe_lint_format.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── backend ├── Dockerfile ├── Makefile ├── README.md ├── db │ ├── __init__.py │ └── models.py ├── extraction │ ├── __init__.py │ ├── parsing.py │ └── utils.py ├── poetry.lock ├── pyproject.toml ├── scripts │ ├── __init__.py │ ├── local_entry_point.sh │ ├── prod_entry_point.sh │ └── run_migrations.py ├── server │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── api_key.py │ │ ├── configurables.py │ │ ├── examples.py │ │ ├── extract.py │ │ ├── extractors.py │ │ ├── shared.py │ │ └── suggest.py │ ├── extraction_runnable.py │ ├── main.py │ ├── models.py │ ├── retrieval.py │ ├── settings.py │ └── validators.py └── tests │ ├── __init__.py │ ├── db.py │ ├── integration_tests │ ├── __init__.py │ └── test_extraction.py │ └── unit_tests │ ├── __init__.py │ ├── api │ ├── __init__.py │ ├── test_api_configuration.py │ ├── test_api_defining_extractors.py │ ├── test_api_examples.py │ └── test_api_extract.py │ ├── conftest.py │ ├── fake │ ├── __init__.py │ ├── chat_model.py │ └── test_fake_chat_model.py │ ├── fixtures │ ├── __init__.py │ ├── sample.docx │ ├── sample.epub │ ├── sample.html │ ├── sample.odt │ ├── sample.pdf │ ├── sample.rtf │ └── sample.txt │ ├── test_deduplication.py │ ├── test_parsing.py │ ├── test_upload.py │ ├── test_utils.py │ ├── test_validators.py │ └── utils.py ├── docker-compose.yml ├── docs ├── Makefile ├── make.bat └── source │ ├── conf.py │ ├── notebooks │ ├── earnings_call_example.ipynb │ └── quick_start.ipynb │ └── toc.segment └── frontend ├── .env.example ├── .eslintrc.json ├── .gitignore ├── .prettierrc ├── .yarnrc.yml ├── Dockerfile ├── app ├── components │ ├── CreateExtractor.tsx │ ├── Extractor.tsx │ ├── Playground.tsx │ ├── ResultsTable.tsx │ ├── ShareModal.tsx │ └── Sidebar.tsx ├── e │ └── [extractorId] │ │ └── page.tsx ├── globals.css ├── layout.tsx ├── new │ └── page.tsx ├── page.tsx ├── providers.tsx ├── s │ └── [sharedExtractorId] │ │ └── page.tsx └── utils │ ├── api.tsx │ └── api_url.ts ├── next.config.js ├── package.json ├── postcss.config.js ├── public ├── favicon.ico └── images │ └── github-mark.svg ├── tailwind.config.ts ├── tsconfig.json └── yarn.lock /.github/actions/poetry_setup/action.yml: -------------------------------------------------------------------------------- 1 | # An action for setting up poetry install with caching. 2 | # Using a custom action since the default action does not 3 | # take poetry install groups into account. 4 | # Action code from: 5 | # https://github.com/actions/setup-python/issues/505#issuecomment-1273013236 6 | name: poetry-install-with-caching 7 | description: Poetry install with support for caching of dependency groups. 8 | 9 | inputs: 10 | python-version: 11 | description: Python version, supporting MAJOR.MINOR only 12 | required: true 13 | 14 | poetry-version: 15 | description: Poetry version 16 | required: true 17 | 18 | cache-key: 19 | description: Cache key to use for manual handling of caching 20 | required: true 21 | 22 | working-directory: 23 | description: Directory whose poetry.lock file should be cached 24 | required: true 25 | 26 | runs: 27 | using: composite 28 | steps: 29 | - uses: actions/setup-python@v4 30 | name: Setup python ${{ inputs.python-version }} 31 | with: 32 | python-version: ${{ inputs.python-version }} 33 | 34 | - uses: actions/cache@v3 35 | id: cache-bin-poetry 36 | name: Cache Poetry binary - Python ${{ inputs.python-version }} 37 | env: 38 | SEGMENT_DOWNLOAD_TIMEOUT_MIN: "1" 39 | with: 40 | path: | 41 | /opt/pipx/venvs/poetry 42 | # This step caches the poetry installation, so make sure it's keyed on the poetry version as well. 43 | key: bin-poetry-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-${{ inputs.poetry-version }} 44 | 45 | - name: Refresh shell hashtable and fixup softlinks 46 | if: steps.cache-bin-poetry.outputs.cache-hit == 'true' 47 | shell: bash 48 | env: 49 | POETRY_VERSION: ${{ inputs.poetry-version }} 50 | PYTHON_VERSION: ${{ inputs.python-version }} 51 | run: | 52 | set -eux 53 | 54 | # Refresh the shell hashtable, to ensure correct `which` output. 55 | hash -r 56 | 57 | # `actions/cache@v3` doesn't always seem able to correctly unpack softlinks. 58 | # Delete and recreate the softlinks pipx expects to have. 59 | rm /opt/pipx/venvs/poetry/bin/python 60 | cd /opt/pipx/venvs/poetry/bin 61 | ln -s "$(which "python$PYTHON_VERSION")" python 62 | chmod +x python 63 | cd /opt/pipx_bin/ 64 | ln -s /opt/pipx/venvs/poetry/bin/poetry poetry 65 | chmod +x poetry 66 | 67 | # Ensure everything got set up correctly. 68 | /opt/pipx/venvs/poetry/bin/python --version 69 | /opt/pipx_bin/poetry --version 70 | 71 | - name: Install poetry 72 | if: steps.cache-bin-poetry.outputs.cache-hit != 'true' 73 | shell: bash 74 | env: 75 | POETRY_VERSION: ${{ inputs.poetry-version }} 76 | PYTHON_VERSION: ${{ inputs.python-version }} 77 | run: pipx install "poetry==$POETRY_VERSION" --python "python$PYTHON_VERSION" --verbose 78 | 79 | - name: Restore pip and poetry cached dependencies 80 | uses: actions/cache@v3 81 | env: 82 | SEGMENT_DOWNLOAD_TIMEOUT_MIN: "4" 83 | WORKDIR: ${{ inputs.working-directory == '' && '.' || inputs.working-directory }} 84 | with: 85 | path: | 86 | ~/.cache/pip 87 | ~/.cache/pypoetry/virtualenvs 88 | ~/.cache/pypoetry/cache 89 | ~/.cache/pypoetry/artifacts 90 | ${{ env.WORKDIR }}/.venv 91 | key: py-deps-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-poetry-${{ inputs.poetry-version }}-${{ inputs.cache-key }}-${{ hashFiles(format('{0}/**/poetry.lock', env.WORKDIR)) }} 92 | -------------------------------------------------------------------------------- /.github/workflows/_lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - '!frontend/**' 7 | push: 8 | paths: 9 | - '!frontend/**' 10 | workflow_call: 11 | inputs: 12 | working-directory: 13 | required: true 14 | type: string 15 | description: "From which folder this pipeline executes" 16 | 17 | env: 18 | POETRY_VERSION: "1.7.1" 19 | WORKDIR: ${{ inputs.working-directory == '' && '.' || inputs.working-directory }} 20 | 21 | jobs: 22 | build: 23 | runs-on: ubuntu-latest 24 | env: 25 | # This number is set "by eye": we want it to be big enough 26 | # so that it's bigger than the number of commits in any reasonable PR, 27 | # and also as small as possible since increasing the number makes 28 | # the initial `git fetch` slower. 29 | FETCH_DEPTH: 50 30 | strategy: 31 | matrix: 32 | # Only lint on the min and max supported Python versions. 33 | # It's extremely unlikely that there's a lint issue on any version in between 34 | # that doesn't show up on the min or max versions. 35 | # 36 | # GitHub rate-limits how many jobs can be running at any one time. 37 | # Starting new jobs is also relatively slow, 38 | # so linting on fewer versions makes CI faster. 39 | python-version: 40 | - "3.8" 41 | - "3.11" 42 | steps: 43 | - uses: actions/checkout@v3 44 | - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }} 45 | uses: "./.github/actions/poetry_setup" 46 | with: 47 | python-version: ${{ matrix.python-version }} 48 | poetry-version: ${{ env.POETRY_VERSION }} 49 | working-directory: ${{ inputs.working-directory }} 50 | cache-key: lint-with-extras 51 | 52 | - name: Check Poetry File 53 | shell: bash 54 | working-directory: ${{ inputs.working-directory }} 55 | run: | 56 | poetry check 57 | 58 | - name: Check lock file 59 | shell: bash 60 | working-directory: ${{ inputs.working-directory }} 61 | run: | 62 | poetry lock --check 63 | 64 | - name: Install dependencies 65 | # Also installs dev/lint/test/typing dependencies, to ensure we have 66 | # type hints for as many of our libraries as possible. 67 | # This helps catch errors that require dependencies to be spotted, for example: 68 | # https://github.com/langchain-ai/langchain/pull/10249/files#diff-935185cd488d015f026dcd9e19616ff62863e8cde8c0bee70318d3ccbca98341 69 | # 70 | # If you change this configuration, make sure to change the `cache-key` 71 | # in the `poetry_setup` action above to stop using the old cache. 72 | # It doesn't matter how you change it, any change will cause a cache-bust. 73 | working-directory: ${{ inputs.working-directory }} 74 | run: | 75 | poetry install --with dev,lint,test,typing 76 | 77 | - name: Get .mypy_cache to speed up mypy 78 | uses: actions/cache@v3 79 | env: 80 | SEGMENT_DOWNLOAD_TIMEOUT_MIN: "2" 81 | with: 82 | path: | 83 | ${{ env.WORKDIR }}/.mypy_cache 84 | key: mypy-${{ runner.os }}-${{ runner.arch }}-py${{ matrix.python-version }}-${{ inputs.working-directory }}-${{ hashFiles(format('{0}/poetry.lock', env.WORKDIR)) }} 85 | 86 | - name: Analysing the code with our lint 87 | working-directory: ${{ inputs.working-directory }} 88 | run: | 89 | make lint 90 | -------------------------------------------------------------------------------- /.github/workflows/_test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - '!frontend/**' 7 | push: 8 | paths: 9 | - '!frontend/**' 10 | workflow_call: 11 | inputs: 12 | working-directory: 13 | required: true 14 | type: string 15 | description: "From which folder this pipeline executes" 16 | 17 | env: 18 | POETRY_VERSION: "1.7.1" 19 | 20 | jobs: 21 | build: 22 | defaults: 23 | run: 24 | working-directory: ${{ inputs.working-directory }} 25 | runs-on: ubuntu-latest 26 | strategy: 27 | matrix: 28 | python-version: 29 | - "3.8" 30 | - "3.9" 31 | - "3.10" 32 | - "3.11" 33 | name: Python ${{ matrix.python-version }} 34 | steps: 35 | - uses: actions/checkout@v3 36 | 37 | - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }} 38 | uses: "./.github/actions/poetry_setup" 39 | with: 40 | python-version: ${{ matrix.python-version }} 41 | poetry-version: ${{ env.POETRY_VERSION }} 42 | working-directory: ${{ inputs.working-directory }} 43 | cache-key: core 44 | 45 | - name: Install dependencies 46 | shell: bash 47 | run: poetry install 48 | 49 | - name: Run core tests 50 | shell: bash 51 | run: make test 52 | 53 | - name: Ensure the tests did not create any additional files 54 | shell: bash 55 | run: | 56 | set -eu 57 | 58 | STATUS="$(git status)" 59 | echo "$STATUS" 60 | 61 | # grep will exit non-zero if the target message isn't found, 62 | # and `set -e` above will cause the step to fail. 63 | echo "$STATUS" | grep 'nothing to commit, working tree clean' 64 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Run CI Tests 3 | 4 | on: 5 | push: 6 | branches: [ main ] 7 | paths: 8 | - 'backend/**' 9 | pull_request: 10 | paths: 11 | - 'backend/**' 12 | workflow_dispatch: # Allows to trigger the workflow manually in GitHub UI 13 | 14 | # If another push to the same PR or branch happens while this workflow is still running, 15 | # cancel the earlier run in favor of the next run. 16 | # 17 | # There's no point in testing an outdated version of the code. GitHub only allows 18 | # a limited number of job runners to be active at the same time, so it's better to cancel 19 | # pointless jobs early so that more useful jobs can run sooner. 20 | concurrency: 21 | group: ${{ github.workflow }}-${{ github.ref }} 22 | cancel-in-progress: true 23 | 24 | env: 25 | POETRY_VERSION: "1.7.1" 26 | WORKDIR: "./backend" 27 | 28 | jobs: 29 | lint: 30 | uses: 31 | ./.github/workflows/_lint.yml 32 | with: 33 | working-directory: ./backend 34 | secrets: inherit 35 | test: 36 | timeout-minutes: 5 37 | runs-on: ubuntu-latest 38 | defaults: 39 | run: 40 | working-directory: ${{ env.WORKDIR }} 41 | services: 42 | postgres: 43 | # ensure postgres version this stays in sync with prod database 44 | # and with postgres version used in docker compose 45 | image: postgres:16 46 | env: 47 | # optional (defaults to `postgres`) 48 | POSTGRES_DB: langchain_test 49 | # required 50 | POSTGRES_PASSWORD: langchain 51 | # optional (defaults to `5432`) 52 | POSTGRES_PORT: 5432 53 | # optional (defaults to `postgres`) 54 | POSTGRES_USER: langchain 55 | ports: 56 | # maps tcp port 5432 on service container to the host 57 | - 5432:5432 58 | # set health checks to wait until postgres has started 59 | options: >- 60 | --health-cmd pg_isready 61 | --health-interval 3s 62 | --health-timeout 5s 63 | --health-retries 10 64 | strategy: 65 | matrix: 66 | python-version: 67 | - "3.8" 68 | - "3.9" 69 | - "3.10" 70 | - "3.11" 71 | name: Python ${{ matrix.python-version }} tests 72 | steps: 73 | - uses: actions/checkout@v3 74 | 75 | - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }} 76 | uses: "./.github/actions/poetry_setup" 77 | with: 78 | python-version: ${{ matrix.python-version }} 79 | poetry-version: ${{ env.POETRY_VERSION }} 80 | working-directory: ${{ env.WORKDIR }} 81 | cache-key: langchain-extract-all 82 | - name: Test database connection 83 | run: | 84 | # Set up postgresql-client 85 | sudo apt-get install -y postgresql-client 86 | # Test psql connection 87 | psql -h localhost -p 5432 -U langchain -d langchain_test -c "SELECT 1;" 88 | env: 89 | # postgress password is required; alternatively, you can run: 90 | # `PGPASSWORD=postgres_password psql ...` 91 | PGPASSWORD: langchain 92 | 93 | - name: Install dependencies 94 | shell: bash 95 | run: | 96 | echo "Running tests, installing dependencies with poetry..." 97 | poetry install --with test,lint,typing,docs 98 | 99 | - name: Run tests 100 | run: make test 101 | 102 | - name: Ensure the tests did not create any additional files 103 | shell: bash 104 | run: | 105 | set -eu 106 | 107 | STATUS="$(git status)" 108 | echo "$STATUS" 109 | 110 | # grep will exit non-zero if the target message isn't found, 111 | # and `set -e` above will cause the step to fail. 112 | echo "$STATUS" | grep 'nothing to commit, working tree clean' 113 | -------------------------------------------------------------------------------- /.github/workflows/fe_ci.yml: -------------------------------------------------------------------------------- 1 | # Run formatting on all PRs 2 | 3 | name: (FE) CI 4 | 5 | on: 6 | push: 7 | branches: ["main"] 8 | paths: 9 | - 'frontend/**' 10 | pull_request: 11 | paths: 12 | - 'frontend/**' 13 | workflow_dispatch: # Allows triggering the workflow manually in GitHub UI 14 | 15 | 16 | # If another push to the same PR or branch happens while this workflow is still running, 17 | # cancel the earlier run in favor of the next run. 18 | # 19 | # There's no point in testing an outdated version of the code. GitHub only allows 20 | # a limited number of job runners to be active at the same time, so it's better to cancel 21 | # pointless jobs early so that more useful jobs can run sooner. 22 | concurrency: 23 | group: ${{ github.workflow }}-${{ github.ref }} 24 | cancel-in-progress: true 25 | 26 | jobs: 27 | build: 28 | name: Build frontend 29 | runs-on: ubuntu-latest 30 | env: 31 | NEXT_PUBLIC_BASE_API_URL: http://localhost:8000 32 | steps: 33 | - uses: actions/checkout@v4 34 | - name: Use Node.js 18.x 35 | uses: actions/setup-node@v3 36 | with: 37 | node-version: 18.x 38 | cache: "yarn" 39 | cache-dependency-path: ./frontend/yarn.lock 40 | - name: Install dependencies 41 | run: yarn install --immutable --mode=skip-build 42 | working-directory: ./frontend 43 | - name: Build frontend 44 | run: yarn build 45 | working-directory: ./frontend 46 | -------------------------------------------------------------------------------- /.github/workflows/fe_lint_format.yml: -------------------------------------------------------------------------------- 1 | # Run formatting on all PRs 2 | 3 | name: (FE) Lint & Format 4 | 5 | on: 6 | push: 7 | branches: ["main"] 8 | paths: 9 | - 'frontend/**' 10 | pull_request: 11 | paths: 12 | - 'frontend/**' 13 | workflow_dispatch: # Allows triggering the workflow manually in GitHub UI 14 | 15 | 16 | # If another push to the same PR or branch happens while this workflow is still running, 17 | # cancel the earlier run in favor of the next run. 18 | # 19 | # There's no point in testing an outdated version of the code. GitHub only allows 20 | # a limited number of job runners to be active at the same time, so it's better to cancel 21 | # pointless jobs early so that more useful jobs can run sooner. 22 | concurrency: 23 | group: ${{ github.workflow }}-${{ github.ref }} 24 | cancel-in-progress: true 25 | 26 | jobs: 27 | format: 28 | name: Check formatting 29 | runs-on: ubuntu-latest 30 | steps: 31 | - uses: actions/checkout@v4 32 | - name: Use Node.js 18.x 33 | uses: actions/setup-node@v3 34 | with: 35 | node-version: 18.x 36 | cache: "yarn" 37 | cache-dependency-path: ./frontend/yarn.lock 38 | - name: Install dependencies 39 | run: yarn install --immutable --mode=skip-build 40 | working-directory: ./frontend 41 | - name: Check formatting 42 | run: yarn format:check 43 | working-directory: ./frontend 44 | 45 | lint: 46 | name: Check linting 47 | runs-on: ubuntu-latest 48 | steps: 49 | - uses: actions/checkout@v4 50 | - name: Use Node.js 18.x 51 | uses: actions/setup-node@v3 52 | with: 53 | node-version: 18.x 54 | cache: "yarn" 55 | cache-dependency-path: ./frontend/yarn.lock 56 | - name: Install dependencies 57 | run: yarn install --immutable --mode=skip-build 58 | working-directory: ./frontend 59 | - name: Check linting 60 | run: yarn lint 61 | working-directory: ./frontend 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | #.idea/ 162 | .DS_Store 163 | 164 | # Local env file 165 | .local.env 166 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # All directory paths for COPY commands are relative to the build context 2 | 3 | # Ensure this python version stays in sync with CI 4 | FROM python:3.11-slim as base 5 | WORKDIR /backend 6 | 7 | # set environment variables 8 | ENV PYTHONDONTWRITEBYTECODE 1 9 | ENV PYTHONUNBUFFERED 1 10 | ENV POETRY_HOME="/opt/poetry" 11 | ENV MYPYPATH="/app/src/stubs" 12 | 13 | # Use bash as the shell for the build 14 | # https://github.com/docker/for-linux/issues/408#issuecomment-414748815 15 | SHELL ["/bin/bash", "-o", "pipefail", "-c"] 16 | 17 | RUN set -eux && \ 18 | apt-get update && \ 19 | apt-get install -y \ 20 | build-essential \ 21 | curl \ 22 | libpq-dev \ 23 | python3-dev \ 24 | libmagic1 25 | 26 | # https://python-poetry.org/docs 27 | RUN pip install poetry 28 | 29 | # install deps before copying project files so the cache is only invalidated 30 | # when the deps change 31 | COPY ./backend/pyproject.toml ./backend/poetry.lock ./ 32 | RUN poetry config virtualenvs.create false 33 | RUN poetry install --no-root --only main 34 | 35 | COPY ./backend . 36 | 37 | EXPOSE 8080 38 | 39 | ### 40 | # development image 41 | ### 42 | FROM base as development 43 | 44 | ENTRYPOINT ["bash", "./scripts/prod_entry_point.sh"] 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024-Present Langchain AI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 🚧 Under Active Development 🚧 2 | 3 | This repo is under active developments. Do not use code from `main`. Instead please checkout code from [releases](https://github.com/langchain-ai/langchain-extract/releases) 4 | 5 | This repository is not a library, but a jumping point for your own application -- so do not be surprised to find breaking changes between releases! 6 | 7 | Checkout the demo service deployed at [extract.langchain.com/](https://extract.langchain.com/). 8 | 9 | # 🦜⛏️ LangChain Extract 10 | 11 | https://github.com/langchain-ai/langchain-extract/assets/26529506/6657280e-d05f-4c0f-9c47-07a0ef7c559d 12 | 13 | [![CI](https://github.com/langchain-ai/langchain-extract/actions/workflows/ci.yml/badge.svg)](https://github.com/langchain-ai/langchain-extract/actions/workflows/ci.yml) 14 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 15 | [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/langchainai.svg?style=social&label=Follow%20%40LangChainAI)](https://twitter.com/langchainai) 16 | [![](https://dcbadge.vercel.app/api/server/6adMQxSpJS?compact=true&style=flat)](https://discord.gg/6adMQxSpJS) 17 | [![Open Issues](https://img.shields.io/github/issues-raw/langchain-ai/langchain-extract)](https://github.com/langchain-ai/langchain-extract/issues) 18 | 19 | `langchain-extract` is a simple web server that allows you to extract information from text and files using LLMs. It is build using [FastAPI](https://fastapi.tiangolo.com/), [LangChain](https://python.langchain.com/) and [Postgresql](https://www.postgresql.org/). 20 | 21 | The backend closely follows the [extraction use-case documentation](https://python.langchain.com/docs/use_cases/extraction) and provides 22 | a reference implementation of an app that helps to do extraction over data using LLMs. 23 | 24 | This repository is meant to be a starting point for building your own extraction application which 25 | may have slightly different requirements or use cases. 26 | 27 | ## Functionality 28 | 29 | - 🚀 FastAPI webserver with a REST API 30 | - 📚 OpenAPI Documentation 31 | - 📝 Use [JSON Schema](https://json-schema.org/) to define what to extract 32 | - 📊 Use examples to improve the quality of extracted results 33 | - 📦 Create and save extractors and examples in a database 34 | - 📂 Extract information from text and/or binary files 35 | - 🦜️🏓 [LangServe](https://github.com/langchain-ai/langserve) endpoint to integrate with LangChain `RemoteRunnnable` 36 | 37 | ## Releases: 38 | 39 | 0.0.1: https://github.com/langchain-ai/langchain-extract/releases/tag/0.0.1 40 | 0.0.2: https://github.com/langchain-ai/langchain-extract/releases/tag/0.0.2 41 | 42 | ## 📚 Documentation 43 | 44 | See the example notebooks in the [documentation](https://github.com/langchain-ai/langchain-extract/tree/main/docs/source/notebooks) 45 | to see how to create examples to improve extraction results, upload files (e.g., HTML, PDF) and more. 46 | 47 | Documentation and server code are both under development! 48 | 49 | ## 🍯 Example API 50 | 51 | Below are two sample `curl` requests to demonstrate how to use the API. 52 | 53 | These only provide minimal examples of how to use the API, 54 | see the [documentation](https://github.com/langchain-ai/langchain-extract/tree/main/docs/source/notebooks) for more information 55 | about the API and the [extraction use-case documentation](https://python.langchain.com/docs/use_cases/extraction) for more information about how to extract 56 | information using LangChain. 57 | 58 | First we generate a user ID for ourselves. **The application does not properly manage users or include legitimate authentication**. Access to extractors, few-shot examples, and other artifacts is controlled via this ID. Consider it secret. 59 | 60 | ```sh 61 | USER_ID=$(uuidgen) 62 | export USER_ID 63 | ``` 64 | 65 | ### Create an extractor 66 | 67 | ```sh 68 | curl -X 'POST' \ 69 | 'http://localhost:8000/extractors' \ 70 | -H 'accept: application/json' \ 71 | -H 'Content-Type: application/json' \ 72 | -H "x-key: ${USER_ID}" \ 73 | -d '{ 74 | "name": "Personal Information", 75 | "description": "Use to extract personal information", 76 | "schema": { 77 | "type": "object", 78 | "title": "Person", 79 | "required": [ 80 | "name", 81 | "age" 82 | ], 83 | "properties": { 84 | "age": { 85 | "type": "integer", 86 | "title": "Age" 87 | }, 88 | "name": { 89 | "type": "string", 90 | "title": "Name" 91 | } 92 | } 93 | }, 94 | "instruction": "Use information about the person from the given user input." 95 | }' 96 | ``` 97 | 98 | Response: 99 | 100 | ```json 101 | { 102 | "uuid": "e07f389f-3577-4e94-bd88-6b201d1b10b9" 103 | } 104 | ``` 105 | 106 | Use the extract endpoint to extract information from the text (or a file) 107 | using an existing pre-defined extractor. 108 | 109 | ```sh 110 | curl -s -X 'POST' \ 111 | 'http://localhost:8000/extract' \ 112 | -H 'accept: application/json' \ 113 | -H 'Content-Type: multipart/form-data' \ 114 | -H "x-key: ${USER_ID}" \ 115 | -F 'extractor_id=e07f389f-3577-4e94-bd88-6b201d1b10b9' \ 116 | -F 'text=my name is chester and i am 20 years old. My name is eugene and I am 1 year older than chester.' \ 117 | -F 'mode=entire_document' \ 118 | -F 'file=' | jq . 119 | ``` 120 | 121 | Response: 122 | 123 | ```json 124 | { 125 | "data": [ 126 | { 127 | "name": "chester", 128 | "age": 20 129 | }, 130 | { 131 | "name": "eugene", 132 | "age": 21 133 | } 134 | ] 135 | } 136 | ``` 137 | 138 | Add a few shot example: 139 | 140 | ```sh 141 | curl -X POST "http://localhost:8000/examples" \ 142 | -H "Content-Type: application/json" \ 143 | -H "x-key: ${USER_ID}" \ 144 | -d '{ 145 | "extractor_id": "e07f389f-3577-4e94-bd88-6b201d1b10b9", 146 | "content": "marcos is 10.", 147 | "output": [ 148 | { 149 | "name": "MARCOS", 150 | "age": 10 151 | } 152 | ] 153 | }' | jq . 154 | ``` 155 | 156 | The response will contain a UUID for the example. Examples can be deleted with a DELETE request. This example is now persisted and associated with our extractor, and subsequent extraction runs will incorporate it. 157 | 158 | ## ✅ Running locally 159 | 160 | The easiest way to get started is to use `docker-compose` to run the server. 161 | 162 | **Configure the environment** 163 | 164 | Add `.local.env` file to the root directory with the following content: 165 | 166 | ```sh 167 | OPENAI_API_KEY=... # Your OpenAI API key 168 | ``` 169 | 170 | Adding `FIREWORKS_API_KEY` or `TOGETHER_API_KEY` to this file would enable additional models. You can access available models for the server and other information via a `GET` request to the `configuration` endpoint. 171 | 172 | Build the images: 173 | 174 | ```sh 175 | docker compose build 176 | ``` 177 | 178 | Run the services: 179 | 180 | ```sh 181 | docker compose up 182 | ``` 183 | 184 | This will launch both the extraction server and the postgres instance. 185 | 186 | Verify that the server is running: 187 | 188 | ```sh 189 | curl -X 'GET' 'http://localhost:8000/ready' 190 | ``` 191 | 192 | This should return `ok`. 193 | 194 | The UI will be available at [http://localhost:3000](http://localhost:3000). 195 | 196 | ## Contributions 197 | 198 | Feel free to develop in this project for your own needs! 199 | For now, we are not accepting pull requests, but would love to hear [questions, ideas or issues](https://github.com/langchain-ai/langchain-extract/discussions). 200 | 201 | ## Development 202 | 203 | To set up for development, you will need to install [Poetry](https://python-poetry.org/). 204 | 205 | The backend code is located in the `backend` directory. 206 | 207 | ```sh 208 | cd backend 209 | ``` 210 | 211 | Set up the environment using poetry: 212 | 213 | ```sh 214 | poetry install --with lint,dev,test 215 | ``` 216 | 217 | Run the following script to create a database and schema: 218 | 219 | ```sh 220 | python -m scripts.run_migrations create 221 | ``` 222 | 223 | From `/backend`: 224 | 225 | ```sh 226 | OPENAI_API_KEY=[YOUR API KEY] python -m server.main 227 | ``` 228 | 229 | ### Testing 230 | 231 | Create a test database. The test database is used for running tests and is 232 | separate from the main database. It will have the same schema as the main 233 | database. 234 | 235 | ```sh 236 | python -m scripts.run_migrations create-test-db 237 | ``` 238 | 239 | Run the tests 240 | 241 | ```sh 242 | make test 243 | ``` 244 | 245 | ### Linting and format 246 | 247 | Testing and formatting is done using a Makefile inside `[root]/backend` 248 | 249 | ```sh 250 | make format 251 | ``` 252 | -------------------------------------------------------------------------------- /backend/Dockerfile: -------------------------------------------------------------------------------- 1 | # All directory paths for COPY commands are relative to the build context 2 | 3 | # Ensure this python version stays in sync with CI 4 | FROM python:3.11-slim as base 5 | WORKDIR /backend 6 | 7 | # set environment variables 8 | ENV PYTHONDONTWRITEBYTECODE 1 9 | ENV PYTHONUNBUFFERED 1 10 | ENV POETRY_HOME="/opt/poetry" 11 | ENV MYPYPATH="/app/src/stubs" 12 | 13 | # Use bash as the shell for the build 14 | # https://github.com/docker/for-linux/issues/408#issuecomment-414748815 15 | SHELL ["/bin/bash", "-o", "pipefail", "-c"] 16 | 17 | RUN set -eux && \ 18 | apt-get update && \ 19 | apt-get install -y \ 20 | build-essential \ 21 | curl \ 22 | libpq-dev \ 23 | python3-dev \ 24 | libmagic1 25 | 26 | # https://python-poetry.org/docs 27 | RUN pip install poetry 28 | 29 | # install deps before copying project files so the cache is only invalidated 30 | # when the deps change 31 | COPY ./backend/pyproject.toml ./backend/poetry.lock . 32 | RUN poetry config virtualenvs.create false 33 | RUN poetry install --no-root --only main 34 | 35 | COPY ./backend . 36 | 37 | EXPOSE 8000 38 | 39 | ### 40 | # development image 41 | ### 42 | FROM base as development 43 | 44 | ENTRYPOINT ["bash", "./scripts/local_entry_point.sh"] 45 | -------------------------------------------------------------------------------- /backend/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all lint format test help 2 | 3 | # Default target executed when no arguments are given to make. 4 | all: help 5 | 6 | ###################### 7 | # TESTING AND COVERAGE 8 | ###################### 9 | 10 | # Define a variable for the test file path. 11 | TEST_FILE ?= tests/unit_tests/ 12 | 13 | test: 14 | poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE) 15 | 16 | test_watch: 17 | poetry run ptw . -- $(TEST_FILE) 18 | 19 | openapi: 20 | OPENAI_API_KEY=placeholder python -c "from server import main; import json; print(json.dumps(main.app.openapi()))" > openapi.json 21 | 22 | 23 | ###################### 24 | # LINTING AND FORMATTING 25 | ###################### 26 | 27 | # Define a variable for Python and notebook files. 28 | lint format: PYTHON_FILES=. 29 | lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=. --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$') 30 | 31 | lint lint_diff: 32 | [ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff 33 | # [ "$(PYTHON_FILES)" = "" ] || poetry run mypy $(PYTHON_FILES) 34 | 35 | format format_diff: 36 | [ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) 37 | [ "$(PYTHON_FILES)" = "" ] || poetry run ruff --fix $(PYTHON_FILES) 38 | 39 | spell_check: 40 | poetry run codespell --toml pyproject.toml 41 | 42 | spell_fix: 43 | poetry run codespell --toml pyproject.toml -w 44 | 45 | ###################### 46 | # HELP 47 | ###################### 48 | 49 | help: 50 | @echo '====================' 51 | @echo '-- LINTING --' 52 | @echo 'format - run code formatters' 53 | @echo 'lint - run linters' 54 | @echo 'spell_check - run codespell on the project' 55 | @echo 'spell_fix - run codespell on the project and fix the errors' 56 | @echo '-- TESTS --' 57 | @echo 'coverage - run unit tests and generate coverage report' 58 | @echo 'test - run unit tests' 59 | @echo 'test TEST_FILE= - run all tests in file' 60 | @echo '-- DOCUMENTATION tasks are from the top-level Makefile --' 61 | -------------------------------------------------------------------------------- /backend/README.md: -------------------------------------------------------------------------------- 1 | See readme at repo root. 2 | -------------------------------------------------------------------------------- /backend/db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/db/__init__.py -------------------------------------------------------------------------------- /backend/db/models.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from datetime import datetime 3 | from typing import Generator 4 | 5 | from sqlalchemy import ( 6 | Column, 7 | DateTime, 8 | ForeignKey, 9 | String, 10 | Text, 11 | UniqueConstraint, 12 | create_engine, 13 | ) 14 | from sqlalchemy.dialects.postgresql import JSONB, UUID 15 | from sqlalchemy.ext.declarative import declarative_base 16 | from sqlalchemy.orm import Session, relationship, sessionmaker 17 | 18 | from server.settings import get_postgres_url 19 | 20 | ENGINE = create_engine(get_postgres_url()) 21 | SessionClass = sessionmaker(bind=ENGINE) 22 | 23 | Base = declarative_base() 24 | 25 | 26 | # TODO(Eugene): Convert to async code 27 | def get_session() -> Generator[Session, None, None]: 28 | """Create a new session.""" 29 | session = SessionClass() 30 | 31 | try: 32 | yield session 33 | except: 34 | session.rollback() 35 | raise 36 | finally: 37 | session.close() 38 | 39 | 40 | class TimestampedModel(Base): 41 | """An abstract base model that includes the timestamp fields.""" 42 | 43 | __abstract__ = True 44 | 45 | created_at = Column( 46 | DateTime, 47 | default=datetime.utcnow, 48 | comment="The time the record was created (UTC).", 49 | ) 50 | updated_at = Column( 51 | DateTime, 52 | default=datetime.utcnow, 53 | onupdate=datetime.utcnow, 54 | doc="The time the record was last updated (UTC).", 55 | ) 56 | 57 | # This is our own uuid assigned to the artifact. 58 | # By construction guaranteed to be unique no matter what. 59 | uuid = Column( 60 | UUID(as_uuid=True), 61 | primary_key=True, 62 | default=lambda: str(uuid.uuid4()), 63 | doc="Unique identifier for this model.", 64 | ) 65 | 66 | 67 | class Example(TimestampedModel): 68 | """A representation of an example. 69 | 70 | Examples consist of content together with the expected output. 71 | 72 | The output is a JSON object that is expected to be extracted from the content. 73 | 74 | The JSON object should be valid according to the schema of the associated extractor. 75 | 76 | The JSON object is defined by the schema of the associated extractor, so 77 | it's perfectly fine for a given example to represent the extraction 78 | of multiple instances of some object from the content since 79 | the JSON schema can represent a list of objects. 80 | """ 81 | 82 | __tablename__ = "examples" 83 | 84 | content = Column( 85 | Text, 86 | nullable=False, 87 | comment="The input portion of the example.", 88 | ) 89 | output = Column( 90 | JSONB, 91 | comment="The output associated with the example.", 92 | ) 93 | extractor_id = Column( 94 | UUID(as_uuid=True), 95 | ForeignKey("extractors.uuid", ondelete="CASCADE"), 96 | nullable=False, 97 | comment="Foreign key referencing the associated extractor.", 98 | ) 99 | 100 | def __repr__(self) -> str: 101 | return f"" 102 | 103 | 104 | class SharedExtractors(TimestampedModel): 105 | """A table for managing sharing of extractors.""" 106 | 107 | __tablename__ = "shared_extractors" 108 | 109 | extractor_id = Column( 110 | UUID(as_uuid=True), 111 | ForeignKey("extractors.uuid", ondelete="CASCADE"), 112 | index=True, 113 | nullable=False, 114 | comment="The extractor that is being shared.", 115 | ) 116 | 117 | share_token = Column( 118 | UUID(as_uuid=True), 119 | index=True, 120 | nullable=False, 121 | unique=True, 122 | comment="The token that is used to access the shared extractor.", 123 | ) 124 | 125 | # Add unique constraint for (extractor_id, share_token) 126 | __table_args__ = ( 127 | UniqueConstraint("extractor_id", "share_token", name="unique_share_token"), 128 | ) 129 | 130 | def __repr__(self) -> str: 131 | """Return a string representation of the object.""" 132 | return f"" 133 | 134 | 135 | class Extractor(TimestampedModel): 136 | __tablename__ = "extractors" 137 | 138 | name = Column( 139 | String(100), 140 | nullable=False, 141 | server_default="", 142 | comment="The name of the extractor.", 143 | ) 144 | owner_id = Column( 145 | UUID(as_uuid=True), 146 | nullable=False, 147 | comment="Owner uuid.", 148 | ) 149 | schema = Column( 150 | JSONB, 151 | nullable=False, 152 | comment="JSON Schema that describes what content will be " 153 | "extracted from the document", 154 | ) 155 | description = Column( 156 | String(100), 157 | nullable=False, 158 | server_default="", 159 | comment="Surfaced via UI to the users.", 160 | ) 161 | instruction = Column( 162 | Text, nullable=False, comment="The prompt to the language model." 163 | ) # TODO: This will need to evolve 164 | 165 | examples = relationship("Example", backref="extractor") 166 | 167 | # Used for sharing the extractor with others. 168 | share_uuid = Column( 169 | UUID(as_uuid=True), 170 | nullable=True, 171 | comment="The uuid of the shareable link.", 172 | ) 173 | 174 | def __repr__(self) -> str: 175 | return f"" 176 | 177 | 178 | def validate_extractor_owner( 179 | session: Session, extractor_id: UUID, user_id: UUID 180 | ) -> Extractor: 181 | """Validate the extractor id.""" 182 | extractor = ( 183 | session.query(Extractor).filter_by(uuid=extractor_id, owner_id=user_id).first() 184 | ) 185 | return extractor is not None 186 | -------------------------------------------------------------------------------- /backend/extraction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/extraction/__init__.py -------------------------------------------------------------------------------- /backend/extraction/parsing.py: -------------------------------------------------------------------------------- 1 | """Convert binary input to blobs and parse them using the appropriate parser.""" 2 | from __future__ import annotations 3 | 4 | from typing import BinaryIO, List 5 | 6 | from fastapi import HTTPException 7 | from langchain.document_loaders.parsers import BS4HTMLParser, PDFMinerParser 8 | from langchain.document_loaders.parsers.generic import MimeTypeBasedParser 9 | from langchain.document_loaders.parsers.txt import TextParser 10 | from langchain_community.document_loaders import Blob 11 | from langchain_core.documents import Document 12 | 13 | HANDLERS = { 14 | "application/pdf": PDFMinerParser(), 15 | "text/plain": TextParser(), 16 | "text/html": BS4HTMLParser(), 17 | # Disable for now as they rely on unstructured and there's some install 18 | # issue with unstructured. 19 | # from langchain.document_loaders.parsers.msword import MsWordParser 20 | # "application/msword": MsWordParser(), 21 | # "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ( 22 | # MsWordParser() 23 | # ), 24 | } 25 | 26 | SUPPORTED_MIMETYPES = sorted(HANDLERS.keys()) 27 | 28 | MAX_FILE_SIZE_MB = 10 # in MB 29 | 30 | 31 | def _guess_mimetype(file_bytes: bytes) -> str: 32 | """Guess the mime-type of a file.""" 33 | try: 34 | import magic 35 | except ImportError as e: 36 | raise ImportError( 37 | "magic package not found, please install it with `pip install python-magic`" 38 | ) from e 39 | 40 | mime = magic.Magic(mime=True) 41 | mime_type = mime.from_buffer(file_bytes) 42 | return mime_type 43 | 44 | 45 | def _get_file_size_in_mb(data: BinaryIO) -> float: 46 | """Get file size in MB.""" 47 | data.seek(0, 2) # Move the cursor to the end of the file 48 | file_size = data.tell() 49 | file_size_in_mb = file_size / (1024 * 1024) 50 | data.seek(0) 51 | return file_size_in_mb 52 | 53 | 54 | # PUBLIC API 55 | 56 | MIMETYPE_BASED_PARSER = MimeTypeBasedParser( 57 | handlers=HANDLERS, 58 | fallback_parser=None, 59 | ) 60 | 61 | 62 | def convert_binary_input_to_blob(data: BinaryIO) -> Blob: 63 | """Convert ingestion input to blob.""" 64 | file_size_in_mb = _get_file_size_in_mb(data) 65 | 66 | if file_size_in_mb > MAX_FILE_SIZE_MB: 67 | raise HTTPException( 68 | status_code=413, 69 | detail=f"File size exceeds the maximum limit of {MAX_FILE_SIZE_MB} MB.", 70 | ) 71 | 72 | file_data = data.read() 73 | mimetype = _guess_mimetype(file_data) 74 | file_name = data.name 75 | 76 | return Blob.from_data( 77 | data=file_data, 78 | path=file_name, 79 | mime_type=mimetype, 80 | ) 81 | 82 | 83 | def parse_binary_input(data: BinaryIO) -> List[Document]: 84 | """Parse binary input.""" 85 | blob = convert_binary_input_to_blob(data) 86 | return MIMETYPE_BASED_PARSER.parse(blob) 87 | -------------------------------------------------------------------------------- /backend/extraction/utils.py: -------------------------------------------------------------------------------- 1 | """Adapters to convert between different formats.""" 2 | from __future__ import annotations 3 | 4 | from langchain_core.utils.json_schema import dereference_refs 5 | 6 | 7 | def _rm_titles(kv: dict) -> dict: 8 | """Remove titles from a dictionary.""" 9 | new_kv = {} 10 | for k, v in kv.items(): 11 | if k == "title": 12 | continue 13 | elif isinstance(v, dict): 14 | new_kv[k] = _rm_titles(v) 15 | else: 16 | new_kv[k] = v 17 | return new_kv 18 | 19 | 20 | # PUBLIC API 21 | 22 | 23 | def update_json_schema( 24 | schema: dict, 25 | *, 26 | multi: bool = True, 27 | ) -> dict: 28 | """Add missing fields to JSON schema and add support for multiple records.""" 29 | if multi: 30 | # Wrap the schema in an object called "Root" with a property called: "data" 31 | # which will be a json array of the original schema. 32 | schema_ = { 33 | "type": "object", 34 | "properties": { 35 | "data": { 36 | "type": "array", 37 | "items": dereference_refs(schema), 38 | }, 39 | }, 40 | "required": ["data"], 41 | } 42 | else: 43 | raise NotImplementedError("Only multi is supported for now.") 44 | 45 | schema_["title"] = "extractor" 46 | schema_["description"] = "Extract information matching the given schema." 47 | return schema_ 48 | -------------------------------------------------------------------------------- /backend/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "langchain-extract" 3 | version = "0.0.1" 4 | description = "Sample extraction backend." 5 | authors = ["LangChain AI"] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.8.1" 11 | langchain = "~0.1" 12 | langsmith = ">=0.0.66" 13 | fastapi = "^0.109.2" 14 | langserve = "^0.0.45" 15 | uvicorn = "^0.27.1" 16 | pydantic = "^1.10" 17 | langchain-openai = "^0.1.3" 18 | jsonschema = "^4.21.1" 19 | sse-starlette = "^2.0.0" 20 | alembic = "^1.13.1" 21 | psycopg2 = "^2.9.9" 22 | python-magic = "^0.4.27" 23 | pdfminer-six = "^20231228" 24 | beautifulsoup4 = "^4.12.3" 25 | lxml = "^5.1.0" 26 | faiss-cpu = "^1.7.4" 27 | python-multipart = "^0.0.9" 28 | langchain-fireworks = "^0.1.1" 29 | langchain-anthropic = "^0.1.11" 30 | langchain-groq = "^0.1.3" 31 | 32 | [tool.poetry.group.dev.dependencies] 33 | jupyterlab = "^3.6.1" 34 | 35 | [tool.poetry.group.typing.dependencies] 36 | mypy = "^1.7.0" 37 | 38 | [tool.poetry.group.lint.dependencies] 39 | ruff = "^0.1.5" 40 | 41 | [tool.poetry.group.docs.dependencies] 42 | nbsphinx = ">=0.8.9" 43 | sphinx = ">=5.2.0" 44 | sphinx-autobuild = "^2021.3.14" 45 | sphinx_book_theme = "^1.0.0" 46 | myst-nb = { version = "^1.0.0", python = "^3.9" } 47 | toml = "^0.10.2" 48 | sphinx-copybutton = ">=0.5.1" 49 | 50 | 51 | [tool.poetry.group.test.dependencies] 52 | pytest = "^7.2.1" 53 | pytest-cov = "^4.0.0" 54 | pytest-asyncio = "^0.21.1" 55 | pytest-mock = "^3.11.1" 56 | pytest-socket = "^0.6.0" 57 | pytest-watch = "^4.2.0" 58 | pytest-timeout = "^2.2.0" 59 | 60 | 61 | [tool.ruff] 62 | select = [ 63 | "E", # pycodestyle 64 | "F", # pyflakes 65 | "I", # isort 66 | ] 67 | extend-include = ["*.ipynb"] 68 | 69 | # Same as Black. 70 | line-length = 88 71 | 72 | [tool.mypy] 73 | disallow_untyped_defs = "True" 74 | ignore_missing_imports = "True" 75 | 76 | [tool.coverage.run] 77 | omit = [ 78 | "tests/*", 79 | ] 80 | 81 | 82 | [build-system] 83 | requires = ["poetry-core"] 84 | build-backend = "poetry.core.masonry.api" 85 | 86 | [tool.pytest.ini_options] 87 | # --strict-markers will raise errors on unknown marks. 88 | # https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks 89 | # 90 | # https://docs.pytest.org/en/7.1.x/reference/reference.html 91 | # --strict-config any warnings encountered while parsing the `pytest` 92 | # section of the configuration file raise errors. 93 | addopts = "--strict-markers --strict-config --durations=5 -vv" 94 | # Global timeout for all tests. There shuold be a good reason for a test to 95 | # take more than 5 second 96 | timeout = 5 97 | # Registering custom markers. 98 | # https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers 99 | markers = [ 100 | "asyncio: mark tests as requiring asyncio", 101 | ] 102 | asyncio_mode = "auto" 103 | -------------------------------------------------------------------------------- /backend/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/scripts/__init__.py -------------------------------------------------------------------------------- /backend/scripts/local_entry_point.sh: -------------------------------------------------------------------------------- 1 | # -e: fail on any nonzero exit status 2 | # -u: fail if any referenced variables are not set 3 | # -x: print commands before running them 4 | # -o pipefail: fail if a command in a pipe has a nonzero exit code 5 | set -euxo pipefail 6 | 7 | # For now just create the db if it doesn't exist 8 | python -m scripts.run_migrations create 9 | 10 | uvicorn server.main:app --host 0.0.0.0 --port 8000 --reload 11 | -------------------------------------------------------------------------------- /backend/scripts/prod_entry_point.sh: -------------------------------------------------------------------------------- 1 | # -e: fail on any nonzero exit status 2 | # -u: fail if any referenced variables are not set 3 | # -x: print commands before running them 4 | # -o pipefail: fail if a command in a pipe has a nonzero exit code 5 | set -euxo pipefail 6 | 7 | # For now just create the db if it doesn't exist 8 | python -m scripts.run_migrations create 9 | 10 | uvicorn server.main:app --host 0.0.0.0 --port 8080 --reload 11 | -------------------------------------------------------------------------------- /backend/scripts/run_migrations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Run migrations.""" 3 | import click 4 | 5 | from db.models import ENGINE, Base 6 | 7 | 8 | @click.group() 9 | def cli(): 10 | """Database migration commands.""" 11 | pass 12 | 13 | 14 | @cli.command() 15 | def create(): 16 | """Create all tables.""" 17 | Base.metadata.create_all(ENGINE) 18 | click.echo("All tables created successfully.") 19 | 20 | 21 | @cli.command() 22 | @click.confirmation_option(prompt="Are you sure you want to drop all tables?") 23 | def drop(): 24 | """Drop all tables.""" 25 | Base.metadata.drop_all(ENGINE) 26 | click.echo("All tables dropped successfully.") 27 | 28 | 29 | @cli.command() 30 | def create_test_db(): 31 | """Create a test database called langchain_test used for testing purposes.""" 32 | import psycopg2 33 | from psycopg2.errors import DuplicateDatabase 34 | 35 | # establishing the connection 36 | conn = psycopg2.connect( 37 | database="langchain", 38 | user="langchain", 39 | password="langchain", 40 | host="localhost", 41 | port="5432", 42 | ) 43 | conn.autocommit = True 44 | 45 | # Creating a cursor object using the cursor() method 46 | with conn.cursor() as cursor: 47 | # Preparing query to create a database 48 | sql = "CREATE DATABASE langchain_test;" 49 | 50 | # Creating a database 51 | try: 52 | cursor.execute(sql) 53 | print("Database created successfully.") 54 | except DuplicateDatabase: 55 | print("Database already exists") 56 | 57 | 58 | if __name__ == "__main__": 59 | cli() 60 | -------------------------------------------------------------------------------- /backend/server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/server/__init__.py -------------------------------------------------------------------------------- /backend/server/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/server/api/__init__.py -------------------------------------------------------------------------------- /backend/server/api/api_key.py: -------------------------------------------------------------------------------- 1 | from fastapi.security import APIKeyHeader 2 | 3 | # For actual auth, you'd need to check the key against a database or some other 4 | # data store. Here, we don't need actual auth, just a key that matches 5 | # a UUID 6 | UserToken = APIKeyHeader(name="x-key") 7 | -------------------------------------------------------------------------------- /backend/server/api/configurables.py: -------------------------------------------------------------------------------- 1 | """Endpoint for listing available chat models for extraction.""" 2 | from typing import List 3 | 4 | from fastapi import APIRouter 5 | from typing_extensions import TypedDict 6 | 7 | from extraction.parsing import MAX_FILE_SIZE_MB, SUPPORTED_MIMETYPES 8 | from server.models import SUPPORTED_MODELS 9 | from server.settings import MAX_CHUNKS, MAX_CONCURRENCY 10 | 11 | router = APIRouter( 12 | prefix="/configuration", 13 | tags=["Configuration"], 14 | responses={404: {"description": "Not found"}}, 15 | ) 16 | 17 | 18 | class ConfigurationResponse(TypedDict): 19 | """Response for configuration.""" 20 | 21 | available_models: List[str] 22 | accepted_mimetypes: List[str] 23 | max_file_size_mb: int 24 | max_concurrency: int 25 | max_chunks: int 26 | models: List[dict] 27 | 28 | 29 | @router.get("") 30 | def get() -> ConfigurationResponse: 31 | """Endpoint to show server configuration.""" 32 | return { 33 | "available_models": sorted(SUPPORTED_MODELS), # Deprecate 34 | "models": [ 35 | { 36 | "name": model, 37 | "description": data["description"], 38 | } 39 | for model, data in SUPPORTED_MODELS.items() 40 | ], 41 | "accepted_mimetypes": SUPPORTED_MIMETYPES, 42 | "max_file_size_mb": MAX_FILE_SIZE_MB, 43 | "max_concurrency": MAX_CONCURRENCY, 44 | "max_chunks": MAX_CHUNKS, 45 | } 46 | -------------------------------------------------------------------------------- /backend/server/api/examples.py: -------------------------------------------------------------------------------- 1 | """Endpoints for managing definition of examples..""" 2 | from typing import Any, List 3 | from uuid import UUID 4 | 5 | from fastapi import APIRouter, Depends, HTTPException 6 | from sqlalchemy.orm import Session 7 | from typing_extensions import Annotated, TypedDict 8 | 9 | from db.models import Example, get_session, validate_extractor_owner 10 | from server.api.api_key import UserToken 11 | 12 | router = APIRouter( 13 | prefix="/examples", 14 | tags=["example definitions"], 15 | responses={404: {"description": "Not found"}}, 16 | ) 17 | 18 | 19 | class CreateExample(TypedDict): 20 | """A request to create an example.""" 21 | 22 | extractor_id: Annotated[UUID, "The extractor ID that this is an example for."] 23 | content: Annotated[str, "The input portion of the example."] 24 | output: Annotated[ 25 | List[Any], "JSON object that is expected to be extracted from the content." 26 | ] 27 | 28 | 29 | class CreateExampleResponse(TypedDict): 30 | """Response for creating an example.""" 31 | 32 | uuid: UUID 33 | 34 | 35 | @router.post("") 36 | def create( 37 | create_request: CreateExample, 38 | *, 39 | session: Session = Depends(get_session), 40 | user_id: UUID = Depends(UserToken), 41 | ) -> CreateExampleResponse: 42 | """Endpoint to create an example.""" 43 | if not validate_extractor_owner(session, create_request["extractor_id"], user_id): 44 | raise HTTPException(status_code=404, detail="Extractor not found for owner.") 45 | 46 | instance = Example( 47 | extractor_id=create_request["extractor_id"], 48 | content=create_request["content"], 49 | output=create_request["output"], 50 | ) 51 | session.add(instance) 52 | session.commit() 53 | return {"uuid": instance.uuid} 54 | 55 | 56 | @router.get("") 57 | def list( 58 | extractor_id: UUID, 59 | *, 60 | limit: int = 10, 61 | offset: int = 0, 62 | session=Depends(get_session), 63 | user_id: UUID = Depends(UserToken), 64 | ) -> List[Any]: 65 | """Endpoint to get all examples.""" 66 | if not validate_extractor_owner(session, extractor_id, user_id): 67 | raise HTTPException(status_code=404, detail="Extractor not found for owner.") 68 | return ( 69 | session.query(Example) 70 | .filter(Example.extractor_id == extractor_id) 71 | .order_by(Example.uuid) 72 | .limit(limit) 73 | .offset(offset) 74 | .all() 75 | ) 76 | 77 | 78 | @router.delete("/{uuid}") 79 | def delete( 80 | uuid: UUID, 81 | *, 82 | session: Session = Depends(get_session), 83 | user_id: UUID = Depends(UserToken), 84 | ) -> None: 85 | """Endpoint to delete an example.""" 86 | example = session.query(Example).filter_by(uuid=str(uuid)).first() 87 | if example is None: 88 | raise HTTPException(status_code=404, detail="Example not found.") 89 | extractor_id = example.extractor_id 90 | if not validate_extractor_owner(session, extractor_id, user_id): 91 | raise HTTPException(status_code=404, detail="Extractor not found for owner.") 92 | session.query(Example).filter_by(uuid=str(uuid)).delete() 93 | session.commit() 94 | -------------------------------------------------------------------------------- /backend/server/api/extract.py: -------------------------------------------------------------------------------- 1 | from typing import Literal, Optional 2 | from uuid import UUID 3 | 4 | from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile 5 | from sqlalchemy.orm import Session 6 | from typing_extensions import Annotated 7 | 8 | from db.models import Extractor, SharedExtractors, get_session 9 | from extraction.parsing import parse_binary_input 10 | from server.api.api_key import UserToken 11 | from server.extraction_runnable import ExtractResponse, extract_entire_document 12 | from server.models import DEFAULT_MODEL 13 | from server.retrieval import extract_from_content 14 | 15 | router = APIRouter( 16 | prefix="/extract", 17 | tags=["extract"], 18 | responses={404: {"description": "Not found"}}, 19 | ) 20 | 21 | 22 | @router.post("", response_model=ExtractResponse) 23 | async def extract_using_existing_extractor( 24 | *, 25 | extractor_id: Annotated[UUID, Form()], 26 | text: Optional[str] = Form(None), 27 | mode: Literal["entire_document", "retrieval"] = Form("entire_document"), 28 | file: Optional[UploadFile] = File(None), 29 | model_name: Optional[str] = Form(DEFAULT_MODEL), 30 | session: Session = Depends(get_session), 31 | user_id: UUID = Depends(UserToken), 32 | ) -> ExtractResponse: 33 | """Endpoint that is used with an existing extractor. 34 | 35 | This endpoint will be expanded to support upload of binary files as well as 36 | text files. 37 | """ 38 | if text is None and file is None: 39 | raise HTTPException(status_code=422, detail="No text or file provided.") 40 | 41 | extractor = ( 42 | session.query(Extractor).filter_by(uuid=extractor_id, owner_id=user_id).scalar() 43 | ) 44 | if extractor is None: 45 | raise HTTPException(status_code=404, detail="Extractor not found for owner.") 46 | 47 | if text: 48 | text_ = text 49 | else: 50 | documents = parse_binary_input(file.file) 51 | # TODO: Add metadata like location from original file where 52 | # the text was extracted from 53 | text_ = "\n".join([document.page_content for document in documents]) 54 | 55 | if mode == "entire_document": 56 | return await extract_entire_document(text_, extractor, model_name) 57 | elif mode == "retrieval": 58 | return await extract_from_content(text_, extractor, model_name) 59 | else: 60 | raise ValueError( 61 | f"Invalid mode {mode}. Expected one of 'entire_document', 'retrieval'." 62 | ) 63 | 64 | 65 | @router.post("/shared", response_model=ExtractResponse) 66 | async def extract_using_shared_extractor( 67 | *, 68 | extractor_id: Annotated[UUID, Form()], 69 | text: Optional[str] = Form(None), 70 | mode: Literal["entire_document", "retrieval"] = Form("entire_document"), 71 | file: Optional[UploadFile] = File(None), 72 | model_name: Optional[str] = Form("default"), 73 | session: Session = Depends(get_session), 74 | ) -> ExtractResponse: 75 | """Endpoint that is used with an existing extractor. 76 | 77 | Args: 78 | extractor_id: The UUID of the shared extractor. 79 | This is the UUID that is used to share the extractor, not 80 | the UUID of the extractor itself. 81 | text: The text to extract from. 82 | mode: The mode to use for extraction. 83 | file: The file to extract from. 84 | model_name: The model to use for extraction. 85 | session: The database session. 86 | 87 | """ 88 | if text is None and file is None: 89 | raise HTTPException(status_code=422, detail="No text or file provided.") 90 | 91 | extractor = ( 92 | session.query(Extractor) 93 | .join(SharedExtractors, Extractor.uuid == SharedExtractors.extractor_id) 94 | .filter(SharedExtractors.share_token == extractor_id) 95 | .scalar() 96 | ) 97 | 98 | if not extractor: 99 | raise HTTPException(status_code=404, detail="Extractor not found.") 100 | 101 | if text: 102 | text_ = text 103 | else: 104 | documents = parse_binary_input(file.file) 105 | # TODO: Add metadata like location from original file where 106 | # the text was extracted from 107 | text_ = "\n".join([document.page_content for document in documents]) 108 | 109 | if mode == "entire_document": 110 | return await extract_entire_document(text_, extractor, model_name) 111 | elif mode == "retrieval": 112 | return await extract_from_content(text_, extractor, model_name) 113 | else: 114 | raise ValueError( 115 | f"Invalid mode {mode}. Expected one of 'entire_document', 'retrieval'." 116 | ) 117 | -------------------------------------------------------------------------------- /backend/server/api/extractors.py: -------------------------------------------------------------------------------- 1 | """Endpoints for managing definition of extractors.""" 2 | from typing import Any, Dict, List 3 | from uuid import UUID, uuid4 4 | 5 | from fastapi import APIRouter, Depends, HTTPException 6 | from pydantic import BaseModel, Field, validator 7 | from sqlalchemy.exc import IntegrityError 8 | from sqlalchemy.orm import Session 9 | 10 | from db.models import Extractor, SharedExtractors, get_session, validate_extractor_owner 11 | from server.api.api_key import UserToken 12 | from server.validators import validate_json_schema 13 | 14 | router = APIRouter( 15 | prefix="/extractors", 16 | tags=["extractor definitions"], 17 | responses={404: {"description": "Not found"}}, 18 | ) 19 | 20 | 21 | class CreateExtractor(BaseModel): 22 | """A request to create an extractor.""" 23 | 24 | name: str = Field(default="", description="The name of the extractor.") 25 | 26 | description: str = Field( 27 | default="", description="Short description of the extractor." 28 | ) 29 | json_schema: Dict[str, Any] = Field( 30 | ..., description="The schema to use for extraction.", alias="schema" 31 | ) 32 | instruction: str = Field(..., description="The instruction to use for extraction.") 33 | 34 | @validator("json_schema") 35 | def validate_schema(cls, v: Any) -> Dict[str, Any]: 36 | """Validate the schema.""" 37 | validate_json_schema(v) 38 | return v 39 | 40 | 41 | class CreateExtractorResponse(BaseModel): 42 | """Response for creating an extractor.""" 43 | 44 | uuid: UUID = Field(..., description="The UUID of the created extractor.") 45 | 46 | 47 | class ShareExtractorRequest(BaseModel): 48 | """Response for sharing an extractor.""" 49 | 50 | uuid: UUID = Field(..., description="The UUID of the extractor to share.") 51 | 52 | 53 | class ShareExtractorResponse(BaseModel): 54 | """Response for sharing an extractor.""" 55 | 56 | share_uuid: UUID = Field(..., description="The UUID for the shared extractor.") 57 | 58 | 59 | @router.post("/{uuid}/share", response_model=ShareExtractorResponse) 60 | def share( 61 | uuid: UUID, 62 | *, 63 | session: Session = Depends(get_session), 64 | user_id: UUID = Depends(UserToken), 65 | ) -> ShareExtractorResponse: 66 | """Endpoint to share an extractor. 67 | 68 | Look up a shared extractor by UUID and return the share UUID if it exists. 69 | If not shared, create a new shared extractor entry and return the new share UUID. 70 | 71 | Args: 72 | uuid: The UUID of the extractor to share. 73 | session: The database session. 74 | 75 | Returns: 76 | The UUID for the shared extractor. 77 | """ 78 | if not validate_extractor_owner(session, uuid, user_id): 79 | raise HTTPException(status_code=404, detail="Extractor not found for owner.") 80 | # Check if the extractor is already shared 81 | shared_extractor = ( 82 | session.query(SharedExtractors) 83 | .filter(SharedExtractors.extractor_id == uuid) 84 | .scalar() 85 | ) 86 | 87 | if shared_extractor: 88 | # The extractor is already shared, return the existing share_uuid 89 | return ShareExtractorResponse(share_uuid=shared_extractor.share_token) 90 | 91 | # If not shared, create a new shared extractor entry 92 | new_shared_extractor = SharedExtractors( 93 | extractor_id=uuid, 94 | # This will automatically generate a new UUID for share_token 95 | share_token=uuid4(), 96 | ) 97 | 98 | session.add(new_shared_extractor) 99 | try: 100 | session.commit() 101 | except IntegrityError: 102 | session.rollback() 103 | raise HTTPException(status_code=400, detail="Failed to share the extractor.") 104 | 105 | # Return the new share_uuid 106 | return ShareExtractorResponse(share_uuid=new_shared_extractor.share_token) 107 | 108 | 109 | @router.post("") 110 | def create( 111 | create_request: CreateExtractor, 112 | *, 113 | session: Session = Depends(get_session), 114 | user_id: UUID = Depends(UserToken), 115 | ) -> CreateExtractorResponse: 116 | """Endpoint to create an extractor.""" 117 | 118 | instance = Extractor( 119 | name=create_request.name, 120 | owner_id=user_id, 121 | schema=create_request.json_schema, 122 | description=create_request.description, 123 | instruction=create_request.instruction, 124 | ) 125 | session.add(instance) 126 | session.commit() 127 | return CreateExtractorResponse(uuid=instance.uuid) 128 | 129 | 130 | @router.get("/{uuid}") 131 | def get( 132 | uuid: UUID, 133 | *, 134 | session: Session = Depends(get_session), 135 | user_id: UUID = Depends(UserToken), 136 | ) -> Dict[str, Any]: 137 | """Endpoint to get an extractor.""" 138 | extractor = ( 139 | session.query(Extractor).filter_by(uuid=str(uuid), owner_id=user_id).scalar() 140 | ) 141 | if extractor is None: 142 | raise HTTPException(status_code=404, detail="Extractor not found for owner.") 143 | return { 144 | "uuid": extractor.uuid, 145 | "name": extractor.name, 146 | "description": extractor.description, 147 | "schema": extractor.schema, 148 | "instruction": extractor.instruction, 149 | } 150 | 151 | 152 | @router.get("") 153 | def list( 154 | *, 155 | limit: int = 10, 156 | offset: int = 0, 157 | session=Depends(get_session), 158 | user_id: UUID = Depends(UserToken), 159 | ) -> List[Any]: 160 | """Endpoint to get all extractors.""" 161 | return ( 162 | session.query(Extractor) 163 | .filter_by(owner_id=user_id) 164 | .limit(limit) 165 | .offset(offset) 166 | .all() 167 | ) 168 | 169 | 170 | @router.delete("/{uuid}") 171 | def delete( 172 | uuid: UUID, 173 | *, 174 | session: Session = Depends(get_session), 175 | user_id: UUID = Depends(UserToken), 176 | ) -> None: 177 | """Endpoint to delete an extractor.""" 178 | session.query(Extractor).filter_by(uuid=str(uuid), owner_id=user_id).delete() 179 | session.commit() 180 | -------------------------------------------------------------------------------- /backend/server/api/shared.py: -------------------------------------------------------------------------------- 1 | """Endpoints for working with shared resources.""" 2 | from typing import Any, Dict 3 | from uuid import UUID 4 | 5 | from fastapi import APIRouter, Depends, HTTPException 6 | from pydantic import BaseModel, Field 7 | from sqlalchemy.orm import Session 8 | 9 | from db.models import Extractor, SharedExtractors, get_session 10 | 11 | router = APIRouter( 12 | prefix="/shared/extractors", 13 | tags=["extractor definitions"], 14 | responses={404: {"description": "Not found"}}, 15 | ) 16 | 17 | 18 | class SharedExtractorResponse(BaseModel): 19 | """Response for sharing an extractor.""" 20 | 21 | # UUID should not be included in the response since it is not a public identifier! 22 | name: str 23 | description: str 24 | # schema is a reserved keyword by pydantic 25 | schema_: Dict[str, Any] = Field(..., alias="schema") 26 | instruction: str 27 | 28 | 29 | @router.get("/{uuid}") 30 | def get( 31 | uuid: UUID, 32 | *, 33 | session: Session = Depends(get_session), 34 | ) -> SharedExtractorResponse: 35 | """Get a shared extractor.""" 36 | extractor = ( 37 | session.query(Extractor) 38 | .join(SharedExtractors, Extractor.uuid == SharedExtractors.extractor_id) 39 | .filter(SharedExtractors.share_token == uuid) 40 | .first() 41 | ) 42 | 43 | if not extractor: 44 | raise HTTPException(status_code=404, detail="Extractor not found.") 45 | 46 | return SharedExtractorResponse( 47 | name=extractor.name, 48 | description=extractor.description, 49 | schema=extractor.schema, 50 | instruction=extractor.instruction, 51 | ) 52 | -------------------------------------------------------------------------------- /backend/server/api/suggest.py: -------------------------------------------------------------------------------- 1 | """Module to handle the suggest API endpoint. 2 | 3 | This is logic that leverages LLMs to suggest an extractor for a given task. 4 | """ 5 | from typing import Optional 6 | 7 | from fastapi import APIRouter 8 | from langchain_core.prompts import ChatPromptTemplate 9 | from pydantic import BaseModel, Field 10 | 11 | from server.models import get_model 12 | 13 | router = APIRouter( 14 | prefix="/suggest", 15 | tags=["Suggest an extractor"], 16 | responses={404: {"description": "Not found"}}, 17 | ) 18 | 19 | 20 | model = get_model() 21 | 22 | 23 | class SuggestExtractor(BaseModel): 24 | """A request to create an extractor.""" 25 | 26 | description: str = Field( 27 | default="", 28 | description=( 29 | "Short description of what information the extractor is extracting." 30 | ), 31 | ) 32 | jsonSchema: Optional[str] = Field( 33 | default=None, 34 | description=( 35 | "Existing JSON Schema that describes the entity / " 36 | "information that should be extracted." 37 | ), 38 | ) 39 | 40 | 41 | class ExtractorDefinition(BaseModel): 42 | """Define an information extractor to be used in an information extraction system.""" # noqa: E501 43 | 44 | json_schema: str = Field( 45 | ..., 46 | description=( 47 | "JSON Schema that describes the entity / " 48 | "information that should be extracted. " 49 | "This schema is specified in JSON Schema format. " 50 | ), 51 | ) 52 | 53 | 54 | SUGGEST_PROMPT = ChatPromptTemplate.from_messages( 55 | [ 56 | ( 57 | "system", 58 | "You are are an expert ontologist and have been asked to help a user " 59 | "define an information extractor.The user will describe an entity, " 60 | "a topic or a piece of information that they would like to extract from " 61 | "text. Based on the user input, you are to provide a schema and " 62 | "description for the extractor. The schema should be a JSON Schema that " 63 | "describes the entity or information to be extracted. information to be " 64 | "extracted. Make sure to include title and description for all the " 65 | "attributes in the schema.The JSON Schema should describe a top level " 66 | "object. The object MUST have a title and description.Unless otherwise " 67 | "stated all entity properties in the schema should be considered optional.", 68 | ), 69 | ("human", "{input}"), 70 | ] 71 | ) 72 | 73 | suggestion_chain = SUGGEST_PROMPT | model.with_structured_output( 74 | schema=ExtractorDefinition 75 | ).with_config({"run_name": "suggest"}) 76 | 77 | UPDATE_PROMPT = ChatPromptTemplate.from_messages( 78 | [ 79 | ( 80 | "system", 81 | "You are are an expert ontologist and have been asked to help a user " 82 | "define an information extractor.gThe existing extractor schema is " 83 | "provided.\ng```\n{json_schema}\n```\nThe user will describe a desired " 84 | "modification to the schema (e.g., adding a new field, changing a field " 85 | "type, etc.).Your goal is to provide a new schema that incorporates the " 86 | "user's desired modification.The user may also request a completely new " 87 | "schema, in which case you should provide a new schema based on the " 88 | "user's input, and ignore the existing schema.The JSON Schema should " 89 | "describe a top level object. The object MUST have a title and " 90 | "description.Unless otherwise stated all entity properties in the schema " 91 | "should be considered optional.", 92 | ), 93 | ("human", "{input}"), 94 | ] 95 | ) 96 | 97 | UPDATE_CHAIN = ( 98 | UPDATE_PROMPT | model.with_structured_output(schema=ExtractorDefinition) 99 | ).with_config({"run_name": "suggest_update"}) 100 | 101 | 102 | # PUBLIC API 103 | 104 | 105 | @router.post("") 106 | async def suggest(request: SuggestExtractor) -> ExtractorDefinition: 107 | """Endpoint to create an extractor.""" 108 | if len(request.jsonSchema) > 10: 109 | return await UPDATE_CHAIN.ainvoke( 110 | {"input": request.description, "json_schema": request.jsonSchema} 111 | ) 112 | return await suggestion_chain.ainvoke({"input": request.description}) 113 | -------------------------------------------------------------------------------- /backend/server/extraction_runnable.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import uuid 5 | from typing import Any, Dict, List, Optional, Sequence 6 | 7 | from fastapi import HTTPException 8 | from jsonschema import Draft202012Validator, exceptions 9 | from langchain.text_splitter import TokenTextSplitter 10 | from langchain_core.messages import AIMessage, HumanMessage, ToolMessage 11 | from langchain_core.prompts import ChatPromptTemplate 12 | from langchain_core.runnables import chain 13 | from langserve import CustomUserType 14 | from pydantic import BaseModel, Field, validator 15 | from typing_extensions import TypedDict 16 | 17 | from db.models import Example, Extractor 18 | from extraction.utils import update_json_schema 19 | from server import settings 20 | from server.models import DEFAULT_MODEL, get_chunk_size, get_model 21 | from server.validators import validate_json_schema 22 | 23 | 24 | class ExtractionExample(BaseModel): 25 | """An example extraction. 26 | 27 | This example consists of a text and the expected output of the extraction. 28 | """ 29 | 30 | text: str = Field(..., description="The input text") 31 | output: List[Dict[str, Any]] = Field( 32 | ..., description="The expected output of the example. A list of objects." 33 | ) 34 | 35 | 36 | class ExtractRequest(CustomUserType): 37 | """Request body for the extract endpoint.""" 38 | 39 | text: str = Field(..., description="The text to extract from.") 40 | json_schema: Dict[str, Any] = Field( 41 | ..., 42 | description="JSON schema that describes what content should be extracted " 43 | "from the text.", 44 | alias="schema", 45 | ) 46 | instructions: Optional[str] = Field( 47 | None, description="Supplemental system instructions." 48 | ) 49 | examples: Optional[List[ExtractionExample]] = Field( 50 | None, description="Examples of extractions." 51 | ) 52 | model_name: Optional[str] = Field("gpt-3.5-turbo", description="Chat model to use.") 53 | 54 | @validator("json_schema") 55 | def validate_schema(cls, v: Any) -> Dict[str, Any]: 56 | """Validate the schema.""" 57 | validate_json_schema(v) 58 | return v 59 | 60 | 61 | class ExtractResponse(TypedDict, total=False): 62 | """Response body for the extract endpoint.""" 63 | 64 | data: List[Any] 65 | # content to long will be set to true if the content is too long 66 | # and had to be truncated 67 | content_too_long: Optional[bool] 68 | 69 | 70 | def _cast_example_to_dict(example: Example) -> Dict[str, Any]: 71 | """Cast example record to dictionary.""" 72 | return { 73 | "text": example.content, 74 | "output": example.output, 75 | } 76 | 77 | 78 | def _make_prompt_template( 79 | instructions: Optional[str], 80 | examples: Optional[Sequence[ExtractionExample]], 81 | function_name: str, 82 | ) -> ChatPromptTemplate: 83 | """Make a system message from instructions and examples.""" 84 | prefix = ( 85 | "You are a top-tier algorithm for extracting information from text. " 86 | "Only extract information that is relevant to the provided text. " 87 | "If no information is relevant, use the schema and output " 88 | "an empty list where appropriate." 89 | ) 90 | if instructions: 91 | system_message = ("system", f"{prefix}\n\n{instructions}") 92 | else: 93 | system_message = ("system", prefix) 94 | prompt_components = [system_message] 95 | if examples is not None: 96 | few_shot_prompt = [] 97 | for example in examples: 98 | # TODO: We'll need to refactor this at some point to 99 | # support other encoding strategies. The function calling logic here 100 | # has some hard-coded assumptions (e.g., name of parameters like `data`). 101 | _id = uuid.uuid4().hex[:] 102 | tool_call = { 103 | "args": {"data": example.output}, 104 | "name": function_name, 105 | "id": _id, 106 | } 107 | few_shot_prompt.extend( 108 | [ 109 | HumanMessage(content=example.text), 110 | AIMessage(content="", tool_calls=[tool_call]), 111 | ToolMessage( 112 | content="You have correctly called this tool.", tool_call_id=_id 113 | ), 114 | ] 115 | ) 116 | prompt_components.extend(few_shot_prompt) 117 | 118 | prompt_components.append( 119 | ( 120 | "human", 121 | "I need to extract information from " 122 | "the following text: ```\n{text}\n```\n", 123 | ), 124 | ) 125 | return ChatPromptTemplate.from_messages(prompt_components) 126 | 127 | 128 | # PUBLIC API 129 | 130 | 131 | def deduplicate( 132 | extract_responses: Sequence[ExtractResponse], 133 | ) -> ExtractResponse: 134 | """Deduplicate the results. 135 | 136 | The deduplication is done by comparing the serialized JSON of each of the results 137 | and only keeping the unique ones. 138 | """ 139 | unique_extracted = [] 140 | seen = set() 141 | for response in extract_responses: 142 | for data_item in response["data"]: 143 | # Serialize the data item for comparison purposes 144 | serialized = json.dumps(data_item, sort_keys=True) 145 | if serialized not in seen: 146 | seen.add(serialized) 147 | unique_extracted.append(data_item) 148 | 149 | return { 150 | "data": unique_extracted, 151 | } 152 | 153 | 154 | def get_examples_from_extractor(extractor: Extractor) -> List[Dict[str, Any]]: 155 | """Get examples from an extractor.""" 156 | return [_cast_example_to_dict(example) for example in extractor.examples] 157 | 158 | 159 | @chain 160 | async def extraction_runnable(extraction_request: ExtractRequest) -> ExtractResponse: 161 | """An end point to extract content from a given text object.""" 162 | # TODO: Add validation for model context window size 163 | schema = update_json_schema(extraction_request.json_schema) 164 | try: 165 | Draft202012Validator.check_schema(schema) 166 | except exceptions.ValidationError as e: 167 | raise HTTPException(status_code=422, detail=f"Invalid schema: {e.message}") 168 | 169 | prompt = _make_prompt_template( 170 | extraction_request.instructions, 171 | extraction_request.examples, 172 | schema["title"], 173 | ) 174 | model = get_model(extraction_request.model_name) 175 | runnable = (prompt | model.with_structured_output(schema=schema)).with_config( 176 | {"run_name": "extraction"} 177 | ) 178 | 179 | return await runnable.ainvoke({"text": extraction_request.text}) 180 | 181 | 182 | async def extract_entire_document( 183 | content: str, 184 | extractor: Extractor, 185 | model_name: str, 186 | ) -> ExtractResponse: 187 | """Extract from entire document.""" 188 | 189 | json_schema = extractor.schema 190 | examples = get_examples_from_extractor(extractor) 191 | text_splitter = TokenTextSplitter( 192 | chunk_size=get_chunk_size(model_name), 193 | chunk_overlap=20, 194 | model_name=DEFAULT_MODEL, 195 | ) 196 | texts = text_splitter.split_text(content) 197 | extraction_requests = [ 198 | ExtractRequest( 199 | text=text, 200 | schema=json_schema, 201 | instructions=extractor.instruction, # TODO: consistent naming 202 | examples=examples, 203 | model_name=model_name, 204 | ) 205 | for text in texts 206 | ] 207 | 208 | # Limit the number of chunks to process 209 | if len(extraction_requests) > settings.MAX_CHUNKS and settings.MAX_CHUNKS > 0: 210 | content_too_long = True 211 | extraction_requests = extraction_requests[: settings.MAX_CHUNKS] 212 | else: 213 | content_too_long = False 214 | 215 | # Run extractions which may potentially yield duplicate results 216 | extract_responses: List[ExtractResponse] = await extraction_runnable.abatch( 217 | extraction_requests, {"max_concurrency": settings.MAX_CONCURRENCY} 218 | ) 219 | # Deduplicate the results 220 | return { 221 | "data": deduplicate(extract_responses)["data"], 222 | "content_too_long": content_too_long, 223 | } 224 | -------------------------------------------------------------------------------- /backend/server/main.py: -------------------------------------------------------------------------------- 1 | """Entry point into the server.""" 2 | import logging 3 | import os 4 | from pathlib import Path 5 | 6 | from fastapi import FastAPI 7 | from fastapi.middleware.cors import CORSMiddleware 8 | from fastapi.staticfiles import StaticFiles 9 | from langserve import add_routes 10 | 11 | from server.api import configurables, examples, extract, extractors, shared, suggest 12 | from server.extraction_runnable import ( 13 | ExtractRequest, 14 | ExtractResponse, 15 | extraction_runnable, 16 | ) 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | app = FastAPI( 21 | title="Extraction Powered by LangChain", 22 | description="An extraction service powered by LangChain.", 23 | version="0.0.1", 24 | openapi_tags=[ 25 | { 26 | "name": "extraction", 27 | "description": "Operations related to extracting content from text.", 28 | } 29 | ], 30 | ) 31 | 32 | 33 | ROOT = Path(__file__).parent.parent 34 | 35 | ORIGINS = os.environ.get("CORS_ORIGINS", "").split(",") 36 | 37 | if ORIGINS: 38 | app.add_middleware( 39 | CORSMiddleware, 40 | allow_origins=ORIGINS, 41 | allow_credentials=True, 42 | allow_methods=["*"], 43 | allow_headers=["*"], 44 | ) 45 | 46 | 47 | @app.get("/ready") 48 | def ready() -> str: 49 | return "ok" 50 | 51 | 52 | # Include API endpoints for extractor definitions 53 | app.include_router(extractors.router) 54 | app.include_router(examples.router) 55 | app.include_router(extract.router) 56 | app.include_router(suggest.router) 57 | app.include_router(shared.router) 58 | app.include_router(configurables.router) 59 | 60 | add_routes( 61 | app, 62 | extraction_runnable.with_types( 63 | input_type=ExtractRequest, output_type=ExtractResponse 64 | ), 65 | path="/extract_text", 66 | enabled_endpoints=["invoke", "batch"], 67 | ) 68 | 69 | 70 | # Serve the frontend 71 | UI_DIR = str(ROOT / "ui") 72 | 73 | if os.path.exists(UI_DIR): 74 | app.mount("/", StaticFiles(directory=UI_DIR, html=True), name="ui") 75 | else: 76 | logger.warning("No UI directory found, serving API only.") 77 | 78 | 79 | if __name__ == "__main__": 80 | import uvicorn 81 | 82 | uvicorn.run(app, host="localhost", port=8000) 83 | -------------------------------------------------------------------------------- /backend/server/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | 4 | from langchain_anthropic import ChatAnthropic 5 | from langchain_core.language_models.chat_models import BaseChatModel 6 | from langchain_fireworks import ChatFireworks 7 | from langchain_groq import ChatGroq 8 | from langchain_openai import ChatOpenAI 9 | 10 | 11 | def get_supported_models(): 12 | """Get models according to environment secrets.""" 13 | models = {} 14 | if "OPENAI_API_KEY" in os.environ: 15 | models["gpt-3.5-turbo"] = { 16 | "chat_model": ChatOpenAI(model="gpt-3.5-turbo", temperature=0), 17 | "description": "GPT-3.5 Turbo", 18 | } 19 | if os.environ.get("DISABLE_GPT4", "").lower() != "true": 20 | models["gpt-4-0125-preview"] = { 21 | "chat_model": ChatOpenAI(model="gpt-4-0125-preview", temperature=0), 22 | "description": "GPT-4 0125 Preview", 23 | } 24 | if "FIREWORKS_API_KEY" in os.environ: 25 | models["fireworks"] = { 26 | "chat_model": ChatFireworks( 27 | model="accounts/fireworks/models/firefunction-v1", 28 | temperature=0, 29 | ), 30 | "description": "Fireworks Firefunction-v1", 31 | } 32 | if "TOGETHER_API_KEY" in os.environ: 33 | models["together-ai-mistral-8x7b-instruct-v0.1"] = { 34 | "chat_model": ChatOpenAI( 35 | base_url="https://api.together.xyz/v1", 36 | api_key=os.environ["TOGETHER_API_KEY"], 37 | model="mistralai/Mixtral-8x7B-Instruct-v0.1", 38 | temperature=0, 39 | ), 40 | "description": "Mixtral 8x7B Instruct v0.1 (Together AI)", 41 | } 42 | if "ANTHROPIC_API_KEY" in os.environ: 43 | models["claude-3-sonnet-20240229"] = { 44 | "chat_model": ChatAnthropic( 45 | model="claude-3-sonnet-20240229", temperature=0 46 | ), 47 | "description": "Claude 3 Sonnet", 48 | } 49 | if "GROQ_API_KEY" in os.environ: 50 | models["groq-llama3-8b-8192"] = { 51 | "chat_model": ChatGroq( 52 | model="llama3-8b-8192", 53 | temperature=0, 54 | ), 55 | "description": "GROQ Llama 3 8B", 56 | } 57 | 58 | return models 59 | 60 | 61 | SUPPORTED_MODELS = get_supported_models() 62 | DEFAULT_MODEL = "gpt-3.5-turbo" 63 | 64 | 65 | CHUNK_SIZES = { # in tokens, defaults to int(4_096 * 0.8). Override here. 66 | "gpt-4-0125-preview": int(128_000 * 0.8), 67 | } 68 | 69 | 70 | def get_chunk_size(model_name: str) -> int: 71 | """Get the chunk size.""" 72 | return CHUNK_SIZES.get(model_name, int(4_096 * 0.8)) 73 | 74 | 75 | def get_model(model_name: Optional[str] = None) -> BaseChatModel: 76 | """Get the model.""" 77 | if model_name is None: 78 | return SUPPORTED_MODELS[DEFAULT_MODEL]["chat_model"] 79 | else: 80 | supported_model_names = list(SUPPORTED_MODELS.keys()) 81 | if model_name not in supported_model_names: 82 | raise ValueError( 83 | f"Model {model_name} not found. " 84 | f"Supported models: {supported_model_names}" 85 | ) 86 | else: 87 | return SUPPORTED_MODELS[model_name]["chat_model"] 88 | -------------------------------------------------------------------------------- /backend/server/retrieval.py: -------------------------------------------------------------------------------- 1 | from operator import itemgetter 2 | from typing import Any, Dict, List, Optional 3 | 4 | from langchain.text_splitter import CharacterTextSplitter 5 | from langchain_community.vectorstores import FAISS 6 | from langchain_core.runnables import RunnableLambda 7 | from langchain_openai import OpenAIEmbeddings 8 | 9 | from db.models import Extractor 10 | from server.extraction_runnable import ( 11 | ExtractRequest, 12 | ExtractResponse, 13 | deduplicate, 14 | extraction_runnable, 15 | get_examples_from_extractor, 16 | ) 17 | 18 | 19 | def _make_extract_requests(input_dict: Dict[str, Any]) -> List[ExtractRequest]: 20 | docs = input_dict.pop("text") 21 | return [ExtractRequest(text=doc.page_content, **input_dict) for doc in docs] 22 | 23 | 24 | async def extract_from_content( 25 | content: str, 26 | extractor: Extractor, 27 | model_name: str, 28 | *, 29 | text_splitter_kwargs: Optional[Dict[str, Any]] = None, 30 | ) -> ExtractResponse: 31 | """Extract from potentially long-form content.""" 32 | if text_splitter_kwargs is None: 33 | text_splitter_kwargs = { 34 | "separator": "\n\n", 35 | "chunk_size": 1000, 36 | "chunk_overlap": 50, 37 | } 38 | text_splitter = CharacterTextSplitter(**text_splitter_kwargs) 39 | docs = text_splitter.create_documents([content]) 40 | doc_contents = [doc.page_content for doc in docs] 41 | 42 | vectorstore = FAISS.from_texts(doc_contents, embedding=OpenAIEmbeddings()) 43 | retriever = vectorstore.as_retriever() 44 | 45 | runnable = ( 46 | { 47 | "text": itemgetter("query") | retriever, 48 | "schema": itemgetter("schema"), 49 | "instructions": lambda x: x.get("instructions"), 50 | "examples": lambda x: x.get("examples"), 51 | "model_name": lambda x: x.get("model_name"), 52 | } 53 | | RunnableLambda(_make_extract_requests) 54 | | extraction_runnable.abatch 55 | ) 56 | schema = extractor.schema 57 | examples = get_examples_from_extractor(extractor) 58 | description = extractor.description # TODO: improve this 59 | result = await runnable.ainvoke( 60 | { 61 | "query": description, 62 | "schema": schema, 63 | "examples": examples, 64 | "instructions": extractor.instruction, 65 | "model_name": model_name, 66 | } 67 | ) 68 | return deduplicate(result) 69 | -------------------------------------------------------------------------------- /backend/server/settings.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | 5 | from sqlalchemy.engine import URL 6 | 7 | 8 | def get_postgres_url() -> URL: 9 | if "INSTANCE_UNIX_SOCKET" in os.environ: 10 | return URL.create( 11 | drivername="postgresql+psycopg2", 12 | username=os.environ.get("PG_USER", "langchain"), 13 | password=os.environ.get("PG_PASSWORD", "langchain"), 14 | database=os.environ.get("PG_DATABASE", "langchain"), 15 | query={ 16 | "host": os.environ["INSTANCE_UNIX_SOCKET"], 17 | }, 18 | ) 19 | 20 | url = URL.create( 21 | drivername="postgresql+psycopg2", 22 | username=os.environ.get("PG_USER", "langchain"), 23 | password=os.environ.get("PG_PASSWORD", "langchain"), 24 | host=os.environ.get("PG_HOST", "localhost"), 25 | database=os.environ.get("PG_DATABASE", "langchain"), 26 | port=5432, 27 | ) 28 | return url 29 | 30 | 31 | # Max concurrency used for extracting content from documents. 32 | # A long document is broken into smaller chunks this controls 33 | # how many chunks are processed concurrently. 34 | MAX_CONCURRENCY = int(os.environ.get("MAX_CONCURRENCY", 1)) 35 | 36 | # Max number of chunks to process per documents 37 | # When a long document is split into chunks, this controls 38 | # how many of those chunks will be processed. 39 | # Set to 0 or negative to disable the max chunks limit. 40 | MAX_CHUNKS = int(os.environ.get("MAX_CHUNKS", -1)) 41 | -------------------------------------------------------------------------------- /backend/server/validators.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | from fastapi import HTTPException 4 | from jsonschema import exceptions 5 | from jsonschema.validators import Draft202012Validator 6 | 7 | 8 | def validate_json_schema(schema: Dict[str, Any]) -> None: 9 | """Validate a JSON schema.""" 10 | try: 11 | Draft202012Validator.check_schema(schema) 12 | except exceptions.ValidationError as e: 13 | raise HTTPException( 14 | status_code=422, detail=f"Not a valid JSON schema: {e.message}" 15 | ) 16 | -------------------------------------------------------------------------------- /backend/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/__init__.py -------------------------------------------------------------------------------- /backend/tests/db.py: -------------------------------------------------------------------------------- 1 | """Utility code that sets up a test database and client for tests.""" 2 | from contextlib import asynccontextmanager 3 | from typing import Generator 4 | 5 | from httpx import AsyncClient 6 | from sqlalchemy import URL, create_engine 7 | from sqlalchemy.orm import sessionmaker 8 | 9 | from db.models import Base, get_session 10 | from server.main import app 11 | 12 | url = URL.create( 13 | drivername="postgresql", 14 | username="langchain", 15 | password="langchain", 16 | host="localhost", 17 | database="langchain_test", 18 | port=5432, 19 | ) 20 | engine = create_engine(url) 21 | TestingSession = sessionmaker(bind=engine) 22 | 23 | 24 | def override_get_session() -> Generator[TestingSession, None, None]: 25 | """Override the get_session dependency with a test session. 26 | 27 | This fixture also re-creats the database before each test and drops it after to 28 | ensure a clean slate for each test. 29 | """ 30 | try: 31 | session = TestingSession() 32 | yield session 33 | finally: 34 | session.close() 35 | 36 | 37 | app.dependency_overrides[get_session] = override_get_session 38 | 39 | 40 | @asynccontextmanager 41 | async def get_async_client() -> AsyncClient: 42 | """Get an async client.""" 43 | # Clear the database before each test 44 | Base.metadata.drop_all(engine) 45 | Base.metadata.create_all(engine) 46 | 47 | async_client = AsyncClient(app=app, base_url="http://test") 48 | try: 49 | yield async_client 50 | finally: 51 | await async_client.aclose() 52 | -------------------------------------------------------------------------------- /backend/tests/integration_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/integration_tests/__init__.py -------------------------------------------------------------------------------- /backend/tests/integration_tests/test_extraction.py: -------------------------------------------------------------------------------- 1 | """Makes it easy to run an integration tests using a real chat model.""" 2 | from contextlib import asynccontextmanager 3 | from typing import Optional 4 | 5 | import httpx 6 | from fastapi import FastAPI 7 | from httpx import AsyncClient 8 | from langchain_core.pydantic_v1 import BaseModel 9 | 10 | from server.main import app 11 | 12 | 13 | @asynccontextmanager 14 | async def get_async_test_client( 15 | server: FastAPI, *, path: Optional[str] = None, raise_app_exceptions: bool = True 16 | ) -> AsyncClient: 17 | """Get an async client.""" 18 | url = "http://localhost:9999/" 19 | if path: 20 | url += path 21 | transport = httpx.ASGITransport( 22 | app=server, 23 | raise_app_exceptions=raise_app_exceptions, 24 | ) 25 | async_client = AsyncClient(app=server, base_url=url, transport=transport) 26 | try: 27 | yield async_client 28 | finally: 29 | await async_client.aclose() 30 | 31 | 32 | async def test_extraction_api() -> None: 33 | """Test the extraction API endpoint.""" 34 | 35 | class Person(BaseModel): 36 | age: Optional[int] 37 | name: Optional[str] 38 | alias: Optional[str] 39 | 40 | async with get_async_test_client(app) as client: 41 | text = """ 42 | My name is Chester. I am young. I love cats. I have two cats. My age 43 | is the number of cats I have to the power of 5. (Approximately.) 44 | I also have a friend. His name is Neo. He is older than me. He is 45 | also a cat lover. He has 3 cats. He is 25 years old. 46 | """ 47 | result = await client.post( 48 | "/extract_text/invoke", 49 | json={"input": {"text": text, "schema": Person.schema()}}, 50 | ) 51 | assert result.status_code == 200, result.text 52 | response_data = result.json() 53 | assert isinstance(response_data["output"]["data"], list) 54 | 55 | # Test with instructions 56 | result = await client.post( 57 | "/extract_text/invoke", 58 | json={ 59 | "input": { 60 | "text": text, 61 | "schema": Person.schema(), 62 | "instructions": "Very important: Chester's alias is Neo.", 63 | } 64 | }, 65 | ) 66 | response_data = result.json() 67 | assert result.status_code == 200, result.text 68 | 69 | # Test with few shot examples 70 | examples = [ 71 | { 72 | "text": "My name is Grung. I am 100.", 73 | "output": [Person(age=100, name="######").dict()], 74 | }, 75 | ] 76 | result = await client.post( 77 | "/extract_text/invoke", 78 | json={ 79 | "input": { 80 | "text": text, 81 | "schema": Person.schema(), 82 | "instructions": "Redact all names using the characters `######`", 83 | "examples": examples, 84 | } 85 | }, 86 | ) 87 | assert result.status_code == 200, result.text 88 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/__init__.py -------------------------------------------------------------------------------- /backend/tests/unit_tests/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/api/__init__.py -------------------------------------------------------------------------------- /backend/tests/unit_tests/api/test_api_configuration.py: -------------------------------------------------------------------------------- 1 | from tests.db import get_async_client 2 | 3 | 4 | async def test_configuration_api() -> None: 5 | """Test the configuration API.""" 6 | async with get_async_client() as client: 7 | response = await client.get("/configuration") 8 | assert response.status_code == 200 9 | result = response.json() 10 | assert isinstance(result, dict) 11 | assert sorted(result) == [ 12 | "accepted_mimetypes", 13 | "available_models", 14 | "max_chunks", 15 | "max_concurrency", 16 | "max_file_size_mb", 17 | "models", 18 | ] 19 | models = result["available_models"] 20 | assert all(isinstance(model_name, str) for model_name in models) 21 | assert "gpt-3.5-turbo" in models 22 | assert len(models) >= 2 23 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/api/test_api_defining_extractors.py: -------------------------------------------------------------------------------- 1 | """Code to test API endpoints.""" 2 | import uuid 3 | 4 | from tests.db import get_async_client 5 | 6 | 7 | async def test_extractors_api() -> None: 8 | """This will test a few of the extractors API endpoints.""" 9 | # First verify that the database is empty 10 | async with get_async_client() as client: 11 | user_id = str(uuid.uuid4()) 12 | headers = {"x-key": user_id} 13 | response = await client.get("/extractors", headers=headers) 14 | assert response.status_code == 200 15 | assert response.json() == [] 16 | 17 | # Verify that we can create an extractor 18 | create_request = { 19 | "description": "Test Description", 20 | "schema": {"type": "object"}, 21 | "instruction": "Test Instruction", 22 | } 23 | response = await client.post( 24 | "/extractors", json=create_request, headers=headers 25 | ) 26 | assert response.status_code == 200 27 | 28 | # Verify that the extractor was created 29 | response = await client.get("/extractors", headers=headers) 30 | assert response.status_code == 200 31 | get_response = response.json() 32 | assert len(get_response) == 1 33 | 34 | # Check headers 35 | bad_headers = {"x-key": str(uuid.uuid4())} 36 | bad_response = await client.get("/extractors", headers=bad_headers) 37 | assert bad_response.status_code == 200 38 | assert len(bad_response.json()) == 0 39 | 40 | # Check we need cookie to delete 41 | uuid_str = get_response[0]["uuid"] 42 | _ = uuid.UUID(uuid_str) # assert valid uuid 43 | await client.delete(f"/extractors/{uuid_str}", headers=bad_headers) 44 | # Check extractor was not deleted 45 | response = await client.get("/extractors", headers=headers) 46 | assert len(response.json()) == 1 47 | 48 | # Verify that we can delete an extractor 49 | _ = uuid.UUID(uuid_str) # assert valid uuid 50 | response = await client.delete(f"/extractors/{uuid_str}", headers=headers) 51 | assert response.status_code == 200 52 | 53 | get_response = await client.get("/extractors", headers=headers) 54 | assert get_response.status_code == 200 55 | assert get_response.json() == [] 56 | 57 | # Verify that we can create an extractor 58 | create_request = { 59 | "description": "Test Description", 60 | "schema": {"type": "object"}, 61 | "instruction": "Test Instruction", 62 | } 63 | response = await client.post( 64 | "/extractors", json=create_request, headers=headers 65 | ) 66 | assert response.status_code == 200 67 | 68 | # Verify that the extractor was created 69 | response = await client.get("/extractors", headers=headers) 70 | assert response.status_code == 200 71 | assert len(response.json()) == 1 72 | 73 | # Verify that we can delete an extractor 74 | get_response = response.json() 75 | uuid_str = get_response[0]["uuid"] 76 | _ = uuid.UUID(uuid_str) # assert valid uuid 77 | response = await client.delete(f"/extractors/{uuid_str}", headers=headers) 78 | assert response.status_code == 200 79 | 80 | get_response = await client.get("/extractors", headers=headers) 81 | assert get_response.status_code == 200 82 | assert get_response.json() == [] 83 | 84 | # Verify that we can create an extractor, including other properties 85 | user_id = str(uuid.uuid4()) 86 | create_request = { 87 | "name": "my extractor", 88 | "description": "Test Description", 89 | "schema": {"type": "object"}, 90 | "instruction": "Test Instruction", 91 | } 92 | response = await client.post( 93 | "/extractors", json=create_request, headers=headers 94 | ) 95 | extractor_uuid = response.json()["uuid"] 96 | assert response.status_code == 200 97 | response = await client.get(f"/extractors/{extractor_uuid}", headers=headers) 98 | response_data = response.json() 99 | assert extractor_uuid == response_data["uuid"] 100 | assert "my extractor" == response_data["name"] 101 | assert "user_id" not in response_data 102 | 103 | 104 | async def test_sharing_extractor() -> None: 105 | """Test sharing an extractor.""" 106 | async with get_async_client() as client: 107 | user_id = str(uuid.uuid4()) 108 | headers = {"x-key": user_id} 109 | response = await client.get("/extractors", headers=headers) 110 | assert response.status_code == 200 111 | assert response.json() == [] 112 | # Verify that we can create an extractor 113 | create_request = { 114 | "name": "Test Name", 115 | "description": "Test Description", 116 | "schema": {"type": "object"}, 117 | "instruction": "Test Instruction", 118 | } 119 | response = await client.post( 120 | "/extractors", json=create_request, headers=headers 121 | ) 122 | assert response.status_code == 200 123 | 124 | uuid_str = response.json()["uuid"] 125 | 126 | # Generate a share uuid 127 | response = await client.post(f"/extractors/{uuid_str}/share", headers=headers) 128 | assert response.status_code == 200 129 | assert "share_uuid" in response.json() 130 | share_uuid = response.json()["share_uuid"] 131 | 132 | # Test idempotency 133 | response = await client.post(f"/extractors/{uuid_str}/share", headers=headers) 134 | assert response.status_code == 200 135 | assert "share_uuid" in response.json() 136 | assert response.json()["share_uuid"] == share_uuid 137 | 138 | # Check headers 139 | bad_headers = {"x-key": str(uuid.uuid4())} 140 | response = await client.post( 141 | f"/extractors/{uuid_str}/share", headers=bad_headers 142 | ) 143 | assert response.status_code == 404 144 | 145 | # Check that we can retrieve the shared extractor 146 | response = await client.get(f"/shared/extractors/{share_uuid}") 147 | assert response.status_code == 200 148 | keys = sorted(response.json()) 149 | assert keys == ["description", "instruction", "name", "schema"] 150 | 151 | assert response.json() == { 152 | "description": "Test Description", 153 | "instruction": "Test Instruction", 154 | "name": "Test Name", 155 | "schema": {"type": "object"}, 156 | } 157 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/api/test_api_examples.py: -------------------------------------------------------------------------------- 1 | """Code to test API endpoints.""" 2 | import uuid 3 | 4 | from tests.db import get_async_client 5 | 6 | 7 | async def _list_extractors() -> list: 8 | async with get_async_client() as client: 9 | response = await client.get("/extractors") 10 | assert response.status_code == 200 11 | return response.json() 12 | 13 | 14 | async def test_examples_api() -> None: 15 | """Runs through a set of API calls to test the examples API.""" 16 | async with get_async_client() as client: 17 | # First create an extractor 18 | user_id = str(uuid.uuid4()) 19 | headers = {"x-key": user_id} 20 | create_request = { 21 | "description": "Test Description", 22 | "name": "Test Name", 23 | "schema": {"type": "object"}, 24 | "instruction": "Test Instruction", 25 | } 26 | response = await client.post( 27 | "/extractors", json=create_request, headers=headers 28 | ) 29 | assert response.status_code == 200 30 | # Get the extractor id 31 | extractor_id = response.json()["uuid"] 32 | 33 | # Let's verify that there are no examples 34 | response = await client.get( 35 | "/examples?extractor_id=" + extractor_id, headers=headers 36 | ) 37 | assert response.status_code == 200 38 | assert response.json() == [] 39 | 40 | # Now let's create an example 41 | create_request = { 42 | "extractor_id": extractor_id, 43 | "content": "Test Content", 44 | "output": [ 45 | { 46 | "age": 100, 47 | "name": "Grung", 48 | } 49 | ], 50 | } 51 | response = await client.post("/examples", json=create_request, headers=headers) 52 | assert response.status_code == 200 53 | example_id = response.json()["uuid"] 54 | 55 | # Check headers 56 | bad_headers = {"x-key": str(uuid.uuid4())} 57 | response = await client.post( 58 | "/examples", json=create_request, headers=bad_headers 59 | ) 60 | assert response.status_code == 404 61 | 62 | # Verify that the example was created 63 | response = await client.get( 64 | "/examples?extractor_id=" + extractor_id, headers=headers 65 | ) 66 | assert response.status_code == 200 67 | assert len(response.json()) == 1 68 | 69 | keys = ["content", "extractor_id", "output", "uuid"] 70 | projected_response = { 71 | key: record[key] for key in keys for record in response.json() 72 | } 73 | assert projected_response == { 74 | "content": "Test Content", 75 | "extractor_id": extractor_id, 76 | "output": [ 77 | { 78 | "age": 100, 79 | "name": "Grung", 80 | } 81 | ], 82 | "uuid": example_id, 83 | } 84 | 85 | # Check headers 86 | response = await client.get( 87 | "/examples?extractor_id=" + extractor_id, headers=bad_headers 88 | ) 89 | assert response.status_code == 404 90 | 91 | # Check we need cookie to delete 92 | response = await client.delete(f"/examples/{example_id}", headers=bad_headers) 93 | assert response.status_code == 404 94 | 95 | # Verify that we can delete an example 96 | response = await client.delete(f"/examples/{example_id}", headers=headers) 97 | assert response.status_code == 200 98 | 99 | # Verify that the example was deleted 100 | response = await client.get( 101 | "/examples?extractor_id=" + extractor_id, headers=headers 102 | ) 103 | assert response.status_code == 200 104 | assert response.json() == [] 105 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/api/test_api_extract.py: -------------------------------------------------------------------------------- 1 | """Code to test API endpoints.""" 2 | import tempfile 3 | from unittest.mock import patch 4 | from uuid import UUID, uuid4 5 | 6 | from langchain.text_splitter import CharacterTextSplitter 7 | from langchain_community.embeddings import FakeEmbeddings 8 | from langchain_core.runnables import RunnableLambda 9 | 10 | from tests.db import get_async_client 11 | 12 | 13 | def mock_extraction_runnable(*args, **kwargs): 14 | """Mock the extraction_runnable function.""" 15 | extract_request = args[0] 16 | return { 17 | "data": [ 18 | extract_request.text[:10], 19 | ] 20 | } 21 | 22 | 23 | def mock_text_splitter(*args, **kwargs): 24 | return CharacterTextSplitter() 25 | 26 | 27 | def mock_embeddings(*args, **kwargs): 28 | return FakeEmbeddings(size=10) 29 | 30 | 31 | @patch( 32 | "server.extraction_runnable.extraction_runnable", 33 | new=RunnableLambda(mock_extraction_runnable), 34 | ) 35 | @patch( 36 | "server.retrieval.extraction_runnable", 37 | new=RunnableLambda(mock_extraction_runnable), 38 | ) 39 | @patch("server.extraction_runnable.TokenTextSplitter", mock_text_splitter) 40 | @patch("server.retrieval.OpenAIEmbeddings", mock_embeddings) 41 | async def test_extract_from_file() -> None: 42 | """Test extract from file API.""" 43 | async with get_async_client() as client: 44 | user_id = str(uuid4()) 45 | headers = {"x-key": user_id} 46 | # Test with invalid extractor 47 | extractor_id = UUID(int=1027) # 1027 is a good number. 48 | response = await client.post( 49 | "/extract", 50 | data={ 51 | "extractor_id": str(extractor_id), 52 | "text": "Test Content", 53 | }, 54 | headers=headers, 55 | ) 56 | assert response.status_code == 404, response.text 57 | 58 | # First create an extractor 59 | create_request = { 60 | "name": "Test Name", 61 | "description": "Test Description", 62 | "schema": {"type": "object"}, 63 | "instruction": "Test Instruction", 64 | } 65 | response = await client.post( 66 | "/extractors", 67 | json=create_request, 68 | headers=headers, 69 | ) 70 | assert response.status_code == 200, response.text 71 | # Get the extractor id 72 | extractor_id = response.json()["uuid"] 73 | 74 | # Run an extraction. 75 | # We'll use multi-form data here. 76 | response = await client.post( 77 | "/extract", 78 | data={ 79 | "extractor_id": extractor_id, 80 | "text": "Test Content", 81 | "mode": "entire_document", 82 | }, 83 | headers=headers, 84 | ) 85 | assert response.status_code == 200 86 | assert response.json() == { 87 | "data": ["Test Conte"], 88 | "content_too_long": False, 89 | } 90 | 91 | # Vary chat model 92 | response = await client.post( 93 | "/extract", 94 | data={ 95 | "extractor_id": extractor_id, 96 | "text": "Test Content", 97 | "mode": "entire_document", 98 | "model_name": "gpt-3.5-turbo", 99 | }, 100 | headers=headers, 101 | ) 102 | assert response.status_code == 200 103 | assert response.json() == { 104 | "data": ["Test Conte"], 105 | "content_too_long": False, 106 | } 107 | 108 | # Test retrieval 109 | response = await client.post( 110 | "/extract", 111 | data={ 112 | "extractor_id": extractor_id, 113 | "text": "Test Content", 114 | "mode": "retrieval", 115 | }, 116 | headers=headers, 117 | ) 118 | assert response.status_code == 200 119 | assert response.json() == { 120 | "data": ["Test Conte"], 121 | } 122 | 123 | # We'll use multi-form data here. 124 | # Create a named temporary file 125 | with tempfile.NamedTemporaryFile(mode="w+t", delete=True) as f: 126 | f.write("This is a named temporary file.") 127 | f.seek(0) 128 | f.flush() 129 | response = await client.post( 130 | "/extract", 131 | data={ 132 | "extractor_id": extractor_id, 133 | "mode": "entire_document", 134 | }, 135 | files={"file": f}, 136 | headers=headers, 137 | ) 138 | 139 | assert response.status_code == 200, response.text 140 | assert response.json() == {"data": ["This is a "], "content_too_long": False} 141 | 142 | 143 | @patch( 144 | "server.extraction_runnable.extraction_runnable", 145 | new=RunnableLambda(mock_extraction_runnable), 146 | ) 147 | @patch("server.extraction_runnable.TokenTextSplitter", mock_text_splitter) 148 | async def test_extract_from_large_file() -> None: 149 | user_id = str(uuid4()) 150 | headers = {"x-key": user_id} 151 | async with get_async_client() as client: 152 | # First create an extractor 153 | create_request = { 154 | "name": "Test Name", 155 | "description": "Test Description", 156 | "schema": {"type": "object"}, 157 | "instruction": "Test Instruction", 158 | } 159 | response = await client.post( 160 | "/extractors", json=create_request, headers=headers 161 | ) 162 | assert response.status_code == 200, response.text 163 | # Get the extractor id 164 | extractor_id = response.json()["uuid"] 165 | 166 | # Test file size constraint 167 | with tempfile.NamedTemporaryFile(mode="w+t", delete=True) as f: 168 | f.write("This is a named temporary file.") 169 | f.seek(0) 170 | f.flush() 171 | with patch("extraction.parsing._get_file_size_in_mb", return_value=20): 172 | response = await client.post( 173 | "/extract", 174 | data={ 175 | "extractor_id": extractor_id, 176 | "mode": "entire_document", 177 | }, 178 | files={"file": f}, 179 | headers=headers, 180 | ) 181 | assert response.status_code == 413 182 | 183 | # Test chunk count constraint 184 | with tempfile.NamedTemporaryFile(mode="w+t", delete=True) as f: 185 | f.write("This is a named temporary file.") 186 | f.seek(0) 187 | f.flush() 188 | with patch("server.extraction_runnable.settings.MAX_CHUNKS", 1): 189 | with patch.object( 190 | CharacterTextSplitter, "split_text", return_value=["a", "b"] 191 | ): 192 | response = await client.post( 193 | "/extract", 194 | data={ 195 | "extractor_id": extractor_id, 196 | "mode": "entire_document", 197 | }, 198 | files={"file": f}, 199 | headers=headers, 200 | ) 201 | assert response.status_code == 200 202 | assert response.json() == { 203 | "data": ["a"], 204 | "content_too_long": True, 205 | } 206 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["OPENAI_API_KEY"] = "placeholder" 4 | os.environ["FIREWORKS_API_KEY"] = "placeholder" 5 | os.environ["TOGETHER_API_KEY"] = "placeholder" 6 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/fake/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/fake/__init__.py -------------------------------------------------------------------------------- /backend/tests/unit_tests/fake/chat_model.py: -------------------------------------------------------------------------------- 1 | """Fake Chat Model wrapper for testing purposes.""" 2 | from typing import Any, Iterator, List, Optional 3 | 4 | from langchain_core.callbacks.manager import ( 5 | CallbackManagerForLLMRun, 6 | ) 7 | from langchain_core.language_models.chat_models import BaseChatModel 8 | from langchain_core.messages import ( 9 | AIMessage, 10 | BaseMessage, 11 | ) 12 | from langchain_core.outputs import ChatGeneration, ChatResult 13 | 14 | 15 | class GenericFakeChatModel(BaseChatModel): 16 | """A generic fake chat model that can be used to test the chat model interface.""" 17 | 18 | messages: Iterator[AIMessage] 19 | """Get an iterator over messages. 20 | 21 | This can be expanded to accept other types like Callables / dicts / strings 22 | to make the interface more generic if needed. 23 | 24 | Note: if you want to pass a list, you can use `iter` to convert it to an iterator. 25 | 26 | Please note that streaming is not implemented yet. We should try to implement it 27 | in the future by delegating to invoke and then breaking the resulting output 28 | into message chunks. 29 | """ 30 | 31 | def _generate( 32 | self, 33 | messages: List[BaseMessage], 34 | stop: Optional[List[str]] = None, 35 | run_manager: Optional[CallbackManagerForLLMRun] = None, 36 | **kwargs: Any, 37 | ) -> ChatResult: 38 | """Top Level call""" 39 | message = next(self.messages) 40 | generation = ChatGeneration(message=message) 41 | return ChatResult(generations=[generation]) 42 | 43 | @property 44 | def _llm_type(self) -> str: 45 | return "generic-fake-chat-model" 46 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/fake/test_fake_chat_model.py: -------------------------------------------------------------------------------- 1 | """Tests for verifying that testing utility code works as expected.""" 2 | from itertools import cycle 3 | 4 | from langchain_core.messages import AIMessage 5 | 6 | from tests.unit_tests.fake.chat_model import GenericFakeChatModel 7 | 8 | 9 | class AnyStr(str): 10 | def __init__(self) -> None: 11 | super().__init__() 12 | 13 | def __eq__(self, other: object) -> bool: 14 | return isinstance(other, str) 15 | 16 | 17 | def test_generic_fake_chat_model_invoke() -> None: 18 | # Will alternate between responding with hello and goodbye 19 | infinite_cycle = cycle([AIMessage(content="hello"), AIMessage(content="goodbye")]) 20 | model = GenericFakeChatModel(messages=infinite_cycle) 21 | response = model.invoke("meow") 22 | assert response == AIMessage(content="hello", id=AnyStr()) 23 | response = model.invoke("kitty") 24 | assert response == AIMessage(content="goodbye", id=AnyStr()) 25 | response = model.invoke("meow") 26 | assert response == AIMessage(content="hello", id=AnyStr()) 27 | 28 | 29 | async def test_generic_fake_chat_model_ainvoke() -> None: 30 | # Will alternate between responding with hello and goodbye 31 | infinite_cycle = cycle([AIMessage(content="hello"), AIMessage(content="goodbye")]) 32 | model = GenericFakeChatModel(messages=infinite_cycle) 33 | response = await model.ainvoke("meow") 34 | assert response == AIMessage(content="hello", id=AnyStr()) 35 | response = await model.ainvoke("kitty") 36 | assert response == AIMessage(content="goodbye", id=AnyStr()) 37 | response = await model.ainvoke("meow") 38 | assert response == AIMessage(content="hello", id=AnyStr()) 39 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/fixtures/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import List 3 | 4 | HERE = Path(__file__).parent 5 | 6 | # PUBLIC API 7 | 8 | 9 | def get_sample_paths() -> List[Path]: 10 | """List all fixtures.""" 11 | return list(HERE.glob("sample.*")) 12 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/fixtures/sample.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/fixtures/sample.docx -------------------------------------------------------------------------------- /backend/tests/unit_tests/fixtures/sample.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/fixtures/sample.epub -------------------------------------------------------------------------------- /backend/tests/unit_tests/fixtures/sample.html: -------------------------------------------------------------------------------- 1 |

🦜️ LangChain

Underline

Bold

Italics

Col 1

Col 2

Row 1

1

2

Row 2

3

4

Link: https://www.langchain.com/

  • Item 1
  • Item 2
  • Item 3
  • We also love cats 🐱

Image

-------------------------------------------------------------------------------- /backend/tests/unit_tests/fixtures/sample.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/fixtures/sample.odt -------------------------------------------------------------------------------- /backend/tests/unit_tests/fixtures/sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/langchain-extract/3dcbd3a3ffb097d93e5808ee07d1774b5eb11b17/backend/tests/unit_tests/fixtures/sample.pdf -------------------------------------------------------------------------------- /backend/tests/unit_tests/fixtures/sample.txt: -------------------------------------------------------------------------------- 1 | 🦜️ LangChain 2 | 3 | 4 | 5 | 6 | Underline 7 | 8 | 9 | Bold 10 | 11 | 12 | Italics 13 | 14 | 15 | 16 | 17 | 18 | 19 | Col 1 20 | Col 2 21 | Row 1 22 | 1 23 | 2 24 | Row 2 25 | 3 26 | 4 27 | 28 | 29 | 30 | 31 | Link: https://www.langchain.com/ 32 | 33 | 34 | 35 | 36 | * Item 1 37 | * Item 2 38 | * Item 3 39 | * We also love cats 🐱 40 | 41 | 42 | Image -------------------------------------------------------------------------------- /backend/tests/unit_tests/test_deduplication.py: -------------------------------------------------------------------------------- 1 | from server.extraction_runnable import ExtractResponse, deduplicate 2 | 3 | 4 | async def test_deduplication_different_results() -> None: 5 | """Test deduplication of extraction results.""" 6 | result = deduplicate( 7 | [ 8 | {"data": [{"name": "Chester", "age": 42}]}, 9 | {"data": [{"name": "Jane", "age": 42}]}, 10 | ] 11 | ) 12 | expected = ExtractResponse( 13 | data=[ 14 | {"name": "Chester", "age": 42}, 15 | {"name": "Jane", "age": 42}, 16 | ] 17 | ) 18 | assert expected == result 19 | 20 | result = deduplicate( 21 | [ 22 | { 23 | "data": [ 24 | {"field_1": 1, "field_2": "a"}, 25 | {"field_1": 2, "field_2": "b"}, 26 | ] 27 | }, 28 | { 29 | "data": [ 30 | {"field_1": 1, "field_2": "a"}, 31 | {"field_1": 2, "field_2": "c"}, 32 | ] 33 | }, 34 | ] 35 | ) 36 | 37 | expected = ExtractResponse( 38 | data=[ 39 | {"field_1": 1, "field_2": "a"}, 40 | {"field_1": 2, "field_2": "b"}, 41 | {"field_1": 2, "field_2": "c"}, 42 | ] 43 | ) 44 | assert expected == result 45 | 46 | # Test with data being a list of strings 47 | result = deduplicate([{"data": ["1", "2"]}, {"data": ["1", "3"]}]) 48 | expected = ExtractResponse(data=["1", "2", "3"]) 49 | assert expected == result 50 | 51 | # Test with data being a mix of integer and string 52 | result = deduplicate([{"data": [1, "2"]}, {"data": ["1", "3"]}]) 53 | expected = ExtractResponse(data=[1, "2", "1", "3"]) 54 | assert expected == result 55 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/test_parsing.py: -------------------------------------------------------------------------------- 1 | """Test parsing logic.""" 2 | import mimetypes 3 | 4 | from langchain.document_loaders import Blob 5 | 6 | from extraction.parsing import ( 7 | MIMETYPE_BASED_PARSER, 8 | SUPPORTED_MIMETYPES, 9 | ) 10 | from tests.unit_tests.fixtures import get_sample_paths 11 | 12 | 13 | def test_list_of_accepted_mimetypes() -> None: 14 | """This list should generally grow! Protecting against typos in mimetypes.""" 15 | assert SUPPORTED_MIMETYPES == [ 16 | # Two MS Word mimetypes are disabled for now 17 | # Need to install unstructured to enable them 18 | # "application/msword", 19 | "application/pdf", 20 | # "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 21 | "text/html", 22 | "text/plain", 23 | ] 24 | 25 | 26 | def test_attempt_to_parse_each_fixture() -> None: 27 | """Attempt to parse supported fixtures.""" 28 | seen_mimetypes = set() 29 | for path in get_sample_paths(): 30 | type_, _ = mimetypes.guess_type(path) 31 | if type_ not in SUPPORTED_MIMETYPES: 32 | continue 33 | seen_mimetypes.add(type_) 34 | blob = Blob.from_path(path) 35 | documents = MIMETYPE_BASED_PARSER.parse(blob) 36 | try: 37 | assert len(documents) == 1 38 | doc = documents[0] 39 | assert "source" in doc.metadata 40 | assert doc.metadata["source"] == str(path) 41 | assert "🦜" in doc.page_content 42 | except Exception as e: 43 | raise AssertionError(f"Failed to parse {path}") from e 44 | 45 | known_missing = {"application/msword"} 46 | assert set(SUPPORTED_MIMETYPES) - known_missing == seen_mimetypes 47 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/test_upload.py: -------------------------------------------------------------------------------- 1 | from extraction.parsing import _guess_mimetype 2 | from tests.unit_tests.fixtures import get_sample_paths 3 | 4 | 5 | async def test_mimetype_guessing() -> None: 6 | """Verify mimetype guessing for all fixtures.""" 7 | name_to_mime = {} 8 | for file in sorted(get_sample_paths()): 9 | data = file.read_bytes() 10 | name_to_mime[file.name] = _guess_mimetype(data) 11 | 12 | assert { 13 | "sample.docx": ( 14 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document" 15 | ), 16 | "sample.epub": "application/epub+zip", 17 | "sample.html": "text/html", 18 | "sample.odt": "application/vnd.oasis.opendocument.text", 19 | "sample.pdf": "application/pdf", 20 | "sample.rtf": "text/rtf", 21 | "sample.txt": "text/plain", 22 | } == name_to_mime 23 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from langchain.pydantic_v1 import BaseModel, Field 2 | from langchain_core.messages import AIMessage 3 | 4 | from extraction.utils import update_json_schema 5 | from server.extraction_runnable import ExtractionExample, _make_prompt_template 6 | 7 | 8 | def test_update_json_schema() -> None: 9 | """Test updating JSON schema.""" 10 | 11 | class Person(BaseModel): 12 | name: str = Field(..., description="The name of the person.") 13 | age: int = Field(..., description="The age of the person.") 14 | 15 | schema = Person.schema() 16 | 17 | assert schema == { 18 | "properties": { 19 | "age": { 20 | "description": "The age of the person.", 21 | "title": "Age", 22 | "type": "integer", 23 | }, 24 | "name": { 25 | "description": "The name of the person.", 26 | "title": "Name", 27 | "type": "string", 28 | }, 29 | }, 30 | "required": ["name", "age"], 31 | "title": "Person", 32 | "type": "object", 33 | } 34 | 35 | updated_schema = update_json_schema(schema) 36 | assert updated_schema == { 37 | "type": "object", 38 | "properties": { 39 | "data": { 40 | "type": "array", 41 | "items": { 42 | "title": "Person", 43 | "type": "object", 44 | "properties": { 45 | "name": { 46 | "title": "Name", 47 | "description": "The name of the person.", 48 | "type": "string", 49 | }, 50 | "age": { 51 | "title": "Age", 52 | "description": "The age of the person.", 53 | "type": "integer", 54 | }, 55 | }, 56 | "required": ["name", "age"], 57 | }, 58 | } 59 | }, 60 | "required": ["data"], 61 | "title": "extractor", 62 | "description": "Extract information matching the given schema.", 63 | } 64 | 65 | 66 | def test_make_prompt_template() -> None: 67 | """Test making a system message from instructions and examples.""" 68 | instructions = "Test instructions." 69 | examples = [ 70 | ExtractionExample( 71 | text="Test text.", 72 | output=[ 73 | {"name": "Test Name", "age": 0}, 74 | {"name": "Test Name 2", "age": 1}, 75 | ], 76 | ) 77 | ] 78 | prefix = ( 79 | "You are a top-tier algorithm for extracting information from text. " 80 | "Only extract information that is relevant to the provided text. " 81 | "If no information is relevant, use the schema and output " 82 | "an empty list where appropriate." 83 | ) 84 | prompt = _make_prompt_template(instructions, examples, "name") 85 | messages = prompt.messages 86 | assert 5 == len(messages) 87 | system = messages[0].prompt.template 88 | assert system.startswith(prefix) 89 | assert system.endswith(instructions) 90 | 91 | example_input = messages[1] 92 | assert example_input.content == "Test text." 93 | example_output = messages[2] 94 | assert isinstance(example_output, AIMessage) 95 | assert example_output.tool_calls 96 | assert len(example_output.tool_calls) == 1 97 | assert example_output.tool_calls[0]["name"] == "name" 98 | 99 | prompt = _make_prompt_template(instructions, None, "name") 100 | assert 2 == len(prompt.messages) 101 | 102 | prompt = _make_prompt_template(None, examples, "name") 103 | assert 5 == len(prompt.messages) 104 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/test_validators.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from server.validators import validate_json_schema 4 | 5 | 6 | def test_validate_json_schema() -> None: 7 | """Test validate_json_schema.""" 8 | # TODO: Validate more extensively to make sure that it actually validates 9 | # the schema as expected. 10 | with pytest.raises(Exception): 11 | validate_json_schema({"type": "meow"}) 12 | 13 | with pytest.raises(Exception): 14 | validate_json_schema({"type": "str"}) 15 | 16 | validate_json_schema({"type": "string"}) 17 | -------------------------------------------------------------------------------- /backend/tests/unit_tests/utils.py: -------------------------------------------------------------------------------- 1 | from contextlib import asynccontextmanager 2 | from typing import Optional 3 | 4 | import httpx 5 | from fastapi import FastAPI 6 | from httpx import AsyncClient 7 | 8 | 9 | @asynccontextmanager 10 | async def get_async_test_client( 11 | server: FastAPI, *, path: Optional[str] = None, raise_app_exceptions: bool = True 12 | ) -> AsyncClient: 13 | """Get an async client.""" 14 | url = "http://localhost:9999/" 15 | if path: 16 | url += path 17 | transport = httpx.ASGITransport( 18 | app=server, 19 | raise_app_exceptions=raise_app_exceptions, 20 | ) 21 | async_client = AsyncClient(app=server, base_url=url, transport=transport) 22 | try: 23 | yield async_client 24 | finally: 25 | await async_client.aclose() 26 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | name: langchain-extract 2 | 3 | services: 4 | postgres: 5 | # Careful if bumping postgres version. 6 | # Make sure to keep in sync with CI 7 | # version if being tested on CI. 8 | image: postgres:16 9 | expose: 10 | - "5432" 11 | ports: 12 | - "5432:5432" 13 | environment: 14 | POSTGRES_DB: langchain 15 | POSTGRES_USER: langchain 16 | POSTGRES_PASSWORD: langchain 17 | healthcheck: 18 | test: ["CMD-SHELL", "pg_isready -U langchain -d langchain -W langchain"] 19 | interval: 10s 20 | timeout: 5s 21 | retries: 5 22 | volumes: 23 | - postgres_data:/var/lib/postgresql/data 24 | 25 | backend: 26 | build: 27 | context: . 28 | dockerfile: ./backend/Dockerfile 29 | target: development 30 | env_file: 31 | - .local.env 32 | environment: 33 | - PG_HOST=postgres 34 | # Define CORS origins for dev work on UI 35 | - CORS_ORIGINS=http://localhost:3000 36 | ports: 37 | - "8000:8000" # Backend is accessible on localhost:8100 38 | depends_on: 39 | - postgres 40 | volumes: 41 | - ./backend:/backend 42 | 43 | frontend: 44 | build: 45 | context: ./frontend 46 | dockerfile: ./Dockerfile 47 | target: development 48 | ports: 49 | - "3000:3000" 50 | environment: 51 | - NODE_ENV=development 52 | volumes: 53 | - ./frontend:/app 54 | - /app/node_modules 55 | depends_on: 56 | - backend 57 | 58 | volumes: 59 | postgres_data: 60 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | import pathlib 20 | import sys 21 | from typing import List 22 | 23 | import toml 24 | 25 | ROOT_FOLDER = str(pathlib.Path(__file__).parent.parent.parent) 26 | 27 | # Add the project root to the path 28 | sys.path.insert(0, ROOT_FOLDER) 29 | 30 | with open("../../pyproject.toml") as f: 31 | data = toml.load(f) 32 | 33 | project = "LangChain Extract" 34 | copyright = "2024, Langchain AI" 35 | author = "Langchain AI" 36 | 37 | version = data["tool"]["poetry"]["version"] 38 | release = version 39 | 40 | html_title = project + " " + version 41 | 42 | 43 | # -- General configuration --------------------------------------------------- 44 | 45 | # Add any Sphinx extension module names here, as strings. They can be 46 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 47 | # ones. 48 | extensions = [ 49 | "sphinx.ext.autodoc", 50 | "sphinx.ext.autodoc.typehints", 51 | "sphinx.ext.autosummary", 52 | "sphinx.ext.napoleon", 53 | "sphinx.ext.viewcode", 54 | "myst_nb", 55 | "sphinx_copybutton", 56 | "IPython.sphinxext.ipython_console_highlighting", 57 | ] 58 | source_suffix = [".ipynb", ".html", ".md", ".rst"] 59 | 60 | # Add any paths that contain templates here, relative to this directory. 61 | templates_path = ["_templates"] 62 | 63 | # List of patterns, relative to source directory, that match files and 64 | # directories to ignore when looking for source files. 65 | # This pattern also affects html_static_path and html_extra_path. 66 | exclude_patterns: List[str] = [] 67 | 68 | 69 | # -- Options for HTML output ------------------------------------------------- 70 | 71 | # The theme to use for HTML and HTML Help pages. See the documentation for 72 | # a list of builtin themes. 73 | # 74 | html_theme = "sphinx_book_theme" 75 | 76 | html_theme_options = { 77 | "path_to_docs": "docs/source", 78 | "repository_url": "https://github.com/langchain-ai/langchain-extract", 79 | "home_page_in_toc": True, 80 | "show_navbar_depth": 2, 81 | "use_sidenotes": True, 82 | "use_repository_button": True, 83 | "use_issues_button": True, 84 | "use_source_button": True, 85 | "use_fullscreen_button": True, 86 | "repository_branch": "main", 87 | "launch_buttons": { 88 | "notebook_interface": "jupyterlab", 89 | "colab_url": "https://colab.research.google.com", 90 | }, 91 | } 92 | 93 | html_context = { 94 | "display_github": True, # Integrate GitHub 95 | "github_user": "langchain-ai", # Username 96 | "github_repo": "langchain-extract", # Repo name 97 | "github_version": "main", # Version 98 | "conf_py_path": "/docs/", # Path in the checkout to the docs root 99 | } 100 | 101 | # Add any paths that contain custom static files (such as style sheets) here, 102 | # relative to this directory. They are copied after the builtin static files, 103 | # so a file named "default.css" will overwrite the builtin "default.css". 104 | html_static_path = ["_static"] 105 | 106 | # These paths are either relative to html_static_path 107 | # or fully qualified paths (eg. https://...) 108 | html_css_files = [ 109 | "css/custom.css", 110 | ] 111 | 112 | nb_execution_mode = "off" 113 | autosummary_generate = True -------------------------------------------------------------------------------- /docs/source/notebooks/earnings_call_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "1549a9c6-cca7-4028-9f0c-80ee3aa1d4b4", 6 | "metadata": {}, 7 | "source": [ 8 | "# Example: extracting structured data from earnings call transcripts\n", 9 | "\n", 10 | "Most public companies host earnings calls, providing their management opportunities to discuss past financial results and future plans. Natural language transcripts of these calls may contain useful information, but often this information must first be extracted from the document and arranged into a structured form so that it can be analyzed or compared across time periods and other companies.\n", 11 | "\n", 12 | "Here we demonstrate the use of a LLM-powered extraction service on extracting information from Uber's Q4 2023 earnings call. We show the importance of incorporating few-shot learning to accurate extraction in a real-world context.\n", 13 | "\n", 14 | "Uber investor relations makes the prepared remarks for the call available [online](https://s23.q4cdn.com/407969754/files/doc_earnings/2023/q4/transcript/Uber-Q4-23-Prepared-Remarks.pdf).\n", 15 | "\n", 16 | "First we start our local extraction service, as described in the [README](../../../README.md), and download the PDF document:" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "id": "589ea131-e6ae-4605-8c8f-3ccb0f643477", 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import requests\n", 27 | "\n", 28 | "url = \"http://localhost:8000\"" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "id": "d7d935fe-4642-4c55-bba4-6dfb8191e4bd", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "# Uber transcripts from earnings calls and other events at https://investor.uber.com/news-events/default.aspx\n", 39 | "\n", 40 | "pdf_url = \"https://s23.q4cdn.com/407969754/files/doc_earnings/2023/q4/transcript/Uber-Q4-23-Prepared-Remarks.pdf\"" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "id": "c704a00e-f663-4bce-b482-984278dad8f1", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# Get PDF bytes\n", 51 | "\n", 52 | "pdf_response = requests.get(pdf_url)\n", 53 | "assert(pdf_response.status_code == 200)\n", 54 | "pdf_bytes = pdf_response.content" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "0a10cf8b-f05e-424b-9a72-fb4abfb9e091", 60 | "metadata": {}, 61 | "source": [ 62 | "We next specify the schema of what we intend to extract. Here we specify a record of financial data. We allow the LLM to infer various attributes, such as the time period for the record.\n", 63 | "\n", 64 | "Note that we include an `evidence` attribute, which provides context for the predictions and supports downstream verification of the results.\n", 65 | "\n", 66 | "Once we've defined our schema, we create an extractor by posting it to our database." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "id": "3d5a5bb0-4284-4706-98e6-e622bcc3778d", 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/plain": [ 78 | "" 79 | ] 80 | }, 81 | "execution_count": 4, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [ 87 | "from uuid import uuid4\n", 88 | "\n", 89 | "from pydantic import BaseModel, Field\n", 90 | "\n", 91 | "class FinancialData(BaseModel):\n", 92 | " name: str = Field(..., description=\"Name of the financial figure, such as revenue.\")\n", 93 | " value: int = Field(..., description=\"Nominal earnings in local currency.\")\n", 94 | " scale: str = Field(..., description=\"Scale of figure, such as MM, B, or percent.\")\n", 95 | " period_start: str = Field(..., description=\"The start of the time period in ISO format.\")\n", 96 | " period_duration: int = Field(..., description=\"Duration of period, in months\")\n", 97 | " evidence: str = Field(..., description=\"Verbatim sentence of text where figure was found.\")\n", 98 | "\n", 99 | "user_id = str(uuid4())\n", 100 | "headers = {\"x-key\": user_id}\n", 101 | "\n", 102 | "data = {\n", 103 | " \"user_id\": user_id,\n", 104 | " \"description\": \"Financial revenues and other figures.\",\n", 105 | " \"schema\": FinancialData.schema(),\n", 106 | " \"instruction\": (\n", 107 | " \"Extract standard financial figures, specifically earnings and \"\n", 108 | " \"revenue figures.\"\n", 109 | " )\n", 110 | "}\n", 111 | "\n", 112 | "response = requests.post(f\"{url}/extractors\", json=data, headers=headers)\n", 113 | "response" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 5, 119 | "id": "74b7f3a4-07c1-4cf0-8c75-34d22eb5a661", 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "{'uuid': '151db8c9-ec49-4c6c-a13d-b5335ede8cbb'}\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "extractor = response.json()\n", 132 | "print(extractor)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "fc18f21c-9f73-4f9e-b63d-7d0a198208c9", 138 | "metadata": {}, 139 | "source": [ 140 | "We can now try the extractor on our PDF:" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 6, 146 | "id": "15a1c7e9-3fcd-42ca-88fb-4802fe841a8d", 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "" 153 | ] 154 | }, 155 | "execution_count": 6, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "result = requests.post(\n", 162 | " f\"{url}/extract\",\n", 163 | " data={\"extractor_id\": extractor[\"uuid\"]},\n", 164 | " files={\"file\": pdf_bytes},\n", 165 | " headers=headers,\n", 166 | ")\n", 167 | "\n", 168 | "result" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 7, 174 | "id": "894ad738-5f25-4791-a2a6-365d39b583b4", 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "data": { 179 | "text/plain": [ 180 | "{'data': [{'name': 'Adjusted EBITDA',\n", 181 | " 'scale': 'million',\n", 182 | " 'value': 1300,\n", 183 | " 'evidence': 'Q4 was a standout quarter to cap off a standout year... translated to $1.3 billion in Adjusted EBITDA',\n", 184 | " 'period_start': '2023-10-01',\n", 185 | " 'period_duration': 3},\n", 186 | " {'name': 'GAAP operating income',\n", 187 | " 'scale': 'million',\n", 188 | " 'value': 652,\n", 189 | " 'evidence': 'translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income',\n", 190 | " 'period_start': '2023-10-01',\n", 191 | " 'period_duration': 3},\n", 192 | " {'name': 'Gross Bookings',\n", 193 | " 'scale': 'billion',\n", 194 | " 'value': 37.6,\n", 195 | " 'evidence': 'Gross Bookings of $37.6 billion',\n", 196 | " 'period_start': '2023-10-01',\n", 197 | " 'period_duration': 3},\n", 198 | " {'name': 'Revenue',\n", 199 | " 'scale': 'billion',\n", 200 | " 'value': 9.9,\n", 201 | " 'evidence': 'we grew our revenue by 13% YoY on a constant-currency basis to $9.9 billion',\n", 202 | " 'period_start': '2023-10-01',\n", 203 | " 'period_duration': 3},\n", 204 | " {'name': 'Adjusted EBITDA',\n", 205 | " 'scale': '$',\n", 206 | " 'value': 1260000000,\n", 207 | " 'evidence': 'We expect Adjusted EBITDA of $1.26 billion to $1.34 billion.',\n", 208 | " 'period_start': '2023-01-01',\n", 209 | " 'period_duration': 12}]}" 210 | ] 211 | }, 212 | "execution_count": 7, 213 | "metadata": {}, 214 | "output_type": "execute_result" 215 | } 216 | ], 217 | "source": [ 218 | "result.json()" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "id": "cdcf8c31-bbb4-40df-b94e-dcda32b975aa", 224 | "metadata": {}, 225 | "source": [ 226 | "We've extracted several records capturing various earnings and revenue figures, and have conformed the records to the desired schema.\n", 227 | "\n", 228 | "We can convey additional instructions to the LLM efficiently via few-shot examples. For example, we can specify how the names of financial metrics should be normalized, or how scales (millions, billions, percentages, etc.) should be represented in different cases.\n", 229 | "\n", 230 | "The `examples` endpoint lets us associate few-shot examples with an extractor. We can specify examples by pairing text inputs with lists of `FinancialData` outputs:" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 8, 236 | "id": "b07d78e9-23d5-49e8-ae6c-e3d4aaca2d4d", 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "examples = [\n", 241 | " {\n", 242 | " \"text\": \"In 2022, Revenue was $1 million and EBIT was $2M.\",\n", 243 | " \"output\": [\n", 244 | " FinancialData(\n", 245 | " name=\"revenue\",\n", 246 | " value=1,\n", 247 | " scale=\"MM\",\n", 248 | " period_start=\"2022-01-01\",\n", 249 | " period_duration=12,\n", 250 | " evidence=\"In 2022, Revenue was $1 million and EBIT was $2M.\",\n", 251 | " ).dict(),\n", 252 | " FinancialData(\n", 253 | " name=\"ebit\",\n", 254 | " value=2,\n", 255 | " scale=\"MM\",\n", 256 | " period_start=\"2022-01-01\",\n", 257 | " period_duration=12,\n", 258 | " evidence=\"In 2022, Revenue was $1 million and EBIT was $2M.\",\n", 259 | " ).dict()\n", 260 | " ],\n", 261 | " },\n", 262 | "]\n", 263 | "\n", 264 | "responses = []\n", 265 | "for example in examples:\n", 266 | " create_request = {\n", 267 | " \"extractor_id\": extractor[\"uuid\"],\n", 268 | " \"content\": example[\"text\"],\n", 269 | " \"output\": example['output'],\n", 270 | " }\n", 271 | " response = requests.post(f\"{url}/examples\", json=create_request, headers=headers)\n", 272 | " responses.append(response)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "id": "271a90c0-d320-4e35-9317-1ea08c5dde15", 278 | "metadata": {}, 279 | "source": [ 280 | "Having posted the examples, we can re-run the extraction:" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 9, 286 | "id": "efc8041e-e3ca-4705-8d34-7f9b93b1400c", 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/plain": [ 292 | "" 293 | ] 294 | }, 295 | "execution_count": 9, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "result = requests.post(\n", 302 | " f\"{url}/extract\",\n", 303 | " data={\"extractor_id\": extractor[\"uuid\"]},\n", 304 | " files={\"file\": pdf_bytes},\n", 305 | " headers=headers,\n", 306 | ")\n", 307 | "\n", 308 | "result" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 10, 314 | "id": "2101f960-6abd-4fef-9945-bafb449d5435", 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "data": { 319 | "text/plain": [ 320 | "{'data': [{'name': 'adjusted ebitda',\n", 321 | " 'scale': 'MM',\n", 322 | " 'value': 1300,\n", 323 | " 'evidence': 'These strong top-line trends, combined with continued rigor on costs, translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n", 324 | " 'period_start': '2023-10-01',\n", 325 | " 'period_duration': 3},\n", 326 | " {'name': 'revenue',\n", 327 | " 'scale': 'MM',\n", 328 | " 'value': 9900,\n", 329 | " 'evidence': 'We grew our revenue by 13% YoY on a constant-currency basis to $9.9 billion.',\n", 330 | " 'period_start': '2023-10-01',\n", 331 | " 'period_duration': 3},\n", 332 | " {'name': 'gaap operating income',\n", 333 | " 'scale': 'MM',\n", 334 | " 'value': 652,\n", 335 | " 'evidence': 'These strong top-line trends, combined with continued rigor on costs, translated to $1.3 billion in Adjusted EBITDA and $652 million in GAAP operating income.',\n", 336 | " 'period_start': '2023-10-01',\n", 337 | " 'period_duration': 3},\n", 338 | " {'name': 'adjusted ebitda',\n", 339 | " 'scale': 'B',\n", 340 | " 'value': 1260,\n", 341 | " 'evidence': 'We expect Adjusted EBITDA of $1.26 billion to $1.34 billion.',\n", 342 | " 'period_start': '2023-01-01',\n", 343 | " 'period_duration': 12}]}" 344 | ] 345 | }, 346 | "execution_count": 10, 347 | "metadata": {}, 348 | "output_type": "execute_result" 349 | } 350 | ], 351 | "source": [ 352 | "result.json()" 353 | ] 354 | } 355 | ], 356 | "metadata": { 357 | "kernelspec": { 358 | "display_name": "Python 3 (ipykernel)", 359 | "language": "python", 360 | "name": "python3" 361 | }, 362 | "language_info": { 363 | "codemirror_mode": { 364 | "name": "ipython", 365 | "version": 3 366 | }, 367 | "file_extension": ".py", 368 | "mimetype": "text/x-python", 369 | "name": "python", 370 | "nbconvert_exporter": "python", 371 | "pygments_lexer": "ipython3", 372 | "version": "3.10.4" 373 | } 374 | }, 375 | "nbformat": 4, 376 | "nbformat_minor": 5 377 | } 378 | -------------------------------------------------------------------------------- /docs/source/notebooks/quick_start.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "7e43ed67-9fbb-4d6c-9a5d-8c4addeb2ed5", 6 | "metadata": {}, 7 | "source": [ 8 | "# Client Example" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "b123c960-a0b4-4d5e-b15f-729de23974f5", 15 | "metadata": { 16 | "tags": [] 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "from langserve import RemoteRunnable" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "id": "19dafdeb-63c5-4218-b0f9-fc20754369be", 27 | "metadata": { 28 | "tags": [] 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "from typing import Optional, List\n", 33 | "from pydantic import BaseModel, Field\n", 34 | "\n", 35 | "class Person(BaseModel):\n", 36 | " age: Optional[int] = Field(None, description=\"The age of the person in years.\")\n", 37 | " name: Optional[str] = Field(None, description=\"The name of the person.\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "id": "bf79ef88-b816-46aa-addf-9366b7ebdcaf", 44 | "metadata": { 45 | "tags": [] 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "runnable = RemoteRunnable(\"http://localhost:8000/extract_text/\")" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 4, 55 | "id": "5f102a5c-a80c-4480-863b-30f3aaad5afe", 56 | "metadata": { 57 | "tags": [] 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "text = \"\"\"\n", 62 | "My name is Chester. I am 42 years old. My friend Jane is a year older than me.\n", 63 | "\"\"\"" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 5, 69 | "id": "553d7dbc-9117-4834-83b1-11e28a513170", 70 | "metadata": { 71 | "tags": [] 72 | }, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "{'data': [{'name': 'Chester', 'age': 42}]}" 78 | ] 79 | }, 80 | "execution_count": 5, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "response = runnable.invoke({\"text\": text, \"schema\": Person.schema()})\n", 87 | "response" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "id": "c70d8d7c-5f0b-4757-92b7-cdd40f351275", 93 | "metadata": {}, 94 | "source": [ 95 | "Add instructions:" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 6, 101 | "id": "97294409-6daf-418d-9cbe-f44946245e35", 102 | "metadata": { 103 | "tags": [] 104 | }, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "{'data': [{'name': 'Chester', 'age': 42}]}" 110 | ] 111 | }, 112 | "execution_count": 6, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "instructions = \"Redact all names using the characters `######`\"\n", 119 | "\n", 120 | "response = runnable.invoke(\n", 121 | " {\n", 122 | " \"text\": text,\n", 123 | " \"schema\": Person.schema(),\n", 124 | " \"instructions\": instructions,\n", 125 | " }\n", 126 | ")\n", 127 | "response" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "id": "24b4a123-7841-465b-b43b-1db439c45fa7", 133 | "metadata": {}, 134 | "source": [ 135 | "Add few-shot examples:" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 7, 141 | "id": "bae9416d-abd4-4b41-90c2-3144c8566483", 142 | "metadata": { 143 | "tags": [] 144 | }, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "{'data': [{'name': '######', 'age': 42}, {'name': 'Jane', 'age': 43}]}" 150 | ] 151 | }, 152 | "execution_count": 7, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "instructions = \"Redact all names using the characters `######`\"\n", 159 | "examples = [\n", 160 | " {\n", 161 | " \"text\": \"My name is Grung. I am 100.\",\n", 162 | " \"output\": [ {\"age\": 100, \"name\": \"######\", \"hello\": \"meow\"}] ,\n", 163 | " }\n", 164 | "]\n", 165 | "\n", 166 | "response = runnable.invoke(\n", 167 | " {\n", 168 | " \"text\": text,\n", 169 | " \"schema\": Person.schema(),\n", 170 | " \"instructions\": instructions,\n", 171 | " \"examples\": examples,\n", 172 | " }\n", 173 | ")\n", 174 | "response" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "id": "83244e1a-7d4a-489d-b88c-b4e35ac76001", 180 | "metadata": {}, 181 | "source": [ 182 | "## Persist extractors" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 8, 188 | "id": "7fc60d58-edfd-4d2b-a71f-fc5e9c6ab58b", 189 | "metadata": { 190 | "tags": [] 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "import requests\n", 195 | "from uuid import uuid4" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 9, 201 | "id": "b1124672-ee4b-484a-be07-16687bb229e3", 202 | "metadata": { 203 | "tags": [] 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "url = \"http://localhost:8000\"\n", 208 | "user_id = str(uuid4()) # indicates owner for extractor\n", 209 | "headers = {\"x-key\": user_id}" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 10, 215 | "id": "8a491b3e-999f-4f88-87f0-a282d582ef18", 216 | "metadata": { 217 | "tags": [] 218 | }, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/plain": [ 223 | "" 224 | ] 225 | }, 226 | "execution_count": 10, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "data = {\n", 233 | " \"name\": \"people_extractor\",\n", 234 | " \"description\": \"Extract references to people, having properties name and age.\",\n", 235 | " \"schema\": Person.schema(),\n", 236 | " \"instruction\": \"Redact all names using the characters `######`\",\n", 237 | "}\n", 238 | "\n", 239 | "response = requests.post(f\"{url}/extractors\", json=data, headers=headers)\n", 240 | "response" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 11, 246 | "id": "1cf96c62-5653-4955-87ad-48ca009252d0", 247 | "metadata": { 248 | "tags": [] 249 | }, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "fa60ccce-5637-41b4-ba1a-085a56d0fa5b\n" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "uuid = response.json()['uuid']\n", 261 | "print(uuid)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "id": "c7ddd7ea-31b6-49ad-a89a-8d5d7efa5f22", 267 | "metadata": {}, 268 | "source": [ 269 | "### Add examples" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 12, 275 | "id": "d17f0283-a517-497f-9838-ade72f2e6359", 276 | "metadata": { 277 | "tags": [] 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "import json\n", 282 | "\n", 283 | "examples = [\n", 284 | " {\n", 285 | " \"text\": \"My name is Grung. I am 100.\",\n", 286 | " \"output\": [Person(age=100, name=\"######\").dict()],\n", 287 | " }\n", 288 | "]\n", 289 | "\n", 290 | "responses = []\n", 291 | "for example in examples:\n", 292 | " create_request = {\n", 293 | " \"extractor_id\": uuid,\n", 294 | " \"content\": example[\"text\"],\n", 295 | " \"output\": example['output'],\n", 296 | " }\n", 297 | " response = requests.post(f\"{url}/examples\", json=create_request, headers=headers)\n", 298 | " responses.append(response)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 13, 304 | "id": "eb0b90d9-090a-4b84-9e31-e6c9f92de6ea", 305 | "metadata": { 306 | "tags": [] 307 | }, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "" 313 | ] 314 | }, 315 | "execution_count": 13, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "response = requests.get(f\"{url}/examples?extractor_id={uuid}\", headers=headers)\n", 322 | "response" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 14, 328 | "id": "7b82eb80-bc44-455d-a8df-28616f94885d", 329 | "metadata": { 330 | "tags": [] 331 | }, 332 | "outputs": [ 333 | { 334 | "data": { 335 | "text/plain": [ 336 | "[{'extractor_id': 'fa60ccce-5637-41b4-ba1a-085a56d0fa5b',\n", 337 | " 'content': 'My name is Grung. I am 100.',\n", 338 | " 'created_at': '2024-03-22T12:10:32.862261',\n", 339 | " 'updated_at': '2024-03-22T12:10:32.862265',\n", 340 | " 'output': [{'age': 100, 'name': '######'}],\n", 341 | " 'uuid': '94c8a41a-7d33-4795-a6be-23e72fe6c4e4'}]" 342 | ] 343 | }, 344 | "execution_count": 14, 345 | "metadata": {}, 346 | "output_type": "execute_result" 347 | } 348 | ], 349 | "source": [ 350 | "response.json()" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "id": "1dcf03bf-0b71-4c1f-a88a-bce090762d2c", 356 | "metadata": {}, 357 | "source": [ 358 | "### Extract using persisted extractor" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 15, 364 | "id": "d5f84d0c-3d06-4d76-b9e5-c68e659ef930", 365 | "metadata": { 366 | "tags": [] 367 | }, 368 | "outputs": [ 369 | { 370 | "name": "stdout", 371 | "output_type": "stream", 372 | "text": [ 373 | "{'extractor_id': 'fa60ccce-5637-41b4-ba1a-085a56d0fa5b', 'text': '\\nMy name is Chester. I am 42 years old. My friend Jane is a year older than me.\\n'}\n" 374 | ] 375 | } 376 | ], 377 | "source": [ 378 | "request_data = {\"extractor_id\": uuid, \"text\": text}\n", 379 | "print(request_data)" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 16, 385 | "id": "d2bc2481-0dca-42aa-b3a7-d193721e149e", 386 | "metadata": { 387 | "tags": [] 388 | }, 389 | "outputs": [ 390 | { 391 | "data": { 392 | "text/plain": [ 393 | "" 394 | ] 395 | }, 396 | "execution_count": 16, 397 | "metadata": {}, 398 | "output_type": "execute_result" 399 | } 400 | ], 401 | "source": [ 402 | "response = requests.post(f\"{url}/extract\", data=request_data, headers=headers)\n", 403 | "response" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 17, 409 | "id": "f389da36-5b43-41f5-b0a6-a389c6303937", 410 | "metadata": { 411 | "tags": [] 412 | }, 413 | "outputs": [ 414 | { 415 | "data": { 416 | "text/plain": [ 417 | "'{\"data\":[{\"name\":\"######\",\"age\":42},{\"name\":\"######\",\"age\":43}]}'" 418 | ] 419 | }, 420 | "execution_count": 17, 421 | "metadata": {}, 422 | "output_type": "execute_result" 423 | } 424 | ], 425 | "source": [ 426 | "response.text" 427 | ] 428 | } 429 | ], 430 | "metadata": { 431 | "kernelspec": { 432 | "display_name": "Python 3 (ipykernel)", 433 | "language": "python", 434 | "name": "python3" 435 | }, 436 | "language_info": { 437 | "codemirror_mode": { 438 | "name": "ipython", 439 | "version": 3 440 | }, 441 | "file_extension": ".py", 442 | "mimetype": "text/x-python", 443 | "name": "python", 444 | "nbconvert_exporter": "python", 445 | "pygments_lexer": "ipython3", 446 | "version": "3.10.4" 447 | } 448 | }, 449 | "nbformat": 4, 450 | "nbformat_minor": 5 451 | } 452 | -------------------------------------------------------------------------------- /docs/source/toc.segment: -------------------------------------------------------------------------------- 1 | ```{toctree} 2 | :maxdepth: 2 3 | :caption: Introduction 4 | 5 | ./notebooks/getting_started 6 | ``` 7 | -------------------------------------------------------------------------------- /frontend/.env.example: -------------------------------------------------------------------------------- 1 | # Only set for non development builds. 2 | # Development builds default to `http://localhost:8000` 3 | NEXT_PUBLIC_BASE_API_URL=https://example.com -------------------------------------------------------------------------------- /frontend/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "plugins": ["react", "@typescript-eslint", "eslint-plugin-import"], 3 | "env": { 4 | "es2021": true, 5 | "commonjs": true, 6 | "es6": true, 7 | "node": true, 8 | "browser": true 9 | }, 10 | "globals": { 11 | "window": true, 12 | "process": true 13 | }, 14 | "extends": [ 15 | "plugin:react/recommended", 16 | "plugin:react/jsx-runtime", 17 | "plugin:@typescript-eslint/eslint-recommended", 18 | "plugin:@typescript-eslint/recommended", 19 | "plugin:@typescript-eslint/recommended" 20 | ], 21 | "parserOptions": { 22 | "ecmaFeatures": { 23 | "jsx": true 24 | }, 25 | "ecmaVersion": 12, 26 | "sourceType": "module" 27 | }, 28 | "rules": { 29 | "import/no-extraneous-dependencies": "error", 30 | "no-underscore-dangle": [ 31 | "error", 32 | { 33 | "allow": ["__typename"] 34 | } 35 | ], 36 | "react/display-name": "off", 37 | "react/prop-types": "off", 38 | "@typescript-eslint/class-methods-use-this": [ 39 | "error", 40 | { 41 | "ignoreOverrideMethods": true 42 | } 43 | ], 44 | "@typescript-eslint/consistent-type-assertions": [ 45 | "error", 46 | { 47 | "assertionStyle": "never" 48 | } 49 | ], 50 | "@typescript-eslint/default-param-last": "error", 51 | "@typescript-eslint/no-empty-function": "error", 52 | "@typescript-eslint/no-explicit-any": "error", 53 | "@typescript-eslint/no-shadow": "error", 54 | "@typescript-eslint/no-unused-vars": [ 55 | "error", 56 | { 57 | "argsIgnorePattern": "^_", 58 | "ignoreRestSiblings": true 59 | } 60 | ], 61 | "@typescript-eslint/no-useless-constructor": "error", 62 | "camelcase": "off", 63 | "class-methods-use-this": "off", 64 | "default-case": "off", 65 | "default-param-last": "off", 66 | "import/extensions": "off", 67 | "import/prefer-default-export": "off", 68 | "import/order": "error", 69 | "linebreak-style": ["error", "unix"], 70 | "max-len": [ 71 | "warn", 72 | { 73 | "code": 80, 74 | "tabWidth": 2, 75 | "comments": 80, 76 | "ignoreComments": false, 77 | "ignoreTrailingComments": true, 78 | "ignoreUrls": true, 79 | "ignoreStrings": true, 80 | "ignoreTemplateLiterals": true, 81 | "ignoreRegExpLiterals": true 82 | } 83 | ], 84 | "no-console": ["warn", { "allow": ["warn", "error", "debug"] }], 85 | "no-empty-function": "off", 86 | "no-plusplus": [ 87 | "error", 88 | { 89 | "allowForLoopAfterthoughts": true 90 | } 91 | ] 92 | }, 93 | "settings": { 94 | "import/resolver": { 95 | "typescript": {} 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /frontend/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # next.js 12 | /.next/ 13 | /out/ 14 | 15 | # production 16 | /build 17 | 18 | # misc 19 | .DS_Store 20 | *.pem 21 | 22 | # debug 23 | npm-debug.log* 24 | yarn-debug.log* 25 | yarn-error.log* 26 | 27 | # local env files 28 | .env*.local 29 | 30 | # vercel 31 | .vercel 32 | 33 | # typescript 34 | *.tsbuildinfo 35 | next-env.d.ts 36 | 37 | .yarn/ 38 | 39 | # lint 40 | .eslintcache -------------------------------------------------------------------------------- /frontend/.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "endOfLine": "lf" 3 | } 4 | -------------------------------------------------------------------------------- /frontend/.yarnrc.yml: -------------------------------------------------------------------------------- 1 | nodeLinker: node-modules 2 | -------------------------------------------------------------------------------- /frontend/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:18-alpine AS base 2 | 3 | FROM base AS base-deps 4 | WORKDIR /app 5 | 6 | COPY --link ./yarn.lock ./package.json ./.yarnrc.yml ./ 7 | 8 | FROM base AS installer 9 | WORKDIR /app 10 | 11 | COPY --link --from=base-deps /app/package.json ./package.json 12 | COPY --link --from=base-deps /app/yarn.lock ./yarn.lock 13 | COPY --link .yarnrc.yml . 14 | RUN yarn install 15 | 16 | FROM base AS builder 17 | WORKDIR /app 18 | 19 | COPY --link --from=installer /app . 20 | COPY --link tsconfig.json tsconfig.json 21 | RUN yarn build 22 | 23 | FROM base AS development 24 | WORKDIR /app 25 | 26 | COPY --link --from=installer /app . 27 | 28 | ENV NODE_ENV=development 29 | 30 | CMD ["yarn", "dev"] 31 | -------------------------------------------------------------------------------- /frontend/app/components/CreateExtractor.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import { 4 | AbsoluteCenter, 5 | Accordion, 6 | AccordionButton, 7 | AccordionIcon, 8 | AccordionItem, 9 | AccordionPanel, 10 | Badge, 11 | Box, 12 | Button, 13 | Card, 14 | CardBody, 15 | CircularProgress, 16 | Divider, 17 | FormControl, 18 | Heading, 19 | Icon, 20 | IconButton, 21 | Input, 22 | Text, 23 | } from "@chakra-ui/react"; 24 | import { json } from "@codemirror/lang-json"; 25 | import Form from "@rjsf/chakra-ui"; 26 | import validator from "@rjsf/validator-ajv8"; 27 | import CodeMirror from "@uiw/react-codemirror"; 28 | import Ajv from "ajv"; 29 | import addFormats from "ajv-formats"; 30 | import { useRouter } from "next/navigation"; 31 | import React from "react"; 32 | 33 | import { ChatBubbleBottomCenterTextIcon } from "@heroicons/react/24/outline"; 34 | import { useMutation } from "@tanstack/react-query"; 35 | import { suggestExtractor, useCreateExtractor } from "../utils/api"; 36 | 37 | const ajv = new Ajv(); 38 | // Adds support for parsing format types like "date-time" 39 | // and "email" in JSON Schema. 40 | // A lot of the JSON Schema generated by LLMS will 41 | // be generated with these formats out of the box. 42 | addFormats(ajv); 43 | 44 | /** 45 | * Component to create a new extractor with fields 46 | * for name, description, schema, and examples 47 | */ 48 | const CreateExtractor = ({}) => { 49 | const startSchema = "{}"; 50 | // You might use a mutation hook here if you're 51 | // using something like React Query for state management 52 | const [schema, setSchema] = React.useState(startSchema); 53 | const [creatable, setCreatable] = React.useState(false); 54 | const [lastValidSchema, setLastValidSchema] = React.useState( 55 | JSON.parse(startSchema), 56 | ); 57 | const [currentSchemaValid, setCurrentSchemaValid] = React.useState(true); 58 | const [userInput, setUserInput] = React.useState(""); 59 | 60 | const suggestMutation = useMutation({ 61 | mutationFn: suggestExtractor, 62 | onSuccess: (data) => { 63 | let prettySchema = data.json_schema; 64 | 65 | try { 66 | prettySchema = JSON.stringify(JSON.parse(data.json_schema), null, 2); 67 | } catch (e) {} 68 | 69 | setSchema(prettySchema); 70 | }, 71 | }); 72 | 73 | const { push } = useRouter(); 74 | const { mutate } = useCreateExtractor({ 75 | onSuccess: (data) => { 76 | push(`/e/${data.uuid}`); 77 | }, 78 | }); 79 | 80 | React.useMemo(() => { 81 | try { 82 | const parsedSchema = JSON.parse(schema); 83 | ajv.compile(parsedSchema); 84 | setCurrentSchemaValid(true); 85 | setLastValidSchema(parsedSchema); 86 | // OK to create if schema is parseable and not empty 87 | // and contains an object at the top level 88 | setCreatable(parsedSchema.type === "object"); 89 | } catch (e) { 90 | setCurrentSchemaValid(false); 91 | setCreatable(false); 92 | } 93 | }, [schema]); 94 | 95 | const handleSubmit = (event: React.FormEvent) => { 96 | event.preventDefault(); 97 | const instruction = ""; 98 | const objectSchema = JSON.parse(schema); 99 | // Extract information from schema like name, and description 100 | const name = objectSchema.title || "Unnamed"; 101 | const description = objectSchema.description || ""; 102 | // backend uses varchar(100) for description 103 | const shortDescription = 104 | description.length > 100 105 | ? description.substring(0, 95) + "..." 106 | : description; 107 | 108 | mutate({ 109 | name, 110 | description: shortDescription, 111 | schema: objectSchema, 112 | instruction, 113 | }); 114 | }; 115 | 116 | const handleSuggest = (event: React.FormEvent) => { 117 | event.preventDefault(); 118 | const description = event.currentTarget.userInput.value; 119 | if (description === "") { 120 | return; 121 | } 122 | suggestMutation.mutate({ description, jsonSchema: schema }); 123 | setUserInput(""); 124 | }; 125 | 126 | return ( 127 |
128 | 129 | What would you like to extract today? 130 | 131 |
132 | 133 | setUserInput(event.target.value)} 141 | /> 142 | 143 | {suggestMutation.isPending ? ( 144 | 145 | ) : ( 146 | } 149 | aria-label="OK" 150 | colorScheme={userInput === "" ? "gray" : "blue"} 151 | disabled={userInput === ""} 152 | /> 153 | )} 154 | 155 |
159 | 160 | 161 | 162 | OR 163 | 164 | 165 | 166 | 167 | 168 | Edit JSON Schema 169 |
170 | {currentSchemaValid ? ( 171 | OK 172 | ) : ( 173 | Errors! 174 | )} 175 | 176 |
177 |
178 | 179 | 180 | setSchema(value)} 185 | basicSetup={{ autocompletion: true }} 186 | extensions={[json()]} 187 | minHeight="300px" 188 | className="border-4 border-slate-300 border-double" 189 | /> 190 | 191 | 192 |
193 |
194 | {Object.keys(lastValidSchema).length !== 0 && ( 195 | <> 196 | Preview 197 | {!currentSchemaValid && ( 198 | 199 | JSON Schema has errors. Showing previous valid JSON Schema. 200 | 201 | )} 202 | 203 | 204 | 209 | {true} {/* Disables the submit button */} 210 | 211 | 212 |
213 | 214 | )} 215 | 218 | 219 |
220 | ); 221 | }; 222 | 223 | export default CreateExtractor; 224 | -------------------------------------------------------------------------------- /frontend/app/components/Extractor.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | 3 | import { 4 | Tab, 5 | TabList, 6 | TabPanel, 7 | TabPanels, 8 | Tabs, 9 | Text, 10 | } from "@chakra-ui/react"; 11 | import Form from "@rjsf/chakra-ui"; 12 | import validator from "@rjsf/validator-ajv8"; 13 | import { docco } from "react-syntax-highlighter/dist/esm/styles/hljs"; 14 | 15 | import SyntaxHighlighter from "react-syntax-highlighter"; 16 | import { useGetExtractor } from "../utils/api"; 17 | 18 | type ExtractorProps = { 19 | extractorId: string; 20 | isShared: boolean; 21 | }; 22 | 23 | export const Extractor = ({ extractorId, isShared }: ExtractorProps) => { 24 | const { data, isLoading, isError } = useGetExtractor(extractorId, isShared); 25 | if (isLoading) { 26 | return
Loading...
; 27 | } 28 | if (isError) { 29 | return
Unable to load extractor with ID: {extractorId}
; 30 | } 31 | 32 | if (data === undefined) { 33 | throw new Error("Data is undefined"); 34 | } 35 | 36 | return ( 37 |
38 | 39 | 40 | Form 41 | Code 42 | 43 | 44 | 45 |
46 | {true} {/* Disables the submit button */} 47 |
48 |
49 | 50 | 51 | This shows the raw JSON Schema that describes what information the 52 | extractor will be extracting from the content. 53 | 54 | 55 | {JSON.stringify(data.schema, null, 2)} 56 | 57 | 58 |
59 |
60 |
61 | ); 62 | }; 63 | -------------------------------------------------------------------------------- /frontend/app/components/Playground.tsx: -------------------------------------------------------------------------------- 1 | "use client"; 2 | import { 3 | Button, 4 | Heading, 5 | Tab, 6 | Box, 7 | Divider, 8 | AbsoluteCenter, 9 | TabList, 10 | TabPanel, 11 | TabPanels, 12 | Tabs, 13 | Text, 14 | Textarea, 15 | FormControl, 16 | FormLabel, 17 | HStack, 18 | Radio, 19 | RadioGroup, 20 | } from "@chakra-ui/react"; 21 | import { useMutation } from "@tanstack/react-query"; 22 | import React from "react"; 23 | import SyntaxHighlighter from "react-syntax-highlighter"; 24 | import { docco } from "react-syntax-highlighter/dist/esm/styles/hljs"; 25 | import { runExtraction, useConfiguration } from "../utils/api"; 26 | import { Extractor } from "./Extractor"; 27 | import { ResultsTable } from "./ResultsTable"; 28 | 29 | interface PlaygroundProps { 30 | /** 31 | * The playground currently support viewing 32 | * both shared and non-shared extractors 33 | */ 34 | extractorId: string; 35 | isShared: boolean; 36 | } 37 | 38 | /** 39 | * Playground to work with an existing extractor. 40 | */ 41 | export const Playground = (props: PlaygroundProps) => { 42 | const { extractorId, isShared } = props; 43 | const { data, isPending, mutate } = useMutation({ 44 | mutationFn: runExtraction, 45 | }); 46 | 47 | const requestServerConfig = useConfiguration(); 48 | const [isDisabled, setIsDisabled] = React.useState(true); 49 | 50 | const handleSubmit = (event: React.FormEvent) => { 51 | event.preventDefault(); 52 | 53 | const request = { 54 | extractor_id: extractorId, 55 | model_name: event.currentTarget.modelId.value, 56 | }; 57 | 58 | if (event.currentTarget.text.value) { 59 | Object.assign(request, { text: event.currentTarget.text.value }); 60 | } else { 61 | Object.assign(request, { file: event.currentTarget.file.files[0] }); 62 | } 63 | 64 | mutate([request, isShared]); 65 | }; 66 | 67 | const handleKeyDown = (event: React.KeyboardEvent) => { 68 | if (event.key === "Enter" && !event.shiftKey) { 69 | event.preventDefault(); // Prevent the default Enter action 70 | if (isDisabled) { 71 | return; 72 | } 73 | 74 | event.currentTarget.form?.dispatchEvent( 75 | new Event("submit", { cancelable: true, bubbles: true }), 76 | ); 77 | } 78 | }; 79 | 80 | const handleChange = (event: React.FormEvent) => { 81 | if ( 82 | event.currentTarget.text.value === "" && 83 | event.currentTarget.file.files.length === 0 84 | ) { 85 | setIsDisabled(true); 86 | return; 87 | } 88 | // Also disable if both are present 89 | if ( 90 | event.currentTarget.text.value !== "" && 91 | event.currentTarget.file.files.length !== 0 92 | ) { 93 | setIsDisabled(true); 94 | return; 95 | } 96 | 97 | setIsDisabled(false); 98 | }; 99 | 100 | return ( 101 |
102 |
103 | {isShared && Using a shared exractor} 104 |
105 | 106 |
107 | Extract 108 | 109 |
114 | {requestServerConfig.isFetched && ( 115 | 116 | Extraction Model 117 | 121 | 122 | {requestServerConfig.data?.models.map((model) => ( 123 | 124 | {model.description} 125 | 126 | ))} 127 | 128 | 129 | 130 | )} 131 | {requestServerConfig.isFetched && ( 132 | <> 133 | 140 | 141 | Max file size is: {requestServerConfig.data?.max_file_size_mb}MB 142 | 143 | 144 | Supported mimetypes:{" "} 145 | {requestServerConfig.data?.accepted_mimetypes.join(", ")} 146 | 147 | 148 | )} 149 | 150 | 151 | 152 | OR 153 | 154 | 155 |