├── .env.sample ├── .github └── workflows │ ├── ci.yml │ ├── docker-ci.yml │ ├── docs.yml │ ├── release.yml │ └── stage.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── docetl ├── __init__.py ├── api.py ├── apis │ ├── __init__.py │ └── pd_accessors.py ├── base_schemas.py ├── cli.py ├── config_wrapper.py ├── console.py ├── containers.py ├── dataset.py ├── operations │ ├── __init__.py │ ├── add_uuid.py │ ├── base.py │ ├── cluster.py │ ├── clustering_utils.py │ ├── code_operations.py │ ├── equijoin.py │ ├── extract.py │ ├── filter.py │ ├── gather.py │ ├── link_resolve.py │ ├── map.py │ ├── rank.py │ ├── reduce.py │ ├── resolve.py │ ├── sample.py │ ├── scan.py │ ├── split.py │ ├── unnest.py │ └── utils │ │ ├── __init__.py │ │ ├── api.py │ │ ├── cache.py │ │ ├── llm.py │ │ ├── progress.py │ │ └── validation.py ├── optimizer.py ├── optimizers │ ├── __init__.py │ ├── join_optimizer.py │ ├── map_optimizer │ │ ├── __init__.py │ │ ├── config_generators.py │ │ ├── evaluator.py │ │ ├── operation_creators.py │ │ ├── optimizer.py │ │ ├── plan_generators.py │ │ ├── prompt_generators.py │ │ └── utils.py │ ├── reduce_optimizer.py │ └── utils.py ├── parsing_tools.py ├── ratelimiter.py ├── runner.py ├── schemas.py └── utils.py ├── docker-compose.yml ├── docs ├── advanced │ ├── custom-operators.md │ ├── extending-agents.md │ └── performance-tuning.md ├── api-reference │ ├── cli.md │ ├── docetl.md │ ├── operations.md │ ├── optimizers.md │ └── python.md ├── assets │ ├── docetl-favicon-color.png │ ├── fatal.json │ ├── headerdiagram.png │ ├── medical_transcripts.json │ ├── readmefig.png │ └── tutorial │ │ ├── add-notes.png │ │ ├── dataset-view.png │ │ ├── initial-outputs.png │ │ ├── one-operation.png │ │ ├── operation-details.png │ │ ├── prompt-improvement.png │ │ └── prompt-v2.png ├── best-practices.md ├── community │ ├── index.md │ └── roadmap.md ├── concepts │ ├── operators.md │ ├── optimization.md │ ├── pipelines.md │ └── schemas.md ├── examples │ ├── annotating-legal-documents.md │ ├── characterizing-troll-behavior.md │ ├── custom-parsing.md │ ├── mining-product-reviews.md │ ├── ollama.md │ ├── pdf-analysis-gemini.md │ ├── presidential-debate-themes.md │ ├── rate-limiting.md │ └── split-gather.md ├── execution │ └── running-pipelines.md ├── index.md ├── installation.md ├── operators │ ├── cluster.md │ ├── code.md │ ├── equijoin.md │ ├── extract.md │ ├── filter.md │ ├── gather.md │ ├── link-resolve.md │ ├── map.md │ ├── parallel-map.md │ ├── rank.md │ ├── reduce.md │ ├── resolve.md │ ├── sample.md │ ├── split.md │ └── unnest.md ├── optimization │ ├── configuration.md │ ├── example.md │ ├── overview.md │ └── python-api.md ├── pandas │ ├── examples.md │ ├── index.md │ └── operations.md ├── playground │ ├── features.md │ ├── index.md │ └── tutorial.md ├── python │ ├── examples.md │ └── index.md ├── stylesheets │ └── extra.css ├── tutorial-pythonapi.md └── tutorial.md ├── example_data ├── debates │ ├── data.json │ ├── theme_evolution_analysis_baseline.json │ └── theme_evolution_analysis_reduce_gleaning.json ├── post_di_trump_motion.json └── steamgames │ └── frequent_polarizing_themes.json ├── experiments ├── extraction_outputs.txt ├── logical_fallacy_extraction.py ├── structured_outputs.py └── structured_outputs.txt ├── mkdocs.yml ├── poetry.lock ├── pyproject.toml ├── server ├── __init__.py └── app │ ├── __init__.py │ ├── main.py │ ├── models.py │ └── routes │ ├── __init__.py │ ├── convert.py │ ├── filesystem.py │ └── pipeline.py ├── tests ├── __init__.py ├── basic │ ├── sample_texts │ │ ├── one.txt │ │ └── two.md │ ├── test_basic_filter_split_gather.py │ ├── test_basic_map.py │ ├── test_basic_parallel_map.py │ ├── test_basic_reduce_resolve.py │ ├── test_cluster_and_sample.py │ ├── test_code_operations.py │ ├── test_optimizer.py │ └── test_pipeline_with_parsing.py ├── conftest.py ├── data │ └── PublicWaterMassMailing.pdf ├── ranking │ ├── plots │ │ ├── harmfulness_budget_performance.png │ │ ├── medical_pain_budget_performance.png │ │ └── synthetic_abstracts_budget_performance.png │ ├── test_rank.py │ └── test_rank_budget.py ├── test_api.py ├── test_azure_rl.py ├── test_config.py ├── test_eugene.py ├── test_ollama.py ├── test_pandas_accessors.py ├── test_parsing_tools.py ├── test_reduce_scale.py ├── test_reduce_value_sampling.py ├── test_resolve_auto_batch.py ├── test_runner_caching.py ├── test_split.py ├── test_synth_gather.py ├── test_synth_resolve.py ├── test_synthetic_output.py └── test_validation.py └── website ├── .env.local.sample ├── README.md ├── components.json ├── eslint.config.mjs ├── next.config.mjs ├── package-lock.json ├── package.json ├── postcss.config.mjs ├── posts └── hello-world.md ├── public ├── berkeley.png ├── debate_gemini_result.txt ├── debate_intermediates │ ├── extract_themes_and_viewpoints.json │ ├── summarize_theme_evolution.json │ ├── synthesized_resolve_0.json │ └── unnest_themes.json ├── debate_transcripts.json ├── demos │ ├── prompts_pipeline.yaml │ └── rfi_pipeline.yaml ├── docetl-50m-fall-2024.pdf ├── docetl-favicon-color.png ├── epiclogo.png ├── llms-full.txt ├── llms.txt └── theme_evolution_analysis.json ├── src ├── app │ ├── MarkdownRenderer.tsx │ ├── api │ │ ├── chat │ │ │ └── route.ts │ │ ├── checkNamespace │ │ │ └── route.ts │ │ ├── constants.ts │ │ ├── convertDocuments │ │ │ └── route.ts │ │ ├── downloadTutorialDataset │ │ │ └── route.ts │ │ ├── edit │ │ │ └── route.ts │ │ ├── generate │ │ │ └── route.ts │ │ ├── getInputOutput │ │ │ └── route.ts │ │ ├── getPipelineConfig │ │ │ └── route.ts │ │ ├── readFile │ │ │ └── route.ts │ │ ├── readFilePage │ │ │ └── route.ts │ │ ├── rfi-responses │ │ │ └── route.ts │ │ ├── saveDocuments │ │ │ └── route.ts │ │ ├── serveDocument │ │ │ └── [...path] │ │ │ │ └── route.ts │ │ ├── shouldOptimize │ │ │ └── route.ts │ │ ├── uploadFile │ │ │ └── route.ts │ │ ├── utils.ts │ │ └── writePipelineConfig │ │ │ └── route.ts │ ├── blog │ │ ├── [id] │ │ │ └── page.tsx │ │ └── page.tsx │ ├── fonts │ │ ├── GeistMonoVF.woff │ │ └── GeistVF.woff │ ├── globals.css │ ├── layout.tsx │ ├── localStorageKeys.ts │ ├── page.tsx │ ├── playground │ │ └── page.tsx │ ├── providers.tsx │ ├── showcase │ │ ├── ai-rfi-response-analysis │ │ │ └── page.tsx │ │ ├── ai-system-prompts-analysis │ │ │ └── page.tsx │ │ └── page.tsx │ └── types.ts ├── components │ ├── AIChatPanel.tsx │ ├── AIEditPopover.tsx │ ├── APIKeysDialog.tsx │ ├── AnsiRenderer.tsx │ ├── BookmarksPanel.tsx │ ├── CollapsibleCode.tsx │ ├── ColumnDialog.tsx │ ├── DarkMode.tsx │ ├── DatasetView.tsx │ ├── DebateContent.tsx │ ├── DocumentViewer.tsx │ ├── FileExplorer.tsx │ ├── InlineEditingButton.tsx │ ├── LLMContextPopover.tsx │ ├── MarkdownCell.tsx │ ├── NamespaceDialog.tsx │ ├── NaturalLanguagePipelineDialog.tsx │ ├── OperationCard.tsx │ ├── OperationHelpButton.tsx │ ├── OptimizationDialog.tsx │ ├── Output.tsx │ ├── PipelineGui.tsx │ ├── PipelinePrompts.tsx │ ├── PipelineSettings.tsx │ ├── PipelineVisualization.tsx │ ├── PresidentialDebateDemo.tsx │ ├── PrettyJSON.tsx │ ├── PromptImprovementDialog.tsx │ ├── ResizableDataTable.tsx │ ├── RowNavigator.tsx │ ├── SearchableCell.tsx │ ├── SpotlightOverlay.tsx │ ├── TutorialsDialog.tsx │ ├── operations │ │ ├── args.tsx │ │ └── components.tsx │ ├── showcase │ │ ├── rfi-response-explorer.tsx │ │ └── system-prompts-explorer.tsx │ ├── ui │ │ ├── accordion.tsx │ │ ├── alert-dialog.tsx │ │ ├── alert.tsx │ │ ├── badge.tsx │ │ ├── button.tsx │ │ ├── card.tsx │ │ ├── checkbox.tsx │ │ ├── collapsible.tsx │ │ ├── command.tsx │ │ ├── context-menu.tsx │ │ ├── dialog.tsx │ │ ├── dropdown-menu.tsx │ │ ├── form.tsx │ │ ├── hover-card.tsx │ │ ├── input.tsx │ │ ├── label.tsx │ │ ├── menubar.tsx │ │ ├── pagination.tsx │ │ ├── popover.tsx │ │ ├── progress.tsx │ │ ├── radio-group.tsx │ │ ├── resizable.tsx │ │ ├── scroll-area.tsx │ │ ├── select.tsx │ │ ├── skeleton.tsx │ │ ├── switch.tsx │ │ ├── table.tsx │ │ ├── tabs.tsx │ │ ├── textarea.tsx │ │ ├── toast.tsx │ │ ├── toaster.tsx │ │ └── tooltip.tsx │ └── utils.ts ├── contexts │ ├── BookmarkContext.tsx │ ├── PipelineContext.tsx │ ├── ThemeContext.tsx │ └── WebSocketContext.tsx ├── hooks │ ├── use-toast.ts │ ├── useDatasetUpload.ts │ ├── useOptimizeCheck.ts │ └── useRestorePipeline.ts ├── lib │ ├── analytics.ts │ ├── api-config.ts │ ├── api.ts │ └── utils.ts ├── mocks │ └── mockData.ts └── utils │ └── fileOperations.ts ├── tailwind.config.ts ├── todos.md ├── tsconfig.json └── vercel.json /.env.sample: -------------------------------------------------------------------------------- 1 | # BACKEND configuration 2 | BACKEND_ALLOW_ORIGINS=http://localhost:3000,http://127.0.0.1:3000 3 | BACKEND_HOST=localhost 4 | BACKEND_PORT=8000 5 | BACKEND_RELOAD=True 6 | 7 | # FRONTEND configuration 8 | FRONTEND_HOST=0.0.0.0 9 | FRONTEND_PORT=3000 10 | 11 | # Host port mapping for docker-compose (if not set, defaults are used in docker-compose.yml) 12 | FRONTEND_DOCKER_COMPOSE_PORT=3031 13 | BACKEND_DOCKER_COMPOSE_PORT=8081 14 | 15 | # Supported text file encodings 16 | TEXT_FILE_ENCODINGS=utf-8,latin1,cp1252,iso-8859-1 17 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | push: 7 | branches: 8 | - main 9 | 10 | jobs: 11 | test: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: ["3.10", "3.11", "3.12"] 16 | env: 17 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 18 | 19 | steps: 20 | - name: Checkout code 21 | uses: actions/checkout@v2 22 | 23 | - name: Set up Python 24 | uses: actions/setup-python@v2 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | 28 | - name: Install Poetry 29 | uses: snok/install-poetry@v1 30 | 31 | - name: Copy environment file 32 | run: cp .env.sample .env 33 | 34 | - name: Install dependencies 35 | run: make install 36 | 37 | - name: Run pytest 38 | run: make tests-basic 39 | -------------------------------------------------------------------------------- /.github/workflows/docker-ci.yml: -------------------------------------------------------------------------------- 1 | name: Docker CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | docker-build-test: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Remove unnecessary files 15 | run: | 16 | sudo rm -rf /usr/share/dotnet 17 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 18 | - uses: actions/checkout@v4 19 | - name: Remove .env copy from Dockerfile 20 | run: sed -i '/COPY .env/d' Dockerfile 21 | 22 | - name: Build Docker image 23 | run: | 24 | if ! docker build -t docetl .; then 25 | echo "Docker build failed" 26 | exit 1 27 | fi 28 | 29 | - name: Create Docker volume 30 | run: docker volume create docetl-data 31 | 32 | - name: Test Docker container 33 | run: | 34 | # Run the container in detached mode 35 | docker run -d \ 36 | -p 3000:3000 \ 37 | -p 8000:8000 \ 38 | -v docetl-data:/docetl-data \ 39 | -e FRONTEND_HOST=0.0.0.0 \ 40 | -e FRONTEND_PORT=3000 \ 41 | -e BACKEND_HOST=0.0.0.0 \ 42 | -e BACKEND_PORT=8000 \ 43 | --name docetl-test \ 44 | docetl 45 | 46 | # Wait for initial startup 47 | echo "Waiting for container to start..." 48 | sleep 30 49 | 50 | frontend_healthy=false 51 | 52 | # Check container health for up to 3 minutes 53 | for i in {1..6}; do 54 | if ! docker ps -q -f name=docetl-test > /dev/null 2>&1; then 55 | echo "Container stopped unexpectedly" 56 | docker logs docetl-test 57 | exit 1 58 | fi 59 | 60 | # Try to curl the frontend 61 | if curl -s -f http://localhost:3000/playground > /dev/null; then 62 | echo "Frontend is responding" 63 | frontend_healthy=true 64 | break 65 | fi 66 | 67 | if [ $i -eq 6 ]; then 68 | echo "Container health check failed after 3 minutes" 69 | docker logs docetl-test 70 | exit 1 71 | fi 72 | 73 | echo "Waiting for services to be ready... (attempt $i/6)" 74 | sleep 30 75 | done 76 | 77 | # Explicitly fail if frontend check never succeeded 78 | if [ "$frontend_healthy" = false ]; then 79 | echo "Frontend health check failed" 80 | docker logs docetl-test 81 | exit 1 82 | fi 83 | 84 | # If we get here, container is running and healthy 85 | echo "Container is running successfully" 86 | 87 | # Cleanup 88 | docker stop docetl-test 89 | docker rm docetl-test 90 | 91 | - name: Clean up Docker volume 92 | run: docker volume rm docetl-data -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | on: 3 | push: 4 | branches: 5 | - master 6 | - main 7 | permissions: 8 | contents: write 9 | jobs: 10 | deploy: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Configure Git Credentials 15 | run: | 16 | git config user.name github-actions[bot] 17 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com 18 | - uses: actions/setup-python@v5 19 | with: 20 | python-version: 3.x 21 | - name: Install dependencies 22 | run: | 23 | pip install mkdocs==1.6.1 \ 24 | mkdocs-material==9.5.34 \ 25 | mkdocstrings==0.26.1 \ 26 | mkdocstrings-python==1.11.1 \ 27 | mkdocs-glightbox==0.4.0 \ 28 | pytkdocs==0.16.2 29 | - name: Copy environment file 30 | run: cp .env.sample .env 31 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 32 | - uses: actions/cache@v4 33 | with: 34 | key: mkdocs-material-${{ env.cache_id }} 35 | path: .cache 36 | restore-keys: | 37 | mkdocs-material- 38 | - run: mkdocs build 39 | - run: mkdocs gh-deploy --force 40 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "*.*.*" 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | pypi-publish: 13 | name: Upload release to PyPI 14 | runs-on: ubuntu-latest 15 | environment: 16 | name: pypi 17 | url: https://pypi.org/project/docetl/ 18 | permissions: 19 | id-token: write 20 | steps: 21 | - name: Checkout code 22 | uses: actions/checkout@v4 23 | 24 | - name: Set up Python 3.10 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: "3.10" 28 | 29 | - name: Install Poetry 30 | run: | 31 | curl -sSL https://install.python-poetry.org | python - -y 32 | 33 | - name: Update PATH 34 | run: echo "$HOME/.local/bin" >> $GITHUB_PATH 35 | 36 | - name: Update Poetry configuration 37 | run: poetry config virtualenvs.create false 38 | 39 | - name: Install dependencies 40 | run: poetry install --sync --no-interaction 41 | 42 | - name: Package project 43 | run: poetry build 44 | 45 | - name: Publish package distributions to PyPI 46 | uses: pypa/gh-action-pypi-publish@release/v1 47 | -------------------------------------------------------------------------------- /.github/workflows/stage.yml: -------------------------------------------------------------------------------- 1 | name: Create or Update PR from staging to main 2 | 3 | on: 4 | push: 5 | branches: 6 | - staging 7 | pull_request: 8 | types: 9 | - closed 10 | branches: 11 | - staging 12 | 13 | jobs: 14 | create-or-update-pr: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v2 18 | with: 19 | fetch-depth: 0 20 | 21 | - name: Check for existing PR 22 | id: check_pr 23 | uses: actions/github-script@v6 24 | with: 25 | github-token: ${{secrets.GITHUB_TOKEN}} 26 | script: | 27 | const { data: pullRequests } = await github.rest.pulls.list({ 28 | owner: context.repo.owner, 29 | repo: context.repo.repo, 30 | state: 'open', 31 | head: 'staging', 32 | base: 'main' 33 | }); 34 | return pullRequests.length > 0 ? 'true' : 'false'; 35 | 36 | - name: Create Pull Request 37 | if: steps.check_pr.outputs.result == 'false' 38 | uses: repo-sync/pull-request@v2 39 | with: 40 | source_branch: "staging" 41 | destination_branch: "main" 42 | pr_title: "Merge staging into main" 43 | pr_body: "This PR was automatically created to merge changes from staging into main." 44 | github_token: ${{ secrets.GITHUB_TOKEN }} 45 | 46 | - name: Update Pull Request 47 | if: steps.check_pr.outputs.result == 'true' 48 | uses: actions/github-script@v6 49 | with: 50 | github-token: ${{secrets.GITHUB_TOKEN}} 51 | script: | 52 | const { data: pullRequests } = await github.rest.pulls.list({ 53 | owner: context.repo.owner, 54 | repo: context.repo.repo, 55 | state: 'open', 56 | head: 'staging', 57 | base: 'main' 58 | }); 59 | 60 | if (pullRequests.length > 0) { 61 | const prNumber = pullRequests[0].number; 62 | await github.rest.pulls.update({ 63 | owner: context.repo.owner, 64 | repo: context.repo.repo, 65 | pull_number: prNumber, 66 | body: 'This PR has been automatically updated with the latest changes from staging.' 67 | }); 68 | console.log(`Updated PR #${prNumber}`); 69 | } 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | *__pycache__* 3 | *scratch* 4 | *relevance_assessment* 5 | palimpzest/* 6 | paper_workloads/contracts/full_contract_txt* 7 | paper_workloads/contracts/sample_contract_txt* 8 | *.xlsx 9 | *.csv 10 | paper_workloads/* 11 | preprint_workloads/* 12 | workloads/* 13 | *mypy_cache* 14 | *.DS_Store 15 | *pytest_cache* 16 | *ruff_cache* 17 | motion-old* 18 | venv/ 19 | 20 | # dependencies 21 | website/node_modules 22 | website/.pnp 23 | website/.pnp.js 24 | website/.yarn/install-state.gz 25 | 26 | # testing 27 | website/coverage 28 | 29 | # next.js 30 | website/.next/ 31 | website/out/ 32 | 33 | # production 34 | website/build 35 | 36 | # misc 37 | website/.DS_Store 38 | website/*.pem 39 | 40 | # debug 41 | website/npm-debug.log* 42 | website/yarn-debug.log* 43 | website/yarn-error.log* 44 | 45 | # local env files 46 | website/.env*.local 47 | 48 | # vercel 49 | website/.vercel 50 | 51 | # typescript 52 | website/*.tsbuildinfo 53 | website/next-env.d.ts 54 | 55 | # Docker 56 | .docker/ 57 | 58 | # experiments 59 | experiments/*.json 60 | 61 | metrics_vs_cost.png 62 | tests/data/anthropic-red-team-attempts.jsonl 63 | tests/data/get_freshstack.py -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autofix_prs: false 3 | 4 | files: "^(docetl)/" 5 | exclude: '\__init__.py$' 6 | 7 | repos: 8 | - repo: https://github.com/pre-commit/pre-commit-hooks 9 | rev: v4.5.0 10 | hooks: 11 | - id: trailing-whitespace 12 | - id: end-of-file-fixer 13 | exclude: ^.*\.egg-info/ 14 | - id: check-merge-conflict 15 | - id: check-case-conflict 16 | - id: pretty-format-json 17 | args: [--autofix, --no-ensure-ascii, --no-sort-keys] 18 | - id: check-ast 19 | - id: debug-statements 20 | - id: check-docstring-first 21 | 22 | - repo: https://github.com/hadialqattan/pycln 23 | rev: v2.5.0 24 | hooks: 25 | - id: pycln 26 | args: [--all, --exclude, "__init__.py$", --include, "^docetl/"] 27 | 28 | - repo: https://github.com/psf/black 29 | rev: 24.1.1 30 | hooks: 31 | - id: black 32 | 33 | - repo: https://github.com/pycqa/isort 34 | rev: 5.13.2 35 | hooks: 36 | - id: isort 37 | name: "isort (python)" 38 | types: [python] 39 | args: [--profile, black] 40 | 41 | - repo: https://github.com/charliermarsh/ruff-pre-commit 42 | # Ruff version. 43 | rev: "v0.2.1" 44 | hooks: 45 | - id: ruff 46 | 47 | - repo: https://github.com/pre-commit/pre-commit 48 | rev: v3.6.0 49 | hooks: 50 | - id: validate_manifest 51 | 52 | - repo: https://github.com/pre-commit/mirrors-prettier 53 | rev: "v4.0.0-alpha.8" # Prettier version 54 | hooks: 55 | - id: prettier 56 | files: "^ui/" 57 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build stage for Python dependencies 2 | FROM python:3.11-slim AS python-builder 3 | 4 | RUN pip install poetry==1.4.2 5 | 6 | ENV POETRY_NO_INTERACTION=1 \ 7 | POETRY_VIRTUALENVS_IN_PROJECT=1 \ 8 | POETRY_VIRTUALENVS_CREATE=1 \ 9 | POETRY_CACHE_DIR=/tmp/poetry_cache \ 10 | DOCETL_HOME_DIR="/docetl-data" 11 | 12 | WORKDIR /app 13 | 14 | COPY pyproject.toml poetry.lock ./ 15 | COPY docetl/ ./docetl/ 16 | COPY server/ ./server/ 17 | COPY tests/ ./tests/ 18 | RUN touch README.md 19 | 20 | # Install with --no-root first for dependencies, then install with root for entrypoints 21 | RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry install --all-extras --no-root && \ 22 | poetry install --all-extras 23 | 24 | # Build stage for Node.js dependencies 25 | FROM node:20-alpine AS node-builder 26 | 27 | WORKDIR /app/website 28 | 29 | # Update DOCETL_HOME_DIR to match final location 30 | ENV DOCETL_HOME_DIR="/docetl-data" 31 | 32 | COPY website/package*.json ./ 33 | RUN npm install 34 | COPY website/ ./ 35 | RUN npm run build 36 | 37 | # Final runtime stage 38 | FROM python:3.11-slim AS runtime 39 | 40 | # Install Node.js 41 | RUN apt-get update && apt-get install -y \ 42 | curl \ 43 | && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \ 44 | && apt-get install -y nodejs \ 45 | && rm -rf /var/lib/apt/lists/* 46 | 47 | WORKDIR /app 48 | 49 | # Copy Python virtual environment from builder 50 | ENV VIRTUAL_ENV=/app/.venv \ 51 | PATH="/app/.venv/bin:$PATH" \ 52 | PYTHONPATH="/app" \ 53 | DOCETL_HOME_DIR="/docetl-data" 54 | 55 | COPY --from=python-builder /app/.venv ${VIRTUAL_ENV} 56 | 57 | # Copy Python application files 58 | COPY docetl/ ./docetl/ 59 | COPY server/ ./server/ 60 | COPY tests/ ./tests/ 61 | COPY pyproject.toml poetry.lock ./ 62 | COPY .env ./ 63 | 64 | # Copy Node.js dependencies and application files 65 | COPY --from=node-builder /app/website ./website 66 | 67 | ENV PORT=3000 68 | 69 | # Create data directory with appropriate permissions 70 | RUN mkdir -p /docetl-data && chown -R nobody:nogroup /docetl-data && chmod 777 /docetl-data 71 | 72 | # Define volume AFTER creating and setting permissions 73 | VOLUME ["/docetl-data"] 74 | 75 | # Expose ports for frontend and backend 76 | EXPOSE 3000 8000 77 | 78 | # Start both servers 79 | CMD ["sh", "-c", "python3 server/app/main.py & cd website && npm run start"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Shreya Shankar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docetl/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.4" 2 | 3 | import warnings 4 | 5 | # TODO: Remove after https://github.com/BerriAI/litellm/issues/7560 is fixed 6 | warnings.filterwarnings("ignore", category=UserWarning, module="pydantic._internal._config") 7 | 8 | from docetl.runner import DSLRunner 9 | from docetl.optimizer import Optimizer 10 | from docetl.apis.pd_accessors import SemanticAccessor 11 | 12 | __all__ = ["DSLRunner", "Optimizer", "SemanticAccessor"] 13 | -------------------------------------------------------------------------------- /docetl/apis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docetl/apis/__init__.py -------------------------------------------------------------------------------- /docetl/cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | import typer 6 | from dotenv import load_dotenv 7 | 8 | from docetl.operations.utils import clear_cache as cc 9 | from docetl.runner import DSLRunner 10 | 11 | app = typer.Typer(pretty_exceptions_enable=False) 12 | 13 | 14 | @app.command() 15 | def build( 16 | yaml_file: Path = typer.Argument( 17 | ..., help="Path to the YAML file containing the pipeline configuration" 18 | ), 19 | max_threads: Optional[int] = typer.Option( 20 | None, help="Maximum number of threads to use for running operations" 21 | ), 22 | resume: bool = typer.Option( 23 | False, help="Resume optimization from a previous build that may have failed" 24 | ), 25 | save_path: Path = typer.Option( 26 | None, help="Path to save the optimized pipeline configuration" 27 | ), 28 | ): 29 | """ 30 | Build and optimize the configuration specified in the YAML file. 31 | Any arguments passed here will override the values in the YAML file. 32 | 33 | Args: 34 | yaml_file (Path): Path to the YAML file containing the pipeline configuration. 35 | max_threads (Optional[int]): Maximum number of threads to use for running operations. 36 | model (str): Model to use for optimization. Defaults to "gpt-4o". 37 | resume (bool): Whether to resume optimization from a previous run. Defaults to False. 38 | save_path (Path): Path to save the optimized pipeline configuration. 39 | """ 40 | # Get the current working directory (where the user called the command) 41 | cwd = os.getcwd() 42 | 43 | # Load .env file from the current working directory 44 | env_file = os.path.join(cwd, ".env") 45 | if os.path.exists(env_file): 46 | load_dotenv(env_file) 47 | 48 | runner = DSLRunner.from_yaml(str(yaml_file), max_threads=max_threads) 49 | runner.optimize( 50 | save=True, 51 | return_pipeline=False, 52 | resume=resume, 53 | save_path=save_path, 54 | ) 55 | 56 | 57 | @app.command() 58 | def run( 59 | yaml_file: Path = typer.Argument( 60 | ..., help="Path to the YAML file containing the pipeline configuration" 61 | ), 62 | max_threads: Optional[int] = typer.Option( 63 | None, help="Maximum number of threads to use for running operations" 64 | ), 65 | ): 66 | """ 67 | Run the configuration specified in the YAML file. 68 | 69 | Args: 70 | yaml_file (Path): Path to the YAML file containing the pipeline configuration. 71 | max_threads (Optional[int]): Maximum number of threads to use for running operations. 72 | """ 73 | # Get the current working directory (where the user called the command) 74 | cwd = os.getcwd() 75 | 76 | # Load .env file from the current working directory 77 | env_file = os.path.join(cwd, ".env") 78 | if os.path.exists(env_file): 79 | load_dotenv(env_file) 80 | 81 | runner = DSLRunner.from_yaml(str(yaml_file), max_threads=max_threads) 82 | runner.load_run_save() 83 | 84 | 85 | @app.command() 86 | def clear_cache(): 87 | """ 88 | Clear the LLM cache stored on disk. 89 | """ 90 | cc() 91 | 92 | 93 | @app.command() 94 | def version(): 95 | """ 96 | Display the current version of DocETL. 97 | """ 98 | import docetl 99 | 100 | typer.echo(f"DocETL version: {docetl.__version__}") 101 | 102 | 103 | if __name__ == "__main__": 104 | app() 105 | -------------------------------------------------------------------------------- /docetl/operations/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata 2 | from docetl.operations.cluster import ClusterOperation 3 | from docetl.operations.code_operations import CodeFilterOperation, CodeMapOperation, CodeReduceOperation 4 | from docetl.operations.equijoin import EquijoinOperation 5 | from docetl.operations.filter import FilterOperation 6 | from docetl.operations.gather import GatherOperation 7 | from docetl.operations.map import MapOperation 8 | from docetl.operations.reduce import ReduceOperation 9 | from docetl.operations.resolve import ResolveOperation 10 | from docetl.operations.rank import RankOperation 11 | from docetl.operations.split import SplitOperation 12 | from docetl.operations.sample import SampleOperation 13 | from docetl.operations.unnest import UnnestOperation 14 | from docetl.operations.scan import ScanOperation 15 | from docetl.operations.add_uuid import AddUuidOperation 16 | from docetl.operations.extract import ExtractOperation 17 | 18 | mapping = { 19 | "cluster": ClusterOperation, 20 | "code_filter": CodeFilterOperation, 21 | "code_map": CodeMapOperation, 22 | "code_reduce": CodeReduceOperation, 23 | "equijoin": EquijoinOperation, 24 | "filter": FilterOperation, 25 | "gather": GatherOperation, 26 | "map": MapOperation, 27 | "reduce": ReduceOperation, 28 | "resolve": ResolveOperation, 29 | "rank": RankOperation, 30 | "split": SplitOperation, 31 | "sample": SampleOperation, 32 | "unnest": UnnestOperation, 33 | "scan": ScanOperation, 34 | "add_uuid": AddUuidOperation, 35 | "extract": ExtractOperation 36 | } 37 | 38 | def get_operation(operation_type: str): 39 | """Loads a single operation by name""" 40 | try: 41 | entrypoint = importlib.metadata.entry_points(group="docetl.operation")[ 42 | operation_type 43 | ] 44 | return entrypoint.load() 45 | except KeyError: 46 | if operation_type in mapping: 47 | return mapping[operation_type] 48 | raise KeyError(f"Unrecognized operation {operation_type}") 49 | 50 | def get_operations(): 51 | """Load all available operations and return them as a dictionary""" 52 | operations = mapping.copy() 53 | operations.update({ 54 | op.name: op.load() 55 | for op in importlib.metadata.entry_points(group="docetl.operation") 56 | }) 57 | return operations 58 | -------------------------------------------------------------------------------- /docetl/operations/add_uuid.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from typing import Any, Dict, List, Tuple 3 | 4 | from docetl.operations.base import BaseOperation 5 | 6 | 7 | class AddUuidOperation(BaseOperation): 8 | """ 9 | A class that implements an operation to add a UUID to each document. 10 | 11 | This class extends BaseOperation to: 12 | 1. Generate a unique UUID for each document 13 | 2. Add the UUID under a key formatted as {operation_name}_id 14 | """ 15 | 16 | class schema(BaseOperation.schema): 17 | type: str = "add_uuid" 18 | 19 | def __init__(self, *args, **kwargs): 20 | super().__init__(*args, **kwargs) 21 | self.name = self.config["name"] 22 | 23 | def syntax_check(self) -> None: 24 | # No additional configuration needed beyond base requirements 25 | pass 26 | 27 | def execute( 28 | self, input_data: List[Dict[str, Any]] 29 | ) -> Tuple[List[Dict[str, Any]], float]: 30 | results = [] 31 | cost = 0.0 32 | 33 | # If there's an id key in the config, use that as the id key 34 | if "id_key" in self.config: 35 | id_key = self.config["id_key"] 36 | else: 37 | id_key = f"{self.name}_id" 38 | 39 | for item in input_data: 40 | result = item.copy() 41 | result[id_key] = str(uuid.uuid4()) 42 | results.append(result) 43 | 44 | return results, cost 45 | -------------------------------------------------------------------------------- /docetl/operations/clustering_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains utilities for clustering based on different methods. 3 | 4 | We use these in map and reduce operations. 5 | """ 6 | 7 | from typing import Dict, List, Tuple 8 | 9 | from docetl.operations.utils import APIWrapper 10 | from docetl.utils import completion_cost 11 | 12 | 13 | def get_embeddings_for_clustering( 14 | items: List[Dict], sampling_config: Dict, api_wrapper: APIWrapper 15 | ) -> Tuple[List[List[float]], float]: 16 | embedding_model = sampling_config.get("embedding_model", "text-embedding-3-small") 17 | embedding_keys = sampling_config.get("embedding_keys") 18 | if not embedding_keys: 19 | embedding_keys = list(items[0].keys()) 20 | 21 | if embedding_model == "sentence-transformer": 22 | return get_embeddings_for_clustering_with_st(items, embedding_keys) 23 | 24 | embeddings = [] 25 | cost = 0 26 | batch_size = 1000 27 | 28 | for i in range(0, len(items), batch_size): 29 | batch = items[i : i + batch_size] 30 | texts = [ 31 | " ".join(str(item[key]) for key in embedding_keys if key in item)[:10000] 32 | for item in batch 33 | ] 34 | response = api_wrapper.gen_embedding(embedding_model, texts) 35 | embeddings.extend([data["embedding"] for data in response["data"]]) 36 | cost += completion_cost(response) 37 | 38 | return embeddings, cost 39 | 40 | 41 | def get_embeddings_for_clustering_with_st( 42 | items: List[Dict], embedding_keys: List[str] 43 | ) -> Tuple[List[List[float]], float]: 44 | import torch 45 | from sentence_transformers import SentenceTransformer 46 | 47 | device = "cpu" 48 | if torch.backends.mps.is_available(): 49 | device = "mps" 50 | elif torch.cuda.is_available(): 51 | device = "cuda" 52 | 53 | model = SentenceTransformer("all-MiniLM-L6-v2", device=device) 54 | embeddings = model.encode( 55 | [ 56 | " ".join(str(item[key]) for key in embedding_keys if key in item)[:10000] 57 | for item in items 58 | ] 59 | ) 60 | return embeddings, 0 61 | 62 | 63 | def cluster_documents( 64 | documents: List[Dict], 65 | sampling_config: Dict, 66 | sample_size: int, 67 | api_wrapper: APIWrapper, 68 | ) -> Tuple[Dict[int, List[Dict]], float]: 69 | """ 70 | Cluster documents using KMeans clustering algorithm. 71 | 72 | Args: 73 | documents (List[Dict]): The list of documents to cluster. 74 | sampling_config (Dict): The sampling configuration. Must contain embedding_model. If embedding_keys is not specified, it will use all keys in the document. If embedding_model is not specified, it will use text-embedding-3-small. If embedding_model is sentence-transformer, it will use all-MiniLM-L6-v2. 75 | sample_size (int): The number of clusters to create. 76 | api_wrapper (APIWrapper): The API wrapper to use for embedding. 77 | Returns: 78 | Dict[int, List[Dict]]: A dictionary of clusters, where each cluster is a list of documents. 79 | """ 80 | embeddings, cost = get_embeddings_for_clustering( 81 | documents, sampling_config, api_wrapper 82 | ) 83 | 84 | from sklearn.cluster import KMeans 85 | 86 | num_clusters = min(sample_size, len(documents)) 87 | kmeans = KMeans(n_clusters=num_clusters, random_state=42) 88 | cluster_labels = kmeans.fit_predict(embeddings) 89 | 90 | clusters = {i: [] for i in range(num_clusters)} 91 | for idx, label in enumerate(cluster_labels): 92 | clusters[label].append(documents[idx]) 93 | 94 | return clusters, cost 95 | -------------------------------------------------------------------------------- /docetl/operations/scan.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple 2 | 3 | from docetl.operations.base import BaseOperation 4 | 5 | 6 | class ScanOperation(BaseOperation): 7 | class schema(BaseOperation.schema): 8 | dataset_name: str 9 | 10 | def syntax_check(self) -> None: 11 | """Validate the scan operation configuration.""" 12 | super().syntax_check() 13 | 14 | def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]: 15 | """ 16 | Execute the scan operation to load data from the configured source. 17 | 18 | Args: 19 | input_data: Not used in scan operation 20 | 21 | Returns: 22 | Tuple[List[Dict], float]: Loaded data and cost (0 for scan) 23 | """ 24 | 25 | # Look in the runner.datasets objects 26 | if self.config["dataset_name"] not in self.runner.datasets: 27 | raise ValueError(f"Dataset {self.config['dataset_name']} not found") 28 | 29 | return ( 30 | self.runner.datasets[self.config["dataset_name"]].load(), 31 | 0.0, 32 | ) # Scan has no LLM cost 33 | -------------------------------------------------------------------------------- /docetl/operations/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .api import APIWrapper 2 | from .cache import ( 3 | cache, 4 | cache_key, 5 | clear_cache, 6 | flush_cache, 7 | freezeargs, 8 | CACHE_DIR, 9 | LLM_CACHE_DIR, 10 | DOCETL_HOME_DIR, 11 | ) 12 | from .llm import LLMResult, InvalidOutputError, truncate_messages 13 | from .progress import RichLoopBar, rich_as_completed 14 | from .validation import safe_eval, convert_val, convert_dict_schema_to_list_schema, get_user_input_for_schema, strict_render 15 | 16 | __all__ = [ 17 | 'APIWrapper', 18 | 'cache', 19 | 'cache_key', 20 | 'clear_cache', 21 | 'flush_cache', 22 | 'freezeargs', 23 | 'CACHE_DIR', 24 | 'LLM_CACHE_DIR', 25 | 'DOCETL_HOME_DIR', 26 | 'LLMResult', 27 | 'InvalidOutputError', 28 | 'RichLoopBar', 29 | 'rich_as_completed', 30 | 'safe_eval', 31 | 'convert_val', 32 | 'convert_dict_schema_to_list_schema', 33 | 'get_user_input_for_schema', 34 | 'truncate_messages', 35 | "strict_render" 36 | ] -------------------------------------------------------------------------------- /docetl/operations/utils/cache.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import hashlib 3 | import json 4 | import os 5 | import shutil 6 | from typing import Any, Dict, List 7 | 8 | from diskcache import Cache 9 | from dotenv import load_dotenv 10 | from frozendict import frozendict 11 | from rich.console import Console 12 | 13 | from docetl.console import DOCETL_CONSOLE 14 | 15 | load_dotenv() 16 | 17 | DOCETL_HOME_DIR = ( 18 | os.environ.get("DOCETL_HOME_DIR", os.path.expanduser("~")) + "/.cache/docetl" 19 | ) 20 | CACHE_DIR = os.path.join(DOCETL_HOME_DIR, "general") 21 | LLM_CACHE_DIR = os.path.join(DOCETL_HOME_DIR, "llm") 22 | cache = Cache(LLM_CACHE_DIR) 23 | cache.close() 24 | 25 | 26 | def freezeargs(func): 27 | """ 28 | Decorator to convert mutable dictionary arguments into immutable. 29 | """ 30 | 31 | @functools.wraps(func) 32 | def wrapped(*args, **kwargs): 33 | args = tuple( 34 | ( 35 | frozendict(arg) 36 | if isinstance(arg, dict) 37 | else json.dumps(arg) if isinstance(arg, list) else arg 38 | ) 39 | for arg in args 40 | ) 41 | kwargs = { 42 | k: ( 43 | frozendict(v) 44 | if isinstance(v, dict) 45 | else json.dumps(v) if isinstance(v, list) else v 46 | ) 47 | for k, v in kwargs.items() 48 | } 49 | return func(*args, **kwargs) 50 | 51 | return wrapped 52 | 53 | 54 | def flush_cache(console: Console = DOCETL_CONSOLE): 55 | """Flush the cache to disk.""" 56 | console.log("[bold green]Flushing cache to disk...[/bold green]") 57 | cache.close() 58 | console.log("[bold green]Cache flushed to disk.[/bold green]") 59 | 60 | 61 | def clear_cache(console: Console = DOCETL_CONSOLE): 62 | """Clear the LLM cache stored on disk.""" 63 | console.log("[bold yellow]Clearing LLM cache...[/bold yellow]") 64 | try: 65 | with cache as c: 66 | c.clear() 67 | # Remove all files in the cache directory 68 | if not os.path.exists(CACHE_DIR): 69 | os.makedirs(CACHE_DIR) 70 | for filename in os.listdir(CACHE_DIR): 71 | file_path = os.path.join(CACHE_DIR, filename) 72 | try: 73 | if os.path.isfile(file_path): 74 | os.unlink(file_path) 75 | elif os.path.isdir(file_path): 76 | shutil.rmtree(file_path) 77 | except Exception as e: 78 | console.log( 79 | f"[bold red]Error deleting {file_path}: {str(e)}[/bold red]" 80 | ) 81 | console.log("[bold green]Cache cleared successfully.[/bold green]") 82 | except Exception as e: 83 | console.log(f"[bold red]Error clearing cache: {str(e)}[/bold red]") 84 | 85 | 86 | def cache_key( 87 | model: str, 88 | op_type: str, 89 | messages: List[Dict[str, str]], 90 | output_schema: Dict[str, str], 91 | scratchpad: str = None, 92 | system_prompt: Dict[str, str] = None, 93 | op_config: Dict[str, Any] = {}, 94 | ) -> str: 95 | """Generate a unique cache key based on function arguments.""" 96 | key_dict = { 97 | "model": model, 98 | "op_type": op_type, 99 | "messages": json.dumps(messages, sort_keys=True), 100 | "output_schema": json.dumps(output_schema, sort_keys=True), 101 | "scratchpad": scratchpad, 102 | "system_prompt": json.dumps(system_prompt, sort_keys=True), 103 | "op_config": json.dumps(op_config, sort_keys=True), 104 | } 105 | return hashlib.md5(json.dumps(key_dict, sort_keys=True).encode()).hexdigest() 106 | -------------------------------------------------------------------------------- /docetl/operations/utils/progress.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import as_completed 2 | from typing import Iterable, Optional, Union 3 | 4 | from tqdm import tqdm 5 | 6 | 7 | class RichLoopBar: 8 | """A progress bar class that integrates with Rich console.""" 9 | 10 | def __init__( 11 | self, 12 | iterable: Optional[Union[Iterable, range]] = None, 13 | total: Optional[int] = None, 14 | desc: Optional[str] = None, 15 | leave: bool = True, 16 | console=None, 17 | ): 18 | if console is None: 19 | raise ValueError("Console must be provided") 20 | self.console = console 21 | self.iterable = iterable 22 | self.total = self._get_total(iterable, total) 23 | self.description = desc 24 | self.leave = leave 25 | self.tqdm = None 26 | 27 | def _get_total(self, iterable, total): 28 | if total is not None: 29 | return total 30 | if isinstance(iterable, range): 31 | return len(iterable) 32 | try: 33 | return len(iterable) 34 | except TypeError: 35 | return None 36 | 37 | def __iter__(self): 38 | self.tqdm = tqdm( 39 | self.iterable, 40 | total=self.total, 41 | desc=self.description, 42 | file=self.console.file, 43 | ) 44 | for item in self.tqdm: 45 | yield item 46 | 47 | def __enter__(self): 48 | self.tqdm = tqdm( 49 | total=self.total, 50 | desc=self.description, 51 | leave=self.leave, 52 | file=self.console.file, 53 | ) 54 | return self 55 | 56 | def __exit__(self, exc_type, exc_val, exc_tb): 57 | self.tqdm.close() 58 | 59 | def update(self, n=1): 60 | if self.tqdm: 61 | self.tqdm.update(n) 62 | 63 | 64 | def rich_as_completed(futures, total=None, desc=None, leave=True, console=None): 65 | """Yield completed futures with a Rich progress bar.""" 66 | if console is None: 67 | raise ValueError("Console must be provided") 68 | 69 | with RichLoopBar(total=total, desc=desc, leave=leave, console=console) as pbar: 70 | for future in as_completed(futures): 71 | yield future 72 | pbar.update() 73 | -------------------------------------------------------------------------------- /docetl/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from docetl.optimizers.join_optimizer import JoinOptimizer 2 | from docetl.optimizers.map_optimizer import MapOptimizer 3 | from docetl.optimizers.reduce_optimizer import ReduceOptimizer 4 | 5 | __all__ = ["JoinOptimizer", "MapOptimizer", "ReduceOptimizer"] -------------------------------------------------------------------------------- /docetl/optimizers/map_optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | from docetl.optimizers.map_optimizer.optimizer import MapOptimizer 2 | 3 | __all__ = ["MapOptimizer"] 4 | -------------------------------------------------------------------------------- /docetl/ratelimiter.py: -------------------------------------------------------------------------------- 1 | import math 2 | from inspect import isawaitable 3 | from typing import Any, Dict 4 | 5 | import pyrate_limiter 6 | 7 | 8 | class BucketCollection(pyrate_limiter.BucketFactory): 9 | def __init__(self, **buckets): 10 | self.clock = pyrate_limiter.TimeClock() 11 | self.buckets = buckets 12 | 13 | def wrap_item(self, name: str, weight: int = 1) -> pyrate_limiter.RateItem: 14 | now = self.clock.now() 15 | 16 | async def wrap_async(): 17 | return pyrate_limiter.RateItem(name, await now, weight=weight) 18 | 19 | def wrap_sync(): 20 | return pyrate_limiter.RateItem(name, now, weight=weight) 21 | 22 | return wrap_async() if isawaitable(now) else wrap_sync() 23 | 24 | def get(self, item: pyrate_limiter.RateItem) -> pyrate_limiter.AbstractBucket: 25 | if item.name not in self.buckets: 26 | return self.buckets["unknown"] 27 | return self.buckets[item.name] 28 | 29 | 30 | def create_bucket_factory(rate_limits: Dict[str, Any]) -> BucketCollection: 31 | """ 32 | Create a BucketCollection from rate limits configuration. 33 | 34 | Args: 35 | rate_limits: Dictionary containing rate limit configuration 36 | 37 | Returns: 38 | BucketCollection configured with the specified rate limits 39 | """ 40 | buckets = { 41 | param: pyrate_limiter.InMemoryBucket( 42 | [ 43 | pyrate_limiter.Rate( 44 | param_limit["count"], 45 | param_limit["per"] 46 | * getattr( 47 | pyrate_limiter.Duration, 48 | param_limit.get("unit", "SECOND").upper(), 49 | ), 50 | ) 51 | for param_limit in param_limits 52 | ] 53 | ) 54 | for param, param_limits in rate_limits.items() 55 | } 56 | 57 | # Add default bucket for unknown parameters 58 | buckets["unknown"] = pyrate_limiter.InMemoryBucket( 59 | [pyrate_limiter.Rate(math.inf, 1)] 60 | ) 61 | 62 | return BucketCollection(**buckets) 63 | -------------------------------------------------------------------------------- /docetl/schemas.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from . import dataset 4 | 5 | # ruff: noqa: F403 6 | from .base_schemas import * 7 | from .operations import ( 8 | cluster, 9 | equijoin, 10 | filter, 11 | gather, 12 | map, 13 | reduce, 14 | resolve, 15 | sample, 16 | split, 17 | unnest, 18 | ) 19 | 20 | MapOp = map.MapOperation.schema 21 | ResolveOp = resolve.ResolveOperation.schema 22 | ReduceOp = reduce.ReduceOperation.schema 23 | ParallelMapOp = map.ParallelMapOperation.schema 24 | FilterOp = filter.FilterOperation.schema 25 | EquijoinOp = equijoin.EquijoinOperation.schema 26 | SplitOp = split.SplitOperation.schema 27 | GatherOp = gather.GatherOperation.schema 28 | UnnestOp = unnest.UnnestOperation.schema 29 | ClusterOp = cluster.ClusterOperation.schema 30 | SampleOp = sample.SampleOperation.schema 31 | 32 | OpType = Union[ 33 | MapOp, 34 | ResolveOp, 35 | ReduceOp, 36 | ParallelMapOp, 37 | FilterOp, 38 | EquijoinOp, 39 | SplitOp, 40 | GatherOp, 41 | UnnestOp, 42 | ] 43 | 44 | Dataset = dataset.Dataset.schema 45 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | docetl: 3 | container_name: docetl-docwrangler-stack 4 | build: . 5 | image: docetl 6 | restart: unless-stopped 7 | healthcheck: 8 | test: ["CMD", "curl", "-f", "http://localhost:8000/health"] 9 | interval: 30s 10 | timeout: 10s 11 | retries: 3 12 | start_period: 40s 13 | ports: 14 | # Map host ports to container ports using environment variables. 15 | # If FRONTEND_DOCKER_COMPOSE_PORT is not set, default to 3031. 16 | - "${FRONTEND_DOCKER_COMPOSE_PORT:-3031}:3000" 17 | # If BACKEND_DOCKER_COMPOSE_PORT is not set, default to 8081. 18 | - "${BACKEND_DOCKER_COMPOSE_PORT:-8081}:8000" 19 | environment: 20 | # Pass environment variables from the .env file (or host environment) 21 | # with default values if they are not defined. 22 | # Authentication 23 | - OPENAI_API_KEY=${OPENAI_API_KEY:-your_api_key_here} 24 | 25 | # Backend Configuration 26 | - BACKEND_ALLOW_ORIGINS=${BACKEND_ALLOW_ORIGINS:-http://localhost:3000,http://127.0.0.1:3000} 27 | - BACKEND_HOST=${BACKEND_HOST:-0.0.0.0} 28 | - BACKEND_PORT=${BACKEND_PORT:-8000} 29 | - BACKEND_RELOAD=${BACKEND_RELOAD:-True} 30 | 31 | # Frontend Configuration 32 | - FRONTEND_HOST=${FRONTEND_HOST:-0.0.0.0} 33 | - FRONTEND_PORT=${FRONTEND_PORT:-3000} 34 | 35 | # File Processing 36 | - TEXT_FILE_ENCODINGS=${TEXT_FILE_ENCODINGS:-utf-8,latin1,cp1252,iso-8859-1} 37 | volumes: 38 | # Mount the named volume "docetl-data" to /docetl-data in the container. 39 | - docetl-data:/docetl-data 40 | 41 | docetl-aws: 42 | extends: 43 | service: docetl 44 | environment: 45 | - AWS_PROFILE=${AWS_PROFILE:-default} 46 | - AWS_REGION=${AWS_REGION:-us-west-2} 47 | volumes: 48 | - ~/.aws:/root/.aws:ro 49 | profiles: 50 | - aws 51 | 52 | volumes: 53 | docetl-data: 54 | -------------------------------------------------------------------------------- /docs/advanced/custom-operators.md: -------------------------------------------------------------------------------- 1 | TODO: Support UDFs. 2 | -------------------------------------------------------------------------------- /docs/advanced/extending-agents.md: -------------------------------------------------------------------------------- 1 | TODO: Add guide for extending agents (after preprint release). 2 | -------------------------------------------------------------------------------- /docs/advanced/performance-tuning.md: -------------------------------------------------------------------------------- 1 | TODO: Add performance tuning guide. 2 | -------------------------------------------------------------------------------- /docs/api-reference/cli.md: -------------------------------------------------------------------------------- 1 | ::: docetl.cli.run 2 | options: 3 | show_root_heading: true 4 | heading_level: 3 5 | show_if_no_docstring: false 6 | docstring_options: 7 | ignore_init_summary: false 8 | trim_doctest_flags: true 9 | 10 | ::: docetl.cli.build 11 | options: 12 | show_root_heading: true 13 | heading_level: 3 14 | show_if_no_docstring: false 15 | docstring_options: 16 | ignore_init_summary: false 17 | trim_doctest_flags: true 18 | 19 | ::: docetl.cli.clear_cache 20 | options: 21 | show_root_heading: true 22 | heading_level: 3 23 | show_if_no_docstring: false 24 | docstring_options: 25 | ignore_init_summary: false 26 | trim_doctest_flags: true -------------------------------------------------------------------------------- /docs/api-reference/docetl.md: -------------------------------------------------------------------------------- 1 | ::: docetl.DSLRunner 2 | options: 3 | show_root_heading: true 4 | heading_level: 3 5 | show_if_no_docstring: false 6 | docstring_options: 7 | ignore_init_summary: false 8 | trim_doctest_flags: true 9 | 10 | ::: docetl.Optimizer 11 | options: 12 | show_root_heading: true 13 | heading_level: 3 14 | show_if_no_docstring: false 15 | docstring_options: 16 | ignore_init_summary: false 17 | trim_doctest_flags: true 18 | -------------------------------------------------------------------------------- /docs/api-reference/operations.md: -------------------------------------------------------------------------------- 1 | # LLM-Powered Operators 2 | 3 | ::: docetl.operations.map.MapOperation 4 | options: 5 | show_root_heading: true 6 | heading_level: 3 7 | show_if_no_docstring: false 8 | docstring_options: 9 | ignore_init_summary: false 10 | trim_doctest_flags: true 11 | 12 | ::: docetl.operations.resolve.ResolveOperation 13 | options: 14 | show_root_heading: true 15 | heading_level: 3 16 | show_if_no_docstring: false 17 | docstring_options: 18 | ignore_init_summary: false 19 | trim_doctest_flags: true 20 | 21 | ::: docetl.operations.reduce.ReduceOperation 22 | options: 23 | show_root_heading: true 24 | heading_level: 3 25 | show_if_no_docstring: false 26 | docstring_options: 27 | ignore_init_summary: false 28 | trim_doctest_flags: true 29 | 30 | ::: docetl.operations.map.ParallelMapOperation 31 | options: 32 | show_root_heading: true 33 | heading_level: 3 34 | show_if_no_docstring: false 35 | docstring_options: 36 | ignore_init_summary: false 37 | trim_doctest_flags: true 38 | 39 | ::: docetl.operations.filter.FilterOperation 40 | options: 41 | show_root_heading: true 42 | heading_level: 3 43 | show_if_no_docstring: false 44 | docstring_options: 45 | ignore_init_summary: false 46 | trim_doctest_flags: true 47 | 48 | ::: docetl.operations.equijoin.EquijoinOperation 49 | options: 50 | show_root_heading: true 51 | heading_level: 3 52 | show_if_no_docstring: false 53 | docstring_options: 54 | ignore_init_summary: false 55 | trim_doctest_flags: true 56 | 57 | ::: docetl.operations.cluster.ClusterOperation 58 | options: 59 | show_root_heading: true 60 | heading_level: 3 61 | show_if_no_docstring: false 62 | docstring_options: 63 | ignore_init_summary: false 64 | trim_doctest_flags: true 65 | 66 | # Auxiliary Operators 67 | 68 | ::: docetl.operations.split.SplitOperation 69 | options: 70 | show_root_heading: true 71 | heading_level: 3 72 | show_if_no_docstring: false 73 | docstring_options: 74 | ignore_init_summary: false 75 | trim_doctest_flags: true 76 | 77 | ::: docetl.operations.gather.GatherOperation 78 | options: 79 | show_root_heading: true 80 | heading_level: 3 81 | show_if_no_docstring: false 82 | docstring_options: 83 | ignore_init_summary: false 84 | trim_doctest_flags: true 85 | 86 | ::: docetl.operations.unnest.UnnestOperation 87 | options: 88 | show_root_heading: true 89 | heading_level: 3 90 | show_if_no_docstring: false 91 | docstring_options: 92 | ignore_init_summary: false 93 | trim_doctest_flags: true -------------------------------------------------------------------------------- /docs/api-reference/optimizers.md: -------------------------------------------------------------------------------- 1 | ::: docetl.optimizers.map_optimizer.optimizer.MapOptimizer 2 | options: 3 | show_root_heading: true 4 | heading_level: 3 5 | show_if_no_docstring: false 6 | docstring_options: 7 | ignore_init_summary: false 8 | trim_doctest_flags: true 9 | 10 | ::: docetl.optimizers.reduce_optimizer.ReduceOptimizer 11 | options: 12 | show_root_heading: true 13 | heading_level: 3 14 | show_if_no_docstring: false 15 | docstring_options: 16 | ignore_init_summary: false 17 | trim_doctest_flags: true 18 | 19 | ::: docetl.optimizers.join_optimizer.JoinOptimizer 20 | options: 21 | show_root_heading: true 22 | heading_level: 3 23 | show_if_no_docstring: false -------------------------------------------------------------------------------- /docs/assets/docetl-favicon-color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/docetl-favicon-color.png -------------------------------------------------------------------------------- /docs/assets/headerdiagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/headerdiagram.png -------------------------------------------------------------------------------- /docs/assets/readmefig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/readmefig.png -------------------------------------------------------------------------------- /docs/assets/tutorial/add-notes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/add-notes.png -------------------------------------------------------------------------------- /docs/assets/tutorial/dataset-view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/dataset-view.png -------------------------------------------------------------------------------- /docs/assets/tutorial/initial-outputs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/initial-outputs.png -------------------------------------------------------------------------------- /docs/assets/tutorial/one-operation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/one-operation.png -------------------------------------------------------------------------------- /docs/assets/tutorial/operation-details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/operation-details.png -------------------------------------------------------------------------------- /docs/assets/tutorial/prompt-improvement.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/prompt-improvement.png -------------------------------------------------------------------------------- /docs/assets/tutorial/prompt-v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/prompt-v2.png -------------------------------------------------------------------------------- /docs/community/roadmap.md: -------------------------------------------------------------------------------- 1 | # Roadmap 2 | 3 | !!! info "Join Our Working Groups" 4 | 5 | Are you interested in contributing to any of these projects or have ideas for new areas of exploration? Join our [Discord server](https://discord.gg/fHp7B2X3xx) to participate in our working groups and collaborate with the community! 6 | 7 | We're constantly working to improve DocETL and explore new possibilities in document processing. Our current ideas span both research and engineering problems, and are organized into the following categories: 8 | 9 | ```mermaid 10 | mindmap 11 | root((DocETL Roadmap)) 12 | User Interface and Interaction 13 | Debugging and Optimization 14 | Model and Tool Integrations 15 | Agents and Planning 16 | ``` 17 | 18 | ## User Interface and Interaction 19 | 20 | - **Natural Language to DocETL Pipeline**: Building tools to generate DocETL pipelines from natural language descriptions. 21 | - **Interactive Pipeline Creation**: Developing intuitive interfaces for creating and optimizing DocETL pipelines interactively. 22 | 23 | ## Debugging and Optimization 24 | 25 | - **DocETL Debugger**: Creating a debugger with provenance tracking, allowing users to visualize all intermediates that contributed to a specific output. 26 | - **Plan Efficiency Optimization**: Implementing strategies (and devising new strategies) to reduce latency and cost for the most accurate plans. This includes batching LLM calls, using model cascades, and fusing operators. 27 | 28 | 29 | ## Model and Tool Integrations 30 | 31 | - **Model Diversity**: Extending support beyond OpenAI to include a wider range of models, with a focus on local models. 32 | - **OCR and PDF Extraction**: Improving integration with OCR technologies and PDF extraction tools for more robust document processing. 33 | - **Multimodal Data Processing**: Enhancing DocETL to handle multimodal data, including text, images, audio, and video (as many of the LLMs support multimodal inputs, anyways). 34 | 35 | ## Agents and Planning 36 | 37 | - **Smarter Agent and Planning Architectures**: Optimizing plan exploration based on data characteristics. For instance, refining the optimizer to avoid unnecessary exploration of plans with the [gather operator](../operators/gather.md) for tasks that don't require peripheral context when decomposing map operations for large documents. 38 | 39 | - **Context-Aware Sampling for Validation**: Creating algorithms that can identify and extract the most representative samples from different parts of a document, including the beginning, middle, and end, to use in validaton prompts. This approach will help validation agents to verify that all sections of documents are adequately represented in the outputs, avoiding blind spots in the analysis due to truncation--as we currently naive truncate the middle of documents in validation prompts. 40 | 41 | - **Benchmarks**: Developing a suite of benchmarks to evaluate the performance of different optimization strategies and agent architectures. These benchmarks will help us understand the trade-offs between accuracy, efficiency, and cost in different scenarios, guiding the development of more effective optimization techniques. 42 | -------------------------------------------------------------------------------- /docs/examples/annotating-legal-documents.md: -------------------------------------------------------------------------------- 1 | TODO 2 | -------------------------------------------------------------------------------- /docs/examples/characterizing-troll-behavior.md: -------------------------------------------------------------------------------- 1 | TODO 2 | -------------------------------------------------------------------------------- /docs/examples/rate-limiting.md: -------------------------------------------------------------------------------- 1 | # Rate Limiting 2 | 3 | When using DocETL, you might have rate limits based on your usage tier with various API providers. To help manage these limits and prevent exceeding them, DocETL allows you to configure rate limits in your YAML configuration file. 4 | 5 | ## Configuring Rate Limits 6 | 7 | You can add rate limits to your YAML config by including a `rate_limits` key with specific configurations for different types of API calls. Here's an example of how to set up rate limits: 8 | 9 | ```yaml 10 | rate_limits: 11 | embedding_call: 12 | - count: 1000 13 | per: 1 14 | unit: second 15 | llm_call: 16 | - count: 1 17 | per: 1 18 | unit: second 19 | - count: 10 20 | per: 5 21 | unit: hour 22 | llm_tokens: 23 | - count: 1000000 24 | per: 1 25 | unit: minute 26 | ``` 27 | 28 | Your YAML configuration should have a `rate_limits` key with the config as shown above. This example sets limits for embedding calls and language model (LLM) calls, with multiple rules for LLM calls to accommodate different time scales. 29 | 30 | You can also use rate limits in the Python API, passing in a `rate_limits` dictionary when you initialize the `Pipeline` object. 31 | -------------------------------------------------------------------------------- /docs/execution/running-pipelines.md: -------------------------------------------------------------------------------- 1 | # Additional Notes 2 | 3 | Here are some additional notes to help you get the most out of your pipeline: 4 | 5 | - **Sampling Operations**: If you want to run an operation on a random sample of your data, you can set the sample parameter for that operation. For example: 6 | 7 | ```yaml 8 | operations: 9 | extract_medications: 10 | sample: 100 # This will run the operation on a random sample of 100 items 11 | # ... rest of the operation configuration 12 | ``` 13 | 14 | - **Caching**: DocETL caches the results of operations by default. This means that if you run the same operation on the same data multiple times, the results will be retrieved from the cache rather than being recomputed. You can clear the cache by running docetl clear-cache. 15 | 16 | - **The run Function**: The main entry point for running a pipeline is the run function in docetl/cli.py. Here's a description of its parameters and functionality: 17 | 18 | ::: docetl.cli.run 19 | handler: python 20 | options: 21 | members: - run 22 | show_root_full_path: true 23 | show_root_toc_entry: true 24 | show_root_heading: true 25 | show_source: false 26 | show_name: true 27 | 28 | - **Intermediate Output**: If you provide an intermediate directory in your configuration, the outputs of each operation will be saved to this directory. This allows you to inspect the results of individual steps in the pipeline and can be useful for debugging or analyzing the pipeline's progress. Set the intermediate_dir parameter in your pipeline's output configuration to specify the directory where intermediate results should be saved; e.g., 29 | 30 | ```yaml 31 | pipeline: 32 | output: 33 | type: file 34 | path: ... 35 | intermediate_dir: intermediate_results 36 | ``` -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # 📜 DocETL: A System for Complex Document Processing 2 | 3 | [](https://github.com/ucbepic/docetl) 4 | [](https://docetl.org) 5 | [](https://ucbepic.github.io/docetl) 6 | [](https://discord.gg/fHp7B2X3xx) 7 | [](https://arxiv.org/abs/2410.12189) 8 | 9 |  10 | 11 | DocETL is a tool for creating and executing LLM-powered data processing pipelines. It offers a low-code, declarative YAML interface to define complex data operations on complex data. 12 | 13 | !!! tip "When to Use DocETL" 14 | 15 | DocETL is the ideal choice when you're looking to **maximize correctness and output quality** for complex tasks over a collection of documents or unstructured datasets. You should consider using DocETL if: 16 | 17 | - You have complex tasks that you want to represent via map-reduce (e.g., map over your documents, then group by the result of your map call & reduce) 18 | - You're unsure how to best write your pipeline or sequence of operations to maximize LLM accuracy 19 | - You're working with long documents that don't fit into a single prompt or are too lengthy for effective LLM reasoning 20 | - You have validation criteria and want tasks to automatically retry when the validation fails 21 | 22 | ## 🚀 Features 23 | 24 | - **Rich Suite of Operators**: Tailored for complex data processing, including specialized operators like "resolve" for entity resolution and "gather" for maintaining context when splitting documents. 25 | - **Low-Code Interface**: Define your pipeline and prompts easily using YAML. You have 100% control over the prompts. 26 | - **Flexible Processing**: Handle various document types and processing tasks across domains like law, medicine, and social sciences. 27 | - **Accuracy Optimization**: Our optimizer leverages LLM agents to experiment with different logically-equivalent rewrites of your pipeline and automatically selects the most accurate version. This includes finding limits of how many documents to process in a single reduce operation before the accuracy plateaus. 28 | 29 | ## ⚡ Getting Started 30 | 31 | To get started with DocETL: 32 | 33 | 1. Install the package (see [installation](installation.md) for detailed instructions) 34 | 2. Define your pipeline in a YAML file. Want to use an LLM like ChatGPT or Claude to help you write your pipeline? See [docetl.org/llms.txt](https://docetl.org/llms.txt) for a big prompt you can copy paste into ChatGPT or Claude, before describing your task. 35 | 3. Run your pipeline using the DocETL command-line interface 36 | 37 | ## 🏛️ Project Origin 38 | 39 | DocETL was created by members of the EPIC Data Lab and Data Systems and Foundations group at UC Berkeley. The EPIC (Effective Programming, Interaction, and Computation with Data) Lab focuses on developing low-code and no-code interfaces for data work, powered by next-generation predictive programming techniques. DocETL is one of the projects that emerged from our research efforts to streamline complex document processing tasks. 40 | 41 | For more information about the labs and other projects, visit the [EPIC Lab webpage](https://epic.berkeley.edu/) and the [Data Systems and Foundations webpage](https://dsf.berkeley.edu/). 42 | -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | DocETL can be easily installed using pip, Python's package installer, or from source. Follow these steps to get DocETL up and running on your system: 4 | 5 | ## 🛠️ Prerequisites 6 | 7 | Before installing DocETL, ensure you have Python 3.10 or later installed on your system. You can check your Python version by running: 8 | 9 | ## 📦 Installation via pip 10 | 11 | 1. Install DocETL using pip: 12 | 13 | ```bash 14 | pip install docetl 15 | ``` 16 | 17 | If you want to use the parsing tools, you need to install the `parsing` extra: 18 | 19 | ```bash 20 | pip install docetl[parsing] 21 | ``` 22 | 23 | This command will install DocETL along with its dependencies as specified in the pyproject.toml file. To verify that DocETL has been installed correctly, you can run the following command in your terminal: 24 | 25 | ```bash 26 | docetl version 27 | ``` 28 | 29 | ## 🔧 Installation from Source 30 | 31 | To install DocETL from source, follow these steps: 32 | 33 | 1. Clone the repository: 34 | 35 | ```bash 36 | git clone https://github.com/ucbepic/docetl.git 37 | cd docetl 38 | ``` 39 | 40 | 2. Install Poetry (if not already installed): 41 | 42 | ```bash 43 | pip install poetry 44 | ``` 45 | 46 | 3. Install the project dependencies and DocETL: 47 | 48 | ```bash 49 | poetry install 50 | ``` 51 | 52 | If you want to use the parsing tools, you need to install the `parsing` extra: 53 | 54 | ```bash 55 | poetry install --extras "parsing" 56 | ``` 57 | 58 | This will create a virtual environment and install all the required dependencies. 59 | 60 | 4. Set up your OpenAI API key: 61 | 62 | Create a .env file in the project root and add your OpenAI API key: 63 | 64 | ```bash 65 | OPENAI_API_KEY=your_api_key_here 66 | ``` 67 | 68 | Alternatively, you can set the OPENAI_API_KEY environment variable in your shell. 69 | 70 | 5. Run the basic test suite to ensure everything is working (this costs less than $0.01 with OpenAI): 71 | 72 | ```bash 73 | make tests-basic 74 | ``` 75 | 76 | ## 🚨 Troubleshooting 77 | 78 | If you encounter any issues during installation, please ensure that: 79 | 80 | - Your Python version is 3.10 or later 81 | - You have the latest version of pip installed 82 | - Your system meets all the requirements specified in the pyproject.toml file 83 | 84 | For further assistance, please refer to the project's GitHub repository or reach out on the [Discord server](https://discord.gg/fHp7B2X3xx). 85 | -------------------------------------------------------------------------------- /docs/optimization/configuration.md: -------------------------------------------------------------------------------- 1 | # Advanced: Customizing Optimization 2 | 3 | You can customize the optimization process for specific operations using the ``optimizer_config in your pipeline. 4 | 5 | ## Global Configuration 6 | 7 | The following options can be applied globally to all operations in your pipeline during optimization: 8 | 9 | - `num_retries`: The number of times to retry optimizing if the LLM agent fails. Default is 1. 10 | 11 | - `sample_sizes`: Override the default sample sizes for each operator type. Specify as a dictionary with operator types as keys and integer sample sizes as values. 12 | 13 | Default sample sizes: 14 | 15 | ```python 16 | SAMPLE_SIZE_MAP = { 17 | "reduce": 40, 18 | "map": 5, 19 | "resolve": 100, 20 | "equijoin": 100, 21 | "filter": 5, 22 | } 23 | ``` 24 | 25 | - `judge_agent_model`: Specify the model to use for the judge agent. Default is `gpt-4o-mini`. 26 | 27 | - `rewrite_agent_model`: Specify the model to use for the rewrite agent. Default is `gpt-4o`. 28 | 29 | - `litellm_kwargs`: Specify the litellm kwargs to use for the optimization. Default is `{}`. 30 | 31 | ## Equijoin Configuration 32 | 33 | - `target_recall`: Change the default target recall (default is 0.95). 34 | 35 | ## Resolve Configuration 36 | 37 | - `target_recall`: Specify the target recall for the resolve operation. 38 | 39 | ## Reduce Configuration 40 | 41 | - `synthesize_resolve`: Set to `False` if you definitely don't want a resolve operation synthesized or want to turn off this rewrite rule. 42 | 43 | ## Map Configuration 44 | 45 | - `force_chunking_plan`: Set to `True` if you want the the optimizer to force plan that breaks up the input documents into chunks. 46 | - `plan_types`: Specify the plan types to consider for the map operation. The available plan types are: 47 | - `chunk`: Breaks up the input documents into chunks (i.e., data decomposition). 48 | - `proj_synthesis`: Synthesizes 1+ projections (i.e., task decomposition). 49 | - `glean`: Synthesizes a glean plan (i.e., uses LLM as a judge to refine the output). 50 | 51 | ## Example Configuration 52 | 53 | Here's an example of how to use the `optimizer_config` in your pipeline: 54 | 55 | ```yaml 56 | optimizer_config: 57 | rewrite_agent_model: gpt-4o-mini 58 | judge_agent_model: gpt-4o-mini 59 | litellm_kwargs: 60 | temperature: 0.5 61 | num_retries: 2 62 | sample_sizes: 63 | map: 10 64 | reduce: 50 65 | reduce: 66 | synthesize_resolve: false 67 | map: 68 | plan_types: # Considers all these plan types 69 | - chunk 70 | - proj_synthesis 71 | - glean 72 | 73 | operations: 74 | - name: extract_medications 75 | type: map 76 | optimize: true 77 | recursively_optimize: true # Recursively optimize the map operation (i.e., optimize any new operations that are synthesized) 78 | # ... other configuration ... 79 | 80 | - name: summarize_prescriptions 81 | type: reduce 82 | optimize: true 83 | # ... other configuration ... 84 | # ... rest of the pipeline configuration ... 85 | ``` 86 | 87 | This configuration will: 88 | 89 | 1. Retry optimization up to 2 times for each operation if the LLM agent fails. 90 | 2. Use custom sample sizes for map (10) and reduce (50) operations. 91 | 3. Prevent the synthesis of resolve operations for reduce operations. 92 | 4. Consider all plan types for map operations. -------------------------------------------------------------------------------- /docs/playground/features.md: -------------------------------------------------------------------------------- 1 | # Features 2 | 3 | The DocETL playground provides an interactive environment for building and testing document processing pipelines. Here are the key features: 4 | 5 | ## Current Features 6 | 7 | ### Hybrid Interface 8 | The playground offers a unique hybrid between a notebook and spreadsheet interface, allowing you to: 9 | - Iteratively develop and test pipeline operations 10 | - Inspect operation outputs in a tabular format 11 | - Seamlessly switch between code and data views 12 | 13 | ### Performance Optimizations 14 | To ensure responsive interaction: 15 | - Smart sampling of large datasets for quick iteration 16 | - Automatic caching of operation results 17 | - Efficient handling of LLM API calls 18 | 19 | ### Output Management 20 | - Add notes and highlights to important outputs 21 | - Save and organize findings during pipeline development 22 | - Track key insights and results 23 | 24 | ### Export Capabilities 25 | - Export results from any operation to CSV 26 | - Preserve intermediate results for further analysis 27 | - Share outputs with team members 28 | 29 | ## Upcoming Features 30 | 31 | We're actively working on several exciting ideas: 32 | 33 | ### Natural Language Pipeline Assistant 34 | - Generate and indirectly modify pipelines using natural language 35 | - Interactive help for pipeline development 36 | 37 | ### Enhanced Validation UI 38 | - Per-document retry capabilities for failed operations 39 | - UI support for gleaning validation outside of extra kwargs 40 | - Visual feedback for validation results 41 | 42 | ### Pipeline Optimization Interface 43 | - Interactive tools for optimizing operation performance 44 | - Visual pipeline analysis and bottleneck identification 45 | - Suggestions for pipeline efficiency improvements 46 | 47 | !!! tip "Join the Development" 48 | 49 | Interested in these upcoming features? Join our [Discord community](https://discord.gg/fHp7B2X3xx) to provide feedback and help shape the development of these features! 50 | -------------------------------------------------------------------------------- /docs/python/index.md: -------------------------------------------------------------------------------- 1 | # Python API 2 | 3 | The DocETL Python API provides a programmatic way to define, optimize, and run document processing pipelines. This approach offers an alternative to the YAML configuration method, allowing for more dynamic and flexible pipeline construction. 4 | 5 | ## Overview 6 | 7 | The Python API consists of several classes: 8 | 9 | - Dataset: Represents a dataset with a type and path. 10 | - Various operation classes (e.g., MapOp, ReduceOp, FilterOp) for different types of data processing steps. 11 | - PipelineStep: Represents a step in the pipeline with input and operations. 12 | - Pipeline: The main class for defining and running a complete document processing pipeline. 13 | - PipelineOutput: Defines the output configuration for the pipeline. 14 | 15 | ## Example Usage 16 | 17 | Here's an example of how to use the Python API to create and run a simple document processing pipeline: 18 | 19 | ```python 20 | from docetl.api import Pipeline, Dataset, MapOp, ReduceOp, PipelineStep, PipelineOutput 21 | 22 | # Define datasets 23 | datasets = { 24 | "my_dataset": Dataset(type="file", path="input.json", parsing=[{"input_key": "file_path", "function": "txt_to_string", "output_key": "content"}]), 25 | } 26 | 27 | # Note that the parsing is applied to the `file_path` key in each item of the dataset, 28 | # and the result is stored in the `content` key. 29 | 30 | # Define operations 31 | operations = [ 32 | MapOp( 33 | name="process", 34 | type="map", 35 | prompt="Determine what type of document this is: {{ input.content }}", 36 | output={"schema": {"document_type": "string"}} 37 | ), 38 | ReduceOp( 39 | name="summarize", 40 | type="reduce", 41 | reduce_key="document_type", 42 | prompt="Summarize the processed contents: {% for item in inputs %}{{ item.content }} {% endfor %}", 43 | output={"schema": {"summary": "string"}} 44 | ) 45 | ] 46 | 47 | # Define pipeline steps 48 | steps = [ 49 | PipelineStep(name="process_step", input="my_dataset", operations=["process"]), 50 | PipelineStep(name="summarize_step", input="process_step", operations=["summarize"]) 51 | ] 52 | 53 | # Define pipeline output 54 | output = PipelineOutput(type="file", path="output.json") 55 | 56 | # Create the pipeline 57 | pipeline = Pipeline( 58 | name="example_pipeline", 59 | datasets=datasets, 60 | operations=operations, 61 | steps=steps, 62 | output=output, 63 | default_model="gpt-4o-mini" 64 | ) 65 | 66 | # Optimize the pipeline 67 | optimized_pipeline = pipeline.optimize() 68 | 69 | # Run the optimized pipeline 70 | result = optimized_pipeline.run() # Saves the result to the output path 71 | 72 | print(f"Pipeline execution completed. Total cost: ${result:.2f}") 73 | ``` 74 | 75 | This example demonstrates how to create a simple pipeline that processes input documents and then summarizes the processed content. The pipeline is optimized before execution to improve performance. 76 | 77 | ## API Reference 78 | 79 | For a complete reference of all available classes and their methods, please refer to the [Python API Reference](api-reference/python.md). 80 | 81 | The API Reference provides detailed information about each class, including: 82 | 83 | - Available parameters 84 | - Method signatures 85 | - Return types 86 | - Usage examples 87 | -------------------------------------------------------------------------------- /docs/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | /* Base Layer */ 2 | :root { 3 | --md-primary-fg-color: hsl(211, 100%, 50%); 4 | --md-primary-fg-color--light: hsl(211, 100%, 70%); 5 | --md-primary-fg-color--dark: hsl(211, 100%, 30%); 6 | --background: hsl(211, 100%, 95%); 7 | --foreground: hsl(211, 5%, 0%); 8 | --card: hsl(211, 50%, 90%); 9 | --card-foreground: hsl(211, 5%, 10%); 10 | --popover: hsl(211, 100%, 95%); 11 | --popover-foreground: hsl(211, 100%, 0%); 12 | --primary-foreground: hsl(0, 0%, 100%); 13 | --secondary: hsl(211, 30%, 70%); 14 | --secondary-foreground: hsl(0, 0%, 0%); 15 | --muted: hsl(173, 30%, 85%); 16 | --muted-foreground: hsl(211, 5%, 35%); 17 | --accent: hsl(173, 30%, 80%); 18 | --accent-foreground: hsl(211, 5%, 10%); 19 | --destructive: hsl(0, 100%, 30%); 20 | --destructive-foreground: hsl(211, 5%, 90%); 21 | --border: hsl(211, 30%, 50%); 22 | --input: hsl(211, 30%, 18%); 23 | --ring: var(--md-primary-fg-color); 24 | --radius: 0.5rem; 25 | 26 | /* Custom chart colors */ 27 | --chart-1: hsl(12, 76%, 61%); 28 | --chart-2: hsl(173, 58%, 39%); 29 | --chart-3: hsl(197, 37%, 24%); 30 | --chart-4: hsl(43, 74%, 66%); 31 | --chart-5: hsl(27, 87%, 67%); 32 | } 33 | 34 | /* Dark mode */ 35 | .dark { 36 | --md-primary-fg-color: hsl(211, 100%, 50%); 37 | --md-primary-fg-color--light: hsl(211, 100%, 70%); 38 | --md-primary-fg-color--dark: hsl(211, 100%, 30%); 39 | --background: hsl(211, 50%, 5%); 40 | --foreground: hsl(211, 5%, 90%); 41 | --card: hsl(211, 50%, 0%); 42 | --card-foreground: hsl(211, 5%, 90%); 43 | --popover: hsl(211, 50%, 5%); 44 | --popover-foreground: hsl(211, 5%, 90%); 45 | /* --primary: var(--md-primary-fg-color--dark); */ 46 | --primary-foreground: hsl(0, 0%, 100%); 47 | --secondary: hsl(211, 30%, 10%); 48 | --secondary-foreground: hsl(0, 0%, 100%); 49 | --muted: hsl(173, 30%, 15%); 50 | --muted-foreground: hsl(211, 5%, 60%); 51 | --accent: hsl(173, 30%, 15%); 52 | --accent-foreground: hsl(211, 5%, 90%); 53 | --destructive: hsl(0, 100%, 30%); 54 | --destructive-foreground: hsl(211, 5%, 90%); 55 | --border: hsl(211, 30%, 18%); 56 | --input: hsl(211, 30%, 18%); 57 | --ring: var(--md-primary-fg-color--dark); 58 | --radius: 0.5rem; 59 | 60 | /* Custom chart colors for dark mode */ 61 | --chart-1: hsl(220, 70%, 50%); 62 | --chart-2: hsl(160, 60%, 45%); 63 | --chart-3: hsl(30, 80%, 55%); 64 | --chart-4: hsl(280, 65%, 60%); 65 | --chart-5: hsl(340, 75%, 55%); 66 | } 67 | 68 | /* Header styling */ 69 | h1, h2, h3, h4, h5, h6 { 70 | color: var(--primary); 71 | } 72 | 73 | .dark h1, .dark h2, .dark h3, .dark h4, .dark h5, .dark h6 { 74 | color: var(--primary); 75 | } 76 | 77 | /* Link styling */ 78 | a { 79 | color: var(--primary); 80 | } 81 | 82 | .dark a { 83 | color: var(--primary); 84 | } 85 | 86 | /* Card styling */ 87 | .card { 88 | background-color: var(--card); 89 | color: var(--card-foreground); 90 | border-radius: var(--radius); 91 | } 92 | 93 | .dark .card { 94 | background-color: var(--card); 95 | color: var(--card-foreground); 96 | } 97 | -------------------------------------------------------------------------------- /experiments/extraction_outputs.txt: -------------------------------------------------------------------------------- 1 | Results Table: 2 | Logical Fallacy Extraction Experiment Results 3 | ╭──────────────┬─────────────┬─────────────────┬─────────────┬────────────┬────────────┬─────────────┬────────────────╮ 4 | │ Model │ Method │ Total Fallacies │ Avg per Doc │ Max in Doc │ Avg Length │ Runtime (s) │ Total Cost ($) │ 5 | ├──────────────┼─────────────┼─────────────────┼─────────────┼────────────┼────────────┼─────────────┼────────────────┤ 6 | │ gpt-4.1-mini │ line_number │ 192 │ 3.92 │ 11 │ 895.6 │ 10.39 │ $0.468208 │ 7 | │ │ regex │ 125 │ 2.55 │ 43 │ 3759.6 │ 30.77 │ $0.413990 │ 8 | ├──────────────┼─────────────┼─────────────────┼─────────────┼────────────┼────────────┼─────────────┼────────────────┤ 9 | │ gpt-4.1-nano │ line_number │ 279 │ 5.69 │ 19 │ 719.3 │ 3.51 │ $0.117790 │ 10 | │ │ regex │ 8 │ 0.16 │ 4 │ 1.5 │ 9.95 │ $0.102572 │ 11 | ├──────────────┼─────────────┼─────────────────┼─────────────┼────────────┼────────────┼─────────────┼────────────────┤ 12 | │ gpt-4o-mini │ line_number │ 570 │ 11.63 │ 77 │ 529.6 │ 101.38 │ $0.177959 │ 13 | │ │ regex │ 415 │ 8.47 │ 167 │ 226.3 │ 192.85 │ $0.120100 │ 14 | ╰──────────────┴─────────────┴─────────────────┴─────────────┴────────────┴────────────┴─────────────┴────────────────╯ -------------------------------------------------------------------------------- /experiments/structured_outputs.txt: -------------------------------------------------------------------------------- 1 | Results Table: 2 | Experiment Results 3 | ╭────────────────────────────────────────────────┬───────┬────────────┬───────────┬────────┬───────┬─────────────┬──────────────╮ 4 | │ Model │ Doc % │ Approach │ Precision │ Recall │ F1 │ Avg Runtime │ Avg Cost ($) │ 5 | ├────────────────────────────────────────────────┼───────┼────────────┼───────────┼────────┼───────┼─────────────┼──────────────┤ 6 | │ azure/gpt-4o-mini │ 10% │ structured │ 0.869 │ 0.872 │ 0.853 │ 1.100s │ $0.0004 │ 7 | │ azure/gpt-4o-mini │ 10% │ tool │ 0.914 │ 0.906 │ 0.891 │ 0.722s │ $0.0004 │ 8 | ├────────────────────────────────────────────────┼───────┼────────────┼───────────┼────────┼───────┼─────────────┼──────────────┤ 9 | │ deepseek/deepseek-chat │ 10% │ structured │ 0.878 │ 0.889 │ 0.877 │ 2.094s │ $0.0003 │ 10 | │ deepseek/deepseek-chat │ 10% │ tool │ 0.867 │ 0.856 │ 0.860 │ 2.212s │ $0.0003 │ 11 | ├────────────────────────────────────────────────┼───────┼────────────┼───────────┼────────┼───────┼─────────────┼──────────────┤ 12 | │ lm_studio/hugging-quants/llama-3.2-3b-instruct │ 10% │ structured │ 0.033 │ 0.022 │ 0.027 │ 33.635s │ $0.0000 │ 13 | │ lm_studio/hugging-quants/llama-3.2-3b-instruct │ 10% │ tool │ 0.000 │ 0.000 │ 0.000 │ 70.858s │ $0.0000 │ 14 | ╰────────────────────────────────────────────────┴───────┴────────────┴───────────┴────────┴───────┴─────────────┴──────────────╯ -------------------------------------------------------------------------------- /server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/server/__init__.py -------------------------------------------------------------------------------- /server/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/server/app/__init__.py -------------------------------------------------------------------------------- /server/app/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from fastapi import FastAPI 3 | from fastapi.middleware.cors import CORSMiddleware 4 | from server.app.routes import pipeline, convert, filesystem 5 | from dotenv import load_dotenv 6 | 7 | load_dotenv() 8 | 9 | # Read backend configuration from .env 10 | host = os.getenv("BACKEND_HOST", "127.0.0.1") 11 | port = int(os.getenv("BACKEND_PORT", 8000)) 12 | reload = os.getenv("BACKEND_RELOAD", "False").lower() == "true" 13 | 14 | # Set default allow_origins if BACKEND_ALLOW_ORIGINS is not provided 15 | allow_origins = os.getenv("BACKEND_ALLOW_ORIGINS", "http://localhost:3000").split(",") 16 | 17 | app = FastAPI() 18 | os.environ["USE_FRONTEND"] = "true" 19 | 20 | # Add CORS middleware 21 | app.add_middleware( 22 | CORSMiddleware, 23 | allow_origins=allow_origins, 24 | allow_credentials=True, 25 | allow_methods=["*"], 26 | allow_headers=["*"], 27 | ) 28 | 29 | # Include all routers, 30 | app.include_router(pipeline.router) 31 | app.include_router(convert.router) 32 | app.include_router(filesystem.router, prefix="/fs") 33 | 34 | @app.get("/") 35 | async def root(): 36 | return {"message": "DocETL API is running"} 37 | 38 | @app.get("/health") 39 | async def health_check(): 40 | return {"status": "healthy"} 41 | 42 | if __name__ == "__main__": 43 | import uvicorn 44 | uvicorn.run("server.app.main:app", host=host, port=port, reload=reload) 45 | -------------------------------------------------------------------------------- /server/app/models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import List, Dict, Any, Optional 3 | from datetime import datetime 4 | from enum import Enum 5 | 6 | 7 | class PipelineRequest(BaseModel): 8 | yaml_config: str 9 | 10 | class PipelineConfigRequest(BaseModel): 11 | namespace: str 12 | name: str 13 | config: str 14 | input_path: str 15 | output_path: str 16 | 17 | class TaskStatus(str, Enum): 18 | PENDING = "pending" 19 | PROCESSING = "processing" 20 | COMPLETED = "completed" 21 | FAILED = "failed" 22 | CANCELLED = "cancelled" 23 | 24 | class OptimizeResult(BaseModel): 25 | task_id: str 26 | status: TaskStatus 27 | should_optimize: Optional[str] = None 28 | input_data: Optional[List[Dict[str, Any]]] = None 29 | output_data: Optional[List[Dict[str, Any]]] = None 30 | cost: Optional[float] = None 31 | error: Optional[str] = None 32 | created_at: datetime 33 | completed_at: Optional[datetime] = None 34 | 35 | class OptimizeRequest(BaseModel): 36 | yaml_config: str 37 | step_name: str 38 | op_name: str -------------------------------------------------------------------------------- /server/app/routes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/server/app/routes/__init__.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/tests/__init__.py -------------------------------------------------------------------------------- /tests/basic/sample_texts/one.txt: -------------------------------------------------------------------------------- 1 | Once upon a time, in a quaint village nestled among rolling hills, there lived a curious young girl named Lily. She had always dreamed of adventure beyond the confines of her small town. One day, while exploring the attic of her grandmother's old house, Lily discovered a dusty, leather-bound book with strange symbols etched on its cover. 2 | 3 | As she opened the book, a swirl of glittering mist escaped from its pages, enveloping her in a magical aura. Suddenly, Lily found herself transported to a fantastical world filled with talking animals, floating islands, and shimmering forests. 4 | 5 | Guided by a wise old owl named Hoot, Lily embarked on a quest to find the lost key of harmony, which would restore balance to this enchanted realm. Along her journey, she befriended a mischievous fox, outsmarted a grumpy troll, and solved riddles posed by ancient tree spirits. 6 | 7 | With each challenge she overcame, Lily grew braver and more confident. She learned that true magic lies not in spells or potions, but in the power of kindness, perseverance, and friendship. 8 | 9 | As Lily finally reached the crystal cave where the key of harmony was hidden, she realized that the real treasure was the incredible adventure she had experienced and the lifelong friends she had made along the way. 10 | 11 | With a bittersweet heart, Lily used the key to return home, knowing that her ordinary life would never be the same again. From that day forward, she approached each day with the wonder and courage of a true adventurer, always ready for the next exciting chapter in her story. 12 | -------------------------------------------------------------------------------- /tests/basic/sample_texts/two.md: -------------------------------------------------------------------------------- 1 | # The Enchanted Forest 2 | 3 | Once upon a time, in a land far beyond the reaches of our modern world, there lay a mysterious and enchanted forest. This forest, known as the Whispering Woods, was said to be alive with magic and wonder. 4 | 5 | ## The Guardian of the Woods 6 | 7 | At the heart of the Whispering Woods lived an ancient tree spirit named Eldora. With bark as silver as moonlight and leaves that shimmered like emeralds, Eldora had watched over the forest for countless centuries. 8 | 9 | ## The Lost Traveler 10 | 11 | One misty morning, a young traveler named Finn stumbled into the Whispering Woods. Lost and weary, he marveled at the ethereal beauty of the forest. 12 | 13 | ### A Magical Encounter 14 | 15 | As Finn wandered deeper into the woods, he heard a soft, melodious voice carried on the breeze. It was Eldora, calling out to him: 16 | 17 | > "Welcome, young one. What brings you to our magical realm?" 18 | 19 | Finn, awestruck, replied, "I've lost my way, kind spirit. Can you help me find my path?" 20 | 21 | ### The Quest Begins 22 | 23 | Eldora smiled, her leaves rustling gently. "To find your true path, you must first complete three tasks: 24 | 25 | 1. Befriend the Moonlight Rabbits 26 | 2. Solve the Riddle of the Babbling Brook 27 | 3. Plant a seed of hope in the Glade of Dreams" 28 | 29 | And so, Finn's adventure in the Whispering Woods began, filled with magical creatures, enigmatic puzzles, and the promise of self-discovery. 30 | 31 | --- 32 | 33 | _To be continued..._ 34 | -------------------------------------------------------------------------------- /tests/basic/test_basic_parallel_map.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: F811 2 | 3 | import pytest 4 | from docetl.operations.map import ParallelMapOperation 5 | from dotenv import load_dotenv 6 | from typing import Dict, Any, List, Tuple 7 | from tests.conftest import ( 8 | parallel_map_config as parallel_map_config, 9 | parallel_map_sample_data as parallel_map_sample_data, 10 | default_model as default_model, 11 | max_threads as max_threads, 12 | api_wrapper as api_wrapper, 13 | ) 14 | 15 | load_dotenv() 16 | 17 | 18 | def test_parallel_map_operation( 19 | parallel_map_config, 20 | default_model, 21 | max_threads, 22 | parallel_map_sample_data, 23 | api_wrapper, 24 | ): 25 | parallel_map_config["bypass_cache"] = True 26 | operation = ParallelMapOperation( 27 | api_wrapper, parallel_map_config, default_model, max_threads 28 | ) 29 | results, cost = operation.execute(parallel_map_sample_data) 30 | 31 | assert len(results) == len(parallel_map_sample_data) 32 | assert all("sentiment" in result for result in results) 33 | assert all("word_count" in result for result in results) 34 | assert all( 35 | result["sentiment"] in ["positive", "negative", "neutral"] for result in results 36 | ) 37 | assert all(isinstance(result["word_count"], int) for result in results) 38 | assert cost > 0 39 | 40 | 41 | def test_parallel_map_operation_empty_input( 42 | parallel_map_config, default_model, max_threads, api_wrapper 43 | ): 44 | operation = ParallelMapOperation( 45 | api_wrapper, parallel_map_config, default_model, max_threads 46 | ) 47 | results, cost = operation.execute([]) 48 | 49 | assert len(results) == 0 50 | assert cost == 0 51 | 52 | 53 | def test_parallel_map_operation_with_empty_input( 54 | parallel_map_config, default_model, max_threads, api_wrapper 55 | ): 56 | operation = ParallelMapOperation( 57 | api_wrapper, parallel_map_config, default_model, max_threads 58 | ) 59 | results, cost = operation.execute([]) 60 | 61 | assert len(results) == 0 62 | assert cost == 0 63 | -------------------------------------------------------------------------------- /tests/basic/test_optimizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import json 4 | import shutil 5 | from docetl.runner import DSLRunner 6 | 7 | @pytest.fixture 8 | def test_dir(tmp_path): 9 | # Create test directories 10 | data_dir = tmp_path / "tests" / "data" 11 | data_dir.mkdir(parents=True) 12 | 13 | # Create test data file 14 | data_file = data_dir / "test_data.json" 15 | test_data = [ 16 | {"text": "My name is John Smith"}, 17 | {"text": "Hello, I'm Alice Johnson"}, 18 | {"text": "Bob Wilson here"} 19 | ] 20 | 21 | with open(data_file, "w") as f: 22 | json.dump(test_data, f) 23 | 24 | yield tmp_path 25 | 26 | # Cleanup 27 | if tmp_path.exists(): 28 | shutil.rmtree(tmp_path) 29 | 30 | @pytest.fixture 31 | def test_config(test_dir): 32 | return { 33 | "default_model": "gpt-4o-mini", 34 | "datasets": { 35 | "test_data": { 36 | "type": "file", 37 | "path": str(test_dir / "tests" / "data" / "test_data.json"), 38 | } 39 | }, 40 | "operations": [ 41 | { 42 | "name": "extract_name", 43 | "type": "map", 44 | "prompt": "Extract the person's name from the text.", 45 | "output": { 46 | "schema": { 47 | "name": "string" 48 | } 49 | }, 50 | "optimize": True 51 | } 52 | ], 53 | "pipeline": { 54 | "steps": [ 55 | { 56 | "name": "name_extraction", 57 | "input": "test_data", 58 | "operations": ["extract_name"] 59 | } 60 | ] 61 | } 62 | } 63 | 64 | @pytest.fixture 65 | def runner(test_config): 66 | return DSLRunner( 67 | config=test_config 68 | ) 69 | 70 | def test_optimize_map_operation(runner, test_dir): 71 | """Test that the optimizer can optimize a simple map operation""" 72 | 73 | 74 | # Run optimization 75 | optimized_config, total_cost = runner.optimize(return_pipeline=False) 76 | 77 | # Check that optimization completed successfully 78 | assert total_cost >= 0 # Cost should be non-negative 79 | 80 | # Check that the optimized config contains operations 81 | assert "operations" in optimized_config 82 | assert len(optimized_config["operations"]) > 0 83 | 84 | # Check that the pipeline steps are preserved 85 | assert "pipeline" in optimized_config 86 | assert "steps" in optimized_config["pipeline"] 87 | assert len(optimized_config["pipeline"]["steps"]) > 0 88 | 89 | # Check that the first step is preserved 90 | first_step = optimized_config["pipeline"]["steps"][0] 91 | assert first_step["name"] == "name_extraction" 92 | assert "operations" in first_step 93 | assert len(first_step["operations"]) > 0 94 | 95 | -------------------------------------------------------------------------------- /tests/data/PublicWaterMassMailing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/tests/data/PublicWaterMassMailing.pdf -------------------------------------------------------------------------------- /tests/ranking/plots/harmfulness_budget_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/tests/ranking/plots/harmfulness_budget_performance.png -------------------------------------------------------------------------------- /tests/ranking/plots/medical_pain_budget_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/tests/ranking/plots/medical_pain_budget_performance.png -------------------------------------------------------------------------------- /tests/ranking/plots/synthetic_abstracts_budget_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/tests/ranking/plots/synthetic_abstracts_budget_performance.png -------------------------------------------------------------------------------- /tests/test_azure_rl.py: -------------------------------------------------------------------------------- 1 | from docetl.runner import DSLRunner 2 | import pytest 3 | from docetl.operations.map import MapOperation 4 | import random 5 | import os 6 | from dotenv import load_dotenv 7 | from tests.conftest import api_wrapper 8 | 9 | load_dotenv() 10 | 11 | 12 | @pytest.fixture 13 | def simple_map_config(): 14 | return { 15 | "name": "simple_sentiment_analysis", 16 | "type": "map", 17 | "prompt": "Analyze the sentiment of the following text: '{{ input.text }}'. Classify it as either positive, negative, or neutral.", 18 | "output": {"schema": {"sentiment": "string"}}, 19 | "model": "azure/gpt-4o", 20 | } 21 | 22 | 23 | @pytest.fixture 24 | def sample_documents(): 25 | sentiments = ["positive", "negative", "neutral"] 26 | documents = [] 27 | for _ in range(8): 28 | sentiment = random.choice(sentiments) 29 | if sentiment == "positive": 30 | text = f"I absolutely love this product! It's amazing and works perfectly." 31 | elif sentiment == "negative": 32 | text = f"This is the worst experience I've ever had. Terrible service." 33 | else: 34 | text = f"The product works as expected. Nothing special to report." 35 | documents.append({"text": text}) 36 | return documents 37 | 38 | 39 | def test_map_operation_over_15_documents(simple_map_config, sample_documents): 40 | # Set environment variables specific to this test 41 | os.environ["AZURE_API_BASE"] = os.getenv("LOW_RES_AZURE_API_BASE") 42 | os.environ["AZURE_API_VERSION"] = os.getenv("LOW_RES_AZURE_API_VERSION") 43 | os.environ["AZURE_API_KEY"] = os.getenv("LOW_RES_AZURE_API_KEY") 44 | 45 | runner = DSLRunner( 46 | { 47 | "default_model": "gpt-4o-mini", 48 | "operations": [], 49 | "pipeline": {"steps": [], "output": {"path": "/tmp/testingdocetl.json"}}, 50 | }, 51 | max_threads=64, 52 | ) 53 | 54 | operation = MapOperation(runner, simple_map_config, "azure/gpt-4o", 4) 55 | results, cost = operation.execute(sample_documents + sample_documents) 56 | 57 | assert len(results) == 16 58 | assert all("sentiment" in result for result in results) 59 | assert all( 60 | result["sentiment"] in ["positive", "negative", "neutral"] for result in results 61 | ) 62 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import json 3 | import os 4 | from docetl.runner import DSLRunner 5 | from docetl.utils import load_config 6 | import yaml 7 | 8 | # Sample configuration for the test 9 | SAMPLE_CONFIG = """ 10 | default_model: "gpt-4o-mini" 11 | 12 | operations: 13 | - name: map_operation 14 | type: map 15 | prompt: | 16 | Analyze the sentiment of the following text: "{{ input.text }}" 17 | Classify it as either positive, negative, or neutral. 18 | output: 19 | schema: 20 | sentiment: string 21 | model: "gpt-4o-mini" 22 | 23 | - name: filter_operation 24 | type: filter 25 | prompt: | 26 | Determine if the following text is longer than 5 words: 27 | "{{ input.text }}" 28 | output: 29 | schema: 30 | keep: boolean 31 | model: "gpt-4o-mini" 32 | 33 | datasets: 34 | sample_dataset: 35 | type: file 36 | path: "tests/sample_data.json" 37 | 38 | pipeline: 39 | steps: 40 | - name: sentiment_analysis 41 | input: sample_dataset 42 | operations: 43 | - map_operation 44 | - name: filter_long_texts 45 | input: sentiment_analysis 46 | operations: 47 | - filter_operation 48 | 49 | output: 50 | type: file 51 | path: "tests/output.json" 52 | """ 53 | 54 | SAMPLE_DATA = [ 55 | {"text": "This is a very positive sentence.", "id": 1}, 56 | {"text": "A short negative phrase.", "id": 2}, 57 | {"text": "Neutral statement without much edocetl.", "id": 3}, 58 | {"text": "Brief.", "id": 4}, 59 | ] 60 | 61 | 62 | @pytest.fixture 63 | def config_file(tmp_path): 64 | config_path = tmp_path / "test_config.yaml" 65 | with open(config_path, "w") as f: 66 | f.write(SAMPLE_CONFIG) 67 | return config_path 68 | 69 | 70 | @pytest.fixture 71 | def sample_data_file(tmp_path): 72 | data_path = tmp_path / "sample_data.json" 73 | with open(data_path, "w") as f: 74 | json.dump(SAMPLE_DATA, f) 75 | return data_path 76 | 77 | 78 | def test_end_to_end_pipeline(config_file, sample_data_file, tmp_path): 79 | # Update the config with the correct sample data path 80 | config = load_config(config_file) 81 | config["datasets"]["sample_dataset"]["path"] = str(sample_data_file) 82 | config["pipeline"]["output"]["path"] = str(tmp_path / "output.json") 83 | 84 | # Write the updated config back to the file 85 | with open(config_file, "w") as f: 86 | yaml.dump(config, f) 87 | 88 | # Create and run the DSLRunner 89 | runner = DSLRunner.from_yaml(str(config_file)) 90 | total_cost = runner.load_run_save() 91 | 92 | # Check if the output file was created 93 | output_path = tmp_path / "output.json" 94 | assert output_path.exists(), "Output file was not created" 95 | 96 | # Load and check the output 97 | with open(output_path, "r") as f: 98 | output_data = json.load(f) 99 | 100 | # Verify the output 101 | assert len(output_data) > 0, "Output data is empty" 102 | assert all( 103 | "sentiment" in item for item in output_data 104 | ), "Sentiment analysis was not applied to all items" 105 | assert all( 106 | len(item["text"].split()) >= 5 for item in output_data 107 | ), "Filter operation did not remove short texts" 108 | 109 | # Check if the cost was calculated and is greater than 0 110 | assert total_cost > 0, "Total cost was not calculated or is 0" 111 | 112 | print(f"Pipeline executed successfully. Total cost: ${total_cost:.2f}") 113 | print(f"Output: {output_data}") 114 | -------------------------------------------------------------------------------- /tests/test_ollama.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import pytest 3 | import json 4 | import tempfile 5 | import os 6 | from docetl.api import ( 7 | Pipeline, 8 | Dataset, 9 | MapOp, 10 | ReduceOp, 11 | PipelineStep, 12 | PipelineOutput, 13 | ) 14 | from dotenv import load_dotenv 15 | 16 | load_dotenv() 17 | 18 | # Set the OLLAMA_API_BASE environment variable 19 | os.environ["OLLAMA_API_BASE"] = "http://localhost:11434/" 20 | 21 | 22 | @pytest.fixture 23 | def temp_input_file(): 24 | with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as tmp: 25 | json.dump( 26 | [ 27 | {"text": "This is a test", "group": "A"}, 28 | {"text": "Another test", "group": "B"}, 29 | ], 30 | tmp, 31 | ) 32 | yield tmp.name 33 | os.unlink(tmp.name) 34 | 35 | 36 | @pytest.fixture 37 | def temp_output_file(): 38 | with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as tmp: 39 | pass 40 | yield tmp.name 41 | os.unlink(tmp.name) 42 | 43 | 44 | @pytest.fixture 45 | def temp_intermediate_dir(): 46 | with tempfile.TemporaryDirectory() as tmpdirname: 47 | yield tmpdirname 48 | 49 | 50 | @pytest.fixture 51 | def map_config(): 52 | return MapOp( 53 | name="sentiment_analysis", 54 | type="map", 55 | prompt="Analyze the sentiment of the following text: '{{ input.text }}'. Classify it as either positive, negative, or neutral.", 56 | output={"schema": {"sentiment": "string"}}, 57 | model="ollama/llama3.1", 58 | ) 59 | 60 | 61 | @pytest.fixture 62 | def reduce_config(): 63 | return ReduceOp( 64 | name="group_summary", 65 | type="reduce", 66 | reduce_key="group", 67 | prompt="Summarize the following group of values: {{ inputs }} Provide a total and any other relevant statistics.", 68 | output={"schema": {"total": "number", "avg": "number"}}, 69 | model="ollama/llama3.1", 70 | ) 71 | 72 | 73 | @pytest.fixture(autouse=True) 74 | def remove_openai_api_key(): 75 | openai_api_key = os.environ.pop("OPENAI_API_KEY", None) 76 | yield 77 | if openai_api_key: 78 | os.environ["OPENAI_API_KEY"] = openai_api_key 79 | 80 | 81 | def test_ollama_map_reduce_pipeline( 82 | map_config, reduce_config, temp_input_file, temp_output_file, temp_intermediate_dir 83 | ): 84 | pipeline = Pipeline( 85 | name="test_ollama_pipeline", 86 | datasets={"test_input": Dataset(type="file", path=temp_input_file)}, 87 | operations=[map_config, reduce_config], 88 | steps=[ 89 | PipelineStep( 90 | name="pipeline", 91 | input="test_input", 92 | operations=["sentiment_analysis", "group_summary"], 93 | ), 94 | ], 95 | output=PipelineOutput( 96 | type="file", path=temp_output_file, intermediate_dir=temp_intermediate_dir 97 | ), 98 | default_model="ollama/llama3.1", 99 | ) 100 | 101 | cost = pipeline.run() 102 | 103 | assert isinstance(cost, float) 104 | assert cost == 0 105 | 106 | # Verify output file exists and contains data 107 | assert os.path.exists(temp_output_file) 108 | with open(temp_output_file, "r") as f: 109 | output_data = json.load(f) 110 | assert len(output_data) > 0 111 | 112 | # Clean up 113 | shutil.rmtree(temp_intermediate_dir) 114 | -------------------------------------------------------------------------------- /tests/test_validation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from docetl.operations.map import MapOperation 3 | from tests.conftest import api_wrapper, default_model, max_threads 4 | 5 | 6 | @pytest.fixture 7 | def map_config_with_validation(): 8 | return { 9 | "name": "sentiment_analysis_with_validation", 10 | "type": "map", 11 | "prompt": "Analyze the sentiment of the following text: '{{ input.text }}'. Classify it as either positive, negative, or neutral.", 12 | "output": {"schema": {"sentiment": "string", "confidence": "float"}}, 13 | "model": "gpt-4o-mini", 14 | "validate": [ 15 | "output['sentiment'] in ['positive', 'negative', 'neutral']", 16 | "0 <= output['confidence'] <= 1", 17 | ], 18 | "num_retries_on_validate_failure": 2, 19 | } 20 | 21 | 22 | @pytest.fixture 23 | def sample_data(): 24 | return [ 25 | {"text": "I love this product! It's amazing."}, 26 | {"text": "This is the worst experience ever."}, 27 | {"text": "The weather is okay today."}, 28 | ] 29 | 30 | 31 | def test_map_operation_with_validation( 32 | map_config_with_validation, sample_data, api_wrapper, default_model, max_threads 33 | ): 34 | map_config_with_validation["bypass_cache"] = True 35 | operation = MapOperation( 36 | api_wrapper, map_config_with_validation, default_model, max_threads 37 | ) 38 | results, cost = operation.execute(sample_data) 39 | 40 | assert len(results) == len(sample_data) 41 | assert cost > 0 42 | 43 | for result in results: 44 | assert "sentiment" in result 45 | assert "confidence" in result 46 | assert result["sentiment"] in ["positive", "negative", "neutral"] 47 | assert 0 <= result["confidence"] <= 1 48 | -------------------------------------------------------------------------------- /website/.env.local.sample: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=sk-xxx 2 | OPENAI_API_BASE=https://api.openai.com/v1 3 | MODEL_NAME=gpt-4o-mini 4 | 5 | NEXT_PUBLIC_BACKEND_HOST=localhost 6 | NEXT_PUBLIC_BACKEND_PORT=8000 7 | NEXT_PUBLIC_HOSTED_DOCWRANGLER=false 8 | -------------------------------------------------------------------------------- /website/README.md: -------------------------------------------------------------------------------- 1 | This is a [Next.js](https://nextjs.org) project bootstrapped with [`create-next-app`](https://nextjs.org/docs/app/api-reference/cli/create-next-app). This is DocWrangler, the frontend for DocETL. 2 | 3 | ## Getting Started 4 | 5 | ### Setting up environment variables 6 | 7 | Copy the .env.sample file from the root directory to .env.local and modify the environment variables inside: 8 | 9 | ```bash 10 | OPENAI_API_KEY=sk-xxx 11 | OPENAI_API_BASE=https://api.openai.com/v1 12 | MODEL_NAME=gpt-4o-mini 13 | 14 | NEXT_PUBLIC_BACKEND_HOST=localhost 15 | NEXT_PUBLIC_BACKEND_PORT=8008 16 | 17 | ``` 18 | 19 | First, run the development server: 20 | 21 | ```bash 22 | npm run dev 23 | # or 24 | yarn dev 25 | # or 26 | pnpm dev 27 | # or 28 | bun dev 29 | ``` 30 | 31 | Open [http://localhost:3000](http://localhost:3000) with your browser to see the result. 32 | 33 | You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file. 34 | 35 | This project uses [`next/font`](https://nextjs.org/docs/app/building-your-application/optimizing/fonts) to automatically optimize and load [Geist](https://vercel.com/font), a new font family for Vercel. 36 | 37 | ## Learn More 38 | 39 | To learn more about Next.js, take a look at the following resources: 40 | 41 | - [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API. 42 | - [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial. 43 | 44 | You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js) - your feedback and contributions are welcome! 45 | 46 | ## Deploy on Vercel 47 | 48 | The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js. 49 | 50 | Check out our [Next.js deployment documentation](https://nextjs.org/docs/app/building-your-application/deploying) for more details. 51 | -------------------------------------------------------------------------------- /website/components.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://ui.shadcn.com/schema.json", 3 | "style": "new-york", 4 | "rsc": true, 5 | "tsx": true, 6 | "tailwind": { 7 | "config": "tailwind.config.ts", 8 | "css": "src/app/globals.css", 9 | "baseColor": "zinc", 10 | "cssVariables": true, 11 | "prefix": "" 12 | }, 13 | "aliases": { 14 | "components": "@/components", 15 | "utils": "@/lib/utils", 16 | "ui": "@/components/ui", 17 | "lib": "@/lib", 18 | "hooks": "@/hooks" 19 | } 20 | } -------------------------------------------------------------------------------- /website/eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import globals from "globals"; 2 | import pluginJs from "@eslint/js"; 3 | import tseslint from "typescript-eslint"; 4 | import pluginReact from "eslint-plugin-react"; 5 | import pluginUnusedImports from "eslint-plugin-unused-imports"; 6 | 7 | export default [ 8 | {files: ["**/*.{js,mjs,cjs,ts,jsx,tsx}"]}, 9 | {languageOptions: { globals: globals.browser }}, 10 | { 11 | plugins: { 12 | 'unused-imports': pluginUnusedImports 13 | }, 14 | rules: { 15 | "no-unused-vars": "off", 16 | "unused-imports/no-unused-imports": "error", 17 | "unused-imports/no-unused-vars": [ 18 | "warn", 19 | { 20 | "vars": "all", 21 | "varsIgnorePattern": "^_", 22 | "args": "after-used", 23 | "argsIgnorePattern": "^_" 24 | } 25 | ] 26 | } 27 | }, 28 | pluginJs.configs.recommended, 29 | ...tseslint.configs.recommended, 30 | pluginReact.configs.flat.recommended, 31 | ]; -------------------------------------------------------------------------------- /website/next.config.mjs: -------------------------------------------------------------------------------- 1 | /** @type {import('next').NextConfig} */ 2 | const nextConfig = {}; 3 | 4 | export default nextConfig; 5 | -------------------------------------------------------------------------------- /website/postcss.config.mjs: -------------------------------------------------------------------------------- 1 | /** @type {import('postcss-load-config').Config} */ 2 | const config = { 3 | plugins: { 4 | tailwindcss: {}, 5 | }, 6 | }; 7 | 8 | export default config; 9 | -------------------------------------------------------------------------------- /website/posts/hello-world.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Hello, World!" 3 | date: "2024-09-13" 4 | --- 5 | 6 | This is just a test post. 7 | -------------------------------------------------------------------------------- /website/public/berkeley.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/website/public/berkeley.png -------------------------------------------------------------------------------- /website/public/docetl-50m-fall-2024.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/website/public/docetl-50m-fall-2024.pdf -------------------------------------------------------------------------------- /website/public/docetl-favicon-color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/website/public/docetl-favicon-color.png -------------------------------------------------------------------------------- /website/public/epiclogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/website/public/epiclogo.png -------------------------------------------------------------------------------- /website/src/app/MarkdownRenderer.tsx: -------------------------------------------------------------------------------- 1 | import ReactMarkdown from "react-markdown"; 2 | 3 | const MarkdownRenderer = ({ content }: { content: any }) => { 4 | return ( 5 |
38 | {children}
39 |
40 | ) : (
41 |
42 |
43 | {children}
44 |
45 |
46 | );
47 | },
48 | }}
49 | >
50 | {content}
51 |
41 | {operation}
42 |
43 | )}
44 |
50 | {section}
51 |
52 | );
53 |
54 | const [preOperations, rest] = code.split(/(?=^operations:)/m);
55 | const [operations, postOperations] = rest.split(/(?=pipeline:)/);
56 | const operationsList = operations.split(/(?= {2}- name:)/).slice(1); // Remove the "operations:" line
57 |
58 | return (
59 |
62 | operations:
63 |
64 | {contextData}77 | )} 78 |
86 | {prompt.prompt} 87 |88 |