├── .env.sample
├── .github
    └── workflows
    │   ├── ci.yml
    │   ├── docker-ci.yml
    │   ├── docs.yml
    │   ├── release.yml
    │   └── stage.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── docetl
    ├── __init__.py
    ├── api.py
    ├── apis
    │   ├── __init__.py
    │   └── pd_accessors.py
    ├── base_schemas.py
    ├── cli.py
    ├── config_wrapper.py
    ├── console.py
    ├── containers.py
    ├── dataset.py
    ├── operations
    │   ├── __init__.py
    │   ├── add_uuid.py
    │   ├── base.py
    │   ├── cluster.py
    │   ├── clustering_utils.py
    │   ├── code_operations.py
    │   ├── equijoin.py
    │   ├── extract.py
    │   ├── filter.py
    │   ├── gather.py
    │   ├── link_resolve.py
    │   ├── map.py
    │   ├── rank.py
    │   ├── reduce.py
    │   ├── resolve.py
    │   ├── sample.py
    │   ├── scan.py
    │   ├── split.py
    │   ├── unnest.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── cache.py
    │   │   ├── llm.py
    │   │   ├── progress.py
    │   │   └── validation.py
    ├── optimizer.py
    ├── optimizers
    │   ├── __init__.py
    │   ├── join_optimizer.py
    │   ├── map_optimizer
    │   │   ├── __init__.py
    │   │   ├── config_generators.py
    │   │   ├── evaluator.py
    │   │   ├── operation_creators.py
    │   │   ├── optimizer.py
    │   │   ├── plan_generators.py
    │   │   ├── prompt_generators.py
    │   │   └── utils.py
    │   ├── reduce_optimizer.py
    │   └── utils.py
    ├── parsing_tools.py
    ├── ratelimiter.py
    ├── runner.py
    ├── schemas.py
    └── utils.py
├── docker-compose.yml
├── docs
    ├── advanced
    │   ├── custom-operators.md
    │   ├── extending-agents.md
    │   └── performance-tuning.md
    ├── api-reference
    │   ├── cli.md
    │   ├── docetl.md
    │   ├── operations.md
    │   ├── optimizers.md
    │   └── python.md
    ├── assets
    │   ├── docetl-favicon-color.png
    │   ├── fatal.json
    │   ├── headerdiagram.png
    │   ├── medical_transcripts.json
    │   ├── readmefig.png
    │   └── tutorial
    │   │   ├── add-notes.png
    │   │   ├── dataset-view.png
    │   │   ├── initial-outputs.png
    │   │   ├── one-operation.png
    │   │   ├── operation-details.png
    │   │   ├── prompt-improvement.png
    │   │   └── prompt-v2.png
    ├── best-practices.md
    ├── community
    │   ├── index.md
    │   └── roadmap.md
    ├── concepts
    │   ├── operators.md
    │   ├── optimization.md
    │   ├── pipelines.md
    │   └── schemas.md
    ├── examples
    │   ├── annotating-legal-documents.md
    │   ├── characterizing-troll-behavior.md
    │   ├── custom-parsing.md
    │   ├── mining-product-reviews.md
    │   ├── ollama.md
    │   ├── pdf-analysis-gemini.md
    │   ├── presidential-debate-themes.md
    │   ├── rate-limiting.md
    │   └── split-gather.md
    ├── execution
    │   └── running-pipelines.md
    ├── index.md
    ├── installation.md
    ├── operators
    │   ├── cluster.md
    │   ├── code.md
    │   ├── equijoin.md
    │   ├── extract.md
    │   ├── filter.md
    │   ├── gather.md
    │   ├── link-resolve.md
    │   ├── map.md
    │   ├── parallel-map.md
    │   ├── rank.md
    │   ├── reduce.md
    │   ├── resolve.md
    │   ├── sample.md
    │   ├── split.md
    │   └── unnest.md
    ├── optimization
    │   ├── configuration.md
    │   ├── example.md
    │   ├── overview.md
    │   └── python-api.md
    ├── pandas
    │   ├── examples.md
    │   ├── index.md
    │   └── operations.md
    ├── playground
    │   ├── features.md
    │   ├── index.md
    │   └── tutorial.md
    ├── python
    │   ├── examples.md
    │   └── index.md
    ├── stylesheets
    │   └── extra.css
    ├── tutorial-pythonapi.md
    └── tutorial.md
├── example_data
    ├── debates
    │   ├── data.json
    │   ├── theme_evolution_analysis_baseline.json
    │   └── theme_evolution_analysis_reduce_gleaning.json
    ├── post_di_trump_motion.json
    └── steamgames
    │   └── frequent_polarizing_themes.json
├── experiments
    ├── extraction_outputs.txt
    ├── logical_fallacy_extraction.py
    ├── structured_outputs.py
    └── structured_outputs.txt
├── mkdocs.yml
├── poetry.lock
├── pyproject.toml
├── server
    ├── __init__.py
    └── app
    │   ├── __init__.py
    │   ├── main.py
    │   ├── models.py
    │   └── routes
    │       ├── __init__.py
    │       ├── convert.py
    │       ├── filesystem.py
    │       └── pipeline.py
├── tests
    ├── __init__.py
    ├── basic
    │   ├── sample_texts
    │   │   ├── one.txt
    │   │   └── two.md
    │   ├── test_basic_filter_split_gather.py
    │   ├── test_basic_map.py
    │   ├── test_basic_parallel_map.py
    │   ├── test_basic_reduce_resolve.py
    │   ├── test_cluster_and_sample.py
    │   ├── test_code_operations.py
    │   ├── test_optimizer.py
    │   └── test_pipeline_with_parsing.py
    ├── conftest.py
    ├── data
    │   └── PublicWaterMassMailing.pdf
    ├── ranking
    │   ├── plots
    │   │   ├── harmfulness_budget_performance.png
    │   │   ├── medical_pain_budget_performance.png
    │   │   └── synthetic_abstracts_budget_performance.png
    │   ├── test_rank.py
    │   └── test_rank_budget.py
    ├── test_api.py
    ├── test_azure_rl.py
    ├── test_config.py
    ├── test_eugene.py
    ├── test_ollama.py
    ├── test_pandas_accessors.py
    ├── test_parsing_tools.py
    ├── test_reduce_scale.py
    ├── test_reduce_value_sampling.py
    ├── test_resolve_auto_batch.py
    ├── test_runner_caching.py
    ├── test_split.py
    ├── test_synth_gather.py
    ├── test_synth_resolve.py
    ├── test_synthetic_output.py
    └── test_validation.py
└── website
    ├── .env.local.sample
    ├── README.md
    ├── components.json
    ├── eslint.config.mjs
    ├── next.config.mjs
    ├── package-lock.json
    ├── package.json
    ├── postcss.config.mjs
    ├── posts
        └── hello-world.md
    ├── public
        ├── berkeley.png
        ├── debate_gemini_result.txt
        ├── debate_intermediates
        │   ├── extract_themes_and_viewpoints.json
        │   ├── summarize_theme_evolution.json
        │   ├── synthesized_resolve_0.json
        │   └── unnest_themes.json
        ├── debate_transcripts.json
        ├── demos
        │   ├── prompts_pipeline.yaml
        │   └── rfi_pipeline.yaml
        ├── docetl-50m-fall-2024.pdf
        ├── docetl-favicon-color.png
        ├── epiclogo.png
        ├── llms-full.txt
        ├── llms.txt
        └── theme_evolution_analysis.json
    ├── src
        ├── app
        │   ├── MarkdownRenderer.tsx
        │   ├── api
        │   │   ├── chat
        │   │   │   └── route.ts
        │   │   ├── checkNamespace
        │   │   │   └── route.ts
        │   │   ├── constants.ts
        │   │   ├── convertDocuments
        │   │   │   └── route.ts
        │   │   ├── downloadTutorialDataset
        │   │   │   └── route.ts
        │   │   ├── edit
        │   │   │   └── route.ts
        │   │   ├── generate
        │   │   │   └── route.ts
        │   │   ├── getInputOutput
        │   │   │   └── route.ts
        │   │   ├── getPipelineConfig
        │   │   │   └── route.ts
        │   │   ├── readFile
        │   │   │   └── route.ts
        │   │   ├── readFilePage
        │   │   │   └── route.ts
        │   │   ├── rfi-responses
        │   │   │   └── route.ts
        │   │   ├── saveDocuments
        │   │   │   └── route.ts
        │   │   ├── serveDocument
        │   │   │   └── [...path]
        │   │   │   │   └── route.ts
        │   │   ├── shouldOptimize
        │   │   │   └── route.ts
        │   │   ├── uploadFile
        │   │   │   └── route.ts
        │   │   ├── utils.ts
        │   │   └── writePipelineConfig
        │   │   │   └── route.ts
        │   ├── blog
        │   │   ├── [id]
        │   │   │   └── page.tsx
        │   │   └── page.tsx
        │   ├── fonts
        │   │   ├── GeistMonoVF.woff
        │   │   └── GeistVF.woff
        │   ├── globals.css
        │   ├── layout.tsx
        │   ├── localStorageKeys.ts
        │   ├── page.tsx
        │   ├── playground
        │   │   └── page.tsx
        │   ├── providers.tsx
        │   ├── showcase
        │   │   ├── ai-rfi-response-analysis
        │   │   │   └── page.tsx
        │   │   ├── ai-system-prompts-analysis
        │   │   │   └── page.tsx
        │   │   └── page.tsx
        │   └── types.ts
        ├── components
        │   ├── AIChatPanel.tsx
        │   ├── AIEditPopover.tsx
        │   ├── APIKeysDialog.tsx
        │   ├── AnsiRenderer.tsx
        │   ├── BookmarksPanel.tsx
        │   ├── CollapsibleCode.tsx
        │   ├── ColumnDialog.tsx
        │   ├── DarkMode.tsx
        │   ├── DatasetView.tsx
        │   ├── DebateContent.tsx
        │   ├── DocumentViewer.tsx
        │   ├── FileExplorer.tsx
        │   ├── InlineEditingButton.tsx
        │   ├── LLMContextPopover.tsx
        │   ├── MarkdownCell.tsx
        │   ├── NamespaceDialog.tsx
        │   ├── NaturalLanguagePipelineDialog.tsx
        │   ├── OperationCard.tsx
        │   ├── OperationHelpButton.tsx
        │   ├── OptimizationDialog.tsx
        │   ├── Output.tsx
        │   ├── PipelineGui.tsx
        │   ├── PipelinePrompts.tsx
        │   ├── PipelineSettings.tsx
        │   ├── PipelineVisualization.tsx
        │   ├── PresidentialDebateDemo.tsx
        │   ├── PrettyJSON.tsx
        │   ├── PromptImprovementDialog.tsx
        │   ├── ResizableDataTable.tsx
        │   ├── RowNavigator.tsx
        │   ├── SearchableCell.tsx
        │   ├── SpotlightOverlay.tsx
        │   ├── TutorialsDialog.tsx
        │   ├── operations
        │   │   ├── args.tsx
        │   │   └── components.tsx
        │   ├── showcase
        │   │   ├── rfi-response-explorer.tsx
        │   │   └── system-prompts-explorer.tsx
        │   ├── ui
        │   │   ├── accordion.tsx
        │   │   ├── alert-dialog.tsx
        │   │   ├── alert.tsx
        │   │   ├── badge.tsx
        │   │   ├── button.tsx
        │   │   ├── card.tsx
        │   │   ├── checkbox.tsx
        │   │   ├── collapsible.tsx
        │   │   ├── command.tsx
        │   │   ├── context-menu.tsx
        │   │   ├── dialog.tsx
        │   │   ├── dropdown-menu.tsx
        │   │   ├── form.tsx
        │   │   ├── hover-card.tsx
        │   │   ├── input.tsx
        │   │   ├── label.tsx
        │   │   ├── menubar.tsx
        │   │   ├── pagination.tsx
        │   │   ├── popover.tsx
        │   │   ├── progress.tsx
        │   │   ├── radio-group.tsx
        │   │   ├── resizable.tsx
        │   │   ├── scroll-area.tsx
        │   │   ├── select.tsx
        │   │   ├── skeleton.tsx
        │   │   ├── switch.tsx
        │   │   ├── table.tsx
        │   │   ├── tabs.tsx
        │   │   ├── textarea.tsx
        │   │   ├── toast.tsx
        │   │   ├── toaster.tsx
        │   │   └── tooltip.tsx
        │   └── utils.ts
        ├── contexts
        │   ├── BookmarkContext.tsx
        │   ├── PipelineContext.tsx
        │   ├── ThemeContext.tsx
        │   └── WebSocketContext.tsx
        ├── hooks
        │   ├── use-toast.ts
        │   ├── useDatasetUpload.ts
        │   ├── useOptimizeCheck.ts
        │   └── useRestorePipeline.ts
        ├── lib
        │   ├── analytics.ts
        │   ├── api-config.ts
        │   ├── api.ts
        │   └── utils.ts
        ├── mocks
        │   └── mockData.ts
        └── utils
        │   └── fileOperations.ts
    ├── tailwind.config.ts
    ├── todos.md
    ├── tsconfig.json
    └── vercel.json


/.env.sample:
--------------------------------------------------------------------------------
 1 | # BACKEND configuration
 2 | BACKEND_ALLOW_ORIGINS=http://localhost:3000,http://127.0.0.1:3000
 3 | BACKEND_HOST=localhost
 4 | BACKEND_PORT=8000
 5 | BACKEND_RELOAD=True
 6 | 
 7 | # FRONTEND configuration
 8 | FRONTEND_HOST=0.0.0.0
 9 | FRONTEND_PORT=3000
10 | 
11 | # Host port mapping for docker-compose (if not set, defaults are used in docker-compose.yml)
12 | FRONTEND_DOCKER_COMPOSE_PORT=3031
13 | BACKEND_DOCKER_COMPOSE_PORT=8081
14 | 
15 | # Supported text file encodings
16 | TEXT_FILE_ENCODINGS=utf-8,latin1,cp1252,iso-8859-1
17 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   pull_request:
 6 |   push:
 7 |     branches:
 8 |       - main
 9 | 
10 | jobs:
11 |   test:
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         python-version: ["3.10", "3.11", "3.12"]
16 |     env:
17 |       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
18 | 
19 |     steps:
20 |       - name: Checkout code
21 |         uses: actions/checkout@v2
22 | 
23 |       - name: Set up Python
24 |         uses: actions/setup-python@v2
25 |         with:
26 |           python-version: ${{ matrix.python-version }}
27 | 
28 |       - name: Install Poetry
29 |         uses: snok/install-poetry@v1
30 | 
31 |       - name: Copy environment file
32 |         run: cp .env.sample .env
33 | 
34 |       - name: Install dependencies
35 |         run: make install
36 | 
37 |       - name: Run pytest
38 |         run: make tests-basic
39 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-ci.yml:
--------------------------------------------------------------------------------
 1 | name: Docker CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   docker-build-test:
11 |     runs-on: ubuntu-latest
12 |     
13 |     steps:
14 |     - name: Remove unnecessary files
15 |       run: |
16 |         sudo rm -rf /usr/share/dotnet
17 |         sudo rm -rf "$AGENT_TOOLSDIRECTORY"
18 |     - uses: actions/checkout@v4
19 |     - name: Remove .env copy from Dockerfile
20 |       run: sed -i '/COPY .env/d' Dockerfile
21 |       
22 |     - name: Build Docker image
23 |       run: |
24 |         if ! docker build -t docetl .; then
25 |           echo "Docker build failed"
26 |           exit 1
27 |         fi
28 |       
29 |     - name: Create Docker volume
30 |       run: docker volume create docetl-data
31 |       
32 |     - name: Test Docker container
33 |       run: |
34 |         # Run the container in detached mode
35 |         docker run -d \
36 |           -p 3000:3000 \
37 |           -p 8000:8000 \
38 |           -v docetl-data:/docetl-data \
39 |           -e FRONTEND_HOST=0.0.0.0 \
40 |           -e FRONTEND_PORT=3000 \
41 |           -e BACKEND_HOST=0.0.0.0 \
42 |           -e BACKEND_PORT=8000 \
43 |           --name docetl-test \
44 |           docetl
45 | 
46 |         # Wait for initial startup
47 |         echo "Waiting for container to start..."
48 |         sleep 30
49 | 
50 |         frontend_healthy=false
51 | 
52 |         # Check container health for up to 3 minutes
53 |         for i in {1..6}; do
54 |           if ! docker ps -q -f name=docetl-test > /dev/null 2>&1; then
55 |             echo "Container stopped unexpectedly"
56 |             docker logs docetl-test
57 |             exit 1
58 |           fi
59 | 
60 |           # Try to curl the frontend
61 |           if curl -s -f http://localhost:3000/playground > /dev/null; then
62 |             echo "Frontend is responding"
63 |             frontend_healthy=true
64 |             break
65 |           fi
66 | 
67 |           if [ $i -eq 6 ]; then
68 |             echo "Container health check failed after 3 minutes"
69 |             docker logs docetl-test
70 |             exit 1
71 |           fi
72 | 
73 |           echo "Waiting for services to be ready... (attempt $i/6)"
74 |           sleep 30
75 |         done
76 |         
77 |         # Explicitly fail if frontend check never succeeded
78 |         if [ "$frontend_healthy" = false ]; then
79 |           echo "Frontend health check failed"
80 |           docker logs docetl-test
81 |           exit 1
82 |         fi
83 |         
84 |         # If we get here, container is running and healthy
85 |         echo "Container is running successfully"
86 |         
87 |         # Cleanup
88 |         docker stop docetl-test
89 |         docker rm docetl-test
90 |         
91 |     - name: Clean up Docker volume
92 |       run: docker volume rm docetl-data 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 |   name: docs
 2 |   on:
 3 |     push:
 4 |       branches:
 5 |         - master
 6 |         - main
 7 |   permissions:
 8 |     contents: write
 9 |   jobs:
10 |     deploy:
11 |       runs-on: ubuntu-latest
12 |       steps:
13 |         - uses: actions/checkout@v4
14 |         - name: Configure Git Credentials
15 |           run: |
16 |             git config user.name github-actions[bot]
17 |             git config user.email 41898282+github-actions[bot]@users.noreply.github.com
18 |         - uses: actions/setup-python@v5
19 |           with:
20 |             python-version: 3.x
21 |         - name: Install dependencies
22 |           run: |
23 |             pip install mkdocs==1.6.1 \
24 |                         mkdocs-material==9.5.34 \
25 |                         mkdocstrings==0.26.1 \
26 |                         mkdocstrings-python==1.11.1 \
27 |                         mkdocs-glightbox==0.4.0 \
28 |                         pytkdocs==0.16.2
29 |         - name: Copy environment file
30 |           run: cp .env.sample .env
31 |         - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
32 |         - uses: actions/cache@v4
33 |           with:
34 |             key: mkdocs-material-${{ env.cache_id }}
35 |             path: .cache
36 |             restore-keys: |
37 |               mkdocs-material-
38 |         - run: mkdocs build
39 |         - run: mkdocs gh-deploy --force
40 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "*.*.*"
 7 | 
 8 | permissions:
 9 |   contents: read
10 | 
11 | jobs:
12 |   pypi-publish:
13 |     name: Upload release to PyPI
14 |     runs-on: ubuntu-latest
15 |     environment:
16 |       name: pypi
17 |       url: https://pypi.org/project/docetl/
18 |     permissions:
19 |       id-token: write
20 |     steps:
21 |       - name: Checkout code
22 |         uses: actions/checkout@v4
23 | 
24 |       - name: Set up Python 3.10
25 |         uses: actions/setup-python@v5
26 |         with:
27 |           python-version: "3.10"
28 | 
29 |       - name: Install Poetry
30 |         run: |
31 |           curl -sSL https://install.python-poetry.org | python - -y
32 | 
33 |       - name: Update PATH
34 |         run: echo "$HOME/.local/bin" >> $GITHUB_PATH
35 | 
36 |       - name: Update Poetry configuration
37 |         run: poetry config virtualenvs.create false
38 | 
39 |       - name: Install dependencies
40 |         run: poetry install --sync --no-interaction
41 | 
42 |       - name: Package project
43 |         run: poetry build
44 | 
45 |       - name: Publish package distributions to PyPI
46 |         uses: pypa/gh-action-pypi-publish@release/v1
47 | 


--------------------------------------------------------------------------------
/.github/workflows/stage.yml:
--------------------------------------------------------------------------------
 1 | name: Create or Update PR from staging to main
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - staging
 7 |   pull_request:
 8 |     types:
 9 |       - closed
10 |     branches:
11 |       - staging
12 | 
13 | jobs:
14 |   create-or-update-pr:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |         with:
19 |           fetch-depth: 0
20 | 
21 |       - name: Check for existing PR
22 |         id: check_pr
23 |         uses: actions/github-script@v6
24 |         with:
25 |           github-token: ${{secrets.GITHUB_TOKEN}}
26 |           script: |
27 |             const { data: pullRequests } = await github.rest.pulls.list({
28 |               owner: context.repo.owner,
29 |               repo: context.repo.repo,
30 |               state: 'open',
31 |               head: 'staging',
32 |               base: 'main'
33 |             });
34 |             return pullRequests.length > 0 ? 'true' : 'false';
35 | 
36 |       - name: Create Pull Request
37 |         if: steps.check_pr.outputs.result == 'false'
38 |         uses: repo-sync/pull-request@v2
39 |         with:
40 |           source_branch: "staging"
41 |           destination_branch: "main"
42 |           pr_title: "Merge staging into main"
43 |           pr_body: "This PR was automatically created to merge changes from staging into main."
44 |           github_token: ${{ secrets.GITHUB_TOKEN }}
45 | 
46 |       - name: Update Pull Request
47 |         if: steps.check_pr.outputs.result == 'true'
48 |         uses: actions/github-script@v6
49 |         with:
50 |           github-token: ${{secrets.GITHUB_TOKEN}}
51 |           script: |
52 |             const { data: pullRequests } = await github.rest.pulls.list({
53 |               owner: context.repo.owner,
54 |               repo: context.repo.repo,
55 |               state: 'open',
56 |               head: 'staging',
57 |               base: 'main'
58 |             });
59 | 
60 |             if (pullRequests.length > 0) {
61 |               const prNumber = pullRequests[0].number;
62 |               await github.rest.pulls.update({
63 |                 owner: context.repo.owner,
64 |                 repo: context.repo.repo,
65 |                 pull_number: prNumber,
66 |                 body: 'This PR has been automatically updated with the latest changes from staging.'
67 |               });
68 |               console.log(`Updated PR #${prNumber}`);
69 |             }
70 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .env
 2 | *__pycache__*
 3 | *scratch*
 4 | *relevance_assessment*
 5 | palimpzest/*
 6 | paper_workloads/contracts/full_contract_txt*
 7 | paper_workloads/contracts/sample_contract_txt*
 8 | *.xlsx
 9 | *.csv
10 | paper_workloads/*
11 | preprint_workloads/*
12 | workloads/*
13 | *mypy_cache*
14 | *.DS_Store
15 | *pytest_cache*
16 | *ruff_cache*
17 | motion-old*
18 | venv/
19 | 
20 | # dependencies
21 | website/node_modules
22 | website/.pnp
23 | website/.pnp.js
24 | website/.yarn/install-state.gz
25 | 
26 | # testing
27 | website/coverage
28 | 
29 | # next.js
30 | website/.next/
31 | website/out/
32 | 
33 | # production
34 | website/build
35 | 
36 | # misc
37 | website/.DS_Store
38 | website/*.pem
39 | 
40 | # debug
41 | website/npm-debug.log*
42 | website/yarn-debug.log*
43 | website/yarn-error.log*
44 | 
45 | # local env files
46 | website/.env*.local
47 | 
48 | # vercel
49 | website/.vercel
50 | 
51 | # typescript
52 | website/*.tsbuildinfo
53 | website/next-env.d.ts
54 | 
55 | # Docker
56 | .docker/
57 | 
58 | # experiments
59 | experiments/*.json
60 | 
61 | metrics_vs_cost.png
62 | tests/data/anthropic-red-team-attempts.jsonl
63 | tests/data/get_freshstack.py


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ci:
 2 |   autofix_prs: false
 3 | 
 4 | files: "^(docetl)/"
 5 | exclude: '\__init__.py$'
 6 | 
 7 | repos:
 8 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 9 |     rev: v4.5.0
10 |     hooks:
11 |       - id: trailing-whitespace
12 |       - id: end-of-file-fixer
13 |         exclude: ^.*\.egg-info/
14 |       - id: check-merge-conflict
15 |       - id: check-case-conflict
16 |       - id: pretty-format-json
17 |         args: [--autofix, --no-ensure-ascii, --no-sort-keys]
18 |       - id: check-ast
19 |       - id: debug-statements
20 |       - id: check-docstring-first
21 | 
22 |   - repo: https://github.com/hadialqattan/pycln
23 |     rev: v2.5.0
24 |     hooks:
25 |       - id: pycln
26 |         args: [--all, --exclude, "__init__.py$", --include, "^docetl/"]
27 | 
28 |   - repo: https://github.com/psf/black
29 |     rev: 24.1.1
30 |     hooks:
31 |       - id: black
32 | 
33 |   - repo: https://github.com/pycqa/isort
34 |     rev: 5.13.2
35 |     hooks:
36 |       - id: isort
37 |         name: "isort (python)"
38 |         types: [python]
39 |         args: [--profile, black]
40 | 
41 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
42 |     # Ruff version.
43 |     rev: "v0.2.1"
44 |     hooks:
45 |       - id: ruff
46 | 
47 |   - repo: https://github.com/pre-commit/pre-commit
48 |     rev: v3.6.0
49 |     hooks:
50 |       - id: validate_manifest
51 | 
52 |   - repo: https://github.com/pre-commit/mirrors-prettier
53 |     rev: "v4.0.0-alpha.8" # Prettier version
54 |     hooks:
55 |       - id: prettier
56 |         files: "^ui/"
57 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Build stage for Python dependencies
 2 | FROM python:3.11-slim AS python-builder
 3 | 
 4 | RUN pip install poetry==1.4.2
 5 | 
 6 | ENV POETRY_NO_INTERACTION=1 \
 7 |     POETRY_VIRTUALENVS_IN_PROJECT=1 \
 8 |     POETRY_VIRTUALENVS_CREATE=1 \
 9 |     POETRY_CACHE_DIR=/tmp/poetry_cache \
10 |     DOCETL_HOME_DIR="/docetl-data"
11 | 
12 | WORKDIR /app
13 | 
14 | COPY pyproject.toml poetry.lock ./
15 | COPY docetl/ ./docetl/
16 | COPY server/ ./server/
17 | COPY tests/ ./tests/
18 | RUN touch README.md
19 | 
20 | # Install with --no-root first for dependencies, then install with root for entrypoints
21 | RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry install --all-extras --no-root && \
22 |     poetry install --all-extras
23 | 
24 | # Build stage for Node.js dependencies
25 | FROM node:20-alpine AS node-builder
26 | 
27 | WORKDIR /app/website
28 | 
29 | # Update DOCETL_HOME_DIR to match final location
30 | ENV DOCETL_HOME_DIR="/docetl-data"
31 | 
32 | COPY website/package*.json ./
33 | RUN npm install
34 | COPY website/ ./
35 | RUN npm run build
36 | 
37 | # Final runtime stage
38 | FROM python:3.11-slim AS runtime
39 | 
40 | # Install Node.js
41 | RUN apt-get update && apt-get install -y \
42 |     curl \
43 |     && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
44 |     && apt-get install -y nodejs \
45 |     && rm -rf /var/lib/apt/lists/*
46 | 
47 | WORKDIR /app
48 | 
49 | # Copy Python virtual environment from builder
50 | ENV VIRTUAL_ENV=/app/.venv \
51 |     PATH="/app/.venv/bin:$PATH" \
52 |     PYTHONPATH="/app" \
53 |     DOCETL_HOME_DIR="/docetl-data"
54 | 
55 | COPY --from=python-builder /app/.venv ${VIRTUAL_ENV}
56 | 
57 | # Copy Python application files
58 | COPY docetl/ ./docetl/
59 | COPY server/ ./server/
60 | COPY tests/ ./tests/
61 | COPY pyproject.toml poetry.lock ./
62 | COPY .env ./
63 | 
64 | # Copy Node.js dependencies and application files
65 | COPY --from=node-builder /app/website ./website
66 | 
67 | ENV PORT=3000
68 | 
69 | # Create data directory with appropriate permissions
70 | RUN mkdir -p /docetl-data && chown -R nobody:nogroup /docetl-data && chmod 777 /docetl-data
71 | 
72 | # Define volume AFTER creating and setting permissions
73 | VOLUME ["/docetl-data"]
74 | 
75 | # Expose ports for frontend and backend
76 | EXPOSE 3000 8000
77 | 
78 | # Start both servers
79 | CMD ["sh", "-c", "python3 server/app/main.py & cd website && npm run start"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Shreya Shankar
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docetl/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "0.2.4"
 2 | 
 3 | import warnings
 4 | 
 5 | # TODO: Remove after https://github.com/BerriAI/litellm/issues/7560 is fixed
 6 | warnings.filterwarnings("ignore", category=UserWarning, module="pydantic._internal._config")
 7 | 
 8 | from docetl.runner import DSLRunner
 9 | from docetl.optimizer import Optimizer
10 | from docetl.apis.pd_accessors import SemanticAccessor
11 | 
12 | __all__ = ["DSLRunner", "Optimizer", "SemanticAccessor"]
13 | 


--------------------------------------------------------------------------------
/docetl/apis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docetl/apis/__init__.py


--------------------------------------------------------------------------------
/docetl/cli.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pathlib import Path
  3 | from typing import Optional
  4 | 
  5 | import typer
  6 | from dotenv import load_dotenv
  7 | 
  8 | from docetl.operations.utils import clear_cache as cc
  9 | from docetl.runner import DSLRunner
 10 | 
 11 | app = typer.Typer(pretty_exceptions_enable=False)
 12 | 
 13 | 
 14 | @app.command()
 15 | def build(
 16 |     yaml_file: Path = typer.Argument(
 17 |         ..., help="Path to the YAML file containing the pipeline configuration"
 18 |     ),
 19 |     max_threads: Optional[int] = typer.Option(
 20 |         None, help="Maximum number of threads to use for running operations"
 21 |     ),
 22 |     resume: bool = typer.Option(
 23 |         False, help="Resume optimization from a previous build that may have failed"
 24 |     ),
 25 |     save_path: Path = typer.Option(
 26 |         None, help="Path to save the optimized pipeline configuration"
 27 |     ),
 28 | ):
 29 |     """
 30 |     Build and optimize the configuration specified in the YAML file.
 31 |     Any arguments passed here will override the values in the YAML file.
 32 | 
 33 |     Args:
 34 |         yaml_file (Path): Path to the YAML file containing the pipeline configuration.
 35 |         max_threads (Optional[int]): Maximum number of threads to use for running operations.
 36 |         model (str): Model to use for optimization. Defaults to "gpt-4o".
 37 |         resume (bool): Whether to resume optimization from a previous run. Defaults to False.
 38 |         save_path (Path): Path to save the optimized pipeline configuration.
 39 |     """
 40 |     # Get the current working directory (where the user called the command)
 41 |     cwd = os.getcwd()
 42 | 
 43 |     # Load .env file from the current working directory
 44 |     env_file = os.path.join(cwd, ".env")
 45 |     if os.path.exists(env_file):
 46 |         load_dotenv(env_file)
 47 | 
 48 |     runner = DSLRunner.from_yaml(str(yaml_file), max_threads=max_threads)
 49 |     runner.optimize(
 50 |         save=True,
 51 |         return_pipeline=False,
 52 |         resume=resume,
 53 |         save_path=save_path,
 54 |     )
 55 | 
 56 | 
 57 | @app.command()
 58 | def run(
 59 |     yaml_file: Path = typer.Argument(
 60 |         ..., help="Path to the YAML file containing the pipeline configuration"
 61 |     ),
 62 |     max_threads: Optional[int] = typer.Option(
 63 |         None, help="Maximum number of threads to use for running operations"
 64 |     ),
 65 | ):
 66 |     """
 67 |     Run the configuration specified in the YAML file.
 68 | 
 69 |     Args:
 70 |         yaml_file (Path): Path to the YAML file containing the pipeline configuration.
 71 |         max_threads (Optional[int]): Maximum number of threads to use for running operations.
 72 |     """
 73 |     # Get the current working directory (where the user called the command)
 74 |     cwd = os.getcwd()
 75 | 
 76 |     # Load .env file from the current working directory
 77 |     env_file = os.path.join(cwd, ".env")
 78 |     if os.path.exists(env_file):
 79 |         load_dotenv(env_file)
 80 | 
 81 |     runner = DSLRunner.from_yaml(str(yaml_file), max_threads=max_threads)
 82 |     runner.load_run_save()
 83 | 
 84 | 
 85 | @app.command()
 86 | def clear_cache():
 87 |     """
 88 |     Clear the LLM cache stored on disk.
 89 |     """
 90 |     cc()
 91 | 
 92 | 
 93 | @app.command()
 94 | def version():
 95 |     """
 96 |     Display the current version of DocETL.
 97 |     """
 98 |     import docetl
 99 | 
100 |     typer.echo(f"DocETL version: {docetl.__version__}")
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     app()
105 | 


--------------------------------------------------------------------------------
/docetl/operations/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib.metadata
 2 | from docetl.operations.cluster import ClusterOperation
 3 | from docetl.operations.code_operations import CodeFilterOperation, CodeMapOperation, CodeReduceOperation
 4 | from docetl.operations.equijoin import EquijoinOperation
 5 | from docetl.operations.filter import FilterOperation
 6 | from docetl.operations.gather import GatherOperation
 7 | from docetl.operations.map import MapOperation
 8 | from docetl.operations.reduce import ReduceOperation
 9 | from docetl.operations.resolve import ResolveOperation
10 | from docetl.operations.rank import RankOperation
11 | from docetl.operations.split import SplitOperation
12 | from docetl.operations.sample import SampleOperation
13 | from docetl.operations.unnest import UnnestOperation
14 | from docetl.operations.scan import ScanOperation
15 | from docetl.operations.add_uuid import AddUuidOperation
16 | from docetl.operations.extract import ExtractOperation
17 | 
18 | mapping = {
19 |     "cluster": ClusterOperation,
20 |     "code_filter": CodeFilterOperation,
21 |     "code_map": CodeMapOperation,
22 |     "code_reduce": CodeReduceOperation,
23 |     "equijoin": EquijoinOperation,
24 |     "filter": FilterOperation,
25 |     "gather": GatherOperation,
26 |     "map": MapOperation,
27 |     "reduce": ReduceOperation,
28 |     "resolve": ResolveOperation,
29 |     "rank":  RankOperation,
30 |     "split": SplitOperation,
31 |     "sample": SampleOperation,
32 |     "unnest": UnnestOperation,
33 |     "scan": ScanOperation,
34 |     "add_uuid": AddUuidOperation,
35 |     "extract": ExtractOperation
36 | }
37 | 
38 | def get_operation(operation_type: str):
39 |     """Loads a single operation by name""" 
40 |     try:
41 |         entrypoint = importlib.metadata.entry_points(group="docetl.operation")[
42 |             operation_type
43 |         ]
44 |         return entrypoint.load()
45 |     except KeyError:
46 |         if operation_type in mapping:
47 |             return mapping[operation_type]
48 |         raise KeyError(f"Unrecognized operation {operation_type}")
49 | 
50 | def get_operations():
51 |     """Load all available operations and return them as a dictionary"""
52 |     operations = mapping.copy()
53 |     operations.update({
54 |         op.name: op.load()
55 |         for op in importlib.metadata.entry_points(group="docetl.operation")
56 |     })
57 |     return operations
58 | 


--------------------------------------------------------------------------------
/docetl/operations/add_uuid.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | from typing import Any, Dict, List, Tuple
 3 | 
 4 | from docetl.operations.base import BaseOperation
 5 | 
 6 | 
 7 | class AddUuidOperation(BaseOperation):
 8 |     """
 9 |     A class that implements an operation to add a UUID to each document.
10 | 
11 |     This class extends BaseOperation to:
12 |     1. Generate a unique UUID for each document
13 |     2. Add the UUID under a key formatted as {operation_name}_id
14 |     """
15 | 
16 |     class schema(BaseOperation.schema):
17 |         type: str = "add_uuid"
18 | 
19 |     def __init__(self, *args, **kwargs):
20 |         super().__init__(*args, **kwargs)
21 |         self.name = self.config["name"]
22 | 
23 |     def syntax_check(self) -> None:
24 |         # No additional configuration needed beyond base requirements
25 |         pass
26 | 
27 |     def execute(
28 |         self, input_data: List[Dict[str, Any]]
29 |     ) -> Tuple[List[Dict[str, Any]], float]:
30 |         results = []
31 |         cost = 0.0
32 | 
33 |         # If there's an id key in the config, use that as the id key
34 |         if "id_key" in self.config:
35 |             id_key = self.config["id_key"]
36 |         else:
37 |             id_key = f"{self.name}_id"
38 | 
39 |         for item in input_data:
40 |             result = item.copy()
41 |             result[id_key] = str(uuid.uuid4())
42 |             results.append(result)
43 | 
44 |         return results, cost
45 | 


--------------------------------------------------------------------------------
/docetl/operations/clustering_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains utilities for clustering based on different methods.
 3 | 
 4 | We use these in map and reduce operations.
 5 | """
 6 | 
 7 | from typing import Dict, List, Tuple
 8 | 
 9 | from docetl.operations.utils import APIWrapper
10 | from docetl.utils import completion_cost
11 | 
12 | 
13 | def get_embeddings_for_clustering(
14 |     items: List[Dict], sampling_config: Dict, api_wrapper: APIWrapper
15 | ) -> Tuple[List[List[float]], float]:
16 |     embedding_model = sampling_config.get("embedding_model", "text-embedding-3-small")
17 |     embedding_keys = sampling_config.get("embedding_keys")
18 |     if not embedding_keys:
19 |         embedding_keys = list(items[0].keys())
20 | 
21 |     if embedding_model == "sentence-transformer":
22 |         return get_embeddings_for_clustering_with_st(items, embedding_keys)
23 | 
24 |     embeddings = []
25 |     cost = 0
26 |     batch_size = 1000
27 | 
28 |     for i in range(0, len(items), batch_size):
29 |         batch = items[i : i + batch_size]
30 |         texts = [
31 |             " ".join(str(item[key]) for key in embedding_keys if key in item)[:10000]
32 |             for item in batch
33 |         ]
34 |         response = api_wrapper.gen_embedding(embedding_model, texts)
35 |         embeddings.extend([data["embedding"] for data in response["data"]])
36 |         cost += completion_cost(response)
37 | 
38 |     return embeddings, cost
39 | 
40 | 
41 | def get_embeddings_for_clustering_with_st(
42 |     items: List[Dict], embedding_keys: List[str]
43 | ) -> Tuple[List[List[float]], float]:
44 |     import torch
45 |     from sentence_transformers import SentenceTransformer
46 | 
47 |     device = "cpu"
48 |     if torch.backends.mps.is_available():
49 |         device = "mps"
50 |     elif torch.cuda.is_available():
51 |         device = "cuda"
52 | 
53 |     model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
54 |     embeddings = model.encode(
55 |         [
56 |             " ".join(str(item[key]) for key in embedding_keys if key in item)[:10000]
57 |             for item in items
58 |         ]
59 |     )
60 |     return embeddings, 0
61 | 
62 | 
63 | def cluster_documents(
64 |     documents: List[Dict],
65 |     sampling_config: Dict,
66 |     sample_size: int,
67 |     api_wrapper: APIWrapper,
68 | ) -> Tuple[Dict[int, List[Dict]], float]:
69 |     """
70 |     Cluster documents using KMeans clustering algorithm.
71 | 
72 |     Args:
73 |         documents (List[Dict]): The list of documents to cluster.
74 |         sampling_config (Dict): The sampling configuration. Must contain embedding_model. If embedding_keys is not specified, it will use all keys in the document. If embedding_model is not specified, it will use text-embedding-3-small. If embedding_model is sentence-transformer, it will use all-MiniLM-L6-v2.
75 |         sample_size (int): The number of clusters to create.
76 |         api_wrapper (APIWrapper): The API wrapper to use for embedding.
77 |     Returns:
78 |         Dict[int, List[Dict]]: A dictionary of clusters, where each cluster is a list of documents.
79 |     """
80 |     embeddings, cost = get_embeddings_for_clustering(
81 |         documents, sampling_config, api_wrapper
82 |     )
83 | 
84 |     from sklearn.cluster import KMeans
85 | 
86 |     num_clusters = min(sample_size, len(documents))
87 |     kmeans = KMeans(n_clusters=num_clusters, random_state=42)
88 |     cluster_labels = kmeans.fit_predict(embeddings)
89 | 
90 |     clusters = {i: [] for i in range(num_clusters)}
91 |     for idx, label in enumerate(cluster_labels):
92 |         clusters[label].append(documents[idx])
93 | 
94 |     return clusters, cost
95 | 


--------------------------------------------------------------------------------
/docetl/operations/scan.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Tuple
 2 | 
 3 | from docetl.operations.base import BaseOperation
 4 | 
 5 | 
 6 | class ScanOperation(BaseOperation):
 7 |     class schema(BaseOperation.schema):
 8 |         dataset_name: str
 9 | 
10 |     def syntax_check(self) -> None:
11 |         """Validate the scan operation configuration."""
12 |         super().syntax_check()
13 | 
14 |     def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
15 |         """
16 |         Execute the scan operation to load data from the configured source.
17 | 
18 |         Args:
19 |             input_data: Not used in scan operation
20 | 
21 |         Returns:
22 |             Tuple[List[Dict], float]: Loaded data and cost (0 for scan)
23 |         """
24 | 
25 |         # Look in the runner.datasets objects
26 |         if self.config["dataset_name"] not in self.runner.datasets:
27 |             raise ValueError(f"Dataset {self.config['dataset_name']} not found")
28 | 
29 |         return (
30 |             self.runner.datasets[self.config["dataset_name"]].load(),
31 |             0.0,
32 |         )  # Scan has no LLM cost
33 | 


--------------------------------------------------------------------------------
/docetl/operations/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .api import APIWrapper
 2 | from .cache import (
 3 |     cache,
 4 |     cache_key,
 5 |     clear_cache,
 6 |     flush_cache,
 7 |     freezeargs,
 8 |     CACHE_DIR,
 9 |     LLM_CACHE_DIR,
10 |     DOCETL_HOME_DIR,
11 | )
12 | from .llm import LLMResult, InvalidOutputError, truncate_messages
13 | from .progress import RichLoopBar, rich_as_completed
14 | from .validation import safe_eval, convert_val, convert_dict_schema_to_list_schema, get_user_input_for_schema, strict_render
15 | 
16 | __all__ = [
17 |     'APIWrapper',
18 |     'cache',
19 |     'cache_key',
20 |     'clear_cache',
21 |     'flush_cache', 
22 |     'freezeargs',
23 |     'CACHE_DIR',
24 |     'LLM_CACHE_DIR',
25 |     'DOCETL_HOME_DIR',
26 |     'LLMResult',
27 |     'InvalidOutputError',
28 |     'RichLoopBar',
29 |     'rich_as_completed',
30 |     'safe_eval',
31 |     'convert_val',
32 |     'convert_dict_schema_to_list_schema',
33 |     'get_user_input_for_schema',
34 |     'truncate_messages',
35 |     "strict_render"
36 | ] 


--------------------------------------------------------------------------------
/docetl/operations/utils/cache.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import hashlib
  3 | import json
  4 | import os
  5 | import shutil
  6 | from typing import Any, Dict, List
  7 | 
  8 | from diskcache import Cache
  9 | from dotenv import load_dotenv
 10 | from frozendict import frozendict
 11 | from rich.console import Console
 12 | 
 13 | from docetl.console import DOCETL_CONSOLE
 14 | 
 15 | load_dotenv()
 16 | 
 17 | DOCETL_HOME_DIR = (
 18 |     os.environ.get("DOCETL_HOME_DIR", os.path.expanduser("~")) + "/.cache/docetl"
 19 | )
 20 | CACHE_DIR = os.path.join(DOCETL_HOME_DIR, "general")
 21 | LLM_CACHE_DIR = os.path.join(DOCETL_HOME_DIR, "llm")
 22 | cache = Cache(LLM_CACHE_DIR)
 23 | cache.close()
 24 | 
 25 | 
 26 | def freezeargs(func):
 27 |     """
 28 |     Decorator to convert mutable dictionary arguments into immutable.
 29 |     """
 30 | 
 31 |     @functools.wraps(func)
 32 |     def wrapped(*args, **kwargs):
 33 |         args = tuple(
 34 |             (
 35 |                 frozendict(arg)
 36 |                 if isinstance(arg, dict)
 37 |                 else json.dumps(arg) if isinstance(arg, list) else arg
 38 |             )
 39 |             for arg in args
 40 |         )
 41 |         kwargs = {
 42 |             k: (
 43 |                 frozendict(v)
 44 |                 if isinstance(v, dict)
 45 |                 else json.dumps(v) if isinstance(v, list) else v
 46 |             )
 47 |             for k, v in kwargs.items()
 48 |         }
 49 |         return func(*args, **kwargs)
 50 | 
 51 |     return wrapped
 52 | 
 53 | 
 54 | def flush_cache(console: Console = DOCETL_CONSOLE):
 55 |     """Flush the cache to disk."""
 56 |     console.log("[bold green]Flushing cache to disk...[/bold green]")
 57 |     cache.close()
 58 |     console.log("[bold green]Cache flushed to disk.[/bold green]")
 59 | 
 60 | 
 61 | def clear_cache(console: Console = DOCETL_CONSOLE):
 62 |     """Clear the LLM cache stored on disk."""
 63 |     console.log("[bold yellow]Clearing LLM cache...[/bold yellow]")
 64 |     try:
 65 |         with cache as c:
 66 |             c.clear()
 67 |         # Remove all files in the cache directory
 68 |         if not os.path.exists(CACHE_DIR):
 69 |             os.makedirs(CACHE_DIR)
 70 |         for filename in os.listdir(CACHE_DIR):
 71 |             file_path = os.path.join(CACHE_DIR, filename)
 72 |             try:
 73 |                 if os.path.isfile(file_path):
 74 |                     os.unlink(file_path)
 75 |                 elif os.path.isdir(file_path):
 76 |                     shutil.rmtree(file_path)
 77 |             except Exception as e:
 78 |                 console.log(
 79 |                     f"[bold red]Error deleting {file_path}: {str(e)}[/bold red]"
 80 |                 )
 81 |         console.log("[bold green]Cache cleared successfully.[/bold green]")
 82 |     except Exception as e:
 83 |         console.log(f"[bold red]Error clearing cache: {str(e)}[/bold red]")
 84 | 
 85 | 
 86 | def cache_key(
 87 |     model: str,
 88 |     op_type: str,
 89 |     messages: List[Dict[str, str]],
 90 |     output_schema: Dict[str, str],
 91 |     scratchpad: str = None,
 92 |     system_prompt: Dict[str, str] = None,
 93 |     op_config: Dict[str, Any] = {},
 94 | ) -> str:
 95 |     """Generate a unique cache key based on function arguments."""
 96 |     key_dict = {
 97 |         "model": model,
 98 |         "op_type": op_type,
 99 |         "messages": json.dumps(messages, sort_keys=True),
100 |         "output_schema": json.dumps(output_schema, sort_keys=True),
101 |         "scratchpad": scratchpad,
102 |         "system_prompt": json.dumps(system_prompt, sort_keys=True),
103 |         "op_config": json.dumps(op_config, sort_keys=True),
104 |     }
105 |     return hashlib.md5(json.dumps(key_dict, sort_keys=True).encode()).hexdigest()
106 | 


--------------------------------------------------------------------------------
/docetl/operations/utils/progress.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import as_completed
 2 | from typing import Iterable, Optional, Union
 3 | 
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | class RichLoopBar:
 8 |     """A progress bar class that integrates with Rich console."""
 9 | 
10 |     def __init__(
11 |         self,
12 |         iterable: Optional[Union[Iterable, range]] = None,
13 |         total: Optional[int] = None,
14 |         desc: Optional[str] = None,
15 |         leave: bool = True,
16 |         console=None,
17 |     ):
18 |         if console is None:
19 |             raise ValueError("Console must be provided")
20 |         self.console = console
21 |         self.iterable = iterable
22 |         self.total = self._get_total(iterable, total)
23 |         self.description = desc
24 |         self.leave = leave
25 |         self.tqdm = None
26 | 
27 |     def _get_total(self, iterable, total):
28 |         if total is not None:
29 |             return total
30 |         if isinstance(iterable, range):
31 |             return len(iterable)
32 |         try:
33 |             return len(iterable)
34 |         except TypeError:
35 |             return None
36 | 
37 |     def __iter__(self):
38 |         self.tqdm = tqdm(
39 |             self.iterable,
40 |             total=self.total,
41 |             desc=self.description,
42 |             file=self.console.file,
43 |         )
44 |         for item in self.tqdm:
45 |             yield item
46 | 
47 |     def __enter__(self):
48 |         self.tqdm = tqdm(
49 |             total=self.total,
50 |             desc=self.description,
51 |             leave=self.leave,
52 |             file=self.console.file,
53 |         )
54 |         return self
55 | 
56 |     def __exit__(self, exc_type, exc_val, exc_tb):
57 |         self.tqdm.close()
58 | 
59 |     def update(self, n=1):
60 |         if self.tqdm:
61 |             self.tqdm.update(n)
62 | 
63 | 
64 | def rich_as_completed(futures, total=None, desc=None, leave=True, console=None):
65 |     """Yield completed futures with a Rich progress bar."""
66 |     if console is None:
67 |         raise ValueError("Console must be provided")
68 | 
69 |     with RichLoopBar(total=total, desc=desc, leave=leave, console=console) as pbar:
70 |         for future in as_completed(futures):
71 |             yield future
72 |             pbar.update()
73 | 


--------------------------------------------------------------------------------
/docetl/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from docetl.optimizers.join_optimizer import JoinOptimizer
2 | from docetl.optimizers.map_optimizer import MapOptimizer
3 | from docetl.optimizers.reduce_optimizer import ReduceOptimizer
4 | 
5 | __all__ = ["JoinOptimizer", "MapOptimizer", "ReduceOptimizer"]


--------------------------------------------------------------------------------
/docetl/optimizers/map_optimizer/__init__.py:
--------------------------------------------------------------------------------
1 | from docetl.optimizers.map_optimizer.optimizer import MapOptimizer
2 | 
3 | __all__ = ["MapOptimizer"]
4 | 


--------------------------------------------------------------------------------
/docetl/ratelimiter.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from inspect import isawaitable
 3 | from typing import Any, Dict
 4 | 
 5 | import pyrate_limiter
 6 | 
 7 | 
 8 | class BucketCollection(pyrate_limiter.BucketFactory):
 9 |     def __init__(self, **buckets):
10 |         self.clock = pyrate_limiter.TimeClock()
11 |         self.buckets = buckets
12 | 
13 |     def wrap_item(self, name: str, weight: int = 1) -> pyrate_limiter.RateItem:
14 |         now = self.clock.now()
15 | 
16 |         async def wrap_async():
17 |             return pyrate_limiter.RateItem(name, await now, weight=weight)
18 | 
19 |         def wrap_sync():
20 |             return pyrate_limiter.RateItem(name, now, weight=weight)
21 | 
22 |         return wrap_async() if isawaitable(now) else wrap_sync()
23 | 
24 |     def get(self, item: pyrate_limiter.RateItem) -> pyrate_limiter.AbstractBucket:
25 |         if item.name not in self.buckets:
26 |             return self.buckets["unknown"]
27 |         return self.buckets[item.name]
28 | 
29 | 
30 | def create_bucket_factory(rate_limits: Dict[str, Any]) -> BucketCollection:
31 |     """
32 |     Create a BucketCollection from rate limits configuration.
33 | 
34 |     Args:
35 |         rate_limits: Dictionary containing rate limit configuration
36 | 
37 |     Returns:
38 |         BucketCollection configured with the specified rate limits
39 |     """
40 |     buckets = {
41 |         param: pyrate_limiter.InMemoryBucket(
42 |             [
43 |                 pyrate_limiter.Rate(
44 |                     param_limit["count"],
45 |                     param_limit["per"]
46 |                     * getattr(
47 |                         pyrate_limiter.Duration,
48 |                         param_limit.get("unit", "SECOND").upper(),
49 |                     ),
50 |                 )
51 |                 for param_limit in param_limits
52 |             ]
53 |         )
54 |         for param, param_limits in rate_limits.items()
55 |     }
56 | 
57 |     # Add default bucket for unknown parameters
58 |     buckets["unknown"] = pyrate_limiter.InMemoryBucket(
59 |         [pyrate_limiter.Rate(math.inf, 1)]
60 |     )
61 | 
62 |     return BucketCollection(**buckets)
63 | 


--------------------------------------------------------------------------------
/docetl/schemas.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | from . import dataset
 4 | 
 5 | # ruff: noqa: F403
 6 | from .base_schemas import *
 7 | from .operations import (
 8 |     cluster,
 9 |     equijoin,
10 |     filter,
11 |     gather,
12 |     map,
13 |     reduce,
14 |     resolve,
15 |     sample,
16 |     split,
17 |     unnest,
18 | )
19 | 
20 | MapOp = map.MapOperation.schema
21 | ResolveOp = resolve.ResolveOperation.schema
22 | ReduceOp = reduce.ReduceOperation.schema
23 | ParallelMapOp = map.ParallelMapOperation.schema
24 | FilterOp = filter.FilterOperation.schema
25 | EquijoinOp = equijoin.EquijoinOperation.schema
26 | SplitOp = split.SplitOperation.schema
27 | GatherOp = gather.GatherOperation.schema
28 | UnnestOp = unnest.UnnestOperation.schema
29 | ClusterOp = cluster.ClusterOperation.schema
30 | SampleOp = sample.SampleOperation.schema
31 | 
32 | OpType = Union[
33 |     MapOp,
34 |     ResolveOp,
35 |     ReduceOp,
36 |     ParallelMapOp,
37 |     FilterOp,
38 |     EquijoinOp,
39 |     SplitOp,
40 |     GatherOp,
41 |     UnnestOp,
42 | ]
43 | 
44 | Dataset = dataset.Dataset.schema
45 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   docetl:
 3 |     container_name: docetl-docwrangler-stack
 4 |     build: .
 5 |     image: docetl
 6 |     restart: unless-stopped
 7 |     healthcheck:
 8 |       test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
 9 |       interval: 30s
10 |       timeout: 10s
11 |       retries: 3
12 |       start_period: 40s
13 |     ports:
14 |       # Map host ports to container ports using environment variables.
15 |       # If FRONTEND_DOCKER_COMPOSE_PORT is not set, default to 3031.
16 |       - "${FRONTEND_DOCKER_COMPOSE_PORT:-3031}:3000"
17 |       # If BACKEND_DOCKER_COMPOSE_PORT is not set, default to 8081.
18 |       - "${BACKEND_DOCKER_COMPOSE_PORT:-8081}:8000"
19 |     environment:
20 |       # Pass environment variables from the .env file (or host environment)
21 |       # with default values if they are not defined.
22 |       # Authentication
23 |       - OPENAI_API_KEY=${OPENAI_API_KEY:-your_api_key_here}
24 |       
25 |       # Backend Configuration
26 |       - BACKEND_ALLOW_ORIGINS=${BACKEND_ALLOW_ORIGINS:-http://localhost:3000,http://127.0.0.1:3000}
27 |       - BACKEND_HOST=${BACKEND_HOST:-0.0.0.0}
28 |       - BACKEND_PORT=${BACKEND_PORT:-8000}
29 |       - BACKEND_RELOAD=${BACKEND_RELOAD:-True}
30 |       
31 |       # Frontend Configuration
32 |       - FRONTEND_HOST=${FRONTEND_HOST:-0.0.0.0}
33 |       - FRONTEND_PORT=${FRONTEND_PORT:-3000}
34 |       
35 |       # File Processing
36 |       - TEXT_FILE_ENCODINGS=${TEXT_FILE_ENCODINGS:-utf-8,latin1,cp1252,iso-8859-1}
37 |     volumes:
38 |       # Mount the named volume "docetl-data" to /docetl-data in the container.
39 |       - docetl-data:/docetl-data
40 | 
41 |   docetl-aws:
42 |     extends:
43 |       service: docetl
44 |     environment:
45 |       - AWS_PROFILE=${AWS_PROFILE:-default}
46 |       - AWS_REGION=${AWS_REGION:-us-west-2}
47 |     volumes:
48 |       - ~/.aws:/root/.aws:ro
49 |     profiles:
50 |       - aws
51 | 
52 | volumes:
53 |   docetl-data:
54 | 


--------------------------------------------------------------------------------
/docs/advanced/custom-operators.md:
--------------------------------------------------------------------------------
1 | TODO: Support UDFs.
2 | 


--------------------------------------------------------------------------------
/docs/advanced/extending-agents.md:
--------------------------------------------------------------------------------
1 | TODO: Add guide for extending agents (after preprint release).
2 | 


--------------------------------------------------------------------------------
/docs/advanced/performance-tuning.md:
--------------------------------------------------------------------------------
1 | TODO: Add performance tuning guide.
2 | 


--------------------------------------------------------------------------------
/docs/api-reference/cli.md:
--------------------------------------------------------------------------------
 1 | ::: docetl.cli.run
 2 |     options:
 3 |         show_root_heading: true
 4 |         heading_level: 3
 5 |         show_if_no_docstring: false
 6 |         docstring_options:
 7 |             ignore_init_summary: false
 8 |             trim_doctest_flags: true
 9 | 
10 | ::: docetl.cli.build
11 |     options:
12 |         show_root_heading: true
13 |         heading_level: 3
14 |         show_if_no_docstring: false
15 |         docstring_options:
16 |             ignore_init_summary: false
17 |             trim_doctest_flags: true
18 | 
19 | ::: docetl.cli.clear_cache
20 |     options:
21 |         show_root_heading: true
22 |         heading_level: 3
23 |         show_if_no_docstring: false
24 |         docstring_options:
25 |             ignore_init_summary: false
26 |             trim_doctest_flags: true


--------------------------------------------------------------------------------
/docs/api-reference/docetl.md:
--------------------------------------------------------------------------------
 1 | ::: docetl.DSLRunner
 2 |     options:
 3 |         show_root_heading: true
 4 |         heading_level: 3
 5 |         show_if_no_docstring: false
 6 |         docstring_options:
 7 |             ignore_init_summary: false
 8 |             trim_doctest_flags: true
 9 | 
10 | ::: docetl.Optimizer
11 |     options:
12 |         show_root_heading: true
13 |         heading_level: 3
14 |         show_if_no_docstring: false
15 |         docstring_options:
16 |             ignore_init_summary: false
17 |             trim_doctest_flags: true
18 | 


--------------------------------------------------------------------------------
/docs/api-reference/operations.md:
--------------------------------------------------------------------------------
 1 | # LLM-Powered Operators
 2 | 
 3 | ::: docetl.operations.map.MapOperation
 4 |     options:
 5 |         show_root_heading: true
 6 |         heading_level: 3
 7 |         show_if_no_docstring: false
 8 |         docstring_options:
 9 |             ignore_init_summary: false
10 |             trim_doctest_flags: true
11 | 
12 | ::: docetl.operations.resolve.ResolveOperation
13 |     options:
14 |         show_root_heading: true
15 |         heading_level: 3
16 |         show_if_no_docstring: false
17 |         docstring_options:
18 |             ignore_init_summary: false
19 |             trim_doctest_flags: true
20 | 
21 | ::: docetl.operations.reduce.ReduceOperation
22 |     options:
23 |         show_root_heading: true
24 |         heading_level: 3
25 |         show_if_no_docstring: false
26 |         docstring_options:
27 |             ignore_init_summary: false
28 |             trim_doctest_flags: true
29 | 
30 | ::: docetl.operations.map.ParallelMapOperation
31 |     options:
32 |         show_root_heading: true
33 |         heading_level: 3
34 |         show_if_no_docstring: false
35 |         docstring_options:
36 |             ignore_init_summary: false
37 |             trim_doctest_flags: true
38 | 
39 | ::: docetl.operations.filter.FilterOperation
40 |     options:
41 |         show_root_heading: true
42 |         heading_level: 3
43 |         show_if_no_docstring: false
44 |         docstring_options:
45 |             ignore_init_summary: false
46 |             trim_doctest_flags: true
47 | 
48 | ::: docetl.operations.equijoin.EquijoinOperation
49 |     options:
50 |         show_root_heading: true
51 |         heading_level: 3
52 |         show_if_no_docstring: false
53 |         docstring_options:
54 |             ignore_init_summary: false
55 |             trim_doctest_flags: true
56 | 
57 | ::: docetl.operations.cluster.ClusterOperation
58 |     options:
59 |         show_root_heading: true
60 |         heading_level: 3
61 |         show_if_no_docstring: false
62 |         docstring_options:
63 |             ignore_init_summary: false
64 |             trim_doctest_flags: true
65 | 
66 | # Auxiliary Operators
67 | 
68 | ::: docetl.operations.split.SplitOperation
69 |     options:
70 |         show_root_heading: true
71 |         heading_level: 3
72 |         show_if_no_docstring: false
73 |         docstring_options:
74 |             ignore_init_summary: false
75 |             trim_doctest_flags: true
76 | 
77 | ::: docetl.operations.gather.GatherOperation
78 |     options:
79 |         show_root_heading: true
80 |         heading_level: 3
81 |         show_if_no_docstring: false
82 |         docstring_options:
83 |             ignore_init_summary: false
84 |             trim_doctest_flags: true
85 | 
86 | ::: docetl.operations.unnest.UnnestOperation
87 |     options:
88 |         show_root_heading: true
89 |         heading_level: 3
90 |         show_if_no_docstring: false
91 |         docstring_options:
92 |             ignore_init_summary: false
93 |             trim_doctest_flags: true


--------------------------------------------------------------------------------
/docs/api-reference/optimizers.md:
--------------------------------------------------------------------------------
 1 | ::: docetl.optimizers.map_optimizer.optimizer.MapOptimizer
 2 |     options:
 3 |         show_root_heading: true
 4 |         heading_level: 3
 5 |         show_if_no_docstring: false
 6 |         docstring_options:
 7 |             ignore_init_summary: false
 8 |             trim_doctest_flags: true
 9 | 
10 | ::: docetl.optimizers.reduce_optimizer.ReduceOptimizer
11 |     options:
12 |         show_root_heading: true
13 |         heading_level: 3
14 |         show_if_no_docstring: false
15 |         docstring_options:
16 |             ignore_init_summary: false
17 |             trim_doctest_flags: true
18 | 
19 | ::: docetl.optimizers.join_optimizer.JoinOptimizer
20 |     options:
21 |         show_root_heading: true
22 |         heading_level: 3
23 |         show_if_no_docstring: false


--------------------------------------------------------------------------------
/docs/assets/docetl-favicon-color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/docetl-favicon-color.png


--------------------------------------------------------------------------------
/docs/assets/headerdiagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/headerdiagram.png


--------------------------------------------------------------------------------
/docs/assets/readmefig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/readmefig.png


--------------------------------------------------------------------------------
/docs/assets/tutorial/add-notes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/add-notes.png


--------------------------------------------------------------------------------
/docs/assets/tutorial/dataset-view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/dataset-view.png


--------------------------------------------------------------------------------
/docs/assets/tutorial/initial-outputs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/initial-outputs.png


--------------------------------------------------------------------------------
/docs/assets/tutorial/one-operation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/one-operation.png


--------------------------------------------------------------------------------
/docs/assets/tutorial/operation-details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/operation-details.png


--------------------------------------------------------------------------------
/docs/assets/tutorial/prompt-improvement.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/prompt-improvement.png


--------------------------------------------------------------------------------
/docs/assets/tutorial/prompt-v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/docs/assets/tutorial/prompt-v2.png


--------------------------------------------------------------------------------
/docs/community/roadmap.md:
--------------------------------------------------------------------------------
 1 | # Roadmap
 2 | 
 3 | !!! info "Join Our Working Groups"
 4 | 
 5 |     Are you interested in contributing to any of these projects or have ideas for new areas of exploration? Join our [Discord server](https://discord.gg/fHp7B2X3xx) to participate in our working groups and collaborate with the community!
 6 | 
 7 | We're constantly working to improve DocETL and explore new possibilities in document processing. Our current ideas span both research and engineering problems, and are organized into the following categories:
 8 | 
 9 | ```mermaid
10 | mindmap
11 |   root((DocETL Roadmap))
12 |     User Interface and Interaction
13 |     Debugging and Optimization
14 |     Model and Tool Integrations
15 |     Agents and Planning
16 | ```
17 | 
18 | ## User Interface and Interaction
19 | 
20 | - **Natural Language to DocETL Pipeline**: Building tools to generate DocETL pipelines from natural language descriptions.
21 | - **Interactive Pipeline Creation**: Developing intuitive interfaces for creating and optimizing DocETL pipelines interactively.
22 | 
23 | ## Debugging and Optimization
24 | 
25 | - **DocETL Debugger**: Creating a debugger with provenance tracking, allowing users to visualize all intermediates that contributed to a specific output.
26 | - **Plan Efficiency Optimization**: Implementing strategies (and devising new strategies) to reduce latency and cost for the most accurate plans. This includes batching LLM calls, using model cascades, and fusing operators.
27 | 
28 | 
29 | ## Model and Tool Integrations
30 | 
31 | - **Model Diversity**: Extending support beyond OpenAI to include a wider range of models, with a focus on local models.
32 | - **OCR and PDF Extraction**: Improving integration with OCR technologies and PDF extraction tools for more robust document processing.
33 | - **Multimodal Data Processing**: Enhancing DocETL to handle multimodal data, including text, images, audio, and video (as many of the LLMs support multimodal inputs, anyways).
34 | 
35 | ## Agents and Planning
36 | 
37 | - **Smarter Agent and Planning Architectures**: Optimizing plan exploration based on data characteristics. For instance, refining the optimizer to avoid unnecessary exploration of plans with the [gather operator](../operators/gather.md) for tasks that don't require peripheral context when decomposing map operations for large documents.
38 | 
39 | - **Context-Aware Sampling for Validation**: Creating algorithms that can identify and extract the most representative samples from different parts of a document, including the beginning, middle, and end, to use in validaton prompts. This approach will help validation agents to verify that all sections of documents are adequately represented in the outputs, avoiding blind spots in the analysis due to truncation--as we currently naive truncate the middle of documents in validation prompts.
40 | 
41 | - **Benchmarks**: Developing a suite of benchmarks to evaluate the performance of different optimization strategies and agent architectures. These benchmarks will help us understand the trade-offs between accuracy, efficiency, and cost in different scenarios, guiding the development of more effective optimization techniques.
42 | 


--------------------------------------------------------------------------------
/docs/examples/annotating-legal-documents.md:
--------------------------------------------------------------------------------
1 | TODO
2 | 


--------------------------------------------------------------------------------
/docs/examples/characterizing-troll-behavior.md:
--------------------------------------------------------------------------------
1 | TODO
2 | 


--------------------------------------------------------------------------------
/docs/examples/rate-limiting.md:
--------------------------------------------------------------------------------
 1 | # Rate Limiting
 2 | 
 3 | When using DocETL, you might have rate limits based on your usage tier with various API providers. To help manage these limits and prevent exceeding them, DocETL allows you to configure rate limits in your YAML configuration file.
 4 | 
 5 | ## Configuring Rate Limits
 6 | 
 7 | You can add rate limits to your YAML config by including a `rate_limits` key with specific configurations for different types of API calls. Here's an example of how to set up rate limits:
 8 | 
 9 | ```yaml
10 | rate_limits:
11 |   embedding_call:
12 |     - count: 1000
13 |       per: 1
14 |       unit: second
15 |   llm_call:
16 |     - count: 1
17 |       per: 1
18 |       unit: second
19 |     - count: 10
20 |       per: 5
21 |       unit: hour
22 |   llm_tokens:
23 |     - count: 1000000
24 |       per: 1
25 |       unit: minute
26 | ```
27 | 
28 | Your YAML configuration should have a `rate_limits` key with the config as shown above. This example sets limits for embedding calls and language model (LLM) calls, with multiple rules for LLM calls to accommodate different time scales.
29 | 
30 | You can also use rate limits in the Python API, passing in a `rate_limits` dictionary when you initialize the `Pipeline` object.
31 | 


--------------------------------------------------------------------------------
/docs/execution/running-pipelines.md:
--------------------------------------------------------------------------------
 1 | # Additional Notes
 2 | 
 3 | Here are some additional notes to help you get the most out of your pipeline:
 4 | 
 5 | - **Sampling Operations**: If you want to run an operation on a random sample of your data, you can set the sample parameter for that operation. For example:
 6 | 
 7 |   ```yaml
 8 |   operations:
 9 |     extract_medications:
10 |       sample: 100 # This will run the operation on a random sample of 100 items
11 |       # ... rest of the operation configuration
12 |   ```
13 | 
14 | - **Caching**: DocETL caches the results of operations by default. This means that if you run the same operation on the same data multiple times, the results will be retrieved from the cache rather than being recomputed. You can clear the cache by running docetl clear-cache.
15 | 
16 | - **The run Function**: The main entry point for running a pipeline is the run function in docetl/cli.py. Here's a description of its parameters and functionality:
17 | 
18 | ::: docetl.cli.run
19 |   handler: python
20 |   options:
21 |     members: - run
22 |   show_root_full_path: true
23 |   show_root_toc_entry: true
24 |   show_root_heading: true
25 |   show_source: false
26 |   show_name: true
27 | 
28 | - **Intermediate Output**: If you provide an intermediate directory in your configuration, the outputs of each operation will be saved to this directory. This allows you to inspect the results of individual steps in the pipeline and can be useful for debugging or analyzing the pipeline's progress. Set the intermediate_dir parameter in your pipeline's output configuration to specify the directory where intermediate results should be saved; e.g.,
29 | 
30 |   ```yaml
31 |   pipeline:
32 |     output:
33 |       type: file
34 |       path: ...
35 |       intermediate_dir: intermediate_results
36 |   ```


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # 📜 DocETL: A System for Complex Document Processing
 2 | 
 3 | [![GitHub](https://img.shields.io/github/stars/ucbepic/docetl?style=social)](https://github.com/ucbepic/docetl)
 4 | [![Website](https://img.shields.io/badge/Website-docetl.org-blue)](https://docetl.org)
 5 | [![Documentation](https://img.shields.io/badge/Documentation-docs-green)](https://ucbepic.github.io/docetl)
 6 | [![Discord](https://img.shields.io/discord/1285485891095236608?label=Discord&logo=discord)](https://discord.gg/fHp7B2X3xx)
 7 | [![Paper](https://img.shields.io/badge/Paper-arXiv-red)](https://arxiv.org/abs/2410.12189)
 8 | 
 9 | ![DocETL Figure](assets/readmefig.png)
10 | 
11 | DocETL is a tool for creating and executing LLM-powered data processing pipelines. It offers a low-code, declarative YAML interface to define complex data operations on complex data.
12 | 
13 | !!! tip "When to Use DocETL"
14 | 
15 |     DocETL is the ideal choice when you're looking to **maximize correctness and output quality** for complex tasks over a collection of documents or unstructured datasets. You should consider using DocETL if:
16 | 
17 |     - You have complex tasks that you want to represent via map-reduce (e.g., map over your documents, then group by the result of your map call & reduce)
18 |     - You're unsure how to best write your pipeline or sequence of operations to maximize LLM accuracy
19 |     - You're working with long documents that don't fit into a single prompt or are too lengthy for effective LLM reasoning
20 |     - You have validation criteria and want tasks to automatically retry when the validation fails
21 | 
22 | ## 🚀 Features
23 | 
24 | - **Rich Suite of Operators**: Tailored for complex data processing, including specialized operators like "resolve" for entity resolution and "gather" for maintaining context when splitting documents.
25 | - **Low-Code Interface**: Define your pipeline and prompts easily using YAML. You have 100% control over the prompts.
26 | - **Flexible Processing**: Handle various document types and processing tasks across domains like law, medicine, and social sciences.
27 | - **Accuracy Optimization**: Our optimizer leverages LLM agents to experiment with different logically-equivalent rewrites of your pipeline and automatically selects the most accurate version. This includes finding limits of how many documents to process in a single reduce operation before the accuracy plateaus.
28 | 
29 | ## ⚡ Getting Started
30 | 
31 | To get started with DocETL:
32 | 
33 | 1. Install the package (see [installation](installation.md) for detailed instructions)
34 | 2. Define your pipeline in a YAML file. Want to use an LLM like ChatGPT or Claude to help you write your pipeline? See [docetl.org/llms.txt](https://docetl.org/llms.txt) for a big prompt you can copy paste into ChatGPT or Claude, before describing your task.
35 | 3. Run your pipeline using the DocETL command-line interface
36 | 
37 | ## 🏛️ Project Origin
38 | 
39 | DocETL was created by members of the EPIC Data Lab and Data Systems and Foundations group at UC Berkeley. The EPIC (Effective Programming, Interaction, and Computation with Data) Lab focuses on developing low-code and no-code interfaces for data work, powered by next-generation predictive programming techniques. DocETL is one of the projects that emerged from our research efforts to streamline complex document processing tasks.
40 | 
41 | For more information about the labs and other projects, visit the [EPIC Lab webpage](https://epic.berkeley.edu/) and the [Data Systems and Foundations webpage](https://dsf.berkeley.edu/).
42 | 


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | DocETL can be easily installed using pip, Python's package installer, or from source. Follow these steps to get DocETL up and running on your system:
 4 | 
 5 | ## 🛠️ Prerequisites
 6 | 
 7 | Before installing DocETL, ensure you have Python 3.10 or later installed on your system. You can check your Python version by running:
 8 | 
 9 | ## 📦 Installation via pip
10 | 
11 | 1. Install DocETL using pip:
12 | 
13 | ```bash
14 | pip install docetl
15 | ```
16 | 
17 | If you want to use the parsing tools, you need to install the `parsing` extra:
18 | 
19 | ```bash
20 | pip install docetl[parsing]
21 | ```
22 | 
23 | This command will install DocETL along with its dependencies as specified in the pyproject.toml file. To verify that DocETL has been installed correctly, you can run the following command in your terminal:
24 | 
25 | ```bash
26 | docetl version
27 | ```
28 | 
29 | ## 🔧 Installation from Source
30 | 
31 | To install DocETL from source, follow these steps:
32 | 
33 | 1. Clone the repository:
34 | 
35 | ```bash
36 | git clone https://github.com/ucbepic/docetl.git
37 | cd docetl
38 | ```
39 | 
40 | 2. Install Poetry (if not already installed):
41 | 
42 | ```bash
43 | pip install poetry
44 | ```
45 | 
46 | 3. Install the project dependencies and DocETL:
47 | 
48 | ```bash
49 | poetry install
50 | ```
51 | 
52 | If you want to use the parsing tools, you need to install the `parsing` extra:
53 | 
54 | ```bash
55 | poetry install --extras "parsing"
56 | ```
57 | 
58 | This will create a virtual environment and install all the required dependencies.
59 | 
60 | 4. Set up your OpenAI API key:
61 | 
62 | Create a .env file in the project root and add your OpenAI API key:
63 | 
64 | ```bash
65 | OPENAI_API_KEY=your_api_key_here
66 | ```
67 | 
68 | Alternatively, you can set the OPENAI_API_KEY environment variable in your shell.
69 | 
70 | 5. Run the basic test suite to ensure everything is working (this costs less than $0.01 with OpenAI):
71 | 
72 | ```bash
73 | make tests-basic
74 | ```
75 | 
76 | ## 🚨 Troubleshooting
77 | 
78 | If you encounter any issues during installation, please ensure that:
79 | 
80 | - Your Python version is 3.10 or later
81 | - You have the latest version of pip installed
82 | - Your system meets all the requirements specified in the pyproject.toml file
83 | 
84 | For further assistance, please refer to the project's GitHub repository or reach out on the [Discord server](https://discord.gg/fHp7B2X3xx).
85 | 


--------------------------------------------------------------------------------
/docs/optimization/configuration.md:
--------------------------------------------------------------------------------
 1 | # Advanced: Customizing Optimization
 2 | 
 3 | You can customize the optimization process for specific operations using the ``optimizer_config in your pipeline.
 4 | 
 5 | ## Global Configuration
 6 | 
 7 | The following options can be applied globally to all operations in your pipeline during optimization:
 8 | 
 9 | - `num_retries`: The number of times to retry optimizing if the LLM agent fails. Default is 1.
10 | 
11 | - `sample_sizes`: Override the default sample sizes for each operator type. Specify as a dictionary with operator types as keys and integer sample sizes as values.
12 | 
13 |   Default sample sizes:
14 | 
15 |   ```python
16 |   SAMPLE_SIZE_MAP = {
17 |       "reduce": 40,
18 |       "map": 5,
19 |       "resolve": 100,
20 |       "equijoin": 100,
21 |       "filter": 5,
22 |   }
23 |   ```
24 | 
25 | - `judge_agent_model`: Specify the model to use for the judge agent. Default is `gpt-4o-mini`.
26 | 
27 | - `rewrite_agent_model`: Specify the model to use for the rewrite agent. Default is `gpt-4o`.
28 | 
29 | - `litellm_kwargs`: Specify the litellm kwargs to use for the optimization. Default is `{}`.
30 | 
31 | ## Equijoin Configuration
32 | 
33 | - `target_recall`: Change the default target recall (default is 0.95).
34 | 
35 | ## Resolve Configuration
36 | 
37 | - `target_recall`: Specify the target recall for the resolve operation.
38 | 
39 | ## Reduce Configuration
40 | 
41 | - `synthesize_resolve`: Set to `False` if you definitely don't want a resolve operation synthesized or want to turn off this rewrite rule.
42 | 
43 | ## Map Configuration
44 | 
45 | - `force_chunking_plan`: Set to `True` if you want the the optimizer to force plan that breaks up the input documents into chunks.
46 | - `plan_types`: Specify the plan types to consider for the map operation. The available plan types are:
47 |   - `chunk`: Breaks up the input documents into chunks (i.e., data decomposition).
48 |   - `proj_synthesis`: Synthesizes 1+ projections (i.e., task decomposition).
49 |   - `glean`: Synthesizes a glean plan (i.e., uses LLM as a judge to refine the output).
50 | 
51 | ## Example Configuration
52 | 
53 | Here's an example of how to use the `optimizer_config` in your pipeline:
54 | 
55 | ```yaml
56 | optimizer_config:
57 |   rewrite_agent_model: gpt-4o-mini
58 |   judge_agent_model: gpt-4o-mini
59 |   litellm_kwargs:
60 |     temperature: 0.5
61 |   num_retries: 2
62 |   sample_sizes:
63 |     map: 10
64 |     reduce: 50
65 |   reduce:
66 |     synthesize_resolve: false
67 |   map:
68 |     plan_types: # Considers all these plan types
69 |       - chunk
70 |       - proj_synthesis
71 |       - glean
72 | 
73 | operations:
74 |   - name: extract_medications
75 |     type: map
76 |     optimize: true
77 |     recursively_optimize: true # Recursively optimize the map operation (i.e., optimize any new operations that are synthesized)
78 |     # ... other configuration ...
79 | 
80 |   - name: summarize_prescriptions
81 |     type: reduce
82 |     optimize: true
83 |     # ... other configuration ...
84 | # ... rest of the pipeline configuration ...
85 | ```
86 | 
87 | This configuration will:
88 | 
89 | 1. Retry optimization up to 2 times for each operation if the LLM agent fails.
90 | 2. Use custom sample sizes for map (10) and reduce (50) operations.
91 | 3. Prevent the synthesis of resolve operations for reduce operations.
92 | 4. Consider all plan types for map operations.


--------------------------------------------------------------------------------
/docs/playground/features.md:
--------------------------------------------------------------------------------
 1 | # Features
 2 | 
 3 | The DocETL playground provides an interactive environment for building and testing document processing pipelines. Here are the key features:
 4 | 
 5 | ## Current Features
 6 | 
 7 | ### Hybrid Interface
 8 | The playground offers a unique hybrid between a notebook and spreadsheet interface, allowing you to:
 9 | - Iteratively develop and test pipeline operations
10 | - Inspect operation outputs in a tabular format
11 | - Seamlessly switch between code and data views
12 | 
13 | ### Performance Optimizations
14 | To ensure responsive interaction:
15 | - Smart sampling of large datasets for quick iteration
16 | - Automatic caching of operation results
17 | - Efficient handling of LLM API calls
18 | 
19 | ### Output Management
20 | - Add notes and highlights to important outputs
21 | - Save and organize findings during pipeline development
22 | - Track key insights and results
23 | 
24 | ### Export Capabilities
25 | - Export results from any operation to CSV
26 | - Preserve intermediate results for further analysis
27 | - Share outputs with team members
28 | 
29 | ## Upcoming Features
30 | 
31 | We're actively working on several exciting ideas:
32 | 
33 | ### Natural Language Pipeline Assistant
34 | - Generate and indirectly modify pipelines using natural language
35 | - Interactive help for pipeline development
36 | 
37 | ### Enhanced Validation UI
38 | - Per-document retry capabilities for failed operations
39 | - UI support for gleaning validation outside of extra kwargs
40 | - Visual feedback for validation results
41 | 
42 | ### Pipeline Optimization Interface
43 | - Interactive tools for optimizing operation performance
44 | - Visual pipeline analysis and bottleneck identification
45 | - Suggestions for pipeline efficiency improvements
46 | 
47 | !!! tip "Join the Development"
48 | 
49 |     Interested in these upcoming features? Join our [Discord community](https://discord.gg/fHp7B2X3xx) to provide feedback and help shape the development of these features!
50 | 


--------------------------------------------------------------------------------
/docs/python/index.md:
--------------------------------------------------------------------------------
 1 | # Python API
 2 | 
 3 | The DocETL Python API provides a programmatic way to define, optimize, and run document processing pipelines. This approach offers an alternative to the YAML configuration method, allowing for more dynamic and flexible pipeline construction.
 4 | 
 5 | ## Overview
 6 | 
 7 | The Python API consists of several classes:
 8 | 
 9 | - Dataset: Represents a dataset with a type and path.
10 | - Various operation classes (e.g., MapOp, ReduceOp, FilterOp) for different types of data processing steps.
11 | - PipelineStep: Represents a step in the pipeline with input and operations.
12 | - Pipeline: The main class for defining and running a complete document processing pipeline.
13 | - PipelineOutput: Defines the output configuration for the pipeline.
14 | 
15 | ## Example Usage
16 | 
17 | Here's an example of how to use the Python API to create and run a simple document processing pipeline:
18 | 
19 | ```python
20 | from docetl.api import Pipeline, Dataset, MapOp, ReduceOp, PipelineStep, PipelineOutput
21 | 
22 | # Define datasets
23 | datasets = {
24 |     "my_dataset": Dataset(type="file", path="input.json", parsing=[{"input_key": "file_path", "function": "txt_to_string", "output_key": "content"}]),
25 | }
26 | 
27 | # Note that the parsing is applied to the `file_path` key in each item of the dataset,
28 | # and the result is stored in the `content` key.
29 | 
30 | # Define operations
31 | operations = [
32 |     MapOp(
33 |         name="process",
34 |         type="map",
35 |         prompt="Determine what type of document this is: {{ input.content }}",
36 |         output={"schema": {"document_type": "string"}}
37 |     ),
38 |     ReduceOp(
39 |         name="summarize",
40 |         type="reduce",
41 |         reduce_key="document_type",
42 |         prompt="Summarize the processed contents: {% for item in inputs %}{{ item.content }} {% endfor %}",
43 |         output={"schema": {"summary": "string"}}
44 |     )
45 | ]
46 | 
47 | # Define pipeline steps
48 | steps = [
49 |     PipelineStep(name="process_step", input="my_dataset", operations=["process"]),
50 |     PipelineStep(name="summarize_step", input="process_step", operations=["summarize"])
51 | ]
52 | 
53 | # Define pipeline output
54 | output = PipelineOutput(type="file", path="output.json")
55 | 
56 | # Create the pipeline
57 | pipeline = Pipeline(
58 |     name="example_pipeline",
59 |     datasets=datasets,
60 |     operations=operations,
61 |     steps=steps,
62 |     output=output,
63 |     default_model="gpt-4o-mini"
64 | )
65 | 
66 | # Optimize the pipeline
67 | optimized_pipeline = pipeline.optimize()
68 | 
69 | # Run the optimized pipeline
70 | result = optimized_pipeline.run() # Saves the result to the output path
71 | 
72 | print(f"Pipeline execution completed. Total cost: ${result:.2f}")
73 | ```
74 | 
75 | This example demonstrates how to create a simple pipeline that processes input documents and then summarizes the processed content. The pipeline is optimized before execution to improve performance.
76 | 
77 | ## API Reference
78 | 
79 | For a complete reference of all available classes and their methods, please refer to the [Python API Reference](api-reference/python.md).
80 | 
81 | The API Reference provides detailed information about each class, including:
82 | 
83 | - Available parameters
84 | - Method signatures
85 | - Return types
86 | - Usage examples
87 | 


--------------------------------------------------------------------------------
/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
 1 | /* Base Layer */
 2 | :root {
 3 |   --md-primary-fg-color: hsl(211, 100%, 50%);
 4 |   --md-primary-fg-color--light: hsl(211, 100%, 70%);
 5 |   --md-primary-fg-color--dark: hsl(211, 100%, 30%);
 6 |   --background: hsl(211, 100%, 95%);
 7 |   --foreground: hsl(211, 5%, 0%);
 8 |   --card: hsl(211, 50%, 90%);
 9 |   --card-foreground: hsl(211, 5%, 10%);
10 |   --popover: hsl(211, 100%, 95%);
11 |   --popover-foreground: hsl(211, 100%, 0%);
12 |   --primary-foreground: hsl(0, 0%, 100%);
13 |   --secondary: hsl(211, 30%, 70%);
14 |   --secondary-foreground: hsl(0, 0%, 0%);
15 |   --muted: hsl(173, 30%, 85%);
16 |   --muted-foreground: hsl(211, 5%, 35%);
17 |   --accent: hsl(173, 30%, 80%);
18 |   --accent-foreground: hsl(211, 5%, 10%);
19 |   --destructive: hsl(0, 100%, 30%);
20 |   --destructive-foreground: hsl(211, 5%, 90%);
21 |   --border: hsl(211, 30%, 50%);
22 |   --input: hsl(211, 30%, 18%);
23 |   --ring: var(--md-primary-fg-color);
24 |   --radius: 0.5rem;
25 | 
26 |   /* Custom chart colors */
27 |   --chart-1: hsl(12, 76%, 61%);
28 |   --chart-2: hsl(173, 58%, 39%);
29 |   --chart-3: hsl(197, 37%, 24%);
30 |   --chart-4: hsl(43, 74%, 66%);
31 |   --chart-5: hsl(27, 87%, 67%);
32 | }
33 | 
34 | /* Dark mode */
35 | .dark {
36 |   --md-primary-fg-color: hsl(211, 100%, 50%);
37 |   --md-primary-fg-color--light: hsl(211, 100%, 70%);
38 |   --md-primary-fg-color--dark: hsl(211, 100%, 30%);
39 |   --background: hsl(211, 50%, 5%);
40 |   --foreground: hsl(211, 5%, 90%);
41 |   --card: hsl(211, 50%, 0%);
42 |   --card-foreground: hsl(211, 5%, 90%);
43 |   --popover: hsl(211, 50%, 5%);
44 |   --popover-foreground: hsl(211, 5%, 90%);
45 |   /* --primary: var(--md-primary-fg-color--dark); */
46 |   --primary-foreground: hsl(0, 0%, 100%);
47 |   --secondary: hsl(211, 30%, 10%);
48 |   --secondary-foreground: hsl(0, 0%, 100%);
49 |   --muted: hsl(173, 30%, 15%);
50 |   --muted-foreground: hsl(211, 5%, 60%);
51 |   --accent: hsl(173, 30%, 15%);
52 |   --accent-foreground: hsl(211, 5%, 90%);
53 |   --destructive: hsl(0, 100%, 30%);
54 |   --destructive-foreground: hsl(211, 5%, 90%);
55 |   --border: hsl(211, 30%, 18%);
56 |   --input: hsl(211, 30%, 18%);
57 |   --ring: var(--md-primary-fg-color--dark);
58 |   --radius: 0.5rem;
59 | 
60 |   /* Custom chart colors for dark mode */
61 |   --chart-1: hsl(220, 70%, 50%);
62 |   --chart-2: hsl(160, 60%, 45%);
63 |   --chart-3: hsl(30, 80%, 55%);
64 |   --chart-4: hsl(280, 65%, 60%);
65 |   --chart-5: hsl(340, 75%, 55%);
66 | }
67 | 
68 | /* Header styling */
69 | h1, h2, h3, h4, h5, h6 {
70 |   color: var(--primary);
71 | }
72 | 
73 | .dark h1, .dark h2, .dark h3, .dark h4, .dark h5, .dark h6 {
74 |   color: var(--primary);
75 | }
76 | 
77 | /* Link styling */
78 | a {
79 |   color: var(--primary);
80 | }
81 | 
82 | .dark a {
83 |   color: var(--primary);
84 | }
85 | 
86 | /* Card styling */
87 | .card {
88 |   background-color: var(--card);
89 |   color: var(--card-foreground);
90 |   border-radius: var(--radius);
91 | }
92 | 
93 | .dark .card {
94 |   background-color: var(--card);
95 |   color: var(--card-foreground);
96 | }
97 | 


--------------------------------------------------------------------------------
/experiments/extraction_outputs.txt:
--------------------------------------------------------------------------------
 1 | Results Table:
 2 |                                      Logical Fallacy Extraction Experiment Results                                     
 3 | ╭──────────────┬─────────────┬─────────────────┬─────────────┬────────────┬────────────┬─────────────┬────────────────╮
 4 | │ Model        │ Method      │ Total Fallacies │ Avg per Doc │ Max in Doc │ Avg Length │ Runtime (s) │ Total Cost ($) │
 5 | ├──────────────┼─────────────┼─────────────────┼─────────────┼────────────┼────────────┼─────────────┼────────────────┤
 6 | │ gpt-4.1-mini │ line_number │             192 │        3.92 │         11 │      895.6 │       10.39 │      $0.468208 │
 7 | │              │ regex       │             125 │        2.55 │         43 │     3759.6 │       30.77 │      $0.413990 │
 8 | ├──────────────┼─────────────┼─────────────────┼─────────────┼────────────┼────────────┼─────────────┼────────────────┤
 9 | │ gpt-4.1-nano │ line_number │             279 │        5.69 │         19 │      719.3 │        3.51 │      $0.117790 │
10 | │              │ regex       │               8 │        0.16 │          4 │        1.5 │        9.95 │      $0.102572 │
11 | ├──────────────┼─────────────┼─────────────────┼─────────────┼────────────┼────────────┼─────────────┼────────────────┤
12 | │ gpt-4o-mini  │ line_number │             570 │       11.63 │         77 │      529.6 │      101.38 │      $0.177959 │
13 | │              │ regex       │             415 │        8.47 │        167 │      226.3 │      192.85 │      $0.120100 │
14 | ╰──────────────┴─────────────┴─────────────────┴─────────────┴────────────┴────────────┴─────────────┴────────────────╯


--------------------------------------------------------------------------------
/experiments/structured_outputs.txt:
--------------------------------------------------------------------------------
 1 | Results Table:
 2 |                                                        Experiment Results                                                        
 3 | ╭────────────────────────────────────────────────┬───────┬────────────┬───────────┬────────┬───────┬─────────────┬──────────────╮
 4 | │ Model                                          │ Doc % │ Approach   │ Precision │ Recall │    F1 │ Avg Runtime │ Avg Cost ($) │
 5 | ├────────────────────────────────────────────────┼───────┼────────────┼───────────┼────────┼───────┼─────────────┼──────────────┤
 6 | │ azure/gpt-4o-mini                              │   10% │ structured │     0.869 │  0.872 │ 0.853 │      1.100s │      $0.0004 │
 7 | │ azure/gpt-4o-mini                              │   10% │ tool       │     0.914 │  0.906 │ 0.891 │      0.722s │      $0.0004 │
 8 | ├────────────────────────────────────────────────┼───────┼────────────┼───────────┼────────┼───────┼─────────────┼──────────────┤
 9 | │ deepseek/deepseek-chat                         │   10% │ structured │     0.878 │  0.889 │ 0.877 │      2.094s │      $0.0003 │
10 | │ deepseek/deepseek-chat                         │   10% │ tool       │     0.867 │  0.856 │ 0.860 │      2.212s │      $0.0003 │
11 | ├────────────────────────────────────────────────┼───────┼────────────┼───────────┼────────┼───────┼─────────────┼──────────────┤
12 | │ lm_studio/hugging-quants/llama-3.2-3b-instruct │   10% │ structured │     0.033 │  0.022 │ 0.027 │     33.635s │      $0.0000 │
13 | │ lm_studio/hugging-quants/llama-3.2-3b-instruct │   10% │ tool       │     0.000 │  0.000 │ 0.000 │     70.858s │      $0.0000 │
14 | ╰────────────────────────────────────────────────┴───────┴────────────┴───────────┴────────┴───────┴─────────────┴──────────────╯


--------------------------------------------------------------------------------
/server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/server/__init__.py


--------------------------------------------------------------------------------
/server/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/server/app/__init__.py


--------------------------------------------------------------------------------
/server/app/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from fastapi import FastAPI
 3 | from fastapi.middleware.cors import CORSMiddleware
 4 | from server.app.routes import pipeline, convert, filesystem
 5 | from dotenv import load_dotenv
 6 | 
 7 | load_dotenv()
 8 | 
 9 | # Read backend configuration from .env
10 | host = os.getenv("BACKEND_HOST", "127.0.0.1")
11 | port = int(os.getenv("BACKEND_PORT", 8000))
12 | reload = os.getenv("BACKEND_RELOAD", "False").lower() == "true"
13 | 
14 | # Set default allow_origins if BACKEND_ALLOW_ORIGINS is not provided
15 | allow_origins = os.getenv("BACKEND_ALLOW_ORIGINS", "http://localhost:3000").split(",")
16 | 
17 | app = FastAPI()
18 | os.environ["USE_FRONTEND"] = "true"
19 | 
20 | # Add CORS middleware
21 | app.add_middleware(
22 |     CORSMiddleware,
23 |     allow_origins=allow_origins,
24 |     allow_credentials=True,
25 |     allow_methods=["*"],
26 |     allow_headers=["*"],
27 | )
28 | 
29 | # Include all routers,
30 | app.include_router(pipeline.router)
31 | app.include_router(convert.router)
32 | app.include_router(filesystem.router, prefix="/fs")
33 | 
34 | @app.get("/")
35 | async def root():
36 |     return {"message": "DocETL API is running"}
37 | 
38 | @app.get("/health")
39 | async def health_check():
40 |     return {"status": "healthy"}
41 | 
42 | if __name__ == "__main__":
43 |     import uvicorn
44 |     uvicorn.run("server.app.main:app", host=host, port=port, reload=reload)
45 | 


--------------------------------------------------------------------------------
/server/app/models.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import List, Dict, Any, Optional
 3 | from datetime import datetime
 4 | from enum import Enum
 5 | 
 6 | 
 7 | class PipelineRequest(BaseModel):
 8 |     yaml_config: str
 9 | 
10 | class PipelineConfigRequest(BaseModel):
11 |     namespace: str
12 |     name: str
13 |     config: str
14 |     input_path: str
15 |     output_path: str
16 | 
17 | class TaskStatus(str, Enum):
18 |     PENDING = "pending"
19 |     PROCESSING = "processing"
20 |     COMPLETED = "completed"
21 |     FAILED = "failed"
22 |     CANCELLED = "cancelled"
23 | 
24 | class OptimizeResult(BaseModel):
25 |     task_id: str
26 |     status: TaskStatus
27 |     should_optimize: Optional[str] = None
28 |     input_data: Optional[List[Dict[str, Any]]] = None
29 |     output_data: Optional[List[Dict[str, Any]]] = None
30 |     cost: Optional[float] = None
31 |     error: Optional[str] = None
32 |     created_at: datetime
33 |     completed_at: Optional[datetime] = None
34 | 
35 | class OptimizeRequest(BaseModel):
36 |     yaml_config: str
37 |     step_name: str
38 |     op_name: str


--------------------------------------------------------------------------------
/server/app/routes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/server/app/routes/__init__.py


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/tests/__init__.py


--------------------------------------------------------------------------------
/tests/basic/sample_texts/one.txt:
--------------------------------------------------------------------------------
 1 | Once upon a time, in a quaint village nestled among rolling hills, there lived a curious young girl named Lily. She had always dreamed of adventure beyond the confines of her small town. One day, while exploring the attic of her grandmother's old house, Lily discovered a dusty, leather-bound book with strange symbols etched on its cover.
 2 | 
 3 | As she opened the book, a swirl of glittering mist escaped from its pages, enveloping her in a magical aura. Suddenly, Lily found herself transported to a fantastical world filled with talking animals, floating islands, and shimmering forests. 
 4 | 
 5 | Guided by a wise old owl named Hoot, Lily embarked on a quest to find the lost key of harmony, which would restore balance to this enchanted realm. Along her journey, she befriended a mischievous fox, outsmarted a grumpy troll, and solved riddles posed by ancient tree spirits.
 6 | 
 7 | With each challenge she overcame, Lily grew braver and more confident. She learned that true magic lies not in spells or potions, but in the power of kindness, perseverance, and friendship.
 8 | 
 9 | As Lily finally reached the crystal cave where the key of harmony was hidden, she realized that the real treasure was the incredible adventure she had experienced and the lifelong friends she had made along the way.
10 | 
11 | With a bittersweet heart, Lily used the key to return home, knowing that her ordinary life would never be the same again. From that day forward, she approached each day with the wonder and courage of a true adventurer, always ready for the next exciting chapter in her story.
12 | 


--------------------------------------------------------------------------------
/tests/basic/sample_texts/two.md:
--------------------------------------------------------------------------------
 1 | # The Enchanted Forest
 2 | 
 3 | Once upon a time, in a land far beyond the reaches of our modern world, there lay a mysterious and enchanted forest. This forest, known as the Whispering Woods, was said to be alive with magic and wonder.
 4 | 
 5 | ## The Guardian of the Woods
 6 | 
 7 | At the heart of the Whispering Woods lived an ancient tree spirit named Eldora. With bark as silver as moonlight and leaves that shimmered like emeralds, Eldora had watched over the forest for countless centuries.
 8 | 
 9 | ## The Lost Traveler
10 | 
11 | One misty morning, a young traveler named Finn stumbled into the Whispering Woods. Lost and weary, he marveled at the ethereal beauty of the forest.
12 | 
13 | ### A Magical Encounter
14 | 
15 | As Finn wandered deeper into the woods, he heard a soft, melodious voice carried on the breeze. It was Eldora, calling out to him:
16 | 
17 | > "Welcome, young one. What brings you to our magical realm?"
18 | 
19 | Finn, awestruck, replied, "I've lost my way, kind spirit. Can you help me find my path?"
20 | 
21 | ### The Quest Begins
22 | 
23 | Eldora smiled, her leaves rustling gently. "To find your true path, you must first complete three tasks:
24 | 
25 | 1. Befriend the Moonlight Rabbits
26 | 2. Solve the Riddle of the Babbling Brook
27 | 3. Plant a seed of hope in the Glade of Dreams"
28 | 
29 | And so, Finn's adventure in the Whispering Woods began, filled with magical creatures, enigmatic puzzles, and the promise of self-discovery.
30 | 
31 | ---
32 | 
33 | _To be continued..._
34 | 


--------------------------------------------------------------------------------
/tests/basic/test_basic_parallel_map.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa: F811
 2 | 
 3 | import pytest
 4 | from docetl.operations.map import ParallelMapOperation
 5 | from dotenv import load_dotenv
 6 | from typing import Dict, Any, List, Tuple
 7 | from tests.conftest import (
 8 |     parallel_map_config as parallel_map_config,
 9 |     parallel_map_sample_data as parallel_map_sample_data,
10 |     default_model as default_model,
11 |     max_threads as max_threads,
12 |     api_wrapper as api_wrapper,
13 | )
14 | 
15 | load_dotenv()
16 | 
17 | 
18 | def test_parallel_map_operation(
19 |     parallel_map_config,
20 |     default_model,
21 |     max_threads,
22 |     parallel_map_sample_data,
23 |     api_wrapper,
24 | ):
25 |     parallel_map_config["bypass_cache"] = True
26 |     operation = ParallelMapOperation(
27 |         api_wrapper, parallel_map_config, default_model, max_threads
28 |     )
29 |     results, cost = operation.execute(parallel_map_sample_data)
30 | 
31 |     assert len(results) == len(parallel_map_sample_data)
32 |     assert all("sentiment" in result for result in results)
33 |     assert all("word_count" in result for result in results)
34 |     assert all(
35 |         result["sentiment"] in ["positive", "negative", "neutral"] for result in results
36 |     )
37 |     assert all(isinstance(result["word_count"], int) for result in results)
38 |     assert cost > 0
39 | 
40 | 
41 | def test_parallel_map_operation_empty_input(
42 |     parallel_map_config, default_model, max_threads, api_wrapper
43 | ):
44 |     operation = ParallelMapOperation(
45 |         api_wrapper, parallel_map_config, default_model, max_threads
46 |     )
47 |     results, cost = operation.execute([])
48 | 
49 |     assert len(results) == 0
50 |     assert cost == 0
51 | 
52 | 
53 | def test_parallel_map_operation_with_empty_input(
54 |     parallel_map_config, default_model, max_threads, api_wrapper
55 | ):
56 |     operation = ParallelMapOperation(
57 |         api_wrapper, parallel_map_config, default_model, max_threads
58 |     )
59 |     results, cost = operation.execute([])
60 | 
61 |     assert len(results) == 0
62 |     assert cost == 0
63 | 


--------------------------------------------------------------------------------
/tests/basic/test_optimizer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import json
 4 | import shutil
 5 | from docetl.runner import DSLRunner
 6 | 
 7 | @pytest.fixture
 8 | def test_dir(tmp_path):
 9 |     # Create test directories
10 |     data_dir = tmp_path / "tests" / "data"
11 |     data_dir.mkdir(parents=True)
12 |     
13 |     # Create test data file
14 |     data_file = data_dir / "test_data.json"
15 |     test_data = [
16 |         {"text": "My name is John Smith"},
17 |         {"text": "Hello, I'm Alice Johnson"},
18 |         {"text": "Bob Wilson here"}
19 |     ]
20 |     
21 |     with open(data_file, "w") as f:
22 |         json.dump(test_data, f)
23 |     
24 |     yield tmp_path
25 |     
26 |     # Cleanup
27 |     if tmp_path.exists():
28 |         shutil.rmtree(tmp_path)
29 | 
30 | @pytest.fixture
31 | def test_config(test_dir):
32 |     return {
33 |         "default_model": "gpt-4o-mini",
34 |         "datasets": {
35 |             "test_data": {
36 |                 "type": "file",
37 |                 "path": str(test_dir / "tests" / "data" / "test_data.json"),
38 |             }
39 |         },
40 |         "operations": [
41 |             {
42 |                 "name": "extract_name",
43 |                 "type": "map",
44 |                 "prompt": "Extract the person's name from the text.",
45 |                 "output": {
46 |                     "schema": {
47 |                         "name": "string"
48 |                     }
49 |                 },
50 |                 "optimize": True
51 |             }
52 |         ],
53 |         "pipeline": {
54 |             "steps": [
55 |                 {
56 |                     "name": "name_extraction",
57 |                     "input": "test_data",
58 |                     "operations": ["extract_name"]
59 |                 }
60 |             ]
61 |         }
62 |     }
63 | 
64 | @pytest.fixture
65 | def runner(test_config):
66 |     return DSLRunner(
67 |         config=test_config
68 |     )
69 | 
70 | def test_optimize_map_operation(runner, test_dir):
71 |     """Test that the optimizer can optimize a simple map operation"""
72 |     
73 |     
74 |     # Run optimization
75 |     optimized_config, total_cost = runner.optimize(return_pipeline=False)
76 |     
77 |     # Check that optimization completed successfully
78 |     assert total_cost >= 0  # Cost should be non-negative
79 |     
80 |     # Check that the optimized config contains operations
81 |     assert "operations" in optimized_config
82 |     assert len(optimized_config["operations"]) > 0
83 |     
84 |     # Check that the pipeline steps are preserved
85 |     assert "pipeline" in optimized_config
86 |     assert "steps" in optimized_config["pipeline"]
87 |     assert len(optimized_config["pipeline"]["steps"]) > 0
88 |     
89 |     # Check that the first step is preserved
90 |     first_step = optimized_config["pipeline"]["steps"][0]
91 |     assert first_step["name"] == "name_extraction"
92 |     assert "operations" in first_step
93 |     assert len(first_step["operations"]) > 0
94 | 
95 | 


--------------------------------------------------------------------------------
/tests/data/PublicWaterMassMailing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/tests/data/PublicWaterMassMailing.pdf


--------------------------------------------------------------------------------
/tests/ranking/plots/harmfulness_budget_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/tests/ranking/plots/harmfulness_budget_performance.png


--------------------------------------------------------------------------------
/tests/ranking/plots/medical_pain_budget_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/tests/ranking/plots/medical_pain_budget_performance.png


--------------------------------------------------------------------------------
/tests/ranking/plots/synthetic_abstracts_budget_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/tests/ranking/plots/synthetic_abstracts_budget_performance.png


--------------------------------------------------------------------------------
/tests/test_azure_rl.py:
--------------------------------------------------------------------------------
 1 | from docetl.runner import DSLRunner
 2 | import pytest
 3 | from docetl.operations.map import MapOperation
 4 | import random
 5 | import os
 6 | from dotenv import load_dotenv
 7 | from tests.conftest import api_wrapper
 8 | 
 9 | load_dotenv()
10 | 
11 | 
12 | @pytest.fixture
13 | def simple_map_config():
14 |     return {
15 |         "name": "simple_sentiment_analysis",
16 |         "type": "map",
17 |         "prompt": "Analyze the sentiment of the following text: '{{ input.text }}'. Classify it as either positive, negative, or neutral.",
18 |         "output": {"schema": {"sentiment": "string"}},
19 |         "model": "azure/gpt-4o",
20 |     }
21 | 
22 | 
23 | @pytest.fixture
24 | def sample_documents():
25 |     sentiments = ["positive", "negative", "neutral"]
26 |     documents = []
27 |     for _ in range(8):
28 |         sentiment = random.choice(sentiments)
29 |         if sentiment == "positive":
30 |             text = f"I absolutely love this product! It's amazing and works perfectly."
31 |         elif sentiment == "negative":
32 |             text = f"This is the worst experience I've ever had. Terrible service."
33 |         else:
34 |             text = f"The product works as expected. Nothing special to report."
35 |         documents.append({"text": text})
36 |     return documents
37 | 
38 | 
39 | def test_map_operation_over_15_documents(simple_map_config, sample_documents):
40 |     # Set environment variables specific to this test
41 |     os.environ["AZURE_API_BASE"] = os.getenv("LOW_RES_AZURE_API_BASE")
42 |     os.environ["AZURE_API_VERSION"] = os.getenv("LOW_RES_AZURE_API_VERSION")
43 |     os.environ["AZURE_API_KEY"] = os.getenv("LOW_RES_AZURE_API_KEY")
44 | 
45 |     runner = DSLRunner(
46 |         {
47 |             "default_model": "gpt-4o-mini",
48 |             "operations": [],
49 |             "pipeline": {"steps": [], "output": {"path": "/tmp/testingdocetl.json"}},
50 |         },
51 |         max_threads=64,
52 |     )
53 | 
54 |     operation = MapOperation(runner, simple_map_config, "azure/gpt-4o", 4)
55 |     results, cost = operation.execute(sample_documents + sample_documents)
56 | 
57 |     assert len(results) == 16
58 |     assert all("sentiment" in result for result in results)
59 |     assert all(
60 |         result["sentiment"] in ["positive", "negative", "neutral"] for result in results
61 |     )
62 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import json
  3 | import os
  4 | from docetl.runner import DSLRunner
  5 | from docetl.utils import load_config
  6 | import yaml
  7 | 
  8 | # Sample configuration for the test
  9 | SAMPLE_CONFIG = """
 10 | default_model: "gpt-4o-mini"
 11 | 
 12 | operations:
 13 |   - name: map_operation
 14 |     type: map
 15 |     prompt: |
 16 |       Analyze the sentiment of the following text: "{{ input.text }}"
 17 |       Classify it as either positive, negative, or neutral.
 18 |     output:
 19 |       schema:
 20 |         sentiment: string
 21 |     model: "gpt-4o-mini"
 22 | 
 23 |   - name: filter_operation
 24 |     type: filter
 25 |     prompt: |
 26 |       Determine if the following text is longer than 5 words:
 27 |       "{{ input.text }}"
 28 |     output:
 29 |       schema:
 30 |         keep: boolean
 31 |     model: "gpt-4o-mini"
 32 | 
 33 | datasets:
 34 |   sample_dataset:
 35 |     type: file
 36 |     path: "tests/sample_data.json"
 37 | 
 38 | pipeline:
 39 |   steps:
 40 |     - name: sentiment_analysis
 41 |       input: sample_dataset
 42 |       operations:
 43 |         - map_operation
 44 |     - name: filter_long_texts
 45 |       input: sentiment_analysis
 46 |       operations:
 47 |         - filter_operation
 48 | 
 49 |   output:
 50 |     type: file
 51 |     path: "tests/output.json"
 52 | """
 53 | 
 54 | SAMPLE_DATA = [
 55 |     {"text": "This is a very positive sentence.", "id": 1},
 56 |     {"text": "A short negative phrase.", "id": 2},
 57 |     {"text": "Neutral statement without much edocetl.", "id": 3},
 58 |     {"text": "Brief.", "id": 4},
 59 | ]
 60 | 
 61 | 
 62 | @pytest.fixture
 63 | def config_file(tmp_path):
 64 |     config_path = tmp_path / "test_config.yaml"
 65 |     with open(config_path, "w") as f:
 66 |         f.write(SAMPLE_CONFIG)
 67 |     return config_path
 68 | 
 69 | 
 70 | @pytest.fixture
 71 | def sample_data_file(tmp_path):
 72 |     data_path = tmp_path / "sample_data.json"
 73 |     with open(data_path, "w") as f:
 74 |         json.dump(SAMPLE_DATA, f)
 75 |     return data_path
 76 | 
 77 | 
 78 | def test_end_to_end_pipeline(config_file, sample_data_file, tmp_path):
 79 |     # Update the config with the correct sample data path
 80 |     config = load_config(config_file)
 81 |     config["datasets"]["sample_dataset"]["path"] = str(sample_data_file)
 82 |     config["pipeline"]["output"]["path"] = str(tmp_path / "output.json")
 83 | 
 84 |     # Write the updated config back to the file
 85 |     with open(config_file, "w") as f:
 86 |         yaml.dump(config, f)
 87 | 
 88 |     # Create and run the DSLRunner
 89 |     runner = DSLRunner.from_yaml(str(config_file))
 90 |     total_cost = runner.load_run_save()
 91 | 
 92 |     # Check if the output file was created
 93 |     output_path = tmp_path / "output.json"
 94 |     assert output_path.exists(), "Output file was not created"
 95 | 
 96 |     # Load and check the output
 97 |     with open(output_path, "r") as f:
 98 |         output_data = json.load(f)
 99 | 
100 |     # Verify the output
101 |     assert len(output_data) > 0, "Output data is empty"
102 |     assert all(
103 |         "sentiment" in item for item in output_data
104 |     ), "Sentiment analysis was not applied to all items"
105 |     assert all(
106 |         len(item["text"].split()) >= 5 for item in output_data
107 |     ), "Filter operation did not remove short texts"
108 | 
109 |     # Check if the cost was calculated and is greater than 0
110 |     assert total_cost > 0, "Total cost was not calculated or is 0"
111 | 
112 |     print(f"Pipeline executed successfully. Total cost: ${total_cost:.2f}")
113 |     print(f"Output: {output_data}")
114 | 


--------------------------------------------------------------------------------
/tests/test_ollama.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import pytest
  3 | import json
  4 | import tempfile
  5 | import os
  6 | from docetl.api import (
  7 |     Pipeline,
  8 |     Dataset,
  9 |     MapOp,
 10 |     ReduceOp,
 11 |     PipelineStep,
 12 |     PipelineOutput,
 13 | )
 14 | from dotenv import load_dotenv
 15 | 
 16 | load_dotenv()
 17 | 
 18 | # Set the OLLAMA_API_BASE environment variable
 19 | os.environ["OLLAMA_API_BASE"] = "http://localhost:11434/"
 20 | 
 21 | 
 22 | @pytest.fixture
 23 | def temp_input_file():
 24 |     with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as tmp:
 25 |         json.dump(
 26 |             [
 27 |                 {"text": "This is a test", "group": "A"},
 28 |                 {"text": "Another test", "group": "B"},
 29 |             ],
 30 |             tmp,
 31 |         )
 32 |     yield tmp.name
 33 |     os.unlink(tmp.name)
 34 | 
 35 | 
 36 | @pytest.fixture
 37 | def temp_output_file():
 38 |     with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as tmp:
 39 |         pass
 40 |     yield tmp.name
 41 |     os.unlink(tmp.name)
 42 | 
 43 | 
 44 | @pytest.fixture
 45 | def temp_intermediate_dir():
 46 |     with tempfile.TemporaryDirectory() as tmpdirname:
 47 |         yield tmpdirname
 48 | 
 49 | 
 50 | @pytest.fixture
 51 | def map_config():
 52 |     return MapOp(
 53 |         name="sentiment_analysis",
 54 |         type="map",
 55 |         prompt="Analyze the sentiment of the following text: '{{ input.text }}'. Classify it as either positive, negative, or neutral.",
 56 |         output={"schema": {"sentiment": "string"}},
 57 |         model="ollama/llama3.1",
 58 |     )
 59 | 
 60 | 
 61 | @pytest.fixture
 62 | def reduce_config():
 63 |     return ReduceOp(
 64 |         name="group_summary",
 65 |         type="reduce",
 66 |         reduce_key="group",
 67 |         prompt="Summarize the following group of values: {{ inputs }} Provide a total and any other relevant statistics.",
 68 |         output={"schema": {"total": "number", "avg": "number"}},
 69 |         model="ollama/llama3.1",
 70 |     )
 71 | 
 72 | 
 73 | @pytest.fixture(autouse=True)
 74 | def remove_openai_api_key():
 75 |     openai_api_key = os.environ.pop("OPENAI_API_KEY", None)
 76 |     yield
 77 |     if openai_api_key:
 78 |         os.environ["OPENAI_API_KEY"] = openai_api_key
 79 | 
 80 | 
 81 | def test_ollama_map_reduce_pipeline(
 82 |     map_config, reduce_config, temp_input_file, temp_output_file, temp_intermediate_dir
 83 | ):
 84 |     pipeline = Pipeline(
 85 |         name="test_ollama_pipeline",
 86 |         datasets={"test_input": Dataset(type="file", path=temp_input_file)},
 87 |         operations=[map_config, reduce_config],
 88 |         steps=[
 89 |             PipelineStep(
 90 |                 name="pipeline",
 91 |                 input="test_input",
 92 |                 operations=["sentiment_analysis", "group_summary"],
 93 |             ),
 94 |         ],
 95 |         output=PipelineOutput(
 96 |             type="file", path=temp_output_file, intermediate_dir=temp_intermediate_dir
 97 |         ),
 98 |         default_model="ollama/llama3.1",
 99 |     )
100 | 
101 |     cost = pipeline.run()
102 | 
103 |     assert isinstance(cost, float)
104 |     assert cost == 0
105 | 
106 |     # Verify output file exists and contains data
107 |     assert os.path.exists(temp_output_file)
108 |     with open(temp_output_file, "r") as f:
109 |         output_data = json.load(f)
110 |     assert len(output_data) > 0
111 | 
112 |     # Clean up
113 |     shutil.rmtree(temp_intermediate_dir)
114 | 


--------------------------------------------------------------------------------
/tests/test_validation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from docetl.operations.map import MapOperation
 3 | from tests.conftest import api_wrapper, default_model, max_threads
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def map_config_with_validation():
 8 |     return {
 9 |         "name": "sentiment_analysis_with_validation",
10 |         "type": "map",
11 |         "prompt": "Analyze the sentiment of the following text: '{{ input.text }}'. Classify it as either positive, negative, or neutral.",
12 |         "output": {"schema": {"sentiment": "string", "confidence": "float"}},
13 |         "model": "gpt-4o-mini",
14 |         "validate": [
15 |             "output['sentiment'] in ['positive', 'negative', 'neutral']",
16 |             "0 <= output['confidence'] <= 1",
17 |         ],
18 |         "num_retries_on_validate_failure": 2,
19 |     }
20 | 
21 | 
22 | @pytest.fixture
23 | def sample_data():
24 |     return [
25 |         {"text": "I love this product! It's amazing."},
26 |         {"text": "This is the worst experience ever."},
27 |         {"text": "The weather is okay today."},
28 |     ]
29 | 
30 | 
31 | def test_map_operation_with_validation(
32 |     map_config_with_validation, sample_data, api_wrapper, default_model, max_threads
33 | ):
34 |     map_config_with_validation["bypass_cache"] = True
35 |     operation = MapOperation(
36 |         api_wrapper, map_config_with_validation, default_model, max_threads
37 |     )
38 |     results, cost = operation.execute(sample_data)
39 | 
40 |     assert len(results) == len(sample_data)
41 |     assert cost > 0
42 | 
43 |     for result in results:
44 |         assert "sentiment" in result
45 |         assert "confidence" in result
46 |         assert result["sentiment"] in ["positive", "negative", "neutral"]
47 |         assert 0 <= result["confidence"] <= 1
48 | 


--------------------------------------------------------------------------------
/website/.env.local.sample:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=sk-xxx
2 | OPENAI_API_BASE=https://api.openai.com/v1
3 | MODEL_NAME=gpt-4o-mini
4 | 
5 | NEXT_PUBLIC_BACKEND_HOST=localhost
6 | NEXT_PUBLIC_BACKEND_PORT=8000
7 | NEXT_PUBLIC_HOSTED_DOCWRANGLER=false
8 | 


--------------------------------------------------------------------------------
/website/README.md:
--------------------------------------------------------------------------------
 1 | This is a [Next.js](https://nextjs.org) project bootstrapped with [`create-next-app`](https://nextjs.org/docs/app/api-reference/cli/create-next-app). This is DocWrangler, the frontend for DocETL.
 2 | 
 3 | ## Getting Started
 4 | 
 5 | ### Setting up environment variables
 6 | 
 7 | Copy the .env.sample file from the root directory to .env.local and modify the environment variables inside:
 8 | 
 9 | ```bash
10 | OPENAI_API_KEY=sk-xxx
11 | OPENAI_API_BASE=https://api.openai.com/v1
12 | MODEL_NAME=gpt-4o-mini
13 | 
14 | NEXT_PUBLIC_BACKEND_HOST=localhost
15 | NEXT_PUBLIC_BACKEND_PORT=8008
16 | 
17 | ```
18 | 
19 | First, run the development server:
20 | 
21 | ```bash
22 | npm run dev
23 | # or
24 | yarn dev
25 | # or
26 | pnpm dev
27 | # or
28 | bun dev
29 | ```
30 | 
31 | Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
32 | 
33 | You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file.
34 | 
35 | This project uses [`next/font`](https://nextjs.org/docs/app/building-your-application/optimizing/fonts) to automatically optimize and load [Geist](https://vercel.com/font), a new font family for Vercel.
36 | 
37 | ## Learn More
38 | 
39 | To learn more about Next.js, take a look at the following resources:
40 | 
41 | - [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API.
42 | - [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
43 | 
44 | You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js) - your feedback and contributions are welcome!
45 | 
46 | ## Deploy on Vercel
47 | 
48 | The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js.
49 | 
50 | Check out our [Next.js deployment documentation](https://nextjs.org/docs/app/building-your-application/deploying) for more details.
51 | 


--------------------------------------------------------------------------------
/website/components.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://ui.shadcn.com/schema.json",
 3 |   "style": "new-york",
 4 |   "rsc": true,
 5 |   "tsx": true,
 6 |   "tailwind": {
 7 |     "config": "tailwind.config.ts",
 8 |     "css": "src/app/globals.css",
 9 |     "baseColor": "zinc",
10 |     "cssVariables": true,
11 |     "prefix": ""
12 |   },
13 |   "aliases": {
14 |     "components": "@/components",
15 |     "utils": "@/lib/utils",
16 |     "ui": "@/components/ui",
17 |     "lib": "@/lib",
18 |     "hooks": "@/hooks"
19 |   }
20 | }


--------------------------------------------------------------------------------
/website/eslint.config.mjs:
--------------------------------------------------------------------------------
 1 | import globals from "globals";
 2 | import pluginJs from "@eslint/js";
 3 | import tseslint from "typescript-eslint";
 4 | import pluginReact from "eslint-plugin-react";
 5 | import pluginUnusedImports from "eslint-plugin-unused-imports";
 6 | 
 7 | export default [
 8 |   {files: ["**/*.{js,mjs,cjs,ts,jsx,tsx}"]},
 9 |   {languageOptions: { globals: globals.browser }},
10 |   {
11 |     plugins: {
12 |       'unused-imports': pluginUnusedImports
13 |     },
14 |     rules: {
15 |       "no-unused-vars": "off",
16 |       "unused-imports/no-unused-imports": "error", 
17 |       "unused-imports/no-unused-vars": [
18 |         "warn",
19 |         {
20 |           "vars": "all",
21 |           "varsIgnorePattern": "^_",
22 |           "args": "after-used",
23 |           "argsIgnorePattern": "^_"
24 |         }
25 |       ]
26 |     }
27 |   },
28 |   pluginJs.configs.recommended,
29 |   ...tseslint.configs.recommended,
30 |   pluginReact.configs.flat.recommended,
31 | ];


--------------------------------------------------------------------------------
/website/next.config.mjs:
--------------------------------------------------------------------------------
1 | /** @type {import('next').NextConfig} */
2 | const nextConfig = {};
3 | 
4 | export default nextConfig;
5 | 


--------------------------------------------------------------------------------
/website/postcss.config.mjs:
--------------------------------------------------------------------------------
1 | /** @type {import('postcss-load-config').Config} */
2 | const config = {
3 |   plugins: {
4 |     tailwindcss: {},
5 |   },
6 | };
7 | 
8 | export default config;
9 | 


--------------------------------------------------------------------------------
/website/posts/hello-world.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Hello, World!"
3 | date: "2024-09-13"
4 | ---
5 | 
6 | This is just a test post.
7 | 


--------------------------------------------------------------------------------
/website/public/berkeley.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/website/public/berkeley.png


--------------------------------------------------------------------------------
/website/public/docetl-50m-fall-2024.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/website/public/docetl-50m-fall-2024.pdf


--------------------------------------------------------------------------------
/website/public/docetl-favicon-color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/website/public/docetl-favicon-color.png


--------------------------------------------------------------------------------
/website/public/epiclogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/website/public/epiclogo.png


--------------------------------------------------------------------------------
/website/src/app/MarkdownRenderer.tsx:
--------------------------------------------------------------------------------
 1 | import ReactMarkdown from "react-markdown";
 2 | 
 3 | const MarkdownRenderer = ({ content }: { content: any }) => {
 4 |   return (
 5 |     <div className="prose max-w-none">
 6 |       <ReactMarkdown
 7 |         components={{
 8 |           h1: (props) => <h1 className="text-3xl font-bold mb-4" {...props} />,
 9 |           h2: (props) => <h2 className="text-2xl font-bold mb-3" {...props} />,
10 |           h3: (props) => <h3 className="text-xl font-bold mb-2" {...props} />,
11 |           p: (props) => <p className="mb-4" {...props} />,
12 |           ul: (props) => (
13 |             <ul className="list-disc list-inside mb-4" {...props} />
14 |           ),
15 |           ol: (props) => (
16 |             <ol className="list-decimal list-inside mb-4" {...props} />
17 |           ),
18 |           li: (props) => <li className="mb-1" {...props} />,
19 |           a: (props) => (
20 |             <a className="text-blue-500 hover:underline" {...props} />
21 |           ),
22 |           blockquote: (props) => (
23 |             <blockquote
24 |               className="border-l-4 border-gray-300 pl-4 italic mb-4"
25 |               {...props}
26 |             />
27 |           ),
28 |           code: ({ className, children, ...props }) => {
29 |             const match = /language-(\w+)/.exec(className || "");
30 |             const language = match ? match[1] : "";
31 |             const isInlineCode =
32 |               !language &&
33 |               typeof children === "string" &&
34 |               !children.includes("\n");
35 | 
36 |             return isInlineCode ? (
37 |               <code className="bg-gray-100 rounded px-1" {...props}>
38 |                 {children}
39 |               </code>
40 |             ) : (
41 |               <pre className="bg-gray-100 rounded p-4 mb-4 overflow-x-auto">
42 |                 <code className={className} {...props}>
43 |                   {children}
44 |                 </code>
45 |               </pre>
46 |             );
47 |           },
48 |         }}
49 |       >
50 |         {content}
51 |       </ReactMarkdown>
52 |     </div>
53 |   );
54 | };
55 | 
56 | export default MarkdownRenderer;
57 | 


--------------------------------------------------------------------------------
/website/src/app/api/checkNamespace/route.ts:
--------------------------------------------------------------------------------
 1 | import { NextResponse } from "next/server";
 2 | 
 3 | const FASTAPI_URL = `${
 4 |   process.env.NEXT_PUBLIC_BACKEND_HTTPS ? "https" : "http"
 5 | }://${process.env.NEXT_PUBLIC_BACKEND_HOST}:${
 6 |   process.env.NEXT_PUBLIC_BACKEND_PORT
 7 | }`;
 8 | 
 9 | export async function POST(request: Request) {
10 |   try {
11 |     const { namespace } = await request.json();
12 | 
13 |     if (!namespace) {
14 |       return NextResponse.json(
15 |         { error: "Namespace parameter is required" },
16 |         { status: 400 }
17 |       );
18 |     }
19 | 
20 |     console.log(FASTAPI_URL);
21 |     const response = await fetch(
22 |       `${FASTAPI_URL}/fs/check-namespace?namespace=${namespace}`,
23 |       {
24 |         method: "POST",
25 |       }
26 |     );
27 | 
28 |     if (!response.ok) {
29 |       const error = await response.json();
30 |       throw new Error(error.detail || "Failed to check namespace");
31 |     }
32 | 
33 |     const data = await response.json();
34 |     return NextResponse.json(data);
35 |   } catch (error) {
36 |     console.error("Error checking namespace:", error);
37 |     return NextResponse.json(
38 |       { error: "Failed to check namespace" },
39 |       { status: 500 }
40 |     );
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/website/src/app/api/constants.ts:
--------------------------------------------------------------------------------
1 | export const API_ROUTES = {
2 |   OPTIMIZE: {
3 |     SUBMIT: "/api/shouldOptimize",
4 |     STATUS: (taskId: string) => `/api/shouldOptimize?taskId=${taskId}`,
5 |     CANCEL: (taskId: string) =>
6 |       `/api/shouldOptimize?taskId=${taskId}&cancel=true`,
7 |   },
8 | } as const;
9 | 


--------------------------------------------------------------------------------
/website/src/app/api/convertDocuments/route.ts:
--------------------------------------------------------------------------------
 1 | import { NextRequest, NextResponse } from "next/server";
 2 | 
 3 | const FASTAPI_URL = `${
 4 |   process.env.NEXT_PUBLIC_BACKEND_HTTPS ? "https" : "http"
 5 | }://${process.env.NEXT_PUBLIC_BACKEND_HOST}:${
 6 |   process.env.NEXT_PUBLIC_BACKEND_PORT
 7 | }`;
 8 | 
 9 | export async function POST(request: NextRequest) {
10 |   try {
11 |     const formData = await request.formData();
12 |     const files = formData.getAll("files");
13 |     const conversionMethod = formData.get("conversion_method");
14 | 
15 |     if (!files || files.length === 0) {
16 |       return NextResponse.json({ error: "No files provided" }, { status: 400 });
17 |     }
18 | 
19 |     // Get Azure credentials from headers if they exist
20 |     const azureEndpoint = request.headers.get("azure-endpoint");
21 |     const azureKey = request.headers.get("azure-key");
22 |     const customDoclingUrl = request.headers.get("custom-docling-url");
23 | 
24 |     // Prepare headers for the backend request
25 |     const headers: HeadersInit = {};
26 |     if (azureEndpoint && azureKey) {
27 |       headers["azure-endpoint"] = azureEndpoint;
28 |       headers["azure-key"] = azureKey;
29 |       headers["is-read"] = "true";
30 |     }
31 |     if (customDoclingUrl) {
32 |       headers["custom-docling-url"] = customDoclingUrl;
33 |     }
34 | 
35 |     // Create FormData since FastAPI expects multipart/form-data
36 |     const backendFormData = new FormData();
37 |     for (const file of files) {
38 |       backendFormData.append("files", file);
39 |     }
40 | 
41 |     // Determine which endpoint to use and construct the URL
42 |     let targetUrl: string;
43 |     if (azureEndpoint && azureKey) {
44 |       targetUrl = `${FASTAPI_URL}/api/azure-convert-documents`;
45 |     } else if (customDoclingUrl) {
46 |       targetUrl = `${FASTAPI_URL}/api/convert-documents`;
47 |     } else {
48 |       targetUrl = `${FASTAPI_URL}/api/convert-documents${
49 |         conversionMethod === "docetl" ? "?use_docetl_server=true" : ""
50 |       }`;
51 |     }
52 | 
53 |     // Forward the request to the appropriate backend
54 |     const response = await fetch(targetUrl, {
55 |       method: "POST",
56 |       body: backendFormData,
57 |       headers,
58 |     });
59 | 
60 |     if (!response.ok) {
61 |       const errorData = await response.json().catch(() => ({}));
62 |       throw new Error(errorData.error || `Backend returned ${response.status}`);
63 |     }
64 | 
65 |     const data = await response.json();
66 | 
67 |     return NextResponse.json({
68 |       documents: data.documents,
69 |       message: "Documents converted successfully",
70 |     });
71 |   } catch (error) {
72 |     console.error("Error converting documents:", error);
73 |     return NextResponse.json(
74 |       {
75 |         error: error instanceof Error ? error.message : String(error),
76 |       },
77 |       { status: 500 }
78 |     );
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/website/src/app/api/downloadTutorialDataset/route.ts:
--------------------------------------------------------------------------------
 1 | import { NextResponse } from "next/server";
 2 | 
 3 | export async function GET(request: Request) {
 4 |   const { searchParams } = new URL(request.url);
 5 |   const fileId = searchParams.get("fileId");
 6 | 
 7 |   if (!fileId) {
 8 |     return new NextResponse("File ID is required", { status: 400 });
 9 |   }
10 | 
11 |   try {
12 |     const driveUrl = `https://drive.google.com/uc?export=download&id=${fileId}`;
13 |     const response = await fetch(driveUrl);
14 | 
15 |     if (!response.ok) {
16 |       throw new Error("Failed to download file from Google Drive");
17 |     }
18 | 
19 |     const data = await response.blob();
20 |     return new NextResponse(data);
21 |   } catch (error) {
22 |     console.error("Error downloading tutorial dataset:", error);
23 |     return new NextResponse("Failed to download tutorial dataset", {
24 |       status: 500,
25 |     });
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/website/src/app/api/edit/route.ts:
--------------------------------------------------------------------------------
 1 | import { NextResponse } from "next/server";
 2 | import { Operation } from "@/app/types";
 3 | 
 4 | export async function POST(request: Request) {
 5 |   try {
 6 |     const { operation, instruction } = await request.json();
 7 | 
 8 |     // TODO: Implement your LLM API call here to modify the operation based on the instruction
 9 |     // This is just a placeholder that returns the original operation
10 |     const updatedOperation: Operation = operation;
11 | 
12 |     return NextResponse.json(updatedOperation);
13 |   } catch (error) {
14 |     console.error("Error in edit API:", error);
15 |     return NextResponse.json(
16 |       { error: "Failed to process edit request" },
17 |       { status: 500 }
18 |     );
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/website/src/app/api/generate/route.ts:
--------------------------------------------------------------------------------
 1 | import { createAzure } from "@ai-sdk/azure";
 2 | import { createOpenAI } from "@ai-sdk/openai";
 3 | import { generateText } from "ai";
 4 | import { createClient } from "@supabase/supabase-js";
 5 | 
 6 | const supabase =
 7 |   process.env.SUPABASE_URL && process.env.SUPABASE_SERVICE_KEY
 8 |     ? createClient(process.env.SUPABASE_URL, process.env.SUPABASE_SERVICE_KEY)
 9 |     : null;
10 | 
11 | export async function POST(req: Request) {
12 |   let prompt: string = "";
13 |   let personalApiKey: string | null = null;
14 |   let useOpenAI: boolean = false;
15 |   let source: string | null = null;
16 | 
17 |   try {
18 |     ({ prompt } = await req.json());
19 |     personalApiKey = req.headers.get("x-openai-key");
20 |     useOpenAI = req.headers.get("x-use-openai") === "true";
21 |     const namespace = req.headers.get("x-namespace");
22 |     source = req.headers.get("x-source");
23 |     const modelName = req.headers.get("x-model") || "o1-mini";
24 | 
25 |     let text: string;
26 | 
27 |     // Use OpenAI if explicitly requested via header
28 |     if (useOpenAI) {
29 |       const openai = createOpenAI({
30 |         // Use personal API key if provided, otherwise fall back to environment variable
31 |         apiKey: personalApiKey || process.env.OPENAI_API_KEY!,
32 |         baseURL: process.env.OPENAI_API_BASE || "https://api.openai.com/v1",
33 |         compatibility: "strict",
34 |       });
35 | 
36 |       const result = await generateText({
37 |         model: openai(process.env.MODEL_NAME || "gpt-4o-mini"),
38 |         prompt,
39 |       });
40 | 
41 |       text = result.text;
42 |     } else {
43 |       // Use Azure OpenAI
44 |       const azure = createAzure({
45 |         apiKey: process.env.AZURE_API_KEY!,
46 |         apiVersion: process.env.AZURE_API_VERSION,
47 |         resourceName: process.env.AZURE_RESOURCE_NAME,
48 |       });
49 | 
50 |       const result = await generateText({
51 |         model: azure(modelName),
52 |         prompt,
53 |       });
54 | 
55 |       text = result.text;
56 | 
57 |       // Log usage to Supabase if available
58 |       if (supabase && result.usage) {
59 |         const cost =
60 |           (result.usage.promptTokens * 0.15) / 1_000_000 +
61 |           (result.usage.completionTokens * 0.6) / 1_000_000;
62 | 
63 |         try {
64 |           const { error } = await supabase.from("frontend_ai_requests").insert({
65 |             messages: [{ role: "user", content: prompt }],
66 |             namespace,
67 |             cost,
68 |             source: source || "nl-pipeline-generator",
69 |           });
70 | 
71 |           if (error) {
72 |             console.error("Supabase insert error:", error);
73 |           } else {
74 |             console.log("Successfully logged to Supabase");
75 |           }
76 |         } catch (err) {
77 |           console.error("Failed to log to Supabase:", err);
78 |         }
79 |       }
80 |     }
81 | 
82 |     return new Response(JSON.stringify({ text }), {
83 |       headers: { "Content-Type": "application/json" },
84 |     });
85 |   } catch (error) {
86 |     console.error("Generate API error:", error);
87 | 
88 |     return new Response(
89 |       JSON.stringify({
90 |         error:
91 |           error instanceof Error
92 |             ? error.message
93 |             : "An unexpected error occurred",
94 |       }),
95 |       { status: 500, headers: { "Content-Type": "application/json" } }
96 |     );
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/website/src/app/api/getInputOutput/route.ts:
--------------------------------------------------------------------------------
  1 | import { NextResponse } from "next/server";
  2 | import { generatePipelineConfig } from "@/app/api/utils";
  3 | import os from "os";
  4 | 
  5 | const FASTAPI_URL = `${
  6 |   process.env.NEXT_PUBLIC_BACKEND_HTTPS ? "https" : "http"
  7 | }://${process.env.NEXT_PUBLIC_BACKEND_HOST}:${
  8 |   process.env.NEXT_PUBLIC_BACKEND_PORT
  9 | }`;
 10 | 
 11 | export async function POST(request: Request) {
 12 |   try {
 13 |     const {
 14 |       default_model,
 15 |       data,
 16 |       operations,
 17 |       operation_id,
 18 |       name,
 19 |       sample_size,
 20 |       namespace,
 21 |       extraPipelineSettings,
 22 |     } = await request.json();
 23 | 
 24 |     if (!name) {
 25 |       return NextResponse.json(
 26 |         { error: "Pipeline name is required" },
 27 |         { status: 400 }
 28 |       );
 29 |     }
 30 | 
 31 |     if (!data) {
 32 |       return NextResponse.json(
 33 |         { error: "Data is required. Please select a file in the sidebar." },
 34 |         { status: 400 }
 35 |       );
 36 |     }
 37 | 
 38 |     const homeDir = process.env.DOCETL_HOME_DIR || os.homedir();
 39 |     const { inputPath, outputPath } = generatePipelineConfig(
 40 |       namespace,
 41 |       default_model,
 42 |       data,
 43 |       operations,
 44 |       operation_id,
 45 |       name,
 46 |       homeDir,
 47 |       sample_size,
 48 |       false,
 49 |       false,
 50 |       { datasetDescription: null, persona: null },
 51 |       [],
 52 |       "",
 53 |       false,
 54 |       extraPipelineSettings
 55 |     );
 56 | 
 57 |     // Check if files exist using FastAPI endpoints
 58 |     const checkInputResponse = await fetch(
 59 |       `${FASTAPI_URL}/fs/check-file?path=${encodeURIComponent(inputPath)}`,
 60 |       {
 61 |         method: "GET",
 62 |       }
 63 |     );
 64 | 
 65 |     if (!checkInputResponse.ok) {
 66 |       console.error(`Failed to check input path: ${inputPath}`);
 67 |       return NextResponse.json(
 68 |         { error: "Failed to check input path" },
 69 |         { status: 500 }
 70 |       );
 71 |     }
 72 | 
 73 |     const inputResult = await checkInputResponse.json();
 74 |     if (!inputResult.exists) {
 75 |       console.error(`Input path does not exist: ${inputPath}`);
 76 |       return NextResponse.json(
 77 |         { error: "Input path does not exist" },
 78 |         { status: 400 }
 79 |       );
 80 |     }
 81 | 
 82 |     const checkOutputResponse = await fetch(
 83 |       `${FASTAPI_URL}/fs/check-file?path=${encodeURIComponent(outputPath)}`,
 84 |       {
 85 |         method: "GET",
 86 |       }
 87 |     );
 88 | 
 89 |     if (!checkOutputResponse.ok) {
 90 |       console.error(`Failed to check output path: ${outputPath}`);
 91 |       return NextResponse.json(
 92 |         { error: "Failed to check output path" },
 93 |         { status: 500 }
 94 |       );
 95 |     }
 96 | 
 97 |     const outputResult = await checkOutputResponse.json();
 98 |     if (!outputResult.exists) {
 99 |       console.error(`Output path does not exist: ${outputPath}`);
100 |       return NextResponse.json(
101 |         { error: "Output path does not exist" },
102 |         { status: 400 }
103 |       );
104 |     }
105 | 
106 |     return NextResponse.json({ inputPath, outputPath });
107 |   } catch (error) {
108 |     console.error(error);
109 |     return NextResponse.json(
110 |       { error: "Failed to get input and output paths" },
111 |       { status: 500 }
112 |     );
113 |   }
114 | }
115 | 


--------------------------------------------------------------------------------
/website/src/app/api/getPipelineConfig/route.ts:
--------------------------------------------------------------------------------
 1 | import { NextResponse } from "next/server";
 2 | import { generatePipelineConfig } from "@/app/api/utils";
 3 | import os from "os";
 4 | export async function POST(request: Request) {
 5 |   try {
 6 |     const {
 7 |       default_model,
 8 |       data,
 9 |       operations,
10 |       operation_id,
11 |       name,
12 |       sample_size,
13 |       namespace,
14 |       system_prompt,
15 |       optimizerModel,
16 |       extraPipelineSettings,
17 |     } = await request.json();
18 | 
19 |     if (!name) {
20 |       return NextResponse.json(
21 |         { error: "Pipeline name is required" },
22 |         { status: 400 }
23 |       );
24 |     }
25 | 
26 |     if (!data) {
27 |       return NextResponse.json(
28 |         { error: "Data is required. Please select a file in the sidebar." },
29 |         { status: 400 }
30 |       );
31 |     }
32 | 
33 |     const homeDir = process.env.DOCETL_HOME_DIR || os.homedir();
34 | 
35 |     const { yamlString } = generatePipelineConfig(
36 |       namespace,
37 |       default_model,
38 |       data,
39 |       operations,
40 |       operation_id,
41 |       name,
42 |       homeDir,
43 |       sample_size,
44 |       false,
45 |       false,
46 |       system_prompt,
47 |       [],
48 |       "",
49 |       false,
50 |       optimizerModel,
51 |       extraPipelineSettings
52 |     );
53 | 
54 |     return NextResponse.json({ pipelineConfig: yamlString });
55 |   } catch (error) {
56 |     console.error(error);
57 |     return NextResponse.json(
58 |       { error: "Failed to generate pipeline configuration" },
59 |       { status: 500 }
60 |     );
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/website/src/app/api/readFile/route.ts:
--------------------------------------------------------------------------------
 1 | import { NextRequest, NextResponse } from "next/server";
 2 | 
 3 | const FASTAPI_URL = `${
 4 |   process.env.NEXT_PUBLIC_BACKEND_HTTPS ? "https" : "http"
 5 | }://${process.env.NEXT_PUBLIC_BACKEND_HOST}:${
 6 |   process.env.NEXT_PUBLIC_BACKEND_PORT
 7 | }`;
 8 | 
 9 | export async function GET(req: NextRequest) {
10 |   const filePath = req.nextUrl.searchParams.get("path");
11 | 
12 |   if (!filePath) {
13 |     return NextResponse.json({ error: "Invalid file path" }, { status: 400 });
14 |   }
15 | 
16 |   try {
17 |     const response = await fetch(
18 |       `${FASTAPI_URL}/fs/read-file?path=${encodeURIComponent(filePath)}`
19 |     );
20 | 
21 |     if (!response.ok) {
22 |       const error = await response.json();
23 |       return NextResponse.json(
24 |         { error: error.detail || "Failed to read file" },
25 |         { status: response.status }
26 |       );
27 |     }
28 | 
29 |     const content = await response.text();
30 |     return new NextResponse(content, { status: 200 });
31 |   } catch (error) {
32 |     console.error("Error reading file:", error);
33 |     return NextResponse.json({ error: "Failed to read file" }, { status: 500 });
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/website/src/app/api/readFilePage/route.ts:
--------------------------------------------------------------------------------
 1 | import { NextRequest, NextResponse } from "next/server";
 2 | 
 3 | const FASTAPI_URL = `${
 4 |   process.env.NEXT_PUBLIC_BACKEND_HTTPS ? "https" : "http"
 5 | }://${process.env.NEXT_PUBLIC_BACKEND_HOST}:${
 6 |   process.env.NEXT_PUBLIC_BACKEND_PORT
 7 | }`;
 8 | const CHUNK_SIZE = 1000000; // Number of characters to read at a time (roughly 1MB of text)
 9 | 
10 | export async function GET(req: NextRequest) {
11 |   const filePath = req.nextUrl.searchParams.get("path");
12 |   const page = parseInt(req.nextUrl.searchParams.get("page") || "0", 10);
13 | 
14 |   if (!filePath) {
15 |     return NextResponse.json({ error: "Invalid file path" }, { status: 400 });
16 |   }
17 | 
18 |   try {
19 |     const response = await fetch(
20 |       `${FASTAPI_URL}/fs/read-file-page?path=${encodeURIComponent(
21 |         filePath
22 |       )}&page=${page}&chunk_size=${CHUNK_SIZE}`
23 |     );
24 | 
25 |     if (!response.ok) {
26 |       throw new Error(`HTTP error! status: ${response.status}`);
27 |     }
28 | 
29 |     const data = await response.json();
30 | 
31 |     return NextResponse.json(data, { status: 200 });
32 |   } catch (error) {
33 |     console.error("Error reading file:", error);
34 |     return NextResponse.json({ error: "Failed to read file" }, { status: 500 });
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/website/src/app/api/rfi-responses/route.ts:
--------------------------------------------------------------------------------
 1 | export const dynamic = "force-dynamic";
 2 | 
 3 | import { NextResponse } from "next/server";
 4 | 
 5 | export async function GET(request: Request) {
 6 |   try {
 7 |     // Get URL parameters
 8 |     const { searchParams } = new URL(request.url);
 9 |     const page = parseInt(searchParams.get("page") || "1");
10 |     const limit = parseInt(searchParams.get("limit") || "20");
11 |     const filterFamous = searchParams.get("famous"); // 'true', 'false', or null
12 |     const filterConcrete = searchParams.get("concrete"); // 'true', 'false', or null
13 | 
14 |     // Fetch data from Azure blob storage with better error handling
15 |     const response = await fetch(
16 |       "https://docetl.blob.core.windows.net/demos/summarized_responses.json",
17 |       {
18 |         headers: {
19 |           "Content-Type": "application/json",
20 |         },
21 |         cache: "no-store",
22 |       }
23 |     );
24 | 
25 |     if (!response.ok) {
26 |       console.error(
27 |         `Azure blob fetch failed: ${response.status} ${response.statusText}`
28 |       );
29 |       throw new Error(`Failed to fetch data: ${response.statusText}`);
30 |     }
31 | 
32 |     const allResponses = await response.json();
33 | 
34 |     // Apply filters
35 |     const filteredData = allResponses.filter((response: any) => {
36 |       const famousMatch =
37 |         filterFamous === null ||
38 |         String(response.from_famous_entity) === filterFamous;
39 |       const concreteMatch =
40 |         filterConcrete === null ||
41 |         String(response.concrete_proposal_described) === filterConcrete;
42 |       return famousMatch && concreteMatch;
43 |     });
44 | 
45 |     // Paginate
46 |     const startIndex = (page - 1) * limit;
47 |     const paginatedData = filteredData.slice(startIndex, startIndex + limit);
48 | 
49 |     // If no query parameters are provided, return all data without pagination
50 |     if (
51 |       !searchParams.has("page") &&
52 |       !searchParams.has("limit") &&
53 |       !searchParams.has("famous") &&
54 |       !searchParams.has("concrete")
55 |     ) {
56 |       return NextResponse.json(allResponses, {
57 |         headers: {
58 |           "Cache-Control": "public, s-maxage=86400",
59 |         },
60 |       });
61 |     }
62 | 
63 |     // Return with metadata
64 |     return NextResponse.json(
65 |       {
66 |         data: paginatedData,
67 |         meta: {
68 |           total: filteredData.length,
69 |           page,
70 |           limit,
71 |           pages: Math.ceil(filteredData.length / limit),
72 |         },
73 |       },
74 |       {
75 |         headers: {
76 |           "Cache-Control": "public, s-maxage=86400", // Cache for 1 day
77 |         },
78 |       }
79 |     );
80 |   } catch (error) {
81 |     console.error("Error fetching RFI responses:", error);
82 |     return NextResponse.json(
83 |       { error: "Failed to fetch RFI responses" },
84 |       { status: 500 }
85 |     );
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/website/src/app/api/saveDocuments/route.ts:
--------------------------------------------------------------------------------
 1 | import { NextRequest, NextResponse } from "next/server";
 2 | 
 3 | const FASTAPI_URL = `${
 4 |   process.env.NEXT_PUBLIC_BACKEND_HTTPS ? "https" : "http"
 5 | }://${process.env.NEXT_PUBLIC_BACKEND_HOST}:${
 6 |   process.env.NEXT_PUBLIC_BACKEND_PORT
 7 | }`;
 8 | 
 9 | export async function POST(request: NextRequest) {
10 |   try {
11 |     const formData = await request.formData();
12 |     const files = formData.getAll("files") as File[];
13 |     const namespace = formData.get("namespace") as string;
14 | 
15 |     if (!files || files.length === 0) {
16 |       return NextResponse.json({ error: "No files provided" }, { status: 400 });
17 |     }
18 | 
19 |     // Create a new FormData object to send to the backend
20 |     const backendFormData = new FormData();
21 |     files.forEach((file) => {
22 |       backendFormData.append("files", file);
23 |     });
24 |     backendFormData.append("namespace", namespace);
25 | 
26 |     // Send to FastAPI backend
27 |     const response = await fetch(`${FASTAPI_URL}/fs/save-documents`, {
28 |       method: "POST",
29 |       body: backendFormData,
30 |     });
31 | 
32 |     if (!response.ok) {
33 |       throw new Error(`Backend responded with status ${response.status}`);
34 |     }
35 | 
36 |     const result = await response.json();
37 |     return NextResponse.json(result, { status: 200 });
38 |   } catch (error) {
39 |     console.error("Error saving documents:", error);
40 |     return NextResponse.json(
41 |       { error: "Failed to save documents" },
42 |       { status: 500 }
43 |     );
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/website/src/app/api/serveDocument/[...path]/route.ts:
--------------------------------------------------------------------------------
 1 | // app/api/documents/[...path]/route.ts
 2 | import { NextRequest, NextResponse } from "next/server";
 3 | 
 4 | const FASTAPI_URL = `${
 5 |   process.env.NEXT_PUBLIC_BACKEND_HTTPS ? "https" : "http"
 6 | }://${process.env.NEXT_PUBLIC_BACKEND_HOST}:${
 7 |   process.env.NEXT_PUBLIC_BACKEND_PORT
 8 | }`;
 9 | 
10 | export async function GET(
11 |   request: NextRequest,
12 |   { params }: { params: { path: string[] } }
13 | ) {
14 |   try {
15 |     // Join the path segments and decode any URL encoding
16 |     const filePath = decodeURIComponent(params.path.join("/"));
17 | 
18 |     // Forward the request to FastAPI's serve-document endpoint
19 |     const response = await fetch(
20 |       `${FASTAPI_URL}/fs/serve-document/${filePath}`,
21 |       {
22 |         method: "GET",
23 |       }
24 |     );
25 | 
26 |     if (!response.ok) {
27 |       const error = await response.json();
28 |       return NextResponse.json(
29 |         { error: error.detail },
30 |         { status: response.status }
31 |       );
32 |     }
33 | 
34 |     // Stream the response from FastAPI
35 |     const data = await response.blob();
36 |     return new NextResponse(data, {
37 |       headers: {
38 |         "Content-Type":
39 |           response.headers.get("Content-Type") || "application/octet-stream",
40 |         "Content-Disposition":
41 |           response.headers.get("Content-Disposition") || "inline",
42 |         "Cache-Control":
43 |           response.headers.get("Cache-Control") || "public, max-age=3600",
44 |       },
45 |     });
46 |   } catch (error) {
47 |     console.error("Error serving file:", error);
48 |     return NextResponse.json(
49 |       { error: "Failed to serve file" },
50 |       { status: 500 }
51 |     );
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/website/src/app/api/shouldOptimize/route.ts:
--------------------------------------------------------------------------------
 1 | import { NextRequest, NextResponse } from "next/server";
 2 | 
 3 | const FASTAPI_URL = `${
 4 |   process.env.NEXT_PUBLIC_BACKEND_HTTPS ? "https" : "http"
 5 | }://${process.env.NEXT_PUBLIC_BACKEND_HOST}:${
 6 |   process.env.NEXT_PUBLIC_BACKEND_PORT
 7 | }`;
 8 | 
 9 | // Helper to handle errors consistently
10 | const handleError = (error: unknown, status = 500) => {
11 |   const message =
12 |     error instanceof Error ? error.message : "Internal server error";
13 |   return NextResponse.json({ error: message }, { status });
14 | };
15 | 
16 | // Helper to proxy requests to FastAPI
17 | async function proxyRequest(path: string, init?: RequestInit) {
18 |   const response = await fetch(`${FASTAPI_URL}${path}`, {
19 |     ...init,
20 |     headers: {
21 |       "Content-Type": "application/json",
22 |       ...init?.headers,
23 |     },
24 |   });
25 | 
26 |   if (!response.ok) {
27 |     const error = await response.text();
28 |     throw new Error(`FastAPI server error: ${error}`);
29 |   }
30 | 
31 |   return response.json();
32 | }
33 | 
34 | export async function POST(request: NextRequest): Promise<NextResponse> {
35 |   try {
36 |     // Extract task ID from the URL if it exists
37 |     const taskId = request.nextUrl.searchParams.get("taskId");
38 |     const isCancel = request.nextUrl.searchParams.get("cancel") === "true";
39 | 
40 |     // Handle different POST scenarios
41 |     if (taskId) {
42 |       if (isCancel) {
43 |         // Cancel task
44 |         const data = await proxyRequest(`/should_optimize/${taskId}/cancel`, {
45 |           method: "POST",
46 |         });
47 |         return NextResponse.json(data);
48 |       }
49 |       // Invalid request with taskId but no cancel
50 |       return handleError(new Error("Invalid request"), 400);
51 |     }
52 | 
53 |     // Submit new task
54 |     const body = await request.json();
55 |     const data = await proxyRequest("/should_optimize", {
56 |       method: "POST",
57 |       body: JSON.stringify(body),
58 |     });
59 |     return NextResponse.json(data, { status: 202 });
60 |   } catch (error) {
61 |     return handleError(error);
62 |   }
63 | }
64 | 
65 | export async function GET(request: NextRequest): Promise<NextResponse> {
66 |   try {
67 |     // Extract task ID from the URL
68 |     const taskId = request.nextUrl.searchParams.get("taskId");
69 | 
70 |     if (!taskId) {
71 |       return handleError(new Error("Task ID is required"), 400);
72 |     }
73 | 
74 |     const data = await proxyRequest(`/should_optimize/${taskId}`);
75 |     return NextResponse.json(data);
76 |   } catch (error) {
77 |     return handleError(error);
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/website/src/app/api/uploadFile/route.ts:
--------------------------------------------------------------------------------
 1 | import { NextRequest, NextResponse } from "next/server";
 2 | 
 3 | export async function POST(request: NextRequest) {
 4 |   try {
 5 |     const formData = await request.formData();
 6 |     const file = formData.get("file") as File;
 7 |     const namespace = formData.get("namespace") as string;
 8 |     if (!file) {
 9 |       return NextResponse.json({ error: "No file uploaded" }, { status: 400 });
10 |     }
11 | 
12 |     // Construct FastAPI URL from environment variables
13 |     const FASTAPI_URL = `${
14 |       process.env.NEXT_PUBLIC_BACKEND_HTTPS ? "https" : "http"
15 |     }://${process.env.NEXT_PUBLIC_BACKEND_HOST}:${
16 |       process.env.NEXT_PUBLIC_BACKEND_PORT
17 |     }`;
18 |     const apiFormData = new FormData();
19 |     apiFormData.append("file", file);
20 |     apiFormData.append("namespace", namespace);
21 | 
22 |     const response = await fetch(`${FASTAPI_URL}/fs/upload-file`, {
23 |       method: "POST",
24 |       body: apiFormData,
25 |     });
26 | 
27 |     if (!response.ok) {
28 |       throw new Error(`API responded with status ${response.status}`);
29 |     }
30 | 
31 |     const data = await response.json();
32 |     return NextResponse.json({ path: data.path });
33 |   } catch (error) {
34 |     console.error("Error uploading file:", error);
35 |     return NextResponse.json(
36 |       { error: "Failed to upload file" },
37 |       { status: 500 }
38 |     );
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/website/src/app/api/writePipelineConfig/route.ts:
--------------------------------------------------------------------------------
  1 | import { NextResponse } from "next/server";
  2 | import { generatePipelineConfig } from "@/app/api/utils";
  3 | import os from "os";
  4 | 
  5 | const FASTAPI_URL = `${
  6 |   process.env.NEXT_PUBLIC_BACKEND_HTTPS ? "https" : "http"
  7 | }://${process.env.NEXT_PUBLIC_BACKEND_HOST}:${
  8 |   process.env.NEXT_PUBLIC_BACKEND_PORT
  9 | }`;
 10 | 
 11 | export async function POST(request: Request) {
 12 |   try {
 13 |     const {
 14 |       default_model,
 15 |       data,
 16 |       operations,
 17 |       operation_id,
 18 |       name,
 19 |       sample_size,
 20 |       optimize = false,
 21 |       clear_intermediate = false,
 22 |       system_prompt,
 23 |       namespace,
 24 |       apiKeys,
 25 |       optimizerModel,
 26 |       extraPipelineSettings,
 27 |     } = await request.json();
 28 | 
 29 |     if (!name) {
 30 |       return NextResponse.json(
 31 |         { error: "Pipeline name is required" },
 32 |         { status: 400 }
 33 |       );
 34 |     }
 35 | 
 36 |     if (!data?.path) {
 37 |       return NextResponse.json(
 38 |         { error: "Data is required. Please select a file in the sidebar." },
 39 |         { status: 400 }
 40 |       );
 41 |     }
 42 | 
 43 |     const homeDir = process.env.DOCETL_HOME_DIR || os.homedir();
 44 |     const docetl_encryption_key = process.env.DOCETL_ENCRYPTION_KEY || "";
 45 | 
 46 |     const { yamlString, inputPath, outputPath } = generatePipelineConfig(
 47 |       namespace,
 48 |       default_model,
 49 |       data,
 50 |       operations,
 51 |       operation_id,
 52 |       name,
 53 |       homeDir,
 54 |       sample_size,
 55 |       optimize,
 56 |       clear_intermediate,
 57 |       system_prompt,
 58 |       apiKeys,
 59 |       docetl_encryption_key,
 60 |       true,
 61 |       optimizerModel,
 62 |       extraPipelineSettings
 63 |     );
 64 | 
 65 |     // Use the FastAPI endpoint to write the pipeline config
 66 |     const response = await fetch(`${FASTAPI_URL}/fs/write-pipeline-config`, {
 67 |       method: "POST",
 68 |       headers: {
 69 |         "Content-Type": "application/json",
 70 |       },
 71 |       body: JSON.stringify({
 72 |         namespace,
 73 |         name,
 74 |         config: yamlString,
 75 |         input_path: inputPath,
 76 |         output_path: outputPath,
 77 |       }),
 78 |     });
 79 | 
 80 |     if (!response.ok) {
 81 |       const error = await response.text();
 82 |       throw new Error(`Failed to write pipeline configuration: ${error}`);
 83 |     }
 84 | 
 85 |     const result = await response.json();
 86 |     return NextResponse.json({
 87 |       filePath: result.filePath,
 88 |       inputPath: result.inputPath,
 89 |       outputPath: result.outputPath,
 90 |     });
 91 |   } catch (error) {
 92 |     console.error("Pipeline configuration error:", error);
 93 |     return NextResponse.json(
 94 |       error instanceof Error
 95 |         ? error.message
 96 |         : "An unexpected error occurred while creating the pipeline configuration",
 97 |       { status: 500 }
 98 |     );
 99 |   }
100 | }
101 | 


--------------------------------------------------------------------------------
/website/src/app/blog/[id]/page.tsx:
--------------------------------------------------------------------------------
 1 | import Link from "next/link";
 2 | import { getPostData, getSortedPostsData } from "../../../lib/api";
 3 | import MarkdownRenderer from "../../MarkdownRenderer";
 4 | import { Scroll, Github } from "lucide-react";
 5 | import { Button } from "@/components/ui/button";
 6 | 
 7 | export async function generateStaticParams() {
 8 |   const posts = getSortedPostsData();
 9 |   return posts.map((post) => ({
10 |     id: post.id,
11 |   }));
12 | }
13 | 
14 | export default function BlogPost({ params }: { params: { id: string } }) {
15 |   const postData = getPostData(params.id);
16 | 
17 |   return (
18 |     <main className="min-h-screen p-8">
19 |       <div className="max-w-3xl mx-auto">
20 |         <div className="flex items-center justify-between mb-8">
21 |           <Link href="/blog" className="flex items-center">
22 |             <Scroll className="w-10 h-10 mr-2 text-primary" strokeWidth={1.5} />
23 |             <span className="text-2xl font-bold">docetl blog</span>
24 |           </Link>
25 |           <Button asChild variant="ghost" className="flex items-center">
26 |             <a
27 |               href="https://github.com/ucbepic/docetl"
28 |               target="_blank"
29 |               rel="noopener noreferrer"
30 |               className="flex items-center text-gray-600 hover:text-gray-900"
31 |             >
32 |               <Github className="w-6 h-6 mr-2" />
33 |               <span>GitHub</span>
34 |             </a>
35 |           </Button>
36 |         </div>
37 |         <article>
38 |           <h1 className="text-3xl font-bold mb-4">{postData.title}</h1>
39 |           <div className="text-gray-500 mb-8">{postData.date}</div>
40 |           <div className="mb-12">
41 |             <MarkdownRenderer content={postData.content} />
42 |           </div>
43 |         </article>
44 |         <div className="mt-12 space-y-4">
45 |           <Link href="/blog" className="text-blue-500 hover:underline block">
46 |             &larr; Back to blog
47 |           </Link>
48 |           <Link href="/" className="text-blue-500 hover:underline block">
49 |             &larr; Back to home
50 |           </Link>
51 |         </div>
52 |       </div>
53 |     </main>
54 |   );
55 | }
56 | 


--------------------------------------------------------------------------------
/website/src/app/blog/page.tsx:
--------------------------------------------------------------------------------
 1 | import Link from "next/link";
 2 | import { getSortedPostsData } from "../../lib/api";
 3 | import { Scroll, Github } from "lucide-react";
 4 | import { Button } from "@/components/ui/button";
 5 | 
 6 | export default function BlogPage() {
 7 |   const allPostsData = getSortedPostsData();
 8 | 
 9 |   return (
10 |     <main className="min-h-screen p-8">
11 |       <div className="max-w-3xl mx-auto">
12 |         <div className="flex items-center justify-between mb-8">
13 |           <Link href="/blog" className="flex items-center">
14 |             <Scroll className="w-10 h-10 mr-2 text-primary" strokeWidth={1.5} />
15 |             <span className="text-2xl font-bold">docetl blog</span>
16 |           </Link>
17 |           <Button asChild variant="ghost" className="flex items-center">
18 |             <a
19 |               href="https://github.com/ucbepic/docetl"
20 |               target="_blank"
21 |               rel="noopener noreferrer"
22 |               className="flex items-center text-gray-600 hover:text-gray-900"
23 |             >
24 |               <Github className="w-6 h-6 mr-2" />
25 |               <span>GitHub</span>
26 |             </a>
27 |           </Button>
28 |         </div>
29 |         <h1 className="text-3xl font-bold mb-8">Latest Posts</h1>
30 |         <ul className="space-y-8">
31 |           {allPostsData.map(({ id, date, title }) => (
32 |             <li key={id} className="border-b pb-6">
33 |               <Link
34 |                 href={`/blog/${id}`}
35 |                 className="text-2xl font-semibold hover:text-blue-500 block mb-2"
36 |               >
37 |                 {title}
38 |               </Link>
39 |               <span className="text-gray-500">{date}</span>
40 |             </li>
41 |           ))}
42 |         </ul>
43 |         <div className="mt-12">
44 |           <Link href="/" className="text-blue-500 hover:underline">
45 |             &larr; Back to home
46 |           </Link>
47 |         </div>
48 |       </div>
49 |     </main>
50 |   );
51 | }
52 | 


--------------------------------------------------------------------------------
/website/src/app/fonts/GeistMonoVF.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/website/src/app/fonts/GeistMonoVF.woff


--------------------------------------------------------------------------------
/website/src/app/fonts/GeistVF.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbepic/docetl/ea82a0124d0994c738a8139d2b2ae38226973562/website/src/app/fonts/GeistVF.woff


--------------------------------------------------------------------------------
/website/src/app/layout.tsx:
--------------------------------------------------------------------------------
 1 | import type { Metadata } from "next";
 2 | import localFont from "next/font/local";
 3 | import "./globals.css";
 4 | import { Toaster } from "@/components/ui/toaster";
 5 | import { GoogleAnalytics } from "@next/third-parties/google";
 6 | import Providers from "./providers";
 7 | 
 8 | const geistSans = localFont({
 9 |   src: "./fonts/GeistVF.woff",
10 |   variable: "--font-geist-sans",
11 |   weight: "100 900",
12 | });
13 | const geistMono = localFont({
14 |   src: "./fonts/GeistMonoVF.woff",
15 |   variable: "--font-geist-mono",
16 |   weight: "100 900",
17 | });
18 | 
19 | export const metadata: Metadata = {
20 |   title: "DocETL",
21 |   description: "Powering complex document processing pipelines",
22 |   icons: {
23 |     icon: "/docetl-favicon-color.png",
24 |     shortcut: "/docetl-favicon-color.png",
25 |     apple: "/docetl-favicon-color.png",
26 |     other: {
27 |       rel: "icon",
28 |       url: "/docetl-favicon-color.png",
29 |     },
30 |   },
31 | };
32 | 
33 | export default function RootLayout({
34 |   children,
35 | }: Readonly<{
36 |   children: React.ReactNode;
37 | }>) {
38 |   return (
39 |     <html lang="en">
40 |       <body
41 |         className={`${geistSans.variable} ${geistMono.variable} antialiased`}
42 |       >
43 |         <Providers>{children}</Providers>
44 |         <GoogleAnalytics gaId="G-M9CR0T6CJ0" />
45 |         <Toaster />
46 |       </body>
47 |     </html>
48 |   );
49 | }
50 | 


--------------------------------------------------------------------------------
/website/src/app/localStorageKeys.ts:
--------------------------------------------------------------------------------
 1 | export const TABLE_SETTINGS_KEY = "docetl_table_settings";
 2 | export const BOOKMARKS_STORAGE_KEY = "docetl_bookmarks";
 3 | 
 4 | // Keys from PipelineContext.tsx
 5 | export const OPERATIONS_KEY = "docetl_operations";
 6 | export const CURRENT_FILE_KEY = "docetl_currentFile";
 7 | export const OUTPUT_KEY = "docetl_output";
 8 | export const TERMINAL_OUTPUT_KEY = "docetl_terminalOutput";
 9 | export const IS_LOADING_OUTPUTS_KEY = "docetl_isLoadingOutputs";
10 | export const NUM_OP_RUN_KEY = "docetl_numOpRun";
11 | export const PIPELINE_NAME_KEY = "docetl_pipelineName";
12 | export const SAMPLE_SIZE_KEY = "docetl_sampleSize";
13 | export const FILES_KEY = "docetl_files";
14 | export const COST_KEY = "docetl_cost";
15 | export const DEFAULT_MODEL_KEY = "docetl_defaultModel";
16 | export const OPTIMIZER_MODEL_KEY = "docetl_optimizerModel";
17 | export const AUTO_OPTIMIZE_CHECK_KEY = "docetl_autoOptimizeCheck";
18 | export const HIGH_LEVEL_GOAL_KEY = "docetl_highLevelGoal";
19 | export const SYSTEM_PROMPT_KEY = "docetl_systemPrompt";
20 | export const NAMESPACE_KEY = "docetl_namespace";
21 | export const EXTRA_PIPELINE_SETTINGS_KEY = "docetl_extraPipelineSettings";
22 | export const BOOKMARKS_KEY = "docetl_bookmarks";
23 | 


--------------------------------------------------------------------------------
/website/src/app/providers.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
 4 | import { useState, ReactNode } from "react";
 5 | 
 6 | export default function Providers({ children }: { children: ReactNode }) {
 7 |   const [queryClient] = useState(() => new QueryClient());
 8 | 
 9 |   return (
10 |     <QueryClientProvider client={queryClient}>{children}</QueryClientProvider>
11 |   );
12 | }
13 | 


--------------------------------------------------------------------------------
/website/src/app/types.ts:
--------------------------------------------------------------------------------
  1 | export type File = {
  2 |   name: string;
  3 |   path: string;
  4 |   type: "json" | "document" | "pipeline-yaml";
  5 |   parentFolder?: string;
  6 |   blob?: Blob;
  7 | };
  8 | 
  9 | export type Operation = {
 10 |   id: string;
 11 |   llmType: "LLM" | "non-LLM";
 12 |   type:
 13 |     | "map"
 14 |     | "reduce"
 15 |     | "filter"
 16 |     | "resolve"
 17 |     | "parallel_map"
 18 |     | "rank"
 19 |     | "extract"
 20 |     | "unnest"
 21 |     | "split"
 22 |     | "gather"
 23 |     | "sample"
 24 |     | "code_map"
 25 |     | "code_reduce"
 26 |     | "code_filter";
 27 |   name: string;
 28 |   prompt?: string;
 29 |   output?: { schema: SchemaItem[] };
 30 |   validate?: string[];
 31 |   gleaning?: { num_rounds: number; validation_prompt: string };
 32 |   otherKwargs?: Record<string, any>;
 33 |   runIndex?: number;
 34 |   sample?: number;
 35 |   shouldOptimizeResult?: string;
 36 |   visibility: boolean;
 37 | };
 38 | 
 39 | export type OutputRow = Record<string, string>;
 40 | 
 41 | export type SchemaType =
 42 |   | "string"
 43 |   | "float"
 44 |   | "int"
 45 |   | "boolean"
 46 |   | "list"
 47 |   | "dict"
 48 |   | "enum";
 49 | 
 50 | export interface SchemaItem {
 51 |   key: string;
 52 |   type: SchemaType;
 53 |   subType?: SchemaItem | SchemaItem[];
 54 |   enumValues?: string[];
 55 | }
 56 | 
 57 | export interface UserNote {
 58 |   id: string;
 59 |   note: string;
 60 |   metadata: {
 61 |     columnId?: string;
 62 |     rowIndex?: number;
 63 |     mainColumnValue?: unknown;
 64 |     rowContent?: Record<string, unknown>;
 65 |     operationName?: string;
 66 |   };
 67 | }
 68 | 
 69 | export interface Bookmark {
 70 |   id: string;
 71 |   color: string;
 72 |   notes: UserNote[];
 73 | }
 74 | 
 75 | export interface BookmarkContextType {
 76 |   bookmarks: Bookmark[];
 77 |   addBookmark: (color: string, notes: UserNote[]) => void;
 78 |   removeBookmark: (id: string) => void;
 79 |   getNotesForRowAndColumn: (rowIndex: number, columnId: string) => UserNote[];
 80 | }
 81 | 
 82 | export interface OutputType {
 83 |   path: string;
 84 |   operationId: string;
 85 |   inputPath?: string;
 86 | }
 87 | 
 88 | export interface OptimizeRequest {
 89 |   yaml_config: string;
 90 |   step_name: string;
 91 |   op_name: string;
 92 | }
 93 | 
 94 | export type TaskStatus =
 95 |   | "pending"
 96 |   | "processing"
 97 |   | "completed"
 98 |   | "failed"
 99 |   | "cancelled";
100 | 
101 | export interface OptimizeResult {
102 |   task_id: string;
103 |   status: TaskStatus;
104 |   should_optimize?: string;
105 |   input_data?: Array<Record<string, unknown>>;
106 |   output_data?: Array<Record<string, unknown>>;
107 |   cost?: number;
108 |   error?: string;
109 |   created_at: string;
110 |   completed_at?: string;
111 | }
112 | 
113 | export interface APIKey {
114 |   name: string;
115 |   value: string;
116 | }
117 | 


--------------------------------------------------------------------------------
/website/src/components/AIEditPopover.tsx:
--------------------------------------------------------------------------------
 1 | import React, { useState } from "react";
 2 | import { Button } from "@/components/ui/button";
 3 | import { Textarea } from "@/components/ui/textarea";
 4 | import { PopoverContent } from "@/components/ui/popover";
 5 | 
 6 | interface AIEditPopoverProps {
 7 |   onSubmit: (instruction: string) => void;
 8 | }
 9 | 
10 | export const AIEditPopover: React.FC<AIEditPopoverProps> = React.memo(
11 |   ({ onSubmit }) => {
12 |     const [instruction, setInstruction] = useState("");
13 |     const [isLoading, setIsLoading] = useState(false);
14 | 
15 |     const handleSubmit = async (e: React.FormEvent) => {
16 |       e.preventDefault();
17 |       if (!instruction.trim()) return;
18 | 
19 |       setIsLoading(true);
20 |       try {
21 |         await onSubmit(instruction);
22 |         setInstruction("");
23 |       } finally {
24 |         setIsLoading(false);
25 |       }
26 |     };
27 | 
28 |     return (
29 |       <PopoverContent>
30 |         <form onSubmit={handleSubmit}>
31 |           <div className="grid gap-2">
32 |             <p className="text-sm text-muted-foreground">
33 |               Describe how you want to modify this operation.
34 |             </p>
35 |             <div className="grid gap-2">
36 |               <Textarea
37 |                 placeholder="e.g. Make the prompt more concise"
38 |                 value={instruction}
39 |                 onChange={(e) => setInstruction(e.target.value)}
40 |                 disabled={isLoading}
41 |               />
42 |               <Button type="submit" disabled={!instruction.trim() || isLoading}>
43 |                 {isLoading ? (
44 |                   <div className="animate-spin rounded-full h-4 w-4 border-t-2 border-b-2 border-white"></div>
45 |                 ) : (
46 |                   "Apply"
47 |                 )}
48 |               </Button>
49 |             </div>
50 |           </div>
51 |         </form>
52 |       </PopoverContent>
53 |     );
54 |   }
55 | );
56 | 
57 | AIEditPopover.displayName = "AIEditPopover";
58 | 


--------------------------------------------------------------------------------
/website/src/components/CollapsibleCode.tsx:
--------------------------------------------------------------------------------
 1 | import React, { useState } from "react";
 2 | import { ChevronDown, ChevronUp } from "lucide-react";
 3 | 
 4 | const CollapsibleCode = ({ code }: { code: string }) => {
 5 |   const [expandedSections, setExpandedSections] = useState<
 6 |     Record<number, boolean>
 7 |   >({});
 8 | 
 9 |   const colorMap: Record<string, string> = {
10 |     "# Blue": "bg-blue-100",
11 |     "# Green": "bg-green-100",
12 |     "# Yellow": "bg-yellow-100",
13 |     "# Orange": "bg-orange-100",
14 |   };
15 | 
16 |   const toggleSection = (index: number) => {
17 |     setExpandedSections((prev) => ({ ...prev, [index]: !prev[index] }));
18 |   };
19 | 
20 |   const renderOperation = (operation: string, index: number) => {
21 |     const color = Object.entries(colorMap).find(([key]) =>
22 |       operation.includes(key),
23 |     );
24 |     const isExpanded = expandedSections[index];
25 |     const name = operation.split("\n")[0].split(":")[1].trim();
26 | 
27 |     return (
28 |       <div
29 |         key={index}
30 |         className={`border-b border-gray-200 ${color ? color[1] : ""}`}
31 |       >
32 |         <button
33 |           onClick={() => toggleSection(index)}
34 |           className="w-full text-left py-2 px-4 flex justify-between items-center hover:bg-gray-50"
35 |         >
36 |           <span>{name.replace(/# (Blue|Green|Yellow|Red)/g, "").trim()}</span>
37 |           {isExpanded ? <ChevronUp size={16} /> : <ChevronDown size={16} />}
38 |         </button>
39 |         {isExpanded && (
40 |           <pre className="p-4 bg-gray-100">
41 |             <code>{operation}</code>
42 |           </pre>
43 |         )}
44 |       </div>
45 |     );
46 |   };
47 | 
48 |   const renderNonCollapsible = (section: string) => (
49 |     <pre className="p-4 bg-gray-100">
50 |       <code>{section}</code>
51 |     </pre>
52 |   );
53 | 
54 |   const [preOperations, rest] = code.split(/(?=^operations:)/m);
55 |   const [operations, postOperations] = rest.split(/(?=pipeline:)/);
56 |   const operationsList = operations.split(/(?= {2}- name:)/).slice(1); // Remove the "operations:" line
57 | 
58 |   return (
59 |     <div className="text-sm overflow-x-auto bg-gray-100 rounded-md text-left">
60 |       {renderNonCollapsible(preOperations)}
61 |       <pre className="p-4 bg-gray-100">
62 |         <code>operations:</code>
63 |       </pre>
64 |       <div className="bg-white">
65 |         {operationsList.map((operation, index) =>
66 |           renderOperation(operation, index),
67 |         )}
68 |       </div>
69 |       {renderNonCollapsible(postOperations)}
70 |     </div>
71 |   );
72 | };
73 | 
74 | export default CollapsibleCode;
75 | 


--------------------------------------------------------------------------------
/website/src/components/DarkMode.tsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import { Moon, Sun } from "lucide-react";
 3 | import { Button } from "@/components/ui/button";
 4 | 
 5 | const DarkModeToggle = ({
 6 |   darkMode,
 7 |   toggleDarkMode,
 8 | }: {
 9 |   darkMode: boolean;
10 |   toggleDarkMode: () => void;
11 | }) => {
12 |   return (
13 |     <Button
14 |       onClick={toggleDarkMode}
15 |       variant="outline"
16 |       size="icon"
17 |       className="w-10 h-10 rounded-full"
18 |     >
19 |       {darkMode ? <Sun className="h-5 w-5" /> : <Moon className="h-5 w-5" />}
20 |     </Button>
21 |   );
22 | };
23 | 
24 | export default DarkModeToggle;
25 | 


--------------------------------------------------------------------------------
/website/src/components/DocumentViewer.tsx:
--------------------------------------------------------------------------------
 1 | // DocumentViewer.tsx
 2 | import React, { useEffect, useState } from "react";
 3 | import DocViewer, { DocViewerRenderers } from "@cyntler/react-doc-viewer";
 4 | import {
 5 |   Dialog,
 6 |   DialogContent,
 7 |   DialogHeader,
 8 |   DialogTitle,
 9 | } from "@/components/ui/dialog";
10 | import "@cyntler/react-doc-viewer/dist/index.css";
11 | 
12 | interface DocumentViewerProps {
13 |   isOpen: boolean;
14 |   onClose: () => void;
15 |   filePath: string;
16 |   fileName: string;
17 | }
18 | 
19 | export const DocumentViewer: React.FC<DocumentViewerProps> = ({
20 |   isOpen,
21 |   onClose,
22 |   filePath,
23 |   fileName,
24 | }) => {
25 |   const [documentUrl, setDocumentUrl] = useState<string>("");
26 | 
27 |   useEffect(() => {
28 |     if (isOpen && filePath) {
29 |       // Convert the full file path to a URL-safe format
30 |       const encodedPath = encodeURIComponent(filePath);
31 |       const url = `/api/serveDocument/${encodedPath}`;
32 |       setDocumentUrl(url);
33 |     }
34 |   }, [isOpen, filePath]);
35 | 
36 |   const docs = documentUrl ? [{ uri: documentUrl, fileName: fileName }] : [];
37 | 
38 |   return (
39 |     <Dialog open={isOpen} onOpenChange={(open) => !open && onClose()}>
40 |       <DialogContent className="max-w-[calc(100vh*0.75)] h-[100vh] p-0 overflow-hidden">
41 |         <DialogHeader className="p-4 pb-0">
42 |           <DialogTitle className="pr-8">{fileName}</DialogTitle>
43 |         </DialogHeader>
44 |         <div className="flex-1 p-4 pt-0 overflow-hidden">
45 |           <div className="h-full w-full overflow-hidden">
46 |             {documentUrl && (
47 |               <DocViewer
48 |                 pluginRenderers={DocViewerRenderers}
49 |                 documents={docs}
50 |                 initialActiveDocument={docs[0]}
51 |                 style={{
52 |                   height: "100%",
53 |                   width: "100%",
54 |                   maxHeight: "100%",
55 |                   overflow: "auto",
56 |                   backgroundColor: "white",
57 |                 }}
58 |                 config={{
59 |                   header: {
60 |                     disableHeader: true,
61 |                     disableFileName: true,
62 |                   },
63 |                   pdfZoom: {
64 |                     defaultZoom: 1,
65 |                     zoomJump: 0.2,
66 |                   },
67 |                   pdfVerticalScrollByDefault: true,
68 |                 }}
69 |               />
70 |             )}
71 |           </div>
72 |         </div>
73 |       </DialogContent>
74 |     </Dialog>
75 |   );
76 | };
77 | 


--------------------------------------------------------------------------------
/website/src/components/InlineEditingButton.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import React from "react";
 4 | import { Wand2 } from "lucide-react";
 5 | import { Button } from "@/components/ui/button";
 6 | import {
 7 |   Popover,
 8 |   PopoverContent,
 9 |   PopoverTrigger,
10 | } from "@/components/ui/popover";
11 | import { Textarea } from "@/components/ui/textarea";
12 | 
13 | interface InlineEditingButtonProps {
14 |   selectedText: string;
15 |   onSubmit: (instruction: string) => void;
16 | }
17 | 
18 | const InlineEditingButton: React.FC<InlineEditingButtonProps> = ({
19 |   selectedText,
20 |   onSubmit,
21 | }) => {
22 |   const [instruction, setInstruction] = React.useState("");
23 |   const [isOpen, setIsOpen] = React.useState(false);
24 | 
25 |   const handleSubmit = (e: React.MouseEvent) => {
26 |     e.preventDefault();
27 |     e.stopPropagation();
28 |     if (instruction.trim()) {
29 |       onSubmit(instruction);
30 |       setInstruction("");
31 |       setIsOpen(false);
32 |     }
33 |   };
34 | 
35 |   const handleButtonClick = (e: React.MouseEvent) => {
36 |     e.preventDefault();
37 |     e.stopPropagation();
38 |     setIsOpen(true);
39 |   };
40 | 
41 |   return (
42 |     <Popover modal={true} open={isOpen} onOpenChange={setIsOpen}>
43 |       <PopoverTrigger asChild onClick={handleButtonClick}>
44 |         <Button
45 |           variant="ghost"
46 |           size="icon"
47 |           className="h-6 w-6 absolute bg-background shadow-sm border"
48 |         >
49 |           <Wand2 className="h-4 w-4" />
50 |         </Button>
51 |       </PopoverTrigger>
52 |       <PopoverContent className="w-80" sideOffset={5}>
53 |         <form
54 |           onSubmit={(e) => {
55 |             e.preventDefault();
56 |             handleSubmit(e as any);
57 |           }}
58 |         >
59 |           <div className="space-y-4">
60 |             <div className="text-sm text-muted-foreground">
61 |               Selected text: {selectedText}
62 |             </div>
63 |             <Textarea
64 |               placeholder="Enter your instruction for editing..."
65 |               value={instruction}
66 |               onChange={(e) => setInstruction(e.target.value)}
67 |             />
68 |             <Button type="submit" className="w-full">
69 |               Submit
70 |             </Button>
71 |           </div>
72 |         </form>
73 |       </PopoverContent>
74 |     </Popover>
75 |   );
76 | };
77 | export default InlineEditingButton;
78 | 


--------------------------------------------------------------------------------
/website/src/components/LLMContextPopover.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import React, { useState, useEffect, useRef } from "react";
 4 | import {
 5 |   Popover,
 6 |   PopoverContent,
 7 |   PopoverTrigger,
 8 | } from "@/components/ui/popover";
 9 | import { ScrollArea } from "@/components/ui/scroll-area";
10 | import { usePipelineContext } from "@/contexts/PipelineContext";
11 | import { Loader2 } from "lucide-react";
12 | 
13 | export const LLMContextPopover: React.FC = () => {
14 |   const { serializeState, highLevelGoal } = usePipelineContext();
15 |   const [contextData, setContextData] = useState<string>("");
16 |   const [isLoading, setIsLoading] = useState(false);
17 |   const [isOpen, setIsOpen] = useState(false);
18 |   const loadTimeoutRef = useRef<NodeJS.Timeout>();
19 | 
20 |   const loadContext = async () => {
21 |     setIsLoading(true);
22 |     try {
23 |       const data = await serializeState();
24 |       setContextData(data);
25 |     } catch (error) {
26 |       console.error("Failed to load context:", error);
27 |     } finally {
28 |       setIsLoading(false);
29 |     }
30 |   };
31 | 
32 |   // Update context when high-level goal changes and popover is open
33 |   useEffect(() => {
34 |     if (isOpen) {
35 |       // Clear any pending timeout
36 |       if (loadTimeoutRef.current) {
37 |         clearTimeout(loadTimeoutRef.current);
38 |       }
39 |       // Set a new timeout to load context
40 |       loadTimeoutRef.current = setTimeout(() => {
41 |         loadContext();
42 |       }, 500);
43 |     }
44 |     return () => {
45 |       if (loadTimeoutRef.current) {
46 |         clearTimeout(loadTimeoutRef.current);
47 |       }
48 |     };
49 |   }, [highLevelGoal, isOpen]);
50 | 
51 |   const handlePopoverOpen = async (open: boolean) => {
52 |     setIsOpen(open);
53 |     if (open && !contextData) {
54 |       await loadContext();
55 |     }
56 |   };
57 | 
58 |   return (
59 |     <Popover onOpenChange={handlePopoverOpen}>
60 |       <PopoverTrigger asChild>
61 |         <button className="text-xs text-blue-500 hover:underline">
62 |           Show LLM Context
63 |         </button>
64 |       </PopoverTrigger>
65 |       <PopoverContent
66 |         className="w-[600px] bg-background border shadow-none"
67 |         align="end"
68 |         side="left"
69 |       >
70 |         <ScrollArea className="h-[400px] w-full rounded-md p-1">
71 |           {isLoading ? (
72 |             <div className="flex items-center justify-center h-full">
73 |               <Loader2 className="h-4 w-4 animate-spin" />
74 |             </div>
75 |           ) : (
76 |             <pre className="text-sm whitespace-pre-wrap">{contextData}</pre>
77 |           )}
78 |         </ScrollArea>
79 |       </PopoverContent>
80 |     </Popover>
81 |   );
82 | };
83 | 


--------------------------------------------------------------------------------
/website/src/components/PipelinePrompts.tsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import { Card, CardHeader, CardTitle, CardContent } from "@/components/ui/card";
 3 | import {
 4 |   Accordion,
 5 |   AccordionContent,
 6 |   AccordionItem,
 7 |   AccordionTrigger,
 8 | } from "@/components/ui/accordion";
 9 | 
10 | const pipelinePrompts = [
11 |   {
12 |     name: "Extract Themes and Viewpoints",
13 |     prompt: `Analyze the following debate transcript for {{ input.title }} on {{ input.date }}:
14 | 
15 | {{ input.content }}
16 | 
17 | Extract the main themes discussed in this debate and the viewpoints of the candidates on these themes.
18 | Return a list of themes and corresponding viewpoints in the following format:
19 | [
20 |   {
21 |     "theme": "Theme 1",
22 |     "viewpoints": "Candidate A's viewpoint... Candidate B's viewpoint..."
23 |   },
24 |   {
25 |     "theme": "Theme 2",
26 |     "viewpoints": "Candidate A's viewpoint... Candidate B's viewpoint..."
27 |   },
28 |   ...
29 | ]`,
30 |   },
31 |   {
32 |     name: "Synthesized Resolve",
33 |     prompt: `Compare the following two debate themes:
34 | 
35 | [Entity 1]:
36 | {{ input1.theme }}
37 | 
38 | [Entity 2]:
39 | {{ input2.theme }}
40 | 
41 | Are these themes likely referring to the same concept? Consider the following attributes:
42 | - The core subject matter being discussed
43 | - The context in which the theme is presented
44 | - The viewpoints of the candidates associated with each theme
45 | 
46 | Respond with "True" if they are likely the same theme, or "False" if they are likely different themes.`,
47 |   },
48 |   {
49 |     name: "Summarize Theme Evolution",
50 |     prompt: `Analyze the following viewpoints on the theme "{{ inputs[0].theme }}" from various debates over the years:
51 | 
52 | {% for item in inputs %}
53 | Year: {{ item.year }}
54 | Date: {{ item.date }}
55 | Title: {{ item.title }}
56 | Viewpoints: {{ item.viewpoints }}
57 | 
58 | {% endfor %}
59 | 
60 | Generate a comprehensive summary of how Democratic and Republican viewpoints on this theme have evolved through the years. Include supporting quotes from the debates to illustrate key points or shifts in perspective.
61 | 
62 | Your summary should:
63 | 1. Identify all major trends or shifts in each party's stance over time
64 | 2. Highlight any significant agreements or disagreements between the parties
65 | 3. Note any external events or factors that may have influenced changes in viewpoints
66 | 4. Use specific quotes to support your analysis
67 | 5. The title should contain the start and end years of the analysis
68 | 
69 | Format your response as a well-structured report.`,
70 |   },
71 | ];
72 | 
73 | const PipelinePrompts = () => {
74 |   return (
75 |     <Card>
76 |       <CardHeader>
77 |         <CardTitle>Pipeline Operation Prompts</CardTitle>
78 |       </CardHeader>
79 |       <CardContent>
80 |         <Accordion type="single" collapsible>
81 |           {pipelinePrompts.map((prompt, index) => (
82 |             <AccordionItem key={index} value={`item-${index}`}>
83 |               <AccordionTrigger>{prompt.name}</AccordionTrigger>
84 |               <AccordionContent>
85 |                 <pre className="text-sm bg-gray-100 p-4 rounded-md whitespace-pre-wrap">
86 |                   {prompt.prompt}
87 |                 </pre>
88 |               </AccordionContent>
89 |             </AccordionItem>
90 |           ))}
91 |         </Accordion>
92 |       </CardContent>
93 |     </Card>
94 |   );
95 | };
96 | 
97 | export default PipelinePrompts;
98 | 


--------------------------------------------------------------------------------
/website/src/components/ui/accordion.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import * as React from "react";
 4 | import * as AccordionPrimitive from "@radix-ui/react-accordion";
 5 | import { ChevronDownIcon } from "@radix-ui/react-icons";
 6 | 
 7 | import { cn } from "@/lib/utils";
 8 | 
 9 | const Accordion = AccordionPrimitive.Root;
10 | 
11 | const AccordionItem = React.forwardRef<
12 |   React.ElementRef<typeof AccordionPrimitive.Item>,
13 |   React.ComponentPropsWithoutRef<typeof AccordionPrimitive.Item>
14 | >(({ className, ...props }, ref) => (
15 |   <AccordionPrimitive.Item
16 |     ref={ref}
17 |     className={cn("border-b", className)}
18 |     {...props}
19 |   />
20 | ));
21 | AccordionItem.displayName = "AccordionItem";
22 | 
23 | const AccordionTrigger = React.forwardRef<
24 |   React.ElementRef<typeof AccordionPrimitive.Trigger>,
25 |   React.ComponentPropsWithoutRef<typeof AccordionPrimitive.Trigger>
26 | >(({ className, children, ...props }, ref) => (
27 |   <AccordionPrimitive.Header className="flex">
28 |     <AccordionPrimitive.Trigger
29 |       ref={ref}
30 |       className={cn(
31 |         "flex flex-1 items-center justify-between py-4 text-sm font-medium transition-all hover:underline [&[data-state=open]>svg]:rotate-180",
32 |         className,
33 |       )}
34 |       {...props}
35 |     >
36 |       {children}
37 |       <ChevronDownIcon className="h-4 w-4 shrink-0 text-muted-foreground transition-transform duration-200" />
38 |     </AccordionPrimitive.Trigger>
39 |   </AccordionPrimitive.Header>
40 | ));
41 | AccordionTrigger.displayName = AccordionPrimitive.Trigger.displayName;
42 | 
43 | const AccordionContent = React.forwardRef<
44 |   React.ElementRef<typeof AccordionPrimitive.Content>,
45 |   React.ComponentPropsWithoutRef<typeof AccordionPrimitive.Content>
46 | >(({ className, children, ...props }, ref) => (
47 |   <AccordionPrimitive.Content
48 |     ref={ref}
49 |     className="overflow-hidden text-sm data-[state=closed]:animate-accordion-up data-[state=open]:animate-accordion-down"
50 |     {...props}
51 |   >
52 |     <div className={cn("pb-4 pt-0", className)}>{children}</div>
53 |   </AccordionPrimitive.Content>
54 | ));
55 | AccordionContent.displayName = AccordionPrimitive.Content.displayName;
56 | 
57 | export { Accordion, AccordionItem, AccordionTrigger, AccordionContent };
58 | 


--------------------------------------------------------------------------------
/website/src/components/ui/alert.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react"
 2 | import { cva, type VariantProps } from "class-variance-authority"
 3 | 
 4 | import { cn } from "@/lib/utils"
 5 | 
 6 | const alertVariants = cva(
 7 |   "relative w-full rounded-lg border px-4 py-3 text-sm [&>svg+div]:translate-y-[-3px] [&>svg]:absolute [&>svg]:left-4 [&>svg]:top-4 [&>svg]:text-foreground [&>svg~*]:pl-7",
 8 |   {
 9 |     variants: {
10 |       variant: {
11 |         default: "bg-background text-foreground",
12 |         destructive:
13 |           "border-destructive/50 text-destructive dark:border-destructive [&>svg]:text-destructive",
14 |       },
15 |     },
16 |     defaultVariants: {
17 |       variant: "default",
18 |     },
19 |   }
20 | )
21 | 
22 | const Alert = React.forwardRef<
23 |   HTMLDivElement,
24 |   React.HTMLAttributes<HTMLDivElement> & VariantProps<typeof alertVariants>
25 | >(({ className, variant, ...props }, ref) => (
26 |   <div
27 |     ref={ref}
28 |     role="alert"
29 |     className={cn(alertVariants({ variant }), className)}
30 |     {...props}
31 |   />
32 | ))
33 | Alert.displayName = "Alert"
34 | 
35 | const AlertTitle = React.forwardRef<
36 |   HTMLParagraphElement,
37 |   React.HTMLAttributes<HTMLHeadingElement>
38 | >(({ className, ...props }, ref) => (
39 |   <h5
40 |     ref={ref}
41 |     className={cn("mb-1 font-medium leading-none tracking-tight", className)}
42 |     {...props}
43 |   />
44 | ))
45 | AlertTitle.displayName = "AlertTitle"
46 | 
47 | const AlertDescription = React.forwardRef<
48 |   HTMLParagraphElement,
49 |   React.HTMLAttributes<HTMLParagraphElement>
50 | >(({ className, ...props }, ref) => (
51 |   <div
52 |     ref={ref}
53 |     className={cn("text-sm [&_p]:leading-relaxed", className)}
54 |     {...props}
55 |   />
56 | ))
57 | AlertDescription.displayName = "AlertDescription"
58 | 
59 | export { Alert, AlertTitle, AlertDescription }
60 | 


--------------------------------------------------------------------------------
/website/src/components/ui/badge.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | import { cva, type VariantProps } from "class-variance-authority";
 3 | 
 4 | import { cn } from "@/lib/utils";
 5 | 
 6 | const badgeVariants = cva(
 7 |   "inline-flex items-center rounded-md border px-2.5 py-0.5 text-xs font-semibold transition-colors focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2",
 8 |   {
 9 |     variants: {
10 |       variant: {
11 |         default:
12 |           "border-transparent bg-primary text-primary-foreground shadow hover:bg-primary/80",
13 |         secondary:
14 |           "border-transparent bg-secondary text-secondary-foreground hover:bg-secondary/80",
15 |         destructive:
16 |           "border-transparent bg-destructive text-destructive-foreground shadow hover:bg-destructive/80",
17 |         outline: "text-foreground",
18 |       },
19 |     },
20 |     defaultVariants: {
21 |       variant: "default",
22 |     },
23 |   },
24 | );
25 | 
26 | export interface BadgeProps
27 |   extends React.HTMLAttributes<HTMLDivElement>,
28 |     VariantProps<typeof badgeVariants> {}
29 | 
30 | function Badge({ className, variant, ...props }: BadgeProps) {
31 |   return (
32 |     <div className={cn(badgeVariants({ variant }), className)} {...props} />
33 |   );
34 | }
35 | 
36 | export { Badge, badgeVariants };
37 | 


--------------------------------------------------------------------------------
/website/src/components/ui/button.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | import { Slot } from "@radix-ui/react-slot";
 3 | import { cva, type VariantProps } from "class-variance-authority";
 4 | 
 5 | import { cn } from "@/lib/utils";
 6 | 
 7 | const buttonVariants = cva(
 8 |   "inline-flex items-center justify-center rounded-sm text-sm font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50",
 9 |   {
10 |     variants: {
11 |       variant: {
12 |         default:
13 |           "bg-primary text-primary-foreground shadow hover:bg-primary/90",
14 |         destructive:
15 |           "bg-destructive text-destructive-foreground shadow-sm hover:bg-destructive/90",
16 |         outline:
17 |           "border border-input bg-background shadow-sm hover:bg-accent hover:text-accent-foreground",
18 |         secondary:
19 |           "bg-secondary text-secondary-foreground shadow-sm hover:bg-secondary/80",
20 |         ghost: "hover:bg-accent hover:text-accent-foreground",
21 |         link: "text-primary underline-offset-4 hover:underline",
22 |       },
23 |       size: {
24 |         default: "h-9 px-4 py-2",
25 |         sm: "h-8 rounded-md px-3 text-xs",
26 |         lg: "h-10 rounded-md px-8",
27 |         icon: "h-9 w-9",
28 |       },
29 |     },
30 |     defaultVariants: {
31 |       variant: "default",
32 |       size: "default",
33 |     },
34 |   }
35 | );
36 | 
37 | export interface ButtonProps
38 |   extends React.ButtonHTMLAttributes<HTMLButtonElement>,
39 |     VariantProps<typeof buttonVariants> {
40 |   asChild?: boolean;
41 | }
42 | 
43 | const Button = React.forwardRef<HTMLButtonElement, ButtonProps>(
44 |   ({ className, variant, size, asChild = false, ...props }, ref) => {
45 |     const Comp = asChild ? Slot : "button";
46 |     return (
47 |       <Comp
48 |         className={cn(buttonVariants({ variant, size, className }))}
49 |         ref={ref}
50 |         {...props}
51 |       />
52 |     );
53 |   }
54 | );
55 | Button.displayName = "Button";
56 | 
57 | export { Button, buttonVariants };
58 | 


--------------------------------------------------------------------------------
/website/src/components/ui/card.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | 
 3 | import { cn } from "@/lib/utils";
 4 | 
 5 | const Card = React.forwardRef<
 6 |   HTMLDivElement,
 7 |   React.HTMLAttributes<HTMLDivElement>
 8 | >(({ className, ...props }, ref) => (
 9 |   <div
10 |     ref={ref}
11 |     className={cn(
12 |       "rounded-xl border bg-card text-card-foreground shadow",
13 |       className,
14 |     )}
15 |     {...props}
16 |   />
17 | ));
18 | Card.displayName = "Card";
19 | 
20 | const CardHeader = React.forwardRef<
21 |   HTMLDivElement,
22 |   React.HTMLAttributes<HTMLDivElement>
23 | >(({ className, ...props }, ref) => (
24 |   <div
25 |     ref={ref}
26 |     className={cn("flex flex-col space-y-1.5 p-6", className)}
27 |     {...props}
28 |   />
29 | ));
30 | CardHeader.displayName = "CardHeader";
31 | 
32 | const CardTitle = React.forwardRef<
33 |   HTMLParagraphElement,
34 |   React.HTMLAttributes<HTMLHeadingElement>
35 | >(({ className, ...props }, ref) => (
36 |   <h3
37 |     ref={ref}
38 |     className={cn("font-semibold leading-none tracking-tight", className)}
39 |     {...props}
40 |   />
41 | ));
42 | CardTitle.displayName = "CardTitle";
43 | 
44 | const CardDescription = React.forwardRef<
45 |   HTMLParagraphElement,
46 |   React.HTMLAttributes<HTMLParagraphElement>
47 | >(({ className, ...props }, ref) => (
48 |   <p
49 |     ref={ref}
50 |     className={cn("text-sm text-muted-foreground", className)}
51 |     {...props}
52 |   />
53 | ));
54 | CardDescription.displayName = "CardDescription";
55 | 
56 | const CardContent = React.forwardRef<
57 |   HTMLDivElement,
58 |   React.HTMLAttributes<HTMLDivElement>
59 | >(({ className, ...props }, ref) => (
60 |   <div ref={ref} className={cn("p-6 pt-0", className)} {...props} />
61 | ));
62 | CardContent.displayName = "CardContent";
63 | 
64 | const CardFooter = React.forwardRef<
65 |   HTMLDivElement,
66 |   React.HTMLAttributes<HTMLDivElement>
67 | >(({ className, ...props }, ref) => (
68 |   <div
69 |     ref={ref}
70 |     className={cn("flex items-center p-6 pt-0", className)}
71 |     {...props}
72 |   />
73 | ));
74 | CardFooter.displayName = "CardFooter";
75 | 
76 | export {
77 |   Card,
78 |   CardHeader,
79 |   CardFooter,
80 |   CardTitle,
81 |   CardDescription,
82 |   CardContent,
83 | };
84 | 


--------------------------------------------------------------------------------
/website/src/components/ui/checkbox.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import * as React from "react";
 4 | import * as CheckboxPrimitive from "@radix-ui/react-checkbox";
 5 | import { CheckIcon } from "@radix-ui/react-icons";
 6 | 
 7 | import { cn } from "@/lib/utils";
 8 | 
 9 | const Checkbox = React.forwardRef<
10 |   React.ElementRef<typeof CheckboxPrimitive.Root>,
11 |   React.ComponentPropsWithoutRef<typeof CheckboxPrimitive.Root>
12 | >(({ className, ...props }, ref) => (
13 |   <CheckboxPrimitive.Root
14 |     ref={ref}
15 |     className={cn(
16 |       "peer h-4 w-4 shrink-0 rounded-sm border border-primary shadow focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:cursor-not-allowed disabled:opacity-50 data-[state=checked]:bg-primary data-[state=checked]:text-primary-foreground",
17 |       className,
18 |     )}
19 |     {...props}
20 |   >
21 |     <CheckboxPrimitive.Indicator
22 |       className={cn("flex items-center justify-center text-current")}
23 |     >
24 |       <CheckIcon className="h-4 w-4" />
25 |     </CheckboxPrimitive.Indicator>
26 |   </CheckboxPrimitive.Root>
27 | ));
28 | Checkbox.displayName = CheckboxPrimitive.Root.displayName;
29 | 
30 | export { Checkbox };
31 | 


--------------------------------------------------------------------------------
/website/src/components/ui/collapsible.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import * as CollapsiblePrimitive from "@radix-ui/react-collapsible";
 4 | 
 5 | const Collapsible = CollapsiblePrimitive.Root;
 6 | 
 7 | const CollapsibleTrigger = CollapsiblePrimitive.CollapsibleTrigger;
 8 | 
 9 | const CollapsibleContent = CollapsiblePrimitive.CollapsibleContent;
10 | 
11 | export { Collapsible, CollapsibleTrigger, CollapsibleContent };
12 | 


--------------------------------------------------------------------------------
/website/src/components/ui/hover-card.tsx:
--------------------------------------------------------------------------------
 1 | "use client"
 2 | 
 3 | import * as React from "react"
 4 | import * as HoverCardPrimitive from "@radix-ui/react-hover-card"
 5 | 
 6 | import { cn } from "@/lib/utils"
 7 | 
 8 | const HoverCard = HoverCardPrimitive.Root
 9 | 
10 | const HoverCardTrigger = HoverCardPrimitive.Trigger
11 | 
12 | const HoverCardContent = React.forwardRef<
13 |   React.ElementRef<typeof HoverCardPrimitive.Content>,
14 |   React.ComponentPropsWithoutRef<typeof HoverCardPrimitive.Content>
15 | >(({ className, align = "center", sideOffset = 4, ...props }, ref) => (
16 |   <HoverCardPrimitive.Content
17 |     ref={ref}
18 |     align={align}
19 |     sideOffset={sideOffset}
20 |     className={cn(
21 |       "z-50 w-64 rounded-md border bg-popover p-4 text-popover-foreground shadow-md outline-none data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",
22 |       className
23 |     )}
24 |     {...props}
25 |   />
26 | ))
27 | HoverCardContent.displayName = HoverCardPrimitive.Content.displayName
28 | 
29 | export { HoverCard, HoverCardTrigger, HoverCardContent }
30 | 


--------------------------------------------------------------------------------
/website/src/components/ui/input.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | 
 3 | import { cn } from "@/lib/utils";
 4 | 
 5 | export interface InputProps
 6 |   extends React.InputHTMLAttributes<HTMLInputElement> {}
 7 | 
 8 | const Input = React.forwardRef<HTMLInputElement, InputProps>(
 9 |   ({ className, type, ...props }, ref) => {
10 |     return (
11 |       <input
12 |         type={type}
13 |         className={cn(
14 |           "flex h-9 w-full rounded-md border border-input bg-transparent px-3 py-1 text-sm shadow-sm transition-colors file:border-0 file:bg-transparent file:text-sm file:font-medium file:text-foreground placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:cursor-not-allowed disabled:opacity-50",
15 |           className,
16 |         )}
17 |         ref={ref}
18 |         {...props}
19 |       />
20 |     );
21 |   },
22 | );
23 | Input.displayName = "Input";
24 | 
25 | export { Input };
26 | 


--------------------------------------------------------------------------------
/website/src/components/ui/label.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import * as React from "react";
 4 | import * as LabelPrimitive from "@radix-ui/react-label";
 5 | import { cva, type VariantProps } from "class-variance-authority";
 6 | 
 7 | import { cn } from "@/lib/utils";
 8 | 
 9 | const labelVariants = cva(
10 |   "text-sm font-medium leading-none peer-disabled:cursor-not-allowed peer-disabled:opacity-70",
11 | );
12 | 
13 | const Label = React.forwardRef<
14 |   React.ElementRef<typeof LabelPrimitive.Root>,
15 |   React.ComponentPropsWithoutRef<typeof LabelPrimitive.Root> &
16 |     VariantProps<typeof labelVariants>
17 | >(({ className, ...props }, ref) => (
18 |   <LabelPrimitive.Root
19 |     ref={ref}
20 |     className={cn(labelVariants(), className)}
21 |     {...props}
22 |   />
23 | ));
24 | Label.displayName = LabelPrimitive.Root.displayName;
25 | 
26 | export { Label };
27 | 


--------------------------------------------------------------------------------
/website/src/components/ui/pagination.tsx:
--------------------------------------------------------------------------------
  1 | import * as React from "react";
  2 | import {
  3 |   ChevronLeftIcon,
  4 |   ChevronRightIcon,
  5 |   DotsHorizontalIcon,
  6 | } from "@radix-ui/react-icons";
  7 | 
  8 | import { cn } from "@/lib/utils";
  9 | import { ButtonProps, buttonVariants } from "@/components/ui/button";
 10 | 
 11 | const Pagination = ({ className, ...props }: React.ComponentProps<"nav">) => (
 12 |   <nav
 13 |     role="navigation"
 14 |     aria-label="pagination"
 15 |     className={cn("mx-auto flex w-full justify-center", className)}
 16 |     {...props}
 17 |   />
 18 | );
 19 | Pagination.displayName = "Pagination";
 20 | 
 21 | const PaginationContent = React.forwardRef<
 22 |   HTMLUListElement,
 23 |   React.ComponentProps<"ul">
 24 | >(({ className, ...props }, ref) => (
 25 |   <ul
 26 |     ref={ref}
 27 |     className={cn("flex flex-row items-center gap-1", className)}
 28 |     {...props}
 29 |   />
 30 | ));
 31 | PaginationContent.displayName = "PaginationContent";
 32 | 
 33 | const PaginationItem = React.forwardRef<
 34 |   HTMLLIElement,
 35 |   React.ComponentProps<"li">
 36 | >(({ className, ...props }, ref) => (
 37 |   <li ref={ref} className={cn("", className)} {...props} />
 38 | ));
 39 | PaginationItem.displayName = "PaginationItem";
 40 | 
 41 | type PaginationLinkProps = {
 42 |   isActive?: boolean;
 43 | } & Pick<ButtonProps, "size"> &
 44 |   React.ComponentProps<"a">;
 45 | 
 46 | const PaginationLink = ({
 47 |   className,
 48 |   isActive,
 49 |   size = "icon",
 50 |   ...props
 51 | }: PaginationLinkProps) => (
 52 |   <a
 53 |     aria-current={isActive ? "page" : undefined}
 54 |     className={cn(
 55 |       buttonVariants({
 56 |         variant: isActive ? "outline" : "ghost",
 57 |         size,
 58 |       }),
 59 |       className,
 60 |     )}
 61 |     {...props}
 62 |   />
 63 | );
 64 | PaginationLink.displayName = "PaginationLink";
 65 | 
 66 | const PaginationPrevious = ({
 67 |   className,
 68 |   ...props
 69 | }: React.ComponentProps<typeof PaginationLink>) => (
 70 |   <PaginationLink
 71 |     aria-label="Go to previous page"
 72 |     size="default"
 73 |     className={cn("gap-1 pl-2.5", className)}
 74 |     {...props}
 75 |   >
 76 |     <ChevronLeftIcon className="h-4 w-4" />
 77 |     <span>Previous</span>
 78 |   </PaginationLink>
 79 | );
 80 | PaginationPrevious.displayName = "PaginationPrevious";
 81 | 
 82 | const PaginationNext = ({
 83 |   className,
 84 |   ...props
 85 | }: React.ComponentProps<typeof PaginationLink>) => (
 86 |   <PaginationLink
 87 |     aria-label="Go to next page"
 88 |     size="default"
 89 |     className={cn("gap-1 pr-2.5", className)}
 90 |     {...props}
 91 |   >
 92 |     <span>Next</span>
 93 |     <ChevronRightIcon className="h-4 w-4" />
 94 |   </PaginationLink>
 95 | );
 96 | PaginationNext.displayName = "PaginationNext";
 97 | 
 98 | const PaginationEllipsis = ({
 99 |   className,
100 |   ...props
101 | }: React.ComponentProps<"span">) => (
102 |   <span
103 |     aria-hidden
104 |     className={cn("flex h-9 w-9 items-center justify-center", className)}
105 |     {...props}
106 |   >
107 |     <DotsHorizontalIcon className="h-4 w-4" />
108 |     <span className="sr-only">More pages</span>
109 |   </span>
110 | );
111 | PaginationEllipsis.displayName = "PaginationEllipsis";
112 | 
113 | export {
114 |   Pagination,
115 |   PaginationContent,
116 |   PaginationLink,
117 |   PaginationItem,
118 |   PaginationPrevious,
119 |   PaginationNext,
120 |   PaginationEllipsis,
121 | };
122 | 


--------------------------------------------------------------------------------
/website/src/components/ui/popover.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import * as React from "react";
 4 | import * as PopoverPrimitive from "@radix-ui/react-popover";
 5 | 
 6 | import { cn } from "@/lib/utils";
 7 | 
 8 | const Popover = PopoverPrimitive.Root;
 9 | 
10 | const PopoverTrigger = PopoverPrimitive.Trigger;
11 | 
12 | const PopoverAnchor = PopoverPrimitive.Anchor;
13 | 
14 | const PopoverContent = React.forwardRef<
15 |   React.ElementRef<typeof PopoverPrimitive.Content>,
16 |   React.ComponentPropsWithoutRef<typeof PopoverPrimitive.Content>
17 | >(({ className, align = "center", sideOffset = 4, ...props }, ref) => (
18 |   <PopoverPrimitive.Portal>
19 |     <PopoverPrimitive.Content
20 |       ref={ref}
21 |       align={align}
22 |       sideOffset={sideOffset}
23 |       className={cn(
24 |         "z-50 w-72 rounded-md border bg-popover p-4 text-popover-foreground shadow-md outline-none data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",
25 |         className
26 |       )}
27 |       {...props}
28 |     />
29 |   </PopoverPrimitive.Portal>
30 | ));
31 | PopoverContent.displayName = PopoverPrimitive.Content.displayName;
32 | 
33 | export { Popover, PopoverTrigger, PopoverContent, PopoverAnchor };
34 | 


--------------------------------------------------------------------------------
/website/src/components/ui/progress.tsx:
--------------------------------------------------------------------------------
 1 | "use client"
 2 | 
 3 | import * as React from "react"
 4 | import * as ProgressPrimitive from "@radix-ui/react-progress"
 5 | 
 6 | import { cn } from "@/lib/utils"
 7 | 
 8 | const Progress = React.forwardRef<
 9 |   React.ElementRef<typeof ProgressPrimitive.Root>,
10 |   React.ComponentPropsWithoutRef<typeof ProgressPrimitive.Root>
11 | >(({ className, value, ...props }, ref) => (
12 |   <ProgressPrimitive.Root
13 |     ref={ref}
14 |     className={cn(
15 |       "relative h-2 w-full overflow-hidden rounded-full bg-primary/20",
16 |       className
17 |     )}
18 |     {...props}
19 |   >
20 |     <ProgressPrimitive.Indicator
21 |       className="h-full w-full flex-1 bg-primary transition-all"
22 |       style={{ transform: `translateX(-${100 - (value || 0)}%)` }}
23 |     />
24 |   </ProgressPrimitive.Root>
25 | ))
26 | Progress.displayName = ProgressPrimitive.Root.displayName
27 | 
28 | export { Progress }
29 | 


--------------------------------------------------------------------------------
/website/src/components/ui/radio-group.tsx:
--------------------------------------------------------------------------------
 1 | "use client"
 2 | 
 3 | import * as React from "react"
 4 | import * as RadioGroupPrimitive from "@radix-ui/react-radio-group"
 5 | import { cn } from "@/lib/utils"
 6 | import { DotFilledIcon } from "@radix-ui/react-icons"
 7 | 
 8 | const RadioGroup = React.forwardRef<
 9 |   React.ElementRef<typeof RadioGroupPrimitive.Root>,
10 |   React.ComponentPropsWithoutRef<typeof RadioGroupPrimitive.Root>
11 | >(({ className, ...props }, ref) => {
12 |   return (
13 |     <RadioGroupPrimitive.Root
14 |       className={cn("grid gap-2", className)}
15 |       {...props}
16 |       ref={ref}
17 |     />
18 |   )
19 | })
20 | RadioGroup.displayName = RadioGroupPrimitive.Root.displayName
21 | 
22 | const RadioGroupItem = React.forwardRef<
23 |   React.ElementRef<typeof RadioGroupPrimitive.Item>,
24 |   React.ComponentPropsWithoutRef<typeof RadioGroupPrimitive.Item>
25 | >(({ className, ...props }, ref) => {
26 |   return (
27 |     <RadioGroupPrimitive.Item
28 |       ref={ref}
29 |       className={cn(
30 |         "aspect-square h-4 w-4 rounded-full border border-primary text-primary shadow focus:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:cursor-not-allowed disabled:opacity-50",
31 |         className
32 |       )}
33 |       {...props}
34 |     >
35 |       <RadioGroupPrimitive.Indicator className="flex items-center justify-center">
36 |         <DotFilledIcon className="h-3.5 w-3.5 fill-primary" />
37 |       </RadioGroupPrimitive.Indicator>
38 |     </RadioGroupPrimitive.Item>
39 |   )
40 | })
41 | RadioGroupItem.displayName = RadioGroupPrimitive.Item.displayName
42 | 
43 | export { RadioGroup, RadioGroupItem }
44 | 


--------------------------------------------------------------------------------
/website/src/components/ui/resizable.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import { DragHandleDots2Icon } from "@radix-ui/react-icons";
 4 | import * as ResizablePrimitive from "react-resizable-panels";
 5 | 
 6 | import { cn } from "@/lib/utils";
 7 | 
 8 | const ResizablePanelGroup = ({
 9 |   className,
10 |   ...props
11 | }: React.ComponentProps<typeof ResizablePrimitive.PanelGroup>) => (
12 |   <ResizablePrimitive.PanelGroup
13 |     className={cn(
14 |       "flex h-full w-full data-[panel-group-direction=vertical]:flex-col",
15 |       className,
16 |     )}
17 |     {...props}
18 |   />
19 | );
20 | 
21 | const ResizablePanel = ResizablePrimitive.Panel;
22 | 
23 | const ResizableHandle = ({
24 |   withHandle,
25 |   className,
26 |   ...props
27 | }: React.ComponentProps<typeof ResizablePrimitive.PanelResizeHandle> & {
28 |   withHandle?: boolean;
29 | }) => (
30 |   <ResizablePrimitive.PanelResizeHandle
31 |     className={cn(
32 |       "relative flex w-px items-center justify-center bg-border after:absolute after:inset-y-0 after:left-1/2 after:w-1 after:-translate-x-1/2 focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring focus-visible:ring-offset-1 data-[panel-group-direction=vertical]:h-px data-[panel-group-direction=vertical]:w-full data-[panel-group-direction=vertical]:after:left-0 data-[panel-group-direction=vertical]:after:h-1 data-[panel-group-direction=vertical]:after:w-full data-[panel-group-direction=vertical]:after:-translate-y-1/2 data-[panel-group-direction=vertical]:after:translate-x-0 [&[data-panel-group-direction=vertical]>div]:rotate-90",
33 |       className,
34 |     )}
35 |     {...props}
36 |   >
37 |     {withHandle && (
38 |       <div className="z-10 flex h-4 w-3 items-center justify-center rounded-sm border bg-border">
39 |         <DragHandleDots2Icon className="h-2.5 w-2.5" />
40 |       </div>
41 |     )}
42 |   </ResizablePrimitive.PanelResizeHandle>
43 | );
44 | 
45 | export { ResizablePanelGroup, ResizablePanel, ResizableHandle };
46 | 


--------------------------------------------------------------------------------
/website/src/components/ui/scroll-area.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import * as React from "react";
 4 | import * as ScrollAreaPrimitive from "@radix-ui/react-scroll-area";
 5 | 
 6 | import { cn } from "@/lib/utils";
 7 | 
 8 | const ScrollArea = React.forwardRef<
 9 |   React.ElementRef<typeof ScrollAreaPrimitive.Root>,
10 |   React.ComponentPropsWithoutRef<typeof ScrollAreaPrimitive.Root>
11 | >(({ className, children, ...props }, ref) => (
12 |   <ScrollAreaPrimitive.Root
13 |     ref={ref}
14 |     className={cn("relative overflow-hidden", className)}
15 |     {...props}
16 |   >
17 |     <ScrollAreaPrimitive.Viewport className="h-full w-full rounded-[inherit]">
18 |       {children}
19 |     </ScrollAreaPrimitive.Viewport>
20 |     <ScrollBar />
21 |     <ScrollAreaPrimitive.Corner />
22 |   </ScrollAreaPrimitive.Root>
23 | ));
24 | ScrollArea.displayName = ScrollAreaPrimitive.Root.displayName;
25 | 
26 | const ScrollBar = React.forwardRef<
27 |   React.ElementRef<typeof ScrollAreaPrimitive.ScrollAreaScrollbar>,
28 |   React.ComponentPropsWithoutRef<typeof ScrollAreaPrimitive.ScrollAreaScrollbar>
29 | >(({ className, orientation = "vertical", ...props }, ref) => (
30 |   <ScrollAreaPrimitive.ScrollAreaScrollbar
31 |     ref={ref}
32 |     orientation={orientation}
33 |     className={cn(
34 |       "flex touch-none select-none transition-colors",
35 |       orientation === "vertical" &&
36 |         "h-full w-2.5 border-l border-l-transparent p-[1px]",
37 |       orientation === "horizontal" &&
38 |         "h-2.5 flex-col border-t border-t-transparent p-[1px]",
39 |       className,
40 |     )}
41 |     {...props}
42 |   >
43 |     <ScrollAreaPrimitive.ScrollAreaThumb className="relative flex-1 rounded-full bg-border" />
44 |   </ScrollAreaPrimitive.ScrollAreaScrollbar>
45 | ));
46 | ScrollBar.displayName = ScrollAreaPrimitive.ScrollAreaScrollbar.displayName;
47 | 
48 | export { ScrollArea, ScrollBar };
49 | 


--------------------------------------------------------------------------------
/website/src/components/ui/skeleton.tsx:
--------------------------------------------------------------------------------
 1 | import { cn } from "@/lib/utils";
 2 | 
 3 | function Skeleton({
 4 |   className,
 5 |   ...props
 6 | }: React.HTMLAttributes<HTMLDivElement>) {
 7 |   return (
 8 |     <div
 9 |       className={cn("animate-pulse rounded-md bg-primary/10", className)}
10 |       {...props}
11 |     />
12 |   );
13 | }
14 | 
15 | export { Skeleton };
16 | 


--------------------------------------------------------------------------------
/website/src/components/ui/switch.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import * as React from "react";
 4 | import * as SwitchPrimitives from "@radix-ui/react-switch";
 5 | 
 6 | import { cn } from "@/lib/utils";
 7 | 
 8 | const Switch = React.forwardRef<
 9 |   React.ElementRef<typeof SwitchPrimitives.Root>,
10 |   React.ComponentPropsWithoutRef<typeof SwitchPrimitives.Root>
11 | >(({ className, ...props }, ref) => (
12 |   <SwitchPrimitives.Root
13 |     className={cn(
14 |       "peer inline-flex h-5 w-9 shrink-0 cursor-pointer items-center rounded-full border-2 border-transparent shadow-sm transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 focus-visible:ring-offset-background disabled:cursor-not-allowed disabled:opacity-50 data-[state=checked]:bg-primary data-[state=unchecked]:bg-input",
15 |       className,
16 |     )}
17 |     {...props}
18 |     ref={ref}
19 |   >
20 |     <SwitchPrimitives.Thumb
21 |       className={cn(
22 |         "pointer-events-none block h-4 w-4 rounded-full bg-background shadow-lg ring-0 transition-transform data-[state=checked]:translate-x-4 data-[state=unchecked]:translate-x-0",
23 |       )}
24 |     />
25 |   </SwitchPrimitives.Root>
26 | ));
27 | Switch.displayName = SwitchPrimitives.Root.displayName;
28 | 
29 | export { Switch };
30 | 


--------------------------------------------------------------------------------
/website/src/components/ui/table.tsx:
--------------------------------------------------------------------------------
  1 | import * as React from "react";
  2 | 
  3 | import { cn } from "@/lib/utils";
  4 | 
  5 | const Table = React.forwardRef<
  6 |   HTMLTableElement,
  7 |   React.HTMLAttributes<HTMLTableElement>
  8 | >(({ className, ...props }, ref) => (
  9 |   <div className="relative w-full overflow-auto">
 10 |     <table
 11 |       ref={ref}
 12 |       className={cn("w-full caption-bottom text-sm", className)}
 13 |       {...props}
 14 |     />
 15 |   </div>
 16 | ));
 17 | Table.displayName = "Table";
 18 | 
 19 | const TableHeader = React.forwardRef<
 20 |   HTMLTableSectionElement,
 21 |   React.HTMLAttributes<HTMLTableSectionElement>
 22 | >(({ className, ...props }, ref) => (
 23 |   <thead ref={ref} className={cn("[&_tr]:border-b", className)} {...props} />
 24 | ));
 25 | TableHeader.displayName = "TableHeader";
 26 | 
 27 | const TableBody = React.forwardRef<
 28 |   HTMLTableSectionElement,
 29 |   React.HTMLAttributes<HTMLTableSectionElement>
 30 | >(({ className, ...props }, ref) => (
 31 |   <tbody
 32 |     ref={ref}
 33 |     className={cn("[&_tr:last-child]:border-0", className)}
 34 |     {...props}
 35 |   />
 36 | ));
 37 | TableBody.displayName = "TableBody";
 38 | 
 39 | const TableFooter = React.forwardRef<
 40 |   HTMLTableSectionElement,
 41 |   React.HTMLAttributes<HTMLTableSectionElement>
 42 | >(({ className, ...props }, ref) => (
 43 |   <tfoot
 44 |     ref={ref}
 45 |     className={cn(
 46 |       "border-t bg-muted/50 font-medium [&>tr]:last:border-b-0",
 47 |       className,
 48 |     )}
 49 |     {...props}
 50 |   />
 51 | ));
 52 | TableFooter.displayName = "TableFooter";
 53 | 
 54 | const TableRow = React.forwardRef<
 55 |   HTMLTableRowElement,
 56 |   React.HTMLAttributes<HTMLTableRowElement>
 57 | >(({ className, ...props }, ref) => (
 58 |   <tr
 59 |     ref={ref}
 60 |     className={cn(
 61 |       "border-b transition-colors hover:bg-muted/50 data-[state=selected]:bg-muted",
 62 |       className,
 63 |     )}
 64 |     {...props}
 65 |   />
 66 | ));
 67 | TableRow.displayName = "TableRow";
 68 | 
 69 | const TableHead = React.forwardRef<
 70 |   HTMLTableCellElement,
 71 |   React.ThHTMLAttributes<HTMLTableCellElement>
 72 | >(({ className, ...props }, ref) => (
 73 |   <th
 74 |     ref={ref}
 75 |     className={cn(
 76 |       "h-10 px-2 text-left align-middle font-medium text-muted-foreground [&:has([role=checkbox])]:pr-0 [&>[role=checkbox]]:translate-y-[2px]",
 77 |       className,
 78 |     )}
 79 |     {...props}
 80 |   />
 81 | ));
 82 | TableHead.displayName = "TableHead";
 83 | 
 84 | const TableCell = React.forwardRef<
 85 |   HTMLTableCellElement,
 86 |   React.TdHTMLAttributes<HTMLTableCellElement>
 87 | >(({ className, ...props }, ref) => (
 88 |   <td
 89 |     ref={ref}
 90 |     className={cn(
 91 |       "p-2 align-middle [&:has([role=checkbox])]:pr-0 [&>[role=checkbox]]:translate-y-[2px]",
 92 |       className,
 93 |     )}
 94 |     {...props}
 95 |   />
 96 | ));
 97 | TableCell.displayName = "TableCell";
 98 | 
 99 | const TableCaption = React.forwardRef<
100 |   HTMLTableCaptionElement,
101 |   React.HTMLAttributes<HTMLTableCaptionElement>
102 | >(({ className, ...props }, ref) => (
103 |   <caption
104 |     ref={ref}
105 |     className={cn("mt-4 text-sm text-muted-foreground", className)}
106 |     {...props}
107 |   />
108 | ));
109 | TableCaption.displayName = "TableCaption";
110 | 
111 | export {
112 |   Table,
113 |   TableHeader,
114 |   TableBody,
115 |   TableFooter,
116 |   TableHead,
117 |   TableRow,
118 |   TableCell,
119 |   TableCaption,
120 | };
121 | 


--------------------------------------------------------------------------------
/website/src/components/ui/tabs.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import * as React from "react";
 4 | import * as TabsPrimitive from "@radix-ui/react-tabs";
 5 | 
 6 | import { cn } from "@/lib/utils";
 7 | 
 8 | const Tabs = TabsPrimitive.Root;
 9 | 
10 | const TabsList = React.forwardRef<
11 |   React.ElementRef<typeof TabsPrimitive.List>,
12 |   React.ComponentPropsWithoutRef<typeof TabsPrimitive.List>
13 | >(({ className, ...props }, ref) => (
14 |   <div className="overflow-x-auto">
15 |     <TabsPrimitive.List
16 |       ref={ref}
17 |       className={cn(
18 |         "inline-flex h-9 items-center justify-start rounded-lg bg-muted p-1 text-muted-foreground whitespace-nowrap",
19 |         className,
20 |       )}
21 |       {...props}
22 |     />
23 |   </div>
24 | ));
25 | TabsList.displayName = TabsPrimitive.List.displayName;
26 | 
27 | const TabsTrigger = React.forwardRef<
28 |   React.ElementRef<typeof TabsPrimitive.Trigger>,
29 |   React.ComponentPropsWithoutRef<typeof TabsPrimitive.Trigger>
30 | >(({ className, ...props }, ref) => (
31 |   <TabsPrimitive.Trigger
32 |     ref={ref}
33 |     className={cn(
34 |       "inline-flex items-center justify-center whitespace-nowrap rounded-md px-3 py-1 text-sm font-medium ring-offset-background transition-all focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 data-[state=active]:bg-background data-[state=active]:text-foreground data-[state=active]:shadow",
35 |       className,
36 |     )}
37 |     {...props}
38 |   />
39 | ));
40 | TabsTrigger.displayName = TabsPrimitive.Trigger.displayName;
41 | 
42 | const TabsContent = React.forwardRef<
43 |   React.ElementRef<typeof TabsPrimitive.Content>,
44 |   React.ComponentPropsWithoutRef<typeof TabsPrimitive.Content>
45 | >(({ className, ...props }, ref) => (
46 |   <TabsPrimitive.Content
47 |     ref={ref}
48 |     className={cn(
49 |       "mt-2 ring-offset-background focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 max-h-[calc(100vh-200px)] overflow-y-auto",
50 |       className,
51 |     )}
52 |     {...props}
53 |   />
54 | ));
55 | TabsContent.displayName = TabsPrimitive.Content.displayName;
56 | 
57 | export { Tabs, TabsList, TabsTrigger, TabsContent };
58 | 


--------------------------------------------------------------------------------
/website/src/components/ui/textarea.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | 
 3 | import { cn } from "@/lib/utils";
 4 | 
 5 | export interface TextareaProps
 6 |   extends React.TextareaHTMLAttributes<HTMLTextAreaElement> {}
 7 | 
 8 | const Textarea = React.forwardRef<HTMLTextAreaElement, TextareaProps>(
 9 |   ({ className, ...props }, ref) => {
10 |     return (
11 |       <textarea
12 |         className={cn(
13 |           "flex min-h-[60px] w-full rounded-md border border-input bg-transparent px-3 py-2 text-sm shadow-sm placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:cursor-not-allowed disabled:opacity-50",
14 |           className,
15 |         )}
16 |         ref={ref}
17 |         {...props}
18 |       />
19 |     );
20 |   },
21 | );
22 | Textarea.displayName = "Textarea";
23 | 
24 | export { Textarea };
25 | 


--------------------------------------------------------------------------------
/website/src/components/ui/toaster.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import { useToast } from "@/hooks/use-toast";
 4 | import {
 5 |   Toast,
 6 |   ToastClose,
 7 |   ToastDescription,
 8 |   ToastProvider,
 9 |   ToastTitle,
10 |   ToastViewport,
11 | } from "@/components/ui/toast";
12 | 
13 | export function Toaster() {
14 |   const { toasts } = useToast();
15 | 
16 |   return (
17 |     <ToastProvider>
18 |       {toasts.map(function ({ id, title, description, action, ...props }) {
19 |         return (
20 |           <Toast key={id} {...props}>
21 |             <div className="grid gap-1">
22 |               {title && <ToastTitle>{title}</ToastTitle>}
23 |               {description && (
24 |                 <ToastDescription>{description}</ToastDescription>
25 |               )}
26 |             </div>
27 |             {action}
28 |             <ToastClose />
29 |           </Toast>
30 |         );
31 |       })}
32 |       <ToastViewport />
33 |     </ToastProvider>
34 |   );
35 | }
36 | 


--------------------------------------------------------------------------------
/website/src/components/ui/tooltip.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import * as React from "react";
 4 | import * as TooltipPrimitive from "@radix-ui/react-tooltip";
 5 | 
 6 | import { cn } from "@/lib/utils";
 7 | 
 8 | const TooltipProvider = TooltipPrimitive.Provider;
 9 | 
10 | const Tooltip = TooltipPrimitive.Root;
11 | 
12 | const TooltipTrigger = TooltipPrimitive.Trigger;
13 | 
14 | const TooltipContent = React.forwardRef<
15 |   React.ElementRef<typeof TooltipPrimitive.Content>,
16 |   React.ComponentPropsWithoutRef<typeof TooltipPrimitive.Content>
17 | >(({ className, sideOffset = 4, ...props }, ref) => (
18 |   <TooltipPrimitive.Content
19 |     ref={ref}
20 |     sideOffset={sideOffset}
21 |     className={cn(
22 |       "z-50 overflow-hidden rounded-md bg-primary px-3 py-1.5 text-xs text-primary-foreground animate-in fade-in-0 zoom-in-95 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",
23 |       className,
24 |     )}
25 |     {...props}
26 |   />
27 | ));
28 | TooltipContent.displayName = TooltipPrimitive.Content.displayName;
29 | 
30 | export { Tooltip, TooltipTrigger, TooltipContent, TooltipProvider };
31 | 


--------------------------------------------------------------------------------
/website/src/contexts/BookmarkContext.tsx:
--------------------------------------------------------------------------------
 1 | import React, {
 2 |   createContext,
 3 |   useContext,
 4 |   useState,
 5 |   ReactNode,
 6 |   useEffect,
 7 | } from "react";
 8 | import { Bookmark, BookmarkContextType, UserNote } from "@/app/types";
 9 | import { BOOKMARKS_STORAGE_KEY } from "@/app/localStorageKeys";
10 | 
11 | const BookmarkContext = createContext<BookmarkContextType | undefined>(
12 |   undefined
13 | );
14 | 
15 | export const useBookmarkContext = () => {
16 |   const context = useContext(BookmarkContext);
17 |   if (!context) {
18 |     throw new Error(
19 |       "useBookmarkContext must be used within a BookmarkProvider"
20 |     );
21 |   }
22 |   return context;
23 | };
24 | 
25 | export const BookmarkProvider: React.FC<{ children: ReactNode }> = ({
26 |   children,
27 | }) => {
28 |   const [bookmarks, setBookmarks] = useState<Bookmark[]>(() => {
29 |     if (typeof window !== "undefined") {
30 |       const storedBookmarks = localStorage.getItem(BOOKMARKS_STORAGE_KEY);
31 |       return storedBookmarks ? JSON.parse(storedBookmarks) : [];
32 |     }
33 |     return [];
34 |   });
35 | 
36 |   useEffect(() => {
37 |     localStorage.setItem(BOOKMARKS_STORAGE_KEY, JSON.stringify(bookmarks));
38 |   }, [bookmarks]);
39 | 
40 |   const addBookmark = (color: string, notes: UserNote[]) => {
41 |     const newBookmark: Bookmark = {
42 |       id: Date.now().toString(),
43 |       color,
44 |       notes,
45 |     };
46 |     setBookmarks((prevBookmarks) => [...prevBookmarks, newBookmark]);
47 |   };
48 | 
49 |   const removeBookmark = (id: string) => {
50 |     setBookmarks((prevBookmarks) =>
51 |       prevBookmarks.filter((bookmark) => bookmark.id !== id)
52 |     );
53 |   };
54 | 
55 |   const getNotesForRowAndColumn = (
56 |     rowIndex: number,
57 |     columnId: string
58 |   ): UserNote[] => {
59 |     return bookmarks.flatMap((bookmark) =>
60 |       bookmark.notes.filter(
61 |         (note) =>
62 |           note.metadata?.rowIndex === rowIndex &&
63 |           note.metadata?.columnId === columnId
64 |       )
65 |     );
66 |   };
67 | 
68 |   return (
69 |     <BookmarkContext.Provider
70 |       value={{
71 |         bookmarks,
72 |         addBookmark,
73 |         removeBookmark,
74 |         getNotesForRowAndColumn,
75 |       }}
76 |     >
77 |       {children}
78 |     </BookmarkContext.Provider>
79 |   );
80 | };
81 | 


--------------------------------------------------------------------------------
/website/src/hooks/useOptimizeCheck.ts:
--------------------------------------------------------------------------------
  1 | import { useState, useEffect, useCallback } from "react";
  2 | import axios from "axios";
  3 | import { OptimizeResult, OptimizeRequest } from "@/app/types";
  4 | import { API_ROUTES } from "@/app/api/constants";
  5 | 
  6 | interface UseOptimizeCheckProps {
  7 |   onComplete?: (result: OptimizeResult) => void;
  8 |   onError?: (error: string) => void;
  9 |   pollInterval?: number;
 10 | }
 11 | 
 12 | export function useOptimizeCheck({
 13 |   onComplete,
 14 |   onError,
 15 |   pollInterval = 1000,
 16 | }: UseOptimizeCheckProps = {}) {
 17 |   const [taskId, setTaskId] = useState<string | null>(null);
 18 |   const [result, setResult] = useState<OptimizeResult | null>(null);
 19 |   const [error, setError] = useState<string | null>(null);
 20 |   const [isLoading, setIsLoading] = useState(false);
 21 | 
 22 |   const submitTask = async (request: OptimizeRequest) => {
 23 |     try {
 24 |       setIsLoading(true);
 25 |       setError(null);
 26 |       setResult(null);
 27 | 
 28 |       const response = await axios.post<{ task_id: string }>(
 29 |         API_ROUTES.OPTIMIZE.SUBMIT,
 30 |         request
 31 |       );
 32 | 
 33 |       setTaskId(response.data.task_id);
 34 |     } catch (err) {
 35 |       const errorMessage =
 36 |         err instanceof Error ? err.message : "Failed to submit task";
 37 |       setError(errorMessage);
 38 |       onError?.(errorMessage);
 39 |     } finally {
 40 |       setIsLoading(false);
 41 |     }
 42 |   };
 43 | 
 44 |   const cancelTask = async () => {
 45 |     if (!taskId) return;
 46 | 
 47 |     try {
 48 |       await axios.post(API_ROUTES.OPTIMIZE.CANCEL(taskId));
 49 |       setTaskId(null);
 50 |     } catch (err) {
 51 |       const errorMessage =
 52 |         err instanceof Error ? err.message : "Failed to cancel task";
 53 |       setError(errorMessage);
 54 |       onError?.(errorMessage);
 55 |     }
 56 |   };
 57 | 
 58 |   useEffect(() => {
 59 |     if (!taskId) return;
 60 | 
 61 |     const pollTask = async () => {
 62 |       try {
 63 |         const response = await axios.get<OptimizeResult>(
 64 |           API_ROUTES.OPTIMIZE.STATUS(taskId)
 65 |         );
 66 | 
 67 |         setResult(response.data);
 68 | 
 69 |         if (
 70 |           ["completed", "failed", "cancelled"].includes(response.data.status)
 71 |         ) {
 72 |           setTaskId(null);
 73 |           setIsLoading(false);
 74 | 
 75 |           if (response.data.status === "completed") {
 76 |             onComplete?.(response.data);
 77 |           } else if (response.data.status === "failed" && response.data.error) {
 78 |             setError(response.data.error);
 79 |             onError?.(response.data.error);
 80 |           }
 81 |         }
 82 |       } catch (err) {
 83 |         const errorMessage =
 84 |           err instanceof Error ? err.message : "Failed to fetch task status";
 85 |         setError(errorMessage);
 86 |         onError?.(errorMessage);
 87 |         setTaskId(null);
 88 |         setIsLoading(false);
 89 |       }
 90 |     };
 91 | 
 92 |     const interval = setInterval(pollTask, pollInterval);
 93 |     return () => clearInterval(interval);
 94 |   }, [taskId, onComplete, onError, pollInterval]);
 95 | 
 96 |   return {
 97 |     submitTask,
 98 |     cancelTask,
 99 |     result,
100 |     error,
101 |     isLoading,
102 |     isRunning: !!taskId,
103 |   };
104 | }
105 | 


--------------------------------------------------------------------------------
/website/src/lib/analytics.ts:
--------------------------------------------------------------------------------
 1 | export const logEvent = (
 2 |   action: string,
 3 |   category: string,
 4 |   label: string,
 5 |   value?: number,
 6 | ): void => {
 7 |   window.gtag?.("event", action, {
 8 |     event_category: category,
 9 |     event_label: label,
10 |     value: value,
11 |   });
12 | };
13 | 


--------------------------------------------------------------------------------
/website/src/lib/api-config.ts:
--------------------------------------------------------------------------------
1 | export const getBackendUrl = () => {
2 |   const protocol = process.env.NEXT_PUBLIC_BACKEND_HTTPS ? "https" : "http";
3 |   const host = process.env.NEXT_PUBLIC_BACKEND_HOST;
4 |   const port = process.env.NEXT_PUBLIC_BACKEND_PORT;
5 |   return `${protocol}://${host}:${port}`;
6 | };
7 | 


--------------------------------------------------------------------------------
/website/src/lib/api.ts:
--------------------------------------------------------------------------------
 1 | import fs from "fs";
 2 | import path from "path";
 3 | import matter from "gray-matter";
 4 | 
 5 | const postsDirectory = path.join(process.cwd(), "posts");
 6 | 
 7 | export function getSortedPostsData() {
 8 |   // Get file names under /posts
 9 |   const fileNames = fs.readdirSync(postsDirectory);
10 |   const allPostsData = fileNames.map((fileName) => {
11 |     // Remove ".md" from file name to get id
12 |     const id = fileName.replace(/\.md$/, "");
13 | 
14 |     // Read markdown file as string
15 |     const fullPath = path.join(postsDirectory, fileName);
16 |     const fileContents = fs.readFileSync(fullPath, "utf8");
17 | 
18 |     // Use gray-matter to parse the post metadata section
19 |     const matterResult = matter(fileContents);
20 | 
21 |     // Combine the data with the id
22 |     return {
23 |       id,
24 |       ...(matterResult.data as { date: string; title: string }),
25 |     };
26 |   });
27 |   // Sort posts by date
28 |   return allPostsData.sort((a, b) => {
29 |     if (a.date < b.date) {
30 |       return 1;
31 |     } else {
32 |       return -1;
33 |     }
34 |   });
35 | }
36 | 
37 | export function getPostData(id: string) {
38 |   const fullPath = path.join(postsDirectory, `${id}.md`);
39 |   const fileContents = fs.readFileSync(fullPath, "utf8");
40 | 
41 |   // Use gray-matter to parse the post metadata section
42 |   const matterResult = matter(fileContents);
43 | 
44 |   // Combine the data with the id
45 |   return {
46 |     id,
47 |     content: matterResult.content,
48 |     ...(matterResult.data as { date: string; title: string }),
49 |   };
50 | }
51 | 


--------------------------------------------------------------------------------
/website/src/lib/utils.ts:
--------------------------------------------------------------------------------
 1 | import { type ClassValue, clsx } from "clsx";
 2 | import { twMerge } from "tailwind-merge";
 3 | 
 4 | export function cn(...inputs: ClassValue[]) {
 5 |   return twMerge(clsx(inputs));
 6 | }
 7 | 
 8 | export function canBeOptimized(operationType: string) {
 9 |   return ["resolve", "map", "reduce", "filter"].includes(operationType);
10 | }
11 | 
12 | export const generateId = () => {
13 |   return Math.random().toString(36).substr(2, 9);
14 | };
15 | 
16 | export const DOCWRANGLER_HOSTED_COST_LIMIT = 10;
17 | 
18 | export const isDocWranglerHosted = () => {
19 |   const backendHost = process.env.NEXT_PUBLIC_BACKEND_HOST || "";
20 |   const isHostedVar = process.env.NEXT_PUBLIC_HOSTED_DOCWRANGLER || "false";
21 |   return backendHost.includes("modal.run") || isHostedVar === "true";
22 | };
23 | 


--------------------------------------------------------------------------------
/website/src/mocks/mockData.ts:
--------------------------------------------------------------------------------
 1 | import { File, Operation } from "@/app/types";
 2 | import path from "path";
 3 | 
 4 | export const mockFiles = [];
 5 | 
 6 | export const initialOperations: Operation[] = [
 7 |   // {
 8 |   //   id: "1",
 9 |   //   llmType: "LLM",
10 |   //   type: "map",
11 |   //   name: "extract_funny_quotes",
12 |   //   prompt:
13 |   //     "list the funniest quotes in this presidential debate, {{ input.date }}. here is the transcript: \n {{ input.content }}",
14 |   //   output: {
15 |   //     schema: [
16 |   //       {
17 |   //         key: "quote",
18 |   //         type: "list",
19 |   //         subType: { key: "quote", type: "string" },
20 |   //       },
21 |   //     ],
22 |   //   },
23 |   // },
24 |   // {
25 |   //   id: '2',
26 |   //   llmType: 'non-LLM',
27 |   //   type: 'unnest',
28 |   //   name: 'unnest_themes',
29 |   //   otherKwargs: {
30 |   //     unnest_key: 'theme',
31 |   //   }
32 |   // },
33 |   // {
34 |   //   id: '3',
35 |   //   llmType: 'LLM',
36 |   //   type: 'resolve',
37 |   //   name: 'resolve_themes',
38 |   //   otherKwargs: {
39 |   //     comparison_prompt: 'Are {{ input1.theme }} and {{ input2.theme }} very related?',
40 |   //     resolution_prompt: 'What is a canonical name for the theme? Canonicalize the following themes: {% for input in inputs %} {{ input.theme }} {% endfor %}',
41 |   //   },
42 |   //   output: {
43 |   //     schema: [
44 |   //       { key: 'theme', type: 'string' }
45 |   //     ]
46 |   //   }
47 |   // },
48 |   // {
49 |   //   id: '4',
50 |   //   llmType: 'LLM',
51 |   //   type: 'reduce',
52 |   //   name: 'reduce_themes',
53 |   //   otherKwargs: {
54 |   //     associative: true,
55 |   //     reduce_key: ['year'],
56 |   //   },
57 |   //   prompt: 'summarize the themes discussed in the debate. here are the transcripts: \n {% for input in inputs %}{{ input.content }}\n{% endfor %}',
58 |   //   output: {
59 |   //     schema: [
60 |   //       { key: 'summary', type: 'string' }
61 |   //     ]
62 |   //   }
63 |   // }
64 | ];
65 | 
66 | export const mockSampleSize = 5;
67 | export const mockPipelineName = "Untitled_Analysis";
68 | 


--------------------------------------------------------------------------------
/website/tailwind.config.ts:
--------------------------------------------------------------------------------
 1 | import type { Config } from "tailwindcss";
 2 | import animate from "tailwindcss-animate";
 3 | import scrollbar from "tailwind-scrollbar";
 4 | 
 5 | const config: Config = {
 6 |   darkMode: ["class"],
 7 |   content: [
 8 |     "./src/pages/**/*.{js,ts,jsx,tsx,mdx}",
 9 |     "./src/components/**/*.{js,ts,jsx,tsx,mdx}",
10 |     "./src/app/**/*.{js,ts,jsx,tsx,mdx}",
11 |   ],
12 |   theme: {
13 |     extend: {
14 |       colors: {
15 |         background: "hsl(var(--background))",
16 |         foreground: "hsl(var(--foreground))",
17 |         card: {
18 |           DEFAULT: "hsl(var(--card))",
19 |           foreground: "hsl(var(--card-foreground))",
20 |         },
21 |         popover: {
22 |           DEFAULT: "hsl(var(--popover))",
23 |           foreground: "hsl(var(--popover-foreground))",
24 |         },
25 |         primary: {
26 |           DEFAULT: "hsl(var(--primary))",
27 |           foreground: "hsl(var(--primary-foreground))",
28 |         },
29 |         secondary: {
30 |           DEFAULT: "hsl(var(--secondary))",
31 |           foreground: "hsl(var(--secondary-foreground))",
32 |         },
33 |         muted: {
34 |           DEFAULT: "hsl(var(--muted))",
35 |           foreground: "hsl(var(--muted-foreground))",
36 |         },
37 |         accent: {
38 |           DEFAULT: "hsl(var(--accent))",
39 |           foreground: "hsl(var(--accent-foreground))",
40 |         },
41 |         destructive: {
42 |           DEFAULT: "hsl(var(--destructive))",
43 |           foreground: "hsl(var(--destructive-foreground))",
44 |         },
45 |         border: "hsl(var(--border))",
46 |         input: "hsl(var(--input))",
47 |         ring: "hsl(var(--ring))",
48 |         chart: {
49 |           "1": "hsl(var(--chart-1))",
50 |           "2": "hsl(var(--chart-2))",
51 |           "3": "hsl(var(--chart-3))",
52 |           "4": "hsl(var(--chart-4))",
53 |           "5": "hsl(var(--chart-5))",
54 |         },
55 |       },
56 |       borderRadius: {
57 |         lg: "var(--radius)",
58 |         md: "calc(var(--radius) - 2px)",
59 |         sm: "calc(var(--radius) - 4px)",
60 |       },
61 |       keyframes: {
62 |         "accordion-down": {
63 |           from: {
64 |             height: "0",
65 |           },
66 |           to: {
67 |             height: "var(--radix-accordion-content-height)",
68 |           },
69 |         },
70 |         "accordion-up": {
71 |           from: {
72 |             height: "var(--radix-accordion-content-height)",
73 |           },
74 |           to: {
75 |             height: "0",
76 |           },
77 |         },
78 |       },
79 |       animation: {
80 |         "accordion-down": "accordion-down 0.2s ease-out",
81 |         "accordion-up": "accordion-up 0.2s ease-out",
82 |       },
83 |     },
84 |   },
85 |   plugins: [animate, scrollbar],
86 | };
87 | export default config;
88 | 


--------------------------------------------------------------------------------
/website/todos.md:
--------------------------------------------------------------------------------
1 | # Things We Should Implement in the UI
2 | 
3 | - Ability to rerun an operation for individual rows/documents
4 | - Ability to manually edit a row in the output (to have it persist)
5 | - Track assistant cost in the UI in addition to operations cost
6 | - Instructions to set up .env.local for assistant
7 | 


--------------------------------------------------------------------------------
/website/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "lib": ["dom", "dom.iterable", "esnext"],
 4 |     "allowJs": true,
 5 |     "skipLibCheck": true,
 6 |     "strict": false,
 7 |     "noEmit": true,
 8 |     "esModuleInterop": true,
 9 |     "module": "esnext",
10 |     "moduleResolution": "bundler",
11 |     "resolveJsonModule": true,
12 |     "isolatedModules": true,
13 |     "jsx": "preserve",
14 |     "incremental": true,
15 |     "plugins": [
16 |       {
17 |         "name": "next"
18 |       }
19 |     ],
20 |     "paths": {
21 |       "@/*": ["./src/*"]
22 |     }
23 |   },
24 |   "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
25 |   "exclude": ["node_modules"]
26 | }
27 | 


--------------------------------------------------------------------------------
/website/vercel.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 2,
 3 |     "builds": [
 4 |       {
 5 |         "src": "package.json",
 6 |         "use": "@vercel/next"
 7 |       }
 8 |     ],
 9 |     "git": {
10 |       "deploymentEnabled": {
11 |         "main": false
12 |       }
13 |     }
14 |   }


--------------------------------------------------------------------------------