├── .flake8
├── .github
    ├── renovate.json
    └── workflows
    │   ├── beta-cicd.yml
    │   ├── dev-cicd.yml
    │   ├── feature-cicd.yml
    │   ├── main-cicd.yml
    │   └── publish-pypi.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .readthedocs.yaml
├── LICENSE
├── README.md
├── app
    ├── __init__.py
    └── main.py
├── codecov.yml
├── docs
    └── conf.py
├── env.example
├── poetry.lock
├── public
    ├── common-commands.mov
    └── help-menu.png
├── pyproject.toml
├── tests
    ├── __init__.py
    └── test_main.py
└── tox.ini


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, E266, E501, W503, B006, B007, B008, F401, C416, B950, B904
3 | max-line-length = 88
4 | max-complexity = 18
5 | select = B,C,E,F,W,T4,B9
6 | exclude = venv, .venv, tests/.datafog_env, examples/venv, .gitignore, .DS_Store, .git, .github, .tox, .nox, .coverage, .coverage.*


--------------------------------------------------------------------------------
/.github/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": ["config:base"]
3 | }
4 | 


--------------------------------------------------------------------------------
/.github/workflows/beta-cicd.yml:
--------------------------------------------------------------------------------
 1 | name: beta-cicd-setup-and-test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "v*.0.0-beta.*"
 7 |   pull_request:
 8 |     branches:
 9 |       - "v*.0.0-beta.*"
10 | 
11 | jobs:
12 |   lint:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Check out repo
16 |         uses: actions/checkout@v4
17 |       - name: Set up Python
18 |         uses: actions/setup-python@v4
19 |         with:
20 |           python-version: "3.10"
21 |       - name: Install pre-commit
22 |         run: pip install pre-commit
23 |       - name: Run pre-commit
24 |         run: pre-commit run --all-files
25 | 
26 |   build:
27 |     runs-on: ubuntu-latest
28 |     strategy:
29 |       matrix:
30 |         python-version: ["3.10", "3.11", "3.12"]
31 |     steps:
32 |       - name: Check out repo
33 |         uses: actions/checkout@v4
34 |       - name: Free Disk Space (Ubuntu)
35 |         uses: jlumbroso/free-disk-space@main
36 |         with:
37 |           # this might remove tools that are actually needed,
38 |           # if set to "true" but frees about 6 GB
39 |           tool-cache: false
40 |           # all of these default to true, but feel free to set to
41 |           # "false" if necessary for your workflow
42 |           android: true
43 |           dotnet: true
44 |           haskell: true
45 |           large-packages: true
46 |           docker-images: true
47 |           swap-storage: true
48 |       - name: Set up Python
49 |         uses: actions/setup-python@v4
50 |         with:
51 |           python-version: ${{ matrix.python-version }}
52 |       - name: Install Dependencies
53 |         run: |
54 |           pip install -U pip
55 |           pip install --no-cache-dir -e .
56 |           pip install --no-cache-dir tox just pre-commit
57 |       - name: Free up disk space
58 |         run: |
59 |           sudo apt-get clean
60 |       - name: Run Tests with tox
61 |         run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
62 |       - name: Submit to Codecov
63 |         uses: codecov/codecov-action@v3
64 |         with:
65 |           token: ${{ secrets.CODECOV_TOKEN }}
66 |           files: ./coverage.xml
67 |           flags: unittests
68 |           name: codecov-umbrella
69 | 


--------------------------------------------------------------------------------
/.github/workflows/dev-cicd.yml:
--------------------------------------------------------------------------------
 1 | name: dev-cicd-setup-and-test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - dev
 7 |   pull_request:
 8 |     branches:
 9 |       - dev
10 | 
11 | jobs:
12 |   lint:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Check out repo
16 |         uses: actions/checkout@v4
17 |       - name: Set up Python
18 |         uses: actions/setup-python@v4
19 |         with:
20 |           python-version: "3.10"
21 |       - name: Install pre-commit
22 |         run: pip install pre-commit
23 |       - name: Run pre-commit
24 |         run: pre-commit run --all-files
25 | 
26 |   build:
27 |     runs-on: ubuntu-latest
28 |     strategy:
29 |       matrix:
30 |         python-version: ["3.10", "3.11", "3.12"]
31 |     steps:
32 |       - name: Check out repo
33 |         uses: actions/checkout@v4
34 |       - name: Free Disk Space (Ubuntu)
35 |         uses: jlumbroso/free-disk-space@main
36 |         with:
37 |           # this might remove tools that are actually needed,
38 |           # if set to "true" but frees about 6 GB
39 |           tool-cache: false
40 |           # all of these default to true, but feel free to set to
41 |           # "false" if necessary for your workflow
42 |           android: true
43 |           dotnet: true
44 |           haskell: true
45 |           large-packages: true
46 |           docker-images: true
47 |           swap-storage: true
48 |       - name: Set up Python
49 |         uses: actions/setup-python@v4
50 |         with:
51 |           python-version: ${{ matrix.python-version }}
52 |       - name: Install Dependencies
53 |         run: |
54 |           pip install -U pip
55 |           pip install -e .
56 |           pip install tox just pre-commit
57 |       - name: Run Tests with tox
58 |         run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
59 |       - name: Submit to Codecov
60 |         uses: codecov/codecov-action@v3
61 |         with:
62 |           token: ${{ secrets.CODECOV_TOKEN }}
63 |           files: ./coverage.xml
64 |           flags: unittests
65 |           name: codecov-umbrella
66 |       - name: Clean up pip cache
67 |         run: |
68 |           pip cache purge
69 |           rm -rf ~/.cache/pip
70 | 


--------------------------------------------------------------------------------
/.github/workflows/feature-cicd.yml:
--------------------------------------------------------------------------------
 1 | name: feature-cicd-setup-and-test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - feature/*
 7 |   pull_request:
 8 |     branches:
 9 |       - feature/*
10 | 
11 | jobs:
12 |   lint:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Check out repo
16 |         uses: actions/checkout@v4
17 |       - name: Set up Python
18 |         uses: actions/setup-python@v4
19 |         with:
20 |           python-version: "3.10"
21 |       - name: Install pre-commit
22 |         run: pip install pre-commit
23 |       - name: Run pre-commit
24 |         run: pre-commit run --all-files
25 | 
26 |   build:
27 |     runs-on: ubuntu-latest
28 |     strategy:
29 |       matrix:
30 |         python-version: ["3.10", "3.11", "3.12"]
31 |     steps:
32 |       - name: Check out repo
33 |         uses: actions/checkout@v4
34 |       - name: Free Disk Space (Ubuntu)
35 |         uses: jlumbroso/free-disk-space@main
36 |         with:
37 |           # this might remove tools that are actually needed,
38 |           # if set to "true" but frees about 6 GB
39 |           tool-cache: false
40 |           # all of these default to true, but feel free to set to
41 |           # "false" if necessary for your workflow
42 |           android: true
43 |           dotnet: true
44 |           haskell: true
45 |           large-packages: true
46 |           docker-images: true
47 |           swap-storage: true
48 |       - name: Set up Python
49 |         uses: actions/setup-python@v4
50 |         with:
51 |           python-version: ${{ matrix.python-version }}
52 |       - name: Install Dependencies
53 |         run: |
54 |           pip install -U pip
55 |           pip install --no-cache-dir -e .
56 |           pip install --no-cache-dir tox just pre-commit
57 |       - name: Free up disk space
58 |         run: |
59 |           sudo apt-get clean
60 |       - name: Run Tests with tox
61 |         run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
62 |       - name: Submit to Codecov
63 |         uses: codecov/codecov-action@v3
64 |         with:
65 |           token: ${{ secrets.CODECOV_TOKEN }}
66 |           files: ./coverage.xml
67 |           flags: unittests
68 |           name: codecov-umbrella
69 | 


--------------------------------------------------------------------------------
/.github/workflows/main-cicd.yml:
--------------------------------------------------------------------------------
 1 | name: main-cicd
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "main"
 7 |   pull_request:
 8 |     branches:
 9 |       - "main"
10 | 
11 | jobs:
12 |   lint:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Check out repo
16 |         uses: actions/checkout@v4
17 |       - name: Set up Python
18 |         uses: actions/setup-python@v4
19 |         with:
20 |           python-version: "3.10"
21 |       - name: Install pre-commit
22 |         run: pip install pre-commit
23 |       - name: Run pre-commit
24 |         run: pre-commit run --all-files
25 | 
26 |   build:
27 |     runs-on: ubuntu-latest
28 |     strategy:
29 |       matrix:
30 |         python-version: ["3.10"]
31 |     steps:
32 |       - name: Check out repo
33 |         uses: actions/checkout@v4
34 |       - name: Free Disk Space (Ubuntu)
35 |         uses: jlumbroso/free-disk-space@main
36 |         with:
37 |           # this might remove tools that are actually needed,
38 |           # if set to "true" but frees about 6 GB
39 |           tool-cache: false
40 |           # all of these default to true, but feel free to set to
41 |           # "false" if necessary for your workflow
42 |           android: true
43 |           dotnet: true
44 |           haskell: true
45 |           large-packages: true
46 |           docker-images: true
47 |           swap-storage: true
48 |       - name: Set up Python
49 |         uses: actions/setup-python@v4
50 |         with:
51 |           python-version: ${{ matrix.python-version }}
52 |       - name: Install Dependencies
53 |         run: |
54 |           pip install -U pip
55 |           pip install --no-cache-dir -e .
56 |           pip install --no-cache-dir tox just pre-commit
57 |       - name: Free up disk space
58 |         run: |
59 |           sudo apt-get clean
60 |       - name: Run Tests with tox
61 |         run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
62 |       - name: Submit to Codecov
63 |         uses: codecov/codecov-action@v3
64 |         with:
65 |           token: ${{ secrets.CODECOV_TOKEN }}
66 |           files: ./coverage.xml
67 |           flags: unittests
68 |           name: codecov-umbrella
69 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: PyPI Release
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       version:
 7 |         description: "Version to release (e.g., 1.2.3)"
 8 |         required: true
 9 |       confirm_tests:
10 |         description: "Confirm all tests have passed"
11 |         type: boolean
12 |         required: true
13 | 
14 | jobs:
15 |   release:
16 |     runs-on: ubuntu-latest
17 |     if: github.event.inputs.confirm_tests == 'true'
18 |     permissions:
19 |       contents: write
20 |     steps:
21 |       - uses: actions/checkout@v3
22 |       - name: Set up Python
23 |         uses: actions/setup-python@v4
24 |         with:
25 |           python-version: "3.10"
26 |       - name: Install dependencies
27 |         run: |
28 |           python -m pip install --upgrade pip
29 |           pip install build twine
30 |       - name: Build package
31 |         run: python -m build
32 |       - name: Create GitHub Release
33 |         env:
34 |           GITHUB_TOKEN: ${{ secrets.pypi }}
35 |         run: |
36 |           git config user.name github-actions
37 |           git config user.email github-actions@github.com
38 |           git tag v${{ github.event.inputs.version }}
39 |           git push origin v${{ github.event.inputs.version }}
40 |           gh release create v${{ github.event.inputs.version }} --generate-notes
41 |       - name: Publish to PyPI
42 |         env:
43 |           TWINE_USERNAME: __token__
44 |           TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
45 |         run: twine upload dist/*
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .env
 2 | output/
 3 | datafog_instructor.egg-info/
 4 | examples/poc.py
 5 | dist/
 6 | venv/
 7 | app/__pycache__/
 8 | *.pyc
 9 | *.swp
10 | *.prof
11 | MANIFEST
12 | dist/
13 | build/
14 | .coverage
15 | .cache/
16 | *.egg-info/
17 | .pytest_cache/
18 | .tox/
19 | src/datafog/__pycache__/
20 | src/datafog/pii_tools/__pycache__/
21 | tests/__pycache__/
22 | tests/.datafog_env/
23 | datafog_debug.log
24 | sotu_2023.txt
25 | .DS_Store
26 | .env
27 | coverage.xml
28 | htmlcov/
29 | .venv/
30 | node_modules/
31 | .DS_Store
32 | .venv
33 | examples/venv/
34 | error_log.txt
35 | docs/*
36 | !docs/*.rst
37 | !docs/conf.py
38 | scratch.py


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/PyCQA/isort
 3 |     rev: 5.12.0
 4 |     hooks:
 5 |       - id: isort
 6 |         args: [--profile=black]
 7 |         exclude: .venv
 8 | 
 9 |   - repo: https://github.com/psf/black
10 |     rev: 24.2.0
11 |     hooks:
12 |       - id: black
13 |         language_version: python3
14 |         exclude: .venv
15 | 
16 |   - repo: https://github.com/pycqa/flake8
17 |     rev: 7.0.0
18 |     hooks:
19 |       - id: flake8
20 |         args: [--max-line-length=88] # Match Black's line length
21 |         additional_dependencies: [flake8-bugbear, flake8-comprehensions]
22 |         exclude: .venv
23 | 
24 |   - repo: https://github.com/pre-commit/mirrors-prettier
25 |     rev: v4.0.0-alpha.8
26 |     hooks:
27 |       - id: prettier
28 |         exclude: .venv
29 | 


--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | tests/*
2 | examples/*


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the OS, Python version and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.12"
13 |     # You can also specify other tool versions:
14 |     # nodejs: "19"
15 |     # rust: "1.64"
16 |     # golang: "1.19"
17 | 
18 | # Build documentation in the "docs/" directory with Sphinx
19 | sphinx:
20 |   configuration: docs/conf.py
21 | # Optionally build your docs in additional formats such as PDF and ePub
22 | # formats:
23 | #    - pdf
24 | #    - epub
25 | 
26 | # Optional but recommended, declare the Python requirements required
27 | # to build your documentation
28 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
29 | # python:
30 | #    install:
31 | #    - requirements: docs/requirements.txt
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2024 Sid Mohan, DataFog Inc. 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DataFog Instructor
 2 | 
 3 | v0.1.0 Release Notes
 4 | 
 5 | Hi folks, based on some feedback a few important changes:
 6 | 
 7 | - We have shifted away from the CLI approach to a more flexible API-based solution. For v0.1.0, you'll need to clone the repository and install dependencies using Poetry.
 8 | - The env.example file now includes a LOGFIRE_TOKEN. You can obtain one by signing up at https://logfire.pydantic.dev. Logfire is an observability platform developed by the Pydantic team, designed to assist with debugging and monitoring, including LLM calls.
 9 | - This version focuses on producing consistent LLM outputs for PII detection and incorporates extensive error handling to create a more production-ready service.
10 | - We've implemented robust validation and error handling throughout the codebase to ensure reliability and ease of debugging.
11 | 
12 | Start by cloning the repo and installing the dependencies using poetry:
13 | 
14 | ```
15 | git clone https://github.com/datafog/datafog-instructor.git
16 | cd datafog-instructor
17 | poetry install
18 | ```
19 | 
20 | You'll also need to create a `.env` file with the OPENAI_API_KEY and GROQ_API_KEY.  You can get these by signing up for accounts at https://openai.com/ and https://www.groq.com/.
21 | 
22 | Once you have the .env file, you can run the following to start the service:
23 | 
24 | ```
25 | uvicorn app.main:app --reload
26 | ```
27 | 
28 | 
29 | ## Sample CURL Commands
30 | 
31 | 
32 | ```
33 | curl -X POST "http://localhost:8000/extract-pii" \     
34 |      -H "Content-Type: application/json" \
35 |      -d '{"content": "My name is John Doe and my email is john.doe@example.com. My phone number is 123-456-7890."}'
36 | ```
37 | 
38 | ## Contributing
39 | 
40 | Contributions to the DataFog Instructor SDK are welcome! Please feel free to submit a Pull Request.
41 | 
42 | ## License
43 | 
44 | This project is licensed under the MIT License.
45 | 
46 | ## Support
47 | 
48 | If you encounter any problems or have any questions, please open an issue on the GitHub repository or join our Discord community at https://discord.gg/bzDth394R4.
49 | 
50 | ## Acknowledgements
51 | 
52 | - Logfire: https://logfire.pydantic.dev
53 | - Pydantic: https://pydantic.dev
54 | - Instructor: https://github.com/jxnl/instructor
55 | 
56 | ## Links
57 | 
58 | - Homepage: https://datafog.ai
59 | - Documentation: https://docs.datafog.ai
60 | - Twitter: https://twitter.com/datafoginc
61 | - GitHub: https://github.com/datafog/datafog-instructor
62 | ```
63 | 


--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataFog/datafog-instructor/5ee1788f1dc6daf692a8db8e31dd17940fa7f7d0/app/__init__.py


--------------------------------------------------------------------------------
/app/main.py:
--------------------------------------------------------------------------------
  1 | from pydantic import BaseModel
  2 | from fastapi import FastAPI, HTTPException, Request
  3 | from openai import AsyncOpenAI
  4 | import instructor
  5 | import logfire
  6 | import asyncio
  7 | from collections.abc import Iterable
  8 | from fastapi.responses import StreamingResponse
  9 | from dotenv import load_dotenv
 10 | import os
 11 | from typing import List, Dict
 12 | from sqlalchemy import create_engine
 13 | from sqlmodel import SQLModel, Field
 14 | from typing import Optional
 15 | import pandas as pd
 16 | from sqlmodel import SQLModel, Field
 17 | from typing import Tuple
 18 | import traceback
 19 | 
 20 | class DetectedPII(BaseModel):
 21 |     """
 22 |     Detected PII data from a document
 23 |     """
 24 |     index: int
 25 |     data_type: str
 26 |     pii_value: str
 27 | 
 28 | 
 29 | class PIIDetectionFlow(BaseModel):
 30 |     """
 31 |     Extracted PII data from a document, all data_types should try to have consistent property names
 32 |     """
 33 | 
 34 |     detected_pii: list[DetectedPII] = Field(..., min_items=1)
 35 | 
 36 |     def __init__(self, **data):
 37 |         super().__init__(**data)
 38 |         self._validate_detected_pii()
 39 | 
 40 |     def _validate_detected_pii(self):
 41 |         """
 42 |         Validates the detected_pii list to ensure it meets the required conditions.
 43 | 
 44 |         Preconditions:
 45 |             - detected_pii is a non-empty list
 46 | 
 47 |         Postconditions:
 48 |             - All items in detected_pii are of type DetectedPII
 49 |             - All DetectedPII objects have unique indices
 50 | 
 51 |         Raises:
 52 |             ValueError: If any of the conditions are not met
 53 |         """
 54 |         if not self.detected_pii:
 55 |             raise ValueError("detected_pii list must not be empty")
 56 | 
 57 |         indices = set()
 58 |         for item in self.detected_pii:
 59 |             if not isinstance(item, DetectedPII):
 60 |                 raise ValueError(f"All items in detected_pii must be of type DetectedPII. Found: {type(item)}")
 61 |             if item.index in indices:
 62 |                 raise ValueError(f"Duplicate index found in detected_pii: {item.index}")
 63 |             indices.add(item.index)
 64 | 
 65 |         logfire.info("PIIDetectionFlow initialized", 
 66 |                      pii_count=len(self.detected_pii), 
 67 |                      unique_indices=len(indices))
 68 | 
 69 |     def __len__(self):
 70 |         return len(self.detected_pii)
 71 | 
 72 |     def __iter__(self):
 73 |         return iter(self.detected_pii)
 74 | 
 75 |     def __getitem__(self, index):
 76 |         return self.detected_pii[index]
 77 | 
 78 |     def redact_pii(self, content):
 79 |         """
 80 |         Iterates over the private data and replaces the value with a placeholder in the form of
 81 |         <{data_type}_{i}>
 82 | 
 83 |         Preconditions:
 84 |             - self.detected_pii is a non-empty list of DetectedPII objects
 85 |             - content is a non-empty string
 86 | 
 87 |         Postconditions:
 88 |             - Returns a string with all PII values replaced by placeholders
 89 |             - The number of replacements made is equal to the length of self.detected_pii
 90 |             - The returned string is not empty
 91 | 
 92 |         Invariants:
 93 |             - The structure of the content string is preserved, only PII values are replaced
 94 |         """
 95 | 
 96 |         # Preconditions
 97 |         assert self.detected_pii, "detected_pii list must not be empty"
 98 |         assert content, "content must not be empty"
 99 | 
100 |         original_content = content
101 |         replacement_count = 0
102 | 
103 |         for i, data in enumerate(self.detected_pii):
104 |             new_content = content.replace(data.pii_value, f"<{data.data_type}_{i}>")
105 |             if new_content != content:
106 |                 replacement_count += 1
107 |             content = new_content
108 |             logfire.info(f"Replaced PII: {data.data_type}", index=i, data_type=data.data_type)
109 | 
110 |         # Postconditions
111 |         assert replacement_count == len(self.detected_pii), "Number of replacements should match detected PII count"
112 |         assert content, "Redacted content must not be empty"
113 |         assert len(content) >= len(original_content), "Redacted content should not be shorter than original"
114 | 
115 |         logfire.info("PII redaction completed", 
116 |                      original_length=len(original_content), 
117 |                      redacted_length=len(content), 
118 |                      replacements_made=replacement_count)
119 | 
120 |         return content
121 | 
122 | app = FastAPI()
123 | 
124 | # ----------------------- Middleware Configuration -----------------------
125 | # Configure logfire before adding any middleware to ensure they are set up correctly
126 | logfire.configure(pydantic_plugin=logfire.PydanticPlugin(record="all"))
127 | 
128 | # Instrument FastAPI with logfire middleware
129 | logfire.instrument_fastapi(app)
130 | # ----------------------- End Middleware Configuration --------------------
131 | 
132 | # Global variables to be initialized on startup
133 | openai_client: Optional[AsyncOpenAI] = None
134 | client: Optional[instructor.from_openai] = None
135 | 
136 | @app.on_event("startup")
137 | async def initialize_app():
138 |     """
139 |     Startup event to initialize and validate environment variables and clients.
140 |     
141 |     Preconditions:
142 |         - Environment variables like OPENAI_API_KEY must be set.
143 |     
144 |     Postconditions:
145 |         - openai_client is initialized and ready to use.
146 |     """
147 |     load_dotenv()
148 |     global OPENAI_API_KEY, openai_client, client
149 |     OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
150 |     
151 |     if not OPENAI_API_KEY:
152 |         logfire.error("OPENAI_API_KEY is not set in the environment")
153 |         raise EnvironmentError("OPENAI_API_KEY must be set in the environment variables")
154 |     
155 |     openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
156 |     
157 |     assert isinstance(app, FastAPI), "app must be an instance of FastAPI"
158 |     assert isinstance(openai_client, AsyncOpenAI), "openai_client must be an instance of AsyncOpenAI"
159 |     
160 |     logfire.info("Application initialized",
161 |                  api_key_set=bool(OPENAI_API_KEY),
162 |                  app_type=type(app).__name__,
163 |                  client_type=type(openai_client).__name__)
164 |     
165 |     logfire.instrument_openai(openai_client)
166 |     client = instructor.from_openai(openai_client)
167 | 
168 | @app.on_event("shutdown")
169 | async def shutdown_event():
170 |     """
171 |     Shutdown event to perform cleanup tasks.
172 |     """
173 |     if openai_client:
174 |         await openai_client.aclose()
175 |     logfire.info("Application shutdown complete.")
176 | 
177 | @app.post("/extract-pii")
178 | async def extract_pii(request: Request) -> PIIDetectionFlow:
179 |     """
180 |     Extracts PII data from a document
181 | 
182 |     Preconditions:
183 |         - content is a non-empty string in the request body
184 |     
185 |     Postconditions:
186 |         - Returns a PIIDetectionFlow object
187 |         - The returned object contains a non-empty list of DetectedPII objects in its detected_pii attribute
188 | 
189 |     Invariants:
190 |         - The original content is not modified
191 |         - The API call to OpenAI is made with the correct model and response_model
192 |     """
193 |     # Preconditions
194 |     body = await request.json()
195 |     assert isinstance(body, dict), "Request body must be a valid JSON object"
196 |     
197 |     content = body.get("content")
198 |     
199 |     # Invariant: content should not be modified after extraction
200 |     original_content = content
201 |     
202 |     if not content or not isinstance(content, str) or content.strip() == "":
203 |         logfire.error("Invalid or missing content in request", body=body)
204 |         raise HTTPException(status_code=422, detail="Content is required and must be a non-empty string")
205 |     
206 |     # Postconditions
207 |     assert content == original_content, "Content should not be modified during extraction"
208 |     assert isinstance(content, str) and content.strip() != "", "Content must be a non-empty string"
209 |     
210 |     logfire.info("Content extracted from request", content_length=len(content))
211 | 
212 |     # Preconditions
213 |     assert isinstance(content, str) and content.strip(), "Content must be a non-empty string"
214 |     assert OPENAI_API_KEY, "OPENAI_API_KEY must be set"
215 | 
216 |     try:
217 |         logfire.info("Initiating PII extraction", content_length=len(content))
218 |         
219 |         response = await client.chat.completions.create(
220 |             model="gpt-4o",
221 |             messages=[{"role": "user", "content": content}],
222 |             response_model=PIIDetectionFlow,
223 |         )
224 |         
225 |         # Postconditions
226 |         assert isinstance(response, PIIDetectionFlow), "Response must be a PIIDetectionFlow object"
227 |         assert response.detected_pii, "Extracted PII data must not be empty"
228 |         
229 |         logfire.info("Successfully extracted PII data", 
230 |                      pii_count=len(response.detected_pii),
231 |                      pii_types=[pii.data_type for pii in response.detected_pii])
232 |         
233 |         # Invariant
234 |         assert content == original_content, "Original content must not be modified during extraction"
235 |         
236 |         return response
237 |     except ValueError as ve:
238 |         logfire.warning("No PII data extracted", error=str(ve))
239 |         raise HTTPException(status_code=204, detail="No PII data found in the content")
240 |     except AssertionError as ae:
241 |         logfire.error("Assertion failed during PII extraction", error=str(ae))
242 |         raise HTTPException(status_code=500, detail=str(ae))
243 |     except Exception as e:
244 |         logfire.error("Unexpected error during PII extraction", 
245 |                       error=str(e), 
246 |                       error_type=type(e).__name__,
247 |                       traceback=traceback.format_exc())
248 |         raise HTTPException(status_code=500, detail="Failed to extract PII data")
249 | 
250 | if __name__ == "__main__":
251 |     import uvicorn
252 | 
253 |     uvicorn.run(app, host="0.0.0.0", port=8000)


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: no
2 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | 
 9 | project = "DataFog Instructor"
10 | copyright = "2024, DataFog Inc."
11 | author = "Sid Mohan"
12 | release = "v0.1.0"
13 | 
14 | # -- General configuration ---------------------------------------------------
15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
16 | 
17 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.autosummary", "sphinx.ext.napoleon"]
18 | 
19 | templates_path = ["_templates"]
20 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
21 | 
22 | 
23 | # -- Options for HTML output -------------------------------------------------
24 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
25 | 
26 | html_theme = "alabaster"
27 | html_static_path = ["_static"]
28 | 
29 | autosummary_generate = True
30 | napoleon_use_rtype = False
31 | napoleon_use_ivar = False
32 | napoleon_use_param = False
33 | 


--------------------------------------------------------------------------------
/env.example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=
2 | GROQ_API_KEY=
3 | LOGFIRE_TOKEN=


--------------------------------------------------------------------------------
/public/common-commands.mov:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataFog/datafog-instructor/5ee1788f1dc6daf692a8db8e31dd17940fa7f7d0/public/common-commands.mov


--------------------------------------------------------------------------------
/public/help-menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataFog/datafog-instructor/5ee1788f1dc6daf692a8db8e31dd17940fa7f7d0/public/help-menu.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "datafog-instructor"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Sid Mohan <sid@datafog.ai>"]
 6 | readme = "README.md"
 7 | packages = [
 8 |     { include = "app"},
 9 | ]
10 | 
11 | [tool.poetry.dependencies]
12 | python = ">=3.12,<3.13"
13 | transformers = "^4.44.2"
14 | pandas = "^2.2.2"
15 | sqlmodel = "^0.0.22"
16 | sentence-transformers = "^3.1.0"
17 | openai = "^1.45.0"
18 | pydantic = "2.9.1"
19 | instructor = "^1.4.2"
20 | python-dotenv = "^1.0.1"
21 | nest-asyncio = "^1.6.0"
22 | typer = "^0.12.5"
23 | anthropic = "^0.34.2"
24 | fastapi = "^0.115.0"
25 | logfire = {extras = ["fastapi"], version = "^0.53.0"}
26 | uvicorn = {extras = ["standard"], version = "^0.30.6"}
27 | pytest = "^8.3.3"
28 | 
29 | 
30 | [tool.poetry.scripts]
31 | start = "uvicorn app.main:app --reload"
32 | 
33 | [build-system]
34 | requires = ["poetry-core"]
35 | build-backend = "poetry.core.masonry.api"
36 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataFog/datafog-instructor/5ee1788f1dc6daf692a8db8e31dd17940fa7f7d0/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from fastapi.testclient import TestClient
 3 | from app.main import app, PIIDataExtraction, Entity
 4 | 
 5 | client = TestClient(app)
 6 | 
 7 | @pytest.mark.asyncio
 8 | async def test_extract_pii():
 9 |     content = "John Doe's SSN is 123-45-6789 and his email is john@example.com"
10 |     response = client.post("/extract-pii", json={"content": content})
11 |     assert response.status_code == 200
12 |     pii_data = PIIDataExtraction(**response.json())
13 |     assert len(pii_data.private_data) > 0
14 |     assert any(data.data_type == "SSN" for data in pii_data.private_data)
15 |     assert any(data.data_type == "EMAIL" for data in pii_data.private_data)
16 | 
17 | @pytest.mark.asyncio
18 | async def test_extract_pii_stream():
19 |     content = "Jane Doe's phone number is (555) 123-4567"
20 |     response = client.post("/extract-pii-stream", json={"content": content})
21 |     assert response.status_code == 200
22 |     # Note: Streaming response testing might require additional setup
23 | 
24 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py310
 3 | isolated_build = True
 4 | 
 5 | [testenv]
 6 | deps =
 7 |     pytest==7.4.0
 8 |     pytest-asyncio==0.21.0
 9 |     pytest-cov
10 |     -r requirements-dev.txt
11 | extras = all
12 | allowlist_externals =
13 |     pip
14 | commands =
15 |     pip install --no-cache-dir -r requirements-dev.txt
16 |     pytest {posargs} -v -s --cov=datafog --cov-report=term-missing
17 | 
18 | [testenv:lint]
19 | skip_install = true
20 | deps =
21 |     black
22 |     isort
23 |     flake8
24 | commands =
25 |     black --check .
26 |     isort --check-only .
27 |     flake8 .
28 | 
29 | [testenv:typecheck]
30 | deps =
31 |     mypy
32 | commands =
33 |     mypy datafog tests
34 | 
35 | [pytest]
36 | asyncio_mode = auto


--------------------------------------------------------------------------------