├── .flake8 ├── .github ├── renovate.json └── workflows │ ├── beta-cicd.yml │ ├── dev-cicd.yml │ ├── feature-cicd.yml │ ├── main-cicd.yml │ └── publish-pypi.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .prettierignore ├── .readthedocs.yaml ├── LICENSE ├── README.md ├── app ├── __init__.py └── main.py ├── codecov.yml ├── docs └── conf.py ├── env.example ├── poetry.lock ├── public ├── common-commands.mov └── help-menu.png ├── pyproject.toml ├── tests ├── __init__.py └── test_main.py └── tox.ini /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E266, E501, W503, B006, B007, B008, F401, C416, B950, B904 3 | max-line-length = 88 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4,B9 6 | exclude = venv, .venv, tests/.datafog_env, examples/venv, .gitignore, .DS_Store, .git, .github, .tox, .nox, .coverage, .coverage.* -------------------------------------------------------------------------------- /.github/renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["config:base"] 3 | } 4 | -------------------------------------------------------------------------------- /.github/workflows/beta-cicd.yml: -------------------------------------------------------------------------------- 1 | name: beta-cicd-setup-and-test 2 | 3 | on: 4 | push: 5 | branches: 6 | - "v*.0.0-beta.*" 7 | pull_request: 8 | branches: 9 | - "v*.0.0-beta.*" 10 | 11 | jobs: 12 | lint: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Check out repo 16 | uses: actions/checkout@v4 17 | - name: Set up Python 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: "3.10" 21 | - name: Install pre-commit 22 | run: pip install pre-commit 23 | - name: Run pre-commit 24 | run: pre-commit run --all-files 25 | 26 | build: 27 | runs-on: ubuntu-latest 28 | strategy: 29 | matrix: 30 | python-version: ["3.10", "3.11", "3.12"] 31 | steps: 32 | - name: Check out repo 33 | uses: actions/checkout@v4 34 | - name: Free Disk Space (Ubuntu) 35 | uses: jlumbroso/free-disk-space@main 36 | with: 37 | # this might remove tools that are actually needed, 38 | # if set to "true" but frees about 6 GB 39 | tool-cache: false 40 | # all of these default to true, but feel free to set to 41 | # "false" if necessary for your workflow 42 | android: true 43 | dotnet: true 44 | haskell: true 45 | large-packages: true 46 | docker-images: true 47 | swap-storage: true 48 | - name: Set up Python 49 | uses: actions/setup-python@v4 50 | with: 51 | python-version: ${{ matrix.python-version }} 52 | - name: Install Dependencies 53 | run: | 54 | pip install -U pip 55 | pip install --no-cache-dir -e . 56 | pip install --no-cache-dir tox just pre-commit 57 | - name: Free up disk space 58 | run: | 59 | sudo apt-get clean 60 | - name: Run Tests with tox 61 | run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing 62 | - name: Submit to Codecov 63 | uses: codecov/codecov-action@v3 64 | with: 65 | token: ${{ secrets.CODECOV_TOKEN }} 66 | files: ./coverage.xml 67 | flags: unittests 68 | name: codecov-umbrella 69 | -------------------------------------------------------------------------------- /.github/workflows/dev-cicd.yml: -------------------------------------------------------------------------------- 1 | name: dev-cicd-setup-and-test 2 | 3 | on: 4 | push: 5 | branches: 6 | - dev 7 | pull_request: 8 | branches: 9 | - dev 10 | 11 | jobs: 12 | lint: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Check out repo 16 | uses: actions/checkout@v4 17 | - name: Set up Python 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: "3.10" 21 | - name: Install pre-commit 22 | run: pip install pre-commit 23 | - name: Run pre-commit 24 | run: pre-commit run --all-files 25 | 26 | build: 27 | runs-on: ubuntu-latest 28 | strategy: 29 | matrix: 30 | python-version: ["3.10", "3.11", "3.12"] 31 | steps: 32 | - name: Check out repo 33 | uses: actions/checkout@v4 34 | - name: Free Disk Space (Ubuntu) 35 | uses: jlumbroso/free-disk-space@main 36 | with: 37 | # this might remove tools that are actually needed, 38 | # if set to "true" but frees about 6 GB 39 | tool-cache: false 40 | # all of these default to true, but feel free to set to 41 | # "false" if necessary for your workflow 42 | android: true 43 | dotnet: true 44 | haskell: true 45 | large-packages: true 46 | docker-images: true 47 | swap-storage: true 48 | - name: Set up Python 49 | uses: actions/setup-python@v4 50 | with: 51 | python-version: ${{ matrix.python-version }} 52 | - name: Install Dependencies 53 | run: | 54 | pip install -U pip 55 | pip install -e . 56 | pip install tox just pre-commit 57 | - name: Run Tests with tox 58 | run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing 59 | - name: Submit to Codecov 60 | uses: codecov/codecov-action@v3 61 | with: 62 | token: ${{ secrets.CODECOV_TOKEN }} 63 | files: ./coverage.xml 64 | flags: unittests 65 | name: codecov-umbrella 66 | - name: Clean up pip cache 67 | run: | 68 | pip cache purge 69 | rm -rf ~/.cache/pip 70 | -------------------------------------------------------------------------------- /.github/workflows/feature-cicd.yml: -------------------------------------------------------------------------------- 1 | name: feature-cicd-setup-and-test 2 | 3 | on: 4 | push: 5 | branches: 6 | - feature/* 7 | pull_request: 8 | branches: 9 | - feature/* 10 | 11 | jobs: 12 | lint: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Check out repo 16 | uses: actions/checkout@v4 17 | - name: Set up Python 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: "3.10" 21 | - name: Install pre-commit 22 | run: pip install pre-commit 23 | - name: Run pre-commit 24 | run: pre-commit run --all-files 25 | 26 | build: 27 | runs-on: ubuntu-latest 28 | strategy: 29 | matrix: 30 | python-version: ["3.10", "3.11", "3.12"] 31 | steps: 32 | - name: Check out repo 33 | uses: actions/checkout@v4 34 | - name: Free Disk Space (Ubuntu) 35 | uses: jlumbroso/free-disk-space@main 36 | with: 37 | # this might remove tools that are actually needed, 38 | # if set to "true" but frees about 6 GB 39 | tool-cache: false 40 | # all of these default to true, but feel free to set to 41 | # "false" if necessary for your workflow 42 | android: true 43 | dotnet: true 44 | haskell: true 45 | large-packages: true 46 | docker-images: true 47 | swap-storage: true 48 | - name: Set up Python 49 | uses: actions/setup-python@v4 50 | with: 51 | python-version: ${{ matrix.python-version }} 52 | - name: Install Dependencies 53 | run: | 54 | pip install -U pip 55 | pip install --no-cache-dir -e . 56 | pip install --no-cache-dir tox just pre-commit 57 | - name: Free up disk space 58 | run: | 59 | sudo apt-get clean 60 | - name: Run Tests with tox 61 | run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing 62 | - name: Submit to Codecov 63 | uses: codecov/codecov-action@v3 64 | with: 65 | token: ${{ secrets.CODECOV_TOKEN }} 66 | files: ./coverage.xml 67 | flags: unittests 68 | name: codecov-umbrella 69 | -------------------------------------------------------------------------------- /.github/workflows/main-cicd.yml: -------------------------------------------------------------------------------- 1 | name: main-cicd 2 | 3 | on: 4 | push: 5 | branches: 6 | - "main" 7 | pull_request: 8 | branches: 9 | - "main" 10 | 11 | jobs: 12 | lint: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Check out repo 16 | uses: actions/checkout@v4 17 | - name: Set up Python 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: "3.10" 21 | - name: Install pre-commit 22 | run: pip install pre-commit 23 | - name: Run pre-commit 24 | run: pre-commit run --all-files 25 | 26 | build: 27 | runs-on: ubuntu-latest 28 | strategy: 29 | matrix: 30 | python-version: ["3.10"] 31 | steps: 32 | - name: Check out repo 33 | uses: actions/checkout@v4 34 | - name: Free Disk Space (Ubuntu) 35 | uses: jlumbroso/free-disk-space@main 36 | with: 37 | # this might remove tools that are actually needed, 38 | # if set to "true" but frees about 6 GB 39 | tool-cache: false 40 | # all of these default to true, but feel free to set to 41 | # "false" if necessary for your workflow 42 | android: true 43 | dotnet: true 44 | haskell: true 45 | large-packages: true 46 | docker-images: true 47 | swap-storage: true 48 | - name: Set up Python 49 | uses: actions/setup-python@v4 50 | with: 51 | python-version: ${{ matrix.python-version }} 52 | - name: Install Dependencies 53 | run: | 54 | pip install -U pip 55 | pip install --no-cache-dir -e . 56 | pip install --no-cache-dir tox just pre-commit 57 | - name: Free up disk space 58 | run: | 59 | sudo apt-get clean 60 | - name: Run Tests with tox 61 | run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing 62 | - name: Submit to Codecov 63 | uses: codecov/codecov-action@v3 64 | with: 65 | token: ${{ secrets.CODECOV_TOKEN }} 66 | files: ./coverage.xml 67 | flags: unittests 68 | name: codecov-umbrella 69 | -------------------------------------------------------------------------------- /.github/workflows/publish-pypi.yml: -------------------------------------------------------------------------------- 1 | name: PyPI Release 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | version: 7 | description: "Version to release (e.g., 1.2.3)" 8 | required: true 9 | confirm_tests: 10 | description: "Confirm all tests have passed" 11 | type: boolean 12 | required: true 13 | 14 | jobs: 15 | release: 16 | runs-on: ubuntu-latest 17 | if: github.event.inputs.confirm_tests == 'true' 18 | permissions: 19 | contents: write 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: "3.10" 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install build twine 30 | - name: Build package 31 | run: python -m build 32 | - name: Create GitHub Release 33 | env: 34 | GITHUB_TOKEN: ${{ secrets.pypi }} 35 | run: | 36 | git config user.name github-actions 37 | git config user.email github-actions@github.com 38 | git tag v${{ github.event.inputs.version }} 39 | git push origin v${{ github.event.inputs.version }} 40 | gh release create v${{ github.event.inputs.version }} --generate-notes 41 | - name: Publish to PyPI 42 | env: 43 | TWINE_USERNAME: __token__ 44 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 45 | run: twine upload dist/* 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | output/ 3 | datafog_instructor.egg-info/ 4 | examples/poc.py 5 | dist/ 6 | venv/ 7 | app/__pycache__/ 8 | *.pyc 9 | *.swp 10 | *.prof 11 | MANIFEST 12 | dist/ 13 | build/ 14 | .coverage 15 | .cache/ 16 | *.egg-info/ 17 | .pytest_cache/ 18 | .tox/ 19 | src/datafog/__pycache__/ 20 | src/datafog/pii_tools/__pycache__/ 21 | tests/__pycache__/ 22 | tests/.datafog_env/ 23 | datafog_debug.log 24 | sotu_2023.txt 25 | .DS_Store 26 | .env 27 | coverage.xml 28 | htmlcov/ 29 | .venv/ 30 | node_modules/ 31 | .DS_Store 32 | .venv 33 | examples/venv/ 34 | error_log.txt 35 | docs/* 36 | !docs/*.rst 37 | !docs/conf.py 38 | scratch.py -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/PyCQA/isort 3 | rev: 5.12.0 4 | hooks: 5 | - id: isort 6 | args: [--profile=black] 7 | exclude: .venv 8 | 9 | - repo: https://github.com/psf/black 10 | rev: 24.2.0 11 | hooks: 12 | - id: black 13 | language_version: python3 14 | exclude: .venv 15 | 16 | - repo: https://github.com/pycqa/flake8 17 | rev: 7.0.0 18 | hooks: 19 | - id: flake8 20 | args: [--max-line-length=88] # Match Black's line length 21 | additional_dependencies: [flake8-bugbear, flake8-comprehensions] 22 | exclude: .venv 23 | 24 | - repo: https://github.com/pre-commit/mirrors-prettier 25 | rev: v4.0.0-alpha.8 26 | hooks: 27 | - id: prettier 28 | exclude: .venv 29 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | tests/* 2 | examples/* -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.12" 13 | # You can also specify other tool versions: 14 | # nodejs: "19" 15 | # rust: "1.64" 16 | # golang: "1.19" 17 | 18 | # Build documentation in the "docs/" directory with Sphinx 19 | sphinx: 20 | configuration: docs/conf.py 21 | # Optionally build your docs in additional formats such as PDF and ePub 22 | # formats: 23 | # - pdf 24 | # - epub 25 | 26 | # Optional but recommended, declare the Python requirements required 27 | # to build your documentation 28 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 29 | # python: 30 | # install: 31 | # - requirements: docs/requirements.txt 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2024 Sid Mohan, DataFog Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DataFog Instructor 2 | 3 | v0.1.0 Release Notes 4 | 5 | Hi folks, based on some feedback a few important changes: 6 | 7 | - We have shifted away from the CLI approach to a more flexible API-based solution. For v0.1.0, you'll need to clone the repository and install dependencies using Poetry. 8 | - The env.example file now includes a LOGFIRE_TOKEN. You can obtain one by signing up at https://logfire.pydantic.dev. Logfire is an observability platform developed by the Pydantic team, designed to assist with debugging and monitoring, including LLM calls. 9 | - This version focuses on producing consistent LLM outputs for PII detection and incorporates extensive error handling to create a more production-ready service. 10 | - We've implemented robust validation and error handling throughout the codebase to ensure reliability and ease of debugging. 11 | 12 | Start by cloning the repo and installing the dependencies using poetry: 13 | 14 | ``` 15 | git clone https://github.com/datafog/datafog-instructor.git 16 | cd datafog-instructor 17 | poetry install 18 | ``` 19 | 20 | You'll also need to create a `.env` file with the OPENAI_API_KEY and GROQ_API_KEY. You can get these by signing up for accounts at https://openai.com/ and https://www.groq.com/. 21 | 22 | Once you have the .env file, you can run the following to start the service: 23 | 24 | ``` 25 | uvicorn app.main:app --reload 26 | ``` 27 | 28 | 29 | ## Sample CURL Commands 30 | 31 | 32 | ``` 33 | curl -X POST "http://localhost:8000/extract-pii" \ 34 | -H "Content-Type: application/json" \ 35 | -d '{"content": "My name is John Doe and my email is john.doe@example.com. My phone number is 123-456-7890."}' 36 | ``` 37 | 38 | ## Contributing 39 | 40 | Contributions to the DataFog Instructor SDK are welcome! Please feel free to submit a Pull Request. 41 | 42 | ## License 43 | 44 | This project is licensed under the MIT License. 45 | 46 | ## Support 47 | 48 | If you encounter any problems or have any questions, please open an issue on the GitHub repository or join our Discord community at https://discord.gg/bzDth394R4. 49 | 50 | ## Acknowledgements 51 | 52 | - Logfire: https://logfire.pydantic.dev 53 | - Pydantic: https://pydantic.dev 54 | - Instructor: https://github.com/jxnl/instructor 55 | 56 | ## Links 57 | 58 | - Homepage: https://datafog.ai 59 | - Documentation: https://docs.datafog.ai 60 | - Twitter: https://twitter.com/datafoginc 61 | - GitHub: https://github.com/datafog/datafog-instructor 62 | ``` 63 | -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataFog/datafog-instructor/5ee1788f1dc6daf692a8db8e31dd17940fa7f7d0/app/__init__.py -------------------------------------------------------------------------------- /app/main.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from fastapi import FastAPI, HTTPException, Request 3 | from openai import AsyncOpenAI 4 | import instructor 5 | import logfire 6 | import asyncio 7 | from collections.abc import Iterable 8 | from fastapi.responses import StreamingResponse 9 | from dotenv import load_dotenv 10 | import os 11 | from typing import List, Dict 12 | from sqlalchemy import create_engine 13 | from sqlmodel import SQLModel, Field 14 | from typing import Optional 15 | import pandas as pd 16 | from sqlmodel import SQLModel, Field 17 | from typing import Tuple 18 | import traceback 19 | 20 | class DetectedPII(BaseModel): 21 | """ 22 | Detected PII data from a document 23 | """ 24 | index: int 25 | data_type: str 26 | pii_value: str 27 | 28 | 29 | class PIIDetectionFlow(BaseModel): 30 | """ 31 | Extracted PII data from a document, all data_types should try to have consistent property names 32 | """ 33 | 34 | detected_pii: list[DetectedPII] = Field(..., min_items=1) 35 | 36 | def __init__(self, **data): 37 | super().__init__(**data) 38 | self._validate_detected_pii() 39 | 40 | def _validate_detected_pii(self): 41 | """ 42 | Validates the detected_pii list to ensure it meets the required conditions. 43 | 44 | Preconditions: 45 | - detected_pii is a non-empty list 46 | 47 | Postconditions: 48 | - All items in detected_pii are of type DetectedPII 49 | - All DetectedPII objects have unique indices 50 | 51 | Raises: 52 | ValueError: If any of the conditions are not met 53 | """ 54 | if not self.detected_pii: 55 | raise ValueError("detected_pii list must not be empty") 56 | 57 | indices = set() 58 | for item in self.detected_pii: 59 | if not isinstance(item, DetectedPII): 60 | raise ValueError(f"All items in detected_pii must be of type DetectedPII. Found: {type(item)}") 61 | if item.index in indices: 62 | raise ValueError(f"Duplicate index found in detected_pii: {item.index}") 63 | indices.add(item.index) 64 | 65 | logfire.info("PIIDetectionFlow initialized", 66 | pii_count=len(self.detected_pii), 67 | unique_indices=len(indices)) 68 | 69 | def __len__(self): 70 | return len(self.detected_pii) 71 | 72 | def __iter__(self): 73 | return iter(self.detected_pii) 74 | 75 | def __getitem__(self, index): 76 | return self.detected_pii[index] 77 | 78 | def redact_pii(self, content): 79 | """ 80 | Iterates over the private data and replaces the value with a placeholder in the form of 81 | <{data_type}_{i}> 82 | 83 | Preconditions: 84 | - self.detected_pii is a non-empty list of DetectedPII objects 85 | - content is a non-empty string 86 | 87 | Postconditions: 88 | - Returns a string with all PII values replaced by placeholders 89 | - The number of replacements made is equal to the length of self.detected_pii 90 | - The returned string is not empty 91 | 92 | Invariants: 93 | - The structure of the content string is preserved, only PII values are replaced 94 | """ 95 | 96 | # Preconditions 97 | assert self.detected_pii, "detected_pii list must not be empty" 98 | assert content, "content must not be empty" 99 | 100 | original_content = content 101 | replacement_count = 0 102 | 103 | for i, data in enumerate(self.detected_pii): 104 | new_content = content.replace(data.pii_value, f"<{data.data_type}_{i}>") 105 | if new_content != content: 106 | replacement_count += 1 107 | content = new_content 108 | logfire.info(f"Replaced PII: {data.data_type}", index=i, data_type=data.data_type) 109 | 110 | # Postconditions 111 | assert replacement_count == len(self.detected_pii), "Number of replacements should match detected PII count" 112 | assert content, "Redacted content must not be empty" 113 | assert len(content) >= len(original_content), "Redacted content should not be shorter than original" 114 | 115 | logfire.info("PII redaction completed", 116 | original_length=len(original_content), 117 | redacted_length=len(content), 118 | replacements_made=replacement_count) 119 | 120 | return content 121 | 122 | app = FastAPI() 123 | 124 | # ----------------------- Middleware Configuration ----------------------- 125 | # Configure logfire before adding any middleware to ensure they are set up correctly 126 | logfire.configure(pydantic_plugin=logfire.PydanticPlugin(record="all")) 127 | 128 | # Instrument FastAPI with logfire middleware 129 | logfire.instrument_fastapi(app) 130 | # ----------------------- End Middleware Configuration -------------------- 131 | 132 | # Global variables to be initialized on startup 133 | openai_client: Optional[AsyncOpenAI] = None 134 | client: Optional[instructor.from_openai] = None 135 | 136 | @app.on_event("startup") 137 | async def initialize_app(): 138 | """ 139 | Startup event to initialize and validate environment variables and clients. 140 | 141 | Preconditions: 142 | - Environment variables like OPENAI_API_KEY must be set. 143 | 144 | Postconditions: 145 | - openai_client is initialized and ready to use. 146 | """ 147 | load_dotenv() 148 | global OPENAI_API_KEY, openai_client, client 149 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 150 | 151 | if not OPENAI_API_KEY: 152 | logfire.error("OPENAI_API_KEY is not set in the environment") 153 | raise EnvironmentError("OPENAI_API_KEY must be set in the environment variables") 154 | 155 | openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY) 156 | 157 | assert isinstance(app, FastAPI), "app must be an instance of FastAPI" 158 | assert isinstance(openai_client, AsyncOpenAI), "openai_client must be an instance of AsyncOpenAI" 159 | 160 | logfire.info("Application initialized", 161 | api_key_set=bool(OPENAI_API_KEY), 162 | app_type=type(app).__name__, 163 | client_type=type(openai_client).__name__) 164 | 165 | logfire.instrument_openai(openai_client) 166 | client = instructor.from_openai(openai_client) 167 | 168 | @app.on_event("shutdown") 169 | async def shutdown_event(): 170 | """ 171 | Shutdown event to perform cleanup tasks. 172 | """ 173 | if openai_client: 174 | await openai_client.aclose() 175 | logfire.info("Application shutdown complete.") 176 | 177 | @app.post("/extract-pii") 178 | async def extract_pii(request: Request) -> PIIDetectionFlow: 179 | """ 180 | Extracts PII data from a document 181 | 182 | Preconditions: 183 | - content is a non-empty string in the request body 184 | 185 | Postconditions: 186 | - Returns a PIIDetectionFlow object 187 | - The returned object contains a non-empty list of DetectedPII objects in its detected_pii attribute 188 | 189 | Invariants: 190 | - The original content is not modified 191 | - The API call to OpenAI is made with the correct model and response_model 192 | """ 193 | # Preconditions 194 | body = await request.json() 195 | assert isinstance(body, dict), "Request body must be a valid JSON object" 196 | 197 | content = body.get("content") 198 | 199 | # Invariant: content should not be modified after extraction 200 | original_content = content 201 | 202 | if not content or not isinstance(content, str) or content.strip() == "": 203 | logfire.error("Invalid or missing content in request", body=body) 204 | raise HTTPException(status_code=422, detail="Content is required and must be a non-empty string") 205 | 206 | # Postconditions 207 | assert content == original_content, "Content should not be modified during extraction" 208 | assert isinstance(content, str) and content.strip() != "", "Content must be a non-empty string" 209 | 210 | logfire.info("Content extracted from request", content_length=len(content)) 211 | 212 | # Preconditions 213 | assert isinstance(content, str) and content.strip(), "Content must be a non-empty string" 214 | assert OPENAI_API_KEY, "OPENAI_API_KEY must be set" 215 | 216 | try: 217 | logfire.info("Initiating PII extraction", content_length=len(content)) 218 | 219 | response = await client.chat.completions.create( 220 | model="gpt-4o", 221 | messages=[{"role": "user", "content": content}], 222 | response_model=PIIDetectionFlow, 223 | ) 224 | 225 | # Postconditions 226 | assert isinstance(response, PIIDetectionFlow), "Response must be a PIIDetectionFlow object" 227 | assert response.detected_pii, "Extracted PII data must not be empty" 228 | 229 | logfire.info("Successfully extracted PII data", 230 | pii_count=len(response.detected_pii), 231 | pii_types=[pii.data_type for pii in response.detected_pii]) 232 | 233 | # Invariant 234 | assert content == original_content, "Original content must not be modified during extraction" 235 | 236 | return response 237 | except ValueError as ve: 238 | logfire.warning("No PII data extracted", error=str(ve)) 239 | raise HTTPException(status_code=204, detail="No PII data found in the content") 240 | except AssertionError as ae: 241 | logfire.error("Assertion failed during PII extraction", error=str(ae)) 242 | raise HTTPException(status_code=500, detail=str(ae)) 243 | except Exception as e: 244 | logfire.error("Unexpected error during PII extraction", 245 | error=str(e), 246 | error_type=type(e).__name__, 247 | traceback=traceback.format_exc()) 248 | raise HTTPException(status_code=500, detail="Failed to extract PII data") 249 | 250 | if __name__ == "__main__": 251 | import uvicorn 252 | 253 | uvicorn.run(app, host="0.0.0.0", port=8000) -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: no 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = "DataFog Instructor" 10 | copyright = "2024, DataFog Inc." 11 | author = "Sid Mohan" 12 | release = "v0.1.0" 13 | 14 | # -- General configuration --------------------------------------------------- 15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 16 | 17 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.autosummary", "sphinx.ext.napoleon"] 18 | 19 | templates_path = ["_templates"] 20 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 21 | 22 | 23 | # -- Options for HTML output ------------------------------------------------- 24 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 25 | 26 | html_theme = "alabaster" 27 | html_static_path = ["_static"] 28 | 29 | autosummary_generate = True 30 | napoleon_use_rtype = False 31 | napoleon_use_ivar = False 32 | napoleon_use_param = False 33 | -------------------------------------------------------------------------------- /env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | GROQ_API_KEY= 3 | LOGFIRE_TOKEN= -------------------------------------------------------------------------------- /public/common-commands.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataFog/datafog-instructor/5ee1788f1dc6daf692a8db8e31dd17940fa7f7d0/public/common-commands.mov -------------------------------------------------------------------------------- /public/help-menu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataFog/datafog-instructor/5ee1788f1dc6daf692a8db8e31dd17940fa7f7d0/public/help-menu.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "datafog-instructor" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Sid Mohan "] 6 | readme = "README.md" 7 | packages = [ 8 | { include = "app"}, 9 | ] 10 | 11 | [tool.poetry.dependencies] 12 | python = ">=3.12,<3.13" 13 | transformers = "^4.44.2" 14 | pandas = "^2.2.2" 15 | sqlmodel = "^0.0.22" 16 | sentence-transformers = "^3.1.0" 17 | openai = "^1.45.0" 18 | pydantic = "2.9.1" 19 | instructor = "^1.4.2" 20 | python-dotenv = "^1.0.1" 21 | nest-asyncio = "^1.6.0" 22 | typer = "^0.12.5" 23 | anthropic = "^0.34.2" 24 | fastapi = "^0.115.0" 25 | logfire = {extras = ["fastapi"], version = "^0.53.0"} 26 | uvicorn = {extras = ["standard"], version = "^0.30.6"} 27 | pytest = "^8.3.3" 28 | 29 | 30 | [tool.poetry.scripts] 31 | start = "uvicorn app.main:app --reload" 32 | 33 | [build-system] 34 | requires = ["poetry-core"] 35 | build-backend = "poetry.core.masonry.api" 36 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataFog/datafog-instructor/5ee1788f1dc6daf692a8db8e31dd17940fa7f7d0/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from fastapi.testclient import TestClient 3 | from app.main import app, PIIDataExtraction, Entity 4 | 5 | client = TestClient(app) 6 | 7 | @pytest.mark.asyncio 8 | async def test_extract_pii(): 9 | content = "John Doe's SSN is 123-45-6789 and his email is john@example.com" 10 | response = client.post("/extract-pii", json={"content": content}) 11 | assert response.status_code == 200 12 | pii_data = PIIDataExtraction(**response.json()) 13 | assert len(pii_data.private_data) > 0 14 | assert any(data.data_type == "SSN" for data in pii_data.private_data) 15 | assert any(data.data_type == "EMAIL" for data in pii_data.private_data) 16 | 17 | @pytest.mark.asyncio 18 | async def test_extract_pii_stream(): 19 | content = "Jane Doe's phone number is (555) 123-4567" 20 | response = client.post("/extract-pii-stream", json={"content": content}) 21 | assert response.status_code == 200 22 | # Note: Streaming response testing might require additional setup 23 | 24 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py310 3 | isolated_build = True 4 | 5 | [testenv] 6 | deps = 7 | pytest==7.4.0 8 | pytest-asyncio==0.21.0 9 | pytest-cov 10 | -r requirements-dev.txt 11 | extras = all 12 | allowlist_externals = 13 | pip 14 | commands = 15 | pip install --no-cache-dir -r requirements-dev.txt 16 | pytest {posargs} -v -s --cov=datafog --cov-report=term-missing 17 | 18 | [testenv:lint] 19 | skip_install = true 20 | deps = 21 | black 22 | isort 23 | flake8 24 | commands = 25 | black --check . 26 | isort --check-only . 27 | flake8 . 28 | 29 | [testenv:typecheck] 30 | deps = 31 | mypy 32 | commands = 33 | mypy datafog tests 34 | 35 | [pytest] 36 | asyncio_mode = auto --------------------------------------------------------------------------------