├── anonLLM ├── __init__.py ├── deanonymizer.py ├── anonymizer.py └── llm.py ├── pyproject.toml ├── .gitignore ├── .github └── workflows │ └── python-publish.yml ├── setup.py ├── tests ├── test_anonymizer.py ├── test_openai_language_model.py └── test_deanonymizer.py └── README.md /anonLLM/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /anonLLM/deanonymizer.py: -------------------------------------------------------------------------------- 1 | class Deanonymizer: 2 | def deanonymize(self, text, mappings): 3 | # Loop through each pattern mapping and replace the anonymized values 4 | # back to the original ones 5 | for _, pattern_map in mappings.items(): 6 | for original_value, fake_value in pattern_map.items(): 7 | text = text.replace(fake_value, original_value) 8 | return text 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *.pyc 5 | 6 | # Distribution / packaging 7 | .Python 8 | dist/ 9 | build/ 10 | *.egg-info/ 11 | 12 | # Virtual environment 13 | venv/ 14 | *.venv 15 | .venv/ 16 | 17 | # Environment variables 18 | .env 19 | 20 | # PyCharm and Visual Studio Code directories 21 | .idea/ 22 | .vscode/ 23 | 24 | # OS generated files 25 | .DS_Store 26 | Thumbs.db 27 | 28 | # Experimental folders and files 29 | exp_* 30 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: Check out repository 16 | uses: actions/checkout@v3 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@v3 20 | with: 21 | python-version: '3.11' # Specify your Python version here 22 | 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install build twine # Explicitly install Twine for clarity 27 | 28 | - name: Build package 29 | run: python -m build 30 | 31 | - name: Verify package with Twine 32 | run: twine check dist/* # This step will check your package for common issues 33 | 34 | - name: Publish package to PyPI 35 | uses: pypa/gh-action-pypi-publish@v1.4.2 # Use a tagged version for better stability 36 | with: 37 | user: __token__ 38 | password: ${{ secrets.PYPI_API_TOKEN }} 39 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setup( 7 | name="anonLLM", 8 | version="0.1.10", 9 | author="FS Ndzomga", 10 | author_email="ndzomgafs@gmail.com", 11 | description="Anonymizes personally identifiable information for " 12 | "Large Language Model APIs", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/fsndzomga/anonLLM", 16 | project_urls={ 17 | "Bug Tracker": "https://github.com/fsndzomga/anonLLM/issues" 18 | }, 19 | classifiers=[ 20 | "Programming Language :: Python :: 3", 21 | "License :: OSI Approved :: MIT License", 22 | "Operating System :: OS Independent", 23 | ], 24 | packages=find_packages(), 25 | python_requires=">=3.6", 26 | install_requires=[ 27 | "Faker", 28 | "openai", 29 | "pydantic>=2", 30 | "spacy>=3", 31 | ], 32 | extras_require={ 33 | "dev": ["pytest", "flake8"], 34 | }, 35 | ) 36 | -------------------------------------------------------------------------------- /tests/test_anonymizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from anonLLM.anonymizer import Anonymizer 3 | import re 4 | 5 | 6 | class TestAnonymizer(unittest.TestCase): 7 | 8 | def setUp(self): 9 | self.anonymizer = Anonymizer() 10 | 11 | def extract_info(self, text): 12 | name_pattern = r"My name is ([\w\s]+)," 13 | email_pattern = r"email: ([\w\.-]+@[\w\.-]+)," 14 | phone_pattern = r"phone: ([\+\d\s\-]+)\." 15 | 16 | name = re.search(name_pattern, text).group(1) 17 | email = re.search(email_pattern, text).group(1) 18 | phone = re.search(phone_pattern, text).group(1) 19 | 20 | return name, email, phone 21 | 22 | def test_anonymize_data(self): 23 | # Test with 10 different examples that include name, email, and phone numbers # noqa 24 | test_examples = [ 25 | "My name is Alice Ela Johnson, email: alice.johnson@example.com, " 26 | "phone: +1 234-567-8910.", 27 | 28 | "My name is Bob Smith, email: bob_smith@example.com, " 29 | "phone: +33 1 23 45 67 89.", 30 | 31 | "My name is Charlie Brown, email: charlie.brown@example.com, " 32 | "phone: +237 6 1234 5678.", 33 | 34 | "My name is David Wang, email: david.wang@example.com, " 35 | "phone: +1 987-654-3210.", 36 | 37 | "My name is Eve Adams, email: eve.adams@example.com, " 38 | "phone: +33 9 87 65 43 21.", 39 | 40 | "My name is Frank Lee, email: frank.lee@example.com, " 41 | "phone: +237 6 8765 4321.", 42 | 43 | "My name is Grace Kim, email: grace.kim@example.com, " 44 | "phone: +1 555-444-3333.", 45 | 46 | "My name is Harry Johnson, email: harry.johnson@example.com, " 47 | "phone: +33 4 56 78 90 12.", 48 | 49 | "My name is Irene Williams, email: irene.williams@example.com, " 50 | "phone: +237 6 7890 1234.", 51 | 52 | "My name is John Doe, email: john.doe@example.com, " 53 | "phone: +1 111-222-3333." 54 | ] 55 | 56 | for example in test_examples: 57 | anonymized_text, _ = self.anonymizer.anonymize_data(example) 58 | 59 | name, email, phone = self.extract_info(example) 60 | 61 | self.assertNotIn(name, anonymized_text) 62 | self.assertNotIn(email, anonymized_text) 63 | self.assertNotIn(phone, anonymized_text) 64 | 65 | 66 | if __name__ == "__main__": 67 | unittest.main() 68 | -------------------------------------------------------------------------------- /tests/test_openai_language_model.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pydantic import BaseModel 4 | 5 | from anonLLM.llm import OpenaiLanguageModel 6 | from dotenv import load_dotenv 7 | 8 | load_dotenv() 9 | 10 | 11 | class TestOpenaiLanguageModel(unittest.TestCase): 12 | def setUp(self): 13 | self.llm = OpenaiLanguageModel() 14 | 15 | def test_generate(self): 16 | prompt = ( 17 | "My name is Alice Johnson, " 18 | "email: alice.johnson@example.com, " 19 | "phone: +1 234-567-8910. " 20 | "Write an imaginary cover letter for me " 21 | "as a machine learning engineer." 22 | ) 23 | response = self.llm.generate(prompt) 24 | 25 | self.assertIsNotNone(response) 26 | self.assertNotEqual("", response.strip()) 27 | self.assertIn("Alice Johnson", response) 28 | 29 | def test_generate_with_model_output(self): 30 | class Output(BaseModel): 31 | person: str 32 | city: str 33 | 34 | prompt = 'Extract the requested information from the following sentence: "Alice Johnson is visiting Rome."' 35 | response = self.llm.generate(prompt, output_format=Output) 36 | self.assertTrue(isinstance(response, Output)) 37 | self.assertIn("Alice Johnson", response.person) 38 | self.assertIn("Rome", response.city) 39 | 40 | def test_generate_n_completions(self): 41 | prompt = ( 42 | "What is the user's favorite color in the following expression?" 43 | "\nAlice Johnson's favorite color is blue" 44 | ) 45 | responses = self.llm.generate(prompt, n_completions=2) 46 | self.assertEqual(len(responses), 2) 47 | for response in responses: 48 | self.assertIsNotNone(response) 49 | self.assertIn("blue", response.lower()) 50 | 51 | def test_generate_n_completions_with_model_output(self): 52 | class Output(BaseModel): 53 | person: str 54 | city: str 55 | 56 | prompt = 'Extract the requested information from the following sentence: "Alice Johnson is visiting Rome."' 57 | responses = self.llm.generate(prompt, output_format=Output, n_completions=2) 58 | self.assertEqual(len(responses), 2) 59 | for response in responses: 60 | self.assertTrue(isinstance(response, Output)) 61 | self.assertIn("Alice Johnson", response.person) 62 | self.assertIn("Rome", response.city) 63 | 64 | 65 | 66 | if __name__ == "__main__": 67 | unittest.main() 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # anonLLM: Anonymize Personally Identifiable Information (PII) for Large Language Model APIs 2 | 3 | ![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg) 4 | 5 | anonLLM is a Python package designed to anonymize personally identifiable information (PII) in text data before it's sent to Language Model APIs like GPT-3. The goal is to protect user privacy by ensuring that sensitive data such as names, email addresses, and phone numbers are anonymized. 6 | 7 | # Features 8 | 9 | Anonymize names 10 | Anonymize email addresses 11 | Anonymize phone numbers 12 | Support for multiple country-specific phone number formats 13 | Reversible anonymization (de-anonymization) 14 | Installation 15 | 16 | To install anonLLM, run: 17 | 18 | ```bash 19 | pip install anonLLM 20 | ``` 21 | 22 | # Quick Start 23 | 24 | Here's how to get started with anonLLM: 25 | 26 | ```python 27 | from anonLLM.llm import OpenaiLanguageModel 28 | from dotenv import load_dotenv 29 | 30 | load_dotenv() 31 | 32 | # Anonymize a text 33 | text = "Write a CV for me: My name is Alice Johnson, "\ 34 | "email: alice.johnson@example.com, phone: +1 234-567-8910."\ 35 | "I am a machine learning engineer." 36 | 37 | # Anonymization is handled under the hood 38 | llm = OpenaiLanguageModel() 39 | 40 | response = llm.generate(text) 41 | 42 | print(response) 43 | ``` 44 | In this example, the response will contain the correct name provided. 45 | At the same time, no PII will be sent to OpenAI. 46 | 47 | You can also use anonLLM to generate structured outputs in a JSON format. 48 | You just have to define a pydantic model for your output, and use the output_format argument like this: 49 | 50 | ```python 51 | from pydantic import BaseModel 52 | from anonLLM.llm import OpenaiLanguageModel 53 | from dotenv import load_dotenv 54 | 55 | load_dotenv() 56 | 57 | llm = OpenaiLanguageModel(anonymize=False, temperature=1) 58 | 59 | 60 | class Person(BaseModel): 61 | name: str 62 | sex: str 63 | age: int 64 | email: str 65 | 66 | 67 | response = llm.generate( 68 | prompt="Generate a person", 69 | output_format=Person 70 | ) 71 | 72 | print(response) 73 | 74 | # Returns: {'name': 'Alex', 'sex': 'Male', 'age': 32, 'email': 'alex@example.com'} 75 | 76 | ``` 77 | 78 | 79 | 80 | # Contributing 81 | 82 | We welcome contributions! 83 | 84 | # License 85 | 86 | This project is licensed under the MIT License. 87 | 88 | ## Star History 89 | 90 | [![Star History Chart](https://api.star-history.com/svg?repos=fsndzomga/anonLLM&type=Date)](https://star-history.com/#fsndzomga/anonLLM&Date) 91 | 92 | -------------------------------------------------------------------------------- /tests/test_deanonymizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from anonLLM.deanonymizer import Deanonymizer 3 | from anonLLM.anonymizer import Anonymizer 4 | import re 5 | 6 | 7 | class TestDeanonymizer(unittest.TestCase): 8 | def setUp(self): 9 | self.deanonymizer = Deanonymizer() 10 | self.anonymizer = Anonymizer() 11 | 12 | def extract_info(self, text): 13 | name_pattern = r"My name is ([\w\s]+)," 14 | email_pattern = r"email: ([\w\.-]+@[\w\.-]+)," 15 | phone_pattern = r"phone: ([\+\d\s\-]+)\." 16 | 17 | name = re.search(name_pattern, text).group(1) 18 | email = re.search(email_pattern, text).group(1) 19 | phone = re.search(phone_pattern, text).group(1) 20 | 21 | return name, email, phone 22 | 23 | def test_deanonymize(self): 24 | # Test with 10 different examples that include name, email, and phone numbers # noqa 25 | test_examples = [ 26 | "My name is Alice Johnson, email: alice.johnson@example.com, " 27 | "phone: +1 234-567-8910.", 28 | 29 | "My name is Bob Smith, email: bob_smith@example.com, " 30 | "phone: +33 1 23 45 67 89.", 31 | 32 | "My name is Charlie Brown, email: charlie.brown@example.com, " 33 | "phone: +237 6 1234 5678.", 34 | 35 | "My name is David Wang, email: david.wang@example.com, " 36 | "phone: +1 987-654-3210.", 37 | 38 | "My name is Eve Adams, email: eve.adams@example.com, " 39 | "phone: +33 9 87 65 43 21.", 40 | 41 | "My name is Frank Lee, email: frank.lee@example.com, " 42 | "phone: +237 6 8765 4321.", 43 | 44 | "My name is Grace Kim, email: grace.kim@example.com, " 45 | "phone: +1 555-444-3333.", 46 | 47 | "My name is Harry Johnson, email: harry.johnson@example.com, " 48 | "phone: +33 4 56 78 90 12.", 49 | 50 | "My name is Irene Williams, email: irene.williams@example.com, " 51 | "phone: +237 6 7890 1234.", 52 | 53 | "My name is John Doe, email: john.doe@example.com, " 54 | "phone: +1 111-222-3333." 55 | ] 56 | 57 | for example in test_examples: 58 | anonymized_text, mapping = self.anonymizer.anonymize_data(example) 59 | deanonymized_text = self.deanonymizer.deanonymize(anonymized_text, 60 | mapping) 61 | 62 | name, email, phone = self.extract_info(example) 63 | 64 | self.assertIn(name, deanonymized_text) 65 | self.assertIn(email, deanonymized_text) 66 | self.assertIn(phone, deanonymized_text) 67 | 68 | 69 | if __name__ == "__main__": 70 | unittest.main() 71 | -------------------------------------------------------------------------------- /anonLLM/anonymizer.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | import re 3 | from faker import Faker 4 | import spacy 5 | 6 | 7 | class Anonymizer: 8 | def __init__(self, custom_patterns=None, keep_default=True): 9 | self.fake = Faker() 10 | self.load_spacy_model() 11 | 12 | # Extract valid labels from the NLP model 13 | self.valid_labels = set(label for label 14 | in self.nlp.get_pipe("ner").labels) 15 | 16 | # Define default regex patterns 17 | person_pattern = "PERSON" 18 | phone_pattern = r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}" # noqa 19 | email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' 20 | credit_card_pattern = r'\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}' 21 | address_pattern = r'\d{1,5}\s\w+(\s\w+)*,\s\w+,\s\w+(\s\w+)*' 22 | 23 | # Initialize pattern functions 24 | self.pattern_functions = [] 25 | 26 | # Keep default patterns if specified 27 | if keep_default or custom_patterns is None: 28 | self.pattern_functions.extend([ 29 | self.create_anonymize_function(person_pattern, 30 | self.fake.name), 31 | self.create_anonymize_function(phone_pattern, 32 | self.fake.phone_number), 33 | self.create_anonymize_function(email_pattern, 34 | self.fake.email), 35 | self.create_anonymize_function(credit_card_pattern, 36 | self.fake.credit_card_number), 37 | self.create_anonymize_function(address_pattern, 38 | self.fake.address), 39 | ]) 40 | 41 | # Add any custom patterns 42 | if custom_patterns: 43 | for pattern, func in custom_patterns.items(): 44 | self.pattern_functions.append( 45 | self.create_anonymize_function(pattern, func)) 46 | 47 | def load_spacy_model(self): 48 | try: 49 | self.nlp = spacy.load("en_core_web_sm") 50 | except: # noqa 51 | print("Model not found. Downloading en_core_web_sm...") 52 | spacy.cli.download("en_core_web_sm") 53 | self.nlp = spacy.load("en_core_web_sm") 54 | 55 | def generate_unique_fake(self, original, generator_func): 56 | fake_value = generator_func() 57 | while fake_value == original: 58 | fake_value = generator_func() 59 | return fake_value 60 | 61 | def create_anonymize_function(self, pattern, fake_func): 62 | if pattern not in self.valid_labels: 63 | try: 64 | re.compile(pattern) 65 | except re.error: 66 | raise ValueError(f"Invalid pattern: {pattern}. Must be a spaCy label or a regular expression.") # noqa 67 | 68 | def anonymize_func(sentence, anon_sentence, mappings): 69 | data_map = {} 70 | if pattern in self.valid_labels: 71 | doc = self.nlp(sentence) 72 | for ent in doc.ents: 73 | if ent.label_ == pattern: 74 | fake_data = self.generate_unique_fake(ent.text, 75 | fake_func) 76 | data_map[ent.text] = fake_data 77 | anon_sentence = anon_sentence.replace(ent.text, 78 | fake_data) 79 | else: 80 | for data in re.findall(pattern, sentence): 81 | fake_data = self.generate_unique_fake(data, fake_func) 82 | data_map[data] = fake_data 83 | anon_sentence = anon_sentence.replace(data, fake_data) 84 | mappings[pattern] = data_map 85 | return anon_sentence 86 | return anonymize_func 87 | 88 | def anonymize_data(self, sentence): 89 | anon_sentence = sentence 90 | mappings = {} 91 | for pattern_function in self.pattern_functions: 92 | anon_sentence = pattern_function(sentence, anon_sentence, mappings) 93 | return anon_sentence, mappings 94 | 95 | def anonymize(self, *args_to_anonymize): 96 | def inner_decorator(func): 97 | @wraps(func) 98 | def wrapper(*args, **kwargs): 99 | for arg_name in args_to_anonymize: 100 | if arg_name in kwargs: 101 | anonymized_data, _ = self.anonymize_data( 102 | kwargs[arg_name]) 103 | kwargs[arg_name] = anonymized_data 104 | return func(*args, **kwargs) 105 | return wrapper 106 | return inner_decorator 107 | -------------------------------------------------------------------------------- /anonLLM/llm.py: -------------------------------------------------------------------------------- 1 | import openai 2 | from openai import OpenAI 3 | import time 4 | import json 5 | from pydantic import BaseModel, ValidationError, ConfigDict 6 | from typing import Type, Optional 7 | import os 8 | from anonLLM.anonymizer import Anonymizer 9 | from anonLLM.deanonymizer import Deanonymizer 10 | 11 | 12 | class OpenaiLanguageModel: 13 | def __init__(self, api_key=None, model="gpt-3.5-turbo", temperature=0.5, anonymize=True): 14 | self.anonymize = anonymize 15 | 16 | if self.anonymize: 17 | self.anonymizer = Anonymizer() 18 | self.deanonymizer = Deanonymizer() 19 | 20 | if api_key is None: 21 | api_key = os.environ.get('OPENAI_API_KEY') 22 | 23 | if api_key is None: 24 | raise ValueError("The OPENAI API KEY must be provided either as " 25 | "an argument or as an environment variable named 'OPENAI_API_KEY'") # noqa 26 | 27 | self.api_key = api_key 28 | self.model = model 29 | self.temperature = temperature 30 | self.client = OpenAI(api_key=self.api_key) 31 | 32 | 33 | def generate(self, prompt: str, output_format: Optional[Type[BaseModel]] = None, 34 | n_completions: int = 1, max_tokens: int = None): 35 | anonymized_prompt, mappings = (self.anonymizer.anonymize_data(prompt) 36 | if self.anonymize else (prompt, None)) 37 | 38 | retry_delay = 0.1 39 | valid_responses = [] 40 | 41 | while len(valid_responses) < n_completions: 42 | try: 43 | system_message = "You are a helpful assistant." 44 | if output_format: 45 | system_message += f" Respond in a JSON format that contains the following keys: {self._model_structure_repr(output_format)}" # noqa 46 | 47 | params = { 48 | "model": self.model, 49 | "messages": [ 50 | { 51 | "role": "system", 52 | "content": system_message 53 | }, 54 | { 55 | "role": "user", 56 | "content": anonymized_prompt 57 | } 58 | ], 59 | "temperature": self.temperature, 60 | "n": n_completions 61 | } 62 | 63 | if max_tokens is not None: 64 | params["max_tokens"] = max_tokens 65 | 66 | response = self.client.chat.completions.create(**params) 67 | choices = response.choices 68 | responses = [choice.message.content 69 | for choice in choices] 70 | 71 | if output_format: 72 | valid_responses.extend( 73 | [json.loads(res) for res in responses 74 | if self._is_valid_json_for_model(res, output_format)] 75 | ) 76 | else: 77 | valid_responses.extend(responses) 78 | 79 | except openai.RateLimitError: 80 | print(f"Hit rate limit. Retrying in {retry_delay} seconds.") 81 | time.sleep(retry_delay) 82 | retry_delay *= 2 83 | except Exception as err: 84 | print(f"Error: {err}") 85 | break 86 | 87 | def _deanonymize(response): 88 | if output_format: 89 | for key, value in response.items(): 90 | response[key] = self.deanonymizer.deanonymize(value, mappings) 91 | return output_format.model_validate(response) 92 | else: 93 | return self.deanonymizer.deanonymize(response, mappings) 94 | 95 | deanonymized_responses = [_deanonymize(res) if self.anonymize else res 96 | for res in valid_responses] 97 | 98 | if n_completions == 1: 99 | # if generating a single completion, return it directly 100 | return deanonymized_responses[0] 101 | return deanonymized_responses 102 | 103 | def _model_structure_repr(self, model: Type[BaseModel]) -> str: 104 | fields = model.__annotations__ 105 | return ', '.join(f'{key}: {value}' for key, value in fields.items()) 106 | 107 | 108 | def _is_valid_json_for_model(self, text: str, model: Type[BaseModel]) -> bool: # noqa 109 | """ 110 | Check if a text is valid JSON and if it respects the provided BaseModel. # noqa 111 | """ 112 | model.model_config = ConfigDict(strict=True) 113 | 114 | try: 115 | parsed_data = json.loads(text) 116 | model(**parsed_data) 117 | return True 118 | except (json.JSONDecodeError, ValidationError): 119 | return False 120 | --------------------------------------------------------------------------------