├── src └── fuzztypes │ ├── py.typed │ ├── utils │ ├── __init__.py │ └── download.py │ ├── integer.py │ ├── ascii.py │ ├── flags.py │ ├── regex.py │ ├── emojis.py │ ├── date.py │ ├── __init__.py │ ├── const.py │ ├── match.py │ ├── language.py │ ├── person.py │ ├── validation.py │ ├── storage.py │ ├── in_memory.py │ ├── entity.py │ ├── lazy.py │ └── on_disk.py ├── activate.sh ├── tests ├── data │ ├── emojis.csv │ ├── myths.tsv │ ├── emotions.txt │ ├── mixed.jsonl │ └── simonw_tags.csv ├── test_emoji.py ├── test_integer.py ├── test_ascii.py ├── on_disk │ ├── test_on_disk_semantic.py │ ├── test_on_disk_fuzz.py │ ├── test_on_disk_name.py │ └── test_on_disk_alias.py ├── in_memory │ ├── test_in_memory_similarity.py │ ├── test_in_memory_name.py │ ├── test_in_memory_alias.py │ ├── test_in_memory_tags_example.py │ └── test_in_memory_fuzz.py ├── conftest.py ├── test_language.py ├── test_date.py ├── test_entity.py ├── test_regex.py ├── test_person.py ├── utils │ └── test_download.py └── test_readme.py ├── requirements.txt ├── .gitignore ├── LICENSE ├── CHANGELOG.md ├── Makefile ├── pyproject.toml ├── requirements-dev.txt └── README.md /src/fuzztypes/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /activate.sh: -------------------------------------------------------------------------------- 1 | # activate python 2 | source .venv/bin/activate 3 | -------------------------------------------------------------------------------- /tests/data/emojis.csv: -------------------------------------------------------------------------------- 1 | value,aliases,priority 2 | happy,😀,1 3 | sad,😢,1 4 | celebrate,🎉|🎊|🎈,1 5 | party,🎉|🎊|🎈,100 -------------------------------------------------------------------------------- /tests/data/myths.tsv: -------------------------------------------------------------------------------- 1 | value aliases 2 | Odysseus Ulysses 3 | Athena Minerva|Pallas 4 | Zeus Jupiter|Jove 5 | Hercules Heracles 6 | Mercury Hermes 7 | -------------------------------------------------------------------------------- /src/fuzztypes/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .download import download_file, get_file 2 | 3 | __all__ = ( 4 | "download_file", 5 | "get_file", 6 | ) 7 | -------------------------------------------------------------------------------- /tests/data/emotions.txt: -------------------------------------------------------------------------------- 1 | Happiness 2 | Sadness 3 | Anger 4 | Fear 5 | Surprise 6 | Disgust 7 | Trust 8 | Anticipation 9 | Love 10 | Joy 11 | Courage 12 | Serenity -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv pip compile pyproject.toml -o requirements.txt 3 | annotated-types==0.6.0 4 | # via pydantic 5 | pydantic==2.6.2 6 | pydantic-core==2.16.3 7 | # via pydantic 8 | typing-extensions==4.9.0 9 | # via 10 | # pydantic 11 | # pydantic-core 12 | -------------------------------------------------------------------------------- /tests/data/mixed.jsonl: -------------------------------------------------------------------------------- 1 | {"value": "Dog", "aliases": ["Canine", "Hound"], "label": "animal"} 2 | {"value": "Cat", "aliases": ["Feline", "Kitty"], "label": "animal"} 3 | {"value": "Apple", "aliases": ["Pome"], "label": "fruit"} 4 | {"value": "Banana", "aliases": ["Musa"], "label": "fruit"} 5 | {"value": "Eagle", "aliases": ["Bird of prey"], "label": "animal"} 6 | {"value": "Strawberry", "aliases": ["Fragaria"], "label": "fruit"} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .cache 2 | *.egg-info 3 | *.py[oc] 4 | *~ 5 | .*.sw? 6 | .coverage 7 | .idea 8 | .ipynb_checkpoints 9 | .mypy_cache 10 | .netlify 11 | .pytest_cache 12 | .venv 13 | .vscode 14 | Pipfile.lock 15 | __pycache__/ 16 | archive.zip 17 | build/ 18 | coverage.xml 19 | dist/ 20 | docs.zip 21 | docs_build 22 | env 23 | env3.* 24 | htmlcov 25 | log.txt 26 | site 27 | test.db 28 | venv 29 | wheels/ 30 | model_cache/ 31 | .DS_Store 32 | /training/ 33 | profile.dat 34 | notebooks -------------------------------------------------------------------------------- /src/fuzztypes/integer.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, Callable, Union 2 | 3 | from fuzztypes import FuzzValidator, lazy 4 | 5 | _tx = None 6 | 7 | 8 | def get_tx() -> Callable: 9 | global _tx 10 | 11 | if _tx is None: 12 | _tx = lazy.lazy_import("number_parser", "parse_ordinal") 13 | 14 | return _tx 15 | 16 | 17 | def to_int(key: Union[int, str]) -> int: 18 | if isinstance(key, int): 19 | val = key 20 | else: 21 | f = _tx or get_tx() 22 | val = f(key) 23 | return val 24 | 25 | 26 | Integer = Annotated[int, FuzzValidator(to_int)] 27 | -------------------------------------------------------------------------------- /tests/test_emoji.py: -------------------------------------------------------------------------------- 1 | from fuzztypes import Emoji, emojis, validate_python 2 | 3 | 4 | def test_key_access(): 5 | assert validate_python(Emoji, "balloon") == "🎈" 6 | assert validate_python(Emoji, ":atm_sign:") == "🏧" 7 | assert validate_python(Emoji, "atm sign") == "🏧" 8 | assert validate_python(Emoji, "atm") == "🏧" 9 | assert validate_python(Emoji, "United States") == "🇺🇸" 10 | 11 | 12 | def test_load_emojis(): 13 | entities = emojis.load_emoji_entities() 14 | assert len(entities) > 2000 15 | assert entities[0].value == "🥇" 16 | assert set(entities[0].aliases) == {"1st place medal", ":1st_place_medal:"} 17 | -------------------------------------------------------------------------------- /src/fuzztypes/ascii.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, Any, Callable 2 | 3 | from fuzztypes import FuzzValidator, lazy 4 | 5 | _tx = None 6 | 7 | 8 | def get_tx() -> Callable: # pragma: no cover 9 | global _tx 10 | 11 | if _tx is None: 12 | _tx = lazy.lazy_import( 13 | "unidecode", 14 | "unidecode", 15 | return_none_on_error=True, 16 | ) 17 | _tx = _tx or lazy.lazy_import( 18 | "anyascii", 19 | "anyascii", 20 | return_none_on_error=True, 21 | ) 22 | 23 | if _tx is None: 24 | msg = "Failed: `pip install ascii` or `pip install unidecode`" 25 | raise RuntimeError(msg) 26 | 27 | return _tx 28 | 29 | 30 | def to_ascii(key: Any) -> str: 31 | f = _tx or get_tx() 32 | return f(str(key)) 33 | 34 | 35 | ASCII = Annotated[str, FuzzValidator(to_ascii)] 36 | -------------------------------------------------------------------------------- /tests/test_integer.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, ValidationError 2 | 3 | from fuzztypes import Integer, validate_python 4 | 5 | 6 | def test_convert_number_to_int(): 7 | assert validate_python(Integer, 3) == 3 8 | assert validate_python(Integer, "three") == 3 9 | assert validate_python(Integer, "third") == 3 10 | assert ( 11 | validate_python(Integer, "nineteen billion and nineteen") 12 | == 19_000_000_019 13 | ) 14 | assert ( 15 | validate_python(Integer, "two million three thousand and nineteen") 16 | == 2_003_019 17 | ) 18 | 19 | 20 | def test_validation_error(): 21 | class MyModel(BaseModel): 22 | num: Integer 23 | 24 | assert MyModel(num="three").num == 3 # type: ignore[arg-type] 25 | 26 | try: 27 | assert MyModel(num="xyz") # type: ignore[arg-type] 28 | assert False, "Didn't fail to parse non-integer." 29 | except ValidationError: 30 | pass 31 | -------------------------------------------------------------------------------- /tests/test_ascii.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from pydantic import BaseModel, TypeAdapter 4 | 5 | from fuzztypes import ASCII 6 | 7 | 8 | def test_ascii_usable_type(): 9 | ta = TypeAdapter(ASCII) 10 | assert ta.validate_python("άνθρωποι") == "anthropoi" 11 | 12 | 13 | def test_transliterate_utf8_to_ascii(): 14 | class MyModel(BaseModel): 15 | ascii: ASCII 16 | 17 | obj = MyModel(ascii="άνθρωποι") 18 | assert obj.ascii == "anthropoi" 19 | 20 | assert MyModel(ascii="kožušček").ascii == "kozuscek" 21 | assert ( 22 | MyModel(ascii="30 \U0001d5c4\U0001d5c6/\U0001d5c1").ascii == "30 km/h" 23 | ) 24 | 25 | # Note: unidecode and anyascii have differences in some situations 26 | allowed = ("kakoi-to tekst", "kakoy-to tekst") # unidecode, anyascii 27 | assert MyModel(ascii="какой-то текст").ascii in allowed 28 | 29 | allowed = ("Bei Jing ", "BeiJing") # unidecode, anyascii 30 | assert MyModel(ascii="\u5317\u4EB0").ascii in allowed 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2024 to Ian Maurer and GenomOncology, LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | “Software”), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject 11 | to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR 20 | ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 21 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## v0.1.1 (2023-03-25) 2 | 3 | #### Changed 4 | - Fixes to the README regarding validation utility functions. 5 | - Renamed ill-named function to `resolve_entity` and added explicit test. 6 | 7 | 8 | ## v0.1.0 (2023-03-25) 9 | 10 | The project's typing system was validated using mypy and refactored to follow 11 | Annotated types as specified by [PEP 593](https://peps.python.org/pep-0593/). 12 | 13 | #### Added 14 | - FuzzValidator annotation type created to simplify design 15 | - validate_python and validate_json functions added 16 | - Added Language, LanguageName, and LanguageCode usable types 17 | - fuzztypes.logger and fuzztypes.utils module for downloading iso codes 18 | 19 | #### Changed 20 | - Renamed OnDisk to OnDiskValidator 21 | - Renamed InMemory to InMemoryValidator 22 | - Refactored InMemoryValidator and OnDiskValidator to use FuzzValidator 23 | - Refactored Person to use FuzzValidator 24 | - Renamed Regex to RegexValidator 25 | - Changed error message to more common "did you mean" message format 26 | 27 | #### Removed 28 | - abstract.py module and AbstractType class, simplified by FuzzValidator 29 | - function.py module and Function annotation type, replaced by FuzzValidator -------------------------------------------------------------------------------- /src/fuzztypes/flags.py: -------------------------------------------------------------------------------- 1 | from enum import Flag, auto 2 | 3 | 4 | # What NamedEntity fields does the search key need to match on? 5 | # Does search support fuzzy matching and semantic similarity? 6 | class SearchFlag(Flag): 7 | NAME_OK = auto() 8 | ALIAS_OK = auto() 9 | FUZZ_OK = auto() 10 | SEMANTIC_OK = auto() 11 | 12 | @property 13 | def is_name_ok(self) -> bool: 14 | return bool(self & SearchFlag.NAME_OK) 15 | 16 | @property 17 | def is_alias_ok(self) -> bool: 18 | return bool(self & SearchFlag.ALIAS_OK) 19 | 20 | @property 21 | def is_fuzz_ok(self) -> bool: 22 | return bool(self & SearchFlag.FUZZ_OK) 23 | 24 | @property 25 | def is_semantic_ok(self) -> bool: 26 | return bool(self & SearchFlag.SEMANTIC_OK) 27 | 28 | @property 29 | def is_fuzz_or_semantic_ok(self): 30 | return self.is_fuzz_ok or self.is_semantic_ok 31 | 32 | @property 33 | def is_hybrid(self): 34 | return self.is_fuzz_ok and self.is_semantic_ok 35 | 36 | 37 | NameSearch = SearchFlag.NAME_OK 38 | AliasSearch = NameSearch | SearchFlag.ALIAS_OK 39 | FuzzSearch = AliasSearch | SearchFlag.FUZZ_OK 40 | SemanticSearch = AliasSearch | SearchFlag.SEMANTIC_OK 41 | HybridSearch = FuzzSearch | SemanticSearch 42 | DefaultSearch = AliasSearch 43 | -------------------------------------------------------------------------------- /src/fuzztypes/regex.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Annotated, Optional 3 | 4 | from . import FuzzValidator 5 | 6 | 7 | def RegexValidator( 8 | pattern: str, 9 | examples: Optional[list] = None, 10 | ): 11 | regex = re.compile(pattern) 12 | 13 | def do_regex(key: str) -> str: 14 | matches = regex.findall(key) 15 | if len(matches) == 1: 16 | return matches[0] 17 | elif len(matches) > 1: 18 | raise ValueError( 19 | f"Multiple matches found for pattern '{pattern}' in '{key}'" 20 | ) 21 | else: 22 | raise ValueError( 23 | f"No matches found for pattern '{pattern}' in '{key}'" 24 | ) 25 | 26 | return FuzzValidator(do_regex, examples=examples) 27 | 28 | 29 | Email = Annotated[ 30 | str, 31 | RegexValidator( 32 | r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", 33 | examples=["user@example.com"], 34 | ), 35 | ] 36 | 37 | SSN = Annotated[ 38 | str, 39 | RegexValidator( 40 | r"\b\d{3}-\d{2}-\d{4}\b", 41 | examples=["000-00-0000"], 42 | ), 43 | ] 44 | 45 | ZipCode = Annotated[ 46 | str, 47 | RegexValidator( 48 | r"\b\d{5}(?:-\d{4})?\b", 49 | examples=["12345", "12345-6789"], 50 | ), 51 | ] 52 | -------------------------------------------------------------------------------- /tests/on_disk/test_on_disk_semantic.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pydantic import BaseModel 3 | 4 | from fuzztypes import flags, on_disk, Vibemoji, validate_python 5 | 6 | 7 | @pytest.fixture(scope="session") 8 | def EmotionStoredValidatorStorage(EmotionSource): 9 | storage = on_disk.StoredValidatorStorage( 10 | "Emotions", EmotionSource, search_flag=flags.SemanticSearch 11 | ) 12 | storage.prepare(force_drop_table=True) 13 | return storage 14 | 15 | 16 | def test_check_storage_directly(EmotionStoredValidatorStorage): 17 | matches = EmotionStoredValidatorStorage.get("happiness") 18 | assert len(matches) == 1 19 | assert matches[0].entity.value == "Happiness" 20 | assert matches[0].score == 100.0 21 | 22 | matches = EmotionStoredValidatorStorage.get("scared") 23 | assert len(matches) == 10 24 | assert matches[0].entity.value == "Fear" 25 | assert matches[0].score == pytest.approx(91.23) 26 | 27 | 28 | class MyModel(BaseModel): 29 | emoji: Vibemoji 30 | 31 | 32 | def test_vibemoji_get_value(): 33 | assert validate_python(Vibemoji, "bacon tastes good") == "🥓" 34 | assert validate_python(Vibemoji, "take the bus to school") == "🚌" 35 | assert validate_python(Vibemoji, "jolly santa") == "🎅" 36 | assert validate_python(Vibemoji, "United States") == "🇺🇸" 37 | -------------------------------------------------------------------------------- /tests/on_disk/test_on_disk_fuzz.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import tantivy # type: ignore 4 | 5 | from fuzztypes import Fuzzmoji, const, validate_python 6 | 7 | 8 | def test_tantivy(): 9 | # make sure the index is built 10 | assert validate_python(Fuzzmoji, "balloon") == "🎈" 11 | 12 | # standard schema 13 | schema_builder = tantivy.SchemaBuilder() 14 | schema_builder.add_integer_field("doc_id", stored=True) 15 | schema_builder.add_text_field("term", stored=True) 16 | schema = schema_builder.build() 17 | 18 | # create the index 19 | path = os.path.join( 20 | const.StoredValidatorPath, "Fuzzmoji.lance/_indices/tantivy" 21 | ) 22 | index = tantivy.Index(schema, path=path) 23 | searcher = index.searcher() 24 | 25 | # todo: fuzzy field not in current version 26 | # https://github.com/quickwit-oss/tantivy-py/issues/20 27 | # https://docs.rs/tantivy/latest/tantivy/query/struct.FuzzyTermQuery.html 28 | # index.parse_query("thought", fuzzy_fields={"term": (True, 1, False)}) 29 | 30 | # query the index 31 | query = index.parse_query("thought bubble") 32 | result = searcher.search(query, 5) 33 | 34 | # check the results 35 | terms = [] 36 | for score, address in result.hits: 37 | doc = searcher.doc(address) 38 | terms.extend(doc["term"]) 39 | 40 | assert "thought balloon" in terms 41 | assert ":bubble_tea:" in terms 42 | 43 | 44 | def test_fuzzmoji(): 45 | assert validate_python(Fuzzmoji, "thought bubble") == "💭" 46 | assert validate_python(Fuzzmoji, "bubble team") == "🧋" 47 | -------------------------------------------------------------------------------- /src/fuzztypes/emojis.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import Annotated, List 3 | from pydantic import TypeAdapter 4 | 5 | from fuzztypes import NamedEntity, EntitySource, OnDiskValidator, flags, lazy 6 | 7 | 8 | def load_emoji_entities() -> List[NamedEntity]: 9 | get_aliases_unicode_dict = lazy.lazy_import( 10 | "emoji.unicode_codes", "get_aliases_unicode_dict" 11 | ) 12 | 13 | mapping = defaultdict(list) 14 | emoji_mapping = get_aliases_unicode_dict() 15 | for value, emoji in emoji_mapping.items(): 16 | mapping[emoji].extend([value, value.strip(":").replace("_", " ")]) 17 | 18 | data = ({"value": k, "aliases": v} for k, v in mapping.items()) 19 | return TypeAdapter(List[NamedEntity]).validate_python(data) 20 | 21 | 22 | EmojiSource = EntitySource(load_emoji_entities) 23 | 24 | Emoji = Annotated[ 25 | str, 26 | OnDiskValidator( 27 | "Emoji", 28 | EmojiSource, 29 | search_flag=flags.AliasSearch, 30 | tiebreaker_mode="lesser", 31 | ), 32 | ] 33 | 34 | Fuzzmoji = Annotated[ 35 | str, 36 | OnDiskValidator( 37 | "Fuzzmoji", 38 | EmojiSource, 39 | search_flag=flags.FuzzSearch, 40 | tiebreaker_mode="lesser", 41 | min_similarity=10.0, 42 | device="cpu", 43 | ), 44 | ] 45 | 46 | Vibemoji = Annotated[ 47 | str, 48 | OnDiskValidator( 49 | "Vibemoji", 50 | EmojiSource, 51 | search_flag=flags.SemanticSearch, 52 | tiebreaker_mode="lesser", 53 | min_similarity=10.0, 54 | device="cpu", 55 | ), 56 | ] 57 | -------------------------------------------------------------------------------- /tests/in_memory/test_in_memory_similarity.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from fuzztypes import flags 4 | from fuzztypes.in_memory import InMemoryValidatorStorage 5 | from fuzztypes.lazy import create_reranker 6 | 7 | 8 | @pytest.fixture(scope="session") 9 | def EmotionMemoryStorage(EmotionSource): 10 | storage = InMemoryValidatorStorage( 11 | EmotionSource, search_flag=flags.SemanticSearch 12 | ) 13 | storage.prepare() 14 | return storage 15 | 16 | 17 | def test_check_storage_directly(EmotionMemoryStorage): 18 | matches = EmotionMemoryStorage.get("happiness") 19 | assert len(matches) == 1 20 | assert matches[0].entity.value == "Happiness" 21 | assert matches[0].score == 100.0 22 | 23 | matches = EmotionMemoryStorage.get("scared") 24 | assert len(matches) == 10 25 | assert matches[0].entity.value == "Fear" 26 | assert matches[0].score == pytest.approx(91.23) 27 | 28 | 29 | def test_reranker_directly_1(EmotionMemoryStorage): 30 | ranker = create_reranker("mixedbread-ai/mxbai-rerank-xsmall-v1") 31 | documents = EmotionMemoryStorage._terms 32 | 33 | results = ranker("afraid", documents, 3) 34 | assert len(results) == 3 35 | assert results[0]["text"] == "fear" 36 | assert results[0]["score"] >= 0.3 37 | 38 | 39 | def test_reranker_directly_2(EmotionMemoryStorage): 40 | ranker = create_reranker("mixedbread-ai/mxbai-rerank-xsmall-v1") 41 | documents = EmotionMemoryStorage._terms 42 | 43 | results = ranker("joyous", sorted(documents), 3) 44 | assert len(results) == 3 45 | assert results[0]["text"] in ("happiness", "joy") 46 | assert results[0]["score"] >= 0.3 47 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from pytest import fixture 4 | 5 | from fuzztypes import EntitySource, NamedEntity 6 | 7 | 8 | @fixture(scope="session") 9 | def data_path() -> Path: 10 | return Path(__file__).parent / "data" 11 | 12 | 13 | @fixture(scope="session") 14 | def EmojiSource(data_path): 15 | source = EntitySource(data_path / "emojis.csv") 16 | assert len(source) == 4 17 | return source 18 | 19 | 20 | @fixture(scope="session") 21 | def FruitSource(data_path): 22 | # loading separately from AnimalSource to test lazy loading 23 | MixedSource = EntitySource(data_path / "mixed.jsonl") 24 | assert MixedSource.loaded is False 25 | 26 | FruitSource = MixedSource["fruit"] 27 | assert isinstance(FruitSource, EntitySource) 28 | assert FruitSource.loaded is False 29 | 30 | # first access loads FruitSource -> MixedSource 31 | assert isinstance(FruitSource[0], NamedEntity) 32 | assert FruitSource[0].value == "Apple" 33 | assert FruitSource.loaded is True 34 | assert MixedSource.loaded is True 35 | assert len(FruitSource) == 3 36 | 37 | return FruitSource 38 | 39 | 40 | @fixture(scope="session") 41 | def AnimalSource(data_path): 42 | MixedSource = EntitySource(data_path / "mixed.jsonl") 43 | return MixedSource["animal"] 44 | 45 | 46 | @fixture(scope="session") 47 | def MythSource(data_path): 48 | source = EntitySource(data_path / "myths.tsv") 49 | assert len(source) == 5 50 | return source 51 | 52 | 53 | @fixture(scope="session") 54 | def EmotionSource(data_path): 55 | source = EntitySource(data_path / "emotions.txt") 56 | assert len(source) == 12 57 | return source 58 | -------------------------------------------------------------------------------- /src/fuzztypes/date.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from typing import Annotated, Optional, Union 3 | 4 | from . import FuzzValidator, const, lazy 5 | 6 | DateOrDatetime = Union[datetime.date, datetime.datetime] 7 | 8 | 9 | def DateValidator( 10 | date_order: Optional[const.DateOrder] = None, 11 | is_date: bool = True, 12 | languages: Optional[list[str]] = None, 13 | timezone: Optional[str] = None, 14 | strict: bool = False, 15 | prefer_future_dates: bool = False, 16 | relative_base: Optional[DateOrDatetime] = None, 17 | ): 18 | DateDataParser = lazy.lazy_import("dateparser.date", "DateDataParser") 19 | languages = languages or ["en"] 20 | 21 | settings = { 22 | "STRICT_PARSING": strict, 23 | "PREFER_DATES_FROM": "future" if prefer_future_dates else "past", 24 | "RETURN_AS_TIMEZONE_AWARE": bool(timezone), 25 | } 26 | if date_order: 27 | settings["DATE_ORDER"] = date_order 28 | if timezone: 29 | settings["TIMEZONE"] = timezone 30 | if relative_base: 31 | settings["RELATIVE_BASE"] = relative_base 32 | 33 | parser = DateDataParser(languages=languages, settings=settings) 34 | 35 | def parse(key: str) -> DateOrDatetime: 36 | value = parser.get_date_data(key).date_obj 37 | value = value.date() if (value and is_date) else value 38 | return value 39 | 40 | return FuzzValidator(parse) 41 | 42 | 43 | def DatetimeValidator( 44 | date_order: Optional[const.DateOrder] = None, 45 | languages: Optional[list[str]] = None, 46 | timezone: Optional[str] = None, 47 | strict: bool = False, 48 | prefer_future_dates: bool = False, 49 | relative_base: Optional[DateOrDatetime] = None, 50 | ): 51 | return DateValidator( 52 | date_order=date_order, 53 | is_date=False, 54 | languages=languages, 55 | timezone=timezone, 56 | strict=strict, 57 | prefer_future_dates=prefer_future_dates, 58 | relative_base=relative_base, 59 | ) 60 | 61 | 62 | Date = Annotated[datetime.date, DateValidator()] 63 | Datetime = Annotated[datetime.datetime, DatetimeValidator()] 64 | -------------------------------------------------------------------------------- /src/fuzztypes/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.1" 2 | 3 | # logging 4 | import logging 5 | 6 | logger = logging.getLogger("fuzztypes") 7 | logger.setLevel(logging.WARNING) 8 | 9 | # flags and constants 10 | from . import flags 11 | from . import const 12 | 13 | # utilities 14 | from . import utils 15 | from . import lazy 16 | 17 | # Schema 18 | from .entity import Entity, NamedEntity, EntitySource 19 | from .match import Match, MatchResult, Record 20 | 21 | # Validation 22 | from .validation import ( 23 | FuzzValidator, 24 | resolve_entity, 25 | validate_python, 26 | validate_json, 27 | get_type_adapter, 28 | ) 29 | 30 | # Named Entity Storage 31 | from . import storage 32 | from .in_memory import InMemoryValidator 33 | from .on_disk import OnDiskValidator 34 | 35 | # Base Non-Entity Types 36 | from .regex import RegexValidator 37 | 38 | # Usable Types 39 | from .ascii import ASCII 40 | from .date import Date, DateValidator, Datetime, DatetimeValidator 41 | from .emojis import Emoji, Fuzzmoji, Vibemoji 42 | from .integer import Integer 43 | from .language import ( 44 | Language, 45 | LanguageCode, 46 | LanguageName, 47 | LanguageNamedEntity, 48 | LanguageScope, 49 | LanguageType, 50 | ) 51 | from .person import Person 52 | from .regex import Email, SSN, ZipCode 53 | 54 | 55 | __all__ = ( 56 | "ASCII", 57 | "Date", 58 | "Email", 59 | "Emoji", 60 | "Entity", 61 | "EntitySource", 62 | "Fuzzmoji", 63 | "FuzzValidator", 64 | "InMemoryValidator", 65 | "Integer", 66 | "Language", 67 | "LanguageCode", 68 | "LanguageName", 69 | "LanguageNamedEntity", 70 | "LanguageScope", 71 | "LanguageType", 72 | "Match", 73 | "MatchResult", 74 | "NamedEntity", 75 | "OnDiskValidator", 76 | "Person", 77 | "Record", 78 | "RegexValidator", 79 | "SSN", 80 | "Date", 81 | "DateValidator", 82 | "Datetime", 83 | "DatetimeValidator", 84 | "Vibemoji", 85 | "ZipCode", 86 | "const", 87 | "flags", 88 | "get_type_adapter", 89 | "lazy", 90 | "logger", 91 | "utils", 92 | "resolve_entity", 93 | "validate_json", 94 | "validate_python", 95 | ) 96 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ACTIVATE = . ./activate.sh 2 | 3 | format: 4 | $(ACTIVATE) && ruff format src tests 5 | 6 | test: 7 | $(ACTIVATE) && pytest -s tests/ 8 | 9 | cov: 10 | $(ACTIVATE) && coverage run -m pytest -s tests && coverage combine && coverage report --show-missing && coverage html 11 | 12 | sync: 13 | uv pip compile pyproject.toml -o requirements.txt 14 | uv pip compile pyproject.toml --extra test --extra local --extra ext -o requirements-dev.txt 15 | uv pip sync requirements-dev.txt 16 | uv pip install -e ".[dev]" 17 | uv pip freeze 18 | 19 | publish: 20 | # https://packaging.python.org/en/latest/tutorials/packaging-projects/ 21 | $(ACTIVATE) && python -m build 22 | $(ACTIVATE) && python -m twine upload -r pypi dist/* 23 | 24 | perf_test: 25 | $(ACTIVATE) && python -m cProfile -o profile.dat -m pytest -s tests/ 26 | 27 | echo "** Slowest FuzzTypes functions by total time:" 28 | $(ACTIVATE) && python -c "import pstats; pstats.Stats('profile.dat').sort_stats('tottime').print_stats(1000)" | grep -E "ncalls|/src/" | head -n 21 29 | 30 | echo "\n\n** Slowest FuzzTypes functions by cumulative time:" 31 | $(ACTIVATE) && python -c "import pstats; pstats.Stats('profile.dat').sort_stats('cumtime').print_stats(1000)" | grep -E "ncalls|/src/" | head -n 21 32 | 33 | echo "\n\n** Slowest all-project functions by total time:" 34 | $(ACTIVATE) && python -c "import pstats; pstats.Stats('profile.dat').sort_stats('tottime').print_stats(20)" | tail -n +8 35 | 36 | rm profile.dat 37 | 38 | pbcopy: 39 | # copy all code to clipboard for pasting into an LLM 40 | find . ! -path '*/.*/*' -type f \( -name "*.py" -o -name "*.md" \) -exec tail -n +1 {} + | pbcopy 41 | 42 | #---------- 43 | # clean 44 | #---------- 45 | 46 | clean: clean-build clean-pyc clean-test 47 | 48 | clean-build: 49 | rm -fr build/ 50 | rm -fr dist/ 51 | rm -fr .eggs/ 52 | find . -name '*.egg-info' -exec rm -fr {} + 53 | find . -name '*.egg' -exec rm -f {} + 54 | 55 | clean-pyc: 56 | find . -name '*.pyc' -exec rm -f {} + 57 | find . -name '*.pyo' -exec rm -f {} + 58 | find . -name '*~' -exec rm -f {} + 59 | find . -name '__pycache__' -exec rm -fr {} + 60 | 61 | clean-test: 62 | rm -fr .cache 63 | rm -fr .mypy_cache 64 | rm -fr .pytest_cache 65 | rm -f .coverage 66 | rm -fr htmlcov/ -------------------------------------------------------------------------------- /src/fuzztypes/utils/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urllib.request 3 | from datetime import datetime 4 | from typing import Optional 5 | from urllib.error import URLError, HTTPError 6 | 7 | from fuzztypes import logger, const 8 | 9 | 10 | def get_file_age_in_days(file_path: str) -> int: 11 | age = datetime.now() - datetime.fromtimestamp(os.path.getmtime(file_path)) 12 | return age.days 13 | 14 | 15 | def get_file(url: str, expires_in_days: int = 30) -> Optional[str]: 16 | """ 17 | Tries to retrieve a file from the cache or download it if not available 18 | or expired. 19 | 20 | :param url: The URL of the original file to be downloaded. 21 | :param expires_in_days: Expiration period for the cached file. 22 | :return: Path to the downloaded file, or None if fails. 23 | """ 24 | file_name = os.path.basename(url) 25 | cache_file_path = os.path.join(const.DownloadsPath, file_name) 26 | temp_download_path = f"{cache_file_path}.tmp" 27 | 28 | cache_ok = os.path.exists(cache_file_path) 29 | if cache_ok: 30 | file_age = get_file_age_in_days(cache_file_path) 31 | cache_ok = file_age <= expires_in_days 32 | 33 | if not cache_ok: 34 | download_success = download_file(url, temp_download_path) 35 | if download_success: 36 | os.replace(temp_download_path, cache_file_path) 37 | cache_ok = os.path.exists(cache_file_path) 38 | 39 | if not cache_ok: 40 | logger.error(f"Unable to download the file and no cached file: {url}") 41 | 42 | return cache_file_path if cache_ok else None 43 | 44 | 45 | def download_file(url, download_path): 46 | """ 47 | Attempt to download a file directly to a specified path. 48 | If the download fails, logs a warning and returns None. 49 | 50 | :param url: The URL of the file to be downloaded. 51 | :param download_path: The full file path where the file should be saved. 52 | :return: Boolean indicating success or failure of the download. 53 | """ 54 | try: 55 | urllib.request.urlretrieve(url, download_path) 56 | return True 57 | except (HTTPError, URLError, ValueError, OSError, Exception) as e: 58 | logger.warning(f"Download (url={url}) failed: {e}", exc_info=True) 59 | return False 60 | -------------------------------------------------------------------------------- /tests/in_memory/test_in_memory_name.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, Optional 2 | 3 | from pydantic import BaseModel, ValidationError, Field 4 | 5 | from fuzztypes import NamedEntity, InMemoryValidator, flags 6 | 7 | names = ["George Washington", "John Adams", "Thomas Jefferson"] 8 | President = InMemoryValidator(names, search_flag=flags.NameSearch) 9 | CasedPrez = InMemoryValidator( 10 | names, case_sensitive=True, search_flag=flags.NameSearch 11 | ) 12 | NullPrez = InMemoryValidator( 13 | names, notfound_mode="none", search_flag=flags.NameSearch 14 | ) 15 | AllowPrez = InMemoryValidator( 16 | names, notfound_mode="allow", search_flag=flags.NameSearch 17 | ) 18 | 19 | 20 | def test_namestr_getitem(): 21 | entity = NamedEntity(value="Thomas Jefferson") 22 | assert President["Thomas Jefferson"] == entity 23 | assert President["THOMAS JEFFERSON"] == entity 24 | 25 | assert CasedPrez["Thomas Jefferson"] == entity 26 | try: 27 | assert CasedPrez["THOMAS JEFFERSON"] == entity 28 | assert False, "Didn't raise KeyError!" 29 | except KeyError: 30 | pass 31 | 32 | assert NullPrez["The Rock"] is None 33 | assert AllowPrez["The Rock"].value == "The Rock" 34 | 35 | 36 | def test_uncased_name_str(): 37 | class Example(BaseModel): 38 | value: Annotated[str, President] 39 | 40 | # exact match 41 | assert Example(value="George Washington").value == "George Washington" 42 | 43 | # case-insensitive match 44 | assert Example(value="john ADAMS").value == "John Adams" 45 | 46 | 47 | def test_cased_name_str(): 48 | class Example(BaseModel): 49 | value: Annotated[str, CasedPrez] 50 | 51 | # exact match 52 | assert Example(value="George Washington").value == "George Washington" 53 | 54 | # case-insensitive match 55 | try: 56 | assert Example(value="john ADAMS").value == "John Adams" 57 | assert False, "Didn't raise PydanticCustomError!" 58 | except ValidationError: 59 | pass 60 | 61 | 62 | def test_nullable_name_str(): 63 | class Example(BaseModel): 64 | value: Annotated[Optional[str], NullPrez] = Field(default=None) 65 | 66 | assert Example().model_dump() == {"value": None} 67 | assert Example(value="The Rock").model_dump() == {"value": None} 68 | -------------------------------------------------------------------------------- /tests/test_language.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from fuzztypes import ( 4 | Language, 5 | LanguageCode, 6 | LanguageName, 7 | validate_python, 8 | LanguageNamedEntity, 9 | LanguageScope, 10 | LanguageType, 11 | ) 12 | from fuzztypes.language import load_languages 13 | 14 | 15 | def test_load_languages(): 16 | source = load_languages() 17 | entities = source() 18 | assert len(entities) == 7910 19 | assert entities[0].resolve() == "Ghotuo" 20 | 21 | 22 | def test_language_model_resolution(): 23 | class Model(BaseModel): 24 | language_code: LanguageCode 25 | language_name: LanguageName 26 | language: Language 27 | 28 | # Test that Language resolves to the complete language object 29 | data = dict(language_code="en", language="English", language_name="ENG") 30 | obj = validate_python(Model, data) 31 | assert obj.language_code == "en" 32 | assert obj.language_name == "English" 33 | assert obj.language.scope == LanguageScope.INDIVIDUAL 34 | assert obj.language.type == LanguageType.LIVING 35 | assert isinstance(obj.language, LanguageNamedEntity) 36 | assert obj.model_dump(exclude_defaults=True, mode="json") == { 37 | "language": { 38 | "aliases": ["en", "eng"], 39 | "alpha_2": "en", 40 | "alpha_3": "eng", 41 | "scope": "I", 42 | "type": "L", 43 | "value": "English", 44 | }, 45 | "language_code": "en", 46 | "language_name": "English", 47 | } 48 | 49 | 50 | def test_matching_edge_cases(): 51 | # 'En' is a proper name of a language 52 | assert validate_python(LanguageName, "En") == "En" 53 | assert validate_python(LanguageCode, "En") == "enc" 54 | 55 | # 'en' is the alpha2 code for English 56 | assert validate_python(LanguageName, "en") == "English" 57 | assert validate_python(LanguageCode, "en") == "en" 58 | 59 | # Bangla is common name for Bengali 60 | assert validate_python(LanguageName, "Bangla") == "Bengali" 61 | assert validate_python(LanguageCode, "Bangla") == "bn" 62 | assert validate_python(Language, "Bangla").model_dump( 63 | exclude_defaults=True, mode="json" 64 | ) == { 65 | "aliases": ["bn", "ben", "Bangla"], 66 | "alpha_2": "bn", 67 | "alpha_3": "ben", 68 | "common_name": "Bangla", 69 | "scope": "I", 70 | "type": "L", 71 | "value": "Bengali", 72 | } 73 | -------------------------------------------------------------------------------- /tests/on_disk/test_on_disk_name.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, Optional 2 | 3 | from pydantic import BaseModel, ValidationError, Field 4 | 5 | from fuzztypes import NamedEntity, OnDiskValidator, flags, resolve_entity 6 | 7 | names = ["George Washington", "John Adams", "Thomas Jefferson"] 8 | President = OnDiskValidator( 9 | "President", 10 | names, 11 | search_flag=flags.NameSearch, 12 | ) 13 | CasedPrez = OnDiskValidator( 14 | "CasedPrez", 15 | names, 16 | case_sensitive=True, 17 | search_flag=flags.NameSearch, 18 | ) 19 | NullPrez = OnDiskValidator( 20 | "NullPrez", 21 | names, 22 | notfound_mode="none", 23 | search_flag=flags.NameSearch, 24 | ) 25 | AllowPrez = OnDiskValidator( 26 | "AllowPrez", 27 | names, 28 | notfound_mode="allow", 29 | search_flag=flags.NameSearch, 30 | ) 31 | 32 | 33 | def test_namestr_getitem(): 34 | entity = NamedEntity(value="Thomas Jefferson") 35 | assert President["Thomas Jefferson"] == entity 36 | assert President["THOMAS JEFFERSON"] == entity 37 | assert resolve_entity(President, "Thomas Jefferson") == entity 38 | 39 | assert CasedPrez["Thomas Jefferson"] == entity 40 | try: 41 | assert CasedPrez["THOMAS JEFFERSON"] == entity 42 | assert False, "Didn't raise KeyError!" 43 | except KeyError: 44 | pass 45 | 46 | assert NullPrez["The Rock"] is None 47 | assert AllowPrez["The Rock"].value == "The Rock" 48 | 49 | 50 | def test_uncased_name_str(): 51 | class Example(BaseModel): 52 | value: Annotated[str, President] 53 | 54 | # exact match 55 | assert Example(value="George Washington").value == "George Washington" 56 | 57 | # case-insensitive match 58 | assert Example(value="john ADAMS").value == "John Adams" 59 | 60 | 61 | def test_cased_name_str(): 62 | class Example(BaseModel): 63 | value: Annotated[str, CasedPrez] 64 | 65 | # exact match 66 | assert Example(value="George Washington").value == "George Washington" 67 | 68 | # case-insensitive match 69 | try: 70 | assert Example(value="john ADAMS").value == "John Adams" 71 | assert False, "Didn't raise PydanticCustomError!" 72 | except ValidationError: 73 | pass 74 | 75 | 76 | def test_nullable_name_str(): 77 | class Example(BaseModel): 78 | value: Annotated[Optional[str], NullPrez] = Field(default=None) 79 | 80 | assert Example().model_dump() == {"value": None} 81 | assert Example(value="The Rock").model_dump() == {"value": None} 82 | -------------------------------------------------------------------------------- /tests/test_date.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, date 2 | from typing import Annotated 3 | from zoneinfo import ZoneInfo 4 | 5 | from pydantic import BaseModel 6 | 7 | from fuzztypes import ( 8 | Date, 9 | DateValidator, 10 | DatetimeValidator, 11 | validate_python, 12 | validate_json, 13 | ) 14 | 15 | ny_tz = ZoneInfo("America/New_York") 16 | 17 | DateY2K = Annotated[ 18 | datetime, 19 | DatetimeValidator(relative_base=datetime(2000, 1, 1), timezone="EST"), 20 | ] 21 | 22 | 23 | class MyModel(BaseModel): 24 | date: Date 25 | time: DateY2K 26 | 27 | 28 | def test_validate_python_date_and_datetime(): 29 | data = dict(date="11 July 2012", time="tomorrow 5am") 30 | obj = validate_python(MyModel, data) 31 | assert obj.date == date(2012, 7, 11) 32 | assert obj.time == datetime(2000, 1, 2, 5, 0, 0, tzinfo=ny_tz) 33 | 34 | 35 | def test_validate_json_date_and_datetime(): 36 | json = '{"date": "July 4th", "time": "1 year ago"}' 37 | obj = validate_json(MyModel, json) 38 | today = date.today() 39 | year = today.year if (today.month, today.day) >= (7, 4) else today.year - 1 40 | assert obj.date == date(year, 7, 4) 41 | assert obj.time == datetime(1999, 1, 1, 0, 0, 0, tzinfo=ny_tz) 42 | 43 | d = obj.date.isoformat() 44 | t = obj.time.isoformat() 45 | assert obj.model_dump_json() == f'{{"date":"{d}","time":"{t}"}}' 46 | 47 | 48 | def test_mdy_vs_ymd(): 49 | # MDY vs. YMD ordering is context specific 50 | # https://dateparser.readthedocs.io/en/latest/settings.html#date-order 51 | # 52 | assert validate_python(Date, "02-03-04") == date(year=2004, month=2, day=3) 53 | 54 | DateEN = Annotated[date, DateValidator(languages=["en"])] 55 | assert validate_python(DateEN, "02-03-04") == date( 56 | year=2004, month=2, day=3 57 | ) 58 | 59 | DateMDY = Annotated[date, DateValidator(date_order="MDY")] 60 | assert validate_python(DateMDY, "02-03-04") == date( 61 | year=2004, month=2, day=3 62 | ) 63 | 64 | DateES = Annotated[date, DateValidator(languages=["es"])] 65 | assert validate_python(DateES, "02-03-04") == date( 66 | year=2004, month=3, day=2 67 | ) 68 | 69 | DateDMY = Annotated[date, DateValidator(date_order="DMY")] 70 | assert validate_python(DateDMY, "02-03-04") == date( 71 | year=2004, month=3, day=2 72 | ) 73 | 74 | DateYMD = Annotated[date, DateValidator(date_order="YMD")] 75 | assert validate_python(DateYMD, "02-03-04") == date( 76 | year=2002, month=3, day=4 77 | ) 78 | -------------------------------------------------------------------------------- /tests/test_entity.py: -------------------------------------------------------------------------------- 1 | from fuzztypes import NamedEntity, InMemoryValidator, EntitySource 2 | 3 | 4 | def test_entity_conv(): 5 | def c(item): 6 | return NamedEntity.convert(item).model_dump( 7 | exclude_defaults=True, by_alias=True 8 | ) 9 | 10 | assert c("A") == dict(value="A") 11 | assert c(("A", "B")) == dict(value="A", aliases=["B"]) 12 | assert c(("A", ["B"])) == dict(value="A", aliases=["B"]) 13 | assert c(("A", ["B", "C"])) == dict(value="A", aliases=["B", "C"]) 14 | 15 | 16 | def test_meta(): 17 | entity = NamedEntity(value="a", meta=dict(b=1, c=None), priority=10) 18 | assert entity.value == "a" 19 | assert entity.b == 1 20 | assert entity.c is None 21 | assert entity.priority == 10 22 | assert entity.model_dump(by_alias=True) == { 23 | "value": "a", 24 | "label": None, 25 | "aliases": [], 26 | "meta": {"b": 1, "c": None}, 27 | "priority": 10, 28 | } 29 | 30 | 31 | def test_meta_edge_cases(): 32 | entity = NamedEntity(value="a") 33 | 34 | try: 35 | assert entity.unknown 36 | assert False, "Did not throw AttributeError exception." 37 | 38 | except AttributeError: 39 | pass 40 | 41 | entity.unknown = 123 42 | assert entity.unknown == 123 43 | 44 | assert entity.label is None 45 | entity.label = "LABEL" 46 | assert entity.label == "LABEL" 47 | 48 | 49 | def test_csv_load(EmojiSource): 50 | Emoji = InMemoryValidator(EmojiSource) 51 | assert Emoji["happy"].value == "happy" 52 | assert Emoji["🎉"].value == "party" 53 | assert Emoji["party"].rank < Emoji["celebrate"].rank 54 | 55 | 56 | def test_jsonl_load_animal(AnimalSource): 57 | assert AnimalSource[0].value == "Dog" 58 | 59 | AnimalStr = InMemoryValidator(AnimalSource) 60 | assert AnimalStr["dog"] == AnimalSource[0] 61 | assert AnimalStr["Bird of prey"].value == "Eagle" 62 | 63 | 64 | def test_jsonl_label_source(FruitSource): 65 | FruitStr = InMemoryValidator( 66 | FruitSource, 67 | case_sensitive=True, 68 | notfound_mode="none", 69 | ) 70 | assert FruitStr["apple"] is None 71 | assert FruitStr["Pome"].value == "Apple" 72 | 73 | 74 | def test_tsv_load(MythSource): 75 | Myth = InMemoryValidator(MythSource) 76 | assert Myth["Pallas"].value == "Athena" 77 | assert Myth["Jupiter"].value == "Zeus" 78 | 79 | 80 | def test_entity_source_from_callable(): 81 | def fn(): 82 | return [NamedEntity(value="hi!")] 83 | 84 | source = EntitySource(source=fn) 85 | entity = source[0] 86 | assert isinstance(entity, NamedEntity) 87 | assert entity.value == "hi!" 88 | -------------------------------------------------------------------------------- /tests/in_memory/test_in_memory_alias.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from typing import Annotated 3 | from pydantic import BaseModel, ValidationError 4 | 5 | from fuzztypes import InMemoryValidator, flags 6 | 7 | 8 | @pytest.fixture(scope="session") 9 | def MythicalFigure(MythSource): 10 | return InMemoryValidator(MythSource, search_flag=flags.AliasSearch) 11 | 12 | 13 | @pytest.fixture(scope="session") 14 | def CasedMythicalFigure(MythSource): 15 | return InMemoryValidator( 16 | MythSource, 17 | search_flag=flags.AliasSearch, 18 | case_sensitive=True, 19 | ) 20 | 21 | 22 | def test_alias_uncased_getitem(MythicalFigure): 23 | # Testing Alias with aliases 24 | assert MythicalFigure["Odysseus"].value == "Odysseus" 25 | assert MythicalFigure["Ulysses"].value == "Odysseus" # alias 26 | assert MythicalFigure["athena"].value == "Athena" # case insensitivity 27 | 28 | 29 | def test_alias_cased_getitem(CasedMythicalFigure): 30 | # Testing AliasCasedStr, expecting case-sensitive behavior 31 | assert CasedMythicalFigure["Athena"].value == "Athena" 32 | 33 | with pytest.raises(KeyError): 34 | # This should fail because CasedMythicalFigure is case-sensitive 35 | assert CasedMythicalFigure["athena"].value == "Athena" 36 | 37 | 38 | def test_uncased_alias_str(MythicalFigure): 39 | class Example(BaseModel): 40 | value: Annotated[str, MythicalFigure] 41 | 42 | # Exact match 43 | assert Example(value="Zeus").value == "Zeus" 44 | # Alias match 45 | assert Example(value="Jupiter").value == "Zeus" 46 | # Case-insensitive alias match 47 | assert Example(value="jove").value == "Zeus" 48 | 49 | 50 | def test_cased_alias_str(CasedMythicalFigure): 51 | class Example(BaseModel): 52 | value: Annotated[str, CasedMythicalFigure] 53 | 54 | # Exact match 55 | assert Example(value="Zeus").value == "Zeus" 56 | # Alias match 57 | assert Example(value="Jupiter").value == "Zeus" 58 | # Case-sensitive alias match should fail 59 | with pytest.raises(ValidationError): 60 | Example(value="jove") 61 | 62 | 63 | def test_duplicate_records(): 64 | source = [["c", "b"], ["a", "b"], ["d", "b"]] 65 | 66 | A = InMemoryValidator(source) 67 | assert A["a"].value == "a" 68 | 69 | try: 70 | assert A["b"].value == "a" 71 | assert False, "Didn't raise exception!" 72 | except KeyError as e: 73 | msg = str(e.args[0]) 74 | assert ( 75 | msg == "Key Error: b " 76 | '["b" could not be resolved, did you mean "c", "a", or "d"?]' 77 | ) 78 | 79 | A = InMemoryValidator(source, tiebreaker_mode="lesser") 80 | assert A["b"].value == "a" 81 | 82 | A = InMemoryValidator(source, tiebreaker_mode="greater") 83 | assert A["b"].value == "d" 84 | -------------------------------------------------------------------------------- /tests/test_regex.py: -------------------------------------------------------------------------------- 1 | from pydantic import ValidationError 2 | 3 | from fuzztypes import Email, SSN, ZipCode, validate_python 4 | 5 | 6 | def test_email_regexer(): 7 | assert ( 8 | validate_python(Email, "Jane Doe ") 9 | == "jdoe@example.com" 10 | ) 11 | assert validate_python(Email, "") == "jdoe@example.com" 12 | 13 | try: 14 | assert validate_python(Email, "abc@xyz") is not None 15 | assert False, "Invalid email did not fail!" 16 | except ValidationError: 17 | pass 18 | 19 | 20 | def test_valid_ssn(): 21 | # Value call 22 | assert validate_python(SSN, "Valid SSN: 123-45-6789") == "123-45-6789" 23 | 24 | # Entity value comparison 25 | assert validate_python(SSN, "Valid SSN: 123-45-6789") == "123-45-6789" 26 | 27 | # Entity equivalence to a value 28 | assert validate_python(SSN, "Valid SSN: 123-45-6789") == "123-45-6789" 29 | 30 | 31 | def test_valid_ssn_with_touching_bounding_chars(): 32 | assert validate_python(SSN, "Valid SSN:123-45-6789.") == "123-45-6789" 33 | 34 | 35 | def test_invalid_ssn_format(): 36 | try: 37 | validate_python(SSN, "Invalid SSN: 123-456-789") 38 | assert False, "Invalid SSN format was accepted." 39 | except ValidationError: 40 | pass 41 | 42 | 43 | def test_ssn_needs_bounding_spaces(): 44 | try: 45 | validate_python(SSN, "SSN text: abc123-45-6789xyz") 46 | assert False, "SSNs require some sort of bounding characters." 47 | except ValidationError: 48 | pass 49 | 50 | 51 | def test_multiple_ssns(): 52 | # This test depends on how you decide to handle multiple SSNs. 53 | multi_ssn_string = "Two SSNs: 123-45-6789 and 987-65-4321" 54 | try: 55 | assert validate_python(SSN, multi_ssn_string) is not None 56 | assert False, "Invalid SSN format was accepted." 57 | except ValidationError: 58 | pass 59 | 60 | 61 | def test_valid_zip_code_5_digits(): 62 | assert validate_python(ZipCode, "Postal code: 12345") == "12345" 63 | 64 | 65 | def test_valid_zip_code_9_digits(): 66 | assert validate_python(ZipCode, "ZIP:12345-6789") == "12345-6789" 67 | 68 | 69 | def test_zip_code_within_text(): 70 | assert ( 71 | validate_python(ZipCode, "Send it to 98765-4321, please.") 72 | == "98765-4321" 73 | ) 74 | 75 | 76 | def test_invalid_zip_code(): 77 | try: 78 | validate_python(ZipCode, "Invalid ZIP: 1234") 79 | assert False, "Invalid ZIP code did not fail." 80 | except ValidationError: 81 | pass 82 | 83 | 84 | def test_zip_code_with_invalid_four_format(): 85 | # Python's re module does not support lookbehinds (?= 1.13.0"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "FuzzTypes" 7 | description = "FuzzTypes is a Pydantic extension for annotating autocorrecting fields" 8 | readme = "README.md" 9 | requires-python = ">=3.9" 10 | authors = [ 11 | { name = "Ian Maurer", email = "ian@genomoncology.com" }, 12 | ] 13 | classifiers = [ 14 | "Intended Audience :: Developers", 15 | "License :: OSI Approved :: MIT License", 16 | "Operating System :: OS Independent", 17 | "Programming Language :: Python :: 3", 18 | "Topic :: Software Development :: Libraries", 19 | ] 20 | dependencies = [ 21 | "pydantic >= 2.6.1", 22 | ] 23 | dynamic = ["version"] 24 | 25 | [project.optional-dependencies] 26 | test = [ 27 | "pytest", 28 | "pytest-mock", 29 | "coverage[toml]", 30 | ] 31 | local = [ 32 | "build", 33 | "jupyter", 34 | "ipython", 35 | "mypy", 36 | "pip", 37 | "setuptools", 38 | "twine", 39 | ] 40 | ext = [ 41 | "anyascii", 42 | "dateparser", 43 | "emoji", 44 | "lancedb", 45 | "nameparser", # Note: LGPL. 46 | "number-parser", 47 | "rapidfuzz", 48 | "sentence-transformers", 49 | "tantivy", 50 | "unidecode", # Note: GPL. 51 | ] 52 | 53 | [tool.setuptools.package-data] 54 | 55 | [project.urls] 56 | 57 | [project.scripts] 58 | #fuzztypes = "fuzztypes:cli" 59 | 60 | [tool.hatch.version] 61 | path = "src/fuzztypes/__init__.py" 62 | 63 | [tool.mypy] 64 | check_untyped_defs = true 65 | 66 | [tool.pytest.ini_options] 67 | addopts = [ 68 | "--strict-config", 69 | "--strict-markers", 70 | ] 71 | xfail_strict = true 72 | junit_family = "xunit2" 73 | norecursedirs = ".venv" 74 | filterwarnings = [ 75 | ] 76 | 77 | [tool.coverage.run] 78 | parallel = true 79 | source = [ 80 | "src", 81 | ] 82 | context = '${CONTEXT}' 83 | omit = [ 84 | '__main__.py', 85 | '__init__.py', 86 | ] 87 | 88 | [tool.ruff] 89 | line-length = 79 90 | select = [ 91 | "E", # pycodestyle errors 92 | "W", # pycodestyle warnings 93 | "F", # pyflakes 94 | "I", # isort 95 | "C", # flake8-comprehensions 96 | "B", # flake8-bugbear 97 | ] 98 | [tool.coverage.report] 99 | exclude_also = [ 100 | "def __repr__", 101 | "if self.debug:", 102 | "if settings.DEBUG", 103 | "raise AssertionError", 104 | "raise NotImplementedError", 105 | "except ImportError", 106 | "if 0:", 107 | "if __name__ == .__main__.:", 108 | "if TYPE_CHECKING:", 109 | "class .*\\bProtocol\\):", 110 | "@(abc\\.)?abstractmethod", 111 | ] 112 | 113 | [tool.ruff.isort] 114 | known-third-party = ["click", "pydantic"] 115 | 116 | [tool.ruff.format] 117 | quote-style = "double" 118 | indent-style = "space" 119 | skip-magic-trailing-comma = false 120 | line-ending = "auto" 121 | 122 | [tool.isort] 123 | extend_skip = ["__init__.py"] 124 | -------------------------------------------------------------------------------- /tests/test_person.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pydantic import BaseModel, ValidationError 4 | 5 | from fuzztypes import Person, validate_python 6 | 7 | 8 | class MyModel(BaseModel): 9 | person: Person 10 | optional: Optional[Person] = None 11 | 12 | 13 | def test_example(): 14 | person = validate_python(Person, "Mr. John (Johnny) Q. Public IV") 15 | assert str(person) == "Mr. John Q. Public IV (Johnny)" 16 | assert person.last_name_first == "Public, John Q." 17 | assert person.short_name == "John Public" 18 | assert person.legal_name == "John Q. Public IV" 19 | assert person.full_name == "Mr. John Q. Public IV (Johnny)" 20 | 21 | assert person.initials == "J. Q. P." 22 | assert person.full_initials == "J. Q. P." 23 | assert person.short_initials == "J. P." 24 | 25 | obj2 = MyModel(person=person) 26 | assert obj2.person == person 27 | assert obj2.person.human_name() == person.human_name() 28 | 29 | assert obj2.optional is None 30 | 31 | 32 | def test_mixed_capitalization_with_validate_python(): 33 | person = validate_python(Person, "shirley maclaine") 34 | assert person.first == "Shirley" 35 | assert person.last == "MacLaine" 36 | 37 | 38 | def test_null_person_ok(): 39 | assert validate_python(Optional[Person], None) is None 40 | 41 | 42 | def test_different_nickname_format_oh_well(): 43 | obj = validate_python(MyModel, dict(person="Arthur 'The Fonz' Fonzerelli")) 44 | assert obj.person.first == "Arthur" 45 | assert obj.person.last == "Fonzerelli" 46 | assert obj.person.middle == "'the Fonz'" 47 | assert str(obj.person) == "Arthur 'the Fonz' Fonzerelli" 48 | 49 | 50 | def test_json_serialization(): 51 | json = '{"person": "Grace Hopper", "optional": null}' 52 | obj = MyModel.model_validate_json(json) 53 | assert str(obj.person) == "Grace Hopper" 54 | assert obj.optional is None 55 | 56 | data = dict(person="grace hopper", optional="ava lovelace") 57 | obj = MyModel.model_validate(data) 58 | assert str(obj.person) == "Grace Hopper" 59 | assert str(obj.optional) == "Ava Lovelace" 60 | 61 | json = obj.model_dump_json(exclude_defaults=True) 62 | assert ( 63 | json == '{"person":{"first":"Grace","last":"Hopper"},' 64 | '"optional":{"first":"Ava","last":"Lovelace"}}' 65 | ) 66 | obj = MyModel.model_validate_json(json) 67 | 68 | data = obj.model_dump(exclude_defaults=True) 69 | assert data == dict( 70 | person=dict(first="Grace", last="Hopper"), 71 | optional=dict(first="Ava", last="Lovelace"), 72 | ) 73 | 74 | 75 | def test_value_error(): 76 | try: 77 | data: dict = {} 78 | validate_python(MyModel, data) 79 | assert False, "Didn't fail as expected." 80 | except ValidationError: 81 | pass 82 | 83 | try: 84 | data = dict(person=5) 85 | validate_python(MyModel, data) 86 | assert False, "Didn't fail as expected." 87 | except ValueError: 88 | pass 89 | -------------------------------------------------------------------------------- /src/fuzztypes/const.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Literal 3 | 4 | # Home directory of fuzztypes library. 5 | FuzzHome = "~/.local/fuzztypes/" 6 | FuzzHome = os.path.expanduser(os.environ.get("FUZZTYPES_HOME", FuzzHome)) 7 | StoredValidatorPath = os.path.join(FuzzHome, "on_disk") 8 | DownloadsPath = os.path.join(FuzzHome, "downloads") 9 | 10 | # Default encoder to use when generating semantic embeddings. 11 | # Override with environment variable `FUZZTYPES_DEFAULT_ENCODER`. 12 | DefaultEncoder = "sentence-transformers/paraphrase-MiniLM-L6-v2" 13 | DefaultEncoder = os.environ.get("FUZZTYPES_DEFAULT_ENCODER", DefaultEncoder) 14 | 15 | # Default path for storing models for sentence transformers. 16 | ModelsPath = os.path.join(FuzzHome, "models") 17 | 18 | 19 | # Date Ordering used when parsing ambiguous dates. 20 | # https://dateparser.readthedocs.io/en/latest/settings.html#date-order 21 | DateOrder = Literal["DMY", "MDY", "YMD"] 22 | 23 | # Device to use for generating semantic embeddings and lancedb indexing. 24 | # https://www.sbert.net/examples/applications/computing-embeddings/README.html 25 | # https://lancedb.github.io/lance/read_and_write.html#indexing 26 | DeviceList = Literal["cpu", "cuda", "mps"] 27 | 28 | # Which rapidfuzz scorer to use? 29 | # https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html 30 | # Scorers: 31 | # ratio: Calculates Levenshtein Distance similarity ratio 32 | # partial_ratio: Compares substrings, good for different length strings 33 | # token_set_ratio: Compares unique words, allows different word order 34 | # partial_token_set_ratio: Like token_set_ratio but compares substrings 35 | # token_sort_ratio: Sorts words before compare, good when order is irrelevant 36 | # partial_token_sort_ratio: Like token_sort_ratio but compares substrings 37 | # token_ratio: Averages token_sort_ratio and token_set_ratio 38 | # partial_token_ratio: Averages partial token sort and set ratios 39 | # WRatio: Weighted combination of different ratios based on string lengths 40 | # QRatio: Faster version of ratio, less accurate 41 | FuzzScorer = Literal[ 42 | "ratio", 43 | "partial_ratio", 44 | "token_set_ratio", 45 | "partial_token_set_ratio", 46 | "token_sort_ratio", 47 | "partial_token_sort_ratio", 48 | "token_ratio", 49 | "partial_token_ratio", 50 | "WRatio", 51 | "QRatio", 52 | ] 53 | 54 | # What happens if a matching entity is not found for key? 55 | # raise: raises an exception if no matching entity found 56 | # none: sets value to None if no matching entity found 57 | # allow: passes through key 58 | NotFoundMode = Literal["raise", "none", "allow"] 59 | 60 | 61 | # What happens if there is a tie? 62 | # raise: raise an exception if two elements are tied without Entity.priority 63 | # lesser: use lower Entity.value, if Entity.priority not set or different 64 | # greater: use greater Entity.value, if Entity.priority not set or different 65 | TiebreakerMode = Literal["raise", "lesser", "greater"] 66 | 67 | # Which Pydantic validator mode? 68 | # https://docs.pydantic.dev/latest/concepts/validators/ 69 | # Only 'before' has been tested, 'plain' may work with no changes. 70 | # Refactoring probably required for 'after' and 'wrap'. 71 | ValidatorMode = Literal["before"] # ... , "after", "plain", "wrap"] 72 | -------------------------------------------------------------------------------- /tests/utils/test_download.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import pytest 4 | 5 | from fuzztypes.const import DownloadsPath 6 | from fuzztypes.utils.download import get_file 7 | 8 | 9 | @pytest.fixture 10 | def mock_path_exists(mocker): 11 | return mocker.patch("os.path.exists") 12 | 13 | 14 | @pytest.fixture 15 | def mock_getmtime(mocker): 16 | return mocker.patch("os.path.getmtime") 17 | 18 | 19 | @pytest.fixture 20 | def mock_replace(mocker): 21 | return mocker.patch("os.replace") 22 | 23 | 24 | @pytest.fixture 25 | def mock_file_age(mocker): 26 | return mocker.patch("fuzztypes.utils.download.get_file_age_in_days") 27 | 28 | 29 | @pytest.fixture 30 | def mock_urlretrieve(mocker): 31 | return mocker.patch("urllib.request.urlretrieve") 32 | 33 | 34 | @pytest.fixture 35 | def mock_logger_warning(mocker): 36 | return mocker.patch("fuzztypes.logger.warning") 37 | 38 | 39 | @pytest.fixture 40 | def mock_logger_error(mocker): 41 | return mocker.patch("fuzztypes.logger.error") 42 | 43 | 44 | def test_get_file_cache_hit(mock_path_exists, mock_file_age, mock_replace): 45 | mock_path_exists.return_value = True 46 | mock_file_age.return_value = 10 47 | 48 | result = get_file("http://example.com/file.txt") 49 | assert result == os.path.join(DownloadsPath, "file.txt") 50 | mock_replace.assert_not_called() 51 | 52 | 53 | def test_cache_miss_due_to_expiry( 54 | mock_path_exists, mock_file_age, mock_replace, mock_urlretrieve 55 | ): 56 | mock_path_exists.return_value = True 57 | mock_file_age.return_value = 31 58 | mock_urlretrieve.return_value = True 59 | 60 | result = get_file("http://example.com/file.txt") 61 | assert result == os.path.join(DownloadsPath, "file.txt") 62 | mock_replace.assert_called_once() 63 | mock_urlretrieve.assert_called_once_with( 64 | "http://example.com/file.txt", 65 | os.path.join(DownloadsPath, "file.txt.tmp"), 66 | ) 67 | 68 | 69 | def test_cache_miss_due_to_absence( 70 | mock_path_exists, mock_replace, mock_urlretrieve 71 | ): 72 | mock_path_exists.side_effect = [ 73 | False, 74 | True, 75 | ] # First call for cache check, second for download check 76 | mock_urlretrieve.return_value = True # Simulate successful download 77 | assert get_file("http://example.com/file.txt") is not None 78 | mock_replace.assert_called_once() 79 | mock_urlretrieve.assert_called_once() 80 | 81 | 82 | def test_download_failure( 83 | mock_path_exists, mock_logger_error, mock_urlretrieve 84 | ): 85 | mock_path_exists.return_value = False 86 | mock_urlretrieve.side_effect = Exception("Download failed") 87 | assert get_file("http://example.com/file.txt") is None 88 | mock_logger_error.assert_called_once() 89 | 90 | 91 | def test_download_exception_handling( 92 | mock_path_exists, mock_logger_warning, mock_urlretrieve 93 | ): 94 | mock_path_exists.return_value = False 95 | mock_urlretrieve.side_effect = Exception( 96 | "Unexpected error" 97 | ) # Simulate an exception during download 98 | assert get_file("http://example.com/file.txt") is None 99 | mock_logger_warning.assert_called_once() 100 | -------------------------------------------------------------------------------- /src/fuzztypes/match.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Optional, Any, Union, Type 2 | 3 | from pydantic import BaseModel, Field 4 | 5 | from . import Entity, NamedEntity, const 6 | 7 | 8 | class Match(BaseModel): 9 | key: Any 10 | entity: Entity 11 | is_alias: bool = False 12 | score: float = 100.0 13 | term: Optional[str] = None 14 | 15 | @property 16 | def rank(self) -> Tuple[float, int]: 17 | return -1 * self.score, self.entity.rank 18 | 19 | @property 20 | def rank_value(self) -> Tuple[Tuple[float, int], Any]: 21 | return self.rank, self.entity.value 22 | 23 | def __lt__(self, other: "Match"): 24 | return self.rank_value < other.rank_value 25 | 26 | 27 | class MatchResult(BaseModel): 28 | matches: List[Match] = Field(default_factory=list) 29 | choice: Optional[Match] = None 30 | 31 | def __bool__(self): 32 | return bool(self.matches) 33 | 34 | def __len__(self): 35 | return len(self.matches) 36 | 37 | def __getitem__(self, item): 38 | return self.matches[item] 39 | 40 | @property 41 | def entity(self): 42 | return self.choice is not None and self.choice.entity 43 | 44 | def append(self, match: Match): 45 | """Add a match to the list of potential matches.""" 46 | self.matches.append(match) 47 | 48 | def choose(self, min_score: float, tiebreaker_mode: const.TiebreakerMode): 49 | """Filter matches by score, sort by rank/alpha, and make choice.""" 50 | allowed = sorted(m for m in self.matches if m.score >= min_score) 51 | count = len(allowed) 52 | 53 | if count == 1: 54 | self.choice = allowed[0] 55 | 56 | elif count > 1: 57 | first = allowed[0] 58 | tied = [ 59 | m 60 | for m in allowed[1:] 61 | if m.rank == first.rank and m.entity != first.entity 62 | ] 63 | 64 | if not tied or tiebreaker_mode == "lesser": 65 | self.choice = first 66 | 67 | elif tiebreaker_mode == "greater": 68 | self.choice = tied[-1] 69 | 70 | 71 | class Record(BaseModel): 72 | entity: Union[NamedEntity, str] 73 | term: str 74 | norm_term: Optional[str] = None 75 | is_alias: bool 76 | vector: Any = None 77 | 78 | @classmethod 79 | def from_list( 80 | cls, 81 | recs: list, 82 | key, 83 | score: float = 100.0, 84 | entity_type: Type[NamedEntity] = NamedEntity, 85 | ) -> List[Match]: 86 | return [record.to_match(key, score, entity_type) for record in recs] 87 | 88 | def to_match( 89 | self, 90 | key, 91 | score: float = 100.0, 92 | entity_type: Type[NamedEntity] = NamedEntity, 93 | ) -> Match: 94 | if isinstance(self.entity, str): 95 | match_entity = entity_type.model_validate_json(self.entity) 96 | else: 97 | match_entity = self.entity 98 | 99 | return Match( 100 | key=key, 101 | entity=match_entity, 102 | is_alias=self.is_alias, 103 | score=score, 104 | term=self.term, 105 | ) 106 | -------------------------------------------------------------------------------- /src/fuzztypes/language.py: -------------------------------------------------------------------------------- 1 | import json 2 | from enum import Enum 3 | from typing import Annotated, Optional, List, Iterable, Type 4 | 5 | from pydantic import TypeAdapter 6 | 7 | from fuzztypes import EntitySource, NamedEntity, OnDiskValidator, flags, utils 8 | 9 | 10 | class LanguageScope(Enum): 11 | INDIVIDUAL = "I" 12 | MACROLANGUAGE = "M" 13 | SPECIAL = "S" 14 | 15 | 16 | class LanguageType(Enum): 17 | ANCIENT = "A" 18 | CONSTRUCTED = "C" 19 | EXTINCT = "E" 20 | HISTORICAL = "H" 21 | LIVING = "L" 22 | SPECIAL = "S" 23 | 24 | 25 | class LanguageNamedEntity(NamedEntity): 26 | """Resolves to language full name.""" 27 | 28 | alpha_2: Optional[str] = None 29 | alpha_3: str 30 | scope: Optional[LanguageScope] = None 31 | type: Optional[LanguageType] = None 32 | common_name: Optional[str] = None 33 | inverted_name: Optional[str] = None 34 | bibliographic: Optional[str] = None 35 | 36 | @property 37 | def code(self): 38 | return self.alpha_2 or self.alpha_3 39 | 40 | 41 | class LanguageModelNamedEntity(LanguageNamedEntity): 42 | """Resolves to self as a full child object.""" 43 | 44 | def resolve(self): 45 | return self 46 | 47 | 48 | class LanguageCodeNameEntity(LanguageNamedEntity): 49 | """Resolves to code name.""" 50 | 51 | def resolve(self): 52 | return self.code 53 | 54 | 55 | LanguageNamedEntityType = Type[LanguageNamedEntity] 56 | 57 | 58 | def load_languages( 59 | entity_cls: Type[LanguageNamedEntity] = LanguageNamedEntity, 60 | ): 61 | def do_load() -> Iterable[NamedEntity]: 62 | repo = "https://salsa.debian.org/iso-codes-team/iso-codes/" 63 | remote = f"{repo}-/raw/main/data/iso_639-3.json" 64 | local = utils.get_file(remote) 65 | assert local, f"Could not download: {remote}" 66 | data = json.load(open(local))["639-3"] 67 | alias_fields = { 68 | "alpha_2", 69 | "alpha_3", 70 | "common_name", 71 | "inverted_name", 72 | "bibliographic", 73 | } 74 | entities = [] 75 | for item in data: 76 | item["value"] = item.pop("name") 77 | aliases = [v for k, v in item.items() if k in alias_fields] 78 | item["aliases"] = aliases 79 | entities.append(item) 80 | return TypeAdapter(List[LanguageNamedEntity]).validate_python(data) 81 | 82 | return do_load 83 | 84 | 85 | LanguageName = Annotated[ 86 | str, 87 | OnDiskValidator( 88 | "Language", 89 | EntitySource(load_languages(LanguageNamedEntity)), 90 | entity_type=LanguageNamedEntity, 91 | search_flag=flags.AliasSearch, 92 | tiebreaker_mode="lesser", 93 | ), 94 | ] 95 | 96 | LanguageCode = Annotated[ 97 | str, 98 | OnDiskValidator( 99 | "Language", 100 | EntitySource(load_languages(LanguageCodeNameEntity)), 101 | entity_type=LanguageCodeNameEntity, 102 | search_flag=flags.AliasSearch, 103 | tiebreaker_mode="lesser", 104 | ), 105 | ] 106 | 107 | Language = Annotated[ 108 | LanguageNamedEntity, 109 | OnDiskValidator( 110 | "Language", 111 | EntitySource(load_languages(LanguageModelNamedEntity)), 112 | entity_type=LanguageModelNamedEntity, 113 | search_flag=flags.AliasSearch, 114 | tiebreaker_mode="lesser", 115 | ), 116 | ] 117 | -------------------------------------------------------------------------------- /src/fuzztypes/person.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, Optional 2 | 3 | from pydantic import BaseModel 4 | 5 | from fuzztypes import FuzzValidator, lazy 6 | 7 | FULL_NAME = "{title} {first} {middle} {last} {suffix} ({nickname})" 8 | SHORT_NAME = "{first} {last}" 9 | LEGAL_NAME = "{first} {middle} {last} {suffix}" 10 | LAST_NAME_FIRST = "{last}, {first} {middle}" 11 | 12 | FULL_INIT = "{first} {middle} {last}" 13 | SHORT_INIT = "{first} {last}" 14 | 15 | 16 | def parse(**kwargs): 17 | HumanName = lazy.lazy_import("nameparser", "HumanName") 18 | return HumanName(**kwargs) 19 | 20 | 21 | class PersonModel(BaseModel): 22 | name_format: str = FULL_NAME 23 | init_format: str = FULL_INIT 24 | title: str = "" 25 | first: str = "" 26 | middle: str = "" 27 | last: str = "" 28 | suffix: str = "" 29 | nickname: str = "" 30 | 31 | def __str__(self): 32 | return self.name 33 | 34 | # names 35 | 36 | @property 37 | def name(self) -> str: 38 | return str(self.human_name()) 39 | 40 | @property 41 | def full_name(self) -> str: 42 | return str(self.human_name(name_format=FULL_NAME)) 43 | 44 | @property 45 | def short_name(self) -> str: 46 | return str(self.human_name(name_format=SHORT_NAME)) 47 | 48 | @property 49 | def legal_name(self) -> str: 50 | return str(self.human_name(name_format=LEGAL_NAME)) 51 | 52 | @property 53 | def last_name_first(self) -> str: 54 | return str(self.human_name(name_format=LAST_NAME_FIRST)) 55 | 56 | # initials 57 | 58 | @property 59 | def initials(self) -> str: 60 | return self.human_name().initials() 61 | 62 | @property 63 | def full_initials(self) -> str: 64 | return self.human_name(init_format=FULL_INIT).initials() 65 | 66 | @property 67 | def short_initials(self) -> str: 68 | return self.human_name(init_format=SHORT_INIT).initials() 69 | 70 | # human name object from nameparser library 71 | 72 | def human_name(self, name_format=None, init_format=None): 73 | name_format = name_format or self.name_format 74 | init_format = init_format or self.init_format 75 | return parse( 76 | string_format=name_format, 77 | initials_format=init_format, 78 | title=self.title, 79 | first=self.first, 80 | middle=self.middle, 81 | last=self.last, 82 | suffix=self.suffix, 83 | nickname=self.nickname, 84 | ) 85 | 86 | 87 | def PersonValidator( 88 | name_format: str = FULL_NAME, 89 | init_format: str = FULL_INIT, 90 | capitalize: bool = True, 91 | ): 92 | def to_person(key) -> Optional[PersonModel]: 93 | if isinstance(key, str): 94 | human_name = parse(full_name=key) 95 | if capitalize: 96 | human_name.capitalize(force=True) 97 | data = human_name.as_dict() 98 | person = PersonModel( 99 | name_format=name_format, init_format=init_format, **data 100 | ) 101 | elif isinstance(key, PersonModel): 102 | person = key 103 | elif isinstance(key, dict): 104 | person = PersonModel(**key) 105 | else: 106 | raise ValueError(f"Unexpected key type {type(key)} for {key}.") 107 | 108 | return person 109 | 110 | return FuzzValidator(to_person) 111 | 112 | 113 | # default annotation 114 | Person = Annotated[PersonModel, PersonValidator()] 115 | -------------------------------------------------------------------------------- /tests/in_memory/test_in_memory_tags_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Inspired by Simon Willison's twitter post: 3 | https://x.com/simonw/status/1766847300310028698 4 | 5 | Collected tags from his website here: 6 | https://simonwillison.net/tags/ 7 | 8 | Future Goal: Move to OnDiskValidator implementation with NotFound=Allow where the 9 | tags are added to the database incrementally for future fuzzy matching. 10 | https://github.com/quickwit-oss/tantivy-py/issues/20 11 | https://docs.rs/tantivy/latest/tantivy/query/struct.FuzzyTermQuery.html 12 | """ 13 | from typing import Annotated, List 14 | 15 | from pydantic import BaseModel 16 | from pytest import fixture 17 | 18 | from fuzztypes import ( 19 | EntitySource, 20 | InMemoryValidator, 21 | flags, 22 | resolve_entity, 23 | validate_python, 24 | Entity, 25 | ) 26 | 27 | 28 | @fixture(scope="session") 29 | def TagSource(data_path): 30 | source = EntitySource(data_path / "simonw_tags.csv") 31 | assert len(source) == 1500 32 | return source 33 | 34 | 35 | @fixture(scope="session") 36 | def Tag(TagSource): 37 | # allow set will pass through any not founds 38 | # Fuzz Search using RapidFuzz 39 | # min_similarity is very low for demo 40 | # QRatio used because tags are single "words" (e.g. sqlinjection) 41 | 42 | return Annotated[ 43 | str, 44 | InMemoryValidator( 45 | TagSource, 46 | notfound_mode="allow", 47 | search_flag=flags.FuzzSearch, 48 | min_similarity=50.0, 49 | fuzz_scorer="QRatio", 50 | ), 51 | ] 52 | 53 | 54 | def test_get_entity_from_annotation(Tag): 55 | entity = resolve_entity(Tag, "2d") 56 | assert isinstance(entity, Entity) 57 | assert entity.priority == 3 58 | 59 | entity = resolve_entity(Tag, "3d") 60 | assert isinstance(entity, Entity) 61 | assert entity.priority == 14 62 | 63 | 64 | def test_fuzzy_tags_priority(Tag): 65 | # since min_similarity is 50.0, it chooses higher priority 66 | assert validate_python(Tag, "4d") == "3d" 67 | 68 | # matches because 67% ratio > 50.0 minimum 69 | assert validate_python(Tag, "27d") == "2d" 70 | 71 | # less than 50% similarity is passed through (notfound_mode="allow") 72 | assert validate_python(Tag, "17d") == "17d" 73 | 74 | # different 75 | assert validate_python(Tag, "18d") == "i18n" 76 | 77 | # todo: collect allowed tags and use for future fuzzy matching 78 | # assert validate_python(Tag, "15d") == "17d" 79 | assert validate_python(Tag, "15d") == "15d" 80 | 81 | 82 | def test_fuzzy_scoring_edge_cases(Tag): 83 | assert validate_python(Tag, "prompt_injection") == "promptinjection" 84 | assert validate_python(Tag, "promptinjections") == "promptinjection" 85 | assert validate_python(Tag, "prompt injections") == "promptinjection" 86 | 87 | 88 | def test_as_a_list_of_tags(TagSource): 89 | Tag = Annotated[ 90 | str, 91 | InMemoryValidator( 92 | TagSource, 93 | notfound_mode="allow", 94 | search_flag=flags.FuzzSearch, 95 | min_similarity=50.0, 96 | fuzz_scorer="QRatio", 97 | ), 98 | ] 99 | 100 | class Post(BaseModel): 101 | text: str 102 | tags: List[Tag] 103 | 104 | post = Post( 105 | text="Prompt injection is unsolved still.", 106 | tags=["prompt_injection", "AI"], 107 | ) 108 | 109 | assert post.tags == ["promptinjection", "ai"] 110 | 111 | json = post.model_dump_json() 112 | second = Post.model_validate_json(json) 113 | assert second.tags == ["promptinjection", "ai"] 114 | -------------------------------------------------------------------------------- /src/fuzztypes/validation.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import sys 3 | from functools import lru_cache 4 | from itertools import chain 5 | from typing import Any, Dict, Optional, Union, cast, get_args 6 | 7 | from pydantic import ( 8 | GetCoreSchemaHandler, 9 | GetJsonSchemaHandler, 10 | TypeAdapter, 11 | json_schema, 12 | ) 13 | from pydantic_core import CoreSchema, PydanticCustomError, core_schema 14 | 15 | from fuzztypes import Entity 16 | 17 | dataclass_kwargs: Dict[str, Any] 18 | 19 | slots_true: Dict[str, bool] = {} 20 | if sys.version_info >= (3, 10): 21 | slots_true = {"slots": True} # pragma: no cover 22 | 23 | 24 | @lru_cache(maxsize=None) 25 | def get_type_adapter(cls: Any) -> TypeAdapter: 26 | """ 27 | Get a type adapter for the given class wrapped by a cache. 28 | 29 | :param cls: TypedDict, BaseModel, or Annotation. 30 | :return: TypeAdapter wrapper of cls 31 | """ 32 | return TypeAdapter(cls) 33 | 34 | 35 | def validate_json(cls: Any, value: Union[str, bytes]) -> Any: 36 | """ 37 | Validate a JSON string or bytes against the model. 38 | 39 | :param cls: TypedDict, BaseModel, or Annotation. 40 | :param value: JSON string or bytes to validate. 41 | :return: Validated Python object. 42 | """ 43 | return get_type_adapter(cls).validate_json(value) 44 | 45 | 46 | def validate_python(cls: Any, value: Any) -> Any: 47 | """ 48 | Validate a Python object against the model. 49 | 50 | :param cls: TypedDict, BaseModel, or Annotation. 51 | :param value: Python object to validate. 52 | :return: Validated Python object. 53 | """ 54 | ta = get_type_adapter(cls) 55 | return ta.validate_python(value) 56 | 57 | 58 | def resolve_entity(cls: Any, value: Any) -> Optional[Entity]: 59 | """ 60 | Returns entity from metadata if cls is a FuzzValidator. 61 | 62 | :param cls: Any object 63 | :param value: input value 64 | :return: Entity if validator is an entity source 65 | """ 66 | metadata = get_args(cls) 67 | entity = None 68 | for item in chain([cls], metadata): 69 | if isinstance(item, FuzzValidator): 70 | entity = item[value] 71 | return entity 72 | 73 | 74 | @dataclasses.dataclass(frozen=True, **slots_true) 75 | class FuzzValidator: 76 | func: Any 77 | examples: Optional[list] = None 78 | 79 | def __hash__(self): 80 | attrs = (self.func, tuple(self.examples or ())) 81 | return hash(attrs) 82 | 83 | def __getitem__(self, key): 84 | try: 85 | return self.func[key] 86 | except PydanticCustomError as err: 87 | raise KeyError(f"Key Error: {key} [{err}]") from err 88 | 89 | def __get_pydantic_core_schema__( 90 | self, source_type: Any, handler: GetCoreSchemaHandler 91 | ) -> core_schema.CoreSchema: 92 | schema = handler(source_type) 93 | func = cast(core_schema.NoInfoValidatorFunction, self.func) 94 | 95 | return core_schema.no_info_before_validator_function( 96 | func, schema=schema 97 | ) 98 | 99 | def __get_pydantic_json_schema__( 100 | self, 101 | schema: CoreSchema, 102 | handler: GetJsonSchemaHandler, 103 | ) -> json_schema.JsonSchemaValue: 104 | """ 105 | Generate the JSON schema for the AbstractType. 106 | 107 | This method is used internally by Pydantic to generate the JSON 108 | schema representation of the AbstractType, including any examples. 109 | """ 110 | schema = handler(schema) 111 | if self.examples is not None: 112 | schema["examples"] = self.examples 113 | return schema 114 | -------------------------------------------------------------------------------- /src/fuzztypes/storage.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union 2 | 3 | from pydantic_core import PydanticCustomError 4 | 5 | from fuzztypes import NamedEntity, MatchResult, const, flags, lazy 6 | 7 | 8 | class AbstractStorage: 9 | def __init__( 10 | self, 11 | source: Iterable, 12 | *, 13 | case_sensitive: bool = False, 14 | device: const.DeviceList = "cpu", 15 | encoder: Union[Callable, str, object] = None, 16 | entity_type: Type[NamedEntity] = NamedEntity, 17 | fuzz_scorer: str = "token_sort_ratio", 18 | limit: int = 10, 19 | min_similarity: float = 80.0, 20 | notfound_mode: const.NotFoundMode = "raise", 21 | search_flag: flags.SearchFlag = flags.DefaultSearch, 22 | tiebreaker_mode: const.TiebreakerMode = "raise", 23 | ): 24 | assert not search_flag.is_hybrid, "Hybrid search not yet supported!" 25 | 26 | self.source = source 27 | 28 | # options 29 | self.case_sensitive = case_sensitive 30 | self.device = device 31 | self.entity_type = entity_type 32 | self.limit = limit 33 | self.min_similarity = min_similarity 34 | self.notfound_mode = notfound_mode 35 | self.prepped = False 36 | self.search_flag = search_flag 37 | self.tiebreaker_mode = tiebreaker_mode 38 | 39 | # store string for lazy loading 40 | self._fuzz_scorer = fuzz_scorer 41 | self._encoder = encoder 42 | self._vect_dimensions = None 43 | 44 | def __call__(self, key: str) -> Optional[Any]: 45 | entity = self[key] 46 | return entity.resolve() if entity else None 47 | 48 | def __getitem__(self, key: str) -> Optional[NamedEntity]: 49 | if not self.prepped: 50 | self.prepped = True 51 | self.prepare() 52 | 53 | match_list = self.get(key) 54 | match_list.choose(self.min_similarity, self.tiebreaker_mode) 55 | 56 | if match_list.choice is not None: 57 | return match_list.entity 58 | 59 | if self.notfound_mode == "allow": 60 | return self.entity_type(value=key) 61 | 62 | if self.notfound_mode == "none": 63 | return None 64 | 65 | msg = '"{key}" could not be resolved' 66 | ctx: Dict[str, Any] = dict(key=key) 67 | if match_list: 68 | near = [f'"{match.entity.value}"' for match in match_list.matches] 69 | if len(near) > 1: 70 | near[-1] = "or " + near[-1] 71 | msg += f", did you mean {', '.join(near)}?" 72 | raise PydanticCustomError("key_not_found", msg, ctx) 73 | 74 | def prepare(self): 75 | raise NotImplementedError 76 | 77 | def get(self, key: str) -> MatchResult: 78 | raise NotImplementedError 79 | 80 | def normalize(self, key: str): 81 | if key: 82 | key = key.strip() 83 | if self.case_sensitive: 84 | return key 85 | else: 86 | return key.lower() 87 | 88 | # 89 | # encoding 90 | # 91 | 92 | @property 93 | def encoder(self): 94 | return lazy.create_encoder(self._encoder, device=self.device) 95 | 96 | @property 97 | def vect_dimensions(self): 98 | if self._vect_dimensions is None: 99 | dummy_encoded = self.encode([""]) 100 | self._vect_dimensions = dummy_encoded.shape[1] 101 | return self._vect_dimensions 102 | 103 | def encode(self, values: List[str]): 104 | return self.encoder(values) 105 | 106 | # 107 | # fuzzy matching 108 | # 109 | 110 | @property 111 | def rapidfuzz(self): 112 | return lazy.lazy_import("rapidfuzz") 113 | 114 | @property 115 | def fuzz_scorer(self): 116 | return getattr( 117 | self.rapidfuzz.fuzz, 118 | self._fuzz_scorer, 119 | self.rapidfuzz.fuzz.token_sort_ratio, 120 | ) 121 | 122 | def fuzz_clean(self, term: str) -> str: 123 | # no really, it's a string 124 | # noinspection PyTypeChecker 125 | return self.rapidfuzz.utils.default_process(term) 126 | -------------------------------------------------------------------------------- /tests/in_memory/test_in_memory_fuzz.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, Optional 2 | from pydantic import BaseModel, ValidationError 3 | 4 | from fuzztypes import NamedEntity, InMemoryValidator, flags, validate_python 5 | 6 | FruitStr = Annotated[ 7 | Optional[str], 8 | InMemoryValidator( 9 | ["Apple", "Banana"], 10 | search_flag=flags.FuzzSearch, 11 | ), 12 | ] 13 | 14 | DirectionStr = Annotated[ 15 | Optional[str], 16 | InMemoryValidator( 17 | [ 18 | ("Left", "L"), 19 | ("Right", "R"), 20 | ("Middle", "M"), 21 | ], 22 | search_flag=flags.FuzzSearch, 23 | ), 24 | ] 25 | LooseStr = Annotated[ 26 | Optional[str], 27 | InMemoryValidator( 28 | ["A B C", "X Y Z"], 29 | min_similarity=10.0, 30 | limit=1, 31 | search_flag=flags.FuzzSearch, 32 | ), 33 | ] 34 | StrictStr = Annotated[ 35 | str, 36 | InMemoryValidator( 37 | ["A B C", "X Y Z"], 38 | min_similarity=95.0, 39 | limit=1, 40 | search_flag=flags.FuzzSearch, 41 | ), 42 | ] 43 | 44 | 45 | class Model(BaseModel): 46 | fruit: FruitStr = None 47 | direction: DirectionStr = None 48 | loose: LooseStr = "A B C" 49 | strict: StrictStr = "A B C" 50 | 51 | 52 | def test_exact_matches(): 53 | obj = Model(fruit="Apple", direction="Left") 54 | assert obj.fruit == "Apple" 55 | assert obj.direction == "Left" 56 | 57 | 58 | def test_case_insensitive(): 59 | obj = Model(fruit="banana", direction="right") 60 | assert obj.fruit == "Banana" 61 | assert obj.direction == "Right" 62 | 63 | 64 | def test_case_fuzzy(): 65 | obj = Model(fruit="appel", direction="lft.") 66 | assert obj.fruit == "Apple" 67 | assert obj.direction == "Left" 68 | 69 | 70 | def test_synonyms(): 71 | assert Model(direction="L").direction == "Left" 72 | assert Model(direction="r").direction == "Right" 73 | assert Model(direction="M.").direction == "Middle" 74 | 75 | 76 | def test_get_item(): 77 | assert validate_python(DirectionStr, "L") == "Left" 78 | 79 | try: 80 | assert validate_python(DirectionStr, "XYZ") 81 | raise AssertionError("Didn't throw KeyError") 82 | except ValidationError: 83 | pass 84 | 85 | 86 | def test_min_score(): 87 | assert Model(loose="B K L").loose == "A B C" 88 | 89 | try: 90 | Model(strict="B K L") 91 | assert "Expected validation error!" 92 | 93 | except ValidationError as e: 94 | assert e.errors(include_url=False) == [ 95 | { 96 | "ctx": {"key": "B K L"}, 97 | "input": "B K L", 98 | "loc": ("strict",), 99 | "msg": '"B K L" could not be resolved, did you mean "A B C"?', 100 | "type": "key_not_found", 101 | } 102 | ] 103 | 104 | 105 | def test_with_priority(): 106 | entities = [ 107 | NamedEntity(value="WP1", priority=1), 108 | NamedEntity(value="WP2", priority=1), 109 | NamedEntity(value="WP3", priority=3), 110 | ] 111 | 112 | # highest priority sorts to the front 113 | assert sorted(entities)[0].value == "WP3" 114 | 115 | # value is tiebreaker 116 | assert sorted(entities)[1].value == "WP1" 117 | 118 | # validate that priority wins 119 | WithPriority = InMemoryValidator( 120 | entities, 121 | min_similarity=65.0, 122 | search_flag=flags.FuzzSearch, 123 | ) 124 | assert WithPriority["WPX"].value == "WP3" 125 | 126 | 127 | def test_without_tiebreaker(): 128 | entities = ["NT1", "NT2", "NT3"] 129 | WithoutPriority = InMemoryValidator( 130 | entities, 131 | min_similarity=65.0, 132 | search_flag=flags.FuzzSearch, 133 | ) 134 | try: 135 | assert WithoutPriority["NTX"] is None 136 | except KeyError: 137 | pass 138 | 139 | 140 | def test_with_lesser_tiebreaker(): 141 | entities = ["NT1", "NT2", "NT3"] 142 | LesserTiebreak = InMemoryValidator( 143 | entities, 144 | min_similarity=65, 145 | tiebreaker_mode="lesser", 146 | search_flag=flags.FuzzSearch, 147 | ) 148 | assert LesserTiebreak["NTX"].value == "NT1" 149 | 150 | 151 | def test_with_greater_tiebreaker(): 152 | entities = ["NT1", "NT2", "NT3", "XX5"] 153 | GreaterTiebreak = InMemoryValidator( 154 | entities, 155 | min_similarity=0, 156 | tiebreaker_mode="greater", 157 | search_flag=flags.FuzzSearch, 158 | ) 159 | assert GreaterTiebreak["NTX"].value == "NT3" 160 | -------------------------------------------------------------------------------- /src/fuzztypes/in_memory.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import Callable, Iterable, Union, Type, Optional 3 | 4 | from pydantic import PositiveInt 5 | 6 | from fuzztypes import ( 7 | FuzzValidator, 8 | Match, 9 | MatchResult, 10 | NamedEntity, 11 | Record, 12 | const, 13 | flags, 14 | lazy, 15 | storage, 16 | ) 17 | 18 | 19 | class InMemoryValidatorStorage(storage.AbstractStorage): 20 | def __init__(self, *args, **kwargs): 21 | super().__init__(*args, **kwargs) 22 | 23 | self._mapping = defaultdict(list) 24 | self._terms = [] 25 | self._is_alias = [] 26 | self._entities = [] 27 | self._embeddings = None 28 | 29 | # 30 | # Prepare 31 | # 32 | 33 | def prepare(self): 34 | for item in self.source: 35 | entity = self.entity_type.convert(item) 36 | self.add(entity) 37 | 38 | def add(self, entity: NamedEntity) -> None: 39 | if self.search_flag.is_name_ok: 40 | self.add_by_name(entity) 41 | 42 | if self.search_flag.is_alias_ok: 43 | self.add_by_alias(entity) 44 | 45 | if self.search_flag.is_fuzz_or_semantic_ok: 46 | self.add_fuzz_or_semantic(entity) 47 | 48 | def add_by_name(self, entity: NamedEntity) -> None: 49 | term = entity.value 50 | norm_term = self.normalize(term) 51 | record = Record( 52 | entity=entity, term=term, norm_term=norm_term, is_alias=False 53 | ) 54 | self._mapping[norm_term].append(record) 55 | 56 | def add_by_alias(self, entity: NamedEntity) -> None: 57 | for term in entity.aliases: 58 | norm_term = self.normalize(term) 59 | record = Record( 60 | entity=entity, term=term, norm_term=norm_term, is_alias=True 61 | ) 62 | self._mapping[norm_term].append(record) 63 | 64 | def add_fuzz_or_semantic(self, entity: NamedEntity) -> None: 65 | clean_name: str = self.fuzz_clean(entity.value) 66 | self._terms.append(clean_name) 67 | self._entities.append(entity) 68 | self._is_alias.append(False) 69 | 70 | for alias in entity.aliases: 71 | clean_alias: str = self.fuzz_clean(alias) 72 | self._terms.append(clean_alias) 73 | self._entities.append(entity) 74 | self._is_alias.append(True) 75 | 76 | # 77 | # Getters 78 | # 79 | 80 | def get(self, key: str) -> MatchResult: 81 | records = self._mapping.get(self.normalize(key), []) 82 | match_list = Record.from_list( 83 | records, key=key, entity_type=self.entity_type 84 | ) 85 | 86 | results = MatchResult(matches=match_list) 87 | 88 | if not results: 89 | if self.search_flag.is_fuzz_ok: 90 | results = self.get_by_fuzz(key) 91 | 92 | if self.search_flag.is_semantic_ok: 93 | results = self.get_by_semantic(key) 94 | 95 | return results 96 | 97 | # 98 | # Fuzzy Matching 99 | # 100 | 101 | def get_by_fuzz(self, term) -> MatchResult: 102 | query = self.fuzz_clean(term) 103 | matches = self.fuzz_match(query) 104 | return matches 105 | 106 | def fuzz_match( 107 | self, 108 | query: str, 109 | ) -> MatchResult: 110 | # https://rapidfuzz.github.io/RapidFuzz/Usage/process.html#extract 111 | extract = self.rapidfuzz.process.extract( 112 | query=query, 113 | choices=self._terms, 114 | scorer=self.fuzz_scorer, 115 | limit=self.limit, 116 | ) 117 | 118 | results = MatchResult() 119 | for key, score, index in extract: 120 | entity = self._entities[index] 121 | is_alias = self._is_alias[index] 122 | m = Match(key=key, entity=entity, is_alias=is_alias, score=score) 123 | results.append(m) 124 | return results 125 | 126 | # 127 | # Vector Similarity Search 128 | # 129 | 130 | def get_by_semantic(self, key) -> MatchResult: 131 | # find closest match using knn 132 | indices, scores = self.find_knn(key) 133 | 134 | # create a MatchResult from the results 135 | results = MatchResult() 136 | for index, score in zip(indices, scores): 137 | entity = self._entities[index] 138 | term = self._terms[index] 139 | is_alias = self._is_alias[index] 140 | match = Match( 141 | key=key, 142 | entity=entity, 143 | score=score, 144 | is_alias=is_alias, 145 | term=term, 146 | ) 147 | results.append(match) 148 | 149 | return results 150 | 151 | @property 152 | def embeddings(self): 153 | if self._embeddings is None: 154 | self._embeddings = self.encode(self._terms) 155 | return self._embeddings 156 | 157 | def find_knn(self, key: str) -> tuple: 158 | np = lazy.lazy_import("numpy") 159 | cosine_similarity = lazy.lazy_import( 160 | "sklearn.metrics.pairwise", "cosine_similarity" 161 | ) 162 | 163 | # Encode the query 164 | term = self.fuzz_clean(key) 165 | query = self.encode([term])[0] 166 | 167 | # Reshape the query to a 2D array for cosine_similarity compatibility 168 | query = query.reshape(1, -1) 169 | 170 | # Compute cosine similarity 171 | similarities = cosine_similarity(self.embeddings, query).flatten() 172 | 173 | # Normalize the scores to the range of 0 to 100 174 | normalized_scores = (similarities + 1) * 50 175 | 176 | # Get indices of the top-k similarities 177 | k_nearest_indices = np.argsort(-normalized_scores)[: self.limit] 178 | 179 | # Get the top-k normalized scores 180 | top_k_scores = normalized_scores[k_nearest_indices] 181 | 182 | return k_nearest_indices, top_k_scores 183 | 184 | 185 | def InMemoryValidator( 186 | source: Iterable, 187 | *, 188 | case_sensitive: bool = False, 189 | encoder: Union[Callable, str, object] = None, 190 | entity_type: Type[NamedEntity] = NamedEntity, 191 | examples: Optional[list] = None, 192 | fuzz_scorer: const.FuzzScorer = "token_sort_ratio", 193 | limit: PositiveInt = 10, 194 | min_similarity: float = 80.0, 195 | notfound_mode: const.NotFoundMode = "raise", 196 | search_flag: flags.SearchFlag = flags.DefaultSearch, 197 | tiebreaker_mode: const.TiebreakerMode = "raise", 198 | ): 199 | in_memory = InMemoryValidatorStorage( 200 | source, 201 | case_sensitive=case_sensitive, 202 | encoder=encoder, 203 | entity_type=entity_type, 204 | fuzz_scorer=fuzz_scorer, 205 | limit=limit, 206 | min_similarity=min_similarity, 207 | notfound_mode=notfound_mode, 208 | search_flag=search_flag, 209 | tiebreaker_mode=tiebreaker_mode, 210 | ) 211 | 212 | return FuzzValidator(in_memory, examples=examples) 213 | -------------------------------------------------------------------------------- /src/fuzztypes/entity.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | from pathlib import Path 4 | from typing import ( 5 | List, 6 | Union, 7 | Type, 8 | Any, 9 | Optional, 10 | Tuple, 11 | Callable, 12 | Generic, 13 | TypeVar, 14 | ) 15 | 16 | from pydantic import BaseModel, Field, TypeAdapter 17 | 18 | T = TypeVar("T") 19 | 20 | 21 | class Entity(BaseModel, Generic[T]): 22 | value: T = Field( 23 | ..., 24 | description="Value stored by Entity.", 25 | ) 26 | label: Optional[str] = Field( 27 | default=None, 28 | description="Entity concept type such as PERSON, ORG, or GPE.", 29 | ) 30 | meta: Optional[dict] = Field( 31 | default=None, 32 | description="Additional attributes accessible through dot-notation.", 33 | ) 34 | priority: Optional[int] = Field( 35 | default=None, 36 | description="Tiebreaker rank (higher wins, None=0, negative allowed)", 37 | ) 38 | 39 | def __eq__(self, other: Any): 40 | other = getattr(other, "value", other) 41 | return self.value == other 42 | 43 | def resolve(self) -> T: 44 | return self.value 45 | 46 | @property 47 | def rank(self) -> int: 48 | """Normalized by converting None to 0 and making lower better.""" 49 | return -1 * (self.priority or 0) 50 | 51 | def __lt__(self, other: "Entity") -> bool: 52 | # noinspection PyTypeChecker 53 | return (self.rank, self.value) < (other.rank, other.value) 54 | 55 | def __getattr__(self, key: str) -> Any: 56 | # Check if the key exists in the meta dictionary 57 | if self.meta is not None and key in self.meta: 58 | return self.meta[key] 59 | # Attribute not found; raise AttributeError 60 | raise AttributeError( 61 | f"{self.__class__.__name__!r} object has no attribute {key!r}" 62 | ) 63 | 64 | def __setattr__(self, key: str, value: Any): 65 | # Check if the key is a predefined field in the BaseModel 66 | if key in self.model_fields: 67 | super().__setattr__(key, value) 68 | else: 69 | self.meta = self.meta or {} 70 | self.meta[key] = value 71 | 72 | 73 | class NamedEntity(Entity): 74 | value: str = Field( 75 | ..., 76 | description="Preferred term of NamedEntity.", 77 | ) 78 | aliases: list[str] = Field( 79 | ..., 80 | description="List of aliases for NamedEntity.", 81 | default_factory=list, 82 | ) 83 | 84 | @classmethod 85 | def convert(cls, item: Union[str, dict, list, tuple, "NamedEntity"]): 86 | if isinstance(item, cls): 87 | return item 88 | 89 | data = {} 90 | if item and isinstance(item, (list, tuple)): 91 | value, aliases = item[0], item[1:] 92 | if len(aliases) == 1 and isinstance(aliases[0], (tuple, list)): 93 | aliases = aliases[0] 94 | data = dict(value=value, aliases=aliases) 95 | elif isinstance(item, dict): 96 | data = item 97 | else: 98 | data = dict(value=item) 99 | 100 | return cls(**data) 101 | 102 | 103 | NamedEntityAdapter = TypeAdapter(NamedEntity) 104 | 105 | SourceType = Union[Path, tuple["EntitySource", str], Callable] 106 | 107 | 108 | class EntitySource: 109 | def __init__(self, source: SourceType, mv_splitter: str = "|"): 110 | self.loaded: bool = False 111 | self.source: SourceType = source 112 | self.mv_splitter: str = mv_splitter 113 | self.entities: List[NamedEntity] = [] 114 | 115 | def __len__(self): 116 | self._load_if_necessary() 117 | return len(self.entities) 118 | 119 | def __getitem__( 120 | self, key: Union[int, slice, str] 121 | ) -> Union[NamedEntity, list[NamedEntity], "EntitySource"]: 122 | if isinstance(key, str): 123 | # return another shell, let loading occur on demand. 124 | return EntitySource(source=(self, key)) 125 | 126 | self._load_if_necessary() 127 | return self.entities[key] 128 | 129 | def __iter__(self): 130 | self._load_if_necessary() 131 | return iter(self.entities) 132 | 133 | def _load_if_necessary(self): 134 | if not self.loaded: 135 | self.loaded = True 136 | if isinstance(self.source, tuple): 137 | parent, label = self.source 138 | self.entities = [e for e in parent if e.label == label] 139 | 140 | elif callable(self.source): 141 | self.entities = self.source() 142 | 143 | elif isinstance(self.source, Path): 144 | dialects = { 145 | "csv": self.from_csv, 146 | "tsv": self.from_tsv, 147 | "jsonl": self.from_jsonl, 148 | "txt": self.from_txt, 149 | } 150 | _, ext = self.source.name.lower().rsplit(".", maxsplit=1) 151 | f = dialects.get(ext) 152 | assert f is not None, f"No reader found for: {ext}" 153 | 154 | # noinspection PyArgumentList 155 | self.entities = f(self.source) 156 | 157 | @classmethod 158 | def from_jsonl(cls, path: Path) -> List[NamedEntity]: 159 | """ 160 | Constructs an EntityList from a .jsonl file of NamedEntity definitions. 161 | 162 | :param path: Path object pointing to the .jsonl file. 163 | :return: List of Entities. 164 | """ 165 | entities = [] 166 | with path.open("r") as fp: 167 | for line in fp: 168 | entity = NamedEntity.convert(json.loads(line)) 169 | entities.append(entity) 170 | return entities 171 | 172 | def from_csv(self, path: Path) -> List[NamedEntity]: 173 | return self.from_sv(path, csv.excel) 174 | 175 | def from_tsv(self, path: Path) -> List[NamedEntity]: 176 | return self.from_sv(path, csv.excel_tab) 177 | 178 | def from_txt(self, path: Path) -> List[NamedEntity]: 179 | return self.from_sv(path, csv.excel, fieldnames=["value"]) 180 | 181 | def from_sv( 182 | self, 183 | path: Path, 184 | dialect: Type[csv.Dialect], 185 | fieldnames=None, 186 | ) -> List[NamedEntity]: 187 | """ 188 | Constructs an EntityList from a .csv or .tsv file. 189 | 190 | :param path: Path object pointing to the .csv or .tsv file. 191 | :param dialect: CSV or TSV excel-based dialect. 192 | :param fieldnames: Specify header if not provided (e.g. .txt mode) 193 | :return: List of Entities 194 | """ 195 | 196 | def fix(d): 197 | aliases = d.get("aliases", "").split(self.mv_splitter) 198 | d["aliases"] = list(filter(None, aliases)) 199 | return d 200 | 201 | with path.open("r") as fp: 202 | reader = csv.DictReader(fp, dialect=dialect, fieldnames=fieldnames) 203 | data = map(fix, list(reader)) 204 | 205 | return TypeAdapter(List[NamedEntity]).validate_python(data) 206 | -------------------------------------------------------------------------------- /src/fuzztypes/lazy.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import importlib 3 | import os 4 | from typing import Any, List, TypedDict, Callable, Optional 5 | 6 | from fuzztypes import const 7 | 8 | 9 | @functools.lru_cache(maxsize=None) 10 | def lazy_import( 11 | library_name: str, 12 | attr_name: Optional[str] = None, 13 | return_none_on_error: bool = False, 14 | ) -> Any: 15 | """ 16 | Lazily import a library or a specific attribute from a library. 17 | 18 | Args: 19 | library_name (str): The name of the library to import. 20 | attr_name (str, optional): Library attribute to import from library. 21 | return_none_on_error (bool, optional): Whether to return None if an 22 | import error occurs. Default is False, which raises an ImportError. 23 | 24 | Returns: 25 | The imported library or attribute, or None if an import error occurs 26 | and return_none_on_error is True. 27 | 28 | Raises: 29 | ImportError: If the library or attribute is not found and 30 | return_none_on_error is False. 31 | """ 32 | info = _lib_info.get(library_name, {}) 33 | 34 | module_name = info.get("module_name", library_name) 35 | install_name = info.get("install_name", library_name) 36 | purpose = info.get("purpose", "") 37 | license_type = info.get("license", "") 38 | url = info.get("url", "") 39 | version = info.get("version", "") 40 | 41 | try: 42 | module = importlib.import_module(module_name) 43 | if attr_name: 44 | return getattr(module, attr_name) 45 | return module 46 | except ImportError as e: 47 | version_info = f"(version {version})" if version else "" 48 | install = f"`pip install {install_name}{version_info}`" 49 | details = ", ".join(list(filter(None, [purpose, url, license_type]))) 50 | details = f" ({details})" if details else "" 51 | msg = f"Import Failed: {install}{details}" 52 | 53 | if not info: 54 | additional_msg = ( 55 | f"\nPlease add the library '{library_name}' to " 56 | f"the '_lib_info' dictionary in the 'lazy' " 57 | f"module." 58 | ) 59 | msg += additional_msg 60 | 61 | if return_none_on_error: 62 | return None 63 | else: 64 | raise ImportError(msg) from e 65 | 66 | 67 | @functools.lru_cache(maxsize=None) 68 | def create_encoder(model_or_model_name: str, device: const.DeviceList): 69 | def get_encoder(): 70 | nonlocal model_or_model_name 71 | 72 | if model_or_model_name is None: 73 | model_or_model_name = const.DefaultEncoder 74 | 75 | if isinstance(model_or_model_name, str): 76 | sbert = lazy_import("sentence_transformers") 77 | local_path = os.path.join(const.ModelsPath, model_or_model_name) 78 | 79 | if not os.path.exists(local_path): # pragma: no cover 80 | encoder = sbert.SentenceTransformer( 81 | model_or_model_name, device=device 82 | ) 83 | encoder.save(local_path) 84 | else: 85 | encoder = sbert.SentenceTransformer(local_path) 86 | 87 | model_or_model_name = encoder 88 | 89 | return model_or_model_name 90 | 91 | def encode(texts: List[str]) -> List: 92 | return get_encoder().encode(texts, device=device) 93 | 94 | return encode 95 | 96 | 97 | class RankResult(TypedDict): 98 | text: str 99 | score: float 100 | corpus_id: int 101 | 102 | 103 | def create_reranker( 104 | model_name: str, 105 | ) -> Callable[[str, List[str], int], List[RankResult]]: 106 | """ 107 | Creates a reranker function using the specified sentence transformer model. 108 | 109 | :param model_name: Name of the CrossEncoder model 110 | (e.g. "mixedbread-ai/mxbai-rerank-xsmall-v1") 111 | 112 | :return: rerank function Callable 113 | """ 114 | 115 | def get_reranker(): 116 | sbert = lazy_import("sentence_transformers") 117 | local_path = os.path.join(const.ModelsPath, model_name) 118 | 119 | if not os.path.exists(local_path): # pragma: no cover 120 | reranker = sbert.CrossEncoder(model_name) 121 | reranker.save(local_path) 122 | else: 123 | reranker = sbert.CrossEncoder(local_path) 124 | 125 | return reranker 126 | 127 | def rerank( 128 | query: str, 129 | documents: List[str], 130 | top_k: int = 3, 131 | ) -> List[RankResult]: 132 | reranker = get_reranker() 133 | results: List[RankResult] = reranker.rank( 134 | query, documents, return_documents=True, top_k=top_k 135 | ) 136 | return results 137 | 138 | return rerank 139 | 140 | 141 | _lib_info = { 142 | "sentence-transformers": { 143 | "module_name": "sentence_transformers", 144 | "install_name": "sentence-transformers", 145 | "purpose": "Encoding sentences into high-dimensional vectors", 146 | "license": "Apache 2.0", 147 | "url": "https://github.com/UKPLab/sentence-transformers", 148 | }, 149 | "unidecode": { 150 | "module_name": "unidecode", 151 | "install_name": "Unidecode", 152 | "purpose": "Converting Unicode text into ASCII equivalents", 153 | "license": "GPL", 154 | "url": "https://github.com/avian2/unidecode", 155 | }, 156 | "anyascii": { 157 | "module_name": "anyascii", 158 | "install_name": "anyascii", 159 | "purpose": "Converting Unicode text into ASCII equivalents", 160 | "license": "ISC", 161 | "url": "https://github.com/anyascii/anyascii", 162 | }, 163 | "rapidfuzz": { 164 | "module_name": "rapidfuzz", 165 | "install_name": "rapidfuzz", 166 | "purpose": "Performing fuzzy string matching", 167 | "license": "MIT", 168 | "url": "https://github.com/maxbachmann/RapidFuzz", 169 | }, 170 | "dateparser": { 171 | "module_name": "dateparser", 172 | "install_name": "dateparser", 173 | "purpose": "Parsing dates from strings", 174 | "license": "BSD-3-Clause", 175 | "url": "https://github.com/scrapinghub/dateparser", 176 | }, 177 | "emoji": { 178 | "module_name": "emoji", 179 | "install_name": "emoji", 180 | "purpose": "Handling and manipulating emoji characters", 181 | "license": "BSD", 182 | "url": "https://github.com/carpedm20/emoji", 183 | }, 184 | "nameparser": { 185 | "module_name": "nameparser", 186 | "install_name": "nameparser", 187 | "purpose": "Parsing person names", 188 | "license": "LGPL", 189 | "url": "https://github.com/derek73/python-nameparser", 190 | }, 191 | "number-parser": { 192 | "module_name": "number_parser", 193 | "install_name": "number-parser", 194 | "purpose": "Parsing numbers from strings", 195 | "license": "BSD-3-Clause", 196 | "url": "https://github.com/scrapinghub/number-parser", 197 | }, 198 | "pycountry": { 199 | "module_name": "pycountry", 200 | "install_name": "pycountry", 201 | "purpose": "Provides ISO country, subdivision, language, and currency", 202 | "license": "LGPL 2.1", 203 | "url": "https://github.com/flyingcircusio/pycountry", 204 | }, 205 | "lancedb": { 206 | "module_name": "lancedb", 207 | "install_name": "lancedb", 208 | "purpose": "High-performance, on-disk vector database", 209 | "license": "Apache 2.0", 210 | "url": "https://github.com/lancedb/lancedb", 211 | }, 212 | "numpy": { 213 | "module_name": "numpy", 214 | "install_name": "numpy", 215 | "purpose": "Numerical computing in Python", 216 | "license": "BSD", 217 | "url": "https://numpy.org/", 218 | }, 219 | "sklearn": { 220 | "module_name": "sklearn", 221 | "install_name": "scikit-learn", 222 | "purpose": "Machine learning in Python", 223 | "license": "BSD", 224 | "url": "https://scikit-learn.org/", 225 | }, 226 | } 227 | -------------------------------------------------------------------------------- /src/fuzztypes/on_disk.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Iterable, Union, List, Type, Optional, Any 2 | 3 | from pydantic import PositiveInt 4 | 5 | from fuzztypes import ( 6 | FuzzValidator, 7 | Match, 8 | MatchResult, 9 | NamedEntity, 10 | Record, 11 | const, 12 | flags, 13 | lazy, 14 | storage, 15 | ) 16 | 17 | accelerators = {"cuda", "mps"} 18 | 19 | 20 | class StoredValidatorStorage(storage.AbstractStorage): 21 | def __init__( 22 | self, 23 | name: str, 24 | source: Iterable, 25 | **kwargs, 26 | ): 27 | super().__init__(source, **kwargs) 28 | 29 | self.name = name 30 | self._conn = None 31 | self._table = None 32 | 33 | @property 34 | def conn(self) -> Any: 35 | if self._conn is None: 36 | lancedb = lazy.lazy_import("lancedb") 37 | self._conn = lancedb.connect(const.StoredValidatorPath) 38 | return self._conn 39 | 40 | @property 41 | def table(self) -> Any: 42 | if self._table is None: 43 | self._table = self.conn.open_table(self.name) 44 | return self._table 45 | 46 | def prepare(self, force_drop_table: bool = False): 47 | table_names = set(self.conn.table_names(limit=999_999_999)) 48 | 49 | if force_drop_table and self.name in table_names: 50 | self.conn.drop_table(self.name) 51 | table_names -= {self.name} 52 | 53 | if self.name not in table_names: 54 | try: 55 | self.create_table() 56 | except Exception as e: # pragma: no cover 57 | # if any issue occurs, drop the table and re-raise error 58 | # in the future, handle errors better 59 | self.conn.drop_table(self.name) 60 | raise e 61 | 62 | def create_table(self): 63 | pa = lazy.lazy_import("pyarrow") 64 | 65 | schema = pa.schema( 66 | [ 67 | pa.field("term", pa.string()), 68 | pa.field("norm_term", pa.string()), 69 | pa.field("entity", pa.string()), 70 | pa.field("is_alias", pa.string()), 71 | pa.field( 72 | "vector", 73 | pa.list_(pa.float32(), self.vect_dimensions), 74 | ), 75 | ] 76 | ) 77 | table = self.conn.create_table(self.name, schema=schema, exist_ok=True) 78 | 79 | # create records from source 80 | records = self.create_records() 81 | 82 | # calculate vectors in a batch 83 | if self.search_flag.is_semantic_ok: 84 | terms = [record.term for record in records] 85 | vectors = self.encode(terms) 86 | for record, vector in zip(records, vectors): 87 | record.vector = vector 88 | 89 | # add records in a batch to table 90 | table.add([record.model_dump() for record in records]) 91 | 92 | # adjust num_partitions and num_sub_vectors based on dataset size 93 | num_records = len(records) 94 | 95 | should_index = num_records > 256 and self.search_flag.is_semantic_ok 96 | 97 | if self.search_flag.is_fuzz_ok: # pragma: no cover 98 | table.create_fts_index("term") 99 | 100 | if should_index: # pragma: no cover 101 | num_partitions = min(num_records, 256) 102 | num_sub_vectors = min(num_records, 96) 103 | index_cache_size = min(num_records, 256) 104 | accelerator = self.device if self.device in accelerators else None 105 | 106 | table.create_index( 107 | metric="cosine", 108 | num_partitions=num_partitions, 109 | num_sub_vectors=num_sub_vectors, 110 | vector_column_name="vector", 111 | replace=True, 112 | index_cache_size=index_cache_size, 113 | accelerator=accelerator, 114 | ) 115 | 116 | def create_records(self): 117 | records = [] 118 | empty = [0.0] * self.vect_dimensions 119 | for item in self.source: 120 | entity = self.entity_type.convert(item) 121 | json = entity.model_dump_json(exclude_defaults=True) 122 | 123 | terms = [] 124 | is_alias = False 125 | 126 | if self.search_flag.is_name_ok: 127 | terms.append(entity.value) 128 | is_alias = True 129 | 130 | if self.search_flag.is_alias_ok: 131 | terms += entity.aliases 132 | 133 | for term in terms: 134 | # normalize for case sensitivity 135 | norm_term = self.normalize(term) 136 | 137 | # construct and add record 138 | if term: 139 | record = Record( 140 | entity=json, 141 | term=term, 142 | norm_term=norm_term, 143 | is_alias=is_alias, 144 | vector=empty, 145 | ) 146 | records.append(record) 147 | 148 | # 2nd term and beyond are aliases 149 | is_alias = True 150 | 151 | return records 152 | 153 | # 154 | # Getters 155 | # 156 | 157 | def get(self, key: str) -> MatchResult: 158 | where = f'term = "{key}"' 159 | match_list = self.run_query(key, where=where) 160 | 161 | if not match_list: 162 | where = f'norm_term = "{self.normalize(key)}"' 163 | match_list = self.run_query(key, where=where) 164 | 165 | if not match_list: 166 | if self.search_flag.is_fuzz_ok: 167 | match_list = self.get_by_fuzz(key) 168 | 169 | if self.search_flag.is_semantic_ok: 170 | match_list = self.get_by_semantic(key) 171 | 172 | matches = MatchResult(matches=match_list) 173 | return matches 174 | 175 | def get_by_fuzz(self, key: str) -> List[Match]: 176 | query = self.normalize(key) 177 | match_list = self.run_query(key, vector=query) 178 | 179 | # re-scoring using rapidfuzz on matches 180 | terms = [match.term for match in match_list] 181 | extract = self.rapidfuzz.process.extract( 182 | query, terms, scorer=self.fuzz_scorer 183 | ) 184 | for key, score, index in extract: 185 | match_list[index].score = score 186 | 187 | return match_list 188 | 189 | def get_by_semantic(self, key: str) -> List[Match]: 190 | vector = self.encode([key])[0] 191 | return self.run_query(key, vector=vector) 192 | 193 | def run_query(self, key, where=None, vector=None) -> List[Match]: 194 | qb = self.table.search(query=vector, vector_column_name="vector") 195 | 196 | if vector is not None and self.search_flag.is_semantic_ok: 197 | qb = qb.metric("cosine") 198 | 199 | qb = qb.select(["entity", "term", "norm_term", "is_alias"]) 200 | 201 | if where is not None: 202 | qb = qb.where(where, prefilter=True) 203 | 204 | qb = qb.limit(self.limit) 205 | data = qb.to_list() 206 | 207 | match_list = [] 208 | for item in data: 209 | if "_distance" in item: 210 | distance = item.pop("_distance", 0.0) 211 | similarity = 1 - distance 212 | score = (similarity + 1) * 50 213 | elif "score" in item: 214 | score = item.pop("score", 0.0) 215 | else: 216 | score = 100.0 # Exact match 217 | 218 | record = Record.model_validate(item) 219 | match = record.to_match( 220 | key=key, score=score, entity_type=self.entity_type 221 | ) 222 | match_list.append(match) 223 | 224 | return match_list 225 | 226 | 227 | def OnDiskValidator( 228 | identity: str, 229 | source: Iterable, 230 | *, 231 | case_sensitive: bool = False, 232 | device: Optional[const.DeviceList] = None, 233 | encoder: Union[Callable, str, object] = None, 234 | entity_type: Type[NamedEntity] = NamedEntity, 235 | examples: Optional[list] = None, 236 | fuzz_scorer: const.FuzzScorer = "token_sort_ratio", 237 | limit: PositiveInt = 10, 238 | min_similarity: float = 80.0, 239 | notfound_mode: const.NotFoundMode = "raise", 240 | search_flag: flags.SearchFlag = flags.DefaultSearch, 241 | tiebreaker_mode: const.TiebreakerMode = "raise", 242 | ): 243 | on_disk = StoredValidatorStorage( 244 | identity, 245 | source, 246 | case_sensitive=case_sensitive, 247 | device=device, 248 | entity_type=entity_type, 249 | fuzz_scorer=fuzz_scorer, 250 | limit=limit, 251 | min_similarity=min_similarity, 252 | notfound_mode=notfound_mode, 253 | search_flag=search_flag, 254 | encoder=encoder, 255 | tiebreaker_mode=tiebreaker_mode, 256 | ) 257 | 258 | return FuzzValidator(on_disk, examples=examples) 259 | -------------------------------------------------------------------------------- /tests/test_readme.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | from typing import Annotated 3 | 4 | from pydantic import BaseModel 5 | 6 | from fuzztypes import ( 7 | ASCII, 8 | Datetime, 9 | Email, 10 | Fuzzmoji, 11 | InMemoryValidator, 12 | Integer, 13 | Person, 14 | RegexValidator, 15 | ZipCode, 16 | flags, 17 | ) 18 | 19 | 20 | # define a source, see EntitySource for using TSV, CSV, JSONL 21 | inventors = ["Ada Lovelace", "Alan Turing", "Claude Shannon"] 22 | 23 | # define a in memory validator with fuzz search enabled. 24 | Inventor = Annotated[ 25 | str, InMemoryValidator(inventors, search_flag=flags.FuzzSearch) 26 | ] 27 | 28 | # custom Regex type for finding twitter handles. 29 | Handle = Annotated[ 30 | str, RegexValidator(r"@\w{1,15}", examples=["@genomoncology"]) 31 | ] 32 | 33 | 34 | # define a Pydantic class with 9 fuzzy type attributes 35 | class Fuzzy(BaseModel): 36 | ascii: ASCII 37 | email: Email 38 | emoji: Fuzzmoji 39 | handle: Handle 40 | integer: Integer 41 | inventor: Inventor 42 | person: Person 43 | time: Datetime 44 | zipcode: ZipCode 45 | 46 | 47 | def test_full_model(): 48 | # create an instance of class Fuzzy 49 | obj = Fuzzy( 50 | ascii="άνθρωπος", 51 | email="John Doe ", 52 | emoji="thought bubble", 53 | handle="Ian Maurer (@imaurer)", 54 | integer="fifty-five", # type: ignore[arg-type] 55 | inventor="ada luvlace", 56 | person="mr. arthur h. fonzarelli (fonzie)", # type: ignore[arg-type] 57 | time="5am on Jan 1, 2025", # type: ignore[arg-type] 58 | zipcode="(Zipcode: 12345-6789)", 59 | ) 60 | 61 | # test the autocorrecting performed 62 | 63 | # greek for man: https://en.wiktionary.org/wiki/άνθρωπος 64 | assert obj.ascii == "anthropos" 65 | 66 | # extract email via regular expression 67 | assert obj.email == "jdoe@example.com" 68 | 69 | # fuzzy match "thought bubble" to "thought balloon" emoji 70 | assert obj.emoji == "💭" 71 | 72 | # simple, inline regex example (see above Handle type) 73 | assert obj.handle == "@imaurer" 74 | 75 | # convert integer word phrase to integer value 76 | assert obj.integer == 55 77 | 78 | # case-insensitive fuzzy match on lowercase, misspelled name 79 | assert obj.inventor == "Ada Lovelace" 80 | 81 | # human name parser (title, first, middle, last, suffix, nickname) 82 | assert str(obj.person) == "Mr. Arthur H. Fonzarelli (fonzie)" 83 | assert obj.person.short_name == "Arthur Fonzarelli" 84 | assert obj.person.nickname == "fonzie" 85 | assert obj.person.last == "Fonzarelli" 86 | 87 | # convert time phrase to datetime object 88 | assert obj.time.isoformat() == "2025-01-01T05:00:00" 89 | 90 | # extract zip5 or zip9 formats using regular expressions 91 | assert obj.zipcode == "12345-6789" 92 | 93 | # print JSON on success 94 | assert obj.model_dump() == { 95 | "ascii": "anthropos", 96 | "email": "jdoe@example.com", 97 | "emoji": "💭", 98 | "handle": "@imaurer", 99 | "integer": 55, 100 | "inventor": "Ada Lovelace", 101 | "person": { 102 | "first": "Arthur", 103 | "init_format": "{first} {middle} {last}", 104 | "last": "Fonzarelli", 105 | "middle": "H.", 106 | "name_format": "{title} {first} {middle} {last} {suffix} " 107 | "({nickname})", 108 | "nickname": "fonzie", 109 | "suffix": "", 110 | "title": "Mr.", 111 | }, 112 | "time": datetime(2025, 1, 1, 5), 113 | "zipcode": "12345-6789", 114 | } 115 | 116 | 117 | def test_json_schema(): 118 | data = Fuzzy.model_json_schema() 119 | expected_data = { 120 | "$defs": { 121 | "PersonModel": { 122 | "properties": { 123 | "first": { 124 | "default": "", 125 | "title": "First", 126 | "type": "string", 127 | }, 128 | "init_format": { 129 | "default": "{first} " "{middle} " "{last}", 130 | "title": "Init " "Format", 131 | "type": "string", 132 | }, 133 | "last": {"default": "", "title": "Last", "type": "string"}, 134 | "middle": { 135 | "default": "", 136 | "title": "Middle", 137 | "type": "string", 138 | }, 139 | "name_format": { 140 | "default": "{title} " 141 | "{first} " 142 | "{middle} " 143 | "{last} " 144 | "{suffix} " 145 | "({nickname})", 146 | "title": "Name " "Format", 147 | "type": "string", 148 | }, 149 | "nickname": { 150 | "default": "", 151 | "title": "Nickname", 152 | "type": "string", 153 | }, 154 | "suffix": { 155 | "default": "", 156 | "title": "Suffix", 157 | "type": "string", 158 | }, 159 | "title": { 160 | "default": "", 161 | "title": "Title", 162 | "type": "string", 163 | }, 164 | }, 165 | "title": "PersonModel", 166 | "type": "object", 167 | } 168 | }, 169 | "properties": { 170 | "ascii": {"title": "Ascii", "type": "string"}, 171 | "email": { 172 | "examples": ["user@example.com"], 173 | "title": "Email", 174 | "type": "string", 175 | }, 176 | "emoji": {"title": "Emoji", "type": "string"}, 177 | "handle": { 178 | "examples": ["@genomoncology"], 179 | "title": "Handle", 180 | "type": "string", 181 | }, 182 | "integer": {"title": "Integer", "type": "integer"}, 183 | "inventor": {"title": "Inventor", "type": "string"}, 184 | "person": {"$ref": "#/$defs/PersonModel"}, 185 | "time": {"format": "date-time", "title": "Time", "type": "string"}, 186 | "zipcode": { 187 | "examples": ["12345", "12345-6789"], 188 | "title": "Zipcode", 189 | "type": "string", 190 | }, 191 | }, 192 | "required": [ 193 | "ascii", 194 | "email", 195 | "emoji", 196 | "handle", 197 | "integer", 198 | "inventor", 199 | "person", 200 | "time", 201 | "zipcode", 202 | ], 203 | "title": "Fuzzy", 204 | "type": "object", 205 | } 206 | assert data == expected_data 207 | 208 | 209 | def test_in_memory_validator(): 210 | # Create a custom annotation type for matching fruits in memory 211 | fruits = ["Apple", "Banana", "Orange"] 212 | Fruit = Annotated[ 213 | str, InMemoryValidator(fruits, search_flag=flags.FuzzSearch) 214 | ] 215 | 216 | class MyModel(BaseModel): 217 | fruit: Fruit 218 | 219 | model = MyModel(fruit="appel") 220 | assert model.fruit == "Apple" 221 | 222 | 223 | def test_on_disk_validator(): 224 | from fuzztypes import OnDiskValidator 225 | 226 | # Create a custom annotation type for matching countries stored on disk 227 | countries = [ 228 | ("United States", "US"), 229 | ("United Kingdom", "UK"), 230 | ("Canada", "CA"), 231 | ] 232 | Country = Annotated[str, OnDiskValidator("Country", countries)] 233 | 234 | class MyModel(BaseModel): 235 | country: Country 236 | 237 | assert MyModel(country="Canada").country == "Canada" 238 | assert MyModel(country="US").country == "United States" 239 | 240 | 241 | def test_date_validators(): 242 | from fuzztypes import DateValidator, DatetimeValidator 243 | 244 | MyDate = Annotated[date, DateValidator(date_order="MDY")] 245 | MyTime = Annotated[datetime, DatetimeValidator(timezone="UTC")] 246 | 247 | class MyModel(BaseModel): 248 | date: MyDate 249 | time: MyTime 250 | 251 | model = MyModel(date="1/1/2023", time="1/1/23 at 10:30 PM") # type: ignore 252 | assert model.date.isoformat() == "2023-01-01" 253 | assert model.time.isoformat() == "2023-01-01T22:30:00+00:00" 254 | 255 | 256 | def test_fuzz_validator(): 257 | from fuzztypes import FuzzValidator 258 | 259 | # Create a custom annotation type that converts a value to uppercase 260 | UpperCase = Annotated[str, FuzzValidator(str.upper)] 261 | 262 | class MyModel(BaseModel): 263 | name: UpperCase 264 | 265 | model = MyModel(name="john") 266 | assert model.name == "JOHN" 267 | 268 | 269 | def test_regex_validator(): 270 | from fuzztypes import RegexValidator 271 | 272 | # Create a custom annotation type for matching email addresses 273 | IPAddress = Annotated[ 274 | str, RegexValidator(r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}$") 275 | ] 276 | 277 | class MyModel(BaseModel): 278 | ip_address: IPAddress 279 | 280 | model = MyModel(ip_address="My internet IP address is 192.168.127.12") 281 | assert model.ip_address == "192.168.127.12" 282 | 283 | 284 | def test_validate_functions(): 285 | from fuzztypes import validate_python, validate_json, resolve_entity, Date 286 | 287 | # validate python 288 | assert validate_python(Integer, "two hundred") == 200 289 | 290 | # validate json 291 | class MyModel(BaseModel): 292 | date: Date 293 | 294 | json = '{"date": "July 4th 2021"}' 295 | obj = validate_json(MyModel, json) 296 | assert obj.date.isoformat() == "2021-07-04" 297 | 298 | 299 | def test_resolve_entity(): 300 | from fuzztypes import resolve_entity, InMemoryValidator 301 | 302 | elements = ["earth", "fire", "water", "air"] 303 | ElementValidator = InMemoryValidator(elements) 304 | Element = Annotated[str, ElementValidator] 305 | 306 | # resolve using validator 307 | entity = resolve_entity(ElementValidator, "EARTH") 308 | assert entity is not None 309 | assert entity.model_dump() == { 310 | "aliases": [], 311 | "label": None, 312 | "meta": None, 313 | "priority": None, 314 | "value": "earth", 315 | } 316 | 317 | # resolve using annotation type 318 | entity = resolve_entity(Element, "Air") 319 | assert entity is not None 320 | assert entity.model_dump(exclude_defaults=True) == {"value": "air"} 321 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv pip compile pyproject.toml --extra test --extra local --extra ext -o requirements-dev.txt 3 | annotated-types==0.6.0 4 | # via pydantic 5 | anyascii==0.3.2 6 | anyio==4.3.0 7 | # via 8 | # httpx 9 | # jupyter-server 10 | appnope==0.1.4 11 | # via ipykernel 12 | argon2-cffi==23.1.0 13 | # via jupyter-server 14 | argon2-cffi-bindings==21.2.0 15 | # via argon2-cffi 16 | arrow==1.3.0 17 | # via isoduration 18 | asttokens==2.4.1 19 | # via stack-data 20 | async-lru==2.0.4 21 | # via jupyterlab 22 | attrs==23.2.0 23 | # via 24 | # jsonschema 25 | # lancedb 26 | # number-parser 27 | # referencing 28 | babel==2.14.0 29 | # via jupyterlab-server 30 | beautifulsoup4==4.12.3 31 | # via nbconvert 32 | bleach==6.1.0 33 | # via nbconvert 34 | build==1.1.1 35 | cachetools==5.3.3 36 | # via lancedb 37 | certifi==2024.2.2 38 | # via 39 | # httpcore 40 | # httpx 41 | # requests 42 | cffi==1.16.0 43 | # via argon2-cffi-bindings 44 | charset-normalizer==3.3.2 45 | # via requests 46 | click==8.1.7 47 | # via lancedb 48 | comm==0.2.2 49 | # via 50 | # ipykernel 51 | # ipywidgets 52 | coverage==7.4.3 53 | dateparser==1.2.0 54 | debugpy==1.8.1 55 | # via ipykernel 56 | decorator==5.1.1 57 | # via 58 | # ipython 59 | # retry 60 | defusedxml==0.7.1 61 | # via nbconvert 62 | deprecation==2.1.0 63 | # via lancedb 64 | docutils==0.20.1 65 | # via readme-renderer 66 | emoji==2.10.1 67 | exceptiongroup==1.2.0 68 | # via 69 | # anyio 70 | # ipython 71 | # pytest 72 | executing==2.0.1 73 | # via stack-data 74 | fastjsonschema==2.19.1 75 | # via nbformat 76 | filelock==3.13.1 77 | # via 78 | # huggingface-hub 79 | # torch 80 | # transformers 81 | fqdn==1.5.1 82 | # via jsonschema 83 | fsspec==2024.2.0 84 | # via 85 | # huggingface-hub 86 | # torch 87 | h11==0.14.0 88 | # via httpcore 89 | httpcore==1.0.4 90 | # via httpx 91 | httpx==0.27.0 92 | # via jupyterlab 93 | huggingface-hub==0.21.3 94 | # via 95 | # sentence-transformers 96 | # tokenizers 97 | # transformers 98 | idna==3.6 99 | # via 100 | # anyio 101 | # httpx 102 | # jsonschema 103 | # requests 104 | importlib-metadata==7.0.2 105 | # via 106 | # build 107 | # jupyter-client 108 | # jupyter-lsp 109 | # jupyterlab 110 | # jupyterlab-server 111 | # keyring 112 | # nbconvert 113 | # twine 114 | iniconfig==2.0.0 115 | # via pytest 116 | ipykernel==6.29.3 117 | # via 118 | # jupyter 119 | # jupyter-console 120 | # jupyterlab 121 | # qtconsole 122 | ipython==8.18.1 123 | # via 124 | # ipykernel 125 | # ipywidgets 126 | # jupyter-console 127 | ipywidgets==8.1.2 128 | # via jupyter 129 | isoduration==20.11.0 130 | # via jsonschema 131 | jaraco-classes==3.3.1 132 | # via keyring 133 | jedi==0.19.1 134 | # via ipython 135 | jinja2==3.1.3 136 | # via 137 | # jupyter-server 138 | # jupyterlab 139 | # jupyterlab-server 140 | # nbconvert 141 | # torch 142 | joblib==1.3.2 143 | # via scikit-learn 144 | json5==0.9.24 145 | # via jupyterlab-server 146 | jsonpointer==2.4 147 | # via jsonschema 148 | jsonschema==4.21.1 149 | # via 150 | # jupyter-events 151 | # jupyterlab-server 152 | # nbformat 153 | jsonschema-specifications==2023.12.1 154 | # via jsonschema 155 | jupyter==1.0.0 156 | jupyter-client==8.6.1 157 | # via 158 | # ipykernel 159 | # jupyter-console 160 | # jupyter-server 161 | # nbclient 162 | # qtconsole 163 | jupyter-console==6.6.3 164 | # via jupyter 165 | jupyter-core==5.7.2 166 | # via 167 | # ipykernel 168 | # jupyter-client 169 | # jupyter-console 170 | # jupyter-server 171 | # jupyterlab 172 | # nbclient 173 | # nbconvert 174 | # nbformat 175 | # qtconsole 176 | jupyter-events==0.10.0 177 | # via jupyter-server 178 | jupyter-lsp==2.2.4 179 | # via jupyterlab 180 | jupyter-server==2.13.0 181 | # via 182 | # jupyter-lsp 183 | # jupyterlab 184 | # jupyterlab-server 185 | # notebook 186 | # notebook-shim 187 | jupyter-server-terminals==0.5.3 188 | # via jupyter-server 189 | jupyterlab==4.1.5 190 | # via notebook 191 | jupyterlab-pygments==0.3.0 192 | # via nbconvert 193 | jupyterlab-server==2.25.4 194 | # via 195 | # jupyterlab 196 | # notebook 197 | jupyterlab-widgets==3.0.10 198 | # via ipywidgets 199 | keyring==24.3.1 200 | # via twine 201 | lancedb==0.6.2 202 | markdown-it-py==3.0.0 203 | # via rich 204 | markupsafe==2.1.5 205 | # via 206 | # jinja2 207 | # nbconvert 208 | matplotlib-inline==0.1.6 209 | # via 210 | # ipykernel 211 | # ipython 212 | mdurl==0.1.2 213 | # via markdown-it-py 214 | mistune==3.0.2 215 | # via nbconvert 216 | more-itertools==10.2.0 217 | # via jaraco-classes 218 | mpmath==1.3.0 219 | # via sympy 220 | mypy==1.9.0 221 | mypy-extensions==1.0.0 222 | # via mypy 223 | nameparser==1.1.3 224 | nbclient==0.10.0 225 | # via nbconvert 226 | nbconvert==7.16.2 227 | # via 228 | # jupyter 229 | # jupyter-server 230 | nbformat==5.10.3 231 | # via 232 | # jupyter-server 233 | # nbclient 234 | # nbconvert 235 | nest-asyncio==1.6.0 236 | # via ipykernel 237 | networkx==3.2.1 238 | # via torch 239 | nh3==0.2.15 240 | # via readme-renderer 241 | notebook==7.1.2 242 | # via jupyter 243 | notebook-shim==0.2.4 244 | # via 245 | # jupyterlab 246 | # notebook 247 | number-parser==0.3.2 248 | numpy==1.26.4 249 | # via 250 | # pyarrow 251 | # pylance 252 | # scikit-learn 253 | # scipy 254 | # sentence-transformers 255 | # transformers 256 | overrides==7.7.0 257 | # via 258 | # jupyter-server 259 | # lancedb 260 | packaging==23.2 261 | # via 262 | # build 263 | # deprecation 264 | # huggingface-hub 265 | # ipykernel 266 | # jupyter-server 267 | # jupyterlab 268 | # jupyterlab-server 269 | # nbconvert 270 | # pytest 271 | # qtconsole 272 | # qtpy 273 | # transformers 274 | pandocfilters==1.5.1 275 | # via nbconvert 276 | parso==0.8.3 277 | # via jedi 278 | pexpect==4.9.0 279 | # via ipython 280 | pillow==10.2.0 281 | # via sentence-transformers 282 | pip==24.0 283 | pkginfo==1.10.0 284 | # via twine 285 | platformdirs==4.2.0 286 | # via jupyter-core 287 | pluggy==1.4.0 288 | # via pytest 289 | prometheus-client==0.20.0 290 | # via jupyter-server 291 | prompt-toolkit==3.0.43 292 | # via 293 | # ipython 294 | # jupyter-console 295 | psutil==5.9.8 296 | # via ipykernel 297 | ptyprocess==0.7.0 298 | # via 299 | # pexpect 300 | # terminado 301 | pure-eval==0.2.2 302 | # via stack-data 303 | py==1.11.0 304 | # via retry 305 | pyarrow==15.0.0 306 | # via pylance 307 | pycparser==2.21 308 | # via cffi 309 | pydantic==2.6.2 310 | # via lancedb 311 | pydantic-core==2.16.3 312 | # via pydantic 313 | pygments==2.17.2 314 | # via 315 | # ipython 316 | # jupyter-console 317 | # nbconvert 318 | # qtconsole 319 | # readme-renderer 320 | # rich 321 | pylance==0.10.2 322 | # via lancedb 323 | pyproject-hooks==1.0.0 324 | # via build 325 | pytest==8.0.1 326 | # via pytest-mock 327 | pytest-mock==3.12.0 328 | python-dateutil==2.9.0.post0 329 | # via 330 | # arrow 331 | # dateparser 332 | # jupyter-client 333 | python-json-logger==2.0.7 334 | # via jupyter-events 335 | pytz==2024.1 336 | # via dateparser 337 | pyyaml==6.0.1 338 | # via 339 | # huggingface-hub 340 | # jupyter-events 341 | # lancedb 342 | # transformers 343 | pyzmq==25.1.2 344 | # via 345 | # ipykernel 346 | # jupyter-client 347 | # jupyter-console 348 | # jupyter-server 349 | # qtconsole 350 | qtconsole==5.5.1 351 | # via jupyter 352 | qtpy==2.4.1 353 | # via qtconsole 354 | rapidfuzz==3.6.1 355 | ratelimiter==1.2.0.post0 356 | # via lancedb 357 | readme-renderer==43.0 358 | # via twine 359 | referencing==0.34.0 360 | # via 361 | # jsonschema 362 | # jsonschema-specifications 363 | # jupyter-events 364 | regex==2023.12.25 365 | # via 366 | # dateparser 367 | # transformers 368 | requests==2.31.0 369 | # via 370 | # huggingface-hub 371 | # jupyterlab-server 372 | # lancedb 373 | # requests-toolbelt 374 | # transformers 375 | # twine 376 | requests-toolbelt==1.0.0 377 | # via twine 378 | retry==0.9.2 379 | # via lancedb 380 | rfc3339-validator==0.1.4 381 | # via 382 | # jsonschema 383 | # jupyter-events 384 | rfc3986==2.0.0 385 | # via twine 386 | rfc3986-validator==0.1.1 387 | # via 388 | # jsonschema 389 | # jupyter-events 390 | rich==13.7.1 391 | # via twine 392 | rpds-py==0.18.0 393 | # via 394 | # jsonschema 395 | # referencing 396 | safetensors==0.4.2 397 | # via transformers 398 | scikit-learn==1.4.1.post1 399 | # via sentence-transformers 400 | scipy==1.12.0 401 | # via 402 | # scikit-learn 403 | # sentence-transformers 404 | semver==3.0.2 405 | # via lancedb 406 | send2trash==1.8.2 407 | # via jupyter-server 408 | sentence-transformers==2.5.1 409 | setuptools==69.1.1 410 | six==1.16.0 411 | # via 412 | # asttokens 413 | # bleach 414 | # python-dateutil 415 | # rfc3339-validator 416 | sniffio==1.3.1 417 | # via 418 | # anyio 419 | # httpx 420 | soupsieve==2.5 421 | # via beautifulsoup4 422 | stack-data==0.6.3 423 | # via ipython 424 | sympy==1.12 425 | # via torch 426 | tantivy==0.21.0 427 | terminado==0.18.1 428 | # via 429 | # jupyter-server 430 | # jupyter-server-terminals 431 | threadpoolctl==3.3.0 432 | # via scikit-learn 433 | tinycss2==1.2.1 434 | # via nbconvert 435 | tokenizers==0.15.2 436 | # via transformers 437 | tomli==2.0.1 438 | # via 439 | # build 440 | # coverage 441 | # jupyterlab 442 | # mypy 443 | # pyproject-hooks 444 | # pytest 445 | torch==2.2.1 446 | # via sentence-transformers 447 | tornado==6.4 448 | # via 449 | # ipykernel 450 | # jupyter-client 451 | # jupyter-server 452 | # jupyterlab 453 | # notebook 454 | # terminado 455 | tqdm==4.66.2 456 | # via 457 | # huggingface-hub 458 | # lancedb 459 | # sentence-transformers 460 | # transformers 461 | traitlets==5.14.1 462 | # via 463 | # comm 464 | # ipykernel 465 | # ipython 466 | # ipywidgets 467 | # jupyter-client 468 | # jupyter-console 469 | # jupyter-core 470 | # jupyter-events 471 | # jupyter-server 472 | # jupyterlab 473 | # matplotlib-inline 474 | # nbclient 475 | # nbconvert 476 | # nbformat 477 | # qtconsole 478 | transformers==4.38.2 479 | # via sentence-transformers 480 | twine==5.0.0 481 | types-python-dateutil==2.9.0.20240316 482 | # via arrow 483 | typing-extensions==4.9.0 484 | # via 485 | # anyio 486 | # async-lru 487 | # huggingface-hub 488 | # ipython 489 | # mypy 490 | # pydantic 491 | # pydantic-core 492 | # torch 493 | tzlocal==5.2 494 | # via dateparser 495 | unidecode==1.3.8 496 | uri-template==1.3.0 497 | # via jsonschema 498 | urllib3==2.2.1 499 | # via 500 | # requests 501 | # twine 502 | wcwidth==0.2.13 503 | # via prompt-toolkit 504 | webcolors==1.13 505 | # via jsonschema 506 | webencodings==0.5.1 507 | # via 508 | # bleach 509 | # tinycss2 510 | websocket-client==1.7.0 511 | # via jupyter-server 512 | widgetsnbextension==4.0.10 513 | # via ipywidgets 514 | zipp==3.17.0 515 | # via importlib-metadata 516 | -------------------------------------------------------------------------------- /tests/data/simonw_tags.csv: -------------------------------------------------------------------------------- 1 | value,priority 2 | 24ways,13 3 | 2d,3 4 | 37signals,12 5 | 3d,14 6 | 4chan,4 7 | 500startups,4 8 | aaronstraupcope,4 9 | aaronswartz,3 10 | abtesting,6 11 | accessibility,30 12 | accounts,4 13 | acid3,5 14 | acme,3 15 | actionscript,3 16 | activemq,3 17 | activitypub,7 18 | adamgomaa,3 19 | adamjohnson,4 20 | adobe,29 21 | adrianholovaty,15 22 | ads,5 23 | advertising,10 24 | agile,3 25 | ai,468 26 | aiassistedprogramming,11 27 | airships,10 28 | airtable,7 29 | ajax,59 30 | ajaxian,3 31 | alexgarcia,22 32 | alexgaynor,5 33 | alexpayne,5 34 | alexrussell,27 35 | alfeaton,4 36 | algorithms,12 37 | alistapart,8 38 | alpha,4 39 | alt,3 40 | amazon,63 41 | amazonaws,3 42 | amazonwebservices,7 43 | ami,4 44 | amqp,4 45 | analytics,8 46 | andrejkarpathy,10 47 | andrewgodwin,18 48 | andrewturner,5 49 | android,8 50 | andybaio,4 51 | andybudd,4 52 | anildash,9 53 | animation,7 54 | annevankesteren,5 55 | annotatedreleasenotes,22 56 | annotatedtalks,17 57 | anthropic,17 58 | antonzhiyanov,4 59 | aol,13 60 | apache,35 61 | api,28 62 | apidesign,9 63 | apis,81 64 | apollo,5 65 | appengine,33 66 | apple,90 67 | applephotos,3 68 | applescript,4 69 | appstore,10 70 | aprilfools,4 71 | aralbalkan,4 72 | architecture,9 73 | archive,5 74 | archives,3 75 | archiving,6 76 | arminronacher,8 77 | arstechnica,5 78 | art,8 79 | asciiart,3 80 | asf,3 81 | asgi,19 82 | askmetafilter,57 83 | aspdotnet,3 84 | aspnet,4 85 | assafarkin,5 86 | astronomy,3 87 | async,39 88 | athena,3 89 | atmedia,6 90 | atmedia07,3 91 | atmedia2007,4 92 | atom,21 93 | audio,10 94 | augmentedreality,3 95 | australia,3 96 | authentication,13 97 | autocomplete,6 98 | autoescaping,4 99 | avibryant,4 100 | aws,42 101 | azure,4 102 | backbone,3 103 | backups,6 104 | badges,3 105 | bakeddata,9 106 | bandwidth,3 107 | barackobama,3 108 | barcamp,6 109 | barcamplondon,3 110 | bard,13 111 | basecamp,6 112 | bash,8 113 | bayeux,7 114 | bazaar,6 115 | bbauth,3 116 | bbc,29 117 | bbcnews,4 118 | beautifulsoup,6 119 | bellingcat,3 120 | benchmarking,4 121 | benchmarks,5 122 | benfirshman,4 123 | bengoldacre,3 124 | benjohnson,9 125 | benlaurie,5 126 | benward,3 127 | benwelsh,8 128 | berkeleydb,3 129 | beta,3 130 | bigdata,14 131 | bigtable,3 132 | billdehora,5 133 | billgates,6 134 | binary,5 135 | bing,24 136 | bitcoin,10 137 | black,5 138 | blainecook,7 139 | blockchain,7 140 | blocks,4 141 | blogger,4 142 | blogging,39 143 | blogs,3 144 | bloom,5 145 | bloomfilters,4 146 | boardgames,3 147 | bobippolito,5 148 | boingboing,4 149 | bookmarklet,6 150 | bookmarklets,10 151 | books,17 152 | bradfitzpatrick,11 153 | bradneuberg,6 154 | branching,3 155 | branding,3 156 | brandonaaron,3 157 | brandurleach,12 158 | brendaneich,4 159 | brettaylor,5 160 | brighton,12 161 | brothercake,3 162 | browsers,73 163 | bruceschneier,27 164 | bsd,3 165 | buckettesting,3 166 | bugs,8 167 | bunniehuang,5 168 | business,22 169 | buzz,3 170 | c,28 171 | cabelsasser,3 172 | cache,3 173 | caching,45 174 | cairo,5 175 | calendars,5 176 | calhenderson,7 177 | california,3 178 | callbacks,5 179 | camino,6 180 | canon,3 181 | canvas,37 182 | cappuccino,3 183 | captcha,6 184 | cardspace,7 185 | careers,32 186 | cartography,3 187 | cassandra,7 188 | cdn,7 189 | cern,6 190 | certificates,9 191 | cfp,4 192 | chaining,3 193 | charlesbabbage,3 194 | charlesleifer,4 195 | charlesmiller,7 196 | charliestross,4 197 | chatgpt,90 198 | cheese,8 199 | cherrypy,4 200 | chicagocrime,3 201 | china,7 202 | chrisamico,3 203 | chrismessina,7 204 | chrisshiflett,6 205 | christianheilmann,5 206 | christmas,3 207 | christopherlenz,8 208 | chrome,19 209 | chromeframe,5 210 | chromium,3 211 | classes,4 212 | claude,17 213 | clayshirky,5 214 | cli,10 215 | clickhouse,4 216 | clickjacking,11 217 | closure,3 218 | closures,8 219 | cloud,8 220 | cloudcomputing,17 221 | cloudflare,10 222 | cloudfront,4 223 | cloudrun,8 224 | cms,10 225 | co2,3 226 | code,3 227 | codecs,3 228 | codereview,3 229 | coffeescript,3 230 | collaboration,8 231 | colour,4 232 | comcast,3 233 | comet,57 234 | cometd,3 235 | commandline,8 236 | commentspam,4 237 | communication,8 238 | community,21 239 | compilers,13 240 | complexity,3 241 | compression,3 242 | computerhistory,4 243 | computers,3 244 | computerscience,11 245 | computervision,13 246 | concurrency,12 247 | conditionalcomments,3 248 | conference,12 249 | conferences,181 250 | conspiracy,3 251 | contentapi,4 252 | contenttypes,3 253 | continuousdeployment,10 254 | continuousintegration,16 255 | cookiecutter,7 256 | cookies,25 257 | cooking,5 258 | copilot,4 259 | copy,3 260 | copyright,11 261 | copywriting,3 262 | corydoctorow,4 263 | cosmopolitan,6 264 | couchdb,27 265 | counters,3 266 | covid19,16 267 | cplusplus,6 268 | crawling,3 269 | crdt,6 270 | creativecommons,8 271 | crime,3 272 | cron,3 273 | crossdomain,10 274 | crossdomainxml,6 275 | crowdsourcing,15 276 | cryptography,20 277 | csharp,4 278 | csrf,47 279 | css,143 280 | css3,11 281 | cssaintrocketscience,9 282 | csv,31 283 | ctypes,6 284 | curl,8 285 | curse,3 286 | cursegaming,3 287 | cvs,3 288 | d3,11 289 | dabbledb,5 290 | dalle,12 291 | damienkatz,9 292 | danahboyd,3 293 | dancatt,6 294 | dannyobrien,4 295 | danwebb,6 296 | dareobasanjo,9 297 | data,21 298 | databases,93 299 | datablog,3 300 | datagov,3 301 | datagovuk,3 302 | datajournalism,37 303 | dataportability,6 304 | datascience,14 305 | datasette,384 306 | datasettecloud,34 307 | datasettedesktop,6 308 | datasettelite,15 309 | datastore,7 310 | datastructures,4 311 | datauri,6 312 | dates,6 313 | datetime,6 314 | daveshea,3 315 | davethomas,4 316 | davewiner,24 317 | davidbeazley,4 318 | davidcramer,8 319 | davidmbeazley,3 320 | davidrecordon,6 321 | dconstruct,5 322 | deanedwards,4 323 | debian,6 324 | debugger,5 325 | debugging,34 326 | decentralisation,4 327 | decorators,6 328 | delicious,6 329 | denialofservice,3 330 | deno,15 331 | deployment,25 332 | derekwillis,5 333 | design,61 334 | development,4 335 | devfort,4 336 | dewittclinton,3 337 | dickcostolo,4 338 | diff,5 339 | digg,16 340 | digitalocean,4 341 | dionalmaer,3 342 | directedidentity,5 343 | discord,5 344 | django,538 345 | djangobook,5 346 | djangocon,18 347 | djangocon08,3 348 | djangodebugtoolbar,3 349 | djangoorm,3 350 | djangopeople,12 351 | djangopony,4 352 | djangosnippets,6 353 | djangosqldashboard,11 354 | djugl,4 355 | dns,24 356 | docker,44 357 | documentary,3 358 | documentation,46 359 | documentcloud,4 360 | dogpile,7 361 | dogsheep,31 362 | dojo,40 363 | dojox,4 364 | dom,12 365 | domains,9 366 | domcontentloaded,4 367 | domscripting,3 368 | dontbeevil,3 369 | dopplr,12 370 | dotnet,4 371 | douglascrockford,14 372 | draganddrop,3 373 | dragndrop,5 374 | drawing,4 375 | dreamhost,5 376 | drewbreunig,3 377 | drewmclellan,5 378 | drichardhipp,7 379 | drizzle,3 380 | drm,15 381 | dropbox,9 382 | drupal,8 383 | duckdb,8 384 | duncanrobertson,4 385 | dustindiaz,5 386 | dynamiclanguages,3 387 | ebs,3 388 | ec2,42 389 | ecmascript,5 390 | ecommerce,8 391 | edddumbill,4 392 | edeliot,3 393 | edfelten,6 394 | editor,3 395 | education,14 396 | effbot,4 397 | egypt,3 398 | ekranoplans,4 399 | elasticsearch,11 400 | elections,9 401 | electron,8 402 | electronicvoting,3 403 | elementtree,4 404 | elliotterustyharold,6 405 | email,31 406 | embedding,3 407 | embeddings,23 408 | emoji,5 409 | encoding,5 410 | encryption,6 411 | enterprise,8 412 | entrepreneurship,51 413 | ericflorenzano,3 414 | ericholscher,8 415 | ericmeyer,6 416 | erlang,26 417 | errors,3 418 | escaping,4 419 | etags,3 420 | etech,4 421 | ethanmollick,11 422 | etherpad,3 423 | ethics,70 424 | etiquette,3 425 | eurooscon,3 426 | europe,3 427 | europython,4 428 | eventio,5 429 | eventlet,5 430 | eventmachine,3 431 | events,105 432 | everyblock,8 433 | evoting,3 434 | explorables,20 435 | extensions,4 436 | fabric,8 437 | facebook,107 438 | facebookgraphsearch,3 439 | facetedsearch,4 440 | fakestevejobs,3 441 | faq,4 442 | fastai,3 443 | fastcgi,3 444 | favicon,7 445 | featureflags,5 446 | fediverse,7 447 | feedburner,4 448 | feeds,6 449 | ffs,3 450 | finetuning,13 451 | firebug,22 452 | firecracker,3 453 | fireeagle,15 454 | firefox,52 455 | firefox3,7 456 | flash,69 457 | flask,5 458 | flex,7 459 | flickr,74 460 | flickrplaces,3 461 | fluiddb,4 462 | fly,27 463 | follow,3 464 | fonts,10 465 | foocamp,3 466 | food,11 467 | forms,9 468 | fowa,12 469 | fowa2007,3 470 | fowa2008,5 471 | framebusting,6 472 | frameworks,29 473 | francoischollet,4 474 | fredriklundh,6 475 | freebase,12 476 | friendfeed,11 477 | friends,3 478 | frontend,16 479 | fulltext,7 480 | fulltextsearch,10 481 | functional,4 482 | functionalprogramming,4 483 | funding,41 484 | funny,69 485 | fuse,4 486 | futureofwebapps,12 487 | gadgets,3 488 | games,19 489 | gaming,4 490 | garethrushgrove,8 491 | gcap,7 492 | gearman,3 493 | gears,7 494 | gecko,4 495 | geeks,3 496 | gemini,5 497 | generativeai,404 498 | generators,7 499 | genetics,5 500 | geo,12 501 | geocoding,11 502 | geodata,3 503 | geodjango,8 504 | geoffreylitt,3 505 | geoip,3 506 | geojson,11 507 | geolocation,6 508 | geonames,7 509 | geoplanet,7 510 | geospatial,9 511 | getlatlon,3 512 | gif,4 513 | gifs,4 514 | gil,11 515 | gis,41 516 | git,40 517 | githistory,6 518 | github,120 519 | githubactions,41 520 | githubcodespaces,9 521 | gitscraping,27 522 | glitch,11 523 | glyph,3 524 | gmail,21 525 | go,25 526 | google,289 527 | googleappengine,14 528 | googlecharts,9 529 | googlechrome,9 530 | googlecode,5 531 | googledocs,7 532 | googledoctype,3 533 | googlegears,8 534 | googlemaps,52 535 | googlemapsapi,4 536 | googlereader,5 537 | googlevideo,5 538 | googlewave,3 539 | government,8 540 | gpl,4 541 | gps,10 542 | gpt3,65 543 | gpt4,30 544 | grahamdumpleton,4 545 | graphics,9 546 | graphing,6 547 | graphql,18 548 | graphs,7 549 | greasemonkey,19 550 | gregwilson,5 551 | guardian,48 552 | guidovanrossum,10 553 | gwt,7 554 | gzip,8 555 | h264,4 556 | hack,3 557 | hackathons,4 558 | hackday,10 559 | hackdaylondon,3 560 | hackernews,15 561 | hacking,12 562 | hacks,9 563 | hadoop,9 564 | hakibenita,3 565 | halfmoonbay,3 566 | haproxy,5 567 | hardware,9 568 | hashbanghell,4 569 | hashes,3 570 | hashing,12 571 | haystack,4 572 | hcard,6 573 | heatmaps,3 574 | henrisivonen,5 575 | heroku,14 576 | highavailability,5 577 | highlights,3 578 | highrise,6 579 | history,31 580 | hixie,11 581 | homebrew,7 582 | homebrewllms,44 583 | hosting,24 584 | hotmail,6 585 | hotstandby,3 586 | html,65 587 | html5,83 588 | http,89 589 | http2,5 590 | httponly,3 591 | https,11 592 | httpx,4 593 | huggingface,5 594 | hynekschlawack,5 595 | i18n,13 596 | ia,6 597 | ianbicking,14 598 | ianhickson,20 599 | ianmansfield,3 600 | ibm,4 601 | ical,6 602 | ideas,8 603 | identity,14 604 | identitytheft,4 605 | idproxy,7 606 | ie,60 607 | ie6,14 608 | ie7,8 609 | ie8,25 610 | ietf,4 611 | iframes,17 612 | imagemagick,3 613 | images,13 614 | inaturalist,4 615 | infographics,7 616 | inheritance,3 617 | innodb,3 618 | inspiring,3 619 | internationalisation,9 620 | internet,25 621 | internetarchive,8 622 | internetexplorer,25 623 | interview,6 624 | interviews,7 625 | introspection,3 626 | investing,4 627 | io,7 628 | ios,18 629 | ip,4 630 | ipad,13 631 | iphone,62 632 | iphones,3 633 | iplayer,5 634 | ipod,6 635 | irc,5 636 | ironpython,10 637 | iso,4 638 | it,3 639 | itunes,3 640 | ixr,8 641 | jackclark,4 642 | jacobkaplanmoss,42 643 | jakearchibald,3 644 | jakobnielsen,5 645 | jamesbennett,19 646 | jamesbridle,3 647 | jamestauber,3 648 | janrain,7 649 | japan,3 650 | jargon,4 651 | jasoncalacanis,5 652 | jasonkottke,8 653 | jasonscott,5 654 | java,80 655 | javafx,3 656 | javascript,631 657 | javascriptlibraries,3 658 | jeffatwood,10 659 | jeffcroft,4 660 | jefflindsay,3 661 | jeffreyzeldman,8 662 | jeremiahgrossman,3 663 | jeremyashkenas,4 664 | jeremyhoward,8 665 | jeremykeith,12 666 | jeremyzawodny,4 667 | jetty,4 668 | jinja,4 669 | jit,7 670 | jobs,14 671 | joegregorio,9 672 | joelspolsky,8 673 | joelveitch,3 674 | joewalker,3 675 | johngrahamcumming,4 676 | johngruber,22 677 | johnresig,32 678 | johnsiracusa,3 679 | jonhicks,9 680 | jonudell,10 681 | joshberkus,3 682 | joshcomeau,3 683 | journalism,31 684 | jpstacey,3 685 | jq,6 686 | jquery,99 687 | jqueryui,3 688 | jruby,3 689 | jsk,8 690 | json,130 691 | jsonhead,3 692 | jsonp,23 693 | jsonschema,4 694 | juliaevans,17 695 | jupyter,38 696 | jvm,4 697 | jwt,3 698 | jwz,5 699 | jython,11 700 | kafka,7 701 | kansas,4 702 | kapingyee,5 703 | kellanelliottmccrea,16 704 | kevinyank,6 705 | keynote,3 706 | keyvaluepairs,9 707 | keyvaluestores,4 708 | kml,8 709 | korea,3 710 | kriszyp,3 711 | kubernetes,5 712 | l10n,5 713 | laion,4 714 | lambda,9 715 | language,5 716 | lanyrd,16 717 | largehadroncollider,3 718 | lastfm,9 719 | latex,3 720 | laurievoss,4 721 | law,3 722 | lawrence,6 723 | leahculver,3 724 | leanstartups,6 725 | legal,3 726 | lego,4 727 | leonardlin,5 728 | leopard,14 729 | lesorchard,8 730 | libevent,4 731 | libraries,21 732 | licenses,6 733 | lifehacks,5 734 | lightningtalks,4 735 | lighttpd,5 736 | lilypond,3 737 | lindenlab,4 738 | linguistics,4 739 | linkedin,6 740 | links,3 741 | linustorvalds,4 742 | linux,39 743 | lisp,8 744 | litestream,9 745 | livejournal,9 746 | ljworld,7 747 | llama,44 748 | llm,37 749 | llms,379 750 | llvm,5 751 | loadbalancing,11 752 | loading,3 753 | loadtesting,3 754 | local,5 755 | localisation,3 756 | location,14 757 | lockin,3 758 | logging,18 759 | login,3 760 | logincsrf,3 761 | logs,4 762 | london,54 763 | london2,3 764 | longpolling,3 765 | lua,9 766 | lucene,12 767 | lugradio,4 768 | lugradiolive,3 769 | lukaszlanga,3 770 | lukeplant,8 771 | lxml,7 772 | mac,11 773 | macbook,3 774 | macbookpro,3 775 | macfuse,3 776 | machinelearning,62 777 | machinetags,5 778 | maciejceglowski,5 779 | macosx,14 780 | magic,3 781 | magnolia,3 782 | mailinator,3 783 | make,6 784 | malcolmtredinnick,8 785 | management,47 786 | manyeyes,3 787 | mapping,51 788 | mapreduce,10 789 | maps,41 790 | markdown,14 791 | marketing,15 792 | marknottingham,14 793 | markpilgrim,37 794 | markramm,3 795 | markshuttleworth,4 796 | markup,8 797 | marsphoenix,3 798 | martinatkins,7 799 | martinbelam,3 800 | mashup,5 801 | mashups,7 802 | masterslave,6 803 | mastodon,22 804 | mathematics,3 805 | mathml,4 806 | mattbiddulph,12 807 | mattcroydon,3 808 | matthewsomerville,4 809 | mattlevine,3 810 | mattmullenweg,5 811 | mattwebb,11 812 | mattwestcott,4 813 | maxwoolf,12 814 | md5,4 815 | me,6 816 | media,5 817 | mediawiki,4 818 | meetings,6 819 | meetup,3 820 | meetups,8 821 | megpickard,4 822 | memcache,7 823 | memcached,32 824 | memcachedb,3 825 | memes,3 826 | memory,6 827 | memoryleaks,4 828 | mercurial,4 829 | messagequeue,3 830 | messagequeues,16 831 | messaging,8 832 | metaclasses,3 833 | metadata,10 834 | metafilter,10 835 | michaeltrier,4 836 | michalmigurski,13 837 | michalzalewski,3 838 | microformats,30 839 | microservices,7 840 | microsoft,97 841 | middleware,13 842 | midjourney,5 843 | migration,3 844 | migrations,15 845 | migueldeicaza,8 846 | mikebostock,6 847 | mikebutcher,3 848 | mikelmaron,4 849 | mikemalone,4 850 | mikeshaver,3 851 | military,3 852 | minification,4 853 | mistral,7 854 | mit,3 855 | mlc,9 856 | mobile,38 857 | mobileweb,4 858 | models,3 859 | moderation,9 860 | modpython,5 861 | modwsgi,13 862 | mollywhite,3 863 | money,3 864 | mongodb,10 865 | mongrel,3 866 | monitoring,8 867 | monkeypatching,6 868 | mono,8 869 | moonlight,3 870 | mootools,6 871 | motivation,4 872 | movies,9 873 | mozilla,45 874 | mp3,5 875 | multidb,5 876 | multiprocessing,3 877 | museums,19 878 | music,17 879 | mvc,4 880 | mymaps,3 881 | myopenid,4 882 | mypy,9 883 | mysociety,13 884 | myspace,19 885 | mysql,63 886 | namespaces,4 887 | nasa,4 888 | nataliedowne,35 889 | nathanborror,3 890 | nedbatchelder,16 891 | neilfraser,4 892 | nelsonminar,3 893 | netflix,4 894 | netscape,4 895 | networking,22 896 | newforms,12 897 | newformsadmin,3 898 | news,8 899 | newspapers,17 900 | newyork,3 901 | nginx,39 902 | niallkennedy,7 903 | nicar,7 904 | nlp,9 905 | node,29 906 | nodejs,44 907 | nofollow,3 908 | nomic,3 909 | noscript,4 910 | nose,5 911 | nosql,28 912 | npm,15 913 | nsa,3 914 | numpy,4 915 | nyc,5 916 | nytimes,15 917 | oauth,48 918 | objectivec,7 919 | observability,7 920 | observable,40 921 | ocr,12 922 | offline,10 923 | ogg,3 924 | olpc,6 925 | onload,5 926 | oop,5 927 | ooxml,4 928 | opacity,4 929 | open-source,22 930 | openai,122 931 | opencv,3 932 | opendata,13 933 | openid,213 934 | openid2,7 935 | openlibrary,3 936 | openplatform,11 937 | openrightsgroup,5 938 | opensearch,3 939 | opensocial,8 940 | opensource,185 941 | openstreetmap,44 942 | opentech,3 943 | opentech2008,3 944 | openweb,6 945 | opera,30 946 | operations,5 947 | ops,16 948 | optfunc,3 949 | optimisation,6 950 | oracle,5 951 | orange,3 952 | orbited,3 953 | ordnancesurvey,3 954 | oreilly,4 955 | org,3 956 | orm,40 957 | ormcaching,4 958 | oscon,8 959 | oscon07,4 960 | osx,80 961 | owasp,4 962 | owlsnearyou,3 963 | oxford,14 964 | oxfordgeeknight2,7 965 | oxfordgeeknights,20 966 | oxfordgeeks,7 967 | packaging,20 968 | pagerank,6 969 | pagni,3 970 | pandas,16 971 | panels,5 972 | parallels,6 973 | paris,3 974 | parquet,5 975 | parrot,3 976 | parsing,11 977 | passwordantipattern,7 978 | passwords,27 979 | paste,3 980 | patents,5 981 | patrickmckenzie,5 982 | patterns,5 983 | paulford,12 984 | paulgraham,9 985 | paulhammond,3 986 | paypal,4 987 | pdb,6 988 | pdf,17 989 | performance,79 990 | perl,22 991 | perlbal,3 992 | permissions,5 993 | perplexity,3 994 | personal,5 995 | personalnews,8 996 | petermichaux,4 997 | petervandijck,4 998 | philgyford,8 999 | phishing,52 1000 | photography,18 1001 | photos,16 1002 | photosynth,3 1003 | php,79 1004 | php5,4 1005 | physics,4 1006 | pil,4 1007 | pingback,22 1008 | pip,9 1009 | pipes,7 1010 | piracy,3 1011 | pitching,4 1012 | pixelart,4 1013 | placemaker,3 1014 | play,3 1015 | playwright,6 1016 | plugin,6 1017 | plugins,69 1018 | plurk,3 1019 | png,7 1020 | pngs,4 1021 | podcasts,31 1022 | politicalhacking,6 1023 | politics,24 1024 | pony,3 1025 | popfly,3 1026 | portablesocialnetworks,12 1027 | portland,3 1028 | post,7 1029 | postelslaw,3 1030 | postgis,6 1031 | postgresql,114 1032 | power,3 1033 | powerpoint,3 1034 | pownce,9 1035 | ppk,6 1036 | pr,6 1037 | presentations,7 1038 | presenting,4 1039 | pricing,3 1040 | privacy,34 1041 | process,6 1042 | processes,3 1043 | productivity,20 1044 | productmanagement,8 1045 | profiler,3 1046 | profiling,12 1047 | programmers,13 1048 | programming,146 1049 | programminglanguages,32 1050 | progressiveenhancement,7 1051 | projectmanagement,6 1052 | projects,350 1053 | promptengineering,60 1054 | promptinjection,48 1055 | protocolbuffers,5 1056 | prototype,16 1057 | proxies,3 1058 | proxy,15 1059 | psf,5 1060 | psychology,5 1061 | pubs,3 1062 | pubsub,3 1063 | pubsubhubbub,6 1064 | puppeteer,5 1065 | pycon,15 1066 | pyconuk,6 1067 | pylons,6 1068 | pyobjc,3 1069 | pyodide,9 1070 | pypi,22 1071 | pypy,9 1072 | pysqlite,3 1073 | pytest,13 1074 | python,908 1075 | python3,21 1076 | pythoncard,4 1077 | pytorch,5 1078 | qemu,3 1079 | queryset,4 1080 | querysetrefactor,4 1081 | queue,5 1082 | queues,13 1083 | quora,1004 1084 | rabbitmq,7 1085 | radio,7 1086 | rafecolburn,6 1087 | rails,70 1088 | rands,3 1089 | ratelimiting,7 1090 | rdbms,4 1091 | rdf,4 1092 | react,26 1093 | reading,3 1094 | realtime,9 1095 | realtimeweb,5 1096 | recommendations,3 1097 | recovered,213 1098 | recruiting,8 1099 | redbean,4 1100 | reddit,21 1101 | redhat,3 1102 | redis,50 1103 | redpajama,4 1104 | refactoring,6 1105 | regex,7 1106 | registration,5 1107 | regularexpressions,12 1108 | releasenotes,11 1109 | releases,15 1110 | remote,3 1111 | remysharp,3 1112 | replicate,5 1113 | replication,30 1114 | research,4 1115 | resolved,7 1116 | rest,35 1117 | restaurants,8 1118 | restful,4 1119 | restructuredtext,4 1120 | revcanonical,9 1121 | rewrites,3 1122 | rfc,7 1123 | richardcrowley,5 1124 | richardjones,3 1125 | richskrenta,5 1126 | richtext,3 1127 | rileygoodside,5 1128 | ripgrep,5 1129 | robertocallahan,4 1130 | robinsloan,4 1131 | robots,5 1132 | robotstxt,4 1133 | royalmail,3 1134 | rss,30 1135 | ruby,68 1136 | rubyonrails,3 1137 | russellbeattie,3 1138 | rust,51 1139 | ryandahl,5 1140 | ryantomayko,12 1141 | rye,5 1142 | s3,48 1143 | s3credentials,8 1144 | saas,14 1145 | safari,40 1146 | salvatoresanfilippo,10 1147 | samedomain,3 1148 | samruby,12 1149 | sandboxing,8 1150 | sanfrancisco,27 1151 | sanfranciscobayarea,9 1152 | sanic,5 1153 | satellite,3 1154 | scala,6 1155 | scalability,3 1156 | scaling,128 1157 | science,18 1158 | sciencefiction,3 1159 | scipy,4 1160 | scottkveton,4 1161 | scottschiller,3 1162 | scraping,21 1163 | screencast,4 1164 | screencasts,4 1165 | screenreaders,4 1166 | screenscraping,7 1167 | screenwriting,6 1168 | scribd,4 1169 | search,73 1170 | searchengines,11 1171 | secondlife,10 1172 | security,435 1173 | securitytheatre,3 1174 | selectors,13 1175 | selenium,4 1176 | semantic,3 1177 | semanticweb,8 1178 | sentry,7 1179 | seo,29 1180 | serialization,3 1181 | serverless,5 1182 | servers,6 1183 | serviceworkers,3 1184 | servo,3 1185 | sessions,6 1186 | settings,4 1187 | setuptools,5 1188 | sha1,4 1189 | shapefiles,5 1190 | sharding,11 1191 | sharecropping,8 1192 | shell,6 1193 | shotscraper,21 1194 | sidechannel,3 1195 | sidekiq,4 1196 | signedcookies,5 1197 | signing,6 1198 | siliconvalley,6 1199 | silverlight,15 1200 | simonwardley,3 1201 | simpledb,7 1202 | sinatra,3 1203 | sitepen,5 1204 | sitepoint,11 1205 | sitespecificbrowsers,7 1206 | sixapart,6 1207 | sizzle,4 1208 | skillswap,3 1209 | skype,3 1210 | slack,6 1211 | slidecast,3 1212 | slides,12 1213 | slideshare,10 1214 | smalldata,3 1215 | sms,4 1216 | snowleopard,3 1217 | soap,9 1218 | social,4 1219 | socialgraph,11 1220 | socialmedia,24 1221 | socialnetworks,15 1222 | socialsoftware,6 1223 | socialwhitelisting,4 1224 | software,10 1225 | softwarearchitecture,6 1226 | softwareengineering,48 1227 | solr,22 1228 | sourceforge,3 1229 | south,4 1230 | soviet,3 1231 | space,6 1232 | spam,21 1233 | spatialite,11 1234 | speaking,102 1235 | specification,3 1236 | sphinxdocs,8 1237 | sphinxsearch,7 1238 | spidermonkey,5 1239 | spiderverse,3 1240 | spongmonkeys,4 1241 | sports,4 1242 | spreadsheets,3 1243 | sql,81 1244 | sqlalchemy,5 1245 | sqlinjection,4 1246 | sqlite,231 1247 | sqliteutils,81 1248 | sqlserver,3 1249 | squid,6 1250 | squirrels,8 1251 | ssh,8 1252 | ssl,9 1253 | sso,3 1254 | stablediffusion,22 1255 | stackoverflow,11 1256 | standards,30 1257 | stanford,8 1258 | starling,3 1259 | startup,5 1260 | startups,184 1261 | starwars,4 1262 | staticanalysis,3 1263 | staticgenerator,3 1264 | staticmaps,3 1265 | statictyping,7 1266 | stdlib,3 1267 | steampunk,3 1268 | stephenfry,3 1269 | stevejobs,10 1270 | stevesouders,9 1271 | steveyegge,4 1272 | storage,5 1273 | streaming,3 1274 | streetview,3 1275 | strings,4 1276 | stripe,6 1277 | stuartcolville,4 1278 | stuartlangridge,21 1279 | stupid,3 1280 | subversion,32 1281 | sun,14 1282 | sunmicrosystems,4 1283 | support,3 1284 | svg,30 1285 | swf,4 1286 | swyx,5 1287 | sxsw,20 1288 | symbex,4 1289 | syndication,8 1290 | syntaxhighlighting,4 1291 | sysadmin,26 1292 | tagging,4 1293 | tags,3 1294 | tailscale,4 1295 | talks,37 1296 | tamarin,6 1297 | teaching,13 1298 | teamfortress2,3 1299 | techcrunch,7 1300 | technicaldebt,3 1301 | technology,14 1302 | technorati,8 1303 | techstars,4 1304 | ted,7 1305 | tedleung,3 1306 | templates,5 1307 | templating,5 1308 | tensorflow,6 1309 | testing,50 1310 | textmate,7 1311 | tf2,3 1312 | thebigpicture,3 1313 | theguardian,4 1314 | theoschlossnagle,3 1315 | theregister,3 1316 | thomasptacek,8 1317 | threading,5 1318 | threads,6 1319 | thunderbird,3 1320 | ticketing,3 1321 | tickets,3 1322 | til,6 1323 | timbernerslee,5 1324 | timbray,25 1325 | timemachine,4 1326 | timezones,10 1327 | timoreilly,5 1328 | tinyurl,6 1329 | tls,5 1330 | tokyocabinet,7 1331 | tokyotyrant,5 1332 | tomarmitage,6 1333 | tomchristie,4 1334 | tomcoates,8 1335 | tommacwright,10 1336 | tomscott,4 1337 | tomsteinberg,4 1338 | tomtaylor,3 1339 | tomwatson,3 1340 | tonyhirst,5 1341 | tools,11 1342 | torchbox,7 1343 | tornado,10 1344 | trac,3 1345 | trackback,9 1346 | traefik,3 1347 | transactions,5 1348 | transformers,6 1349 | transformersjs,5 1350 | translation,6 1351 | travel,42 1352 | travis,5 1353 | tunisia,3 1354 | turbogears,4 1355 | tutorial,18 1356 | tutorials,12 1357 | tv,15 1358 | twisted,12 1359 | twitter,150 1360 | typescript,6 1361 | typography,16 1362 | ubuntu,22 1363 | ui,25 1364 | uk,15 1365 | ukgovernment,4 1366 | undo,4 1367 | unicode,29 1368 | unittesting,4 1369 | unittests,13 1370 | unix,16 1371 | unladenswallow,3 1372 | unobtrusivejavascript,6 1373 | unobtrusivescripting,3 1374 | upcoming,5 1375 | uploads,4 1376 | upsert,3 1377 | urls,74 1378 | usa,3 1379 | usability,66 1380 | userresearch,3 1381 | uuid,3 1382 | ux,20 1383 | v8,8 1384 | vaccinateca,23 1385 | vaccinatecablog,14 1386 | vaccines,3 1387 | validation,6 1388 | validator,3 1389 | valve,3 1390 | varnish,10 1391 | vectorsearch,3 1392 | verisign,5 1393 | versioncontrol,9 1394 | versioning,7 1395 | vicuna,5 1396 | video,29 1397 | views,4 1398 | virtualenv,4 1399 | virtualisation,8 1400 | virtualization,13 1401 | vista,7 1402 | visualisation,15 1403 | visualisations,3 1404 | visualization,22 1405 | vml,4 1406 | vmware,12 1407 | vox,5 1408 | vps,4 1409 | vr,3 1410 | vulnerability,5 1411 | w3c,18 1412 | washingtonpost,7 1413 | wasp,3 1414 | web,12 1415 | web2,5 1416 | web20,5 1417 | web2expo,3 1418 | web3,4 1419 | webapis,7 1420 | webapps,44 1421 | webassembly,53 1422 | webcomponents,15 1423 | webdav,3 1424 | webdevelopers,4 1425 | webdevelopment,167 1426 | webfonts,3 1427 | webhooks,20 1428 | webkit,28 1429 | weblog,3 1430 | webperformance,17 1431 | webrunner,3 1432 | webserver,3 1433 | webservers,11 1434 | webservice,3 1435 | webservices,26 1436 | websockets,15 1437 | webstandards,36 1438 | webstock,4 1439 | webworkers,9 1440 | weeknotes,176 1441 | whatwg,16 1442 | whereonearth,4 1443 | whisper,11 1444 | whitelisting,7 1445 | whoosh,4 1446 | whytheluckystiff,6 1447 | widgets,4 1448 | wifi,13 1449 | wii,6 1450 | wiki,12 1451 | wikileaks,4 1452 | wikinear,6 1453 | wikipedia,37 1454 | wildlife,4 1455 | wildlifenearyou,10 1456 | willlarson,10 1457 | wilsonminer,5 1458 | windows,35 1459 | wired,8 1460 | wordpress,17 1461 | wordpresscom,4 1462 | workers,4 1463 | workflow,4 1464 | worm,5 1465 | writing,18 1466 | wsgi,18 1467 | wsstar,4 1468 | xfn,5 1469 | xhtml,20 1470 | xhtml2,6 1471 | xkcd,12 1472 | xml,55 1473 | xmlhttprequest,7 1474 | xmlrpc,15 1475 | xmpp,5 1476 | xrds,5 1477 | xss,59 1478 | xtech,11 1479 | xtech2007,5 1480 | xuacompatible,10 1481 | xulrunner,3 1482 | yadis,3 1483 | yagni,5 1484 | yahoo,106 1485 | yahoopipes,7 1486 | yaml,8 1487 | ycombinator,52 1488 | ydn,10 1489 | yelp,5 1490 | youtube,32 1491 | yql,12 1492 | yrb,3 1493 | yui,33 1494 | yui3,3 1495 | zacharyvoase,3 1496 | zeitnow,20 1497 | zeppelins,17 1498 | zerodowntime,15 1499 | zeromq,4 1500 | zig,3 1501 | zstd,3 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FuzzTypes 2 | 3 | FuzzTypes is a set of "autocorrecting" annotation types that expands 4 | upon [Pydantic](https://github.com/pydantic/pydantic)'s included [data 5 | conversions.](https://docs.pydantic.dev/latest/concepts/conversion_table/) 6 | Designed for simplicity, it provides powerful normalization capabilities 7 | (e.g. named entity linking) to ensure structured data is composed of 8 | "smart things" not "dumb strings". 9 | 10 | 11 | ## Getting Started 12 | 13 | Pydantic supports basic conversion of data between types. For instance: 14 | 15 | ```python 16 | from pydantic import BaseModel 17 | 18 | class Normal(BaseModel): 19 | boolean: bool 20 | float: float 21 | integer: int 22 | 23 | obj = Normal( 24 | boolean='yes', 25 | float='2', 26 | integer='3', 27 | ) 28 | assert obj.boolean is True 29 | assert obj.float == 2.0 30 | assert obj.integer == 3 31 | ``` 32 | 33 | FuzzTypes expands on the standard data conversions handled by Pydantic and 34 | provides a variety of autocorrecting annotation types. 35 | 36 | ```python 37 | from datetime import datetime 38 | from typing import Annotated 39 | 40 | from pydantic import BaseModel 41 | 42 | from fuzztypes import ( 43 | ASCII, 44 | Datetime, 45 | Email, 46 | Fuzzmoji, 47 | InMemoryValidator, 48 | Integer, 49 | Person, 50 | RegexValidator, 51 | ZipCode, 52 | flags, 53 | ) 54 | 55 | # define a source, see EntitySource for using TSV, CSV, JSONL 56 | inventors = ["Ada Lovelace", "Alan Turing", "Claude Shannon"] 57 | 58 | # define a in memory validator with fuzz search enabled. 59 | Inventor = Annotated[ 60 | str, InMemoryValidator(inventors, search_flag=flags.FuzzSearch) 61 | ] 62 | 63 | # custom Regex type for finding twitter handles. 64 | Handle = Annotated[ 65 | str, RegexValidator(r"@\w{1,15}", examples=["@genomoncology"]) 66 | ] 67 | 68 | # define a Pydantic class with 9 fuzzy type attributes 69 | class Fuzzy(BaseModel): 70 | ascii: ASCII 71 | email: Email 72 | emoji: Fuzzmoji 73 | handle: Handle 74 | integer: Integer 75 | inventor: Inventor 76 | person: Person 77 | time: Datetime 78 | zipcode: ZipCode 79 | 80 | # create an instance of class Fuzzy 81 | obj = Fuzzy( 82 | ascii="άνθρωπος", 83 | email="John Doe ", 84 | emoji='thought bubble', 85 | handle='Ian Maurer (@imaurer)', 86 | integer='fifty-five', 87 | inventor='ada luvlace', 88 | person='mr. arthur herbert fonzarelli (fonzie)', 89 | time='5am on Jan 1, 2025', 90 | zipcode="(Zipcode: 12345-6789)", 91 | ) 92 | 93 | # test the autocorrecting performed 94 | 95 | # greek for man: https://en.wiktionary.org/wiki/άνθρωπος 96 | assert obj.ascii == "anthropos" 97 | 98 | # extract email via regular expression 99 | assert obj.email == "jdoe@example.com" 100 | 101 | # fuzzy match "thought bubble" to "thought balloon" emoji 102 | assert obj.emoji == "💭" 103 | 104 | # simple, inline regex example (see above Handle type) 105 | assert obj.handle == "@imaurer" 106 | 107 | # convert integer word phrase to integer value 108 | assert obj.integer == 55 109 | 110 | # case-insensitive fuzzy match on lowercase, misspelled name 111 | assert obj.inventor == "Ada Lovelace" 112 | 113 | # human name parser (title, first, middle, last, suffix, nickname) 114 | assert str(obj.person) == "Mr. Arthur H. Fonzarelli (fonzie)" 115 | assert obj.person.short_name == "Arthur Fonzarelli" 116 | assert obj.person.nickname == "fonzie" 117 | assert obj.person.last == "Fonzarelli" 118 | 119 | # convert time phrase to datetime object 120 | assert obj.time.isoformat() == "2025-01-01T05:00:00" 121 | 122 | # extract zip5 or zip9 formats using regular expressions 123 | assert obj.zipcode == "12345-6789" 124 | 125 | # print JSON on success 126 | assert obj.model_dump() == { 127 | "ascii": "anthropos", 128 | "email": "jdoe@example.com", 129 | "emoji": "💭", 130 | "handle": "@imaurer", 131 | "integer": 55, 132 | "inventor": "Ada Lovelace", 133 | "person": { 134 | "first": "Arthur", 135 | "init_format": "{first} {middle} {last}", 136 | "last": "Fonzarelli", 137 | "middle": "H.", 138 | "name_format": "{title} {first} {middle} {last} {suffix} " 139 | "({nickname})", 140 | "nickname": "fonzie", 141 | "suffix": "", 142 | "title": "Mr.", 143 | }, 144 | "time": datetime(2025, 1, 1, 5), 145 | "zipcode": "12345-6789", 146 | } 147 | ``` 148 | 149 | ## Installation 150 | 151 | Available on [PyPI](https://pypi.org/project/FuzzTypes/): 152 | 153 | ```bash 154 | pip install fuzztypes 155 | ``` 156 | 157 | To install all dependencies (see below), you can copy and paste this: 158 | 159 | ```bash 160 | pip install anyascii dateparser emoji lancedb nameparser number-parser rapidfuzz sentence-transformers tantivy 161 | ``` 162 | 163 | 164 | ## Google Colab Notebook 165 | 166 | There is a read-only notebook that you can copy and edit to try out FuzzTypes: 167 | 168 | [https://colab.research.google.com/drive/1GNngxcTUXpWDqK_qNsJoP2NhSN9vKCzZ?usp=sharing](https://colab.research.google.com/drive/1GNngxcTUXpWDqK_qNsJoP2NhSN9vKCzZ?usp=sharing) 169 | 170 | 171 | ## Base Validators 172 | 173 | Base validators are the building blocks of FuzzTypes that can be used for creating custom "usable types". 174 | 175 | | Type | Description | 176 | |---------------------|---------------------------------------------------------------------------------------------| 177 | | `DateType` | Base date type, pass in arguments such as `date_order`, `strict` and `relative_base`. | 178 | | `FuzzValidator` | Validator class that calls a provided function and handles core and json schema config. | 179 | | `InMemoryValidator` | Enables matching entities in memory using exact, alias, fuzzy, or semantic search. | 180 | | `OnDiskValidator` | Performs matching entities stored on disk using exact, alias, fuzzy, or semantic search. | 181 | | `RegexValidator` | Regular expression pattern matching base validator. | 182 | | `DatetimeType` | Base datetime type, pass in arguments such as `date_order`, `timezone` and `relative_base`. | 183 | 184 | These base types offer flexibility and extensibility, enabling you to create custom annotation types that suit your 185 | specific data validation and normalization requirements. 186 | 187 | 188 | ## Usable Types 189 | 190 | Usable types are pre-built annotation types in FuzzTypes that can be directly used in Pydantic models. They provide 191 | convenient and ready-to-use functionality for common data types and scenarios. 192 | 193 | | Type | Description | 194 | |----------------|-------------------------------------------------------------------------------------------| 195 | | `ASCII` | Converts Unicode strings to ASCII equivalents using either `anyascii` or `unidecode`. | 196 | | `Date` | Converts date strings to `date` objects using `dateparser`. | 197 | | `Email` | Extracts email addresses from strings using a regular expression. | 198 | | `Emoji` | Matches emojis based on Unicode Consortium aliases using the `emoji` library. | 199 | | `Fuzzmoji` | Matches emojis using fuzzy string matching against aliases. | 200 | | `Integer` | Converts numeric strings or words to integers using `number-parser`. | 201 | | `LanguageCode` | Resolves language to ISO language codes (e.g., "en"). | 202 | | `LanguageName` | Resolves language to ISO language names (e.g., "English"). | 203 | | `Language` | Resolves language to ISO language object (name, alpha_2, alpha_3, scope, type, etc.). | 204 | | `Person` | Parses person names into subfields (e.g., first, last, suffix) using `python-nameparser`. | 205 | | `SSN` | Extracts U.S. Social Security Numbers from strings using a regular expression. | 206 | | `Time` | Converts datetime strings to `datetime` objects using `dateparser`. | 207 | | `Vibemoji` | Matches emojis using semantic similarity against aliases. | 208 | | `Zipcode` | Extracts U.S. ZIP codes (5 or 9 digits) from strings using a regular expression. | 209 | 210 | These usable types provide a wide range of commonly needed data validations and transformations, making it 211 | easier to work with various data formats and perform tasks like parsing, extraction, and matching. 212 | 213 | 214 | ## InMemoryValidator and OnDiskValidator Configuration 215 | 216 | The InMemory and OnDisk Validator objects work with lists of Entities. 217 | 218 | The following table describes the available configuration options: 219 | 220 | | Argument | Type | Default | Description | 221 | |-------------------|-----------------------------------------|-----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 222 | | `case_sensitive` | `bool` | `False` | If `True`, matches are case-sensitive. If `False`, matches are case-insensitive. | 223 | | `device` | `Literal["cpu", "cuda", "mps"]` | `"cpu"` | The device to use for generating semantic embeddings and LanceDB indexing. Available options are "cpu", "cuda" (for NVIDIA GPUs), and "mps" (for Apple's Metal Performance Shaders). | 224 | | `encoder` | `Union[Callable, str, Any]` | `None` | The encoder to use for generating semantic embeddings. It can be a callable function, a string specifying the name or path of a pre-trained model, or any other object that implements the encoding functionality. | 225 | | `examples` | `List[Any]` | `None` | A list of example values to be used in schema generation. These examples are included in the generated JSON schema to provide guidance on the expected format of the input values. | 226 | | `fuzz_scorer` | `Literal["token_sort_ratio", ...]` | `"token_sort_ratio"` | The scoring algorithm to use for fuzzy string matching. Available options include "token_sort_ratio", "ratio", "partial_ratio", "token_set_ratio", "partial_token_set_ratio", "token_ratio", "partial_token_ratio", "WRatio", and "QRatio". Each algorithm has its own characteristics and trade-offs between accuracy and performance. | 227 | | `limit` | `int` | `10` | The maximum number of matches to return when performing fuzzy or semantic searches. | 228 | | `min_similarity` | `float` | `80.0` | The minimum similarity score required for a match to be considered valid. Matches with a similarity score below this threshold will be discarded. | 229 | | `notfound_mode` | `Literal["raise", "none", "allow"]` | `"raise"` | The action to take when a matching entity is not found. Available options are "raise" (raises an exception), "none" (returns `None`), and "allow" (returns the input key as the value). | 230 | | `search_flag` | `flags.SearchFlag` | `flags.DefaultSearch` | The search strategy to use for finding matches. It is a combination of flags that determine which fields of the `NamedEntity` are considered for matching and whether fuzzy or semantic search is enabled. Available options are defined in the `flags` module. | 231 | | `tiebreaker_mode` | `Literal["raise", "lesser", "greater"]` | `"raise"` | The strategy to use for resolving ties when multiple matches have the same similarity score. Available options are "raise" (raises an exception), "lesser" (returns the match with the lower value), and "greater" (returns the match with the greater value). | 232 | 233 | 234 | ## Lazy Dependencies 235 | 236 | FuzzTypes leverages several powerful libraries to extend its functionality. 237 | 238 | These dependencies are not installed by default with FuzzTypes to keep the 239 | installation lightweight. Instead, they are optional and can be installed 240 | as needed depending on which types you use. 241 | 242 | Below is a list of these dependencies, including their licenses, purpose, and what 243 | specific Types require them. 244 | 245 | Right now, you must pip install the modules directly, in the future you will 246 | be able to install them automatically as part of the main install using pip extras. 247 | 248 | To install all dependencies, you can copy and paste this: 249 | 250 | ```bash 251 | pip install anyascii dateparser emoji lancedb nameparser number-parser rapidfuzz sentence-transformers tantivy 252 | ``` 253 | 254 | 255 | | Fuzz Type | Library | License | Purpose | 256 | |-------------------|--------------------------------------------------------------------------|------------|------------------------------------------------------------| 257 | | ASCII | [anyascii](https://github.com/anyascii/anyascii) | ISC | Converting Unicode into ASCII equivalents (not GPL) | 258 | | ASCII | [unidecode](https://github.com/avian2/unidecode) | GPL | Converting Unicode into ASCII equivalents (better quality) | 259 | | Date | [dateparser](https://github.com/scrapinghub/dateparser) | BSD-3 | Parsing dates from strings | 260 | | Emoji | [emoji](https://github.com/carpedm20/emoji/) | BSD | Handling and manipulating emoji characters | 261 | | Fuzz | [rapidfuzz](https://github.com/rapidfuzz/RapidFuzz) | MIT | Performing fuzzy string matching | 262 | | InMemoryValidator | [numpy](https://numpy.org/) | BSD | Numerical computing in Python | 263 | | InMemoryValidator | [scikit-learn](https://scikit-learn.org/) | BSD | Machine learning in Python | 264 | | InMemoryValidator | [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | Apache-2.0 | Encoding sentences into high-dimensional vectors | 265 | | Integer | [number-parser](https://github.com/scrapinghub/number-parser) | BSD-3 | Parsing numbers from strings | 266 | | OnDiskValidator | [lancedb](https://github.com/lancedb/lancedb) | Apache-2.0 | High-performance, on-disk vector database | 267 | | OnDiskValidator | [pyarrow](https://github.com/apache/arrow) | Apache-2.0 | In-memory columnar data format and processing library | 268 | | OnDiskValidator | [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | Apache-2.0 | Encoding sentences into high-dimensional vectors | 269 | | OnDiskValidator | [tantivy](https://github.com/quickwit-oss/tantivy-py) | MIT | Full-text search (FTS) for LanceDB. | 270 | | Person | [nameparser](https://github.com/derek73/python-nameparser) | LGPL | Parsing person names | 271 | 272 | 273 | ## Maintainer 274 | 275 | FuzzTypes was created by [Ian Maurer](https://x.com/imaurer), the CTO of [GenomOncology](https://genomoncology.com). 276 | 277 | This MIT-based open-source project was extracted from our product which includes the ability to normalize biomedical 278 | data for use in precision oncology clinical decision support systems. Contact me to learn more about our product 279 | offerings. 280 | 281 | 282 | | Type | Description | 283 | |----------------|-------------------------------------------------------------------------------------------| 284 | | `AirportCode` | Represents airport codes (e.g., "ORD"). | 285 | | `Airport` | Represents airport names (e.g., "O'Hare International Airport"). | 286 | | `CountryCode` | Represents ISO country codes (e.g., "US"). | 287 | | `Country` | Represents country names (e.g., "United States"). | 288 | | `Currency` | Represents currency codes (e.g., "USD"). | 289 | | `Quantity` | Converts strings to `Quantity` objects with value and unit using `pint`. | 290 | | `URL` | Represents normalized URLs with tracking parameters removed using `url-normalize`. | 291 | | `USStateCode` | Represents U.S. state codes (e.g., "CA"). | 292 | | `USState` | Represents U.S. state names (e.g., "California"). | 293 | 294 | 295 | ## Structured Data Generation via LLM Function Calling and Custom GPT Actions 296 | 297 | Several libraries (e.g. [Instructor](https://github.com/jxnl/instructor), 298 | [Outlines](https://github.com/outlines-dev/outlines), 299 | [Marvin](https://github.com/prefecthq/marvin)) use Pydantic to define models for structured data generation 300 | using Large Language Models (LLMs) via function calling or a grammar/regex 301 | based sampling approach based on the [JSON schema generated by Pydantic](https://docs.pydantic.dev/latest/concepts/json_schema/). 302 | 303 | This approach allows for the enumeration of allowed values using 304 | Python's `Literal`, `Enum` or JSON Schema's `examples` field directly 305 | in your Pydantic class declaration which is used by the LLM to 306 | generate valid values. This approach works exceptionally well for 307 | low-cardinality (not many unique allowed values) such as the world's 308 | continents (7 in total). 309 | 310 | This approach, however, doesn't scale well for high-cardinality (many unique 311 | allowed values) such as the number of known human genomic variants (~325M). 312 | Where exactly the cutoff is between "low" and "high" cardinality is an exercise 313 | left to the reader and their use case. 314 | 315 | That's where FuzzTypes come in. The allowed values are managed by the FuzzTypes 316 | annotations and the values are resolved during the Pydantic validation process. 317 | This can include fuzzy and semantic searching that throws an exception if the 318 | provided value doesn't meet a minimum similarity threshold defined by the 319 | developer. 320 | 321 | Errors discovered via Pydantic can be caught and resubmitted to the LLM for 322 | correction. The error will contain examples, expected patterns, and closest 323 | matches to help steer the LLM to provide a better informed guess. 324 | 325 | 326 | ## Creating Custom Types 327 | 328 | FuzzTypes provides a set of base types that you can use to create 329 | your own custom annotation types. These base types offer different 330 | capabilities and can be extended to suit your specific data validation 331 | and normalization needs. 332 | 333 | ### EntitySource 334 | 335 | FuzzTypes provides the `EntitySource` class to manage and load 336 | entity data from various sources. It supports JSON Lines (`.jsonl`), 337 | CSV (`.csv`), TSV (`.tsv`), and Text (`.txt`) formats, as well as 338 | loading entities from a callable function. 339 | 340 | Example: 341 | ```python 342 | from pathlib import Path 343 | from fuzztypes import EntitySource, NamedEntity 344 | 345 | # Load entities from a CSV file 346 | fruit_source = EntitySource(Path("path/to/fruits.csv")) 347 | 348 | # Load entities from a callable function 349 | def load_animals(): 350 | return [ 351 | NamedEntity(value="Dog", aliases=["Canine"]), 352 | NamedEntity(value="Cat", aliases=["Feline"]), 353 | ] 354 | 355 | animal_source = EntitySource(load_animals) 356 | ``` 357 | 358 | ### InMemoryValidator Base Type 359 | 360 | The `InMemoryValidator` base type enables matching entities in memory using 361 | exact, alias, fuzzy, or semantic search. It is suitable for small 362 | to medium-sized datasets that can fit in memory and provides fast 363 | matching capabilities. 364 | 365 | Example: 366 | ```python 367 | from typing import Annotated 368 | from pydantic import BaseModel 369 | from fuzztypes import InMemoryValidator, flags 370 | 371 | # Create a custom annotation type for matching fruits 372 | fruits = ["Apple", "Banana", "Orange"] 373 | Fruit = Annotated[ 374 | str, InMemoryValidator(fruits, search_flag=flags.FuzzSearch) 375 | ] 376 | 377 | class MyModel(BaseModel): 378 | fruit: Fruit 379 | 380 | model = MyModel(fruit="appel") 381 | assert model.fruit == "Apple" 382 | ``` 383 | 384 | ### OnDiskValidator Base Type 385 | 386 | The `OnDiskValidator` base type performs matching entities stored on disk 387 | using exact, alias, fuzzy, or semantic search. It leverages the 388 | LanceDB library for efficient storage and retrieval of entities. 389 | `OnDiskValidator` is recommended for large datasets that cannot fit in memory. 390 | 391 | Example: 392 | ```python 393 | from typing import Annotated 394 | from pydantic import BaseModel 395 | from fuzztypes import OnDiskValidator 396 | 397 | # Create a custom annotation type for matching countries stored on disk 398 | countries = [ 399 | ("United States", "US"), 400 | ("United Kingdom", "UK"), 401 | ("Canada", "CA"), 402 | ] 403 | Country = Annotated[str, OnDiskValidator("Country", countries)] 404 | 405 | class MyModel(BaseModel): 406 | country: Country 407 | 408 | assert MyModel(country="Canada").country == "Canada" 409 | assert MyModel(country="US").country == "United States" 410 | ``` 411 | 412 | ### DateType and TimeType 413 | 414 | The `DateValidator` and `DatetimeValidator` base types provide fuzzy parsing 415 | capabilities for date and datetime objects, respectively. They allow 416 | you to define flexible date and time formats and perform parsing 417 | based on specified settings such as date order, timezone, and 418 | relative base. 419 | 420 | Example: 421 | 422 | ```python 423 | from datetime import date, datetime 424 | from pydantic import BaseModel 425 | from typing import Annotated 426 | from fuzztypes import DateValidator, DatetimeValidator 427 | 428 | MyDate = Annotated[date, DateValidator(date_order="MDY")] 429 | MyTime = Annotated[datetime, DatetimeValidator(timezone="UTC")] 430 | 431 | class MyModel(BaseModel): 432 | date: MyDate 433 | time: MyTime 434 | 435 | model = MyModel(date="1/1/2023", time="1/1/23 at 10:30 PM") 436 | assert model.date.isoformat() == "2023-01-01" 437 | assert model.time.isoformat() == "2023-01-01T22:30:00+00:00" 438 | ``` 439 | 440 | 441 | ### FuzzValidator 442 | 443 | The `FuzzValidator` is the base of the fuzztypes typing system. 444 | It can be used directly to wrap any python function. 445 | 446 | Example: 447 | ```python 448 | from typing import Annotated 449 | from pydantic import BaseModel 450 | from fuzztypes import FuzzValidator 451 | 452 | # Create a custom annotation type that converts a value to uppercase 453 | UpperCase = Annotated[str, FuzzValidator(str.upper)] 454 | 455 | class MyModel(BaseModel): 456 | name: UpperCase 457 | 458 | model = MyModel(name="john") 459 | assert model.name == "JOHN" 460 | ``` 461 | 462 | 463 | ### Regex 464 | 465 | The `Regex` base type allows matching values using a regular 466 | expression pattern. It is useful for creating annotation types that 467 | validate and extract specific patterns from input values. 468 | 469 | Example: 470 | ```python 471 | from typing import Annotated 472 | from pydantic import BaseModel 473 | from fuzztypes import RegexValidator 474 | 475 | # Create a custom annotation type for matching email addresses 476 | IPAddress = Annotated[ 477 | str, RegexValidator(r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}$") 478 | ] 479 | 480 | class MyModel(BaseModel): 481 | ip_address: IPAddress 482 | 483 | model = MyModel(ip_address="My internet IP address is 192.168.127.12") 484 | assert model.ip_address == "192.168.127.12" 485 | ``` 486 | 487 | ### Languages 488 | 489 | Languages are loaded from the [Debian iso-codes](https://salsa.debian.org/iso-codes-team/iso-codes/) project. 490 | 491 | Languages are resolved using their preferred, common, inverted, bibliographic name, or 2 or 3 letter alpha code. 492 | 493 | Languages can be included as a string name (LanguageName), string code (LanguageCode) or full language object. 494 | 495 | The preferred code is the 2 letter version and will be used if available. Otherwise, the 3 letter alpha code is used. 496 | 497 | Example: 498 | 499 | ```python 500 | from pydantic import BaseModel 501 | from fuzztypes import ( 502 | Language, 503 | LanguageName, 504 | LanguageCode, 505 | LanguageScope, 506 | LanguageType, 507 | LanguageNamedEntity, 508 | validate_python, 509 | ) 510 | class Model(BaseModel): 511 | language_code: LanguageCode 512 | language_name: LanguageName 513 | language: Language 514 | 515 | # Test that Language resolves to the complete language object 516 | data = dict(language_code="en", language="English", language_name="ENG") 517 | obj = validate_python(Model, data) 518 | assert obj.language_code == "en" 519 | assert obj.language_name == "English" 520 | assert obj.language.scope == LanguageScope.INDIVIDUAL 521 | assert obj.language.type == LanguageType.LIVING 522 | assert isinstance(obj.language, LanguageNamedEntity) 523 | assert obj.model_dump(exclude_defaults=True, mode="json") == { 524 | "language": { 525 | "aliases": ["en", "eng"], 526 | "alpha_2": "en", 527 | "alpha_3": "eng", 528 | "scope": "I", 529 | "type": "L", 530 | "value": "English", 531 | }, 532 | "language_code": "en", 533 | "language_name": "English", 534 | } 535 | ``` 536 | 537 | ### Validate Python and JSON functions 538 | 539 | Functional approach to validating python and json are available. 540 | Below are examples for the `validate_python` and `validate_json` functions: 541 | 542 | ```python 543 | from pydantic import BaseModel 544 | from fuzztypes import validate_python, validate_json, Integer, Date 545 | 546 | # validate python 547 | assert validate_python(Integer, "two hundred") == 200 548 | 549 | # validate json 550 | class MyModel(BaseModel): 551 | date: Date 552 | 553 | json = '{"date": "July 4th 2021"}' 554 | obj = validate_json(MyModel, json) 555 | assert obj.date.isoformat() == "2021-07-04" 556 | ``` 557 | 558 | ### Resolve Entities from FuzzValidator or Annotation 559 | 560 | Entities can be resolved from the `FuzzValidator` validators such as InMemoryValidator 561 | or OnDiskValidator or the defined `Annotation` type using the `resolve_entity` function: 562 | 563 | ```python 564 | from typing import Annotated 565 | from fuzztypes import resolve_entity, InMemoryValidator 566 | 567 | elements = ["earth", "fire", "water", "air"] 568 | ElementValidator = InMemoryValidator(elements) 569 | Element = Annotated[str, ElementValidator] 570 | 571 | assert resolve_entity(ElementValidator, "EARTH").model_dump() == { 572 | "aliases": [], 573 | "label": None, 574 | "meta": None, 575 | "priority": None, 576 | "value": "earth", 577 | } 578 | 579 | assert resolve_entity(Element, "Air").model_dump( 580 | exclude_defaults=True 581 | ) == {"value": "air"} 582 | ``` --------------------------------------------------------------------------------