├── src
    └── fuzztypes
    │   ├── py.typed
    │   ├── utils
    │       ├── __init__.py
    │       └── download.py
    │   ├── integer.py
    │   ├── ascii.py
    │   ├── flags.py
    │   ├── regex.py
    │   ├── emojis.py
    │   ├── date.py
    │   ├── __init__.py
    │   ├── const.py
    │   ├── match.py
    │   ├── language.py
    │   ├── person.py
    │   ├── validation.py
    │   ├── storage.py
    │   ├── in_memory.py
    │   ├── entity.py
    │   ├── lazy.py
    │   └── on_disk.py
├── activate.sh
├── tests
    ├── data
    │   ├── emojis.csv
    │   ├── myths.tsv
    │   ├── emotions.txt
    │   ├── mixed.jsonl
    │   └── simonw_tags.csv
    ├── test_emoji.py
    ├── test_integer.py
    ├── test_ascii.py
    ├── on_disk
    │   ├── test_on_disk_semantic.py
    │   ├── test_on_disk_fuzz.py
    │   ├── test_on_disk_name.py
    │   └── test_on_disk_alias.py
    ├── in_memory
    │   ├── test_in_memory_similarity.py
    │   ├── test_in_memory_name.py
    │   ├── test_in_memory_alias.py
    │   ├── test_in_memory_tags_example.py
    │   └── test_in_memory_fuzz.py
    ├── conftest.py
    ├── test_language.py
    ├── test_date.py
    ├── test_entity.py
    ├── test_regex.py
    ├── test_person.py
    ├── utils
    │   └── test_download.py
    └── test_readme.py
├── requirements.txt
├── .gitignore
├── LICENSE
├── CHANGELOG.md
├── Makefile
├── pyproject.toml
├── requirements-dev.txt
└── README.md


/src/fuzztypes/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/activate.sh:
--------------------------------------------------------------------------------
1 | # activate python
2 | source .venv/bin/activate
3 | 


--------------------------------------------------------------------------------
/tests/data/emojis.csv:
--------------------------------------------------------------------------------
1 | value,aliases,priority
2 | happy,😀,1
3 | sad,😢,1
4 | celebrate,🎉|🎊|🎈,1
5 | party,🎉|🎊|🎈,100


--------------------------------------------------------------------------------
/tests/data/myths.tsv:
--------------------------------------------------------------------------------
1 | value	aliases
2 | Odysseus	Ulysses
3 | Athena	Minerva|Pallas
4 | Zeus	Jupiter|Jove
5 | Hercules	Heracles
6 | Mercury	Hermes
7 | 


--------------------------------------------------------------------------------
/src/fuzztypes/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .download import download_file, get_file
2 | 
3 | __all__ = (
4 |     "download_file",
5 |     "get_file",
6 | )
7 | 


--------------------------------------------------------------------------------
/tests/data/emotions.txt:
--------------------------------------------------------------------------------
 1 | Happiness
 2 | Sadness
 3 | Anger
 4 | Fear
 5 | Surprise
 6 | Disgust
 7 | Trust
 8 | Anticipation
 9 | Love
10 | Joy
11 | Courage
12 | Serenity


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # This file was autogenerated by uv via the following command:
 2 | #    uv pip compile pyproject.toml -o requirements.txt
 3 | annotated-types==0.6.0
 4 |     # via pydantic
 5 | pydantic==2.6.2
 6 | pydantic-core==2.16.3
 7 |     # via pydantic
 8 | typing-extensions==4.9.0
 9 |     # via
10 |     #   pydantic
11 |     #   pydantic-core
12 | 


--------------------------------------------------------------------------------
/tests/data/mixed.jsonl:
--------------------------------------------------------------------------------
1 | {"value": "Dog", "aliases": ["Canine", "Hound"], "label": "animal"}
2 | {"value": "Cat", "aliases": ["Feline", "Kitty"], "label": "animal"}
3 | {"value": "Apple", "aliases": ["Pome"], "label": "fruit"}
4 | {"value": "Banana", "aliases": ["Musa"], "label": "fruit"}
5 | {"value": "Eagle", "aliases": ["Bird of prey"], "label": "animal"}
6 | {"value": "Strawberry", "aliases": ["Fragaria"], "label": "fruit"}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .cache
 2 | *.egg-info
 3 | *.py[oc]
 4 | *~
 5 | .*.sw?
 6 | .coverage
 7 | .idea
 8 | .ipynb_checkpoints
 9 | .mypy_cache
10 | .netlify
11 | .pytest_cache
12 | .venv
13 | .vscode
14 | Pipfile.lock
15 | __pycache__/
16 | archive.zip
17 | build/
18 | coverage.xml
19 | dist/
20 | docs.zip
21 | docs_build
22 | env
23 | env3.*
24 | htmlcov
25 | log.txt
26 | site
27 | test.db
28 | venv
29 | wheels/
30 | model_cache/
31 | .DS_Store
32 | /training/
33 | profile.dat
34 | notebooks


--------------------------------------------------------------------------------
/src/fuzztypes/integer.py:
--------------------------------------------------------------------------------
 1 | from typing import Annotated, Callable, Union
 2 | 
 3 | from fuzztypes import FuzzValidator, lazy
 4 | 
 5 | _tx = None
 6 | 
 7 | 
 8 | def get_tx() -> Callable:
 9 |     global _tx
10 | 
11 |     if _tx is None:
12 |         _tx = lazy.lazy_import("number_parser", "parse_ordinal")
13 | 
14 |     return _tx
15 | 
16 | 
17 | def to_int(key: Union[int, str]) -> int:
18 |     if isinstance(key, int):
19 |         val = key
20 |     else:
21 |         f = _tx or get_tx()
22 |         val = f(key)
23 |     return val
24 | 
25 | 
26 | Integer = Annotated[int, FuzzValidator(to_int)]
27 | 


--------------------------------------------------------------------------------
/tests/test_emoji.py:
--------------------------------------------------------------------------------
 1 | from fuzztypes import Emoji, emojis, validate_python
 2 | 
 3 | 
 4 | def test_key_access():
 5 |     assert validate_python(Emoji, "balloon") == "🎈"
 6 |     assert validate_python(Emoji, ":atm_sign:") == "🏧"
 7 |     assert validate_python(Emoji, "atm sign") == "🏧"
 8 |     assert validate_python(Emoji, "atm") == "🏧"
 9 |     assert validate_python(Emoji, "United States") == "🇺🇸"
10 | 
11 | 
12 | def test_load_emojis():
13 |     entities = emojis.load_emoji_entities()
14 |     assert len(entities) > 2000
15 |     assert entities[0].value == "🥇"
16 |     assert set(entities[0].aliases) == {"1st place medal", ":1st_place_medal:"}
17 | 


--------------------------------------------------------------------------------
/src/fuzztypes/ascii.py:
--------------------------------------------------------------------------------
 1 | from typing import Annotated, Any, Callable
 2 | 
 3 | from fuzztypes import FuzzValidator, lazy
 4 | 
 5 | _tx = None
 6 | 
 7 | 
 8 | def get_tx() -> Callable:  # pragma: no cover
 9 |     global _tx
10 | 
11 |     if _tx is None:
12 |         _tx = lazy.lazy_import(
13 |             "unidecode",
14 |             "unidecode",
15 |             return_none_on_error=True,
16 |         )
17 |         _tx = _tx or lazy.lazy_import(
18 |             "anyascii",
19 |             "anyascii",
20 |             return_none_on_error=True,
21 |         )
22 | 
23 |     if _tx is None:
24 |         msg = "Failed: `pip install ascii` or `pip install unidecode`"
25 |         raise RuntimeError(msg)
26 | 
27 |     return _tx
28 | 
29 | 
30 | def to_ascii(key: Any) -> str:
31 |     f = _tx or get_tx()
32 |     return f(str(key))
33 | 
34 | 
35 | ASCII = Annotated[str, FuzzValidator(to_ascii)]
36 | 


--------------------------------------------------------------------------------
/tests/test_integer.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, ValidationError
 2 | 
 3 | from fuzztypes import Integer, validate_python
 4 | 
 5 | 
 6 | def test_convert_number_to_int():
 7 |     assert validate_python(Integer, 3) == 3
 8 |     assert validate_python(Integer, "three") == 3
 9 |     assert validate_python(Integer, "third") == 3
10 |     assert (
11 |         validate_python(Integer, "nineteen billion and nineteen")
12 |         == 19_000_000_019
13 |     )
14 |     assert (
15 |         validate_python(Integer, "two million three thousand and nineteen")
16 |         == 2_003_019
17 |     )
18 | 
19 | 
20 | def test_validation_error():
21 |     class MyModel(BaseModel):
22 |         num: Integer
23 | 
24 |     assert MyModel(num="three").num == 3  # type: ignore[arg-type]
25 | 
26 |     try:
27 |         assert MyModel(num="xyz")  # type: ignore[arg-type]
28 |         assert False, "Didn't fail to parse non-integer."
29 |     except ValidationError:
30 |         pass
31 | 


--------------------------------------------------------------------------------
/tests/test_ascii.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from pydantic import BaseModel, TypeAdapter
 4 | 
 5 | from fuzztypes import ASCII
 6 | 
 7 | 
 8 | def test_ascii_usable_type():
 9 |     ta = TypeAdapter(ASCII)
10 |     assert ta.validate_python("άνθρωποι") == "anthropoi"
11 | 
12 | 
13 | def test_transliterate_utf8_to_ascii():
14 |     class MyModel(BaseModel):
15 |         ascii: ASCII
16 | 
17 |     obj = MyModel(ascii="άνθρωποι")
18 |     assert obj.ascii == "anthropoi"
19 | 
20 |     assert MyModel(ascii="kožušček").ascii == "kozuscek"
21 |     assert (
22 |         MyModel(ascii="30 \U0001d5c4\U0001d5c6/\U0001d5c1").ascii == "30 km/h"
23 |     )
24 | 
25 |     # Note: unidecode and anyascii have differences in some situations
26 |     allowed = ("kakoi-to tekst", "kakoy-to tekst")  # unidecode, anyascii
27 |     assert MyModel(ascii="какой-то текст").ascii in allowed
28 | 
29 |     allowed = ("Bei Jing ", "BeiJing")  # unidecode, anyascii
30 |     assert MyModel(ascii="\u5317\u4EB0").ascii in allowed
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2024 to Ian Maurer and GenomOncology, LLC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | “Software”), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject
11 | to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
20 | ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
21 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## v0.1.1 (2023-03-25)
 2 | 
 3 | #### Changed
 4 |  - Fixes to the README regarding validation utility functions.
 5 |  - Renamed ill-named function to `resolve_entity` and added explicit test.
 6 | 
 7 | 
 8 | ## v0.1.0 (2023-03-25)
 9 | 
10 | The project's typing system was validated using mypy and refactored to follow
11 | Annotated types as specified by [PEP 593](https://peps.python.org/pep-0593/).
12 | 
13 | #### Added
14 |  - FuzzValidator annotation type created to simplify design
15 |  - validate_python and validate_json functions added
16 |  - Added Language, LanguageName, and LanguageCode usable types
17 |  - fuzztypes.logger and fuzztypes.utils module for downloading iso codes
18 | 
19 | #### Changed
20 |  - Renamed OnDisk to OnDiskValidator
21 |  - Renamed InMemory to InMemoryValidator
22 |  - Refactored InMemoryValidator and OnDiskValidator to use FuzzValidator
23 |  - Refactored Person to use FuzzValidator
24 |  - Renamed Regex to RegexValidator
25 |  - Changed error message to more common "did you mean" message format
26 | 
27 | #### Removed
28 |  - abstract.py module and AbstractType class, simplified by FuzzValidator
29 |  - function.py module and Function annotation type, replaced by FuzzValidator


--------------------------------------------------------------------------------
/src/fuzztypes/flags.py:
--------------------------------------------------------------------------------
 1 | from enum import Flag, auto
 2 | 
 3 | 
 4 | # What NamedEntity fields does the search key need to match on?
 5 | # Does search support fuzzy matching and semantic similarity?
 6 | class SearchFlag(Flag):
 7 |     NAME_OK = auto()
 8 |     ALIAS_OK = auto()
 9 |     FUZZ_OK = auto()
10 |     SEMANTIC_OK = auto()
11 | 
12 |     @property
13 |     def is_name_ok(self) -> bool:
14 |         return bool(self & SearchFlag.NAME_OK)
15 | 
16 |     @property
17 |     def is_alias_ok(self) -> bool:
18 |         return bool(self & SearchFlag.ALIAS_OK)
19 | 
20 |     @property
21 |     def is_fuzz_ok(self) -> bool:
22 |         return bool(self & SearchFlag.FUZZ_OK)
23 | 
24 |     @property
25 |     def is_semantic_ok(self) -> bool:
26 |         return bool(self & SearchFlag.SEMANTIC_OK)
27 | 
28 |     @property
29 |     def is_fuzz_or_semantic_ok(self):
30 |         return self.is_fuzz_ok or self.is_semantic_ok
31 | 
32 |     @property
33 |     def is_hybrid(self):
34 |         return self.is_fuzz_ok and self.is_semantic_ok
35 | 
36 | 
37 | NameSearch = SearchFlag.NAME_OK
38 | AliasSearch = NameSearch | SearchFlag.ALIAS_OK
39 | FuzzSearch = AliasSearch | SearchFlag.FUZZ_OK
40 | SemanticSearch = AliasSearch | SearchFlag.SEMANTIC_OK
41 | HybridSearch = FuzzSearch | SemanticSearch
42 | DefaultSearch = AliasSearch
43 | 


--------------------------------------------------------------------------------
/src/fuzztypes/regex.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Annotated, Optional
 3 | 
 4 | from . import FuzzValidator
 5 | 
 6 | 
 7 | def RegexValidator(
 8 |     pattern: str,
 9 |     examples: Optional[list] = None,
10 | ):
11 |     regex = re.compile(pattern)
12 | 
13 |     def do_regex(key: str) -> str:
14 |         matches = regex.findall(key)
15 |         if len(matches) == 1:
16 |             return matches[0]
17 |         elif len(matches) > 1:
18 |             raise ValueError(
19 |                 f"Multiple matches found for pattern '{pattern}' in '{key}'"
20 |             )
21 |         else:
22 |             raise ValueError(
23 |                 f"No matches found for pattern '{pattern}' in '{key}'"
24 |             )
25 | 
26 |     return FuzzValidator(do_regex, examples=examples)
27 | 
28 | 
29 | Email = Annotated[
30 |     str,
31 |     RegexValidator(
32 |         r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
33 |         examples=["user@example.com"],
34 |     ),
35 | ]
36 | 
37 | SSN = Annotated[
38 |     str,
39 |     RegexValidator(
40 |         r"\b\d{3}-\d{2}-\d{4}\b",
41 |         examples=["000-00-0000"],
42 |     ),
43 | ]
44 | 
45 | ZipCode = Annotated[
46 |     str,
47 |     RegexValidator(
48 |         r"\b\d{5}(?:-\d{4})?\b",
49 |         examples=["12345", "12345-6789"],
50 |     ),
51 | ]
52 | 


--------------------------------------------------------------------------------
/tests/on_disk/test_on_disk_semantic.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pydantic import BaseModel
 3 | 
 4 | from fuzztypes import flags, on_disk, Vibemoji, validate_python
 5 | 
 6 | 
 7 | @pytest.fixture(scope="session")
 8 | def EmotionStoredValidatorStorage(EmotionSource):
 9 |     storage = on_disk.StoredValidatorStorage(
10 |         "Emotions", EmotionSource, search_flag=flags.SemanticSearch
11 |     )
12 |     storage.prepare(force_drop_table=True)
13 |     return storage
14 | 
15 | 
16 | def test_check_storage_directly(EmotionStoredValidatorStorage):
17 |     matches = EmotionStoredValidatorStorage.get("happiness")
18 |     assert len(matches) == 1
19 |     assert matches[0].entity.value == "Happiness"
20 |     assert matches[0].score == 100.0
21 | 
22 |     matches = EmotionStoredValidatorStorage.get("scared")
23 |     assert len(matches) == 10
24 |     assert matches[0].entity.value == "Fear"
25 |     assert matches[0].score == pytest.approx(91.23)
26 | 
27 | 
28 | class MyModel(BaseModel):
29 |     emoji: Vibemoji
30 | 
31 | 
32 | def test_vibemoji_get_value():
33 |     assert validate_python(Vibemoji, "bacon tastes good") == "🥓"
34 |     assert validate_python(Vibemoji, "take the bus to school") == "🚌"
35 |     assert validate_python(Vibemoji, "jolly santa") == "🎅"
36 |     assert validate_python(Vibemoji, "United States") == "🇺🇸"
37 | 


--------------------------------------------------------------------------------
/tests/on_disk/test_on_disk_fuzz.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import tantivy  # type: ignore
 4 | 
 5 | from fuzztypes import Fuzzmoji, const, validate_python
 6 | 
 7 | 
 8 | def test_tantivy():
 9 |     # make sure the index is built
10 |     assert validate_python(Fuzzmoji, "balloon") == "🎈"
11 | 
12 |     # standard schema
13 |     schema_builder = tantivy.SchemaBuilder()
14 |     schema_builder.add_integer_field("doc_id", stored=True)
15 |     schema_builder.add_text_field("term", stored=True)
16 |     schema = schema_builder.build()
17 | 
18 |     # create the index
19 |     path = os.path.join(
20 |         const.StoredValidatorPath, "Fuzzmoji.lance/_indices/tantivy"
21 |     )
22 |     index = tantivy.Index(schema, path=path)
23 |     searcher = index.searcher()
24 | 
25 |     # todo: fuzzy field not in current version
26 |     # https://github.com/quickwit-oss/tantivy-py/issues/20
27 |     # https://docs.rs/tantivy/latest/tantivy/query/struct.FuzzyTermQuery.html
28 |     # index.parse_query("thought", fuzzy_fields={"term": (True, 1, False)})
29 | 
30 |     # query the index
31 |     query = index.parse_query("thought bubble")
32 |     result = searcher.search(query, 5)
33 | 
34 |     # check the results
35 |     terms = []
36 |     for score, address in result.hits:
37 |         doc = searcher.doc(address)
38 |         terms.extend(doc["term"])
39 | 
40 |     assert "thought balloon" in terms
41 |     assert ":bubble_tea:" in terms
42 | 
43 | 
44 | def test_fuzzmoji():
45 |     assert validate_python(Fuzzmoji, "thought bubble") == "💭"
46 |     assert validate_python(Fuzzmoji, "bubble team") == "🧋"
47 | 


--------------------------------------------------------------------------------
/src/fuzztypes/emojis.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from typing import Annotated, List
 3 | from pydantic import TypeAdapter
 4 | 
 5 | from fuzztypes import NamedEntity, EntitySource, OnDiskValidator, flags, lazy
 6 | 
 7 | 
 8 | def load_emoji_entities() -> List[NamedEntity]:
 9 |     get_aliases_unicode_dict = lazy.lazy_import(
10 |         "emoji.unicode_codes", "get_aliases_unicode_dict"
11 |     )
12 | 
13 |     mapping = defaultdict(list)
14 |     emoji_mapping = get_aliases_unicode_dict()
15 |     for value, emoji in emoji_mapping.items():
16 |         mapping[emoji].extend([value, value.strip(":").replace("_", " ")])
17 | 
18 |     data = ({"value": k, "aliases": v} for k, v in mapping.items())
19 |     return TypeAdapter(List[NamedEntity]).validate_python(data)
20 | 
21 | 
22 | EmojiSource = EntitySource(load_emoji_entities)
23 | 
24 | Emoji = Annotated[
25 |     str,
26 |     OnDiskValidator(
27 |         "Emoji",
28 |         EmojiSource,
29 |         search_flag=flags.AliasSearch,
30 |         tiebreaker_mode="lesser",
31 |     ),
32 | ]
33 | 
34 | Fuzzmoji = Annotated[
35 |     str,
36 |     OnDiskValidator(
37 |         "Fuzzmoji",
38 |         EmojiSource,
39 |         search_flag=flags.FuzzSearch,
40 |         tiebreaker_mode="lesser",
41 |         min_similarity=10.0,
42 |         device="cpu",
43 |     ),
44 | ]
45 | 
46 | Vibemoji = Annotated[
47 |     str,
48 |     OnDiskValidator(
49 |         "Vibemoji",
50 |         EmojiSource,
51 |         search_flag=flags.SemanticSearch,
52 |         tiebreaker_mode="lesser",
53 |         min_similarity=10.0,
54 |         device="cpu",
55 |     ),
56 | ]
57 | 


--------------------------------------------------------------------------------
/tests/in_memory/test_in_memory_similarity.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from fuzztypes import flags
 4 | from fuzztypes.in_memory import InMemoryValidatorStorage
 5 | from fuzztypes.lazy import create_reranker
 6 | 
 7 | 
 8 | @pytest.fixture(scope="session")
 9 | def EmotionMemoryStorage(EmotionSource):
10 |     storage = InMemoryValidatorStorage(
11 |         EmotionSource, search_flag=flags.SemanticSearch
12 |     )
13 |     storage.prepare()
14 |     return storage
15 | 
16 | 
17 | def test_check_storage_directly(EmotionMemoryStorage):
18 |     matches = EmotionMemoryStorage.get("happiness")
19 |     assert len(matches) == 1
20 |     assert matches[0].entity.value == "Happiness"
21 |     assert matches[0].score == 100.0
22 | 
23 |     matches = EmotionMemoryStorage.get("scared")
24 |     assert len(matches) == 10
25 |     assert matches[0].entity.value == "Fear"
26 |     assert matches[0].score == pytest.approx(91.23)
27 | 
28 | 
29 | def test_reranker_directly_1(EmotionMemoryStorage):
30 |     ranker = create_reranker("mixedbread-ai/mxbai-rerank-xsmall-v1")
31 |     documents = EmotionMemoryStorage._terms
32 | 
33 |     results = ranker("afraid", documents, 3)
34 |     assert len(results) == 3
35 |     assert results[0]["text"] == "fear"
36 |     assert results[0]["score"] >= 0.3
37 | 
38 | 
39 | def test_reranker_directly_2(EmotionMemoryStorage):
40 |     ranker = create_reranker("mixedbread-ai/mxbai-rerank-xsmall-v1")
41 |     documents = EmotionMemoryStorage._terms
42 | 
43 |     results = ranker("joyous", sorted(documents), 3)
44 |     assert len(results) == 3
45 |     assert results[0]["text"] in ("happiness", "joy")
46 |     assert results[0]["score"] >= 0.3
47 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from pytest import fixture
 4 | 
 5 | from fuzztypes import EntitySource, NamedEntity
 6 | 
 7 | 
 8 | @fixture(scope="session")
 9 | def data_path() -> Path:
10 |     return Path(__file__).parent / "data"
11 | 
12 | 
13 | @fixture(scope="session")
14 | def EmojiSource(data_path):
15 |     source = EntitySource(data_path / "emojis.csv")
16 |     assert len(source) == 4
17 |     return source
18 | 
19 | 
20 | @fixture(scope="session")
21 | def FruitSource(data_path):
22 |     # loading separately from AnimalSource to test lazy loading
23 |     MixedSource = EntitySource(data_path / "mixed.jsonl")
24 |     assert MixedSource.loaded is False
25 | 
26 |     FruitSource = MixedSource["fruit"]
27 |     assert isinstance(FruitSource, EntitySource)
28 |     assert FruitSource.loaded is False
29 | 
30 |     # first access loads FruitSource -> MixedSource
31 |     assert isinstance(FruitSource[0], NamedEntity)
32 |     assert FruitSource[0].value == "Apple"
33 |     assert FruitSource.loaded is True
34 |     assert MixedSource.loaded is True
35 |     assert len(FruitSource) == 3
36 | 
37 |     return FruitSource
38 | 
39 | 
40 | @fixture(scope="session")
41 | def AnimalSource(data_path):
42 |     MixedSource = EntitySource(data_path / "mixed.jsonl")
43 |     return MixedSource["animal"]
44 | 
45 | 
46 | @fixture(scope="session")
47 | def MythSource(data_path):
48 |     source = EntitySource(data_path / "myths.tsv")
49 |     assert len(source) == 5
50 |     return source
51 | 
52 | 
53 | @fixture(scope="session")
54 | def EmotionSource(data_path):
55 |     source = EntitySource(data_path / "emotions.txt")
56 |     assert len(source) == 12
57 |     return source
58 | 


--------------------------------------------------------------------------------
/src/fuzztypes/date.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | from typing import Annotated, Optional, Union
 3 | 
 4 | from . import FuzzValidator, const, lazy
 5 | 
 6 | DateOrDatetime = Union[datetime.date, datetime.datetime]
 7 | 
 8 | 
 9 | def DateValidator(
10 |     date_order: Optional[const.DateOrder] = None,
11 |     is_date: bool = True,
12 |     languages: Optional[list[str]] = None,
13 |     timezone: Optional[str] = None,
14 |     strict: bool = False,
15 |     prefer_future_dates: bool = False,
16 |     relative_base: Optional[DateOrDatetime] = None,
17 | ):
18 |     DateDataParser = lazy.lazy_import("dateparser.date", "DateDataParser")
19 |     languages = languages or ["en"]
20 | 
21 |     settings = {
22 |         "STRICT_PARSING": strict,
23 |         "PREFER_DATES_FROM": "future" if prefer_future_dates else "past",
24 |         "RETURN_AS_TIMEZONE_AWARE": bool(timezone),
25 |     }
26 |     if date_order:
27 |         settings["DATE_ORDER"] = date_order
28 |     if timezone:
29 |         settings["TIMEZONE"] = timezone
30 |     if relative_base:
31 |         settings["RELATIVE_BASE"] = relative_base
32 | 
33 |     parser = DateDataParser(languages=languages, settings=settings)
34 | 
35 |     def parse(key: str) -> DateOrDatetime:
36 |         value = parser.get_date_data(key).date_obj
37 |         value = value.date() if (value and is_date) else value
38 |         return value
39 | 
40 |     return FuzzValidator(parse)
41 | 
42 | 
43 | def DatetimeValidator(
44 |     date_order: Optional[const.DateOrder] = None,
45 |     languages: Optional[list[str]] = None,
46 |     timezone: Optional[str] = None,
47 |     strict: bool = False,
48 |     prefer_future_dates: bool = False,
49 |     relative_base: Optional[DateOrDatetime] = None,
50 | ):
51 |     return DateValidator(
52 |         date_order=date_order,
53 |         is_date=False,
54 |         languages=languages,
55 |         timezone=timezone,
56 |         strict=strict,
57 |         prefer_future_dates=prefer_future_dates,
58 |         relative_base=relative_base,
59 |     )
60 | 
61 | 
62 | Date = Annotated[datetime.date, DateValidator()]
63 | Datetime = Annotated[datetime.datetime, DatetimeValidator()]
64 | 


--------------------------------------------------------------------------------
/src/fuzztypes/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "0.1.1"
 2 | 
 3 | # logging
 4 | import logging
 5 | 
 6 | logger = logging.getLogger("fuzztypes")
 7 | logger.setLevel(logging.WARNING)
 8 | 
 9 | # flags and constants
10 | from . import flags
11 | from . import const
12 | 
13 | # utilities
14 | from . import utils
15 | from . import lazy
16 | 
17 | # Schema
18 | from .entity import Entity, NamedEntity, EntitySource
19 | from .match import Match, MatchResult, Record
20 | 
21 | # Validation
22 | from .validation import (
23 |     FuzzValidator,
24 |     resolve_entity,
25 |     validate_python,
26 |     validate_json,
27 |     get_type_adapter,
28 | )
29 | 
30 | # Named Entity Storage
31 | from . import storage
32 | from .in_memory import InMemoryValidator
33 | from .on_disk import OnDiskValidator
34 | 
35 | # Base Non-Entity Types
36 | from .regex import RegexValidator
37 | 
38 | # Usable Types
39 | from .ascii import ASCII
40 | from .date import Date, DateValidator, Datetime, DatetimeValidator
41 | from .emojis import Emoji, Fuzzmoji, Vibemoji
42 | from .integer import Integer
43 | from .language import (
44 |     Language,
45 |     LanguageCode,
46 |     LanguageName,
47 |     LanguageNamedEntity,
48 |     LanguageScope,
49 |     LanguageType,
50 | )
51 | from .person import Person
52 | from .regex import Email, SSN, ZipCode
53 | 
54 | 
55 | __all__ = (
56 |     "ASCII",
57 |     "Date",
58 |     "Email",
59 |     "Emoji",
60 |     "Entity",
61 |     "EntitySource",
62 |     "Fuzzmoji",
63 |     "FuzzValidator",
64 |     "InMemoryValidator",
65 |     "Integer",
66 |     "Language",
67 |     "LanguageCode",
68 |     "LanguageName",
69 |     "LanguageNamedEntity",
70 |     "LanguageScope",
71 |     "LanguageType",
72 |     "Match",
73 |     "MatchResult",
74 |     "NamedEntity",
75 |     "OnDiskValidator",
76 |     "Person",
77 |     "Record",
78 |     "RegexValidator",
79 |     "SSN",
80 |     "Date",
81 |     "DateValidator",
82 |     "Datetime",
83 |     "DatetimeValidator",
84 |     "Vibemoji",
85 |     "ZipCode",
86 |     "const",
87 |     "flags",
88 |     "get_type_adapter",
89 |     "lazy",
90 |     "logger",
91 |     "utils",
92 |     "resolve_entity",
93 |     "validate_json",
94 |     "validate_python",
95 | )
96 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | ACTIVATE = . ./activate.sh
 2 | 
 3 | format:
 4 | 	$(ACTIVATE) && ruff format src tests
 5 | 
 6 | test:
 7 | 	$(ACTIVATE) && pytest -s tests/
 8 | 
 9 | cov:
10 | 	$(ACTIVATE) && coverage run -m pytest -s tests && coverage combine && coverage report --show-missing && coverage html
11 | 
12 | sync:
13 | 	uv pip compile pyproject.toml -o requirements.txt
14 | 	uv pip compile pyproject.toml --extra test --extra local --extra ext -o requirements-dev.txt
15 | 	uv pip sync requirements-dev.txt
16 | 	uv pip install -e ".[dev]"
17 | 	uv pip freeze
18 | 
19 | publish:
20 | 	# https://packaging.python.org/en/latest/tutorials/packaging-projects/
21 | 	$(ACTIVATE) && python -m build
22 | 	$(ACTIVATE) && python -m twine upload -r pypi dist/*
23 | 
24 | perf_test:
25 | 	$(ACTIVATE) && python -m cProfile -o profile.dat -m pytest -s tests/
26 | 
27 | 	echo "** Slowest FuzzTypes functions by total time:"
28 | 	$(ACTIVATE) && python -c "import pstats; pstats.Stats('profile.dat').sort_stats('tottime').print_stats(1000)" | grep -E "ncalls|/src/" | head -n 21
29 | 
30 | 	echo "\n\n** Slowest FuzzTypes functions by cumulative time:"
31 | 	$(ACTIVATE) && python -c "import pstats; pstats.Stats('profile.dat').sort_stats('cumtime').print_stats(1000)" | grep -E  "ncalls|/src/" | head -n 21
32 | 
33 | 	echo "\n\n** Slowest all-project functions by total time:"
34 | 	$(ACTIVATE) && python -c "import pstats; pstats.Stats('profile.dat').sort_stats('tottime').print_stats(20)" | tail -n +8
35 | 
36 | 	rm profile.dat
37 | 
38 | pbcopy:
39 | 	# copy all code to clipboard for pasting into an LLM
40 | 	find . ! -path '*/.*/*' -type f \( -name "*.py" -o -name "*.md" \) -exec tail -n +1 {} + | pbcopy
41 | 
42 | #----------
43 | # clean
44 | #----------
45 | 
46 | clean: clean-build clean-pyc clean-test
47 | 
48 | clean-build:
49 | 	rm -fr build/
50 | 	rm -fr dist/
51 | 	rm -fr .eggs/
52 | 	find . -name '*.egg-info' -exec rm -fr {} +
53 | 	find . -name '*.egg' -exec rm -f {} +
54 | 
55 | clean-pyc:
56 | 	find . -name '*.pyc' -exec rm -f {} +
57 | 	find . -name '*.pyo' -exec rm -f {} +
58 | 	find . -name '*~' -exec rm -f {} +
59 | 	find . -name '__pycache__' -exec rm -fr {} +
60 | 
61 | clean-test:
62 | 	rm -fr .cache
63 | 	rm -fr .mypy_cache
64 | 	rm -fr .pytest_cache
65 | 	rm -f .coverage
66 | 	rm -fr htmlcov/


--------------------------------------------------------------------------------
/src/fuzztypes/utils/download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import urllib.request
 3 | from datetime import datetime
 4 | from typing import Optional
 5 | from urllib.error import URLError, HTTPError
 6 | 
 7 | from fuzztypes import logger, const
 8 | 
 9 | 
10 | def get_file_age_in_days(file_path: str) -> int:
11 |     age = datetime.now() - datetime.fromtimestamp(os.path.getmtime(file_path))
12 |     return age.days
13 | 
14 | 
15 | def get_file(url: str, expires_in_days: int = 30) -> Optional[str]:
16 |     """
17 |     Tries to retrieve a file from the cache or download it if not available
18 |     or expired.
19 | 
20 |     :param url: The URL of the original file to be downloaded.
21 |     :param expires_in_days: Expiration period for the cached file.
22 |     :return: Path to the downloaded file, or None if fails.
23 |     """
24 |     file_name = os.path.basename(url)
25 |     cache_file_path = os.path.join(const.DownloadsPath, file_name)
26 |     temp_download_path = f"{cache_file_path}.tmp"
27 | 
28 |     cache_ok = os.path.exists(cache_file_path)
29 |     if cache_ok:
30 |         file_age = get_file_age_in_days(cache_file_path)
31 |         cache_ok = file_age <= expires_in_days
32 | 
33 |     if not cache_ok:
34 |         download_success = download_file(url, temp_download_path)
35 |         if download_success:
36 |             os.replace(temp_download_path, cache_file_path)
37 |             cache_ok = os.path.exists(cache_file_path)
38 | 
39 |     if not cache_ok:
40 |         logger.error(f"Unable to download the file and no cached file: {url}")
41 | 
42 |     return cache_file_path if cache_ok else None
43 | 
44 | 
45 | def download_file(url, download_path):
46 |     """
47 |     Attempt to download a file directly to a specified path.
48 |     If the download fails, logs a warning and returns None.
49 | 
50 |     :param url: The URL of the file to be downloaded.
51 |     :param download_path: The full file path where the file should be saved.
52 |     :return: Boolean indicating success or failure of the download.
53 |     """
54 |     try:
55 |         urllib.request.urlretrieve(url, download_path)
56 |         return True
57 |     except (HTTPError, URLError, ValueError, OSError, Exception) as e:
58 |         logger.warning(f"Download (url={url}) failed: {e}", exc_info=True)
59 |         return False
60 | 


--------------------------------------------------------------------------------
/tests/in_memory/test_in_memory_name.py:
--------------------------------------------------------------------------------
 1 | from typing import Annotated, Optional
 2 | 
 3 | from pydantic import BaseModel, ValidationError, Field
 4 | 
 5 | from fuzztypes import NamedEntity, InMemoryValidator, flags
 6 | 
 7 | names = ["George Washington", "John Adams", "Thomas Jefferson"]
 8 | President = InMemoryValidator(names, search_flag=flags.NameSearch)
 9 | CasedPrez = InMemoryValidator(
10 |     names, case_sensitive=True, search_flag=flags.NameSearch
11 | )
12 | NullPrez = InMemoryValidator(
13 |     names, notfound_mode="none", search_flag=flags.NameSearch
14 | )
15 | AllowPrez = InMemoryValidator(
16 |     names, notfound_mode="allow", search_flag=flags.NameSearch
17 | )
18 | 
19 | 
20 | def test_namestr_getitem():
21 |     entity = NamedEntity(value="Thomas Jefferson")
22 |     assert President["Thomas Jefferson"] == entity
23 |     assert President["THOMAS JEFFERSON"] == entity
24 | 
25 |     assert CasedPrez["Thomas Jefferson"] == entity
26 |     try:
27 |         assert CasedPrez["THOMAS JEFFERSON"] == entity
28 |         assert False, "Didn't raise KeyError!"
29 |     except KeyError:
30 |         pass
31 | 
32 |     assert NullPrez["The Rock"] is None
33 |     assert AllowPrez["The Rock"].value == "The Rock"
34 | 
35 | 
36 | def test_uncased_name_str():
37 |     class Example(BaseModel):
38 |         value: Annotated[str, President]
39 | 
40 |     # exact match
41 |     assert Example(value="George Washington").value == "George Washington"
42 | 
43 |     # case-insensitive match
44 |     assert Example(value="john ADAMS").value == "John Adams"
45 | 
46 | 
47 | def test_cased_name_str():
48 |     class Example(BaseModel):
49 |         value: Annotated[str, CasedPrez]
50 | 
51 |     # exact match
52 |     assert Example(value="George Washington").value == "George Washington"
53 | 
54 |     # case-insensitive match
55 |     try:
56 |         assert Example(value="john ADAMS").value == "John Adams"
57 |         assert False, "Didn't raise PydanticCustomError!"
58 |     except ValidationError:
59 |         pass
60 | 
61 | 
62 | def test_nullable_name_str():
63 |     class Example(BaseModel):
64 |         value: Annotated[Optional[str], NullPrez] = Field(default=None)
65 | 
66 |     assert Example().model_dump() == {"value": None}
67 |     assert Example(value="The Rock").model_dump() == {"value": None}
68 | 


--------------------------------------------------------------------------------
/tests/test_language.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from fuzztypes import (
 4 |     Language,
 5 |     LanguageCode,
 6 |     LanguageName,
 7 |     validate_python,
 8 |     LanguageNamedEntity,
 9 |     LanguageScope,
10 |     LanguageType,
11 | )
12 | from fuzztypes.language import load_languages
13 | 
14 | 
15 | def test_load_languages():
16 |     source = load_languages()
17 |     entities = source()
18 |     assert len(entities) == 7910
19 |     assert entities[0].resolve() == "Ghotuo"
20 | 
21 | 
22 | def test_language_model_resolution():
23 |     class Model(BaseModel):
24 |         language_code: LanguageCode
25 |         language_name: LanguageName
26 |         language: Language
27 | 
28 |     # Test that Language resolves to the complete language object
29 |     data = dict(language_code="en", language="English", language_name="ENG")
30 |     obj = validate_python(Model, data)
31 |     assert obj.language_code == "en"
32 |     assert obj.language_name == "English"
33 |     assert obj.language.scope == LanguageScope.INDIVIDUAL
34 |     assert obj.language.type == LanguageType.LIVING
35 |     assert isinstance(obj.language, LanguageNamedEntity)
36 |     assert obj.model_dump(exclude_defaults=True, mode="json") == {
37 |         "language": {
38 |             "aliases": ["en", "eng"],
39 |             "alpha_2": "en",
40 |             "alpha_3": "eng",
41 |             "scope": "I",
42 |             "type": "L",
43 |             "value": "English",
44 |         },
45 |         "language_code": "en",
46 |         "language_name": "English",
47 |     }
48 | 
49 | 
50 | def test_matching_edge_cases():
51 |     # 'En' is a proper name of a language
52 |     assert validate_python(LanguageName, "En") == "En"
53 |     assert validate_python(LanguageCode, "En") == "enc"
54 | 
55 |     # 'en' is the alpha2 code for English
56 |     assert validate_python(LanguageName, "en") == "English"
57 |     assert validate_python(LanguageCode, "en") == "en"
58 | 
59 |     # Bangla is common name for Bengali
60 |     assert validate_python(LanguageName, "Bangla") == "Bengali"
61 |     assert validate_python(LanguageCode, "Bangla") == "bn"
62 |     assert validate_python(Language, "Bangla").model_dump(
63 |         exclude_defaults=True, mode="json"
64 |     ) == {
65 |         "aliases": ["bn", "ben", "Bangla"],
66 |         "alpha_2": "bn",
67 |         "alpha_3": "ben",
68 |         "common_name": "Bangla",
69 |         "scope": "I",
70 |         "type": "L",
71 |         "value": "Bengali",
72 |     }
73 | 


--------------------------------------------------------------------------------
/tests/on_disk/test_on_disk_name.py:
--------------------------------------------------------------------------------
 1 | from typing import Annotated, Optional
 2 | 
 3 | from pydantic import BaseModel, ValidationError, Field
 4 | 
 5 | from fuzztypes import NamedEntity, OnDiskValidator, flags, resolve_entity
 6 | 
 7 | names = ["George Washington", "John Adams", "Thomas Jefferson"]
 8 | President = OnDiskValidator(
 9 |     "President",
10 |     names,
11 |     search_flag=flags.NameSearch,
12 | )
13 | CasedPrez = OnDiskValidator(
14 |     "CasedPrez",
15 |     names,
16 |     case_sensitive=True,
17 |     search_flag=flags.NameSearch,
18 | )
19 | NullPrez = OnDiskValidator(
20 |     "NullPrez",
21 |     names,
22 |     notfound_mode="none",
23 |     search_flag=flags.NameSearch,
24 | )
25 | AllowPrez = OnDiskValidator(
26 |     "AllowPrez",
27 |     names,
28 |     notfound_mode="allow",
29 |     search_flag=flags.NameSearch,
30 | )
31 | 
32 | 
33 | def test_namestr_getitem():
34 |     entity = NamedEntity(value="Thomas Jefferson")
35 |     assert President["Thomas Jefferson"] == entity
36 |     assert President["THOMAS JEFFERSON"] == entity
37 |     assert resolve_entity(President, "Thomas Jefferson") == entity
38 | 
39 |     assert CasedPrez["Thomas Jefferson"] == entity
40 |     try:
41 |         assert CasedPrez["THOMAS JEFFERSON"] == entity
42 |         assert False, "Didn't raise KeyError!"
43 |     except KeyError:
44 |         pass
45 | 
46 |     assert NullPrez["The Rock"] is None
47 |     assert AllowPrez["The Rock"].value == "The Rock"
48 | 
49 | 
50 | def test_uncased_name_str():
51 |     class Example(BaseModel):
52 |         value: Annotated[str, President]
53 | 
54 |     # exact match
55 |     assert Example(value="George Washington").value == "George Washington"
56 | 
57 |     # case-insensitive match
58 |     assert Example(value="john ADAMS").value == "John Adams"
59 | 
60 | 
61 | def test_cased_name_str():
62 |     class Example(BaseModel):
63 |         value: Annotated[str, CasedPrez]
64 | 
65 |     # exact match
66 |     assert Example(value="George Washington").value == "George Washington"
67 | 
68 |     # case-insensitive match
69 |     try:
70 |         assert Example(value="john ADAMS").value == "John Adams"
71 |         assert False, "Didn't raise PydanticCustomError!"
72 |     except ValidationError:
73 |         pass
74 | 
75 | 
76 | def test_nullable_name_str():
77 |     class Example(BaseModel):
78 |         value: Annotated[Optional[str], NullPrez] = Field(default=None)
79 | 
80 |     assert Example().model_dump() == {"value": None}
81 |     assert Example(value="The Rock").model_dump() == {"value": None}
82 | 


--------------------------------------------------------------------------------
/tests/test_date.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, date
 2 | from typing import Annotated
 3 | from zoneinfo import ZoneInfo
 4 | 
 5 | from pydantic import BaseModel
 6 | 
 7 | from fuzztypes import (
 8 |     Date,
 9 |     DateValidator,
10 |     DatetimeValidator,
11 |     validate_python,
12 |     validate_json,
13 | )
14 | 
15 | ny_tz = ZoneInfo("America/New_York")
16 | 
17 | DateY2K = Annotated[
18 |     datetime,
19 |     DatetimeValidator(relative_base=datetime(2000, 1, 1), timezone="EST"),
20 | ]
21 | 
22 | 
23 | class MyModel(BaseModel):
24 |     date: Date
25 |     time: DateY2K
26 | 
27 | 
28 | def test_validate_python_date_and_datetime():
29 |     data = dict(date="11 July 2012", time="tomorrow 5am")
30 |     obj = validate_python(MyModel, data)
31 |     assert obj.date == date(2012, 7, 11)
32 |     assert obj.time == datetime(2000, 1, 2, 5, 0, 0, tzinfo=ny_tz)
33 | 
34 | 
35 | def test_validate_json_date_and_datetime():
36 |     json = '{"date": "July 4th", "time": "1 year ago"}'
37 |     obj = validate_json(MyModel, json)
38 |     today = date.today()
39 |     year = today.year if (today.month, today.day) >= (7, 4) else today.year - 1
40 |     assert obj.date == date(year, 7, 4)
41 |     assert obj.time == datetime(1999, 1, 1, 0, 0, 0, tzinfo=ny_tz)
42 | 
43 |     d = obj.date.isoformat()
44 |     t = obj.time.isoformat()
45 |     assert obj.model_dump_json() == f'{{"date":"{d}","time":"{t}"}}'
46 | 
47 | 
48 | def test_mdy_vs_ymd():
49 |     # MDY vs. YMD ordering is context specific
50 |     # https://dateparser.readthedocs.io/en/latest/settings.html#date-order
51 |     #
52 |     assert validate_python(Date, "02-03-04") == date(year=2004, month=2, day=3)
53 | 
54 |     DateEN = Annotated[date, DateValidator(languages=["en"])]
55 |     assert validate_python(DateEN, "02-03-04") == date(
56 |         year=2004, month=2, day=3
57 |     )
58 | 
59 |     DateMDY = Annotated[date, DateValidator(date_order="MDY")]
60 |     assert validate_python(DateMDY, "02-03-04") == date(
61 |         year=2004, month=2, day=3
62 |     )
63 | 
64 |     DateES = Annotated[date, DateValidator(languages=["es"])]
65 |     assert validate_python(DateES, "02-03-04") == date(
66 |         year=2004, month=3, day=2
67 |     )
68 | 
69 |     DateDMY = Annotated[date, DateValidator(date_order="DMY")]
70 |     assert validate_python(DateDMY, "02-03-04") == date(
71 |         year=2004, month=3, day=2
72 |     )
73 | 
74 |     DateYMD = Annotated[date, DateValidator(date_order="YMD")]
75 |     assert validate_python(DateYMD, "02-03-04") == date(
76 |         year=2002, month=3, day=4
77 |     )
78 | 


--------------------------------------------------------------------------------
/tests/test_entity.py:
--------------------------------------------------------------------------------
 1 | from fuzztypes import NamedEntity, InMemoryValidator, EntitySource
 2 | 
 3 | 
 4 | def test_entity_conv():
 5 |     def c(item):
 6 |         return NamedEntity.convert(item).model_dump(
 7 |             exclude_defaults=True, by_alias=True
 8 |         )
 9 | 
10 |     assert c("A") == dict(value="A")
11 |     assert c(("A", "B")) == dict(value="A", aliases=["B"])
12 |     assert c(("A", ["B"])) == dict(value="A", aliases=["B"])
13 |     assert c(("A", ["B", "C"])) == dict(value="A", aliases=["B", "C"])
14 | 
15 | 
16 | def test_meta():
17 |     entity = NamedEntity(value="a", meta=dict(b=1, c=None), priority=10)
18 |     assert entity.value == "a"
19 |     assert entity.b == 1
20 |     assert entity.c is None
21 |     assert entity.priority == 10
22 |     assert entity.model_dump(by_alias=True) == {
23 |         "value": "a",
24 |         "label": None,
25 |         "aliases": [],
26 |         "meta": {"b": 1, "c": None},
27 |         "priority": 10,
28 |     }
29 | 
30 | 
31 | def test_meta_edge_cases():
32 |     entity = NamedEntity(value="a")
33 | 
34 |     try:
35 |         assert entity.unknown
36 |         assert False, "Did not throw AttributeError exception."
37 | 
38 |     except AttributeError:
39 |         pass
40 | 
41 |     entity.unknown = 123
42 |     assert entity.unknown == 123
43 | 
44 |     assert entity.label is None
45 |     entity.label = "LABEL"
46 |     assert entity.label == "LABEL"
47 | 
48 | 
49 | def test_csv_load(EmojiSource):
50 |     Emoji = InMemoryValidator(EmojiSource)
51 |     assert Emoji["happy"].value == "happy"
52 |     assert Emoji["🎉"].value == "party"
53 |     assert Emoji["party"].rank < Emoji["celebrate"].rank
54 | 
55 | 
56 | def test_jsonl_load_animal(AnimalSource):
57 |     assert AnimalSource[0].value == "Dog"
58 | 
59 |     AnimalStr = InMemoryValidator(AnimalSource)
60 |     assert AnimalStr["dog"] == AnimalSource[0]
61 |     assert AnimalStr["Bird of prey"].value == "Eagle"
62 | 
63 | 
64 | def test_jsonl_label_source(FruitSource):
65 |     FruitStr = InMemoryValidator(
66 |         FruitSource,
67 |         case_sensitive=True,
68 |         notfound_mode="none",
69 |     )
70 |     assert FruitStr["apple"] is None
71 |     assert FruitStr["Pome"].value == "Apple"
72 | 
73 | 
74 | def test_tsv_load(MythSource):
75 |     Myth = InMemoryValidator(MythSource)
76 |     assert Myth["Pallas"].value == "Athena"
77 |     assert Myth["Jupiter"].value == "Zeus"
78 | 
79 | 
80 | def test_entity_source_from_callable():
81 |     def fn():
82 |         return [NamedEntity(value="hi!")]
83 | 
84 |     source = EntitySource(source=fn)
85 |     entity = source[0]
86 |     assert isinstance(entity, NamedEntity)
87 |     assert entity.value == "hi!"
88 | 


--------------------------------------------------------------------------------
/tests/in_memory/test_in_memory_alias.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from typing import Annotated
 3 | from pydantic import BaseModel, ValidationError
 4 | 
 5 | from fuzztypes import InMemoryValidator, flags
 6 | 
 7 | 
 8 | @pytest.fixture(scope="session")
 9 | def MythicalFigure(MythSource):
10 |     return InMemoryValidator(MythSource, search_flag=flags.AliasSearch)
11 | 
12 | 
13 | @pytest.fixture(scope="session")
14 | def CasedMythicalFigure(MythSource):
15 |     return InMemoryValidator(
16 |         MythSource,
17 |         search_flag=flags.AliasSearch,
18 |         case_sensitive=True,
19 |     )
20 | 
21 | 
22 | def test_alias_uncased_getitem(MythicalFigure):
23 |     # Testing Alias with aliases
24 |     assert MythicalFigure["Odysseus"].value == "Odysseus"
25 |     assert MythicalFigure["Ulysses"].value == "Odysseus"  # alias
26 |     assert MythicalFigure["athena"].value == "Athena"  # case insensitivity
27 | 
28 | 
29 | def test_alias_cased_getitem(CasedMythicalFigure):
30 |     # Testing AliasCasedStr, expecting case-sensitive behavior
31 |     assert CasedMythicalFigure["Athena"].value == "Athena"
32 | 
33 |     with pytest.raises(KeyError):
34 |         # This should fail because CasedMythicalFigure is case-sensitive
35 |         assert CasedMythicalFigure["athena"].value == "Athena"
36 | 
37 | 
38 | def test_uncased_alias_str(MythicalFigure):
39 |     class Example(BaseModel):
40 |         value: Annotated[str, MythicalFigure]
41 | 
42 |     # Exact match
43 |     assert Example(value="Zeus").value == "Zeus"
44 |     # Alias match
45 |     assert Example(value="Jupiter").value == "Zeus"
46 |     # Case-insensitive alias match
47 |     assert Example(value="jove").value == "Zeus"
48 | 
49 | 
50 | def test_cased_alias_str(CasedMythicalFigure):
51 |     class Example(BaseModel):
52 |         value: Annotated[str, CasedMythicalFigure]
53 | 
54 |     # Exact match
55 |     assert Example(value="Zeus").value == "Zeus"
56 |     # Alias match
57 |     assert Example(value="Jupiter").value == "Zeus"
58 |     # Case-sensitive alias match should fail
59 |     with pytest.raises(ValidationError):
60 |         Example(value="jove")
61 | 
62 | 
63 | def test_duplicate_records():
64 |     source = [["c", "b"], ["a", "b"], ["d", "b"]]
65 | 
66 |     A = InMemoryValidator(source)
67 |     assert A["a"].value == "a"
68 | 
69 |     try:
70 |         assert A["b"].value == "a"
71 |         assert False, "Didn't raise exception!"
72 |     except KeyError as e:
73 |         msg = str(e.args[0])
74 |         assert (
75 |             msg == "Key Error: b "
76 |             '["b" could not be resolved, did you mean "c", "a", or "d"?]'
77 |         )
78 | 
79 |     A = InMemoryValidator(source, tiebreaker_mode="lesser")
80 |     assert A["b"].value == "a"
81 | 
82 |     A = InMemoryValidator(source, tiebreaker_mode="greater")
83 |     assert A["b"].value == "d"
84 | 


--------------------------------------------------------------------------------
/tests/test_regex.py:
--------------------------------------------------------------------------------
 1 | from pydantic import ValidationError
 2 | 
 3 | from fuzztypes import Email, SSN, ZipCode, validate_python
 4 | 
 5 | 
 6 | def test_email_regexer():
 7 |     assert (
 8 |         validate_python(Email, "Jane Doe <jdoe@example.com>")
 9 |         == "jdoe@example.com"
10 |     )
11 |     assert validate_python(Email, "<jdoe@example.com>") == "jdoe@example.com"
12 | 
13 |     try:
14 |         assert validate_python(Email, "abc@xyz") is not None
15 |         assert False, "Invalid email did not fail!"
16 |     except ValidationError:
17 |         pass
18 | 
19 | 
20 | def test_valid_ssn():
21 |     # Value call
22 |     assert validate_python(SSN, "Valid SSN: 123-45-6789") == "123-45-6789"
23 | 
24 |     # Entity value comparison
25 |     assert validate_python(SSN, "Valid SSN: 123-45-6789") == "123-45-6789"
26 | 
27 |     # Entity equivalence to a value
28 |     assert validate_python(SSN, "Valid SSN: 123-45-6789") == "123-45-6789"
29 | 
30 | 
31 | def test_valid_ssn_with_touching_bounding_chars():
32 |     assert validate_python(SSN, "Valid SSN:123-45-6789.") == "123-45-6789"
33 | 
34 | 
35 | def test_invalid_ssn_format():
36 |     try:
37 |         validate_python(SSN, "Invalid SSN: 123-456-789")
38 |         assert False, "Invalid SSN format was accepted."
39 |     except ValidationError:
40 |         pass
41 | 
42 | 
43 | def test_ssn_needs_bounding_spaces():
44 |     try:
45 |         validate_python(SSN, "SSN text: abc123-45-6789xyz")
46 |         assert False, "SSNs require some sort of bounding characters."
47 |     except ValidationError:
48 |         pass
49 | 
50 | 
51 | def test_multiple_ssns():
52 |     # This test depends on how you decide to handle multiple SSNs.
53 |     multi_ssn_string = "Two SSNs: 123-45-6789 and 987-65-4321"
54 |     try:
55 |         assert validate_python(SSN, multi_ssn_string) is not None
56 |         assert False, "Invalid SSN format was accepted."
57 |     except ValidationError:
58 |         pass
59 | 
60 | 
61 | def test_valid_zip_code_5_digits():
62 |     assert validate_python(ZipCode, "Postal code: 12345") == "12345"
63 | 
64 | 
65 | def test_valid_zip_code_9_digits():
66 |     assert validate_python(ZipCode, "ZIP:12345-6789") == "12345-6789"
67 | 
68 | 
69 | def test_zip_code_within_text():
70 |     assert (
71 |         validate_python(ZipCode, "Send it to 98765-4321, please.")
72 |         == "98765-4321"
73 |     )
74 | 
75 | 
76 | def test_invalid_zip_code():
77 |     try:
78 |         validate_python(ZipCode, "Invalid ZIP: 1234")
79 |         assert False, "Invalid ZIP code did not fail."
80 |     except ValidationError:
81 |         pass
82 | 
83 | 
84 | def test_zip_code_with_invalid_four_format():
85 |     # Python's re module does not support lookbehinds (?<!)
86 |     # tried: r"\b\d{5}(?:-\d{4})?\b(?<!-\d{1,3}\b)",
87 |     assert validate_python(ZipCode, "Invalid ZIP: 12345-678") == "12345"
88 | 


--------------------------------------------------------------------------------
/tests/on_disk/test_on_disk_alias.py:
--------------------------------------------------------------------------------
 1 | from typing import Annotated
 2 | 
 3 | import pytest
 4 | from pydantic import BaseModel, ValidationError
 5 | 
 6 | from fuzztypes import OnDiskValidator, flags
 7 | 
 8 | 
 9 | @pytest.fixture(scope="session")
10 | def MythicalFigure(MythSource):
11 |     return OnDiskValidator(
12 |         "MythicalFigure", MythSource, search_flag=flags.AliasSearch
13 |     )
14 | 
15 | 
16 | @pytest.fixture(scope="session")
17 | def CasedMythicalFigure(MythSource):
18 |     return OnDiskValidator(
19 |         "CasedMythicalFigure",
20 |         MythSource,
21 |         search_flag=flags.AliasSearch,
22 |         case_sensitive=True,
23 |     )
24 | 
25 | 
26 | def test_alias_uncased_getitem(MythicalFigure):
27 |     # Testing Alias with aliases
28 |     assert MythicalFigure["Odysseus"].value == "Odysseus"
29 |     assert MythicalFigure["Ulysses"].value == "Odysseus"  # alias
30 |     assert MythicalFigure["athena"].value == "Athena"  # case insensitivity
31 | 
32 | 
33 | def test_alias_cased_getitem(CasedMythicalFigure):
34 |     # Testing AliasCasedStr, expecting case-sensitive behavior
35 |     assert CasedMythicalFigure["Athena"].value == "Athena"
36 | 
37 |     with pytest.raises(KeyError):
38 |         # This should fail because CasedMythicalFigure is case-sensitive
39 |         assert CasedMythicalFigure["athena"].value == "Athena"
40 | 
41 | 
42 | def test_uncased_alias_str(MythicalFigure):
43 |     class Example(BaseModel):
44 |         value: Annotated[str, MythicalFigure]
45 | 
46 |     # Exact match
47 |     assert Example(value="Zeus").value == "Zeus"
48 |     # Alias match
49 |     assert Example(value="Jupiter").value == "Zeus"
50 |     # Case-insensitive alias match
51 |     assert Example(value="jove").value == "Zeus"
52 | 
53 | 
54 | def test_cased_alias_str(CasedMythicalFigure):
55 |     class Example(BaseModel):
56 |         value: Annotated[str, CasedMythicalFigure]
57 | 
58 |     # Exact match
59 |     assert Example(value="Zeus").value == "Zeus"
60 |     # Alias match
61 |     assert Example(value="Jupiter").value == "Zeus"
62 |     # Case-sensitive alias match should fail
63 |     with pytest.raises(ValidationError):
64 |         Example(value="jove")
65 | 
66 | 
67 | def test_duplicate_records():
68 |     source = [["c", "b"], ["a", "b"], ["d", "b"]]
69 | 
70 |     A = OnDiskValidator("DupeRec", source)
71 |     assert A["a"].value == "a"
72 | 
73 |     try:
74 |         assert A["b"].value == "a"
75 |         assert False, "Didn't raise exception!"
76 |     except KeyError as e:
77 |         assert (
78 |             str(e) == "'Key Error: b "
79 |             '["b" could not be resolved, did you mean "c", "a", or "d"?]\''
80 |         )
81 | 
82 |     A = OnDiskValidator("DupeRec", source, tiebreaker_mode="lesser")
83 |     assert A["b"].value == "a"
84 | 
85 |     A = OnDiskValidator("DupeRec", source, tiebreaker_mode="greater")
86 |     assert A["b"].value == "d"
87 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling >= 1.13.0"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "FuzzTypes"
  7 | description = "FuzzTypes is a Pydantic extension for annotating autocorrecting fields"
  8 | readme = "README.md"
  9 | requires-python = ">=3.9"
 10 | authors = [
 11 |     { name = "Ian Maurer", email = "ian@genomoncology.com" },
 12 | ]
 13 | classifiers = [
 14 |     "Intended Audience :: Developers",
 15 |     "License :: OSI Approved :: MIT License",
 16 |     "Operating System :: OS Independent",
 17 |     "Programming Language :: Python :: 3",
 18 |     "Topic :: Software Development :: Libraries",
 19 | ]
 20 | dependencies = [
 21 |     "pydantic >= 2.6.1",
 22 | ]
 23 | dynamic = ["version"]
 24 | 
 25 | [project.optional-dependencies]
 26 | test = [
 27 |     "pytest",
 28 |     "pytest-mock",
 29 |     "coverage[toml]",
 30 | ]
 31 | local = [
 32 |     "build",
 33 |     "jupyter",
 34 |     "ipython",
 35 |     "mypy",
 36 |     "pip",
 37 |     "setuptools",
 38 |     "twine",
 39 | ]
 40 | ext = [
 41 |     "anyascii",
 42 |     "dateparser",
 43 |     "emoji",
 44 |     "lancedb",
 45 |     "nameparser",  # Note: LGPL.
 46 |     "number-parser",
 47 |     "rapidfuzz",
 48 |     "sentence-transformers",
 49 |     "tantivy",
 50 |     "unidecode",  # Note: GPL.
 51 | ]
 52 | 
 53 | [tool.setuptools.package-data]
 54 | 
 55 | [project.urls]
 56 | 
 57 | [project.scripts]
 58 | #fuzztypes = "fuzztypes:cli"
 59 | 
 60 | [tool.hatch.version]
 61 | path = "src/fuzztypes/__init__.py"
 62 | 
 63 | [tool.mypy]
 64 | check_untyped_defs = true
 65 | 
 66 | [tool.pytest.ini_options]
 67 | addopts = [
 68 |   "--strict-config",
 69 |   "--strict-markers",
 70 | ]
 71 | xfail_strict = true
 72 | junit_family = "xunit2"
 73 | norecursedirs = ".venv"
 74 | filterwarnings = [
 75 | ]
 76 | 
 77 | [tool.coverage.run]
 78 | parallel = true
 79 | source = [
 80 |     "src",
 81 | ]
 82 | context = '${CONTEXT}'
 83 | omit = [
 84 |     '__main__.py',
 85 |     '__init__.py',
 86 | ]
 87 | 
 88 | [tool.ruff]
 89 | line-length = 79
 90 | select = [
 91 |   "E",  # pycodestyle errors
 92 |   "W",  # pycodestyle warnings
 93 |   "F",  # pyflakes
 94 |   "I",  # isort
 95 |   "C",  # flake8-comprehensions
 96 |   "B",  # flake8-bugbear
 97 | ]
 98 | [tool.coverage.report]
 99 | exclude_also = [
100 |     "def __repr__",
101 |     "if self.debug:",
102 |     "if settings.DEBUG",
103 |     "raise AssertionError",
104 |     "raise NotImplementedError",
105 |     "except ImportError",
106 |     "if 0:",
107 |     "if __name__ == .__main__.:",
108 |     "if TYPE_CHECKING:",
109 |     "class .*\\bProtocol\\):",
110 |     "@(abc\\.)?abstractmethod",
111 | ]
112 | 
113 | [tool.ruff.isort]
114 | known-third-party = ["click", "pydantic"]
115 | 
116 | [tool.ruff.format]
117 | quote-style = "double"
118 | indent-style = "space"
119 | skip-magic-trailing-comma = false
120 | line-ending = "auto"
121 | 
122 | [tool.isort]
123 | extend_skip = ["__init__.py"]
124 | 


--------------------------------------------------------------------------------
/tests/test_person.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from pydantic import BaseModel, ValidationError
 4 | 
 5 | from fuzztypes import Person, validate_python
 6 | 
 7 | 
 8 | class MyModel(BaseModel):
 9 |     person: Person
10 |     optional: Optional[Person] = None
11 | 
12 | 
13 | def test_example():
14 |     person = validate_python(Person, "Mr. John (Johnny) Q. Public IV")
15 |     assert str(person) == "Mr. John Q. Public IV (Johnny)"
16 |     assert person.last_name_first == "Public, John Q."
17 |     assert person.short_name == "John Public"
18 |     assert person.legal_name == "John Q. Public IV"
19 |     assert person.full_name == "Mr. John Q. Public IV (Johnny)"
20 | 
21 |     assert person.initials == "J. Q. P."
22 |     assert person.full_initials == "J. Q. P."
23 |     assert person.short_initials == "J. P."
24 | 
25 |     obj2 = MyModel(person=person)
26 |     assert obj2.person == person
27 |     assert obj2.person.human_name() == person.human_name()
28 | 
29 |     assert obj2.optional is None
30 | 
31 | 
32 | def test_mixed_capitalization_with_validate_python():
33 |     person = validate_python(Person, "shirley maclaine")
34 |     assert person.first == "Shirley"
35 |     assert person.last == "MacLaine"
36 | 
37 | 
38 | def test_null_person_ok():
39 |     assert validate_python(Optional[Person], None) is None
40 | 
41 | 
42 | def test_different_nickname_format_oh_well():
43 |     obj = validate_python(MyModel, dict(person="Arthur 'The Fonz' Fonzerelli"))
44 |     assert obj.person.first == "Arthur"
45 |     assert obj.person.last == "Fonzerelli"
46 |     assert obj.person.middle == "'the Fonz'"
47 |     assert str(obj.person) == "Arthur 'the Fonz' Fonzerelli"
48 | 
49 | 
50 | def test_json_serialization():
51 |     json = '{"person": "Grace Hopper", "optional": null}'
52 |     obj = MyModel.model_validate_json(json)
53 |     assert str(obj.person) == "Grace Hopper"
54 |     assert obj.optional is None
55 | 
56 |     data = dict(person="grace hopper", optional="ava lovelace")
57 |     obj = MyModel.model_validate(data)
58 |     assert str(obj.person) == "Grace Hopper"
59 |     assert str(obj.optional) == "Ava Lovelace"
60 | 
61 |     json = obj.model_dump_json(exclude_defaults=True)
62 |     assert (
63 |         json == '{"person":{"first":"Grace","last":"Hopper"},'
64 |         '"optional":{"first":"Ava","last":"Lovelace"}}'
65 |     )
66 |     obj = MyModel.model_validate_json(json)
67 | 
68 |     data = obj.model_dump(exclude_defaults=True)
69 |     assert data == dict(
70 |         person=dict(first="Grace", last="Hopper"),
71 |         optional=dict(first="Ava", last="Lovelace"),
72 |     )
73 | 
74 | 
75 | def test_value_error():
76 |     try:
77 |         data: dict = {}
78 |         validate_python(MyModel, data)
79 |         assert False, "Didn't fail as expected."
80 |     except ValidationError:
81 |         pass
82 | 
83 |     try:
84 |         data = dict(person=5)
85 |         validate_python(MyModel, data)
86 |         assert False, "Didn't fail as expected."
87 |     except ValueError:
88 |         pass
89 | 


--------------------------------------------------------------------------------
/src/fuzztypes/const.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Literal
 3 | 
 4 | # Home directory of fuzztypes library.
 5 | FuzzHome = "~/.local/fuzztypes/"
 6 | FuzzHome = os.path.expanduser(os.environ.get("FUZZTYPES_HOME", FuzzHome))
 7 | StoredValidatorPath = os.path.join(FuzzHome, "on_disk")
 8 | DownloadsPath = os.path.join(FuzzHome, "downloads")
 9 | 
10 | # Default encoder to use when generating semantic embeddings.
11 | # Override with environment variable `FUZZTYPES_DEFAULT_ENCODER`.
12 | DefaultEncoder = "sentence-transformers/paraphrase-MiniLM-L6-v2"
13 | DefaultEncoder = os.environ.get("FUZZTYPES_DEFAULT_ENCODER", DefaultEncoder)
14 | 
15 | # Default path for storing models for sentence transformers.
16 | ModelsPath = os.path.join(FuzzHome, "models")
17 | 
18 | 
19 | # Date Ordering used when parsing ambiguous dates.
20 | # https://dateparser.readthedocs.io/en/latest/settings.html#date-order
21 | DateOrder = Literal["DMY", "MDY", "YMD"]
22 | 
23 | # Device to use for generating semantic embeddings and lancedb indexing.
24 | # https://www.sbert.net/examples/applications/computing-embeddings/README.html
25 | # https://lancedb.github.io/lance/read_and_write.html#indexing
26 | DeviceList = Literal["cpu", "cuda", "mps"]
27 | 
28 | # Which rapidfuzz scorer to use?
29 | # https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html
30 | # Scorers:
31 | # ratio: Calculates Levenshtein Distance similarity ratio
32 | # partial_ratio: Compares substrings, good for different length strings
33 | # token_set_ratio: Compares unique words, allows different word order
34 | # partial_token_set_ratio: Like token_set_ratio but compares substrings
35 | # token_sort_ratio: Sorts words before compare, good when order is irrelevant
36 | # partial_token_sort_ratio: Like token_sort_ratio but compares substrings
37 | # token_ratio: Averages token_sort_ratio and token_set_ratio
38 | # partial_token_ratio: Averages partial token sort and set ratios
39 | # WRatio: Weighted combination of different ratios based on string lengths
40 | # QRatio: Faster version of ratio, less accurate
41 | FuzzScorer = Literal[
42 |     "ratio",
43 |     "partial_ratio",
44 |     "token_set_ratio",
45 |     "partial_token_set_ratio",
46 |     "token_sort_ratio",
47 |     "partial_token_sort_ratio",
48 |     "token_ratio",
49 |     "partial_token_ratio",
50 |     "WRatio",
51 |     "QRatio",
52 | ]
53 | 
54 | # What happens if a matching entity is not found for key?
55 | # raise: raises an exception if no matching entity found
56 | # none: sets value to None if no matching entity found
57 | # allow: passes through key
58 | NotFoundMode = Literal["raise", "none", "allow"]
59 | 
60 | 
61 | # What happens if there is a tie?
62 | # raise: raise an exception if two elements are tied without Entity.priority
63 | # lesser: use lower Entity.value, if Entity.priority not set or different
64 | # greater: use greater Entity.value, if Entity.priority not set or different
65 | TiebreakerMode = Literal["raise", "lesser", "greater"]
66 | 
67 | # Which Pydantic validator mode?
68 | # https://docs.pydantic.dev/latest/concepts/validators/
69 | # Only 'before' has been tested, 'plain' may work with no changes.
70 | # Refactoring probably required for 'after' and 'wrap'.
71 | ValidatorMode = Literal["before"]  # ... , "after", "plain", "wrap"]
72 | 


--------------------------------------------------------------------------------
/tests/utils/test_download.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | 
  3 | import pytest
  4 | 
  5 | from fuzztypes.const import DownloadsPath
  6 | from fuzztypes.utils.download import get_file
  7 | 
  8 | 
  9 | @pytest.fixture
 10 | def mock_path_exists(mocker):
 11 |     return mocker.patch("os.path.exists")
 12 | 
 13 | 
 14 | @pytest.fixture
 15 | def mock_getmtime(mocker):
 16 |     return mocker.patch("os.path.getmtime")
 17 | 
 18 | 
 19 | @pytest.fixture
 20 | def mock_replace(mocker):
 21 |     return mocker.patch("os.replace")
 22 | 
 23 | 
 24 | @pytest.fixture
 25 | def mock_file_age(mocker):
 26 |     return mocker.patch("fuzztypes.utils.download.get_file_age_in_days")
 27 | 
 28 | 
 29 | @pytest.fixture
 30 | def mock_urlretrieve(mocker):
 31 |     return mocker.patch("urllib.request.urlretrieve")
 32 | 
 33 | 
 34 | @pytest.fixture
 35 | def mock_logger_warning(mocker):
 36 |     return mocker.patch("fuzztypes.logger.warning")
 37 | 
 38 | 
 39 | @pytest.fixture
 40 | def mock_logger_error(mocker):
 41 |     return mocker.patch("fuzztypes.logger.error")
 42 | 
 43 | 
 44 | def test_get_file_cache_hit(mock_path_exists, mock_file_age, mock_replace):
 45 |     mock_path_exists.return_value = True
 46 |     mock_file_age.return_value = 10
 47 | 
 48 |     result = get_file("http://example.com/file.txt")
 49 |     assert result == os.path.join(DownloadsPath, "file.txt")
 50 |     mock_replace.assert_not_called()
 51 | 
 52 | 
 53 | def test_cache_miss_due_to_expiry(
 54 |     mock_path_exists, mock_file_age, mock_replace, mock_urlretrieve
 55 | ):
 56 |     mock_path_exists.return_value = True
 57 |     mock_file_age.return_value = 31
 58 |     mock_urlretrieve.return_value = True
 59 | 
 60 |     result = get_file("http://example.com/file.txt")
 61 |     assert result == os.path.join(DownloadsPath, "file.txt")
 62 |     mock_replace.assert_called_once()
 63 |     mock_urlretrieve.assert_called_once_with(
 64 |         "http://example.com/file.txt",
 65 |         os.path.join(DownloadsPath, "file.txt.tmp"),
 66 |     )
 67 | 
 68 | 
 69 | def test_cache_miss_due_to_absence(
 70 |     mock_path_exists, mock_replace, mock_urlretrieve
 71 | ):
 72 |     mock_path_exists.side_effect = [
 73 |         False,
 74 |         True,
 75 |     ]  # First call for cache check, second for download check
 76 |     mock_urlretrieve.return_value = True  # Simulate successful download
 77 |     assert get_file("http://example.com/file.txt") is not None
 78 |     mock_replace.assert_called_once()
 79 |     mock_urlretrieve.assert_called_once()
 80 | 
 81 | 
 82 | def test_download_failure(
 83 |     mock_path_exists, mock_logger_error, mock_urlretrieve
 84 | ):
 85 |     mock_path_exists.return_value = False
 86 |     mock_urlretrieve.side_effect = Exception("Download failed")
 87 |     assert get_file("http://example.com/file.txt") is None
 88 |     mock_logger_error.assert_called_once()
 89 | 
 90 | 
 91 | def test_download_exception_handling(
 92 |     mock_path_exists, mock_logger_warning, mock_urlretrieve
 93 | ):
 94 |     mock_path_exists.return_value = False
 95 |     mock_urlretrieve.side_effect = Exception(
 96 |         "Unexpected error"
 97 |     )  # Simulate an exception during download
 98 |     assert get_file("http://example.com/file.txt") is None
 99 |     mock_logger_warning.assert_called_once()
100 | 


--------------------------------------------------------------------------------
/src/fuzztypes/match.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Tuple, Optional, Any, Union, Type
  2 | 
  3 | from pydantic import BaseModel, Field
  4 | 
  5 | from . import Entity, NamedEntity, const
  6 | 
  7 | 
  8 | class Match(BaseModel):
  9 |     key: Any
 10 |     entity: Entity
 11 |     is_alias: bool = False
 12 |     score: float = 100.0
 13 |     term: Optional[str] = None
 14 | 
 15 |     @property
 16 |     def rank(self) -> Tuple[float, int]:
 17 |         return -1 * self.score, self.entity.rank
 18 | 
 19 |     @property
 20 |     def rank_value(self) -> Tuple[Tuple[float, int], Any]:
 21 |         return self.rank, self.entity.value
 22 | 
 23 |     def __lt__(self, other: "Match"):
 24 |         return self.rank_value < other.rank_value
 25 | 
 26 | 
 27 | class MatchResult(BaseModel):
 28 |     matches: List[Match] = Field(default_factory=list)
 29 |     choice: Optional[Match] = None
 30 | 
 31 |     def __bool__(self):
 32 |         return bool(self.matches)
 33 | 
 34 |     def __len__(self):
 35 |         return len(self.matches)
 36 | 
 37 |     def __getitem__(self, item):
 38 |         return self.matches[item]
 39 | 
 40 |     @property
 41 |     def entity(self):
 42 |         return self.choice is not None and self.choice.entity
 43 | 
 44 |     def append(self, match: Match):
 45 |         """Add a match to the list of potential matches."""
 46 |         self.matches.append(match)
 47 | 
 48 |     def choose(self, min_score: float, tiebreaker_mode: const.TiebreakerMode):
 49 |         """Filter matches by score, sort by rank/alpha, and make choice."""
 50 |         allowed = sorted(m for m in self.matches if m.score >= min_score)
 51 |         count = len(allowed)
 52 | 
 53 |         if count == 1:
 54 |             self.choice = allowed[0]
 55 | 
 56 |         elif count > 1:
 57 |             first = allowed[0]
 58 |             tied = [
 59 |                 m
 60 |                 for m in allowed[1:]
 61 |                 if m.rank == first.rank and m.entity != first.entity
 62 |             ]
 63 | 
 64 |             if not tied or tiebreaker_mode == "lesser":
 65 |                 self.choice = first
 66 | 
 67 |             elif tiebreaker_mode == "greater":
 68 |                 self.choice = tied[-1]
 69 | 
 70 | 
 71 | class Record(BaseModel):
 72 |     entity: Union[NamedEntity, str]
 73 |     term: str
 74 |     norm_term: Optional[str] = None
 75 |     is_alias: bool
 76 |     vector: Any = None
 77 | 
 78 |     @classmethod
 79 |     def from_list(
 80 |         cls,
 81 |         recs: list,
 82 |         key,
 83 |         score: float = 100.0,
 84 |         entity_type: Type[NamedEntity] = NamedEntity,
 85 |     ) -> List[Match]:
 86 |         return [record.to_match(key, score, entity_type) for record in recs]
 87 | 
 88 |     def to_match(
 89 |         self,
 90 |         key,
 91 |         score: float = 100.0,
 92 |         entity_type: Type[NamedEntity] = NamedEntity,
 93 |     ) -> Match:
 94 |         if isinstance(self.entity, str):
 95 |             match_entity = entity_type.model_validate_json(self.entity)
 96 |         else:
 97 |             match_entity = self.entity
 98 | 
 99 |         return Match(
100 |             key=key,
101 |             entity=match_entity,
102 |             is_alias=self.is_alias,
103 |             score=score,
104 |             term=self.term,
105 |         )
106 | 


--------------------------------------------------------------------------------
/src/fuzztypes/language.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from enum import Enum
  3 | from typing import Annotated, Optional, List, Iterable, Type
  4 | 
  5 | from pydantic import TypeAdapter
  6 | 
  7 | from fuzztypes import EntitySource, NamedEntity, OnDiskValidator, flags, utils
  8 | 
  9 | 
 10 | class LanguageScope(Enum):
 11 |     INDIVIDUAL = "I"
 12 |     MACROLANGUAGE = "M"
 13 |     SPECIAL = "S"
 14 | 
 15 | 
 16 | class LanguageType(Enum):
 17 |     ANCIENT = "A"
 18 |     CONSTRUCTED = "C"
 19 |     EXTINCT = "E"
 20 |     HISTORICAL = "H"
 21 |     LIVING = "L"
 22 |     SPECIAL = "S"
 23 | 
 24 | 
 25 | class LanguageNamedEntity(NamedEntity):
 26 |     """Resolves to language full name."""
 27 | 
 28 |     alpha_2: Optional[str] = None
 29 |     alpha_3: str
 30 |     scope: Optional[LanguageScope] = None
 31 |     type: Optional[LanguageType] = None
 32 |     common_name: Optional[str] = None
 33 |     inverted_name: Optional[str] = None
 34 |     bibliographic: Optional[str] = None
 35 | 
 36 |     @property
 37 |     def code(self):
 38 |         return self.alpha_2 or self.alpha_3
 39 | 
 40 | 
 41 | class LanguageModelNamedEntity(LanguageNamedEntity):
 42 |     """Resolves to self as a full child object."""
 43 | 
 44 |     def resolve(self):
 45 |         return self
 46 | 
 47 | 
 48 | class LanguageCodeNameEntity(LanguageNamedEntity):
 49 |     """Resolves to code name."""
 50 | 
 51 |     def resolve(self):
 52 |         return self.code
 53 | 
 54 | 
 55 | LanguageNamedEntityType = Type[LanguageNamedEntity]
 56 | 
 57 | 
 58 | def load_languages(
 59 |     entity_cls: Type[LanguageNamedEntity] = LanguageNamedEntity,
 60 | ):
 61 |     def do_load() -> Iterable[NamedEntity]:
 62 |         repo = "https://salsa.debian.org/iso-codes-team/iso-codes/"
 63 |         remote = f"{repo}-/raw/main/data/iso_639-3.json"
 64 |         local = utils.get_file(remote)
 65 |         assert local, f"Could not download: {remote}"
 66 |         data = json.load(open(local))["639-3"]
 67 |         alias_fields = {
 68 |             "alpha_2",
 69 |             "alpha_3",
 70 |             "common_name",
 71 |             "inverted_name",
 72 |             "bibliographic",
 73 |         }
 74 |         entities = []
 75 |         for item in data:
 76 |             item["value"] = item.pop("name")
 77 |             aliases = [v for k, v in item.items() if k in alias_fields]
 78 |             item["aliases"] = aliases
 79 |             entities.append(item)
 80 |         return TypeAdapter(List[LanguageNamedEntity]).validate_python(data)
 81 | 
 82 |     return do_load
 83 | 
 84 | 
 85 | LanguageName = Annotated[
 86 |     str,
 87 |     OnDiskValidator(
 88 |         "Language",
 89 |         EntitySource(load_languages(LanguageNamedEntity)),
 90 |         entity_type=LanguageNamedEntity,
 91 |         search_flag=flags.AliasSearch,
 92 |         tiebreaker_mode="lesser",
 93 |     ),
 94 | ]
 95 | 
 96 | LanguageCode = Annotated[
 97 |     str,
 98 |     OnDiskValidator(
 99 |         "Language",
100 |         EntitySource(load_languages(LanguageCodeNameEntity)),
101 |         entity_type=LanguageCodeNameEntity,
102 |         search_flag=flags.AliasSearch,
103 |         tiebreaker_mode="lesser",
104 |     ),
105 | ]
106 | 
107 | Language = Annotated[
108 |     LanguageNamedEntity,
109 |     OnDiskValidator(
110 |         "Language",
111 |         EntitySource(load_languages(LanguageModelNamedEntity)),
112 |         entity_type=LanguageModelNamedEntity,
113 |         search_flag=flags.AliasSearch,
114 |         tiebreaker_mode="lesser",
115 |     ),
116 | ]
117 | 


--------------------------------------------------------------------------------
/src/fuzztypes/person.py:
--------------------------------------------------------------------------------
  1 | from typing import Annotated, Optional
  2 | 
  3 | from pydantic import BaseModel
  4 | 
  5 | from fuzztypes import FuzzValidator, lazy
  6 | 
  7 | FULL_NAME = "{title} {first} {middle} {last} {suffix} ({nickname})"
  8 | SHORT_NAME = "{first} {last}"
  9 | LEGAL_NAME = "{first} {middle} {last} {suffix}"
 10 | LAST_NAME_FIRST = "{last}, {first} {middle}"
 11 | 
 12 | FULL_INIT = "{first} {middle} {last}"
 13 | SHORT_INIT = "{first} {last}"
 14 | 
 15 | 
 16 | def parse(**kwargs):
 17 |     HumanName = lazy.lazy_import("nameparser", "HumanName")
 18 |     return HumanName(**kwargs)
 19 | 
 20 | 
 21 | class PersonModel(BaseModel):
 22 |     name_format: str = FULL_NAME
 23 |     init_format: str = FULL_INIT
 24 |     title: str = ""
 25 |     first: str = ""
 26 |     middle: str = ""
 27 |     last: str = ""
 28 |     suffix: str = ""
 29 |     nickname: str = ""
 30 | 
 31 |     def __str__(self):
 32 |         return self.name
 33 | 
 34 |     # names
 35 | 
 36 |     @property
 37 |     def name(self) -> str:
 38 |         return str(self.human_name())
 39 | 
 40 |     @property
 41 |     def full_name(self) -> str:
 42 |         return str(self.human_name(name_format=FULL_NAME))
 43 | 
 44 |     @property
 45 |     def short_name(self) -> str:
 46 |         return str(self.human_name(name_format=SHORT_NAME))
 47 | 
 48 |     @property
 49 |     def legal_name(self) -> str:
 50 |         return str(self.human_name(name_format=LEGAL_NAME))
 51 | 
 52 |     @property
 53 |     def last_name_first(self) -> str:
 54 |         return str(self.human_name(name_format=LAST_NAME_FIRST))
 55 | 
 56 |     # initials
 57 | 
 58 |     @property
 59 |     def initials(self) -> str:
 60 |         return self.human_name().initials()
 61 | 
 62 |     @property
 63 |     def full_initials(self) -> str:
 64 |         return self.human_name(init_format=FULL_INIT).initials()
 65 | 
 66 |     @property
 67 |     def short_initials(self) -> str:
 68 |         return self.human_name(init_format=SHORT_INIT).initials()
 69 | 
 70 |     # human name object from nameparser library
 71 | 
 72 |     def human_name(self, name_format=None, init_format=None):
 73 |         name_format = name_format or self.name_format
 74 |         init_format = init_format or self.init_format
 75 |         return parse(
 76 |             string_format=name_format,
 77 |             initials_format=init_format,
 78 |             title=self.title,
 79 |             first=self.first,
 80 |             middle=self.middle,
 81 |             last=self.last,
 82 |             suffix=self.suffix,
 83 |             nickname=self.nickname,
 84 |         )
 85 | 
 86 | 
 87 | def PersonValidator(
 88 |     name_format: str = FULL_NAME,
 89 |     init_format: str = FULL_INIT,
 90 |     capitalize: bool = True,
 91 | ):
 92 |     def to_person(key) -> Optional[PersonModel]:
 93 |         if isinstance(key, str):
 94 |             human_name = parse(full_name=key)
 95 |             if capitalize:
 96 |                 human_name.capitalize(force=True)
 97 |             data = human_name.as_dict()
 98 |             person = PersonModel(
 99 |                 name_format=name_format, init_format=init_format, **data
100 |             )
101 |         elif isinstance(key, PersonModel):
102 |             person = key
103 |         elif isinstance(key, dict):
104 |             person = PersonModel(**key)
105 |         else:
106 |             raise ValueError(f"Unexpected key type {type(key)} for {key}.")
107 | 
108 |         return person
109 | 
110 |     return FuzzValidator(to_person)
111 | 
112 | 
113 | # default annotation
114 | Person = Annotated[PersonModel, PersonValidator()]
115 | 


--------------------------------------------------------------------------------
/tests/in_memory/test_in_memory_tags_example.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Inspired by Simon Willison's twitter post:
  3 | https://x.com/simonw/status/1766847300310028698
  4 | 
  5 | Collected tags from his website here:
  6 | https://simonwillison.net/tags/
  7 | 
  8 | Future Goal: Move to OnDiskValidator implementation with NotFound=Allow where the
  9 | tags are added to the database incrementally for future fuzzy matching.
 10 | https://github.com/quickwit-oss/tantivy-py/issues/20
 11 | https://docs.rs/tantivy/latest/tantivy/query/struct.FuzzyTermQuery.html
 12 | """
 13 | from typing import Annotated, List
 14 | 
 15 | from pydantic import BaseModel
 16 | from pytest import fixture
 17 | 
 18 | from fuzztypes import (
 19 |     EntitySource,
 20 |     InMemoryValidator,
 21 |     flags,
 22 |     resolve_entity,
 23 |     validate_python,
 24 |     Entity,
 25 | )
 26 | 
 27 | 
 28 | @fixture(scope="session")
 29 | def TagSource(data_path):
 30 |     source = EntitySource(data_path / "simonw_tags.csv")
 31 |     assert len(source) == 1500
 32 |     return source
 33 | 
 34 | 
 35 | @fixture(scope="session")
 36 | def Tag(TagSource):
 37 |     # allow set will pass through any not founds
 38 |     # Fuzz Search using RapidFuzz
 39 |     # min_similarity is very low for demo
 40 |     # QRatio used because tags are single "words" (e.g. sqlinjection)
 41 | 
 42 |     return Annotated[
 43 |         str,
 44 |         InMemoryValidator(
 45 |             TagSource,
 46 |             notfound_mode="allow",
 47 |             search_flag=flags.FuzzSearch,
 48 |             min_similarity=50.0,
 49 |             fuzz_scorer="QRatio",
 50 |         ),
 51 |     ]
 52 | 
 53 | 
 54 | def test_get_entity_from_annotation(Tag):
 55 |     entity = resolve_entity(Tag, "2d")
 56 |     assert isinstance(entity, Entity)
 57 |     assert entity.priority == 3
 58 | 
 59 |     entity = resolve_entity(Tag, "3d")
 60 |     assert isinstance(entity, Entity)
 61 |     assert entity.priority == 14
 62 | 
 63 | 
 64 | def test_fuzzy_tags_priority(Tag):
 65 |     # since min_similarity is 50.0, it chooses higher priority
 66 |     assert validate_python(Tag, "4d") == "3d"
 67 | 
 68 |     # matches because 67% ratio > 50.0 minimum
 69 |     assert validate_python(Tag, "27d") == "2d"
 70 | 
 71 |     # less than 50% similarity is passed through (notfound_mode="allow")
 72 |     assert validate_python(Tag, "17d") == "17d"
 73 | 
 74 |     # different
 75 |     assert validate_python(Tag, "18d") == "i18n"
 76 | 
 77 |     # todo: collect allowed tags and use for future fuzzy matching
 78 |     # assert validate_python(Tag, "15d") == "17d"
 79 |     assert validate_python(Tag, "15d") == "15d"
 80 | 
 81 | 
 82 | def test_fuzzy_scoring_edge_cases(Tag):
 83 |     assert validate_python(Tag, "prompt_injection") == "promptinjection"
 84 |     assert validate_python(Tag, "promptinjections") == "promptinjection"
 85 |     assert validate_python(Tag, "prompt injections") == "promptinjection"
 86 | 
 87 | 
 88 | def test_as_a_list_of_tags(TagSource):
 89 |     Tag = Annotated[
 90 |         str,
 91 |         InMemoryValidator(
 92 |             TagSource,
 93 |             notfound_mode="allow",
 94 |             search_flag=flags.FuzzSearch,
 95 |             min_similarity=50.0,
 96 |             fuzz_scorer="QRatio",
 97 |         ),
 98 |     ]
 99 | 
100 |     class Post(BaseModel):
101 |         text: str
102 |         tags: List[Tag]
103 | 
104 |     post = Post(
105 |         text="Prompt injection is unsolved still.",
106 |         tags=["prompt_injection", "AI"],
107 |     )
108 | 
109 |     assert post.tags == ["promptinjection", "ai"]
110 | 
111 |     json = post.model_dump_json()
112 |     second = Post.model_validate_json(json)
113 |     assert second.tags == ["promptinjection", "ai"]
114 | 


--------------------------------------------------------------------------------
/src/fuzztypes/validation.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import sys
  3 | from functools import lru_cache
  4 | from itertools import chain
  5 | from typing import Any, Dict, Optional, Union, cast, get_args
  6 | 
  7 | from pydantic import (
  8 |     GetCoreSchemaHandler,
  9 |     GetJsonSchemaHandler,
 10 |     TypeAdapter,
 11 |     json_schema,
 12 | )
 13 | from pydantic_core import CoreSchema, PydanticCustomError, core_schema
 14 | 
 15 | from fuzztypes import Entity
 16 | 
 17 | dataclass_kwargs: Dict[str, Any]
 18 | 
 19 | slots_true: Dict[str, bool] = {}
 20 | if sys.version_info >= (3, 10):
 21 |     slots_true = {"slots": True}  # pragma: no cover
 22 | 
 23 | 
 24 | @lru_cache(maxsize=None)
 25 | def get_type_adapter(cls: Any) -> TypeAdapter:
 26 |     """
 27 |     Get a type adapter for the given class wrapped by a cache.
 28 | 
 29 |     :param cls: TypedDict, BaseModel, or Annotation.
 30 |     :return: TypeAdapter wrapper of cls
 31 |     """
 32 |     return TypeAdapter(cls)
 33 | 
 34 | 
 35 | def validate_json(cls: Any, value: Union[str, bytes]) -> Any:
 36 |     """
 37 |     Validate a JSON string or bytes against the model.
 38 | 
 39 |     :param cls: TypedDict, BaseModel, or Annotation.
 40 |     :param value: JSON string or bytes to validate.
 41 |     :return: Validated Python object.
 42 |     """
 43 |     return get_type_adapter(cls).validate_json(value)
 44 | 
 45 | 
 46 | def validate_python(cls: Any, value: Any) -> Any:
 47 |     """
 48 |     Validate a Python object against the model.
 49 | 
 50 |     :param cls: TypedDict, BaseModel, or Annotation.
 51 |     :param value: Python object to validate.
 52 |     :return: Validated Python object.
 53 |     """
 54 |     ta = get_type_adapter(cls)
 55 |     return ta.validate_python(value)
 56 | 
 57 | 
 58 | def resolve_entity(cls: Any, value: Any) -> Optional[Entity]:
 59 |     """
 60 |     Returns entity from metadata if cls is a FuzzValidator.
 61 | 
 62 |     :param cls: Any object
 63 |     :param value: input value
 64 |     :return: Entity if validator is an entity source
 65 |     """
 66 |     metadata = get_args(cls)
 67 |     entity = None
 68 |     for item in chain([cls], metadata):
 69 |         if isinstance(item, FuzzValidator):
 70 |             entity = item[value]
 71 |     return entity
 72 | 
 73 | 
 74 | @dataclasses.dataclass(frozen=True, **slots_true)
 75 | class FuzzValidator:
 76 |     func: Any
 77 |     examples: Optional[list] = None
 78 | 
 79 |     def __hash__(self):
 80 |         attrs = (self.func, tuple(self.examples or ()))
 81 |         return hash(attrs)
 82 | 
 83 |     def __getitem__(self, key):
 84 |         try:
 85 |             return self.func[key]
 86 |         except PydanticCustomError as err:
 87 |             raise KeyError(f"Key Error: {key} [{err}]") from err
 88 | 
 89 |     def __get_pydantic_core_schema__(
 90 |         self, source_type: Any, handler: GetCoreSchemaHandler
 91 |     ) -> core_schema.CoreSchema:
 92 |         schema = handler(source_type)
 93 |         func = cast(core_schema.NoInfoValidatorFunction, self.func)
 94 | 
 95 |         return core_schema.no_info_before_validator_function(
 96 |             func, schema=schema
 97 |         )
 98 | 
 99 |     def __get_pydantic_json_schema__(
100 |         self,
101 |         schema: CoreSchema,
102 |         handler: GetJsonSchemaHandler,
103 |     ) -> json_schema.JsonSchemaValue:
104 |         """
105 |         Generate the JSON schema for the AbstractType.
106 | 
107 |         This method is used internally by Pydantic to generate the JSON
108 |         schema representation of the AbstractType, including any examples.
109 |         """
110 |         schema = handler(schema)
111 |         if self.examples is not None:
112 |             schema["examples"] = self.examples
113 |         return schema
114 | 


--------------------------------------------------------------------------------
/src/fuzztypes/storage.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union
  2 | 
  3 | from pydantic_core import PydanticCustomError
  4 | 
  5 | from fuzztypes import NamedEntity, MatchResult, const, flags, lazy
  6 | 
  7 | 
  8 | class AbstractStorage:
  9 |     def __init__(
 10 |         self,
 11 |         source: Iterable,
 12 |         *,
 13 |         case_sensitive: bool = False,
 14 |         device: const.DeviceList = "cpu",
 15 |         encoder: Union[Callable, str, object] = None,
 16 |         entity_type: Type[NamedEntity] = NamedEntity,
 17 |         fuzz_scorer: str = "token_sort_ratio",
 18 |         limit: int = 10,
 19 |         min_similarity: float = 80.0,
 20 |         notfound_mode: const.NotFoundMode = "raise",
 21 |         search_flag: flags.SearchFlag = flags.DefaultSearch,
 22 |         tiebreaker_mode: const.TiebreakerMode = "raise",
 23 |     ):
 24 |         assert not search_flag.is_hybrid, "Hybrid search not yet supported!"
 25 | 
 26 |         self.source = source
 27 | 
 28 |         # options
 29 |         self.case_sensitive = case_sensitive
 30 |         self.device = device
 31 |         self.entity_type = entity_type
 32 |         self.limit = limit
 33 |         self.min_similarity = min_similarity
 34 |         self.notfound_mode = notfound_mode
 35 |         self.prepped = False
 36 |         self.search_flag = search_flag
 37 |         self.tiebreaker_mode = tiebreaker_mode
 38 | 
 39 |         # store string for lazy loading
 40 |         self._fuzz_scorer = fuzz_scorer
 41 |         self._encoder = encoder
 42 |         self._vect_dimensions = None
 43 | 
 44 |     def __call__(self, key: str) -> Optional[Any]:
 45 |         entity = self[key]
 46 |         return entity.resolve() if entity else None
 47 | 
 48 |     def __getitem__(self, key: str) -> Optional[NamedEntity]:
 49 |         if not self.prepped:
 50 |             self.prepped = True
 51 |             self.prepare()
 52 | 
 53 |         match_list = self.get(key)
 54 |         match_list.choose(self.min_similarity, self.tiebreaker_mode)
 55 | 
 56 |         if match_list.choice is not None:
 57 |             return match_list.entity
 58 | 
 59 |         if self.notfound_mode == "allow":
 60 |             return self.entity_type(value=key)
 61 | 
 62 |         if self.notfound_mode == "none":
 63 |             return None
 64 | 
 65 |         msg = '"{key}" could not be resolved'
 66 |         ctx: Dict[str, Any] = dict(key=key)
 67 |         if match_list:
 68 |             near = [f'"{match.entity.value}"' for match in match_list.matches]
 69 |             if len(near) > 1:
 70 |                 near[-1] = "or " + near[-1]
 71 |             msg += f", did you mean {', '.join(near)}?"
 72 |         raise PydanticCustomError("key_not_found", msg, ctx)
 73 | 
 74 |     def prepare(self):
 75 |         raise NotImplementedError
 76 | 
 77 |     def get(self, key: str) -> MatchResult:
 78 |         raise NotImplementedError
 79 | 
 80 |     def normalize(self, key: str):
 81 |         if key:
 82 |             key = key.strip()
 83 |             if self.case_sensitive:
 84 |                 return key
 85 |             else:
 86 |                 return key.lower()
 87 | 
 88 |     #
 89 |     # encoding
 90 |     #
 91 | 
 92 |     @property
 93 |     def encoder(self):
 94 |         return lazy.create_encoder(self._encoder, device=self.device)
 95 | 
 96 |     @property
 97 |     def vect_dimensions(self):
 98 |         if self._vect_dimensions is None:
 99 |             dummy_encoded = self.encode([""])
100 |             self._vect_dimensions = dummy_encoded.shape[1]
101 |         return self._vect_dimensions
102 | 
103 |     def encode(self, values: List[str]):
104 |         return self.encoder(values)
105 | 
106 |     #
107 |     # fuzzy matching
108 |     #
109 | 
110 |     @property
111 |     def rapidfuzz(self):
112 |         return lazy.lazy_import("rapidfuzz")
113 | 
114 |     @property
115 |     def fuzz_scorer(self):
116 |         return getattr(
117 |             self.rapidfuzz.fuzz,
118 |             self._fuzz_scorer,
119 |             self.rapidfuzz.fuzz.token_sort_ratio,
120 |         )
121 | 
122 |     def fuzz_clean(self, term: str) -> str:
123 |         # no really, it's a string
124 |         # noinspection PyTypeChecker
125 |         return self.rapidfuzz.utils.default_process(term)
126 | 


--------------------------------------------------------------------------------
/tests/in_memory/test_in_memory_fuzz.py:
--------------------------------------------------------------------------------
  1 | from typing import Annotated, Optional
  2 | from pydantic import BaseModel, ValidationError
  3 | 
  4 | from fuzztypes import NamedEntity, InMemoryValidator, flags, validate_python
  5 | 
  6 | FruitStr = Annotated[
  7 |     Optional[str],
  8 |     InMemoryValidator(
  9 |         ["Apple", "Banana"],
 10 |         search_flag=flags.FuzzSearch,
 11 |     ),
 12 | ]
 13 | 
 14 | DirectionStr = Annotated[
 15 |     Optional[str],
 16 |     InMemoryValidator(
 17 |         [
 18 |             ("Left", "L"),
 19 |             ("Right", "R"),
 20 |             ("Middle", "M"),
 21 |         ],
 22 |         search_flag=flags.FuzzSearch,
 23 |     ),
 24 | ]
 25 | LooseStr = Annotated[
 26 |     Optional[str],
 27 |     InMemoryValidator(
 28 |         ["A B C", "X Y Z"],
 29 |         min_similarity=10.0,
 30 |         limit=1,
 31 |         search_flag=flags.FuzzSearch,
 32 |     ),
 33 | ]
 34 | StrictStr = Annotated[
 35 |     str,
 36 |     InMemoryValidator(
 37 |         ["A B C", "X Y Z"],
 38 |         min_similarity=95.0,
 39 |         limit=1,
 40 |         search_flag=flags.FuzzSearch,
 41 |     ),
 42 | ]
 43 | 
 44 | 
 45 | class Model(BaseModel):
 46 |     fruit: FruitStr = None
 47 |     direction: DirectionStr = None
 48 |     loose: LooseStr = "A B C"
 49 |     strict: StrictStr = "A B C"
 50 | 
 51 | 
 52 | def test_exact_matches():
 53 |     obj = Model(fruit="Apple", direction="Left")
 54 |     assert obj.fruit == "Apple"
 55 |     assert obj.direction == "Left"
 56 | 
 57 | 
 58 | def test_case_insensitive():
 59 |     obj = Model(fruit="banana", direction="right")
 60 |     assert obj.fruit == "Banana"
 61 |     assert obj.direction == "Right"
 62 | 
 63 | 
 64 | def test_case_fuzzy():
 65 |     obj = Model(fruit="appel", direction="lft.")
 66 |     assert obj.fruit == "Apple"
 67 |     assert obj.direction == "Left"
 68 | 
 69 | 
 70 | def test_synonyms():
 71 |     assert Model(direction="L").direction == "Left"
 72 |     assert Model(direction="r").direction == "Right"
 73 |     assert Model(direction="M.").direction == "Middle"
 74 | 
 75 | 
 76 | def test_get_item():
 77 |     assert validate_python(DirectionStr, "L") == "Left"
 78 | 
 79 |     try:
 80 |         assert validate_python(DirectionStr, "XYZ")
 81 |         raise AssertionError("Didn't throw KeyError")
 82 |     except ValidationError:
 83 |         pass
 84 | 
 85 | 
 86 | def test_min_score():
 87 |     assert Model(loose="B K L").loose == "A B C"
 88 | 
 89 |     try:
 90 |         Model(strict="B K L")
 91 |         assert "Expected validation error!"
 92 | 
 93 |     except ValidationError as e:
 94 |         assert e.errors(include_url=False) == [
 95 |             {
 96 |                 "ctx": {"key": "B K L"},
 97 |                 "input": "B K L",
 98 |                 "loc": ("strict",),
 99 |                 "msg": '"B K L" could not be resolved, did you mean "A B C"?',
100 |                 "type": "key_not_found",
101 |             }
102 |         ]
103 | 
104 | 
105 | def test_with_priority():
106 |     entities = [
107 |         NamedEntity(value="WP1", priority=1),
108 |         NamedEntity(value="WP2", priority=1),
109 |         NamedEntity(value="WP3", priority=3),
110 |     ]
111 | 
112 |     # highest priority sorts to the front
113 |     assert sorted(entities)[0].value == "WP3"
114 | 
115 |     # value is tiebreaker
116 |     assert sorted(entities)[1].value == "WP1"
117 | 
118 |     # validate that priority wins
119 |     WithPriority = InMemoryValidator(
120 |         entities,
121 |         min_similarity=65.0,
122 |         search_flag=flags.FuzzSearch,
123 |     )
124 |     assert WithPriority["WPX"].value == "WP3"
125 | 
126 | 
127 | def test_without_tiebreaker():
128 |     entities = ["NT1", "NT2", "NT3"]
129 |     WithoutPriority = InMemoryValidator(
130 |         entities,
131 |         min_similarity=65.0,
132 |         search_flag=flags.FuzzSearch,
133 |     )
134 |     try:
135 |         assert WithoutPriority["NTX"] is None
136 |     except KeyError:
137 |         pass
138 | 
139 | 
140 | def test_with_lesser_tiebreaker():
141 |     entities = ["NT1", "NT2", "NT3"]
142 |     LesserTiebreak = InMemoryValidator(
143 |         entities,
144 |         min_similarity=65,
145 |         tiebreaker_mode="lesser",
146 |         search_flag=flags.FuzzSearch,
147 |     )
148 |     assert LesserTiebreak["NTX"].value == "NT1"
149 | 
150 | 
151 | def test_with_greater_tiebreaker():
152 |     entities = ["NT1", "NT2", "NT3", "XX5"]
153 |     GreaterTiebreak = InMemoryValidator(
154 |         entities,
155 |         min_similarity=0,
156 |         tiebreaker_mode="greater",
157 |         search_flag=flags.FuzzSearch,
158 |     )
159 |     assert GreaterTiebreak["NTX"].value == "NT3"
160 | 


--------------------------------------------------------------------------------
/src/fuzztypes/in_memory.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import Callable, Iterable, Union, Type, Optional
  3 | 
  4 | from pydantic import PositiveInt
  5 | 
  6 | from fuzztypes import (
  7 |     FuzzValidator,
  8 |     Match,
  9 |     MatchResult,
 10 |     NamedEntity,
 11 |     Record,
 12 |     const,
 13 |     flags,
 14 |     lazy,
 15 |     storage,
 16 | )
 17 | 
 18 | 
 19 | class InMemoryValidatorStorage(storage.AbstractStorage):
 20 |     def __init__(self, *args, **kwargs):
 21 |         super().__init__(*args, **kwargs)
 22 | 
 23 |         self._mapping = defaultdict(list)
 24 |         self._terms = []
 25 |         self._is_alias = []
 26 |         self._entities = []
 27 |         self._embeddings = None
 28 | 
 29 |     #
 30 |     # Prepare
 31 |     #
 32 | 
 33 |     def prepare(self):
 34 |         for item in self.source:
 35 |             entity = self.entity_type.convert(item)
 36 |             self.add(entity)
 37 | 
 38 |     def add(self, entity: NamedEntity) -> None:
 39 |         if self.search_flag.is_name_ok:
 40 |             self.add_by_name(entity)
 41 | 
 42 |         if self.search_flag.is_alias_ok:
 43 |             self.add_by_alias(entity)
 44 | 
 45 |         if self.search_flag.is_fuzz_or_semantic_ok:
 46 |             self.add_fuzz_or_semantic(entity)
 47 | 
 48 |     def add_by_name(self, entity: NamedEntity) -> None:
 49 |         term = entity.value
 50 |         norm_term = self.normalize(term)
 51 |         record = Record(
 52 |             entity=entity, term=term, norm_term=norm_term, is_alias=False
 53 |         )
 54 |         self._mapping[norm_term].append(record)
 55 | 
 56 |     def add_by_alias(self, entity: NamedEntity) -> None:
 57 |         for term in entity.aliases:
 58 |             norm_term = self.normalize(term)
 59 |             record = Record(
 60 |                 entity=entity, term=term, norm_term=norm_term, is_alias=True
 61 |             )
 62 |             self._mapping[norm_term].append(record)
 63 | 
 64 |     def add_fuzz_or_semantic(self, entity: NamedEntity) -> None:
 65 |         clean_name: str = self.fuzz_clean(entity.value)
 66 |         self._terms.append(clean_name)
 67 |         self._entities.append(entity)
 68 |         self._is_alias.append(False)
 69 | 
 70 |         for alias in entity.aliases:
 71 |             clean_alias: str = self.fuzz_clean(alias)
 72 |             self._terms.append(clean_alias)
 73 |             self._entities.append(entity)
 74 |             self._is_alias.append(True)
 75 | 
 76 |     #
 77 |     # Getters
 78 |     #
 79 | 
 80 |     def get(self, key: str) -> MatchResult:
 81 |         records = self._mapping.get(self.normalize(key), [])
 82 |         match_list = Record.from_list(
 83 |             records, key=key, entity_type=self.entity_type
 84 |         )
 85 | 
 86 |         results = MatchResult(matches=match_list)
 87 | 
 88 |         if not results:
 89 |             if self.search_flag.is_fuzz_ok:
 90 |                 results = self.get_by_fuzz(key)
 91 | 
 92 |             if self.search_flag.is_semantic_ok:
 93 |                 results = self.get_by_semantic(key)
 94 | 
 95 |         return results
 96 | 
 97 |     #
 98 |     # Fuzzy Matching
 99 |     #
100 | 
101 |     def get_by_fuzz(self, term) -> MatchResult:
102 |         query = self.fuzz_clean(term)
103 |         matches = self.fuzz_match(query)
104 |         return matches
105 | 
106 |     def fuzz_match(
107 |         self,
108 |         query: str,
109 |     ) -> MatchResult:
110 |         # https://rapidfuzz.github.io/RapidFuzz/Usage/process.html#extract
111 |         extract = self.rapidfuzz.process.extract(
112 |             query=query,
113 |             choices=self._terms,
114 |             scorer=self.fuzz_scorer,
115 |             limit=self.limit,
116 |         )
117 | 
118 |         results = MatchResult()
119 |         for key, score, index in extract:
120 |             entity = self._entities[index]
121 |             is_alias = self._is_alias[index]
122 |             m = Match(key=key, entity=entity, is_alias=is_alias, score=score)
123 |             results.append(m)
124 |         return results
125 | 
126 |     #
127 |     # Vector Similarity Search
128 |     #
129 | 
130 |     def get_by_semantic(self, key) -> MatchResult:
131 |         # find closest match using knn
132 |         indices, scores = self.find_knn(key)
133 | 
134 |         # create a MatchResult from the results
135 |         results = MatchResult()
136 |         for index, score in zip(indices, scores):
137 |             entity = self._entities[index]
138 |             term = self._terms[index]
139 |             is_alias = self._is_alias[index]
140 |             match = Match(
141 |                 key=key,
142 |                 entity=entity,
143 |                 score=score,
144 |                 is_alias=is_alias,
145 |                 term=term,
146 |             )
147 |             results.append(match)
148 | 
149 |         return results
150 | 
151 |     @property
152 |     def embeddings(self):
153 |         if self._embeddings is None:
154 |             self._embeddings = self.encode(self._terms)
155 |         return self._embeddings
156 | 
157 |     def find_knn(self, key: str) -> tuple:
158 |         np = lazy.lazy_import("numpy")
159 |         cosine_similarity = lazy.lazy_import(
160 |             "sklearn.metrics.pairwise", "cosine_similarity"
161 |         )
162 | 
163 |         # Encode the query
164 |         term = self.fuzz_clean(key)
165 |         query = self.encode([term])[0]
166 | 
167 |         # Reshape the query to a 2D array for cosine_similarity compatibility
168 |         query = query.reshape(1, -1)
169 | 
170 |         # Compute cosine similarity
171 |         similarities = cosine_similarity(self.embeddings, query).flatten()
172 | 
173 |         # Normalize the scores to the range of 0 to 100
174 |         normalized_scores = (similarities + 1) * 50
175 | 
176 |         # Get indices of the top-k similarities
177 |         k_nearest_indices = np.argsort(-normalized_scores)[: self.limit]
178 | 
179 |         # Get the top-k normalized scores
180 |         top_k_scores = normalized_scores[k_nearest_indices]
181 | 
182 |         return k_nearest_indices, top_k_scores
183 | 
184 | 
185 | def InMemoryValidator(
186 |     source: Iterable,
187 |     *,
188 |     case_sensitive: bool = False,
189 |     encoder: Union[Callable, str, object] = None,
190 |     entity_type: Type[NamedEntity] = NamedEntity,
191 |     examples: Optional[list] = None,
192 |     fuzz_scorer: const.FuzzScorer = "token_sort_ratio",
193 |     limit: PositiveInt = 10,
194 |     min_similarity: float = 80.0,
195 |     notfound_mode: const.NotFoundMode = "raise",
196 |     search_flag: flags.SearchFlag = flags.DefaultSearch,
197 |     tiebreaker_mode: const.TiebreakerMode = "raise",
198 | ):
199 |     in_memory = InMemoryValidatorStorage(
200 |         source,
201 |         case_sensitive=case_sensitive,
202 |         encoder=encoder,
203 |         entity_type=entity_type,
204 |         fuzz_scorer=fuzz_scorer,
205 |         limit=limit,
206 |         min_similarity=min_similarity,
207 |         notfound_mode=notfound_mode,
208 |         search_flag=search_flag,
209 |         tiebreaker_mode=tiebreaker_mode,
210 |     )
211 | 
212 |     return FuzzValidator(in_memory, examples=examples)
213 | 


--------------------------------------------------------------------------------
/src/fuzztypes/entity.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import json
  3 | from pathlib import Path
  4 | from typing import (
  5 |     List,
  6 |     Union,
  7 |     Type,
  8 |     Any,
  9 |     Optional,
 10 |     Tuple,
 11 |     Callable,
 12 |     Generic,
 13 |     TypeVar,
 14 | )
 15 | 
 16 | from pydantic import BaseModel, Field, TypeAdapter
 17 | 
 18 | T = TypeVar("T")
 19 | 
 20 | 
 21 | class Entity(BaseModel, Generic[T]):
 22 |     value: T = Field(
 23 |         ...,
 24 |         description="Value stored by Entity.",
 25 |     )
 26 |     label: Optional[str] = Field(
 27 |         default=None,
 28 |         description="Entity concept type such as PERSON, ORG, or GPE.",
 29 |     )
 30 |     meta: Optional[dict] = Field(
 31 |         default=None,
 32 |         description="Additional attributes accessible through dot-notation.",
 33 |     )
 34 |     priority: Optional[int] = Field(
 35 |         default=None,
 36 |         description="Tiebreaker rank (higher wins, None=0, negative allowed)",
 37 |     )
 38 | 
 39 |     def __eq__(self, other: Any):
 40 |         other = getattr(other, "value", other)
 41 |         return self.value == other
 42 | 
 43 |     def resolve(self) -> T:
 44 |         return self.value
 45 | 
 46 |     @property
 47 |     def rank(self) -> int:
 48 |         """Normalized by converting None to 0 and making lower better."""
 49 |         return -1 * (self.priority or 0)
 50 | 
 51 |     def __lt__(self, other: "Entity") -> bool:
 52 |         # noinspection PyTypeChecker
 53 |         return (self.rank, self.value) < (other.rank, other.value)
 54 | 
 55 |     def __getattr__(self, key: str) -> Any:
 56 |         # Check if the key exists in the meta dictionary
 57 |         if self.meta is not None and key in self.meta:
 58 |             return self.meta[key]
 59 |         # Attribute not found; raise AttributeError
 60 |         raise AttributeError(
 61 |             f"{self.__class__.__name__!r} object has no attribute {key!r}"
 62 |         )
 63 | 
 64 |     def __setattr__(self, key: str, value: Any):
 65 |         # Check if the key is a predefined field in the BaseModel
 66 |         if key in self.model_fields:
 67 |             super().__setattr__(key, value)
 68 |         else:
 69 |             self.meta = self.meta or {}
 70 |             self.meta[key] = value
 71 | 
 72 | 
 73 | class NamedEntity(Entity):
 74 |     value: str = Field(
 75 |         ...,
 76 |         description="Preferred term of NamedEntity.",
 77 |     )
 78 |     aliases: list[str] = Field(
 79 |         ...,
 80 |         description="List of aliases for NamedEntity.",
 81 |         default_factory=list,
 82 |     )
 83 | 
 84 |     @classmethod
 85 |     def convert(cls, item: Union[str, dict, list, tuple, "NamedEntity"]):
 86 |         if isinstance(item, cls):
 87 |             return item
 88 | 
 89 |         data = {}
 90 |         if item and isinstance(item, (list, tuple)):
 91 |             value, aliases = item[0], item[1:]
 92 |             if len(aliases) == 1 and isinstance(aliases[0], (tuple, list)):
 93 |                 aliases = aliases[0]
 94 |             data = dict(value=value, aliases=aliases)
 95 |         elif isinstance(item, dict):
 96 |             data = item
 97 |         else:
 98 |             data = dict(value=item)
 99 | 
100 |         return cls(**data)
101 | 
102 | 
103 | NamedEntityAdapter = TypeAdapter(NamedEntity)
104 | 
105 | SourceType = Union[Path, tuple["EntitySource", str], Callable]
106 | 
107 | 
108 | class EntitySource:
109 |     def __init__(self, source: SourceType, mv_splitter: str = "|"):
110 |         self.loaded: bool = False
111 |         self.source: SourceType = source
112 |         self.mv_splitter: str = mv_splitter
113 |         self.entities: List[NamedEntity] = []
114 | 
115 |     def __len__(self):
116 |         self._load_if_necessary()
117 |         return len(self.entities)
118 | 
119 |     def __getitem__(
120 |         self, key: Union[int, slice, str]
121 |     ) -> Union[NamedEntity, list[NamedEntity], "EntitySource"]:
122 |         if isinstance(key, str):
123 |             # return another shell, let loading occur on demand.
124 |             return EntitySource(source=(self, key))
125 | 
126 |         self._load_if_necessary()
127 |         return self.entities[key]
128 | 
129 |     def __iter__(self):
130 |         self._load_if_necessary()
131 |         return iter(self.entities)
132 | 
133 |     def _load_if_necessary(self):
134 |         if not self.loaded:
135 |             self.loaded = True
136 |             if isinstance(self.source, tuple):
137 |                 parent, label = self.source
138 |                 self.entities = [e for e in parent if e.label == label]
139 | 
140 |             elif callable(self.source):
141 |                 self.entities = self.source()
142 | 
143 |             elif isinstance(self.source, Path):
144 |                 dialects = {
145 |                     "csv": self.from_csv,
146 |                     "tsv": self.from_tsv,
147 |                     "jsonl": self.from_jsonl,
148 |                     "txt": self.from_txt,
149 |                 }
150 |                 _, ext = self.source.name.lower().rsplit(".", maxsplit=1)
151 |                 f = dialects.get(ext)
152 |                 assert f is not None, f"No reader found for: {ext}"
153 | 
154 |                 # noinspection PyArgumentList
155 |                 self.entities = f(self.source)
156 | 
157 |     @classmethod
158 |     def from_jsonl(cls, path: Path) -> List[NamedEntity]:
159 |         """
160 |         Constructs an EntityList from a .jsonl file of NamedEntity definitions.
161 | 
162 |         :param path: Path object pointing to the .jsonl file.
163 |         :return: List of Entities.
164 |         """
165 |         entities = []
166 |         with path.open("r") as fp:
167 |             for line in fp:
168 |                 entity = NamedEntity.convert(json.loads(line))
169 |                 entities.append(entity)
170 |         return entities
171 | 
172 |     def from_csv(self, path: Path) -> List[NamedEntity]:
173 |         return self.from_sv(path, csv.excel)
174 | 
175 |     def from_tsv(self, path: Path) -> List[NamedEntity]:
176 |         return self.from_sv(path, csv.excel_tab)
177 | 
178 |     def from_txt(self, path: Path) -> List[NamedEntity]:
179 |         return self.from_sv(path, csv.excel, fieldnames=["value"])
180 | 
181 |     def from_sv(
182 |         self,
183 |         path: Path,
184 |         dialect: Type[csv.Dialect],
185 |         fieldnames=None,
186 |     ) -> List[NamedEntity]:
187 |         """
188 |         Constructs an EntityList from a .csv or .tsv file.
189 | 
190 |         :param path: Path object pointing to the .csv or .tsv file.
191 |         :param dialect: CSV or TSV excel-based dialect.
192 |         :param fieldnames: Specify header if not provided (e.g. .txt mode)
193 |         :return: List of Entities
194 |         """
195 | 
196 |         def fix(d):
197 |             aliases = d.get("aliases", "").split(self.mv_splitter)
198 |             d["aliases"] = list(filter(None, aliases))
199 |             return d
200 | 
201 |         with path.open("r") as fp:
202 |             reader = csv.DictReader(fp, dialect=dialect, fieldnames=fieldnames)
203 |             data = map(fix, list(reader))
204 | 
205 |         return TypeAdapter(List[NamedEntity]).validate_python(data)
206 | 


--------------------------------------------------------------------------------
/src/fuzztypes/lazy.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import importlib
  3 | import os
  4 | from typing import Any, List, TypedDict, Callable, Optional
  5 | 
  6 | from fuzztypes import const
  7 | 
  8 | 
  9 | @functools.lru_cache(maxsize=None)
 10 | def lazy_import(
 11 |     library_name: str,
 12 |     attr_name: Optional[str] = None,
 13 |     return_none_on_error: bool = False,
 14 | ) -> Any:
 15 |     """
 16 |     Lazily import a library or a specific attribute from a library.
 17 | 
 18 |     Args:
 19 |         library_name (str): The name of the library to import.
 20 |         attr_name (str, optional): Library attribute to import from library.
 21 |         return_none_on_error (bool, optional): Whether to return None if an
 22 |             import error occurs. Default is False, which raises an ImportError.
 23 | 
 24 |     Returns:
 25 |         The imported library or attribute, or None if an import error occurs
 26 |         and return_none_on_error is True.
 27 | 
 28 |     Raises:
 29 |         ImportError: If the library or attribute is not found and
 30 |         return_none_on_error is False.
 31 |     """
 32 |     info = _lib_info.get(library_name, {})
 33 | 
 34 |     module_name = info.get("module_name", library_name)
 35 |     install_name = info.get("install_name", library_name)
 36 |     purpose = info.get("purpose", "")
 37 |     license_type = info.get("license", "")
 38 |     url = info.get("url", "")
 39 |     version = info.get("version", "")
 40 | 
 41 |     try:
 42 |         module = importlib.import_module(module_name)
 43 |         if attr_name:
 44 |             return getattr(module, attr_name)
 45 |         return module
 46 |     except ImportError as e:
 47 |         version_info = f"(version {version})" if version else ""
 48 |         install = f"`pip install {install_name}{version_info}`"
 49 |         details = ", ".join(list(filter(None, [purpose, url, license_type])))
 50 |         details = f" ({details})" if details else ""
 51 |         msg = f"Import Failed: {install}{details}"
 52 | 
 53 |         if not info:
 54 |             additional_msg = (
 55 |                 f"\nPlease add the library '{library_name}' to "
 56 |                 f"the '_lib_info' dictionary in the 'lazy' "
 57 |                 f"module."
 58 |             )
 59 |             msg += additional_msg
 60 | 
 61 |         if return_none_on_error:
 62 |             return None
 63 |         else:
 64 |             raise ImportError(msg) from e
 65 | 
 66 | 
 67 | @functools.lru_cache(maxsize=None)
 68 | def create_encoder(model_or_model_name: str, device: const.DeviceList):
 69 |     def get_encoder():
 70 |         nonlocal model_or_model_name
 71 | 
 72 |         if model_or_model_name is None:
 73 |             model_or_model_name = const.DefaultEncoder
 74 | 
 75 |         if isinstance(model_or_model_name, str):
 76 |             sbert = lazy_import("sentence_transformers")
 77 |             local_path = os.path.join(const.ModelsPath, model_or_model_name)
 78 | 
 79 |             if not os.path.exists(local_path):  # pragma: no cover
 80 |                 encoder = sbert.SentenceTransformer(
 81 |                     model_or_model_name, device=device
 82 |                 )
 83 |                 encoder.save(local_path)
 84 |             else:
 85 |                 encoder = sbert.SentenceTransformer(local_path)
 86 | 
 87 |             model_or_model_name = encoder
 88 | 
 89 |         return model_or_model_name
 90 | 
 91 |     def encode(texts: List[str]) -> List:
 92 |         return get_encoder().encode(texts, device=device)
 93 | 
 94 |     return encode
 95 | 
 96 | 
 97 | class RankResult(TypedDict):
 98 |     text: str
 99 |     score: float
100 |     corpus_id: int
101 | 
102 | 
103 | def create_reranker(
104 |     model_name: str,
105 | ) -> Callable[[str, List[str], int], List[RankResult]]:
106 |     """
107 |     Creates a reranker function using the specified sentence transformer model.
108 | 
109 |     :param model_name: Name of the CrossEncoder model
110 |                        (e.g. "mixedbread-ai/mxbai-rerank-xsmall-v1")
111 | 
112 |     :return: rerank function Callable
113 |     """
114 | 
115 |     def get_reranker():
116 |         sbert = lazy_import("sentence_transformers")
117 |         local_path = os.path.join(const.ModelsPath, model_name)
118 | 
119 |         if not os.path.exists(local_path):  # pragma: no cover
120 |             reranker = sbert.CrossEncoder(model_name)
121 |             reranker.save(local_path)
122 |         else:
123 |             reranker = sbert.CrossEncoder(local_path)
124 | 
125 |         return reranker
126 | 
127 |     def rerank(
128 |         query: str,
129 |         documents: List[str],
130 |         top_k: int = 3,
131 |     ) -> List[RankResult]:
132 |         reranker = get_reranker()
133 |         results: List[RankResult] = reranker.rank(
134 |             query, documents, return_documents=True, top_k=top_k
135 |         )
136 |         return results
137 | 
138 |     return rerank
139 | 
140 | 
141 | _lib_info = {
142 |     "sentence-transformers": {
143 |         "module_name": "sentence_transformers",
144 |         "install_name": "sentence-transformers",
145 |         "purpose": "Encoding sentences into high-dimensional vectors",
146 |         "license": "Apache 2.0",
147 |         "url": "https://github.com/UKPLab/sentence-transformers",
148 |     },
149 |     "unidecode": {
150 |         "module_name": "unidecode",
151 |         "install_name": "Unidecode",
152 |         "purpose": "Converting Unicode text into ASCII equivalents",
153 |         "license": "GPL",
154 |         "url": "https://github.com/avian2/unidecode",
155 |     },
156 |     "anyascii": {
157 |         "module_name": "anyascii",
158 |         "install_name": "anyascii",
159 |         "purpose": "Converting Unicode text into ASCII equivalents",
160 |         "license": "ISC",
161 |         "url": "https://github.com/anyascii/anyascii",
162 |     },
163 |     "rapidfuzz": {
164 |         "module_name": "rapidfuzz",
165 |         "install_name": "rapidfuzz",
166 |         "purpose": "Performing fuzzy string matching",
167 |         "license": "MIT",
168 |         "url": "https://github.com/maxbachmann/RapidFuzz",
169 |     },
170 |     "dateparser": {
171 |         "module_name": "dateparser",
172 |         "install_name": "dateparser",
173 |         "purpose": "Parsing dates from strings",
174 |         "license": "BSD-3-Clause",
175 |         "url": "https://github.com/scrapinghub/dateparser",
176 |     },
177 |     "emoji": {
178 |         "module_name": "emoji",
179 |         "install_name": "emoji",
180 |         "purpose": "Handling and manipulating emoji characters",
181 |         "license": "BSD",
182 |         "url": "https://github.com/carpedm20/emoji",
183 |     },
184 |     "nameparser": {
185 |         "module_name": "nameparser",
186 |         "install_name": "nameparser",
187 |         "purpose": "Parsing person names",
188 |         "license": "LGPL",
189 |         "url": "https://github.com/derek73/python-nameparser",
190 |     },
191 |     "number-parser": {
192 |         "module_name": "number_parser",
193 |         "install_name": "number-parser",
194 |         "purpose": "Parsing numbers from strings",
195 |         "license": "BSD-3-Clause",
196 |         "url": "https://github.com/scrapinghub/number-parser",
197 |     },
198 |     "pycountry": {
199 |         "module_name": "pycountry",
200 |         "install_name": "pycountry",
201 |         "purpose": "Provides ISO country, subdivision, language, and currency",
202 |         "license": "LGPL 2.1",
203 |         "url": "https://github.com/flyingcircusio/pycountry",
204 |     },
205 |     "lancedb": {
206 |         "module_name": "lancedb",
207 |         "install_name": "lancedb",
208 |         "purpose": "High-performance, on-disk vector database",
209 |         "license": "Apache 2.0",
210 |         "url": "https://github.com/lancedb/lancedb",
211 |     },
212 |     "numpy": {
213 |         "module_name": "numpy",
214 |         "install_name": "numpy",
215 |         "purpose": "Numerical computing in Python",
216 |         "license": "BSD",
217 |         "url": "https://numpy.org/",
218 |     },
219 |     "sklearn": {
220 |         "module_name": "sklearn",
221 |         "install_name": "scikit-learn",
222 |         "purpose": "Machine learning in Python",
223 |         "license": "BSD",
224 |         "url": "https://scikit-learn.org/",
225 |     },
226 | }
227 | 


--------------------------------------------------------------------------------
/src/fuzztypes/on_disk.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, Iterable, Union, List, Type, Optional, Any
  2 | 
  3 | from pydantic import PositiveInt
  4 | 
  5 | from fuzztypes import (
  6 |     FuzzValidator,
  7 |     Match,
  8 |     MatchResult,
  9 |     NamedEntity,
 10 |     Record,
 11 |     const,
 12 |     flags,
 13 |     lazy,
 14 |     storage,
 15 | )
 16 | 
 17 | accelerators = {"cuda", "mps"}
 18 | 
 19 | 
 20 | class StoredValidatorStorage(storage.AbstractStorage):
 21 |     def __init__(
 22 |         self,
 23 |         name: str,
 24 |         source: Iterable,
 25 |         **kwargs,
 26 |     ):
 27 |         super().__init__(source, **kwargs)
 28 | 
 29 |         self.name = name
 30 |         self._conn = None
 31 |         self._table = None
 32 | 
 33 |     @property
 34 |     def conn(self) -> Any:
 35 |         if self._conn is None:
 36 |             lancedb = lazy.lazy_import("lancedb")
 37 |             self._conn = lancedb.connect(const.StoredValidatorPath)
 38 |         return self._conn
 39 | 
 40 |     @property
 41 |     def table(self) -> Any:
 42 |         if self._table is None:
 43 |             self._table = self.conn.open_table(self.name)
 44 |         return self._table
 45 | 
 46 |     def prepare(self, force_drop_table: bool = False):
 47 |         table_names = set(self.conn.table_names(limit=999_999_999))
 48 | 
 49 |         if force_drop_table and self.name in table_names:
 50 |             self.conn.drop_table(self.name)
 51 |             table_names -= {self.name}
 52 | 
 53 |         if self.name not in table_names:
 54 |             try:
 55 |                 self.create_table()
 56 |             except Exception as e:  # pragma: no cover
 57 |                 # if any issue occurs, drop the table and re-raise error
 58 |                 # in the future, handle errors better
 59 |                 self.conn.drop_table(self.name)
 60 |                 raise e
 61 | 
 62 |     def create_table(self):
 63 |         pa = lazy.lazy_import("pyarrow")
 64 | 
 65 |         schema = pa.schema(
 66 |             [
 67 |                 pa.field("term", pa.string()),
 68 |                 pa.field("norm_term", pa.string()),
 69 |                 pa.field("entity", pa.string()),
 70 |                 pa.field("is_alias", pa.string()),
 71 |                 pa.field(
 72 |                     "vector",
 73 |                     pa.list_(pa.float32(), self.vect_dimensions),
 74 |                 ),
 75 |             ]
 76 |         )
 77 |         table = self.conn.create_table(self.name, schema=schema, exist_ok=True)
 78 | 
 79 |         # create records from source
 80 |         records = self.create_records()
 81 | 
 82 |         # calculate vectors in a batch
 83 |         if self.search_flag.is_semantic_ok:
 84 |             terms = [record.term for record in records]
 85 |             vectors = self.encode(terms)
 86 |             for record, vector in zip(records, vectors):
 87 |                 record.vector = vector
 88 | 
 89 |         # add records in a batch to table
 90 |         table.add([record.model_dump() for record in records])
 91 | 
 92 |         # adjust num_partitions and num_sub_vectors based on dataset size
 93 |         num_records = len(records)
 94 | 
 95 |         should_index = num_records > 256 and self.search_flag.is_semantic_ok
 96 | 
 97 |         if self.search_flag.is_fuzz_ok:  # pragma: no cover
 98 |             table.create_fts_index("term")
 99 | 
100 |         if should_index:  # pragma: no cover
101 |             num_partitions = min(num_records, 256)
102 |             num_sub_vectors = min(num_records, 96)
103 |             index_cache_size = min(num_records, 256)
104 |             accelerator = self.device if self.device in accelerators else None
105 | 
106 |             table.create_index(
107 |                 metric="cosine",
108 |                 num_partitions=num_partitions,
109 |                 num_sub_vectors=num_sub_vectors,
110 |                 vector_column_name="vector",
111 |                 replace=True,
112 |                 index_cache_size=index_cache_size,
113 |                 accelerator=accelerator,
114 |             )
115 | 
116 |     def create_records(self):
117 |         records = []
118 |         empty = [0.0] * self.vect_dimensions
119 |         for item in self.source:
120 |             entity = self.entity_type.convert(item)
121 |             json = entity.model_dump_json(exclude_defaults=True)
122 | 
123 |             terms = []
124 |             is_alias = False
125 | 
126 |             if self.search_flag.is_name_ok:
127 |                 terms.append(entity.value)
128 |                 is_alias = True
129 | 
130 |             if self.search_flag.is_alias_ok:
131 |                 terms += entity.aliases
132 | 
133 |             for term in terms:
134 |                 # normalize for case sensitivity
135 |                 norm_term = self.normalize(term)
136 | 
137 |                 # construct and add record
138 |                 if term:
139 |                     record = Record(
140 |                         entity=json,
141 |                         term=term,
142 |                         norm_term=norm_term,
143 |                         is_alias=is_alias,
144 |                         vector=empty,
145 |                     )
146 |                     records.append(record)
147 | 
148 |                 # 2nd term and beyond are aliases
149 |                 is_alias = True
150 | 
151 |         return records
152 | 
153 |     #
154 |     # Getters
155 |     #
156 | 
157 |     def get(self, key: str) -> MatchResult:
158 |         where = f'term = "{key}"'
159 |         match_list = self.run_query(key, where=where)
160 | 
161 |         if not match_list:
162 |             where = f'norm_term = "{self.normalize(key)}"'
163 |             match_list = self.run_query(key, where=where)
164 | 
165 |         if not match_list:
166 |             if self.search_flag.is_fuzz_ok:
167 |                 match_list = self.get_by_fuzz(key)
168 | 
169 |             if self.search_flag.is_semantic_ok:
170 |                 match_list = self.get_by_semantic(key)
171 | 
172 |         matches = MatchResult(matches=match_list)
173 |         return matches
174 | 
175 |     def get_by_fuzz(self, key: str) -> List[Match]:
176 |         query = self.normalize(key)
177 |         match_list = self.run_query(key, vector=query)
178 | 
179 |         # re-scoring using rapidfuzz on matches
180 |         terms = [match.term for match in match_list]
181 |         extract = self.rapidfuzz.process.extract(
182 |             query, terms, scorer=self.fuzz_scorer
183 |         )
184 |         for key, score, index in extract:
185 |             match_list[index].score = score
186 | 
187 |         return match_list
188 | 
189 |     def get_by_semantic(self, key: str) -> List[Match]:
190 |         vector = self.encode([key])[0]
191 |         return self.run_query(key, vector=vector)
192 | 
193 |     def run_query(self, key, where=None, vector=None) -> List[Match]:
194 |         qb = self.table.search(query=vector, vector_column_name="vector")
195 | 
196 |         if vector is not None and self.search_flag.is_semantic_ok:
197 |             qb = qb.metric("cosine")
198 | 
199 |         qb = qb.select(["entity", "term", "norm_term", "is_alias"])
200 | 
201 |         if where is not None:
202 |             qb = qb.where(where, prefilter=True)
203 | 
204 |         qb = qb.limit(self.limit)
205 |         data = qb.to_list()
206 | 
207 |         match_list = []
208 |         for item in data:
209 |             if "_distance" in item:
210 |                 distance = item.pop("_distance", 0.0)
211 |                 similarity = 1 - distance
212 |                 score = (similarity + 1) * 50
213 |             elif "score" in item:
214 |                 score = item.pop("score", 0.0)
215 |             else:
216 |                 score = 100.0  # Exact match
217 | 
218 |             record = Record.model_validate(item)
219 |             match = record.to_match(
220 |                 key=key, score=score, entity_type=self.entity_type
221 |             )
222 |             match_list.append(match)
223 | 
224 |         return match_list
225 | 
226 | 
227 | def OnDiskValidator(
228 |     identity: str,
229 |     source: Iterable,
230 |     *,
231 |     case_sensitive: bool = False,
232 |     device: Optional[const.DeviceList] = None,
233 |     encoder: Union[Callable, str, object] = None,
234 |     entity_type: Type[NamedEntity] = NamedEntity,
235 |     examples: Optional[list] = None,
236 |     fuzz_scorer: const.FuzzScorer = "token_sort_ratio",
237 |     limit: PositiveInt = 10,
238 |     min_similarity: float = 80.0,
239 |     notfound_mode: const.NotFoundMode = "raise",
240 |     search_flag: flags.SearchFlag = flags.DefaultSearch,
241 |     tiebreaker_mode: const.TiebreakerMode = "raise",
242 | ):
243 |     on_disk = StoredValidatorStorage(
244 |         identity,
245 |         source,
246 |         case_sensitive=case_sensitive,
247 |         device=device,
248 |         entity_type=entity_type,
249 |         fuzz_scorer=fuzz_scorer,
250 |         limit=limit,
251 |         min_similarity=min_similarity,
252 |         notfound_mode=notfound_mode,
253 |         search_flag=search_flag,
254 |         encoder=encoder,
255 |         tiebreaker_mode=tiebreaker_mode,
256 |     )
257 | 
258 |     return FuzzValidator(on_disk, examples=examples)
259 | 


--------------------------------------------------------------------------------
/tests/test_readme.py:
--------------------------------------------------------------------------------
  1 | from datetime import date, datetime
  2 | from typing import Annotated
  3 | 
  4 | from pydantic import BaseModel
  5 | 
  6 | from fuzztypes import (
  7 |     ASCII,
  8 |     Datetime,
  9 |     Email,
 10 |     Fuzzmoji,
 11 |     InMemoryValidator,
 12 |     Integer,
 13 |     Person,
 14 |     RegexValidator,
 15 |     ZipCode,
 16 |     flags,
 17 | )
 18 | 
 19 | 
 20 | # define a source, see EntitySource for using TSV, CSV, JSONL
 21 | inventors = ["Ada Lovelace", "Alan Turing", "Claude Shannon"]
 22 | 
 23 | # define a in memory validator with fuzz search enabled.
 24 | Inventor = Annotated[
 25 |     str, InMemoryValidator(inventors, search_flag=flags.FuzzSearch)
 26 | ]
 27 | 
 28 | # custom Regex type for finding twitter handles.
 29 | Handle = Annotated[
 30 |     str, RegexValidator(r"@\w{1,15}", examples=["@genomoncology"])
 31 | ]
 32 | 
 33 | 
 34 | # define a Pydantic class with 9 fuzzy type attributes
 35 | class Fuzzy(BaseModel):
 36 |     ascii: ASCII
 37 |     email: Email
 38 |     emoji: Fuzzmoji
 39 |     handle: Handle
 40 |     integer: Integer
 41 |     inventor: Inventor
 42 |     person: Person
 43 |     time: Datetime
 44 |     zipcode: ZipCode
 45 | 
 46 | 
 47 | def test_full_model():
 48 |     # create an instance of class Fuzzy
 49 |     obj = Fuzzy(
 50 |         ascii="άνθρωπος",
 51 |         email="John Doe <jdoe@example.com>",
 52 |         emoji="thought bubble",
 53 |         handle="Ian Maurer (@imaurer)",
 54 |         integer="fifty-five",  # type: ignore[arg-type]
 55 |         inventor="ada luvlace",
 56 |         person="mr. arthur h. fonzarelli (fonzie)",  # type: ignore[arg-type]
 57 |         time="5am on Jan 1, 2025",  # type: ignore[arg-type]
 58 |         zipcode="(Zipcode: 12345-6789)",
 59 |     )
 60 | 
 61 |     # test the autocorrecting performed
 62 | 
 63 |     # greek for man: https://en.wiktionary.org/wiki/άνθρωπος
 64 |     assert obj.ascii == "anthropos"
 65 | 
 66 |     # extract email via regular expression
 67 |     assert obj.email == "jdoe@example.com"
 68 | 
 69 |     # fuzzy match "thought bubble" to "thought balloon" emoji
 70 |     assert obj.emoji == "💭"
 71 | 
 72 |     # simple, inline regex example (see above Handle type)
 73 |     assert obj.handle == "@imaurer"
 74 | 
 75 |     # convert integer word phrase to integer value
 76 |     assert obj.integer == 55
 77 | 
 78 |     # case-insensitive fuzzy match on lowercase, misspelled name
 79 |     assert obj.inventor == "Ada Lovelace"
 80 | 
 81 |     # human name parser (title, first, middle, last, suffix, nickname)
 82 |     assert str(obj.person) == "Mr. Arthur H. Fonzarelli (fonzie)"
 83 |     assert obj.person.short_name == "Arthur Fonzarelli"
 84 |     assert obj.person.nickname == "fonzie"
 85 |     assert obj.person.last == "Fonzarelli"
 86 | 
 87 |     # convert time phrase to datetime object
 88 |     assert obj.time.isoformat() == "2025-01-01T05:00:00"
 89 | 
 90 |     # extract zip5 or zip9 formats using regular expressions
 91 |     assert obj.zipcode == "12345-6789"
 92 | 
 93 |     # print JSON on success
 94 |     assert obj.model_dump() == {
 95 |         "ascii": "anthropos",
 96 |         "email": "jdoe@example.com",
 97 |         "emoji": "💭",
 98 |         "handle": "@imaurer",
 99 |         "integer": 55,
100 |         "inventor": "Ada Lovelace",
101 |         "person": {
102 |             "first": "Arthur",
103 |             "init_format": "{first} {middle} {last}",
104 |             "last": "Fonzarelli",
105 |             "middle": "H.",
106 |             "name_format": "{title} {first} {middle} {last} {suffix} "
107 |             "({nickname})",
108 |             "nickname": "fonzie",
109 |             "suffix": "",
110 |             "title": "Mr.",
111 |         },
112 |         "time": datetime(2025, 1, 1, 5),
113 |         "zipcode": "12345-6789",
114 |     }
115 | 
116 | 
117 | def test_json_schema():
118 |     data = Fuzzy.model_json_schema()
119 |     expected_data = {
120 |         "$defs": {
121 |             "PersonModel": {
122 |                 "properties": {
123 |                     "first": {
124 |                         "default": "",
125 |                         "title": "First",
126 |                         "type": "string",
127 |                     },
128 |                     "init_format": {
129 |                         "default": "{first} " "{middle} " "{last}",
130 |                         "title": "Init " "Format",
131 |                         "type": "string",
132 |                     },
133 |                     "last": {"default": "", "title": "Last", "type": "string"},
134 |                     "middle": {
135 |                         "default": "",
136 |                         "title": "Middle",
137 |                         "type": "string",
138 |                     },
139 |                     "name_format": {
140 |                         "default": "{title} "
141 |                         "{first} "
142 |                         "{middle} "
143 |                         "{last} "
144 |                         "{suffix} "
145 |                         "({nickname})",
146 |                         "title": "Name " "Format",
147 |                         "type": "string",
148 |                     },
149 |                     "nickname": {
150 |                         "default": "",
151 |                         "title": "Nickname",
152 |                         "type": "string",
153 |                     },
154 |                     "suffix": {
155 |                         "default": "",
156 |                         "title": "Suffix",
157 |                         "type": "string",
158 |                     },
159 |                     "title": {
160 |                         "default": "",
161 |                         "title": "Title",
162 |                         "type": "string",
163 |                     },
164 |                 },
165 |                 "title": "PersonModel",
166 |                 "type": "object",
167 |             }
168 |         },
169 |         "properties": {
170 |             "ascii": {"title": "Ascii", "type": "string"},
171 |             "email": {
172 |                 "examples": ["user@example.com"],
173 |                 "title": "Email",
174 |                 "type": "string",
175 |             },
176 |             "emoji": {"title": "Emoji", "type": "string"},
177 |             "handle": {
178 |                 "examples": ["@genomoncology"],
179 |                 "title": "Handle",
180 |                 "type": "string",
181 |             },
182 |             "integer": {"title": "Integer", "type": "integer"},
183 |             "inventor": {"title": "Inventor", "type": "string"},
184 |             "person": {"$ref": "#/$defs/PersonModel"},
185 |             "time": {"format": "date-time", "title": "Time", "type": "string"},
186 |             "zipcode": {
187 |                 "examples": ["12345", "12345-6789"],
188 |                 "title": "Zipcode",
189 |                 "type": "string",
190 |             },
191 |         },
192 |         "required": [
193 |             "ascii",
194 |             "email",
195 |             "emoji",
196 |             "handle",
197 |             "integer",
198 |             "inventor",
199 |             "person",
200 |             "time",
201 |             "zipcode",
202 |         ],
203 |         "title": "Fuzzy",
204 |         "type": "object",
205 |     }
206 |     assert data == expected_data
207 | 
208 | 
209 | def test_in_memory_validator():
210 |     # Create a custom annotation type for matching fruits in memory
211 |     fruits = ["Apple", "Banana", "Orange"]
212 |     Fruit = Annotated[
213 |         str, InMemoryValidator(fruits, search_flag=flags.FuzzSearch)
214 |     ]
215 | 
216 |     class MyModel(BaseModel):
217 |         fruit: Fruit
218 | 
219 |     model = MyModel(fruit="appel")
220 |     assert model.fruit == "Apple"
221 | 
222 | 
223 | def test_on_disk_validator():
224 |     from fuzztypes import OnDiskValidator
225 | 
226 |     # Create a custom annotation type for matching countries stored on disk
227 |     countries = [
228 |         ("United States", "US"),
229 |         ("United Kingdom", "UK"),
230 |         ("Canada", "CA"),
231 |     ]
232 |     Country = Annotated[str, OnDiskValidator("Country", countries)]
233 | 
234 |     class MyModel(BaseModel):
235 |         country: Country
236 | 
237 |     assert MyModel(country="Canada").country == "Canada"
238 |     assert MyModel(country="US").country == "United States"
239 | 
240 | 
241 | def test_date_validators():
242 |     from fuzztypes import DateValidator, DatetimeValidator
243 | 
244 |     MyDate = Annotated[date, DateValidator(date_order="MDY")]
245 |     MyTime = Annotated[datetime, DatetimeValidator(timezone="UTC")]
246 | 
247 |     class MyModel(BaseModel):
248 |         date: MyDate
249 |         time: MyTime
250 | 
251 |     model = MyModel(date="1/1/2023", time="1/1/23 at 10:30 PM")  # type: ignore
252 |     assert model.date.isoformat() == "2023-01-01"
253 |     assert model.time.isoformat() == "2023-01-01T22:30:00+00:00"
254 | 
255 | 
256 | def test_fuzz_validator():
257 |     from fuzztypes import FuzzValidator
258 | 
259 |     # Create a custom annotation type that converts a value to uppercase
260 |     UpperCase = Annotated[str, FuzzValidator(str.upper)]
261 | 
262 |     class MyModel(BaseModel):
263 |         name: UpperCase
264 | 
265 |     model = MyModel(name="john")
266 |     assert model.name == "JOHN"
267 | 
268 | 
269 | def test_regex_validator():
270 |     from fuzztypes import RegexValidator
271 | 
272 |     # Create a custom annotation type for matching email addresses
273 |     IPAddress = Annotated[
274 |         str, RegexValidator(r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}$")
275 |     ]
276 | 
277 |     class MyModel(BaseModel):
278 |         ip_address: IPAddress
279 | 
280 |     model = MyModel(ip_address="My internet IP address is 192.168.127.12")
281 |     assert model.ip_address == "192.168.127.12"
282 | 
283 | 
284 | def test_validate_functions():
285 |     from fuzztypes import validate_python, validate_json, resolve_entity, Date
286 | 
287 |     # validate python
288 |     assert validate_python(Integer, "two hundred") == 200
289 | 
290 |     # validate json
291 |     class MyModel(BaseModel):
292 |         date: Date
293 | 
294 |     json = '{"date": "July 4th 2021"}'
295 |     obj = validate_json(MyModel, json)
296 |     assert obj.date.isoformat() == "2021-07-04"
297 | 
298 | 
299 | def test_resolve_entity():
300 |     from fuzztypes import resolve_entity, InMemoryValidator
301 | 
302 |     elements = ["earth", "fire", "water", "air"]
303 |     ElementValidator = InMemoryValidator(elements)
304 |     Element = Annotated[str, ElementValidator]
305 | 
306 |     # resolve using validator
307 |     entity = resolve_entity(ElementValidator, "EARTH")
308 |     assert entity is not None
309 |     assert entity.model_dump() == {
310 |         "aliases": [],
311 |         "label": None,
312 |         "meta": None,
313 |         "priority": None,
314 |         "value": "earth",
315 |     }
316 | 
317 |     # resolve using annotation type
318 |     entity = resolve_entity(Element, "Air")
319 |     assert entity is not None
320 |     assert entity.model_dump(exclude_defaults=True) == {"value": "air"}
321 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
  1 | # This file was autogenerated by uv via the following command:
  2 | #    uv pip compile pyproject.toml --extra test --extra local --extra ext -o requirements-dev.txt
  3 | annotated-types==0.6.0
  4 |     # via pydantic
  5 | anyascii==0.3.2
  6 | anyio==4.3.0
  7 |     # via
  8 |     #   httpx
  9 |     #   jupyter-server
 10 | appnope==0.1.4
 11 |     # via ipykernel
 12 | argon2-cffi==23.1.0
 13 |     # via jupyter-server
 14 | argon2-cffi-bindings==21.2.0
 15 |     # via argon2-cffi
 16 | arrow==1.3.0
 17 |     # via isoduration
 18 | asttokens==2.4.1
 19 |     # via stack-data
 20 | async-lru==2.0.4
 21 |     # via jupyterlab
 22 | attrs==23.2.0
 23 |     # via
 24 |     #   jsonschema
 25 |     #   lancedb
 26 |     #   number-parser
 27 |     #   referencing
 28 | babel==2.14.0
 29 |     # via jupyterlab-server
 30 | beautifulsoup4==4.12.3
 31 |     # via nbconvert
 32 | bleach==6.1.0
 33 |     # via nbconvert
 34 | build==1.1.1
 35 | cachetools==5.3.3
 36 |     # via lancedb
 37 | certifi==2024.2.2
 38 |     # via
 39 |     #   httpcore
 40 |     #   httpx
 41 |     #   requests
 42 | cffi==1.16.0
 43 |     # via argon2-cffi-bindings
 44 | charset-normalizer==3.3.2
 45 |     # via requests
 46 | click==8.1.7
 47 |     # via lancedb
 48 | comm==0.2.2
 49 |     # via
 50 |     #   ipykernel
 51 |     #   ipywidgets
 52 | coverage==7.4.3
 53 | dateparser==1.2.0
 54 | debugpy==1.8.1
 55 |     # via ipykernel
 56 | decorator==5.1.1
 57 |     # via
 58 |     #   ipython
 59 |     #   retry
 60 | defusedxml==0.7.1
 61 |     # via nbconvert
 62 | deprecation==2.1.0
 63 |     # via lancedb
 64 | docutils==0.20.1
 65 |     # via readme-renderer
 66 | emoji==2.10.1
 67 | exceptiongroup==1.2.0
 68 |     # via
 69 |     #   anyio
 70 |     #   ipython
 71 |     #   pytest
 72 | executing==2.0.1
 73 |     # via stack-data
 74 | fastjsonschema==2.19.1
 75 |     # via nbformat
 76 | filelock==3.13.1
 77 |     # via
 78 |     #   huggingface-hub
 79 |     #   torch
 80 |     #   transformers
 81 | fqdn==1.5.1
 82 |     # via jsonschema
 83 | fsspec==2024.2.0
 84 |     # via
 85 |     #   huggingface-hub
 86 |     #   torch
 87 | h11==0.14.0
 88 |     # via httpcore
 89 | httpcore==1.0.4
 90 |     # via httpx
 91 | httpx==0.27.0
 92 |     # via jupyterlab
 93 | huggingface-hub==0.21.3
 94 |     # via
 95 |     #   sentence-transformers
 96 |     #   tokenizers
 97 |     #   transformers
 98 | idna==3.6
 99 |     # via
100 |     #   anyio
101 |     #   httpx
102 |     #   jsonschema
103 |     #   requests
104 | importlib-metadata==7.0.2
105 |     # via
106 |     #   build
107 |     #   jupyter-client
108 |     #   jupyter-lsp
109 |     #   jupyterlab
110 |     #   jupyterlab-server
111 |     #   keyring
112 |     #   nbconvert
113 |     #   twine
114 | iniconfig==2.0.0
115 |     # via pytest
116 | ipykernel==6.29.3
117 |     # via
118 |     #   jupyter
119 |     #   jupyter-console
120 |     #   jupyterlab
121 |     #   qtconsole
122 | ipython==8.18.1
123 |     # via
124 |     #   ipykernel
125 |     #   ipywidgets
126 |     #   jupyter-console
127 | ipywidgets==8.1.2
128 |     # via jupyter
129 | isoduration==20.11.0
130 |     # via jsonschema
131 | jaraco-classes==3.3.1
132 |     # via keyring
133 | jedi==0.19.1
134 |     # via ipython
135 | jinja2==3.1.3
136 |     # via
137 |     #   jupyter-server
138 |     #   jupyterlab
139 |     #   jupyterlab-server
140 |     #   nbconvert
141 |     #   torch
142 | joblib==1.3.2
143 |     # via scikit-learn
144 | json5==0.9.24
145 |     # via jupyterlab-server
146 | jsonpointer==2.4
147 |     # via jsonschema
148 | jsonschema==4.21.1
149 |     # via
150 |     #   jupyter-events
151 |     #   jupyterlab-server
152 |     #   nbformat
153 | jsonschema-specifications==2023.12.1
154 |     # via jsonschema
155 | jupyter==1.0.0
156 | jupyter-client==8.6.1
157 |     # via
158 |     #   ipykernel
159 |     #   jupyter-console
160 |     #   jupyter-server
161 |     #   nbclient
162 |     #   qtconsole
163 | jupyter-console==6.6.3
164 |     # via jupyter
165 | jupyter-core==5.7.2
166 |     # via
167 |     #   ipykernel
168 |     #   jupyter-client
169 |     #   jupyter-console
170 |     #   jupyter-server
171 |     #   jupyterlab
172 |     #   nbclient
173 |     #   nbconvert
174 |     #   nbformat
175 |     #   qtconsole
176 | jupyter-events==0.10.0
177 |     # via jupyter-server
178 | jupyter-lsp==2.2.4
179 |     # via jupyterlab
180 | jupyter-server==2.13.0
181 |     # via
182 |     #   jupyter-lsp
183 |     #   jupyterlab
184 |     #   jupyterlab-server
185 |     #   notebook
186 |     #   notebook-shim
187 | jupyter-server-terminals==0.5.3
188 |     # via jupyter-server
189 | jupyterlab==4.1.5
190 |     # via notebook
191 | jupyterlab-pygments==0.3.0
192 |     # via nbconvert
193 | jupyterlab-server==2.25.4
194 |     # via
195 |     #   jupyterlab
196 |     #   notebook
197 | jupyterlab-widgets==3.0.10
198 |     # via ipywidgets
199 | keyring==24.3.1
200 |     # via twine
201 | lancedb==0.6.2
202 | markdown-it-py==3.0.0
203 |     # via rich
204 | markupsafe==2.1.5
205 |     # via
206 |     #   jinja2
207 |     #   nbconvert
208 | matplotlib-inline==0.1.6
209 |     # via
210 |     #   ipykernel
211 |     #   ipython
212 | mdurl==0.1.2
213 |     # via markdown-it-py
214 | mistune==3.0.2
215 |     # via nbconvert
216 | more-itertools==10.2.0
217 |     # via jaraco-classes
218 | mpmath==1.3.0
219 |     # via sympy
220 | mypy==1.9.0
221 | mypy-extensions==1.0.0
222 |     # via mypy
223 | nameparser==1.1.3
224 | nbclient==0.10.0
225 |     # via nbconvert
226 | nbconvert==7.16.2
227 |     # via
228 |     #   jupyter
229 |     #   jupyter-server
230 | nbformat==5.10.3
231 |     # via
232 |     #   jupyter-server
233 |     #   nbclient
234 |     #   nbconvert
235 | nest-asyncio==1.6.0
236 |     # via ipykernel
237 | networkx==3.2.1
238 |     # via torch
239 | nh3==0.2.15
240 |     # via readme-renderer
241 | notebook==7.1.2
242 |     # via jupyter
243 | notebook-shim==0.2.4
244 |     # via
245 |     #   jupyterlab
246 |     #   notebook
247 | number-parser==0.3.2
248 | numpy==1.26.4
249 |     # via
250 |     #   pyarrow
251 |     #   pylance
252 |     #   scikit-learn
253 |     #   scipy
254 |     #   sentence-transformers
255 |     #   transformers
256 | overrides==7.7.0
257 |     # via
258 |     #   jupyter-server
259 |     #   lancedb
260 | packaging==23.2
261 |     # via
262 |     #   build
263 |     #   deprecation
264 |     #   huggingface-hub
265 |     #   ipykernel
266 |     #   jupyter-server
267 |     #   jupyterlab
268 |     #   jupyterlab-server
269 |     #   nbconvert
270 |     #   pytest
271 |     #   qtconsole
272 |     #   qtpy
273 |     #   transformers
274 | pandocfilters==1.5.1
275 |     # via nbconvert
276 | parso==0.8.3
277 |     # via jedi
278 | pexpect==4.9.0
279 |     # via ipython
280 | pillow==10.2.0
281 |     # via sentence-transformers
282 | pip==24.0
283 | pkginfo==1.10.0
284 |     # via twine
285 | platformdirs==4.2.0
286 |     # via jupyter-core
287 | pluggy==1.4.0
288 |     # via pytest
289 | prometheus-client==0.20.0
290 |     # via jupyter-server
291 | prompt-toolkit==3.0.43
292 |     # via
293 |     #   ipython
294 |     #   jupyter-console
295 | psutil==5.9.8
296 |     # via ipykernel
297 | ptyprocess==0.7.0
298 |     # via
299 |     #   pexpect
300 |     #   terminado
301 | pure-eval==0.2.2
302 |     # via stack-data
303 | py==1.11.0
304 |     # via retry
305 | pyarrow==15.0.0
306 |     # via pylance
307 | pycparser==2.21
308 |     # via cffi
309 | pydantic==2.6.2
310 |     # via lancedb
311 | pydantic-core==2.16.3
312 |     # via pydantic
313 | pygments==2.17.2
314 |     # via
315 |     #   ipython
316 |     #   jupyter-console
317 |     #   nbconvert
318 |     #   qtconsole
319 |     #   readme-renderer
320 |     #   rich
321 | pylance==0.10.2
322 |     # via lancedb
323 | pyproject-hooks==1.0.0
324 |     # via build
325 | pytest==8.0.1
326 |     # via pytest-mock
327 | pytest-mock==3.12.0
328 | python-dateutil==2.9.0.post0
329 |     # via
330 |     #   arrow
331 |     #   dateparser
332 |     #   jupyter-client
333 | python-json-logger==2.0.7
334 |     # via jupyter-events
335 | pytz==2024.1
336 |     # via dateparser
337 | pyyaml==6.0.1
338 |     # via
339 |     #   huggingface-hub
340 |     #   jupyter-events
341 |     #   lancedb
342 |     #   transformers
343 | pyzmq==25.1.2
344 |     # via
345 |     #   ipykernel
346 |     #   jupyter-client
347 |     #   jupyter-console
348 |     #   jupyter-server
349 |     #   qtconsole
350 | qtconsole==5.5.1
351 |     # via jupyter
352 | qtpy==2.4.1
353 |     # via qtconsole
354 | rapidfuzz==3.6.1
355 | ratelimiter==1.2.0.post0
356 |     # via lancedb
357 | readme-renderer==43.0
358 |     # via twine
359 | referencing==0.34.0
360 |     # via
361 |     #   jsonschema
362 |     #   jsonschema-specifications
363 |     #   jupyter-events
364 | regex==2023.12.25
365 |     # via
366 |     #   dateparser
367 |     #   transformers
368 | requests==2.31.0
369 |     # via
370 |     #   huggingface-hub
371 |     #   jupyterlab-server
372 |     #   lancedb
373 |     #   requests-toolbelt
374 |     #   transformers
375 |     #   twine
376 | requests-toolbelt==1.0.0
377 |     # via twine
378 | retry==0.9.2
379 |     # via lancedb
380 | rfc3339-validator==0.1.4
381 |     # via
382 |     #   jsonschema
383 |     #   jupyter-events
384 | rfc3986==2.0.0
385 |     # via twine
386 | rfc3986-validator==0.1.1
387 |     # via
388 |     #   jsonschema
389 |     #   jupyter-events
390 | rich==13.7.1
391 |     # via twine
392 | rpds-py==0.18.0
393 |     # via
394 |     #   jsonschema
395 |     #   referencing
396 | safetensors==0.4.2
397 |     # via transformers
398 | scikit-learn==1.4.1.post1
399 |     # via sentence-transformers
400 | scipy==1.12.0
401 |     # via
402 |     #   scikit-learn
403 |     #   sentence-transformers
404 | semver==3.0.2
405 |     # via lancedb
406 | send2trash==1.8.2
407 |     # via jupyter-server
408 | sentence-transformers==2.5.1
409 | setuptools==69.1.1
410 | six==1.16.0
411 |     # via
412 |     #   asttokens
413 |     #   bleach
414 |     #   python-dateutil
415 |     #   rfc3339-validator
416 | sniffio==1.3.1
417 |     # via
418 |     #   anyio
419 |     #   httpx
420 | soupsieve==2.5
421 |     # via beautifulsoup4
422 | stack-data==0.6.3
423 |     # via ipython
424 | sympy==1.12
425 |     # via torch
426 | tantivy==0.21.0
427 | terminado==0.18.1
428 |     # via
429 |     #   jupyter-server
430 |     #   jupyter-server-terminals
431 | threadpoolctl==3.3.0
432 |     # via scikit-learn
433 | tinycss2==1.2.1
434 |     # via nbconvert
435 | tokenizers==0.15.2
436 |     # via transformers
437 | tomli==2.0.1
438 |     # via
439 |     #   build
440 |     #   coverage
441 |     #   jupyterlab
442 |     #   mypy
443 |     #   pyproject-hooks
444 |     #   pytest
445 | torch==2.2.1
446 |     # via sentence-transformers
447 | tornado==6.4
448 |     # via
449 |     #   ipykernel
450 |     #   jupyter-client
451 |     #   jupyter-server
452 |     #   jupyterlab
453 |     #   notebook
454 |     #   terminado
455 | tqdm==4.66.2
456 |     # via
457 |     #   huggingface-hub
458 |     #   lancedb
459 |     #   sentence-transformers
460 |     #   transformers
461 | traitlets==5.14.1
462 |     # via
463 |     #   comm
464 |     #   ipykernel
465 |     #   ipython
466 |     #   ipywidgets
467 |     #   jupyter-client
468 |     #   jupyter-console
469 |     #   jupyter-core
470 |     #   jupyter-events
471 |     #   jupyter-server
472 |     #   jupyterlab
473 |     #   matplotlib-inline
474 |     #   nbclient
475 |     #   nbconvert
476 |     #   nbformat
477 |     #   qtconsole
478 | transformers==4.38.2
479 |     # via sentence-transformers
480 | twine==5.0.0
481 | types-python-dateutil==2.9.0.20240316
482 |     # via arrow
483 | typing-extensions==4.9.0
484 |     # via
485 |     #   anyio
486 |     #   async-lru
487 |     #   huggingface-hub
488 |     #   ipython
489 |     #   mypy
490 |     #   pydantic
491 |     #   pydantic-core
492 |     #   torch
493 | tzlocal==5.2
494 |     # via dateparser
495 | unidecode==1.3.8
496 | uri-template==1.3.0
497 |     # via jsonschema
498 | urllib3==2.2.1
499 |     # via
500 |     #   requests
501 |     #   twine
502 | wcwidth==0.2.13
503 |     # via prompt-toolkit
504 | webcolors==1.13
505 |     # via jsonschema
506 | webencodings==0.5.1
507 |     # via
508 |     #   bleach
509 |     #   tinycss2
510 | websocket-client==1.7.0
511 |     # via jupyter-server
512 | widgetsnbextension==4.0.10
513 |     # via ipywidgets
514 | zipp==3.17.0
515 |     # via importlib-metadata
516 | 


--------------------------------------------------------------------------------
/tests/data/simonw_tags.csv:
--------------------------------------------------------------------------------
   1 | value,priority
   2 | 24ways,13
   3 | 2d,3
   4 | 37signals,12
   5 | 3d,14
   6 | 4chan,4
   7 | 500startups,4
   8 | aaronstraupcope,4
   9 | aaronswartz,3
  10 | abtesting,6
  11 | accessibility,30
  12 | accounts,4
  13 | acid3,5
  14 | acme,3
  15 | actionscript,3
  16 | activemq,3
  17 | activitypub,7
  18 | adamgomaa,3
  19 | adamjohnson,4
  20 | adobe,29
  21 | adrianholovaty,15
  22 | ads,5
  23 | advertising,10
  24 | agile,3
  25 | ai,468
  26 | aiassistedprogramming,11
  27 | airships,10
  28 | airtable,7
  29 | ajax,59
  30 | ajaxian,3
  31 | alexgarcia,22
  32 | alexgaynor,5
  33 | alexpayne,5
  34 | alexrussell,27
  35 | alfeaton,4
  36 | algorithms,12
  37 | alistapart,8
  38 | alpha,4
  39 | alt,3
  40 | amazon,63
  41 | amazonaws,3
  42 | amazonwebservices,7
  43 | ami,4
  44 | amqp,4
  45 | analytics,8
  46 | andrejkarpathy,10
  47 | andrewgodwin,18
  48 | andrewturner,5
  49 | android,8
  50 | andybaio,4
  51 | andybudd,4
  52 | anildash,9
  53 | animation,7
  54 | annevankesteren,5
  55 | annotatedreleasenotes,22
  56 | annotatedtalks,17
  57 | anthropic,17
  58 | antonzhiyanov,4
  59 | aol,13
  60 | apache,35
  61 | api,28
  62 | apidesign,9
  63 | apis,81
  64 | apollo,5
  65 | appengine,33
  66 | apple,90
  67 | applephotos,3
  68 | applescript,4
  69 | appstore,10
  70 | aprilfools,4
  71 | aralbalkan,4
  72 | architecture,9
  73 | archive,5
  74 | archives,3
  75 | archiving,6
  76 | arminronacher,8
  77 | arstechnica,5
  78 | art,8
  79 | asciiart,3
  80 | asf,3
  81 | asgi,19
  82 | askmetafilter,57
  83 | aspdotnet,3
  84 | aspnet,4
  85 | assafarkin,5
  86 | astronomy,3
  87 | async,39
  88 | athena,3
  89 | atmedia,6
  90 | atmedia07,3
  91 | atmedia2007,4
  92 | atom,21
  93 | audio,10
  94 | augmentedreality,3
  95 | australia,3
  96 | authentication,13
  97 | autocomplete,6
  98 | autoescaping,4
  99 | avibryant,4
 100 | aws,42
 101 | azure,4
 102 | backbone,3
 103 | backups,6
 104 | badges,3
 105 | bakeddata,9
 106 | bandwidth,3
 107 | barackobama,3
 108 | barcamp,6
 109 | barcamplondon,3
 110 | bard,13
 111 | basecamp,6
 112 | bash,8
 113 | bayeux,7
 114 | bazaar,6
 115 | bbauth,3
 116 | bbc,29
 117 | bbcnews,4
 118 | beautifulsoup,6
 119 | bellingcat,3
 120 | benchmarking,4
 121 | benchmarks,5
 122 | benfirshman,4
 123 | bengoldacre,3
 124 | benjohnson,9
 125 | benlaurie,5
 126 | benward,3
 127 | benwelsh,8
 128 | berkeleydb,3
 129 | beta,3
 130 | bigdata,14
 131 | bigtable,3
 132 | billdehora,5
 133 | billgates,6
 134 | binary,5
 135 | bing,24
 136 | bitcoin,10
 137 | black,5
 138 | blainecook,7
 139 | blockchain,7
 140 | blocks,4
 141 | blogger,4
 142 | blogging,39
 143 | blogs,3
 144 | bloom,5
 145 | bloomfilters,4
 146 | boardgames,3
 147 | bobippolito,5
 148 | boingboing,4
 149 | bookmarklet,6
 150 | bookmarklets,10
 151 | books,17
 152 | bradfitzpatrick,11
 153 | bradneuberg,6
 154 | branching,3
 155 | branding,3
 156 | brandonaaron,3
 157 | brandurleach,12
 158 | brendaneich,4
 159 | brettaylor,5
 160 | brighton,12
 161 | brothercake,3
 162 | browsers,73
 163 | bruceschneier,27
 164 | bsd,3
 165 | buckettesting,3
 166 | bugs,8
 167 | bunniehuang,5
 168 | business,22
 169 | buzz,3
 170 | c,28
 171 | cabelsasser,3
 172 | cache,3
 173 | caching,45
 174 | cairo,5
 175 | calendars,5
 176 | calhenderson,7
 177 | california,3
 178 | callbacks,5
 179 | camino,6
 180 | canon,3
 181 | canvas,37
 182 | cappuccino,3
 183 | captcha,6
 184 | cardspace,7
 185 | careers,32
 186 | cartography,3
 187 | cassandra,7
 188 | cdn,7
 189 | cern,6
 190 | certificates,9
 191 | cfp,4
 192 | chaining,3
 193 | charlesbabbage,3
 194 | charlesleifer,4
 195 | charlesmiller,7
 196 | charliestross,4
 197 | chatgpt,90
 198 | cheese,8
 199 | cherrypy,4
 200 | chicagocrime,3
 201 | china,7
 202 | chrisamico,3
 203 | chrismessina,7
 204 | chrisshiflett,6
 205 | christianheilmann,5
 206 | christmas,3
 207 | christopherlenz,8
 208 | chrome,19
 209 | chromeframe,5
 210 | chromium,3
 211 | classes,4
 212 | claude,17
 213 | clayshirky,5
 214 | cli,10
 215 | clickhouse,4
 216 | clickjacking,11
 217 | closure,3
 218 | closures,8
 219 | cloud,8
 220 | cloudcomputing,17
 221 | cloudflare,10
 222 | cloudfront,4
 223 | cloudrun,8
 224 | cms,10
 225 | co2,3
 226 | code,3
 227 | codecs,3
 228 | codereview,3
 229 | coffeescript,3
 230 | collaboration,8
 231 | colour,4
 232 | comcast,3
 233 | comet,57
 234 | cometd,3
 235 | commandline,8
 236 | commentspam,4
 237 | communication,8
 238 | community,21
 239 | compilers,13
 240 | complexity,3
 241 | compression,3
 242 | computerhistory,4
 243 | computers,3
 244 | computerscience,11
 245 | computervision,13
 246 | concurrency,12
 247 | conditionalcomments,3
 248 | conference,12
 249 | conferences,181
 250 | conspiracy,3
 251 | contentapi,4
 252 | contenttypes,3
 253 | continuousdeployment,10
 254 | continuousintegration,16
 255 | cookiecutter,7
 256 | cookies,25
 257 | cooking,5
 258 | copilot,4
 259 | copy,3
 260 | copyright,11
 261 | copywriting,3
 262 | corydoctorow,4
 263 | cosmopolitan,6
 264 | couchdb,27
 265 | counters,3
 266 | covid19,16
 267 | cplusplus,6
 268 | crawling,3
 269 | crdt,6
 270 | creativecommons,8
 271 | crime,3
 272 | cron,3
 273 | crossdomain,10
 274 | crossdomainxml,6
 275 | crowdsourcing,15
 276 | cryptography,20
 277 | csharp,4
 278 | csrf,47
 279 | css,143
 280 | css3,11
 281 | cssaintrocketscience,9
 282 | csv,31
 283 | ctypes,6
 284 | curl,8
 285 | curse,3
 286 | cursegaming,3
 287 | cvs,3
 288 | d3,11
 289 | dabbledb,5
 290 | dalle,12
 291 | damienkatz,9
 292 | danahboyd,3
 293 | dancatt,6
 294 | dannyobrien,4
 295 | danwebb,6
 296 | dareobasanjo,9
 297 | data,21
 298 | databases,93
 299 | datablog,3
 300 | datagov,3
 301 | datagovuk,3
 302 | datajournalism,37
 303 | dataportability,6
 304 | datascience,14
 305 | datasette,384
 306 | datasettecloud,34
 307 | datasettedesktop,6
 308 | datasettelite,15
 309 | datastore,7
 310 | datastructures,4
 311 | datauri,6
 312 | dates,6
 313 | datetime,6
 314 | daveshea,3
 315 | davethomas,4
 316 | davewiner,24
 317 | davidbeazley,4
 318 | davidcramer,8
 319 | davidmbeazley,3
 320 | davidrecordon,6
 321 | dconstruct,5
 322 | deanedwards,4
 323 | debian,6
 324 | debugger,5
 325 | debugging,34
 326 | decentralisation,4
 327 | decorators,6
 328 | delicious,6
 329 | denialofservice,3
 330 | deno,15
 331 | deployment,25
 332 | derekwillis,5
 333 | design,61
 334 | development,4
 335 | devfort,4
 336 | dewittclinton,3
 337 | dickcostolo,4
 338 | diff,5
 339 | digg,16
 340 | digitalocean,4
 341 | dionalmaer,3
 342 | directedidentity,5
 343 | discord,5
 344 | django,538
 345 | djangobook,5
 346 | djangocon,18
 347 | djangocon08,3
 348 | djangodebugtoolbar,3
 349 | djangoorm,3
 350 | djangopeople,12
 351 | djangopony,4
 352 | djangosnippets,6
 353 | djangosqldashboard,11
 354 | djugl,4
 355 | dns,24
 356 | docker,44
 357 | documentary,3
 358 | documentation,46
 359 | documentcloud,4
 360 | dogpile,7
 361 | dogsheep,31
 362 | dojo,40
 363 | dojox,4
 364 | dom,12
 365 | domains,9
 366 | domcontentloaded,4
 367 | domscripting,3
 368 | dontbeevil,3
 369 | dopplr,12
 370 | dotnet,4
 371 | douglascrockford,14
 372 | draganddrop,3
 373 | dragndrop,5
 374 | drawing,4
 375 | dreamhost,5
 376 | drewbreunig,3
 377 | drewmclellan,5
 378 | drichardhipp,7
 379 | drizzle,3
 380 | drm,15
 381 | dropbox,9
 382 | drupal,8
 383 | duckdb,8
 384 | duncanrobertson,4
 385 | dustindiaz,5
 386 | dynamiclanguages,3
 387 | ebs,3
 388 | ec2,42
 389 | ecmascript,5
 390 | ecommerce,8
 391 | edddumbill,4
 392 | edeliot,3
 393 | edfelten,6
 394 | editor,3
 395 | education,14
 396 | effbot,4
 397 | egypt,3
 398 | ekranoplans,4
 399 | elasticsearch,11
 400 | elections,9
 401 | electron,8
 402 | electronicvoting,3
 403 | elementtree,4
 404 | elliotterustyharold,6
 405 | email,31
 406 | embedding,3
 407 | embeddings,23
 408 | emoji,5
 409 | encoding,5
 410 | encryption,6
 411 | enterprise,8
 412 | entrepreneurship,51
 413 | ericflorenzano,3
 414 | ericholscher,8
 415 | ericmeyer,6
 416 | erlang,26
 417 | errors,3
 418 | escaping,4
 419 | etags,3
 420 | etech,4
 421 | ethanmollick,11
 422 | etherpad,3
 423 | ethics,70
 424 | etiquette,3
 425 | eurooscon,3
 426 | europe,3
 427 | europython,4
 428 | eventio,5
 429 | eventlet,5
 430 | eventmachine,3
 431 | events,105
 432 | everyblock,8
 433 | evoting,3
 434 | explorables,20
 435 | extensions,4
 436 | fabric,8
 437 | facebook,107
 438 | facebookgraphsearch,3
 439 | facetedsearch,4
 440 | fakestevejobs,3
 441 | faq,4
 442 | fastai,3
 443 | fastcgi,3
 444 | favicon,7
 445 | featureflags,5
 446 | fediverse,7
 447 | feedburner,4
 448 | feeds,6
 449 | ffs,3
 450 | finetuning,13
 451 | firebug,22
 452 | firecracker,3
 453 | fireeagle,15
 454 | firefox,52
 455 | firefox3,7
 456 | flash,69
 457 | flask,5
 458 | flex,7
 459 | flickr,74
 460 | flickrplaces,3
 461 | fluiddb,4
 462 | fly,27
 463 | follow,3
 464 | fonts,10
 465 | foocamp,3
 466 | food,11
 467 | forms,9
 468 | fowa,12
 469 | fowa2007,3
 470 | fowa2008,5
 471 | framebusting,6
 472 | frameworks,29
 473 | francoischollet,4
 474 | fredriklundh,6
 475 | freebase,12
 476 | friendfeed,11
 477 | friends,3
 478 | frontend,16
 479 | fulltext,7
 480 | fulltextsearch,10
 481 | functional,4
 482 | functionalprogramming,4
 483 | funding,41
 484 | funny,69
 485 | fuse,4
 486 | futureofwebapps,12
 487 | gadgets,3
 488 | games,19
 489 | gaming,4
 490 | garethrushgrove,8
 491 | gcap,7
 492 | gearman,3
 493 | gears,7
 494 | gecko,4
 495 | geeks,3
 496 | gemini,5
 497 | generativeai,404
 498 | generators,7
 499 | genetics,5
 500 | geo,12
 501 | geocoding,11
 502 | geodata,3
 503 | geodjango,8
 504 | geoffreylitt,3
 505 | geoip,3
 506 | geojson,11
 507 | geolocation,6
 508 | geonames,7
 509 | geoplanet,7
 510 | geospatial,9
 511 | getlatlon,3
 512 | gif,4
 513 | gifs,4
 514 | gil,11
 515 | gis,41
 516 | git,40
 517 | githistory,6
 518 | github,120
 519 | githubactions,41
 520 | githubcodespaces,9
 521 | gitscraping,27
 522 | glitch,11
 523 | glyph,3
 524 | gmail,21
 525 | go,25
 526 | google,289
 527 | googleappengine,14
 528 | googlecharts,9
 529 | googlechrome,9
 530 | googlecode,5
 531 | googledocs,7
 532 | googledoctype,3
 533 | googlegears,8
 534 | googlemaps,52
 535 | googlemapsapi,4
 536 | googlereader,5
 537 | googlevideo,5
 538 | googlewave,3
 539 | government,8
 540 | gpl,4
 541 | gps,10
 542 | gpt3,65
 543 | gpt4,30
 544 | grahamdumpleton,4
 545 | graphics,9
 546 | graphing,6
 547 | graphql,18
 548 | graphs,7
 549 | greasemonkey,19
 550 | gregwilson,5
 551 | guardian,48
 552 | guidovanrossum,10
 553 | gwt,7
 554 | gzip,8
 555 | h264,4
 556 | hack,3
 557 | hackathons,4
 558 | hackday,10
 559 | hackdaylondon,3
 560 | hackernews,15
 561 | hacking,12
 562 | hacks,9
 563 | hadoop,9
 564 | hakibenita,3
 565 | halfmoonbay,3
 566 | haproxy,5
 567 | hardware,9
 568 | hashbanghell,4
 569 | hashes,3
 570 | hashing,12
 571 | haystack,4
 572 | hcard,6
 573 | heatmaps,3
 574 | henrisivonen,5
 575 | heroku,14
 576 | highavailability,5
 577 | highlights,3
 578 | highrise,6
 579 | history,31
 580 | hixie,11
 581 | homebrew,7
 582 | homebrewllms,44
 583 | hosting,24
 584 | hotmail,6
 585 | hotstandby,3
 586 | html,65
 587 | html5,83
 588 | http,89
 589 | http2,5
 590 | httponly,3
 591 | https,11
 592 | httpx,4
 593 | huggingface,5
 594 | hynekschlawack,5
 595 | i18n,13
 596 | ia,6
 597 | ianbicking,14
 598 | ianhickson,20
 599 | ianmansfield,3
 600 | ibm,4
 601 | ical,6
 602 | ideas,8
 603 | identity,14
 604 | identitytheft,4
 605 | idproxy,7
 606 | ie,60
 607 | ie6,14
 608 | ie7,8
 609 | ie8,25
 610 | ietf,4
 611 | iframes,17
 612 | imagemagick,3
 613 | images,13
 614 | inaturalist,4
 615 | infographics,7
 616 | inheritance,3
 617 | innodb,3
 618 | inspiring,3
 619 | internationalisation,9
 620 | internet,25
 621 | internetarchive,8
 622 | internetexplorer,25
 623 | interview,6
 624 | interviews,7
 625 | introspection,3
 626 | investing,4
 627 | io,7
 628 | ios,18
 629 | ip,4
 630 | ipad,13
 631 | iphone,62
 632 | iphones,3
 633 | iplayer,5
 634 | ipod,6
 635 | irc,5
 636 | ironpython,10
 637 | iso,4
 638 | it,3
 639 | itunes,3
 640 | ixr,8
 641 | jackclark,4
 642 | jacobkaplanmoss,42
 643 | jakearchibald,3
 644 | jakobnielsen,5
 645 | jamesbennett,19
 646 | jamesbridle,3
 647 | jamestauber,3
 648 | janrain,7
 649 | japan,3
 650 | jargon,4
 651 | jasoncalacanis,5
 652 | jasonkottke,8
 653 | jasonscott,5
 654 | java,80
 655 | javafx,3
 656 | javascript,631
 657 | javascriptlibraries,3
 658 | jeffatwood,10
 659 | jeffcroft,4
 660 | jefflindsay,3
 661 | jeffreyzeldman,8
 662 | jeremiahgrossman,3
 663 | jeremyashkenas,4
 664 | jeremyhoward,8
 665 | jeremykeith,12
 666 | jeremyzawodny,4
 667 | jetty,4
 668 | jinja,4
 669 | jit,7
 670 | jobs,14
 671 | joegregorio,9
 672 | joelspolsky,8
 673 | joelveitch,3
 674 | joewalker,3
 675 | johngrahamcumming,4
 676 | johngruber,22
 677 | johnresig,32
 678 | johnsiracusa,3
 679 | jonhicks,9
 680 | jonudell,10
 681 | joshberkus,3
 682 | joshcomeau,3
 683 | journalism,31
 684 | jpstacey,3
 685 | jq,6
 686 | jquery,99
 687 | jqueryui,3
 688 | jruby,3
 689 | jsk,8
 690 | json,130
 691 | jsonhead,3
 692 | jsonp,23
 693 | jsonschema,4
 694 | juliaevans,17
 695 | jupyter,38
 696 | jvm,4
 697 | jwt,3
 698 | jwz,5
 699 | jython,11
 700 | kafka,7
 701 | kansas,4
 702 | kapingyee,5
 703 | kellanelliottmccrea,16
 704 | kevinyank,6
 705 | keynote,3
 706 | keyvaluepairs,9
 707 | keyvaluestores,4
 708 | kml,8
 709 | korea,3
 710 | kriszyp,3
 711 | kubernetes,5
 712 | l10n,5
 713 | laion,4
 714 | lambda,9
 715 | language,5
 716 | lanyrd,16
 717 | largehadroncollider,3
 718 | lastfm,9
 719 | latex,3
 720 | laurievoss,4
 721 | law,3
 722 | lawrence,6
 723 | leahculver,3
 724 | leanstartups,6
 725 | legal,3
 726 | lego,4
 727 | leonardlin,5
 728 | leopard,14
 729 | lesorchard,8
 730 | libevent,4
 731 | libraries,21
 732 | licenses,6
 733 | lifehacks,5
 734 | lightningtalks,4
 735 | lighttpd,5
 736 | lilypond,3
 737 | lindenlab,4
 738 | linguistics,4
 739 | linkedin,6
 740 | links,3
 741 | linustorvalds,4
 742 | linux,39
 743 | lisp,8
 744 | litestream,9
 745 | livejournal,9
 746 | ljworld,7
 747 | llama,44
 748 | llm,37
 749 | llms,379
 750 | llvm,5
 751 | loadbalancing,11
 752 | loading,3
 753 | loadtesting,3
 754 | local,5
 755 | localisation,3
 756 | location,14
 757 | lockin,3
 758 | logging,18
 759 | login,3
 760 | logincsrf,3
 761 | logs,4
 762 | london,54
 763 | london2,3
 764 | longpolling,3
 765 | lua,9
 766 | lucene,12
 767 | lugradio,4
 768 | lugradiolive,3
 769 | lukaszlanga,3
 770 | lukeplant,8
 771 | lxml,7
 772 | mac,11
 773 | macbook,3
 774 | macbookpro,3
 775 | macfuse,3
 776 | machinelearning,62
 777 | machinetags,5
 778 | maciejceglowski,5
 779 | macosx,14
 780 | magic,3
 781 | magnolia,3
 782 | mailinator,3
 783 | make,6
 784 | malcolmtredinnick,8
 785 | management,47
 786 | manyeyes,3
 787 | mapping,51
 788 | mapreduce,10
 789 | maps,41
 790 | markdown,14
 791 | marketing,15
 792 | marknottingham,14
 793 | markpilgrim,37
 794 | markramm,3
 795 | markshuttleworth,4
 796 | markup,8
 797 | marsphoenix,3
 798 | martinatkins,7
 799 | martinbelam,3
 800 | mashup,5
 801 | mashups,7
 802 | masterslave,6
 803 | mastodon,22
 804 | mathematics,3
 805 | mathml,4
 806 | mattbiddulph,12
 807 | mattcroydon,3
 808 | matthewsomerville,4
 809 | mattlevine,3
 810 | mattmullenweg,5
 811 | mattwebb,11
 812 | mattwestcott,4
 813 | maxwoolf,12
 814 | md5,4
 815 | me,6
 816 | media,5
 817 | mediawiki,4
 818 | meetings,6
 819 | meetup,3
 820 | meetups,8
 821 | megpickard,4
 822 | memcache,7
 823 | memcached,32
 824 | memcachedb,3
 825 | memes,3
 826 | memory,6
 827 | memoryleaks,4
 828 | mercurial,4
 829 | messagequeue,3
 830 | messagequeues,16
 831 | messaging,8
 832 | metaclasses,3
 833 | metadata,10
 834 | metafilter,10
 835 | michaeltrier,4
 836 | michalmigurski,13
 837 | michalzalewski,3
 838 | microformats,30
 839 | microservices,7
 840 | microsoft,97
 841 | middleware,13
 842 | midjourney,5
 843 | migration,3
 844 | migrations,15
 845 | migueldeicaza,8
 846 | mikebostock,6
 847 | mikebutcher,3
 848 | mikelmaron,4
 849 | mikemalone,4
 850 | mikeshaver,3
 851 | military,3
 852 | minification,4
 853 | mistral,7
 854 | mit,3
 855 | mlc,9
 856 | mobile,38
 857 | mobileweb,4
 858 | models,3
 859 | moderation,9
 860 | modpython,5
 861 | modwsgi,13
 862 | mollywhite,3
 863 | money,3
 864 | mongodb,10
 865 | mongrel,3
 866 | monitoring,8
 867 | monkeypatching,6
 868 | mono,8
 869 | moonlight,3
 870 | mootools,6
 871 | motivation,4
 872 | movies,9
 873 | mozilla,45
 874 | mp3,5
 875 | multidb,5
 876 | multiprocessing,3
 877 | museums,19
 878 | music,17
 879 | mvc,4
 880 | mymaps,3
 881 | myopenid,4
 882 | mypy,9
 883 | mysociety,13
 884 | myspace,19
 885 | mysql,63
 886 | namespaces,4
 887 | nasa,4
 888 | nataliedowne,35
 889 | nathanborror,3
 890 | nedbatchelder,16
 891 | neilfraser,4
 892 | nelsonminar,3
 893 | netflix,4
 894 | netscape,4
 895 | networking,22
 896 | newforms,12
 897 | newformsadmin,3
 898 | news,8
 899 | newspapers,17
 900 | newyork,3
 901 | nginx,39
 902 | niallkennedy,7
 903 | nicar,7
 904 | nlp,9
 905 | node,29
 906 | nodejs,44
 907 | nofollow,3
 908 | nomic,3
 909 | noscript,4
 910 | nose,5
 911 | nosql,28
 912 | npm,15
 913 | nsa,3
 914 | numpy,4
 915 | nyc,5
 916 | nytimes,15
 917 | oauth,48
 918 | objectivec,7
 919 | observability,7
 920 | observable,40
 921 | ocr,12
 922 | offline,10
 923 | ogg,3
 924 | olpc,6
 925 | onload,5
 926 | oop,5
 927 | ooxml,4
 928 | opacity,4
 929 | open-source,22
 930 | openai,122
 931 | opencv,3
 932 | opendata,13
 933 | openid,213
 934 | openid2,7
 935 | openlibrary,3
 936 | openplatform,11
 937 | openrightsgroup,5
 938 | opensearch,3
 939 | opensocial,8
 940 | opensource,185
 941 | openstreetmap,44
 942 | opentech,3
 943 | opentech2008,3
 944 | openweb,6
 945 | opera,30
 946 | operations,5
 947 | ops,16
 948 | optfunc,3
 949 | optimisation,6
 950 | oracle,5
 951 | orange,3
 952 | orbited,3
 953 | ordnancesurvey,3
 954 | oreilly,4
 955 | org,3
 956 | orm,40
 957 | ormcaching,4
 958 | oscon,8
 959 | oscon07,4
 960 | osx,80
 961 | owasp,4
 962 | owlsnearyou,3
 963 | oxford,14
 964 | oxfordgeeknight2,7
 965 | oxfordgeeknights,20
 966 | oxfordgeeks,7
 967 | packaging,20
 968 | pagerank,6
 969 | pagni,3
 970 | pandas,16
 971 | panels,5
 972 | parallels,6
 973 | paris,3
 974 | parquet,5
 975 | parrot,3
 976 | parsing,11
 977 | passwordantipattern,7
 978 | passwords,27
 979 | paste,3
 980 | patents,5
 981 | patrickmckenzie,5
 982 | patterns,5
 983 | paulford,12
 984 | paulgraham,9
 985 | paulhammond,3
 986 | paypal,4
 987 | pdb,6
 988 | pdf,17
 989 | performance,79
 990 | perl,22
 991 | perlbal,3
 992 | permissions,5
 993 | perplexity,3
 994 | personal,5
 995 | personalnews,8
 996 | petermichaux,4
 997 | petervandijck,4
 998 | philgyford,8
 999 | phishing,52
1000 | photography,18
1001 | photos,16
1002 | photosynth,3
1003 | php,79
1004 | php5,4
1005 | physics,4
1006 | pil,4
1007 | pingback,22
1008 | pip,9
1009 | pipes,7
1010 | piracy,3
1011 | pitching,4
1012 | pixelart,4
1013 | placemaker,3
1014 | play,3
1015 | playwright,6
1016 | plugin,6
1017 | plugins,69
1018 | plurk,3
1019 | png,7
1020 | pngs,4
1021 | podcasts,31
1022 | politicalhacking,6
1023 | politics,24
1024 | pony,3
1025 | popfly,3
1026 | portablesocialnetworks,12
1027 | portland,3
1028 | post,7
1029 | postelslaw,3
1030 | postgis,6
1031 | postgresql,114
1032 | power,3
1033 | powerpoint,3
1034 | pownce,9
1035 | ppk,6
1036 | pr,6
1037 | presentations,7
1038 | presenting,4
1039 | pricing,3
1040 | privacy,34
1041 | process,6
1042 | processes,3
1043 | productivity,20
1044 | productmanagement,8
1045 | profiler,3
1046 | profiling,12
1047 | programmers,13
1048 | programming,146
1049 | programminglanguages,32
1050 | progressiveenhancement,7
1051 | projectmanagement,6
1052 | projects,350
1053 | promptengineering,60
1054 | promptinjection,48
1055 | protocolbuffers,5
1056 | prototype,16
1057 | proxies,3
1058 | proxy,15
1059 | psf,5
1060 | psychology,5
1061 | pubs,3
1062 | pubsub,3
1063 | pubsubhubbub,6
1064 | puppeteer,5
1065 | pycon,15
1066 | pyconuk,6
1067 | pylons,6
1068 | pyobjc,3
1069 | pyodide,9
1070 | pypi,22
1071 | pypy,9
1072 | pysqlite,3
1073 | pytest,13
1074 | python,908
1075 | python3,21
1076 | pythoncard,4
1077 | pytorch,5
1078 | qemu,3
1079 | queryset,4
1080 | querysetrefactor,4
1081 | queue,5
1082 | queues,13
1083 | quora,1004
1084 | rabbitmq,7
1085 | radio,7
1086 | rafecolburn,6
1087 | rails,70
1088 | rands,3
1089 | ratelimiting,7
1090 | rdbms,4
1091 | rdf,4
1092 | react,26
1093 | reading,3
1094 | realtime,9
1095 | realtimeweb,5
1096 | recommendations,3
1097 | recovered,213
1098 | recruiting,8
1099 | redbean,4
1100 | reddit,21
1101 | redhat,3
1102 | redis,50
1103 | redpajama,4
1104 | refactoring,6
1105 | regex,7
1106 | registration,5
1107 | regularexpressions,12
1108 | releasenotes,11
1109 | releases,15
1110 | remote,3
1111 | remysharp,3
1112 | replicate,5
1113 | replication,30
1114 | research,4
1115 | resolved,7
1116 | rest,35
1117 | restaurants,8
1118 | restful,4
1119 | restructuredtext,4
1120 | revcanonical,9
1121 | rewrites,3
1122 | rfc,7
1123 | richardcrowley,5
1124 | richardjones,3
1125 | richskrenta,5
1126 | richtext,3
1127 | rileygoodside,5
1128 | ripgrep,5
1129 | robertocallahan,4
1130 | robinsloan,4
1131 | robots,5
1132 | robotstxt,4
1133 | royalmail,3
1134 | rss,30
1135 | ruby,68
1136 | rubyonrails,3
1137 | russellbeattie,3
1138 | rust,51
1139 | ryandahl,5
1140 | ryantomayko,12
1141 | rye,5
1142 | s3,48
1143 | s3credentials,8
1144 | saas,14
1145 | safari,40
1146 | salvatoresanfilippo,10
1147 | samedomain,3
1148 | samruby,12
1149 | sandboxing,8
1150 | sanfrancisco,27
1151 | sanfranciscobayarea,9
1152 | sanic,5
1153 | satellite,3
1154 | scala,6
1155 | scalability,3
1156 | scaling,128
1157 | science,18
1158 | sciencefiction,3
1159 | scipy,4
1160 | scottkveton,4
1161 | scottschiller,3
1162 | scraping,21
1163 | screencast,4
1164 | screencasts,4
1165 | screenreaders,4
1166 | screenscraping,7
1167 | screenwriting,6
1168 | scribd,4
1169 | search,73
1170 | searchengines,11
1171 | secondlife,10
1172 | security,435
1173 | securitytheatre,3
1174 | selectors,13
1175 | selenium,4
1176 | semantic,3
1177 | semanticweb,8
1178 | sentry,7
1179 | seo,29
1180 | serialization,3
1181 | serverless,5
1182 | servers,6
1183 | serviceworkers,3
1184 | servo,3
1185 | sessions,6
1186 | settings,4
1187 | setuptools,5
1188 | sha1,4
1189 | shapefiles,5
1190 | sharding,11
1191 | sharecropping,8
1192 | shell,6
1193 | shotscraper,21
1194 | sidechannel,3
1195 | sidekiq,4
1196 | signedcookies,5
1197 | signing,6
1198 | siliconvalley,6
1199 | silverlight,15
1200 | simonwardley,3
1201 | simpledb,7
1202 | sinatra,3
1203 | sitepen,5
1204 | sitepoint,11
1205 | sitespecificbrowsers,7
1206 | sixapart,6
1207 | sizzle,4
1208 | skillswap,3
1209 | skype,3
1210 | slack,6
1211 | slidecast,3
1212 | slides,12
1213 | slideshare,10
1214 | smalldata,3
1215 | sms,4
1216 | snowleopard,3
1217 | soap,9
1218 | social,4
1219 | socialgraph,11
1220 | socialmedia,24
1221 | socialnetworks,15
1222 | socialsoftware,6
1223 | socialwhitelisting,4
1224 | software,10
1225 | softwarearchitecture,6
1226 | softwareengineering,48
1227 | solr,22
1228 | sourceforge,3
1229 | south,4
1230 | soviet,3
1231 | space,6
1232 | spam,21
1233 | spatialite,11
1234 | speaking,102
1235 | specification,3
1236 | sphinxdocs,8
1237 | sphinxsearch,7
1238 | spidermonkey,5
1239 | spiderverse,3
1240 | spongmonkeys,4
1241 | sports,4
1242 | spreadsheets,3
1243 | sql,81
1244 | sqlalchemy,5
1245 | sqlinjection,4
1246 | sqlite,231
1247 | sqliteutils,81
1248 | sqlserver,3
1249 | squid,6
1250 | squirrels,8
1251 | ssh,8
1252 | ssl,9
1253 | sso,3
1254 | stablediffusion,22
1255 | stackoverflow,11
1256 | standards,30
1257 | stanford,8
1258 | starling,3
1259 | startup,5
1260 | startups,184
1261 | starwars,4
1262 | staticanalysis,3
1263 | staticgenerator,3
1264 | staticmaps,3
1265 | statictyping,7
1266 | stdlib,3
1267 | steampunk,3
1268 | stephenfry,3
1269 | stevejobs,10
1270 | stevesouders,9
1271 | steveyegge,4
1272 | storage,5
1273 | streaming,3
1274 | streetview,3
1275 | strings,4
1276 | stripe,6
1277 | stuartcolville,4
1278 | stuartlangridge,21
1279 | stupid,3
1280 | subversion,32
1281 | sun,14
1282 | sunmicrosystems,4
1283 | support,3
1284 | svg,30
1285 | swf,4
1286 | swyx,5
1287 | sxsw,20
1288 | symbex,4
1289 | syndication,8
1290 | syntaxhighlighting,4
1291 | sysadmin,26
1292 | tagging,4
1293 | tags,3
1294 | tailscale,4
1295 | talks,37
1296 | tamarin,6
1297 | teaching,13
1298 | teamfortress2,3
1299 | techcrunch,7
1300 | technicaldebt,3
1301 | technology,14
1302 | technorati,8
1303 | techstars,4
1304 | ted,7
1305 | tedleung,3
1306 | templates,5
1307 | templating,5
1308 | tensorflow,6
1309 | testing,50
1310 | textmate,7
1311 | tf2,3
1312 | thebigpicture,3
1313 | theguardian,4
1314 | theoschlossnagle,3
1315 | theregister,3
1316 | thomasptacek,8
1317 | threading,5
1318 | threads,6
1319 | thunderbird,3
1320 | ticketing,3
1321 | tickets,3
1322 | til,6
1323 | timbernerslee,5
1324 | timbray,25
1325 | timemachine,4
1326 | timezones,10
1327 | timoreilly,5
1328 | tinyurl,6
1329 | tls,5
1330 | tokyocabinet,7
1331 | tokyotyrant,5
1332 | tomarmitage,6
1333 | tomchristie,4
1334 | tomcoates,8
1335 | tommacwright,10
1336 | tomscott,4
1337 | tomsteinberg,4
1338 | tomtaylor,3
1339 | tomwatson,3
1340 | tonyhirst,5
1341 | tools,11
1342 | torchbox,7
1343 | tornado,10
1344 | trac,3
1345 | trackback,9
1346 | traefik,3
1347 | transactions,5
1348 | transformers,6
1349 | transformersjs,5
1350 | translation,6
1351 | travel,42
1352 | travis,5
1353 | tunisia,3
1354 | turbogears,4
1355 | tutorial,18
1356 | tutorials,12
1357 | tv,15
1358 | twisted,12
1359 | twitter,150
1360 | typescript,6
1361 | typography,16
1362 | ubuntu,22
1363 | ui,25
1364 | uk,15
1365 | ukgovernment,4
1366 | undo,4
1367 | unicode,29
1368 | unittesting,4
1369 | unittests,13
1370 | unix,16
1371 | unladenswallow,3
1372 | unobtrusivejavascript,6
1373 | unobtrusivescripting,3
1374 | upcoming,5
1375 | uploads,4
1376 | upsert,3
1377 | urls,74
1378 | usa,3
1379 | usability,66
1380 | userresearch,3
1381 | uuid,3
1382 | ux,20
1383 | v8,8
1384 | vaccinateca,23
1385 | vaccinatecablog,14
1386 | vaccines,3
1387 | validation,6
1388 | validator,3
1389 | valve,3
1390 | varnish,10
1391 | vectorsearch,3
1392 | verisign,5
1393 | versioncontrol,9
1394 | versioning,7
1395 | vicuna,5
1396 | video,29
1397 | views,4
1398 | virtualenv,4
1399 | virtualisation,8
1400 | virtualization,13
1401 | vista,7
1402 | visualisation,15
1403 | visualisations,3
1404 | visualization,22
1405 | vml,4
1406 | vmware,12
1407 | vox,5
1408 | vps,4
1409 | vr,3
1410 | vulnerability,5
1411 | w3c,18
1412 | washingtonpost,7
1413 | wasp,3
1414 | web,12
1415 | web2,5
1416 | web20,5
1417 | web2expo,3
1418 | web3,4
1419 | webapis,7
1420 | webapps,44
1421 | webassembly,53
1422 | webcomponents,15
1423 | webdav,3
1424 | webdevelopers,4
1425 | webdevelopment,167
1426 | webfonts,3
1427 | webhooks,20
1428 | webkit,28
1429 | weblog,3
1430 | webperformance,17
1431 | webrunner,3
1432 | webserver,3
1433 | webservers,11
1434 | webservice,3
1435 | webservices,26
1436 | websockets,15
1437 | webstandards,36
1438 | webstock,4
1439 | webworkers,9
1440 | weeknotes,176
1441 | whatwg,16
1442 | whereonearth,4
1443 | whisper,11
1444 | whitelisting,7
1445 | whoosh,4
1446 | whytheluckystiff,6
1447 | widgets,4
1448 | wifi,13
1449 | wii,6
1450 | wiki,12
1451 | wikileaks,4
1452 | wikinear,6
1453 | wikipedia,37
1454 | wildlife,4
1455 | wildlifenearyou,10
1456 | willlarson,10
1457 | wilsonminer,5
1458 | windows,35
1459 | wired,8
1460 | wordpress,17
1461 | wordpresscom,4
1462 | workers,4
1463 | workflow,4
1464 | worm,5
1465 | writing,18
1466 | wsgi,18
1467 | wsstar,4
1468 | xfn,5
1469 | xhtml,20
1470 | xhtml2,6
1471 | xkcd,12
1472 | xml,55
1473 | xmlhttprequest,7
1474 | xmlrpc,15
1475 | xmpp,5
1476 | xrds,5
1477 | xss,59
1478 | xtech,11
1479 | xtech2007,5
1480 | xuacompatible,10
1481 | xulrunner,3
1482 | yadis,3
1483 | yagni,5
1484 | yahoo,106
1485 | yahoopipes,7
1486 | yaml,8
1487 | ycombinator,52
1488 | ydn,10
1489 | yelp,5
1490 | youtube,32
1491 | yql,12
1492 | yrb,3
1493 | yui,33
1494 | yui3,3
1495 | zacharyvoase,3
1496 | zeitnow,20
1497 | zeppelins,17
1498 | zerodowntime,15
1499 | zeromq,4
1500 | zig,3
1501 | zstd,3


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # FuzzTypes
  2 | 
  3 | FuzzTypes is a set of "autocorrecting" annotation types that expands
  4 | upon [Pydantic](https://github.com/pydantic/pydantic)'s included [data
  5 | conversions.](https://docs.pydantic.dev/latest/concepts/conversion_table/)
  6 | Designed for simplicity, it provides powerful normalization capabilities
  7 | (e.g. named entity linking) to ensure structured data is composed of
  8 | "smart things" not "dumb strings".
  9 | 
 10 | 
 11 | ## Getting Started
 12 | 
 13 | Pydantic supports basic conversion of data between types. For instance:
 14 | 
 15 | ```python
 16 | from pydantic import BaseModel
 17 | 
 18 | class Normal(BaseModel):
 19 |     boolean: bool
 20 |     float: float
 21 |     integer: int
 22 |     
 23 | obj = Normal(
 24 |     boolean='yes',
 25 |     float='2',
 26 |     integer='3',
 27 | )
 28 | assert obj.boolean is True
 29 | assert obj.float == 2.0
 30 | assert obj.integer == 3
 31 | ```
 32 | 
 33 | FuzzTypes expands on the standard data conversions handled by Pydantic and
 34 | provides a variety of autocorrecting annotation types. 
 35 | 
 36 | ```python
 37 | from datetime import datetime
 38 | from typing import Annotated
 39 | 
 40 | from pydantic import BaseModel
 41 | 
 42 | from fuzztypes import (
 43 |     ASCII,
 44 |     Datetime,
 45 |     Email,
 46 |     Fuzzmoji,
 47 |     InMemoryValidator,
 48 |     Integer,
 49 |     Person,
 50 |     RegexValidator,
 51 |     ZipCode,
 52 |     flags,
 53 | )
 54 | 
 55 | # define a source, see EntitySource for using TSV, CSV, JSONL
 56 | inventors = ["Ada Lovelace", "Alan Turing", "Claude Shannon"]
 57 | 
 58 | # define a in memory validator with fuzz search enabled.
 59 | Inventor = Annotated[
 60 |     str, InMemoryValidator(inventors, search_flag=flags.FuzzSearch)
 61 | ]
 62 | 
 63 | # custom Regex type for finding twitter handles.
 64 | Handle = Annotated[
 65 |     str, RegexValidator(r"@\w{1,15}", examples=["@genomoncology"])
 66 | ]
 67 | 
 68 | # define a Pydantic class with 9 fuzzy type attributes
 69 | class Fuzzy(BaseModel):
 70 |     ascii: ASCII
 71 |     email: Email
 72 |     emoji: Fuzzmoji
 73 |     handle: Handle
 74 |     integer: Integer
 75 |     inventor: Inventor
 76 |     person: Person
 77 |     time: Datetime
 78 |     zipcode: ZipCode
 79 | 
 80 | # create an instance of class Fuzzy
 81 | obj = Fuzzy(
 82 |     ascii="άνθρωπος",
 83 |     email="John Doe <jdoe@example.com>",
 84 |     emoji='thought bubble',
 85 |     handle='Ian Maurer (@imaurer)',
 86 |     integer='fifty-five',
 87 |     inventor='ada luvlace',
 88 |     person='mr. arthur herbert fonzarelli (fonzie)',
 89 |     time='5am on Jan 1, 2025',
 90 |     zipcode="(Zipcode: 12345-6789)",
 91 | )
 92 | 
 93 | # test the autocorrecting performed
 94 | 
 95 | # greek for man: https://en.wiktionary.org/wiki/άνθρωπος
 96 | assert obj.ascii == "anthropos"
 97 | 
 98 | # extract email via regular expression
 99 | assert obj.email == "jdoe@example.com"
100 | 
101 | # fuzzy match "thought bubble" to "thought balloon" emoji
102 | assert obj.emoji == "💭"
103 | 
104 | # simple, inline regex example (see above Handle type)
105 | assert obj.handle == "@imaurer"
106 | 
107 | # convert integer word phrase to integer value
108 | assert obj.integer == 55
109 | 
110 | # case-insensitive fuzzy match on lowercase, misspelled name
111 | assert obj.inventor == "Ada Lovelace"
112 | 
113 | # human name parser (title, first, middle, last, suffix, nickname)
114 | assert str(obj.person) == "Mr. Arthur H. Fonzarelli (fonzie)"
115 | assert obj.person.short_name == "Arthur Fonzarelli"
116 | assert obj.person.nickname == "fonzie"
117 | assert obj.person.last == "Fonzarelli"
118 | 
119 | # convert time phrase to datetime object
120 | assert obj.time.isoformat() == "2025-01-01T05:00:00"
121 | 
122 | # extract zip5 or zip9 formats using regular expressions
123 | assert obj.zipcode == "12345-6789"
124 | 
125 | # print JSON on success
126 | assert obj.model_dump() == {
127 |     "ascii": "anthropos",
128 |     "email": "jdoe@example.com",
129 |     "emoji": "💭",
130 |     "handle": "@imaurer",
131 |     "integer": 55,
132 |     "inventor": "Ada Lovelace",
133 |     "person": {
134 |         "first": "Arthur",
135 |         "init_format": "{first} {middle} {last}",
136 |         "last": "Fonzarelli",
137 |         "middle": "H.",
138 |         "name_format": "{title} {first} {middle} {last} {suffix} "
139 |         "({nickname})",
140 |         "nickname": "fonzie",
141 |         "suffix": "",
142 |         "title": "Mr.",
143 |     },
144 |     "time": datetime(2025, 1, 1, 5),
145 |     "zipcode": "12345-6789",
146 | }
147 | ```
148 | 
149 | ## Installation
150 | 
151 | Available on [PyPI](https://pypi.org/project/FuzzTypes/):
152 | 
153 | ```bash
154 | pip install fuzztypes
155 | ```
156 | 
157 | To install all dependencies (see below), you can copy and paste this:
158 | 
159 | ```bash
160 | pip install anyascii dateparser emoji lancedb nameparser number-parser rapidfuzz sentence-transformers tantivy
161 | ```
162 | 
163 | 
164 | ## Google Colab Notebook
165 | 
166 | There is a read-only notebook that you can copy and edit to try out FuzzTypes:
167 | 
168 | [https://colab.research.google.com/drive/1GNngxcTUXpWDqK_qNsJoP2NhSN9vKCzZ?usp=sharing](https://colab.research.google.com/drive/1GNngxcTUXpWDqK_qNsJoP2NhSN9vKCzZ?usp=sharing)
169 | 
170 | 
171 | ## Base Validators
172 | 
173 | Base validators are the building blocks of FuzzTypes that can be used for creating custom "usable types".
174 | 
175 | | Type                | Description                                                                                 |
176 | |---------------------|---------------------------------------------------------------------------------------------|
177 | | `DateType`          | Base date type, pass in arguments such as `date_order`, `strict` and `relative_base`.       |
178 | | `FuzzValidator`     | Validator class that calls a provided function and handles core and json schema config.     |
179 | | `InMemoryValidator` | Enables matching entities in memory using exact, alias, fuzzy, or semantic search.          |
180 | | `OnDiskValidator`   | Performs matching entities stored on disk using exact, alias, fuzzy, or semantic search.    |
181 | | `RegexValidator`    | Regular expression pattern matching base validator.                                         |
182 | | `DatetimeType`      | Base datetime type, pass in arguments such as `date_order`, `timezone` and `relative_base`. |
183 | 
184 | These base types offer flexibility and extensibility, enabling you to create custom annotation types that suit your
185 | specific data validation and normalization requirements.
186 | 
187 | 
188 | ## Usable Types
189 | 
190 | Usable types are pre-built annotation types in FuzzTypes that can be directly used in Pydantic models. They provide
191 | convenient and ready-to-use functionality for common data types and scenarios.
192 | 
193 | | Type           | Description                                                                               |
194 | |----------------|-------------------------------------------------------------------------------------------|
195 | | `ASCII`        | Converts Unicode strings to ASCII equivalents using either `anyascii` or `unidecode`.     |
196 | | `Date`         | Converts date strings to `date` objects using `dateparser`.                               |
197 | | `Email`        | Extracts email addresses from strings using a regular expression.                         |
198 | | `Emoji`        | Matches emojis based on Unicode Consortium aliases using the `emoji` library.             |
199 | | `Fuzzmoji`     | Matches emojis using fuzzy string matching against aliases.                               |
200 | | `Integer`      | Converts numeric strings or words to integers using `number-parser`.                      |
201 | | `LanguageCode` | Resolves language to ISO language codes (e.g., "en").                                     |
202 | | `LanguageName` | Resolves language to ISO language names (e.g., "English").                                |
203 | | `Language`     | Resolves language to ISO language object (name, alpha_2, alpha_3, scope, type, etc.).     |
204 | | `Person`       | Parses person names into subfields (e.g., first, last, suffix) using `python-nameparser`. |
205 | | `SSN`          | Extracts U.S. Social Security Numbers from strings using a regular expression.            |
206 | | `Time`         | Converts datetime strings to `datetime` objects using `dateparser`.                       |
207 | | `Vibemoji`     | Matches emojis using semantic similarity against aliases.                                 |
208 | | `Zipcode`      | Extracts U.S. ZIP codes (5 or 9 digits) from strings using a regular expression.          |
209 | 
210 | These usable types provide a wide range of commonly needed data validations and transformations, making it
211 | easier to work with various data formats and perform tasks like parsing, extraction, and matching.
212 | 
213 | 
214 | ## InMemoryValidator and OnDiskValidator Configuration
215 | 
216 | The InMemory and OnDisk Validator objects work with lists of Entities.
217 | 
218 | The following table describes the available configuration options:
219 | 
220 | | Argument          | Type                                    | Default               | Description                                                                                                                                                                                                                                                                                                                             |
221 | |-------------------|-----------------------------------------|-----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
222 | | `case_sensitive`  | `bool`                                  | `False`               | If `True`, matches are case-sensitive. If `False`, matches are case-insensitive.                                                                                                                                                                                                                                                        |
223 | | `device`          | `Literal["cpu", "cuda", "mps"]`         | `"cpu"`               | The device to use for generating semantic embeddings and LanceDB indexing. Available options are "cpu", "cuda" (for NVIDIA GPUs), and "mps" (for Apple's Metal Performance Shaders).                                                                                                                                                    |
224 | | `encoder`         | `Union[Callable, str, Any]`             | `None`                | The encoder to use for generating semantic embeddings. It can be a callable function, a string specifying the name or path of a pre-trained model, or any other object that implements the encoding functionality.                                                                                                                      |
225 | | `examples`        | `List[Any]`                             | `None`                | A list of example values to be used in schema generation. These examples are included in the generated JSON schema to provide guidance on the expected format of the input values.                                                                                                                                                      |
226 | | `fuzz_scorer`     | `Literal["token_sort_ratio", ...]`      | `"token_sort_ratio"`  | The scoring algorithm to use for fuzzy string matching. Available options include "token_sort_ratio", "ratio", "partial_ratio", "token_set_ratio", "partial_token_set_ratio", "token_ratio", "partial_token_ratio", "WRatio", and "QRatio". Each algorithm has its own characteristics and trade-offs between accuracy and performance. |
227 | | `limit`           | `int`                                   | `10`                  | The maximum number of matches to return when performing fuzzy or semantic searches.                                                                                                                                                                                                                                                     |
228 | | `min_similarity`  | `float`                                 | `80.0`                | The minimum similarity score required for a match to be considered valid. Matches with a similarity score below this threshold will be discarded.                                                                                                                                                                                       |
229 | | `notfound_mode`   | `Literal["raise", "none", "allow"]`     | `"raise"`             | The action to take when a matching entity is not found. Available options are "raise" (raises an exception), "none" (returns `None`), and "allow" (returns the input key as the value).                                                                                                                                                 |
230 | | `search_flag`     | `flags.SearchFlag`                      | `flags.DefaultSearch` | The search strategy to use for finding matches. It is a combination of flags that determine which fields of the `NamedEntity` are considered for matching and whether fuzzy or semantic search is enabled. Available options are defined in the `flags` module.                                                                         |
231 | | `tiebreaker_mode` | `Literal["raise", "lesser", "greater"]` | `"raise"`             | The strategy to use for resolving ties when multiple matches have the same similarity score. Available options are "raise" (raises an exception), "lesser" (returns the match with the lower value), and "greater" (returns the match with the greater value).                                                                          |
232 | 
233 | 
234 | ## Lazy Dependencies
235 | 
236 | FuzzTypes leverages several powerful libraries to extend its functionality.
237 | 
238 | These dependencies are not installed by default with FuzzTypes to keep the
239 | installation lightweight. Instead, they are optional and can be installed
240 | as needed depending on which types you use.
241 | 
242 | Below is a list of these dependencies, including their licenses, purpose, and what
243 | specific Types require them.
244 | 
245 | Right now, you must pip install the modules directly, in the future you will 
246 | be able to install them automatically as part of the main install using pip extras.
247 | 
248 | To install all dependencies, you can copy and paste this:
249 | 
250 | ```bash
251 | pip install anyascii dateparser emoji lancedb nameparser number-parser rapidfuzz sentence-transformers tantivy
252 | ```
253 | 
254 | 
255 | | Fuzz Type         | Library                                                                  | License    | Purpose                                                    |
256 | |-------------------|--------------------------------------------------------------------------|------------|------------------------------------------------------------|
257 | | ASCII             | [anyascii](https://github.com/anyascii/anyascii)                         | ISC        | Converting Unicode into ASCII equivalents (not GPL)        |
258 | | ASCII             | [unidecode](https://github.com/avian2/unidecode)                         | GPL        | Converting Unicode into ASCII equivalents (better quality) |
259 | | Date              | [dateparser](https://github.com/scrapinghub/dateparser)                  | BSD-3      | Parsing dates from strings                                 |
260 | | Emoji             | [emoji](https://github.com/carpedm20/emoji/)                             | BSD        | Handling and manipulating emoji characters                 |
261 | | Fuzz              | [rapidfuzz](https://github.com/rapidfuzz/RapidFuzz)                      | MIT        | Performing fuzzy string matching                           |
262 | | InMemoryValidator | [numpy](https://numpy.org/)                                              | BSD        | Numerical computing in Python                              |
263 | | InMemoryValidator | [scikit-learn](https://scikit-learn.org/)                                | BSD        | Machine learning in Python                                 |
264 | | InMemoryValidator | [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | Apache-2.0 | Encoding sentences into high-dimensional vectors           |
265 | | Integer           | [number-parser](https://github.com/scrapinghub/number-parser)            | BSD-3      | Parsing numbers from strings                               |
266 | | OnDiskValidator   | [lancedb](https://github.com/lancedb/lancedb)                            | Apache-2.0 | High-performance, on-disk vector database                  |
267 | | OnDiskValidator   | [pyarrow](https://github.com/apache/arrow)                               | Apache-2.0 | In-memory columnar data format and processing library      |
268 | | OnDiskValidator   | [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | Apache-2.0 | Encoding sentences into high-dimensional vectors           |
269 | | OnDiskValidator   | [tantivy](https://github.com/quickwit-oss/tantivy-py)                    | MIT        | Full-text search (FTS) for LanceDB.                        |
270 | | Person            | [nameparser](https://github.com/derek73/python-nameparser)               | LGPL       | Parsing person names                                       |
271 | 
272 | 
273 | ## Maintainer
274 | 
275 | FuzzTypes was created by [Ian Maurer](https://x.com/imaurer), the CTO of [GenomOncology](https://genomoncology.com).
276 | 
277 | This MIT-based open-source project was extracted from our product which includes the ability to normalize biomedical
278 | data for use in precision oncology clinical decision support systems. Contact me to learn more about our product
279 | offerings.
280 | 
281 | 
282 | | Type           | Description                                                                               |
283 | |----------------|-------------------------------------------------------------------------------------------|
284 | | `AirportCode`  | Represents airport codes (e.g., "ORD").                                                   |
285 | | `Airport`      | Represents airport names (e.g., "O'Hare International Airport").                          |
286 | | `CountryCode`  | Represents ISO country codes (e.g., "US").                                                |
287 | | `Country`      | Represents country names (e.g., "United States").                                         |
288 | | `Currency`     | Represents currency codes (e.g., "USD").                                                  |
289 | | `Quantity`     | Converts strings to `Quantity` objects with value and unit using `pint`.                  |
290 | | `URL`          | Represents normalized URLs with tracking parameters removed using `url-normalize`.        |
291 | | `USStateCode`  | Represents U.S. state codes (e.g., "CA").                                                 |
292 | | `USState`      | Represents U.S. state names (e.g., "California").                                         |
293 | 
294 | 
295 | ## Structured Data Generation via LLM Function Calling and Custom GPT Actions
296 | 
297 | Several libraries (e.g. [Instructor](https://github.com/jxnl/instructor),
298 | [Outlines](https://github.com/outlines-dev/outlines),
299 | [Marvin](https://github.com/prefecthq/marvin)) use Pydantic to define models for structured data generation
300 | using Large Language Models (LLMs) via function calling or a grammar/regex
301 | based sampling approach based on the [JSON schema generated by Pydantic](https://docs.pydantic.dev/latest/concepts/json_schema/).
302 | 
303 | This approach allows for the enumeration of allowed values using
304 | Python's `Literal`, `Enum` or JSON Schema's `examples` field directly
305 | in your Pydantic class declaration which is used by the LLM to
306 | generate valid values. This approach works exceptionally well for
307 | low-cardinality (not many unique allowed values) such as the world's
308 | continents (7 in total).
309 | 
310 | This approach, however, doesn't scale well for high-cardinality (many unique
311 | allowed values) such as the number of known human genomic variants (~325M).
312 | Where exactly the cutoff is between "low" and "high" cardinality is an exercise
313 | left to the reader and their use case.
314 | 
315 | That's where FuzzTypes come in. The allowed values are managed by the FuzzTypes
316 | annotations and the values are resolved during the Pydantic validation process.
317 | This can include fuzzy and semantic searching that throws an exception if the
318 | provided value doesn't meet a minimum similarity threshold defined by the
319 | developer.
320 | 
321 | Errors discovered via Pydantic can be caught and resubmitted to the LLM for
322 | correction. The error will contain examples, expected patterns, and closest
323 | matches to help steer the LLM to provide a better informed guess.
324 | 
325 | 
326 | ## Creating Custom Types
327 | 
328 | FuzzTypes provides a set of base types that you can use to create
329 | your own custom annotation types. These base types offer different
330 | capabilities and can be extended to suit your specific data validation
331 | and normalization needs.
332 | 
333 | ### EntitySource
334 | 
335 | FuzzTypes provides the `EntitySource` class to manage and load
336 | entity data from various sources. It supports JSON Lines (`.jsonl`),
337 | CSV (`.csv`), TSV (`.tsv`), and Text (`.txt`) formats, as well as
338 | loading entities from a callable function.
339 | 
340 | Example:
341 | ```python
342 | from pathlib import Path
343 | from fuzztypes import EntitySource, NamedEntity
344 | 
345 | # Load entities from a CSV file
346 | fruit_source = EntitySource(Path("path/to/fruits.csv"))
347 | 
348 | # Load entities from a callable function
349 | def load_animals():
350 |     return [
351 |         NamedEntity(value="Dog", aliases=["Canine"]),
352 |         NamedEntity(value="Cat", aliases=["Feline"]),
353 |     ]
354 | 
355 | animal_source = EntitySource(load_animals)
356 | ```
357 | 
358 | ### InMemoryValidator Base Type
359 | 
360 | The `InMemoryValidator` base type enables matching entities in memory using
361 | exact, alias, fuzzy, or semantic search. It is suitable for small
362 | to medium-sized datasets that can fit in memory and provides fast
363 | matching capabilities.
364 | 
365 | Example:
366 | ```python
367 | from typing import Annotated
368 | from pydantic import BaseModel
369 | from fuzztypes import InMemoryValidator, flags
370 | 
371 | # Create a custom annotation type for matching fruits
372 | fruits = ["Apple", "Banana", "Orange"]
373 | Fruit = Annotated[
374 |     str, InMemoryValidator(fruits, search_flag=flags.FuzzSearch)
375 | ]
376 | 
377 | class MyModel(BaseModel):
378 |     fruit: Fruit
379 | 
380 | model = MyModel(fruit="appel")
381 | assert model.fruit == "Apple"
382 | ```
383 | 
384 | ### OnDiskValidator Base Type
385 | 
386 | The `OnDiskValidator` base type performs matching entities stored on disk
387 | using exact, alias, fuzzy, or semantic search. It leverages the
388 | LanceDB library for efficient storage and retrieval of entities.
389 | `OnDiskValidator` is recommended for large datasets that cannot fit in memory.
390 | 
391 | Example:
392 | ```python
393 | from typing import Annotated
394 | from pydantic import BaseModel
395 | from fuzztypes import OnDiskValidator
396 | 
397 | # Create a custom annotation type for matching countries stored on disk
398 | countries = [
399 |     ("United States", "US"),
400 |     ("United Kingdom", "UK"),
401 |     ("Canada", "CA"),
402 | ]
403 | Country = Annotated[str, OnDiskValidator("Country", countries)]
404 | 
405 | class MyModel(BaseModel):
406 |     country: Country
407 | 
408 | assert MyModel(country="Canada").country == "Canada"
409 | assert MyModel(country="US").country == "United States"
410 | ```
411 | 
412 | ### DateType and TimeType
413 | 
414 | The `DateValidator` and `DatetimeValidator` base types provide fuzzy parsing
415 | capabilities for date and datetime objects, respectively. They allow
416 | you to define flexible date and time formats and perform parsing
417 | based on specified settings such as date order, timezone, and
418 | relative base.
419 | 
420 | Example:
421 | 
422 | ```python
423 | from datetime import date, datetime
424 | from pydantic import BaseModel
425 | from typing import Annotated
426 | from fuzztypes import DateValidator, DatetimeValidator
427 | 
428 | MyDate = Annotated[date, DateValidator(date_order="MDY")]
429 | MyTime = Annotated[datetime, DatetimeValidator(timezone="UTC")]
430 | 
431 | class MyModel(BaseModel):
432 |     date: MyDate
433 |     time: MyTime
434 | 
435 | model = MyModel(date="1/1/2023", time="1/1/23 at 10:30 PM")
436 | assert model.date.isoformat() == "2023-01-01"
437 | assert model.time.isoformat() == "2023-01-01T22:30:00+00:00"
438 | ```
439 | 
440 | 
441 | ### FuzzValidator
442 | 
443 | The `FuzzValidator` is the base of the fuzztypes typing system.
444 | It can be used directly to wrap any python function.
445 | 
446 | Example:
447 | ```python
448 | from typing import Annotated
449 | from pydantic import BaseModel
450 | from fuzztypes import FuzzValidator
451 | 
452 | # Create a custom annotation type that converts a value to uppercase
453 | UpperCase = Annotated[str, FuzzValidator(str.upper)]
454 | 
455 | class MyModel(BaseModel):
456 |     name: UpperCase
457 | 
458 | model = MyModel(name="john")
459 | assert model.name == "JOHN"
460 | ```
461 | 
462 | 
463 | ### Regex
464 | 
465 | The `Regex` base type allows matching values using a regular
466 | expression pattern. It is useful for creating annotation types that
467 | validate and extract specific patterns from input values.
468 | 
469 | Example:
470 | ```python
471 | from typing import Annotated
472 | from pydantic import BaseModel
473 | from fuzztypes import RegexValidator
474 | 
475 | # Create a custom annotation type for matching email addresses
476 | IPAddress = Annotated[
477 |     str, RegexValidator(r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}$")
478 | ]
479 | 
480 | class MyModel(BaseModel):
481 |     ip_address: IPAddress
482 | 
483 | model = MyModel(ip_address="My internet IP address is 192.168.127.12")
484 | assert model.ip_address == "192.168.127.12"
485 | ```
486 | 
487 | ### Languages
488 | 
489 | Languages are loaded from the [Debian iso-codes](https://salsa.debian.org/iso-codes-team/iso-codes/) project.
490 | 
491 | Languages are resolved using their preferred, common, inverted, bibliographic name, or 2 or 3 letter alpha code.
492 | 
493 | Languages can be included as a string name (LanguageName), string code (LanguageCode) or full language object.
494 | 
495 | The preferred code is the 2 letter version and will be used if available. Otherwise, the 3 letter alpha code is used.
496 | 
497 | Example:
498 | 
499 | ```python
500 | from pydantic import BaseModel
501 | from fuzztypes import (
502 |     Language,
503 |     LanguageName,
504 |     LanguageCode,
505 |     LanguageScope,
506 |     LanguageType,
507 |     LanguageNamedEntity,
508 |     validate_python,
509 | )
510 | class Model(BaseModel):
511 |     language_code: LanguageCode
512 |     language_name: LanguageName
513 |     language: Language
514 | 
515 | # Test that Language resolves to the complete language object
516 | data = dict(language_code="en", language="English", language_name="ENG")
517 | obj = validate_python(Model, data)
518 | assert obj.language_code == "en"
519 | assert obj.language_name == "English"
520 | assert obj.language.scope == LanguageScope.INDIVIDUAL
521 | assert obj.language.type == LanguageType.LIVING
522 | assert isinstance(obj.language, LanguageNamedEntity)
523 | assert obj.model_dump(exclude_defaults=True, mode="json") == {
524 |     "language": {
525 |         "aliases": ["en", "eng"],
526 |         "alpha_2": "en",
527 |         "alpha_3": "eng",
528 |         "scope": "I",
529 |         "type": "L",
530 |         "value": "English",
531 |     },
532 |     "language_code": "en",
533 |     "language_name": "English",
534 | }
535 | ```
536 | 
537 | ### Validate Python and JSON functions
538 | 
539 | Functional approach to validating python and json are available.
540 | Below are examples for the `validate_python` and `validate_json` functions:
541 | 
542 | ```python
543 | from pydantic import BaseModel
544 | from fuzztypes import validate_python, validate_json, Integer, Date
545 | 
546 | # validate python
547 | assert validate_python(Integer, "two hundred") == 200
548 | 
549 | # validate json
550 | class MyModel(BaseModel):
551 |     date: Date
552 | 
553 | json = '{"date": "July 4th 2021"}'
554 | obj = validate_json(MyModel, json)
555 | assert obj.date.isoformat() == "2021-07-04"
556 | ```
557 | 
558 | ### Resolve Entities from FuzzValidator or Annotation
559 | 
560 | Entities can be resolved from the `FuzzValidator` validators such as InMemoryValidator
561 | or OnDiskValidator or the defined `Annotation` type using the `resolve_entity` function:
562 | 
563 | ```python
564 | from typing import Annotated
565 | from fuzztypes import resolve_entity, InMemoryValidator
566 | 
567 | elements = ["earth", "fire", "water", "air"]
568 | ElementValidator = InMemoryValidator(elements)
569 | Element = Annotated[str, ElementValidator]
570 | 
571 | assert resolve_entity(ElementValidator, "EARTH").model_dump() == {
572 |     "aliases": [],
573 |     "label": None,
574 |     "meta": None,
575 |     "priority": None,
576 |     "value": "earth",
577 | }
578 | 
579 | assert resolve_entity(Element, "Air").model_dump(
580 |     exclude_defaults=True
581 | ) == {"value": "air"}
582 | ```


--------------------------------------------------------------------------------