├── tests
    ├── __init__.py
    ├── test_cli.py
    ├── test_nested_arrays.py
    ├── test_structure_generator.py
    ├── test_pydantic.py
    ├── test_roundtrip.py
    └── test_decoder.py
├── assets
    ├── toonify.png
    ├── README.zh-CN.md
    └── README.ko.md
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── custom.md
    │   ├── feature_request.md
    │   └── bug_report.md
    └── workflows
    │   ├── code-quality.yml
    │   ├── release.yml
    │   └── codeql.yml
├── run_example.sh
├── benchmark
    ├── __init__.py
    ├── run_all.py
    ├── QUICKSTART.md
    ├── memory_benchmark.py
    ├── RESULTS.md
    └── compare_formats.py
├── .gitignore
├── toon
    ├── constants.py
    ├── __init__.py
    ├── pydantic_converter.py
    ├── cli.py
    ├── utils.py
    ├── structure_generator.py
    └── encoder.py
├── .releaserc.json
├── pyproject.toml
├── examples
    ├── basic_usage.py
    ├── advanced_features.py
    ├── pydantic_usage.py
    └── structure_template_usage.py
├── CHANGELOG.md
├── demo.py
└── CONTRIBUTING.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Test package initialization."""
2 | 


--------------------------------------------------------------------------------
/assets/toonify.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScrapeGraphAI/toonify/HEAD/assets/toonify.png


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Custom issue template
3 | about: Describe this issue template's purpose here.
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 | 
8 | ---
9 | 


--------------------------------------------------------------------------------
/run_example.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Helper script to run examples with proper PYTHONPATH
3 | export PYTHONPATH=/Users/marcovinciguerra/Desktop/toon:$PYTHONPATH
4 | .venv/bin/python "$@"
5 | 


--------------------------------------------------------------------------------
/benchmark/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | TOON Benchmark Suite
 3 | 
 4 | This package contains benchmarks comparing TOON vs JSON for:
 5 | - File size and token count
 6 | - Memory usage
 7 | - Encoding/decoding performance
 8 | 
 9 | Run all benchmarks:
10 |     python -m benchmark.run_all
11 | 
12 | Or run individual benchmarks:
13 |     python -m benchmark.compare_formats
14 |     python -m benchmark.memory_benchmark
15 | """
16 | 
17 | __version__ = "1.0.0"
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .venv
 3 | __pycache__
 4 | *.pyc
 5 | *.pyo
 6 | *.pyd
 7 | *.pyw
 8 | *.pyz
 9 | *.pywz
10 | *.pyzw
11 | *.pyzwz
12 | 
13 | # Python
14 | __pycache__/
15 | *.py[cod]
16 | *$py.class
17 | *.so
18 | .Python
19 | build/
20 | develop-eggs/
21 | dist/
22 | downloads/
23 | eggs/
24 | .eggs/
25 | lib/
26 | lib64/
27 | parts/
28 | sdist/
29 | var/
30 | wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 | 
35 | # Testing
36 | .coverage
37 | .pytest_cache
38 | .tox
39 | htmlcov/
40 | 
41 | # Virtual environments
42 | .venv
43 | venv/
44 | ENV/
45 | env/
46 | 
47 | # IDEs
48 | .vscode
49 | .idea
50 | *.swp
51 | *.swo
52 | *~
53 | 
54 | # OS
55 | .DS_Store
56 | Thumbs.db
57 | 
58 | # Environment variables
59 | .env
60 | .env.local
61 | .env.development.local
62 | .env.test.local
63 | .env.production.local
64 | 


--------------------------------------------------------------------------------
/toon/constants.py:
--------------------------------------------------------------------------------
 1 | """Constants used throughout the TOON library."""
 2 | 
 3 | # Delimiters
 4 | COMMA = ','
 5 | TAB = '\t'
 6 | PIPE = '|'
 7 | DEFAULT_DELIMITER = COMMA
 8 | 
 9 | # Special characters
10 | COLON = ':'
11 | QUOTE = '"'
12 | BACKSLASH = '\\'
13 | NEWLINE = '\n'
14 | SPACE = ' '
15 | LEFT_BRACKET = '['
16 | RIGHT_BRACKET = ']'
17 | LEFT_BRACE = '{'
18 | RIGHT_BRACE = '}'
19 | 
20 | # Literals
21 | TRUE_LITERAL = 'true'
22 | FALSE_LITERAL = 'false'
23 | NULL_LITERAL = 'null'
24 | 
25 | # Default options
26 | DEFAULT_INDENT = 2
27 | DEFAULT_KEY_FOLDING = 'off'
28 | DEFAULT_EXPAND_PATHS = 'off'
29 | DEFAULT_STRICT = True
30 | 
31 | # Key folding modes
32 | KEY_FOLDING_OFF = 'off'
33 | KEY_FOLDING_SAFE = 'safe'
34 | 
35 | # Path expansion modes
36 | EXPAND_PATHS_OFF = 'off'
37 | EXPAND_PATHS_SAFE = 'safe'
38 | 
39 | # Special delimiter names
40 | DELIMITER_TAB = 'tab'
41 | DELIMITER_PIPE = 'pipe'
42 | DELIMITER_COMMA = 'comma'
43 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/workflows/code-quality.yml:
--------------------------------------------------------------------------------
 1 | name: Code Quality Checks
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'toon/**'
 7 |       - '.github/workflows/code-quality.yml'
 8 | 
 9 | jobs:
10 |   quality:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v3
14 | 
15 |       - name: Install uv
16 |         uses: astral-sh/setup-uv@v3
17 | 
18 |       - name: Install dependencies
19 |         run: uv sync --frozen
20 | 
21 |       - name: Run Ruff
22 |         run: uv run ruff check toon
23 | 
24 |       - name: Run Black
25 |         run: uv run black --check toon
26 | 
27 |       - name: Run isort
28 |         run: uv run isort --check-only toon
29 | 
30 |       - name: Analysing the code with pylint
31 |         run: uv run poe pylint-ci
32 | 
33 |       - name: Check Pylint score
34 |         run: |
35 |           pylint_score=$(uv run poe pylint-score-ci | grep 'Raw metrics' | awk '{print $4}')
36 |           if (( $(echo "$pylint_score < 8" | bc -l) )); then
37 |             echo "Pylint score is below 8. Blocking commit."
38 |             exit 1
39 |           else
40 |             echo "Pylint score is acceptable."
41 |           fi
42 | 


--------------------------------------------------------------------------------
/.releaserc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "branches": [
 3 |     "main",
 4 |     {
 5 |       "name": "pre/*",
 6 |       "prerelease": true
 7 |     }
 8 |   ],
 9 |   "plugins": [
10 |     [
11 |       "@semantic-release/commit-analyzer",
12 |       {
13 |         "preset": "conventionalcommits"
14 |       }
15 |     ],
16 |     [
17 |       "@semantic-release/release-notes-generator",
18 |       {
19 |         "preset": "conventionalcommits"
20 |       }
21 |     ],
22 |     [
23 |       "@semantic-release/changelog",
24 |       {
25 |         "changelogFile": "CHANGELOG.md"
26 |       }
27 |     ],
28 |     [
29 |       "semantic-release-pypi",
30 |       {
31 |         "distDir": "dist"
32 |       }
33 |     ],
34 |     [
35 |       "@semantic-release/github",
36 |       {
37 |         "assets": [
38 |           {
39 |             "path": "dist/*.tar.gz",
40 |             "label": "Source distribution"
41 |           },
42 |           {
43 |             "path": "dist/*.whl",
44 |             "label": "Python wheel"
45 |           }
46 |         ]
47 |       }
48 |     ],
49 |     [
50 |       "@semantic-release/git",
51 |       {
52 |         "assets": ["CHANGELOG.md"],
53 |         "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
54 |       }
55 |     ]
56 |   ]
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/toon/__init__.py:
--------------------------------------------------------------------------------
 1 | """TOON (Token-Oriented Object Notation) - A compact serialization format for LLMs."""
 2 | 
 3 | from .encoder import encode
 4 | from .decoder import decode
 5 | from .structure_generator import generate_structure
 6 | from .constants import (
 7 |     COMMA, TAB, PIPE,
 8 |     KEY_FOLDING_OFF, KEY_FOLDING_SAFE,
 9 |     EXPAND_PATHS_OFF, EXPAND_PATHS_SAFE
10 | )
11 | 
12 | # Pydantic converters (optional - requires pydantic installation)
13 | try:
14 |     from .pydantic_converter import encode_pydantic, decode_to_pydantic
15 |     from .structure_generator import generate_structure_from_pydantic
16 |     _PYDANTIC_AVAILABLE = True
17 | except ImportError:
18 |     _PYDANTIC_AVAILABLE = False
19 |     def encode_pydantic(*args, **kwargs):
20 |         raise ImportError("encode_pydantic requires pydantic to be installed. Please install pydantic to use this feature.")
21 |     def decode_to_pydantic(*args, **kwargs):
22 |         raise ImportError("decode_to_pydantic requires pydantic to be installed. Please install pydantic to use this feature.")
23 |     def generate_structure_from_pydantic(*args, **kwargs):
24 |         raise ImportError("generate_structure_from_pydantic requires pydantic to be installed. Please install pydantic to use this feature.")
25 | 
26 | __version__ = '1.0.0'
27 | __all__ = [
28 |     'encode',
29 |     'decode',
30 |     'generate_structure',
31 |     'encode_pydantic',
32 |     'decode_to_pydantic',
33 |     'generate_structure_from_pydantic',
34 |     'COMMA',
35 |     'TAB',
36 |     'PIPE',
37 |     'KEY_FOLDING_OFF',
38 |     'KEY_FOLDING_SAFE',
39 |     'EXPAND_PATHS_OFF',
40 |     'EXPAND_PATHS_SAFE',
41 | ]
42 | 


--------------------------------------------------------------------------------
/benchmark/run_all.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Run all benchmarks and generate a comprehensive report.
 4 | """
 5 | 
 6 | import sys
 7 | import subprocess
 8 | from pathlib import Path
 9 | 
10 | 
11 | def run_benchmark(script_name: str) -> int:
12 |     """Run a benchmark script and return its exit code."""
13 |     script_path = Path(__file__).parent / script_name
14 |     print(f"\n{'='*70}")
15 |     print(f"Running: {script_name}")
16 |     print(f"{'='*70}\n")
17 | 
18 |     result = subprocess.run([sys.executable, str(script_path)], cwd=script_path.parent)
19 |     return result.returncode
20 | 
21 | 
22 | def main():
23 |     """Run all benchmarks."""
24 |     print("="*70)
25 |     print("TOON Benchmark Suite - Running All Tests")
26 |     print("="*70)
27 | 
28 |     benchmarks = [
29 |         "compare_formats.py",
30 |         "memory_benchmark.py",
31 |     ]
32 | 
33 |     failed = []
34 | 
35 |     for benchmark in benchmarks:
36 |         exit_code = run_benchmark(benchmark)
37 |         if exit_code != 0:
38 |             failed.append(benchmark)
39 | 
40 |     print("\n" + "="*70)
41 |     print("BENCHMARK SUITE COMPLETE")
42 |     print("="*70)
43 | 
44 |     if failed:
45 |         print(f"\n❌ {len(failed)} benchmark(s) failed:")
46 |         for b in failed:
47 |             print(f"  • {b}")
48 |         sys.exit(1)
49 |     else:
50 |         print("\n✅ All benchmarks completed successfully!")
51 |         print("\nKey Takeaways:")
52 |         print("  • TOON reduces size by 30-60% compared to JSON")
53 |         print("  • Token savings translate directly to lower LLM API costs")
54 |         print("  • Performance is comparable or better than JSON")
55 |         print("  • Perfect for structured data passed to LLMs")
56 |         sys.exit(0)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     main()
61 | 


--------------------------------------------------------------------------------
/benchmark/QUICKSTART.md:
--------------------------------------------------------------------------------
 1 | # Benchmark Quick Start
 2 | 
 3 | ## Run All Benchmarks
 4 | 
 5 | The fastest way to see the memory savings:
 6 | 
 7 | ```bash
 8 | python benchmark/run_all.py
 9 | ```
10 | 
11 | This will run all benchmarks and provide a comprehensive summary.
12 | 
13 | ## Run Individual Benchmarks
14 | 
15 | ### 1. Compare Sizes and Tokens
16 | 
17 | ```bash
18 | python benchmark/compare_formats.py
19 | ```
20 | 
21 | This shows:
22 | - File size comparison (JSON vs TOON)
23 | - Token count comparison (for LLM APIs)
24 | - Encoding/decoding performance
25 | - Example outputs
26 | 
27 | ### 2. Measure Memory Usage
28 | 
29 | ```bash
30 | python benchmark/memory_benchmark.py
31 | ```
32 | 
33 | This shows:
34 | - Actual memory consumption
35 | - Network bandwidth savings
36 | - Practical cost impact
37 | 
38 | ## Expected Results
39 | 
40 | You should see:
41 | - **~58% average size reduction**
42 | - **~50% average token reduction**
43 | - **Up to 71% savings for tabular data**
44 | 
45 | ## What This Means
46 | 
47 | If you're sending structured data to LLM APIs:
48 | - **50% fewer tokens** = **50% lower costs**
49 | - **Faster network transfers** = better performance
50 | - **Same data quality** = no loss of information
51 | 
52 | ## Examples
53 | 
54 | ### Before (JSON)
55 | ```json
56 | {
57 |   "products": [
58 |     {"id": 1, "name": "Laptop", "price": 999},
59 |     {"id": 2, "name": "Mouse", "price": 29}
60 |   ]
61 | }
62 | ```
63 | **Size: 134 bytes, 48 tokens**
64 | 
65 | ### After (TOON)
66 | ```toon
67 | products[2]{id,name,price}:
68 |   1,Laptop,999
69 |   2,Mouse,29
70 | ```
71 | **Size: 52 bytes, 23 tokens**
72 | 
73 | **Savings: 61.2% size, 52.1% tokens**
74 | 
75 | ## Requirements
76 | 
77 | ```bash
78 | pip install tiktoken  # For token counting
79 | ```
80 | 
81 | ## Troubleshooting
82 | 
83 | If you get import errors, make sure you're in the project root:
84 | 
85 | ```bash
86 | cd /path/to/toonify
87 | python benchmark/run_all.py
88 | ```
89 | 
90 | Or install the package:
91 | 
92 | ```bash
93 | pip install -e .
94 | python benchmark/run_all.py
95 | ```
96 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling==1.26.3"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "toonify"
 7 | version = "0.0.2"
 8 | description = "TOON (Token-Oriented Object Notation) - A compact, human-readable serialization format for LLMs"
 9 | readme = "README.md"
10 | authors = [
11 |     {name = "TOON Format Contributors"}
12 | ]
13 | license = {text = "MIT"}
14 | classifiers = [
15 |     "Development Status :: 4 - Beta",
16 |     "Intended Audience :: Developers",
17 |     "License :: OSI Approved :: MIT License",
18 |     "Programming Language :: Python :: 3",
19 |     "Programming Language :: Python :: 3.8",
20 |     "Programming Language :: Python :: 3.9",
21 |     "Programming Language :: Python :: 3.10",
22 |     "Programming Language :: Python :: 3.11",
23 |     "Programming Language :: Python :: 3.12",
24 |     "Topic :: Software Development :: Libraries :: Python Modules",
25 |     "Topic :: Text Processing",
26 | ]
27 | keywords = ["serialization", "toon", "json", "csv", "llm", "token-efficient"]
28 | requires-python = ">=3.8"
29 | dependencies = [
30 |     "tiktoken>=0.5.0",
31 | ]
32 | 
33 | [project.optional-dependencies]
34 | dev = [
35 |     "pytest>=7.0.0",
36 |     "pytest-cov>=4.0.0",
37 |     "ruff>=0.8.0",
38 |     "black>=24.0.0",
39 |     "isort>=5.13.0",
40 |     "pylint>=3.3.0",
41 |     "poethepoet>=0.29.0",
42 | ]
43 | pydantic = [
44 |     "pydantic>=1.10.0",
45 | ]
46 | all = [
47 |     "pydantic>=1.10.0",
48 | ]
49 | 
50 | [project.urls]
51 | Homepage = "https://github.com/ScrapeGraphAI/toonify"
52 | Repository = "https://github.com/ScrapeGraphAI/toonify"
53 | Documentation = "https://github.com/ScrapeGraphAI/toonify#readme"
54 | 
55 | [project.scripts]
56 | toon = "toon.cli:main"
57 | 
58 | [tool.hatch.build.targets.wheel]
59 | packages = ["toon"]
60 | 
61 | [tool.hatch.build.targets.sdist]
62 | include = [
63 |     "toon/",
64 |     "README.md",
65 |     "LICENSE",
66 | ]
67 | 
68 | [tool.poe.tasks]
69 | pylint-ci = "pylint toon --output-format=text"
70 | pylint-score-ci = "pylint toon --score=yes"
71 | 
72 | [tool.pytest.ini_options]
73 | testpaths = ["tests"]
74 | python_files = ["test_*.py"]
75 | python_classes = ["Test*"]
76 | python_functions = ["test_*"]
77 | addopts = "-v --cov=toon --cov-report=term-missing"
78 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | """Tests for TOON CLI."""
 2 | import sys
 3 | import json
 4 | import tempfile
 5 | from pathlib import Path
 6 | from io import StringIO
 7 | import pytest
 8 | 
 9 | from toon.cli import detect_mode, read_input, write_output, count_tokens
10 | 
11 | 
12 | def test_detect_mode_from_extension():
13 |     """Test mode detection from file extension."""
14 |     # JSON file -> encode
15 |     assert detect_mode('test.json', False, False) == 'encode'
16 |     
17 |     # TOON file -> decode
18 |     assert detect_mode('test.toon', False, False) == 'decode'
19 |     
20 |     # Unknown extension -> encode (default)
21 |     assert detect_mode('test.txt', False, False) == 'encode'
22 | 
23 | 
24 | def test_detect_mode_with_flags():
25 |     """Test mode detection with explicit flags."""
26 |     # Force encode
27 |     assert detect_mode('test.toon', True, False) == 'encode'
28 |     
29 |     # Force decode
30 |     assert detect_mode('test.json', False, True) == 'decode'
31 | 
32 | 
33 | def test_detect_mode_stdin():
34 |     """Test mode detection for stdin."""
35 |     # Stdin without flags -> encode (default)
36 |     assert detect_mode('-', False, False) == 'encode'
37 |     assert detect_mode(None, False, False) == 'encode'
38 | 
39 | 
40 | def test_read_input_from_file():
41 |     """Test reading input from file."""
42 |     with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as f:
43 |         f.write('{"test": "value"}')
44 |         f.flush()
45 |         temp_path = f.name
46 |     
47 |     try:
48 |         content = read_input(temp_path)
49 |         assert content == '{"test": "value"}'
50 |     finally:
51 |         Path(temp_path).unlink()
52 | 
53 | 
54 | def test_write_output_to_file():
55 |     """Test writing output to file."""
56 |     with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.toon') as f:
57 |         temp_path = f.name
58 |     
59 |     try:
60 |         write_output('test: value', temp_path)
61 |         
62 |         with open(temp_path, 'r') as f:
63 |             content = f.read()
64 |         
65 |         assert content == 'test: value'
66 |     finally:
67 |         Path(temp_path).unlink()
68 | 
69 | 
70 | def test_count_tokens():
71 |     """Test token counting (if tiktoken available)."""
72 |     result = count_tokens('Hello, world!')
73 |     
74 |     # If tiktoken is available, should return int
75 |     # Otherwise, should return None
76 |     assert result is None or isinstance(result, int)
77 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |       - pre/*
 7 | 
 8 | jobs:
 9 |   build:
10 |     name: Build
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Install git
14 |         run: |
15 |           sudo apt update
16 |           sudo apt install -y git
17 | 
18 |       - name: Set up Python
19 |         uses: actions/setup-python@v5
20 |         with:
21 |           python-version: '3.10'
22 | 
23 |       - name: Install uv
24 |         uses: astral-sh/setup-uv@v3
25 | 
26 |       - name: Install Node Env
27 |         uses: actions/setup-node@v4
28 |         with:
29 |           node-version: 20
30 | 
31 |       - name: Checkout
32 |         uses: actions/checkout@v4.1.1
33 |         with:
34 |           fetch-depth: 0
35 |           persist-credentials: false
36 | 
37 |       - name: Build and validate package
38 |         run: |
39 |           uv venv
40 |           . .venv/bin/activate
41 |           uv pip install --upgrade hatchling
42 |           uv sync --frozen
43 |           uv pip install -e .
44 |           uv build
45 |           uv pip install --upgrade pkginfo==1.12.0 twine==6.0.1  # Upgrade pkginfo and install twine
46 |           python -m twine check dist/*
47 | 
48 |       - name: Debug Dist Directory
49 |         run: ls -al dist
50 | 
51 |   release:
52 |     name: Release
53 |     runs-on: ubuntu-latest
54 |     needs: build
55 |     environment: development
56 |     if: >
57 |         github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/pre/beta') ||
58 |         (github.event_name == 'pull_request' && github.event.action == 'closed' && github.event.pull_request.merged &&
59 |          (github.event.pull_request.base.ref == 'main' || github.event.pull_request.base.ref == 'pre/beta'))
60 |     permissions:
61 |       contents: write
62 |       issues: write
63 |       pull-requests: write
64 |       id-token: write
65 |     steps:
66 |       - name: Checkout repo
67 |         uses: actions/checkout@v4.1.1
68 |         with:
69 |           fetch-depth: 0
70 |           persist-credentials: false
71 | 
72 |       - name: Semantic Release
73 |         uses: cycjimmy/semantic-release-action@v4.1.0
74 |         with:
75 |           semantic_version: 23
76 |           extra_plugins: |
77 |             semantic-release-pypi@3
78 |             @semantic-release/git
79 |             @semantic-release/commit-analyzer@12
80 |             @semantic-release/release-notes-generator@13
81 |             @semantic-release/github@10
82 |             @semantic-release/changelog@6
83 |             conventional-changelog-conventionalcommits@7
84 |         env:
85 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
86 |           PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
87 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ "main" ]
17 |   pull_request:
18 |     branches: [ "main" ]
19 |   schedule:
20 |     - cron: '42 19 * * 5'
21 | 
22 | jobs:
23 |   analyze:
24 |     name: Analyze
25 |     # Runner size impacts CodeQL analysis time. To learn more, please see:
26 |     #   - https://gh.io/recommended-hardware-resources-for-running-codeql
27 |     #   - https://gh.io/supported-runners-and-hardware-resources
28 |     #   - https://gh.io/using-larger-runners
29 |     # Consider using larger runners for possible analysis time improvements.
30 |     runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
31 |     timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
32 |     permissions:
33 |       # required for all workflows
34 |       security-events: write
35 | 
36 |       # only required for workflows in private repositories
37 |       actions: read
38 |       contents: read
39 | 
40 |     strategy:
41 |       fail-fast: false
42 |       matrix:
43 |         language: [ 'python' ]
44 |         # CodeQL supports [ 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' ]
45 |         # Use only 'java-kotlin' to analyze code written in Java, Kotlin or both
46 |         # Use only 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
47 |         # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
48 | 
49 |     steps:
50 |     - name: Checkout repository
51 |       uses: actions/checkout@v4
52 | 
53 |     # Initializes the CodeQL tools for scanning.
54 |     - name: Initialize CodeQL
55 |       uses: github/codeql-action/init@v3
56 |       with:
57 |         languages: ${{ matrix.language }}
58 |         # If you wish to specify custom queries, you can do so here or in a config file.
59 |         # By default, queries listed here will override any specified in a config file.
60 |         # Prefix the list here with "+" to use these queries and those in the config file.
61 | 
62 |         # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
63 |         # queries: security-extended,security-and-quality
64 | 


--------------------------------------------------------------------------------
/examples/basic_usage.py:
--------------------------------------------------------------------------------
  1 | """Basic usage examples for TOON format."""
  2 | from toon import encode, decode
  3 | import json
  4 | 
  5 | 
  6 | def example_simple_encoding():
  7 |     """Simple object encoding."""
  8 |     print("=== Simple Encoding ===")
  9 |     
 10 |     data = {
 11 |         'name': 'Alice',
 12 |         'age': 30,
 13 |         'active': True
 14 |     }
 15 |     
 16 |     toon = encode(data)
 17 |     print("Original:", json.dumps(data))
 18 |     print("\nTOON format:")
 19 |     print(toon)
 20 |     print()
 21 | 
 22 | 
 23 | def example_array_encoding():
 24 |     """Array encoding examples."""
 25 |     print("=== Array Encoding ===")
 26 |     
 27 |     # Primitive array
 28 |     data1 = {'numbers': [1, 2, 3, 4, 5]}
 29 |     print("Primitive array:")
 30 |     print(encode(data1))
 31 |     print()
 32 |     
 33 |     # Tabular array (uniform objects)
 34 |     data2 = {
 35 |         'users': [
 36 |             {'id': 1, 'name': 'Alice', 'role': 'admin'},
 37 |             {'id': 2, 'name': 'Bob', 'role': 'user'}
 38 |         ]
 39 |     }
 40 |     print("Tabular array:")
 41 |     print(encode(data2))
 42 |     print()
 43 | 
 44 | 
 45 | def example_nested_structure():
 46 |     """Nested structure encoding."""
 47 |     print("=== Nested Structure ===")
 48 |     
 49 |     data = {
 50 |         'project': 'TOON',
 51 |         'version': '1.0.0',
 52 |         'metadata': {
 53 |             'author': 'TOON Contributors',
 54 |             'created': '2024-01-01',
 55 |             'tags': ['serialization', 'llm', 'format']
 56 |         },
 57 |         'users': [
 58 |             {'id': 1, 'name': 'Alice', 'active': True},
 59 |             {'id': 2, 'name': 'Bob', 'active': False}
 60 |         ]
 61 |     }
 62 |     
 63 |     toon = encode(data)
 64 |     print("TOON format:")
 65 |     print(toon)
 66 |     print()
 67 | 
 68 | 
 69 | def example_decoding():
 70 |     """Decoding examples."""
 71 |     print("=== Decoding ===")
 72 |     
 73 |     toon = """users[2]{id,name,role}:
 74 |   1,Alice,admin
 75 |   2,Bob,user"""
 76 |     
 77 |     data = decode(toon)
 78 |     print("TOON input:")
 79 |     print(toon)
 80 |     print("\nDecoded to Python:")
 81 |     print(json.dumps(data, indent=2))
 82 |     print()
 83 | 
 84 | 
 85 | def example_round_trip():
 86 |     """Round-trip conversion."""
 87 |     print("=== Round-trip Conversion ===")
 88 |     
 89 |     original = {
 90 |         'items': [
 91 |             {'id': 1, 'name': 'Item 1', 'price': 19.99},
 92 |             {'id': 2, 'name': 'Item 2', 'price': 29.99}
 93 |         ]
 94 |     }
 95 |     
 96 |     print("Original:")
 97 |     print(json.dumps(original, indent=2))
 98 |     
 99 |     # Encode
100 |     toon = encode(original)
101 |     print("\nTOON format:")
102 |     print(toon)
103 |     
104 |     # Decode
105 |     result = decode(toon)
106 |     print("\nDecoded back:")
107 |     print(json.dumps(result, indent=2))
108 |     
109 |     # Verify
110 |     print("\nRound-trip successful:", original == result)
111 |     print()
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     example_simple_encoding()
116 |     example_array_encoding()
117 |     example_nested_structure()
118 |     example_decoding()
119 |     example_round_trip()
120 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## [1.5.1](https://github.com/ScrapeGraphAI/toonify/compare/v1.5.0...v1.5.1) (2025-11-26)
 2 | 
 3 | 
 4 | ### Bug Fixes
 5 | 
 6 | * **ci:** correct workflow paths from scrapegraphai to toon ([bd77a82](https://github.com/ScrapeGraphAI/toonify/commit/bd77a82edb41ecd7df461468e65ef0d20a6c51b7))
 7 | 
 8 | ## [1.5.0](https://github.com/ScrapeGraphAI/toonify/compare/v1.4.0...v1.5.0) (2025-11-20)
 9 | 
10 | 
11 | ### Features
12 | 
13 | * trigger semantic release ([919fa4c](https://github.com/ScrapeGraphAI/toonify/commit/919fa4cda0e9a8b84eace559c3a12a18497fa766))
14 | 
15 | 
16 | ### Bug Fixes
17 | 
18 | * Preserve nested arrays and objects in tabular format (Issue [#6](https://github.com/ScrapeGraphAI/toonify/issues/6)) ([38eacc0](https://github.com/ScrapeGraphAI/toonify/commit/38eacc0e164414e24892067e52d7367b2342872d))
19 | 
20 | ## [1.4.0](https://github.com/ScrapeGraphAI/toonify/compare/v1.3.0...v1.4.0) (2025-11-13)
21 | 
22 | 
23 | ### Features
24 | 
25 | * Add Pydantic model support for TOON conversion ([0bc9109](https://github.com/ScrapeGraphAI/toonify/commit/0bc9109bda223f503bb227e15ca2fa7183a746cf))
26 | 
27 | ## [1.3.0](https://github.com/ScrapeGraphAI/toonify/compare/v1.2.0...v1.3.0) (2025-11-12)
28 | 
29 | 
30 | ### Features
31 | 
32 | * achieve full feature parity with TypeScript TOON library ([58e375d](https://github.com/ScrapeGraphAI/toonify/commit/58e375dab005f5a3e6cdbebe3ec36e03245cae19))
33 | 
34 | ## [1.2.0](https://github.com/ScrapeGraphAI/toonify/compare/v1.1.1...v1.2.0) (2025-11-12)
35 | 
36 | 
37 | ### Features
38 | 
39 | * add benchmarks ([4e86247](https://github.com/ScrapeGraphAI/toonify/commit/4e86247aad89e8b7eac77a69fef9f73e8a8e3451))
40 | 
41 | ## [1.1.1](https://github.com/ScrapeGraphAI/toonify/compare/v1.1.0...v1.1.1) (2025-11-12)
42 | 
43 | 
44 | ### Bug Fixes
45 | 
46 | * use supported hatchling version to fix release ([d4ee6b2](https://github.com/ScrapeGraphAI/toonify/commit/d4ee6b260fa79471962ad83108855a3cfcd2933b))
47 | 
48 | ## [1.1.0](https://github.com/ScrapeGraphAI/toonify/compare/v1.0.1...v1.1.0) (2025-11-12)
49 | 
50 | 
51 | ### Features
52 | 
53 | * add files ([1716141](https://github.com/ScrapeGraphAI/toonify/commit/1716141ae8fc73895ff19d4cd0178ea5bcc2538f))
54 | 
55 | ## [1.0.1](https://github.com/ScrapeGraphAI/toonify/compare/v1.0.0...v1.0.1) (2025-11-12)
56 | 
57 | 
58 | ### Bug Fixes
59 | 
60 | * release file ([33fc5ff](https://github.com/ScrapeGraphAI/toonify/commit/33fc5ff000f1dea80aaeec4d02282dc406e0f9bc))
61 | 
62 | ## 1.0.0 (2025-11-12)
63 | 
64 | 
65 | ### Features
66 | 
67 | * add chinese readme ([7f9eeca](https://github.com/ScrapeGraphAI/toonify/commit/7f9eeca7d8b64aef7ae6bf797d1a2cfd32eb9635))
68 | * add CI/CD ([9d444fd](https://github.com/ScrapeGraphAI/toonify/commit/9d444fd9738435bf9fed907a0d97b50562ab7218))
69 | * add korean readme ([5c223e0](https://github.com/ScrapeGraphAI/toonify/commit/5c223e06a36eb1ed7edac4c9b8af3ecaafeffd3d))
70 | * add toonify logo ([f3ccd1c](https://github.com/ScrapeGraphAI/toonify/commit/f3ccd1c2b0a37941af41dbb508fe1763f7471c10))
71 | * first git ([303fabd](https://github.com/ScrapeGraphAI/toonify/commit/303fabdca416d06518f28b1007fd99479d3d6046))
72 | 
73 | 
74 | ### Bug Fixes
75 | 
76 | * create release file ([71acc0b](https://github.com/ScrapeGraphAI/toonify/commit/71acc0b8fcf46f4b947ee5a4240dc27bd7cf4a93))
77 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Demo script showcasing TOON format capabilities."""
  3 | import json
  4 | from toon import encode, decode
  5 | 
  6 | def print_section(title):
  7 |     """Print a section header."""
  8 |     print(f"\n{'='*60}")
  9 |     print(f"  {title}")
 10 |     print(f"{'='*60}\n")
 11 | 
 12 | def demo_basic():
 13 |     """Basic encoding/decoding demo."""
 14 |     print_section("Basic TOON Encoding")
 15 |     
 16 |     data = {
 17 |         "name": "Alice",
 18 |         "age": 30,
 19 |         "city": "New York"
 20 |     }
 21 |     
 22 |     print("Python object:")
 23 |     print(json.dumps(data, indent=2))
 24 |     
 25 |     print("\nTOON format:")
 26 |     toon = encode(data)
 27 |     print(toon)
 28 |     
 29 |     print("\nDecoded back:")
 30 |     result = decode(toon)
 31 |     print(json.dumps(result, indent=2))
 32 | 
 33 | def demo_tabular():
 34 |     """Tabular array demo."""
 35 |     print_section("Tabular Arrays - The Power of TOON")
 36 |     
 37 |     data = {
 38 |         "users": [
 39 |             {"id": 1, "name": "Alice Smith", "role": "Engineer", "active": True},
 40 |             {"id": 2, "name": "Bob Jones", "role": "Designer", "active": True},
 41 |             {"id": 3, "name": "Carol White", "role": "Manager", "active": False}
 42 |         ]
 43 |     }
 44 |     
 45 |     json_str = json.dumps(data, indent=2)
 46 |     toon_str = encode(data)
 47 |     
 48 |     print("JSON format:")
 49 |     print(json_str)
 50 |     print(f"\nSize: {len(json_str)} bytes")
 51 |     
 52 |     print("\n" + "-"*60 + "\n")
 53 |     
 54 |     print("TOON format:")
 55 |     print(toon_str)
 56 |     print(f"\nSize: {len(toon_str)} bytes")
 57 |     
 58 |     reduction = (1 - len(toon_str) / len(json_str)) * 100
 59 |     print(f"\n✨ Size reduction: {reduction:.1f}%")
 60 | 
 61 | def demo_nested():
 62 |     """Nested structure demo."""
 63 |     print_section("Nested Structures")
 64 |     
 65 |     data = {
 66 |         "project": "TOON",
 67 |         "metadata": {
 68 |             "version": "1.0.0",
 69 |             "license": "MIT",
 70 |             "contributors": ["Alice", "Bob", "Carol"]
 71 |         }
 72 |     }
 73 |     
 74 |     toon = encode(data)
 75 |     print("TOON format:")
 76 |     print(toon)
 77 |     
 78 |     print("\nDecoded:")
 79 |     result = decode(toon)
 80 |     print(json.dumps(result, indent=2))
 81 | 
 82 | def demo_delimiters():
 83 |     """Different delimiter demo."""
 84 |     print_section("Custom Delimiters")
 85 |     
 86 |     data = {
 87 |         "items": [
 88 |             {"code": "A001", "name": "Widget", "price": 19.99},
 89 |             {"code": "B002", "name": "Gadget", "price": 29.99}
 90 |         ]
 91 |     }
 92 |     
 93 |     print("Tab delimiter (for spreadsheets):")
 94 |     toon_tab = encode(data, {"delimiter": "tab"})
 95 |     print(toon_tab)
 96 |     
 97 |     print("\nPipe delimiter (when data has commas):")
 98 |     toon_pipe = encode(data, {"delimiter": "pipe"})
 99 |     print(toon_pipe)
100 | 
101 | def demo_key_folding():
102 |     """Key folding demo."""
103 |     print_section("Key Folding for Deeply Nested Data")
104 |     
105 |     data = {
106 |         "api": {
107 |             "response": {
108 |                 "data": {
109 |                     "user": {
110 |                         "name": "Alice"
111 |                     }
112 |                 }
113 |             }
114 |         }
115 |     }
116 |     
117 |     print("Without key folding:")
118 |     toon_normal = encode(data)
119 |     print(toon_normal)
120 |     
121 |     print("\nWith key folding:")
122 |     toon_folded = encode(data, {"key_folding": "safe"})
123 |     print(toon_folded)
124 |     
125 |     print("\nWith path expansion on decode:")
126 |     result = decode(toon_folded, {"expand_paths": "safe"})
127 |     print(json.dumps(result, indent=2))
128 | 
129 | def main():
130 |     """Run all demos."""
131 |     print("\n" + "="*60)
132 |     print("  TOON FORMAT LIBRARY - INTERACTIVE DEMO")
133 |     print("  Token-Oriented Object Notation for LLMs")
134 |     print("="*60)
135 |     
136 |     demo_basic()
137 |     demo_tabular()
138 |     demo_nested()
139 |     demo_delimiters()
140 |     demo_key_folding()
141 |     
142 |     print_section("Summary")
143 |     print("✨ TOON achieves 30-60% size reduction vs JSON")
144 |     print("✨ Perfect for LLM prompts and context windows")
145 |     print("✨ Human-readable and easy to edit")
146 |     print("✨ Fully reversible - no data loss")
147 |     print("\nTry it yourself:")
148 |     print("  pip install toon-format")
149 |     print("  echo '{\"hello\": \"world\"}' | toon -e")
150 |     print()
151 | 
152 | if __name__ == "__main__":
153 |     main()
154 | 


--------------------------------------------------------------------------------
/tests/test_nested_arrays.py:
--------------------------------------------------------------------------------
  1 | """Tests for nested arrays and objects within tabular arrays (Issue #6)."""
  2 | import pytest
  3 | from toon import encode, decode
  4 | 
  5 | 
  6 | def test_array_of_objects_with_nested_arrays():
  7 |     """Test encoding/decoding arrays of objects that contain nested arrays."""
  8 |     original = {
  9 |         "categorization": [
 10 |             {
 11 |                 "id": "01.04.04.01.",
 12 |                 "label": "Aspetti generali",
 13 |                 "hierarchy": [
 14 |                     "Prodotti",
 15 |                     "Organizzazione altro e Sito Internet",
 16 |                     "Aspetti generali",
 17 |                     "Aspetti generali"
 18 |                 ],
 19 |                 "score": 900,
 20 |                 "winner": True,
 21 |                 "namespace": "$namespace",
 22 |                 "frequency": 0,
 23 |                 "offset": [
 24 |                     {"start": 511, "end": 520},
 25 |                     {"start": 524, "end": 527},
 26 |                     {"start": 528, "end": 543}
 27 |                 ]
 28 |             }
 29 |         ]
 30 |     }
 31 |     
 32 |     # Encode
 33 |     toon = encode(original)
 34 |     print("Encoded TOON:")
 35 |     print(toon)
 36 |     
 37 |     # Decode
 38 |     result = decode(toon)
 39 |     print("\nDecoded result:")
 40 |     print(result)
 41 |     
 42 |     # Verify all fields are preserved
 43 |     assert result == original, "Decoded data should match original"
 44 |     assert "hierarchy" in result["categorization"][0], "hierarchy field should be preserved"
 45 |     assert "offset" in result["categorization"][0], "offset field should be preserved"
 46 |     assert len(result["categorization"][0]["hierarchy"]) == 4, "hierarchy array should have 4 items"
 47 |     assert len(result["categorization"][0]["offset"]) == 3, "offset array should have 3 items"
 48 | 
 49 | 
 50 | def test_array_of_objects_with_nested_objects():
 51 |     """Test encoding/decoding arrays where objects contain nested objects."""
 52 |     original = {
 53 |         "users": [
 54 |             {
 55 |                 "id": 1,
 56 |                 "name": "Alice",
 57 |                 "address": {
 58 |                     "street": "123 Main St",
 59 |                     "city": "NYC"
 60 |                 }
 61 |             },
 62 |             {
 63 |                 "id": 2,
 64 |                 "name": "Bob",
 65 |                 "address": {
 66 |                     "street": "456 Oak Ave",
 67 |                     "city": "LA"
 68 |                 }
 69 |             }
 70 |         ]
 71 |     }
 72 |     
 73 |     # Encode
 74 |     toon = encode(original)
 75 |     print("Encoded TOON:")
 76 |     print(toon)
 77 |     
 78 |     # Decode
 79 |     result = decode(toon)
 80 |     
 81 |     # Verify all fields are preserved
 82 |     assert result == original
 83 |     assert "address" in result["users"][0]
 84 |     assert result["users"][0]["address"]["city"] == "NYC"
 85 | 
 86 | 
 87 | def test_array_of_objects_mixed_primitive_and_nested():
 88 |     """Test arrays with both primitive and nested fields."""
 89 |     original = {
 90 |         "items": [
 91 |             {
 92 |                 "id": 1,
 93 |                 "name": "Item A",
 94 |                 "tags": ["tag1", "tag2"],
 95 |                 "price": 10.5
 96 |             },
 97 |             {
 98 |                 "id": 2,
 99 |                 "name": "Item B",
100 |                 "tags": ["tag3"],
101 |                 "price": 20.0
102 |             }
103 |         ]
104 |     }
105 |     
106 |     # Encode
107 |     toon = encode(original)
108 |     
109 |     # Decode
110 |     result = decode(toon)
111 |     
112 |     # Verify all fields are preserved
113 |     assert result == original
114 |     assert "tags" in result["items"][0]
115 |     assert len(result["items"][0]["tags"]) == 2
116 |     assert len(result["items"][1]["tags"]) == 1
117 | 
118 | 
119 | def test_roundtrip_complex_nested_structure():
120 |     """Test full roundtrip of complex nested structure."""
121 |     original = {
122 |         "data": [
123 |             {
124 |                 "id": "A1",
125 |                 "value": 100,
126 |                 "metadata": {
127 |                     "created": "2024-01-01",
128 |                     "tags": ["important", "urgent"]
129 |                 },
130 |                 "scores": [95, 87, 92]
131 |             },
132 |             {
133 |                 "id": "A2", 
134 |                 "value": 200,
135 |                 "metadata": {
136 |                     "created": "2024-01-02",
137 |                     "tags": ["normal"]
138 |                 },
139 |                 "scores": [88, 90]
140 |             }
141 |         ]
142 |     }
143 |     
144 |     # First roundtrip
145 |     toon1 = encode(original)
146 |     result1 = decode(toon1)
147 |     assert result1 == original
148 |     
149 |     # Second roundtrip
150 |     toon2 = encode(result1)
151 |     result2 = decode(toon2)
152 |     assert result2 == original
153 |     assert toon1 == toon2
154 | 
155 | 
156 | def test_array_of_objects_some_with_nested_some_without():
157 |     """Test arrays where only some objects have nested fields."""
158 |     original = {
159 |         "records": [
160 |             {
161 |                 "id": 1,
162 |                 "name": "Record A",
163 |                 "extra": {"note": "Has nested"}
164 |             },
165 |             {
166 |                 "id": 2,
167 |                 "name": "Record B"
168 |                 # No 'extra' field
169 |             }
170 |         ]
171 |     }
172 |     
173 |     # Encode
174 |     toon = encode(original)
175 |     
176 |     # Decode
177 |     result = decode(toon)
178 |     
179 |     # Verify structure is preserved
180 |     assert "extra" in result["records"][0]
181 |     assert "extra" not in result["records"][1]
182 |     assert result["records"][0]["extra"]["note"] == "Has nested"
183 | 
184 | 


--------------------------------------------------------------------------------
/examples/advanced_features.py:
--------------------------------------------------------------------------------
  1 | """Advanced features examples for TOON format."""
  2 | from toon import encode, decode
  3 | import json
  4 | 
  5 | 
  6 | def example_custom_delimiter():
  7 |     """Using different delimiters."""
  8 |     print("=== Custom Delimiters ===")
  9 |     
 10 |     data = {
 11 |         'users': [
 12 |             {'id': 1, 'name': 'Alice', 'dept': 'Engineering'},
 13 |             {'id': 2, 'name': 'Bob', 'dept': 'Sales'}
 14 |         ]
 15 |     }
 16 |     
 17 |     # Comma delimiter (default)
 18 |     print("Comma delimiter:")
 19 |     print(encode(data, {'delimiter': 'comma'}))
 20 |     print()
 21 |     
 22 |     # Tab delimiter
 23 |     print("Tab delimiter:")
 24 |     print(encode(data, {'delimiter': 'tab'}))
 25 |     print()
 26 |     
 27 |     # Pipe delimiter
 28 |     print("Pipe delimiter:")
 29 |     print(encode(data, {'delimiter': 'pipe'}))
 30 |     print()
 31 | 
 32 | 
 33 | def example_key_folding():
 34 |     """Key folding for deeply nested single-key objects."""
 35 |     print("=== Key Folding ===")
 36 |     
 37 |     data = {
 38 |         'response': {
 39 |             'data': {
 40 |                 'user': {
 41 |                     'profile': {
 42 |                         'name': 'Alice'
 43 |                     }
 44 |                 }
 45 |             }
 46 |         }
 47 |     }
 48 |     
 49 |     # Without key folding
 50 |     print("Without key folding:")
 51 |     print(encode(data, {'key_folding': 'off'}))
 52 |     print()
 53 |     
 54 |     # With key folding
 55 |     print("With key folding:")
 56 |     print(encode(data, {'key_folding': 'safe'}))
 57 |     print()
 58 | 
 59 | 
 60 | def example_path_expansion():
 61 |     """Path expansion during decoding."""
 62 |     print("=== Path Expansion ===")
 63 |     
 64 |     # TOON with dotted keys
 65 |     toon = 'user.profile.name: Alice\nuser.profile.age: 30'
 66 |     
 67 |     # Without expansion
 68 |     print("Without path expansion:")
 69 |     result_no_expand = decode(toon, {'expand_paths': 'off'})
 70 |     print(json.dumps(result_no_expand, indent=2))
 71 |     print()
 72 |     
 73 |     # With expansion
 74 |     print("With path expansion:")
 75 |     result_expand = decode(toon, {'expand_paths': 'safe'})
 76 |     print(json.dumps(result_expand, indent=2))
 77 |     print()
 78 | 
 79 | 
 80 | def example_custom_indentation():
 81 |     """Custom indentation size."""
 82 |     print("=== Custom Indentation ===")
 83 |     
 84 |     data = {
 85 |         'parent': {
 86 |             'child': {
 87 |                 'value': 42
 88 |             }
 89 |         }
 90 |     }
 91 |     
 92 |     # 2 spaces (default)
 93 |     print("2-space indent:")
 94 |     print(encode(data, {'indent': 2}))
 95 |     print()
 96 |     
 97 |     # 4 spaces
 98 |     print("4-space indent:")
 99 |     print(encode(data, {'indent': 4}))
100 |     print()
101 | 
102 | 
103 | def example_special_characters():
104 |     """Handling special characters."""
105 |     print("=== Special Characters ===")
106 |     
107 |     data = {
108 |         'message': 'Hello, World!',
109 |         'path': 'C:\\Users\\Alice\\Documents',
110 |         'quote': 'He said "hello"',
111 |         'multiline': 'Line 1\nLine 2\nLine 3',
112 |         'looks_like_bool': 'true'
113 |     }
114 |     
115 |     toon = encode(data)
116 |     print("Encoded:")
117 |     print(toon)
118 |     print()
119 |     
120 |     # Decode back
121 |     result = decode(toon)
122 |     print("Decoded:")
123 |     print(json.dumps(result, indent=2))
124 |     print()
125 | 
126 | 
127 | def example_mixed_arrays():
128 |     """Handling different array types."""
129 |     print("=== Mixed Arrays ===")
130 |     
131 |     data = {
132 |         'primitive_array': [1, 2, 3, 4, 5],
133 |         'string_array': ['apple', 'banana', 'cherry'],
134 |         'uniform_objects': [
135 |             {'id': 1, 'name': 'Alice'},
136 |             {'id': 2, 'name': 'Bob'}
137 |         ],
138 |         'mixed_objects': [
139 |             {'id': 1, 'type': 'A'},
140 |             {'id': 2, 'category': 'B'}  # Different fields
141 |         ]
142 |     }
143 |     
144 |     toon = encode(data)
145 |     print("Encoded:")
146 |     print(toon)
147 |     print()
148 | 
149 | 
150 | def example_empty_values():
151 |     """Handling empty and null values."""
152 |     print("=== Empty and Null Values ===")
153 |     
154 |     data = {
155 |         'null_value': None,
156 |         'empty_string': '',
157 |         'empty_array': [],
158 |         'empty_object': {},
159 |         'nested_empty': {
160 |             'inner': {}
161 |         }
162 |     }
163 |     
164 |     toon = encode(data)
165 |     print("Encoded:")
166 |     print(toon)
167 |     print()
168 |     
169 |     result = decode(toon)
170 |     print("Decoded:")
171 |     print(json.dumps(result, indent=2))
172 |     print()
173 | 
174 | 
175 | def example_token_efficiency():
176 |     """Compare token usage between JSON and TOON."""
177 |     print("=== Token Efficiency ===")
178 |     
179 |     data = {
180 |         'users': [
181 |             {'id': 1, 'name': 'Alice Johnson', 'email': 'alice@example.com', 'active': True},
182 |             {'id': 2, 'name': 'Bob Smith', 'email': 'bob@example.com', 'active': True},
183 |             {'id': 3, 'name': 'Charlie Brown', 'email': 'charlie@example.com', 'active': False}
184 |         ]
185 |     }
186 |     
187 |     json_str = json.dumps(data, indent=2)
188 |     toon_str = encode(data)
189 |     
190 |     print("JSON format:")
191 |     print(json_str)
192 |     print(f"\nJSON size: {len(json_str)} bytes")
193 |     
194 |     print("\nTOON format:")
195 |     print(toon_str)
196 |     print(f"\nTOON size: {len(toon_str)} bytes")
197 |     
198 |     reduction = (1 - len(toon_str) / len(json_str)) * 100
199 |     print(f"\nSize reduction: {reduction:.1f}%")
200 |     print()
201 | 
202 | 
203 | if __name__ == '__main__':
204 |     example_custom_delimiter()
205 |     example_key_folding()
206 |     example_path_expansion()
207 |     example_custom_indentation()
208 |     example_special_characters()
209 |     example_mixed_arrays()
210 |     example_empty_values()
211 |     example_token_efficiency()
212 | 


--------------------------------------------------------------------------------
/benchmark/memory_benchmark.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Memory usage benchmark comparing TOON vs JSON.
  4 | Shows actual memory consumption when working with serialized data.
  5 | """
  6 | 
  7 | import json
  8 | import sys
  9 | from pathlib import Path
 10 | 
 11 | # Add parent directory to path for imports
 12 | sys.path.insert(0, str(Path(__file__).parent.parent))
 13 | 
 14 | from toon import encode, decode
 15 | 
 16 | 
 17 | def get_memory_size(obj):
 18 |     """
 19 |     Get approximate memory size of a Python object in bytes.
 20 |     This uses sys.getsizeof recursively for nested structures.
 21 |     """
 22 |     import sys
 23 |     from collections.abc import Mapping, Iterable
 24 | 
 25 |     seen = set()
 26 | 
 27 |     def sizeof(o):
 28 |         if id(o) in seen:
 29 |             return 0
 30 |         seen.add(id(o))
 31 | 
 32 |         size = sys.getsizeof(o)
 33 | 
 34 |         if isinstance(o, Mapping):
 35 |             size += sum(sizeof(k) + sizeof(v) for k, v in o.items())
 36 |         elif isinstance(o, Iterable) and not isinstance(o, (str, bytes, bytearray)):
 37 |             size += sum(sizeof(item) for item in o)
 38 | 
 39 |         return size
 40 | 
 41 |     return sizeof(obj)
 42 | 
 43 | 
 44 | def format_size(size_bytes: int) -> str:
 45 |     """Format bytes in human-readable format."""
 46 |     if size_bytes < 1024:
 47 |         return f"{size_bytes}B"
 48 |     elif size_bytes < 1024 * 1024:
 49 |         return f"{size_bytes / 1024:.2f}KB"
 50 |     else:
 51 |         return f"{size_bytes / (1024 * 1024):.2f}MB"
 52 | 
 53 | 
 54 | def benchmark_memory(name: str, data: dict):
 55 |     """Benchmark memory usage for a dataset."""
 56 |     print(f"\n{'='*60}")
 57 |     print(f"Memory Benchmark: {name}")
 58 |     print(f"{'='*60}")
 59 | 
 60 |     # Python object memory
 61 |     obj_memory = get_memory_size(data)
 62 |     print(f"\n📦 Python Object:")
 63 |     print(f"  Memory: {format_size(obj_memory):>10} ({obj_memory:,} bytes)")
 64 | 
 65 |     # JSON string memory
 66 |     json_str = json.dumps(data, indent=2)
 67 |     json_str_memory = sys.getsizeof(json_str)
 68 |     json_bytes = json_str.encode('utf-8')
 69 |     json_bytes_memory = len(json_bytes)
 70 | 
 71 |     print(f"\n📄 JSON Format:")
 72 |     print(f"  String object memory: {format_size(json_str_memory):>10} ({json_str_memory:,} bytes)")
 73 |     print(f"  UTF-8 bytes size:     {format_size(json_bytes_memory):>10} ({json_bytes_memory:,} bytes)")
 74 | 
 75 |     # TOON string memory
 76 |     toon_str = encode(data)
 77 |     toon_str_memory = sys.getsizeof(toon_str)
 78 |     toon_bytes = toon_str.encode('utf-8')
 79 |     toon_bytes_memory = len(toon_bytes)
 80 | 
 81 |     print(f"\n🎯 TOON Format:")
 82 |     print(f"  String object memory: {format_size(toon_str_memory):>10} ({toon_str_memory:,} bytes)")
 83 |     print(f"  UTF-8 bytes size:     {format_size(toon_bytes_memory):>10} ({toon_bytes_memory:,} bytes)")
 84 | 
 85 |     # Calculate savings
 86 |     string_savings = ((json_str_memory - toon_str_memory) / json_str_memory) * 100
 87 |     bytes_savings = ((json_bytes_memory - toon_bytes_memory) / json_bytes_memory) * 100
 88 | 
 89 |     print(f"\n💾 Memory Savings:")
 90 |     print(f"  String memory: {string_savings:.1f}% smaller")
 91 |     print(f"  Bytes size:    {bytes_savings:.1f}% smaller")
 92 | 
 93 |     # Practical impact
 94 |     print(f"\n💡 Practical Impact:")
 95 |     print(f"  If you send this data to an LLM API:")
 96 |     print(f"    • JSON uses {json_bytes_memory:,} bytes of network bandwidth")
 97 |     print(f"    • TOON uses {toon_bytes_memory:,} bytes of network bandwidth")
 98 |     print(f"    • You save {json_bytes_memory - toon_bytes_memory:,} bytes per request!")
 99 | 
100 |     return {
101 |         'name': name,
102 |         'obj_memory': obj_memory,
103 |         'json_str_memory': json_str_memory,
104 |         'json_bytes_memory': json_bytes_memory,
105 |         'toon_str_memory': toon_str_memory,
106 |         'toon_bytes_memory': toon_bytes_memory,
107 |         'string_savings': string_savings,
108 |         'bytes_savings': bytes_savings,
109 |     }
110 | 
111 | 
112 | def main():
113 |     """Run memory benchmarks."""
114 |     from sample_datasets import DATASETS
115 | 
116 |     print("="*60)
117 |     print("TOON Memory Usage Benchmark")
118 |     print("="*60)
119 |     print("\nThis benchmark shows actual memory consumption when")
120 |     print("working with JSON vs TOON serialized data.")
121 | 
122 |     results = []
123 | 
124 |     # Test a subset of datasets for memory benchmarks
125 |     test_datasets = {
126 |         "E-commerce Products": DATASETS["E-commerce Products"],
127 |         "Database Results": DATASETS["Database Results"],
128 |         "Large Inventory (100 items)": DATASETS["Large Inventory (100 items)"],
129 |     }
130 | 
131 |     for dataset_name, dataset in test_datasets.items():
132 |         result = benchmark_memory(dataset_name, dataset)
133 |         results.append(result)
134 | 
135 |     # Summary
136 |     print(f"\n{'='*60}")
137 |     print("MEMORY SAVINGS SUMMARY")
138 |     print(f"{'='*60}")
139 |     print(f"\n{'Dataset':<35} {'Bytes Saved':<15} {'% Saved':<10}")
140 |     print("-" * 60)
141 | 
142 |     for result in results:
143 |         bytes_saved = result['json_bytes_memory'] - result['toon_bytes_memory']
144 |         print(f"{result['name']:<35} {format_size(bytes_saved):<15} {result['bytes_savings']:>6.1f}%")
145 | 
146 |     total_json = sum(r['json_bytes_memory'] for r in results)
147 |     total_toon = sum(r['toon_bytes_memory'] for r in results)
148 |     total_saved = total_json - total_toon
149 |     avg_savings = ((total_json - total_toon) / total_json) * 100
150 | 
151 |     print("-" * 60)
152 |     print(f"{'TOTAL':<35} {format_size(total_saved):<15} {avg_savings:>6.1f}%")
153 | 
154 |     print(f"\n🎉 Key Findings:")
155 |     print(f"  • TOON reduces serialized data size by {avg_savings:.1f}% on average")
156 |     print(f"  • This means less memory usage, faster network transfers")
157 |     print(f"  • And most importantly: lower LLM API costs!")
158 |     print(f"\n✅ Memory benchmark completed successfully!")
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     main()
163 | 


--------------------------------------------------------------------------------
/toon/pydantic_converter.py:
--------------------------------------------------------------------------------
  1 | """Pydantic model converter for TOON format."""
  2 | from typing import Any, Dict, List, Optional, Union
  3 | from .encoder import encode, EncoderOptions
  4 | 
  5 | 
  6 | def encode_pydantic(
  7 |     model: Any,
  8 |     options: Optional[Dict[str, Any]] = None,
  9 |     exclude_unset: bool = False,
 10 |     exclude_none: bool = False,
 11 |     exclude_defaults: bool = False,
 12 |     by_alias: bool = False
 13 | ) -> str:
 14 |     """
 15 |     Encode a Pydantic model to TOON format.
 16 |     
 17 |     This function converts Pydantic models (v1 or v2) to TOON format by first
 18 |     converting them to dictionaries and then encoding to TOON.
 19 |     
 20 |     Args:
 21 |         model: Pydantic model instance or list of model instances
 22 |         options: Encoding options (same as `encode` function)
 23 |             - delimiter: ',' (default), '\t', or '|'
 24 |             - indent: int (default 2)
 25 |             - key_folding: 'off' (default) or 'safe'
 26 |             - flatten_depth: int or None
 27 |         exclude_unset: If True, exclude fields that were not explicitly set
 28 |         exclude_none: If True, exclude fields with None values
 29 |         exclude_defaults: If True, exclude fields with default values
 30 |         by_alias: If True, use field aliases instead of field names
 31 |             
 32 |     Returns:
 33 |         TOON formatted string
 34 |         
 35 |     Raises:
 36 |         ImportError: If pydantic is not installed
 37 |         ValueError: If the model is not a valid Pydantic model
 38 |         
 39 |     Example:
 40 |         >>> from pydantic import BaseModel
 41 |         >>> class User(BaseModel):
 42 |         ...     id: int
 43 |         ...     name: str
 44 |         ...     email: str
 45 |         >>> 
 46 |         >>> users = [
 47 |         ...     User(id=1, name='Alice', email='alice@example.com'),
 48 |         ...     User(id=2, name='Bob', email='bob@example.com')
 49 |         ... ]
 50 |         >>> print(encode_pydantic(users))
 51 |         [2]{id,name,email}:
 52 |           1,Alice,alice@example.com
 53 |           2,Bob,bob@example.com
 54 |     """
 55 |     try:
 56 |         from pydantic import BaseModel
 57 |     except ImportError:
 58 |         raise ImportError(
 59 |             "pydantic is required for encode_pydantic(). "
 60 |             "Install it with: pip install pydantic"
 61 |         )
 62 |     
 63 |     # Convert model(s) to dict
 64 |     data = _pydantic_to_dict(
 65 |         model,
 66 |         exclude_unset=exclude_unset,
 67 |         exclude_none=exclude_none,
 68 |         exclude_defaults=exclude_defaults,
 69 |         by_alias=by_alias
 70 |     )
 71 |     
 72 |     # Encode to TOON
 73 |     return encode(data, options)
 74 | 
 75 | 
 76 | def _pydantic_to_dict(
 77 |     model: Any,
 78 |     exclude_unset: bool = False,
 79 |     exclude_none: bool = False,
 80 |     exclude_defaults: bool = False,
 81 |     by_alias: bool = False
 82 | ) -> Union[Dict, List]:
 83 |     """
 84 |     Convert Pydantic model(s) to dictionary/list.
 85 |     
 86 |     Supports both Pydantic v1 and v2.
 87 |     """
 88 |     try:
 89 |         from pydantic import BaseModel
 90 |     except ImportError:
 91 |         raise ImportError("pydantic is not installed")
 92 |     
 93 |     # Handle list of models
 94 |     if isinstance(model, list):
 95 |         return [
 96 |             _pydantic_to_dict(
 97 |                 item,
 98 |                 exclude_unset=exclude_unset,
 99 |                 exclude_none=exclude_none,
100 |                 exclude_defaults=exclude_defaults,
101 |                 by_alias=by_alias
102 |             )
103 |             for item in model
104 |         ]
105 |     
106 |     # Verify it's a Pydantic model
107 |     if not isinstance(model, BaseModel):
108 |         raise ValueError(
109 |             f"Expected Pydantic BaseModel instance, got {type(model).__name__}"
110 |         )
111 |     
112 |     # Try Pydantic v2 first, fall back to v1
113 |     try:
114 |         # Pydantic v2
115 |         return model.model_dump(
116 |             exclude_unset=exclude_unset,
117 |             exclude_none=exclude_none,
118 |             exclude_defaults=exclude_defaults,
119 |             by_alias=by_alias,
120 |             mode='python'
121 |         )
122 |     except AttributeError:
123 |         # Pydantic v1
124 |         return model.dict(
125 |             exclude_unset=exclude_unset,
126 |             exclude_none=exclude_none,
127 |             exclude_defaults=exclude_defaults,
128 |             by_alias=by_alias
129 |         )
130 | 
131 | 
132 | def decode_to_pydantic(toon_string: str, model_class: type, options: Optional[Dict[str, Any]] = None) -> Any:
133 |     """
134 |     Decode TOON string to Pydantic model(s).
135 |     
136 |     Args:
137 |         toon_string: TOON formatted string
138 |         model_class: Pydantic model class to instantiate
139 |         options: Decoding options (same as `decode` function)
140 |             - strict: bool (default True) - validate structure
141 |             - expand_paths: 'off' (default) or 'safe'
142 |             - default_delimiter: ',' (default)
143 |             
144 |     Returns:
145 |         Pydantic model instance or list of instances
146 |         
147 |     Raises:
148 |         ImportError: If pydantic is not installed
149 |         ValueError: If model_class is not a valid Pydantic model class
150 |         
151 |     Example:
152 |         >>> from pydantic import BaseModel
153 |         >>> class User(BaseModel):
154 |         ...     id: int
155 |         ...     name: str
156 |         ...     email: str
157 |         >>> 
158 |         >>> toon = '''[2]{id,name,email}:
159 |         ...   1,Alice,alice@example.com
160 |         ...   2,Bob,bob@example.com'''
161 |         >>> users = decode_to_pydantic(toon, User)
162 |         >>> print(users[0].name)
163 |         Alice
164 |     """
165 |     try:
166 |         from pydantic import BaseModel
167 |     except ImportError:
168 |         raise ImportError(
169 |             "pydantic is required for decode_to_pydantic(). "
170 |             "Install it with: pip install pydantic"
171 |         )
172 |     
173 |     from .decoder import decode
174 |     
175 |     # Verify model_class is a Pydantic model
176 |     if not (isinstance(model_class, type) and issubclass(model_class, BaseModel)):
177 |         raise ValueError(
178 |             f"Expected Pydantic BaseModel class, got {type(model_class).__name__}"
179 |         )
180 |     
181 |     # Decode TOON to dict/list
182 |     data = decode(toon_string, options)
183 |     
184 |     # Convert to Pydantic model(s)
185 |     if isinstance(data, list):
186 |         if not all(isinstance(item, dict) for item in data):
187 |             raise ValueError("All items in the decoded list must be dicts to convert to Pydantic models")
188 |         return [model_class(**item) for item in data]
189 |     elif isinstance(data, dict):
190 |         return model_class(**data)
191 |     else:
192 |         raise ValueError(f"Cannot convert {type(data).__name__} to Pydantic model")
193 | 
194 | 


--------------------------------------------------------------------------------
/examples/pydantic_usage.py:
--------------------------------------------------------------------------------
  1 | """Examples demonstrating Pydantic model conversion with TOON."""
  2 | try:
  3 |     from pydantic import BaseModel, Field
  4 |     from toon import encode_pydantic, decode_to_pydantic, encode
  5 |     import json
  6 | except ImportError as e:
  7 |     print(f"Error: {e}")
  8 |     print("Please install pydantic: pip install pydantic")
  9 |     exit(1)
 10 | 
 11 | 
 12 | # Define Pydantic models
 13 | class Address(BaseModel):
 14 |     """Address model."""
 15 |     street: str
 16 |     city: str
 17 |     state: str
 18 |     zipcode: str
 19 | 
 20 | 
 21 | class User(BaseModel):
 22 |     """User model with optional fields."""
 23 |     id: int
 24 |     name: str
 25 |     email: str
 26 |     age: int | None = None
 27 |     active: bool = True
 28 |     address: Address | None = None
 29 | 
 30 | 
 31 | class Product(BaseModel):
 32 |     """Product model."""
 33 |     sku: str
 34 |     name: str
 35 |     price: float
 36 |     stock: int
 37 |     tags: list[str] = []
 38 | 
 39 | 
 40 | class Order(BaseModel):
 41 |     """Order model with nested products."""
 42 |     order_id: str = Field(alias='orderId')
 43 |     customer_name: str = Field(alias='customerName')
 44 |     products: list[Product]
 45 |     total: float
 46 | 
 47 | 
 48 | def example_simple_model():
 49 |     """Example: Simple Pydantic model to TOON."""
 50 |     print("=== Simple Pydantic Model ===")
 51 |     
 52 |     user = User(
 53 |         id=1,
 54 |         name='Alice Smith',
 55 |         email='alice@example.com',
 56 |         age=30,
 57 |         active=True
 58 |     )
 59 |     
 60 |     print("Python object:")
 61 |     print(f"  {user}")
 62 |     print()
 63 |     
 64 |     toon = encode_pydantic(user)
 65 |     print("TOON format:")
 66 |     print(toon)
 67 |     print()
 68 | 
 69 | 
 70 | def example_list_of_models():
 71 |     """Example: List of uniform Pydantic models (tabular format)."""
 72 |     print("=== List of Pydantic Models (Tabular) ===")
 73 |     
 74 |     products = [
 75 |         Product(sku='LAP-001', name='Gaming Laptop', price=1299.99, stock=15, tags=['electronics', 'computers']),
 76 |         Product(sku='MOU-042', name='Wireless Mouse', price=29.99, stock=128, tags=['electronics', 'accessories']),
 77 |         Product(sku='KEY-789', name='Mechanical Keyboard', price=149.99, stock=67, tags=['electronics', 'accessories'])
 78 |     ]
 79 |     
 80 |     print("Python objects:")
 81 |     for p in products:
 82 |         print(f"  {p.sku}: {p.name} - ${p.price}")
 83 |     print()
 84 |     
 85 |     # Compare with regular dict encoding
 86 |     dict_data = {'products': [p.model_dump() if hasattr(p, 'model_dump') else p.dict() for p in products]}
 87 |     json_str = json.dumps(dict_data)
 88 |     toon_dict = encode(dict_data)
 89 |     toon_pydantic = encode_pydantic(products)
 90 |     
 91 |     print(f"JSON size: {len(json_str)} bytes")
 92 |     print(f"TOON size (from dict): {len(toon_dict)} bytes")
 93 |     print(f"TOON size (from pydantic): {len(toon_pydantic)} bytes")
 94 |     print()
 95 |     
 96 |     print("TOON format:")
 97 |     print(toon_pydantic)
 98 |     print()
 99 | 
100 | 
101 | def example_nested_models():
102 |     """Example: Nested Pydantic models."""
103 |     print("=== Nested Pydantic Models ===")
104 |     
105 |     user = User(
106 |         id=2,
107 |         name='Bob Johnson',
108 |         email='bob@example.com',
109 |         age=35,
110 |         active=True,
111 |         address=Address(
112 |             street='123 Main Street',
113 |             city='Boston',
114 |             state='MA',
115 |             zipcode='02101'
116 |         )
117 |     )
118 |     
119 |     toon = encode_pydantic(user)
120 |     print("TOON format:")
121 |     print(toon)
122 |     print()
123 | 
124 | 
125 | def example_exclude_options():
126 |     """Example: Using exclude options."""
127 |     print("=== Exclude Options ===")
128 |     
129 |     user = User(
130 |         id=3,
131 |         name='Charlie Brown',
132 |         email='charlie@example.com'
133 |         # age, active, and address use defaults or are None
134 |     )
135 |     
136 |     print("All fields (default):")
137 |     toon_all = encode_pydantic(user, exclude_unset=False)
138 |     print(toon_all)
139 |     print()
140 |     
141 |     print("Exclude unset fields:")
142 |     toon_unset = encode_pydantic(user, exclude_unset=True)
143 |     print(toon_unset)
144 |     print()
145 |     
146 |     print("Exclude None values:")
147 |     toon_none = encode_pydantic(user, exclude_none=True)
148 |     print(toon_none)
149 |     print()
150 | 
151 | 
152 | def example_field_aliases():
153 |     """Example: Using field aliases."""
154 |     print("=== Field Aliases ===")
155 |     
156 |     order = Order(
157 |         orderId='ORD-12345',
158 |         customerName='Diana Prince',
159 |         products=[
160 |             Product(sku='LAP-001', name='Gaming Laptop', price=1299.99, stock=15),
161 |             Product(sku='MOU-042', name='Wireless Mouse', price=29.99, stock=128)
162 |         ],
163 |         total=1329.98
164 |     )
165 |     
166 |     print("Without aliases (internal field names):")
167 |     toon_no_alias = encode_pydantic(order, by_alias=False)
168 |     print(toon_no_alias)
169 |     print()
170 |     
171 |     print("With aliases (API field names):")
172 |     toon_alias = encode_pydantic(order, by_alias=True)
173 |     print(toon_alias)
174 |     print()
175 | 
176 | 
177 | def example_decoding():
178 |     """Example: Decoding TOON back to Pydantic models."""
179 |     print("=== Decoding TOON to Pydantic ===")
180 |     
181 |     # TOON string representing a list of users
182 |     toon = """[3]{id,name,email,age,active}:
183 |   1,Alice Smith,alice@example.com,30,true
184 |   2,Bob Johnson,bob@example.com,35,true
185 |   3,Charlie Brown,charlie@example.com,28,false"""
186 |     
187 |     print("TOON input:")
188 |     print(toon)
189 |     print()
190 |     
191 |     # Decode to list of User objects
192 |     users = decode_to_pydantic(toon, User)
193 |     
194 |     print("Decoded Pydantic models:")
195 |     for user in users:
196 |         print(f"  User(id={user.id}, name='{user.name}', age={user.age}, active={user.active})")
197 |     print()
198 | 
199 | 
200 | def example_roundtrip():
201 |     """Example: Round-trip conversion."""
202 |     print("=== Round-trip Conversion ===")
203 |     
204 |     original = [
205 |         Product(sku='KEY-001', name='Wireless Keyboard', price=79.99, stock=45, tags=['wireless', 'keyboard']),
206 |         Product(sku='MOU-002', name='Gaming Mouse', price=59.99, stock=78, tags=['gaming', 'mouse']),
207 |     ]
208 |     
209 |     print("Original objects:")
210 |     for p in original:
211 |         print(f"  {p.sku}: {p.name} - ${p.price} (stock: {p.stock})")
212 |     print()
213 |     
214 |     # Encode to TOON
215 |     toon = encode_pydantic(original)
216 |     print("TOON format:")
217 |     print(toon)
218 |     print()
219 |     
220 |     # Decode back to Pydantic
221 |     decoded = decode_to_pydantic(toon, Product)
222 |     print("Decoded objects:")
223 |     for p in decoded:
224 |         print(f"  {p.sku}: {p.name} - ${p.price} (stock: {p.stock})")
225 |     print()
226 |     
227 |     # Verify equality
228 |     print("Round-trip successful:", all(
229 |         orig.sku == dec.sku and
230 |         orig.name == dec.name and
231 |         orig.price == dec.price and
232 |         orig.stock == dec.stock
233 |         for orig, dec in zip(original, decoded)
234 |     ))
235 |     print()
236 | 
237 | 
238 | def example_comparison():
239 |     """Example: Size comparison between JSON and TOON."""
240 |     print("=== Size Comparison: JSON vs TOON ===")
241 |     
242 |     # Create a list of products
243 |     products = [
244 |         Product(sku=f'PROD-{i:03d}', name=f'Product {i}', price=float(10 + i), stock=100 - i)
245 |         for i in range(1, 11)
246 |     ]
247 |     
248 |     # Convert to JSON
249 |     json_data = [p.model_dump() if hasattr(p, 'model_dump') else p.dict() for p in products]
250 |     json_str = json.dumps(json_data)
251 |     
252 |     # Convert to TOON
253 |     toon_str = encode_pydantic(products)
254 |     
255 |     print(f"Number of products: {len(products)}")
256 |     print(f"JSON size: {len(json_str)} bytes")
257 |     print(f"TOON size: {len(toon_str)} bytes")
258 |     print(f"Size reduction: {100 - (len(toon_str) / len(json_str) * 100):.1f}%")
259 |     print()
260 |     
261 |     print("JSON format (first 200 chars):")
262 |     print(json_str[:200] + "...")
263 |     print()
264 |     
265 |     print("TOON format:")
266 |     print(toon_str)
267 |     print()
268 | 
269 | 
270 | if __name__ == '__main__':
271 |     example_simple_model()
272 |     example_list_of_models()
273 |     example_nested_models()
274 |     example_exclude_options()
275 |     example_field_aliases()
276 |     example_decoding()
277 |     example_roundtrip()
278 |     example_comparison()
279 | 
280 | 


--------------------------------------------------------------------------------
/toon/cli.py:
--------------------------------------------------------------------------------
  1 | """Command-line interface for TOON format conversion."""
  2 | import sys
  3 | import json
  4 | import argparse
  5 | from pathlib import Path
  6 | from typing import Optional
  7 | 
  8 | try:
  9 |     import tiktoken
 10 |     TIKTOKEN_AVAILABLE = True
 11 | except ImportError:
 12 |     TIKTOKEN_AVAILABLE = False
 13 | 
 14 | from . import encode, decode
 15 | 
 16 | 
 17 | def count_tokens(text: str) -> Optional[int]:
 18 |     """
 19 |     Count tokens in text using tiktoken (o200k_base encoding).
 20 |     
 21 |     Args:
 22 |         text: Text to count tokens in
 23 |         
 24 |     Returns:
 25 |         Token count or None if tiktoken not available
 26 |     """
 27 |     if not TIKTOKEN_AVAILABLE:
 28 |         return None
 29 |     
 30 |     try:
 31 |         encoding = tiktoken.get_encoding("o200k_base")
 32 |         return len(encoding.encode(text))
 33 |     except Exception:
 34 |         return None
 35 | 
 36 | 
 37 | def detect_mode(input_path: Optional[str], force_encode: bool, force_decode: bool) -> str:
 38 |     """
 39 |     Detect conversion mode from file extension or flags.
 40 |     
 41 |     Args:
 42 |         input_path: Input file path
 43 |         force_encode: Force encode mode
 44 |         force_decode: Force decode mode
 45 |         
 46 |     Returns:
 47 |         'encode' or 'decode'
 48 |     """
 49 |     if force_encode:
 50 |         return 'encode'
 51 |     if force_decode:
 52 |         return 'decode'
 53 |     
 54 |     if input_path and input_path != '-':
 55 |         path = Path(input_path)
 56 |         ext = path.suffix.lower()
 57 |         if ext == '.json':
 58 |             return 'encode'
 59 |         elif ext == '.toon':
 60 |             return 'decode'
 61 |     
 62 |     # Default to encode
 63 |     return 'encode'
 64 | 
 65 | 
 66 | def read_input(input_path: Optional[str]) -> str:
 67 |     """
 68 |     Read input from file or stdin.
 69 |     
 70 |     Args:
 71 |         input_path: Input file path or '-' for stdin
 72 |         
 73 |     Returns:
 74 |         Input content
 75 |     """
 76 |     if not input_path or input_path == '-':
 77 |         return sys.stdin.read()
 78 |     
 79 |     with open(input_path, 'r', encoding='utf-8') as f:
 80 |         return f.read()
 81 | 
 82 | 
 83 | def write_output(content: str, output_path: Optional[str]) -> None:
 84 |     """
 85 |     Write output to file or stdout.
 86 |     
 87 |     Args:
 88 |         content: Content to write
 89 |         output_path: Output file path or None for stdout
 90 |     """
 91 |     if not output_path:
 92 |         print(content)
 93 |     else:
 94 |         with open(output_path, 'w', encoding='utf-8') as f:
 95 |             f.write(content)
 96 | 
 97 | 
 98 | def main():
 99 |     """Main CLI entry point."""
100 |     parser = argparse.ArgumentParser(
101 |         description='TOON (Token-Oriented Object Notation) - Convert between JSON and TOON formats',
102 |         formatter_class=argparse.RawDescriptionHelpFormatter,
103 |         epilog="""
104 | Examples:
105 |   # Encode JSON file to TOON
106 |   toon input.json -o output.toon
107 |   
108 |   # Decode TOON file to JSON
109 |   toon input.toon -o output.json
110 |   
111 |   # Pipe JSON and encode to TOON
112 |   echo '{"key": "value"}' | toon -e
113 |   
114 |   # Force decode mode with custom delimiter
115 |   toon input.txt -d --delimiter tab
116 |   
117 |   # Show token statistics
118 |   toon input.json --stats
119 |         """
120 |     )
121 |     
122 |     parser.add_argument(
123 |         'input',
124 |         nargs='?',
125 |         help='Input file path (or "-" for stdin, default: stdin)'
126 |     )
127 |     parser.add_argument(
128 |         '-o', '--output',
129 |         help='Output file path (default: stdout)'
130 |     )
131 |     parser.add_argument(
132 |         '-e', '--encode',
133 |         action='store_true',
134 |         help='Force encode mode (JSON to TOON)'
135 |     )
136 |     parser.add_argument(
137 |         '-d', '--decode',
138 |         action='store_true',
139 |         help='Force decode mode (TOON to JSON)'
140 |     )
141 |     parser.add_argument(
142 |         '--delimiter',
143 |         choices=['comma', 'tab', 'pipe'],
144 |         default='comma',
145 |         help='Array delimiter (default: comma)'
146 |     )
147 |     parser.add_argument(
148 |         '--indent',
149 |         type=int,
150 |         default=2,
151 |         help='Indentation size (default: 2)'
152 |     )
153 |     parser.add_argument(
154 |         '--stats',
155 |         action='store_true',
156 |         help='Show token statistics'
157 |     )
158 |     parser.add_argument(
159 |         '--no-strict',
160 |         action='store_true',
161 |         help='Disable strict validation (decode only)'
162 |     )
163 |     parser.add_argument(
164 |         '--key-folding',
165 |         choices=['off', 'safe'],
166 |         default='off',
167 |         help='Key folding mode (encode only, default: off)'
168 |     )
169 |     parser.add_argument(
170 |         '--flatten-depth',
171 |         type=int,
172 |         help='Maximum key folding depth (encode only)'
173 |     )
174 |     parser.add_argument(
175 |         '--expand-paths',
176 |         choices=['off', 'safe'],
177 |         default='off',
178 |         help='Path expansion mode (decode only, default: off)'
179 |     )
180 |     
181 |     args = parser.parse_args()
182 |     
183 |     # Validate arguments
184 |     if args.encode and args.decode:
185 |         parser.error('Cannot specify both --encode and --decode')
186 |     
187 |     try:
188 |         # Read input
189 |         input_content = read_input(args.input)
190 |         
191 |         # Detect mode
192 |         mode = detect_mode(args.input, args.encode, args.decode)
193 |         
194 |         # Convert
195 |         if mode == 'encode':
196 |             # Parse JSON
197 |             data = json.loads(input_content)
198 |             
199 |             # Encode to TOON
200 |             options = {
201 |                 'delimiter': args.delimiter,
202 |                 'indent': args.indent,
203 |                 'key_folding': args.key_folding,
204 |             }
205 |             if args.flatten_depth is not None:
206 |                 options['flatten_depth'] = args.flatten_depth
207 |             
208 |             output_content = encode(data, options)
209 |             
210 |             # Show statistics if requested
211 |             if args.stats:
212 |                 input_tokens = count_tokens(input_content)
213 |                 output_tokens = count_tokens(output_content)
214 |                 
215 |                 print(f'Input (JSON):  {len(input_content)} bytes', file=sys.stderr)
216 |                 print(f'Output (TOON): {len(output_content)} bytes', file=sys.stderr)
217 |                 print(f'Size reduction: {(1 - len(output_content) / len(input_content)) * 100:.1f}%', file=sys.stderr)
218 |                 
219 |                 if input_tokens is not None and output_tokens is not None:
220 |                     print(f'Input tokens:  {input_tokens}', file=sys.stderr)
221 |                     print(f'Output tokens: {output_tokens}', file=sys.stderr)
222 |                     print(f'Token reduction: {(1 - output_tokens / input_tokens) * 100:.1f}%', file=sys.stderr)
223 |                 else:
224 |                     print('(Install tiktoken for token statistics)', file=sys.stderr)
225 |                 
226 |                 print('---', file=sys.stderr)
227 |         
228 |         else:  # decode
229 |             # Decode TOON
230 |             options = {
231 |                 'strict': not args.no_strict,
232 |                 'expand_paths': args.expand_paths,
233 |                 'default_delimiter': args.delimiter,
234 |             }
235 |             
236 |             data = decode(input_content, options)
237 |             
238 |             # Convert to JSON
239 |             output_content = json.dumps(data, indent=2, ensure_ascii=False)
240 |             
241 |             # Show statistics if requested
242 |             if args.stats:
243 |                 input_tokens = count_tokens(input_content)
244 |                 output_tokens = count_tokens(output_content)
245 |                 
246 |                 print(f'Input (TOON): {len(input_content)} bytes', file=sys.stderr)
247 |                 print(f'Output (JSON): {len(output_content)} bytes', file=sys.stderr)
248 |                 print(f'Size increase: {(len(output_content) / len(input_content) - 1) * 100:.1f}%', file=sys.stderr)
249 |                 
250 |                 if input_tokens is not None and output_tokens is not None:
251 |                     print(f'Input tokens:  {input_tokens}', file=sys.stderr)
252 |                     print(f'Output tokens: {output_tokens}', file=sys.stderr)
253 |                     print(f'Token increase: {(output_tokens / input_tokens - 1) * 100:.1f}%', file=sys.stderr)
254 |                 else:
255 |                     print('(Install tiktoken for token statistics)', file=sys.stderr)
256 |                 
257 |                 print('---', file=sys.stderr)
258 |         
259 |         # Write output
260 |         write_output(output_content, args.output)
261 |         
262 |         return 0
263 |     
264 |     except json.JSONDecodeError as e:
265 |         print(f'Error parsing JSON: {e}', file=sys.stderr)
266 |         return 1
267 |     except FileNotFoundError as e:
268 |         print(f'Error: {e}', file=sys.stderr)
269 |         return 1
270 |     except Exception as e:
271 |         print(f'Error: {e}', file=sys.stderr)
272 |         return 1
273 | 
274 | 
275 | if __name__ == '__main__':
276 |     sys.exit(main())
277 | 


--------------------------------------------------------------------------------
/toon/utils.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for the TOON library."""
  2 | from typing import Any, Optional
  3 | from datetime import datetime, date
  4 | from .constants import (
  5 |     QUOTE, BACKSLASH, NEWLINE, COMMA, TAB, PIPE,
  6 |     TRUE_LITERAL, FALSE_LITERAL, NULL_LITERAL,
  7 |     SPACE, COLON
  8 | )
  9 | 
 10 | 
 11 | def needs_quoting(value: str) -> bool:
 12 |     """
 13 |     Check if a string value needs to be quoted.
 14 |     
 15 |     Quoting is needed when:
 16 |     - Value contains special characters (comma, colon, newline, quotes)
 17 |     - Value has leading or trailing whitespace
 18 |     - Value looks like a boolean or null literal
 19 |     - Value is empty
 20 |     
 21 |     Args:
 22 |         value: String to check
 23 |         
 24 |     Returns:
 25 |         True if quoting is needed, False otherwise
 26 |     """
 27 |     if not value:
 28 |         return True
 29 |     
 30 |     # Check for leading/trailing whitespace
 31 |     if value != value.strip():
 32 |         return True
 33 |     
 34 |     # Check if it looks like a literal
 35 |     lower_value = value.lower()
 36 |     if lower_value in (TRUE_LITERAL, FALSE_LITERAL, NULL_LITERAL):
 37 |         return True
 38 |     
 39 |     # Check for special characters
 40 |     special_chars = {COMMA, COLON, NEWLINE, QUOTE, TAB, PIPE, BACKSLASH, '[', ']', '{', '}'}
 41 |     if any(char in value for char in special_chars):
 42 |         return True
 43 |     
 44 |     # Check if it looks like a number but has trailing content
 45 |     # This handles cases like "123abc" which should be quoted
 46 |     try:
 47 |         float(value)
 48 |         return False
 49 |     except ValueError:
 50 |         pass
 51 |     
 52 |     return False
 53 | 
 54 | 
 55 | def escape_string(value: str) -> str:
 56 |     """
 57 |     Escape special characters in a string for TOON encoding.
 58 |     
 59 |     Args:
 60 |         value: String to escape
 61 |         
 62 |     Returns:
 63 |         Escaped string
 64 |     """
 65 |     # Escape backslashes first
 66 |     value = value.replace(BACKSLASH, BACKSLASH + BACKSLASH)
 67 |     # Escape quotes
 68 |     value = value.replace(QUOTE, BACKSLASH + QUOTE)
 69 |     # Escape newlines
 70 |     value = value.replace(NEWLINE, BACKSLASH + 'n')
 71 |     # Escape tabs
 72 |     value = value.replace('\t', BACKSLASH + 't')
 73 |     # Escape carriage returns
 74 |     value = value.replace('\r', BACKSLASH + 'r')
 75 |     return value
 76 | 
 77 | 
 78 | def unescape_string(value: str) -> str:
 79 |     """
 80 |     Unescape special characters in a TOON string.
 81 |     
 82 |     Args:
 83 |         value: Escaped string
 84 |         
 85 |     Returns:
 86 |         Unescaped string
 87 |     """
 88 |     result = []
 89 |     i = 0
 90 |     while i < len(value):
 91 |         if value[i] == BACKSLASH and i + 1 < len(value):
 92 |             next_char = value[i + 1]
 93 |             if next_char == 'n':
 94 |                 result.append(NEWLINE)
 95 |                 i += 2
 96 |             elif next_char == 't':
 97 |                 result.append('\t')
 98 |                 i += 2
 99 |             elif next_char == 'r':
100 |                 result.append('\r')
101 |                 i += 2
102 |             elif next_char == QUOTE:
103 |                 result.append(QUOTE)
104 |                 i += 2
105 |             elif next_char == BACKSLASH:
106 |                 result.append(BACKSLASH)
107 |                 i += 2
108 |             else:
109 |                 result.append(value[i])
110 |                 i += 1
111 |         else:
112 |             result.append(value[i])
113 |             i += 1
114 |     return ''.join(result)
115 | 
116 | 
117 | def quote_string(value: str) -> str:
118 |     """
119 |     Quote and escape a string for TOON encoding.
120 |     
121 |     Args:
122 |         value: String to quote
123 |         
124 |     Returns:
125 |         Quoted and escaped string
126 |     """
127 |     escaped = escape_string(value)
128 |     return f'{QUOTE}{escaped}{QUOTE}'
129 | 
130 | 
131 | def is_primitive(value: Any) -> bool:
132 |     """
133 |     Check if a value is a primitive type (str, int, float, bool, None, datetime, date).
134 | 
135 |     Args:
136 |         value: Value to check
137 | 
138 |     Returns:
139 |         True if primitive, False otherwise
140 |     """
141 |     return isinstance(value, (str, int, float, bool, type(None), datetime, date))
142 | 
143 | 
144 | def is_array_of_objects(value: Any) -> bool:
145 |     """
146 |     Check if a value is an array of objects (list of dicts).
147 |     
148 |     Args:
149 |         value: Value to check
150 |         
151 |     Returns:
152 |         True if array of objects, False otherwise
153 |     """
154 |     if not isinstance(value, list) or not value:
155 |         return False
156 |     return all(isinstance(item, dict) for item in value)
157 | 
158 | 
159 | def is_uniform_array_of_objects(value: list) -> Optional[list]:
160 |     """
161 |     Check if an array contains objects with identical primitive-only fields.
162 |     
163 |     This function determines if an array of objects can use the compact tabular format.
164 |     Tabular format is only used when ALL fields in ALL objects are primitive types.
165 |     If any object contains non-primitive fields (arrays, nested objects), the function
166 |     returns None, and the encoder will use list array format instead to preserve all data.
167 |     
168 |     Args:
169 |         value: Array to check
170 |         
171 |     Returns:
172 |         List of field names if uniform and all primitive, None otherwise
173 |     """
174 |     if not value or not all(isinstance(item, dict) for item in value):
175 |         return None
176 |     
177 |     # Get all fields from first object and check if they're primitive
178 |     first_obj = value[0]
179 |     fields = []
180 |     
181 |     for key, val in first_obj.items():
182 |         if not is_primitive(val):
183 |             # Object contains non-primitive field (array or nested object)
184 |             # Cannot use tabular format - must use list format to preserve all data
185 |             return None
186 |         fields.append(key)
187 |     
188 |     if not fields:
189 |         return None
190 |     
191 |     # Check all objects have the exact same fields, all primitive
192 |     for obj in value[1:]:
193 |         # Check that this object has exactly the same fields
194 |         if set(obj.keys()) != set(fields):
195 |             return None
196 |         
197 |         # Check that all values in this object are primitive
198 |         for key, val in obj.items():
199 |             if not is_primitive(val):
200 |                 # Found non-primitive field - cannot use tabular format
201 |                 return None
202 |     
203 |     return fields
204 | 
205 | 
206 | def get_indent(level: int, indent_size: int = 2) -> str:
207 |     """
208 |     Get indentation string for a given level.
209 |     
210 |     Args:
211 |         level: Indentation level
212 |         indent_size: Number of spaces per level
213 |         
214 |     Returns:
215 |         Indentation string
216 |     """
217 |     return SPACE * (level * indent_size)
218 | 
219 | 
220 | def parse_number(value: str) -> Any:
221 |     """
222 |     Parse a string as a number (int or float).
223 |     
224 |     Args:
225 |         value: String to parse
226 |         
227 |     Returns:
228 |         Parsed number or original string if not a number
229 |     """
230 |     try:
231 |         # Try integer first
232 |         if '.' not in value and 'e' not in value.lower():
233 |             return int(value)
234 |         # Try float
235 |         return float(value)
236 |     except ValueError:
237 |         return value
238 | 
239 | 
240 | def parse_literal(value: str) -> Any:
241 |     """
242 |     Parse a string as a boolean, null, or number literal.
243 | 
244 |     Args:
245 |         value: String to parse
246 | 
247 |     Returns:
248 |         Parsed value or original string if not a literal
249 |     """
250 |     lower_value = value.lower()
251 |     if lower_value == TRUE_LITERAL:
252 |         return True
253 |     elif lower_value == FALSE_LITERAL:
254 |         return False
255 |     elif lower_value == NULL_LITERAL:
256 |         return None
257 |     else:
258 |         return parse_number(value)
259 | 
260 | 
261 | def format_float(value: float) -> str:
262 |     """
263 |     Format a float without unnecessary scientific notation.
264 | 
265 |     Suppresses scientific notation for numbers in a reasonable range,
266 |     making the output more human-readable.
267 | 
268 |     Args:
269 |         value: Float value to format
270 | 
271 |     Returns:
272 |         Formatted string representation
273 |     """
274 |     if value == 0:
275 |         return '0'
276 | 
277 |     # Check if value would use scientific notation
278 |     str_repr = str(value)
279 |     if 'e' not in str_repr.lower():
280 |         # Already in decimal format, but strip unnecessary trailing zeros
281 |         if '.' in str_repr:
282 |             return str_repr.rstrip('0').rstrip('.')
283 |         return str_repr
284 | 
285 |     abs_val = abs(value)
286 | 
287 |     # Only suppress scientific notation for reasonable ranges
288 |     # Keep scientific notation for very large or very small numbers
289 |     if abs_val < 1e-100 or abs_val >= 1e100:
290 |         return str_repr
291 | 
292 |     # Format with fixed-point notation
293 |     # Use enough precision to preserve the value
294 |     if abs_val >= 1:
295 |         # For numbers >= 1, use minimal decimal places
296 |         formatted = f'{value:.10f}'
297 |     else:
298 |         # For numbers < 1, use more precision
299 |         formatted = f'{value:.15f}'
300 | 
301 |     # Strip trailing zeros and unnecessary decimal point
302 |     formatted = formatted.rstrip('0').rstrip('.')
303 | 
304 |     return formatted
305 | 


--------------------------------------------------------------------------------
/tests/test_structure_generator.py:
--------------------------------------------------------------------------------
  1 | """Tests for TOON structure generator."""
  2 | import pytest
  3 | from toon import generate_structure
  4 | 
  5 | 
  6 | def test_generate_simple_structure():
  7 |     """Test generating a simple object structure."""
  8 |     schema = {
  9 |         "name": "name of the person",
 10 |         "age": "age of the person",
 11 |         "occupation": "job description of the person"
 12 |     }
 13 |     
 14 |     result = generate_structure(schema)
 15 |     
 16 |     assert "name: <name of the person>" in result
 17 |     assert "age: <age of the person>" in result
 18 |     assert "occupation: <job description of the person>" in result
 19 | 
 20 | 
 21 | def test_generate_nested_structure():
 22 |     """Test generating a nested object structure."""
 23 |     schema = {
 24 |         "user": {
 25 |             "id": "user identifier",
 26 |             "profile": {
 27 |                 "name": "user name",
 28 |                 "email": "user email"
 29 |             }
 30 |         }
 31 |     }
 32 |     
 33 |     result = generate_structure(schema)
 34 |     
 35 |     assert "user:" in result
 36 |     assert "id: <user identifier>" in result
 37 |     assert "profile:" in result
 38 |     assert "name: <user name>" in result
 39 |     assert "email: <user email>" in result
 40 | 
 41 | 
 42 | def test_generate_array_structure():
 43 |     """Test generating a structure with arrays."""
 44 |     schema = {
 45 |         "users": [{
 46 |             "id": "user id",
 47 |             "name": "user name",
 48 |             "email": "user email"
 49 |         }]
 50 |     }
 51 |     
 52 |     result = generate_structure(schema)
 53 |     
 54 |     assert "users[N]{id,name,email}:" in result
 55 |     assert "<user id>" in result
 56 |     assert "<user name>" in result
 57 |     assert "<user email>" in result
 58 |     assert "..." in result
 59 | 
 60 | 
 61 | def test_generate_root_array_structure():
 62 |     """Test generating a structure for root-level array."""
 63 |     schema = [{
 64 |         "id": "product id",
 65 |         "name": "product name",
 66 |         "price": "product price"
 67 |     }]
 68 |     
 69 |     result = generate_structure(schema)
 70 |     
 71 |     assert "[N]{id,name,price}:" in result
 72 |     assert "<product id>" in result
 73 |     assert "<product name>" in result
 74 |     assert "<product price>" in result
 75 |     assert "..." in result
 76 | 
 77 | 
 78 | def test_generate_empty_structures():
 79 |     """Test generating empty structures."""
 80 |     # Empty object
 81 |     result = generate_structure({})
 82 |     assert result == "{}"
 83 |     
 84 |     # Object with empty nested object
 85 |     schema = {"data": {}}
 86 |     result = generate_structure(schema)
 87 |     assert "data: {}" in result
 88 |     
 89 |     # Object with empty array
 90 |     schema = {"items": []}
 91 |     result = generate_structure(schema)
 92 |     assert "items: []" in result
 93 | 
 94 | 
 95 | def test_generate_with_tab_delimiter():
 96 |     """Test generating structure with tab delimiter."""
 97 |     schema = [{
 98 |         "id": "user id",
 99 |         "name": "user name"
100 |     }]
101 |     
102 |     result = generate_structure(schema, {"delimiter": "\t"})
103 |     
104 |     # Should show tab indicator in header
105 |     assert "[N\t]{id,name}:" in result
106 |     # Should use tab as delimiter in sample row
107 |     assert "<user id>\t<user name>" in result
108 | 
109 | 
110 | def test_generate_with_pipe_delimiter():
111 |     """Test generating structure with pipe delimiter."""
112 |     schema = [{
113 |         "id": "user id",
114 |         "name": "user name"
115 |     }]
116 |     
117 |     result = generate_structure(schema, {"delimiter": "|"})
118 |     
119 |     # Should show pipe indicator in header
120 |     assert "[N|]{id,name}:" in result
121 |     # Should use pipe as delimiter in sample row
122 |     assert "<user id>|<user name>" in result
123 | 
124 | 
125 | def test_generate_with_custom_indent():
126 |     """Test generating structure with custom indentation."""
127 |     schema = {
128 |         "user": {
129 |             "name": "user name"
130 |         }
131 |     }
132 |     
133 |     result = generate_structure(schema, {"indent": 4})
134 |     
135 |     lines = result.split("\n")
136 |     # First level should have no indent
137 |     assert lines[0] == "user:"
138 |     # Second level should have 4 spaces
139 |     assert lines[1].startswith("    ")
140 | 
141 | 
142 | def test_generate_mixed_structure():
143 |     """Test generating a complex mixed structure."""
144 |     schema = {
145 |         "title": "document title",
146 |         "metadata": {
147 |             "author": "author name",
148 |             "created": "creation date"
149 |         },
150 |         "tags": ["tag name"],
151 |         "sections": [{
152 |             "heading": "section heading",
153 |             "content": "section content"
154 |         }]
155 |     }
156 |     
157 |     result = generate_structure(schema)
158 |     
159 |     assert "title: <document title>" in result
160 |     assert "metadata:" in result
161 |     assert "author: <author name>" in result
162 |     assert "created: <creation date>" in result
163 |     assert "tags: [<tag name>,...]" in result
164 |     assert "sections[N]{heading,content}:" in result
165 | 
166 | 
167 | def test_generate_primitive_array():
168 |     """Test generating structure for primitive arrays."""
169 |     schema = {
170 |         "numbers": ["numeric value"],
171 |         "names": ["name string"]
172 |     }
173 |     
174 |     result = generate_structure(schema)
175 |     
176 |     assert "numbers: [<numeric value>,...]" in result
177 |     assert "names: [<name string>,...]" in result
178 | 
179 | 
180 | # Pydantic-specific tests
181 | try:
182 |     from pydantic import BaseModel, Field
183 |     from toon import generate_structure_from_pydantic
184 |     
185 |     class SimpleUser(BaseModel):
186 |         """Simple user model."""
187 |         id: int = Field(description="user identifier")
188 |         name: str = Field(description="user full name")
189 |         email: str = Field(description="user email address")
190 |     
191 |     
192 |     class NestedUser(BaseModel):
193 |         """User model with nested profile."""
194 |         id: int = Field(description="user identifier")
195 |         name: str = Field(description="user name")
196 |         
197 |         class Profile(BaseModel):
198 |             bio: str = Field(description="user biography")
199 |             location: str = Field(description="user location")
200 |         
201 |         profile: Profile = Field(description="user profile")
202 |     
203 |     
204 |     def test_generate_from_pydantic_simple():
205 |         """Test generating structure from simple Pydantic model."""
206 |         result = generate_structure_from_pydantic(SimpleUser)
207 |         
208 |         assert "id: <user identifier>" in result
209 |         assert "name: <user full name>" in result
210 |         assert "email: <user email address>" in result
211 |     
212 |     
213 |     def test_generate_from_pydantic_without_descriptions():
214 |         """Test generating structure from Pydantic model without descriptions."""
215 |         class UserNoDesc(BaseModel):
216 |             id: int
217 |             name: str
218 |             active: bool
219 |         
220 |         result = generate_structure_from_pydantic(UserNoDesc, include_descriptions=False)
221 |         
222 |         # Should use type names as descriptions
223 |         assert "id: <integer>" in result or "id: <int>" in result
224 |         assert "name: <string>" in result or "name: <str>" in result
225 |         assert "active: <boolean>" in result or "active: <bool>" in result
226 |     
227 |     
228 |     def test_generate_from_pydantic_invalid_input():
229 |         """Test error handling for invalid Pydantic input."""
230 |         class NotAModel:
231 |             pass
232 |         
233 |         with pytest.raises(TypeError):
234 |             generate_structure_from_pydantic(NotAModel)
235 |     
236 |     
237 |     def test_generate_from_pydantic_with_options():
238 |         """Test generating structure from Pydantic with custom options."""
239 |         result = generate_structure_from_pydantic(
240 |             SimpleUser,
241 |             options={"indent": 4}
242 |         )
243 |         
244 |         # Check that custom indentation is used (though this simple model has no nesting)
245 |         assert "id: <user identifier>" in result
246 | 
247 | except ImportError:
248 |     # Pydantic tests will be skipped if pydantic is not installed
249 |     pass
250 | 
251 | 
252 | def test_generate_structure_ordering():
253 |     """Test that field ordering is preserved."""
254 |     schema = {
255 |         "field_a": "first field",
256 |         "field_b": "second field",
257 |         "field_c": "third field"
258 |     }
259 |     
260 |     result = generate_structure(schema)
261 |     lines = result.split("\n")
262 |     
263 |     # Find positions of each field
264 |     pos_a = next(i for i, line in enumerate(lines) if "field_a" in line)
265 |     pos_b = next(i for i, line in enumerate(lines) if "field_b" in line)
266 |     pos_c = next(i for i, line in enumerate(lines) if "field_c" in line)
267 |     
268 |     # Check ordering
269 |     assert pos_a < pos_b < pos_c
270 | 
271 | 
272 | def test_generate_structure_realistic_example():
273 |     """Test with a realistic example similar to the issue description."""
274 |     schema = {
275 |         "name": "name of the person",
276 |         "age": "age of the person",
277 |         "occupation": "job description of the person"
278 |     }
279 |     
280 |     result = generate_structure(schema)
281 |     
282 |     # This should produce a clean template for LLM prompts
283 |     expected_lines = [
284 |         "name: <name of the person>",
285 |         "age: <age of the person>",
286 |         "occupation: <job description of the person>"
287 |     ]
288 |     
289 |     for line in expected_lines:
290 |         assert line in result
291 |     
292 |     # Should not contain actual data values
293 |     assert "Alice" not in result
294 |     assert "30" not in result
295 |     assert "Engineer" not in result
296 | 


--------------------------------------------------------------------------------
/benchmark/RESULTS.md:
--------------------------------------------------------------------------------
  1 | # 🚀 TOON vs JSON: Benchmark Results
  2 | 
  3 | ## Executive Summary
  4 | 
  5 | **TOON achieves MASSIVE memory and token savings compared to JSON across 50 diverse, real-world datasets.**
  6 | 
  7 | ```
  8 | ┌────────────────────────────────────────────────────────────────┐
  9 | │                                                                │
 10 | │                    ⚡ HEADLINE RESULTS ⚡                       │
 11 | │                                                                │
 12 | │     📉  63.9% SMALLER file sizes                               │
 13 | │     📉  54.1% FEWER tokens for LLM APIs                        │
 14 | │     💾  35.81KB saved across 50 test datasets                  │
 15 | │     🎯  10,735 tokens saved                                    │
 16 | │                                                                │
 17 | │                  FOR HIGH-VOLUME APPLICATIONS:                 │
 18 | │     💰  $2,147 saved per million API requests                  │
 19 | │     💰  $5,408 saved per billion tokens                        │
 20 | │                                                                │
 21 | └────────────────────────────────────────────────────────────────┘
 22 | ```
 23 | 
 24 | ## Why This Matters
 25 | 
 26 | ### For LLM API Users
 27 | 
 28 | If you're sending structured data to LLM APIs (GPT-4, Claude, etc.), **you're paying for every token**. TOON can cut your token usage by **MORE THAN HALF** (54.1% average), translating directly to:
 29 | 
 30 | - **54% lower API costs**
 31 | - **Faster API responses** (less data to transmit)
 32 | - **More content in context windows** (fit more data within token limits)
 33 | 
 34 | ### Real-World Cost Impact
 35 | 
 36 | At typical GPT-4 pricing ($10 per 1M tokens):
 37 | 
 38 | | Usage Volume | JSON Cost | TOON Cost | **Savings** |
 39 | |--------------|-----------|-----------|-------------|
 40 | | **1,000 requests** | $3.97 | $1.82 | **$2.15** |
 41 | | **1M requests/year** | $3,970 | $1,823 | **$2,147** |
 42 | | **1B tokens** | $10,000 | $4,592 | **$5,408** |
 43 | 
 44 | ## Detailed Results
 45 | 
 46 | ### Tested Across 50 Real-World Datasets
 47 | 
 48 | We benchmarked TOON against JSON using 50 diverse, production-ready datasets representing common use cases:
 49 | 
 50 | - E-commerce (products, orders, inventory)
 51 | - Databases (query results, employee records)
 52 | - APIs (responses, logs, requests)
 53 | - Analytics (metrics, A/B tests, surveys)
 54 | - IoT (sensor data, time series)
 55 | - Social media (posts, profiles, comments)
 56 | - Finance (transactions, stock data)
 57 | - And much more...
 58 | 
 59 | ### Performance Distribution
 60 | 
 61 | ```
 62 | 🔥 EXCELLENT (≥60% savings):  30 datasets (60%)
 63 | ✅ GOOD (40-60% savings):     19 datasets (38%)
 64 | 📊 MODERATE (<40% savings):    1 dataset  (2%)
 65 | ```
 66 | 
 67 | **98% of tested datasets achieved 40%+ savings!**
 68 | 
 69 | ### Top Performers
 70 | 
 71 | #### 🥇 Best Overall: Survey Responses
 72 | - **73.4% size reduction** (935B → 249B)
 73 | - **63.4% token reduction** (287 → 105 tokens)
 74 | 
 75 | #### 🥈 Other Champions (>70% savings):
 76 | - ML Training Data: **71.2%** size, **61.9%** tokens
 77 | - Large Inventory (100 items): **71.2%** size, **57.7%** tokens
 78 | - Student Grades: **71.2%** size, **61.9%** tokens
 79 | - Customer Reviews: **69.1%** size, **61.0%** tokens
 80 | - Weather Forecast: **69.0%** size, **55.9%** tokens
 81 | 
 82 | ### Category Breakdown
 83 | 
 84 | | Category | Datasets | Avg Size Savings | Avg Token Savings |
 85 | |----------|----------|------------------|-------------------|
 86 | | **Tabular Data** (databases, spreadsheets) | 12 | **69.2%** | **59.8%** |
 87 | | **E-commerce** (products, orders) | 8 | **66.1%** | **56.4%** |
 88 | | **Analytics** (metrics, surveys) | 7 | **65.7%** | **55.2%** |
 89 | | **API Data** (responses, logs) | 10 | **58.3%** | **48.9%** |
 90 | | **IoT/Sensors** (time series) | 5 | **60.0%** | **43.7%** |
 91 | | **Social/Content** (posts, profiles) | 8 | **61.5%** | **52.1%** |
 92 | 
 93 | ## Complete Results Table
 94 | 
 95 | | # | Dataset | JSON Size | TOON Size | Size Savings | Token Savings |
 96 | |---|---------|-----------|-----------|--------------|---------------|
 97 | | 01 | E-commerce Products | 1.57KB | 542B | **66.3%** | **58.2%** |
 98 | | 02 | API Response | 934B | 501B | **46.4%** | **39.7%** |
 99 | | 03 | Database Results | 1.52KB | 582B | **62.5%** | **56.5%** |
100 | | 04 | ML Training Data | 1.85KB | 545B | **71.2%** | **61.9%** |
101 | | 05 | Server Configuration | 1016B | 719B | **29.2%** | **28.4%** |
102 | | 06 | Analytics Data | 1.40KB | 526B | **63.3%** | **49.4%** |
103 | | 07 | Large Inventory (100 items) | 13.55KB | 3.90KB | **71.2%** | **57.7%** |
104 | | 08 | Customer Reviews | 828B | 256B | **69.1%** | **61.0%** |
105 | | 09 | Social Media Posts | 849B | 282B | **66.8%** | **52.1%** |
106 | | 10 | Weather Forecast | 777B | 241B | **69.0%** | **55.9%** |
107 | | 11 | Stock Market Data | - | - | **59.8%** | **44.2%** |
108 | | 12 | Restaurant Menu | - | - | **66.4%** | **61.5%** |
109 | | 13 | Hotel Bookings | - | - | **64.2%** | **52.1%** |
110 | | 14 | Flight Schedule | - | - | **68.9%** | **59.9%** |
111 | | 15 | Medical Records | - | - | **59.3%** | **50.6%** |
112 | | 16 | Student Grades | - | - | **71.2%** | **61.9%** |
113 | | 17 | Sports Statistics | - | - | **66.3%** | **54.6%** |
114 | | 18 | Movie Catalog | - | - | **68.5%** | **59.8%** |
115 | | 19 | Music Playlist | - | - | **62.5%** | **56.7%** |
116 | | 20 | Real Estate Listings | - | - | **66.5%** | **58.7%** |
117 | | 21-50 | ... (see full benchmark output) | - | - | **60%+ avg** | **50%+ avg** |
118 | 
119 | ## Aggregate Statistics
120 | 
121 | ### Total Across All 50 Datasets
122 | 
123 | ```
124 | JSON TOTAL:   56.00KB  (57,349 bytes)
125 | TOON TOTAL:   20.20KB  (20,680 bytes)
126 | ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
127 | SAVED:        35.81KB  (36,669 bytes)  ⬇ 63.9%
128 | 
129 | JSON TOKENS:  19,851 tokens
130 | TOON TOKENS:   9,116 tokens
131 | ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
132 | SAVED:        10,735 tokens            ⬇ 54.1%
133 | ```
134 | 
135 | ### Visual Savings
136 | 
137 | ```
138 | Size Reduction:  ████████████████████████████████░░░░░░░░░░░░░░░░░░  63.9%
139 | Token Reduction: ███████████████████████████░░░░░░░░░░░░░░░░░░░░░░░  54.1%
140 | ```
141 | 
142 | ## Why TOON Saves So Much Memory
143 | 
144 | ### 1. **Eliminates Repeated Keys in Arrays**
145 | 
146 | **JSON** repeats keys for every object:
147 | ```json
148 | [
149 |   {"id": 1, "name": "Laptop", "price": 999},
150 |   {"id": 2, "name": "Mouse", "price": 29},
151 |   {"id": 3, "name": "Keyboard", "price": 149}
152 | ]
153 | ```
154 | **134 bytes, 48 tokens**
155 | 
156 | **TOON** declares headers once:
157 | ```toon
158 | [3]{id,name,price}:
159 |   1,Laptop,999
160 |   2,Mouse,29
161 |   3,Keyboard,149
162 | ```
163 | **52 bytes, 23 tokens** → **61% smaller, 52% fewer tokens!**
164 | 
165 | ### 2. **Minimal Syntax Overhead**
166 | 
167 | JSON requires:
168 | - Braces `{}` and brackets `[]` everywhere
169 | - Quotes around all keys
170 | - Quotes around string values
171 | - Commas between all elements
172 | 
173 | TOON uses:
174 | - Indentation for structure (like YAML)
175 | - Colons for key-value pairs
176 | - Quotes only when necessary
177 | - Headers for uniform arrays
178 | 
179 | ### 3. **Intelligent Type Handling**
180 | 
181 | TOON automatically detects when quotes aren't needed and preserves types (numbers, booleans, null) while maintaining human readability.
182 | 
183 | ## Use Case Recommendations
184 | 
185 | ### ✅ **PERFECT FOR TOON:**
186 | 
187 | 1. **LLM API Payloads** - Cut token costs in half
188 | 2. **Database Query Results** - Tabular data compression
189 | 3. **Analytics & Metrics** - Time series, aggregates
190 | 4. **E-commerce Data** - Product catalogs, inventory
191 | 5. **IoT Sensor Data** - Regular readings
192 | 6. **API Logs & Traces** - Structured log entries
193 | 7. **ML Training Data** - Feature vectors, labels
194 | 
195 | ### ⚠️ **LESS OPTIMAL:**
196 | 
197 | - Highly irregular/nested data (still saves 20-40%)
198 | - Maximum compatibility required (JSON is universal)
199 | - Microsecond-level performance critical (TOON is fast, but JSON is faster)
200 | 
201 | ## Methodology
202 | 
203 | ### Test Environment
204 | - **50 diverse datasets** representing real-world use cases
205 | - **Accurate token counting** using tiktoken (GPT-4 encoding)
206 | - **Multiple iterations** (100+) for performance measurements
207 | - **Production-ready data** (not synthetic/trivial examples)
208 | 
209 | ### Datasets Include:
210 | - E-commerce: products, orders, inventory, reviews
211 | - Databases: employee records, query results
212 | - APIs: responses, logs, requests, errors
213 | - Analytics: metrics, A/B tests, surveys, time series
214 | - IoT: sensor readings, device data
215 | - Social: posts, profiles, comments, messages
216 | - Finance: transactions, stock prices
217 | - Media: videos, music, blogs
218 | - System: logs, audit trails, notifications
219 | - And many more...
220 | 
221 | ### Token Counting
222 | All token counts use tiktoken with GPT-4 encoding for accuracy. Results are directly applicable to:
223 | - GPT-4 / GPT-4 Turbo
224 | - GPT-3.5 Turbo
225 | - Claude (similar tokenization)
226 | - Other modern LLMs
227 | 
228 | ## Running the Benchmarks Yourself
229 | 
230 | ```bash
231 | # Clone the repo
232 | git clone https://github.com/ScrapeGraphAI/toonify.git
233 | cd toonify
234 | 
235 | # Install dependencies
236 | pip install -e .
237 | pip install tiktoken
238 | 
239 | # Run benchmarks
240 | python benchmark/compare_formats.py
241 | python benchmark/memory_benchmark.py
242 | 
243 | # Or run all at once
244 | python benchmark/run_all.py
245 | ```
246 | 
247 | ## Conclusion
248 | 
249 | **TOON delivers massive, consistent savings across diverse real-world datasets:**
250 | 
251 | - ✅ **63.9% average size reduction**
252 | - ✅ **54.1% average token reduction**
253 | - ✅ **98% of datasets achieve 40%+ savings**
254 | - ✅ **60% of datasets achieve 60%+ savings**
255 | - ✅ **Thousands of dollars saved** in LLM API costs for high-volume applications
256 | 
257 | **The results speak for themselves: If you're working with structured data and LLM APIs, TOON can cut your costs in half while maintaining full data fidelity and readability.**
258 | 
259 | ---
260 | 
261 | **Want to see the live output?** Run `python benchmark/compare_formats.py` to see the full, interactive benchmark results!
262 | 


--------------------------------------------------------------------------------
/examples/structure_template_usage.py:
--------------------------------------------------------------------------------
  1 | """Examples demonstrating TOON structure template generation for LLM prompts."""
  2 | from toon import generate_structure
  3 | 
  4 | try:
  5 |     from pydantic import BaseModel, Field
  6 |     from toon import generate_structure_from_pydantic
  7 |     PYDANTIC_AVAILABLE = True
  8 | except ImportError:
  9 |     PYDANTIC_AVAILABLE = False
 10 |     print("Note: Pydantic examples will be skipped (pydantic not installed)")
 11 | 
 12 | 
 13 | def example_simple_response_structure():
 14 |     """Example: Generate a simple response structure template."""
 15 |     print("=== Simple Response Structure ===")
 16 |     print("Use case: Telling an LLM what format to return data in\n")
 17 |     
 18 |     schema = {
 19 |         "name": "name of the person",
 20 |         "age": "age of the person",
 21 |         "occupation": "job description of the person"
 22 |     }
 23 |     
 24 |     structure = generate_structure(schema)
 25 |     
 26 |     print("Schema definition:")
 27 |     print(schema)
 28 |     print("\nGenerated TOON structure template:")
 29 |     print(structure)
 30 |     print("\nHow to use in LLM prompt:")
 31 |     print('  "Please extract person information and return it in this format:')
 32 |     print(f'   {structure}"')
 33 |     print()
 34 | 
 35 | 
 36 | def example_nested_response_structure():
 37 |     """Example: Generate a nested response structure."""
 38 |     print("=== Nested Response Structure ===")
 39 |     print("Use case: Complex data with nested objects\n")
 40 |     
 41 |     schema = {
 42 |         "company": {
 43 |             "name": "company name",
 44 |             "location": {
 45 |                 "city": "city name",
 46 |                 "country": "country name"
 47 |             }
 48 |         },
 49 |         "employee_count": "number of employees"
 50 |     }
 51 |     
 52 |     structure = generate_structure(schema)
 53 |     
 54 |     print("Generated structure template:")
 55 |     print(structure)
 56 |     print()
 57 | 
 58 | 
 59 | def example_array_response_structure():
 60 |     """Example: Generate structure for array responses."""
 61 |     print("=== Array Response Structure ===")
 62 |     print("Use case: Extracting multiple items in tabular format\n")
 63 |     
 64 |     schema = {
 65 |         "products": [{
 66 |             "id": "product identifier",
 67 |             "name": "product name",
 68 |             "price": "product price in USD",
 69 |             "in_stock": "availability status"
 70 |         }]
 71 |     }
 72 |     
 73 |     structure = generate_structure(schema)
 74 |     
 75 |     print("Generated structure template:")
 76 |     print(structure)
 77 |     print("\nHow to use in LLM prompt:")
 78 |     print('  "Extract all products from the page and return them in this format:')
 79 |     print(f'   {structure}"')
 80 |     print()
 81 | 
 82 | 
 83 | def example_list_response_structure():
 84 |     """Example: Generate structure for list of items."""
 85 |     print("=== Root-Level Array Structure ===")
 86 |     print("Use case: Returning an array of similar objects\n")
 87 |     
 88 |     schema = [{
 89 |         "title": "article title",
 90 |         "author": "article author",
 91 |         "date": "publication date",
 92 |         "summary": "brief summary"
 93 |     }]
 94 |     
 95 |     structure = generate_structure(schema)
 96 |     
 97 |     print("Generated structure template:")
 98 |     print(structure)
 99 |     print()
100 | 
101 | 
102 | def example_mixed_response_structure():
103 |     """Example: Complex structure with mixed types."""
104 |     print("=== Mixed Response Structure ===")
105 |     print("Use case: Complex extraction with various data types\n")
106 |     
107 |     schema = {
108 |         "page_title": "title of the page",
109 |         "metadata": {
110 |             "published": "publication date",
111 |             "author": "author name"
112 |         },
113 |         "tags": ["tag name"],
114 |         "sections": [{
115 |             "heading": "section heading",
116 |             "word_count": "number of words"
117 |         }]
118 |     }
119 |     
120 |     structure = generate_structure(schema)
121 |     
122 |     print("Generated structure template:")
123 |     print(structure)
124 |     print()
125 | 
126 | 
127 | def example_delimiter_options():
128 |     """Example: Using different delimiters."""
129 |     print("=== Custom Delimiters ===")
130 |     print("Use case: When data might contain commas\n")
131 |     
132 |     schema = [{
133 |         "address": "full address (may contain commas)",
134 |         "city": "city name",
135 |         "zipcode": "zip code"
136 |     }]
137 |     
138 |     print("With pipe delimiter (recommended for addresses):")
139 |     structure_pipe = generate_structure(schema, {"delimiter": "|"})
140 |     print(structure_pipe)
141 |     
142 |     print("\nWith tab delimiter (good for spreadsheet-like data):")
143 |     structure_tab = generate_structure(schema, {"delimiter": "\t"})
144 |     print(structure_tab)
145 |     print()
146 | 
147 | 
148 | # Pydantic-specific examples
149 | if PYDANTIC_AVAILABLE:
150 |     
151 |     class Person(BaseModel):
152 |         """Person information model."""
153 |         id: int = Field(description="unique identifier")
154 |         name: str = Field(description="full name")
155 |         email: str = Field(description="email address")
156 |         age: int = Field(description="age in years")
157 |         occupation: str = Field(description="job title or profession")
158 |     
159 |     
160 |     class Article(BaseModel):
161 |         """Article model."""
162 |         title: str = Field(description="article title")
163 |         author: str = Field(description="author name")
164 |         published_date: str = Field(description="publication date in YYYY-MM-DD format")
165 |         tags: list[str] = Field(description="article tags")
166 |         word_count: int = Field(description="number of words")
167 |     
168 |     
169 |     def example_pydantic_simple_model():
170 |         """Example: Generate structure from Pydantic model."""
171 |         print("=== Pydantic Model Structure ===")
172 |         print("Use case: Generate structure from existing data models\n")
173 |         
174 |         structure = generate_structure_from_pydantic(Person)
175 |         
176 |         print(f"Model: {Person.__name__}")
177 |         print("\nGenerated structure template:")
178 |         print(structure)
179 |         print()
180 |     
181 |     
182 |     def example_pydantic_for_llm_prompt():
183 |         """Example: Using Pydantic structure in LLM prompts."""
184 |         print("=== Complete LLM Prompt Example ===")
185 |         print("Use case: Full example of using structure in a prompt\n")
186 |         
187 |         structure = generate_structure_from_pydantic(Article)
188 |         
189 |         prompt = f"""Extract the article information from the following text and return it in TOON format.
190 | 
191 | Expected structure:
192 | {structure}
193 | 
194 | Text to extract from:
195 | [Article content would go here...]
196 | 
197 | Please return only the TOON formatted data."""
198 |         
199 |         print("Complete prompt:")
200 |         print("-" * 60)
201 |         print(prompt)
202 |         print("-" * 60)
203 |         print()
204 |     
205 |     
206 |     def example_pydantic_array_structure():
207 |         """Example: Array of Pydantic models."""
208 |         print("=== Pydantic Array Structure ===")
209 |         print("Use case: Extracting multiple items of the same type\n")
210 |         
211 |         # To generate array structure, we pass a list schema
212 |         schema = [{
213 |             "title": "article title",
214 |             "author": "author name",
215 |             "published_date": "publication date",
216 |             "word_count": "word count"
217 |         }]
218 |         
219 |         structure = generate_structure(schema)
220 |         
221 |         print("Generated structure for array of articles:")
222 |         print(structure)
223 |         print()
224 | 
225 | 
226 | def example_real_world_use_case():
227 |     """Example: Real-world use case for web scraping."""
228 |     print("=== Real-World Use Case: Product Scraping ===")
229 |     print("Use case: Instructing an LLM to extract product data\n")
230 |     
231 |     schema = {
232 |         "products": [{
233 |             "name": "product name",
234 |             "sku": "product SKU or ID",
235 |             "price": "price in USD",
236 |             "rating": "average rating (1-5)",
237 |             "reviews_count": "number of reviews",
238 |             "availability": "in stock or out of stock"
239 |         }]
240 |     }
241 |     
242 |     structure = generate_structure(schema)
243 |     
244 |     prompt = f"""You are a web scraping assistant. Extract all product information from the HTML and return it in TOON format.
245 | 
246 | Return the data in this exact structure:
247 | {structure}
248 | 
249 | Important notes:
250 | - Extract ALL products from the page
251 | - Price should be numeric (remove currency symbols)
252 | - Rating should be a number between 1 and 5
253 | - If a field is missing, use null
254 | 
255 | HTML content:
256 | [HTML content would go here...]"""
257 |     
258 |     print("Complete prompt for web scraping:")
259 |     print("=" * 60)
260 |     print(prompt)
261 |     print("=" * 60)
262 |     print()
263 | 
264 | 
265 | def main():
266 |     """Run all examples."""
267 |     print("\n" + "="*60)
268 |     print("  TOON STRUCTURE TEMPLATE EXAMPLES")
269 |     print("  Generate response structures for LLM prompts")
270 |     print("="*60 + "\n")
271 |     
272 |     example_simple_response_structure()
273 |     example_nested_response_structure()
274 |     example_array_response_structure()
275 |     example_list_response_structure()
276 |     example_mixed_response_structure()
277 |     example_delimiter_options()
278 |     
279 |     if PYDANTIC_AVAILABLE:
280 |         example_pydantic_simple_model()
281 |         example_pydantic_for_llm_prompt()
282 |         example_pydantic_array_structure()
283 |     
284 |     example_real_world_use_case()
285 |     
286 |     print("="*60)
287 |     print("  Summary")
288 |     print("="*60)
289 |     print("✨ Use generate_structure() to create response templates")
290 |     print("✨ Perfect for LLM prompts - no need to provide examples")
291 |     print("✨ Supports nested objects, arrays, and custom delimiters")
292 |     print("✨ Works with Pydantic models for type-safe schemas")
293 |     print("✨ Reduces token usage while maintaining clarity")
294 |     print()
295 | 
296 | 
297 | if __name__ == "__main__":
298 |     main()
299 | 


--------------------------------------------------------------------------------
/assets/README.zh-CN.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img src="toonify.png" alt="Toonify Logo" width="400">
  3 | </p>
  4 | 
  5 | # TOON（面向Token的对象表示法）
  6 | 
  7 | [English](../README.md) | [中文](README.zh-CN.md) | [한국어](README.ko.md)
  8 | 
  9 | 一种紧凑、人类可读的序列化格式，专为向大型语言模型传递结构化数据而设计，显著减少Token使用量。
 10 | 
 11 | [![Python Version](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
 12 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 13 | 
 14 | ## 概述
 15 | 
 16 | TOON在实现**CSV般的紧凑性**的同时增加了**明确的结构**，非常适合：
 17 | - 降低LLM API调用的Token成本
 18 | - 提高上下文窗口效率
 19 | - 保持人类可读性
 20 | - 保留数据结构和类型
 21 | 
 22 | ### 主要特性
 23 | 
 24 | - ✅ **紧凑**：平均比JSON**小64%**（在50个数据集上测试）
 25 | - ✅ **可读**：简洁、基于缩进的语法
 26 | - ✅ **结构化**：保留嵌套对象和数组
 27 | - ✅ **类型安全**：支持字符串、数字、布尔值、null
 28 | - ✅ **灵活**：多种分隔符选项（逗号、制表符、竖线）
 29 | - ✅ **智能**：对统一数组自动使用表格格式
 30 | - ✅ **高效**：对深层嵌套对象的键折叠
 31 | 
 32 | ## 安装
 33 | 
 34 | ```bash
 35 | pip install toonify
 36 | ```
 37 | 
 38 | 开发环境安装：
 39 | ```bash
 40 | pip install toonify[dev]
 41 | ```
 42 | 
 43 | 支持Pydantic：
 44 | ```bash
 45 | pip install toonify[pydantic]
 46 | ```
 47 | 
 48 | ## 快速开始
 49 | 
 50 | ### Python API
 51 | 
 52 | ```python
 53 | from toon import encode, decode
 54 | 
 55 | # 将Python字典编码为TOON
 56 | data = {
 57 |     'products': [
 58 |         {'sku': 'LAP-001', 'name': 'Gaming Laptop', 'price': 1299.99},
 59 |         {'sku': 'MOU-042', 'name': 'Wireless Mouse', 'price': 29.99}
 60 |     ]
 61 | }
 62 | 
 63 | toon_string = encode(data)
 64 | print(toon_string)
 65 | # 输出：
 66 | # products[2]{sku,name,price}:
 67 | #   LAP-001,Gaming Laptop,1299.99
 68 | #   MOU-042,Wireless Mouse,29.99
 69 | 
 70 | # 将TOON解码回Python
 71 | result = decode(toon_string)
 72 | assert result == data
 73 | ```
 74 | 
 75 | ### 命令行
 76 | 
 77 | ```bash
 78 | # 将JSON编码为TOON
 79 | toon input.json -o output.toon
 80 | 
 81 | # 将TOON解码为JSON
 82 | toon input.toon -o output.json
 83 | 
 84 | # 使用管道
 85 | cat data.json | toon -e > data.toon
 86 | 
 87 | # 显示Token统计信息
 88 | toon data.json --stats
 89 | ```
 90 | 
 91 | ### Pydantic集成
 92 | 
 93 | TOON支持直接从Pydantic模型转换：
 94 | 
 95 | ```python
 96 | from pydantic import BaseModel
 97 | from toon import encode_pydantic, decode_to_pydantic
 98 | 
 99 | # 定义Pydantic模型
100 | class User(BaseModel):
101 |     id: int
102 |     name: str
103 |     email: str
104 | 
105 | # 将Pydantic模型编码为TOON
106 | users = [
107 |     User(id=1, name='Alice', email='alice@example.com'),
108 |     User(id=2, name='Bob', email='bob@example.com')
109 | ]
110 | 
111 | toon = encode_pydantic(users)
112 | print(toon)
113 | # 输出：
114 | # [2]{id,name,email}:
115 | #   1,Alice,alice@example.com
116 | #   2,Bob,bob@example.com
117 | 
118 | # 将TOON解码回Pydantic模型
119 | decoded_users = decode_to_pydantic(toon, User)
120 | assert all(isinstance(u, User) for u in decoded_users)
121 | ```
122 | 
123 | **特性：**
124 | - ✅ 直接从Pydantic模型转换（支持v1和v2）
125 | - ✅ 支持嵌套模型
126 | - ✅ 排除未设置、None或默认值
127 | - ✅ 支持字段别名
128 | - ✅ 解码时完全验证
129 | - ✅ 往返转换
130 | 
131 | 详见[examples/pydantic_usage.py](../examples/pydantic_usage.py)。
132 | 
133 | ## TOON格式规范
134 | 
135 | ### 基本语法
136 | 
137 | ```toon
138 | # 简单的键值对
139 | title: Machine Learning Basics
140 | chapters: 12
141 | published: true
142 | ```
143 | 
144 | ### 数组
145 | 
146 | **原始数组**（内联）：
147 | ```toon
148 | temperatures: [72.5,68.3,75.1,70.8,73.2]
149 | categories: [electronics,computers,accessories]
150 | ```
151 | 
152 | **表格数组**（具有标题的统一对象）：
153 | ```toon
154 | inventory[3]{sku,product,stock}:
155 |   KB-789,Mechanical Keyboard,45
156 |   MS-456,RGB Mouse Pad,128
157 |   HD-234,USB Headset,67
158 | ```
159 | 
160 | **列表数组**（非统一或嵌套）：
161 | ```toon
162 | tasks[2]:
163 |   Complete documentation
164 |   Review pull requests
165 | ```
166 | 
167 | ### 嵌套对象
168 | 
169 | ```toon
170 | server:
171 |   hostname: api-prod-01
172 |   config:
173 |     port: 8080
174 |     region: us-east
175 | ```
176 | 
177 | ### 引号规则
178 | 
179 | 字符串仅在必要时使用引号：
180 | - 包含特殊字符（`,`、`:`、`"`、换行符）
181 | - 有前导/尾随空格
182 | - 看起来像字面量（`true`、`false`、`null`）
183 | - 为空字符串
184 | 
185 | ```toon
186 | simple: ProductName
187 | quoted: "Product, Description"
188 | escaped: "Size: 15\" display"
189 | multiline: "First feature\nSecond feature"
190 | ```
191 | 
192 | ## API参考
193 | 
194 | ### `encode(data, options=None)`
195 | 
196 | 将Python对象转换为TOON字符串。
197 | 
198 | **参数：**
199 | - `data`：Python字典或列表
200 | - `options`：可选字典，包含：
201 |   - `delimiter`：`'comma'`（默认）、`'tab'`或`'pipe'`
202 |   - `indent`：每级缩进的空格数（默认：2）
203 |   - `key_folding`：`'off'`（默认）或`'safe'`
204 |   - `flatten_depth`：键折叠的最大深度（默认：None）
205 | 
206 | **示例：**
207 | ```python
208 | toon = encode(data, {
209 |     'delimiter': 'tab',
210 |     'indent': 4,
211 |     'key_folding': 'safe'
212 | })
213 | ```
214 | 
215 | ### `decode(toon_string, options=None)`
216 | 
217 | 将TOON字符串转换为Python对象。
218 | 
219 | **参数：**
220 | - `toon_string`：TOON格式字符串
221 | - `options`：可选字典，包含：
222 |   - `strict`：严格验证结构（默认：True）
223 |   - `expand_paths`：`'off'`（默认）或`'safe'`
224 |   - `default_delimiter`：默认分隔符（默认：`','`）
225 | 
226 | **示例：**
227 | ```python
228 | data = decode(toon_string, {
229 |     'expand_paths': 'safe',
230 |     'strict': False
231 | })
232 | ```
233 | 
234 | ### `encode_pydantic(model, options=None, exclude_unset=False, exclude_none=False, exclude_defaults=False, by_alias=False)`
235 | 
236 | 将Pydantic模型转换为TOON字符串。
237 | 
238 | **参数：**
239 | - `model`：Pydantic模型实例或模型实例列表
240 | - `options`：与`encode()`函数相同
241 | - `exclude_unset`：如果为True，排除未明确设置的字段
242 | - `exclude_none`：如果为True，排除None值字段
243 | - `exclude_defaults`：如果为True，排除具有默认值的字段
244 | - `by_alias`：如果为True，使用字段别名而不是字段名称
245 | 
246 | **示例：**
247 | ```python
248 | from pydantic import BaseModel
249 | from toon import encode_pydantic
250 | 
251 | class User(BaseModel):
252 |     id: int
253 |     name: str
254 |     email: str | None = None
255 | 
256 | user = User(id=1, name='Alice')
257 | toon = encode_pydantic(user, exclude_none=True)
258 | ```
259 | 
260 | ### `decode_to_pydantic(toon_string, model_class, options=None)`
261 | 
262 | 将TOON字符串解码为Pydantic模型。
263 | 
264 | **参数：**
265 | - `toon_string`：TOON格式字符串
266 | - `model_class`：要实例化的Pydantic模型类
267 | - `options`：与`decode()`函数相同
268 | 
269 | **返回：**
270 | - Pydantic模型实例或实例列表（取决于输入）
271 | 
272 | **示例：**
273 | ```python
274 | from pydantic import BaseModel
275 | from toon import decode_to_pydantic
276 | 
277 | class User(BaseModel):
278 |     id: int
279 |     name: str
280 | 
281 | toon = "id: 1\nname: Alice"
282 | user = decode_to_pydantic(toon, User)
283 | ```
284 | 
285 | ## CLI使用
286 | 
287 | ```
288 | 用法：toon [-h] [-o OUTPUT] [-e] [-d] [--delimiter {comma,tab,pipe}]
289 |             [--indent INDENT] [--stats] [--no-strict]
290 |             [--key-folding {off,safe}] [--flatten-depth DEPTH]
291 |             [--expand-paths {off,safe}]
292 |             [input]
293 | 
294 | TOON (Token-Oriented Object Notation) - 在JSON和TOON格式之间转换
295 | 
296 | 位置参数：
297 |   input                 输入文件路径（或"-"表示stdin）
298 | 
299 | 可选参数：
300 |   -h, --help            显示帮助信息并退出
301 |   -o, --output OUTPUT   输出文件路径（默认：stdout）
302 |   -e, --encode          强制编码模式（JSON到TOON）
303 |   -d, --decode          强制解码模式（TOON到JSON）
304 |   --delimiter {comma,tab,pipe}
305 |                         数组分隔符（默认：comma）
306 |   --indent INDENT       缩进大小（默认：2）
307 |   --stats               显示Token统计信息
308 |   --no-strict           禁用严格验证（仅解码）
309 |   --key-folding {off,safe}
310 |                         键折叠模式（仅编码）
311 |   --flatten-depth DEPTH 最大键折叠深度（仅编码）
312 |   --expand-paths {off,safe}
313 |                         路径扩展模式（仅解码）
314 | ```
315 | 
316 | ## 高级特性
317 | 
318 | ### 键折叠
319 | 
320 | 将单键链折叠为点分隔路径：
321 | 
322 | ```python
323 | data = {
324 |     'api': {
325 |         'response': {
326 |             'product': {
327 |                 'title': 'Wireless Keyboard'
328 |             }
329 |         }
330 |     }
331 | }
332 | 
333 | # 使用key_folding='safe'
334 | toon = encode(data, {'key_folding': 'safe'})
335 | # 输出：api.response.product.title: Wireless Keyboard
336 | ```
337 | 
338 | ### 路径扩展
339 | 
340 | 将点分隔的键扩展为嵌套对象：
341 | 
342 | ```python
343 | toon = 'store.location.zipcode: 10001'
344 | 
345 | # 使用expand_paths='safe'
346 | data = decode(toon, {'expand_paths': 'safe'})
347 | # 结果：{'store': {'location': {'zipcode': 10001}}}
348 | ```
349 | 
350 | ### 自定义分隔符
351 | 
352 | 选择最适合您数据的分隔符：
353 | 
354 | ```python
355 | # 制表符分隔符（更适合类似电子表格的数据）
356 | toon = encode(data, {'delimiter': 'tab'})
357 | 
358 | # 竖线分隔符（当数据包含逗号时）
359 | toon = encode(data, {'delimiter': 'pipe'})
360 | ```
361 | 
362 | ## 格式比较
363 | 
364 | ### JSON vs TOON
365 | 
366 | **JSON**（247字节）：
367 | ```json
368 | {
369 |   "products": [
370 |     {"id": 101, "name": "Laptop Pro", "price": 1299},
371 |     {"id": 102, "name": "Magic Mouse", "price": 79},
372 |     {"id": 103, "name": "USB-C Cable", "price": 19}
373 |   ]
374 | }
375 | ```
376 | 
377 | **TOON**（98字节，**减少60%**）：
378 | ```toon
379 | products[3]{id,name,price}:
380 |   101,Laptop Pro,1299
381 |   102,Magic Mouse,79
382 |   103,USB-C Cable,19
383 | ```
384 | 
385 | ### 何时使用TOON
386 | 
387 | **使用TOON的场景：**
388 | - ✅ 向LLM API传递数据（降低Token成本）
389 | - ✅ 处理统一的表格数据
390 | - ✅ 上下文窗口受限
391 | - ✅ 重视人类可读性
392 | 
393 | **使用JSON的场景：**
394 | - ❌ 需要最大兼容性
395 | - ❌ 数据高度不规则/嵌套
396 | - ❌ 使用仅支持JSON的现有工具
397 | 
398 | ## 开发
399 | 
400 | ### 设置
401 | 
402 | ```bash
403 | git clone https://github.com/ScrapeGraphAI/toonify.git
404 | cd toonify
405 | pip install -e .[dev]
406 | ```
407 | 
408 | ### 运行测试
409 | 
410 | ```bash
411 | pytest
412 | pytest --cov=toon --cov-report=term-missing
413 | ```
414 | 
415 | ### 运行示例
416 | 
417 | ```bash
418 | python examples/basic_usage.py
419 | python examples/advanced_features.py
420 | ```
421 | 
422 | ## 性能
423 | 
424 | **在50个多样化的真实数据集上进行基准测试：**
425 | 
426 | - 与JSON相比，结构化数据**平均减少63.9%的大小**
427 | - **平均减少54.1%的Token**（直接降低LLM API成本）
428 | - 最优使用场景**最高节省73.4%**（表格数据、调查、分析）
429 | - **98%的数据集实现40%以上的节省**
430 | - **最小的开销**用于编码/解码（典型有效负载<1ms）
431 | 
432 | **💰 成本影响：** 按GPT-4定价计算，TOON每百万次API请求**节省$2,147**，每十亿Token**节省$5,408**。
433 | 
434 | **[📊 查看完整基准测试结果 →](../benchmark/RESULTS.md)**
435 | 
436 | ## 贡献
437 | 
438 | 欢迎贡献！请：
439 | 
440 | 1. Fork仓库
441 | 2. 创建功能分支（`git checkout -b feature/amazing-feature`）
442 | 3. 进行更改并编写测试
443 | 4. 运行测试（`pytest`）
444 | 5. 提交更改（`git commit -m 'Add amazing feature'`）
445 | 6. 推送到分支（`git push origin feature/amazing-feature`）
446 | 7. 打开Pull Request
447 | 
448 | ## 许可证
449 | 
450 | MIT许可证 - 详情请参见[LICENSE](../LICENSE)文件。
451 | 
452 | ## 致谢
453 | 
454 | Python实现受[toon-format/toon](https://github.com/toon-format/toon)的TypeScript TOON库启发。
455 | 
456 | ## 链接
457 | 
458 | - **GitHub**：https://github.com/ScrapeGraphAI/toonify
459 | - **PyPI**：https://pypi.org/project/toonify/
460 | - **文档**：https://github.com/ScrapeGraphAI/toonify#readme
461 | - **格式规范**：https://github.com/toon-format/toon
462 | 
463 | ---
464 | 
465 | 由[ScrapeGraph团队](https://scrapegraphai.com)用心制作
466 | 
467 | <p align="center">
468 |   <img src="https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/assets/scrapegraphai_logo.png" alt="ScrapeGraphAI Logo" width="250">
469 | </p>
470 | 
471 | 


--------------------------------------------------------------------------------
/toon/structure_generator.py:
--------------------------------------------------------------------------------
  1 | """TOON structure generator - create response templates for LLM prompts."""
  2 | from typing import Any, Dict, List, Optional, Union
  3 | from .constants import (
  4 |     COMMA, COLON, NEWLINE,
  5 |     DEFAULT_DELIMITER, DEFAULT_INDENT,
  6 |     LEFT_BRACKET, RIGHT_BRACKET, LEFT_BRACE, RIGHT_BRACE
  7 | )
  8 | from .utils import get_indent
  9 | 
 10 | 
 11 | def generate_structure(
 12 |     schema: Union[Dict[str, Any], List[Dict[str, Any]]], 
 13 |     options: Optional[Dict[str, Any]] = None
 14 | ) -> str:
 15 |     """
 16 |     Generate a TOON structure template from a schema definition.
 17 |     
 18 |     This function creates a response structure template that can be included
 19 |     in LLM prompts to specify the expected output format without examples.
 20 |     
 21 |     Args:
 22 |         schema: Schema definition as a dict or list of dicts
 23 |             - For simple fields: {"field_name": "description"}
 24 |             - For nested objects: {"field_name": {"nested_field": "description"}}
 25 |             - For arrays: {"field_name": [{"array_field": "description"}]}
 26 |             - List at root level creates a tabular array template
 27 |         options: Optional encoding options
 28 |             - delimiter: ',' (default), '\t', or '|'
 29 |             - indent: int (default 2)
 30 |             
 31 |     Returns:
 32 |         TOON formatted structure template string
 33 |         
 34 |     Examples:
 35 |         >>> schema = {
 36 |         ...     "name": "name of the person",
 37 |         ...     "age": "age of the person",
 38 |         ...     "occupation": "job description of the person"
 39 |         ... }
 40 |         >>> print(generate_structure(schema))
 41 |         name: <name of the person>
 42 |         age: <age of the person>
 43 |         occupation: <job description of the person>
 44 |         
 45 |         >>> schema = [{"id": "user id", "name": "user name"}]
 46 |         >>> print(generate_structure(schema))
 47 |         [N]{id,name}:
 48 |           <user id>,<user name>
 49 |           ...
 50 |     """
 51 |     if options is None:
 52 |         options = {}
 53 |     
 54 |     delimiter = options.get('delimiter', DEFAULT_DELIMITER)
 55 |     indent = options.get('indent', DEFAULT_INDENT)
 56 |     
 57 |     if isinstance(schema, list):
 58 |         return _generate_array_structure(schema, 0, delimiter, indent)
 59 |     elif isinstance(schema, dict):
 60 |         return _generate_object_structure(schema, 0, delimiter, indent)
 61 |     else:
 62 |         return "<value>"
 63 | 
 64 | 
 65 | def _generate_object_structure(
 66 |     schema: Dict[str, Any], 
 67 |     level: int, 
 68 |     delimiter: str, 
 69 |     indent_size: int
 70 | ) -> str:
 71 |     """Generate structure template for an object."""
 72 |     if not schema:
 73 |         return "{}"
 74 |     
 75 |     lines = []
 76 |     indent = get_indent(level, indent_size)
 77 |     
 78 |     for key, value in schema.items():
 79 |         if isinstance(value, str):
 80 |             # Simple field with description
 81 |             lines.append(f'{indent}{key}{COLON} <{value}>')
 82 |         elif isinstance(value, dict):
 83 |             # Nested object
 84 |             if not value:
 85 |                 lines.append(f'{indent}{key}{COLON} {{}}')
 86 |             else:
 87 |                 nested = _generate_object_structure(value, level + 1, delimiter, indent_size)
 88 |                 lines.append(f'{indent}{key}{COLON}')
 89 |                 lines.append(nested)
 90 |         elif isinstance(value, list):
 91 |             # Array field
 92 |             if not value:
 93 |                 lines.append(f'{indent}{key}{COLON} []')
 94 |             else:
 95 |                 array_template = _generate_array_structure(value, level, delimiter, indent_size, key=key)
 96 |                 lines.append(array_template)
 97 |         else:
 98 |             lines.append(f'{indent}{key}{COLON} <value>')
 99 |     
100 |     return NEWLINE.join(lines)
101 | 
102 | 
103 | def _generate_array_structure(
104 |     schema: List[Any], 
105 |     level: int, 
106 |     delimiter: str, 
107 |     indent_size: int,
108 |     key: Optional[str] = None
109 | ) -> str:
110 |     """Generate structure template for an array."""
111 |     if not schema:
112 |         return "[]"
113 |     
114 |     indent = get_indent(level, indent_size)
115 |     
116 |     # Check if it's an array of objects (tabular format)
117 |     if isinstance(schema[0], dict):
118 |         return _generate_tabular_array_structure(
119 |             schema[0], level, delimiter, indent_size, key
120 |         )
121 |     elif isinstance(schema[0], str):
122 |         # Array of primitive descriptions
123 |         if key:
124 |             return f'{indent}{key}{COLON} [<{schema[0]}>,...]'
125 |         else:
126 |             return f'[<{schema[0]}>,...]'
127 |     else:
128 |         # Generic array
129 |         if key:
130 |             return f'{indent}{key}{COLON} [...]'
131 |         else:
132 |             return '[...]'
133 | 
134 | 
135 | def _generate_tabular_array_structure(
136 |     field_schema: Dict[str, Any],
137 |     level: int,
138 |     delimiter: str,
139 |     indent_size: int,
140 |     key: Optional[str] = None
141 | ) -> str:
142 |     """Generate structure template for a tabular array."""
143 |     indent = get_indent(level, indent_size)
144 |     
145 |     # Get field names
146 |     fields = list(field_schema.keys())
147 |     
148 |     # Delimiter indicator for non-comma delimiters
149 |     delimiter_indicator = ''
150 |     if delimiter == '\t':
151 |         delimiter_indicator = '\t'
152 |     elif delimiter == '|':
153 |         delimiter_indicator = '|'
154 |     
155 |     # Header: key[N]{field1,field2,...}: or [N]{field1,field2,...}:
156 |     if key:
157 |         header = f'{indent}{key}[N{delimiter_indicator}]{LEFT_BRACE}{COMMA.join(fields)}{RIGHT_BRACE}{COLON}'
158 |     else:
159 |         header = f'[N{delimiter_indicator}]{LEFT_BRACE}{COMMA.join(fields)}{RIGHT_BRACE}{COLON}'
160 |     
161 |     lines = [header]
162 |     
163 |     # Create a sample row with descriptions
164 |     row_parts = []
165 |     for field, description in field_schema.items():
166 |         if isinstance(description, str):
167 |             row_parts.append(f'<{description}>')
168 |         else:
169 |             row_parts.append('<value>')
170 |     
171 |     row = delimiter.join(row_parts)
172 |     lines.append(f'{indent}  {row}')
173 |     lines.append(f'{indent}  ...')
174 |     
175 |     return NEWLINE.join(lines)
176 | 
177 | 
178 | def generate_structure_from_pydantic(
179 |     model_class,
180 |     options: Optional[Dict[str, Any]] = None,
181 |     include_descriptions: bool = True
182 | ) -> str:
183 |     """
184 |     Generate a TOON structure template from a Pydantic model class.
185 |     
186 |     This function creates a response structure template from a Pydantic model
187 |     that can be included in LLM prompts.
188 |     
189 |     Args:
190 |         model_class: Pydantic model class (BaseModel subclass)
191 |         options: Optional encoding options
192 |             - delimiter: ',' (default), '\t', or '|'
193 |             - indent: int (default 2)
194 |         include_descriptions: If True, include field descriptions from docstrings
195 |             
196 |     Returns:
197 |         TOON formatted structure template string
198 |         
199 |     Examples:
200 |         >>> from pydantic import BaseModel, Field
201 |         >>> class User(BaseModel):
202 |         ...     id: int = Field(description="user identifier")
203 |         ...     name: str = Field(description="user full name")
204 |         ...     email: str = Field(description="user email address")
205 |         >>> print(generate_structure_from_pydantic(User))
206 |         id: <user identifier>
207 |         name: <user full name>
208 |         email: <user email address>
209 |     """
210 |     try:
211 |         from pydantic import BaseModel
212 |     except ImportError:
213 |         raise ImportError(
214 |             "generate_structure_from_pydantic requires pydantic to be installed. "
215 |             "Please install pydantic to use this feature."
216 |         )
217 |     
218 |     if not issubclass(model_class, BaseModel):
219 |         raise TypeError("model_class must be a Pydantic BaseModel subclass")
220 |     
221 |     schema = _extract_schema_from_pydantic(model_class, include_descriptions)
222 |     return generate_structure(schema, options)
223 | 
224 | 
225 | def _extract_schema_from_pydantic(
226 |     model_class,
227 |     include_descriptions: bool
228 | ) -> Dict[str, Any]:
229 |     """Extract schema from Pydantic model."""
230 |     try:
231 |         # Pydantic v2
232 |         if hasattr(model_class, 'model_fields'):
233 |             fields = model_class.model_fields
234 |             schema = {}
235 |             
236 |             for field_name, field_info in fields.items():
237 |                 if include_descriptions and field_info.description:
238 |                     schema[field_name] = field_info.description
239 |                 else:
240 |                     # Use type annotation as description
241 |                     annotation = field_info.annotation
242 |                     type_name = _get_type_name(annotation)
243 |                     schema[field_name] = type_name
244 |             
245 |             return schema
246 |         # Pydantic v1
247 |         elif hasattr(model_class, '__fields__'):
248 |             fields = model_class.__fields__
249 |             schema = {}
250 |             
251 |             for field_name, field_info in fields.items():
252 |                 if include_descriptions and field_info.field_info.description:
253 |                     schema[field_name] = field_info.field_info.description
254 |                 else:
255 |                     # Use type annotation as description
256 |                     type_name = _get_type_name(field_info.outer_type_)
257 |                     schema[field_name] = type_name
258 |             
259 |             return schema
260 |         else:
261 |             raise ValueError("Unable to extract fields from Pydantic model")
262 |     except Exception as e:
263 |         raise ValueError(f"Error extracting schema from Pydantic model: {e}")
264 | 
265 | 
266 | def _get_type_name(annotation) -> str:
267 |     """Get a readable type name from a type annotation."""
268 |     if hasattr(annotation, '__name__'):
269 |         return annotation.__name__.lower()
270 |     
271 |     # Handle typing generics
272 |     type_str = str(annotation)
273 |     
274 |     # Simplify common types
275 |     if 'int' in type_str.lower():
276 |         return 'integer'
277 |     elif 'str' in type_str.lower():
278 |         return 'string'
279 |     elif 'float' in type_str.lower():
280 |         return 'number'
281 |     elif 'bool' in type_str.lower():
282 |         return 'boolean'
283 |     elif 'list' in type_str.lower():
284 |         return 'array'
285 |     elif 'dict' in type_str.lower():
286 |         return 'object'
287 |     else:
288 |         return 'value'
289 | 


--------------------------------------------------------------------------------
/tests/test_pydantic.py:
--------------------------------------------------------------------------------
  1 | """Tests for Pydantic model conversion."""
  2 | import pytest
  3 | 
  4 | # Check if pydantic is available
  5 | try:
  6 |     from pydantic import BaseModel, Field
  7 |     from toon import encode_pydantic, decode_to_pydantic
  8 |     PYDANTIC_AVAILABLE = True
  9 | except ImportError:
 10 |     PYDANTIC_AVAILABLE = False
 11 | 
 12 | 
 13 | @pytest.mark.skipif(not PYDANTIC_AVAILABLE, reason="pydantic not installed")
 14 | class TestPydanticEncoder:
 15 |     """Tests for encode_pydantic function."""
 16 |     
 17 |     def test_simple_model(self):
 18 |         """Test encoding a simple Pydantic model."""
 19 |         class User(BaseModel):
 20 |             id: int
 21 |             name: str
 22 |             email: str
 23 |         
 24 |         user = User(id=1, name='Alice', email='alice@example.com')
 25 |         toon = encode_pydantic(user)
 26 |         
 27 |         assert 'id: 1' in toon
 28 |         assert 'name: Alice' in toon
 29 |         assert 'email: alice@example.com' in toon
 30 |     
 31 |     def test_list_of_models_tabular(self):
 32 |         """Test encoding a list of uniform Pydantic models (tabular format)."""
 33 |         class Product(BaseModel):
 34 |             sku: str
 35 |             name: str
 36 |             price: float
 37 |         
 38 |         products = [
 39 |             Product(sku='LAP-001', name='Gaming Laptop', price=1299.99),
 40 |             Product(sku='MOU-042', name='Wireless Mouse', price=29.99)
 41 |         ]
 42 |         
 43 |         toon = encode_pydantic(products)
 44 |         
 45 |         # Should use tabular format
 46 |         assert '[2]{sku,name,price}:' in toon
 47 |         assert 'LAP-001,Gaming Laptop,1299.99' in toon
 48 |         assert 'MOU-042,Wireless Mouse,29.99' in toon
 49 |     
 50 |     def test_nested_models(self):
 51 |         """Test encoding nested Pydantic models."""
 52 |         class Address(BaseModel):
 53 |             street: str
 54 |             city: str
 55 |             zipcode: str
 56 |         
 57 |         class Person(BaseModel):
 58 |             name: str
 59 |             age: int
 60 |             address: Address
 61 |         
 62 |         person = Person(
 63 |             name='Bob',
 64 |             age=35,
 65 |             address=Address(street='123 Main St', city='Boston', zipcode='02101')
 66 |         )
 67 |         
 68 |         toon = encode_pydantic(person)
 69 |         
 70 |         assert 'name: Bob' in toon
 71 |         assert 'age: 35' in toon
 72 |         assert 'address:' in toon
 73 |         assert 'street: 123 Main St' in toon
 74 |         assert 'city: Boston' in toon
 75 |         assert 'zipcode: 02101' in toon
 76 |     
 77 |     def test_exclude_unset(self):
 78 |         """Test excluding unset fields."""
 79 |         class Config(BaseModel):
 80 |             host: str
 81 |             port: int = 8080
 82 |             debug: bool = False
 83 |         
 84 |         config = Config(host='localhost')
 85 |         
 86 |         # With exclude_unset=False (default)
 87 |         toon_all = encode_pydantic(config, exclude_unset=False)
 88 |         assert 'port: 8080' in toon_all
 89 |         assert 'debug: false' in toon_all
 90 |         
 91 |         # With exclude_unset=True
 92 |         toon_set = encode_pydantic(config, exclude_unset=True)
 93 |         assert 'host: localhost' in toon_set
 94 |         assert 'port' not in toon_set
 95 |         assert 'debug' not in toon_set
 96 |     
 97 |     def test_exclude_none(self):
 98 |         """Test excluding None values."""
 99 |         class User(BaseModel):
100 |             id: int
101 |             name: str
102 |             email: str | None = None  # Use | syntax for Python 3.10+
103 |         
104 |         user = User(id=1, name='Alice', email=None)
105 |         
106 |         # With exclude_none=False (default)
107 |         toon_all = encode_pydantic(user, exclude_none=False)
108 |         assert 'email: null' in toon_all
109 |         
110 |         # With exclude_none=True
111 |         toon_no_none = encode_pydantic(user, exclude_none=True)
112 |         assert 'email' not in toon_no_none
113 |     
114 |     def test_by_alias(self):
115 |         """Test using field aliases."""
116 |         class User(BaseModel):
117 |             user_id: int = Field(alias='id')
118 |             user_name: str = Field(alias='name')
119 |         
120 |         user = User(id=1, name='Alice')
121 |         
122 |         # Without alias
123 |         toon_no_alias = encode_pydantic(user, by_alias=False)
124 |         assert 'user_id: 1' in toon_no_alias
125 |         assert 'user_name: Alice' in toon_no_alias
126 |         
127 |         # With alias
128 |         toon_alias = encode_pydantic(user, by_alias=True)
129 |         assert 'id: 1' in toon_alias
130 |         assert 'name: Alice' in toon_alias
131 |     
132 |     def test_with_encoding_options(self):
133 |         """Test encoding with custom TOON options."""
134 |         class Item(BaseModel):
135 |             id: int
136 |             tags: list[str]
137 |         
138 |         item = Item(id=1, tags=['tag1', 'tag2', 'tag3'])
139 |         
140 |         # Tab delimiter
141 |         toon_tab = encode_pydantic(item, options={'delimiter': 'tab'})
142 |         assert 'tags: [tag1\ttag2\ttag3]' in toon_tab
143 |         
144 |         # Pipe delimiter
145 |         toon_pipe = encode_pydantic(item, options={'delimiter': 'pipe'})
146 |         assert 'tags: [tag1|tag2|tag3]' in toon_pipe
147 |     
148 |     def test_invalid_input(self):
149 |         """Test error handling for invalid input."""
150 |         with pytest.raises(ValueError, match="Expected Pydantic BaseModel"):
151 |             encode_pydantic({'not': 'a model'})
152 |         
153 |         with pytest.raises(ValueError, match="Expected Pydantic BaseModel"):
154 |             encode_pydantic("string")
155 | 
156 | 
157 | @pytest.mark.skipif(not PYDANTIC_AVAILABLE, reason="pydantic not installed")
158 | class TestPydanticDecoder:
159 |     """Tests for decode_to_pydantic function."""
160 |     
161 |     def test_decode_simple_model(self):
162 |         """Test decoding TOON to a simple Pydantic model."""
163 |         class User(BaseModel):
164 |             id: int
165 |             name: str
166 |             email: str
167 |         
168 |         toon = """id: 1
169 | name: Alice
170 | email: alice@example.com"""
171 |         
172 |         user = decode_to_pydantic(toon, User)
173 |         
174 |         assert isinstance(user, User)
175 |         assert user.id == 1
176 |         assert user.name == 'Alice'
177 |         assert user.email == 'alice@example.com'
178 |     
179 |     def test_decode_list_of_models(self):
180 |         """Test decoding TOON to a list of Pydantic models."""
181 |         class Product(BaseModel):
182 |             sku: str
183 |             name: str
184 |             price: float
185 |         
186 |         toon = """[2]{sku,name,price}:
187 |   LAP-001,Gaming Laptop,1299.99
188 |   MOU-042,Wireless Mouse,29.99"""
189 |         
190 |         products = decode_to_pydantic(toon, Product)
191 |         
192 |         assert isinstance(products, list)
193 |         assert len(products) == 2
194 |         assert all(isinstance(p, Product) for p in products)
195 |         assert products[0].sku == 'LAP-001'
196 |         assert products[0].name == 'Gaming Laptop'
197 |         assert products[0].price == 1299.99
198 |         assert products[1].sku == 'MOU-042'
199 |     
200 |     def test_decode_nested_models(self):
201 |         """Test decoding nested Pydantic models."""
202 |         class Address(BaseModel):
203 |             street: str
204 |             city: str
205 |             zipcode: str
206 |         
207 |         class Person(BaseModel):
208 |             name: str
209 |             age: int
210 |             address: Address
211 |         
212 |         toon = """name: Bob
213 | age: 35
214 | address:
215 |   street: 123 Main St
216 |   city: Boston
217 |   zipcode: "02101\""""
218 |         
219 |         person = decode_to_pydantic(toon, Person)
220 |         
221 |         assert isinstance(person, Person)
222 |         assert person.name == 'Bob'
223 |         assert person.age == 35
224 |         assert isinstance(person.address, Address)
225 |         assert person.address.street == '123 Main St'
226 |         assert person.address.city == 'Boston'
227 |         assert person.address.zipcode == '02101'
228 |     
229 |     def test_decode_with_validation(self):
230 |         """Test that Pydantic validation works during decoding."""
231 |         class User(BaseModel):
232 |             id: int
233 |             age: int
234 |         
235 |         # Valid data
236 |         toon_valid = """id: 1
237 | age: 25"""
238 |         user = decode_to_pydantic(toon_valid, User)
239 |         assert user.id == 1
240 |         assert user.age == 25
241 |         
242 |         # Invalid data (string for int field)
243 |         toon_invalid = """id: 1
244 | age: not_a_number"""
245 |         
246 |         with pytest.raises(Exception):  # Pydantic validation error
247 |             decode_to_pydantic(toon_invalid, User)
248 |     
249 |     def test_invalid_model_class(self):
250 |         """Test error handling for invalid model class."""
251 |         with pytest.raises(ValueError, match="Expected Pydantic BaseModel class"):
252 |             decode_to_pydantic("data: value", dict)
253 |         
254 |         with pytest.raises(ValueError, match="Expected Pydantic BaseModel class"):
255 |             decode_to_pydantic("data: value", "not a class")
256 |     
257 |     def test_roundtrip(self):
258 |         """Test encoding and decoding round-trip."""
259 |         class User(BaseModel):
260 |             id: int
261 |             name: str
262 |             email: str
263 |             active: bool
264 |         
265 |         original = User(id=42, name='Charlie', email='charlie@example.com', active=True)
266 |         
267 |         # Encode to TOON
268 |         toon = encode_pydantic(original)
269 |         
270 |         # Decode back to Pydantic
271 |         decoded = decode_to_pydantic(toon, User)
272 |         
273 |         # Verify equality
274 |         assert decoded.id == original.id
275 |         assert decoded.name == original.name
276 |         assert decoded.email == original.email
277 |         assert decoded.active == original.active
278 |     
279 |     def test_list_roundtrip(self):
280 |         """Test encoding and decoding round-trip with list."""
281 |         class Item(BaseModel):
282 |             id: int
283 |             name: str
284 |             price: float
285 |         
286 |         original = [
287 |             Item(id=1, name='Item 1', price=19.99),
288 |             Item(id=2, name='Item 2', price=29.99),
289 |             Item(id=3, name='Item 3', price=39.99)
290 |         ]
291 |         
292 |         # Encode to TOON
293 |         toon = encode_pydantic(original)
294 |         
295 |         # Decode back to Pydantic
296 |         decoded = decode_to_pydantic(toon, Item)
297 |         
298 |         # Verify equality
299 |         assert len(decoded) == len(original)
300 |         for orig, dec in zip(original, decoded):
301 |             assert dec.id == orig.id
302 |             assert dec.name == orig.name
303 |             assert dec.price == orig.price
304 | 
305 | 
306 | @pytest.mark.skipif(PYDANTIC_AVAILABLE, reason="test for when pydantic is not installed")
307 | def test_pydantic_not_installed():
308 |     """Test that appropriate error is raised when pydantic is not installed."""
309 |     from toon import encode_pydantic, decode_to_pydantic
310 |     
311 |     # When pydantic is not installed, these should be None
312 |     assert encode_pydantic is None
313 |     assert decode_to_pydantic is None
314 | 
315 | 


--------------------------------------------------------------------------------
/benchmark/compare_formats.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Benchmark script comparing TOON vs JSON in terms of:
  4 | - File size (bytes)
  5 | - Token count (using tiktoken)
  6 | - Memory usage
  7 | - Encoding/decoding performance
  8 | """
  9 | 
 10 | import json
 11 | import sys
 12 | import time
 13 | from pathlib import Path
 14 | 
 15 | # Add parent directory to path for imports
 16 | sys.path.insert(0, str(Path(__file__).parent.parent))
 17 | 
 18 | import tiktoken
 19 | from toon import encode, decode
 20 | 
 21 | 
 22 | def count_tokens(text: str, model: str = "gpt-4") -> int:
 23 |     """Count tokens using tiktoken."""
 24 |     encoding = tiktoken.encoding_for_model(model)
 25 |     return len(encoding.encode(text))
 26 | 
 27 | 
 28 | def format_size(size_bytes: int) -> str:
 29 |     """Format bytes in human-readable format."""
 30 |     if size_bytes < 1024:
 31 |         return f"{size_bytes}B"
 32 |     elif size_bytes < 1024 * 1024:
 33 |         return f"{size_bytes / 1024:.2f}KB"
 34 |     else:
 35 |         return f"{size_bytes / (1024 * 1024):.2f}MB"
 36 | 
 37 | 
 38 | def calculate_savings(original: int, compressed: int) -> float:
 39 |     """Calculate percentage savings."""
 40 |     if original == 0:
 41 |         return 0.0
 42 |     return ((original - compressed) / original) * 100
 43 | 
 44 | 
 45 | def print_header(title: str, width: int = 80):
 46 |     """Print a fancy header."""
 47 |     print("\n" + "=" * width)
 48 |     print(f"{title:^{width}}")
 49 |     print("=" * width)
 50 | 
 51 | 
 52 | def print_savings_bar(label: str, percentage: float, width: int = 50):
 53 |     """Print a visual savings bar."""
 54 |     filled = int((percentage / 100) * width)
 55 |     bar = "█" * filled + "░" * (width - filled)
 56 |     print(f"  {label:<20} {bar} {percentage:>6.1f}%")
 57 | 
 58 | 
 59 | def benchmark_dataset(name: str, data: dict, iterations: int = 1000, verbose: bool = True) -> dict:
 60 |     """Benchmark a single dataset."""
 61 |     # Generate JSON
 62 |     json_str = json.dumps(data, indent=2)
 63 |     json_size = len(json_str.encode('utf-8'))
 64 |     json_tokens = count_tokens(json_str)
 65 | 
 66 |     # Generate TOON
 67 |     toon_str = encode(data)
 68 |     toon_size = len(toon_str.encode('utf-8'))
 69 |     toon_tokens = count_tokens(toon_str)
 70 | 
 71 |     # Calculate savings
 72 |     size_savings = calculate_savings(json_size, toon_size)
 73 |     token_savings = calculate_savings(json_tokens, toon_tokens)
 74 | 
 75 |     if verbose:
 76 |         print(f"\n{'─'*80}")
 77 |         print(f"📊 {name}")
 78 |         print(f"{'─'*80}")
 79 | 
 80 |         # Size comparison
 81 |         print(f"\n  SIZE:   JSON {format_size(json_size):>10}  →  TOON {format_size(toon_size):>10}  "
 82 |               f"({'-' if size_savings > 0 else '+'}{abs(size_savings):.1f}%)")
 83 |         print(f"  TOKENS: JSON {json_tokens:>6} tokens  →  TOON {toon_tokens:>6} tokens  "
 84 |               f"({'-' if token_savings > 0 else '+'}{abs(token_savings):.1f}%)")
 85 | 
 86 |     # Benchmark encoding performance (reduced iterations for speed)
 87 |     start = time.perf_counter()
 88 |     for _ in range(min(iterations, 100)):
 89 |         encode(data)
 90 |     toon_encode_time = (time.perf_counter() - start) / min(iterations, 100) * 1000
 91 | 
 92 |     start = time.perf_counter()
 93 |     for _ in range(min(iterations, 100)):
 94 |         json.dumps(data)
 95 |     json_encode_time = (time.perf_counter() - start) / min(iterations, 100) * 1000
 96 | 
 97 |     # Benchmark decoding performance
 98 |     start = time.perf_counter()
 99 |     for _ in range(min(iterations, 100)):
100 |         decode(toon_str)
101 |     toon_decode_time = (time.perf_counter() - start) / min(iterations, 100) * 1000
102 | 
103 |     start = time.perf_counter()
104 |     for _ in range(min(iterations, 100)):
105 |         json.loads(json_str)
106 |     json_decode_time = (time.perf_counter() - start) / min(iterations, 100) * 1000
107 | 
108 |     return {
109 |         'name': name,
110 |         'json_size': json_size,
111 |         'toon_size': toon_size,
112 |         'size_savings': size_savings,
113 |         'json_tokens': json_tokens,
114 |         'toon_tokens': toon_tokens,
115 |         'token_savings': token_savings,
116 |         'json_encode_time': json_encode_time,
117 |         'toon_encode_time': toon_encode_time,
118 |         'json_decode_time': json_decode_time,
119 |         'toon_decode_time': toon_decode_time,
120 |     }
121 | 
122 | 
123 | def main():
124 |     """Run all benchmarks."""
125 |     from sample_datasets import DATASETS
126 | 
127 |     print_header("🚀 TOON vs JSON: THE ULTIMATE SHOWDOWN 🚀", 80)
128 |     print("\n" + " " * 10 + "Testing across 50 diverse, real-world datasets")
129 |     print(" " * 10 + "Measuring size, tokens, and performance\n")
130 | 
131 |     results = []
132 | 
133 |     # Show detailed output for first 10 datasets
134 |     print("\n" + "▼" * 80)
135 |     print("DETAILED RESULTS (First 10 Datasets)")
136 |     print("▼" * 80)
137 | 
138 |     for i, (dataset_name, dataset) in enumerate(DATASETS.items()):
139 |         if i < 10:
140 |             result = benchmark_dataset(dataset_name, dataset, verbose=True)
141 |         else:
142 |             # Silent benchmarking for remaining datasets
143 |             if i == 10:
144 |                 print("\n" + "⚡" * 80)
145 |                 print("  Processing remaining 40 datasets...")
146 |                 print("⚡" * 80)
147 |             result = benchmark_dataset(dataset_name, dataset, verbose=False)
148 |             print(f"  ✓ {dataset_name:<50} ({result['size_savings']:.1f}% size, {result['token_savings']:.1f}% tokens)")
149 |         results.append(result)
150 | 
151 |     # Calculate aggregate statistics
152 |     total_json_size = sum(r['json_size'] for r in results)
153 |     total_toon_size = sum(r['toon_size'] for r in results)
154 |     total_json_tokens = sum(r['json_tokens'] for r in results)
155 |     total_toon_tokens = sum(r['toon_tokens'] for r in results)
156 | 
157 |     avg_size_savings = calculate_savings(total_json_size, total_toon_size)
158 |     avg_token_savings = calculate_savings(total_json_tokens, total_toon_tokens)
159 | 
160 |     total_size_saved = total_json_size - total_toon_size
161 |     total_tokens_saved = total_json_tokens - total_toon_tokens
162 | 
163 |     # Best performers
164 |     best_size = max(results, key=lambda x: x['size_savings'])
165 |     best_tokens = max(results, key=lambda x: x['token_savings'])
166 | 
167 |     # Print epic summary
168 |     print_header("📈 AGGREGATE RESULTS ACROSS ALL 50 DATASETS 📈", 80)
169 | 
170 |     print(f"\n{'┌' + '─'*78 + '┐'}")
171 |     print(f"│{'TOTAL DATA SIZE':^78}│")
172 |     print(f"│{' '*78}│")
173 |     print(f"│  JSON:     {format_size(total_json_size):>15}  ({total_json_size:,} bytes){' '*(32-len(str(total_json_size)))}│")
174 |     print(f"│  TOON:     {format_size(total_toon_size):>15}  ({total_toon_size:,} bytes){' '*(32-len(str(total_toon_size)))}│")
175 |     print(f"│  SAVED:    {format_size(total_size_saved):>15}  (⬇ {avg_size_savings:.1f}%){' '*(41-len(f'{avg_size_savings:.1f}'))}│")
176 |     print(f"{'└' + '─'*78 + '┘'}")
177 | 
178 |     print(f"\n{'┌' + '─'*78 + '┐'}")
179 |     print(f"│{'TOTAL TOKEN COUNT (GPT-4)':^78}│")
180 |     print(f"│{' '*78}│")
181 |     print(f"│  JSON:     {total_json_tokens:>10,} tokens{' '*(50-len(f'{total_json_tokens:,}'))}│")
182 |     print(f"│  TOON:     {total_toon_tokens:>10,} tokens{' '*(50-len(f'{total_toon_tokens:,}'))}│")
183 |     print(f"│  SAVED:    {total_tokens_saved:>10,} tokens  (⬇ {avg_token_savings:.1f}%){' '*(39-len(f'{total_tokens_saved:,}')-len(f'{avg_token_savings:.1f}'))}│")
184 |     print(f"{'└' + '─'*78 + '┘'}")
185 | 
186 |     # Visual savings bars
187 |     print("\n" + "━" * 80)
188 |     print("  MEMORY & TOKEN SAVINGS VISUALIZATION")
189 |     print("━" * 80)
190 |     print_savings_bar("Size Reduction", avg_size_savings)
191 |     print_savings_bar("Token Reduction", avg_token_savings)
192 | 
193 |     # Cost analysis
194 |     print("\n" + "╔" + "═"*78 + "╗")
195 |     print("║" + "💰 REAL-WORLD COST IMPACT FOR LLM APIs 💰".center(78) + "║")
196 |     print("╠" + "═"*78 + "╣")
197 | 
198 |     # Calculate costs at different price points
199 |     cost_per_1m_tokens = 10.00  # Example: $10 per 1M tokens (GPT-4 pricing)
200 | 
201 |     json_cost_per_request = (total_json_tokens / len(results)) * (cost_per_1m_tokens / 1_000_000)
202 |     toon_cost_per_request = (total_toon_tokens / len(results)) * (cost_per_1m_tokens / 1_000_000)
203 |     savings_per_request = json_cost_per_request - toon_cost_per_request
204 | 
205 |     print(f"║  At ${cost_per_1m_tokens}/1M tokens (typical GPT-4 pricing):".ljust(79) + "║")
206 |     print(f"║".ljust(79) + "║")
207 |     print(f"║    • Average JSON request:  ${json_cost_per_request:.6f}".ljust(79) + "║")
208 |     print(f"║    • Average TOON request:  ${toon_cost_per_request:.6f}".ljust(79) + "║")
209 |     print(f"║    • Savings per request:   ${savings_per_request:.6f}  ({avg_token_savings:.1f}% less!)".ljust(79) + "║")
210 |     print(f"║".ljust(79) + "║")
211 |     print(f"║  ANNUAL SAVINGS (at 1M requests/year):".ljust(79) + "║")
212 |     print(f"║    💵 ${savings_per_request * 1_000_000:>15,.2f}".ljust(79) + "║")
213 |     print(f"║".ljust(79) + "║")
214 |     print(f"║  For 1 BILLION tokens (e.g., high-volume API):".ljust(79) + "║")
215 |     print(f"║    💵 ${(avg_token_savings / 100) * cost_per_1m_tokens * 1000:>15,.2f} saved!".ljust(79) + "║")
216 |     print("╚" + "═"*78 + "╝")
217 | 
218 |     # Champion datasets
219 |     print("\n" + "🏆" * 80)
220 |     print("  CHAMPION PERFORMERS")
221 |     print("🏆" * 80)
222 |     print(f"\n  🥇 Best Size Savings:  {best_size['name']}")
223 |     print(f"     {best_size['size_savings']:.1f}% reduction  ({format_size(best_size['json_size'])} → {format_size(best_size['toon_size'])})")
224 |     print(f"\n  🥇 Best Token Savings: {best_tokens['name']}")
225 |     print(f"     {best_tokens['token_savings']:.1f}% reduction  ({best_tokens['json_tokens']} → {best_tokens['toon_tokens']} tokens)")
226 | 
227 |     # Summary table
228 |     print("\n" + "╔" + "═"*78 + "╗")
229 |     print("║" + " DATASET PERFORMANCE SUMMARY ".center(78) + "║")
230 |     print("╠" + "═"*78 + "╣")
231 | 
232 |     # Group by performance
233 |     excellent = [r for r in results if r['size_savings'] >= 60]
234 |     good = [r for r in results if 40 <= r['size_savings'] < 60]
235 |     moderate = [r for r in results if r['size_savings'] < 40]
236 | 
237 |     print(f"║  🔥 Excellent (≥60% savings):  {len(excellent):>2} datasets".ljust(79) + "║")
238 |     print(f"║  ✅ Good (40-60% savings):     {len(good):>2} datasets".ljust(79) + "║")
239 |     print(f"║  📊 Moderate (<40% savings):   {len(moderate):>2} datasets".ljust(79) + "║")
240 |     print("╚" + "═"*78 + "╝")
241 | 
242 |     # Final verdict
243 |     print("\n" + "⭐" * 80)
244 |     print_header("⚡ THE VERDICT ⚡")
245 |     print("\n  TOON FORMAT DELIVERS:")
246 |     print(f"    • {avg_size_savings:.1f}% SMALLER file sizes")
247 |     print(f"    • {avg_token_savings:.1f}% FEWER tokens for LLM APIs")
248 |     print(f"    • {format_size(total_size_saved)} TOTAL memory saved across 50 datasets")
249 |     print(f"    • {total_tokens_saved:,} TOTAL tokens saved")
250 |     print(f"    • 💰 MASSIVE cost savings for high-volume applications")
251 |     print("\n  Perfect for:")
252 |     print("    ✓ LLM API calls (reduce token costs)")
253 |     print("    ✓ Database exports (tabular data)")
254 |     print("    ✓ Analytics & metrics")
255 |     print("    ✓ E-commerce & inventory")
256 |     print("    ✓ Any structured, uniform data")
257 |     print("\n" + "⭐" * 80)
258 | 
259 |     print("\n✅ Benchmark completed successfully!\n")
260 | 
261 | 
262 | if __name__ == "__main__":
263 |     main()
264 | 


--------------------------------------------------------------------------------
/assets/README.ko.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img src="toonify.png" alt="Toonify Logo" width="400">
  3 | </p>
  4 | 
  5 | # TOON (Token-Oriented Object Notation)
  6 | 
  7 | [English](../README.md) | [中文](README.zh-CN.md) | [한국어](README.ko.md)
  8 | 
  9 | 구조화된 데이터를 대규모 언어 모델에 전달할 때 토큰 사용량을 크게 줄이도록 설계된 간결하고 사람이 읽을 수 있는 직렬화 형식입니다.
 10 | 
 11 | [![Python Version](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
 12 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 13 | 
 14 | ## 개요
 15 | 
 16 | TOON은 **CSV 수준의 간결함**을 달성하면서 **명시적인 구조**를 추가하여 다음과 같은 용도에 이상적입니다:
 17 | - LLM API 호출 시 토큰 비용 절감
 18 | - 컨텍스트 윈도우 효율성 향상
 19 | - 사람이 읽을 수 있는 형식 유지
 20 | - 데이터 구조 및 타입 보존
 21 | 
 22 | ### 주요 기능
 23 | 
 24 | - ✅ **간결함**: 평균적으로 JSON보다 **64% 작음** (50개 데이터셋 테스트 결과)
 25 | - ✅ **가독성**: 깔끔하고 들여쓰기 기반의 구문
 26 | - ✅ **구조화**: 중첩된 객체와 배열 보존
 27 | - ✅ **타입 안전성**: 문자열, 숫자, 불리언, null 지원
 28 | - ✅ **유연성**: 다양한 구분자 옵션 (쉼표, 탭, 파이프)
 29 | - ✅ **스마트**: 균일한 배열을 위한 자동 테이블 형식
 30 | - ✅ **효율성**: 깊게 중첩된 객체를 위한 키 폴딩
 31 | 
 32 | ## 설치
 33 | 
 34 | ```bash
 35 | pip install toonify
 36 | ```
 37 | 
 38 | 개발 환경:
 39 | ```bash
 40 | pip install toonify[dev]
 41 | ```
 42 | 
 43 | Pydantic 지원:
 44 | ```bash
 45 | pip install toonify[pydantic]
 46 | ```
 47 | 
 48 | ## 빠른 시작
 49 | 
 50 | ### Python API
 51 | 
 52 | ```python
 53 | from toon import encode, decode
 54 | 
 55 | # Python dict를 TOON으로 인코딩
 56 | data = {
 57 |     'products': [
 58 |         {'sku': 'LAP-001', 'name': 'Gaming Laptop', 'price': 1299.99},
 59 |         {'sku': 'MOU-042', 'name': 'Wireless Mouse', 'price': 29.99}
 60 |     ]
 61 | }
 62 | 
 63 | toon_string = encode(data)
 64 | print(toon_string)
 65 | # 출력:
 66 | # products[2]{sku,name,price}:
 67 | #   LAP-001,Gaming Laptop,1299.99
 68 | #   MOU-042,Wireless Mouse,29.99
 69 | 
 70 | # TOON을 다시 Python으로 디코딩
 71 | result = decode(toon_string)
 72 | assert result == data
 73 | ```
 74 | 
 75 | ### 명령줄
 76 | 
 77 | ```bash
 78 | # JSON을 TOON으로 인코딩
 79 | toon input.json -o output.toon
 80 | 
 81 | # TOON을 JSON으로 디코딩
 82 | toon input.toon -o output.json
 83 | 
 84 | # 파이프 사용
 85 | cat data.json | toon -e > data.toon
 86 | 
 87 | # 토큰 통계 표시
 88 | toon data.json --stats
 89 | ```
 90 | 
 91 | ### Pydantic 통합
 92 | 
 93 | TOON은 Pydantic 모델에서 직접 변환을 지원합니다:
 94 | 
 95 | ```python
 96 | from pydantic import BaseModel
 97 | from toon import encode_pydantic, decode_to_pydantic
 98 | 
 99 | # Pydantic 모델 정의
100 | class User(BaseModel):
101 |     id: int
102 |     name: str
103 |     email: str
104 | 
105 | # Pydantic 모델을 TOON으로 인코딩
106 | users = [
107 |     User(id=1, name='Alice', email='alice@example.com'),
108 |     User(id=2, name='Bob', email='bob@example.com')
109 | ]
110 | 
111 | toon = encode_pydantic(users)
112 | print(toon)
113 | # 출력:
114 | # [2]{id,name,email}:
115 | #   1,Alice,alice@example.com
116 | #   2,Bob,bob@example.com
117 | 
118 | # TOON을 다시 Pydantic 모델로 디코딩
119 | decoded_users = decode_to_pydantic(toon, User)
120 | assert all(isinstance(u, User) for u in decoded_users)
121 | ```
122 | 
123 | **기능:**
124 | - ✅ Pydantic 모델에서 직접 변환 (v1 및 v2)
125 | - ✅ 중첩된 모델 지원
126 | - ✅ 설정되지 않은 값, None 또는 기본값 제외
127 | - ✅ 필드 별칭 지원
128 | - ✅ 디코딩 시 전체 검증
129 | - ✅ 왕복 변환
130 | 
131 | 자세한 예제는 [examples/pydantic_usage.py](../examples/pydantic_usage.py)를 참조하세요.
132 | 
133 | ## TOON 형식 사양
134 | 
135 | ### 기본 구문
136 | 
137 | ```toon
138 | # 간단한 키-값 쌍
139 | title: Machine Learning Basics
140 | chapters: 12
141 | published: true
142 | ```
143 | 
144 | ### 배열
145 | 
146 | **기본 배열** (인라인):
147 | ```toon
148 | temperatures: [72.5,68.3,75.1,70.8,73.2]
149 | categories: [electronics,computers,accessories]
150 | ```
151 | 
152 | **테이블 배열** (헤더가 있는 균일한 객체):
153 | ```toon
154 | inventory[3]{sku,product,stock}:
155 |   KB-789,Mechanical Keyboard,45
156 |   MS-456,RGB Mouse Pad,128
157 |   HD-234,USB Headset,67
158 | ```
159 | 
160 | **리스트 배열** (불균일하거나 중첩된):
161 | ```toon
162 | tasks[2]:
163 |   Complete documentation
164 |   Review pull requests
165 | ```
166 | 
167 | ### 중첩 객체
168 | 
169 | ```toon
170 | server:
171 |   hostname: api-prod-01
172 |   config:
173 |     port: 8080
174 |     region: us-east
175 | ```
176 | 
177 | ### 따옴표 규칙
178 | 
179 | 문자열은 필요한 경우에만 따옴표로 묶습니다:
180 | - 특수 문자 포함 (`,`, `:`, `"`, 줄바꿈)
181 | - 앞/뒤 공백 있음
182 | - 리터럴처럼 보임 (`true`, `false`, `null`)
183 | - 비어있음
184 | 
185 | ```toon
186 | simple: ProductName
187 | quoted: "Product, Description"
188 | escaped: "Size: 15\" display"
189 | multiline: "First feature\nSecond feature"
190 | ```
191 | 
192 | ## API 레퍼런스
193 | 
194 | ### `encode(data, options=None)`
195 | 
196 | Python 객체를 TOON 문자열로 변환합니다.
197 | 
198 | **매개변수:**
199 | - `data`: Python dict 또는 list
200 | - `options`: 선택적 dict:
201 |   - `delimiter`: `'comma'` (기본값), `'tab'`, 또는 `'pipe'`
202 |   - `indent`: 레벨당 공백 수 (기본값: 2)
203 |   - `key_folding`: `'off'` (기본값) 또는 `'safe'`
204 |   - `flatten_depth`: 키 폴딩의 최대 깊이 (기본값: None)
205 | 
206 | **예제:**
207 | ```python
208 | toon = encode(data, {
209 |     'delimiter': 'tab',
210 |     'indent': 4,
211 |     'key_folding': 'safe'
212 | })
213 | ```
214 | 
215 | ### `decode(toon_string, options=None)`
216 | 
217 | TOON 문자열을 Python 객체로 변환합니다.
218 | 
219 | **매개변수:**
220 | - `toon_string`: TOON 형식 문자열
221 | - `options`: 선택적 dict:
222 |   - `strict`: 구조를 엄격하게 검증 (기본값: True)
223 |   - `expand_paths`: `'off'` (기본값) 또는 `'safe'`
224 |   - `default_delimiter`: 기본 구분자 (기본값: `','`)
225 | 
226 | **예제:**
227 | ```python
228 | data = decode(toon_string, {
229 |     'expand_paths': 'safe',
230 |     'strict': False
231 | })
232 | ```
233 | 
234 | ### `encode_pydantic(model, options=None, exclude_unset=False, exclude_none=False, exclude_defaults=False, by_alias=False)`
235 | 
236 | Pydantic 모델을 TOON 문자열로 변환합니다.
237 | 
238 | **매개변수:**
239 | - `model`: Pydantic 모델 인스턴스 또는 모델 인스턴스 리스트
240 | - `options`: `encode()` 함수와 동일
241 | - `exclude_unset`: True인 경우 명시적으로 설정되지 않은 필드 제외
242 | - `exclude_none`: True인 경우 None 값을 가진 필드 제외
243 | - `exclude_defaults`: True인 경우 기본값을 가진 필드 제외
244 | - `by_alias`: True인 경우 필드 이름 대신 필드 별칭 사용
245 | 
246 | **예제:**
247 | ```python
248 | from pydantic import BaseModel
249 | from toon import encode_pydantic
250 | 
251 | class User(BaseModel):
252 |     id: int
253 |     name: str
254 |     email: str | None = None
255 | 
256 | user = User(id=1, name='Alice')
257 | toon = encode_pydantic(user, exclude_none=True)
258 | ```
259 | 
260 | ### `decode_to_pydantic(toon_string, model_class, options=None)`
261 | 
262 | TOON 문자열을 Pydantic 모델로 디코딩합니다.
263 | 
264 | **매개변수:**
265 | - `toon_string`: TOON 형식 문자열
266 | - `model_class`: 인스턴스화할 Pydantic 모델 클래스
267 | - `options`: `decode()` 함수와 동일
268 | 
269 | **반환값:**
270 | - Pydantic 모델 인스턴스 또는 인스턴스 리스트 (입력에 따라 다름)
271 | 
272 | **예제:**
273 | ```python
274 | from pydantic import BaseModel
275 | from toon import decode_to_pydantic
276 | 
277 | class User(BaseModel):
278 |     id: int
279 |     name: str
280 | 
281 | toon = "id: 1\nname: Alice"
282 | user = decode_to_pydantic(toon, User)
283 | ```
284 | 
285 | ## CLI 사용법
286 | 
287 | ```
288 | usage: toon [-h] [-o OUTPUT] [-e] [-d] [--delimiter {comma,tab,pipe}]
289 |             [--indent INDENT] [--stats] [--no-strict]
290 |             [--key-folding {off,safe}] [--flatten-depth DEPTH]
291 |             [--expand-paths {off,safe}]
292 |             [input]
293 | 
294 | TOON (Token-Oriented Object Notation) - JSON과 TOON 형식 간 변환
295 | 
296 | positional arguments:
297 |   input                 입력 파일 경로 (또는 stdin의 경우 "-")
298 | 
299 | optional arguments:
300 |   -h, --help            도움말 메시지 표시
301 |   -o, --output OUTPUT   출력 파일 경로 (기본값: stdout)
302 |   -e, --encode          인코딩 모드 강제 (JSON에서 TOON으로)
303 |   -d, --decode          디코딩 모드 강제 (TOON에서 JSON으로)
304 |   --delimiter {comma,tab,pipe}
305 |                         배열 구분자 (기본값: comma)
306 |   --indent INDENT       들여쓰기 크기 (기본값: 2)
307 |   --stats               토큰 통계 표시
308 |   --no-strict           엄격한 검증 비활성화 (디코딩만)
309 |   --key-folding {off,safe}
310 |                         키 폴딩 모드 (인코딩만)
311 |   --flatten-depth DEPTH 최대 키 폴딩 깊이 (인코딩만)
312 |   --expand-paths {off,safe}
313 |                         경로 확장 모드 (디코딩만)
314 | ```
315 | 
316 | ## 고급 기능
317 | 
318 | ### 키 폴딩
319 | 
320 | 단일 키 체인을 점으로 구분된 경로로 축소합니다:
321 | 
322 | ```python
323 | data = {
324 |     'api': {
325 |         'response': {
326 |             'product': {
327 |                 'title': 'Wireless Keyboard'
328 |             }
329 |         }
330 |     }
331 | }
332 | 
333 | # key_folding='safe' 사용
334 | toon = encode(data, {'key_folding': 'safe'})
335 | # 출력: api.response.product.title: Wireless Keyboard
336 | ```
337 | 
338 | ### 경로 확장
339 | 
340 | 점으로 구분된 키를 중첩 객체로 확장합니다:
341 | 
342 | ```python
343 | toon = 'store.location.zipcode: 10001'
344 | 
345 | # expand_paths='safe' 사용
346 | data = decode(toon, {'expand_paths': 'safe'})
347 | # 결과: {'store': {'location': {'zipcode': 10001}}}
348 | ```
349 | 
350 | ### 사용자 정의 구분자
351 | 
352 | 데이터에 가장 적합한 구분자를 선택하세요:
353 | 
354 | ```python
355 | # 탭 구분자 (스프레드시트 같은 데이터에 더 좋음)
356 | toon = encode(data, {'delimiter': 'tab'})
357 | 
358 | # 파이프 구분자 (데이터에 쉼표가 포함된 경우)
359 | toon = encode(data, {'delimiter': 'pipe'})
360 | ```
361 | 
362 | ## 형식 비교
363 | 
364 | ### JSON vs TOON
365 | 
366 | **JSON** (247 바이트):
367 | ```json
368 | {
369 |   "products": [
370 |     {"id": 101, "name": "Laptop Pro", "price": 1299},
371 |     {"id": 102, "name": "Magic Mouse", "price": 79},
372 |     {"id": 103, "name": "USB-C Cable", "price": 19}
373 |   ]
374 | }
375 | ```
376 | 
377 | **TOON** (98 바이트, **60% 감소**):
378 | ```toon
379 | products[3]{id,name,price}:
380 |   101,Laptop Pro,1299
381 |   102,Magic Mouse,79
382 |   103,USB-C Cable,19
383 | ```
384 | 
385 | ### TOON 사용 시기
386 | 
387 | **TOON 사용:**
388 | - ✅ LLM API에 데이터 전달 시 (토큰 비용 절감)
389 | - ✅ 균일한 테이블 데이터 작업
390 | - ✅ 컨텍스트 윈도우가 제한적일 때
391 | - ✅ 사람이 읽을 수 있어야 할 때
392 | 
393 | **JSON 사용:**
394 | - ❌ 최대 호환성이 필요할 때
395 | - ❌ 데이터가 매우 불규칙하거나 중첩될 때
396 | - ❌ 기존 JSON 전용 도구와 작업할 때
397 | 
398 | ## 개발
399 | 
400 | ### 설정
401 | 
402 | ```bash
403 | git clone https://github.com/ScrapeGraphAI/toonify.git
404 | cd toonify
405 | pip install -e .[dev]
406 | ```
407 | 
408 | ### 테스트 실행
409 | 
410 | ```bash
411 | pytest
412 | pytest --cov=toon --cov-report=term-missing
413 | ```
414 | 
415 | ### 예제 실행
416 | 
417 | ```bash
418 | python examples/basic_usage.py
419 | python examples/advanced_features.py
420 | ```
421 | 
422 | ## 성능
423 | 
424 | **50개의 다양한 실제 데이터셋에서 벤치마크 테스트:**
425 | 
426 | - 구조화된 데이터의 경우 JSON 대비 **평균 63.9% 크기 감소**
427 | - **평균 54.1% 토큰 감소** (LLM API 비용 직접 절감)
428 | - 최적 사용 사례에서 **최대 73.4% 절감** (테이블 데이터, 설문조사, 분석)
429 | - **98%의 데이터셋에서 40% 이상 절감 달성**
430 | - 인코딩/디코딩 시 **최소 오버헤드** (일반적인 페이로드의 경우 <1ms)
431 | 
432 | **💰 비용 영향:** GPT-4 가격 기준으로, TOON은 백만 건의 API 요청당 **$2,147 절감**, 10억 토큰당 **$5,408 절감**.
433 | 
434 | **[📊 전체 벤치마크 결과 보기 →](../benchmark/RESULTS.md)**
435 | 
436 | ## 기여
437 | 
438 | 기여를 환영합니다! 다음 단계를 따라주세요:
439 | 
440 | 1. 저장소 포크
441 | 2. 기능 브랜치 생성 (`git checkout -b feature/amazing-feature`)
442 | 3. 테스트와 함께 변경 사항 작성
443 | 4. 테스트 실행 (`pytest`)
444 | 5. 변경 사항 커밋 (`git commit -m 'Add amazing feature'`)
445 | 6. 브랜치에 푸시 (`git push origin feature/amazing-feature`)
446 | 7. Pull Request 열기
447 | 
448 | ## 라이선스
449 | 
450 | MIT 라이선스 - 자세한 내용은 [LICENSE](../LICENSE) 파일을 참조하세요.
451 | 
452 | ## 크레딧
453 | 
454 | Python 구현은 [toon-format/toon](https://github.com/toon-format/toon)의 TypeScript TOON 라이브러리에서 영감을 받았습니다.
455 | 
456 | ## 링크
457 | 
458 | - **GitHub**: https://github.com/ScrapeGraphAI/toonify
459 | - **PyPI**: https://pypi.org/project/toonify/
460 | - **문서**: https://github.com/ScrapeGraphAI/toonify#readme
461 | - **형식 사양**: https://github.com/toon-format/toon
462 | 
463 | ---
464 | 
465 | [ScrapeGraph 팀](https://scrapegraphai.com)이 ❤️으로 만들었습니다
466 | 
467 | <p align="center">
468 |   <img src="https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/assets/scrapegraphai_logo.png" alt="ScrapeGraphAI Logo" width="250">
469 | </p>
470 | 
471 | 


--------------------------------------------------------------------------------
/tests/test_roundtrip.py:
--------------------------------------------------------------------------------
  1 | """Tests for round-trip encoding and decoding."""
  2 | import pytest
  3 | from toon import encode, decode
  4 | 
  5 | 
  6 | def test_roundtrip_simple_object():
  7 |     """Test round-trip of simple object."""
  8 |     original = {
  9 |         'name': 'Alice',
 10 |         'age': 30,
 11 |         'active': True
 12 |     }
 13 |     
 14 |     toon = encode(original)
 15 |     result = decode(toon)
 16 |     
 17 |     assert result == original
 18 | 
 19 | 
 20 | def test_roundtrip_nested_object():
 21 |     """Test round-trip of nested object."""
 22 |     original = {
 23 |         'user': {
 24 |             'name': 'Alice',
 25 |             'profile': {
 26 |                 'age': 30,
 27 |                 'city': 'NYC'
 28 |             }
 29 |         }
 30 |     }
 31 |     
 32 |     toon = encode(original)
 33 |     result = decode(toon)
 34 |     
 35 |     assert result == original
 36 | 
 37 | 
 38 | def test_roundtrip_primitive_array():
 39 |     """Test round-trip of primitive arrays."""
 40 |     original = {
 41 |         'numbers': [1, 2, 3, 4, 5],
 42 |         'names': ['Alice', 'Bob', 'Charlie'],
 43 |         'mixed': [1, 'text', True, None]
 44 |     }
 45 |     
 46 |     toon = encode(original)
 47 |     result = decode(toon)
 48 |     
 49 |     assert result == original
 50 | 
 51 | 
 52 | def test_roundtrip_tabular_array():
 53 |     """Test round-trip of tabular array."""
 54 |     original = {
 55 |         'users': [
 56 |             {'id': 1, 'name': 'Alice', 'role': 'admin'},
 57 |             {'id': 2, 'name': 'Bob', 'role': 'user'},
 58 |             {'id': 3, 'name': 'Charlie', 'role': 'guest'}
 59 |         ]
 60 |     }
 61 |     
 62 |     toon = encode(original)
 63 |     result = decode(toon)
 64 |     
 65 |     assert result == original
 66 | 
 67 | 
 68 | def test_roundtrip_empty_structures():
 69 |     """Test round-trip of empty structures."""
 70 |     original = {
 71 |         'empty_object': {},
 72 |         'empty_array': [],
 73 |         'nested': {
 74 |             'also_empty': {}
 75 |         }
 76 |     }
 77 |     
 78 |     toon = encode(original)
 79 |     result = decode(toon)
 80 |     
 81 |     assert result == original
 82 | 
 83 | 
 84 | def test_roundtrip_special_strings():
 85 |     """Test round-trip of strings requiring quotes."""
 86 |     original = {
 87 |         'comma': 'hello, world',
 88 |         'colon': 'key: value',
 89 |         'quote': 'He said "hello"',
 90 |         'newline': 'line1\nline2',
 91 |         'spaces': '  padded  ',
 92 |         'looks_like_bool': 'true',
 93 |         'looks_like_null': 'null'
 94 |     }
 95 |     
 96 |     toon = encode(original)
 97 |     result = decode(toon)
 98 |     
 99 |     assert result == original
100 | 
101 | 
102 | def test_roundtrip_complex_structure():
103 |     """Test round-trip of complex structure."""
104 |     original = {
105 |         'project': 'TOON',
106 |         'version': '1.0.0',
107 |         'description': 'A token-efficient format',
108 |         'features': ['compact', 'readable', 'structured'],
109 |         'users': [
110 |             {'id': 1, 'name': 'Alice', 'active': True},
111 |             {'id': 2, 'name': 'Bob', 'active': False}
112 |         ],
113 |         'metadata': {
114 |             'created': '2024-01-01',
115 |             'author': 'TOON Contributors',
116 |             'stats': {
117 |                 'files': 10,
118 |                 'lines': 1000
119 |             }
120 |         }
121 |     }
122 |     
123 |     toon = encode(original)
124 |     result = decode(toon)
125 |     
126 |     assert result == original
127 | 
128 | 
129 | def test_roundtrip_with_delimiters():
130 |     """Test round-trip with different delimiters."""
131 |     original = {
132 |         'values': [1, 2, 3],
133 |         'users': [
134 |             {'id': 1, 'name': 'Alice'},
135 |             {'id': 2, 'name': 'Bob'}
136 |         ]
137 |     }
138 |     
139 |     # Tab delimiter
140 |     toon_tab = encode(original, {'delimiter': 'tab'})
141 |     result_tab = decode(toon_tab)
142 |     assert result_tab == original
143 |     
144 |     # Pipe delimiter
145 |     toon_pipe = encode(original, {'delimiter': 'pipe'})
146 |     result_pipe = decode(toon_pipe)
147 |     assert result_pipe == original
148 | 
149 | 
150 | def test_roundtrip_key_folding_and_expansion():
151 |     """Test round-trip with key folding and path expansion."""
152 |     original = {
153 |         'data': {
154 |             'metadata': {
155 |                 'items': [1, 2, 3]
156 |             }
157 |         }
158 |     }
159 |     
160 |     # Encode with key folding
161 |     toon = encode(original, {'key_folding': 'safe'})
162 |     
163 |     # Decode with path expansion
164 |     result = decode(toon, {'expand_paths': 'safe'})
165 |     
166 |     assert result == original
167 | 
168 | 
169 | def test_roundtrip_multiple_iterations():
170 |     """Test multiple encode-decode cycles maintain consistency."""
171 |     original = {
172 |         'users': [
173 |             {'id': 1, 'name': 'Alice'},
174 |             {'id': 2, 'name': 'Bob'}
175 |         ],
176 |         'count': 2
177 |     }
178 |     
179 |     # First cycle
180 |     toon1 = encode(original)
181 |     result1 = decode(toon1)
182 |     
183 |     # Second cycle
184 |     toon2 = encode(result1)
185 |     result2 = decode(toon2)
186 |     
187 |     # Third cycle
188 |     toon3 = encode(result2)
189 |     result3 = decode(toon3)
190 |     
191 |     # All should be equal
192 |     assert result1 == original
193 |     assert result2 == original
194 |     assert result3 == original
195 |     assert toon1 == toon2 == toon3
196 | 
197 | 
198 | def test_roundtrip_delimiter_indicators():
199 |     """Test round-trip with delimiter indicators in headers."""
200 |     original = {
201 |         'tab_data': [
202 |             {'id': 1, 'value': 'A'},
203 |             {'id': 2, 'value': 'B'}
204 |         ],
205 |         'pipe_data': [
206 |             {'code': 'X', 'qty': 10},
207 |             {'code': 'Y', 'qty': 20}
208 |         ]
209 |     }
210 | 
211 |     # Test with tab delimiter
212 |     toon_tab = encode(original, {'delimiter': 'tab'})
213 |     assert '[2\t]{id,value}:' in toon_tab
214 |     result_tab = decode(toon_tab)
215 |     assert result_tab == original
216 | 
217 |     # Test with pipe delimiter
218 |     toon_pipe = encode(original, {'delimiter': 'pipe'})
219 |     assert '[2|]{id,value}:' in toon_pipe or '[2|]{code,qty}:' in toon_pipe
220 |     result_pipe = decode(toon_pipe)
221 |     assert result_pipe == original
222 | 
223 |     # Test with comma delimiter (no indicator)
224 |     toon_comma = encode(original, {'delimiter': 'comma'})
225 |     assert '[2]{id,value}:' in toon_comma
226 |     assert '\t' not in toon_comma  # No tab indicator
227 |     assert '[2|]' not in toon_comma  # No pipe indicator
228 |     result_comma = decode(toon_comma)
229 |     assert result_comma == original
230 | 
231 | 
232 | def test_roundtrip_list_array_with_dashes():
233 |     """Test round-trip with list array dash markers."""
234 |     original = {
235 |         'mixed': [
236 |             'text',
237 |             42,
238 |             {'nested': 'object'},
239 |             True
240 |         ],
241 |         'list_with_objects': [
242 |             {'id': 1},
243 |             {'id': 2, 'name': 'extra field'}  # Non-uniform, so list format
244 |         ]
245 |     }
246 | 
247 |     # Encode
248 |     toon = encode(original)
249 | 
250 |     # Verify dash markers are present in mixed array
251 |     assert '- text' in toon
252 |     assert '- 42' in toon
253 |     assert '- nested: object' in toon
254 |     assert '- true' in toon
255 | 
256 |     # Decode
257 |     result = decode(toon)
258 | 
259 |     # Should match original
260 |     assert result == original
261 | 
262 |     # Second round-trip
263 |     toon2 = encode(result)
264 |     result2 = decode(toon2)
265 |     assert result2 == original
266 | 
267 | 
268 | def test_roundtrip_datetime_objects():
269 |     """Test round-trip with datetime objects (encodes to ISO strings)."""
270 |     from datetime import datetime, date
271 | 
272 |     original = {
273 |         'created': datetime(2024, 1, 1, 12, 30, 45),
274 |         'birth_date': date(1990, 5, 20),
275 |         'events': [
276 |             {'name': 'Start', 'time': datetime(2024, 1, 1, 0, 0, 0)},
277 |             {'name': 'End', 'time': datetime(2024, 12, 31, 23, 59, 59)}
278 |         ]
279 |     }
280 | 
281 |     # Encode
282 |     toon = encode(original)
283 | 
284 |     # Verify ISO format strings are present
285 |     assert '2024-01-01T12:30:45' in toon
286 |     assert '1990-05-20' in toon
287 | 
288 |     # Decode (datetimes become strings)
289 |     result = decode(toon)
290 | 
291 |     # After decode, datetimes are strings
292 |     expected_decoded = {
293 |         'created': '2024-01-01T12:30:45',
294 |         'birth_date': '1990-05-20',
295 |         'events': [
296 |             {'name': 'Start', 'time': '2024-01-01T00:00:00'},
297 |             {'name': 'End', 'time': '2024-12-31T23:59:59'}
298 |         ]
299 |     }
300 | 
301 |     assert result == expected_decoded
302 | 
303 |     # Second encode should produce same TOON string
304 |     toon2 = encode(result)
305 |     assert toon == toon2
306 | 
307 |     # Third decode should match second
308 |     result2 = decode(toon2)
309 |     assert result2 == expected_decoded
310 | 
311 | 
312 | def test_roundtrip_scientific_notation_suppression():
313 |     """Test round-trip with scientific notation suppression."""
314 |     original = {
315 |         'small': 0.000001,
316 |         'smaller': 0.0000001,
317 |         'large': 15000000000000000.0,
318 |         'very_large': 1.23e20,
319 |         'normal': 3.14159,
320 |         'values': [0.000001, 1.5e16]
321 |     }
322 | 
323 |     # First encode
324 |     toon = encode(original)
325 | 
326 |     # Should not have scientific notation for reasonable values
327 |     assert '0.000001' in toon
328 |     assert '0.0000001' in toon
329 |     assert '15000000000000000' in toon
330 |     assert '123000000000000000000' in toon
331 | 
332 |     # Decode
333 |     result = decode(toon)
334 | 
335 |     # Values should match (allowing for float precision)
336 |     assert result['small'] == 0.000001
337 |     assert result['smaller'] == 0.0000001
338 |     assert result['large'] == 15000000000000000.0
339 |     assert result['very_large'] == 1.23e20
340 |     assert result['normal'] == 3.14159
341 | 
342 |     # Second encode should produce same TOON
343 |     toon2 = encode(result)
344 |     assert toon == toon2
345 | 
346 |     # Third decode should match
347 |     result2 = decode(toon2)
348 |     assert result2 == result
349 | 
350 | 
351 | def test_roundtrip_extreme_scientific_notation():
352 |     """Test round-trip with extreme values that keep scientific notation."""
353 |     original = {
354 |         'very_small': 1.23e-150,
355 |         'very_large': 1.23e150
356 |     }
357 | 
358 |     # First encode
359 |     toon = encode(original)
360 | 
361 |     # Extreme values should keep scientific notation
362 |     assert 'e' in toon.lower() or 'E' in toon
363 | 
364 |     # Decode
365 |     result = decode(toon)
366 | 
367 |     # Values should match
368 |     assert result['very_small'] == 1.23e-150
369 |     assert result['very_large'] == 1.23e150
370 | 
371 |     # Second encode should produce same TOON
372 |     toon2 = encode(result)
373 |     assert toon == toon2
374 | 
375 | 
376 | def test_roundtrip_root_inline_array():
377 |     """Test round-trip with root-level inline array."""
378 |     original = [1, 2, 3, 4, 5]
379 | 
380 |     # First encode
381 |     toon = encode(original)
382 |     assert toon == '[1,2,3,4,5]'
383 | 
384 |     # Decode
385 |     result = decode(toon)
386 |     assert result == original
387 | 
388 |     # Second encode
389 |     toon2 = encode(result)
390 |     assert toon2 == toon
391 | 
392 | 
393 | def test_roundtrip_root_tabular_array():
394 |     """Test round-trip with root-level tabular array."""
395 |     original = [
396 |         {'id': 1, 'name': 'Alice', 'active': True},
397 |         {'id': 2, 'name': 'Bob', 'active': False},
398 |         {'id': 3, 'name': 'Charlie', 'active': True}
399 |     ]
400 | 
401 |     # First encode
402 |     toon = encode(original)
403 |     assert '[3]{id,name,active}:' in toon
404 | 
405 |     # Decode
406 |     result = decode(toon)
407 |     assert result == original
408 | 
409 |     # Second encode
410 |     toon2 = encode(result)
411 |     assert toon2 == toon
412 | 
413 | 
414 | def test_roundtrip_root_list_array():
415 |     """Test round-trip with root-level list array."""
416 |     original = [
417 |         1,
418 |         'text',
419 |         True,
420 |         {'nested': 'object'},
421 |         [1, 2, 3]
422 |     ]
423 | 
424 |     # First encode
425 |     toon = encode(original)
426 |     assert '[5]:' in toon
427 |     assert '- 1' in toon
428 |     assert '- nested: object' in toon
429 | 
430 |     # Decode
431 |     result = decode(toon)
432 |     assert result == original
433 | 
434 |     # Second encode
435 |     toon2 = encode(result)
436 |     assert toon2 == toon
437 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to Toonify
  2 | 
  3 | Thank you for your interest in contributing to Toonify! We welcome contributions from the community and are excited to work with you.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Code of Conduct](#code-of-conduct)
  8 | - [Getting Started](#getting-started)
  9 | - [Development Setup](#development-setup)
 10 | - [How to Contribute](#how-to-contribute)
 11 | - [Coding Standards](#coding-standards)
 12 | - [Testing](#testing)
 13 | - [Pull Request Process](#pull-request-process)
 14 | - [Commit Message Guidelines](#commit-message-guidelines)
 15 | - [Reporting Issues](#reporting-issues)
 16 | - [Documentation](#documentation)
 17 | - [Community](#community)
 18 | 
 19 | ## Code of Conduct
 20 | 
 21 | We are committed to providing a welcoming and inclusive environment for everyone. Please be respectful and professional in all interactions. Key principles:
 22 | 
 23 | - **Be respectful**: Value differing viewpoints and experiences
 24 | - **Be constructive**: Provide helpful feedback and criticism
 25 | - **Be collaborative**: Work together to improve the project
 26 | - **Be patient**: Remember that everyone was once a beginner
 27 | 
 28 | Unacceptable behavior includes harassment, trolling, personal attacks, or any conduct that would be inappropriate in a professional setting.
 29 | 
 30 | ## Getting Started
 31 | 
 32 | 1. **Fork the repository** on GitHub
 33 | 2. **Clone your fork** locally:
 34 |    ```bash
 35 |    git clone https://github.com/YOUR_USERNAME/toonify.git
 36 |    cd toonify
 37 |    ```
 38 | 3. **Add the upstream repository** as a remote:
 39 |    ```bash
 40 |    git remote add upstream https://github.com/ScrapeGraphAI/toonify.git
 41 |    ```
 42 | 4. **Create a branch** for your changes:
 43 |    ```bash
 44 |    git checkout -b feature/your-feature-name
 45 |    ```
 46 | 
 47 | ## Development Setup
 48 | 
 49 | ### Prerequisites
 50 | 
 51 | - Python 3.8 or higher
 52 | - pip or uv package manager
 53 | - Git
 54 | 
 55 | ### Installation
 56 | 
 57 | 1. **Install in development mode with all dependencies**:
 58 |    ```bash
 59 |    pip install -e .[dev,pydantic]
 60 |    ```
 61 | 
 62 |    Or using `uv` (recommended):
 63 |    ```bash
 64 |    uv pip install -e .[dev,pydantic]
 65 |    ```
 66 | 
 67 | 2. **Verify the installation**:
 68 |    ```bash
 69 |    python -c "import toon; print(toon.__version__)"
 70 |    pytest --version
 71 |    ```
 72 | 
 73 | ### Project Structure
 74 | 
 75 | ```
 76 | toonify/
 77 | ├── toon/               # Main package
 78 | │   ├── encoder.py      # TOON encoding logic
 79 | │   ├── decoder.py      # TOON decoding logic
 80 | │   ├── cli.py          # Command-line interface
 81 | │   ├── pydantic_converter.py  # Pydantic integration
 82 | │   └── utils.py        # Utility functions
 83 | ├── tests/              # Test suite
 84 | ├── examples/           # Example scripts
 85 | ├── benchmark/          # Performance benchmarks
 86 | └── docs/               # Documentation
 87 | ```
 88 | 
 89 | ## How to Contribute
 90 | 
 91 | ### Types of Contributions
 92 | 
 93 | We welcome many types of contributions:
 94 | 
 95 | - 🐛 **Bug fixes**: Fix issues reported in GitHub Issues
 96 | - ✨ **New features**: Add new functionality or improve existing features
 97 | - 📝 **Documentation**: Improve README, docstrings, or examples
 98 | - 🧪 **Tests**: Add or improve test coverage
 99 | - 🚀 **Performance**: Optimize code for better performance
100 | - 🌐 **Internationalization**: Add translations or improve i18n support
101 | - 🎨 **Examples**: Create new examples or improve existing ones
102 | 
103 | ### Finding Issues to Work On
104 | 
105 | - Check the [Issues page](https://github.com/ScrapeGraphAI/toonify/issues)
106 | - Look for issues labeled `good first issue` or `help wanted`
107 | - Comment on an issue to let others know you're working on it
108 | 
109 | ## Coding Standards
110 | 
111 | ### Python Style Guide
112 | 
113 | We follow [PEP 8](https://pep8.org/) with some exceptions:
114 | 
115 | - **Line length**: Maximum 100 characters (not 79)
116 | - **Imports**: Group standard library, third-party, and local imports
117 | - **Docstrings**: Use Google-style docstrings for all public functions/classes
118 | - **Type hints**: Add type hints to function signatures when practical
119 | 
120 | ### Example Code Style
121 | 
122 | ```python
123 | from typing import Dict, List, Optional, Union
124 | 
125 | 
126 | def encode_array(
127 |     items: List[Union[str, int, float, bool, None]],
128 |     options: Optional[Dict[str, any]] = None
129 | ) -> str:
130 |     """Encode a Python list to TOON array format.
131 |     
132 |     Args:
133 |         items: List of values to encode
134 |         options: Optional encoding configuration with:
135 |             - delimiter: 'comma', 'tab', or 'pipe' (default: 'comma')
136 |             - indent: Number of spaces per level (default: 2)
137 |     
138 |     Returns:
139 |         TOON-formatted array string
140 |         
141 |     Raises:
142 |         ValueError: If items list is empty or contains unsupported types
143 |         
144 |     Example:
145 |         >>> encode_array([1, 2, 3], {'delimiter': 'comma'})
146 |         '[1,2,3]'
147 |     """
148 |     if not items:
149 |         raise ValueError("Cannot encode empty array")
150 |     
151 |     # Implementation...
152 |     pass
153 | ```
154 | 
155 | ### Best Practices
156 | 
157 | - ✅ Write self-documenting code with clear variable names
158 | - ✅ Keep functions small and focused on a single responsibility
159 | - ✅ Add comments for complex logic, but prefer clear code over comments
160 | - ✅ Handle edge cases and validate inputs
161 | - ✅ Use meaningful error messages
162 | - ❌ Don't leave commented-out code in PRs
163 | - ❌ Don't use wildcard imports (`from module import *`)
164 | 
165 | ## Testing
166 | 
167 | ### Running Tests
168 | 
169 | Run all tests:
170 | ```bash
171 | pytest
172 | ```
173 | 
174 | Run with coverage report:
175 | ```bash
176 | pytest --cov=toon --cov-report=term-missing
177 | ```
178 | 
179 | Run specific test file:
180 | ```bash
181 | pytest tests/test_encoder.py
182 | ```
183 | 
184 | Run specific test:
185 | ```bash
186 | pytest tests/test_encoder.py::test_encode_simple_dict
187 | ```
188 | 
189 | ### Writing Tests
190 | 
191 | - **Location**: Place test files in the `tests/` directory
192 | - **Naming**: Name test files `test_*.py` and test functions `test_*`
193 | - **Coverage**: Aim for >90% code coverage for new features
194 | - **Structure**: Use AAA pattern (Arrange, Act, Assert)
195 | 
196 | Example test:
197 | ```python
198 | def test_encode_nested_object():
199 |     """Test encoding of nested objects."""
200 |     # Arrange
201 |     data = {
202 |         'user': {
203 |             'name': 'Alice',
204 |             'profile': {
205 |                 'age': 30
206 |             }
207 |         }
208 |     }
209 |     
210 |     # Act
211 |     result = encode(data)
212 |     
213 |     # Assert
214 |     assert 'user:' in result
215 |     assert 'name: Alice' in result
216 |     assert 'age: 30' in result
217 |     
218 |     # Verify round-trip
219 |     decoded = decode(result)
220 |     assert decoded == data
221 | ```
222 | 
223 | ### Test Types
224 | 
225 | 1. **Unit tests**: Test individual functions and methods
226 | 2. **Integration tests**: Test component interactions
227 | 3. **Round-trip tests**: Ensure encode/decode consistency
228 | 4. **Edge case tests**: Test boundary conditions and error handling
229 | 
230 | ### Running Examples
231 | 
232 | Test example scripts to ensure they work:
233 | ```bash
234 | python examples/basic_usage.py
235 | python examples/advanced_features.py
236 | python examples/pydantic_usage.py
237 | ```
238 | 
239 | ## Pull Request Process
240 | 
241 | ### Before Submitting
242 | 
243 | 1. ✅ Ensure all tests pass: `pytest`
244 | 2. ✅ Add tests for new functionality
245 | 3. ✅ Update documentation if needed
246 | 4. ✅ Run examples to verify they still work
247 | 5. ✅ Write clear commit messages
248 | 6. ✅ Update CHANGELOG.md if applicable
249 | 
250 | ### Submitting a Pull Request
251 | 
252 | 1. **Push your branch** to your fork:
253 |    ```bash
254 |    git push origin feature/your-feature-name
255 |    ```
256 | 
257 | 2. **Open a Pull Request** on GitHub with:
258 |    - **Clear title**: Summarize the change in one line
259 |    - **Description**: Explain what changed and why
260 |    - **Issue reference**: Link to related issues (e.g., "Fixes #123")
261 |    - **Testing**: Describe how you tested the changes
262 |    - **Breaking changes**: Note any breaking changes
263 | 
264 | 3. **PR Template** (use this format):
265 |    ```markdown
266 |    ## Description
267 |    Brief description of what this PR does.
268 |    
269 |    ## Related Issue
270 |    Fixes #123
271 |    
272 |    ## Type of Change
273 |    - [ ] Bug fix (non-breaking change which fixes an issue)
274 |    - [ ] New feature (non-breaking change which adds functionality)
275 |    - [ ] Breaking change (fix or feature that would cause existing functionality to change)
276 |    - [ ] Documentation update
277 |    
278 |    ## Testing
279 |    - [ ] All tests pass
280 |    - [ ] Added new tests for the changes
281 |    - [ ] Tested manually with examples
282 |    
283 |    ## Checklist
284 |    - [ ] Code follows the project's style guidelines
285 |    - [ ] Self-review completed
286 |    - [ ] Documentation updated
287 |    - [ ] No new warnings or errors introduced
288 |    ```
289 | 
290 | ### Review Process
291 | 
292 | - A maintainer will review your PR within 3-5 business days
293 | - Address any feedback or requested changes
294 | - Once approved, a maintainer will merge your PR
295 | - Your contribution will be credited in the release notes!
296 | 
297 | ## Commit Message Guidelines
298 | 
299 | We follow the [Conventional Commits](https://www.conventionalcommits.org/) specification:
300 | 
301 | ### Format
302 | 
303 | ```
304 | <type>(<scope>): <subject>
305 | 
306 | <body>
307 | 
308 | <footer>
309 | ```
310 | 
311 | ### Types
312 | 
313 | - `feat`: New feature
314 | - `fix`: Bug fix
315 | - `docs`: Documentation changes
316 | - `test`: Adding or updating tests
317 | - `refactor`: Code refactoring
318 | - `perf`: Performance improvements
319 | - `style`: Code style changes (formatting, etc.)
320 | - `chore`: Maintenance tasks
321 | - `ci`: CI/CD changes
322 | 
323 | ### Examples
324 | 
325 | ```bash
326 | # Feature
327 | feat(encoder): add support for custom delimiters
328 | 
329 | # Bug fix
330 | fix(decoder): handle escaped quotes in strings
331 | 
332 | # Documentation
333 | docs(readme): update installation instructions
334 | 
335 | # Breaking change
336 | feat(encoder)!: change default delimiter to tab
337 | 
338 | BREAKING CHANGE: The default delimiter has changed from comma to tab.
339 | Update your code if you rely on the default behavior.
340 | ```
341 | 
342 | ### Scope
343 | 
344 | Use these scopes when applicable:
345 | - `encoder`: Encoding logic
346 | - `decoder`: Decoding logic
347 | - `cli`: Command-line interface
348 | - `pydantic`: Pydantic integration
349 | - `utils`: Utility functions
350 | - `tests`: Test suite
351 | - `docs`: Documentation
352 | 
353 | ## Reporting Issues
354 | 
355 | ### Bug Reports
356 | 
357 | When reporting a bug, include:
358 | 
359 | 1. **Description**: Clear description of the bug
360 | 2. **Steps to reproduce**: Minimal code example
361 | 3. **Expected behavior**: What should happen
362 | 4. **Actual behavior**: What actually happens
363 | 5. **Environment**:
364 |    - Python version
365 |    - Toonify version
366 |    - Operating system
367 | 
368 | **Bug Report Template**:
369 | ```markdown
370 | ## Description
371 | Brief description of the bug
372 | 
373 | ## Steps to Reproduce
374 | ```python
375 | from toon import encode
376 | 
377 | data = {...}
378 | result = encode(data)
379 | ```
380 | 
381 | ## Expected Behavior
382 | The output should be...
383 | 
384 | ## Actual Behavior
385 | But instead it is...
386 | 
387 | ## Environment
388 | - Python version: 3.11.5
389 | - Toonify version: 0.0.2
390 | - OS: macOS 14.0
391 | ```
392 | 
393 | ### Feature Requests
394 | 
395 | When suggesting a feature:
396 | 
397 | 1. **Use case**: Describe the problem you're trying to solve
398 | 2. **Proposed solution**: Your idea for solving it
399 | 3. **Alternatives**: Other solutions you've considered
400 | 4. **Additional context**: Examples, mockups, or references
401 | 
402 | ## Documentation
403 | 
404 | ### Types of Documentation
405 | 
406 | - **README.md**: Project overview and quick start
407 | - **API docs**: Function/class docstrings (Google style)
408 | - **Examples**: Working code examples in `examples/`
409 | - **Inline comments**: Complex logic explanations
410 | 
411 | ### Documentation Standards
412 | 
413 | - Use clear, concise language
414 | - Include code examples when helpful
415 | - Keep formatting consistent
416 | - Update docs when changing functionality
417 | - Add examples for new features
418 | 
419 | ### Building Documentation Locally
420 | 
421 | ```bash
422 | # Install documentation dependencies (if applicable in future)
423 | pip install -e .[docs]
424 | 
425 | # Run examples as documentation tests
426 | python examples/basic_usage.py
427 | ```
428 | 
429 | ## Community
430 | 
431 | ### Getting Help
432 | 
433 | - **GitHub Issues**: For bug reports and feature requests
434 | - **Discussions**: For questions and general discussion
435 | - **Email**: Contact the ScrapeGraph team at [scrapegraphai.com](https://scrapegraphai.com)
436 | 
437 | ### Stay Connected
438 | 
439 | - **GitHub**: [ScrapeGraphAI/toonify](https://github.com/ScrapeGraphAI/toonify)
440 | - **Website**: [scrapegraphai.com](https://scrapegraphai.com)
441 | - **TOON Format Spec**: [toon-format/toon](https://github.com/toon-format/toon)
442 | 
443 | ## Recognition
444 | 
445 | Contributors are recognized in:
446 | - GitHub contributors list
447 | - Release notes
448 | - CHANGELOG.md
449 | 
450 | Thank you for contributing to Toonify! 🎉
451 | 
452 | ---
453 | 
454 | **Questions?** Feel free to open a [GitHub Discussion](https://github.com/ScrapeGraphAI/toonify/discussions) or create an issue.
455 | 
456 | Made with ❤️ by the [ScrapeGraph team](https://scrapegraphai.com)
457 | 
458 | 


--------------------------------------------------------------------------------
/toon/encoder.py:
--------------------------------------------------------------------------------
  1 | """TOON encoder - convert Python objects to TOON format."""
  2 | from typing import Any, Dict, List, Optional
  3 | from datetime import datetime, date
  4 | from .constants import (
  5 |     COMMA, TAB, PIPE, COLON, NEWLINE,
  6 |     DEFAULT_DELIMITER, DEFAULT_INDENT,
  7 |     KEY_FOLDING_OFF, KEY_FOLDING_SAFE,
  8 |     DELIMITER_TAB, DELIMITER_PIPE, DELIMITER_COMMA,
  9 |     LEFT_BRACKET, RIGHT_BRACKET, LEFT_BRACE, RIGHT_BRACE
 10 | )
 11 | from .utils import (
 12 |     needs_quoting, quote_string, is_primitive,
 13 |     is_uniform_array_of_objects, get_indent, format_float
 14 | )
 15 | 
 16 | 
 17 | class EncoderOptions:
 18 |     """Options for TOON encoding."""
 19 |     
 20 |     def __init__(
 21 |         self,
 22 |         delimiter: str = DEFAULT_DELIMITER,
 23 |         indent: int = DEFAULT_INDENT,
 24 |         key_folding: str = KEY_FOLDING_OFF,
 25 |         flatten_depth: Optional[int] = None
 26 |     ):
 27 |         """
 28 |         Initialize encoder options.
 29 |         
 30 |         Args:
 31 |             delimiter: Array value delimiter (',' | '\t' | '|')
 32 |             indent: Number of spaces per indentation level
 33 |             key_folding: Key folding mode ('off' | 'safe')
 34 |             flatten_depth: Maximum depth for key folding (None = unlimited)
 35 |         """
 36 |         # Normalize delimiter names
 37 |         if delimiter == DELIMITER_TAB:
 38 |             delimiter = TAB
 39 |         elif delimiter == DELIMITER_PIPE:
 40 |             delimiter = PIPE
 41 |         elif delimiter == DELIMITER_COMMA:
 42 |             delimiter = COMMA
 43 |         
 44 |         self.delimiter = delimiter
 45 |         self.indent = indent
 46 |         self.key_folding = key_folding
 47 |         self.flatten_depth = flatten_depth
 48 | 
 49 | 
 50 | def encode(data: Any, options: Optional[Dict[str, Any]] = None) -> str:
 51 |     """
 52 |     Encode Python data structure to TOON format.
 53 |     
 54 |     Args:
 55 |         data: Python object to encode (dict or list)
 56 |         options: Encoding options
 57 |             - delimiter: ',' (default), '\t', or '|'
 58 |             - indent: int (default 2)
 59 |             - key_folding: 'off' (default) or 'safe'
 60 |             - flatten_depth: int or None
 61 |             
 62 |     Returns:
 63 |         TOON formatted string
 64 |         
 65 |     Example:
 66 |         >>> data = {'users': [{'id': 1, 'name': 'Alice'}]}
 67 |         >>> print(encode(data))
 68 |         users[1]{id,name}:
 69 |           1,Alice
 70 |     """
 71 |     if options is None:
 72 |         options = {}
 73 |     
 74 |     opts = EncoderOptions(
 75 |         delimiter=options.get('delimiter', DEFAULT_DELIMITER),
 76 |         indent=options.get('indent', DEFAULT_INDENT),
 77 |         key_folding=options.get('key_folding', KEY_FOLDING_OFF),
 78 |         flatten_depth=options.get('flatten_depth')
 79 |     )
 80 |     
 81 |     return _encode_value(data, 0, opts)
 82 | 
 83 | 
 84 | def _encode_value(value: Any, level: int, opts: EncoderOptions) -> str:
 85 |     """Encode a value at a given indentation level."""
 86 |     if value is None:
 87 |         return 'null'
 88 |     elif isinstance(value, bool):
 89 |         return 'true' if value else 'false'
 90 |     elif isinstance(value, datetime):
 91 |         # Convert datetime to ISO 8601 string
 92 |         iso_string = value.isoformat()
 93 |         if needs_quoting(iso_string):
 94 |             return quote_string(iso_string)
 95 |         return iso_string
 96 |     elif isinstance(value, date):
 97 |         # Convert date to ISO 8601 date string
 98 |         iso_string = value.isoformat()
 99 |         if needs_quoting(iso_string):
100 |             return quote_string(iso_string)
101 |         return iso_string
102 |     elif isinstance(value, (int, float)):
103 |         # Handle special float values
104 |         if isinstance(value, float):
105 |             if value != value:  # NaN
106 |                 return 'null'
107 |             elif value == float('inf') or value == float('-inf'):
108 |                 return 'null'
109 |             # Use format_float to suppress scientific notation
110 |             return format_float(value)
111 |         return str(value)
112 |     elif isinstance(value, str):
113 |         if needs_quoting(value):
114 |             return quote_string(value)
115 |         return value
116 |     elif isinstance(value, list):
117 |         return _encode_array(value, level, opts)
118 |     elif isinstance(value, dict):
119 |         return _encode_object(value, level, opts)
120 |     else:
121 |         # Handle other types as null
122 |         return 'null'
123 | 
124 | 
125 | def _encode_object(obj: dict, level: int, opts: EncoderOptions) -> str:
126 |     """Encode a dictionary object."""
127 |     if not obj:
128 |         return '{}'
129 |     
130 |     # Apply key folding if enabled
131 |     if opts.key_folding == KEY_FOLDING_SAFE:
132 |         obj = _apply_key_folding(obj, opts.flatten_depth)
133 |     
134 |     lines = []
135 |     indent = get_indent(level, opts.indent)
136 |     
137 |     for key, value in obj.items():
138 |         # Special handling for arrays to include key in header
139 |         if isinstance(value, list):
140 |             encoded_value = _encode_array_with_key(key, value, level, opts)
141 |             if NEWLINE in encoded_value:
142 |                 lines.append(encoded_value)
143 |             else:
144 |                 lines.append(f'{indent}{key}{COLON} {encoded_value}')
145 |         elif isinstance(value, dict):
146 |             # Nested object handling
147 |             if not value:
148 |                 # Empty object - inline
149 |                 lines.append(f'{indent}{key}{COLON} {{}}')
150 |             else:
151 |                 # Non-empty object - multiline
152 |                 encoded_value = _encode_value(value, level + 1, opts)
153 |                 lines.append(f'{indent}{key}{COLON}')
154 |                 lines.append(encoded_value)
155 |         else:
156 |             # Primitive value
157 |             encoded_value = _encode_value(value, level + 1, opts)
158 |             lines.append(f'{indent}{key}{COLON} {encoded_value}')
159 |     
160 |     return NEWLINE.join(lines)
161 | 
162 | 
163 | def _encode_array(arr: list, level: int, opts: EncoderOptions) -> str:
164 |     """Encode an array."""
165 |     if not arr:
166 |         return '[]'
167 |     
168 |     # Check if it's a uniform array of objects (tabular format)
169 |     fields = is_uniform_array_of_objects(arr)
170 |     if fields:
171 |         return _encode_tabular_array(arr, fields, level, opts, key=None)
172 |     
173 |     # Check if all elements are primitives (inline format)
174 |     if all(is_primitive(item) for item in arr):
175 |         return _encode_primitive_array(arr, opts)
176 |     
177 |     # Mixed array (list format)
178 |     return _encode_list_array(arr, level, opts, key=None)
179 | 
180 | 
181 | def _encode_array_with_key(key: str, arr: list, level: int, opts: EncoderOptions) -> str:
182 |     """Encode an array with its key prefix for object context."""
183 |     if not arr:
184 |         return '[]'
185 |     
186 |     indent = get_indent(level, opts.indent)
187 |     
188 |     # Check if it's a uniform array of objects (tabular format)
189 |     fields = is_uniform_array_of_objects(arr)
190 |     if fields:
191 |         return _encode_tabular_array(arr, fields, level, opts, key=key)
192 |     
193 |     # Check if all elements are primitives (inline format)
194 |     if all(is_primitive(item) for item in arr):
195 |         return _encode_primitive_array(arr, opts)
196 |     
197 |     # Mixed array (list format)
198 |     return _encode_list_array(arr, level, opts, key=key)
199 | 
200 | 
201 | 
202 | def _encode_primitive_array(arr: list, opts: EncoderOptions) -> str:
203 |     """Encode an array of primitives as inline values."""
204 |     encoded_values = []
205 |     for item in arr:
206 |         if item is None:
207 |             encoded_values.append('null')
208 |         elif isinstance(item, bool):
209 |             encoded_values.append('true' if item else 'false')
210 |         elif isinstance(item, (int, float)):
211 |             if isinstance(item, float):
212 |                 if item != item or item == float('inf') or item == float('-inf'):
213 |                     encoded_values.append('null')
214 |                 else:
215 |                     # Use format_float to suppress scientific notation
216 |                     encoded_values.append(format_float(item))
217 |             else:
218 |                 encoded_values.append(str(item))
219 |         elif isinstance(item, str):
220 |             if needs_quoting(item):
221 |                 encoded_values.append(quote_string(item))
222 |             else:
223 |                 encoded_values.append(item)
224 | 
225 |     return f'[{opts.delimiter.join(encoded_values)}]'
226 | 
227 | 
228 | def _encode_tabular_array(arr: list, fields: list, level: int, opts: EncoderOptions, key: Optional[str] = None) -> str:
229 |     """Encode a uniform array of objects in tabular format."""
230 |     indent = get_indent(level, opts.indent)
231 | 
232 |     # Delimiter indicator: show delimiter in header for non-comma
233 |     delimiter_indicator = ''
234 |     if opts.delimiter == TAB:
235 |         delimiter_indicator = '\t'
236 |     elif opts.delimiter == PIPE:
237 |         delimiter_indicator = '|'
238 |     # Comma is default, no indicator needed
239 | 
240 |     # Header: [N]{field1,field2,...}: or key[N\t]{field1,field2,...}: or key[N|]{field1,field2,...}:
241 |     if key:
242 |         header = f'{indent}{key}[{len(arr)}{delimiter_indicator}]{LEFT_BRACE}{COMMA.join(fields)}{RIGHT_BRACE}{COLON}'
243 |     else:
244 |         header = f'[{len(arr)}{delimiter_indicator}]{LEFT_BRACE}{COMMA.join(fields)}{RIGHT_BRACE}{COLON}'
245 |     
246 |     lines = [header]
247 |     
248 |     # Rows: indented values separated by delimiter
249 |     for obj in arr:
250 |         row_values = []
251 |         for field in fields:
252 |             value = obj.get(field)
253 |             encoded = _encode_primitive_value(value)
254 |             row_values.append(encoded)
255 |         
256 |         row = opts.delimiter.join(row_values)
257 |         lines.append(f'{indent}  {row}')
258 |     
259 |     return NEWLINE.join(lines)
260 | 
261 | 
262 | def _encode_primitive_value(value: Any) -> str:
263 |     """Encode a primitive value for use in arrays."""
264 |     if value is None:
265 |         return 'null'
266 |     elif isinstance(value, bool):
267 |         return 'true' if value else 'false'
268 |     elif isinstance(value, datetime):
269 |         # Convert datetime to ISO 8601 string
270 |         iso_string = value.isoformat()
271 |         if needs_quoting(iso_string):
272 |             return quote_string(iso_string)
273 |         return iso_string
274 |     elif isinstance(value, date):
275 |         # Convert date to ISO 8601 date string
276 |         iso_string = value.isoformat()
277 |         if needs_quoting(iso_string):
278 |             return quote_string(iso_string)
279 |         return iso_string
280 |     elif isinstance(value, (int, float)):
281 |         if isinstance(value, float):
282 |             if value != value or value == float('inf') or value == float('-inf'):
283 |                 return 'null'
284 |             # Use format_float to suppress scientific notation
285 |             return format_float(value)
286 |         return str(value)
287 |     elif isinstance(value, str):
288 |         if needs_quoting(value):
289 |             return quote_string(value)
290 |         return value
291 |     else:
292 |         return 'null'
293 | 
294 | 
295 | def _encode_list_array(arr: list, level: int, opts: EncoderOptions, key: Optional[str] = None) -> str:
296 |     """Encode a non-uniform array in list format."""
297 |     indent = get_indent(level, opts.indent)
298 | 
299 |     # Header: [N]: or key[N]:
300 |     if key:
301 |         header = f'{indent}{key}[{len(arr)}]{COLON}'
302 |     else:
303 |         header = f'[{len(arr)}]{COLON}'
304 | 
305 |     lines = [header]
306 | 
307 |     # Items: indented encoded values with dash markers
308 |     for item in arr:
309 |         if isinstance(item, dict) and item:
310 |             # Nested object: encode at level + 2 for proper subsequent line indentation
311 |             encoded = _encode_value(item, level + 2, opts)
312 |             encoded_lines = encoded.split(NEWLINE)
313 |             # First line: strip leading indent and add dash
314 |             first_line = encoded_lines[0].lstrip()
315 |             lines.append(f'{indent}  - {first_line}')
316 |             # Subsequent lines: keep as-is (already properly indented)
317 |             for line in encoded_lines[1:]:
318 |                 lines.append(line)
319 |         else:
320 |             # Simple value: encode and add dash marker
321 |             encoded = _encode_value(item, level + 1, opts)
322 |             lines.append(f'{indent}  - {encoded}')
323 |     
324 |     return NEWLINE.join(lines)
325 | 
326 | 
327 | def _apply_key_folding(obj: dict, max_depth: Optional[int] = None) -> dict:
328 |     """
329 |     Apply key folding to collapse single-key chains into dotted paths.
330 |     
331 |     Args:
332 |         obj: Object to fold
333 |         max_depth: Maximum depth for folding (None = unlimited)
334 |         
335 |     Returns:
336 |         Folded object
337 |     """
338 |     result = {}
339 |     
340 |     for key, value in obj.items():
341 |         if isinstance(value, dict) and len(value) == 1:
342 |             # Single-key object - check if we can fold
343 |             nested_key = list(value.keys())[0]
344 |             nested_value = value[nested_key]
345 |             
346 |             # Calculate current depth
347 |             depth = 1
348 |             current = nested_value
349 |             while isinstance(current, dict) and len(current) == 1 and (max_depth is None or depth < max_depth):
350 |                 depth += 1
351 |                 current = list(current.values())[0]
352 |             
353 |             # Fold if within depth limit
354 |             if max_depth is None or depth <= max_depth:
355 |                 folded_key = f'{key}.{nested_key}'
356 |                 # Recursively fold
357 |                 if isinstance(nested_value, dict) and len(nested_value) == 1:
358 |                     folded = _apply_key_folding({nested_key: nested_value}, max_depth)
359 |                     for fk, fv in folded.items():
360 |                         result[f'{key}.{fk}'] = fv
361 |                 else:
362 |                     result[folded_key] = nested_value
363 |             else:
364 |                 result[key] = value
365 |         else:
366 |             result[key] = value
367 |     
368 |     return result
369 | 


--------------------------------------------------------------------------------
/tests/test_decoder.py:
--------------------------------------------------------------------------------
  1 | """Tests for TOON decoder."""
  2 | import pytest
  3 | from toon import decode
  4 | 
  5 | 
  6 | def test_decode_primitive_types():
  7 |     """Test decoding of primitive types."""
  8 |     # String
  9 |     assert decode('name: Alice') == {'name': 'Alice'}
 10 |     
 11 |     # Number
 12 |     assert decode('age: 30') == {'age': 30}
 13 |     assert decode('price: 19.99') == {'price': 19.99}
 14 |     
 15 |     # Boolean
 16 |     assert decode('active: true') == {'active': True}
 17 |     assert decode('disabled: false') == {'disabled': False}
 18 |     
 19 |     # Null
 20 |     assert decode('value: null') == {'value': None}
 21 | 
 22 | 
 23 | def test_decode_quoted_strings():
 24 |     """Test decoding of quoted strings."""
 25 |     # Simple quoted string
 26 |     assert decode('name: "Alice"') == {'name': 'Alice'}
 27 |     
 28 |     # String with comma
 29 |     assert decode('text: "Hello, World"') == {'text': 'Hello, World'}
 30 |     
 31 |     # String with colon
 32 |     assert decode('text: "key: value"') == {'text': 'key: value'}
 33 |     
 34 |     # String with spaces
 35 |     assert decode('text: " padded "') == {'text': ' padded '}
 36 |     
 37 |     # Empty string
 38 |     assert decode('text: ""') == {'text': ''}
 39 | 
 40 | 
 41 | def test_decode_escaped_strings():
 42 |     """Test decoding of escaped strings."""
 43 |     # Escaped quotes
 44 |     assert decode('text: "He said \\"hello\\""') == {'text': 'He said "hello"'}
 45 |     
 46 |     # Escaped newline
 47 |     assert decode('text: "line1\\nline2"') == {'text': 'line1\nline2'}
 48 |     
 49 |     # Escaped backslash
 50 |     assert decode('text: "path\\\\to\\\\file"') == {'text': 'path\\to\\file'}
 51 |     
 52 |     # Escaped tab
 53 |     assert decode('text: "col1\\tcol2"') == {'text': 'col1\tcol2'}
 54 | 
 55 | 
 56 | def test_decode_empty_structures():
 57 |     """Test decoding of empty structures."""
 58 |     # Empty object
 59 |     result = decode('data: {}')
 60 |     assert result == {'data': {}}
 61 |     
 62 |     # Empty array
 63 |     assert decode('items: []') == {'items': []}
 64 | 
 65 | 
 66 | def test_decode_primitive_array():
 67 |     """Test decoding of primitive arrays."""
 68 |     # Number array
 69 |     assert decode('numbers: [1,2,3]') == {'numbers': [1, 2, 3]}
 70 |     
 71 |     # String array
 72 |     assert decode('names: [Alice,Bob]') == {'names': ['Alice', 'Bob']}
 73 |     
 74 |     # Mixed array
 75 |     assert decode('mixed: [1,text,true,null]') == {'mixed': [1, 'text', True, None]}
 76 |     
 77 |     # Array with quoted strings
 78 |     result = decode('items: [hello,"world, test",foo]')
 79 |     assert result == {'items': ['hello', 'world, test', 'foo']}
 80 | 
 81 | 
 82 | def test_decode_array_delimiters():
 83 |     """Test decoding with different delimiters."""
 84 |     # Tab delimiter
 85 |     assert decode('numbers: [1\t2\t3]') == {'numbers': [1, 2, 3]}
 86 |     
 87 |     # Pipe delimiter
 88 |     assert decode('numbers: [1|2|3]') == {'numbers': [1, 2, 3]}
 89 | 
 90 | 
 91 | def test_decode_tabular_array():
 92 |     """Test decoding of tabular arrays."""
 93 |     toon = """users[2]{id,name,role}:
 94 |   1,Alice,admin
 95 |   2,Bob,user"""
 96 |     
 97 |     result = decode(toon)
 98 |     
 99 |     expected = {
100 |         'users': [
101 |             {'id': 1, 'name': 'Alice', 'role': 'admin'},
102 |             {'id': 2, 'name': 'Bob', 'role': 'user'}
103 |         ]
104 |     }
105 |     
106 |     assert result == expected
107 | 
108 | 
109 | def test_decode_tabular_array_with_tab():
110 |     """Test decoding tabular array with tab delimiter."""
111 |     # Tab delimiter should have \t indicator in header
112 |     toon = """users[2\t]{id,name}:
113 |   1\tAlice
114 |   2\tBob"""
115 | 
116 |     result = decode(toon)
117 |     
118 |     expected = {
119 |         'users': [
120 |             {'id': 1, 'name': 'Alice'},
121 |             {'id': 2, 'name': 'Bob'}
122 |         ]
123 |     }
124 |     
125 |     assert result == expected
126 | 
127 | 
128 | def test_decode_list_array():
129 |     """Test decoding of list arrays."""
130 |     toon = """items[3]:
131 |   value1
132 |   value2
133 |   value3"""
134 |     
135 |     result = decode(toon)
136 |     assert result == {'items': ['value1', 'value2', 'value3']}
137 | 
138 | 
139 | def test_decode_nested_objects():
140 |     """Test decoding of nested objects."""
141 |     toon = """user:
142 |   name: Alice
143 |   profile:
144 |     age: 30
145 |     city: NYC"""
146 |     
147 |     result = decode(toon)
148 |     
149 |     expected = {
150 |         'user': {
151 |             'name': 'Alice',
152 |             'profile': {
153 |                 'age': 30,
154 |                 'city': 'NYC'
155 |             }
156 |         }
157 |     }
158 |     
159 |     assert result == expected
160 | 
161 | 
162 | def test_decode_path_expansion():
163 |     """Test path expansion feature."""
164 |     toon = 'data.metadata.items: [1,2,3]'
165 |     
166 |     # Without expansion
167 |     result_no_expand = decode(toon, {'expand_paths': 'off'})
168 |     assert result_no_expand == {'data.metadata.items': [1, 2, 3]}
169 |     
170 |     # With expansion
171 |     result_expand = decode(toon, {'expand_paths': 'safe'})
172 |     expected = {
173 |         'data': {
174 |             'metadata': {
175 |                 'items': [1, 2, 3]
176 |             }
177 |         }
178 |     }
179 |     assert result_expand == expected
180 | 
181 | 
182 | def test_decode_complex_structure():
183 |     """Test decoding of complex structure."""
184 |     toon = """project: TOON
185 | version: 1.0.0
186 | users[2]{id,name,active}:
187 |   1,Alice,true
188 |   2,Bob,false
189 | metadata:
190 |   created: 2024-01-01
191 |   tags: [format,serialization,llm]"""
192 |     
193 |     result = decode(toon)
194 |     
195 |     expected = {
196 |         'project': 'TOON',
197 |         'version': '1.0.0',
198 |         'users': [
199 |             {'id': 1, 'name': 'Alice', 'active': True},
200 |             {'id': 2, 'name': 'Bob', 'active': False}
201 |         ],
202 |         'metadata': {
203 |             'created': '2024-01-01',
204 |             'tags': ['format', 'serialization', 'llm']
205 |         }
206 |     }
207 |     
208 |     assert result == expected
209 | 
210 | 
211 | def test_decode_empty_lines():
212 |     """Test decoding with empty lines."""
213 |     toon = """name: Alice
214 | 
215 | age: 30
216 | 
217 | active: true"""
218 |     
219 |     result = decode(toon)
220 |     assert result == {'name': 'Alice', 'age': 30, 'active': True}
221 | 
222 | 
223 | def test_decode_number_formats():
224 |     """Test decoding various number formats."""
225 |     toon = """int: 42
226 | float: 3.14
227 | negative: -10
228 | scientific: 1.5e10"""
229 |     
230 |     result = decode(toon)
231 |     
232 |     assert result['int'] == 42
233 |     assert result['float'] == 3.14
234 |     assert result['negative'] == -10
235 |     assert result['scientific'] == 1.5e10
236 | 
237 | 
238 | def test_decode_quoted_field_values():
239 |     """Test decoding with quoted values in tabular arrays."""
240 |     toon = """items[2]{id,description}:
241 |   1,"Item with, comma"
242 |   2,"Normal item\""""
243 |     
244 |     result = decode(toon)
245 |     
246 |     expected = {
247 |         'items': [
248 |             {'id': 1, 'description': 'Item with, comma'},
249 |             {'id': 2, 'description': 'Normal item'}
250 |         ]
251 |     }
252 | 
253 |     assert result == expected
254 | 
255 | 
256 | def test_decode_tabular_array_with_tab_indicator():
257 |     """Test decoding tabular array with tab delimiter indicator in header."""
258 |     toon = """users[2\t]{id,name}:
259 |   1\tAlice
260 |   2\tBob"""
261 | 
262 |     result = decode(toon)
263 | 
264 |     expected = {
265 |         'users': [
266 |             {'id': 1, 'name': 'Alice'},
267 |             {'id': 2, 'name': 'Bob'}
268 |         ]
269 |     }
270 | 
271 |     assert result == expected
272 | 
273 | 
274 | def test_decode_tabular_array_with_pipe_indicator():
275 |     """Test decoding tabular array with pipe delimiter indicator in header."""
276 |     toon = """products[2|]{sku,price}:
277 |   A001|29.99
278 |   B002|49.99"""
279 | 
280 |     result = decode(toon)
281 | 
282 |     expected = {
283 |         'products': [
284 |             {'sku': 'A001', 'price': 29.99},
285 |             {'sku': 'B002', 'price': 49.99}
286 |         ]
287 |     }
288 | 
289 |     assert result == expected
290 | 
291 | 
292 | def test_decode_tabular_array_comma_no_indicator():
293 |     """Test decoding tabular array without delimiter indicator uses comma default."""
294 |     toon = """items[2]{code,count}:
295 |   X,5
296 |   Y,10"""
297 | 
298 |     result = decode(toon)
299 | 
300 |     expected = {
301 |         'items': [
302 |             {'code': 'X', 'count': 5},
303 |             {'code': 'Y', 'count': 10}
304 |         ]
305 |     }
306 | 
307 |     assert result == expected
308 | 
309 | 
310 | def test_decode_list_array_with_dash_markers():
311 |     """Test decoding list array with dash markers."""
312 |     toon = """items[3]:
313 |   - apple
314 |   - banana
315 |   - cherry"""
316 | 
317 |     result = decode(toon)
318 | 
319 |     expected = {
320 |         'items': ['apple', 'banana', 'cherry']
321 |     }
322 | 
323 |     assert result == expected
324 | 
325 | 
326 | def test_decode_mixed_types_with_dash_markers():
327 |     """Test decoding mixed types array with dash markers."""
328 |     toon = """mixed[3]:
329 |   - string value
330 |   - 42
331 |   - key: value"""
332 | 
333 |     result = decode(toon)
334 | 
335 |     expected = {
336 |         'mixed': ['string value', 42, {'key': 'value'}]
337 |     }
338 | 
339 |     assert result == expected
340 | 
341 | 
342 | def test_decode_datetime_string():
343 |     """Test decoding datetime ISO strings."""
344 |     toon = """created: "2024-01-01T12:30:45"
345 | updated: "2024-06-15T09:00:00\""""
346 | 
347 |     result = decode(toon)
348 | 
349 |     expected = {
350 |         'created': '2024-01-01T12:30:45',
351 |         'updated': '2024-06-15T09:00:00'
352 |     }
353 | 
354 |     assert result == expected
355 | 
356 | 
357 | def test_decode_scientific_notation():
358 |     """Test decoding numbers in scientific notation."""
359 |     toon = """small: 1e-06
360 | smaller: 1e-07
361 | large: 1.5e+16
362 | very_large: 1.23e20
363 | normal: 3.14159"""
364 | 
365 |     result = decode(toon)
366 | 
367 |     expected = {
368 |         'small': 1e-06,
369 |         'smaller': 1e-07,
370 |         'large': 1.5e+16,
371 |         'very_large': 1.23e20,
372 |         'normal': 3.14159
373 |     }
374 | 
375 |     assert result == expected
376 | 
377 | 
378 | def test_decode_decimal_notation():
379 |     """Test decoding numbers in decimal notation (no scientific)."""
380 |     toon = """small: 0.000001
381 | smaller: 0.0000001
382 | large: 15000000000000000
383 | normal: 3.14159
384 | integer: 42"""
385 | 
386 |     result = decode(toon)
387 | 
388 |     expected = {
389 |         'small': 0.000001,
390 |         'smaller': 0.0000001,
391 |         'large': 15000000000000000.0,
392 |         'normal': 3.14159,
393 |         'integer': 42
394 |     }
395 | 
396 |     assert result == expected
397 | 
398 | 
399 | def test_decode_float_array_with_scientific():
400 |     """Test decoding arrays with scientific notation numbers."""
401 |     toon = """values: [1e-06,1e-07,1.5e+16,3.14159]"""
402 | 
403 |     result = decode(toon)
404 | 
405 |     expected = {
406 |         'values': [1e-06, 1e-07, 1.5e+16, 3.14159]
407 |     }
408 | 
409 |     assert result == expected
410 | 
411 | 
412 | def test_decode_root_inline_array():
413 |     """Test decoding root-level inline array."""
414 |     toon = "[1,2,3,4,5]"
415 | 
416 |     result = decode(toon)
417 | 
418 |     expected = [1, 2, 3, 4, 5]
419 | 
420 |     assert result == expected
421 | 
422 | 
423 | def test_decode_root_tabular_array():
424 |     """Test decoding root-level tabular array."""
425 |     toon = """[3]{id,name}:
426 |   1,Alice
427 |   2,Bob
428 |   3,Charlie"""
429 | 
430 |     result = decode(toon)
431 | 
432 |     expected = [
433 |         {'id': 1, 'name': 'Alice'},
434 |         {'id': 2, 'name': 'Bob'},
435 |         {'id': 3, 'name': 'Charlie'}
436 |     ]
437 | 
438 |     assert result == expected
439 | 
440 | 
441 | def test_decode_root_list_array():
442 |     """Test decoding root-level list array."""
443 |     toon = """[4]:
444 |   - 1
445 |   - text
446 |   - nested: object
447 |   - [1,2,3]"""
448 | 
449 |     result = decode(toon)
450 | 
451 |     expected = [
452 |         1,
453 |         'text',
454 |         {'nested': 'object'},
455 |         [1, 2, 3]
456 |     ]
457 | 
458 |     assert result == expected
459 | 
460 | 
461 | def test_decode_4space_indent():
462 |     """Test auto-detecting 4-space indentation."""
463 |     toon = """user:
464 |     name: Alice
465 |     age: 30
466 |     profile:
467 |         city: NYC
468 |         country: USA"""
469 | 
470 |     result = decode(toon)
471 | 
472 |     expected = {
473 |         'user': {
474 |             'name': 'Alice',
475 |             'age': 30,
476 |             'profile': {
477 |                 'city': 'NYC',
478 |                 'country': 'USA'
479 |             }
480 |         }
481 |     }
482 | 
483 |     assert result == expected
484 | 
485 | 
486 | def test_decode_explicit_indent_override():
487 |     """Test explicitly specifying indent size."""
488 |     # 3-space indent (unusual but should work with explicit option)
489 |     toon = """data:
490 |    value: 123
491 |    nested:
492 |       item: test"""
493 | 
494 |     result = decode(toon, {'indent': 3})
495 | 
496 |     expected = {
497 |         'data': {
498 |             'value': 123,
499 |             'nested': {
500 |                 'item': 'test'
501 |             }
502 |         }
503 |     }
504 | 
505 |     assert result == expected
506 | 
507 | 
508 | def test_decode_array_with_custom_indent():
509 |     """Test decoding array with custom indentation."""
510 |     toon = """users[2]{id,name}:
511 |     1,Alice
512 |     2,Bob"""
513 | 
514 |     result = decode(toon)
515 | 
516 |     expected = {
517 |         'users': [
518 |             {'id': 1, 'name': 'Alice'},
519 |             {'id': 2, 'name': 'Bob'}
520 |         ]
521 |     }
522 | 
523 |     assert result == expected
524 | 
525 | 
526 | def test_decode_strict_mode_correct_count():
527 |     """Test strict mode with correct array count."""
528 |     toon = """users[2]{id,name}:
529 |   1,Alice
530 |   2,Bob"""
531 | 
532 |     result = decode(toon, {'strict': True})
533 | 
534 |     expected = {
535 |         'users': [
536 |             {'id': 1, 'name': 'Alice'},
537 |             {'id': 2, 'name': 'Bob'}
538 |         ]
539 |     }
540 | 
541 |     assert result == expected
542 | 
543 | 
544 | def test_decode_strict_mode_too_few_items():
545 |     """Test strict mode raises error when array has fewer items than declared."""
546 |     toon = """users[3]{id,name}:
547 |   1,Alice
548 |   2,Bob"""
549 | 
550 |     try:
551 |         decode(toon, {'strict': True})
552 |         assert False, 'Should have raised ValueError'
553 |     except ValueError as e:
554 |         assert 'Array length mismatch' in str(e)
555 |         assert 'expected 3, got 2' in str(e)
556 | 
557 | 
558 | def test_decode_non_strict_mode_too_few_items():
559 |     """Test non-strict mode allows fewer items than declared."""
560 |     toon = """users[5]{id,name}:
561 |   1,Alice
562 |   2,Bob"""
563 | 
564 |     result = decode(toon, {'strict': False})
565 | 
566 |     expected = {
567 |         'users': [
568 |             {'id': 1, 'name': 'Alice'},
569 |             {'id': 2, 'name': 'Bob'}
570 |         ]
571 |     }
572 | 
573 |     assert result == expected
574 | 
575 | 
576 | def test_decode_strict_mode_list_array():
577 |     """Test strict mode with list array."""
578 |     toon = """items[2]:
579 |   - item1
580 |   - item2"""
581 | 
582 |     result = decode(toon, {'strict': True})
583 | 
584 |     expected = {
585 |         'items': ['item1', 'item2']
586 |     }
587 | 
588 |     assert result == expected
589 | 
590 | 
591 | def test_decode_strict_mode_list_array_mismatch():
592 |     """Test strict mode raises error for list array length mismatch."""
593 |     toon = """items[4]:
594 |   - item1
595 |   - item2"""
596 | 
597 |     try:
598 |         decode(toon, {'strict': True})
599 |         assert False, 'Should have raised ValueError'
600 |     except ValueError as e:
601 |         assert 'Array length mismatch' in str(e)
602 |         assert 'expected 4, got 2' in str(e)
603 | 


--------------------------------------------------------------------------------