├── CNAME ├── docs ├── CNAME ├── robots.txt ├── schema_reference │ ├── foreign_keys.md │ └── special_field_types.md ├── examples │ └── miscellaneous.md ├── index.md └── deep_dive │ ├── output_options.md │ └── foreign_keys.md ├── tests ├── __init__.py ├── test_templates.py └── test_custom_generators.py ├── examples ├── invalid_schemas │ ├── output │ │ ├── invalid_missing_name.csv │ │ ├── valid_customer.csv │ │ └── valid_product.csv │ ├── schemas │ │ ├── invalid_field_type.yml │ │ ├── invalid_missing_name.yml │ │ ├── invalid_template_type.yml │ │ ├── invalid_foreign_key.yml │ │ ├── valid_customer.yml │ │ └── valid_product.yml │ └── example_schema_validation.py ├── unstructured_only │ └── healthcare_yml │ │ ├── output │ │ ├── LabResult │ │ │ ├── document_1.pdf │ │ │ ├── document_2.pdf │ │ │ ├── document_3.pdf │ │ │ ├── document_4.pdf │ │ │ └── document_5.pdf │ │ └── MedicalReport │ │ │ ├── document_1.pdf │ │ │ ├── document_2.pdf │ │ │ ├── document_3.pdf │ │ │ ├── document_4.pdf │ │ │ └── document_5.pdf │ │ ├── schemas │ │ ├── lab_result.yml │ │ └── medical_report.yml │ │ ├── README.md │ │ ├── templates │ │ ├── lab_result_template.html │ │ └── medical_report_template.html │ │ └── generate_healthcare_data.py ├── structured_and_unstructured │ ├── retail_yml │ │ ├── output │ │ │ ├── Receipt │ │ │ │ ├── document_1.pdf │ │ │ │ ├── document_2.pdf │ │ │ │ ├── document_3.pdf │ │ │ │ ├── document_4.pdf │ │ │ │ └── document_5.pdf │ │ │ ├── category.csv │ │ │ ├── customer.csv │ │ │ └── transaction.csv │ │ └── schemas │ │ │ ├── category.yml │ │ │ ├── customer.yml │ │ │ ├── product.yml │ │ │ ├── transaction.yml │ │ │ └── receipt.yml │ └── crm_sqlalchemy │ │ ├── output │ │ ├── contract_documents │ │ │ ├── document_1.pdf │ │ │ └── document_2.pdf │ │ ├── proposal_documents │ │ │ ├── document_1.pdf │ │ │ ├── document_2.pdf │ │ │ └── document_3.pdf │ │ ├── customers.csv │ │ ├── contacts.csv │ │ └── opportunities.csv │ │ └── templates │ │ ├── proposal.html │ │ └── contract.html ├── structured_only │ ├── hr_employee_example │ │ ├── output │ │ │ ├── department.csv │ │ │ ├── position.csv │ │ │ ├── employee.csv │ │ │ └── performance_review.csv │ │ ├── schemas │ │ │ ├── position.yml │ │ │ ├── department.yml │ │ │ ├── performance_review.yml │ │ │ └── employee.yml │ │ └── README.md │ ├── schema_files │ │ ├── yaml │ │ │ ├── category.yml │ │ │ ├── supplier.yml │ │ │ ├── inventory.yml │ │ │ └── product.yml │ │ └── json │ │ │ ├── user.json │ │ │ ├── post.json │ │ │ └── comment.json │ ├── output │ │ ├── example_dict_schemas │ │ │ └── ecommerce │ │ │ │ ├── customer.csv │ │ │ │ ├── orderitem.csv │ │ │ │ ├── order.csv │ │ │ │ └── product.csv │ │ ├── example_sqlalchemy_models │ │ │ └── crm_data │ │ │ │ ├── customers.csv │ │ │ │ ├── orders.csv │ │ │ │ ├── order_items.csv │ │ │ │ ├── contacts.csv │ │ │ │ └── products.csv │ │ ├── example_yaml_schemas │ │ │ └── inventory_data │ │ │ │ ├── category.csv │ │ │ │ ├── supplier.csv │ │ │ │ └── inventory.csv │ │ └── example_json_schemas │ │ │ └── blog_data │ │ │ └── user.csv │ └── example_proxy_configuration.py ├── quickstart_output_data │ ├── products.csv │ └── categories.csv ├── model_selection │ ├── output │ │ ├── test_openai_models │ │ │ └── gpt-4o │ │ │ │ ├── patient.csv │ │ │ │ └── claim.csv │ │ ├── test_gemini_models │ │ │ ├── flash-2-5 │ │ │ │ ├── claim.csv │ │ │ │ └── patient.csv │ │ │ └── flash-2-0 │ │ │ │ └── patient.csv │ │ ├── test_claude_models │ │ │ └── haiku-3-5 │ │ │ │ ├── patient.csv │ │ │ │ └── claim.csv │ │ └── test_grok_models │ │ │ ├── grok-3 │ │ │ ├── companies.csv │ │ │ └── products.csv │ │ │ └── grok-4 │ │ │ ├── companies.csv │ │ │ └── products.csv │ ├── example_openai_models.py │ ├── example_claude_models.py │ ├── example_gemini_models.py │ └── example_azureopenai_models.py └── quickstart.py ├── setup.py ├── .gitignore ├── requirements.txt ├── MANIFEST.in ├── syda ├── __init__.py ├── output.py └── utils.py ├── DCO ├── LICENSE ├── CITATION.cff ├── CHANGELOG.md └── mkdocs.yml /CNAME: -------------------------------------------------------------------------------- 1 | python.syda.ai -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | python.syda.ai 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test package for Syda library. 3 | """ 4 | -------------------------------------------------------------------------------- /examples/invalid_schemas/output/invalid_missing_name.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 1,Sample Record 3 | -------------------------------------------------------------------------------- /docs/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Allow: / 3 | 4 | Sitemap: https://python.syda.ai/sitemap.xml -------------------------------------------------------------------------------- /examples/invalid_schemas/output/valid_customer.csv: -------------------------------------------------------------------------------- 1 | id,name,email 2 | 1,John Doe,john.doe@example.com 3 | -------------------------------------------------------------------------------- /examples/invalid_schemas/output/valid_product.csv: -------------------------------------------------------------------------------- 1 | id,name,price 2 | 1,Wireless Bluetooth Headphones,79.99 3 | -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/output/LabResult/document_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/unstructured_only/healthcare_yml/output/LabResult/document_1.pdf -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/output/LabResult/document_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/unstructured_only/healthcare_yml/output/LabResult/document_2.pdf -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/output/LabResult/document_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/unstructured_only/healthcare_yml/output/LabResult/document_3.pdf -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/output/LabResult/document_4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/unstructured_only/healthcare_yml/output/LabResult/document_4.pdf -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/output/LabResult/document_5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/unstructured_only/healthcare_yml/output/LabResult/document_5.pdf -------------------------------------------------------------------------------- /examples/structured_and_unstructured/retail_yml/output/Receipt/document_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/structured_and_unstructured/retail_yml/output/Receipt/document_1.pdf -------------------------------------------------------------------------------- /examples/structured_and_unstructured/retail_yml/output/Receipt/document_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/structured_and_unstructured/retail_yml/output/Receipt/document_2.pdf -------------------------------------------------------------------------------- /examples/structured_and_unstructured/retail_yml/output/Receipt/document_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/structured_and_unstructured/retail_yml/output/Receipt/document_3.pdf -------------------------------------------------------------------------------- /examples/structured_and_unstructured/retail_yml/output/Receipt/document_4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/structured_and_unstructured/retail_yml/output/Receipt/document_4.pdf -------------------------------------------------------------------------------- /examples/structured_and_unstructured/retail_yml/output/Receipt/document_5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/structured_and_unstructured/retail_yml/output/Receipt/document_5.pdf -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/output/MedicalReport/document_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/unstructured_only/healthcare_yml/output/MedicalReport/document_1.pdf -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/output/MedicalReport/document_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/unstructured_only/healthcare_yml/output/MedicalReport/document_2.pdf -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/output/MedicalReport/document_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/unstructured_only/healthcare_yml/output/MedicalReport/document_3.pdf -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/output/MedicalReport/document_4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/unstructured_only/healthcare_yml/output/MedicalReport/document_4.pdf -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/output/MedicalReport/document_5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/unstructured_only/healthcare_yml/output/MedicalReport/document_5.pdf -------------------------------------------------------------------------------- /examples/structured_and_unstructured/crm_sqlalchemy/output/contract_documents/document_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/structured_and_unstructured/crm_sqlalchemy/output/contract_documents/document_1.pdf -------------------------------------------------------------------------------- /examples/structured_and_unstructured/crm_sqlalchemy/output/contract_documents/document_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/structured_and_unstructured/crm_sqlalchemy/output/contract_documents/document_2.pdf -------------------------------------------------------------------------------- /examples/structured_and_unstructured/crm_sqlalchemy/output/proposal_documents/document_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/structured_and_unstructured/crm_sqlalchemy/output/proposal_documents/document_1.pdf -------------------------------------------------------------------------------- /examples/structured_and_unstructured/crm_sqlalchemy/output/proposal_documents/document_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/structured_and_unstructured/crm_sqlalchemy/output/proposal_documents/document_2.pdf -------------------------------------------------------------------------------- /examples/structured_and_unstructured/crm_sqlalchemy/output/proposal_documents/document_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syda-ai/syda/HEAD/examples/structured_and_unstructured/crm_sqlalchemy/output/proposal_documents/document_3.pdf -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Minimal setup.py for backward compatibility. 3 | The actual configuration is in pyproject.toml 4 | """ 5 | 6 | from setuptools import setup 7 | 8 | # For backward compatibility, keep a minimal setup.py 9 | # All configuration is now in pyproject.toml 10 | setup() 11 | -------------------------------------------------------------------------------- /examples/invalid_schemas/schemas/invalid_field_type.yml: -------------------------------------------------------------------------------- 1 | __name__: InvalidFieldType 2 | __description__: Invalid field type schema 3 | 4 | id: 5 | type: integer 6 | primary_key: true 7 | not_null: true 8 | min: 1 9 | max: 10000 10 | description: Unique ID 11 | 12 | name: 13 | type: string 14 | length: 50 15 | not_null: true 16 | description: Name 17 | 18 | age: 19 | type: non_existent_type # Invalid field type 20 | min: 18 21 | max: 100 22 | description: Age 23 | -------------------------------------------------------------------------------- /examples/invalid_schemas/schemas/invalid_missing_name.yml: -------------------------------------------------------------------------------- 1 | # Invalid schema - missing required __name__ field 2 | __description__: Schema with missing name field 3 | 4 | id: 5 | type: integer 6 | primary_key: true 7 | not_null: true 8 | min: 1 9 | max: 10000 10 | description: Unique ID 11 | 12 | title: 13 | type: string 14 | length: 100 15 | not_null: true 16 | description: Sample title 17 | 18 | content: 19 | type: text 20 | length: 500 21 | description: Sample content 22 | -------------------------------------------------------------------------------- /examples/structured_and_unstructured/retail_yml/output/category.csv: -------------------------------------------------------------------------------- 1 | id,name,parent_id,description,display_order,active 2 | 1,Electronics,0,Main category for all electronic devices and gadgets,1, 3 | 2,Smartphones,1,Subcategory of electronics featuring mobile phones and smart devices,2, 4 | 3,Computers,1,"Subcategory of electronics including laptops, desktops, and computer accessories",3, 5 | 4,Clothing,0,Main category for all clothing and fashion items,4, 6 | 5,Men's Clothing,4,Subcategory of clothing specifically for men's fashion,5, 7 | -------------------------------------------------------------------------------- /examples/invalid_schemas/schemas/invalid_template_type.yml: -------------------------------------------------------------------------------- 1 | __name__: InvalidTemplate 2 | __description__: Invalid template schema - wrong template field type 3 | __template__: "not-a-boolean" # Invalid type, should be boolean true/false 4 | 5 | first_name: 6 | type: string 7 | length: 50 8 | description: First name 9 | 10 | last_name: 11 | type: string 12 | length: 50 13 | description: Last name 14 | 15 | # Missing required fields for a template 16 | # Missing __template_source__, __input_file_type__, __output_file_type__ 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # Virtual Environment 24 | .venv 25 | .venvtest 26 | venv/ 27 | env/ 28 | 29 | # Environment variables 30 | .env 31 | 32 | # Database 33 | *.sqlite 34 | *.sqlite3 35 | 36 | # Alembic 37 | service/alembic/versions/* 38 | 39 | # IDE 40 | .idea/ 41 | .vscode/ 42 | *.swp 43 | *.swo 44 | *~ 45 | logs/ 46 | .windsurf/ 47 | .cursor/ 48 | site/ 49 | -------------------------------------------------------------------------------- /examples/structured_only/hr_employee_example/output/department.csv: -------------------------------------------------------------------------------- 1 | id,name,location,budget,manager_id,created_date 2 | 1,Human Resources,"Main Campus, Building A, 3rd Floor",750000,,2015-03-15 3 | 2,Information Technology,"Tech Wing, Building B, 2nd Floor",1250000,,2010-11-01 4 | 3,Finance,"Executive Tower, 5th Floor",950000,,2012-06-22 5 | 4,Marketing,"Creative Center, Building C, 4th Floor",650000,,2017-09-05 6 | 5,Sales,"Sales Plaza, Ground Floor",1500000,,2008-02-14 7 | 6,Research & Development,"Innovation Center, Building D, 6th Floor",2000000,,2013-11-30 8 | 7,Customer Support,"Service Hub, Building E, 1st Floor",500000,,2019-01-10 9 | 8,Legal,"Corporate Headquarters, 7th Floor",850000,,2016-07-18 10 | -------------------------------------------------------------------------------- /examples/invalid_schemas/schemas/invalid_foreign_key.yml: -------------------------------------------------------------------------------- 1 | __name__: InvalidForeignKey 2 | __description__: Invalid foreign key schema 3 | 4 | id: 5 | type: integer 6 | primary_key: true 7 | not_null: true 8 | min: 1 9 | max: 10000 10 | description: Unique ID 11 | 12 | name: 13 | type: string 14 | length: 50 15 | not_null: true 16 | description: Name 17 | 18 | # Invalid foreign key reference - wrong format 19 | customer_id: 20 | type: integer 21 | not_null: true 22 | min: 1 23 | description: Reference to customer 24 | references: Customer # Should be [table_name, column_name] format 25 | 26 | __foreign_keys__: 27 | invalid_column: NonExistentTable # References a non-existent table 28 | -------------------------------------------------------------------------------- /examples/structured_only/schema_files/yaml/category.yml: -------------------------------------------------------------------------------- 1 | __table_description__: Product categories in the inventory system 2 | 3 | id: 4 | type: number 5 | description: Unique identifier for the category 6 | constraints: 7 | primary_key: true 8 | 9 | name: 10 | type: text 11 | description: Name of the category 12 | constraints: 13 | unique: true 14 | max_length: 100 15 | 16 | description: 17 | type: text 18 | description: Description of the category and what types of products it contains 19 | constraints: 20 | max_length: 1000 21 | 22 | parent_id: 23 | type: number 24 | description: Reference to the parent category (for hierarchical categories), indicate 0 if it is a parent category 25 | -------------------------------------------------------------------------------- /examples/structured_only/output/example_dict_schemas/ecommerce/customer.csv: -------------------------------------------------------------------------------- 1 | id,name,email,signup_date,loyalty_tier 2 | 1,Emily Rodriguez,emily.rodriguez@example.com,2022-03-15,Gold 3 | 2,Michael Chen,michael.chen@example.com,2021-11-22,Platinum 4 | 3,Sarah Thompson,sarah.thompson@example.com,2023-01-05,Gold 5 | 4,David Kim,david.kim@example.com,2022-07-30,Silver 6 | 5,Jessica Martinez,jessica.martinez@example.com,2021-05-18,Gold 7 | 6,Alex Johnson,alex.johnson@example.com,2022-09-12,Silver 8 | 7,Rachel Wong,rachel.wong@example.com,2023-02-28,Silver 9 | 8,Daniel Garcia,daniel.garcia@example.com,2021-12-05,Silver 10 | 9,Emma Anderson,emma.anderson@example.com,2022-06-17,Bronze 11 | 10,Ryan Nakamura,ryan.nakamura@example.com,2021-08-03,Platinum 12 | -------------------------------------------------------------------------------- /examples/structured_only/hr_employee_example/output/position.csv: -------------------------------------------------------------------------------- 1 | id,title,level,min_salary,max_salary,is_management 2 | 1,Software Engineer,4,75000,120000, 3 | 2,Senior Product Manager,7,110000,180000, 4 | 3,Marketing Specialist,3,55000,85000, 5 | 4,IT Director,9,150000,250000, 6 | 5,Data Analyst,2,45000,75000, 7 | 6,Human Resources Manager,6,90000,140000, 8 | 7,Sales Representative,3,50000,90000, 9 | 8,Chief Financial Officer,10,200000,300000, 10 | 9,Customer Support Specialist,2,40000,65000, 11 | 10,Operations Manager,8,120000,190000, 12 | 11,UX Designer,4,65000,110000, 13 | 12,Network Administrator,5,70000,120000, 14 | 13,Project Coordinator,3,55000,85000, 15 | 14,Senior Software Architect,8,130000,220000, 16 | 15,Business Development Manager,7,100000,170000, 17 | -------------------------------------------------------------------------------- /examples/quickstart_output_data/products.csv: -------------------------------------------------------------------------------- 1 | id,name,category_id,price 2 | 1,Wireless Bluetooth Headphones,4,129.99 3 | 2,Organic Cotton T-Shirt,5,24.5 4 | 3,Stainless Steel Water Bottle,5,19.99 5 | 4,Smart Home Security Camera,2,79.99 6 | 5,Ergonomic Office Chair,2,249.99 7 | 6,Non-Stick Cookware Set,3,89.95 8 | 7,Hiking Backpack,2,64.5 9 | 8,Fitness Tracker Watch,1,99.99 10 | 9,Leather Messenger Bag,1,139.0 11 | 10,Noise-Cancelling Earbuds,5,159.5 12 | 11,Memory Foam Pillow,2,45.75 13 | 12,Running Shoes,1,89.99 14 | 13,Smart Coffee Maker,3,129.5 15 | 14,Portable Bluetooth Speaker,5,59.99 16 | 15,Winter Down Jacket,5,199.99 17 | 16,Gaming Mouse,3,49.99 18 | 17,Cast Iron Skillet,3,34.5 19 | 18,Minimalist Wall Clock,1,55.0 20 | 19,Yoga Mat,2,39.99 21 | 20,Designer Sunglasses,3,179.5 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies 2 | pydantic>=2.4.2 3 | python-dotenv>=1.0.0 4 | 5 | # Database 6 | sqlalchemy>=2.0.23 7 | 8 | # Data processing 9 | pandas>=2.0.3 10 | numpy>=1.24.3 11 | networkx>=3.1 12 | jsonref>=1.1.0 13 | 14 | # AI/ML 15 | openai>=1.0.0 16 | anthropic>=0.7.0 17 | instructor>=1.0.0 18 | google-genai>=1.30.0 19 | 20 | # File processing 21 | python-magic-bin>=0.4.14 22 | python-docx>=1.0.0 23 | openpyxl>=3.1.2 24 | weasyprint>=65.1 25 | 26 | # Configuration 27 | pyyaml>=6.0.1 28 | 29 | # Testing 30 | pytest>=7.4.0 31 | 32 | # Cloud and file processing 33 | boto3>=1.28.0 34 | azure-storage-blob>=12.19.0 35 | pdfplumber>=0.10.3 36 | pillow>=10.0.1 37 | pytesseract>=0.3.10 38 | sqlalchemy-utils>=0.41.1 39 | 40 | # Documentation 41 | mkdocs-material>=9.6.15 42 | mkdocs>=1.6.1 43 | mkdocs-macros-plugin>=1.3.7 -------------------------------------------------------------------------------- /examples/structured_and_unstructured/crm_sqlalchemy/output/customers.csv: -------------------------------------------------------------------------------- 1 | id,name,industry,annual_revenue,employees,website,address,city,state,zip_code,status 2 | 1,TechInnovate Solutions,Information Technology,5750000.5,250,https://techinnovate.com,1234 Silicon Valley Drive,San Jose,CA,95110,Active 3 | 2,GreenEnergy Dynamics,Renewable Energy,12300000.75,500,https://greenenergydynamics.com,5678 Sustainability Lane,Austin,TX,78701,Prospect 4 | 3,HealthCare Innovations Inc.,Healthcare,8900000.25,375,https://healthcareinnovations.org,9012 Medical Research Park,Boston,MA,02115,Active 5 | 4,GlobalLogistics Network,Logistics,15600000.0,750,https://globallogisticsnetwork.com,3456 Trade Boulevard,Chicago,IL,60601,Inactive 6 | 5,FinTech Innovators,Financial Services,6800000.5,200,https://fintechinnovators.com,7890 Financial District Road,New York,NY,10007,Prospect 7 | -------------------------------------------------------------------------------- /docs/schema_reference/foreign_keys.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Foreign Key Schema Reference | Syda Documentation 3 | description: Complete guide to defining foreign key relationships in Syda schemas - syntax, examples, and best practices for maintaining referential integrity. 4 | keywords: 5 | - foreign key schema 6 | - schema relationships 7 | - referential integrity 8 | - foreign key syntax 9 | - database relationships 10 | --- 11 | 12 | # Multiple Ways to Define Foreign Keys 13 | 14 | Foreign keys can be defined in 2 ways: 15 | 16 | 1. Using the `__foreign_keys__` special section (recommended): 17 | ```yaml 18 | __foreign_keys__: 19 | user_id: [User, id] 20 | user_id: foreign_key 21 | ``` 22 | 23 | 2. Using the field definition with `references`: 24 | ```yaml 25 | user_id: 26 | type: foreign_key 27 | references: 28 | schema: User 29 | field: id 30 | ``` -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | 2 | include README.md 3 | include LICENSE 4 | include CONTRIBUTING.md 5 | include DCO 6 | include requirements.txt 7 | include pyproject.toml 8 | include MANIFEST.in 9 | include .gitignore 10 | 11 | # Include documentation 12 | recursive-include docs *.md *.yml *.yaml *.html *.css *.js 13 | recursive-include docs *.png *.jpg *.gif *.svg 14 | 15 | # Include examples 16 | recursive-include examples *.py *.md *.yml *.yaml *.json *.html *.txt 17 | 18 | # Include schema files 19 | recursive-include syda/schema_versions *.json *.yaml *.yml 20 | 21 | # Exclude unwanted files 22 | recursive-exclude * __pycache__ 23 | recursive-exclude * *.py[co] 24 | recursive-exclude * *.so 25 | recursive-exclude * .DS_Store 26 | recursive-exclude * .git* 27 | recursive-exclude * *.egg-info 28 | recursive-exclude .venv * 29 | recursive-exclude build * 30 | recursive-exclude dist * 31 | recursive-exclude site * 32 | recursive-exclude .pytest_cache * -------------------------------------------------------------------------------- /examples/structured_and_unstructured/crm_sqlalchemy/output/contacts.csv: -------------------------------------------------------------------------------- 1 | id,customer_id,first_name,last_name,email,phone,position,is_primary 2 | 1,4,John,Smith,john.smith@acmecorp.com,+1-555-123-4567,Chief Executive Officer, 3 | 2,3,Emily,Johnson,emily.johnson@techsolutions.net,+1-555-987-6543,IT Director, 4 | 3,2,Michael,Wong,michael.wong@globalinc.org,+1-555-246-8135,Sales Manager, 5 | 4,3,Sarah,Martinez,sarah.martinez@innovate.com,+1-555-369-2580,Marketing Director, 6 | 5,4,David,Kim,david.kim@enterprisetech.co,+1-555-147-2589,Operations Manager, 7 | 6,2,Rachel,Patel,rachel.patel@cloudservices.io,+1-555-753-9514,Chief Technology Officer, 8 | 7,2,Alex,Rodriguez,alex.rodriguez@manufacturinggroup.com,+1-555-852-7410,Supply Chain Manager, 9 | 8,5,Jennifer,Lee,jennifer.lee@financialservices.net,+1-555-963-8520,Finance Director, 10 | 9,2,Robert,Chen,robert.chen@researchinstitute.org,+1-555-456-7890,Research Lead, 11 | 10,3,Amanda,Nguyen,amanda.nguyen@consultingfirm.com,+1-555-321-6540,Senior Consultant, 12 | -------------------------------------------------------------------------------- /examples/structured_only/output/example_sqlalchemy_models/crm_data/customers.csv: -------------------------------------------------------------------------------- 1 | id,name,industry,website,status,created_at 2 | 1,TechInnovate Solutions,Information Technology,https://techinnovate.com,Prospect,2023-01-15 3 | 2,MediCare Health Systems,Healthcare,https://medicarehealth.org,Active,2023-02-22 4 | 3,FinPro Consulting Group,Financial Services,https://finproconsulting.com,Prospect,2023-03-10 5 | 4,GreenEnergy Innovations,Renewable Energy,https://greenenergy.io,Prospect,2023-04-05 6 | 5,EduTech Learning Solutions,Education Technology,https://edutech.com,Active,2023-05-18 7 | 6,RetailPro Management,Retail,https://retailpro.net,Prospect,2023-06-30 8 | 7,ManufactureTech Solutions,Manufacturing,https://manufacturetechsolutions.com,Prospect,2023-07-12 9 | 8,LogiCore Transportation,Logistics,https://logicore.com,Active,2023-08-25 10 | 9,AgriTech Innovations,Agriculture,https://agritech.io,Inactive,2023-09-07 11 | 10,MediaPro Communications,Media & Entertainment,https://mediapro.com,Prospect,2023-10-20 12 | -------------------------------------------------------------------------------- /examples/invalid_schemas/schemas/valid_customer.yml: -------------------------------------------------------------------------------- 1 | __description__: Valid customer schema example 2 | __name__: Customer 3 | 4 | id: 5 | type: integer 6 | primary_key: true 7 | not_null: true 8 | min: 1 9 | max: 10000 10 | description: Unique customer ID 11 | 12 | first_name: 13 | type: string 14 | length: 50 15 | not_null: true 16 | description: Customer's first name 17 | 18 | last_name: 19 | type: string 20 | length: 50 21 | not_null: true 22 | description: Customer's last name 23 | 24 | email: 25 | type: email 26 | length: 100 27 | unique: true 28 | not_null: true 29 | description: Customer's email address 30 | 31 | phone: 32 | type: string 33 | pattern: '^\+\d{1,3}-\d{3}-\d{3}-\d{4}$' 34 | length: 20 35 | description: Customer's phone number 36 | 37 | address: 38 | type: text 39 | length: 200 40 | description: Customer's full address 41 | 42 | join_date: 43 | type: date 44 | format: YYYY-MM-DD 45 | not_null: true 46 | description: Date when the customer joined 47 | -------------------------------------------------------------------------------- /examples/invalid_schemas/schemas/valid_product.yml: -------------------------------------------------------------------------------- 1 | __description__: Valid product schema example 2 | __name__: Product 3 | 4 | id: 5 | type: integer 6 | primary_key: true 7 | not_null: true 8 | min: 1 9 | max: 10000 10 | description: Unique product ID 11 | 12 | name: 13 | type: string 14 | length: 100 15 | unique: true 16 | not_null: true 17 | description: Product name 18 | 19 | category: 20 | type: string 21 | length: 50 22 | not_null: true 23 | description: Product category 24 | 25 | price: 26 | type: float 27 | min: 0.01 28 | max: 10000.00 29 | decimals: 2 30 | not_null: true 31 | description: Product price in USD 32 | 33 | description: 34 | type: text 35 | length: 500 36 | description: Detailed product description 37 | 38 | in_stock: 39 | type: boolean 40 | not_null: true 41 | description: Whether the product is in stock 42 | 43 | sku: 44 | type: string 45 | pattern: '^SKU-\d{6}-[A-Z]{2}$' 46 | length: 13 47 | unique: true 48 | not_null: true 49 | description: Stock keeping unit identifier 50 | -------------------------------------------------------------------------------- /examples/structured_only/hr_employee_example/schemas/position.yml: -------------------------------------------------------------------------------- 1 | __table_name__: Position 2 | __description__: Job positions within the organization 3 | 4 | id: 5 | type: integer 6 | description: Unique identifier for the position 7 | constraints: 8 | primary_key: true 9 | min: 1 10 | max: 50 11 | 12 | title: 13 | type: text 14 | description: Job title of the position 15 | constraints: 16 | not_null: true 17 | min_length: 3 18 | max_length: 100 19 | 20 | level: 21 | type: integer 22 | description: Job level (1-10 where 10 is highest) 23 | constraints: 24 | min: 1 25 | max: 10 26 | 27 | min_salary: 28 | type: number 29 | description: Minimum salary range for the position 30 | constraints: 31 | min: 30000 32 | max: 200000 33 | 34 | max_salary: 35 | type: number 36 | description: Maximum salary range for the position 37 | constraints: 38 | min: 50000 39 | max: 300000 40 | 41 | is_management: 42 | type: boolean 43 | description: Whether the position has management responsibilities 44 | -------------------------------------------------------------------------------- /syda/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Syda - Synthetic Data Generation Library 3 | 4 | A Python library for AI-powered synthetic data generation with referential integrity. 5 | Supports multiple AI providers (OpenAI, Anthropic) and various schema formats. 6 | """ 7 | 8 | from .generate import SyntheticDataGenerator 9 | from .schemas import ModelConfig 10 | 11 | __all__ = [ 12 | 'SyntheticDataGenerator', 13 | 'ModelConfig' 14 | ] 15 | 16 | __version__ = '0.0.4' 17 | __author__ = 'Rama Krishna Kumar Lingamgunta' 18 | __email__ = 'ramkumar2606@gmail.com' 19 | __license__ = 'MIT' 20 | __description__ = 'Seamlessly generates realistic synthetic test data—including structured, unstructured, PDF, and HTML—using AI and large language models. It preserves referential integrity, maintains privacy compliance, and accelerates development workflows. SYDA enables both highly regulated industries such as healthcare and banking, as well as non-regulated environments like software testing, research, and analytics, to safely simulate diverse data scenarios without exposing sensitive information.' 21 | -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/schemas/lab_result.yml: -------------------------------------------------------------------------------- 1 | __template__: true 2 | __description__: Laboratory test result template 3 | __name__: LabResult 4 | __foreign_keys__: {} 5 | __template_source__: /home/ramkumar2606/syda-fresh/examples/unstructured_only/healthcare_yml/templates/lab_result_template.html 6 | __input_file_type__: html 7 | __output_file_type__: pdf 8 | 9 | # Report information 10 | lab_id: 11 | type: string 12 | format: uuid 13 | 14 | patient_id: 15 | type: string 16 | format: uuid 17 | 18 | collection_date: 19 | type: string 20 | format: date-time 21 | 22 | report_date: 23 | type: string 24 | format: date-time 25 | 26 | # Test information 27 | test_name: 28 | type: string 29 | 30 | test_category: 31 | type: string 32 | 33 | result_value: 34 | type: string 35 | 36 | reference_range: 37 | type: string 38 | 39 | unit: 40 | type: string 41 | 42 | interpretation: 43 | type: string 44 | 45 | ordering_physician: 46 | type: string 47 | 48 | performing_lab: 49 | type: string 50 | 51 | is_flagged: 52 | type: boolean 53 | -------------------------------------------------------------------------------- /examples/structured_only/hr_employee_example/schemas/department.yml: -------------------------------------------------------------------------------- 1 | __table_name__: Department 2 | __description__: Department information for the organization 3 | 4 | id: 5 | type: integer 6 | description: Unique identifier for the department 7 | constraints: 8 | primary_key: true 9 | min: 1 10 | max: 100 11 | 12 | name: 13 | type: text 14 | description: Name of the department 15 | constraints: 16 | not_null: true 17 | min_length: 3 18 | max_length: 50 19 | 20 | location: 21 | type: text 22 | description: Physical location of the department 23 | constraints: 24 | max_length: 100 25 | 26 | budget: 27 | type: number 28 | description: Annual budget allocated to the department 29 | constraints: 30 | min: 10000 31 | max: 10000000 32 | 33 | manager_id: 34 | type: integer 35 | description: Employee ID of the department manager 36 | constraints: 37 | min: 1 38 | max: 1000 39 | 40 | created_date: 41 | type: date 42 | description: Date when the department was created 43 | constraints: 44 | format: YYYY-MM-DD 45 | -------------------------------------------------------------------------------- /examples/structured_and_unstructured/retail_yml/schemas/category.yml: -------------------------------------------------------------------------------- 1 | __table_name__: Category 2 | __description__: Retail product categories 3 | 4 | id: 5 | type: integer 6 | description: Unique category ID 7 | constraints: 8 | primary_key: true 9 | not_null: true 10 | min: 1 11 | max: 1000 12 | 13 | name: 14 | type: string 15 | description: Category name 16 | constraints: 17 | not_null: true 18 | length: 50 19 | unique: true 20 | 21 | parent_id: 22 | type: integer 23 | description: Parent category ID for hierarchical categories, if it is a parent category, this field should be 0 24 | constraints: 25 | min: 0 26 | max: 1000 27 | 28 | description: 29 | type: text 30 | description: Detailed category description 31 | constraints: 32 | length: 500 33 | 34 | display_order: 35 | type: integer 36 | description: Order for displaying the category 37 | constraints: 38 | min: 1 39 | max: 100 40 | 41 | active: 42 | type: boolean 43 | description: Whether the category is active 44 | constraints: 45 | not_null: true 46 | -------------------------------------------------------------------------------- /DCO: -------------------------------------------------------------------------------- 1 | Developer Certificate of Origin 2 | Version 1.1 3 | 4 | By making a contribution to this project, I certify that: 5 | 6 | (1) The contribution was created in whole or in part by me and I have 7 | the right to submit it under the open-source license indicated in 8 | the file; or 9 | 10 | (2) The contribution is based upon previous work that, to the best of my 11 | knowledge, is covered under an appropriate open-source license and I 12 | have the right under that license to submit that work under the same 13 | open-source license (unless I am permitted to submit under a 14 | different license), as indicated in the file; or 15 | 16 | (3) The contribution was provided directly to me by some other person 17 | who certified (1) or (2) and I have not modified it. 18 | 19 | I understand and agree that this project and the contribution are 20 | public and that a record of the contribution (including all personal 21 | information I submit with it) is maintained indefinitely and may be 22 | redistributed consistent with this project or the open-source license(s) 23 | involved. 24 | -------------------------------------------------------------------------------- /examples/structured_only/output/example_dict_schemas/ecommerce/orderitem.csv: -------------------------------------------------------------------------------- 1 | id,order_id,product_id,quantity,unit_price 2 | 1,13,6,5,19.99 3 | 2,23,14,2,45.5 4 | 3,18,6,3,12.75 5 | 4,12,7,1,89.99 6 | 5,7,14,6,7.25 7 | 6,15,5,4,33.4 8 | 7,18,3,2,55.2 9 | 8,8,1,3,22.99 10 | 9,22,7,1,149.99 11 | 10,5,7,5,8.5 12 | 11,12,11,4,41.25 13 | 12,16,11,2,67.8 14 | 13,13,12,3,15.6 15 | 14,11,9,6,9.99 16 | 15,14,11,2,88.5 17 | 16,25,14,1,129.75 18 | 17,20,2,5,18.25 19 | 18,18,1,3,36.4 20 | 19,17,5,4,52.99 21 | 20,25,4,2,75.6 22 | 21,17,4,1,199.99 23 | 22,20,3,6,11.5 24 | 23,19,15,3,44.75 25 | 24,1,6,2,63.2 26 | 25,7,15,5,16.99 27 | 26,11,3,4,29.5 28 | 27,23,15,2,95.4 29 | 28,25,14,1,169.99 30 | 29,10,10,6,7.8 31 | 30,8,12,3,42.25 32 | 31,20,3,4,37.6 33 | 32,2,15,2,81.5 34 | 33,24,3,1,249.99 35 | 34,11,9,5,14.75 36 | 35,13,2,3,56.2 37 | 36,6,15,6,9.5 38 | 37,25,6,2,67.99 39 | 38,5,8,4,25.4 40 | 39,1,6,1,179.5 41 | 40,4,15,5,13.25 42 | 41,3,4,3,48.6 43 | 42,17,12,2,92.75 44 | 43,21,11,6,8.99 45 | 44,20,11,4,35.5 46 | 45,9,1,2,76.25 47 | 46,20,15,1,189.99 48 | 47,6,4,5,17.6 49 | 48,14,10,3,43.75 50 | 49,16,9,4,59.5 51 | 50,4,7,2,84.99 52 | -------------------------------------------------------------------------------- /examples/quickstart_output_data/categories.csv: -------------------------------------------------------------------------------- 1 | id,name,description 2 | 1,Electronics,"A comprehensive category featuring the latest consumer electronics, including smartphones, laptops, tablets, smart home devices, audio equipment, and cutting-edge gadgets for tech enthusiasts." 3 | 2,Home Decor,"Stylish and functional items to enhance living spaces, including furniture, wall art, decorative accessories, lighting fixtures, rugs, and home accent pieces that help create personalized and comfortable environments." 4 | 3,Sports & Fitness,"Athletic equipment, workout gear, and sporting goods for various activities including team sports, individual fitness, outdoor recreation, exercise accessories, and performance apparel for athletes of all levels." 5 | 4,Kitchen & Dining,"Essential and innovative kitchenware, cooking tools, appliances, dining sets, cookware, bakeware, kitchen gadgets, and culinary accessories for home chefs and cooking enthusiasts." 6 | 5,Fashion & Accessories,"Trendy clothing, shoes, jewelry, handbags, watches, and personal accessories for men, women, and children, covering casual wear, formal attire, seasonal collections, and style essentials." 7 | -------------------------------------------------------------------------------- /examples/structured_only/output/example_sqlalchemy_models/crm_data/orders.csv: -------------------------------------------------------------------------------- 1 | id,customer_id,order_date,status,total_amount 2 | 1,2,2023-06-15,Delivered,248.75 3 | 2,1,2023-06-16,Processing,129.5 4 | 3,7,2023-06-14,Shipped,345.2 5 | 4,6,2023-06-17,New,87.3 6 | 5,4,2023-06-13,Delivered,456.9 7 | 6,5,2023-06-15,Processing,212.45 8 | 7,2,2023-06-16,Shipped,167.8 9 | 8,9,2023-06-14,New,93.6 10 | 9,9,2023-06-17,Delivered,276.4 11 | 10,7,2023-06-15,Cancelled,55.2 12 | 11,7,2023-06-13,Processing,189.75 13 | 12,2,2023-06-16,Shipped,422.3 14 | 13,10,2023-06-14,New,76.5 15 | 14,6,2023-06-17,Delivered,534.6 16 | 15,3,2023-06-15,Processing,145.9 17 | 16,7,2023-06-13,Shipped,267.4 18 | 17,8,2023-06-16,New,102.75 19 | 18,9,2023-06-14,Delivered,389.2 20 | 19,6,2023-06-17,Cancelled,67.3 21 | 20,7,2023-06-15,Processing,224.6 22 | 21,3,2023-06-13,Shipped,298.75 23 | 22,4,2023-06-16,New,115.4 24 | 23,1,2023-06-14,Delivered,467.9 25 | 24,7,2023-06-17,Processing,156.3 26 | 25,8,2023-06-15,Shipped,321.6 27 | 26,7,2023-06-13,New,89.75 28 | 27,4,2023-06-16,Delivered,512.4 29 | 28,10,2023-06-14,Cancelled,62.9 30 | 29,4,2023-06-17,Processing,203.5 31 | 30,3,2023-06-15,Shipped,376.8 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Rama Krishna Kumar Lingamgunta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/examples/miscellaneous.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Miscellaneous Examples & Use Cases | Syda Documentation 3 | description: Explore additional Syda examples and use cases - advanced patterns, integrations, and real-world applications for synthetic data generation. 4 | keywords: 5 | - Syda examples 6 | - use cases 7 | - advanced patterns 8 | - integration examples 9 | - real-world applications 10 | --- 11 | 12 | # Miscellaneous Examples 13 | 14 | This page provides links to additional examples that showcase various features and use cases of SYDA beyond the main documented examples. 15 | 16 | ## GitHub Examples Repository 17 | 18 | For a complete collection of examples, please visit the [SYDA GitHub examples directory](https://github.com/syda-ai/syda/tree/main/examples). 19 | 20 | 21 | 22 | ## Contributing Examples 23 | 24 | We welcome contributions of new examples! If you've created an interesting use case or implementation with SYDA, please consider submitting it to our GitHub repository. 25 | 26 | --- 27 | 28 | > For the most up-to-date and complete collection of examples, always refer to the [official GitHub repository](https://github.com/syda-ai/syda/tree/main/examples). 29 | -------------------------------------------------------------------------------- /examples/structured_only/schema_files/json/user.json: -------------------------------------------------------------------------------- 1 | { 2 | "__table_description__": "User accounts for the blog system", 3 | "id": { 4 | "type": "number", 5 | "description": "Unique identifier for the user", 6 | "constraints": { 7 | "primary_key": true 8 | } 9 | }, 10 | "username": { 11 | "type": "text", 12 | "description": "User's login name", 13 | "constraints": { 14 | "unique": true, 15 | "min_length": 3, 16 | "max_length": 50 17 | } 18 | }, 19 | "email": { 20 | "type": "email", 21 | "description": "User's email address", 22 | "constraints": { 23 | "unique": true, 24 | "max_length": 150 25 | } 26 | }, 27 | "full_name": { 28 | "type": "text", 29 | "description": "User's full name", 30 | "constraints": { 31 | "max_length": 100 32 | } 33 | }, 34 | "join_date": { 35 | "type": "date", 36 | "description": "Date when the user registered" 37 | }, 38 | "bio": { 39 | "type": "text", 40 | "description": "User's biographical information" 41 | }, 42 | "is_admin": { 43 | "type": "boolean", 44 | "description": "Whether the user has administrator privileges" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /examples/structured_only/output/example_sqlalchemy_models/crm_data/order_items.csv: -------------------------------------------------------------------------------- 1 | id,order_id,product_id,quantity,unit_price 2 | 1,10,9,5,24.99 3 | 2,30,11,8,49.5 4 | 3,22,13,8,99.99 5 | 4,30,2,1,12.75 6 | 5,5,1,5,79.99 7 | 6,20,15,9,34.5 8 | 7,10,7,2,15.99 9 | 8,14,11,10,89.99 10 | 9,19,3,1,44.75 11 | 10,8,13,3,62.5 12 | 11,4,6,4,129.99 13 | 12,7,5,5,22.25 14 | 13,26,9,1,54.99 15 | 14,7,6,9,39.5 16 | 15,12,5,10,19.99 17 | 16,9,6,2,99.75 18 | 17,11,2,9,67.5 19 | 18,4,10,5,49.5 20 | 19,10,1,8,149.99 21 | 20,21,3,9,28.75 22 | 21,14,8,7,59.99 23 | 22,10,15,8,44.75 24 | 23,6,7,10,17.5 25 | 24,4,11,6,89.99 26 | 25,30,10,1,72.25 27 | 26,6,9,7,62.5 28 | 27,7,6,10,119.99 29 | 28,29,10,2,22.25 30 | 29,26,10,3,64.5 31 | 30,12,1,8,39.5 32 | 31,27,10,7,29.99 33 | 32,3,15,6,99.75 34 | 33,24,4,10,67.5 35 | 34,3,7,3,49.5 36 | 35,21,5,9,149.99 37 | 36,10,15,3,28.75 38 | 37,26,6,3,59.99 39 | 38,4,3,7,44.75 40 | 39,1,2,9,17.5 41 | 40,11,12,5,89.99 42 | 41,1,5,3,72.25 43 | 42,25,3,10,62.5 44 | 43,1,8,7,119.99 45 | 44,6,11,3,22.25 46 | 45,29,14,2,64.5 47 | 46,24,6,5,39.5 48 | 47,30,12,4,29.99 49 | 48,25,6,7,99.75 50 | 49,3,1,9,67.5 51 | 50,25,8,10,49.5 52 | 51,11,8,2,149.99 53 | 52,23,4,6,28.75 54 | 53,8,8,3,59.99 55 | 54,19,8,4,44.75 56 | 55,8,9,2,17.5 57 | 56,22,6,3,89.99 58 | 57,30,5,1,72.25 59 | 58,4,15,4,62.5 60 | 59,20,9,1,119.99 61 | 60,4,12,1,22.25 62 | -------------------------------------------------------------------------------- /examples/structured_only/schema_files/yaml/supplier.yml: -------------------------------------------------------------------------------- 1 | __table_description__: Suppliers who provide products to the inventory system 2 | 3 | id: 4 | type: number 5 | description: Unique identifier for the supplier 6 | constraints: 7 | primary_key: true 8 | 9 | company_name: 10 | type: text 11 | description: Name of the supplier company 12 | constraints: 13 | unique: true 14 | max_length: 200 15 | 16 | contact_name: 17 | type: text 18 | description: Name of the primary contact person at the supplier 19 | constraints: 20 | max_length: 100 21 | 22 | email: 23 | type: email 24 | description: Email address for the supplier 25 | constraints: 26 | unique: true 27 | max_length: 150 28 | 29 | phone: 30 | type: text 31 | description: Phone number for the supplier 32 | constraints: 33 | max_length: 30 34 | 35 | address: 36 | type: text 37 | description: Physical address of the supplier 38 | constraints: 39 | max_length: 300 40 | 41 | website: 42 | type: text 43 | description: Supplier's website URL 44 | constraints: 45 | max_length: 200 46 | 47 | payment_terms: 48 | type: text 49 | description: Payment terms for this supplier (e.g., Net 30, Net 60) 50 | constraints: 51 | max_length: 100 52 | 53 | active: 54 | type: boolean 55 | description: Whether the supplier relationship is currently active 56 | -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/schemas/medical_report.yml: -------------------------------------------------------------------------------- 1 | __template__: true 2 | __description__: Medical report template for patient visits 3 | __name__: MedicalReport 4 | __foreign_keys__: {} 5 | __template_source__: /home/ramkumar2606/syda-fresh/examples/unstructured_only/healthcare_yml/templates/medical_report_template.html 6 | __input_file_type__: html 7 | __output_file_type__: pdf 8 | 9 | # Patient information 10 | patient_id: 11 | type: string 12 | format: uuid 13 | 14 | patient_name: 15 | type: string 16 | 17 | date_of_birth: 18 | type: string 19 | format: date 20 | 21 | visit_date: 22 | type: string 23 | format: date-time 24 | 25 | chief_complaint: 26 | type: string 27 | 28 | medical_history: 29 | type: string 30 | 31 | # Vital signs 32 | blood_pressure: 33 | type: string 34 | 35 | heart_rate: 36 | type: integer 37 | 38 | respiratory_rate: 39 | type: integer 40 | 41 | temperature: 42 | type: number 43 | 44 | oxygen_saturation: 45 | type: integer 46 | 47 | # Clinical information 48 | assessment: 49 | type: string 50 | 51 | diagnosis: 52 | type: array 53 | items: 54 | type: string 55 | 56 | treatment_plan: 57 | type: string 58 | 59 | medications: 60 | type: array 61 | items: 62 | type: string 63 | description: List of medications in format "Name - Dosage - Frequency" 64 | 65 | follow_up: 66 | type: string 67 | -------------------------------------------------------------------------------- /examples/structured_only/schema_files/json/post.json: -------------------------------------------------------------------------------- 1 | { 2 | "__table_description__": "Blog posts created by users", 3 | "id": { 4 | "type": "number", 5 | "description": "Unique identifier for the post", 6 | "constraints": { 7 | "primary_key": true 8 | } 9 | }, 10 | "author_id": { 11 | "type": "foreign_key", 12 | "description": "Reference to the user who created the post", 13 | "references": { 14 | "schema": "User", 15 | "field": "id" 16 | } 17 | }, 18 | "title": { 19 | "type": "text", 20 | "description": "Title of the blog post", 21 | "constraints": { 22 | "unique": true, 23 | "max_length": 200 24 | } 25 | }, 26 | "content": { 27 | "type": "text", 28 | "description": "Full content of the blog post", 29 | "constraints": { 30 | "max_length": 50000 31 | } 32 | }, 33 | "publish_date": { 34 | "type": "date", 35 | "description": "Date when the post was published" 36 | }, 37 | "category": { 38 | "type": "text", 39 | "description": "Category of the blog post", 40 | "constraints": { 41 | "max_length": 50 42 | } 43 | }, 44 | "tags": { 45 | "type": "text", 46 | "description": "Comma-separated list of tags for the post", 47 | "constraints": { 48 | "max_length": 500 49 | } 50 | }, 51 | "status": { 52 | "type": "text", 53 | "description": "Publication status of the post (draft, published, archived)" 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /examples/structured_only/schema_files/json/comment.json: -------------------------------------------------------------------------------- 1 | { 2 | "__table_description__": "Comments on blog posts by users", 3 | "__foreign_keys__": { 4 | "post_id": ["Post", "id"], 5 | "user_id": ["User", "id"] 6 | }, 7 | "id": { 8 | "type": "number", 9 | "description": "Unique identifier for the comment", 10 | "constraints": { 11 | "primary_key": true 12 | } 13 | }, 14 | "post_id": { 15 | "type": "foreign_key", 16 | "description": "Reference to the post being commented on", 17 | "references": { 18 | "schema": "Post", 19 | "field": "id" 20 | } 21 | }, 22 | "user_id": { 23 | "type": "foreign_key", 24 | "description": "Reference to the user who wrote the comment", 25 | "references": { 26 | "schema": "User", 27 | "field": "id" 28 | } 29 | }, 30 | "content": { 31 | "type": "text", 32 | "description": "Content of the comment", 33 | "constraints": { 34 | "max_length": 1000 35 | } 36 | }, 37 | "created_at": { 38 | "type": "date", 39 | "description": "Date and time when the comment was created" 40 | }, 41 | "is_approved": { 42 | "type": "boolean", 43 | "description": "Whether the comment has been approved by moderators" 44 | }, 45 | "parent_comment_id": { 46 | "type": "number", 47 | "description": "Reference to the parent comment (for threaded comments), indicate 0 if it is a top-level comment or if it is a parent comment on the post" 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /examples/structured_only/output/example_yaml_schemas/inventory_data/category.csv: -------------------------------------------------------------------------------- 1 | id,name,description,parent_id 2 | 1,Electronics,"Comprehensive category for all electronic devices, including computers, mobile devices, audio equipment, and home entertainment systems.",0 3 | 2,Computers,"Desktop computers, laptops, workstations, and computer components including processors, graphics cards, and memory modules.",1 4 | 3,Mobile Devices,"Smartphones, tablets, smartwatches, and related mobile accessories and communication devices.",1 5 | 4,Home Appliances,"Kitchen appliances, cleaning equipment, climate control devices, and other household electrical machines.",0 6 | 5,Kitchen Appliances,"Cooking equipment, refrigeration units, small countertop appliances, and food preparation tools.",4 7 | 6,Cleaning Equipment,"Vacuum cleaners, steam mops, robotic cleaners, and other home cleaning and maintenance devices.",4 8 | 7,Clothing,"Apparel and fashion items for men, women, and children, including seasonal and everyday wear.",0 9 | 8,Men's Clothing,"Shirts, pants, suits, casual wear, and accessories specifically designed for men.",7 10 | 9,Women's Clothing,"Dresses, blouses, skirts, professional attire, and fashion accessories for women.",7 11 | 10,Sports & Outdoors,"Athletic equipment, outdoor gear, fitness accessories, and recreational products for various sports and activities.",0 12 | 11,Fitness Equipment,"Gym machines, weights, exercise mats, cardio equipment, and personal training accessories.",10 13 | 12,Outdoor Gear,"Camping equipment, hiking gear, tents, sleeping bags, navigation tools, and adventure accessories.",10 14 | -------------------------------------------------------------------------------- /examples/structured_and_unstructured/retail_yml/output/customer.csv: -------------------------------------------------------------------------------- 1 | id,first_name,last_name,email,phone,address,city,state,postal_code,country,date_registered,loyalty_points,last_purchase_date 2 | 1245,Emily,Rodriguez,emily.rodriguez@gmail.com,+1-555-234-5678,3721 Maple Street,Phoenix,Arizona,85001,United States,2022-03-15,2350,2023-06-22 3 | 2367,Michael,Chen,michael.chen@outlook.com,+1-555-789-0123,1592 Oak Avenue,Toronto,Ontario,M5V 2T6,Canada,2021-11-05,5670,2023-07-01 4 | 3489,Sophia,Martinez,sophia.martinez@yahoo.com,+1-555-456-7890,8246 Pine Road,Houston,Texas,77001,United States,2023-01-20,1125,2023-06-30 5 | 4612,Daniel,Wilson,daniel.wilson@hotmail.com,+1-555-901-2345,5713 Cedar Lane,Melbourne,Victoria,3000,Australia,2022-08-12,4890,2023-07-05 6 | 5734,Isabella,Garcia,isabella.garcia@gmail.com,+1-555-345-6789,2189 Birch Street,London,Greater London,SW1A 1AA,United Kingdom,2022-05-30,3675,2023-06-25 7 | 6856,David,Johnson,david.johnson@outlook.com,+1-555-678-9012,7465 Elm Drive,Mexico City,Mexico City,06000,Mexico,2021-09-18,6240,2023-07-03 8 | 7978,Emma,Smith,emma.smith@yahoo.com,+1-555-012-3456,4932 Willow Court,Chicago,Illinois,60601,United States,2023-02-14,890,2023-06-28 9 | 9100,Alexander,Lee,alexander.lee@hotmail.com,+1-555-234-5678,6217 Sycamore Road,Vancouver,British Columbia,V6B 1G1,Canada,2022-10-05,3210,2023-07-02 10 | 10222,Olivia,Brown,olivia.brown@gmail.com,+1-555-567-8901,1876 Magnolia Avenue,Sydney,New South Wales,2000,Australia,2022-06-22,5520,2023-06-29 11 | 11344,James,Taylor,james.taylor@outlook.com,+1-555-890-1234,9543 Redwood Street,Manchester,Greater Manchester,M1 1AE,United Kingdom,2021-12-10,4100,2023-07-04 12 | -------------------------------------------------------------------------------- /examples/structured_only/schema_files/yaml/inventory.yml: -------------------------------------------------------------------------------- 1 | __table_description__: Current inventory levels for products 2 | 3 | id: 4 | type: number 5 | description: Unique identifier for the inventory record 6 | constraints: 7 | primary_key: true 8 | 9 | product_id: 10 | type: foreign_key 11 | description: Reference to the product 12 | references: 13 | schema: Product 14 | field: id 15 | 16 | quantity: 17 | type: number 18 | description: Current quantity in stock, typically between 0-200 units with higher volume products having larger quantities 19 | 20 | warehouse_location: 21 | type: text 22 | description: Location code within the warehouse in format [Zone]-[Aisle]-[Shelf] (e.g., 'A-3-12', where A is the zone, 3 is the aisle, and 12 is the shelf position) 23 | constraints: 24 | max_length: 50 25 | 26 | last_checked: 27 | type: date 28 | description: Date when inventory was last verified, typically within the last 90 days from current date 29 | 30 | batch_number: 31 | type: text 32 | description: Batch or lot number for inventory tracking in format 'B#####-X#' where # is a digit and X is a letter (e.g., 'B12345-A2') 33 | constraints: 34 | max_length: 100 35 | 36 | expiry_date: 37 | type: date 38 | description: Expiration date for the product (if applicable), typically 1-24 months in the future for perishable items, with approximately 50% of products having expiry dates 39 | 40 | purchase_order_id: 41 | type: text 42 | description: Reference to the purchase order that added this inventory, typically in format 'PO-######' where # is a digit 43 | constraints: 44 | max_length: 50 45 | -------------------------------------------------------------------------------- /examples/model_selection/output/test_openai_models/gpt-4o/patient.csv: -------------------------------------------------------------------------------- 1 | patient_id,diagnosis_code,email,visit_date,notes 2 | 1,J20.9,jdoe@example.com,2023-09-12,Patient presented with symptoms of acute bronchitis. Prescribed rest and fluids. 3 | 2,E11.9,asmith@example.com,2023-08-23,Routine check-up for type 2 diabetes. Blood sugar levels stable. 4 | 3,I10,mbrown@example.com,2023-10-01,Hypertension follow-up. Blood pressure within normal range. 5 | 4,F41.1,ljohnson@example.com,2023-07-15,Patient experiencing generalized anxiety disorder. Referred to counseling. 6 | 5,M54.5,dlee@example.com,2023-09-30,Chronic low back pain. Recommended physical therapy exercises. 7 | 6,N39.0,kpatel@example.com,2023-09-18,Treated for urinary tract infection. Prescribed antibiotics. 8 | 7,E66.9,rwilson@example.com,2023-08-05,Obesity management consultation. Discussed dietary changes. 9 | 8,J45.909,gharris@example.com,2023-07-24,Asthma exacerbation. Adjusted medication dosage. 10 | 9,K21.9,dclark@example.com,2023-09-02,GERD symptoms reported. Advised lifestyle modifications. 11 | 10,L98.9,amartin@example.com,2023-10-10,Skin rash of unknown etiology. Referred to dermatologist. 12 | 11,G43.909,bwhite@example.com,2023-08-19,Migraine management. Prescribed triptans for acute attacks. 13 | 12,F32.9,csanchez@example.com,2023-09-27,"Major depressive disorder, recurrent episode. Medication adjusted." 14 | 13,R53.81,pjones@example.com,2023-08-15,Chronic fatigue syndrome. Discussed energy conservation techniques. 15 | 14,H52.4,mking@example.com,2023-07-29,Presbyopia. Fitted for reading glasses. 16 | 15,M79.7,lrodriguez@example.com,2023-09-05,Fibromyalgia pain management. Initiated low-impact aerobic exercises. 17 | -------------------------------------------------------------------------------- /examples/model_selection/output/test_openai_models/gpt-4o/claim.csv: -------------------------------------------------------------------------------- 1 | claim_id,patient_id,diagnosis_code,email,visit_date,notes 2 | 1,13,E11.9,john.doe@example.com,2023-03-15,Patient presented with elevated blood sugar levels. Prescribed metformin. 3 | 2,14,I10,jane.smith@example.com,2023-03-16,Routine check-up. Blood pressure slightly elevated. Advised lifestyle changes. 4 | 3,8,J45.909,mike.jones@example.com,2023-03-17,Asthma flare-up. Prescribed inhaler. 5 | 4,12,F32.9,sarah.connor@example.com,2023-03-18,Patient experiencing depressive symptoms. Referred to counseling. 6 | 5,11,M54.5,david.brown@example.com,2023-03-19,Patient reports lower back pain. Recommended physical therapy. 7 | 6,5,K21.9,linda.green@example.com,2023-03-20,Symptoms consistent with GERD. Advised dietary changes. 8 | 7,15,N39.0,steven.white@example.com,2023-03-21,Diagnosis of urinary tract infection. Prescribed antibiotics. 9 | 8,2,L20.9,emily.davis@example.com,2023-03-22,Eczema symptoms observed. Provided topical treatment. 10 | 9,12,G43.909,matthew.wilson@example.com,2023-03-23,Migraine headaches reported. Prescribed sumatriptan. 11 | 10,1,R51,olivia.taylor@example.com,2023-03-24,Patient complained of headache. Recommended hydration and rest. 12 | 11,15,E78.5,chris.martin@example.com,2023-03-25,Elevated cholesterol levels. Advised low-fat diet. 13 | 12,14,B34.9,anna.moore@example.com,2023-03-26,"Patient with viral infection, advised rest and fluids." 14 | 13,4,H52.4,paul.jackson@example.com,2023-03-27,Patient experiencing presbyopia. Recommended reading glasses. 15 | 14,6,Z00.00,megan.thompson@example.com,2023-03-28,General adult medical examination. All vitals normal. 16 | 15,8,R53.83,luke.anderson@example.com,2023-03-29,Patient reports fatigue. Recommended balanced diet and exercise. 17 | -------------------------------------------------------------------------------- /examples/structured_only/hr_employee_example/schemas/performance_review.yml: -------------------------------------------------------------------------------- 1 | __table_name__: PerformanceReview 2 | __description__: Employee performance reviews 3 | __foreign_keys__: 4 | employee_id: [Employee, id] 5 | 6 | id: 7 | type: integer 8 | description: Unique identifier for the performance review 9 | constraints: 10 | primary_key: true 11 | min: 1 12 | max: 5000 13 | 14 | employee_id: 15 | type: integer 16 | description: Employee ID for whom the review is written 17 | constraints: 18 | not_null: true 19 | min: 1 20 | max: 1000 21 | 22 | review_date: 23 | type: date 24 | description: Date when the review was conducted 25 | constraints: 26 | format: YYYY-MM-DD 27 | 28 | performance_score: 29 | type: number 30 | description: Overall performance score (1-5 where 5 is best) 31 | constraints: 32 | min: 1.0 33 | max: 5.0 34 | 35 | review_period_start: 36 | type: date 37 | description: Start date of the review period 38 | constraints: 39 | format: YYYY-MM-DD 40 | 41 | review_period_end: 42 | type: date 43 | description: End date of the review period 44 | constraints: 45 | format: YYYY-MM-DD 46 | 47 | strengths: 48 | type: text 49 | description: Employee strengths noted in the review 50 | constraints: 51 | max_length: 500 52 | 53 | areas_for_improvement: 54 | type: text 55 | description: Areas where the employee can improve 56 | constraints: 57 | max_length: 500 58 | 59 | goals_set: 60 | type: text 61 | description: Goals set for the next review period 62 | constraints: 63 | max_length: 500 64 | 65 | reviewer_comments: 66 | type: text 67 | description: Additional comments from the reviewer 68 | constraints: 69 | max_length: 1000 70 | -------------------------------------------------------------------------------- /examples/structured_only/hr_employee_example/schemas/employee.yml: -------------------------------------------------------------------------------- 1 | __table_name__: Employee 2 | __description__: Employee information for the organization 3 | __foreign_keys__: 4 | department_id: [Department, id] 5 | position_id: [Position, id] 6 | 7 | id: 8 | type: integer 9 | description: Unique identifier for the employee 10 | constraints: 11 | primary_key: true 12 | min: 1 13 | max: 1000 14 | 15 | first_name: 16 | type: text 17 | description: First name of the employee 18 | constraints: 19 | not_null: true 20 | min_length: 2 21 | max_length: 50 22 | 23 | last_name: 24 | type: text 25 | description: Last name of the employee 26 | constraints: 27 | not_null: true 28 | min_length: 2 29 | max_length: 50 30 | 31 | email: 32 | type: text 33 | description: Email address of the employee 34 | constraints: 35 | format: email 36 | unique: true 37 | 38 | phone: 39 | type: text 40 | description: Phone number of the employee 41 | constraints: 42 | format: phone 43 | 44 | hire_date: 45 | type: date 46 | description: Date when the employee was hired 47 | constraints: 48 | format: YYYY-MM-DD 49 | 50 | department_id: 51 | type: integer 52 | description: Department ID where the employee works 53 | constraints: 54 | not_null: true 55 | min: 1 56 | max: 100 57 | 58 | position_id: 59 | type: integer 60 | description: Position ID of the employee 61 | constraints: 62 | not_null: true 63 | min: 1 64 | max: 50 65 | 66 | salary: 67 | type: number 68 | description: Annual salary of the employee 69 | constraints: 70 | min: 30000 71 | max: 300000 72 | 73 | is_active: 74 | type: boolean 75 | description: Whether the employee is currently active 76 | -------------------------------------------------------------------------------- /examples/structured_and_unstructured/crm_sqlalchemy/output/opportunities.csv: -------------------------------------------------------------------------------- 1 | id,customer_id,name,value,stage,probability,expected_close_date,description 2 | 1,1,Enterprise Cloud Migration Project,250000.5,Proposal,65.5,2024-03-15,"Large-scale cloud infrastructure migration for a financial services company, involving comprehensive assessment, strategy, and implementation of AWS cloud services." 3 | 2,4,AI-Powered Customer Support Platform,125000.75,Qualification,45.0,2024-04-30,Development of an intelligent chatbot and machine learning-driven customer support solution for a leading e-commerce platform. 4 | 3,3,Cybersecurity Enhanced Network Infrastructure,345000.25,Negotiation,82.5,2024-02-28,"Comprehensive cybersecurity upgrade for a multinational healthcare organization, including advanced threat detection and network segmentation." 5 | 4,5,Data Analytics and Visualization Suite,180000.0,Proposal,55.5,2024-05-10,Implementation of advanced data analytics tools and custom visualization dashboards for a retail marketing agency. 6 | 5,1,IoT Smart Manufacturing Solution,425000.75,Negotiation,75.0,2024-03-05,"End-to-end Internet of Things (IoT) solution for optimizing manufacturing processes, including sensor integration and real-time monitoring systems." 7 | 6,4,Enterprise Resource Planning Modernization,295000.5,Qualification,38.5,2024-04-15,Comprehensive ERP system upgrade and digital transformation project for a mid-sized manufacturing company. 8 | 7,4,Blockchain Supply Chain Management,210000.25,Lead,25.0,2024-06-20,Blockchain-based supply chain tracking and transparency solution for a global logistics and shipping company. 9 | 8,4,Healthcare Data Integration Platform,385000.0,Proposal,60.5,2024-04-25,Secure healthcare data integration and interoperability platform connecting multiple medical systems and ensuring HIPAA compliance. 10 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | title: Syda - AI-Powered Synthetic Data Generation 3 | message: >- 4 | If you use this software, please cite it using the 5 | metadata from this file 6 | type: software 7 | authors: 8 | - family-names: Lingamgunta 9 | given-names: Rama Krishna Kumar 10 | email: ramkumar2606@gmail.com 11 | orcid: 'https://orcid.org/0009-0001-6201-7620' 12 | identifiers: 13 | - type: doi 14 | value: 10.5281/zenodo.17345575 15 | repository-code: 'https://github.com/syda-ai/syda' 16 | url: 'https://python.syda.ai' 17 | repository-artifact: 'https://pypi.org/project/syda' 18 | abstract: > 19 | SYDA seamlessly generates realistic synthetic test 20 | data—including structured, unstructured, PDF, and 21 | HTML—using AI and large language models. It preserves 22 | referential integrity, maintains privacy compliance, and 23 | accelerates development workflows. SYDA enables both 24 | highly regulated industries such as healthcare and 25 | banking, as well as non-regulated environments like 26 | software testing, research, and analytics, to safely 27 | simulate diverse data scenarios without exposing sensitive 28 | information. 29 | keywords: 30 | - synthetic data 31 | - structured data generation 32 | - unstructured data generation 33 | - PDF generation 34 | - HTML generation 35 | - CSV generation 36 | - data simulation 37 | - AI 38 | - artificial intelligence 39 | - large language models 40 | - LLM 41 | - data governance 42 | - data validation 43 | - python 44 | - testing 45 | - privacy 46 | - SQLAlchemy 47 | - OpenAI GPT 48 | - Anthropic Claude 49 | - Google Gemini 50 | - xAI Grok 51 | - YAML 52 | - Dict 53 | - JSON 54 | commit: '52cd5357f776989940afd5511212453d7e9765b1' 55 | license: MIT 56 | version: 0.0.4 57 | date-released: '2025-10-10' 58 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | 9 | ## [0.0.3] - 2025-09-21 10 | 11 | ### Added 12 | - Azure OpenAI provider support for enterprise deployments 13 | - Advanced configuration with `extra_kwargs` parameter for all providers 14 | - AI gateway integration support (LiteLLM, Portkey, Kong, and custom gateways) 15 | - Comprehensive Azure OpenAI documentation and examples 16 | - Enhanced model configuration guide with `extra_kwargs` reference 17 | - Support for custom endpoints, authentication headers, and timeouts 18 | - Enterprise-grade features for production deployments 19 | 20 | ### Changed 21 | - Development status upgraded from Beta to Production/Stable 22 | - Enhanced documentation with AI gateway integration examples 23 | - Improved error handling and troubleshooting guidance 24 | - Updated model configuration documentation with provider-specific examples 25 | 26 | ### Fixed 27 | - Enhanced provider-specific parameter handling 28 | - Better error messages for configuration issues 29 | 30 | 31 | ## [0.0.2] - 2025-08-23 32 | 33 | ### Added 34 | - Support for Google Gemini Models 35 | 36 | ### Changed 37 | - Documentation Fixes 38 | 39 | 40 | ## [0.0.1] - 2025-08-11 41 | 42 | ### Added 43 | - Modern packaging with pyproject.toml 44 | - Support for multiple AI providers (OpenAI, Anthropic Claude) 45 | - Comprehensive schema formats (SQLAlchemy, YAML, JSON, Dict) 46 | - Foreign key relationship handling with referential integrity 47 | - Unstructured document generation with templates 48 | - Custom generators for domain-specific data 49 | - Multi-provider AI integration with consistent interface 50 | - Automatic dependency resolution via topological sorting -------------------------------------------------------------------------------- /examples/structured_only/schema_files/yaml/product.yml: -------------------------------------------------------------------------------- 1 | __table_description__: Products in the inventory management system 2 | 3 | id: 4 | type: number 5 | description: Unique identifier for the product 6 | constraints: 7 | primary_key: true 8 | 9 | name: 10 | type: text 11 | description: Name of the product 12 | constraints: 13 | unique: true 14 | max_length: 150 15 | 16 | category_id: 17 | type: foreign_key 18 | description: Reference to the product category 19 | references: 20 | schema: Category 21 | field: id 22 | 23 | sku: 24 | type: text 25 | description: Stock Keeping Unit - unique product identifier 26 | constraints: 27 | unique: true 28 | max_length: 50 29 | 30 | price: 31 | type: number 32 | description: Current price of the product in USD, typically ranging from $5 to $500 with prices clustered around common price points 33 | 34 | description: 35 | type: text 36 | description: Detailed description of the product 37 | constraints: 38 | max_length: 5000 39 | 40 | weight: 41 | type: number 42 | description: Weight of the product in kg, typically between 0.1kg for small items and up to 50kg for larger items 43 | 44 | dimensions: 45 | type: text 46 | constraints: 47 | max_length: 100 48 | description: Dimensions of the product in format LxWxH cm (e.g., '25x40x10 cm'), with measurements typically between 1-100cm per dimension 49 | 50 | in_stock: 51 | type: boolean 52 | description: Whether the product is currently in stock, with approximately 80% of products typically being in stock 53 | 54 | reorder_level: 55 | type: number 56 | description: Minimum stock level before reordering, typically set between 5-50 units depending on product demand 57 | 58 | supplier_id: 59 | type: foreign_key 60 | description: Reference to the supplier of this product 61 | references: 62 | schema: Supplier 63 | field: id 64 | -------------------------------------------------------------------------------- /examples/structured_only/output/example_yaml_schemas/inventory_data/supplier.csv: -------------------------------------------------------------------------------- 1 | id,company_name,contact_name,email,phone,address,website,payment_terms,active 2 | 1,GlobalTech Electronics,Michael Chen,michael.chen@globaltech.com,+1-408-555-7890,"1234 Innovation Drive, San Jose, CA 95110",www.globaltechelectronics.com,Net 15,True 3 | 2,Green Harvest Organics,Sarah Rodriguez,sarah.rodriguez@greenharvest.org,+1-650-555-2345,"567 Farm Road, Salinas, CA 93901",www.greenharvestorganics.com,Net 15,True 4 | 3,Precision Machinery Parts,Robert Williams,robert.williams@precisionparts.com,+1-312-555-9876,"890 Industrial Park Way, Chicago, IL 60601",www.precisionmachineryparts.com,Net 60,True 5 | 4,Textile Solutions Inc.,Emily Thompson,emily.thompson@textilesolutions.com,+1-212-555-4567,"456 Fashion Avenue, New York, NY 10016",www.textilesolutionsinc.com,Net 30,True 6 | 5,Ocean Seafood Distributors,David Kim,david.kim@oceanseafood.com,+1-206-555-3210,"789 Fisherman's Wharf, Seattle, WA 98101",www.oceanseafooddistributors.com,Net 30,True 7 | 6,Quantum Pharmaceuticals,Jennifer Martinez,jennifer.martinez@quantumpharma.com,+1-617-555-8901,"234 Medical Research Park, Boston, MA 02115",www.quantumpharmaceuticals.com,Net 60,True 8 | 7,Eco-Friendly Packaging Solutions,Alex Johnson,alex.johnson@ecopackaging.com,+1-415-555-6543,"678 Green Boulevard, San Francisco, CA 94105",www.ecofriendlypackaging.com,COD,True 9 | 8,Automotive Components Direct,Mark Anderson,mark.anderson@autocomponents.com,+1-248-555-1234,"345 Motor City Lane, Detroit, MI 48201",www.automotivecomponentsdirect.com,Net 60,True 10 | 9,Gourmet Ingredients Wholesaler,Lisa Chang,lisa.chang@gourmetingredients.com,+1-702-555-9012,"901 Culinary Street, Las Vegas, NV 89101",www.gourmetingredientswholesaler.com,COD,True 11 | 10,Advanced Robotics Systems,Daniel Smith,daniel.smith@advancedrobotics.com,+1-503-555-6789,"567 Tech Innovation Road, Portland, OR 97201",www.advancedroboticsystems.com,Prepaid,False 12 | -------------------------------------------------------------------------------- /examples/structured_and_unstructured/retail_yml/output/transaction.csv: -------------------------------------------------------------------------------- 1 | id,customer_id,transaction_date,transaction_time,product_id,quantity,unit_price,discount_amount,total_amount,payment_method,payment_status,store_id,employee_id 2 | 34567,7978,2023-06-15,14:23:45,7,2,49.99,5.0,94.98,Credit Card,Completed,456,7890 3 | 34568,4612,2023-06-15,10:15:30,5,1,129.99,10.5,119.49,Debit Card,Completed,234,5678 4 | 34569,5734,2023-06-15,16:45:12,17,3,24.5,0.0,73.5,Cash,Completed,789,2345 5 | 34570,1245,2023-06-15,11:32:54,11,1,299.99,30.0,269.99,Gift Card,Pending,345,6789 6 | 34571,1245,2023-06-15,13:50:23,13,4,12.99,2.5,49.96,Mobile Payment,Completed,567,3456 7 | 34572,10222,2023-06-15,09:05:41,4,2,79.99,15.0,144.98,Store Credit,Completed,678,4567 8 | 34573,4612,2023-06-15,17:22:36,24,1,199.5,20.0,179.5,Credit Card,Failed,890,5678 9 | 34574,1245,2023-06-15,15:40:17,1,3,34.75,5.25,99.0,Debit Card,Partially Refunded,123,6789 10 | 34575,7978,2023-06-15,12:18:09,20,1,149.99,0.0,149.99,Cash,Completed,456,7890 11 | 34576,7978,2023-06-15,08:55:30,21,5,9.99,7.5,42.45,Mobile Payment,Completed,234,2345 12 | 34577,7978,2023-06-15,16:03:22,9,2,89.5,17.9,161.1,Gift Card,Completed,567,3456 13 | 34578,7978,2023-06-15,10:47:55,2,1,399.99,50.0,349.99,Credit Card,Refunded,789,4567 14 | 34579,1245,2023-06-15,14:56:41,15,3,22.5,2.25,65.25,Debit Card,Completed,345,5678 15 | 34580,6856,2023-06-15,11:20:15,11,1,249.99,25.0,224.99,Store Credit,Pending,678,6789 16 | 34581,11344,2023-06-15,13:37:48,15,4,15.99,6.4,57.56,Mobile Payment,Completed,890,7890 17 | 34582,9100,2023-06-15,09:33:07,5,2,69.75,13.95,125.55,Cash,Completed,123,2345 18 | 34583,3489,2023-06-15,15:14:29,15,1,179.5,17.95,161.55,Credit Card,Completed,456,3456 19 | 34584,5734,2023-06-15,12:05:53,16,3,44.25,4.43,128.32,Debit Card,Partially Refunded,234,4567 20 | 34585,5734,2023-06-15,17:50:36,5,1,99.99,10.0,89.99,Gift Card,Completed,567,5678 21 | 34586,11344,2023-06-15,10:02:44,15,2,59.5,11.9,107.1,Store Credit,Completed,789,6789 22 | -------------------------------------------------------------------------------- /examples/structured_only/output/example_dict_schemas/ecommerce/order.csv: -------------------------------------------------------------------------------- 1 | id,customer_id,order_date,status,total_amount,shipping_address 2 | 1,3,2023-11-15,Delivered,129.99,"123 Maple Street, Springfield, IL 62701" 3 | 2,1,2023-11-16,Delivered,249.5,"456 Oak Avenue, Chicago, IL 60601" 4 | 3,10,2023-11-17,Delivered,89.75,"789 Pine Road, Indianapolis, IN 46201" 5 | 4,4,2023-11-18,Delivered,199.25,"321 Birch Lane, Columbus, OH 43215" 6 | 5,7,2023-11-19,Delivered,59.99,"654 Cedar Street, Detroit, MI 48201" 7 | 6,1,2023-11-20,Shipped,349.99,"987 Elm Court, Milwaukee, WI 53201" 8 | 7,1,2023-11-21,Shipped,179.5,"234 Willow Drive, Cincinnati, OH 45201" 9 | 8,2,2023-11-22,Delivered,99.75,"567 Spruce Boulevard, Louisville, KY 40201" 10 | 9,5,2023-11-23,Delivered,279.99,"890 Redwood Street, Cleveland, OH 44101" 11 | 10,4,2023-11-24,Delivered,149.25,"432 Magnolia Avenue, Pittsburgh, PA 15201" 12 | 11,3,2023-11-25,Delivered,69.5,"765 Sycamore Road, Nashville, TN 37201" 13 | 12,10,2023-11-26,Delivered,299.75,"210 Hickory Lane, Kansas City, MO 64101" 14 | 13,1,2023-11-27,Pending,39.99,"543 Juniper Street, St. Louis, MO 63101" 15 | 14,1,2023-11-28,Delivered,199.5,"876 Aspen Court, Memphis, TN 38101" 16 | 15,4,2023-11-29,Delivered,79.25,"109 Dogwood Drive, Indianapolis, IN 46201" 17 | 16,8,2023-11-30,Delivered,259.99,"432 Chestnut Boulevard, Cincinnati, OH 45201" 18 | 17,8,2023-12-01,Processing,119.75,"765 Poplar Street, Columbus, OH 43215" 19 | 18,8,2023-12-02,Shipped,399.5,"210 Maple Avenue, Chicago, IL 60601" 20 | 19,6,2023-12-03,Delivered,49.99,"543 Pine Road, Milwaukee, WI 53201" 21 | 20,8,2023-12-04,Processing,169.25,"876 Oak Street, Detroit, MI 48201" 22 | 21,7,2023-12-05,Pending,89.5,"109 Cedar Lane, St. Louis, MO 63101" 23 | 22,1,2023-12-06,Delivered,279.75,"432 Elm Court, Kansas City, MO 64101" 24 | 23,7,2023-12-07,Delivered,139.99,"765 Willow Drive, Nashville, TN 37201" 25 | 24,2,2023-12-08,Pending,229.5,"210 Spruce Boulevard, Memphis, TN 38101" 26 | 25,8,2023-12-09,Cancelled,189.25,"543 Redwood Street, Pittsburgh, PA 15201" 27 | -------------------------------------------------------------------------------- /examples/model_selection/output/test_gemini_models/flash-2-5/claim.csv: -------------------------------------------------------------------------------- 1 | claim_id,patient_id,diagnosis_code,email,visit_date,notes 2 | 101,109,J10.0,alice.smith@example.com,2023-01-15,"Patient presented with flu-like symptoms, fever, and body aches. Prescribed rest and fluids." 3 | 102,114,M54.5,bob.johnson@example.com,2023-02-20,"Chronic low back pain, exacerbated after heavy lifting. Advised physical therapy." 4 | 103,104,E11.9,carol.white@example.com,2023-03-10,Follow-up for uncontrolled type 2 diabetes. Adjusted medication dosage. 5 | 104,113,I10,david.brown@example.com,2023-04-05,"Routine check-up, blood pressure elevated. Discussed lifestyle modifications." 6 | 105,105,J30.9,eva.green@example.com,2023-05-12,"Seasonal allergies, sneezing, and watery eyes. Recommended antihistamines." 7 | 106,109,K21.9,frank.black@example.com,2023-06-01,"Symptoms of GERD, heartburn, and regurgitation. Prescribed PPI." 8 | 107,111,R51,grace.taylor@example.com,2023-07-08,"Persistent headaches, throbbing pain. Advised to keep a headache diary." 9 | 108,101,L23.9,henry.wilson@example.com,2023-08-19,"Contact dermatitis on hands, likely from new soap. Recommended avoiding irritants." 10 | 109,107,N39.0,ivy.moore@example.com,2023-09-25,"Urinary tract infection symptoms. Sent urine for culture, prescribed antibiotics." 11 | 110,101,Z00.00,jack.martin@example.com,2023-10-30,"Annual physical examination. All vitals normal, no new concerns." 12 | 111,107,F32.9,karen.davis@example.com,2023-11-14,"Symptoms of depression, low mood, and fatigue. Discussed therapy options." 13 | 112,102,G47.00,liam.roberts@example.com,2023-12-01,"Insomnia, difficulty falling and staying asleep. Advised on sleep hygiene." 14 | 113,111,J02.9,mia.thomas@example.com,2024-01-07,Sore throat and mild fever. Diagnosed with acute pharyngitis. 15 | 114,112,S93.401A,noah.jackson@example.com,2024-02-18,Sprained ankle from sports injury. Advised RICE protocol. 16 | 115,103,H61.20,olivia.harris@example.com,2024-03-22,"Impacted earwax, causing hearing loss. Performed ear irrigation." 17 | -------------------------------------------------------------------------------- /examples/model_selection/output/test_claude_models/haiku-3-5/patient.csv: -------------------------------------------------------------------------------- 1 | patient_id,diagnosis_code,email,visit_date,notes 2 | 1001,E11.9,john.smith@example.com,2023-06-15,Type 2 diabetes mellitus without complications. Recommended diet and exercise plan. 3 | 1002,I10,maria.garcia@example.net,2023-06-12,Essential (primary) hypertension. Prescribed blood pressure medication. 4 | 1003,J45.909,david.lee@example.org,2023-06-18,Unspecified asthma with acute exacerbation. Renewed inhaler prescription. 5 | 1004,M25.50,sarah.johnson@example.com,2023-06-10,Pain in unspecified knee. Recommended physical therapy and anti-inflammatory medication. 6 | 1005,F32.9,michael.brown@example.net,2023-06-14,"Major depressive disorder, unspecified. Referred to mental health counseling." 7 | 1006,K35.2,emily.wilson@example.org,2023-06-16,Acute appendicitis with generalized peritonitis. Immediate surgical intervention required. 8 | 1007,N39.0,robert.taylor@example.com,2023-06-11,"Urinary tract infection, site not specified. Prescribed antibiotics." 9 | 1008,L02.91,jennifer.martinez@example.net,2023-06-13,"Cutaneous abscess of unspecified site. Cleaned and dressed wound, started antibiotics." 10 | 1009,H10.9,christopher.anderson@example.org,2023-06-17,Unspecified conjunctivitis. Prescribed eye drops and recommended follow-up. 11 | 1010,A09,amanda.thomas@example.com,2023-06-19,"Infectious gastroenteritis and colitis, unspecified. Advised hydration and rest." 12 | 1011,S52.50XA,daniel.jackson@example.net,2023-06-20,"Fracture of lower end of radius, unspecified arm, initial encounter. Referred for X-ray and orthopedic consultation." 13 | 1012,R10.2,olivia.white@example.org,2023-06-09,Pelvic and perineal pain. Ordered additional diagnostic tests. 14 | 1013,Z23,william.martin@example.com,2023-06-21,Encounter for immunization. Administered seasonal flu vaccine. 15 | 1014,B02.9,sophia.thompson@example.net,2023-06-22,Herpes zoster without complications. Prescribed antiviral medication. 16 | 1015,R53.83,ethan.davis@example.org,2023-06-23,Other fatigue. Conducted comprehensive health screening and blood tests. 17 | -------------------------------------------------------------------------------- /examples/structured_and_unstructured/retail_yml/schemas/customer.yml: -------------------------------------------------------------------------------- 1 | __table_name__: Customer 2 | __description__: Retail customers 3 | 4 | id: 5 | type: integer 6 | description: Unique customer ID 7 | constraints: 8 | primary_key: true 9 | not_null: true 10 | min: 1 11 | max: 50000 12 | 13 | first_name: 14 | type: string 15 | description: Customer's first name 16 | constraints: 17 | not_null: true 18 | length: 50 19 | 20 | last_name: 21 | type: string 22 | description: Customer's last name 23 | constraints: 24 | not_null: true 25 | length: 50 26 | 27 | email: 28 | type: email 29 | description: Customer's email address 30 | constraints: 31 | not_null: true 32 | unique: true 33 | length: 100 34 | 35 | phone: 36 | type: string 37 | description: Customer's phone number 38 | constraints: 39 | pattern: '^\+\d{1,3}-\d{3}-\d{3}-\d{4}$' 40 | length: 20 41 | 42 | address: 43 | type: address 44 | description: Customer's address 45 | constraints: 46 | length: 200 47 | 48 | city: 49 | type: string 50 | description: Customer's city 51 | constraints: 52 | length: 50 53 | 54 | state: 55 | type: string 56 | description: Customer's state/province 57 | constraints: 58 | length: 50 59 | 60 | postal_code: 61 | type: string 62 | description: Customer's postal/zip code 63 | constraints: 64 | pattern: '^\d{5}(-\d{4})?$' 65 | length: 10 66 | 67 | country: 68 | type: string 69 | description: Customer's country 70 | constraints: 71 | length: 50 72 | enum: ["United States", "Canada", "Mexico", "United Kingdom", "Australia"] 73 | 74 | date_registered: 75 | type: date 76 | description: Date when the customer registered 77 | constraints: 78 | format: YYYY-MM-DD 79 | 80 | loyalty_points: 81 | type: integer 82 | description: Customer's loyalty program points 83 | constraints: 84 | min: 0 85 | max: 100000 86 | 87 | last_purchase_date: 88 | type: date 89 | description: Date of customer's last purchase 90 | constraints: 91 | format: YYYY-MM-DD 92 | -------------------------------------------------------------------------------- /examples/model_selection/output/test_gemini_models/flash-2-5/patient.csv: -------------------------------------------------------------------------------- 1 | patient_id,diagnosis_code,email,visit_date,notes 2 | 101,I10,john.doe@example.com,2023-01-15,Patient presented with elevated blood pressure. Advised lifestyle modifications and prescribed medication. 3 | 102,J45.909,jane.smith@example.com,2023-02-20,"Chronic asthma exacerbation. Inhaler technique reviewed, rescue medication refilled." 4 | 103,E11.9,robert.johnson@example.com,2023-03-10,Follow-up for uncontrolled type 2 diabetes. Diet and exercise adherence discussed. HbA1c ordered. 5 | 104,M54.5,emily.davis@example.com,2023-04-05,"Lower back pain, acute onset. X-ray negative. Prescribed muscle relaxants and advised rest." 6 | 105,K21.9,michael.brown@example.com,2023-05-12,"GERD symptoms, persistent heartburn. PPI initiated. Lifestyle changes recommended." 7 | 106,F32.9,sarah.wilson@example.com,2023-06-01,Symptoms of depression. Counseling referral provided. Discussed medication options. 8 | 107,G43.909,david.moore@example.com,2023-07-08,Migraine with aura. Triptan prescribed. Advised on trigger avoidance. 9 | 108,N18.9,lisa.taylor@example.com,2023-08-14,"Chronic kidney disease, stage 3. Labs reviewed. Nephrology referral made." 10 | 109,R10.9,chris.anderson@example.com,2023-09-02,"Abdominal pain, generalized. Ultrasound ordered. Advised clear liquid diet." 11 | 110,H61.20,olivia.thomas@example.com,2023-10-11,"Cerumen impaction, left ear. Ear irrigation performed. Patient tolerated procedure well." 12 | 111,L20.9,daniel.jackson@example.com,2023-11-07,Atopic dermatitis flare-up. Topical steroids prescribed. Moisturizer use reinforced. 13 | 112,Z00.00,sophia.white@example.com,2023-12-01,Routine annual physical examination. No acute concerns. Labs drawn for screening. 14 | 113,I25.10,james.harris@example.com,2024-01-03,"Coronary artery disease, stable angina. Discussed medication adherence and exercise plan." 15 | 114,J02.9,mia.martin@example.com,2024-02-19,Acute pharyngitis. Rapid strep test negative. Symptomatic treatment advised. 16 | 115,M17.9,william.thompson@example.com,2024-03-25,"Osteoarthritis of knee, right. Pain management strategies discussed. PT referral provided." 17 | -------------------------------------------------------------------------------- /examples/structured_only/output/example_yaml_schemas/inventory_data/inventory.csv: -------------------------------------------------------------------------------- 1 | id,product_id,quantity,warehouse_location,last_checked,batch_number,expiry_date,purchase_order_id 2 | 1,9,150,A-3-12,2025-05-10,B00123-X1,2025-06-30,PO-000456 3 | 2,13,75,B-5-7,2025-05-12,B00234-X2,2025-03-15,PO-000789 4 | 3,23,200,C-2-15,2025-04-08,B00345-X3,2025-09-20,PO-001012 5 | 4,9,50,A-1-5,2025-05-05,B00456-X4,N/A,PO-001345 6 | 5,4,100,D-4-9,2025-06-19,B00567-X5,2025-05-12,PO-001678 7 | 6,5,180,B-6-3,2025-06-09,B00678-X6,2025-11-25,PO-002011 8 | 7,21,25,C-3-8,2025-06-02,B00789-X7,N/A,PO-002344 9 | 8,14,90,A-2-11,2025-06-04,B00890-X8,2025-04-18,PO-002677 10 | 9,1,60,D-5-6,2025-04-13,B00901-X9,2025-07-30,PO-003010 11 | 10,12,135,B-4-2,2025-04-08,B01012-X10,2025-10-05,PO-003343 12 | 11,9,80,C-1-7,2025-03-27,B01123-X11,N/A,PO-003676 13 | 12,18,170,A-5-13,2025-04-27,B01234-X12,2025-08-15,PO-004009 14 | 13,3,40,D-3-10,2025-05-06,B01345-X13,2025-03-22,PO-004342 15 | 14,7,110,B-2-6,2025-05-24,B01456-X14,N/A,PO-004675 16 | 15,7,95,C-5-4,2025-04-16,B01567-X15,N/A,PO-005008 17 | 16,16,65,A-4-9,2025-05-10,B01678-X16,2025-05-01,PO-005341 18 | 17,25,190,D-2-8,2025-04-19,B01789-X17,2025-11-12,PO-005674 19 | 18,7,30,B-1-5,2025-04-21,B01890-X18,N/A,PO-006007 20 | 19,6,145,C-4-11,2025-05-01,B01901-X19,N/A,PO-006340 21 | 20,18,85,A-6-2,2025-04-22,B02012-X20,2025-09-08,PO-006673 22 | 21,21,120,D-1-6,2025-06-06,B02123-X21,N/A,PO-007006 23 | 22,17,55,B-5-10,2025-04-23,B02234-X22,N/A,PO-007339 24 | 23,25,160,C-2-7,2025-05-01,B02345-X23,2025-10-30,PO-007672 25 | 24,23,70,A-3-13,2025-06-09,B02456-X24,2025-03-10,PO-008005 26 | 25,21,105,D-4-5,2025-05-20,B02567-X25,N/A,PO-008338 27 | 26,12,45,B-3-9,2025-05-09,B02678-X26,N/A,PO-008671 28 | 27,5,175,C-5-3,2025-05-25,B02789-X27,2025-08-22,PO-009004 29 | 28,4,95,A-1-11,2025-05-03,B02890-X28,2025-05-15,PO-009337 30 | 29,9,130,D-6-7,2025-06-04,B02901-X29,N/A,PO-009670 31 | 30,7,60,B-4-12,2025-04-03,B03012-X30,N/A,PO-010003 32 | 31,17,140,C-1-5,2025-05-05,B03123-X31,2025-09-12,PO-010336 33 | 32,22,85,A-5-8,2025-04-20,B03234-X32,N/A,PO-010669 34 | 33,18,115,D-3-11,2025-04-22,B03345-X33,N/A,PO-011002 35 | 34,3,70,B-2-4,2025-05-29,B03456-X34,2025-07-15,PO-011335 36 | 35,18,165,C-4-6,2025-04-16,B03567-X35,2025-10-10,PO-011668 37 | -------------------------------------------------------------------------------- /examples/model_selection/output/test_claude_models/haiku-3-5/claim.csv: -------------------------------------------------------------------------------- 1 | claim_id,patient_id,diagnosis_code,email,visit_date,notes 2 | 1,1015,E11.9,john.smith@example.com,2023-05-15,Type 2 diabetes mellitus without complications. Patient advised on diet and exercise management. 3 | 2,1014,I10,emily.jones@gmail.com,2023-04-22,Essential (primary) hypertension. Prescribed blood pressure medication and recommended lifestyle modifications. 4 | 3,1005,J45.90,michael.wong@outlook.com,2023-03-10,Mild intermittent asthma. Provided inhaler and asthma management plan. 5 | 4,1001,M17.9,sarah.miller@example.com,2023-02-18,Unspecified osteoarthritis of knee. Recommended physical therapy and pain management strategies. 6 | 5,1012,F32.9,david.garcia@gmail.com,2023-01-05,"Major depressive disorder, unspecified. Referred to mental health counseling and discussed potential medication options." 7 | 6,1004,K35.2,lisa.chen@outlook.com,2023-06-03,Acute appendicitis with generalized peritonitis. Immediate surgical consultation recommended. 8 | 7,1010,N39.0,robert.taylor@example.com,2023-05-01,"Urinary tract infection, site not specified. Prescribed antibiotics and hydration advice." 9 | 8,1007,L02.91,amanda.rodriguez@gmail.com,2023-04-12,"Cutaneous abscess of trunk. Performed incision and drainage, prescribed topical antibiotics." 10 | 9,1015,H10.9,james.lee@outlook.com,2023-03-25,Unspecified conjunctivitis. Prescribed antibiotic eye drops and hygiene instructions. 11 | 10,1010,A09,emma.wilson@example.com,2023-02-07,"Infectious gastroenteritis. Advised on hydration, rest, and symptomatic treatment." 12 | 11,1008,S52.521A,alex.martinez@gmail.com,2023-01-20,"Torus fracture of lower end of right radius, initial encounter. Applied cast and scheduled follow-up." 13 | 12,1011,R05,olivia.kim@outlook.com,2023-06-15,Chronic cough. Conducted thorough respiratory examination and ordered chest X-ray. 14 | 13,1005,Z23,william.park@example.com,2023-05-10,Encounter for immunization. Administered seasonal flu vaccine. 15 | 14,1001,R53.83,sophia.nguyen@gmail.com,2023-04-05,Other fatigue. Performed comprehensive metabolic panel and discussed lifestyle factors. 16 | 15,1001,G43.909,ethan.brown@outlook.com,2023-03-15,"Migraine, unspecified, without mention of intractability. Prescribed preventive medication and trigger management strategies." 17 | -------------------------------------------------------------------------------- /tests/test_templates.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the templates module. 3 | """ 4 | import pytest 5 | import os 6 | import re 7 | from unittest.mock import patch, mock_open, MagicMock 8 | 9 | 10 | # Skip all template processor tests due to import/circular dependency issues 11 | pytestmark = pytest.mark.skip(reason="TemplateProcessor tests temporarily disabled due to import issues") 12 | 13 | 14 | @pytest.mark.skip(reason="TemplateProcessor tests temporarily disabled due to import issues") 15 | class TestTemplateProcessor: 16 | """Tests for the TemplateProcessor class (all skipped).""" 17 | 18 | def test_initialization(self): 19 | """Test TemplateProcessor initialization.""" 20 | pass 21 | 22 | def test_extract_placeholders(self): 23 | """Test extracting placeholders from a template.""" 24 | pass 25 | 26 | def test_extract_placeholders_no_placeholders(self): 27 | """Test extracting placeholders from a template with no placeholders.""" 28 | pass 29 | 30 | def test_get_template_content(self): 31 | """Test getting content from a template file.""" 32 | pass 33 | 34 | def test_get_template_content_file_not_found(self): 35 | """Test error handling when template file is not found.""" 36 | pass 37 | 38 | def test_get_template_content_extraction_error(self): 39 | """Test error handling when template extraction fails.""" 40 | pass 41 | 42 | def test_replace_placeholders(self): 43 | """Test replacing placeholders in a template.""" 44 | pass 45 | 46 | def test_replace_placeholders_with_numbers(self): 47 | """Test replacing placeholders with number values.""" 48 | pass 49 | 50 | def test_create_schema_from_placeholders(self): 51 | """Test creating a schema from placeholders.""" 52 | pass 53 | 54 | def test_render_jinja2_template(self): 55 | """Test rendering a Jinja2 template.""" 56 | pass 57 | 58 | def test_process_template_with_data(self): 59 | """Test processing a template with data.""" 60 | pass 61 | 62 | def test_process_html_template(self): 63 | """Test processing an HTML template.""" 64 | pass 65 | -------------------------------------------------------------------------------- /examples/structured_only/output/example_sqlalchemy_models/crm_data/contacts.csv: -------------------------------------------------------------------------------- 1 | id,customer_id,first_name,last_name,email,phone,position,is_primary 2 | 1,6,John,Smith,john.smith@techcorp.com,555-123-4567,CEO, 3 | 2,4,Emily,Johnson,emily.johnson@marketingfirm.net,555-987-6543,Marketing Director, 4 | 3,10,Michael,Williams,michael.williams@financeinc.org,555-456-7890,Finance Manager, 5 | 4,8,Sarah,Brown,sarah.brown@retailgroup.com,555-234-5678,Operations Lead, 6 | 5,9,David,Miller,david.miller@techsolutions.io,555-345-6789,IT Director, 7 | 6,1,Jennifer,Davis,jennifer.davis@consultingfirm.com,555-678-9012,Senior Consultant, 8 | 7,5,Robert,Garcia,robert.garcia@manufacturingco.net,555-890-1234,Production Manager, 9 | 8,6,Lisa,Martinez,lisa.martinez@healthservices.org,555-567-8901,Healthcare Administrator, 10 | 9,5,Christopher,Rodriguez,christopher.rodriguez@educationgroup.com,555-456-7890,Education Coordinator, 11 | 10,1,Amanda,Lopez,amanda.lopez@energycompany.io,555-789-0123,Sustainability Manager, 12 | 11,1,Kevin,Wilson,kevin.wilson@foodindustry.net,555-234-5678,Supply Chain Director, 13 | 12,1,Rachel,Anderson,rachel.anderson@transportationco.org,555-345-6789,Logistics Coordinator, 14 | 13,6,Thomas,Taylor,thomas.taylor@agriculturefirm.com,555-678-9012,Agricultural Specialist, 15 | 14,2,Michelle,Thomas,michelle.thomas@constructiongroup.io,555-890-1234,Project Manager, 16 | 15,8,Daniel,Jackson,daniel.jackson@environmentalorg.net,555-567-8901,Environmental Consultant, 17 | 16,5,Elizabeth,White,elizabeth.white@pharmaceuticalco.com,555-456-7890,Research Director, 18 | 17,9,Mark,Harris,mark.harris@mediacompany.org,555-789-0123,Content Strategy Lead, 19 | 18,1,Jessica,Martin,jessica.martin@nonprofitorg.net,555-234-5678,Program Director, 20 | 19,1,Brian,Lee,brian.lee@softwarecompany.io,555-345-6789,Software Engineering Manager, 21 | 20,6,Nicole,Clark,nicole.clark@retailtech.com,555-678-9012,E-commerce Director, 22 | 21,7,Steven,Lewis,steven.lewis@sportsequipment.net,555-890-1234,Product Development Manager, 23 | 22,10,Lauren,Walker,lauren.walker@travelagency.com,555-567-8901,Customer Experience Manager, 24 | 23,3,Eric,Hall,eric.hall@entertainmentgroup.org,555-456-7890,Event Coordinator, 25 | 24,6,Karen,Allen,karen.allen@insurancecompany.net,555-789-0123,Claims Manager, 26 | 25,1,Ryan,Young,ryan.young@hospitalityinc.com,555-234-5678,Hospitality Operations Director, 27 | -------------------------------------------------------------------------------- /examples/structured_and_unstructured/retail_yml/schemas/product.yml: -------------------------------------------------------------------------------- 1 | __table_name__: Product 2 | __description__: Retail products 3 | __foreign_keys__: 4 | category_id: [Category, id] 5 | 6 | id: 7 | type: integer 8 | description: Unique product ID 9 | constraints: 10 | primary_key: true 11 | not_null: true 12 | min: 1 13 | max: 10000 14 | 15 | name: 16 | type: string 17 | description: Product name 18 | constraints: 19 | not_null: true 20 | length: 100 21 | unique: true 22 | 23 | category_id: 24 | type: integer 25 | description: Category ID for the product 26 | constraints: 27 | not_null: true 28 | min: 1 29 | max: 1000 30 | 31 | sku: 32 | type: string 33 | description: Stock Keeping Unit - unique product code 34 | constraints: 35 | not_null: true 36 | pattern: '^P[A-Z]{2}-\d{5}$' 37 | length: 10 38 | unique: true 39 | 40 | description: 41 | type: text 42 | description: Detailed product description 43 | constraints: 44 | length: 1000 45 | 46 | price: 47 | type: float 48 | description: Product price in USD 49 | constraints: 50 | not_null: true 51 | min: 0.99 52 | max: 9999.99 53 | decimals: 2 54 | 55 | cost: 56 | type: float 57 | description: Product cost in USD 58 | constraints: 59 | not_null: true 60 | min: 0.10 61 | max: 5000.00 62 | decimals: 2 63 | 64 | stock_quantity: 65 | type: integer 66 | description: Current stock level 67 | constraints: 68 | not_null: true 69 | min: 0 70 | max: 10000 71 | 72 | weight: 73 | type: float 74 | description: Product weight in kg 75 | constraints: 76 | min: 0.01 77 | max: 500.00 78 | decimals: 3 79 | 80 | dimensions: 81 | type: string 82 | description: Product dimensions in cm (LxWxH) 83 | constraints: 84 | pattern: '^\d+(\.\d+)?x\d+(\.\d+)?x\d+(\.\d+)?$' 85 | 86 | is_featured: 87 | type: boolean 88 | description: Whether the product is featured 89 | constraints: 90 | not_null: true 91 | 92 | date_added: 93 | type: date 94 | description: Date when the product was added to inventory 95 | constraints: 96 | format: YYYY-MM-DD 97 | 98 | last_updated: 99 | type: date 100 | description: Date when the product was last updated 101 | constraints: 102 | format: YYYY-MM-DD 103 | -------------------------------------------------------------------------------- /docs/schema_reference/special_field_types.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Special Field Types Reference | Syda Documentation 3 | description: Learn about Syda's special field types - table descriptions, foreign keys, template configurations, and metadata fields for advanced schema definition. 4 | keywords: 5 | - special field types 6 | - schema metadata 7 | - table descriptions 8 | - template fields 9 | - schema configuration 10 | --- 11 | 12 | # Special Field Types 13 | 14 | Special field types are prefixed with double underscores. These special sections are validated during schema validation: 15 | 16 | ### `__description__` 17 | 18 | It is used to identify the table description for the schema. 19 | 20 | ```yaml 21 | __description__: Customer information for e-commerce site 22 | ``` 23 | ### `__table_description__` 24 | 25 | It can also be used to identify the table description for the schema. 26 | 27 | ```yaml 28 | __table_description__: Customer information for e-commerce site 29 | ``` 30 | 31 | ### `__foreign_keys__` 32 | 33 | Defines foreign key relationships: 34 | 35 | ```yaml 36 | __foreign_keys__: 37 | user_id: [User, id] 38 | product_id: [Product, id] 39 | ``` 40 | 41 | ### `__depends_on__` 42 | 43 | Specifies schema dependencies for generation order: 44 | 45 | ```yaml 46 | __depends_on__: [Product, Customer] 47 | ``` 48 | 49 | This ensures that Product and Customer data are generated before the current schema. 50 | 51 | 52 | ## Special Template-Related Fields 53 | 54 | For schemas that generate unstructured document outputs: 55 | 56 | ### `__template__` 57 | 58 | It can be set to `true` or a string value to enable template generation. 59 | 60 | ```yaml 61 | __template__: true 62 | ``` 63 | 64 | ### `__template_source__` 65 | 66 | It is used to specify the path to the template file. 67 | 68 | ```yaml 69 | __template_source__: /path/to/template.html 70 | ``` 71 | 72 | ### `__input_file_type__` 73 | 74 | It is used to specify the input file type. 75 | 76 | ```yaml 77 | __input_file_type__: html 78 | ``` 79 | 80 | ### `__output_file_type__` 81 | 82 | It is used to specify the output file type. 83 | 84 | ```yaml 85 | __output_file_type__: pdf 86 | ``` 87 | 88 | These fields enable document generation from templates with the synthetic data. 89 | 90 | > **Important**: When `__template__` is set to `true`, the `__template_source__` field is required. Schema validation will fail if this relationship is not maintained. -------------------------------------------------------------------------------- /examples/structured_only/output/example_json_schemas/blog_data/user.csv: -------------------------------------------------------------------------------- 1 | id,username,email,full_name,join_date,bio,is_admin 2 | 1,techwriter23,sarah.johnson@example.com,Sarah Johnson,2019-03-15,Professional technical writer passionate about clear communication and technology trends.,False 3 | 2,codingwizard,michael.chen@example.com,Michael Chen,2020-07-22,Software engineer and open-source enthusiast. Love solving complex coding challenges.,True 4 | 3,creativemind,emily.rodriguez@example.com,Emily Rodriguez,2018-11-05,Graphic designer and digital artist exploring the intersection of art and technology.,False 5 | 4,travelblogs,david.kim@example.com,David Kim,2021-02-14,"Globetrotter sharing adventures, travel tips, and cultural insights from around the world.",True 6 | 5,platformadmin,alex.wilson@example.com,Alex Wilson,2017-09-01,Lead administrator of the blog platform. Ensuring smooth operations and user experience.,False 7 | 6,foodblogger,olivia.martinez@example.com,Olivia Martinez,2020-05-30,Culinary explorer and recipe developer. Sharing delicious recipes and food photography.,False 8 | 7,techsupport,ryan.thompson@example.com,Ryan Thompson,2019-12-10,Technical support specialist helping users navigate and resolve platform issues.,False 9 | 8,healthwellness,jessica.lee@example.com,Jessica Lee,2021-08-17,Fitness trainer and wellness advocate sharing health tips and motivational content.,False 10 | 9,musicreviewer,daniel.garcia@example.com,Daniel Garcia,2018-06-25,Independent music critic and enthusiast reviewing indie and emerging artists.,True 11 | 10,contentmanager,sophia.nguyen@example.com,Sophia Nguyen,2017-04-12,Content strategy expert managing editorial guidelines and platform content.,False 12 | 11,environmentalist,ethan.brown@example.com,Ethan Brown,2020-11-08,Environmental researcher documenting climate change and sustainability efforts.,False 13 | 12,gamingzone,lucas.santos@example.com,Lucas Santos,2021-01-20,Professional gamer and esports enthusiast sharing gaming strategies and reviews.,False 14 | 13,financeblogger,isabella.taylor@example.com,Isabella Taylor,2019-07-03,Financial advisor providing investment tips and personal finance insights.,False 15 | 14,securityadmin,andrew.patel@example.com,Andrew Patel,2018-02-28,Cybersecurity expert managing platform security and user data protection.,False 16 | 15,sciencenerd,rachel.zhang@example.com,Rachel Zhang,2021-06-05,Science communicator breaking down complex scientific concepts for everyone.,False 17 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Syda - AI-Powered Synthetic Test Data Generation | Python Library 3 | description: Generate realistic synthetic test data with AI - structured, unstructured, PDF, and HTML data generation using OpenAI, Anthropic Claude, and Google Gemini while preserving referential integrity and maintaining privacy compliance. 4 | keywords: 5 | - synthetic data generation 6 | - test data generation 7 | - AI data generation 8 | - large language models 9 | - OpenAI GPT 10 | - Anthropic Claude 11 | - Google Gemini 12 | - SQLAlchemy 13 | - referential integrity 14 | - privacy compliance 15 | - structured data 16 | - unstructured data 17 | - PDF generation 18 | - HTML generation 19 | - python library 20 | --- 21 | 22 | # What is Syda? 23 | 24 | Syda seamlessly generate realistic synthetic test data - structured, unstructured, PDF, and HTML data generation with AI and large language models while preserving referential integrity, maintaining privacy compliance, and accelerating development workflows using OpenAI, Anthropic, Gemini, and Grok models 25 | 26 | ## Key Features 27 | 28 | - **Multi-Provider AI Integration**: 29 | 30 | * Seamless integration with multiple AI providers 31 | * Support for OpenAI (GPT), Anthropic (Claude), Google (Gemini), and xAI (Grok). 32 | * Default model is Anthropic Claude model claude-3-5-haiku-20241022 33 | * Consistent interface across different providers 34 | * Provider-specific parameter optimization 35 | 36 | - **LLM-based Data Generation**: 37 | 38 | * AI-powered schema understanding and data creation 39 | * Contextually-aware synthetic records 40 | * Natural language prompt customization 41 | * Intelligent schema inference 42 | 43 | 44 | 45 | - **Multiple Schema Formats**: 46 | 47 | * YAML/JSON schema file support with full foreign key relationship handling 48 | * SQLAlchemy model integration with automatic metadata extraction 49 | * Python dictionary-based schema definitions 50 | 51 | - **Referential Integrity** 52 | 53 | * Automatic foreign key detection and resolution 54 | * Multi-model dependency analysis through topological sorting 55 | * Robust handling of related data with referential constraints 56 | 57 | - **Custom Generators** 58 | 59 | * Register column- or type-specific functions for domain-specific data 60 | * Contextual generators that adapt to other fields (like ICD-10 codes based on demographics) 61 | -------------------------------------------------------------------------------- /examples/structured_only/hr_employee_example/README.md: -------------------------------------------------------------------------------- 1 | # HR/Employee Data Generation Example 2 | 3 | This example demonstrates how to use SYDA to generate synthetic HR/employee data with interlinked tables and custom generators. 4 | 5 | ## Overview 6 | 7 | The HR example includes: 8 | 9 | 1. **Departments** - Company departments with budget and location information 10 | 2. **Positions** - Job positions with salary ranges and levels 11 | 3. **Employees** - Staff members linked to departments and positions 12 | 4. **Performance Reviews** - Regular employee evaluations with scores and feedback 13 | 14 | This example showcases: 15 | - Foreign key relationships across tables 16 | - Custom generators for realistic data 17 | - Generation dependencies to maintain data integrity 18 | - Realistic business rules (managers from same department, salary ranges based on position) 19 | 20 | ## Schema Structure 21 | 22 | The example defines four YAML schemas: 23 | 24 | - `department.yml` - Department information with budget and location 25 | - `position.yml` - Job positions with titles, levels, and salary ranges 26 | - `employee.yml` - Employee records with links to departments and positions 27 | - `performance_review.yml` - Employee reviews with scores and feedback 28 | 29 | ## Custom Generators 30 | 31 | The example implements several custom generators: 32 | 33 | 1. **Email Generator** - Creates realistic emails based on employee names 34 | 2. **Manager ID Generator** - Ensures managers belong to departments they manage 35 | 3. **Salary Generator** - Sets salary within range defined by position 36 | 4. **Review Date Generator** - Generates review dates after employee hire dates 37 | 5. **Review Period Generator** - Creates sensible review periods 38 | 39 | ## Running the Example 40 | 41 | To run this example: 42 | 43 | ```bash 44 | cd /path/to/syda-fresh 45 | python examples/structured_only/hr_employee_example/test_hr_schemas.py 46 | ``` 47 | 48 | The script will: 49 | - Generate 8 departments, 15 positions, 50 employees, and 75 performance reviews 50 | - Apply custom generators to create realistic values 51 | - Maintain referential integrity across tables 52 | - Save all data to CSV files in the `output` directory 53 | 54 | ## Output 55 | 56 | The generated data is saved to the `output` directory as CSV files: 57 | - `department.csv` 58 | - `position.csv` 59 | - `employee.csv` 60 | - `performance_review.csv` 61 | 62 | You can import these files into a database or analyze them to see the relationships between tables. 63 | -------------------------------------------------------------------------------- /examples/structured_and_unstructured/retail_yml/schemas/transaction.yml: -------------------------------------------------------------------------------- 1 | __table_name__: Transaction 2 | __description__: Retail transactions 3 | __foreign_keys__: 4 | customer_id: [Customer, id] 5 | product_id: [Product, id] 6 | 7 | id: 8 | type: integer 9 | description: Unique transaction ID 10 | constraints: 11 | primary_key: true 12 | not_null: true 13 | min: 1 14 | max: 500000 15 | 16 | customer_id: 17 | type: integer 18 | description: ID of the customer making the purchase 19 | constraints: 20 | not_null: true 21 | min: 1 22 | max: 50000 23 | 24 | transaction_date: 25 | type: date 26 | description: Date when the transaction occurred 27 | constraints: 28 | not_null: true 29 | format: YYYY-MM-DD 30 | 31 | transaction_time: 32 | type: string 33 | description: Time when the transaction occurred 34 | constraints: 35 | not_null: true 36 | pattern: '^([01]\d|2[0-3]):([0-5]\d):([0-5]\d)$' 37 | 38 | product_id: 39 | type: integer 40 | description: ID of the product purchased 41 | constraints: 42 | not_null: true 43 | min: 1 44 | max: 10000 45 | 46 | quantity: 47 | type: integer 48 | description: Quantity of the product purchased 49 | constraints: 50 | not_null: true 51 | min: 1 52 | max: 100 53 | 54 | unit_price: 55 | type: float 56 | description: Price per unit in USD 57 | constraints: 58 | not_null: true 59 | min: 0.99 60 | max: 9999.99 61 | decimals: 2 62 | 63 | discount_amount: 64 | type: float 65 | description: Discount amount applied in USD 66 | constraints: 67 | min: 0 68 | max: 5000 69 | decimals: 2 70 | 71 | total_amount: 72 | type: float 73 | description: Total amount for this line item in USD 74 | constraints: 75 | not_null: true 76 | min: 0.99 77 | max: 999999.99 78 | decimals: 2 79 | 80 | payment_method: 81 | type: string 82 | description: Method of payment 83 | constraints: 84 | not_null: true 85 | enum: ["Credit Card", "Debit Card", "Cash", "Gift Card", "Store Credit", "Mobile Payment"] 86 | 87 | payment_status: 88 | type: string 89 | description: Status of the payment 90 | constraints: 91 | not_null: true 92 | enum: ["Completed", "Pending", "Failed", "Refunded", "Partially Refunded"] 93 | 94 | store_id: 95 | type: integer 96 | description: ID of the store where the transaction occurred 97 | constraints: 98 | not_null: true 99 | min: 1 100 | max: 1000 101 | 102 | employee_id: 103 | type: integer 104 | description: ID of the employee who processed the transaction 105 | constraints: 106 | not_null: true 107 | min: 1 108 | max: 10000 109 | -------------------------------------------------------------------------------- /examples/quickstart.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Syda 30-Second Quick Start Example 4 | Demonstrates AI-powered synthetic data generation with perfect referential integrity 5 | """ 6 | 7 | from syda import SyntheticDataGenerator, ModelConfig 8 | from dotenv import load_dotenv 9 | 10 | # Load environment variables from .env file 11 | load_dotenv() 12 | 13 | print("🚀 Starting Syda 30-Second Quick Start...") 14 | 15 | # Configure AI model 16 | generator = SyntheticDataGenerator( 17 | model_config=ModelConfig( 18 | provider="anthropic", 19 | model_name="claude-3-5-haiku-20241022" 20 | ) 21 | ) 22 | 23 | # Define schemas with rich descriptions for better AI understanding 24 | schemas = { 25 | # Categories schema with table and column descriptions 26 | 'categories': { 27 | '__table_description__': 'Product categories for organizing items in the e-commerce catalog', 28 | 'id': { 29 | 'type': 'number', 30 | 'description': 'Unique identifier for the category', 31 | 'primary_key': True 32 | }, 33 | 'name': { 34 | 'type': 'text', 35 | 'description': 'Category name (Electronics, Home Decor, Sports, etc.)' 36 | }, 37 | 'description': { 38 | 'type': 'text', 39 | 'description': 'Detailed description of what products belong in this category' 40 | } 41 | }, 42 | 43 | # Products schema with table and column descriptions and foreign keys 44 | 'products': { 45 | '__table_description__': 'Individual products available for purchase with pricing and category assignment', 46 | '__foreign_keys__': { 47 | 'category_id': ['categories', 'id'] # products.category_id references categories.id 48 | }, 49 | 'id': { 50 | 'type': 'number', 51 | 'description': 'Unique product identifier', 52 | 'primary_key': True 53 | }, 54 | 'name': { 55 | 'type': 'text', 56 | 'description': 'Product name and title' 57 | }, 58 | 'category_id': { 59 | 'type': 'foreign_key', 60 | 'description': 'Reference to the category this product belongs to' 61 | }, 62 | 'price': { 63 | 'type': 'number', 64 | 'description': 'Product price in USD' 65 | } 66 | } 67 | } 68 | 69 | # Generate data with perfect referential integrity 70 | print("📊 Generating categories and products...") 71 | results = generator.generate_for_schemas( 72 | schemas=schemas, 73 | sample_sizes={"categories": 5, "products": 20}, 74 | output_dir="examples/quickstart_output_data" 75 | ) 76 | 77 | print("✅ Generated realistic data with perfect foreign key relationships!") 78 | print("📂 Check the 'data' folder for categories.csv and products.csv") 79 | -------------------------------------------------------------------------------- /examples/structured_only/output/example_sqlalchemy_models/crm_data/products.csv: -------------------------------------------------------------------------------- 1 | id,name,category,price,description 2 | 1,Enterprise Cloud Platform,API Services,4655.24,"Comprehensive cloud solution for large-scale enterprise operations, offering scalable infrastructure, advanced security, and seamless integration capabilities." 3 | 2,DevOps Acceleration Toolkit,Security Services,1813.53,"Complete DevOps suite with continuous integration, deployment tools, and workflow optimization solutions for modern software development teams." 4 | 3,Cybersecurity Shield Pro,Custom Development,3550.68,"Advanced cybersecurity package with real-time threat detection, AI-powered monitoring, and comprehensive protection for cloud and on-premise environments." 5 | 4,Cloud Migration Consulting,Business Intelligence,1101.5,"End-to-end cloud migration consulting service, including strategy development, implementation planning, and post-migration optimization." 6 | 5,Managed Cloud Support Platinum,Cloud Infrastructure,3237.18,"24/7 premium support package with dedicated technical experts, rapid response times, and proactive system monitoring." 7 | 6,Data Analytics Cloud Suite,Support Package,4215.07,"Comprehensive data analytics platform with machine learning capabilities, real-time reporting, and advanced visualization tools." 8 | 7,Hybrid Cloud Connector,Security Services,3140.18,"Seamless integration solution for connecting on-premise infrastructure with multi-cloud environments, ensuring smooth data transfer and management." 9 | 8,AI-Powered Cloud Optimization,Custom Development,3562.81,"Strategic consulting service using AI to analyze and optimize cloud infrastructure, reducing costs and improving performance." 10 | 9,Cloud Compliance Framework,Support Package,393.82,Comprehensive compliance management tool ensuring adherence to industry regulations and standards across cloud environments. 11 | 10,Startup Cloud Accelerator,API Services,956.06,"Tailored cloud solution for startups, offering scalable infrastructure, development tools, and cost-effective pricing." 12 | 11,Enterprise API Management,Custom Development,2309.66,"Comprehensive API management platform with design, security, and analytics tools for complex enterprise integration." 13 | 12,Cloud Cost Optimization Service,Support Package,3232.99,Detailed analysis and implementation of cost-saving strategies for cloud infrastructure and resource allocation. 14 | 13,Disaster Recovery Cloud Solution,Data Analytics,1826.17,"Robust disaster recovery and business continuity platform with automated backup, rapid restoration, and multi-region redundancy." 15 | 14,Cloud Training and Certification,Security Services,2831.08,"Comprehensive training program covering cloud technologies, best practices, and professional certification preparation." 16 | 15,IoT Cloud Integration Platform,Data Analytics,2013.6,"Advanced IoT cloud platform enabling seamless device management, data processing, and scalable IoT ecosystem development." 17 | -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/README.md: -------------------------------------------------------------------------------- 1 | # Healthcare Unstructured Data Example 2 | 3 | This example demonstrates how to use SYDA for generating unstructured healthcare data using YAML schemas with PDF output. 4 | 5 | ## Overview 6 | 7 | This example shows how to: 8 | 9 | 1. Define healthcare data schemas with template metadata using YAML format 10 | 2. Reference HTML templates for document formatting 11 | 3. Generate synthetic healthcare documents in PDF format using the `generate_for_schemas` method 12 | 4. Process schema definitions with proper template integration 13 | 14 | ## Directory Structure 15 | 16 | - `schemas/`: Contains YAML schema definitions with template metadata 17 | - `medical_report.yml`: Schema for medical reports with template settings 18 | - `lab_result.yml`: Schema for lab results with template settings 19 | - `templates/`: HTML templates for medical reports and lab results 20 | - `medical_report_template.html`: Template for medical visit reports 21 | - `lab_result_template.html`: Template for laboratory test results 22 | - `output/`: Generated healthcare data files (PDF format and JSON data) 23 | - `generate_healthcare_data.py`: Main script to generate healthcare data using schemas 24 | 25 | ## Running the Example 26 | 27 | Execute the example script: 28 | 29 | ```bash 30 | python3 examples/unstructured_only/healthcare_yml/generate_healthcare_data.py 31 | ``` 32 | 33 | This will: 34 | - Initialize the `SyntheticDataGenerator` with model configuration 35 | - Load the YAML schema files with template references 36 | - Generate 5 synthetic medical reports and 5 lab results as PDF files 37 | - Save the synthetic data in both PDF format and structured JSON data 38 | 39 | ## Schema Definitions 40 | 41 | The example uses YAML files that define both the schema and template metadata: 42 | 43 | 1. **Medical Report** (`medical_report.yml`): Contains: 44 | - Template metadata (`__template__`, `__name__`, etc.) 45 | - Input/output format configuration (HTML to PDF) 46 | - Data fields for patient information, vital signs, diagnosis, etc. 47 | 48 | 2. **Lab Result** (`lab_result.yml`): Contains: 49 | - Template metadata and format configuration 50 | - Fields for laboratory test information, results, and reference ranges 51 | 52 | ## Templates 53 | 54 | HTML templates in the `templates/` directory define how medical reports and lab results are formatted. The schema files reference these templates and specify conversion to PDF format using the following special attributes: 55 | 56 | ```yaml 57 | __template__: true 58 | __template_source: path/to/template.html 59 | __input_file_type__: html 60 | __output_file_type__: pdf 61 | ``` 62 | 63 | ## Using the Results 64 | 65 | After generation, the example produces: 66 | 67 | - PDF documents that can be viewed in any PDF reader 68 | - Structured JSON data representing the synthetic healthcare records 69 | 70 | These can be used for: 71 | - Training machine learning models 72 | - Testing document processing pipelines 73 | - Demonstrating healthcare data workflows 74 | - Prototyping healthcare applications 75 | -------------------------------------------------------------------------------- /examples/model_selection/output/test_grok_models/grok-3/companies.csv: -------------------------------------------------------------------------------- 1 | id,name,industry,founded_year,employee_count,revenue_millions,is_public,headquarters 2 | 1,TechCorp,AI,2015,250,45.7,,"San Francisco, USA" 3 | 2,InnovateLabs,FinTech,2018,120,18.3,,"London, UK" 4 | 3,FutureSystems,HealthTech,2010,500,92.1,,"Boston, USA" 5 | 4,QuantumEdge,Quantum Computing,2020,80,5.6,,"Munich, Germany" 6 | 5,BioNexus,BioTech,2005,350,67.4,,"San Diego, USA" 7 | 6,EduVision,EdTech,2017,200,12.9,,"Toronto, Canada" 8 | 7,GreenTech Solutions,CleanTech,2012,180,25.5,,"Stockholm, Sweden" 9 | 8,DataSphere,Big Data,2014,300,38.2,,"Austin, USA" 10 | 9,CyberGuard,Cybersecurity,2008,450,78.9,,"Tel Aviv, Israel" 11 | 10,RoboTech,Robotics,2019,90,7.3,,"Tokyo, Japan" 12 | 11,CloudWave,Cloud Computing,2011,600,120.5,,"Seattle, USA" 13 | 12,NanoInnovate,Nanotechnology,2021,50,2.1,,"Zurich, Switzerland" 14 | 13,SmartGrid,EnergyTech,2009,400,55.8,,"Berlin, Germany" 15 | 14,VirtuaLabs,AR/VR,2016,150,19.4,,"Los Angeles, USA" 16 | 15,GenomicAI,HealthTech,2013,220,33.7,,"Cambridge, UK" 17 | 16,FinFuture,FinTech,2020,70,4.8,,Singapore 18 | 17,SpaceForge,SpaceTech,2018,130,10.2,,"Cape Canaveral, USA" 19 | 18,NeuralNet,AI,2015,280,41.6,,"Montreal, Canada" 20 | 19,EcoSynth,CleanTech,2007,320,60.3,,"Copenhagen, Denmark" 21 | 20,LearnSphere,EdTech,2019,110,8.5,,"Sydney, Australia" 22 | 21,SecureFlow,Cybersecurity,2010,390,72.1,,"Washington D.C., USA" 23 | 22,QuantumFlow,Quantum Computing,2022,40,1.5,,"Paris, France" 24 | 23,BioSynth,BioTech,2003,500,88.9,,"Basel, Switzerland" 25 | 24,DataPulse,Big Data,2017,190,22.4,,"Chicago, USA" 26 | 25,RoboSphere,Robotics,2014,260,35.7,,"Seoul, South Korea" 27 | 26,CloudNest,Cloud Computing,2009,700,150.2,,"Dublin, Ireland" 28 | 27,NanoSphere,Nanotechnology,2018,85,6.3,,Singapore 29 | 28,EnergyWave,EnergyTech,2011,340,48.6,,"Oslo, Norway" 30 | 29,VirtuaTech,AR/VR,2020,60,3.9,,"Stockholm, Sweden" 31 | 30,GenTech,HealthTech,2006,420,77.5,,"Houston, USA" 32 | 31,FinSphere,FinTech,2016,140,16.8,,Hong Kong 33 | 32,SpaceWave,SpaceTech,2021,75,5.2,,"Houston, USA" 34 | 33,AI Nexus,AI,2013,310,52.4,,"Palo Alto, USA" 35 | 34,EcoWave,CleanTech,2010,240,39.1,,"Amsterdam, Netherlands" 36 | 35,EduFuture,EdTech,2018,130,10.7,,"Melbourne, Australia" 37 | 36,CyberWave,Cybersecurity,2005,550,95.3,,"San Jose, USA" 38 | 37,QuantumNest,Quantum Computing,2019,95,8.1,,"Cambridge, USA" 39 | 38,BioWave,BioTech,2002,480,83.6,,"Oxford, UK" 40 | 39,DataNest,Big Data,2015,210,27.8,,"New York, USA" 41 | 40,RoboWave,Robotics,2012,290,42.3,,"Shanghai, China" 42 | 41,CloudFuture,Cloud Computing,2008,800,180.9,,"San Francisco, USA" 43 | 42,NanoFuture,Nanotechnology,2020,65,4.2,,"Tokyo, Japan" 44 | 43,EnergyFuture,EnergyTech,2007,370,61.5,,"Madrid, Spain" 45 | 44,VirtuaFuture,AR/VR,2017,170,20.6,,"Berlin, Germany" 46 | 45,GenFuture,HealthTech,2011,330,54.9,,Singapore 47 | 46,FinNest,FinTech,2019,100,9.4,,"Zurich, Switzerland" 48 | 47,SpaceNest,SpaceTech,2022,55,3.3,,"Los Angeles, USA" 49 | 48,AI Future,AI,2014,270,46.8,,"Toronto, Canada" 50 | 49,EcoNest,CleanTech,2009,310,50.2,,"Helsinki, Finland" 51 | 50,EduNest,EdTech,2020,90,6.7,,"Bangalore, India" 52 | -------------------------------------------------------------------------------- /examples/structured_and_unstructured/retail_yml/schemas/receipt.yml: -------------------------------------------------------------------------------- 1 | __template__: true 2 | __description__: Retail receipt template 3 | __name__: Receipt 4 | __depends_on__: [Product, Transaction, Customer] 5 | __foreign_keys__: 6 | customer_name: [Customer, first_name] 7 | customer_id: [Customer, id] 8 | 9 | __template_source__: /home/ramkumar2606/syda-fresh/examples/structured_and_unstructured/retail_yml/templates/receipt.html 10 | __input_file_type__: html 11 | __output_file_type__: pdf 12 | 13 | # Receipt header 14 | store_name: 15 | type: string 16 | length: 50 17 | description: Name of the retail store 18 | 19 | store_address: 20 | type: address 21 | length: 150 22 | description: Full address of the store 23 | 24 | store_phone: 25 | type: string 26 | pattern: '^\+\d{1,3}-\d{3}-\d{3}-\d{4}$' 27 | length: 20 28 | description: Store phone number 29 | 30 | store_website: 31 | type: url 32 | length: 100 33 | description: Store website URL 34 | 35 | # Receipt details 36 | receipt_number: 37 | type: string 38 | pattern: '^RCP-\d{8}$' 39 | length: 12 40 | description: Unique receipt identifier 41 | 42 | transaction_date: 43 | type: date 44 | format: YYYY-MM-DD 45 | description: Date of the transaction 46 | 47 | transaction_time: 48 | type: string 49 | pattern: '^([01]\d|2[0-3]):([0-5]\d)$' 50 | length: 5 51 | description: Time of the transaction in 24-hour format 52 | 53 | # Customer information 54 | customer_name: 55 | type: string 56 | length: 100 57 | description: Full name of the customer 58 | 59 | customer_id: 60 | type: integer 61 | description: Customer ID number 62 | 63 | 64 | # Payment information 65 | payment_method: 66 | type: string 67 | enum: ["Credit Card", "Debit Card", "Cash", "Gift Card", "Store Credit", "Mobile Payment"] 68 | description: Method of payment used 69 | 70 | card_last_four: 71 | type: string 72 | pattern: '^\d{4}$' 73 | length: 4 74 | description: Last four digits of the payment card (if applicable) 75 | 76 | # Product purchase details 77 | items: 78 | type: array 79 | description: "List of purchased items with product details" 80 | 81 | # Totals 82 | subtotal: 83 | type: float 84 | min: 0.99 85 | max: 999999.99 86 | decimals: 2 87 | description: Sum of all item totals before tax 88 | 89 | tax_rate: 90 | type: float 91 | min: 0 92 | max: 15 93 | decimals: 2 94 | description: Tax rate percentage 95 | 96 | tax_amount: 97 | type: float 98 | min: 0 99 | max: 99999.99 100 | decimals: 2 101 | description: Total tax amount 102 | 103 | discount_amount: 104 | type: float 105 | min: 0 106 | max: 99999.99 107 | decimals: 2 108 | description: Total discount amount applied 109 | 110 | total: 111 | type: float 112 | min: 0.99 113 | max: 999999.99 114 | decimals: 2 115 | description: Final total amount 116 | 117 | # Footer 118 | cashier_name: 119 | type: string 120 | length: 100 121 | description: Name of the cashier who processed the transaction 122 | 123 | return_policy: 124 | type: string 125 | length: 500 126 | description: Store return policy text 127 | -------------------------------------------------------------------------------- /examples/model_selection/example_openai_models.py: -------------------------------------------------------------------------------- 1 | from syda.generate import SyntheticDataGenerator 2 | from syda.schemas import ModelConfig 3 | import os 4 | from dotenv import load_dotenv 5 | 6 | # Load environment variables 7 | load_dotenv() 8 | 9 | # Define schema for a single table 10 | schemas = { 11 | 'Patient': { 12 | 'patient_id': {'type': 'number', 'description': 'Unique identifier for the patient'}, 13 | 'diagnosis_code': {'type': 'text', 'description': 'ICD-10 diagnosis code'}, 14 | 'email': {'type': 'email', 'description': 'Patient email address used for communication'}, 15 | 'visit_date': {'type': 'date', 'description': 'Date when the patient visited the clinic'}, 16 | 'notes': {'type': 'text', 'description': 'Clinical notes for the patient visit'} 17 | }, 18 | 'Claim': { 19 | 'claim_id': {'type': 'number', 'description': 'Unique identifier for the claim'}, 20 | 'patient_id': {'type': 'foreign_key', 'description': 'Reference to the patient who made the claim', 'references': {'schema': 'Patient', 'field': 'patient_id'}}, 21 | 'diagnosis_code': {'type': 'text', 'description': 'ICD-10 diagnosis code'}, 22 | 'email': {'type': 'email', 'description': 'Patient email address used for communication'}, 23 | 'visit_date': {'type': 'date', 'description': 'Date when the patient visited the clinic'}, 24 | 'notes': {'type': 'text', 'description': 'Clinical notes for the patient visit'} 25 | } 26 | } 27 | 28 | prompts={ 29 | 'Patient': 'Generate realistic synthetic patient records with ICD-10 diagnosis codes, emails, visit dates, and clinical notes.', 30 | 'Claim': 'Generate realistic synthetic claim records with ICD-10 diagnosis codes, emails, visit dates, and clinical notes.' 31 | } 32 | 33 | 34 | print("--------------Testing openai gpt-4o----------------") 35 | sample_sizes={'Patient': 15, 'Claim': 15} 36 | model_config = ModelConfig( 37 | provider="openai", 38 | model_name="gpt-4o-2024-08-06", 39 | temperature=0.7, 40 | max_completion_tokens=16000 # Larger value for more complete responses 41 | ) 42 | 43 | generator = SyntheticDataGenerator(model_config=model_config) 44 | # Define output directory 45 | output_dir = os.path.join( 46 | os.path.dirname(os.path.abspath(__file__)), 47 | "output", 48 | "test_openai_models", 49 | "gpt-4o" 50 | ) 51 | # Generate and save to CSV 52 | results = generator.generate_for_schemas( 53 | schemas=schemas, 54 | prompts=prompts, 55 | sample_sizes=sample_sizes, 56 | output_dir=output_dir 57 | ) 58 | print(f"Data saved to {output_dir}") 59 | 60 | 61 | print("--------------Testing openai o3----------------") 62 | model_config = ModelConfig( 63 | provider="openai", 64 | model_name="o3-2025-04-16", 65 | max_completion_tokens=100000 # Larger value for more complete responses 66 | ) 67 | 68 | generator = SyntheticDataGenerator(model_config=model_config) 69 | # Define output directory 70 | output_dir = os.path.join( 71 | os.path.dirname(os.path.abspath(__file__)), 72 | "output", 73 | "test_openai_models", 74 | "o3" 75 | ) 76 | sample_sizes={'Patient': 100, 'Claim': 200} 77 | # Generate and save to CSV 78 | results = generator.generate_for_schemas( 79 | schemas=schemas, 80 | prompts=prompts, 81 | sample_sizes=sample_sizes, 82 | output_dir=output_dir 83 | ) 84 | print(f"Data saved to {output_dir}") -------------------------------------------------------------------------------- /examples/invalid_schemas/example_schema_validation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Schema Validation Testing Script 4 | 5 | This script demonstrates schema validation with both valid and invalid schemas. 6 | It showcases how validation errors are handled and reported to users. 7 | """ 8 | 9 | import os 10 | import sys 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 12 | 13 | from syda.generate import SyntheticDataGenerator 14 | import logging 15 | 16 | # Set up logging to see validation messages 17 | logging.basicConfig(level=logging.INFO) 18 | 19 | def test_schema_validation(): 20 | """Test loading and validating various schema files""" 21 | 22 | output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "output") 23 | os.makedirs(output_dir, exist_ok=True) 24 | 25 | generator = SyntheticDataGenerator() 26 | schemas_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "schemas") 27 | 28 | # Test valid schemas 29 | valid_schemas = [ 30 | "valid_customer.yml", 31 | "valid_product.yml" 32 | ] 33 | 34 | print("\n===== Testing Valid Schemas =====") 35 | for schema_file in valid_schemas: 36 | schema_path = os.path.join(schemas_dir, schema_file) 37 | try: 38 | print(f"\nValidating schema: {schema_file}") 39 | # Use generate_for_schemas to validate, but set default_sample_size to 1 to keep processing minimal 40 | schema_name = os.path.splitext(os.path.basename(schema_file))[0] 41 | results = generator.generate_for_schemas( 42 | schemas={schema_name: schema_path}, 43 | output_dir=output_dir, 44 | default_sample_size=1 # Keep minimal to avoid long processing 45 | ) 46 | print(f"✅ Schema validation successful: {schema_file}") 47 | table_name = os.path.splitext(os.path.basename(schema_file))[0].replace("valid_", "") 48 | print(f" Schema name: {table_name}") 49 | except Exception as e: 50 | print(f"❌ Unexpected validation error: {e}") 51 | 52 | # Test invalid schemas 53 | invalid_schemas = [ 54 | "invalid_missing_name.yml", 55 | "invalid_template_type.yml", 56 | "invalid_field_type.yml", 57 | "invalid_foreign_key.yml" 58 | ] 59 | 60 | print("\n===== Testing Invalid Schemas =====") 61 | for schema_file in invalid_schemas: 62 | schema_path = os.path.join(schemas_dir, schema_file) 63 | try: 64 | print(f"\nValidating schema: {schema_file}") 65 | # Try to generate with an invalid schema 66 | schema_name = os.path.splitext(os.path.basename(schema_file))[0] 67 | results = generator.generate_for_schemas( 68 | schemas={schema_name: schema_path}, 69 | output_dir=output_dir, 70 | default_sample_size=1 71 | ) 72 | print(f"❓ Unexpected success: {schema_file} - validation should have failed!") 73 | except Exception as e: 74 | # This is expected for invalid schemas 75 | print(f"✓ Validation correctly failed: {schema_file}") 76 | print(f" Error: {str(e)}") 77 | 78 | if __name__ == "__main__": 79 | print("Schema Validation Test") 80 | print("=====================") 81 | test_schema_validation() 82 | print("\nTest complete!") 83 | -------------------------------------------------------------------------------- /examples/structured_only/output/example_dict_schemas/ecommerce/product.csv: -------------------------------------------------------------------------------- 1 | id,name,category,price,description,in_stock 2 | 1,Wireless Noise-Canceling Headphones,Electronics,249.99,"Premium over-ear headphones with advanced noise-cancellation technology, 40-hour battery life, and crystal-clear sound quality. Perfect for music lovers and professionals seeking immersive audio experience.", 3 | 2,Organic Cotton Comfort T-Shirt,Home & Kitchen,29.5,"Soft, breathable 100% organic cotton t-shirt with a relaxed fit. Available in multiple colors, this versatile wardrobe staple offers comfort and sustainability in one classic design.", 4 | 3,Smart WiFi Robot Vacuum Cleaner,Clothing,299.0,"Intelligent robotic vacuum with WiFi connectivity, app control, and advanced mapping technology. Cleans hardwood and carpet surfaces, with automatic charging and scheduling features.", 5 | 4,Bestselling Fantasy Novel Collection,Books,45.99,"Boxed set of the top 5 fantasy novels of the year, featuring intricate world-building, compelling characters, and epic storytelling. Perfect for avid readers and fantasy enthusiasts.", 6 | 5,Ergonomic Office Chair,Toys,249.99,"Professionally designed ergonomic office chair with lumbar support, adjustable armrests, and breathable mesh back. Provides ultimate comfort for long working hours.", 7 | 6,4K Ultra HD Smart Television,Beauty,599.0,"55-inch 4K Smart TV with HDR, built-in streaming apps, voice control, and stunning picture quality. Immersive entertainment experience with multiple HDMI and USB ports.", 8 | 7,Waterproof Running Shoes,Sports,89.99,"High-performance running shoes with waterproof exterior, responsive cushioning, and lightweight design. Ideal for outdoor athletes and fitness enthusiasts in various weather conditions.", 9 | 8,Stainless Steel Kitchen Knife Set,Sports,129.5,"Professional-grade 6-piece knife set with ergonomic handles, precision-forged blades, and wooden storage block. Includes chef's knife, bread knife, and specialized cutting tools.", 10 | 9,Science Fiction Graphic Novel Bundle,Electronics,59.99,Collector's edition graphic novel set featuring award-winning sci-fi stories with stunning artwork. Includes 3 complete series exploring futuristic worlds and complex narratives., 11 | 10,Smart Home Security Camera System,Beauty,199.0,"Wireless security camera system with night vision, motion detection, cloud storage, and smartphone app integration. Provides comprehensive home monitoring and protection.", 12 | 11,Luxury Leather Messenger Bag,Sports,179.99,"Premium genuine leather messenger bag with multiple compartments, padded laptop sleeve, and adjustable shoulder strap. Combines professional style with practical functionality.", 13 | 12,Programmable Coffee Maker,Electronics,89.99,"12-cup programmable coffee maker with built-in grinder, thermal carafe, and customizable brewing options. Ensures perfect temperature and flavor for coffee enthusiasts.", 14 | 13,Historical Biography Collection,Clothing,39.5,Curated set of biographical books exploring influential historical figures from various eras and disciplines. Comprehensive and engaging narratives for history lovers., 15 | 14,Memory Foam Mattress,Home & Kitchen,499.0,"Premium memory foam mattress with cooling gel technology, pressure-relieving support, and hypoallergenic cover. Provides exceptional comfort and restful sleep.", 16 | 15,Portable Bluetooth Speaker,Toys,79.99,"Waterproof wireless Bluetooth speaker with 360-degree sound, 12-hour battery life, and rugged design. Perfect for outdoor adventures and home entertainment.", 17 | -------------------------------------------------------------------------------- /examples/model_selection/output/test_grok_models/grok-4/companies.csv: -------------------------------------------------------------------------------- 1 | id,name,industry,founded_year,employee_count,revenue_millions,is_public,headquarters 2 | 1,NeuroLink AI,AI,2018,150,25.5,,"San Francisco, USA" 3 | 2,Finova Tech,FinTech,2005,5000,800.0,,"New York, USA" 4 | 3,HealthNova Labs,HealthTech,2012,300,45.2,,"Boston, USA" 5 | 4,EduForge Innovations,EdTech,1995,1200,150.0,,"London, UK" 6 | 5,CleanEnergy Dynamics,CleanTech,2020,80,10.1,,"Berlin, Germany" 7 | 6,BioGenix Systems,BioTech,2000,2500,600.0,,"Cambridge, USA" 8 | 7,QuantumEdge Computing,Quantum Computing,2015,200,30.0,,"Toronto, Canada" 9 | 8,SecureNet Cyber,Cybersecurity,1998,4000,750.0,,"Tel Aviv, Israel" 10 | 9,RoboMech Industries,Robotics,2010,600,120.0,,"Tokyo, Japan" 11 | 10,DataSphere Analytics,Big Data,2008,1800,400.0,,"Seattle, USA" 12 | 11,VirtuaReal VR,Virtual Reality,2016,120,18.7,,"Los Angeles, USA" 13 | 12,AgriTech Solutions,AgriTech,1992,900,200.0,,"Amsterdam, Netherlands" 14 | 13,NanoMed Therapeutics,NanoTech,2019,50,5.3,,"Singapore, Singapore" 15 | 14,SpaceForge Aerospace,SpaceTech,2002,3500,900.0,,"Houston, USA" 16 | 15,EcoDrive Mobility,AutoTech,2014,400,60.0,,"Munich, Germany" 17 | 16,GenomeX Biotech,Genomics,1997,1500,350.0,,"San Diego, USA" 18 | 17,CloudNova Services,Cloud Computing,2011,800,140.0,,"Dublin, Ireland" 19 | 18,SmartCity Innovations,IoT,2003,2200,500.0,,"Shanghai, China" 20 | 19,BlockSecure Chain,Blockchain,2017,90,12.4,,"Zurich, Switzerland" 21 | 20,TeleHealth Connect,Telemedicine,1994,1100,250.0,,"Paris, France" 22 | 21,AI Visionary Labs,AI,2021,40,3.2,,"Bangalore, India" 23 | 22,FinSecure Banking,FinTech,2007,2800,650.0,,"Sydney, Australia" 24 | 23,MedTech Pioneers,HealthTech,2013,250,35.0,,"Stockholm, Sweden" 25 | 24,LearnAI Education,EdTech,1999,1400,300.0,,"Seoul, South Korea" 26 | 25,GreenPower Renewables,CleanTech,2022,60,8.5,,"Copenhagen, Denmark" 27 | 26,SynthBio Innovations,BioTech,2004,1900,450.0,,"Oxford, UK" 28 | 27,QuantumSecure Net,Quantum Computing,2016,180,28.0,,"Beijing, China" 29 | 28,CyberShield Defenses,Cybersecurity,1996,3200,700.0,,"Washington D.C., USA" 30 | 29,AutoBot Robotics,Robotics,2009,700,160.0,,"Osaka, Japan" 31 | 30,BigData Insights,Big Data,2012,1000,220.0,,"Austin, USA" 32 | 31,AugmentReality Tech,Augmented Reality,2018,130,20.0,,"Madrid, Spain" 33 | 32,FarmSmart Agri,AgriTech,2001,1600,380.0,,"Sao Paulo, Brazil" 34 | 33,NanoFabricate,NanoTech,2020,70,9.8,,"Seoul, South Korea" 35 | 34,Orbital Dynamics,SpaceTech,1993,2700,550.0,,"Cape Canaveral, USA" 36 | 35,EVolve Mobility,AutoTech,2015,500,80.0,,"Stuttgart, Germany" 37 | 36,GeneEdit Solutions,Genomics,2006,2100,480.0,,"Heidelberg, Germany" 38 | 37,CloudForge Platforms,Cloud Computing,2010,900,180.0,,"Frankfurt, Germany" 39 | 38,IoT Urban Systems,IoT,1991,1300,290.0,,"New York, USA" 40 | 39,CryptoLedger Tech,Blockchain,2019,100,15.0,,"Dubai, UAE" 41 | 40,RemoteCare Health,Telemedicine,2000,1700,410.0,,"Toronto, Canada" 42 | 41,IntelliMind AI,AI,2023,30,2.5,,"Tel Aviv, Israel" 43 | 42,WealthWave FinTech,FinTech,1990,4500,950.0,,"Hong Kong, China" 44 | 43,VitalTech Health,HealthTech,2014,350,50.0,,"Melbourne, Australia" 45 | 44,SkillBoost EdTech,EdTech,2008,2000,420.0,,"Boston, USA" 46 | 45,SolarSpark CleanTech,CleanTech,2021,110,16.3,,"Madrid, Spain" 47 | 46,LifeSynth BioTech,BioTech,1998,2400,580.0,,"Basel, Switzerland" 48 | 47,QuantumLeap Computing,Quantum Computing,2017,160,22.0,,"Waterloo, Canada" 49 | 48,Fortress Cyber,Cybersecurity,2004,3600,820.0,,"London, UK" 50 | 49,MechInnovate Robotics,Robotics,2011,550,110.0,,"Shenzhen, China" 51 | 50,InsightData Analytics,Big Data,1999,1250,270.0,,"Chicago, USA" 52 | -------------------------------------------------------------------------------- /syda/output.py: -------------------------------------------------------------------------------- 1 | """ 2 | Output utilities for saving generated data to various formats. 3 | """ 4 | 5 | import os 6 | import pandas as pd 7 | from typing import Dict, Optional, Union, List 8 | 9 | 10 | def save_dataframe( 11 | df: pd.DataFrame, 12 | file_path: str, 13 | format: Optional[str] = None 14 | ) -> str: 15 | """ 16 | Save a single DataFrame to a file with format detection and validation. 17 | 18 | Args: 19 | df: DataFrame to save 20 | file_path: Path where the file should be saved 21 | format: Optional format override ('csv' or 'json') 22 | 23 | Returns: 24 | Path to the saved file 25 | 26 | Raises: 27 | ValueError: If the DataFrame is empty or invalid 28 | ValueError: If the specified format is not supported ('csv' or 'json') 29 | """ 30 | # Validate DataFrame 31 | if df.empty or len(df.columns) == 0: 32 | raise ValueError( 33 | "Failed to generate valid data. The resulting DataFrame is empty or has no columns. " 34 | "This could be due to an issue with the AI model response or schema definition. " 35 | "Check your schema, model settings, and API keys." 36 | ) 37 | 38 | # Validate format if explicitly provided 39 | if format and format.lower() not in ['csv', 'json']: 40 | raise ValueError(f"Unsupported format: {format}. Supported formats are 'csv' and 'json'.") 41 | 42 | # Determine format from extension or override 43 | if format: 44 | # If format is explicitly provided, ensure path has correct extension 45 | if not file_path.endswith(f'.{format}'): 46 | file_path = f"{file_path}.{format}" 47 | 48 | # Ensure the output directory exists 49 | output_dir = os.path.dirname(file_path) 50 | if output_dir and not os.path.exists(output_dir): 51 | os.makedirs(output_dir, exist_ok=True) 52 | 53 | # Save based on file extension 54 | if file_path.endswith('.csv'): 55 | df.to_csv(file_path, index=False) 56 | elif file_path.endswith('.json'): 57 | df.to_json(file_path, orient='records') 58 | else: 59 | # Default to CSV if no recognized extension 60 | file_path = f"{file_path}.csv" 61 | df.to_csv(file_path, index=False) 62 | 63 | print(f"[OK] Successfully wrote {len(df)} rows to {file_path}") 64 | return file_path 65 | 66 | 67 | def save_dataframes( 68 | data_dict: Dict[str, pd.DataFrame], 69 | output_dir: str, 70 | format: str = 'csv', 71 | filenames: Optional[Dict[str, str]] = None 72 | ) -> List[str]: 73 | """ 74 | Save multiple DataFrames to files in a directory. 75 | 76 | Args: 77 | data_dict: Dictionary mapping names to DataFrames 78 | output_dir: Directory where files should be saved 79 | format: File format to use ('csv' or 'json') 80 | filenames: Optional dictionary mapping schema names to custom filenames 81 | (without extension) 82 | 83 | Returns: 84 | List of paths to saved files 85 | """ 86 | os.makedirs(output_dir, exist_ok=True) 87 | saved_paths = [] 88 | 89 | for name, df in data_dict.items(): 90 | # Use custom filename if provided, otherwise use schema name 91 | base_filename = filenames.get(name, name.lower()) if filenames else name.lower() 92 | file_name = f"{base_filename}.{format}" 93 | file_path = os.path.join(output_dir, file_name) 94 | saved_path = save_dataframe(df, file_path) 95 | saved_paths.append(saved_path) 96 | 97 | return saved_paths 98 | -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/templates/lab_result_template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Laboratory Report 6 | 60 | 61 | 62 | 63 | 64 |
65 |

LABORATORY TEST REPORT

66 |
67 | 68 |
69 |
REPORT INFORMATION
70 |

71 | Lab ID: {{ lab_id }}
72 | Collection Date: {{ collection_date }}
73 | Report Date: {{ report_date }} 74 |

75 |
76 | 77 |
78 |
PATIENT INFORMATION
79 |

80 | Patient ID: {{ patient_id }}
81 | Ordering Physician: {{ ordering_physician }} 82 |

83 |
84 | 85 |
86 |
TEST RESULTS
87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 |
TestCategoryResultReference RangeUnitsInterpretation
{{ test_name }}{{ test_category }}{{ result_value }}{{ reference_range }}{{ unit }}{{ interpretation }}
105 |
106 | 107 |
108 |
PERFORMING LABORATORY
109 |

{{ performing_lab }}

110 |
111 | 112 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Syda Documentation 2 | site_description: syda seamlessly generate realistic synthetic test data - structured, unstructured, PDF, and HTML data generation with AI and large language models while preserving referential integrity, maintaining privacy compliance, and accelerating development workflows using OpenAI, Anthropic, Gemini, and Grok models 3 | site_url: https://python.syda.ai/ 4 | repo_url: https://github.com/syda-ai/syda 5 | repo_name: syda-ai/syda 6 | 7 | theme: 8 | name: material 9 | palette: 10 | - media: "(prefers-color-scheme: light)" 11 | scheme: default 12 | primary: teal 13 | accent: amber 14 | toggle: 15 | icon: material/brightness-7 16 | name: Switch to dark mode 17 | - media: "(prefers-color-scheme: dark)" 18 | scheme: slate 19 | primary: teal 20 | accent: amber 21 | toggle: 22 | icon: material/brightness-4 23 | name: Switch to light mode 24 | features: 25 | - content.code.annotate 26 | - content.code.copy 27 | - navigation.instant 28 | - navigation.tracking 29 | - navigation.tabs 30 | - navigation.sections 31 | - navigation.top 32 | - navigation.indexes 33 | - search.highlight 34 | - search.share 35 | - search.suggest 36 | icon: 37 | repo: fontawesome/brands/github 38 | 39 | plugins: 40 | - search: 41 | separator: '[\s\-,:!=\[\]()"/]+|(?!\b)(?=[A-Z][a-z])|\.(?!\d)|&[lg]t;' 42 | - macros 43 | - minify: 44 | minify_html: true 45 | minify_css: true 46 | minify_js: true 47 | htmlmin_opts: 48 | remove_comments: true 49 | remove_empty_space: true 50 | cache_safe: true 51 | 52 | extra: 53 | sitemap: 54 | changefreq: weekly 55 | priority: 0.8 56 | generator: false # Remove "Made with MkDocs" footer 57 | 58 | markdown_extensions: 59 | - admonition 60 | - codehilite: 61 | guess_lang: false 62 | - toc: 63 | permalink: true 64 | permalink_title: Anchor link to this section 65 | - pymdownx.highlight: 66 | anchor_linenums: true 67 | - pymdownx.superfences 68 | - pymdownx.tabbed: 69 | alternate_style: true 70 | 71 | # Navigation structure 72 | nav: 73 | - Home: 74 | Introduction: index.md 75 | Quick Start: quickstart.md 76 | - Deep Dive: 77 | - Structured Data: 'deep_dive/structured_data.md' 78 | - Foreign Keys: 'deep_dive/foreign_keys.md' 79 | - Unstructured Documents: 'deep_dive/unstructured_documents.md' 80 | - Custom Generators: 'deep_dive/custom_generators.md' 81 | - Combined Data: 'deep_dive/combined_data.md' 82 | - Model Configuration: 'deep_dive/model_configuration.md' 83 | - Output Options: 'deep_dive/output_options.md' 84 | - Schema Reference: 85 | - Field Types: 'schema_reference/field_types.md' 86 | - Special Field Types: 'schema_reference/special_field_types.md' 87 | - Foreign Keys: 'schema_reference/foreign_keys.md' 88 | - Examples: 89 | - Model Selection: 90 | - Anthropic: 'examples/model_selection/anthropic.md' 91 | - OpenAI: 'examples/model_selection/openai.md' 92 | - Azure OpenAI: 'examples/model_selection/azureopenai.md' 93 | - Gemini: 'examples/model_selection/gemini.md' 94 | - Grok: 'examples/model_selection/grok.md' 95 | - Structured Only: 96 | - Dict Schemas: 'examples/structured_only/dict_schemas.md' 97 | - YAML Schemas: 'examples/structured_only/yaml_schemas.md' 98 | - JSON Schemas: 'examples/structured_only/json_schemas.md' 99 | - SQLAlchemy Models: 'examples/structured_only/sqlalchemy_models.md' 100 | - Structured and Unstructured Mixed: 101 | - YAML Schemas: 'examples/structured_and_unstructured_mixed/yaml_schemas.md' 102 | - SQLAlchemy Models: 'examples/structured_and_unstructured_mixed/sqlalchemy_models.md' 103 | - Miscellaneous: 'examples/miscellaneous.md' 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /examples/model_selection/output/test_gemini_models/flash-2-0/patient.csv: -------------------------------------------------------------------------------- 1 | patient_id,diagnosis_code,email,visit_date,notes 2 | 1,E11.9,patient1@example.com,2024-01-20,"Routine checkup, patient is stable." 3 | 2,I50.9,patient2@example.com,2024-01-15,Follow up for heart failure. Adjusted medication. 4 | 3,J45.909,patient3@example.com,2024-01-10,"Asthma exacerbation, prescribed inhaler." 5 | 4,M54.5,patient4@example.com,2024-02-01,"Lower back pain, referred to physical therapy." 6 | 5,R06.02,patient5@example.com,2024-02-05,"Shortness of breath, rule out cardiac issues." 7 | 6,Z00.00,patient6@example.com,2024-02-10,"Annual physical exam, healthy." 8 | 7,E78.0,patient7@example.com,2024-02-15,"High cholesterol, started statin." 9 | 8,G43.909,patient8@example.com,2024-02-20,"Migraine, prescribed triptan." 10 | 9,N40.0,patient9@example.com,2024-02-25,"BPH, started alpha-blocker." 11 | 10,H52.2,patient10@example.com,2024-03-01,"Astigmatism, referred to ophthalmologist." 12 | 11,E11.9,patient11@example.com,2024-03-05,"Diabetes follow up, A1c stable." 13 | 12,I50.9,patient12@example.com,2024-03-10,"Heart failure, increased diuretic dose." 14 | 13,J45.909,patient13@example.com,2024-03-15,"Asthma, added long-acting beta agonist." 15 | 14,M54.5,patient14@example.com,2024-03-20,"Chronic back pain, referred for injections." 16 | 15,R06.02,patient15@example.com,2024-03-25,"SOB, pulmonary function tests ordered." 17 | 16,Z00.00,patient16@example.com,2024-03-30,"Preventative care, updated vaccinations." 18 | 17,E78.0,patient17@example.com,2024-04-01,"Hyperlipidemia, diet counseling provided." 19 | 18,G43.909,patient18@example.com,2024-04-05,"Chronic migraines, consider Botox." 20 | 19,N40.0,patient19@example.com,2024-04-10,"BPH, scheduled for TURP." 21 | 20,H52.2,patient20@example.com,2024-04-15,"Vision changes, new glasses prescribed." 22 | 21,E11.9,patient21@example.com,2024-04-20,"Poorly controlled diabetes, insulin started." 23 | 22,I50.9,patient22@example.com,2024-04-25,"Worsening heart failure, hospital admission." 24 | 23,J45.909,patient23@example.com,2024-04-30,"Severe asthma, ER visit." 25 | 24,M54.5,patient24@example.com,2024-05-01,"Back pain, MRI ordered." 26 | 25,R06.02,patient25@example.com,2024-05-05,"SOB, CT scan ordered." 27 | 26,Z00.00,patient26@example.com,2024-05-10,"Routine checkup, all normal." 28 | 27,E78.0,patient27@example.com,2024-05-15,"High cholesterol, increased statin dose." 29 | 28,G43.909,patient28@example.com,2024-05-20,"Migraines, started preventative medication." 30 | 29,N40.0,patient29@example.com,2024-05-25,"BPH, watchful waiting." 31 | 30,H52.2,patient30@example.com,2024-05-30,"Progressive vision loss, glaucoma suspect." 32 | 31,E11.9,patient31@example.com,2024-06-01,Diabetes education provided. 33 | 32,I50.9,patient32@example.com,2024-06-05,"Stable heart failure, continue current meds." 34 | 33,J45.909,patient33@example.com,2024-06-10,Well-controlled asthma. 35 | 34,M54.5,patient34@example.com,2024-06-15,Physical therapy helping back pain. 36 | 35,R06.02,patient35@example.com,2024-06-20,"SOB resolved, likely anxiety." 37 | 36,Z00.00,patient36@example.com,2024-06-25,Flu shot given. 38 | 37,E78.0,patient37@example.com,2024-06-30,Cholesterol improving with diet. 39 | 38,G43.909,patient38@example.com,2024-07-01,Migraine frequency decreased. 40 | 39,N40.0,patient39@example.com,2024-07-05,BPH symptoms stable. 41 | 40,H52.2,patient40@example.com,2024-07-10,Referred for cataract evaluation. 42 | 41,E11.9,patient41@example.com,2024-07-15,Insulin dose adjusted. 43 | 42,I50.9,patient42@example.com,2024-07-20,"Fluid retention, adjusted diuretics." 44 | 43,J45.909,patient43@example.com,2024-07-25,Asthma action plan reviewed. 45 | 44,M54.5,patient44@example.com,2024-07-30,Pain management strategies discussed. 46 | 45,R06.02,patient45@example.com,2024-08-01,Anxiety management techniques taught. 47 | 46,Z00.00,patient46@example.com,2024-08-05,Tdap booster given. 48 | 47,E78.0,patient47@example.com,2024-08-10,Lipid panel ordered. 49 | 48,G43.909,patient48@example.com,2024-08-15,New migraine medication prescribed. 50 | 49,N40.0,patient49@example.com,2024-08-20,PSA level checked. 51 | 50,H52.2,patient50@example.com,2024-08-25,Prescription for new glasses sent. 52 | -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/templates/medical_report_template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Medical Report 6 | 40 | 41 | 42 |
43 |

MEDICAL REPORT

44 |
45 | 46 |
47 |
PATIENT INFORMATION
48 |

49 | Patient ID: {{ patient_id }}
50 | Name: {{ patient_name }}
51 | Date of Birth: {{ date_of_birth }} 52 |

53 |
54 | 55 |
56 |
VISIT INFORMATION
57 |

58 | Visit Date: {{ visit_date }}
59 | Chief Complaint: {{ chief_complaint }} 60 |

61 |
62 | 63 |
64 |
MEDICAL HISTORY
65 |

{{ medical_history }}

66 |
67 | 68 |
69 |
VITAL SIGNS
70 |

71 | Blood Pressure: {{ blood_pressure }}
72 | Heart Rate: {{ heart_rate }} bpm
73 | Respiratory Rate: {{ respiratory_rate }} breaths/min
74 | Temperature: {{ temperature }}°F
75 | Oxygen Saturation: {{ oxygen_saturation }}% 76 |

77 |
78 | 79 |
80 |
ASSESSMENT
81 |

{{ assessment }}

82 |
83 | 84 |
85 |
DIAGNOSIS
86 | 95 |
96 | 97 |
98 |
TREATMENT PLAN
99 |

{{ treatment_plan }}

100 |
101 | 102 |
103 |
MEDICATIONS
104 | 105 | 106 | 107 | 108 | {% if medications %} 109 | {% for med in medications %} 110 | 111 | 112 | 113 | {% endfor %} 114 | {% else %} 115 | 116 | 117 | 118 | {% endif %} 119 |
Medication
{{ med }}
No medications prescribed
120 |
121 | 122 |
123 |
FOLLOW-UP
124 |

{{follow_up}}

125 |
126 | 127 | 128 | -------------------------------------------------------------------------------- /examples/structured_only/hr_employee_example/output/employee.csv: -------------------------------------------------------------------------------- 1 | id,first_name,last_name,email,phone,hire_date,department_id,position_id,salary,is_active 2 | 1,John,Smith,jsmith@company.com,555-123-4567,2019-03-15,5,3,67000.0, 3 | 2,Emily,Johnson,johnson.emily@company.com,555-987-6543,2020-06-22,3,9,52000.0, 4 | 3,Michael,Williams,michaelw@company.com,555-456-7890,2018-01-10,4,15,103000.0, 5 | 4,Sarah,Brown,sarahb@company.com,555-234-5678,2021-09-05,3,3,79000.0, 6 | 5,David,Miller,miller.david@company.com,555-345-6789,2017-11-30,7,13,78000.0, 7 | 6,Jennifer,Davis,davis.jennifer@company.com,555-567-8901,2020-04-18,8,9,41000.0, 8 | 7,Robert,Garcia,robert.garcia@company.com,555-678-9012,2019-07-25,1,15,157000.0, 9 | 8,Lisa,Martinez,lmartinez@company.com,555-789-0123,2021-02-14,2,15,150000.0, 10 | 9,William,Rodriguez,wrodriguez@company.com,555-890-1234,2018-05-20,4,14,141000.0, 11 | 10,Jessica,Lopez,lopez.jessica@company.com,555-901-2345,2020-11-08,6,1,109000.0, 12 | 11,Christopher,Gonzalez,christopherg@company.com,555-012-3456,2019-08-17,6,12,104000.0, 13 | 12,Amanda,Wilson,wilson.amanda@company.com,555-123-4567,2021-05-30,4,13,81000.0, 14 | 13,Daniel,Anderson,daniel.anderson@company.com,555-234-5678,2017-12-05,3,13,60000.0, 15 | 14,Michelle,Taylor,michelle.taylor@company.com,555-345-6789,2020-07-12,2,12,99000.0, 16 | 15,Kevin,Thomas,kevin.thomas@company.com,555-456-7890,2018-09-22,8,13,79000.0, 17 | 16,Rachel,Moore,rmoore@company.com,555-567-8901,2021-01-15,2,13,82000.0, 18 | 17,Brian,Jackson,jackson.brian@company.com,555-678-9012,2019-06-03,1,13,74000.0, 19 | 18,Nicole,Martin,martin.nicole@company.com,555-789-0123,2020-10-20,6,11,75000.0, 20 | 19,Jason,Lee,jason.lee@company.com,555-890-1234,2018-04-07,3,11,103000.0, 21 | 20,Lauren,Thompson,laurent@company.com,555-901-2345,2021-03-25,8,8,241000.0, 22 | 21,Ryan,White,white.ryan@company.com,555-012-3456,2019-02-14,2,9,63000.0, 23 | 22,Stephanie,Harris,harris.stephanie@company.com,555-123-4567,2020-08-30,4,14,199000.0, 24 | 23,Eric,Sanchez,esanchez@company.com,555-234-5678,2018-10-11,4,2,179000.0, 25 | 24,Megan,Clark,meganc@company.com,555-345-6789,2021-06-17,4,6,131000.0, 26 | 25,Justin,Ramirez,justinr@company.com,555-456-7890,2019-05-09,1,8,215000.0, 27 | 26,Ashley,Lewis,ashleyl@company.com,555-567-8901,2020-12-05,1,12,118000.0, 28 | 27,Andrew,Robinson,robinson.andrew@company.com,555-678-9012,2018-03-22,7,12,75000.0, 29 | 28,Brittany,Walker,bwalker@company.com,555-789-0123,2021-04-08,1,14,202000.0, 30 | 29,Matthew,Perez,matthewp@company.com,555-890-1234,2019-01-16,7,12,109000.0, 31 | 30,Katie,Hall,hall.katie@company.com,555-901-2345,2020-09-14,5,10,140000.0, 32 | 31,Steven,Young,young.steven@company.com,555-012-3456,2018-07-19,6,7,89000.0, 33 | 32,Kelly,Allen,kelly.allen@company.com,555-123-4567,2021-02-01,4,13,56000.0, 34 | 33,Mark,King,king.mark@company.com,555-234-5678,2019-10-25,3,14,150000.0, 35 | 34,Rebecca,Wright,rebecca.wright@company.com,555-345-6789,2020-05-07,7,3,77000.0, 36 | 35,Timothy,Scott,timothy.scott@company.com,555-456-7890,2018-02-28,5,15,103000.0, 37 | 36,Laura,Green,lgreen@company.com,555-567-8901,2021-07-22,2,8,284000.0, 38 | 37,Jeffrey,Baker,jeffrey.baker@company.com,555-678-9012,2019-04-13,1,11,105000.0, 39 | 38,Karen,Adams,kadams@company.com,555-789-0123,2020-11-30,6,5,67000.0, 40 | 39,Gregory,Nelson,gregoryn@company.com,555-890-1234,2018-06-07,1,15,118000.0, 41 | 40,Nancy,Hill,nancy.hill@company.com,555-901-2345,2021-01-09,6,6,122000.0, 42 | 41,Ronald,Ramirez,ronald.ramirez@company.com,555-012-3456,2019-09-16,5,13,79000.0, 43 | 42,Sandra,Campbell,sandra.campbell@company.com,555-123-4567,2020-06-05,6,9,41000.0, 44 | 43,Scott,Mitchell,scott.mitchell@company.com,555-234-5678,2018-11-20,8,3,77000.0, 45 | 44,Carol,Roberts,croberts@company.com,555-345-6789,2021-03-12,8,10,151000.0, 46 | 45,Frank,Carter,carter.frank@company.com,555-456-7890,2019-05-28,8,9,56000.0, 47 | 46,Sharon,Phillips,sharon.phillips@company.com,555-567-8901,2020-12-17,8,9,52000.0, 48 | 47,Dennis,Evans,dennis.evans@company.com,555-678-9012,2018-08-09,1,1,92000.0, 49 | 48,Angela,Edwards,angela.edwards@company.com,555-789-0123,2021-05-01,3,10,143000.0, 50 | 49,Larry,Collins,larryc@company.com,555-890-1234,2019-02-26,1,1,79000.0, 51 | 50,Kimberly,Stewart,kimberly.stewart@company.com,555-901-2345,2020-10-09,8,7,55000.0, 52 | -------------------------------------------------------------------------------- /examples/model_selection/output/test_grok_models/grok-4/products.csv: -------------------------------------------------------------------------------- 1 | id,name,company_id,category,launch_year,price_usd,is_ai_powered 2 | 1,iPhone 15,27,Hardware,2023,799.0, 3 | 2,iPad Pro,35,Hardware,2023,1099.0, 4 | 3,MacBook Air M2,23,Hardware,2022,1199.0, 5 | 4,Apple Watch Series 9,10,Hardware,2023,399.0, 6 | 5,AirPods Pro 2,16,Hardware,2022,249.0, 7 | 6,iOS 17,44,Software,2023,0.0, 8 | 7,macOS Sonoma,21,Software,2023,0.0, 9 | 8,Apple TV+,12,Service,2019,6.99, 10 | 9,Apple Arcade,45,Service,2019,4.99, 11 | 10,Apple Vision Pro,5,Hardware,2024,3499.0, 12 | 11,Pixel 8 Pro,12,Hardware,2023,999.0, 13 | 12,Google Gemini,27,Software,2023,0.0, 14 | 13,Google Cloud AI,2,Platform,2020,0.0, 15 | 14,Android Auto AI,21,Software,2024,0.0, 16 | 15,Google Maps AR,48,Service,2022,0.0, 17 | 16,YouTube AI Editor,44,Software,2024,0.0, 18 | 17,Nest Cam IQ,21,Hardware,2021,299.0, 19 | 18,Google Workspace,27,Platform,2020,6.0, 20 | 19,Chrome OS Flex,29,Software,2022,0.0, 21 | 20,Google Fiber,13,Service,2010,70.0, 22 | 21,Surface Pro 9,44,Hardware,2022,1099.0, 23 | 22,Windows 11,5,Software,2021,139.0, 24 | 23,Microsoft 365,36,Service,2017,6.99, 25 | 24,Azure AI,43,Platform,2010,0.0, 26 | 25,Xbox Series X,19,Hardware,2020,499.0, 27 | 26,Copilot,48,Software,2023,0.0, 28 | 27,Teams,30,Software,2017,0.0, 29 | 28,Power BI,24,Software,2011,10.0, 30 | 29,HoloLens 2,14,Hardware,2019,3500.0, 31 | 30,Microsoft Edge,40,Software,2015,0.0, 32 | 31,Echo Dot,11,Hardware,2022,49.0, 33 | 32,Alexa,28,Software,2014,0.0, 34 | 33,Amazon Web Services,4,Platform,2006,0.0, 35 | 34,Prime Video,19,Service,2006,8.99, 36 | 35,Kindle Paperwhite,21,Hardware,2021,129.0, 37 | 36,Ring Doorbell,44,Hardware,2020,99.0, 38 | 37,Amazon Luna,41,Platform,2020,9.99, 39 | 38,Fire TV Stick,37,Hardware,2023,39.0, 40 | 39,Amazon SageMaker,14,Service,2017,0.0, 41 | 40,Audible,6,Service,1995,14.95, 42 | 41,Model 3,15,Hardware,2017,39990.0, 43 | 42,Cybertruck,37,Hardware,2023,60000.0, 44 | 43,Full Self-Driving,5,Software,2019,12000.0, 45 | 44,Powerwall,10,Hardware,2015,7000.0, 46 | 45,Solar Roof,9,Hardware,2016,20000.0, 47 | 46,Tesla App,7,Software,2012,0.0, 48 | 47,Autopilot,48,Software,2014,0.0, 49 | 48,Model Y,50,Hardware,2020,42990.0, 50 | 49,Supercharger Network,44,Service,2012,0.0, 51 | 50,Optimus Robot,33,Hardware,2024,20000.0, 52 | 51,Galaxy S24,27,Hardware,2024,799.0, 53 | 52,One UI 6,31,Software,2023,0.0, 54 | 53,Samsung SmartThings,35,Platform,2014,0.0, 55 | 54,QLED TV,36,Hardware,2023,1499.0, 56 | 55,Galaxy Watch 6,50,Hardware,2023,299.0, 57 | 56,Bixby,35,Software,2017,0.0, 58 | 57,Samsung Health,26,Service,2015,0.0, 59 | 58,Galaxy Buds 2 Pro,39,Hardware,2022,229.0, 60 | 59,DeX,1,Software,2017,0.0, 61 | 60,Samsung Pay,35,Service,2015,0.0, 62 | 61,Watson AI,28,Platform,2011,0.0, 63 | 62,IBM Cloud,42,Platform,2013,0.0, 64 | 63,Quantum System One,50,Hardware,2019,1000000.0, 65 | 64,IBM SPSS,35,Software,1968,99.0, 66 | 65,Red Hat OpenShift,16,Platform,2018,0.0, 67 | 66,IBM Maximo,24,Software,1985,0.0, 68 | 67,Watson Assistant,13,Service,2017,0.0, 69 | 68,IBM Blockchain,22,Platform,2016,0.0, 70 | 69,z16 Mainframe,27,Hardware,2022,1000000.0, 71 | 70,IBM Consulting,29,Service,1991,0.0, 72 | 71,ChatGPT,5,Service,2022,0.0, 73 | 72,GPT-4,7,Software,2023,20.0, 74 | 73,DALL-E 3,50,Software,2023,0.0, 75 | 74,OpenAI API,25,Platform,2020,0.0, 76 | 75,Sora,40,Software,2024,0.0, 77 | 76,Codex,17,Software,2021,0.0, 78 | 77,Whisper,23,Software,2022,0.0, 79 | 78,Gym,31,Software,2016,0.0, 80 | 79,OpenAI Five,42,Software,2018,0.0, 81 | 80,Safety Gym,15,Software,2019,0.0, 82 | 81,Oculus Quest 3,11,Hardware,2023,499.0, 83 | 82,Facebook,26,Platform,2004,0.0, 84 | 83,Instagram,49,Platform,2010,0.0, 85 | 84,WhatsApp,17,Service,2009,0.0, 86 | 85,Meta Horizon Worlds,24,Platform,2021,0.0, 87 | 86,Llama 2,32,Software,2023,0.0, 88 | 87,Ray-Ban Meta Smart Glasses,37,Hardware,2023,299.0, 89 | 88,Threads,6,Service,2023,0.0, 90 | 89,Meta AI,22,Software,2023,0.0, 91 | 90,Workplace,31,Platform,2016,4.0, 92 | 91,GeForce RTX 4090,38,Hardware,2022,1599.0, 93 | 92,CUDA,11,Software,2007,0.0, 94 | 93,NVIDIA Omniverse,44,Platform,2020,0.0, 95 | 94,Jetson Nano,9,Hardware,2019,99.0, 96 | 95,DGX A100,50,Hardware,2020,199000.0, 97 | 96,DRIVE Platform,30,Platform,2015,0.0, 98 | 97,TensorRT,17,Software,2017,0.0, 99 | 98,NVIDIA Broadcast,42,Software,2020,0.0, 100 | 99,Grace CPU,38,Hardware,2023,0.0, 101 | 100,NVIDIA AI Enterprise,19,Software,2021,0.0, 102 | -------------------------------------------------------------------------------- /examples/model_selection/example_claude_models.py: -------------------------------------------------------------------------------- 1 | from syda.generate import SyntheticDataGenerator 2 | from syda.schemas import ModelConfig 3 | import os 4 | from dotenv import load_dotenv 5 | 6 | # Load environment variables 7 | load_dotenv() 8 | 9 | # Define schema for a single table 10 | schemas = { 11 | 'Patient': { 12 | 'patient_id': {'type': 'number', 'description': 'Unique identifier for the patient'}, 13 | 'diagnosis_code': {'type': 'text', 'description': 'ICD-10 diagnosis code'}, 14 | 'email': {'type': 'email', 'description': 'Patient email address used for communication'}, 15 | 'visit_date': {'type': 'date', 'description': 'Date when the patient visited the clinic'}, 16 | 'notes': {'type': 'text', 'description': 'Clinical notes for the patient visit'} 17 | }, 18 | 'Claim': { 19 | 'claim_id': {'type': 'number', 'description': 'Unique identifier for the claim'}, 20 | 'patient_id': {'type': 'foreign_key', 'description': 'Reference to the patient who made the claim', 'references': {'schema': 'Patient', 'field': 'patient_id'}}, 21 | 'diagnosis_code': {'type': 'text', 'description': 'ICD-10 diagnosis code'}, 22 | 'email': {'type': 'email', 'description': 'Patient email address used for communication'}, 23 | 'visit_date': {'type': 'date', 'description': 'Date when the patient visited the clinic'}, 24 | 'notes': {'type': 'text', 'description': 'Clinical notes for the patient visit'} 25 | } 26 | } 27 | 28 | prompts={ 29 | 'Patient': 'Generate realistic synthetic patient records with ICD-10 diagnosis codes, emails, visit dates, and clinical notes.', 30 | 'Claim': 'Generate realistic synthetic claim records with ICD-10 diagnosis codes, emails, visit dates, and clinical notes.' 31 | } 32 | sample_sizes={'Patient': 15, 'Claim': 15} 33 | 34 | print("--------------Testing Claude Haiku----------------") 35 | model_config = ModelConfig( 36 | provider="anthropic", 37 | model_name="claude-3-5-haiku-20241022", 38 | temperature=0.7, 39 | max_tokens=8192 # Larger value for more complete responses 40 | ) 41 | 42 | generator = SyntheticDataGenerator(model_config=model_config) 43 | # Define output directory 44 | output_dir = os.path.join( 45 | os.path.dirname(os.path.abspath(__file__)), 46 | "output", 47 | "test_claude_models", 48 | "haiku-3-5" 49 | ) 50 | # Generate and save to CSV 51 | results = generator.generate_for_schemas( 52 | schemas=schemas, 53 | prompts=prompts, 54 | sample_sizes=sample_sizes, 55 | output_dir=output_dir 56 | ) 57 | print(f"Data saved to {output_dir}") 58 | 59 | 60 | print("--------------Testing Claude Sonnet----------------") 61 | model_config = ModelConfig( 62 | provider="anthropic", 63 | model_name="claude-sonnet-4-20250514", 64 | temperature=0.7, 65 | max_tokens=64000 # Larger value for more complete responses 66 | ) 67 | 68 | generator = SyntheticDataGenerator(model_config=model_config) 69 | # Define output directory 70 | output_dir = os.path.join( 71 | os.path.dirname(os.path.abspath(__file__)), 72 | "output", 73 | "test_claude_models", 74 | "sonnet-4" 75 | ) 76 | sample_sizes={'Patient': 100, 'Claim': 200} 77 | # Generate and save to CSV 78 | results = generator.generate_for_schemas( 79 | schemas=schemas, 80 | prompts=prompts, 81 | sample_sizes=sample_sizes, 82 | output_dir=output_dir 83 | ) 84 | print(f"Data saved to {output_dir}") 85 | 86 | print("--------------Testing Claude Opus----------------") 87 | model_config = ModelConfig( 88 | provider="anthropic", 89 | model_name="claude-opus-4-20250514", 90 | temperature=0.7, 91 | max_tokens=32000 # Larger value for more complete responses 92 | ) 93 | 94 | generator = SyntheticDataGenerator(model_config=model_config) 95 | # Define output directory 96 | output_dir = os.path.join( 97 | os.path.dirname(os.path.abspath(__file__)), 98 | "output", 99 | "test_claude_models", 100 | "opus-4" 101 | ) 102 | sample_sizes={'Patient': 100, 'Claim': 200} 103 | # Generate and save to CSV 104 | results = generator.generate_for_schemas( 105 | schemas=schemas, 106 | prompts=prompts, 107 | sample_sizes=sample_sizes, 108 | output_dir=output_dir 109 | ) 110 | print(f"Data saved to {output_dir}") -------------------------------------------------------------------------------- /examples/model_selection/example_gemini_models.py: -------------------------------------------------------------------------------- 1 | from syda.generate import SyntheticDataGenerator 2 | from syda.schemas import ModelConfig 3 | import os 4 | from dotenv import load_dotenv 5 | 6 | # Load environment variables 7 | load_dotenv() 8 | 9 | # Define schema for a single table 10 | schemas = { 11 | 'Patient': { 12 | 'patient_id': {'type': 'number', 'description': 'Unique identifier for the patient'}, 13 | 'diagnosis_code': {'type': 'text', 'description': 'ICD-10 diagnosis code'}, 14 | 'email': {'type': 'email', 'description': 'Patient email address used for communication'}, 15 | 'visit_date': {'type': 'date', 'description': 'Date when the patient visited the clinic'}, 16 | 'notes': {'type': 'text', 'description': 'Clinical notes for the patient visit'} 17 | }, 18 | 'Claim': { 19 | 'claim_id': {'type': 'number', 'description': 'Unique identifier for the claim'}, 20 | 'patient_id': {'type': 'foreign_key', 'description': 'Reference to the patient who made the claim', 'references': {'schema': 'Patient', 'field': 'patient_id'}}, 21 | 'diagnosis_code': {'type': 'text', 'description': 'ICD-10 diagnosis code'}, 22 | 'email': {'type': 'email', 'description': 'Patient email address used for communication'}, 23 | 'visit_date': {'type': 'date', 'description': 'Date when the patient visited the clinic'}, 24 | 'notes': {'type': 'text', 'description': 'Clinical notes for the patient visit'} 25 | } 26 | } 27 | 28 | prompts={ 29 | 'Patient': 'Generate realistic synthetic patient records with ICD-10 diagnosis codes, emails, visit dates, and clinical notes.', 30 | 'Claim': 'Generate realistic synthetic claim records with ICD-10 diagnosis codes, emails, visit dates, and clinical notes.' 31 | } 32 | sample_sizes={'Patient': 15, 'Claim': 15} 33 | 34 | print("--------------Testing Gemini Flash----------------") 35 | model_config = ModelConfig( 36 | provider="gemini", 37 | model_name="gemini-2.5-flash", 38 | temperature=0.7, 39 | max_tokens=8192 # Larger value for more complete responses 40 | ) 41 | 42 | generator = SyntheticDataGenerator(model_config=model_config) 43 | # Define output directory 44 | output_dir = os.path.join( 45 | os.path.dirname(os.path.abspath(__file__)), 46 | "output", 47 | "test_gemini_models", 48 | "flash-2-5" 49 | ) 50 | # Generate and save to CSV 51 | results = generator.generate_for_schemas( 52 | schemas=schemas, 53 | prompts=prompts, 54 | sample_sizes=sample_sizes, 55 | output_dir=output_dir 56 | ) 57 | print(f"Data saved to {output_dir}") 58 | 59 | 60 | print("--------------Testing Gemini 2.0 Flash----------------") 61 | model_config = ModelConfig( 62 | provider="gemini", 63 | model_name="gemini-2.0-flash", 64 | temperature=0.7, 65 | max_tokens=8192 # Larger value for more complete responses 66 | ) 67 | 68 | generator = SyntheticDataGenerator(model_config=model_config) 69 | # Define output directory 70 | output_dir = os.path.join( 71 | os.path.dirname(os.path.abspath(__file__)), 72 | "output", 73 | "test_gemini_models", 74 | "flash-2-0" 75 | ) 76 | sample_sizes={'Patient': 50, 'Claim': 75} 77 | # Generate and save to CSV 78 | results = generator.generate_for_schemas( 79 | schemas=schemas, 80 | prompts=prompts, 81 | sample_sizes=sample_sizes, 82 | output_dir=output_dir 83 | ) 84 | print(f"Data saved to {output_dir}") 85 | 86 | 87 | print("--------------Testing Gemini 2.5 Pro----------------") 88 | model_config = ModelConfig( 89 | provider="gemini", 90 | model_name="gemini-2.5-pro", 91 | temperature=0.7, 92 | max_tokens=64000 # Larger value for more complete responses 93 | ) 94 | 95 | generator = SyntheticDataGenerator(model_config=model_config) 96 | # Define output directory 97 | output_dir = os.path.join( 98 | os.path.dirname(os.path.abspath(__file__)), 99 | "output", 100 | "test_gemini_models", 101 | "pro-2-5" 102 | ) 103 | sample_sizes={'Patient': 100, 'Claim': 150} # Pro can handle larger datasets 104 | # Generate and save to CSV 105 | results = generator.generate_for_schemas( 106 | schemas=schemas, 107 | prompts=prompts, 108 | sample_sizes=sample_sizes, 109 | output_dir=output_dir 110 | ) 111 | print(f"Data saved to {output_dir}") -------------------------------------------------------------------------------- /examples/structured_and_unstructured/crm_sqlalchemy/templates/proposal.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ title }} 6 | 82 | 83 | 84 |
85 |
{{ title }}
86 |
{{ subtitle }}
87 |
Prepared for: {{ customer_name }}
88 |
Prepared by: {{ prepared_by }}
89 |
Date: {{ created_date }}
90 |
91 | 92 |
93 |
Executive Summary
94 |
95 |

This proposal outlines our recommended solutions to address the needs of {{ customer_name }} regarding {{ opportunity_name }}. The total investment for this solution is estimated at ${{ opportunity_value }}.

96 |

{{ opportunity_description }}

97 |
98 |
99 | 100 |
101 |
Proposed Solutions
102 |
103 | {{ proposed_solutions }} 104 |
105 |
106 | 107 |
108 |
Implementation Timeline
109 |
110 | {{ implementation_timeline }} 111 |
112 |
113 | 114 |
115 |
Pricing & Investment
116 |
117 | {{ pricing_details }} 118 |
119 |
120 | 121 |
122 |
Terms & Conditions
123 |
124 | {{ terms_and_conditions }} 125 |
126 |
127 | 128 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /examples/structured_only/hr_employee_example/output/performance_review.csv: -------------------------------------------------------------------------------- 1 | id,employee_id,review_date,performance_score,review_period_start,review_period_end,strengths,areas_for_improvement,goals_set,reviewer_comments 2 | 1234,46,2023-06-15,4.5,2022-12-01,2023-05-31,"Excellent communication skills, consistently meets project deadlines, and demonstrates strong leadership in team collaboration.",Could benefit from more advanced technical training in emerging software development methodologies.,"Complete advanced cloud computing certification, lead a cross-departmental innovation project, and mentor two junior team members.","John has been an exceptional performer this year, showing remarkable adaptability and commitment to team success." 3 | 2345,17,2023-07-01,3.8,2022-12-01,2023-06-30,"Strong analytical skills, detailed-oriented approach, and excellent problem-solving capabilities in data analysis.",Develop more confident presentation and public speaking skills for client meetings.,"Attend professional communication workshop, improve data visualization techniques, and take on more client-facing roles.",Sarah shows great potential and is consistently improving her professional capabilities. 4 | 3456,41,2023-05-20,4.2,2022-11-01,2023-04-30,"Innovative thinking, quick learner, and demonstrates exceptional creativity in marketing campaign design.",Need to improve time management and prioritization of multiple project tasks.,"Complete project management certification, implement personal productivity tracking system, and lead a major marketing initiative.",Michael has consistently brought fresh perspectives to our marketing strategies. 5 | 4567,12,2023-06-30,4.7,2022-12-01,2023-06-15,"Exceptional technical expertise, quick problem solver, and highly reliable in critical IT infrastructure maintenance.",Enhance soft skills and team communication strategies.,"Participate in leadership communication training, document knowledge transfer processes, and mentor junior IT staff.",Emily is a technical powerhouse who is crucial to our IT operations' success. 6 | 5678,38,2023-05-15,3.5,2022-11-01,2023-04-30,"Customer-focused approach, empathetic listener, and consistently receives positive customer feedback.",Develop more advanced technical knowledge of product features.,"Complete product technical training, improve resolution time for complex customer issues, and create customer experience improvement plan.",David shows great potential in customer service but needs more technical depth. 7 | 6789,49,2023-07-10,4.9,2022-12-01,2023-06-30,"Exceptional financial analysis, strategic thinking, and ability to identify cost-saving opportunities.",Expand knowledge of emerging financial technologies and digital transformation.,"Attend fintech conference, lead digital transformation project, and develop comprehensive financial strategy report.",Rachel is an outstanding financial analyst who consistently delivers exceptional insights. 8 | 7890,4,2023-06-01,4.0,2022-11-15,2023-05-31,"Strong organizational skills, detail-oriented, and excellent in managing complex administrative tasks.",Develop more proactive communication and initiative in process improvements.,"Create process optimization proposal, improve interdepartmental communication, and lead an administrative efficiency project.",Lisa is a reliable team member with significant potential for growth. 9 | 8901,13,2023-05-25,3.7,2022-11-01,2023-04-30,"Creative problem solver, adaptable to changing project requirements, and strong team collaborator.",Enhance technical skills in latest design software and methodologies.,"Complete advanced design certification, lead design innovation workshop, and develop comprehensive design skills portfolio.",Mark shows creativity and potential but needs to continue skill development. 10 | 9012,35,2023-06-20,4.3,2022-12-01,2023-05-31,"Exceptional research skills, deep analytical capabilities, and ability to translate complex data into actionable insights.",Develop more confident presentation and communication of research findings.,"Attend scientific communication workshop, publish research paper, and lead cross-functional research initiative.",Anna is a brilliant researcher with tremendous potential for scientific contributions. 11 | 4321,4,2023-07-05,4.1,2022-12-01,2023-06-30,"Strong interpersonal skills, excellent team motivator, and consistent in delivering high-quality sales results.",Develop more advanced strategic sales planning and digital sales techniques.,"Complete digital sales transformation course, develop comprehensive sales strategy, and mentor junior sales team members.",Tom is a valuable team member who consistently drives sales performance. 12 | -------------------------------------------------------------------------------- /examples/unstructured_only/healthcare_yml/generate_healthcare_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Healthcare Unstructured Data Example 4 | 5 | This script demonstrates how to generate synthetic healthcare data using YAML schemas 6 | and template processing with PDF output. 7 | """ 8 | 9 | import os 10 | import sys 11 | from pathlib import Path 12 | from dotenv import load_dotenv 13 | 14 | # Load environment variables 15 | load_dotenv() 16 | 17 | # Add the parent directory to the path so we can import the SYDA module 18 | sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) 19 | 20 | from syda.generate import SyntheticDataGenerator 21 | from syda.schemas import ModelConfig 22 | 23 | # Define paths 24 | BASE_DIR = Path(__file__).parent 25 | SCHEMAS_DIR = BASE_DIR / "schemas" 26 | TEMPLATES_DIR = BASE_DIR / "templates" 27 | OUTPUT_DIR = BASE_DIR / "output" 28 | 29 | # Ensure output directory exists 30 | OUTPUT_DIR.mkdir(exist_ok=True) 31 | 32 | def main(): 33 | """Main function to generate healthcare data.""" 34 | print("=== Generating Healthcare Data with Templates ===") 35 | 36 | # Initialize generator with model config 37 | config = ModelConfig(provider="anthropic", model_name="claude-3-5-sonnet-20240620") 38 | generator = SyntheticDataGenerator(model_config=config) 39 | 40 | # Define schemas using YAML files 41 | schemas = { 42 | 'MedicalReport': str(SCHEMAS_DIR / "medical_report.yml"), 43 | 'LabResult': str(SCHEMAS_DIR / "lab_result.yml") 44 | } 45 | 46 | # Define sample sizes 47 | sample_sizes = { 48 | 'MedicalReport': 5, 49 | 'LabResult': 5 50 | } 51 | 52 | # Define custom prompts 53 | prompts = { 54 | 'MedicalReport': 'Generate synthetic medical reports for patients with various health conditions', 55 | 'LabResult': 'Generate synthetic laboratory test results for patients' 56 | } 57 | 58 | # Print information about template source paths 59 | for schema_name, schema_path in schemas.items(): 60 | print(f"\n📄 Processing schema: {schema_name} from {schema_path}") 61 | 62 | # Generate data for all schemas at once 63 | print("\n🔄 Generating data for healthcare templates...") 64 | print(" The system will automatically determine the right generation order") 65 | print(" and handle template processing for PDF generation\n") 66 | 67 | results = generator.generate_for_schemas( 68 | schemas=schemas, 69 | sample_sizes=sample_sizes, 70 | prompts=prompts, 71 | output_dir=str(OUTPUT_DIR) 72 | ) 73 | 74 | # Print summary of generated data 75 | print("\n✅ Data generation complete!") 76 | for schema_name, df in results.items(): 77 | if df is not None: 78 | print(f" {schema_name}: {len(df)} records") 79 | 80 | # Check for template output directories 81 | template_dir = OUTPUT_DIR / schema_name 82 | if template_dir.exists(): 83 | files = list(template_dir.iterdir()) 84 | pdf_files = [f for f in files if f.name.endswith('.pdf')] 85 | print(f" - Found {len(pdf_files)} PDF files in {template_dir}") 86 | for i, pdf_file in enumerate(pdf_files[:3]): 87 | print(f" - {pdf_file.name}") 88 | if len(pdf_files) > 3: 89 | print(f" - ... and {len(pdf_files) - 3} more") 90 | else: 91 | print(f" - Template directory for {schema_name} not found at {template_dir}") 92 | 93 | # Check for PDFs in all output directories 94 | print("\n🔍 Checking for generated PDF files:") 95 | total_pdfs = 0 96 | 97 | # Check main output directory first 98 | pdf_files = [f for f in OUTPUT_DIR.iterdir() if f.name.endswith('.pdf')] 99 | if pdf_files: 100 | total_pdfs += len(pdf_files) 101 | print(f" - Found {len(pdf_files)} PDFs in main output directory") 102 | 103 | # Check schema-specific directories 104 | for schema_name in schemas: 105 | schema_output_dir = OUTPUT_DIR / schema_name 106 | if schema_output_dir.exists(): 107 | pdf_files = [f for f in schema_output_dir.iterdir() if f.name.endswith('.pdf')] 108 | if pdf_files: 109 | total_pdfs += len(pdf_files) 110 | print(f" - Found {len(pdf_files)} PDFs in {schema_name} directory") 111 | 112 | print(f"\n📊 Total PDFs generated: {total_pdfs}") 113 | print(f"📂 Output directory: {OUTPUT_DIR}") 114 | 115 | if __name__ == "__main__": 116 | main() 117 | -------------------------------------------------------------------------------- /examples/model_selection/example_azureopenai_models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Azure OpenAI Model Configuration Example for Syda 3 | 4 | This example demonstrates how to configure and use Azure OpenAI models with Syda. 5 | Azure OpenAI requires additional configuration parameters that are passed via extra_kwargs. 6 | 7 | Prerequisites: 8 | 1. Azure OpenAI resource deployed in Azure 9 | 2. Model deployments created in Azure OpenAI Studio 10 | 3. API key and endpoint URL from your Azure OpenAI resource 11 | 12 | Environment Variables: 13 | - AZURE_OPENAI_API_KEY: Your Azure OpenAI API key 14 | - Or pass the API key directly to the SyntheticDataGenerator constructor 15 | 16 | Required extra_kwargs for Azure OpenAI: 17 | - azure_endpoint: Your Azure OpenAI endpoint URL 18 | - api_version: The API version to use (e.g., "2024-02-15-preview") 19 | - azure_deployment: Optional, can be set here or use model_name as deployment name 20 | """ 21 | 22 | from syda.generate import SyntheticDataGenerator 23 | from syda.schemas import ModelConfig 24 | import os 25 | from dotenv import load_dotenv 26 | 27 | # Load environment variables 28 | load_dotenv() 29 | 30 | # Define schema for healthcare data 31 | schemas = { 32 | 'Patient': { 33 | 'patient_id': {'type': 'number', 'description': 'Unique identifier for the patient'}, 34 | 'first_name': {'type': 'text', 'description': 'Patient first name'}, 35 | 'last_name': {'type': 'text', 'description': 'Patient last name'}, 36 | 'email': {'type': 'email', 'description': 'Patient email address'}, 37 | 'phone': {'type': 'phone', 'description': 'Patient contact phone number'}, 38 | 'date_of_birth': {'type': 'date', 'description': 'Patient date of birth'}, 39 | 'diagnosis_code': {'type': 'text', 'description': 'Primary ICD-10 diagnosis code'}, 40 | 'visit_date': {'type': 'date', 'description': 'Date of most recent clinic visit'}, 41 | 'notes': {'type': 'text', 'description': 'Clinical notes and observations'} 42 | }, 43 | 'Appointment': { 44 | 'appointment_id': {'type': 'number', 'description': 'Unique identifier for the appointment'}, 45 | 'patient_id': { 46 | 'type': 'foreign_key', 47 | 'description': 'Reference to the patient', 48 | 'references': {'schema': 'Patient', 'field': 'patient_id'} 49 | }, 50 | 'appointment_date': {'type': 'datetime', 'description': 'Scheduled appointment date and time'}, 51 | 'appointment_type': {'type': 'text', 'description': 'Type of appointment (consultation, follow-up, etc.)'}, 52 | 'duration_minutes': {'type': 'number', 'description': 'Appointment duration in minutes'}, 53 | 'status': {'type': 'text', 'description': 'Appointment status (scheduled, completed, cancelled)'}, 54 | 'provider_name': {'type': 'text', 'description': 'Name of the healthcare provider'} 55 | } 56 | } 57 | 58 | prompts = { 59 | 'Patient': 'Generate realistic synthetic patient records for a general practice clinic with diverse demographics, common diagnoses, and clinical notes.', 60 | 'Appointment': 'Generate realistic appointment records with various appointment types, realistic scheduling patterns, and appropriate durations.' 61 | } 62 | 63 | # Example 1: Azure OpenAI with GPT-4o 64 | print("--------------Testing Azure OpenAI GPT-4o----------------") 65 | 66 | # Configuration for Azure OpenAI GPT-4o 67 | model_config_gpt4o = ModelConfig( 68 | provider="azureopenai", 69 | model_name="gpt-4o", # This should match your deployment name in Azure 70 | temperature=0.7, 71 | max_tokens=4000, 72 | extra_kwargs={ 73 | # Required Azure OpenAI parameters 74 | "azure_endpoint": "https://your-resource-name.openai.azure.com/", # Replace with your endpoint 75 | "api_version": "2024-02-15-preview", # Use the latest API version 76 | } 77 | ) 78 | 79 | # Initialize generator with Azure OpenAI 80 | generator = SyntheticDataGenerator( 81 | model_config=model_config_gpt4o, 82 | # You can pass the API key directly or set AZURE_OPENAI_API_KEY environment variable 83 | # openai_api_key="your-azure-openai-api-key" 84 | ) 85 | 86 | # Define output directory 87 | output_dir = os.path.join( 88 | os.path.dirname(os.path.abspath(__file__)), 89 | "output", 90 | "test_azureopenai_models", 91 | "gpt-4o" 92 | ) 93 | 94 | sample_sizes = {'Patient': 20, 'Appointment': 30} 95 | 96 | # Generate and save to CSV 97 | 98 | results = generator.generate_for_schemas( 99 | schemas=schemas, 100 | prompts=prompts, 101 | sample_sizes=sample_sizes, 102 | output_dir=output_dir 103 | ) 104 | print(f"✅ GPT-4o data saved to {output_dir}") 105 | print(f"Generated {len(results['Patient'])} patients and {len(results['Appointment'])} appointments") 106 | -------------------------------------------------------------------------------- /examples/model_selection/output/test_grok_models/grok-3/products.csv: -------------------------------------------------------------------------------- 1 | id,name,company_id,category,launch_year,price_usd,is_ai_powered 2 | 1,QuantumCode AI,30,Software,2022,499.99, 3 | 2,NanoTech Drone,37,Hardware,2021,1299.99, 4 | 3,CloudSync Platform,1,Platform,2020,0.0, 5 | 4,SmartVision Glasses,23,Hardware,2023,799.99, 6 | 5,DataSphere Analytics,3,Software,2019,299.99, 7 | 6,EcoBot Cleaner,40,Hardware,2022,349.99, 8 | 7,SecureNet VPN,5,Service,2021,9.99, 9 | 8,HoloMeet VR,32,Platform,2023,199.99, 10 | 9,AI Health Monitor,10,Hardware,2020,249.99, 11 | 10,CodeGen Pro,25,Software,2022,399.99, 12 | 11,SmartHome Hub,41,Hardware,2018,129.99, 13 | 12,CloudAI Optimizer,1,Software,2021,599.99, 14 | 13,RoboGuard Security,31,Hardware,2023,899.99, 15 | 14,StreamFlow Media,31,Platform,2020,14.99, 16 | 15,NeuroLink Interface,38,Hardware,2024,1999.99, 17 | 16,AI Design Studio,6,Software,2022,349.99, 18 | 17,EnergyGrid Manager,40,Software,2021,799.99, 19 | 18,VirtualSpace Host,18,Platform,2023,29.99, 20 | 19,BioScan Device,50,Hardware,2020,499.99, 21 | 20,AI Marketing Suite,14,Software,2022,199.99, 22 | 21,SmartWear Fitness,3,Hardware,2019,199.99, 23 | 22,DataVault Secure,23,Software,2021,99.99, 24 | 23,AutoPilot Drone,46,Hardware,2023,1499.99, 25 | 24,CloudGame Stream,31,Platform,2020,9.99, 26 | 25,AI Legal Advisor,2,Software,2022,399.99, 27 | 26,SmartLens Camera,6,Hardware,2021,699.99, 28 | 27,NetShield Firewall,4,Software,2019,49.99, 29 | 28,VR Training Sim,7,Platform,2023,299.99, 30 | 29,AI Supply Chain,9,Software,2021,999.99, 31 | 30,RoboArm Industrial,45,Hardware,2020,4999.99, 32 | 31,CloudMesh Network,24,Platform,2022,0.0, 33 | 32,SmartGrid Sensor,10,Hardware,2021,299.99, 34 | 33,AI Voice Assistant,15,Software,2020,99.99, 35 | 34,HoloDesk Display,1,Hardware,2023,1999.99, 36 | 35,DataFlow Engine,45,Software,2022,499.99, 37 | 36,AI Traffic System,15,Software,2021,1499.99, 38 | 37,NanoHealth Patch,31,Hardware,2023,199.99, 39 | 38,CloudEdu Platform,15,Platform,2020,19.99, 40 | 39,SmartLock Security,17,Hardware,2022,149.99, 41 | 40,AI Content Creator,47,Software,2021,299.99, 42 | 41,VR Fitness Coach,6,Platform,2023,24.99, 43 | 42,QuantumSecure Net,4,Software,2022,799.99, 44 | 43,SmartFarm Sensor,1,Hardware,2020,399.99, 45 | 44,AI Finance Planner,10,Software,2021,199.99, 46 | 45,CloudRetail Suite,46,Platform,2023,499.99, 47 | 46,RoboPet Companion,4,Hardware,2022,599.99, 48 | 47,AI Logistics Hub,45,Software,2020,999.99, 49 | 48,SmartMirror Display,31,Hardware,2021,349.99, 50 | 49,CloudHealth Portal,25,Platform,2023,29.99, 51 | 50,AI Code Debugger,16,Software,2022,199.99, 52 | 51,SmartEar Buds,30,Hardware,2020,129.99, 53 | 52,DataMind Analytics,6,Software,2021,599.99, 54 | 53,VR Travel Explorer,17,Platform,2023,14.99, 55 | 54,AI Energy Monitor,49,Software,2022,299.99, 56 | 55,SmartRoad Sensor,7,Hardware,2021,499.99, 57 | 56,CloudCreate Studio,3,Platform,2020,0.0, 58 | 57,AI Risk Assessor,30,Software,2023,799.99, 59 | 58,NanoBot Repair,3,Hardware,2022,999.99, 60 | 59,SmartCity Planner,8,Software,2021,1499.99, 61 | 60,VR WorkSpace,45,Platform,2023,49.99, 62 | 61,AI Music Composer,1,Software,2020,99.99, 63 | 62,SmartGlove Controller,18,Hardware,2022,199.99, 64 | 63,CloudSecure Backup,19,Platform,2021,9.99, 65 | 64,AI Fraud Detector,4,Software,2023,499.99, 66 | 65,RoboChef Kitchen,42,Hardware,2020,1299.99, 67 | 66,DataStream Processor,7,Software,2022,699.99, 68 | 67,SmartBoard Interactive,9,Hardware,2021,799.99, 69 | 68,CloudSocial Hub,47,Platform,2023,0.0, 70 | 69,AI Language Tutor,25,Software,2020,149.99, 71 | 70,SmartDrone Delivery,26,Hardware,2022,1999.99, 72 | 71,CloudAnalytics Dash,46,Platform,2021,299.99, 73 | 72,AI Image Enhancer,11,Software,2023,99.99, 74 | 73,SmartThermostat,23,Hardware,2020,129.99, 75 | 74,DataGuard Privacy,16,Software,2022,199.99, 76 | 75,VR Learning Lab,20,Platform,2021,399.99, 77 | 76,AI Inventory System,37,Software,2023,599.99, 78 | 77,SmartWatch Health,49,Hardware,2020,249.99, 79 | 78,CloudCollab Tools,16,Platform,2022,19.99, 80 | 79,AI Predictive Model,25,Software,2021,799.99, 81 | 80,RoboGarden Tool,46,Hardware,2023,399.99, 82 | 81,SmartAudio Speaker,21,Hardware,2020,99.99, 83 | 82,AI Chatbot Service,37,Software,2022,299.99, 84 | 83,CloudGaming Pro,49,Platform,2021,14.99, 85 | 84,SmartProjector AI,28,Hardware,2023,599.99, 86 | 85,AI Customer Support,29,Software,2020,499.99, 87 | 86,VR Art Studio,1,Platform,2022,29.99, 88 | 87,SmartBike Tracker,1,Hardware,2021,149.99, 89 | 88,AI Data Cleaner,33,Software,2023,199.99, 90 | 89,CloudIoT Connect,41,Platform,2020,9.99, 91 | 90,SmartHelmet Safety,30,Hardware,2022,299.99, 92 | 91,AI Video Editor,28,Software,2021,99.99, 93 | 92,CloudFinance Tools,34,Platform,2023,399.99, 94 | 93,SmartFridge Monitor,41,Hardware,2020,499.99, 95 | 94,AI HR Assistant,38,Software,2022,599.99, 96 | 95,VR Event Space,42,Platform,2021,49.99, 97 | 96,SmartCar Interface,46,Hardware,2023,799.99, 98 | 97,AI Research Tool,32,Software,2020,299.99, 99 | 98,CloudMarket Insights,40,Platform,2022,199.99, 100 | 99,SmartDrone Camera,40,Hardware,2021,999.99, 101 | 100,AI Workflow Manager,39,Software,2023,499.99, 102 | -------------------------------------------------------------------------------- /docs/deep_dive/output_options.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Data Output Options & Formats | Syda Deep Dive 3 | description: Learn about Syda's flexible output options - save synthetic data as CSV, JSON, databases, or custom formats with various configuration options. 4 | keywords: 5 | - synthetic data output 6 | - CSV data export 7 | - JSON data export 8 | - database output 9 | - data format options 10 | - file output configuration 11 | --- 12 | 13 | # Output Options 14 | 15 | SYDA offers flexible options for handling the output of generated data, allowing you to save results in various formats and locations. 16 | 17 | ## Return Types 18 | 19 | By default, SYDA returns generated data as pandas DataFrames: 20 | 21 | ```python 22 | from syda import SyntheticDataGenerator, ModelConfig 23 | 24 | config = ModelConfig(provider="anthropic", model_name="claude-3-5-haiku-20241022") 25 | generator = SyntheticDataGenerator(model_config=config) 26 | 27 | # Generate data 28 | results = generator.generate_for_schemas( 29 | schemas={...}, 30 | sample_sizes={"Customer": 10} 31 | ) 32 | 33 | # Results is a dictionary of DataFrames 34 | customer_df = results["Customer"] 35 | 36 | # Work with the DataFrame 37 | print(f"Generated {len(customer_df)} customer records") 38 | print(customer_df.head()) 39 | ``` 40 | 41 | The returned `results` dictionary maps table names to pandas DataFrames, making it easy to analyze, transform, or further process the generated data. 42 | 43 | ## Saving to Files 44 | 45 | You can save generated data to files by specifying an output directory: 46 | 47 | ```python 48 | results = generator.generate_for_schemas( 49 | schemas={...}, 50 | sample_sizes={"Customer": 10, "Order": 25}, 51 | output_dir="output/crm_data" 52 | ) 53 | ``` 54 | 55 | When you provide an `output_dir`: 56 | 57 | 1. SYDA creates the directory if it doesn't exist 58 | 2. Each table's data is saved as a CSV file (e.g., `Customer.csv`, `Order.csv`) 59 | 3. The results dictionary still contains the DataFrames for immediate use 60 | 61 | ## Output Formats 62 | 63 | By default, SYDA saves data in CSV format, but you can specify other formats using the `output_formats` parameter: 64 | 65 | ```python 66 | results = generator.generate_for_schemas( 67 | schemas={...}, 68 | sample_sizes={"Customer": 10, "Order": 25}, 69 | output_dir="output/crm_data", 70 | output_formats=["csv", "json"] 71 | ) 72 | ``` 73 | 74 | Supported output formats include: 75 | 76 | - `csv`: Standard comma-separated values format 77 | - `json`: JSON format with records orientation 78 | 79 | ## Document Output 80 | 81 | When generating unstructured documents alongside structured data, SYDA saves the documents in their specified formats: 82 | 83 | ```python 84 | schemas = { 85 | 'Report': { 86 | '__template__': 'true', 87 | '__template_source__': 'templates/report.html', 88 | '__input_file_type__': 'html', 89 | '__output_file_type__': 'pdf', 90 | # ...other fields 91 | } 92 | } 93 | 94 | results = generator.generate_for_schemas( 95 | schemas=schemas, 96 | sample_sizes={"Report": 5}, 97 | output_dir="output/reports" 98 | ) 99 | ``` 100 | 101 | This creates: 102 | 103 | - A `Report` subdirectory with the generated documents (e.g., `Report_1.pdf`, `Report_2.pdf`, etc.) 104 | 105 | ## Output Directory Structure 106 | 107 | When using both structured data and document generation, SYDA creates an organized directory structure: 108 | 109 | ``` 110 | output/ 111 | ├── Customer.csv 112 | ├── Order.csv 113 | ├── OrderItem.csv 114 | ├── Invoice/ 115 | │ ├── Invoice_1.pdf 116 | │ ├── Invoice_2.pdf 117 | │ └── ... 118 | └── Report/ 119 | ├── Report_1.pdf 120 | ├── Report_2.pdf 121 | └── ... 122 | ``` 123 | 124 | This structure makes it easy to locate and manage both structured data and generated documents. 125 | 126 | 127 | ## Working with Output Programmatically 128 | 129 | After generation, you can further process or transform the output data: 130 | 131 | ```python 132 | # Generate data 133 | results = generator.generate_for_schemas( 134 | schemas={...}, 135 | sample_sizes={"Customer": 10, "Order": 25} 136 | ) 137 | 138 | # Process Customer data 139 | customers = results["Customer"] 140 | vip_customers = customers[customers["annual_revenue"] > 1000000] 141 | 142 | # Process Order data 143 | orders = results["Order"] 144 | recent_orders = orders[orders["order_date"] > "2023-01-01"] 145 | 146 | # Join data for analysis 147 | merged = orders.merge(customers, left_on="customer_id", right_on="id") 148 | ``` 149 | 150 | ## Best Practices 151 | 152 | 1. **Use Descriptive Output Directories**: Create meaningful directory names for your output 153 | 2. **Choose Appropriate Formats**: Select output formats based on your downstream needs 154 | 3. **Process DataFrames Before Saving**: Apply transformations before writing to disk when needed 155 | 4. **Check Output Size**: Be mindful of output size for large generations 156 | 5. **Backup Results**: Keep the returned DataFrames for immediate use even when saving to disk 157 | 158 | ## Examples 159 | 160 | Explore [SQLAlchemy Example](../examples/structured_and_unstructured_mixed/sqlalchemy_models.md) and [Yaml Example](../examples/structured_and_unstructured_mixed/yaml_schemas.md) 161 | -------------------------------------------------------------------------------- /tests/test_custom_generators.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the custom_generators module. 3 | """ 4 | import pytest 5 | import random 6 | from unittest.mock import patch, MagicMock 7 | 8 | from syda.custom_generators import GeneratorManager 9 | 10 | 11 | class TestGeneratorManager: 12 | """Tests for the GeneratorManager class.""" 13 | 14 | def test_initialization(self): 15 | """Test initialization of GeneratorManager.""" 16 | manager = GeneratorManager() 17 | 18 | # Check that attributes are properly initialized 19 | assert hasattr(manager, "type_generators") 20 | assert hasattr(manager, "column_generators") 21 | assert isinstance(manager.type_generators, dict) 22 | assert isinstance(manager.column_generators, dict) 23 | 24 | def test_register_type_generator(self): 25 | """Test registering a type generator.""" 26 | manager = GeneratorManager() 27 | 28 | # Define a test generator 29 | def test_generator(row, col_name): 30 | return "test_value" 31 | 32 | # Register the generator 33 | manager.register_generator("test_type", test_generator) 34 | 35 | # Check that the generator was registered 36 | assert "test_type" in manager.type_generators 37 | assert manager.type_generators["test_type"] == test_generator 38 | 39 | def test_register_column_generator(self): 40 | """Test registering a column generator.""" 41 | manager = GeneratorManager() 42 | 43 | # Define a test generator 44 | def test_generator(row, col_name): 45 | return "test_value" 46 | 47 | # Register the generator with a column name 48 | manager.register_generator("test_type", test_generator, column_name="test_column") 49 | 50 | # Check that the generator was registered 51 | assert "test_column" in manager.column_generators 52 | assert manager.column_generators["test_column"] == test_generator 53 | 54 | def test_get_generator_state(self): 55 | """Test getting and restoring generator state.""" 56 | manager = GeneratorManager() 57 | 58 | # Define test generators 59 | def type_gen(row, col_name): 60 | return "type_value" 61 | 62 | def column_gen(row, col_name): 63 | return "column_value" 64 | 65 | # Register the generators 66 | manager.register_generator("custom_type", type_gen) 67 | manager.register_generator("custom_type2", column_gen, column_name="custom_column") 68 | 69 | # Get the state 70 | state = manager.get_generator_state() 71 | 72 | # Check that the state was returned correctly 73 | type_generators, column_generators = state 74 | assert "custom_type" in type_generators 75 | assert type_generators["custom_type"] == type_gen 76 | assert "custom_column" in column_generators 77 | assert column_generators["custom_column"] == column_gen 78 | 79 | def test_type_and_column_generators(self): 80 | """Test that type and column generators are stored in their respective dictionaries.""" 81 | manager = GeneratorManager() 82 | 83 | # Define test generators 84 | def type_gen(row, col_name): 85 | return "type_value" 86 | 87 | def column_gen(row, col_name): 88 | return "column_value" 89 | 90 | # Register the generators 91 | manager.register_generator("test_type", type_gen) 92 | manager.register_generator("test_type", column_gen, column_name="test_column") 93 | 94 | # Check that the generators were registered in their respective dictionaries 95 | assert "test_type" in manager.type_generators 96 | assert manager.type_generators["test_type"] == type_gen 97 | assert "test_column" in manager.column_generators 98 | assert manager.column_generators["test_column"] == column_gen 99 | 100 | def test_restore_generator_state(self): 101 | """Test saving and restoring generator state.""" 102 | manager = GeneratorManager() 103 | 104 | # Define a test generator 105 | def type_gen(row, col_name): 106 | return "type_value" 107 | 108 | # Register the generator 109 | manager.register_generator("test_type", type_gen) 110 | 111 | # Get the state 112 | state = manager.get_generator_state() 113 | 114 | # Create a new manager 115 | new_manager = GeneratorManager() 116 | 117 | # Restore the state 118 | new_manager.restore_generator_state(state) 119 | 120 | # Check that the generator was restored 121 | assert "test_type" in new_manager.type_generators 122 | assert new_manager.type_generators["test_type"] == type_gen 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | # Additional tests for GeneratorManager functionality could be added here 131 | 132 | # The following tests have been removed because they reference methods that don't exist 133 | # in the actual GeneratorManager implementation: 134 | # - test_generate_boolean 135 | # - test_generate_number 136 | # - test_generate_number_with_precision 137 | -------------------------------------------------------------------------------- /examples/structured_and_unstructured/crm_sqlalchemy/templates/contract.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Service Contract - {{ customer_name }} 6 | 68 | 69 | 70 |
71 |
Service Contract
72 |
Contract #: {{ contract_number }}
73 |
74 | 75 |
76 |

This Service Contract (the "Contract") is entered into as of {{ effective_date }} (the "Effective Date") by and between:

77 | 78 |

Service Provider: Our Company, with its principal place of business at 123 Business St, Business City, BS 12345

79 | 80 |

Client: {{ customer_name }}, with its principal place of business at {{ customer_address }}

81 |
82 | 83 |
84 |
1. SERVICES
85 |
86 |

The Service Provider agrees to provide to the Client the services described below (collectively, the "Services"):

87 |

{{ service_description }}

88 |
89 |
90 | 91 |
92 |
2. TERM
93 |
94 |

This Contract shall commence on the Effective Date and continue until {{ expiration_date }} (the "Term"), unless earlier terminated as provided herein.

95 |
96 |
97 | 98 |
99 |
3. COMPENSATION
100 |
101 |

In consideration for the Services, the Client shall pay the Service Provider a total fee of ${{ contract_value }}.

102 |

{{ payment_terms }}

103 |
104 |
105 | 106 |
107 |
4. RENEWAL
108 |
109 |

{{ renewal_terms }}

110 |
111 |
112 | 113 |
114 |
5. LEGAL TERMS
115 |
116 |

{{ legal_terms }}

117 |
118 |
119 | 120 |
121 |

IN WITNESS WHEREOF, the parties hereto have executed this Contract as of the Effective Date first above written.

122 | 123 |
124 |

Service Provider:

125 |
126 |

Name: ________________________

127 |

Title: ________________________

128 |

Date: ________________________

129 |
130 | 131 |
132 |

Client: {{ customer_name }}

133 |
134 |

Name: ________________________

135 |

Title: ________________________

136 |

Date: ________________________

137 |
138 |
139 | 140 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /examples/structured_only/example_proxy_configuration.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Example demonstrating how to use company AI proxies with the data generator. 5 | This enables using the library in enterprise environments where AI API calls 6 | are typically routed through internal proxy services. 7 | 8 | ⚠️ WARNING: The proxy configuration feature is experimental and has not been 9 | thoroughly tested with actual enterprise proxy setups. You may need to adjust 10 | the implementation to work with your specific company proxy. 11 | """ 12 | 13 | import os 14 | import sys 15 | from dotenv import load_dotenv 16 | 17 | # Add the parent directory to the path so we can import the syda package 18 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 19 | 20 | from syda.generate import SyntheticDataGenerator 21 | from syda.schemas import ModelConfig, ProxyConfig 22 | 23 | # Load environment variables from .env file if it exists 24 | load_dotenv() 25 | 26 | def main(): 27 | # Simple schema for generating product data 28 | schema = { 29 | "product_id": "number", 30 | "product_name": "text", 31 | "description": "text", 32 | "price": "number", 33 | "category": "text", 34 | "in_stock": "boolean" 35 | } 36 | 37 | prompt = "Generate premium electronic products with detailed descriptions" 38 | 39 | # Example 1: Using OpenAI through a company proxy 40 | print("\n1. Using OpenAI with company proxy:") 41 | 42 | # Create a proxy configuration for your company's OpenAI proxy 43 | openai_proxy_config = ModelConfig( 44 | provider="openai", 45 | model_name="gpt-4", 46 | temperature=0.7, 47 | proxy=ProxyConfig( 48 | base_url="https://ai-proxy.company.com/v1", # Replace with your company's proxy URL 49 | headers={ 50 | "X-Company-Auth": "your-internal-token", # Replace with actual auth if needed 51 | "X-Project-ID": "synthetic-data" 52 | }, 53 | # Query parameters will be appended to the URL: https://ai-proxy.company.com/v1?team=data-science&project=synthetic-data 54 | params={ 55 | "team": "data-science", 56 | "project": "synthetic-data", 57 | "track_usage": True, # Will be converted to string "True" 58 | "priority": 1 # Will be converted to string "1" 59 | } 60 | ) 61 | ) 62 | 63 | print("When configured with params, the actual URL used will be:") 64 | print("https://ai-proxy.company.com/v1?team=data-science&project=synthetic-data&track_usage=True&priority=1") 65 | 66 | # Create a generator that uses the proxy 67 | proxy_generator = SyntheticDataGenerator(model_config=openai_proxy_config) 68 | 69 | # Note: This will only work if your company's proxy is correctly configured 70 | # Uncomment the following lines to test with your actual proxy setup 71 | """ 72 | proxy_data = proxy_generator.generate_data( 73 | schema_dict=schema, 74 | prompt=prompt, 75 | sample_size=2 76 | ) 77 | print(proxy_data) 78 | """ 79 | print("(Example code - requires actual proxy configuration to run)") 80 | 81 | # Example 2: Using Anthropic models through a proxy 82 | print("\n2. Using Anthropic with company proxy:") 83 | 84 | anthropic_proxy_config = ModelConfig( 85 | provider="anthropic", 86 | model_name="claude-3-haiku-20240307", 87 | temperature=0.5, 88 | proxy=ProxyConfig( 89 | base_url="https://ai-proxy.company.com/anthropic", # Replace with your company's Anthropic proxy 90 | headers={"X-API-Source": "synthetic-data-generator"} 91 | ) 92 | ) 93 | 94 | # Create generator with Anthropic proxy 95 | anthropic_proxy_generator = SyntheticDataGenerator(model_config=anthropic_proxy_config) 96 | 97 | # Note: This will only work if your company's Anthropic proxy is configured 98 | print("(Example code - requires actual proxy configuration to run)") 99 | 100 | print("\n3. Using proxy with custom authentication:") 101 | 102 | # Example of more complex proxy setup with custom auth 103 | complex_proxy_config = ModelConfig( 104 | provider="openai", 105 | model_name="gpt-4-turbo", 106 | temperature=0.8, 107 | proxy=ProxyConfig( 108 | base_url="https://ai-proxy.company.com/openai", 109 | headers={ 110 | "Authorization": f"Bearer {os.environ.get('COMPANY_PROXY_TOKEN', 'your-token-here')}", 111 | "X-Request-Source": "syda-library" 112 | }, 113 | path_format="/proxy/llm/{provider}/completions" 114 | ) 115 | ) 116 | 117 | # Create generator with complex proxy setup 118 | complex_proxy_generator = SyntheticDataGenerator(model_config=complex_proxy_config) 119 | 120 | print("(Example code - requires actual proxy configuration to run)") 121 | 122 | print("\nNotes about proxy configuration:") 123 | print("- Replace the example URLs and tokens with your company's actual proxy settings") 124 | print("- Ensure you have the necessary permissions to access the proxy") 125 | print("- Check with your IT department for the correct headers and authentication method") 126 | print("- Consider storing proxy tokens in environment variables rather than hardcoding") 127 | 128 | if __name__ == "__main__": 129 | main() 130 | -------------------------------------------------------------------------------- /docs/deep_dive/foreign_keys.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Foreign Key Handling & Referential Integrity | Syda Deep Dive 3 | description: Master foreign key relationships in synthetic data generation - maintain perfect referential integrity across tables with AI-generated data using Syda's advanced dependency resolution. 4 | keywords: 5 | - foreign key relationships 6 | - referential integrity 7 | - database relationships 8 | - synthetic data relationships 9 | - AI data integrity 10 | - table dependencies 11 | - relational data generation 12 | --- 13 | 14 | # Foreign Key Handling 15 | 16 | One of SYDA's most powerful features is its ability to maintain referential integrity across multiple related tables. This document explains in detail how foreign key relationships are defined, detected, and handled during data generation. 17 | 18 | ## Foreign Key Definition Methods 19 | 20 | SYDA supports three different ways to define foreign key relationships: 21 | 22 | ### 1. Special `__foreign_keys__` Section 23 | 24 | You can explicitly define foreign keys using a special `__foreign_keys__` section in your schema: 25 | 26 | ```yaml 27 | # order.yaml 28 | id: 29 | type: integer 30 | primary_key: true 31 | customer_id: 32 | type: integer 33 | order_date: 34 | type: date 35 | 36 | __foreign_keys__: 37 | customer_id: [Customer, id] 38 | 39 | ``` 40 | 41 | 42 | ### 2. Field-Level `references` Property 43 | 44 | You can define foreign keys directly in field definitions using the `references` property: 45 | 46 | ```yaml 47 | # orderitem.yaml 48 | id: 49 | type: integer 50 | primary_key: true 51 | order_id: 52 | type: integer 53 | references: 54 | schema: Order 55 | field: id 56 | product_id: 57 | type: integer 58 | references: 59 | schema: Product 60 | field: id 61 | quantity: 62 | type: integer 63 | ``` 64 | 65 | This approach keeps the foreign key definition close to the field it applies to, making the schema more readable. 66 | 67 | ### 3. SQLAlchemy `ForeignKey` Definitions 68 | 69 | When using SQLAlchemy models, foreign keys are automatically detected from the `ForeignKey` definitions: 70 | 71 | ```python 72 | from sqlalchemy import Column, Integer, String, Date, ForeignKey 73 | from sqlalchemy.ext.declarative import declarative_base 74 | 75 | Base = declarative_base() 76 | 77 | class Order(Base): 78 | __tablename__ = 'orders' 79 | 80 | id = Column(Integer, primary_key=True) 81 | customer_id = Column(Integer, ForeignKey('customers.id')) 82 | order_date = Column(Date) 83 | ``` 84 | 85 | SYDA will automatically extract these relationships during schema analysis. 86 | 87 | 88 | ## Dependency Resolution 89 | 90 | Once foreign keys are defined, SYDA automatically determines the correct order for generating data: 91 | 92 | 1. **Dependency Graph**: SYDA builds a directed graph of dependencies between tables 93 | 2. **Topological Sort**: It performs a topological sort to determine the generation order 94 | 3. **Execution Order**: Tables are generated in an order that ensures all parent tables exist first 95 | 96 | For example, with these tables: 97 | 98 | - Customer (no dependencies) 99 | 100 | - Product (no dependencies) 101 | 102 | - Order (depends on Customer) 103 | 104 | - OrderItem (depends on Order and Product) 105 | 106 | SYDA would generate them in this order: 107 | 108 | 1. Customer and Product (can be generated in parallel) 109 | 110 | 2. Order (after Customer is available) 111 | 112 | 3. OrderItem (after both Order and Product are available) 113 | 114 | ## Foreign Key Value Assignment 115 | 116 | When generating data with foreign keys, SYDA ensures that each foreign key references a valid primary key in the parent table: 117 | 118 | 1. **Parent Table Access**: SYDA maintains access to all previously generated tables 119 | 2. **Random Selection**: By default, it randomly selects a valid foreign key value 120 | 3. **Consistent Foreign Keys**: When multiple columns in the same schema reference the same parent table, SYDA ensures they get the same parent record for consistency 121 | 122 | 123 | 124 | ## Best Practices for Foreign Key Handling 125 | 126 | 1. **Be Explicit**: Whenever possible, explicitly define foreign key relationships 127 | 2. **Consistent Naming**: Use consistent naming patterns (e.g., `table_id`) for foreign keys 128 | 3. **Handle Nullable Keys**: Specify whether foreign keys can be null 129 | 4. **Test Relationships**: Verify that generated data maintains proper referential integrity 130 | 5. **Document Dependencies**: Add comments or documentation about table dependencies 131 | 132 | ## Examples 133 | 134 | To see foreign key relationships in action, explore the example projects included with SYDA: 135 | 136 | 1. **SQLAlchemy Examples**: Check [sqlalchemy_models](../examples/structured_and_unstructured_mixed/sqlalchemy_models.md) for examples of foreign keys with SQLAlchemy models 137 | 2. **Dictionary Schema Examples**: See [Dictionary Examples](../examples/structured_only/dict_schemas.md) for dictionary-based foreign key handling 138 | 3. **YAML/JSON Schema Examples**: The [YAML Examples](../examples/structured_only/yaml_schemas.md) and [JSON Examples](../examples/structured_only/json_schemas.md) demonstrate foreign keys in file-based schemas 139 | 4. **Retail Example**: [Retail Example](../examples/structured_and_unstructured_mixed/yaml_schemas.md) shows foreign keys connecting multiple related tables 140 | 141 | Each example demonstrates different aspects of foreign key handling, including relationship definition, value assignment, and referential integrity verification. 142 | -------------------------------------------------------------------------------- /syda/utils.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | import yaml 4 | import json 5 | import pandas as pd 6 | import random 7 | import string 8 | from datetime import datetime, date, timedelta 9 | from sqlalchemy import inspect as sqla_inspect, Column 10 | from typing import Dict, Optional, Any, Union 11 | 12 | def create_empty_dataframe(schema: Dict[str, str]) -> pd.DataFrame: 13 | """Create an empty pandas DataFrame with columns matching the schema types.""" 14 | columns = {} 15 | for field, field_type in schema.items(): 16 | # Skip metadata fields 17 | if field.startswith('__') and field.endswith('__'): 18 | continue 19 | # Map schema types to pandas dtypes 20 | if field_type == 'integer': 21 | columns[field] = pd.Series(dtype='int64') 22 | elif field_type == 'float': 23 | columns[field] = pd.Series(dtype='float64') 24 | elif field_type == 'boolean': 25 | columns[field] = pd.Series(dtype='bool') 26 | elif field_type in ('date', 'datetime'): 27 | columns[field] = pd.Series(dtype='datetime64[ns]') 28 | else: 29 | columns[field] = pd.Series(dtype='object') 30 | 31 | return pd.DataFrame(columns) 32 | 33 | 34 | def generate_random_value(field_type: str) -> Any: 35 | """Generate a random value based on field type for placeholder data.""" 36 | if field_type == 'integer': 37 | return random.randint(1, 1000) 38 | elif field_type == 'float': 39 | return round(random.uniform(1.0, 1000.0), 2) 40 | elif field_type == 'boolean': 41 | return random.choice([True, False]) 42 | elif field_type == 'date': 43 | # Random date in last 5 years 44 | days = random.randint(0, 365 * 5) 45 | return (date.today() - timedelta(days=days)).isoformat() 46 | elif field_type == 'datetime': 47 | # Random datetime in last 5 years 48 | days = random.randint(0, 365 * 5) 49 | hours = random.randint(0, 23) 50 | minutes = random.randint(0, 59) 51 | seconds = random.randint(0, 59) 52 | dt = datetime.now() - timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds) 53 | return dt.isoformat() 54 | else: # text or any other type 55 | # Generate random string 56 | length = random.randint(5, 15) 57 | return ''.join(random.choice(string.ascii_letters) for _ in range(length)) 58 | 59 | 60 | def get_schema_prompt( 61 | schema: Dict[str, str], 62 | table_name: str, 63 | description: Optional[str] = None 64 | ) -> str: 65 | """Generate a prompt for the LLM based on schema information.""" 66 | prompt = f"Generate data for {table_name}" 67 | if description: 68 | prompt += f": {description}" 69 | return prompt 70 | 71 | 72 | def parse_dataframe_output( 73 | text: str, 74 | schema: Dict[str, str] 75 | ) -> pd.DataFrame: 76 | """Parse LLM output text into a pandas DataFrame based on schema.""" 77 | try: 78 | # Try to parse as JSON 79 | data = json.loads(text) 80 | 81 | # Convert to DataFrame 82 | if isinstance(data, list): 83 | df = pd.DataFrame(data) 84 | elif isinstance(data, dict): 85 | df = pd.DataFrame([data]) 86 | else: 87 | # If not a valid JSON structure, raise error 88 | raise ValueError("Output is not a valid JSON structure") 89 | 90 | # Type conversion based on schema 91 | for col, dtype in schema.items(): 92 | if col in df.columns: 93 | if dtype == 'integer': 94 | df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int) 95 | elif dtype == 'float': 96 | df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) 97 | elif dtype == 'boolean': 98 | df[col] = df[col].astype(bool) 99 | elif dtype in ('date', 'datetime'): 100 | df[col] = pd.to_datetime(df[col], errors='coerce') 101 | 102 | return df 103 | except Exception as e: 104 | print(f"Error parsing output: {e}") 105 | # Return empty DataFrame matching schema 106 | return create_empty_dataframe(schema) 107 | 108 | 109 | def save_dataframe( 110 | df: pd.DataFrame, 111 | output_file: str 112 | ) -> str: 113 | """Save a dataframe to a file (CSV, Excel, JSON, or Parquet).""" 114 | # Create output directory if it doesn't exist 115 | output_dir = os.path.dirname(output_file) 116 | if output_dir and not os.path.exists(output_dir): 117 | os.makedirs(output_dir) 118 | 119 | # Save based on file extension 120 | file_ext = os.path.splitext(output_file)[1].lower() 121 | 122 | if file_ext == '.csv': 123 | df.to_csv(output_file, index=False) 124 | print(f"[OK] Successfully wrote {len(df)} rows to {output_file}") 125 | elif file_ext in ('.xls', '.xlsx'): 126 | df.to_excel(output_file, index=False) 127 | print(f"[OK] Successfully wrote {len(df)} rows to {output_file}") 128 | elif file_ext == '.json': 129 | df.to_json(output_file, orient='records', lines=True) 130 | print(f"[OK] Successfully wrote {len(df)} rows to {output_file}") 131 | elif file_ext == '.parquet': 132 | df.to_parquet(output_file, index=False) 133 | print(f"[OK] Successfully wrote {len(df)} rows to {output_file}") 134 | else: 135 | print(f"[WARNING] Unsupported file format: {file_ext}. Defaulting to CSV.") 136 | csv_file = os.path.splitext(output_file)[0] + '.csv' 137 | df.to_csv(csv_file, index=False) 138 | print(f"[OK] Successfully wrote {len(df)} rows to {csv_file}") 139 | --------------------------------------------------------------------------------