├── github ├── __init__.py ├── features │ ├── fraud │ │ ├── __init__.py │ │ ├── github_event.chalk.sql │ │ └── prompts.py │ ├── groq │ │ ├── __init__.py │ │ └── groq.py │ ├── cerebras │ │ ├── __init__.py │ │ └── cerebras.py │ ├── github │ │ ├── __init__.py │ │ ├── github_repo_document_vector_database.py │ │ ├── github_archive.py │ │ ├── github_repo.py │ │ └── github_user.py │ ├── search │ │ ├── __init__.py │ │ ├── prompts.py │ │ └── github_search.py │ ├── __init__.py │ ├── named_queries.py │ └── github_feature_set.py └── sql │ ├── github_archive_stars.chalk.sql │ ├── github_archive_stars.sql │ ├── github_owner.sql │ └── github_repo.sql ├── 14_codegen ├── __init__.py ├── score_resolvers.py ├── models.py ├── README.md └── custom_model.py ├── call_recordings ├── __init__.py ├── features │ ├── fathom │ │ ├── __init__.py │ │ ├── fathom_meeting_insights_sales.py │ │ └── fathom_message_webhook.py │ └── __init__.py └── sql │ ├── fathom_call.chalk.sql │ ├── fathom_message.chalk.sql │ └── fathom_call_data.chalk.sql ├── full_examples ├── batch_ml │ ├── tests │ │ ├── __init__.py │ │ └── test_batch_prediction.py │ ├── models │ │ └── fraud_model.onnx │ ├── chalk.yaml │ ├── src │ │ ├── datasources.py │ │ ├── resolvers │ │ │ ├── sql │ │ │ │ ├── get_users.chalk.sql │ │ │ │ ├── get_users_offline.chalk.sql │ │ │ │ ├── get_transactions.chalk.sql │ │ │ │ └── get_transactions_offline.chalk.sql │ │ │ └── fraud_model.py │ │ ├── queries.py │ │ └── models.py │ └── pyproject.toml ├── image_processing │ ├── src │ │ ├── __init__.py │ │ ├── feature_sets.py │ │ └── resolvers.py │ ├── requirements.txt │ ├── chalk.yaml │ ├── pyproject.toml │ └── README.md ├── fraud_transactions_with_llm │ ├── tests │ │ ├── __init__.py │ │ ├── conftest.py │ │ └── test_denylisted.py │ ├── src │ │ ├── emailage │ │ │ ├── __init__.py │ │ │ └── client.py │ │ ├── datasources.py │ │ ├── users.chalk.sql │ │ ├── groq.py │ │ ├── __init__.py │ │ ├── transactions.chalk.sql │ │ ├── transactions_offline.chalk.sql │ │ ├── denylist.py │ │ ├── streaming.py │ │ └── experian │ │ │ └── __init__.py │ ├── requirements.txt │ ├── chalk.yaml │ ├── .gitignore │ ├── .chalkignore │ └── README.md ├── dynamic_pricing │ ├── requirements.txt │ ├── chalk.yaml │ ├── src │ │ ├── sql │ │ │ └── hotels.chalk.sql │ │ └── datasources.py │ └── README.md └── sagemaker │ ├── requirements.txt │ ├── src │ ├── datasources.py │ ├── resolvers │ │ ├── customers.chalk.sql │ │ └── transactions.chalk.sql │ └── models.py │ ├── steps │ ├── evaluate.py │ ├── training.py │ └── dataset.py │ ├── chalk_sagemaker_pipeline.py │ └── README.md ├── mypy.ini ├── 10_migrations └── README.md ├── requirements.txt ├── unstructured_data ├── requirements.txt ├── src │ ├── datasources.py │ ├── users.chalk.sql │ ├── __init__.py │ ├── transactions.chalk.sql │ ├── denylist.py │ ├── models.py │ └── resolvers.py ├── chalk.yaml ├── .gitignore └── .chalkignore ├── 11_sql ├── user_views.sql ├── 2_dataframes.py ├── README.md └── 1_scalars.py ├── 13_airflow ├── airflow.jpg ├── get_users.chalk.sql ├── features.py ├── shared_environment.py ├── isolated_environment.py ├── polling.py └── chalk_airflow.py ├── 12_model └── churn_model.skops ├── marketplace ├── item.chalk.sql ├── seller.chalk.sql ├── user.chalk.sql ├── interaction │ └── interaction_type.py ├── item_price.chalk.sql ├── interaction.chalk.sql ├── review.chalk.sql ├── __init__.py ├── resolvers.py ├── item_category │ └── item_category_value_enum.py ├── tests.py ├── lancedb.py └── named_queries.py ├── marketing ├── event_type.chalk.sql ├── product_area.chalk.sql ├── session.chalk.sql ├── user.chalk.sql ├── event.chalk.sql ├── requirements.txt ├── customer_interaction.chalk.sql └── __init__.py ├── .github └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── .gitignore ├── 08_testing ├── 2_integration_tests.py ├── README.md └── 1_unit_tests.py ├── 06_dataframe ├── 3_projections.py ├── 5_aggregations.py ├── 4_filters_and_projections.py ├── 1_creating_dataframes.py ├── 2_filters.py ├── 6_self_joins.py └── README.md ├── SECURITY.md ├── 01_features ├── 6_has_one_has_many.py ├── 4_has_one.py ├── 3_primary_keys.py ├── 5_has_many.py ├── 1_feature_types.py ├── 7_feature_time.py ├── 8_constructing_features.py └── 2_custom_feature_types.py ├── 09_github_actions ├── 1_install_chalk_cli.yaml ├── 2_deploy_with_chalk.yaml ├── 3_deploy_preview.yaml └── README.md ├── 07_streaming ├── 1_mapping_stream.py ├── 4_continuous_aggregation.py ├── 3_window_sql.py ├── 2_window_dataframe.py └── README.md ├── 05_feature_discovery ├── 1_descriptions.py ├── 4_unified.py ├── 3_tags.py ├── 2_owners.py └── README.md ├── predictive_maintenance ├── 2_time_query.py ├── 3_keep_data_fresh.py ├── 4_customer_sensors.py ├── 1_device_data.py └── README.md ├── 03_caching ├── 7_prefetching.py ├── 2_lastest_value.py ├── 6_cache_busting.py ├── 4_override_max_staleness.py ├── 5_override_cache_values.py ├── 3_intermediates.py ├── 1_basic_caching.py └── README.md ├── credit ├── 4_aggregate_tradelines.py ├── 3_bureau_api.py ├── README.md └── 2_accounts.py ├── 04_scheduling ├── 3_sample_arguments.py ├── 1_cron.py ├── 2_filtered_cron.py └── README.md ├── fraud ├── 2_patterns.py ├── 3_identity.py ├── 5_account_takeover.py ├── 1_return.py └── README.md ├── 02_resolvers ├── 2_multiple_features_resolver.py ├── 3_downstream_scalars.py ├── 4_downstream_dataframes.py ├── 1_scalar_resolver.py ├── 5_tagged_resolvers.py └── 6_sharing_resolvers.py ├── ecommerce ├── 1_users_sellers.py ├── 2_interactions.py ├── README.md └── 3_streams.py └── mocks └── __init__.py /github/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /14_codegen/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /call_recordings/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /github/features/fraud/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /github/features/groq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /github/features/cerebras/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /github/features/github/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /call_recordings/features/fathom/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /full_examples/batch_ml/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /full_examples/image_processing/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | plugins = chalk.mypy_plugin 3 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /10_migrations/README.md: -------------------------------------------------------------------------------- 1 | # Migrations 2 | 3 | Examples to come! 4 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/src/emailage/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | chalkpy>=1.11.8 2 | pydantic 3 | cattrs 4 | requests -------------------------------------------------------------------------------- /full_examples/dynamic_pricing/requirements.txt: -------------------------------------------------------------------------------- 1 | chalkpy[runtime,postgresql] 2 | -------------------------------------------------------------------------------- /full_examples/sagemaker/requirements.txt: -------------------------------------------------------------------------------- 1 | chalkpy 2 | sagemaker 3 | scikit-learn 4 | -------------------------------------------------------------------------------- /unstructured_data/requirements.txt: -------------------------------------------------------------------------------- 1 | chalkpy[runtime,postgresql] 2 | google-generativeai 3 | -------------------------------------------------------------------------------- /11_sql/user_views.sql: -------------------------------------------------------------------------------- 1 | select sum(mins) as viewed_minutes from view_counts where uid = :uid 2 | -------------------------------------------------------------------------------- /13_airflow/airflow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalk-ai/examples/HEAD/13_airflow/airflow.jpg -------------------------------------------------------------------------------- /call_recordings/features/__init__.py: -------------------------------------------------------------------------------- 1 | from .fathom_feature_set import FathomCall, FathomMessage 2 | -------------------------------------------------------------------------------- /12_model/churn_model.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalk-ai/examples/HEAD/12_model/churn_model.skops -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/requirements.txt: -------------------------------------------------------------------------------- 1 | chalkpy[runtime,postgresql] 2 | google-generativeai 3 | -------------------------------------------------------------------------------- /unstructured_data/src/datasources.py: -------------------------------------------------------------------------------- 1 | from chalk.sql import PostgreSQLSource 2 | 3 | 4 | PostgreSQLSource(name="postgres") -------------------------------------------------------------------------------- /full_examples/image_processing/requirements.txt: -------------------------------------------------------------------------------- 1 | chalkpy[runtime] 2 | beautifulsoup4 3 | requests 4 | pillow 5 | CairoSVG==2.5.2 6 | -------------------------------------------------------------------------------- /full_examples/sagemaker/src/datasources.py: -------------------------------------------------------------------------------- 1 | from chalk.sql import PostgreSQLSource 2 | 3 | 4 | PostgreSQLSource(name="postgres") 5 | -------------------------------------------------------------------------------- /github/features/search/__init__.py: -------------------------------------------------------------------------------- 1 | from .github_search import GithubSearch 2 | 3 | __all__ = [ 4 | "GithubSearch", 5 | ] 6 | -------------------------------------------------------------------------------- /full_examples/batch_ml/models/fraud_model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chalk-ai/examples/HEAD/full_examples/batch_ml/models/fraud_model.onnx -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/src/datasources.py: -------------------------------------------------------------------------------- 1 | from chalk.sql import PostgreSQLSource 2 | 3 | 4 | PostgreSQLSource(name="postgres") -------------------------------------------------------------------------------- /full_examples/batch_ml/chalk.yaml: -------------------------------------------------------------------------------- 1 | project: Sandbox 2 | environments: 3 | default: 4 | runtime: python311 5 | requirements: pyproject.toml 6 | -------------------------------------------------------------------------------- /unstructured_data/src/users.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: User 2 | -- source: postgres 3 | select 4 | id, 5 | email, 6 | dob, 7 | name 8 | from usrs -------------------------------------------------------------------------------- /unstructured_data/chalk.yaml: -------------------------------------------------------------------------------- 1 | project: Demo Project 2 | environments: 3 | default: 4 | runtime: python312 5 | requirements: requirements.txt 6 | -------------------------------------------------------------------------------- /marketplace/item.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: Item 2 | -- source: postgres 3 | select 4 | hid as id, 5 | title, 6 | description 7 | from marketplace_products 8 | -------------------------------------------------------------------------------- /13_airflow/get_users.chalk.sql: -------------------------------------------------------------------------------- 1 | -- The features given to us by the user. 2 | -- resolves: user 3 | -- source: postgres 4 | select id, full_name as name, email from users; 5 | -------------------------------------------------------------------------------- /full_examples/dynamic_pricing/chalk.yaml: -------------------------------------------------------------------------------- 1 | project: Demo 2 | environments: 3 | default: 4 | runtime: python311 5 | requirements: requirements.txt 6 | 7 | -------------------------------------------------------------------------------- /full_examples/image_processing/chalk.yaml: -------------------------------------------------------------------------------- 1 | project: Demo 2 | environments: 3 | default: 4 | runtime: python311 5 | requirements: requirements.txt 6 | 7 | -------------------------------------------------------------------------------- /full_examples/dynamic_pricing/src/sql/hotels.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: Hotel 2 | -- source: postgres 3 | select 4 | id, 5 | num_rooms, 6 | location 7 | from 8 | hotels; 9 | -------------------------------------------------------------------------------- /marketing/event_type.chalk.sql: -------------------------------------------------------------------------------- 1 | -- Resolves: EventType 2 | -- source: postgres 3 | select 4 | name, 5 | product_area_type, 6 | event_weight 7 | from event_types 8 | -------------------------------------------------------------------------------- /full_examples/batch_ml/src/datasources.py: -------------------------------------------------------------------------------- 1 | from chalk.sql import PostgreSQLSource, SnowflakeSource 2 | 3 | pg = PostgreSQLSource(name="pg") 4 | sf = SnowflakeSource(name="sf") 5 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/src/users.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: User 2 | -- source: postgres 3 | select 4 | id, 5 | email, 6 | dob, 7 | name 8 | from usrs -------------------------------------------------------------------------------- /marketing/product_area.chalk.sql: -------------------------------------------------------------------------------- 1 | -- Resolves: ProductArea 2 | -- source: postgres 3 | select 4 | name as type, 5 | created_at, 6 | description 7 | from product_areas 8 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/chalk.yaml: -------------------------------------------------------------------------------- 1 | project: Demo Project 2 | environments: 3 | default: 4 | runtime: python311 5 | requirements: requirements.txt 6 | -------------------------------------------------------------------------------- /call_recordings/sql/fathom_call.chalk.sql: -------------------------------------------------------------------------------- 1 | -- type: online 2 | -- resolves: FathomCall 3 | -- source: clickhouse 4 | select 5 | recording_id as id 6 | from "fathom-calls-etl-01" 7 | ; 8 | -------------------------------------------------------------------------------- /full_examples/batch_ml/src/resolvers/sql/get_users.chalk.sql: -------------------------------------------------------------------------------- 1 | -- get users from postgres 2 | -- source: pg 3 | -- resolves: User 4 | SELECT 5 | id, 6 | name 7 | FROM 8 | users 9 | -------------------------------------------------------------------------------- /full_examples/image_processing/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "image-chalk" 3 | version = "0.1.0" 4 | description = "Image Chalk Demo" 5 | readme = "README.md" 6 | dependencies = [] 7 | -------------------------------------------------------------------------------- /github/features/groq/groq.py: -------------------------------------------------------------------------------- 1 | GROQ_API_KEY: str = "" 2 | GROQ_MODEL_PROVIDER: str = "openai" 3 | GROQ_MODEL: str = "llama3-8b-8192" 4 | GROQ_BASE_URL: str = "https://api.groq.com/openai/v1" 5 | -------------------------------------------------------------------------------- /github/sql/github_archive_stars.chalk.sql: -------------------------------------------------------------------------------- 1 | -- type: online 2 | -- resolves: GithubArchive 3 | -- source: postgres 4 | select id, name as path, url as api_url, stars from github_archive_stars 5 | -------------------------------------------------------------------------------- /marketing/session.chalk.sql: -------------------------------------------------------------------------------- 1 | -- Resolves: Session 2 | -- source: postgres 3 | select 4 | id, 5 | created_at, 6 | end_at, 7 | duration, 8 | user_id 9 | from sessions 10 | -------------------------------------------------------------------------------- /unstructured_data/src/__init__.py: -------------------------------------------------------------------------------- 1 | import google.generativeai as genai 2 | 3 | 4 | # @before_all 5 | # def init_model(): 6 | # genai.configure(api_key="AIzaSyCEgFSw5mRj-POYuvhJJKhIfw76NJxaUo0") 7 | -------------------------------------------------------------------------------- /full_examples/batch_ml/src/resolvers/sql/get_users_offline.chalk.sql: -------------------------------------------------------------------------------- 1 | -- get users from snowflake 2 | -- source: sf 3 | -- resolves: User 4 | SELECT 5 | id, 6 | name 7 | FROM 8 | "ML.USERS" 9 | -------------------------------------------------------------------------------- /github/features/cerebras/cerebras.py: -------------------------------------------------------------------------------- 1 | CEREBRAS_API_KEY: str = "" 2 | CEREBRAS_MODEL_PROVIDER: str = "openai" 3 | CEREBRAS_MODEL: str = "llama3.1-8b" 4 | CEREBRAS_BASE_URL: str = "https://api.cerebras.ai/v1" 5 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/src/groq.py: -------------------------------------------------------------------------------- 1 | GROQ_API_KEY: str ="" 2 | GROQ_MODEL_PROVIDER: str = "openai" 3 | GROQ_MODEL: str = "llama3-8b-8192" 4 | GROQ_BASE_URL: str = "https://api.groq.com/openai/v1" 5 | -------------------------------------------------------------------------------- /marketing/user.chalk.sql: -------------------------------------------------------------------------------- 1 | -- Resolves: User 2 | -- source: postgres 3 | select 4 | hid as id, 5 | created_at, 6 | first_name, 7 | last_name, 8 | email, 9 | birthday 10 | from users 11 | -------------------------------------------------------------------------------- /unstructured_data/src/transactions.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: Transaction 2 | -- source: postgres 3 | select 4 | id, 5 | amount, 6 | user_id, 7 | at, 8 | description as memo 9 | from txns 10 | -------------------------------------------------------------------------------- /full_examples/sagemaker/src/resolvers/customers.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: Customer 2 | -- source: postgres 3 | select 4 | id, 5 | name, 6 | email, 7 | dob, 8 | age, 9 | income 10 | from 11 | users; 12 | -------------------------------------------------------------------------------- /marketing/event.chalk.sql: -------------------------------------------------------------------------------- 1 | -- Resolves: Event 2 | -- source: postgres 3 | select 4 | id, 5 | created_at, 6 | name, 7 | product_area_type, 8 | user_id, 9 | session_id 10 | from events 11 | -------------------------------------------------------------------------------- /full_examples/dynamic_pricing/src/datasources.py: -------------------------------------------------------------------------------- 1 | from chalk.sql import PostgreSQLSource 2 | from chalk.streams import KafkaSource 3 | 4 | postgres = PostgreSQLSource(name="pg") 5 | kafka_stream = KafkaSource(name="stream") 6 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/src/__init__.py: -------------------------------------------------------------------------------- 1 | import google.generativeai as genai 2 | 3 | 4 | # @before_all 5 | # def init_model(): 6 | # genai.configure(api_key="AIzaSyCEgFSw5mRj-POYuvhJJKhIfw76NJxaUo0") 7 | -------------------------------------------------------------------------------- /marketplace/seller.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: Seller 2 | -- source: postgres 3 | select 4 | hid as id, 5 | created_at, 6 | name, 7 | zipcode, 8 | email, 9 | phone_number 10 | from marketplace_sellers 11 | -------------------------------------------------------------------------------- /marketplace/user.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: User 2 | -- source: postgres 3 | select 4 | hid as id, 5 | created_at, 6 | first_name, 7 | last_name, 8 | email, 9 | birthday 10 | from marketplace_users 11 | -------------------------------------------------------------------------------- /github/sql/github_archive_stars.sql: -------------------------------------------------------------------------------- 1 | -- type: online 2 | -- resolves: GithubArchive 3 | -- source: postgres 4 | select id, name as path, url as api_url, stars 5 | from github_archive_stars 6 | order by stars desc 7 | limit 100 8 | ; 9 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/src/transactions.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: Transaction 2 | -- source: postgres 3 | select 4 | id, 5 | amount, 6 | user_id, 7 | at, 8 | description as memo 9 | from txns 10 | -------------------------------------------------------------------------------- /full_examples/sagemaker/src/resolvers/transactions.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: Transaction 2 | -- source: postgres 3 | select 4 | id, 5 | amt, 6 | customer_id, 7 | confirmed_fraud, 8 | created_at as at 9 | from 10 | transactions; 11 | -------------------------------------------------------------------------------- /marketplace/interaction/interaction_type.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class InteractionType(str, Enum): 5 | PRODUCT_INQUIRY = "productInquiry" 6 | ORDER_PLACEMENT = "orderPlacement" 7 | FEEDBACK_AND_REVIEWS = "feedbackAndReviews" 8 | -------------------------------------------------------------------------------- /marketplace/item_price.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: ItemPrice 2 | -- source: postgres 3 | select 4 | hid as id, 5 | price as value, 6 | created_at , 7 | product_hid as item_id, 8 | seller_hid as seller_id 9 | from marketplace_product_prices 10 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/src/transactions_offline.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: Transaction 2 | -- source: bigquery 3 | -- type: offline 4 | select 5 | id, 6 | amount, 7 | user_id, 8 | updated_at as at, 9 | description as memo 10 | from transactions_log 11 | -------------------------------------------------------------------------------- /unstructured_data/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Python artifacts 3 | venv 4 | *venv* 5 | virtualenv 6 | __pycache__ 7 | *.pyc 8 | *.py~ 9 | .eggs 10 | *.egg-info 11 | dist 12 | 13 | # VSCode 14 | .vscode 15 | 16 | # Intellij 17 | *.iml 18 | idea 19 | 20 | # Git artifacts 21 | .git 22 | .github 23 | -------------------------------------------------------------------------------- /full_examples/image_processing/README.md: -------------------------------------------------------------------------------- 1 | # Image Processing Example 2 | 3 | In this example we set up some code showing how to scrape images from websites and process them. 4 | 5 | The images are processed with the Python Pillow library and flagged by an image model 6 | hosted on a SageMaker endpoint. 7 | -------------------------------------------------------------------------------- /marketplace/interaction.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: Interaction 2 | -- source: postgres 3 | select 4 | hid as id, 5 | created_at, 6 | interaction_type, 7 | seller_hid as seller_id, 8 | user_hid as user_id, 9 | product_hid as item_id, 10 | price 11 | from marketplace_interactions 12 | -------------------------------------------------------------------------------- /full_examples/batch_ml/src/resolvers/sql/get_transactions.chalk.sql: -------------------------------------------------------------------------------- 1 | -- get transactions from postgres 2 | -- source: pg 3 | -- resolves: Transaction 4 | SELECT 5 | transaction_id as id, 6 | user_id, 7 | merchant_id, 8 | amount, 9 | ts, 10 | category 11 | FROM 12 | transactions 13 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Python artifacts 3 | venv 4 | *venv* 5 | virtualenv 6 | __pycache__ 7 | *.pyc 8 | *.py~ 9 | .eggs 10 | *.egg-info 11 | dist 12 | 13 | # VSCode 14 | .vscode 15 | 16 | # Intellij 17 | *.iml 18 | idea 19 | 20 | # Git artifacts 21 | .git 22 | .github 23 | -------------------------------------------------------------------------------- /marketplace/review.chalk.sql: -------------------------------------------------------------------------------- 1 | -- resolves: Review 2 | -- source: postgres 3 | select 4 | hid as id, 5 | created_at, 6 | star_rating, 7 | review_headline, 8 | review_body, 9 | product_hid as item_id, 10 | user_hid as user_id, 11 | seller_hid as seller_id 12 | from marketplace_reviews 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | about: Suggest an idea for a new example 4 | title: "[FEATURE]" 5 | labels: enhancement 6 | --- 7 | 8 | **Desired Example** 9 | A description of the example you'd like to see! 10 | 11 | **Is this request related to a problem?** 12 | 13 | **Additional Context** 14 | -------------------------------------------------------------------------------- /13_airflow/features.py: -------------------------------------------------------------------------------- 1 | from chalk import online 2 | from chalk.features import features 3 | 4 | 5 | @features 6 | class User: 7 | id: int 8 | name: str 9 | email: str 10 | email_domain: str 11 | 12 | 13 | @online 14 | def get_email_domain(email: User.email) -> User.email_domain: 15 | return email.split("@")[1].lower() 16 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from chalk.client import ChalkClient 3 | 4 | 5 | @pytest.fixture(scope="session") 6 | def client(): 7 | # OPTION 2 8 | # chalk apply --branch 9 | # CHALK_CLIENT_ID 10 | # CHALK_CLIENT_SECRET 11 | return ChalkClient(branch=True) 12 | -------------------------------------------------------------------------------- /github/features/fraud/github_event.chalk.sql: -------------------------------------------------------------------------------- 1 | -- type: online 2 | -- resolves: GithubEvent 3 | -- source: postgres 4 | select 5 | event_id as id, 6 | event_type as type, 7 | created_at, 8 | public, 9 | payload_action, 10 | repo_id, 11 | repo_name, 12 | -- actor_id as user_id, 13 | actor_login as username 14 | from github_events 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # dependencies 2 | /node_modules 3 | node_modules 4 | /.pnp 5 | .pnp.js 6 | 7 | # testing 8 | /coverage 9 | 10 | # misc 11 | .DS_Store 12 | *.pem 13 | 14 | # python 15 | __pycache__ 16 | *.pyc 17 | .eggs 18 | *.egg-info 19 | venv/ 20 | dist 21 | build 22 | engine.iml 23 | 24 | # VSCode 25 | .vscode/ 26 | 27 | # intellij 28 | *.iml 29 | .idea 30 | 31 | -------------------------------------------------------------------------------- /github/features/__init__.py: -------------------------------------------------------------------------------- 1 | from .github.github_archive import GithubArchive 2 | from .github.github_repo import GithubRepo 3 | from .github.github_repo_document_vector_database import GithubRepoDocVDB 4 | from .github.github_user import GithubUser 5 | 6 | __all__ = [ 7 | "GithubArchive", 8 | "GithubRepo", 9 | "GithubRepoDocVDB", 10 | "GithubUser", 11 | ] 12 | -------------------------------------------------------------------------------- /marketing/requirements.txt: -------------------------------------------------------------------------------- 1 | chalkpy[bigquery,openai,postgresql,runtime,vertexai,clickhouse] 2 | google-generativeai 3 | httpx~=0.27.2 4 | lancedb~=0.24.1 5 | openai>=1.52.2 6 | orjson~=3.11.0 7 | pydantic~=1.10.22 8 | pygithub~=2.6.1 9 | requests~=2.32.3 10 | numpy~=1.26.4 11 | protobuf~=5.29.5 12 | pyarrow~=18.1.0 13 | marimo~=0.14.13 14 | pandas~=2.2.3 15 | pytest~=8.4.2 16 | -------------------------------------------------------------------------------- /marketing/customer_interaction.chalk.sql: -------------------------------------------------------------------------------- 1 | -- Resolves: CustomerInteraction 2 | -- source: postgres 3 | select 4 | id, 5 | created_at, 6 | sentiment_rating, 7 | correspondence_subject, 8 | correspondence_body, 9 | communication_channel, 10 | communication_direction, 11 | user_event_id, 12 | user_id, 13 | product_area_id 14 | from correspondences 15 | -------------------------------------------------------------------------------- /call_recordings/sql/fathom_message.chalk.sql: -------------------------------------------------------------------------------- 1 | -- type: online 2 | -- resolves: FathomMessage 3 | -- source: clickhouse 4 | select 5 | id, 6 | recording_id, 7 | message_id, 8 | url, 9 | title, 10 | date, 11 | timestamp, 12 | speaker, 13 | organization as organization_raw, 14 | message, 15 | action_item, 16 | watch_link 17 | from "fathom-messages-etl-01" 18 | ; 19 | -------------------------------------------------------------------------------- /08_testing/2_integration_tests.py: -------------------------------------------------------------------------------- 1 | # You can apply changes with the `--no-promote` 2 | # flag to create a preview environment: 3 | # 4 | # > chalk apply --no-promote 5 | 6 | # Once your code has been deployed, you can query 7 | # the resulting deployment id: 8 | # 9 | # > chalk query --deployment $DEPLOYMENT_ID \ 10 | # --in user.id=1 \ 11 | # --out user.id \ 12 | # --out user.email 13 | -------------------------------------------------------------------------------- /github/sql/github_owner.sql: -------------------------------------------------------------------------------- 1 | -- type: online 2 | -- resolves: GithubOwner 3 | -- source: postgres 4 | -- ELVIS: TODO: DISABLED USING THE ACTUAL PYTHON 5 | select 6 | id, 7 | hid, 8 | login, 9 | node_id, 10 | avatar_url, 11 | url, 12 | html_url, 13 | followers_url, 14 | following_url, 15 | starred_url, 16 | organizations_url, 17 | repos_url, 18 | type 19 | from github_owner 20 | -------------------------------------------------------------------------------- /unstructured_data/.chalkignore: -------------------------------------------------------------------------------- 1 | # .gitignore compatible file for ignoring files with chalk apply 2 | # Chalk also respects .gitignore 3 | 4 | # Ignore test files 5 | tests 6 | 7 | # Python artifacts 8 | venv 9 | *venv* 10 | virtualenv 11 | __pycache__ 12 | *.pyc 13 | *.py~ 14 | .eggs 15 | *.egg-info 16 | dist 17 | 18 | # VSCode 19 | .vscode 20 | 21 | # Intellij 22 | *.iml 23 | idea 24 | 25 | # Git artifacts 26 | .git 27 | .github 28 | -------------------------------------------------------------------------------- /06_dataframe/3_projections.py: -------------------------------------------------------------------------------- 1 | from chalk.features import DataFrame, features 2 | 3 | 4 | @features 5 | class Transaction: 6 | id: int 7 | user_id: "User.id" 8 | memo: str 9 | merchant: str 10 | amount: float 11 | 12 | 13 | @features 14 | class User: 15 | id: int 16 | txns: DataFrame[Transaction] 17 | 18 | 19 | # You can filter down the transactions by any of the 20 | # properties on the transaction 21 | credits = User.txns[Transaction.amount] 22 | -------------------------------------------------------------------------------- /full_examples/batch_ml/src/resolvers/sql/get_transactions_offline.chalk.sql: -------------------------------------------------------------------------------- 1 | -- get transactions from snowflake 2 | -- source: sf 3 | -- resolves: Transaction 4 | -- tag: ['model_sample'] 5 | -- incremental: 6 | -- mode: row 7 | -- lookback_period: 60m 8 | -- incremental_column: ts 9 | SELECT 10 | transaction_id as id, 11 | user_id, 12 | merchant_id, 13 | amount, 14 | ts, 15 | category 16 | FROM 17 | "ML.TRANSACTIONS" 18 | WHERE category <> "pending" 19 | -------------------------------------------------------------------------------- /full_examples/dynamic_pricing/README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Price Prediction with Chalk 2 | 3 | In this example we set up some code showing how to write dynamic pricing features in Chalk. The goal``` 4 | is to show how a company that dynamically prices hotels might define their features. This example assumes that data is defined in two places: 5 | - A Postgres database with a `hotel` table which contains basic features like `num_rooms` and `location`, 6 | - A Kafka stream which updates in realtime with customer-hotel interaction information. 7 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/.chalkignore: -------------------------------------------------------------------------------- 1 | # .gitignore compatible file for ignoring files with chalk apply 2 | # Chalk also respects .gitignore 3 | 4 | # Ignore test files 5 | tests 6 | 7 | # Python artifacts 8 | venv 9 | *venv* 10 | virtualenv 11 | __pycache__ 12 | *.pyc 13 | *.py~ 14 | .eggs 15 | *.egg-info 16 | dist 17 | 18 | # VSCode 19 | .vscode 20 | 21 | # Intellij 22 | *.iml 23 | idea 24 | 25 | # Git artifacts 26 | .git 27 | .github 28 | 29 | 30 | streaming.py 31 | transactions_offline.chalk.sql 32 | -------------------------------------------------------------------------------- /marketplace/__init__.py: -------------------------------------------------------------------------------- 1 | # Import all feature classes from models.py to make them available at package level 2 | from .models import ( 3 | Interaction, 4 | Item, 5 | ItemPrice, 6 | ItemSearch, 7 | Review, 8 | Seller, 9 | StructuredOutput, 10 | User, 11 | UserItem, 12 | ) 13 | 14 | __all__ = [ 15 | "Interaction", 16 | "Item", 17 | "ItemPrice", 18 | "ItemSearch", 19 | "Review", 20 | "Seller", 21 | "StructuredOutput", 22 | "User", 23 | "UserItem", 24 | ] 25 | -------------------------------------------------------------------------------- /github/features/github/github_repo_document_vector_database.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from chalk.features import ( 4 | Primary, 5 | features, 6 | ) 7 | 8 | if TYPE_CHECKING: 9 | from src.github.features.search import GithubSearch 10 | 11 | 12 | @features 13 | class GithubRepoDocVDB: 14 | # from vector database 15 | path: Primary[str] 16 | query: "GithubSearch.query" = "" 17 | url: str 18 | distance: float | None 19 | ai_summary: str 20 | query_type: str = "VECTOR" 21 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | ## Security 2 | 3 | If you believe you have found a security vulnerability in Chalk, please report it to us! 4 | 5 | ### Reporting Security Issues 6 | 7 | **Please do not report security vulnerabilities through public GitHub issues.** 8 | 9 | Please email security concerns to [security@chalk.ai](mailto:security@chalk.ai). 10 | 11 | ### Security Overview 12 | 13 | https://docs.chalk.ai/docs/security 14 | 15 | ### SOC-2 Report 16 | 17 | To request access to Chalk's SOC-2 report, please email [security@chalk.ai](mailto:security@chalk.ai). 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Report an issue with an example 4 | title: "[BUG]" 5 | labels: bug 6 | --- 7 | 8 | **Bug Description** 9 | 10 | **Reproduction** 11 | Steps to reproduce the behavior. 12 | 13 | **Expected Behavior** 14 | A description of what you expected to happen. 15 | 16 | **Screenshots** 17 | If applicable, add screenshots to help explain your problem. 18 | 19 | **Version** 20 | Version of the Chalk Python package. 21 | 22 | **Additional Context** 23 | Add any other context about the problem here. 24 | -------------------------------------------------------------------------------- /01_features/6_has_one_has_many.py: -------------------------------------------------------------------------------- 1 | from chalk.features import DataFrame, features, has_many 2 | 3 | 4 | @features 5 | class Book: 6 | id: str 7 | name: str 8 | page_count: int 9 | author_id: str 10 | # Here, we do not define the has_one relationship. 11 | # The relationship is assumed to be symmetric, and the join 12 | # condition is taken from the `has_many(...)` defined on `Author`. 13 | author: "Author" 14 | 15 | 16 | @features 17 | class Author: 18 | id: str 19 | books: DataFrame[Book] = has_many(lambda: Book.author_id == Author.id) 20 | -------------------------------------------------------------------------------- /github/features/github/github_archive.py: -------------------------------------------------------------------------------- 1 | import chalk.functions as F 2 | from chalk.features import Primary, _, features 3 | 4 | 5 | @features 6 | class GithubArchive: 7 | id: int 8 | path: Primary[str] 9 | api_url: str 10 | stars: int = -1 11 | is_valid_repo_path: bool = F.regexp_like( 12 | expr=_.path, 13 | pattern=r"^[a-zA-Z0-9_-]+\/[a-zA-Z0-9._-]+$", 14 | ) 15 | url: str | None = F.if_then_else( 16 | condition=_.is_valid_repo_path, 17 | if_true="https://github.com/" + _.path, 18 | if_false=None, 19 | ) 20 | -------------------------------------------------------------------------------- /06_dataframe/5_aggregations.py: -------------------------------------------------------------------------------- 1 | from chalk.features import DataFrame, features 2 | 3 | 4 | @features 5 | class Transaction: 6 | id: int 7 | user_id: "User.id" 8 | memo: str 9 | merchant: str 10 | amount: float 11 | 12 | 13 | @features 14 | class User: 15 | id: int 16 | txns: DataFrame[Transaction] 17 | num_credits: int 18 | 19 | 20 | # You can filter down the transactions by any of the 21 | # properties on the transaction 22 | @online 23 | def get_num_credits(credits: User.txns[Transaction.amount < 0]) -> User.num_credits: 24 | return len(credits) 25 | -------------------------------------------------------------------------------- /unstructured_data/src/denylist.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | from chalk import chalk_logger 3 | 4 | 5 | class Denylist: 6 | def __init__( 7 | self, 8 | source: str, 9 | ): 10 | self.source = source 11 | self.s = set() 12 | 13 | def load(self): 14 | try: 15 | self.s = set(pl.read_csv(self.source).to_series().to_list()) 16 | except Exception as e: 17 | chalk_logger.warn(f"Failed to load denylist {e}", exc_info=True) 18 | 19 | def __contains__(self, email: str) -> bool: 20 | return email in self.s 21 | -------------------------------------------------------------------------------- /06_dataframe/4_filters_and_projections.py: -------------------------------------------------------------------------------- 1 | from chalk.features import DataFrame, _, features 2 | 3 | 4 | @features 5 | class Transaction: 6 | id: int 7 | user_id: "User.id" 8 | memo: str 9 | merchant: str 10 | amount: float 11 | 12 | 13 | @features 14 | class User: 15 | id: int 16 | txns: DataFrame[Transaction] 17 | 18 | 19 | # You can filter down the transactions by any of the 20 | # properties on the transaction 21 | credits = User.txns[Transaction.amount < 0] 22 | 23 | # You can also use the '_' as an alias for the current namespace 24 | credits = User.txns[_.amount < 0] 25 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/src/denylist.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | from chalk import chalk_logger 3 | 4 | 5 | class Denylist: 6 | def __init__( 7 | self, 8 | source: str, 9 | ): 10 | self.source = source 11 | self.s = set() 12 | 13 | def load(self): 14 | try: 15 | self.s = set(pl.read_csv(self.source).to_series().to_list()) 16 | except Exception as e: 17 | chalk_logger.warn(f"Failed to load denylist {e}", exc_info=True) 18 | 19 | def __contains__(self, email: str) -> bool: 20 | return email in self.s 21 | -------------------------------------------------------------------------------- /09_github_actions/1_install_chalk_cli.yaml: -------------------------------------------------------------------------------- 1 | name: Install the Chalk CLI 2 | on: push 3 | 4 | jobs: 5 | test-with-chalk: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v4 9 | 10 | - uses: chalk-ai/cli-action@v2 11 | with: 12 | client-id: ${{secrets.CHALK_CLIENT_ID}} 13 | client-secret: ${{secrets.CHALK_CLIENT_SECRET}} 14 | 15 | - name: Use the Chalk CLI 16 | run: | 17 | # Print out the version 18 | chalk version 19 | # All commands are now authenticated with your client-id and client-secret 20 | chalk whoami 21 | -------------------------------------------------------------------------------- /full_examples/batch_ml/src/queries.py: -------------------------------------------------------------------------------- 1 | from chalk import ScheduledQuery, NamedQuery 2 | from src.models import Transaction 3 | 4 | 5 | # Scheduled Queries allow you to compute a specified 6 | # set of features on a schedule, useful for persisting 7 | # values to the online and offline stores. 8 | # https://docs.chalk.ai/docs/scheduled-query 9 | 10 | sq = ScheduledQuery( 11 | name="run_fraud_model", 12 | schedule="0 0 * * *", # Every day at midnight 13 | output=[ 14 | Transaction.is_fraud, 15 | ], 16 | store_online=True, 17 | store_offline=True, 18 | tags=["model_sample"], 19 | incremental_resolvers=["get_transactions_offline"], 20 | ) 21 | -------------------------------------------------------------------------------- /marketing/__init__.py: -------------------------------------------------------------------------------- 1 | # Import all feature classes from models.py to make them available at package level 2 | from .models import ( 3 | CustomerInteraction, 4 | CustomerInteractionDocument, 5 | CustomerInteractionSearch, 6 | ProductArea, 7 | UserEventType, 8 | StructuredOutput, 9 | User, 10 | Event, 11 | UserEventAnalysis, 12 | EventType, 13 | ) 14 | 15 | __all__ = [ 16 | "CustomerInteraction", 17 | "CustomerInteractionDocument", 18 | "CustomerInteractionSearch", 19 | "ProductArea", 20 | "UserEventType", 21 | "StructuredOutput", 22 | "User", 23 | "Event", 24 | "UserEventAnalysis", 25 | "EventType", 26 | ] 27 | -------------------------------------------------------------------------------- /13_airflow/shared_environment.py: -------------------------------------------------------------------------------- 1 | from airflow.decorators import task 2 | from chalk.client import ChalkClient 3 | from airflow.exceptions import AirflowFailException 4 | 5 | 6 | @task 7 | def run_chalk_resolver() -> str: 8 | """ 9 | Trigger the resolver.get_email_domain resolver 10 | """ 11 | 12 | # This assumes that CHALK_CLIENT_SECRET, CHALK_CLIENT_ID, & CHALK_ENVIRONMENT environment variables 13 | # are passed to airflow. 14 | client = ChalkClient() 15 | 16 | result = client.trigger_resolver_run( 17 | "get_users" 18 | ) 19 | if result.status == "failed": 20 | raise AirflowFailException(f"Resolver run failed: {result}") 21 | return result.id -------------------------------------------------------------------------------- /01_features/4_has_one.py: -------------------------------------------------------------------------------- 1 | from chalk.features import features, has_one 2 | 3 | 4 | @features 5 | class Author: 6 | id: str 7 | author_name: str 8 | 9 | 10 | @features 11 | class Book: 12 | id: str 13 | name: str 14 | author_id: str 15 | # The `has_one(...)` function takes a lambda function 16 | # that specifies the join condition between the classes. 17 | # We need to use a lambda function, not simply the join condition, 18 | # to allow for forward references to the `Author` class. 19 | author: Author = has_one(lambda: Book.author_id == Author.id) 20 | 21 | 22 | # You can reference features through this has-one relationship 23 | author_name_type = Book.author.author_name 24 | -------------------------------------------------------------------------------- /github/sql/github_repo.sql: -------------------------------------------------------------------------------- 1 | -- type: online 2 | -- resolves: GithubRepo 3 | -- source: postgres 4 | -- ELVIS: TODO: DISABLED USING THE ACTUAL PYTHON 5 | select 6 | id, 7 | hid, 8 | node_id, 9 | name, 10 | full_name, 11 | html_url, 12 | description, 13 | url, 14 | created_at, 15 | updated_at, 16 | pushed_at, 17 | homepage, 18 | size, 19 | stargazers_count, 20 | watchers_count, 21 | language, 22 | has_issues, 23 | forks_count, 24 | archived, 25 | open_issues_count, 26 | license, 27 | visibility, 28 | forks, 29 | open_issues, 30 | watchers, 31 | default_branch, 32 | owner as owner_id 33 | from github_repos_elvis 34 | ; 35 | -------------------------------------------------------------------------------- /07_streaming/1_mapping_stream.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from chalk import stream 4 | from chalk.features import Features, features 5 | from chalk.streams import KafkaSource 6 | 7 | 8 | @features 9 | class User: 10 | id: str 11 | favorite_color: str 12 | 13 | 14 | class UserUpdateBody(BaseModel): 15 | user_id: str 16 | favorite_color: str 17 | 18 | 19 | src = KafkaSource( 20 | bootstrap_server="kafka.website.com:9092", topic="user_favorite_color_updates" 21 | ) 22 | 23 | 24 | @stream(source=src) 25 | def fn(message: UserUpdateBody) -> Features[User.id, User.favorite_color]: 26 | return User( 27 | id=message.user_id, 28 | favorite_color=message.favorite_color, 29 | ) 30 | -------------------------------------------------------------------------------- /01_features/3_primary_keys.py: -------------------------------------------------------------------------------- 1 | from chalk import is_primary 2 | from chalk.features import Primary, feature, features 3 | 4 | 5 | # Feature classes have exactly one primary key, 6 | # which by default, is taken to be the field with 7 | # the name `id`. 8 | @features 9 | class Book1: 10 | id: str 11 | 12 | 13 | # If you want to name your primary key something other than `id`, 14 | # you can explicitly assign it a primary key 15 | @features 16 | class Book2: 17 | book_id: Primary[str] 18 | 19 | 20 | # Alternatively, you can use the `features(...)` function 21 | # to set a feature to primary 22 | @features 23 | class Book2: 24 | book_id: str = feature(primary=True) 25 | 26 | 27 | assert is_primary(Book2.book_id) 28 | assert is_primary(Book1.id) 29 | -------------------------------------------------------------------------------- /05_feature_discovery/1_descriptions.py: -------------------------------------------------------------------------------- 1 | from chalk import description 2 | from chalk.features import feature, features 3 | 4 | 5 | @features 6 | class RocketShip1: 7 | id: int 8 | # Comments above a feature are applied assigned 9 | # to the features above which they sit. 10 | software_version: str 11 | 12 | 13 | @features 14 | class RocketShip2: 15 | software_version: str = feature( 16 | description=""" 17 | You can use explicit comments too! Explicit comments 18 | take precedence over comments parsed from comments in 19 | the code (as above) 20 | """ 21 | ) 22 | 23 | 24 | # The function `chalk.features.description(...)` returns the description text 25 | print(description(RocketShip1.software_version)) 26 | -------------------------------------------------------------------------------- /11_sql/2_dataframes.py: -------------------------------------------------------------------------------- 1 | from chalk import online 2 | from chalk.features import DataFrame, features 3 | from chalk.sql import SQLiteInMemorySource 4 | 5 | 6 | @features 7 | class User: 8 | id: str 9 | viewed_minutes: float 10 | 11 | 12 | db = SQLiteInMemorySource() 13 | 14 | 15 | @online 16 | def get_views() -> DataFrame[User]: 17 | """ 18 | Chalk is able to perform push down filters on the returned type here, 19 | so even though we're returning the viewed minutes for every user, 20 | Chalk will only read the rows that it needs to serve queries. 21 | """ 22 | return db.query_string( 23 | """ 24 | select id, sum(mins) as viewed_minutes 25 | from view_counts 26 | group by id 27 | """, 28 | ).all() 29 | -------------------------------------------------------------------------------- /06_dataframe/1_creating_dataframes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from chalk.features import DataFrame, features 4 | 5 | 6 | @features 7 | class User: 8 | id: int 9 | email: str 10 | 11 | 12 | # Constructing an empty DataFrame 13 | df = DataFrame() 14 | 15 | # Constructing from a Python dictionary 16 | DataFrame.from_dict( 17 | { 18 | User.id: [1, 2], 19 | User.email: ["elliot@chalk.ai", "samantha@chalk.ai"], 20 | } 21 | ) 22 | 23 | # Constructing from a Pandas DataFrame 24 | pandas_df = pd.DataFrame( 25 | { 26 | User.id: [1, 2], 27 | User.email: ["elliot@chalk.ai", "samantha@chalk.ai"], 28 | } 29 | ) 30 | DataFrame(pandas_df) 31 | 32 | # Loading a .csv 33 | DataFrame.read_csv("s3://...") 34 | DataFrame.read_parquet("s3://...") 35 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/src/emailage/client.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | 4 | 5 | class EmailAgeClient: 6 | def get_email_score(self, email: str) -> str: 7 | domainname = email.split("@")[1] 8 | if "fraud" in email: 9 | return json.dumps( 10 | { 11 | "domainAge": 120, 12 | "domainname": domainname, 13 | "emailAge": random.randint(0, 30), 14 | } 15 | ) 16 | return json.dumps( 17 | { 18 | "domainAge": 10200, 19 | "domainname": domainname, 20 | "emailAge": random.randint(365, 5_000), 21 | } 22 | ) 23 | 24 | 25 | emailage_client = EmailAgeClient() 26 | -------------------------------------------------------------------------------- /01_features/5_has_many.py: -------------------------------------------------------------------------------- 1 | from chalk.features import DataFrame, features, has_many 2 | 3 | 4 | @features 5 | class Book: 6 | id: str 7 | name: str 8 | page_count: int 9 | author_id: str 10 | 11 | 12 | @features 13 | class Author: 14 | id: str 15 | # The `has_many(...)` function takes a lambda function 16 | # that specifies the join condition between the classes. 17 | # We need to use a lambda function, not simply the join condition, 18 | # to allow for forward references to the `Author` class. 19 | books: DataFrame[Book] = has_many(lambda: Book.author_id == Author.id) 20 | 21 | 22 | # You can reference the has-many relationship, and interact with the 23 | # dataframe type 24 | book_pages_df: DataFrame[Book.page_count] = Author.books[Book.page_count] 25 | -------------------------------------------------------------------------------- /14_codegen/score_resolvers.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from chalk import online 3 | from models import * 4 | 5 | 6 | @online 7 | def get_score1(nms: User.name_match_score, email: User.email) -> User.score1: 8 | response = requests.post( 9 | "https://internal.example.com/model1", 10 | headers={"accept": "application/json"}, 11 | json={"nms": nms, "email": email}, 12 | ) 13 | return response.json().get("prediction") 14 | 15 | 16 | @online 17 | def get_score2(nms: User.name_match_score, email: User.email) -> User.score2: 18 | response = requests.post( 19 | "https://internal.example.com/model2", 20 | headers={"accept": "application/json"}, 21 | json={"nms": nms, "email": email}, 22 | ) 23 | return response.json().get("prediction") 24 | -------------------------------------------------------------------------------- /predictive_maintenance/2_time_query.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from chalk.client import ChalkClient 4 | from chalk.features import DataFrame, has_many, feature_time, features, FeatureTime 5 | 6 | 7 | @features 8 | class Measurement: 9 | device_id: str 10 | lat: float 11 | long: float 12 | voltage: float 13 | temp: float 14 | timestamp: FeatureTime 15 | 16 | 17 | @features 18 | class Sensor: 19 | id: str 20 | measurements: DataFrame[Measurement] = has_many(lambda: Measurement.device_id == Sensor.id) 21 | 22 | 23 | ChalkClient().offline_query( 24 | input=labels[[Measurement.device_id]], 25 | input_times=[(datetime.now() - timedelta(days=30)).isoformat()], 26 | output=[Measurement.lat, Measurement.long, Measurement.temp], 27 | ) 28 | -------------------------------------------------------------------------------- /03_caching/7_prefetching.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import requests 4 | 5 | from chalk import online 6 | from chalk.features import feature, features 7 | 8 | 9 | @features 10 | class User: 11 | id: int 12 | name: str 13 | email: str 14 | last_login: datetime 15 | fico_score: int = feature(max_staleness="30d") 16 | 17 | 18 | # You can warm the cache by scheduling a resolver to run 19 | # more frequently than the max-staleness. 20 | # Here, the maximum-staleness for the FICO score is 30 days, 21 | # and the cron schedule means that this function will run 22 | # every 29 days and 11 hours. So, the cache will always be warm. 23 | @online(cron="29d 11h") 24 | def get_fico_score(name: User.name, email: User.email) -> User.fico_score: 25 | return requests.get("https://experian.com").json()["score"] 26 | -------------------------------------------------------------------------------- /13_airflow/isolated_environment.py: -------------------------------------------------------------------------------- 1 | from airflow.decorators import task 2 | from airflow.exceptions import AirflowFailException 3 | 4 | 5 | @task.virtualenv( 6 | task_id="virtualenv_python", requirements=["chalkpy"], system_site_packages=False 7 | ) 8 | def run_chalk_resolver() -> str: 9 | """ 10 | Trigger the resolver.get_email_domain resolver in a virtual environment 11 | """ 12 | from chalk.client import ChalkClient 13 | 14 | # This assumes that CHALK_CLIENT_SECRET, CHALK_CLIENT_ID, & CHALK_ENVIRONMENT environment variables 15 | # are passed to airflow. 16 | client = ChalkClient() 17 | 18 | result = client.trigger_resolver_run( 19 | "get_users" 20 | ) 21 | if result.status == "failed": 22 | raise AirflowFailException(f"Resolver run failed: {result}") 23 | return result.id -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/src/streaming.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from chalk import Features, stream 4 | from chalk.streams import KafkaSource 5 | from pydantic import BaseModel 6 | 7 | from src.models import Transaction 8 | 9 | transactions_topic = KafkaSource(name="transactions") 10 | 11 | 12 | class TransactionMessage(BaseModel): 13 | id: str 14 | memo: str 15 | amount: float 16 | at: datetime 17 | 18 | 19 | @stream(source=transactions_topic) 20 | def process_stream_message( 21 | msg: TransactionMessage, 22 | ) -> Features[ 23 | Transaction.id, 24 | Transaction.amount, 25 | Transaction.at, 26 | Transaction.memo, 27 | ]: 28 | return Transaction( 29 | id=msg.id, 30 | amount=msg.amount, 31 | at=msg.at, 32 | memo=msg.memo, 33 | ) 34 | -------------------------------------------------------------------------------- /13_airflow/polling.py: -------------------------------------------------------------------------------- 1 | from airflow.decorators import task 2 | from airflow.sensors.base import PokeReturnValue 3 | from chalk.client import ChalkClient 4 | from airflow.exceptions import AirflowFailException 5 | 6 | 7 | @task.sensor(poke_interval=30, timeout=60 * 5) 8 | def poll_resolver_run(run_id) -> PokeReturnValue: 9 | """ 10 | Poll the running chalk resolver 11 | """ 12 | # This assumes that CHALK_CLIENT_SECRET, CHALK_CLIENT_ID, & CHALK_ENVIRONMENT environment variables 13 | # are passed to airflow. 14 | client = ChalkClient() 15 | status = client.get_run_status(run_id).status 16 | 17 | if status == "succeeded": 18 | return PokeReturnValue(True, run_id) 19 | elif status == "failed": 20 | raise AirflowFailException(f"Chalk resolver resolver run: {run_id}") 21 | return PokeReturnValue(False) -------------------------------------------------------------------------------- /full_examples/batch_ml/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "batch_ml_example" 3 | version = "1.0.0" 4 | description = "Batch Machine Learning Example using Chalk" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "chalkpy[runtime,postgresql,snowflake]", 9 | "onnxruntime>=1.22.1", 10 | ] 11 | 12 | [tool.uv] 13 | dev-dependencies = [ 14 | "pytest>=7.0", 15 | "pytest-cov>=4.0", 16 | "black>=23.0", 17 | "isort>=5.0", 18 | "flake8>=6.0", 19 | "ipython>=8.37.0", 20 | "jupyter>=1.1.1", 21 | "ipykernel>=6.29.5", 22 | ] 23 | 24 | [tool.pytest.ini_options] 25 | pythonpath = ["."] 26 | 27 | [tool.ruff.lint.per-file-ignores] 28 | "*.ipynb" = ["F821","F401"] 29 | 30 | [tool.pyright] 31 | reportUninitializedInstanceVariable = false 32 | reportAssignmentType = false 33 | reportInvalidTypeForm = false 34 | -------------------------------------------------------------------------------- /03_caching/2_lastest_value.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from chalk import online 4 | from chalk.client import ChalkClient 5 | from chalk.features import feature, features 6 | 7 | 8 | @features 9 | class User: 10 | id: int 11 | name: str 12 | 13 | # Setting the maximum staleness to `infinity` means that this 14 | # value is calculated once and then read from the online store 15 | # for subsequent requests. 16 | fico_score: int = feature(max_staleness="infinity") 17 | 18 | 19 | # Slow and expensive `User.fico_score` resolver from `1_basic_caching.py` 20 | @online 21 | def get_fico_score(name: User.name) -> User.fico_score: 22 | return requests.get("...").json()["fico"] 23 | 24 | 25 | if __name__ == "__main__": 26 | ChalkClient().query( 27 | input={User.name: "Katherine Johnson"}, 28 | output=[User.fico_score], 29 | ) 30 | -------------------------------------------------------------------------------- /predictive_maintenance/3_keep_data_fresh.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from pydantic import BaseModel 3 | 4 | from chalk.features import DataFrame, features 5 | from chalk.streams import stream, KafkaSource, Windowed, windowed 6 | 7 | 8 | @features 9 | class Sensor: 10 | id: str 11 | count_failed: Windowed[int] = windowed("10m", "20m") 12 | 13 | 14 | source = KafkaSource(name="sensor_stream") 15 | 16 | 17 | class Message(BaseModel): 18 | device_id: str 19 | timestamp: datetime 20 | is_failing: bool 21 | 22 | 23 | @stream(source=source, mode="continuous") 24 | def process_measurements(df: DataFrame[Message]) -> DataFrame[Sensor]: 25 | return f""" 26 | select 27 | count(*) as count_failed, 28 | id as device_id 29 | from {df} 30 | where is_failing <> TRUE 31 | group by id 32 | """ 33 | -------------------------------------------------------------------------------- /07_streaming/4_continuous_aggregation.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from chalk import stream 4 | from chalk.features import DataFrame, Features, features 5 | from chalk.streams import KafkaSource, Windowed, windowed 6 | 7 | src = KafkaSource( 8 | bootstrap_server='kafka.website.com:9092', 9 | topic='user_favorite_color_updates' 10 | ) 11 | 12 | 13 | @features 14 | class User: 15 | id: str 16 | num_failed_logins: Windowed[int] = windowed("10m", "30m", "1d") 17 | 18 | 19 | class LoginMessage(BaseModel): 20 | user_id: int 21 | success: bool 22 | 23 | 24 | @stream(source=src, mode='continuous', keys={"user_id": User.id}) 25 | def failed_logins(events: DataFrame[LoginMessage]) -> Features[ 26 | User.id, 27 | User.num_failed_logins 28 | ]: 29 | return User(id=events[0].user_id, num_failed_logins=sum(1 for e in events if e.success)) 30 | -------------------------------------------------------------------------------- /07_streaming/3_window_sql.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from chalk import stream 4 | from chalk.features import DataFrame, features 5 | from chalk.streams import KafkaSource, Windowed, windowed 6 | 7 | src = KafkaSource( 8 | bootstrap_server="kafka.website.com:9092", topic="user_favorite_color_updates" 9 | ) 10 | 11 | 12 | @features 13 | class User: 14 | id: str 15 | num_failed_logins: Windowed[int] = windowed("10m", "30m", "1d") 16 | 17 | 18 | class LoginMessage(BaseModel): 19 | user_id: int 20 | failed: bool 21 | 22 | 23 | @stream(source=src) 24 | def failed_logins( 25 | events: DataFrame[LoginMessage], 26 | ) -> DataFrame[User.id, User.num_failed_logins]: 27 | return f""" 28 | select 29 | user_id as id, 30 | count(*) as num_failed_logins 31 | from {events} 32 | where failed = 1 33 | group by 1 34 | """ 35 | -------------------------------------------------------------------------------- /07_streaming/2_window_dataframe.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from chalk import stream 4 | from chalk.features import Features, DataFrame 5 | from chalk.features import features 6 | from chalk.streams import KafkaSource 7 | from chalk.streams import Windowed, windowed 8 | 9 | 10 | src = KafkaSource( 11 | bootstrap_server='kafka.website.com:9092', 12 | topic='user_favorite_color_updates' 13 | ) 14 | 15 | 16 | @features 17 | class User: 18 | id: str 19 | num_failed_logins: Windowed[int] = windowed("10m", "30m", "1d") 20 | 21 | 22 | class LoginMessage(BaseModel): 23 | user_id: int 24 | failed: bool 25 | 26 | 27 | @stream(source=src) 28 | def failed_logins(events: DataFrame[LoginMessage]) -> Features[ 29 | User.id, 30 | User.num_failed_logins 31 | ]: 32 | return User( 33 | id=events["id"].max(), 34 | num_failed_logins=events["failed"].sum(), 35 | ) 36 | -------------------------------------------------------------------------------- /09_github_actions/2_deploy_with_chalk.yaml: -------------------------------------------------------------------------------- 1 | name: Create a preview deployment 2 | on: push 3 | 4 | jobs: 5 | test-with-chalk: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v4 9 | 10 | - name: Setup Python 11 | uses: actions/setup-python@v4 12 | with: 13 | python-version: '3.10' 14 | cache: 'pip' 15 | 16 | # The chalk-ai/deploy-action expects chalkpy to be installed 17 | - name: Install dependencies 18 | run: pip install -r requirements.txt 19 | 20 | - uses: chalk-ai/deploy-action@v2 21 | with: 22 | client-id: ${{secrets.CHALK_CLIENT_ID}} 23 | client-secret: ${{secrets.CHALK_CLIENT_SECRET}} 24 | # Waits for the deployment to succeed (Optional, default false) 25 | await: true 26 | 27 | - name: Use the Chalk CLI 28 | run: chalk query --in transaction.transaction_id=1 --out transaction.clean_memo 29 | -------------------------------------------------------------------------------- /03_caching/6_cache_busting.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from chalk import realtime 4 | from chalk.client import ChalkClient 5 | from chalk.features import feature, features 6 | 7 | 8 | @features 9 | class User: 10 | id: int 11 | name: str 12 | fico_score: int = feature(max_staleness="30d") 13 | 14 | 15 | @realtime 16 | def get_fico_score(name: User.name) -> User.fico_score: 17 | return requests.get("https://experian.com").json()["score"] 18 | 19 | 20 | if __name__ == "__main__": 21 | # You can force cache invalidation by specifying a 22 | # maximum staleness of 0 seconds at the time of making the query: 23 | ChalkClient().query( 24 | input={User.id: 1, User.name: "Katherine Johnson"}, 25 | output=[User.fico_score], 26 | # Cache busting is a special case of providing an override 27 | # max-staleness. See `4_override_max_staleness.py` for more information. 28 | staleness={User.fico_score: "0s"}, 29 | ) 30 | -------------------------------------------------------------------------------- /09_github_actions/3_deploy_preview.yaml: -------------------------------------------------------------------------------- 1 | name: Create a preview deployment 2 | # You might want to set up preview deployments for every pull request 3 | on: pull_request 4 | 5 | jobs: 6 | test-with-chalk: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | 11 | - name: Setup Python 12 | uses: actions/setup-python@v4 13 | with: 14 | python-version: '3.10' 15 | cache: 'pip' 16 | 17 | # The chalk-ai/deploy-action expects chalkpy to be installed 18 | - name: Install dependencies 19 | run: pip install -r requirements.txt 20 | 21 | - uses: chalk-ai/deploy-action@v2 22 | with: 23 | client-id: ${{secrets.CHALK_CLIENT_ID}} 24 | client-secret: ${{secrets.CHALK_CLIENT_SECRET}} 25 | # Creates a preview deployment with a unique deployment ID, 26 | # output by this step 27 | no-promote: true 28 | # Waits for the deployment to succeed 29 | await: true 30 | -------------------------------------------------------------------------------- /14_codegen/models.py: -------------------------------------------------------------------------------- 1 | import chalk.functions as F 2 | from chalk import _ 3 | from chalk.features import features 4 | from custom_model import CustomModel 5 | 6 | 7 | @features 8 | class User: 9 | id: int 10 | name: str 11 | email: str 12 | name_match_score: float = F.jaccard_similarity(_.email, _.name) 13 | score1: float 14 | score2: float 15 | 16 | 17 | model1 = CustomModel( 18 | url="https://internal.example.com/model1", 19 | dependencies={ 20 | "nms": User.name_match_score, 21 | "email": User.email, 22 | }, 23 | computes=User.score1, 24 | ) 25 | 26 | model2 = CustomModel( 27 | url="https://internal.example.com/model2", 28 | dependencies={ 29 | "nms": User.name_match_score, 30 | "email": User.email, 31 | }, 32 | computes=User.score2, 33 | ) 34 | 35 | 36 | if __name__ == "__main__": 37 | CustomModel.render_all( 38 | header="from models import *", 39 | path="./score_resolvers.py", 40 | models=[model1, model2], 41 | ) 42 | -------------------------------------------------------------------------------- /credit/4_aggregate_tradelines.py: -------------------------------------------------------------------------------- 1 | """An example of connecting Users to Tradelines. 2 | 3 | In particular, this example shows how to pass a 4 | filtered DataFrame of features to a resolver. 5 | """ 6 | from chalk import online 7 | from chalk.features import features, DataFrame, has_many 8 | 9 | 10 | @features 11 | class Tradeline: 12 | id: int 13 | user_id: "User.id" 14 | outstanding: float 15 | is_delinquent: bool 16 | 17 | 18 | @features 19 | class User: 20 | id: int 21 | delinquent_amount: float 22 | tradelines: DataFrame[Tradeline] 23 | 24 | 25 | @online 26 | def tradeline_rollup( 27 | accounts: User.tradelines[ 28 | # resolvers can request a subset of a DataFrame's rows as input 29 | # (https://docs.chalk.ai/docs/dataframe#filters). 30 | Tradeline.is_delinquent is True 31 | ] 32 | ) -> User.delinquent_amount: 33 | """ 34 | Sum the outstanding balances on tradelines that 35 | are marked as delinquent. 36 | """ 37 | return accounts[Tradeline.outstanding].sum() 38 | -------------------------------------------------------------------------------- /04_scheduling/3_sample_arguments.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import requests 4 | 5 | from chalk import Cron, online 6 | from chalk.features import DataFrame, feature, features 7 | from chalk.sql import PostgreSQLSource 8 | 9 | 10 | @features 11 | class User: 12 | id: int 13 | name: str 14 | email: str 15 | last_login: datetime 16 | fico_score: int = feature(max_staleness="30d") 17 | 18 | 19 | session = PostgreSQLSource() 20 | 21 | 22 | def get_active_users() -> DataFrame[User.id]: 23 | return session.query_string( 24 | "select users.id from users where users.active = true", 25 | fields={"id": User.id}, 26 | ).all() 27 | 28 | 29 | # The sample function can pull the primary keys or any subset of 30 | # the arguments that you'd like to sample, and Chalk will sample 31 | # the other arguments. 32 | @online(cron=Cron(schedule="29d 11h", sample=get_active_users)) 33 | def get_fico_score(name: User.name, email: User.email) -> User.fico_score: 34 | return requests.get("https://experian.com").json()["score"] 35 | -------------------------------------------------------------------------------- /call_recordings/sql/fathom_call_data.chalk.sql: -------------------------------------------------------------------------------- 1 | -- type: online 2 | -- resolves: FathomCallData 3 | -- source: clickhouse 4 | -- incremental: 5 | -- mode: row 6 | -- incremental_column: meeting_scheduled_start_time 7 | select 8 | id, 9 | recording_id as call_id, 10 | 11 | CAST(meeting_scheduled_start_time AS DATETIME) as meeting_scheduled_start_time, 12 | CASE 13 | WHEN meeting_scheduled_end_time IS NOT NULL 14 | THEN CAST(meeting_scheduled_end_time AS DATETIME) 15 | ELSE NULL 16 | END as meeting_scheduled_end_time, 17 | 18 | meeting_has_external_invitees as has_external_attandees, 19 | 20 | meeting_invitees_name as attendee_name, 21 | meeting_invitees_email as attendee_email, 22 | meeting_invitees_is_external as attendee_is_external, 23 | meeting_external_domains_domain_name as company_domain, 24 | 25 | meeting_join_url, 26 | meeting_scheduled_duration_in_minutes, 27 | meeting_title, 28 | recording_duration_in_minutes, 29 | recording_url, 30 | transcript_plaintext 31 | from "fathom-calls-etl" 32 | ; 33 | -------------------------------------------------------------------------------- /github/features/named_queries.py: -------------------------------------------------------------------------------- 1 | from chalk.queries.named_query import NamedQuery 2 | 3 | from .github_feature_set import GithubProject 4 | 5 | NamedQuery( 6 | name="github_project", 7 | input=[GithubProject.path], 8 | output=[ 9 | GithubProject.project_is_valid_repo_path, 10 | GithubProject.project_url, 11 | GithubProject.username, 12 | GithubProject.repo.description, # project_description 13 | GithubProject.archive.stars, # project_stars_last_year_from_gh_archive 14 | GithubProject.repo.stargazers_count, # project_stars_from_api 15 | GithubProject.vdb.ai_summary, # project_summary_from_vdb 16 | GithubProject.repo.created_at, # repo_created_at 17 | GithubProject.repo.forks_count, # repo_forks 18 | GithubProject.repo.homepage, # repo_homepage_url 19 | GithubProject.repo.open_issues_count, # repo_issues 20 | GithubProject.repo.size, # repo_size_in_kb 21 | GithubProject.user.bio, # user_bio 22 | GithubProject.user.email, # user_email 23 | GithubProject.user.location, # user_location 24 | ], 25 | ) 26 | -------------------------------------------------------------------------------- /github/features/github/github_repo.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from chalk.features import Primary, features 4 | 5 | from .github_user import GithubUser 6 | 7 | 8 | @features(max_staleness="28d") 9 | class GithubRepo: 10 | path: Primary[str] 11 | full_name: str 12 | id: int | None 13 | name: str | None 14 | html_url: str | None 15 | description: str | None 16 | url: str | None 17 | created_at: datetime | None 18 | updated_at: datetime | None 19 | pushed_at: datetime | None 20 | homepage: str | None 21 | size: int | None 22 | stargazers_count: int | None 23 | watchers_count: int | None 24 | language: str | None 25 | has_issues: bool | None 26 | forks_count: int | None 27 | archived: bool | None 28 | open_issues_count: int | None 29 | license: str | None 30 | visibility: str | None 31 | forks: int | None 32 | open_issues: int | None 33 | watchers: int | None 34 | default_branch: str | None 35 | 36 | owner_id: str | None 37 | owner_login: GithubUser.login 38 | user: GithubUser 39 | 40 | updated_at_chalk: datetime | None 41 | -------------------------------------------------------------------------------- /05_feature_discovery/4_unified.py: -------------------------------------------------------------------------------- 1 | from chalk import tags, is_primary, owner, description 2 | from chalk.features import features 3 | 4 | 5 | @features(owner="shuttle@nasa.gov", tags="group:rocketry") 6 | class SpaceShuttle: 7 | id: str 8 | 9 | # The SHA1 of the software deployed to the shuttle. 10 | # Should align with a git commit on main. 11 | # 12 | # :owner: katherine.johnson@nasa.gov 13 | software_version: str 14 | 15 | # The volume of this shuttle in square meters. 16 | # :owner: architecture@nasa.gov 17 | # :tags: zillow-fact, size 18 | volume: str 19 | 20 | 21 | # Pulling the description programmatically 22 | assert len(description(SpaceShuttle.software_version)) > 0 23 | 24 | # Pulling the tags for the feature class and features 25 | assert tags(SpaceShuttle) == ["group:rocketry"] 26 | assert tags(SpaceShuttle.volume) == ["zillow-fact", "size", "group:rocketry"] 27 | 28 | # Pulling the owner for the feature class and features 29 | assert owner(SpaceShuttle) == "shuttle@nasa.gov" 30 | assert owner(SpaceShuttle.software_version) == "katherine.johnson@nasa.gov" 31 | 32 | assert is_primary(SpaceShuttle.id) 33 | -------------------------------------------------------------------------------- /fraud/2_patterns.py: -------------------------------------------------------------------------------- 1 | """An example of calculating non-sufficient fund (NSF) amount from 2 | a user's transactions 3 | """ 4 | 5 | from chalk import online 6 | from chalk.features import features, DataFrame, before, after, FeatureTime 7 | 8 | 9 | @features 10 | class Transaction: 11 | id: int 12 | amount: float 13 | memo: str 14 | on: FeatureTime 15 | user_id: "User.id" 16 | user: "User" 17 | 18 | 19 | @features 20 | class User: 21 | id: int 22 | plaid_transactions: DataFrame[Transaction] 23 | 24 | # percentage change last 30 days vs a year ago 25 | change_from_last_year: float 26 | 27 | 28 | @online 29 | def get_transaction_trend( 30 | this_year_txns: User.transactions[after(days_ago=30)], 31 | last_year_txns: User.transactions[before(days_ago=30), after(days_ago=2*30)], 32 | ) -> User.change_from_last_year: 33 | """ 34 | Calculates the percentage change in total transaction amount between 35 | 30 day windows. 36 | """ 37 | sum_last = last_year_txns[Transaction.amount].sum() 38 | sum_this = this_year_txns[Transaction.amount].sum() 39 | return (sum_last - sum_this) * 100 / sum_last 40 | -------------------------------------------------------------------------------- /02_resolvers/2_multiple_features_resolver.py: -------------------------------------------------------------------------------- 1 | from mocks import user_service 2 | 3 | from chalk import online 4 | from chalk.client import ChalkClient 5 | from chalk.features import Features, features 6 | 7 | 8 | @features 9 | class User: 10 | id: int 11 | name: str 12 | email: str 13 | 14 | 15 | # Unlike with our scalar resolvers, here we need to wrap our output in 16 | # the class `Features[...]`. 17 | @online 18 | def get_user_details(uid: User.id) -> Features[User.name, User.email]: 19 | details = user_service.get_identity(uid) 20 | # Note that we don't need to supply all arguments to `User`. 21 | # The field `id` on `User` is non-optional, and doesn't have a 22 | # default value, but these classes accept partial application. 23 | # See `01_features/8_constructing_features.py` for more info. 24 | return User( 25 | name=details.name, 26 | email_domain=details.email, 27 | ) 28 | 29 | 30 | if __name__ == "__main__": 31 | # We can then query features as we did in the previous example. 32 | result = ChalkClient().query( 33 | input={User.id: 4}, 34 | output=[User.name, User.email], 35 | ) 36 | -------------------------------------------------------------------------------- /fraud/3_identity.py: -------------------------------------------------------------------------------- 1 | """An example of connecting Users to Credit Reports from a 2 | third part API (in this case socure). 3 | 4 | In this example, we use the requests library to make 5 | get a client's socure score from the socure REST API. This 6 | example shows how you can run arbitrary python code (and connect 7 | to third party APIs) in a python resolver. 8 | """ 9 | 10 | import requests 11 | 12 | from chalk import online 13 | from chalk.features import features, feature 14 | 15 | 16 | @features 17 | class User: 18 | id: str 19 | 20 | # the max staleness assignment on the feature means 21 | # that a new socure score is only computed if one 22 | # hasn't been computed in the last 30 days. 23 | socure_score: float = feature(max_staleness="30d") 24 | 25 | 26 | @online 27 | def get_socure_score(uid: User.id) -> User.socure_score: 28 | """This resolver approximates how one might make a REST 29 | API call to socure in a python resolver for a specific 30 | user. 31 | """ 32 | return requests.get( 33 | "https://api.socure.com", 34 | json={ 35 | "id": uid, 36 | }, 37 | ).json()["socure_score"] 38 | -------------------------------------------------------------------------------- /11_sql/README.md: -------------------------------------------------------------------------------- 1 | # SQL 2 | 3 | Chalk can ingest your data using a SQL interface from any 4 | of the integrations that support it. You can describe your 5 | queries using SQL strings or SQLAlchemy. In offline, event 6 | tables can be ingested incrementally. 7 | 8 | https://docs.chalk.ai/docs/sql 9 | 10 | ## 1. Query Scalars 11 | Query scalars with SQL files or strings. 12 | 13 | **[1_scalars.py](1_scalars.py)** 14 | 15 | ```python 16 | @realtime 17 | def get_views(user: User.id) -> User.viewed_minutes: 18 | return db.query_string( 19 | "select sum(mins) as viewed_minutes from view_counts where uid = :uid", 20 | args=dict(uid=user), 21 | ).one() 22 | ``` 23 | https://docs.chalk.ai/docs/sql 24 | 25 | ## 2. Query DataFrames 26 | Query many rows and take advantage of push down filters. 27 | 28 | **[2_dataframes.py](2_dataframes.py)** 29 | 30 | ```python 31 | @realtime 32 | def get_views() -> DataFrame[User]: 33 | return db.query_string( 34 | """ 35 | select id, sum(mins) as viewed_minutes 36 | from view_counts 37 | group by id 38 | """, 39 | ).all() 40 | ``` 41 | 42 | https://docs.chalk.ai/docs/sql 43 | -------------------------------------------------------------------------------- /ecommerce/1_users_sellers.py: -------------------------------------------------------------------------------- 1 | from chalk import online 2 | from chalk.features import features 3 | 4 | 5 | @features 6 | class Seller: 7 | id: str 8 | categories: set[str] 9 | 10 | 11 | @features 12 | class User: 13 | id: str 14 | age: int 15 | favorite_categories: set[str] 16 | 17 | 18 | @features 19 | class UserSeller: 20 | id: str 21 | user_id: User.id 22 | user: User 23 | seller_id: Seller.id 24 | seller: Seller 25 | favorites_match: bool 26 | 27 | 28 | @online 29 | def get_similarity( 30 | fc: UserSeller.user.favorite_categories, fc2: UserSeller.seller.categories 31 | ) -> UserSeller.favorites_match: 32 | return len(fc & fc2) > 0 33 | 34 | 35 | if __name__ == "__main__": 36 | from chalk.client import ChalkClient 37 | 38 | client = ChalkClient() 39 | user_stores = client.query( 40 | input=[ 41 | UserSeller(user_id="1", seller_id="456"), 42 | UserSeller(user_id="1", seller_id="457"), 43 | UserSeller(user_id="1", seller_id="458"), 44 | ], 45 | output=[UserSeller.user.id, UserSeller.seller.id, UserSeller.favorites_match], 46 | ) 47 | print(user_stores) 48 | -------------------------------------------------------------------------------- /11_sql/1_scalars.py: -------------------------------------------------------------------------------- 1 | from chalk import online 2 | from chalk.features import features 3 | from chalk.sql import SQLiteInMemorySource 4 | 5 | 6 | @features 7 | class User: 8 | id: str 9 | viewed_minutes: float 10 | 11 | 12 | db = SQLiteInMemorySource() 13 | 14 | 15 | @realtime 16 | def get_views(user: User.id) -> User.viewed_minutes: 17 | return db.query_string( 18 | "select sum(mins) as viewed_minutes from view_counts where uid = :uid", 19 | args=dict(uid=user), 20 | # Chalk lines up the name of your returned SQL columns 21 | # with the features that your resolver says it returns 22 | # It they don't line up, you can explicitly map any 23 | # of the columns with the line below: 24 | # fields=dict(viewed_minutes=User.viewed_minutes), 25 | ).one() 26 | 27 | 28 | @online 29 | def get_views(user: User.id) -> User.viewed_minutes: 30 | """ 31 | This resolver executes the same query as above, 32 | but moves the SQL string into the file `user_views.sql`. 33 | """ 34 | return db.query_sql_file( 35 | "user_views.sql", 36 | args=dict(uid=user), 37 | fields=dict(viewed_minutes=User.viewed_minutes), 38 | ).one() 39 | -------------------------------------------------------------------------------- /full_examples/sagemaker/steps/evaluate.py: -------------------------------------------------------------------------------- 1 | from sagemaker.workflow.function_step import step 2 | 3 | @step( 4 | name="model-evaluation", 5 | instance_type='ml.t3.medium', 6 | keep_alive_period_in_seconds=300, 7 | ) 8 | def evaluate(model, xtest_path: str, ytest_path: str, run_bucket: str) -> str: 9 | import pandas as pd 10 | from sklearn.metrics import ( 11 | accuracy_score, 12 | f1_score, 13 | precision_score, 14 | recall_score, 15 | ) 16 | import s3fs 17 | import json 18 | 19 | X_test = pd.read_parquet(xtest_path) 20 | y_test = pd.read_parquet(ytest_path) 21 | 22 | predictions = model.predict(X_test) 23 | 24 | results = { 25 | "accuracy": accuracy_score(y_test, predictions), 26 | "f1": f1_score(y_test, predictions), 27 | "precision": precision_score(y_test, predictions), 28 | "recall": recall_score(y_test, predictions), 29 | } 30 | 31 | # Upload evaluation report to s3 32 | s3_fs = s3fs.S3FileSystem() 33 | eval_src_s3 = f"{run_bucket}/evaluation/evaluation.json" 34 | 35 | with s3_fs.open(eval_src_s3, "wb") as file: 36 | file.write(json.dumps(results)) 37 | 38 | return eval_src_s3 39 | 40 | -------------------------------------------------------------------------------- /mocks/__init__.py: -------------------------------------------------------------------------------- 1 | import random 2 | from dataclasses import dataclass 3 | from enum import Enum 4 | 5 | # A set of mocks for the examples. 6 | 7 | class AccountKind(Enum): 8 | plaid = "plaid" 9 | checking = "checking" 10 | savings = "savings" 11 | 12 | 13 | @dataclass 14 | class UserIdentity: 15 | name: str 16 | email: str 17 | 18 | 19 | class UserService: 20 | domains = ["gmail.com", "chalk.ai", "nasa.gov"] 21 | names = ["Monica", "Justine", "Sam", "Nikhil"] 22 | 23 | def get_identity(self, id: int) -> UserIdentity: 24 | random.seed(id) 25 | name = random.choice(self.names) 26 | return UserIdentity( 27 | name=name, 28 | email=f"{name.lower()}@{random.choice(self.domains)}", 29 | ) 30 | 31 | 32 | user_service = UserService() 33 | 34 | 35 | @dataclass 36 | class EmailRisk: 37 | age_years: float 38 | risk_score: float 39 | 40 | 41 | class LexusNexus: 42 | def get_email_risk(self, email: str) -> EmailRisk: 43 | random.seed(email) 44 | return EmailRisk( 45 | age_years=random.uniform(0, 10), 46 | risk_score=random.uniform(0, 1), 47 | ) 48 | 49 | 50 | lexus_nexus = LexusNexus() 51 | -------------------------------------------------------------------------------- /marketplace/resolvers.py: -------------------------------------------------------------------------------- 1 | from chalk import online 2 | 3 | from src.marketplace import Review, User 4 | 5 | 6 | @online 7 | def get_normalized_rating( 8 | review_rating: Review.star_rating, 9 | review_count_across_all_books: Review.item.total_reviews, 10 | average_rating_across_all_books: Review.item.average_rating, 11 | ) -> Review.normalized_rating: 12 | minimum_reviews: float = review_count_across_all_books / 10 13 | return ( 14 | review_count_across_all_books 15 | / (review_count_across_all_books + minimum_reviews) 16 | ) * review_rating + ( 17 | minimum_reviews / (review_count_across_all_books + minimum_reviews) 18 | ) * average_rating_across_all_books 19 | 20 | 21 | @online 22 | def is_positive_review_from_python_resolver( 23 | bayesian_normalized_rating: Review.normalized_rating, 24 | ) -> Review.is_positive_review_python_resolver: 25 | return bayesian_normalized_rating >= 3.5 26 | 27 | 28 | @online 29 | def get_username(email: User.email) -> User.username: 30 | # def get_username(email: str) -> str: 31 | username = email.split("@")[0] 32 | if "gmail.com" in email: 33 | username = username.split("+")[0].replace(".", "") 34 | 35 | return username.lower() 36 | -------------------------------------------------------------------------------- /02_resolvers/3_downstream_scalars.py: -------------------------------------------------------------------------------- 1 | from chalk import online 2 | from chalk.client import ChalkClient 3 | from chalk.features import features 4 | 5 | 6 | @features 7 | class User: 8 | id: int 9 | email: str 10 | email_domain: str 11 | banned_email: bool 12 | 13 | 14 | @online 15 | def get_email_domain(email: User.email) -> User.email_domain: 16 | return email.split("@")[1].lower() 17 | 18 | 19 | @online 20 | def is_banned_email(domain: User.email_domain) -> User.banned_email: 21 | return domain in { 22 | "pergi.id", 23 | "convoitucpa.com", 24 | "vshgl.com", 25 | "nieise.com", 26 | "bookue.site", 27 | "umaasa.com", 28 | } 29 | 30 | 31 | if __name__ == "__main__": 32 | client = ChalkClient() 33 | assert not client.query( 34 | input={User.email: "katherine.johnson@nasa.gov"}, 35 | # Requesting User.banned_email requires running 36 | # `get_email_domain` and then `is_banned_email` 37 | output=[User.banned_email], 38 | ).get_feature_value(User.banned_email) 39 | 40 | assert client.query( 41 | input={User.email: "attacker@vshgl.com"}, 42 | output=[User.banned_email], 43 | ).get_feature_value(User.banned_email) 44 | -------------------------------------------------------------------------------- /08_testing/README.md: -------------------------------------------------------------------------------- 1 | # Testing 2 | Test your Chalk features and resolvers. 3 | 4 | ## 1. Unit tests 5 | Resolvers are just Python functions, so they are easy to unit test. 6 | 7 | Chalk lets you specify your feature pipelines using 8 | idiomatic Python. This means that you can unit test 9 | individual resolvers and combinations of resolvers. 10 | 11 | **[1_unit_tests.py](1_unit_tests.py)** 12 | 13 | ```python 14 | @realtime 15 | def get_home_data( 16 | hid: HomeFeatures.id, 17 | ) -> Features[HomeFeatures.price, HomeFeatures.sq_ft]: 18 | return HomeFeatures(price=200_000, sq_ft=2_000) 19 | 20 | 21 | def test_multiple_output(): 22 | assert get_home_data(2) == HomeFeatures( 23 | price=200_000, 24 | sq_ft=2_000, 25 | ) 26 | ``` 27 | https://docs.chalk.ai/docs/unit-tests 28 | 29 | ## 2. Integration tests 30 | Test interactions between resolvers with preview deployments. 31 | 32 | **[2_integration_tests.py](2_integration_tests.py)** 33 | 34 | ```bash 35 | > chalk apply --no-promote 36 | ``` 37 | 38 | ```bash 39 | > chalk query --deployment $DEPLOYMENT_ID \ 40 | --in user.id=1 \ 41 | --out user.id \ 42 | --out user.email 43 | ``` 44 | https://docs.chalk.ai/docs/integration-tests 45 | -------------------------------------------------------------------------------- /predictive_maintenance/4_customer_sensors.py: -------------------------------------------------------------------------------- 1 | from chalk import batch 2 | from chalk.features import DataFrame, features, has_many, feature 3 | from chalk.sql import SnowflakeSource 4 | 5 | 6 | @features 7 | class Sensor: 8 | id: str 9 | customer_id: str 10 | is_failing: bool 11 | 12 | 13 | 14 | @features 15 | class Customer: 16 | id: str 17 | customer_needs_service: bool = feature(max_staleness="2h") 18 | sensors: DataFrame[Sensor] = has_many(lambda: Customer.id == Sensor.customer_id) 19 | 20 | 21 | snowflake = SnowflakeSource() 22 | 23 | 24 | @batch(cron="1h") 25 | def get_sensors() -> DataFrame[Sensor.id, Sensor.customer_id, Sensor.is_failing]: 26 | """ 27 | Incrementally ingest new sensors from our Snowflake warehouse 28 | as they become available. 29 | """ 30 | return snowflake.query_string( 31 | """ 32 | select id, customer_id, is_failing from sensors 33 | """ 34 | ).incremental(incremental_column="updated_at", mode='row') 35 | 36 | 37 | @batch(cron="1h") 38 | def get_customers_needing_service( 39 | bad_sensors: Customer.sensors[ 40 | Sensor.is_failing is True, 41 | Sensor.id 42 | ] 43 | ) -> Customer.customer_needs_service: 44 | return len(bad_sensors) > 0 45 | -------------------------------------------------------------------------------- /03_caching/4_override_max_staleness.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from chalk import online 4 | from chalk.client import ChalkClient 5 | from chalk.features import feature, features 6 | 7 | 8 | @features 9 | class User: 10 | id: int 11 | name: str 12 | fico_score: int = feature(max_staleness="30d") 13 | 14 | 15 | @online 16 | def get_fico_score(name: User.name) -> User.fico_score: 17 | return requests.get(...).json()["score"] 18 | 19 | 20 | if __name__ == "__main__": 21 | # By default, the staleness will be taken to be the value given 22 | # on the feature class. In this case, user.fico_score is cached 23 | # for 30 days. But if you have a model that needs fresher data, 24 | # you can specify the desired staleness at the time of making 25 | # the query. For example, here we request a staleness of only 26 | # 10 minutes. 27 | ChalkClient().query( 28 | input={User.name: "Katherine Johnson"}, 29 | output=[User.fico_score], 30 | staleness={User.fico_score: "10m"}, 31 | ) 32 | 33 | # If you didn't specify the staleness, the default staleness 34 | # of 30 days would apply 35 | ChalkClient().query( 36 | input={User.name: "Katherine Johnson"}, 37 | output=[User.fico_score], 38 | ) 39 | -------------------------------------------------------------------------------- /02_resolvers/4_downstream_dataframes.py: -------------------------------------------------------------------------------- 1 | from chalk import online 2 | from chalk.client import ChalkClient 3 | from chalk.features import DataFrame, features, has_many 4 | 5 | 6 | @features 7 | class Email: 8 | id: str 9 | uid: str 10 | domain: str 11 | is_banned: bool 12 | value: str 13 | 14 | 15 | @features 16 | class User: 17 | id: str 18 | banned: bool 19 | emails: DataFrame[Email] = has_many(lambda: Email.uid == User.id) 20 | 21 | 22 | @online 23 | def is_banned_email(domain: Email.domain) -> Email.is_banned: 24 | return domain in { 25 | "pergi.id", 26 | "convoitucpa.com", 27 | "vshgl.com", 28 | "nieise.com", 29 | "bookue.site", 30 | "umaasa.com", 31 | } 32 | 33 | 34 | # Here, we say a user is banned if the user has any emails that are banned. 35 | # Note that all of this can be computed real-time, and Chalk will run the 36 | # `is_banned_email` resolver for each of the emails that the user has. 37 | @online 38 | def banned_user(domains: User.emails[Email.is_banned == True]) -> User.banned: 39 | return len(domains) > 0 40 | 41 | 42 | if __name__ == "__main__": 43 | result = ChalkClient().query( 44 | input={User.id: "1"}, 45 | output=[User.banned], 46 | ) 47 | -------------------------------------------------------------------------------- /predictive_maintenance/1_device_data.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from pydantic import BaseModel 3 | 4 | from chalk.features import DataFrame, has_many, features, FeatureTime 5 | from chalk.streams import stream, KafkaSource 6 | 7 | 8 | @features 9 | class Measurement: 10 | device_id: str 11 | lat: float 12 | long: float 13 | voltage: float 14 | temp: float 15 | 16 | timestamp: FeatureTime 17 | 18 | 19 | @features 20 | class Sensor: 21 | id: str 22 | measurements: DataFrame[Measurement] = has_many(lambda: Measurement.device_id == Sensor.id) 23 | 24 | 25 | source = KafkaSource(name="sensor_stream") 26 | 27 | 28 | class DeviceDataJson(BaseModel): 29 | latitude: float 30 | longitude: float 31 | voltage: float 32 | temperature: float 33 | 34 | 35 | class Message(BaseModel): 36 | device_id: str 37 | timestamp: datetime 38 | data: DeviceDataJson 39 | 40 | 41 | @stream(source=source) 42 | def read_message(message: Message) -> Measurement: 43 | return Measurement( 44 | device_id=message.device_id, 45 | timestamp=message.timestamp, 46 | lat=message.data.latitude, 47 | long=message.data.longitude, 48 | voltage=message.data.voltage, 49 | temp=message.data.temperature, 50 | ) 51 | -------------------------------------------------------------------------------- /05_feature_discovery/3_tags.py: -------------------------------------------------------------------------------- 1 | from chalk import tags 2 | from chalk.features import feature, features 3 | 4 | 5 | # Tags are assigned as code comments 6 | @features 7 | class RiskProfile1: 8 | id: int 9 | # :tags: group:risk 10 | email: str 11 | kyc_score: str 12 | 13 | 14 | # Or explicitly set via `feature(tags=...)` 15 | @features 16 | class RiskProfile2: 17 | id: int 18 | email: str = feature(tags="group:risk") 19 | kyc_score: str 20 | 21 | 22 | # A feature can have many tags 23 | @features 24 | class RiskProfile3: 25 | id: int 26 | # :tags: group:risk, pii 27 | email: str 28 | kyc_score: str 29 | 30 | 31 | # Tags assigned on the class will apply to each of its features 32 | @features(tags="group:risk") 33 | class RiskProfile4: 34 | id: str 35 | kyc_score: float 36 | email_age_days: int 37 | 38 | 39 | # Tags on the class add to tags on the feature 40 | @features(tags="group:risk") 41 | class RiskProfile5: 42 | id: str 43 | kyc_score: float 44 | email_age_days: int 45 | # :tags: pii 46 | email: str 47 | 48 | 49 | # The function `chalk.features.tags(...)` returns the tags for a feature 50 | assert tags(RiskProfile5) == ["group:risk"] 51 | assert tags(RiskProfile5.id) == ["group:risk"] 52 | assert tags(RiskProfile5.email) == ["pii", "group:risk"] 53 | -------------------------------------------------------------------------------- /01_features/1_feature_types.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | from enum import Enum 3 | 4 | from chalk.features import features 5 | 6 | 7 | class Genre(Enum): 8 | FICTION = "FICTION" 9 | NONFICTION = "NONFICTION" 10 | DRAMA = "DRAMA" 11 | POETRY = "POETRY" 12 | 13 | 14 | # The @features decorator creates a feature for each attribute 15 | # of the class. These feature classes work a lot like Python's 16 | # dataclasses, except that you can construct them with only 17 | # partial arguments. 18 | @features 19 | class Book: 20 | # Features can be any primitive Python type 21 | id: int 22 | name: str 23 | pages: int 24 | publish_date: date 25 | copyright_ended_at: datetime | None 26 | genre: Genre 27 | 28 | # Features can also be lists and sets of any primitive 29 | authors: list[str] 30 | categories: set[str] 31 | 32 | # Descriptions live as comments above features. 33 | # See 05_feature_discovery/4_descriptions.py for more information. 34 | jacket_copy: str 35 | 36 | 37 | # Note that we don't supply all the arguments to book here 38 | anna_karenina = Book(name="Anna Karenina", pages=864) 39 | 40 | # Feature classes can be easily converted to dictionaries 41 | assert dict(anna_karenina) == { 42 | "book.name": "Anna Karenina", 43 | "book.pages": 864, 44 | } 45 | -------------------------------------------------------------------------------- /full_examples/sagemaker/src/models.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | from dateutil.relativedelta import relativedelta 3 | from chalk import online, DataFrame, FeatureTime, Windowed, _, feature, windowed 4 | from chalk.features import features 5 | 6 | 7 | @features 8 | class Transaction: 9 | id: int 10 | amt: float 11 | confirmed_fraud: bool 12 | customer_id: "Customer.id" 13 | customer: "Customer" 14 | 15 | # The time at which the transaction was created for temporal consistency 16 | at: FeatureTime 17 | 18 | 19 | @features 20 | class Customer: 21 | id: int 22 | name: str 23 | email: str 24 | dob: date 25 | age: int 26 | income: int 27 | fico: int 28 | 29 | # The transactions, linked by the Customer.id type on the Transaction.customer_id field 30 | transactions: DataFrame[Transaction] 31 | 32 | transaction_sum: Windowed[float] = windowed( 33 | "30m", 34 | "1h", 35 | default=0, 36 | expression=_.transactions[_.amount, _.ts > _.chalk_window].sum(), 37 | ) 38 | 39 | 40 | @online 41 | async def get_fico(email: Customer.email) -> Customer.fico: 42 | # Use your preferred FICO score API here 43 | ... 44 | 45 | @online 46 | async def get_age(Customer.dob) -> Customer.age: 47 | return relativedelta(end_date, start_date).years 48 | 49 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Chalk Quickstart 3 | 4 | 1. Install Chalk 5 | 6 | Install the [Chalk command line tool](https://docs.chalk.ai/cli). 7 | The Chalk CLI allows you to create, update, and manage your feature 8 | pipelines directly from your terminal. 9 | 10 | > curl -s -L https://api.chalk.ai/install.sh | sh 11 | 12 | 2. Create and activate a virtual environment in your project directory 13 | 14 | Creating a virtual environment is a good practice to keep your project 15 | dependencies isolated from your system dependencies. 16 | 17 | > python -m venv .venv 18 | > source .venv/bin/activate 19 | 20 | 3. Login or sign up 21 | 22 | Login or signup with Chalk directly from the command line. The 23 | [`chalk login`](https://docs.chalk.ai/cli/login) command will 24 | open your browser and create an API token for your local development. 25 | 26 | 4. Deploy your features 27 | 28 | Deploy your feature pipeline to production. After you've written some 29 | features and resolvers, use the [`chalk apply`](https://docs.chalk.ai/cli/apply) 30 | command to deploy your feature pipelines. 31 | 32 | 5. Query your features 33 | 34 | Query your features directly from the command line with 35 | [`chalk query`](https://docs.chalk.ai/cli/query) to see that they're 36 | live and available. 37 | -------------------------------------------------------------------------------- /14_codegen/README.md: -------------------------------------------------------------------------------- 1 | ## Codegen with Chalk 2 | 3 | If you’re working with external ML models or microservices, it can be helpful to generate 4 | boilerplate code for calling those services—especially when those models operate on Chalk 5 | features. 6 | 7 | In this folder, the file `custom_model.py` defines a class `CustomModel`, that when created, 8 | stores the information it needs to render the resolver definition for an HTTP call to a 9 | service hosting a model score. 10 | 11 | ```python 12 | CustomModel( 13 | url="https://internal.example.com/model1", 14 | dependencies={ 15 | "nms": User.name_match_score, 16 | "email": User.email, 17 | }, 18 | computes=User.score1, 19 | ) 20 | ``` 21 | 22 | The file `models.py` when executed overwrites the `score_resolvers.py` file with the 23 | auto-generated definitions of the custom model: 24 | 25 | ```python 26 | @online 27 | def get_score1(nms: User.name_match_score, email: User.email) -> User.score1: 28 | response = requests.post( 29 | "https://internal.example.com/model1", 30 | headers={"accept": "application/json"}, 31 | json={"nms": nms, "email": email}, 32 | ) 33 | return response.json().get("prediction") 34 | ``` 35 | 36 | If you find yourself repeating the same pattern for many of these resolvers, codegen 37 | can be helpful to dry up your definitions. -------------------------------------------------------------------------------- /01_features/7_feature_time.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from chalk import offline 4 | from chalk.features import Features, features 5 | 6 | 7 | @features 8 | class Book: 9 | id: str 10 | name: str 11 | 12 | # By default, Chalk marks the time a feature was 13 | # created as the time that its resolver was run. 14 | # However, you may want to provide a custom value 15 | # for this time for data sources like events tables. 16 | # You can inspect the time a feature was created 17 | # and set the time for when a feature was created 18 | # by creating a feature time feature. 19 | # By default, a feature is a feature time feature if 20 | # it has the name `ts` and a type of `datetime.datetime`: 21 | ts: datetime 22 | 23 | # However, you may also explicitly set the feature time 24 | # via the `chalk.features.FeatureTime` type: 25 | # 26 | # timestamp: FeatureTime 27 | # 28 | 29 | 30 | # To set the time a feature was created, assign the feature 31 | # when you resolve it: 32 | @offline 33 | def fn(book_id: Book.id) -> Features[Book.name, Book.ts]: 34 | return Book( 35 | name="Anna Karenina", 36 | ts=datetime(month=9, day=12, year=1877), 37 | ) 38 | 39 | 40 | # Then, when you sample offline data, the name feature will 41 | # be treated as having been created at the provided date. 42 | -------------------------------------------------------------------------------- /02_resolvers/1_scalar_resolver.py: -------------------------------------------------------------------------------- 1 | from chalk import online 2 | from chalk.client import ChalkClient 3 | from chalk.features import features 4 | 5 | 6 | @features 7 | class User: 8 | id: int 9 | email: str 10 | email_domain: str 11 | 12 | 13 | # This resolver computes one features, `User.email_domain`. 14 | # To compute that feature, it takes a data dependency on `User.email`. 15 | @online 16 | def get_email_domain(email: User.email) -> User.email_domain: 17 | return email.split("@")[1].lower() 18 | 19 | 20 | if __name__ == "__main__": 21 | # Once you've deployed your features, you can query them by providing 22 | # the data you know that's already in scope, and asking for any feature 23 | # value that can be computed downstream from that data 24 | result = ChalkClient().query( 25 | # Here, we say that we know the email is `jessie@chalk.ai`. 26 | # In practice, typically this is just the id of the entity 27 | # that you care about 28 | input={User.email: "jessie@chalk.ai"}, 29 | # Here we ask Chalk to compute the `User.email_domain` from this 30 | # downstream feature 31 | output=[User.email_domain], 32 | ) 33 | 34 | # From the resulting object, we can pull the `User.email_domain` 35 | # feature, and see that it is in fact `chalk.ai`. 36 | assert result.get_feature_value(User.email_domain) == "chalk.ai" 37 | -------------------------------------------------------------------------------- /06_dataframe/2_filters.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from chalk.features import DataFrame, features 4 | 5 | 6 | @features 7 | class Transaction: 8 | id: int 9 | user_id: "User.id" 10 | memo: str 11 | merchant: str 12 | amount: float 13 | canceled_at: None | datetime 14 | 15 | 16 | @features 17 | class User: 18 | id: int 19 | txns: DataFrame[Transaction] 20 | 21 | 22 | # You can filter down the transactions by any of the 23 | # properties on the transaction 24 | credits = User.txns[Transaction.amount < 0] 25 | 26 | # Or works much like `and`: 27 | rideshare_income = User.txns[ 28 | Transaction.amount < 0 29 | and (Transaction.merchant in ("uber", "lyft") or "uberpmts" == Transaction.memo) 30 | ] 31 | 32 | # You can also check for set or list membership with `in`: 33 | rideshare_txns = User.txns[Transaction.merchant in ("uber", "lyft")] 34 | 35 | # Filters separated by commas function as `and` filters: 36 | rideshare_credits = User.txns[ 37 | Transaction.amount < 0, Transaction.merchant in ("uber", "lyft") 38 | ] 39 | 40 | # Equivalently, you can use the keyword `and` instead of separating by commas 41 | rideshare_credits = User.txns[ 42 | Transaction.amount < 0 and Transaction.merchant in ("uber", "lyft") 43 | ] 44 | 45 | 46 | # Filters can also check for None the same way you check for None in Python 47 | valid_txns = User.txns[Transaction.canceled_at is not None] 48 | -------------------------------------------------------------------------------- /08_testing/1_unit_tests.py: -------------------------------------------------------------------------------- 1 | from chalk import realtime 2 | from chalk.features import Features, features 3 | 4 | 5 | # First, we'll define a set of features and resolvers: 6 | @features 7 | class HomeFeatures: 8 | id: str 9 | address: str 10 | price: int 11 | sq_ft: int 12 | 13 | 14 | @realtime 15 | def get_address(hid: HomeFeatures.id) -> HomeFeatures.address: 16 | return "Bridge Street" if hid == 1 else "Filbert Street" 17 | 18 | 19 | @realtime 20 | def get_home_data( 21 | hid: HomeFeatures.id, 22 | ) -> Features[HomeFeatures.price, HomeFeatures.sq_ft]: 23 | return HomeFeatures(price=200_000, sq_ft=2_000) 24 | 25 | 26 | # Chalk lets you specify your feature pipelines using 27 | # idiomatic Python. This means that you can unit test 28 | # individual resolvers and combinations of resolvers, 29 | # since they’re just Python functions. 30 | def test_single_output(): 31 | assert get_address(2) == "Filbert Street" 32 | 33 | 34 | # Dataclasses support equality, which can be used 35 | # to test resolvers which return multiple features. 36 | def test_multiple_output(): 37 | result = get_home_data(2) 38 | assert result.price == 200_000 39 | assert result.sq_ft == 2_000 40 | assert result != HomeFeatures( 41 | address="hello", 42 | price=200_000, 43 | sq_ft=2_000, 44 | ) 45 | assert result == HomeFeatures( 46 | price=200_000, 47 | sq_ft=2_000, 48 | ) 49 | -------------------------------------------------------------------------------- /full_examples/sagemaker/steps/training.py: -------------------------------------------------------------------------------- 1 | from sagemaker.workflow.function_step import step 2 | 3 | PARAM_GRID = { 4 | 'xgb__n_estimators': [20, 50, 100, 200], 5 | 'xgb__learning_rate': [0.01, 0.1, 0.2], 6 | 'xgb__max_depth': [3, 5, 7, 9], 7 | } 8 | 9 | @step( 10 | name="model-training", 11 | instance_type="ml.m5.xlarge", 12 | keep_alive_period_in_seconds=300, 13 | ) 14 | def train( 15 | xtrain_path: str, 16 | ytrain_path: str, 17 | num_rounds: int 18 | ): 19 | from sklearn.pipeline import Pipeline 20 | import pandas as pd 21 | from sklearn.preprocessing import StandardScaler 22 | from sklearn.impute import SimpleImputer 23 | from sklearn.ensemble import GradientBoostingClassifier 24 | from sklearn.model_selection import RandomizedSearchCV 25 | 26 | # read data files from S3 27 | X_train = pd.read_parquet(xtrain_path) 28 | y_train = pd.read_parquet(ytrain_path) 29 | 30 | pipeline = Pipeline( 31 | steps=[ 32 | ("impute", (SimpleImputer())), 33 | ("scaler", StandardScaler()), 34 | ("xgb", GradientBoostingClassifier()), 35 | ] 36 | ) 37 | rsc = RandomizedSearchCV( 38 | pipeline, 39 | param_distributions=PARAM_GRID, 40 | n_iter=num_rounds, 41 | cv=3, 42 | scoring="f1", 43 | n_jobs=-1, 44 | ) 45 | rsc.fit(X_train, y_train) 46 | 47 | return rsc.best_estimator_ 48 | -------------------------------------------------------------------------------- /github/features/github_feature_set.py: -------------------------------------------------------------------------------- 1 | import chalk.functions as F 2 | from chalk.features import ( 3 | Primary, 4 | _, 5 | features, 6 | has_one, 7 | ) 8 | 9 | from src.github.features import ( 10 | GithubArchive, 11 | GithubRepo, 12 | GithubRepoDocVDB, 13 | GithubUser, 14 | ) 15 | 16 | 17 | @features 18 | class GithubProject: 19 | path: Primary[str] 20 | project_is_valid_repo_path: bool = F.regexp_like( 21 | expr=_.path, 22 | pattern=r"^[a-zA-Z0-9_-]+\/[a-zA-Z0-9._-]+$", 23 | ) 24 | project_url: str | None = F.if_then_else( 25 | condition=_.project_is_valid_repo_path, 26 | if_true="https://github.com/" + _.path, 27 | if_false=None, 28 | ) 29 | 30 | username: GithubUser.name = F.split_part( 31 | expr=_.path, 32 | delimiter="/", 33 | index=0, 34 | ) 35 | 36 | user: GithubUser | None = has_one( 37 | lambda: GithubProject.username == GithubUser.name, 38 | ) 39 | 40 | repo: GithubRepo = has_one( 41 | lambda: GithubProject.path == GithubRepo.path, 42 | ) 43 | repo_language: str | None = F.coalesce( 44 | _.repo.language, 45 | "MISSING", 46 | ) 47 | 48 | archive: GithubArchive | None = has_one( 49 | lambda: GithubProject.path == GithubArchive.path, 50 | ) 51 | vdb: GithubRepoDocVDB | None = has_one( 52 | lambda: GithubProject.path == GithubRepoDocVDB.path, 53 | ) 54 | -------------------------------------------------------------------------------- /03_caching/5_override_cache_values.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | import requests 4 | 5 | from chalk import realtime 6 | from chalk.client import ChalkClient 7 | from chalk.features import feature, features 8 | 9 | 10 | class FICOBucket(str, Enum): 11 | HIGH = "HIGH" 12 | MEDIUM = "MEDIUM" 13 | LOW = "LOW" 14 | 15 | 16 | @features 17 | class User: 18 | id: int 19 | name: str 20 | fico_score: int = feature(max_staleness="30d") 21 | fico_bucket: FICOBucket 22 | 23 | 24 | @realtime 25 | def get_fico_score(name: User.name) -> User.fico_score: 26 | return requests.get(...).json()["score"] 27 | 28 | 29 | @realtime 30 | def discretize_fico_score(score: User.fico_score) -> User.fico_bucket: 31 | if score > 700: 32 | return FICOBucket.HIGH 33 | if score > 600: 34 | return FICOBucket.MEDIUM 35 | return FICOBucket.LOW 36 | 37 | 38 | if __name__ == "__main__": 39 | # You can also override the cached _value_ (in addition to the cache 40 | # duration, as described in 4_override_max_staleness) by providing 41 | # the value as an input to the query. 42 | # 43 | # Here, we specify that the FICO score is 700. That value is passed 44 | # to the resolver `discritize_fico_score` to compute `User.fico_bucket`, 45 | # instead of running `get_fico_score`. 46 | ChalkClient().query( 47 | input={User.name: "Katherine Johnson", User.fico_score: 700}, 48 | output=[User.fico_bucket], 49 | ) 50 | -------------------------------------------------------------------------------- /full_examples/image_processing/src/feature_sets.py: -------------------------------------------------------------------------------- 1 | 2 | import chalk.functions as F 3 | from chalk.features import DataFrame, Primary, _, feature, features 4 | 5 | @features 6 | class Image: 7 | url: Primary[str] 8 | 9 | # The website that the image was scraped from. 10 | source_url: "Website.url" 11 | 12 | # Image type: svg, png, jpg, unknown 13 | type: str = ( 14 | F.when(F.ends_with(_.url, ".svg")) 15 | .then("svg") 16 | .when(F.ends_with(_.url, ".jpeg") | F.ends_with(_.url, ".jpg")) 17 | .then("jpg") 18 | .when(F.ends_with(_.url, ".png")) 19 | .then("png") 20 | .otherwise("unknown") 21 | ) 22 | 23 | # The raw bytes of the image 24 | image_bytes: bytes 25 | 26 | # The x dimension for an image 27 | x: int 28 | 29 | # The y dimension for an image 30 | y: int 31 | 32 | # whether the image is flaged based on an in house model running deployed on the 33 | # `image-model_1.0.1_2024-09-16` SageMaker endpoint. 34 | flagged: bool = feature( 35 | max_staleness="infinity", 36 | underscore=F.sagemaker_predict( 37 | _.image_bytes, endpoint="image-model_1.0.1_2024-09-16" 38 | ), 39 | ) 40 | 41 | 42 | @features 43 | class Website: 44 | url: Primary[str] 45 | 46 | host: str = F.url_extract_host(_.url) 47 | 48 | # The html of the website 49 | html: str 50 | 51 | # The images associated with a given website 52 | images: DataFrame[Image] 53 | -------------------------------------------------------------------------------- /14_codegen/custom_model.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any 3 | 4 | from chalk.features import unwrap_feature 5 | 6 | 7 | class CustomModel: 8 | def __init__(self, url: str, dependencies: dict[str, Any], computes: Any): 9 | self.url = url 10 | self.dependencies = dependencies 11 | self.computes = unwrap_feature(computes) 12 | 13 | @classmethod 14 | def render_all(cls, *, header: str, path: Path | str, models: "list[CustomModel]"): 15 | children = "\n".join(model.render() for model in models) 16 | content = f"""{header} 17 | from chalk import online 18 | import requests 19 | 20 | 21 | {children} 22 | """ 23 | with open(path, "w") as f: 24 | f.write(content) 25 | 26 | def render(self): 27 | args: dict[str, str] = {} 28 | for k, v in self.dependencies.items(): 29 | f = unwrap_feature(v) 30 | args[k] = f"{f.features_cls.__name__}.{f.attribute_name}" 31 | json_body = ", ".join(f'"{k}": {k}' for k in args.keys()) 32 | returns = f"{self.computes.features_cls.__name__}.{self.computes.attribute_name}" 33 | 34 | return f""" 35 | @online 36 | def get_{self.computes.name}( 37 | {', '.join(f'{k}: {v}' for k, v in args.items())} 38 | ) -> {returns}: 39 | response = requests.post( 40 | "{self.url}", 41 | headers={{"accept": "application/json"}}, 42 | json={{{json_body}}}, 43 | ) 44 | return response.json().get("prediction") 45 | """ 46 | -------------------------------------------------------------------------------- /github/features/search/prompts.py: -------------------------------------------------------------------------------- 1 | SYSTEM_PROMPT: str = """ 2 | You are an AI assistant that processes GitHub repository descriptions and returns back the repository that best matches a given search query. 3 | Your goal is to evaluate, rank, and then return the repository that is most relevant based on the search query. 4 | Assess each repository's features, functionality, and use cases in relation to the search intent. 5 | Clearly explain why the repository is relevant and differentiate between highly relevant, partially relevant, and less relevant results. 6 | Maintain clarity and brevity while optimizing for accuracy and usefulness. 7 | """ 8 | 9 | USER_PROMPT: str = ( 10 | """ 11 | Given the following GitHub repositories: {{GithubSearch.urls_in}} 12 | 13 | Analyze them in relation to the search query: '{{GithubSearch.query}}' 14 | 15 | Here are the repository descriptions: {{GithubSearch.descriptions}} 16 | 17 | Give me back the one repository URL that is most relevant to the query. 18 | Clearly explain why each repository is relevant to the query, highlighting key features, functionality, and use cases. 19 | If a repository is only partially relevant, mention the relevant aspects while keeping the summary concise. 20 | 21 | Generate a structured JSON response following this schema: 22 | 23 | ```json 24 | { 25 | 'repo_url': '< str >', 26 | 'confidence': '< float >', 27 | 'summary': '< str >' 28 | } 29 | ``` 30 | """ 31 | ) 32 | -------------------------------------------------------------------------------- /01_features/8_constructing_features.py: -------------------------------------------------------------------------------- 1 | from chalk.features import Features, features 2 | 3 | 4 | @features 5 | class Book: 6 | id: str 7 | name: str 8 | pages: int 9 | author: str 10 | 11 | 12 | # Feature classes function like data classes, except that they 13 | # are allowed to take only part of their arguments. 14 | # Here, we're not providing `author` or `id`, even though 15 | # they don't have default values or allow optional values. 16 | assert Book(name="Anna Karenina") == Book(name="Anna Karenina") 17 | assert Book(name="Anna Karenina") != Book(name="Anna Karenina", author="Leo Tolstoy") 18 | 19 | # Feature classes are a bag of `Features`. 20 | # If you use Chalk's mypy plugin, the types below will behave as you expect. 21 | x: Features[Book.author, Book.name] = Book(name="Anna Karenina", author="Leo Tolstoy") 22 | 23 | # `Features` is commutative, so `Features[A, B] == Feature[B, A]` 24 | y: Features[Book.name, Book.author] = x 25 | 26 | # Features are iterable, and iterate as tuples of 27 | # (feature_name, feature_value) 28 | for feature_name, value in Book(name="Anna Karenina", pages=864): 29 | print(f"{feature_name=}, {value=}") 30 | 31 | # This iterable property means that features convert nicely into dicts 32 | assert dict(Book(name="Anna Karenina", pages=864)) == { 33 | "book.name": "Anna Karenina", 34 | "book.pages": 864, 35 | } 36 | 37 | # And also into lists 38 | assert [ 39 | ("book.name", "Anna Karenina"), 40 | ("book.pages", 864), 41 | ] == list(Book(name="Anna Karenina", pages=864)) 42 | -------------------------------------------------------------------------------- /03_caching/3_intermediates.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | import requests 4 | 5 | from chalk import realtime 6 | from chalk.client import ChalkClient 7 | from chalk.features import feature, features 8 | 9 | 10 | class FICOBucket(str, Enum): 11 | HIGH = "HIGH" 12 | MEDIUM = "MEDIUM" 13 | LOW = "LOW" 14 | 15 | 16 | @features 17 | class User: 18 | id: int 19 | name: str 20 | fico_score: int = feature(max_staleness="30d") 21 | fico_bucket: FICOBucket 22 | 23 | 24 | @realtime 25 | def get_fico_score(name: User.name) -> User.fico_score: 26 | return requests.get(...).json()["score"] 27 | 28 | 29 | @realtime 30 | def discretize_fico_score(score: User.fico_score) -> User.fico_bucket: 31 | if score > 700: 32 | return FICOBucket.HIGH 33 | if score > 600: 34 | return FICOBucket.MEDIUM 35 | return FICOBucket.LOW 36 | 37 | 38 | if __name__ == "__main__": 39 | # Specifying the max-staleness value also holds when 40 | # the cached feature is an intermediate result for your 41 | # query, but not a desired output. 42 | ChalkClient().query( 43 | input={User.name: "Katherine Johnson"}, 44 | # User.fico_score is not requested in the output 45 | output=[User.fico_bucket], 46 | # ...but is necessary to compute User.fico_bucket. 47 | # The requested feature `User.fico_bucket` is computed 48 | # by running `discretize_fico_score`, which in turn 49 | # depends on `User.fico_score`. 50 | staleness={User.fico_score: "10m"}, 51 | ) 52 | -------------------------------------------------------------------------------- /04_scheduling/1_cron.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from chalk import online 4 | from chalk.features import features 5 | 6 | 7 | @features 8 | class User: 9 | id: str 10 | name: str 11 | email: str 12 | credit_score: str 13 | 14 | 15 | # By default, resolvers with a `cron` parameter will sample the latest 16 | # versions of the data. Imagine you had historically resolved the following 17 | # features: 18 | # 19 | # | Time | ID | Email | Name | 20 | # | :--: | :-: | -------------------- | ----------- | 21 | # | 0 | 1 | elliot@chalk.ai | Elliot | 22 | # | 1 | 2 | andy@chalk.ai | Andy | 23 | # | 2 | 1 | | Elliot Marx | 24 | # | 3 | 2 | elliot.marx@chalk.ai | | 25 | # 26 | # Then, we would sample the following pairs and invoke the resolver 27 | # with these arguments: 28 | # 29 | # | Email | Name | 30 | # | -------------------- | ----------- | 31 | # | elliot.marx@chalk.ai | Elliot Marx | 32 | # | andy@chalk.ai | Andy | 33 | # 34 | # Note that we don't sample (elliot.marx@chalk.ai, Elliot), 35 | # for example, as those features are not the latest values 36 | # for a given id. 37 | # 38 | # The argument to cron can use the Chalk duration type, 39 | # or take a crontab-formatted string: 40 | # i.e.: @online(cron="*/5 * * * *") 41 | 42 | 43 | @online(cron="30d") 44 | def get_credit_score(name: User.name, email: User.email) -> User.credit_score: 45 | return requests.get("https://experian.com").json()["score"] 46 | -------------------------------------------------------------------------------- /05_feature_discovery/2_owners.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | 3 | from chalk import owner 4 | from chalk.features import feature, features 5 | 6 | 7 | @features 8 | class User: 9 | id: str 10 | # Owners are specified via code comments: 11 | # :owner: katherine.johnson@nasa.gov 12 | name: str 13 | 14 | 15 | @features 16 | class User1: 17 | id: str 18 | # or explicitly with `feature(owner=...)`: 19 | name: str = feature(owner="katherine.johnson@nasa.gov") 20 | 21 | 22 | # Setting an owner through the `@features` decorator 23 | # determines the owner all the features on the class 24 | @features(owner="katherine.johnson@nasa.gov") 25 | class User2: 26 | id: str # assigned the owner katherine.johnson@nasa.gov 27 | name: str # assigned the owner katherine.johnson@nasa.gov 28 | 29 | 30 | # Owners on features are more specific than owners 31 | # set via the `@features` decorator. 32 | @features(owner="katherine.johnson@nasa.gov") 33 | class User3: 34 | # Katherine is the owner of the id and dob feature, 35 | # because she is the owner set in the `@features` decorator 36 | id: str 37 | dob: date 38 | 39 | # Annie is the owner of this feature because she is set 40 | # as the owner at the feature level, which is more specific 41 | # than the owner from the feature class 42 | # :owner: annie.easley@nasa.gov 43 | name: str 44 | 45 | 46 | # The function `chalk.features.owner(...)` returns the owner of a feature 47 | assert owner(User3.name) == "annie.easley@nasa.gov" 48 | assert owner(User3.id) == "katherine.johnson@nasa.gov" 49 | -------------------------------------------------------------------------------- /marketplace/item_category/item_category_value_enum.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class ItemCategoryValueEnum(str, Enum): 5 | ARTS_AND_PHOTOGRAPHY = "Arts & Photography" 6 | BIOGRAPHIES_AND_MEMOIRS = "Biographies & Memoirs" 7 | BUSINESS_AND_MONEY = "Business & Money" 8 | CHILDRENS_BOOKS = "Children's Books" 9 | COMICS_AND_GRAPHIC_NOVELS = "Comics & Graphic Novels" 10 | COMPUTERS_AND_TECHNOLOGY = "Computers & Technology" 11 | COOKBOOKS_FOOD_AND_WINE = "Cookbooks, Food & Wine" 12 | CRAFTS_HOBBIES_AND_HOME = "Crafts, Hobbies & Home" 13 | EDUCATION_AND_TEACHING = "Education & Teaching" 14 | ENGINEERING_AND_TRANSPORTATION = "Engineering & Transportation" 15 | HEALTH_FITNESS_AND_DIETING = "Health, Fitness & Dieting" 16 | HISTORY = "History" 17 | HUMOR_AND_ENTERTAINMENT = "Humor & Entertainment" 18 | LAW = "Law" 19 | LITERATURE_AND_FICTION = "Literature & Fiction" 20 | MEDICAL_BOOKS = "Medical Books" 21 | MYSTERY_THRILLER_AND_SUSPENSE = "Mystery, Thriller & Suspense" 22 | PARENTING_AND_RELATIONSHIPS = "Parenting & Relationships" 23 | POLITICS_AND_SOCIAL_SCIENCES = "Politics & Social Sciences" 24 | REFERENCE = "Reference" 25 | RELIGION_AND_SPIRITUALITY = "Religion & Spirituality" 26 | ROMANCE = "Romance" 27 | SCIENCE_AND_MATH = "Science & Math" 28 | SCIENCE_FICTION_AND_FANTASY = "Science Fiction & Fantasy" 29 | SELF_HELP = "Self-Help" 30 | SPORTS_AND_OUTDOORS = "Sports & Outdoors" 31 | TEEN_AND_YOUNG_ADULT = "Teen & Young Adult" 32 | TEST_PREPARATION = "Test Preparation" 33 | TRAVEL = "Travel" 34 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/tests/test_denylisted.py: -------------------------------------------------------------------------------- 1 | from chalk.client import ChalkClient 2 | from src.models import Transaction, User 3 | 4 | 5 | def test_email_features(client: ChalkClient): 6 | client.check( 7 | input={ 8 | User.id: 1, 9 | User.email: "monica.1984+123@gmail.com", 10 | User.name: "Monica Geller", 11 | }, 12 | assertions={ 13 | User.email_username: "monica1984", 14 | User.domain_name: "gmail.com", 15 | User.name_email_match_score: 39.89, 16 | }, 17 | ) 18 | """ 19 | Chalk Feature Value Check Table 20 | ┏━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓ 21 | ┃ Kind ┃ Name ┃ Value ┃ 22 | ┡━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩ 23 | │ Match │ user.domain_name │ gmail.com │ 24 | │ Match │ user.email_username │ monica1984 │ 25 | │ Expect │ user.name_email_match_score │ 39.89 │ 26 | │ Actual │ user.name_email_match_score │ 62.5 │ 27 | └────────┴─────────────────────────────┴────────────┘ 28 | """ 29 | 30 | 31 | def test_transactions(client: ChalkClient): 32 | client.check( 33 | input={ 34 | User.id: 1, 35 | User.transactions: [ 36 | Transaction(id=1, amount=110.0), 37 | Transaction(id=2, amount=900.0), 38 | Transaction(id=3, amount=300.0), 39 | ], 40 | }, 41 | assertions={ 42 | User.total_spend: 1310.0, 43 | }, 44 | ) 45 | -------------------------------------------------------------------------------- /04_scheduling/2_filtered_cron.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | import requests 4 | 5 | from chalk import Cron, Now, online 6 | from chalk.features import feature, features 7 | 8 | 9 | @features 10 | class User: 11 | id: int 12 | name: str 13 | email: str 14 | status: str 15 | last_login: datetime 16 | fico_score: int = feature(max_staleness="30d") 17 | 18 | 19 | # Filter functions can take in any features as arguments, and must 20 | # output True or False to indicate whether to consider a given entity 21 | # in a scheduled run 22 | def only_active_filter( 23 | last_login: User.last_login, status: User.status, now: Now 24 | ) -> bool: 25 | return status == "active" and last_login > (now - timedelta(days=30)) 26 | 27 | 28 | # You may want to run your cron jobs only on a subset of your userbase. 29 | # Imagine, for example, that you wanted to regularly pull credit scores 30 | # for only users who had logged in within the last 30 days. 31 | # 32 | # To do that, pass the keyword argument `cron` an instance of `Cron`, 33 | # and provide a filter function. The filter function should take as arguments 34 | # any feature values that it needs to output a boolean answer for whether 35 | # an entity should be considered for scheduled runs. 36 | # 37 | # Note that in this example, our active filter depends on two features 38 | # that are not part of our resolver's arguments. 39 | @online(cron=Cron(schedule="29d 11h", filter=only_active_filter)) 40 | def get_fico_score(name: User.name, email: User.email) -> User.fico_score: 41 | return requests.get("https://experian.com").json()["score"] 42 | -------------------------------------------------------------------------------- /fraud/5_account_takeover.py: -------------------------------------------------------------------------------- 1 | """An example of using streams and windowed features to calculate 2 | the number of failed logins for a user, at different time slices. 3 | """ 4 | 5 | from enum import Enum 6 | from pydantic import BaseModel 7 | 8 | from chalk.features import ( 9 | DataFrame, 10 | features, 11 | ) 12 | from chalk.streams import KafkaSource, stream, windowed, Windowed 13 | 14 | 15 | @features 16 | class User: 17 | id: str 18 | 19 | # Chalk make it easy to calculate time windowed features, 20 | # below we calculate the number of failed logins in the 21 | # past 10 minutes, 30 minutes, and 1 hour. 22 | failed_logins: Windowed[int] = windowed("10m", "30m", "1h") 23 | 24 | 25 | # setup a stream source, this can be configured from your chalk 26 | # dashboard, in the datasources tab. 27 | source = KafkaSource(name="sensor_stream") 28 | 29 | 30 | class LoginStatus(Enum): 31 | success = "success" 32 | failed = "failed" 33 | 34 | 35 | class LoginMessage(BaseModel): 36 | user_id: str 37 | status: LoginStatus 38 | 39 | 40 | @stream(source=source, mode="continuous") 41 | def agg_logins(df: DataFrame[LoginMessage]) -> DataFrame[User]: 42 | # If a resolver annotation returns a dataframe and takes a dataframe, 43 | # but the function actually returns a string, Chalk treats the return 44 | # value as a SQL query, which it will execute on the passed in dataframe. 45 | return f""" 46 | select 47 | count(*) as failed_logins, 48 | user_id as id 49 | from {df} 50 | where status = 'failed' 51 | group by id 52 | """ 53 | -------------------------------------------------------------------------------- /03_caching/1_basic_caching.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from chalk import realtime 4 | from chalk.client import ChalkClient 5 | from chalk.features import feature, features 6 | 7 | 8 | @features 9 | class User: 10 | id: int 11 | name: str 12 | 13 | # Here, we can specify the default maximum staleness 14 | # that we'll tolerate for a feature. 15 | # You can also override this setting when you go to fetch 16 | # the feature! See 4_override_max_staleness.py for more info 17 | fico_score: int = feature(max_staleness="30d") 18 | 19 | 20 | # This function is both slow and expensive to run, 21 | # but because we're caching the `User.fico_score` 22 | # feature, it won't run every time we need the feature! 23 | @realtime 24 | def get_fico_score(name: User.name) -> User.fico_score: 25 | response = requests.get( 26 | "https://experian.com/api/score", 27 | json={"name": name}, 28 | ).json() 29 | return response["fico"] 30 | 31 | 32 | if __name__ == "__main__": 33 | # The first time that we run this query, 34 | # `get_fico_score` will call out to Experian, 35 | # because the FICO score is not available. 36 | ChalkClient().query( 37 | input={User.name: "Katherine Johnson"}, 38 | output=[User.fico_score], 39 | ) 40 | 41 | # The second time that we run this query with 42 | # the same name, however, `get_fico_score` will 43 | # NOT call out to Experian, because we have computed 44 | # the FICO score for this user in the last 30 days. 45 | ChalkClient().query( 46 | input={User.name: "Katherine Johnson"}, 47 | output=[User.fico_score], 48 | ) 49 | -------------------------------------------------------------------------------- /05_feature_discovery/README.md: -------------------------------------------------------------------------------- 1 | # Feature Discovery 2 | Capture metadata to inform alerting, monitoring, and discovery. 3 | 4 | https://docs.chalk.ai/docs/feature-discovery 5 | 6 | ## 1. Descriptions 7 | Describe features at a feature class or feature level. 8 | 9 | **[1_descriptions.py](1_descriptions.py)** 10 | 11 | ```python 12 | @features 13 | class RocketLaunch: 14 | # Feature descriptions are parsed from your code! 15 | launched_at: datetime 16 | ``` 17 | https://docs.chalk.ai/docs/feature-discovery#description 18 | 19 | ## 2. Owners 20 | Assign owners to features for monitoring and alerting. 21 | 22 | **[2_owners.py](2_owners.py)** 23 | 24 | ```python 25 | @features(owner="default-owner@gmail.com") 26 | class RocketLaunch: 27 | # :owner: specific-owner@gmail.com 28 | launched_at: datetime 29 | ``` 30 | https://docs.chalk.ai/docs/feature-discovery#owner 31 | 32 | ## 3. Tags 33 | Tag related features. 34 | 35 | **[3_tags.py](3_tags.py)** 36 | 37 | ```python 38 | @features(tags="group:risk") 39 | class RiskReport: 40 | id: str 41 | risk_score: str 42 | # :tags: pii 43 | first_name: str 44 | ``` 45 | https://docs.chalk.ai/docs/feature-discovery#tags 46 | 47 | ## 4. Tags & Owners 48 | Assigning tags & owners to features. 49 | 50 | **[4_unified.py](4_unified.py)** 51 | 52 | ```python 53 | @features(owner="shuttle@nasa.gov", tags="group:rocketry") 54 | class SpaceShuttle: 55 | # The volume of this shuttle in square meters. 56 | # :owner: architecture@nasa.gov 57 | # :tags: zillow-fact, size 58 | volume: str 59 | 60 | assert tags(SpaceShuttle.volume) == ["zillow-fact", "size", "group:rocketry"] 61 | ``` 62 | -------------------------------------------------------------------------------- /09_github_actions/README.md: -------------------------------------------------------------------------------- 1 | # GitHub Actions 2 | Deploy feature pipelines in GitHub Actions. 3 | 4 | Docs: https://docs.chalk.ai/docs/github-actions 5 | 6 | CLI Step: https://github.com/chalk-ai/cli-action 7 | 8 | Deploy Step: https://github.com/chalk-ai/deploy-action 9 | 10 | ## 1. Install Chalk CLI 11 | Install the Chalk CLI in a GitHub Action. 12 | 13 | **[1_install_chalk_cli.yaml](1_install_chalk_cli.yaml)** 14 | 15 | ```yaml 16 | - uses: chalk-ai/cli-action@v2 17 | with: 18 | client-id: ${{secrets.CHALK_CLIENT_ID}} 19 | client-secret: ${{secrets.CHALK_CLIENT_SECRET}} 20 | ``` 21 | Docs: https://docs.chalk.ai/docs/github-actions 22 | 23 | Step: https://github.com/chalk-ai/cli-action 24 | 25 | ## 2. Deploy with Chalk 26 | Deploy to Chalk (either as a preview deployment or to production). 27 | 28 | **[2_deploy_with_chalk.yaml](2_deploy_with_chalk.yaml)** 29 | 30 | ```yaml 31 | - uses: chalk-ai/deploy-action@v2 32 | with: 33 | client-id: ${{secrets.CHALK_CLIENT_ID}} 34 | client-secret: ${{secrets.CHALK_CLIENT_SECRET}} 35 | await: true 36 | ``` 37 | Docs: https://docs.chalk.ai/docs/github-actions 38 | 39 | Step: https://github.com/chalk-ai/deploy-action 40 | 41 | ## 3. Preview deployments 42 | Set up preview deployments for all PRs. 43 | 44 | **[3_deploy_preview.yaml](3_deploy_preview.yaml)** 45 | 46 | ```yaml 47 | - uses: chalk-ai/deploy-action@v2 48 | with: 49 | client-id: ${{secrets.CHALK_CLIENT_ID}} 50 | client-secret: ${{secrets.CHALK_CLIENT_SECRET}} 51 | await: true 52 | no-promote: true 53 | ``` 54 | Docs: https://docs.chalk.ai/docs/github-actions 55 | 56 | Step: https://github.com/chalk-ai/deploy-action 57 | -------------------------------------------------------------------------------- /full_examples/batch_ml/tests/test_batch_prediction.py: -------------------------------------------------------------------------------- 1 | from chalk import DataFrame 2 | from datetime import datetime, timedelta 3 | from src.resolvers.fraud_model import run_fraud_model 4 | from src.models import User, Transaction 5 | 6 | # Chalk provides a simple interface for unit tests that works with 7 | # pytest or any other python testing framework: https://docs.chalk.ai/docs/unit-tests 8 | # since chalk resolvers are just python functions, you can test them 9 | # just like you'd unit test any other python function. 10 | 11 | 12 | def test_fraud_model(): 13 | # call the python resolver and assert the result 14 | input = DataFrame( 15 | { 16 | Transaction.id: [1, 2, 3, 4], 17 | Transaction.amount: [10, 100, 50, 200], 18 | Transaction.user.time_since_last_transaction: [ 19 | timedelta(days=30).total_seconds(), 20 | timedelta(days=10).total_seconds(), 21 | timedelta(days=5).total_seconds(), 22 | timedelta(days=60).total_seconds(), 23 | ], 24 | Transaction.user.num_transactions["1d"]: [1, 0, 2, 0], 25 | Transaction.user.num_transactions["10d"]: [5, 2, 6, 0], 26 | Transaction.user.num_transactions["30d"]: [10, 4, 12, 1], 27 | Transaction.user.num_distinct_merchants_transacted["1d"]: [1, 0, 2, 0], 28 | Transaction.user.num_distinct_merchants_transacted["10d"]: [2, 1, 3, 0], 29 | Transaction.user.num_distinct_merchants_transacted["30d"]: [3, 2, 4, 1], 30 | } 31 | ) 32 | result = run_fraud_model(input) 33 | assert isinstance(result, DataFrame) 34 | assert len(result) == 4 35 | -------------------------------------------------------------------------------- /full_examples/fraud_transactions_with_llm/src/experian/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import date 3 | 4 | from chalk import DataFrame 5 | from src.models import User 6 | 7 | 8 | class ExperianClient: 9 | def __init__(self, api_key: str): 10 | self.api_key = api_key 11 | 12 | def fetch_credit_report( 13 | self, 14 | name: str, 15 | dob: date, 16 | ): 17 | return DataFrame( 18 | { 19 | User.credit_report.id: [123], 20 | User.credit_report.raw: [ 21 | json.dumps( 22 | { 23 | "Tradelines": [ 24 | { 25 | "Id": 1, 26 | "OpenDate": "2021-01-01", 27 | "Balance": 7203.40, 28 | "Amount": 10000.0, 29 | "AmountPastDue": 0.0, 30 | "PaymentAmount": 200.0, 31 | }, 32 | { 33 | "Id": 2, 34 | "OpenDate": "2021-01-01", 35 | "Balance": 7203.40, 36 | "Amount": 10000.0, 37 | "AmountPastDue": 0.0, 38 | "PaymentAmount": 200.0, 39 | }, 40 | ], 41 | } 42 | ) 43 | ], 44 | } 45 | ) 46 | 47 | -------------------------------------------------------------------------------- /call_recordings/features/fathom/fathom_meeting_insights_sales.py: -------------------------------------------------------------------------------- 1 | # trunk-ignore-all(ruff/W291) 2 | from __future__ import annotations 3 | 4 | from pydantic import BaseModel, Field 5 | 6 | 7 | class StructuredOutputCallInsights(BaseModel): 8 | """Minimal schema matching the simple JSON format in the prompt. 9 | Prefer StructuredOutputCallInsights for richer analytics and UI. 10 | """ 11 | 12 | reasons_for_meeting: str = Field( 13 | ..., 14 | description="""Plain-text reasons. Return "None" if there wan't anything mentioned.""", 15 | ) 16 | risk_flag: str = Field( 17 | ..., 18 | description="""One-sentence risk summaries. Return "None" if there wan't anything mentioned.""", 19 | ) 20 | 21 | 22 | def prompt_meeting_insights_sales_user() -> str: 23 | return """ 24 | You are analyzing a transcript of a B2B sales call between a representative from Chalk, a data infrastructure platform, and a prospective customer. 25 | Read the entire transcript carefully and extract the following insights in JSON format: 26 | 27 | 1. **Reasons for Meeting** The stated reasons the prospect gave for agreeing to take the call or meet with Chalk. If none are stated explicitly, write `"None"`. 28 | 2. **Risk Flag** Any risks raised by the prospect related to security, legal, or project timeline. If mentioned, summarize the risk in one sentence per item. If none, write `"None"`. 29 | 30 | **Output format**: 31 | ```json 32 | { 33 | "reasons_for_meeting": "string" or "None" 34 | "risk_flag": "string" or "None" 35 | } 36 | 37 | THE TRANSCRIPT TO ANALYZE: 38 | 39 | {{ FathomCall.transcript }} 40 | """ 41 | -------------------------------------------------------------------------------- /unstructured_data/src/models.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import date 3 | 4 | from chalk import DataFrame, FeatureTime, Windowed, _, feature, windowed 5 | from chalk.features import features 6 | 7 | default_completion = json.dumps( 8 | dict( 9 | category="unknown", 10 | is_nsf=False, 11 | is_ach=False, 12 | clean_memo="", 13 | ) 14 | ) 15 | 16 | 17 | @features 18 | class Transaction: 19 | id: int 20 | amount: float 21 | memo: str 22 | 23 | # :tags: genai 24 | clean_memo: str 25 | 26 | # The User.id type defines our join key implicitly 27 | user_id: "User.id" 28 | user: "User" 29 | 30 | # The time at which the transaction was created for temporal consistency 31 | at: FeatureTime 32 | 33 | completion: str = feature(max_staleness="infinity", default=default_completion) 34 | 35 | category: str = "unknown" 36 | is_nsf: bool = False 37 | is_ach: bool = False 38 | 39 | 40 | @features 41 | class User: 42 | # Features pulled from Postgres for the user 43 | id: int 44 | email: str 45 | name: str 46 | dob: date 47 | 48 | # Whether the user appears in a denylist in s3 49 | denylisted: bool 50 | 51 | # The transactions, linked by the User.id type on the Transaction.user_id field 52 | transactions: DataFrame[Transaction] 53 | 54 | # The number of payments made by the user in the last 1, 7, and 30 days 55 | # Uses the category pulled from Gemini to count payments 56 | count_payments: Windowed[int] = windowed( 57 | "1d", "7d", "30d", 58 | expression=_.transactions[ 59 | _.amount, 60 | _.at >= _.chalk_window, 61 | _.category == "payment" 62 | ].count(), 63 | ) 64 | -------------------------------------------------------------------------------- /04_scheduling/README.md: -------------------------------------------------------------------------------- 1 | # Scheduling 2 | Run resolvers on a schedule, sampling values 3 | for the inputs. 4 | 5 | https://docs.chalk.ai/docs/resolver-cron 6 | 7 | ## 1. Cron 8 | Run resolvers on a schedule with all possible arguments. 9 | 10 | **[1_cron.py](1_cron.py)** 11 | 12 | ```python 13 | @realtime(cron="30d") 14 | def get_fico_score(name: User.name, email: User.email) -> User.fico_score: 15 | return ... 16 | ``` 17 | https://docs.chalk.ai/docs/resolver-cron 18 | 19 | ## 2. Filtered Cron 20 | Run resolvers on a schedule and filter down which examples to consider. 21 | 22 | **[2_filtered_cron.py](2_filtered_cron.py)** 23 | 24 | ```python 25 | def only_active_filter(last_login: User.last_login, status: User.status) -> bool: 26 | return status == "active" and last_login > datetime.now() - timedelta(days=30) 27 | 28 | @realtime(cron=Cron(schedule="29d 11h", filter=only_active_filter)) 29 | def get_fico_score(name: User.name, email: User.email) -> User.fico_score: 30 | return requests.get("https://experian.com").json()["score"] 31 | ``` 32 | https://docs.chalk.ai/docs/resolver-cron#filtering-examples 33 | 34 | ## 3. Sampling Cron 35 | Pick exactly the examples that you’d like to run. 36 | 37 | **[3_sample_arguments.py](3_sample_arguments.py)** 38 | 39 | ```python 40 | def get_active_users() -> DataFrame[User.id]: 41 | return session.query_string( 42 | "select users.id from users where users.active = true", 43 | fields={"id": User.id}, 44 | ).all() 45 | 46 | @realtime(cron=Cron(schedule="29d 11h", sample=get_active_users)) 47 | def get_fico_score(name: User.name, email: User.email) -> User.fico_score: 48 | return requests.get("https://experian.com").json()["score"] 49 | ``` 50 | https://docs.chalk.ai/docs/resolver-cron#custom-examples 51 | -------------------------------------------------------------------------------- /02_resolvers/5_tagged_resolvers.py: -------------------------------------------------------------------------------- 1 | from random import random 2 | 3 | from mocks import lexus_nexus 4 | 5 | from chalk import online 6 | from chalk.client import ChalkClient, OnlineQueryContext 7 | from chalk.features import features 8 | 9 | 10 | @features 11 | class User: 12 | id: int 13 | email: str 14 | email_domain: str 15 | email_risk_score: float 16 | banned_email: bool 17 | 18 | 19 | # If a request for features is made under the tag 20 | # `mock`, then this resolver will run. 21 | @online(tags="mock") 22 | def mock_check_banned_email(domain: User.email_domain) -> User.banned_email: 23 | if domain == "chalk.ai": 24 | return False 25 | if domain == "fraudster.com": 26 | return True 27 | return random() < 0.1 28 | 29 | 30 | @online 31 | def get_email_risk_score(email: User.email) -> User.email_risk_score: 32 | return lexus_nexus.get_email_risk(email).risk_score 33 | 34 | 35 | # If a request for features is made with _without_ the tag 36 | # than `mock`, then this resolver will run. 37 | # 38 | # Note that the two resolvers that resolve the feature 39 | # User.banned_email require different features as input! 40 | @online 41 | def check_banned_email(score: User.email_risk_score) -> User.banned_email: 42 | return score >= 0.8 43 | 44 | 45 | if __name__ == "__main__": 46 | result = ChalkClient().query( 47 | input={User.email: "katherine.johnson@nasa.gov"}, 48 | output=[User.banned_email], 49 | ) 50 | assert result.get_feature_value(User.banned_email) == False 51 | 52 | result = ChalkClient().query( 53 | input={User.email: "attacker@fraudster.com"}, 54 | output=[User.banned_email], 55 | context=OnlineQueryContext(tags=["mock"]), 56 | ) 57 | assert result.get_feature_value(User.banned_email) == True 58 | -------------------------------------------------------------------------------- /full_examples/sagemaker/chalk_sagemaker_pipeline.py: -------------------------------------------------------------------------------- 1 | from sagemaker.workflow.pipeline import Pipeline 2 | from steps.dataset import create_dataset 3 | from steps.training import train 4 | from steps.evaluate import evaluate 5 | from sagemaker.workflow.parameters import ( 6 | ParameterInteger, 7 | ParameterString, 8 | ParameterFloat, 9 | ) 10 | from uuid import uuid4 11 | 12 | BUCKET_PREFIX = "s3://chalk-sagemaker-models/" 13 | 14 | if __name__ == "__main__": 15 | # Create Run Parameters 16 | model_package_group = "chalk-sagemaker-xgb" 17 | run_bucket = f"s3://chalk-sagemaker-models/{model_package_group}/{uuid4()}/" 18 | 19 | # Required F1 Threshold for model registration 20 | f1_threshold = ParameterFloat(name="F1Threshold", default_value=0.8) 21 | 22 | # Size of test split 23 | test_size = ParameterFloat(name="TestSize", default_value=0.2) 24 | 25 | # Number of estimators to evaluate 26 | num_rounds = ParameterInteger(name="NumRounds", default_value=50) 27 | run_bucket = ParameterString(name="RunBucket", default_value=run_bucket) 28 | model_package_group = ParameterString(name="ModelPackageGroup", default_value="chalk-sagemaker-xgb") 29 | 30 | 31 | # Instantiate Steps 32 | delayed_data = create_dataset(test_size=test_size, run_bucket=run_bucket) 33 | delayed_model = train(xtrain_path=delayed_data[0], ytrain_path=delayed_data[2], num_rounds=num_rounds) 34 | delayed_evaluation = evaluate(model=delayed_model, xtest_path=delayed_data[1], ytest_path=delayed_data[3], run_bucket=run_bucket) 35 | 36 | # Create Pipeline 37 | pipeline = Pipeline( 38 | name="ChalkaiSagemakerPipeline", 39 | steps=[delayed_evaluation], 40 | parameters=[ 41 | f1_threshold, 42 | test_size, 43 | run_bucket, 44 | model_package_group, 45 | num_rounds, 46 | ] 47 | ) 48 | -------------------------------------------------------------------------------- /credit/3_bureau_api.py: -------------------------------------------------------------------------------- 1 | """An example of connecting Users to Credit Reports from a 2 | third part API. 3 | 4 | In this example, we are getting Credit Reports for our 5 | users through a third party API. This example shows how 6 | you can run arbitrary python code (and connect to third 7 | party APIs) in a python resolver. 8 | """ 9 | 10 | import os 11 | import requests 12 | 13 | from chalk import online 14 | 15 | from chalk.features import features, has_many, DataFrame, Primary 16 | 17 | 18 | @features 19 | class CreditReport: 20 | # if a feature doesn't have an id field, the Primary key must be specified 21 | report_id: Primary[str] 22 | user_id: "User.id" 23 | # The raw report, which we'll save as a plain string 24 | # to parse and extract later. 25 | report: str 26 | 27 | 28 | @features 29 | class User: 30 | id: int 31 | first_name: str 32 | last_name: str 33 | # Adds the pii tag to the ssn feature (https://docs.chalk.ai/docs/feature-discovery#tags) 34 | # :tags: pii 35 | ssn: str 36 | city: str 37 | state: str 38 | credit_report: DataFrame[CreditReport] 39 | 40 | 41 | # Inject a secret through the Chalk dashboard (https://docs.chalk.ai/docs/env-vars) 42 | url = os.getenv("MY_VENDOR_URL") 43 | 44 | 45 | @online 46 | def get_credit_report( 47 | first_name: User.first_name, 48 | last_name: User.last_name, 49 | city: User.city, 50 | state: User.state, 51 | id: User.id, 52 | ) -> CreditReport: 53 | """ 54 | This resolver populates the credit report feature for a user by making a request to 55 | a third party API. 56 | """ 57 | res = requests.get( 58 | f"{url}/transunion/credit-report", 59 | json={ 60 | "firstName": first_name, 61 | "lastName": last_name, 62 | "city": city, 63 | "state": state, 64 | }, 65 | ).json() 66 | return CreditReport(user_id=id, report_id=res["pdfReportId"], report=res["data"]) 67 | -------------------------------------------------------------------------------- /06_dataframe/6_self_joins.py: -------------------------------------------------------------------------------- 1 | from chalk import online 2 | from chalk.features import DataFrame, features, has_many, has_one 3 | 4 | 5 | @features 6 | class SeriesLink: 7 | id: int 8 | books: "DataFrame[Book]" 9 | 10 | 11 | @features 12 | class Author: 13 | id: int 14 | name: str 15 | books: "DataFrame[Book]" 16 | 17 | 18 | @features 19 | class PrequelLink: 20 | id: int 21 | prequel_id: int 22 | book: "Book" = has_one(lambda: Book.id == PrequelLink.prequel_id) 23 | 24 | 25 | @features 26 | class Book: 27 | id: int 28 | title: str 29 | author_id: Author.id 30 | prequel_id: PrequelLink.id | None 31 | prequel: PrequelLink | None = has_one(lambda: Book.id == PrequelLink.prequel_id) 32 | series_id: SeriesLink.id | None 33 | series: SeriesLink = has_one(lambda: SeriesLink.id == Book.series_id) 34 | 35 | 36 | @online 37 | def get_books() -> DataFrame[Book]: 38 | return DataFrame( 39 | [ 40 | Book( 41 | id=1, 42 | title="To the Lighthouse", 43 | author_id=1, 44 | series_id=None, 45 | prequel_id=None, 46 | ), 47 | Book( 48 | id=2, 49 | title="The Fellowship of the Ring", 50 | series_id=1, 51 | author_id=2, 52 | prequel_id=None, 53 | ), 54 | Book(id=3, title="The Two Towers", series_id=1, author_id=2, prequel_id=2), 55 | Book( 56 | id=4, 57 | title="The Return of the King", 58 | series_id=1, 59 | author_id=2, 60 | prequel_id=3, 61 | ), 62 | ] 63 | ) 64 | 65 | 66 | @online 67 | def get_prequel_links() -> DataFrame[PrequelLink]: 68 | return DataFrame([PrequelLink(id=3, prequel_id=2), PrequelLink(id=4, prequel_id=3)]) 69 | 70 | 71 | @online 72 | def get_series_links() -> DataFrame[SeriesLink]: 73 | return DataFrame([SeriesLink(id=1)]) 74 | -------------------------------------------------------------------------------- /full_examples/batch_ml/src/models.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from chalk.features import features, DataFrame, _ 3 | from chalk import windowed, Windowed 4 | import chalk.functions as F 5 | 6 | 7 | @features 8 | class User: 9 | id: int 10 | name: str 11 | created_at: datetime 12 | transactions: "DataFrame[Transaction]" 13 | 14 | # The amount of time since the user was created in seconds. 15 | time_since_creation: int = F.total_seconds(_.chalk_now - _.created_at) 16 | 17 | # The number of transactions the user has made in the last 1, 10, and 30 days. 18 | num_transactions: Windowed[int] = windowed( 19 | "1d", 20 | "10d", 21 | "30d", 22 | expression=_.transactions[_.ts < _.chalk_now, _.ts >= _.chalk_window].count(), 23 | ) 24 | 25 | # The latest transaction timestamp for the user, considering only transactions 26 | latest_transaction_timestamp: datetime = _.transactions[ 27 | _.ts, _.ts < _.chalk_now 28 | ].max() 29 | 30 | # The time since the last transaction in seconds. 31 | time_since_last_transaction: int = F.total_seconds( 32 | _.chalk_now - _.latest_transaction_timestamp 33 | ) 34 | 35 | # The number of distinct merchants the user has transacted with in the last 36 | # 1, 10, and 30 days. 37 | num_distinct_merchants_transacted: Windowed[int] = windowed( 38 | "1d", 39 | "10d", 40 | "30d", 41 | expression=_.transactions[ 42 | _.merchant_id, _.ts < _.chalk_now, _.ts >= _.chalk_window 43 | ].approx_count_distict(), 44 | ) 45 | 46 | # The churn prediction is a float between 0 and 1, where 1 means the user is 47 | # predicted to churn. 48 | churn_prediction: float 49 | 50 | 51 | @features 52 | class Transaction: 53 | id: int 54 | amount: float 55 | merchant_id: int 56 | ts: datetime 57 | user_id: User.id 58 | user: User 59 | category: str 60 | 61 | # model score predicted by a scheduled job 62 | is_fraud: bool 63 | -------------------------------------------------------------------------------- /full_examples/image_processing/src/resolvers.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | from cairosvg import svg2png 6 | from chalk import online 7 | from chalk.features import DataFrame, Features 8 | from PIL import Image as PI 9 | 10 | from src.feature_sets import Image, Website 11 | 12 | 13 | @online 14 | def get_html(url: Website.url) -> Website.html: 15 | """Get the HTML of a website.""" 16 | res = requests.get(url) 17 | return res.content 18 | 19 | 20 | def process_url(image_src, host): 21 | if image_src.startswith("https://") or image_src.startswith("http://"): 22 | return image_src 23 | elif image_src.startswith("//"): 24 | return f"https:{image_src}" 25 | return f"https://{host}/{image_src.strip('/')}" 26 | 27 | 28 | @online 29 | def get_images( 30 | html: Website.html, website_url: Website.url, website_host: Website.host 31 | ) -> Website.images[Image.url, Image.source_url]: 32 | """Extract all images from the HTML of a website.""" 33 | soup = BeautifulSoup(html, "html.parser") 34 | 35 | return DataFrame( 36 | [ 37 | Image( 38 | url=process_url(it["src"], website_host), 39 | source_url=website_url, 40 | ) 41 | for it in soup.find_all("img") 42 | ] 43 | ) 44 | 45 | 46 | @online 47 | def get_image_bytes( 48 | image_url: Image.url, 49 | ) -> Image.image_bytes: 50 | """Get the image as bytes from the image's URL.""" 51 | res = requests.get(image_url) 52 | return res.content 53 | 54 | 55 | @online 56 | def get_image_shape( 57 | image_bytes: Image.image_bytes, image_type: Image.type 58 | ) -> Features[Image.x, Image.y]: 59 | """Read the image using pillow and get its dimensions""" 60 | pil_bytes = io.BytesIO() 61 | if image_type == "svg": 62 | svg2png(bytestring=image_bytes, write_to=pil_bytes) 63 | else: 64 | pil_bytes.write(image_bytes) 65 | x, y = PI.open(pil_bytes).size 66 | return Image(x=x, y=y) 67 | -------------------------------------------------------------------------------- /ecommerce/2_interactions.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | from chalk import online 4 | from chalk.features import DataFrame, FeatureTime, features, _, has_many 5 | 6 | 7 | @features 8 | class Seller: 9 | id: str 10 | categories: set[str] 11 | 12 | 13 | @features 14 | class User: 15 | id: str 16 | age: int 17 | favorite_categories: set[str] 18 | 19 | 20 | @features 21 | class UserSeller: 22 | id: str 23 | user_id: User.id 24 | user: User 25 | seller_id: Seller.id 26 | seller: Seller 27 | favorites_match: bool 28 | user_seller_score: int 29 | interactions: "DataFrame[Interaction]" = has_many( 30 | lambda: (User.id == Interaction.user_id) & (Seller.id == Interaction.seller_id) 31 | ) 32 | number_of_interactions: int = _.interactions.count() 33 | 34 | 35 | class InteractionKind(Enum): 36 | LIKE = "LIKE" 37 | VIEW = "VIEW" 38 | PURCHASE = "PURCHASE" 39 | OTHER = "OTHER" 40 | 41 | @classmethod 42 | def _missing_(cls, _): 43 | return cls.OTHER 44 | 45 | 46 | @features 47 | class Interaction: 48 | id: str 49 | user_id: User.id 50 | user: User 51 | seller_id: Seller.id 52 | seller: Seller 53 | interaction_kind: InteractionKind 54 | on: FeatureTime 55 | 56 | 57 | @online 58 | def get_similarity( 59 | fc: UserSeller.user.favorite_categories, fc2: UserSeller.seller.categories 60 | ) -> UserSeller.favorites_match: 61 | return len(fc & fc2) > 0 62 | 63 | 64 | if __name__ == "__main__": 65 | from chalk.client import ChalkClient 66 | 67 | client = ChalkClient() 68 | user_stores = client.query( 69 | input=[ 70 | UserSeller(user_id="1", seller_id="456"), 71 | UserSeller(user_id="2", seller_id="457"), 72 | UserSeller(user_id="3", seller_id="460"), 73 | ], 74 | output=[ 75 | UserSeller.user.id, 76 | UserSeller.seller.id, 77 | UserSeller.favorites_match, 78 | UserSeller.number_of_interactions, 79 | ], 80 | ) 81 | print(user_stores) 82 | -------------------------------------------------------------------------------- /credit/README.md: -------------------------------------------------------------------------------- 1 | # Credit 2 | 3 | Chalk can help you build insight into the financial transactions 4 | of your users. 5 | 6 | ## 1. Income 7 | 8 | Compute income from Plaid transactions. 9 | 10 | **[1_income.py](1_income.py)** 11 | 12 | ```python 13 | @realtime 14 | def get_plaid_income( 15 | txns: User.transactions[ 16 | PlaidTransaction.is_payroll is True, 17 | after(days_ago=30), 18 | ], 19 | ) -> User.computed_income_30: 20 | return txns[PlaidTransaction.amount].sum() 21 | ``` 22 | 23 | https://docs.chalk.ai/docs/features 24 | 25 | ## 2. Multiple Accounts 26 | 27 | Identify users with multiple accounts. 28 | 29 | **[2_accounts.py](2_accounts.py)** 30 | 31 | ```python 32 | @features 33 | class Account: 34 | id: int 35 | user_id: "User.id" 36 | user: "User" 37 | 38 | @features 39 | class User: 40 | id: int 41 | accounts: DataFrame[Account] 42 | ``` 43 | 44 | https://docs.chalk.ai/docs/has-many 45 | 46 | ## 3. Credit Bureau API 47 | 48 | Integrate data from credit bureaus like Transunion. 49 | 50 | **[3_bureau_api.py](3_bureau_api.py)** 51 | 52 | ```python 53 | @realtime 54 | def get_credit_report( 55 | first_name: User.first_name, 56 | last_name: User.last_name, 57 | city: User.city, 58 | state: User.state, 59 | id: User.id, 60 | ) -> CreditReport: 61 | res = requests.post( 62 | f"{url}/transunion/credit-report", 63 | json={ 64 | "firstName": first_name, 65 | "lastName": last_name, 66 | "city": city, 67 | "state": state, 68 | }, 69 | ).json() 70 | return CreditReport(user_id=id, report_id=res["pdfReportId"], report=res["data"]) 71 | ``` 72 | 73 | https://docs.chalk.ai/docs/resolver-overview 74 | 75 | ## 4. Aggregate Tradelines 76 | 77 | Aggregate user statistics across tradelines. 78 | 79 | **[4_aggregate_tradelines.py](4_aggregate_tradelines.py)** 80 | 81 | ```python 82 | @realtime 83 | def tradeline_rollup( 84 | accounts: User.tradelines[ 85 | Tradeline.is_delinquent is True 86 | ] 87 | ) -> User.delinquent_amount: 88 | return accounts[Tradeline.outstanding].sum() 89 | ``` 90 | 91 | https://docs.chalk.ai/docs/resolver-overview 92 | -------------------------------------------------------------------------------- /unstructured_data/src/resolvers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import textwrap 3 | 4 | import google.generativeai as genai 5 | from chalk import online 6 | from chalk.features import Features, before_all 7 | from src.denylist import Denylist 8 | 9 | from src.models import Transaction, User 10 | 11 | model = genai.GenerativeModel(model_name="models/gemini-1.5-flash-latest") 12 | 13 | 14 | @online 15 | async def get_transaction_classification(memo: Transaction.memo) -> Transaction.completion: 16 | """Here, we pull out the raw response from calling out to Gemini. 17 | The feature `Transaction.completion` is a string, but we can use the 18 | `get_structured_outputs` function to convert it to a structured output. 19 | 20 | Transaction.completion has max-staleness of infinity, so we won't need 21 | to recompute the completion, but can still iterate on how we parse it. 22 | """ 23 | 24 | return model.generate_content( 25 | textwrap.dedent( 26 | f"""\ 27 | Please return JSON for classifying a financial transaction 28 | using the following schema. 29 | 30 | {{"category": str, "is_nsf": bool, "clean_memo": str, "is_ach": bool}} 31 | 32 | All fields are required. Return EXACTLY one JSON object with NO other text. 33 | Memo: {memo}""" 34 | ), 35 | generation_config={"response_mime_type": "application/json"}, 36 | ).candidates[0].content.parts[0].text 37 | 38 | 39 | @online 40 | def get_structured_outputs(completion: Transaction.completion) -> Features[ 41 | Transaction.category, 42 | Transaction.is_nsf, 43 | Transaction.is_ach, 44 | Transaction.clean_memo, 45 | ]: 46 | """Given the completion, we parse it into a structured output.""" 47 | body = json.loads(completion) 48 | return Transaction( 49 | category=body["category"], 50 | is_nsf=body["is_nsf"], 51 | is_ach=body["is_ach"], 52 | clean_memo=body["clean_memo"], 53 | ) 54 | 55 | 56 | denylist = Denylist(source="gs://socure-data/denylist.csv") 57 | 58 | 59 | @before_all 60 | def init_denylist(): 61 | denylist.load() 62 | 63 | 64 | @online 65 | def email_in_denylist(email: User.email) -> User.denylisted: 66 | return email in denylist 67 | -------------------------------------------------------------------------------- /full_examples/sagemaker/steps/dataset.py: -------------------------------------------------------------------------------- 1 | from sagemaker.workflow.function_step import step 2 | 3 | 4 | TRAINING_FEATURES = [ 5 | "transaction.amt", 6 | "transaction.customer.age", 7 | "transaction.customer.income", 8 | "transaction.customer.fico", 9 | "transaction.customer.transaction_sum_30m", 10 | "transaction.customer.transaction_sum_1h", 11 | "transaction.confirmed_fraud" 12 | ] 13 | 14 | TARGET_FEATURE = "transaction.confirmed_fraud" 15 | 16 | 17 | @step( 18 | name="create_dataset", 19 | instance_type='ml.t3.medium', 20 | keep_alive_period_in_seconds=300, 21 | ) 22 | def create_dataset(test_size, run_bucket): 23 | from chalk.client import ChalkClient 24 | from sklearn.model_selection import train_test_split 25 | 26 | # a Chalk client id & client secret for a token with permission to create datasets 27 | # should be added to the sagemaker environment—these are passed automatically to the 28 | # ChalkClient but can also be explicitly passed as arguments. 29 | 30 | chalk_dataset = ChalkClient( 31 | # client_id=os.environ['CHALK_CLIENT_ID'], # automatically loaded by the Chalk Client if in the environment 32 | # client_secret=os.environ['CHALK_CLIENT_SECRET'] # automatically loaded by the Chalk Client if in the environment 33 | ).offline_query( 34 | max_samples=100_000, # reads 100,000 samples from the Chalk dataset 35 | output=TRAINING_FEATURES, 36 | dataset_name="transactions_fraud_model", 37 | ) 38 | dataset = chalk_dataset.to_pandas() 39 | 40 | X_train, X_test, y_train, y_test = train_test_split( 41 | dataset.drop(columns=[TARGET_FEATURE]), # X 42 | dataset[TARGET_FEATURE], # y 43 | test_size=test_size 44 | ) 45 | 46 | xtrain_path = f"{run_bucket}/input/X_train.parquet" 47 | xtest_path = f"{run_bucket}/input/X_test.parquet" 48 | ytrain_path = f"{run_bucket}/input/y_train.parquet" 49 | ytest_path = f"{run_bucket}/input/y_test.parquet" 50 | 51 | dataset.to_parquet(f"{run_bucket}/raw_data/data.parquet") 52 | X_train.to_parquet(xtrain_path) 53 | y_train.to_parquet(ytrain_path) 54 | X_test.to_parquet(xtest_path) 55 | y_test.to_parquet(ytest_path) 56 | return xtrain_path, xtest_path, ytrain_path, ytest_path 57 | -------------------------------------------------------------------------------- /predictive_maintenance/README.md: -------------------------------------------------------------------------------- 1 | # Predictive Maintenance 2 | 3 | Predicting device failure requires complex analysis executed 4 | against a variety of data sources. Chalk's platform allows 5 | data scientists to bring all the different data together, 6 | including streaming data from devices. 7 | 8 | ## 1. Device Data 9 | Easily listen to streaming data and parse messages with 10 | custom logic. 11 | 12 | **[1_device_data.py](1_device_data.py)** 13 | 14 | ```python 15 | @stream(source=source) 16 | def read_message(message: Message) -> Measurement: 17 | return Measurement( 18 | device_id=message.device_id, 19 | timestamp=message.timestamp, 20 | lat=message.data.latitude, 21 | long=message.data.longitude, 22 | voltage=message.data.voltage, 23 | temp=message.data.temperature, 24 | ) 25 | ``` 26 | https://docs.chalk.ai/docs/streams 27 | 28 | ## 2. Historical Data 29 | Access historical sensor data as-of any time in the past. 30 | 31 | **[2_time_query.py](2_time_query.py)** 32 | 33 | ```python 34 | ChalkClient.get_training_dataframe( 35 | input=labels[[Measurement.device_id]], 36 | input_times = [(datetime.now() - timedelta(days = 30)).isoformat()], 37 | output=[ 38 | Measurement.lat, 39 | Measurement.long, 40 | Measurement.temp 41 | ] 42 | ) 43 | ``` 44 | 45 | https://docs.chalk.ai/docs/temporal-consistency 46 | 47 | ## 3. Sensor Streams 48 | 49 | Compute streaming window aggregate functions 50 | on sensor data. 51 | 52 | **[3_keep_data_fresh.py](3_keep_data_fresh.py)** 53 | 54 | ```python 55 | @stream(source=source, mode="continuous") 56 | def process_measurements(df: DataFrame[Message]) -> DataFrame[Sensor]: 57 | return f""" 58 | select 59 | count(*) as count_failed, 60 | id as device_id 61 | from {df} 62 | where is_failing <> TRUE 63 | group by id 64 | """ 65 | ``` 66 | 67 | https://docs.chalk.ai/docs/aggregations 68 | 69 | ## 4. Failing Sensors 70 | 71 | Combine batch, caching, and DataFrames to create a powerful 72 | predictive maintenance pipeline. 73 | 74 | **[4_customer_sensors.py](4_customer_sensors.py)** 75 | 76 | ```python 77 | @batch(cron="1h") 78 | def get_customers_needing_service( 79 | bad_sensors: Customer.sensors[ 80 | Sensor.is_failing is True, 81 | Sensor.id 82 | ] 83 | ) -> Customer.customer_needs_service: 84 | return len(bad_sensors) > 0 85 | ``` 86 | 87 | https://docs.chalk.ai/docs/feature-caching 88 | -------------------------------------------------------------------------------- /07_streaming/README.md: -------------------------------------------------------------------------------- 1 | # Streaming 2 | 3 | Chalk ships with a powerful streams module for computing 4 | features from a stream and computing window functions 5 | on streams. 6 | 7 | https://docs.chalk.ai/docs/streams 8 | 9 | https://docs.chalk.ai/docs/aggregations 10 | 11 | ## 1. Mapping Stream 12 | Create features directly from messages on a stream. 13 | 14 | **[1_mapping_stream.py](1_mapping_stream.py)** 15 | 16 | ```python 17 | @stream(source=src) 18 | def fn(message: UserUpdateBody) -> Features[User.uid, User.favorite_color]: 19 | return User( 20 | uid=message.value.user_id, 21 | favorite_color=message.value.favorite_color 22 | ) 23 | ``` 24 | 25 | https://docs.chalk.ai/docs/streams 26 | 27 | ## 2. Stream DataFrame 28 | 29 | Compute a streaming window aggregate function using [DataFrames](https://docs.chalk.ai/docs/dataframe). 30 | 31 | **[2_window_dataframe.py](2_window_dataframe.py)** 32 | 33 | ```python 34 | @stream(source=src) 35 | def failed_logins(events: DataFrame[LoginMessage]) -> Features[ 36 | User.id, 37 | User.num_failed_logins 38 | ]: 39 | return User( 40 | id=events["id"].max(), 41 | num_failed_logins=events["failed"].sum(), 42 | ) 43 | ``` 44 | 45 | https://docs.chalk.ai/docs/aggregations#using-dataframes 46 | 47 | ## 3. Stream SQL 48 | 49 | Compute a streaming window aggregate function using [DataFrames](https://docs.chalk.ai/docs/dataframe). 50 | 51 | **[3_window_sql.py](3_window_sql.py)** 52 | 53 | ```python 54 | @stream(source=src) 55 | def failed_logins(events: DataFrame[LoginMessage]) -> DataFrame[ 56 | User.id, 57 | User.num_failed_logins 58 | ]: 59 | return f""" 60 | select 61 | user_id as id, 62 | count(*) as num_failed_logins 63 | from {events} 64 | where failed = 1 65 | group by 1 66 | """ 67 | ``` 68 | 69 | https://docs.chalk.ai/docs/aggregations#using-sql 70 | 71 | ## 4. Stream SQL Aggregation 72 | 73 | Compute an aggregation on windows using [DataFrames](https://docs.chalk.ai/docs/dataframe). 74 | 75 | **[4_continuous_aggregation.py](4_continuous_aggregation.py)** 76 | 77 | ```python 78 | @stream(source=src, mode='continuous', keys=["user_id": User.id]) 79 | def failed_logins(events: DataFrame[LoginMessage]) -> DataFrame[ 80 | User.id, 81 | User.distinct_ips 82 | ]: 83 | return f""" 84 | select 85 | user_id as id, 86 | approximate_count_distinct(id_address) as distinct_ips 87 | from {events} 88 | """ 89 | ``` 90 | -------------------------------------------------------------------------------- /01_features/2_custom_feature_types.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | import attrs 5 | from pydantic import BaseModel, constr 6 | 7 | from chalk.features import feature, features 8 | 9 | 10 | # A dataclass can be used as a feature (Book.jacket_info below) 11 | @dataclass 12 | class JacketInfo: 13 | title: str 14 | subtitle: str 15 | body: str 16 | 17 | 18 | # Pydantic classes can also be used as features (Book.title below) 19 | class TitleInfo(BaseModel): 20 | heading: constr(min_length=2) 21 | subheading: Optional[str] 22 | 23 | 24 | # attrs classes are also valid for feature types (Book.table_of_contents below) 25 | @attrs.define 26 | class TableOfContentsItem: 27 | foo: str 28 | bar: int 29 | 30 | 31 | @features 32 | class Book: 33 | id: int 34 | # You can use any `dataclass` as a struct feature. 35 | # Struct types should be used for objects that don't have ids. 36 | # If an object has an id, consider using has_one. 37 | jacket_info: JacketInfo 38 | 39 | # If you prefer `pydantic` to `dataclass`, you can use that instead. 40 | title: TitleInfo 41 | 42 | # Alternatively, you can use `attrs`. Any of these struct types 43 | # (`dataclass`, `pydantic`, and `attrs`) can be used with 44 | # `set[...]` or `list[...]`. 45 | table_of_contents: list[TableOfContentsItem] 46 | 47 | 48 | # Finally, if you have an object that you want to serialize that isn't 49 | # from `dataclass`, `attrs`, or `pydantic`, you can write a custom codec. 50 | 51 | 52 | # Consider the custom class below: 53 | class CustomStruct: 54 | def __init__(self, foo: str, bar: int) -> None: 55 | self.foo = foo 56 | self.bar = bar 57 | 58 | def __eq__(self, other: object) -> bool: 59 | return ( 60 | isinstance(other, CustomStruct) 61 | and self.foo == other.bar 62 | and self.bar == other.bar 63 | ) 64 | 65 | def __hash__(self) -> int: 66 | return hash((self.foo, self.bar)) 67 | 68 | 69 | @dataclass 70 | class CustomStructDC(BaseModel): 71 | foo: str 72 | bar: int 73 | 74 | 75 | @features 76 | class Book: 77 | id: int 78 | jacket_info: JacketInfo 79 | title: TitleInfo 80 | table_of_contents: list[TableOfContentsItem] 81 | custom_field: CustomStructDC = feature( 82 | # The encoder takes an instance of the custom type and outputs a Python object 83 | encoder=lambda x: CustomStructDC(foo=x.foo, bar=x.bar), 84 | # The decoder takes output of the encoder and creates an instance of the custom type 85 | decoder=lambda x: CustomStruct(**x), 86 | ) 87 | -------------------------------------------------------------------------------- /marketplace/tests.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | 3 | import pytest 4 | from chalk.client import ChalkClient 5 | from chalk.features import DataFrame 6 | 7 | from src.marketplace import ( 8 | Review, 9 | User, 10 | ) 11 | 12 | 13 | @pytest.fixture(scope="session") 14 | def client(): 15 | return ChalkClient(branch=True) 16 | 17 | 18 | def test_user_aggregations(client: ChalkClient) -> None: 19 | now = dt.datetime.now() 20 | client.check( 21 | input={ 22 | User.id: 1, 23 | User.reviews: DataFrame( 24 | [ 25 | Review( 26 | id=1, 27 | star_rating=3, 28 | created_at=now - dt.timedelta(days=1), 29 | ), 30 | Review( 31 | id=2, 32 | star_rating=4, 33 | created_at=now - dt.timedelta(days=2), 34 | ), 35 | Review( 36 | id=3, 37 | star_rating=2, 38 | created_at=now - dt.timedelta(days=3), 39 | ), 40 | Review( 41 | id=4, 42 | star_rating=3, 43 | created_at=now - dt.timedelta(days=4), 44 | ), 45 | Review( 46 | id=5, 47 | star_rating=3, 48 | created_at=now - dt.timedelta(days=5), 49 | ), 50 | Review( 51 | id=6, 52 | star_rating=1, 53 | created_at=now - dt.timedelta(days=6), 54 | ), 55 | Review( 56 | id=7, 57 | star_rating=5, 58 | created_at=now - dt.timedelta(days=7), 59 | ), 60 | Review( 61 | id=8, 62 | star_rating=2, 63 | created_at=now - dt.timedelta(days=8), 64 | ), 65 | Review( 66 | id=9, 67 | star_rating=3, 68 | created_at=now - dt.timedelta(days=9), 69 | ), 70 | ], 71 | ), 72 | }, 73 | assertions={ 74 | User.review_count["3d"]: 3, 75 | User.review_count["7d"]: 7, 76 | User.average_rating_given["3d"]: 3, 77 | User.average_rating_given["7d"]: 3, 78 | }, 79 | ) 80 | -------------------------------------------------------------------------------- /call_recordings/features/fathom/fathom_message_webhook.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from chalk.features import online 3 | 4 | from src.fathom.features.fathom_feature_set import FathomMessage 5 | 6 | 7 | @online 8 | def fathom_message_webhook( 9 | id: FathomMessage.id, 10 | recording_id: FathomMessage.recording_id, 11 | message_id: FathomMessage.message_id, 12 | url: FathomMessage.url, 13 | title: FathomMessage.title, 14 | date: FathomMessage.date, 15 | timestamp: FathomMessage.timestamp, 16 | speaker: FathomMessage.speaker, 17 | organization: FathomMessage.organization, 18 | message: FathomMessage.message, 19 | action_item: FathomMessage.action_item, 20 | watch_link: FathomMessage.watch_link, 21 | speaker_changes: FathomMessage.call.speaker_changes, 22 | meeting_duration_ratio: FathomMessage.call.meeting_duration_ratio, 23 | attendee_count: FathomMessage.call.attendee_count, 24 | chalk_attendee_count: FathomMessage.call.chalk_email_count, 25 | customer_attendee_count: FathomMessage.call.customer_email_count, 26 | meeting_scheduled_duration: FathomMessage.call.meeting_scheduled_duration, 27 | recording_duration_in_minutes: FathomMessage.call.recording_duration_in_minutes, 28 | ai_meeting_type: FathomMessage.call.ai_call_type, 29 | ai_reasons_for_meeting: FathomMessage.call.ai_reasons_for_meeting, 30 | ai_risk_flags: FathomMessage.call.ai_risk_flag, 31 | ) -> FathomMessage.webhook_status_code: 32 | data = { 33 | "id": id, 34 | "url": url, 35 | "date": date.isoformat(), 36 | "title": title, 37 | "message": message, 38 | "speaker": speaker, 39 | "timestamp": timestamp, 40 | "message_id": message_id, 41 | "watch_link": watch_link, 42 | "action_item": action_item, 43 | "organization": organization, 44 | "recording_id": recording_id, 45 | "attendee_count": attendee_count, 46 | "chalk_attendee_count": chalk_attendee_count, 47 | "customer_attendee_count": customer_attendee_count, 48 | "meeting_scheduled_duration": meeting_scheduled_duration, 49 | "recording_duration_in_minutes": recording_duration_in_minutes, 50 | "speaker_changes": speaker_changes, 51 | "meeting_duration_ratio": meeting_duration_ratio, 52 | "ai_meeting_type": ai_meeting_type, 53 | "ai_reasons_for_meeting": ai_reasons_for_meeting, 54 | "ai_risk_flags": ai_risk_flags, 55 | } 56 | status_code = requests.post( 57 | headers={"Content-Type": "application/json"}, 58 | url="url", 59 | json=data, 60 | ).status_code 61 | if status_code == 200: 62 | return 200 63 | 64 | return None 65 | -------------------------------------------------------------------------------- /fraud/1_return.py: -------------------------------------------------------------------------------- 1 | """An example of calculating non-sufficient fund (NSF) amount from 2 | a user's transactions 3 | """ 4 | 5 | from chalk import online 6 | from chalk.features import features, DataFrame, FeatureTime 7 | 8 | from datetime import datetime 9 | import pytz 10 | 11 | 12 | @features 13 | class Transaction: 14 | id: str 15 | amount: float 16 | memo: str 17 | on: FeatureTime 18 | user_id: "User.id" 19 | user: "User" 20 | 21 | # Computed properties 22 | clean_memo: str 23 | is_nsf: bool 24 | 25 | 26 | @features 27 | class User: 28 | id: int 29 | transactions: DataFrame[Transaction] 30 | 31 | # Computed properties 32 | nsf_amount: float 33 | 34 | 35 | @online 36 | def get_clean_memo(memo: Transaction.memo) -> Transaction.clean_memo: 37 | computed = memo.lower() 38 | for prefix in ("sale", "pos", "tst", "sq"): 39 | computed = computed.removeprefix(prefix).strip() 40 | return computed 41 | 42 | 43 | @online 44 | def get_transaction_is_nsf( 45 | memo_clean: Transaction.clean_memo, 46 | ) -> Transaction.is_nsf: 47 | return "nsf" in memo_clean.lower() 48 | 49 | 50 | @online 51 | def get_nsf_amount(amounts: User.transactions[Transaction.is_nsf is True, Transaction.amount]) -> User.nsf_amount: 52 | """ 53 | In this resolver, we calculate the total NSF ammount for our users. 54 | """ 55 | return amounts.sum() 56 | 57 | 58 | # Below we generate a couple dummy resolvers to make the example fully runnable without connected 59 | # datasources. 60 | 61 | 62 | @online 63 | def get_test_users() -> DataFrame[User.id]: 64 | return DataFrame([ 65 | User(id=1), 66 | User(id=2), 67 | ]) 68 | 69 | 70 | @online 71 | def get_test_transactions() -> ( 72 | DataFrame[Transaction.id, Transaction.user_id, Transaction.amount, Transaction.memo, Transaction.on] 73 | ): 74 | return DataFrame([ 75 | Transaction(id=1, user_id=1, amount=-277.0, memo="directdep", on=datetime(2014, 8, 12)), 76 | Transaction(id=2, user_id=1, amount=-10_001.0, memo="other", on=datetime(2014, 8, 12)), 77 | Transaction(id=3, user_id=1, amount=42.1, memo="tetst nsf", on=datetime(2014,8, 12)), 78 | Transaction(id=4, user_id=1, amount=-1303.0, memo="paycheck", on=datetime(2014,8, 12)), 79 | Transaction(id=5, user_id=1, amount=124.0, memo="test", on=datetime(2014,8, 12)), 80 | Transaction(id=7, user_id=2, amount=2132.04, memo="undefined", on=datetime(2014,8, 12)), 81 | Transaction(id=6, user_id=2, amount=-1.0, memo="sale nsf", on=datetime(2014,8, 12)), 82 | Transaction(id=8, user_id=2, amount=-30.0, memo="tst nsf", on=datetime(2014,8, 12)), 83 | Transaction(id=9, user_id=2, amount=-999.99, memo="payroll", on=datetime(2014,8, 12)), 84 | ]) 85 | -------------------------------------------------------------------------------- /06_dataframe/README.md: -------------------------------------------------------------------------------- 1 | # DataFrames 2 | A Chalk DataFrame is a 2-dimensional data structure similar 3 | to `pandas.Dataframe`, but with richer types and 4 | underlying optimizations. 5 | 6 | https://docs.chalk.ai/docs/dataframe 7 | 8 | ## 1. Creating DataFrames 9 | Describe features at a feature class or feature level. 10 | 11 | **[1_creating_dataframes.py](1_creating_dataframes.py)** 12 | 13 | ```python 14 | df = DataFrame() 15 | DataFrame.from_dict({ 16 | User.id: [1, 2], 17 | User.email: ["elliot@chalk.ai", "samantha@chalk.ai"], 18 | }) 19 | ``` 20 | https://docs.chalk.ai/docs/dataframe 21 | 22 | ## 2. Filters 23 | Filter the rows of a `DataFrame` by supplying conditions 24 | to the `__getitem__()` method. 25 | 26 | **[2_filtering.py](2_filters.py)** 27 | 28 | ```python 29 | User.txns[ 30 | Transaction.amount < 0, 31 | Transaction.merchant in {"uber", "lyft"} or Transaction.memo == "uberpmts", 32 | Transaction.canceled_at is None 33 | ] 34 | ``` 35 | https://docs.chalk.ai/docs/dataframe 36 | 37 | ## 3. Projections 38 | Scope down the set of rows available in a `DataFrame`. 39 | 40 | **[3_projections.py](3_projections.py)** 41 | 42 | ```python 43 | User.txns[ 44 | Transaction.amount, 45 | Transaction.memo 46 | ] 47 | ``` 48 | https://docs.chalk.ai/docs/dataframe 49 | 50 | ## 4. Projections with Filters 51 | Compose projections and filters to create a new `DataFrame`. 52 | 53 | **[4_filters_and_projections.py](4_filters_and_projections.py)** 54 | 55 | ```python 56 | User.transactions[Transaction.amount > 100, Transaction.memo] 57 | ``` 58 | 59 | https://docs.chalk.ai/docs/dataframe#composing-projections-and-filters 60 | 61 | ## 5. Aggregations 62 | 63 | Compute aggregates over a `DataFrame`. 64 | 65 | **[5_aggregations.py](5_aggregations.py)** 66 | 67 | ```python 68 | User.transactions[Transaction.amount].sum() 69 | User.transactions[Transaction.amount].mean() 70 | User.transactions[Transaction.amount].count() 71 | User.transactions[Transaction.amount].max() 72 | ``` 73 | https://docs.chalk.ai/docs/dataframe#aggregations 74 | 75 | ## 6. Self Joins 76 | 77 | Join a feature set back to itself. 78 | 79 | **[6_self_joins.py](6_self_joins.py)** 80 | 81 | ```python 82 | @features 83 | class PrequelLink: 84 | id: int 85 | prequel_id: int 86 | book: "Book" = has_one(lambda: Book.id == PrequelLink.prequel_id) 87 | 88 | 89 | @features 90 | class Book: 91 | id: int 92 | title: str 93 | author_id: Author.id 94 | prequel_id: PrequelLink.id | None 95 | prequel: PrequelLink | None = has_one(lambda: Book.id == PrequelLink.prequel_id) 96 | series_id: SeriesLink.id | None 97 | series: SeriesLink = has_one(lambda: SeriesLink.id == Book.series_id) 98 | ``` 99 | 100 | -------------------------------------------------------------------------------- /marketplace/lancedb.py: -------------------------------------------------------------------------------- 1 | # trunk-ignore-all(pyright/reportInvalidTypeForm,pyright/reportCallIssue,ruff/PLW0603,pyright/reportOptionalMemberAccess) 2 | import os 3 | from typing import TYPE_CHECKING 4 | 5 | import lancedb 6 | from chalk.features import ( 7 | DataFrame, 8 | before_all, 9 | online, 10 | ) 11 | from chalk.logging import chalk_logger 12 | from lancedb.db import DBConnection 13 | 14 | from src.marketplace.models import ItemDocument, ItemSearch 15 | 16 | if TYPE_CHECKING: 17 | from lancedb.table import Table 18 | 19 | db: DBConnection | None = None 20 | 21 | DB_URI: str = "db://marketplace-x205j4" 22 | TABLE_NAME: str = "marketplace_product_descriptions" 23 | VECTOR_COLUMN_NAME: str = "embedding" 24 | REGION: str = "us-east-1" 25 | 26 | 27 | @before_all 28 | def init_client() -> None: 29 | global db 30 | lance_api_key: str | None = os.getenv("LANCEDB_API_KEY_MARKETPLACE") 31 | if lance_api_key is None: 32 | error_msg: str = "LANCEDB_API_KEY is not set." 33 | chalk_logger.error(msg=error_msg) 34 | raise ValueError(error_msg) 35 | 36 | db = lancedb.connect( 37 | uri=DB_URI, 38 | api_key=lance_api_key, 39 | region=REGION, 40 | ) 41 | chalk_logger.info( 42 | msg=f"Initializing client: LanceDB", 43 | ) 44 | 45 | 46 | @online 47 | def get_vector_search_results( 48 | vector: ItemSearch.vector, 49 | q: ItemSearch.q, 50 | query_type: ItemSearch.query_type, 51 | ) -> DataFrame[ItemDocument]: 52 | def execute_vector_search(vector, q: str) -> DataFrame[ItemDocument]: 53 | tbl: Table = db.open_table( 54 | name=TABLE_NAME, 55 | ) 56 | results: list = ( 57 | tbl.search( 58 | query=vector.to_pylist(), 59 | query_type="vector", 60 | vector_column_name=VECTOR_COLUMN_NAME, 61 | ) 62 | .select( 63 | columns=[ 64 | "hid", 65 | "title", 66 | # "description", 67 | ], 68 | ) 69 | .limit( 70 | limit=30, 71 | ) 72 | .to_list() 73 | ) 74 | documents: list[ItemDocument] = [ 75 | ItemDocument( 76 | query=q, 77 | id=result["hid"], 78 | distance=result["_distance"], 79 | query_type=query_type, 80 | title=result["title"], 81 | # description=result["description"], 82 | ) 83 | for result in results 84 | ] 85 | return DataFrame(documents) 86 | 87 | match query_type: 88 | case "vector": 89 | return execute_vector_search(vector=vector, q=q) 90 | 91 | case _: 92 | raise ValueError(f"Unsupported query_type: {query_type}") 93 | -------------------------------------------------------------------------------- /marketplace/named_queries.py: -------------------------------------------------------------------------------- 1 | from chalk.queries.named_query import NamedQuery 2 | 3 | from . import Review 4 | 5 | NamedQuery( 6 | name="review", 7 | input=[Review.id], 8 | output=[ 9 | # Review features 10 | Review.review_body, 11 | Review.review_headline, 12 | Review.star_rating, 13 | Review.is_positive_review_inline, 14 | Review.is_positive_review_python_resolver, 15 | Review.is_positive_review_from_llm, 16 | Review.normalized_rating, 17 | Review.sentiment_from_llm, 18 | Review.reviewer_name, 19 | Review.created_at, 20 | # Product features 21 | Review.item.title, 22 | Review.item.genre_with_llm_from_title, 23 | Review.item.average_rating, 24 | Review.item.total_reviews, 25 | # User features 26 | Review.user.first_name, 27 | Review.user.last_name, 28 | Review.user.created_at, 29 | Review.user.username, 30 | Review.user.name_match_score, 31 | Review.user.top_genres, 32 | # Seller features 33 | Review.seller.name, 34 | Review.seller.zipcode, 35 | ], 36 | ) 37 | 38 | NamedQuery( 39 | name="review_dag", 40 | input=[Review.id], 41 | output=[ 42 | # Base features pulled from SQL 43 | Review.id, 44 | Review.created_at, 45 | Review.review_headline, 46 | Review.review_body, 47 | Review.star_rating, 48 | # Computed features 49 | Review.is_positive_review_inline, 50 | Review.is_positive_review_python_resolver, 51 | Review.is_positive_review_from_llm, 52 | Review.normalized_rating, 53 | # Product information 54 | Review.item_id, 55 | Review.item.title, 56 | Review.item.genre_with_llm_from_title, 57 | Review.item.genre_with_llm_from_title_confidence, 58 | Review.item.genre_with_llm_from_title_reasoning, 59 | Review.item.average_rating, 60 | Review.item.total_reviews, 61 | Review.item.review_count, 62 | Review.interaction.id, 63 | Review.interaction.created_at, 64 | Review.interaction.interaction_type, 65 | # User information 66 | Review.reviewer_name, 67 | Review.user.id, 68 | Review.user.first_name, 69 | Review.user.last_name, 70 | Review.user.username, 71 | Review.user.email, 72 | Review.user.birthday, 73 | Review.user.review_count, 74 | Review.user.average_rating_given, 75 | # Seller information 76 | Review.seller.id, 77 | Review.seller.created_at, 78 | Review.seller.name, 79 | Review.seller.zipcode, 80 | Review.seller.email, 81 | Review.seller.phone_number, 82 | # Sentiment analysis features 83 | Review.llm, 84 | Review.sentiment_from_llm, 85 | ], 86 | ) 87 | -------------------------------------------------------------------------------- /ecommerce/README.md: -------------------------------------------------------------------------------- 1 | # E-commerce 2 | 3 | Chalk can help you build realtime recommendation systems. 4 | 5 | This guide shows you how to: 6 | 1). Implement User and Seller features in Chalk, 7 | 2). Add an Interaction feature and connect it to users, 8 | 3). Stream Interaction data from a Kafka queue. 9 | 10 | In each section, you can find an `example_query.py` file. The file shows how the Chalk python client API can be used to 11 | get information on the affinity between a User and a Seller. 12 | 13 | ## 1. Query Users & Sellers 14 | 15 | Create Chalk features for Users and Sellers and evaluate whether a user and seller have matching categories. 16 | 17 | **[1_users_sellers.py](1_users_sellers.py)** 18 | 19 | ```python 20 | from chalk.features import features 21 | 22 | 23 | @features 24 | class Seller: 25 | id: str 26 | categories: set[str] 27 | 28 | 29 | @features 30 | class User: 31 | id: str 32 | age: int 33 | favorite_categories: set[str] 34 | 35 | @features 36 | class UserSeller: 37 | id: str 38 | user_id: str 39 | user: User.id 40 | seller_id: str 41 | seller: Seller.id 42 | favorites_match: bool 43 | ``` 44 | 45 | ## 2. Track User Seller Interactions 46 | 47 | Identify the number of interactions that have occurred between users and sellers. 48 | 49 | **[2_interactions.py](2_interactions.py)** 50 | 51 | ```python 52 | from chalk.features import features, DataFrame, FeatureTime 53 | 54 | class InteractionKind(Enum): 55 | LIKE = "LIKE" 56 | VIEW = "VIEW" 57 | PURCHASE = "PURCHASE" 58 | OTHER = "OTHER" 59 | 60 | @classmethod 61 | def _missing_(cls, _): 62 | return cls.OTHER 63 | 64 | @features 65 | class Interaction: 66 | id: str 67 | user: User.id 68 | user_id: str 69 | seller: Seller.id 70 | seller_id: Seller.id 71 | interaction_kind: InteractionKind 72 | on: FeatureTime 73 | 74 | @online 75 | def get_number_of_interactions( 76 | user_interactions: UserSeller.user.interactions, 77 | seller_id: UserSeller.seller.id, 78 | ) -> UserSeller.number_of_interactions: 79 | return len(user_interactions.loc[Interaction.seller_id == seller_id]) 80 | ``` 81 | 82 | ## 3. Stream User Seller Interaction Data 83 | 84 | Enrich User Interaction data with stream data. 85 | 86 | **[3_streams.py](3_streams.py)** 87 | 88 | ```python 89 | from chalk.streams import KafkaSource 90 | from chalk.features import Features 91 | from chalk import stream, online 92 | import uuid 93 | 94 | interaction_stream = KafkaSource(name="interactions") 95 | 96 | @stream(source=interaction_stream) 97 | def interactions_handler( 98 | message: InteractionMessage, 99 | ) -> Features[Interaction]: 100 | return Interaction( 101 | id=uuid.uuid4(), 102 | interaction_kind=message.interaction_kind, 103 | user_id=message.user_id, 104 | seller_id=message.seller_id, 105 | ) 106 | ``` 107 | -------------------------------------------------------------------------------- /github/features/fraud/prompts.py: -------------------------------------------------------------------------------- 1 | SYSTEM_PROMPT: str = """ You are an intelligent fraud detection assistant specialized in analyzing GitHub profiles. Your task is to evaluate the likelihood of suspicious or fraudulent activity associated with a user's GitHub profile and provide a **fraud score** between 0.0 and 1.0. 2 | 3 | You will assess the following GitHub profile attributes if they are provided: 4 | - GitHub Username 5 | - Full Name 6 | - Profile Bio 7 | - Company's Name 8 | - Email address 9 | - Geographical Location 10 | - Twitter Username linked to the GitHub profile 11 | - Personal Blog/Website HTTP Response Status Code, if the code is 0 then the user does not have a personal blog or website linked to their GitHub profile. If the code is -1 then they lised their blog/website but it is not accessible (which is highly likely for fraudulent activity). 12 | 13 | ### Key Guidelines: 14 | 1. Evaluate the presence, content, and consistency of the profile attributes, having an email is good signal that the account is not fraudulent. 15 | 2. Analyze whether the profile details (e.g., bio, company, blog) appear authentic and aligned with typical GitHub user behavior. Bios that are professional and detailed are more likely to be authentic. Bios with stop and go text and short phrases are less likely to be authentic. 16 | 3. Handle missing or null attributes objectively, treating their absence as potentially suspicious, without making too strong an assumption. 17 | 4. Weigh details like generic or unverifiable information (e.g., placeholder text in the bio or invalid links) as more likely to indicate suspicious activity. 18 | 5. Calculate a fraud score ranging between **0.0 (very trustworthy)** and **1.0 (high fraud likelihood)**. Provide clear reasoning for how the score was computed. 19 | 20 | Finally, return a compact JSON object with the fraud score and a brief explanation of how the GitHub profile was evaluated, listing the main contributing factors. 21 | """ 22 | 23 | USER_PROMPT: str = """Analyze a GitHub profile for potential fraudulent behavior based on the attributes provided. Below are the details of the profile: 24 | 25 | - GitHub Username: {{GithubFraud.username}} 26 | - Full Name: {{GithubFraud.user.full_name}} 27 | - Profile Bio: {{GithubFraud.user.bio}} 28 | - Company: {{GithubFraud.user.company}} 29 | - Email: {{GithubFraud.user.email}} 30 | - Location: {{GithubFraud.user.location}} 31 | - Twitter Username: {{GithubFraud.user.twitter_username}} 32 | - Personal Blog/Website HTTP Response Status Code: {{GithubFraud.user_website_status_code}} 33 | 34 | ### Tasks: 35 | 1. Identify whether the profile details (if present) align with typical behavior and authentic information on GitHub. 36 | 2. Assess the impact of the content (e.g., detailed bio, real company, valid blog link) on the likelihood of suspicious activity. 37 | 3. Treat missing or generic information (e.g., no bio or placeholder text) as a signal of potential fraud risk. 38 | 4. Assign a **fraud score** between 0.0 and 1.0 and explain your reasoning. 39 | """ 40 | -------------------------------------------------------------------------------- /credit/2_accounts.py: -------------------------------------------------------------------------------- 1 | """An example of connecting Users to Bank Accounts through Chalk. 2 | 3 | In this example, we connect Users to Bank Accounts. On top of 4 | this, we show how to use a connected postgres datasource to 5 | resolve features. 6 | """ 7 | 8 | from datetime import datetime 9 | 10 | from chalk import online 11 | from chalk.features import features, DataFrame, has_many, FeatureTime 12 | from chalk.sql import PostgreSQLSource 13 | 14 | # This example assumes a postgres database has been added through the chalk 15 | # dashboard where it was assigned a name of "CLOUD_DB". The database should 16 | # contain an accounts table with 'id', 'bank_bank_account_number', 'decision', 17 | # 'user_id', 'created_at', and 'updated_at' fields. 18 | 19 | pg = PostgreSQLSource(name="CLOUD_DB") 20 | 21 | 22 | @features 23 | class Account: 24 | id: int 25 | bank_account_number: int 26 | decision: str 27 | user_id: int 28 | created_at: datetime 29 | # https://docs.chalk.ai/docs/features#feature-time 30 | updated_at: FeatureTime 31 | 32 | 33 | # this call connects the Accounts feature to the "account" table of the `PostgresSQLSource` 34 | # that we instantiated above. 35 | pg.with_table( 36 | name="accounts", 37 | features=Account, 38 | column_to_feature={ 39 | "id": Account.id, 40 | "bank_account_number": Account.bank_account_number, 41 | "user_id": Account.user_id, 42 | "created_at": Account.created_at, 43 | "updated_at": Account.updated_at, 44 | }, 45 | ) 46 | 47 | 48 | @features 49 | class User: 50 | id: Account.user_id 51 | name: str 52 | accounts: DataFrame[Account] 53 | 54 | # computed 55 | number_of_accounts: int 56 | 57 | 58 | @online 59 | def count_accounts(accounts: User.accounts) -> User.number_of_accounts: 60 | return len(accounts) 61 | 62 | # --------------------------------------------------------------------------------- 63 | # Let us assume that our postgres database also contains a `users` table with 64 | # the fields: ['id', 'first_name', 'last_name', 'birthday', 'age']. To get data 65 | # into this feature, we could update our base User feature class to include all 66 | # the fields specified in the database table and use the same `with_table` syntax 67 | # that we used to populate our `Account` feature. However, suppose we want to keep 68 | # the User feature lean or that we want to apply some simple transformation on the 69 | # raw data without using a python resolver. 70 | # 71 | # To accomplish this we can use what Chalk refers to as a `sql_file_resolver` 72 | # (https://docs.chalk.ai/docs/sql#sql-file-resolvers). Essentially, we can 73 | # resolve the User feature by placing a file called `get_user.chalk.sql` in our 74 | # Chalk Directory and adding some metadata specifying the name of the resolved 75 | # feature and the upstream raw data source. It would wind up looking like the 76 | # following: 77 | # --------------------------------------------------------------------------------- 78 | # 79 | # -- type: online 80 | # -- resolvers: user 81 | # -- source: CLOUD_DB 82 | # select id, first_name||last_name as name FROM users; 83 | -------------------------------------------------------------------------------- /full_examples/sagemaker/README.md: -------------------------------------------------------------------------------- 1 | # Integrating Chalk with AWS Sagemaker 2 | 3 | Chalk integrates nicely with machine learning frameworks like AWS Sagemaker. 4 | 5 | You can use Chalk to define your transformed features and pull datasets directly into your 6 | model training pipeline. Using Chalk for dataset generation ensures that feature transformation 7 | code is consistent between training and serving. 8 | 9 | ## Setup 10 | 11 | To pull a dataset from Chalk into Sagemaker, run an offline query with Chalk's Python API client 12 | in a Sagemaker step. Chalk offline queries return datasets, which can be uploaded to a 13 | bucket and used in the subsequent steps of your machine learning pipeline. 14 | 15 | **[steps/dataset.py](./steps/dataset.py)** 16 | 17 | ```python 18 | from chalk.client import ChalkClient 19 | from sagemaker.workflow.function_step import step 20 | 21 | 22 | TRAINING_FEATURES = [ 23 | "transaction.amt", 24 | "transaction.customer.age", 25 | "transaction.customer.income", 26 | "transaction.customer.fico", 27 | "transaction.customer.transaction_sum_30m", 28 | "transaction.customer.transaction_sum_1h", 29 | "transaction.confirmed_fraud" 30 | ] 31 | 32 | TARGET_FEATURE = "transaction.confirmed_fraud" 33 | 34 | @step( 35 | name="create_dataset", 36 | instance_type='ml.t3.medium', 37 | keep_alive_period_in_seconds=300, 38 | ) 39 | def create_dataset(test_size, run_bucket): 40 | from sklearn.model_selection import train_test_split 41 | 42 | # a Chalk client id & client secret for a token with permission to create datasets 43 | # should be added to the Sagemaker environment—these are passed automatically to the 44 | # ChalkClient but can also be explicitly passed as arguments. 45 | 46 | chalk_dataset = ChalkClient( 47 | # client_id=os.environ['CHALK_CLIENT_ID'], # automatically loaded by the Chalk Client if in the environment 48 | # client_secret=os.environ['CHALK_CLIENT_SECRET'] # automatically loaded by the Chalk Client if in the environment 49 | ).offline_query( 50 | max_samples=100_000, # reads 100,000 samples from the Chalk dataset 51 | output=TRAINING_FEATURES, 52 | dataset_name="transactions_fraud_model", 53 | ) 54 | dataset = chalk_dataset.to_pandas() 55 | 56 | X_train, X_test, y_train, y_test = train_test_split( 57 | dataset.drop(columns=[TARGET_FEATURE]), # X 58 | dataset[TARGET_FEATURE], # y 59 | test_size=test_size, 60 | ) 61 | 62 | xtrain_path = f"{run_bucket}/input/X_train.parquet" 63 | xtest_path = f"{run_bucket}/input/X_test.parquet" 64 | ytrain_path = f"{run_bucket}/input/y_train.parquet" 65 | ytest_path = f"{run_bucket}/input/y_test.parquet" 66 | 67 | dataset.to_parquet(f"{run_bucket}/raw_data/data.parquet") 68 | X_train.to_parquet(xtrain_path) 69 | X_test.to_parquet(xtest_path) 70 | y_train.to_parquet(ytrain_path) 71 | y_test.to_parquet(ytest_path) 72 | 73 | return xtrain_path, xtest_path, ytrain_path, ytest_path 74 | ``` 75 | 76 | Subsequent Sagemaker steps can then pull the dataset from the paths returned by the `create_dataset` step. 77 | -------------------------------------------------------------------------------- /13_airflow/chalk_airflow.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | 3 | import pendulum 4 | from airflow.decorators import dag, task 5 | from airflow.exceptions import AirflowFailException 6 | from airflow.sensors.base import PokeReturnValue 7 | 8 | from chalk.client import ChalkClient 9 | 10 | 11 | @dag( 12 | schedule=None, 13 | start_date=pendulum.datetime(2024, 5, 7, tz="UTC"), 14 | catchup=False, 15 | tags=["chalk"], 16 | ) 17 | def taskflow_with_chalk(): 18 | """ 19 | Simple example of setting up airflow DAG that triggers Chalk resolvers 20 | """ 21 | 22 | @task() 23 | def extract(): ... 24 | 25 | @task(multiple_outputs=True) 26 | def transform(): ... 27 | 28 | @task() 29 | def load(): ... 30 | 31 | @task.virtualenv( 32 | task_id="virtualenv_python", 33 | requirements=["chalkpy"], 34 | system_site_packages=False, 35 | ) 36 | def run_chalk_resolver_virtual_env(): 37 | """ 38 | Trigger the resolver.get_email_domain resolver in a virtual environment 39 | """ 40 | from chalk.client import ChalkClient 41 | 42 | # This assumes that CHALK_CLIENT_SECRET, CHALK_CLIENT_ID, & CHALK_ENVIRONMENT environment variables 43 | # are passed to airflow. 44 | client = ChalkClient() 45 | 46 | result = client.trigger_resolver_run( 47 | "get_users" # this is the name of our sql file resolver {name}.chalk.sql 48 | ) 49 | if result.status == "failed": 50 | raise AirflowFailException(f"Resolver run failed: {result}") 51 | return result.id 52 | 53 | @task() 54 | def run_chalk_resolver() -> str: 55 | """ 56 | Trigger the resolver.get_email_domain resolver 57 | """ 58 | 59 | # This assumes that CHALK_CLIENT_SECRET, CHALK_CLIENT_ID, & CHALK_ENVIRONMENT environment variables 60 | # are passed to airflow. 61 | client = ChalkClient() 62 | 63 | result = client.trigger_resolver_run( 64 | "get_users" # this is the name of our sql file resolver {name}.chalk.sql 65 | ) 66 | if result.status == "failed": 67 | raise AirflowFailException(f"Resolver run failed: {result}") 68 | return result.id 69 | 70 | @task.sensor(poke_interval=30, timeout=60 * 5) 71 | def poll_resolver_run(run_id) -> PokeReturnValue: 72 | """ 73 | Poll the running chalk resolver 74 | """ 75 | 76 | # This assumes that CHALK_CLIENT_SECRET, CHALK_CLIENT_ID, & CHALK_ENVIRONMENT environment variables 77 | # are passed to airflow. 78 | client = ChalkClient() 79 | 80 | if (status := client.get_run_status(run_id).status) == "succeeded": 81 | if status == "succeeded": 82 | return PokeReturnValue(True, run_id) 83 | elif status == "failed": 84 | raise AirflowFailException(f"Chalk resolver resolver run: {run_id}") 85 | return PokeReturnValue(False) 86 | 87 | extract() 88 | transform() 89 | load() 90 | rid = run_chalk_resolver() 91 | poll_resolver_run(rid) 92 | # run_chalk_resolver_virtual_env() 93 | 94 | 95 | taskflow_with_chalk() 96 | -------------------------------------------------------------------------------- /full_examples/batch_ml/src/resolvers/fraud_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | from chalk import DataFrame, offline 3 | from src.models import Transaction 4 | import onnxruntime as rt 5 | from functools import cached_property 6 | import numpy as np 7 | 8 | 9 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 10 | 11 | 12 | class PredictionModel: 13 | """ 14 | # Previously, we trained a model on our user data. This 15 | # model has been saved to our local chalk directory next to 16 | # our feature and resolver code. When we run chalk apply 17 | # it will be incorporated into deployments. 18 | 19 | from sklearn.linear import LogisticRegression 20 | 21 | X_train, y_train = ... 22 | 23 | model = LogisticRegression() 24 | model.fit(X_train, y_train) 25 | model 26 | """ 27 | 28 | def __init__(self, filename: str): 29 | self.filename = filename 30 | self.input_name = None 31 | self.output_name = None 32 | 33 | @cached_property 34 | def _model(self) -> rt.InferenceSession: 35 | # The "TARGET_ROOT" environment variable is set by Chalk for both branch and 36 | # standard deployments. You can read more about it on our docs: 37 | # https://docs.chalk.ai/docs/env-vars#chalk-environment-variable 38 | filepath = os.path.join( 39 | os.environ.get("TARGET_ROOT", ROOT_DIR), "models", self.filename 40 | ) 41 | 42 | if not os.path.exists(filepath): 43 | raise FileNotFoundError(f"Model file not found: {filepath}") 44 | 45 | try: 46 | session = rt.InferenceSession(filepath) 47 | except Exception as e: 48 | raise RuntimeError(f"Failed to load ONNX model from {filepath}: {e}") 49 | 50 | self.input_name = session.get_inputs()[0].name 51 | self.output_name = session.get_outputs()[0].name 52 | 53 | return session 54 | 55 | def predict(self, data: np.array, target_class=1): 56 | return self._model.run([self.output_name], {self.input_name: data})[0] 57 | 58 | 59 | # the model has been trained and saved in our local Chalk directory 60 | # models/fraud_model.onnx 61 | fraud_model = PredictionModel("fraud_model.onnx") 62 | 63 | 64 | @offline 65 | def run_fraud_model( 66 | features: DataFrame[ 67 | Transaction.id, 68 | Transaction.amount, 69 | Transaction.user.time_since_last_transaction, 70 | Transaction.user.num_transactions["1d"], 71 | Transaction.user.num_transactions["10d"], 72 | Transaction.user.num_transactions["30d"], 73 | Transaction.user.num_distinct_merchants_transacted["1d"], 74 | Transaction.user.num_distinct_merchants_transacted["10d"], 75 | Transaction.user.num_distinct_merchants_transacted["30d"], 76 | ], 77 | ) -> DataFrame[Transaction.id, Transaction.is_fraud]: 78 | """Predict whether new transactions are fraudulent based on transaction and user data.""" 79 | 80 | predictions = fraud_model.predict( 81 | features.to_pandas().astype(np.float32).drop(columns=[Transaction.id]).values, 82 | ) 83 | 84 | return features[Transaction.id].with_columns({Transaction.is_fraud: predictions}) 85 | -------------------------------------------------------------------------------- /github/features/github/github_user.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import chalk.functions as F 4 | from chalk.features import DataFrame, Primary, _, features, has_many 5 | 6 | 7 | @features 8 | class GithubUserStarredRepo: 9 | path: Primary[str] 10 | description: str | None 11 | homepage: str | None 12 | stargazers_count: int | None 13 | language: str | None 14 | license: str | None 15 | open_issues_count: int | None 16 | forks_count: int | None 17 | url_html: str | None 18 | url_api: str | None 19 | 20 | username: str 21 | 22 | 23 | def parse_starred_github_repos_for_github_user( 24 | username: str, 25 | repo_data: dict | None, 26 | ) -> GithubUserStarredRepo | None: 27 | if repo_data is None or not repo_data: 28 | return None 29 | 30 | repo_license: str = "MISSING" 31 | if temp_license_data := repo_data.get("license"): 32 | if temp_license := temp_license_data.get("spdx_id"): 33 | repo_license = str(temp_license) 34 | 35 | path: str | None = repo_data.get("full_name") 36 | return GithubUserStarredRepo( 37 | username=username, 38 | path=path, 39 | description=repo_data.get("description"), 40 | homepage=repo_data.get("homepage"), 41 | stargazers_count=repo_data.get("stargazers_count"), 42 | language=repo_data.get("language"), 43 | license=repo_license, 44 | open_issues_count=repo_data.get("open_issues_count"), 45 | forks_count=repo_data.get("forks_count"), 46 | url_api=repo_data.get("url"), 47 | url_html=repo_data.get("html_url"), 48 | ) 49 | 50 | 51 | @features(max_staleness="infinity") 52 | class GithubUser: 53 | login: str 54 | name: Primary[str] 55 | id: int 56 | node_id: str 57 | bio: str | None 58 | blog: str 59 | company: str | None 60 | created_at: datetime | None 61 | email: str | None 62 | events_url: str | None 63 | followers: int 64 | following: int 65 | full_name: str | None 66 | gists_url: str | None 67 | gravatar_id: str | None 68 | hireable: bool | None 69 | location: str | None 70 | public_gists: int 71 | public_repos: int 72 | received_events_url: str | None 73 | site_admin: bool | None 74 | subscriptions_url: str | None 75 | twitter_username: str | None 76 | updated_at: datetime | None 77 | user_view_type: str | None 78 | url_api: str 79 | url_avatar: str 80 | url_html: str 81 | url_followers: str 82 | url_following: str 83 | url_starred: str 84 | url_organizations: str 85 | url_repos: str 86 | type: str 87 | 88 | updated_at_chalk: datetime 89 | 90 | starred_repos: DataFrame[GithubUserStarredRepo] = has_many( 91 | lambda: GithubUser.name == GithubUserStarredRepo.username, 92 | ) 93 | starred_most_recent_path: str = F.array_join( 94 | F.head( 95 | _.starred_repos[_.path], 96 | n=1, 97 | ), 98 | delimiter="", 99 | ) 100 | starred_most_recent_url: str | None = ( 101 | "https://github.com/" + _.starred_most_recent_path 102 | ) 103 | -------------------------------------------------------------------------------- /03_caching/README.md: -------------------------------------------------------------------------------- 1 | # Caching 2 | When a feature is expensive or slow to compute, 3 | you may wish to cache its value. 4 | Chalk uses the terminology "maximum staleness" 5 | to describe how recently a feature value needs 6 | to have been computed to be returned without 7 | re-running a resolver. 8 | 9 | https://docs.chalk.ai/docs/feature-caching 10 | 11 | ## 1. Basic Caching 12 | Cache feature values rather than computing them realtime. 13 | 14 | **[1_basic_caching.py](1_basic_caching.py)** 15 | 16 | ```python 17 | @features 18 | class User: 19 | fico_score: int = feature(max_staleness="30d") 20 | ``` 21 | https://docs.chalk.ai/docs/feature-caching 22 | 23 | ## 2. Latest Computed Value 24 | Cache the last computed example of the feature. 25 | 26 | **[2_latest_value.py](2_lastest_value.py)** 27 | 28 | ```python 29 | @features 30 | class User: 31 | fico_score: int = feature(max_staleness="infinity") 32 | ``` 33 | https://docs.chalk.ai/docs/feature-caching 34 | 35 | ## 3. Intermediate Feature Values 36 | Cache intermediate feature values. 37 | 38 | **[3_intermediates.py](3_intermediates.py)** 39 | 40 | ```python 41 | ChalkClient().query( 42 | input={ ... }, 43 | # User.fico_score is not requested in the output... 44 | output=[User.risk_score], 45 | # ...but you can specify the staleness anyhow! 46 | staleness={User.fico_score: "10m"}, 47 | ) 48 | ``` 49 | https://docs.chalk.ai/docs/query-caching 50 | 51 | ## 4. Override Max-Staleness 52 | Set max-staleness per-request. 53 | 54 | **[4_override_max_staleness.py](4_override_max_staleness.py)** 55 | 56 | ```python 57 | @features 58 | class User: 59 | fico_score: int = feature(max_staleness="30d") 60 | 61 | ChalkClient().query( 62 | input={...}, 63 | output=[User.fico_score], 64 | staleness={User.fico_score: "10m"}, 65 | ) 66 | ``` 67 | https://docs.chalk.ai/docs/query-caching 68 | 69 | ## 5. Override Cache Values 70 | Supply a feature value in the input to skip the cache and any resolver entirely. 71 | 72 | **[5_override_cache_values.py](5_override_cache_values.py)** 73 | 74 | ```python 75 | @features 76 | class User: 77 | fico_score: int = feature(max_staleness="30d") 78 | 79 | ChalkClient().query( 80 | input={User.fico_score: 1, ...}, 81 | output=[...], 82 | ) 83 | ``` 84 | https://docs.chalk.ai/docs/query-caching 85 | 86 | ## 6. Cache Busting 87 | Bypass the cache with a max-staleness of 0. 88 | 89 | **[6_cache_busting.py](6_cache_busting.py)** 90 | 91 | ```python 92 | ChalkClient().query( 93 | input={...}, 94 | output=[User.fico_score], 95 | staleness={User.fico_score: "0s"}, 96 | ) 97 | ``` 98 | https://docs.chalk.ai/docs/query-caching#cache-busting 99 | 100 | 101 | ## 7. Pre-Fetching 102 | Keep the cache warm by scheduling a resolver to run 103 | more frequently than the max-staleness. 104 | 105 | **[7_prefetching.py](7_prefetching.py)** 106 | 107 | ```python 108 | @features 109 | class User: 110 | fico_score: int = feature(max_staleness="30d") 111 | 112 | @realtime(cron="29d 11h") 113 | def get_fico_score(name: User.name) -> User.fico_score: 114 | return requests.get("https://experian.com").json()["score"] 115 | ``` 116 | https://docs.chalk.ai/docs/resolver-cron 117 | 118 | -------------------------------------------------------------------------------- /ecommerce/3_streams.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from datetime import datetime 3 | 4 | from chalk import online 5 | from chalk.features.resolver import make_stream_resolver 6 | from chalk.features import DataFrame, FeatureTime, features, _, has_many 7 | from chalk.streams import KafkaSource 8 | from pydantic import BaseModel 9 | 10 | 11 | @features 12 | class Seller: 13 | id: str 14 | categories: set[str] 15 | 16 | 17 | @features 18 | class User: 19 | id: str 20 | age: int 21 | favorite_categories: set[str] 22 | 23 | 24 | @features 25 | class UserSeller: 26 | id: str 27 | user_id: User.id 28 | user: User 29 | seller_id: Seller.id 30 | seller: Seller 31 | favorites_match: bool 32 | user_seller_score: int 33 | 34 | interactions: "DataFrame[Interaction]" = has_many( 35 | lambda: (User.id == Interaction.user_id) & (Seller.id == Interaction.seller_id) 36 | ) 37 | 38 | number_of_interactions: int = _.interactions.count() 39 | 40 | 41 | class InteractionKind(Enum): 42 | LIKE = "LIKE" 43 | VIEW = "VIEW" 44 | PURCHASE = "PURCHASE" 45 | OTHER = "OTHER" 46 | 47 | @classmethod 48 | def _missing_(cls, _): 49 | return cls.OTHER 50 | 51 | 52 | @features 53 | class Interaction: 54 | id: str 55 | user_id: User.id 56 | user: User 57 | seller_id: Seller.id 58 | seller: Seller 59 | interaction_kind: InteractionKind 60 | on: FeatureTime 61 | 62 | 63 | interaction_stream = KafkaSource(name="interactions") 64 | 65 | 66 | class InteractionMessage(BaseModel): 67 | id: str 68 | user_id: str 69 | seller_id: str 70 | interaction_kind: str 71 | ingestion_time: datetime 72 | 73 | 74 | process_interactions = make_stream_resolver( 75 | name="process_interactions", 76 | source=interaction_stream, 77 | message_type=InteractionMessage, 78 | output_features={ 79 | Interaction.id: _.message.id, 80 | Interaction.user_id: _.message.user_id, 81 | Interaction.seller_id: _.message.seller_id, 82 | Interaction.interaction_kind: _.message.interaction_kind, 83 | Interaction.on: _.ingestion_time, 84 | }, 85 | ) 86 | 87 | 88 | @online 89 | def get_similarity( 90 | fc: UserSeller.user.favorite_categories, fc2: UserSeller.seller.categories 91 | ) -> UserSeller.favorites_match: 92 | return len(fc & fc2) > 0 93 | 94 | 95 | if __name__ == "__main__": 96 | from chalk.client import ChalkClient 97 | 98 | client = ChalkClient() 99 | user_stores = client.query( 100 | input=[ 101 | UserSeller(user_id="123", seller_id="456"), 102 | UserSeller(user_id="123", seller_id="457"), 103 | UserSeller(user_id="123", seller_id="458"), 104 | UserSeller(user_id="123", seller_id="458"), 105 | UserSeller(user_id="123", seller_id="456"), 106 | UserSeller(user_id="123", seller_id="461"), 107 | UserSeller(user_id="123", seller_id="460"), 108 | ], 109 | output=[ 110 | UserSeller.user.id, 111 | UserSeller.seller.id, 112 | UserSeller.favorites_match, 113 | UserSeller.number_of_interactions, 114 | ], 115 | ) 116 | print(user_stores) 117 | -------------------------------------------------------------------------------- /github/features/search/github_search.py: -------------------------------------------------------------------------------- 1 | # trunk-ignore-all(ruff/N812) 2 | import chalk.functions as F 3 | import chalk.prompts as P 4 | from chalk.features import ( 5 | DataFrame, 6 | Primary, 7 | Vector, 8 | _, 9 | embed, 10 | features, 11 | ) 12 | from pydantic import BaseModel, Field 13 | 14 | from src.github.features import GithubRepoDocVDB 15 | from src.github.features.cerebras.cerebras import ( 16 | CEREBRAS_API_KEY, 17 | CEREBRAS_BASE_URL, 18 | CEREBRAS_MODEL, 19 | CEREBRAS_MODEL_PROVIDER, 20 | ) 21 | 22 | from .prompts import ( 23 | SYSTEM_PROMPT, 24 | USER_PROMPT, 25 | ) 26 | 27 | CHAT_MAX_TOKENS: int = 8192 28 | CHAT_TEMPERATURE: float = 0.1 29 | CHAT_TOP_P: float = 0.1 30 | 31 | 32 | class StructuredOutput(BaseModel): 33 | repo_url: str = Field( 34 | description="The URL of the best matching GitHub repository", 35 | ) 36 | confidence: float = Field( 37 | description="The confidence threshold for the generated summary, between 0 and 1", 38 | ) 39 | summary: str = Field( 40 | description="What this repo does and why it was selected", 41 | ) 42 | 43 | 44 | @features 45 | class GithubSearch: 46 | query: Primary[str] 47 | limit: int = 10 48 | vector: Vector[768] = embed( 49 | input=lambda: GithubSearch.query, 50 | provider="vertexai", # openai 51 | model="text-embedding-005", # text-embedding-3-small 52 | ) 53 | 54 | results: DataFrame[GithubRepoDocVDB] 55 | urls_in_list: list[str] = F.array_agg( 56 | expr=_.results[_.url], 57 | ) 58 | urls_in: str = F.array_join( 59 | arr=_.urls_in_list, 60 | delimiter="\n\n====\n\n", 61 | ) 62 | 63 | individual_descriptions: list[str] = F.array_agg( 64 | expr=_.results[_.ai_summary], 65 | ) 66 | descriptions: str = F.array_join( 67 | arr=_.individual_descriptions, 68 | delimiter="\n\n====\n\n", 69 | ) 70 | 71 | distances_in_list: list[float] = F.array_agg( 72 | expr=_.results[_.distance,], 73 | ) 74 | 75 | # https://chalk.ai/projects/dmo5dhaj3yqu/environments/dvxenv/prompts 76 | # Can also edit prompts from the dashboard 77 | # completion_gui: P.PromptResponse = P.run_prompt("repo_summary") 78 | completion: P.PromptResponse = P.completion( 79 | api_key=CEREBRAS_API_KEY, 80 | model_provider=CEREBRAS_MODEL_PROVIDER, 81 | model=CEREBRAS_MODEL, 82 | base_url=CEREBRAS_BASE_URL, 83 | max_tokens=CHAT_MAX_TOKENS, 84 | temperature=CHAT_TEMPERATURE, 85 | top_p=CHAT_TOP_P, 86 | messages=[ 87 | P.message( 88 | role="system", 89 | content=SYSTEM_PROMPT, 90 | ), 91 | P.message( 92 | role="user", 93 | content=F.jinja(USER_PROMPT), 94 | ), 95 | ], 96 | output_structure=StructuredOutput, 97 | ) 98 | 99 | c_url: str = F.json_value( 100 | _.completion.response, 101 | "$.repo_url", 102 | ) 103 | c_confidence: float = F.json_value( 104 | _.completion.response, 105 | "$.confidence", 106 | ) 107 | c_summary: str = F.json_value( 108 | _.completion.response, 109 | "$.summary", 110 | ) 111 | -------------------------------------------------------------------------------- /fraud/README.md: -------------------------------------------------------------------------------- 1 | # Fraud Detection 2 | 3 | Finding a balance between user experience and 4 | risk management is a complex task for banking 5 | products. Chalk helps you express complex business 6 | logic with features and resolvers, and lets data 7 | scientists and machine learning engineers collaborate 8 | on solutions. 9 | 10 | ## 1. Returns 11 | 12 | Identify transactions returned for non-sufficient funds. 13 | 14 | **[1_return.py](1_return.py)** 15 | 16 | ```python 17 | @online 18 | def get_transaction_is_nsf( 19 | memo_clean: Transaction.clean_memo, 20 | ) -> Transaction.is_nsf: 21 | return "nsf" in memo_clean.lower() 22 | 23 | @online 24 | def get_nsf_amount( 25 | amounts: User.transactions[ 26 | Transaction.is_nsf is True, 27 | Transaction.amount 28 | ] 29 | ) -> User.nsf_amount: 30 | return amounts.sum() 31 | ``` 32 | 33 | https://docs.chalk.ai/docs/python-resolvers 34 | 35 | ## 2. Changes in Behavior 36 | 37 | Detect changes in user behavior over time. 38 | 39 | **[2_patterns.py](2_patterns.py)** 40 | 41 | ```python 42 | @online 43 | def get_transaction_trend( 44 | this_year_txns: User.transactions[after(days_ago=30)], 45 | last_year_txns: User.transactions[ 46 | before(days_ago=30), 47 | after(days_ago=30 * 2) 48 | ] 49 | ) -> User.change_from_last_year: 50 | sum_last = last_year_txns[Transaction.amount].sum() 51 | sum_this = this_year_txns[Transaction.amount].sum() 52 | return (sum_last - sum_this) / sum_last 53 | ``` 54 | 55 | https://docs.chalk.ai/docs/window-functions 56 | 57 | ## 3. Identity Verification 58 | 59 | Make use of vendor APIs to verify identities, control costs with Chalk's platform. 60 | 61 | **[3_identity.py](3_identity.py)** 62 | 63 | ```python 64 | @features 65 | class User: 66 | id: str 67 | socure_score: float = feature(max_staleness="30d") 68 | 69 | @online 70 | def get_socure_score(uid: User.id) -> Features[User.socure_score]: 71 | return ( 72 | requests.get("https://api.socure.com", json={ 73 | id: uid 74 | }).json()['socure_score'] 75 | ) 76 | ``` 77 | 78 | https://docs.chalk.ai/docs/feature-caching 79 | 80 | ## 4. Withdrawal Model 81 | 82 | Decide and enforce withdrawal limits with custom hold times. 83 | 84 | **[4_withdrawal_model.py](4_withdrawal_model.py)** 85 | 86 | ```python 87 | @realtime(when=TransferLimit.to_account.is_internal is False) 88 | def withdrawal_limit( 89 | internal_accounts: TransferLimit.user.accounts[Account.is_internal is True], 90 | deposits_last_90: TransferLimit.user.transfers[Transfer.from_account.is_internal is False, before(days_ago=90)], 91 | user_settlement: TransferLimit.user.holdback, 92 | ) -> TransferLimit.amount: 93 | ... 94 | ``` 95 | 96 | https://docs.chalk.ai/docs/resolver-overview 97 | 98 | ## 5. Account Takeover 99 | 100 | Aggregate failed logins over a Kafka stream. 101 | 102 | **[5_account_takeover.py](5_account_takeover.py)** 103 | 104 | ```python 105 | @stream(...) 106 | def agg_logins(df: DataFrame[LoginMessage]) -> DataFrame[User]: 107 | return f""" 108 | select 109 | count(*) as failed_logins, 110 | user_id as id 111 | from {df} 112 | where status = 'failed' 113 | group by id 114 | """ 115 | ``` 116 | 117 | https://docs.chalk.ai/docs/aggregations 118 | -------------------------------------------------------------------------------- /02_resolvers/6_sharing_resolvers.py: -------------------------------------------------------------------------------- 1 | from chalk import online 2 | from chalk.client import ChalkClient 3 | from chalk.features import DataFrame, FeatureTime, after, features, has_many, has_one 4 | from chalk.sql import PostgreSQLSource 5 | 6 | 7 | # Imagine that we have two models: 8 | # 1. send_reminder_email: decides when we should send our next reminder email 9 | # 2. expected_loan_repayment: predicts the amount of money we expect to collect 10 | # 11 | # First, we'll lay out some feature classes for this problem: 12 | @features 13 | class EmailRecord: 14 | id: str 15 | user_id: str 16 | user: "User" 17 | sent_at: FeatureTime 18 | 19 | 20 | @features 21 | class User: 22 | id: str 23 | name: str 24 | emails_sent_last_10_days: int 25 | email_history: DataFrame[EmailRecord] = has_many( 26 | lambda: EmailRecord.user_id == User.id 27 | ) 28 | 29 | 30 | # The business logic for a feature is written only once, 31 | # though it can be referenced many times. 32 | @online 33 | def get_emails_sent_last_10_days( 34 | emails: User.email_history[after(days_ago=10)], 35 | ) -> User.emails_sent_last_10_days: 36 | return emails.count() 37 | 38 | 39 | # For our second model on expected loan repayment, we'll first model 40 | # another feature class around loans: 41 | @features 42 | class Loan: 43 | id: int 44 | user_id: str 45 | amount: float 46 | user: User = has_one(lambda: User.id == Loan.user_id) 47 | 48 | 49 | # Here, we leverage the work we did to build the 50 | # feature `User.emails_sent_last_10_days` from the 51 | # first model by requesting 52 | # `Loan.user.emails_sent_last_10_days`. 53 | # We configure this postgres source in the Chalk dashboard. 54 | db = PostgreSQLSource() 55 | 56 | 57 | # Work for sql queries is shared in the same way. 58 | # For example, we need to be able to resolve the 59 | # fields of `EmailRecord`. 60 | @online 61 | def get_email_record(user: User.id) -> DataFrame[EmailRecord]: 62 | return db.query_string( 63 | "select id, sent_at, user_id from email_record where user=:uid", 64 | fields=dict( 65 | id=EmailRecord.id, 66 | sent_at=EmailRecord.sent_at, 67 | user_id=EmailRecord.user_id, 68 | ), 69 | args=dict(uid=user), 70 | ).all() 71 | 72 | 73 | if __name__ == "__main__": 74 | 75 | # For this first model, we request the `User.name` 76 | # and `User.emails_sent_last_10_days` features under 77 | # the query name `send_reminder_email`. 78 | ChalkClient().query( 79 | input={User.id: 1}, 80 | output=[ 81 | User.emails_sent_last_10_days, 82 | User.name, 83 | ], 84 | # This optional `query_name` associates the data 85 | # that we requested with a given model for monitoring 86 | # and migrations. 87 | query_name="send_reminder_email", 88 | ) 89 | 90 | ChalkClient().query( 91 | input={Loan.id: "1"}, 92 | output=[ 93 | Loan.user.emails_sent_last_10_days, 94 | Loan.amount, 95 | ], 96 | query_name="expected_loan_repayment", 97 | ) 98 | 99 | # Here, we're running a really basic query that just maps columns to features. 100 | # For these simple queries, there's a shortcut to automatically ingest these 101 | # tables: 102 | db.with_table(name="email_record", features=EmailRecord) 103 | --------------------------------------------------------------------------------