├── github
    ├── __init__.py
    ├── features
    │   ├── fraud
    │   │   ├── __init__.py
    │   │   ├── github_event.chalk.sql
    │   │   └── prompts.py
    │   ├── groq
    │   │   ├── __init__.py
    │   │   └── groq.py
    │   ├── cerebras
    │   │   ├── __init__.py
    │   │   └── cerebras.py
    │   ├── github
    │   │   ├── __init__.py
    │   │   ├── github_repo_document_vector_database.py
    │   │   ├── github_archive.py
    │   │   ├── github_repo.py
    │   │   └── github_user.py
    │   ├── search
    │   │   ├── __init__.py
    │   │   ├── prompts.py
    │   │   └── github_search.py
    │   ├── __init__.py
    │   ├── named_queries.py
    │   └── github_feature_set.py
    └── sql
    │   ├── github_archive_stars.chalk.sql
    │   ├── github_archive_stars.sql
    │   ├── github_owner.sql
    │   └── github_repo.sql
├── 14_codegen
    ├── __init__.py
    ├── score_resolvers.py
    ├── models.py
    ├── README.md
    └── custom_model.py
├── call_recordings
    ├── __init__.py
    ├── features
    │   ├── fathom
    │   │   ├── __init__.py
    │   │   ├── fathom_meeting_insights_sales.py
    │   │   └── fathom_message_webhook.py
    │   └── __init__.py
    └── sql
    │   ├── fathom_call.chalk.sql
    │   ├── fathom_message.chalk.sql
    │   └── fathom_call_data.chalk.sql
├── full_examples
    ├── batch_ml
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_batch_prediction.py
    │   ├── models
    │   │   └── fraud_model.onnx
    │   ├── chalk.yaml
    │   ├── src
    │   │   ├── datasources.py
    │   │   ├── resolvers
    │   │   │   ├── sql
    │   │   │   │   ├── get_users.chalk.sql
    │   │   │   │   ├── get_users_offline.chalk.sql
    │   │   │   │   ├── get_transactions.chalk.sql
    │   │   │   │   └── get_transactions_offline.chalk.sql
    │   │   │   └── fraud_model.py
    │   │   ├── queries.py
    │   │   └── models.py
    │   └── pyproject.toml
    ├── image_processing
    │   ├── src
    │   │   ├── __init__.py
    │   │   ├── feature_sets.py
    │   │   └── resolvers.py
    │   ├── requirements.txt
    │   ├── chalk.yaml
    │   ├── pyproject.toml
    │   └── README.md
    ├── fraud_transactions_with_llm
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   └── test_denylisted.py
    │   ├── src
    │   │   ├── emailage
    │   │   │   ├── __init__.py
    │   │   │   └── client.py
    │   │   ├── datasources.py
    │   │   ├── users.chalk.sql
    │   │   ├── groq.py
    │   │   ├── __init__.py
    │   │   ├── transactions.chalk.sql
    │   │   ├── transactions_offline.chalk.sql
    │   │   ├── denylist.py
    │   │   ├── streaming.py
    │   │   └── experian
    │   │   │   └── __init__.py
    │   ├── requirements.txt
    │   ├── chalk.yaml
    │   ├── .gitignore
    │   ├── .chalkignore
    │   └── README.md
    ├── dynamic_pricing
    │   ├── requirements.txt
    │   ├── chalk.yaml
    │   ├── src
    │   │   ├── sql
    │   │   │   └── hotels.chalk.sql
    │   │   └── datasources.py
    │   └── README.md
    └── sagemaker
    │   ├── requirements.txt
    │   ├── src
    │       ├── datasources.py
    │       ├── resolvers
    │       │   ├── customers.chalk.sql
    │       │   └── transactions.chalk.sql
    │       └── models.py
    │   ├── steps
    │       ├── evaluate.py
    │       ├── training.py
    │       └── dataset.py
    │   ├── chalk_sagemaker_pipeline.py
    │   └── README.md
├── mypy.ini
├── 10_migrations
    └── README.md
├── requirements.txt
├── unstructured_data
    ├── requirements.txt
    ├── src
    │   ├── datasources.py
    │   ├── users.chalk.sql
    │   ├── __init__.py
    │   ├── transactions.chalk.sql
    │   ├── denylist.py
    │   ├── models.py
    │   └── resolvers.py
    ├── chalk.yaml
    ├── .gitignore
    └── .chalkignore
├── 11_sql
    ├── user_views.sql
    ├── 2_dataframes.py
    ├── README.md
    └── 1_scalars.py
├── 13_airflow
    ├── airflow.jpg
    ├── get_users.chalk.sql
    ├── features.py
    ├── shared_environment.py
    ├── isolated_environment.py
    ├── polling.py
    └── chalk_airflow.py
├── 12_model
    └── churn_model.skops
├── marketplace
    ├── item.chalk.sql
    ├── seller.chalk.sql
    ├── user.chalk.sql
    ├── interaction
    │   └── interaction_type.py
    ├── item_price.chalk.sql
    ├── interaction.chalk.sql
    ├── review.chalk.sql
    ├── __init__.py
    ├── resolvers.py
    ├── item_category
    │   └── item_category_value_enum.py
    ├── tests.py
    ├── lancedb.py
    └── named_queries.py
├── marketing
    ├── event_type.chalk.sql
    ├── product_area.chalk.sql
    ├── session.chalk.sql
    ├── user.chalk.sql
    ├── event.chalk.sql
    ├── requirements.txt
    ├── customer_interaction.chalk.sql
    └── __init__.py
├── .github
    └── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
├── .gitignore
├── 08_testing
    ├── 2_integration_tests.py
    ├── README.md
    └── 1_unit_tests.py
├── 06_dataframe
    ├── 3_projections.py
    ├── 5_aggregations.py
    ├── 4_filters_and_projections.py
    ├── 1_creating_dataframes.py
    ├── 2_filters.py
    ├── 6_self_joins.py
    └── README.md
├── SECURITY.md
├── 01_features
    ├── 6_has_one_has_many.py
    ├── 4_has_one.py
    ├── 3_primary_keys.py
    ├── 5_has_many.py
    ├── 1_feature_types.py
    ├── 7_feature_time.py
    ├── 8_constructing_features.py
    └── 2_custom_feature_types.py
├── 09_github_actions
    ├── 1_install_chalk_cli.yaml
    ├── 2_deploy_with_chalk.yaml
    ├── 3_deploy_preview.yaml
    └── README.md
├── 07_streaming
    ├── 1_mapping_stream.py
    ├── 4_continuous_aggregation.py
    ├── 3_window_sql.py
    ├── 2_window_dataframe.py
    └── README.md
├── 05_feature_discovery
    ├── 1_descriptions.py
    ├── 4_unified.py
    ├── 3_tags.py
    ├── 2_owners.py
    └── README.md
├── predictive_maintenance
    ├── 2_time_query.py
    ├── 3_keep_data_fresh.py
    ├── 4_customer_sensors.py
    ├── 1_device_data.py
    └── README.md
├── 03_caching
    ├── 7_prefetching.py
    ├── 2_lastest_value.py
    ├── 6_cache_busting.py
    ├── 4_override_max_staleness.py
    ├── 5_override_cache_values.py
    ├── 3_intermediates.py
    ├── 1_basic_caching.py
    └── README.md
├── credit
    ├── 4_aggregate_tradelines.py
    ├── 3_bureau_api.py
    ├── README.md
    └── 2_accounts.py
├── 04_scheduling
    ├── 3_sample_arguments.py
    ├── 1_cron.py
    ├── 2_filtered_cron.py
    └── README.md
├── fraud
    ├── 2_patterns.py
    ├── 3_identity.py
    ├── 5_account_takeover.py
    ├── 1_return.py
    └── README.md
├── 02_resolvers
    ├── 2_multiple_features_resolver.py
    ├── 3_downstream_scalars.py
    ├── 4_downstream_dataframes.py
    ├── 1_scalar_resolver.py
    ├── 5_tagged_resolvers.py
    └── 6_sharing_resolvers.py
├── ecommerce
    ├── 1_users_sellers.py
    ├── 2_interactions.py
    ├── README.md
    └── 3_streams.py
└── mocks
    └── __init__.py


/github/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/14_codegen/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/call_recordings/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/github/features/fraud/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/github/features/groq/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/github/features/cerebras/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/github/features/github/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/call_recordings/features/fathom/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/full_examples/batch_ml/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/full_examples/image_processing/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | plugins = chalk.mypy_plugin
3 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/10_migrations/README.md:
--------------------------------------------------------------------------------
1 | # Migrations
2 | 
3 | Examples to come!
4 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/src/emailage/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | chalkpy>=1.11.8
2 | pydantic
3 | cattrs
4 | requests


--------------------------------------------------------------------------------
/full_examples/dynamic_pricing/requirements.txt:
--------------------------------------------------------------------------------
1 | chalkpy[runtime,postgresql]
2 | 


--------------------------------------------------------------------------------
/full_examples/sagemaker/requirements.txt:
--------------------------------------------------------------------------------
1 | chalkpy
2 | sagemaker
3 | scikit-learn
4 | 


--------------------------------------------------------------------------------
/unstructured_data/requirements.txt:
--------------------------------------------------------------------------------
1 | chalkpy[runtime,postgresql]
2 | google-generativeai
3 | 


--------------------------------------------------------------------------------
/11_sql/user_views.sql:
--------------------------------------------------------------------------------
1 | select sum(mins) as viewed_minutes from view_counts where uid = :uid
2 | 


--------------------------------------------------------------------------------
/13_airflow/airflow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalk-ai/examples/HEAD/13_airflow/airflow.jpg


--------------------------------------------------------------------------------
/call_recordings/features/__init__.py:
--------------------------------------------------------------------------------
1 | from .fathom_feature_set import FathomCall, FathomMessage
2 | 


--------------------------------------------------------------------------------
/12_model/churn_model.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalk-ai/examples/HEAD/12_model/churn_model.skops


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/requirements.txt:
--------------------------------------------------------------------------------
1 | chalkpy[runtime,postgresql]
2 | google-generativeai
3 | 


--------------------------------------------------------------------------------
/unstructured_data/src/datasources.py:
--------------------------------------------------------------------------------
1 | from chalk.sql import PostgreSQLSource
2 | 
3 | 
4 | PostgreSQLSource(name="postgres")


--------------------------------------------------------------------------------
/full_examples/image_processing/requirements.txt:
--------------------------------------------------------------------------------
1 | chalkpy[runtime]
2 | beautifulsoup4
3 | requests
4 | pillow
5 | CairoSVG==2.5.2
6 | 


--------------------------------------------------------------------------------
/full_examples/sagemaker/src/datasources.py:
--------------------------------------------------------------------------------
1 | from chalk.sql import PostgreSQLSource
2 | 
3 | 
4 | PostgreSQLSource(name="postgres")
5 | 


--------------------------------------------------------------------------------
/github/features/search/__init__.py:
--------------------------------------------------------------------------------
1 | from .github_search import GithubSearch
2 | 
3 | __all__ = [
4 |     "GithubSearch",
5 | ]
6 | 


--------------------------------------------------------------------------------
/full_examples/batch_ml/models/fraud_model.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chalk-ai/examples/HEAD/full_examples/batch_ml/models/fraud_model.onnx


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/src/datasources.py:
--------------------------------------------------------------------------------
1 | from chalk.sql import PostgreSQLSource
2 | 
3 | 
4 | PostgreSQLSource(name="postgres")


--------------------------------------------------------------------------------
/full_examples/batch_ml/chalk.yaml:
--------------------------------------------------------------------------------
1 | project: Sandbox
2 | environments:
3 |   default:
4 |     runtime: python311
5 |     requirements: pyproject.toml
6 | 


--------------------------------------------------------------------------------
/unstructured_data/src/users.chalk.sql:
--------------------------------------------------------------------------------
1 | -- resolves: User
2 | -- source: postgres
3 | select
4 |     id,
5 |     email,
6 |     dob,
7 |     name
8 | from usrs


--------------------------------------------------------------------------------
/unstructured_data/chalk.yaml:
--------------------------------------------------------------------------------
1 | project: Demo Project
2 | environments:
3 |     default:
4 |         runtime: python312
5 |         requirements: requirements.txt
6 | 


--------------------------------------------------------------------------------
/marketplace/item.chalk.sql:
--------------------------------------------------------------------------------
1 | -- resolves: Item
2 | -- source: postgres
3 | select
4 |     hid as id,
5 |     title,
6 |     description
7 | from marketplace_products
8 | 


--------------------------------------------------------------------------------
/13_airflow/get_users.chalk.sql:
--------------------------------------------------------------------------------
1 | -- The features given to us by the user.
2 | -- resolves: user
3 | -- source: postgres
4 | select id, full_name as name, email from users;
5 | 


--------------------------------------------------------------------------------
/full_examples/dynamic_pricing/chalk.yaml:
--------------------------------------------------------------------------------
1 | project: Demo
2 | environments:
3 |     default:
4 |         runtime: python311
5 |         requirements: requirements.txt
6 | 
7 | 


--------------------------------------------------------------------------------
/full_examples/image_processing/chalk.yaml:
--------------------------------------------------------------------------------
1 | project: Demo
2 | environments:
3 |     default:
4 |         runtime: python311
5 |         requirements: requirements.txt
6 | 
7 | 


--------------------------------------------------------------------------------
/full_examples/dynamic_pricing/src/sql/hotels.chalk.sql:
--------------------------------------------------------------------------------
1 | -- resolves: Hotel
2 | -- source: postgres
3 | select
4 |   id,
5 |   num_rooms,
6 |   location
7 | from
8 |   hotels;
9 | 


--------------------------------------------------------------------------------
/marketing/event_type.chalk.sql:
--------------------------------------------------------------------------------
1 | -- Resolves: EventType
2 | -- source: postgres
3 | select
4 |     name,
5 |     product_area_type,
6 |     event_weight
7 | from event_types
8 | 


--------------------------------------------------------------------------------
/full_examples/batch_ml/src/datasources.py:
--------------------------------------------------------------------------------
1 | from chalk.sql import PostgreSQLSource, SnowflakeSource
2 | 
3 | pg = PostgreSQLSource(name="pg")
4 | sf = SnowflakeSource(name="sf")
5 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/src/users.chalk.sql:
--------------------------------------------------------------------------------
1 | -- resolves: User
2 | -- source: postgres
3 | select
4 |     id,
5 |     email,
6 |     dob,
7 |     name
8 | from usrs


--------------------------------------------------------------------------------
/marketing/product_area.chalk.sql:
--------------------------------------------------------------------------------
1 | -- Resolves: ProductArea
2 | -- source: postgres
3 | select
4 |     name as type,
5 |     created_at,
6 |     description
7 | from product_areas
8 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/chalk.yaml:
--------------------------------------------------------------------------------
1 | project: Demo Project
2 | environments:
3 |     default:
4 |         runtime: python311
5 |         requirements: requirements.txt
6 | 


--------------------------------------------------------------------------------
/call_recordings/sql/fathom_call.chalk.sql:
--------------------------------------------------------------------------------
1 | -- type: online
2 | -- resolves: FathomCall
3 | -- source: clickhouse
4 | select
5 |     recording_id as id
6 | from "fathom-calls-etl-01"
7 | ;
8 | 


--------------------------------------------------------------------------------
/full_examples/batch_ml/src/resolvers/sql/get_users.chalk.sql:
--------------------------------------------------------------------------------
1 | -- get users from postgres
2 | -- source: pg
3 | -- resolves: User
4 | SELECT
5 |     id,
6 |     name
7 | FROM
8 |     users
9 | 


--------------------------------------------------------------------------------
/full_examples/image_processing/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "image-chalk"
3 | version = "0.1.0"
4 | description = "Image Chalk Demo"
5 | readme = "README.md"
6 | dependencies = []
7 | 


--------------------------------------------------------------------------------
/github/features/groq/groq.py:
--------------------------------------------------------------------------------
1 | GROQ_API_KEY: str = ""
2 | GROQ_MODEL_PROVIDER: str = "openai"
3 | GROQ_MODEL: str = "llama3-8b-8192"
4 | GROQ_BASE_URL: str = "https://api.groq.com/openai/v1"
5 | 


--------------------------------------------------------------------------------
/github/sql/github_archive_stars.chalk.sql:
--------------------------------------------------------------------------------
1 | -- type: online
2 | -- resolves: GithubArchive
3 | -- source: postgres
4 | select id, name as path, url as api_url, stars from github_archive_stars
5 | 


--------------------------------------------------------------------------------
/marketing/session.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- Resolves: Session
 2 | -- source: postgres
 3 | select
 4 |     id,
 5 |     created_at,
 6 |     end_at,
 7 |     duration,
 8 |     user_id
 9 | from sessions
10 | 


--------------------------------------------------------------------------------
/unstructured_data/src/__init__.py:
--------------------------------------------------------------------------------
1 | import google.generativeai as genai
2 | 
3 | 
4 | # @before_all
5 | # def init_model():
6 | #     genai.configure(api_key="AIzaSyCEgFSw5mRj-POYuvhJJKhIfw76NJxaUo0")
7 | 


--------------------------------------------------------------------------------
/full_examples/batch_ml/src/resolvers/sql/get_users_offline.chalk.sql:
--------------------------------------------------------------------------------
1 | -- get users from snowflake
2 | -- source: sf
3 | -- resolves: User
4 | SELECT
5 |     id,
6 |     name
7 | FROM
8 |     "ML.USERS"
9 | 


--------------------------------------------------------------------------------
/github/features/cerebras/cerebras.py:
--------------------------------------------------------------------------------
1 | CEREBRAS_API_KEY: str = ""
2 | CEREBRAS_MODEL_PROVIDER: str = "openai"
3 | CEREBRAS_MODEL: str = "llama3.1-8b"
4 | CEREBRAS_BASE_URL: str = "https://api.cerebras.ai/v1"
5 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/src/groq.py:
--------------------------------------------------------------------------------
1 | GROQ_API_KEY: str  =""
2 | GROQ_MODEL_PROVIDER: str = "openai"
3 | GROQ_MODEL: str = "llama3-8b-8192"
4 | GROQ_BASE_URL: str = "https://api.groq.com/openai/v1"
5 | 


--------------------------------------------------------------------------------
/marketing/user.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- Resolves: User
 2 | -- source: postgres
 3 | select
 4 |     hid as id,
 5 |     created_at,
 6 |     first_name,
 7 |     last_name,
 8 |     email,
 9 |     birthday
10 | from users
11 | 


--------------------------------------------------------------------------------
/unstructured_data/src/transactions.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- resolves: Transaction
 2 | -- source: postgres
 3 | select
 4 |     id,
 5 |     amount,
 6 |     user_id,
 7 |     at,
 8 |     description as memo
 9 | from txns
10 | 


--------------------------------------------------------------------------------
/full_examples/sagemaker/src/resolvers/customers.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- resolves: Customer
 2 | -- source: postgres
 3 | select
 4 |   id,
 5 |   name,
 6 |   email,
 7 |   dob,
 8 |   age,
 9 |   income
10 | from
11 |   users;
12 | 


--------------------------------------------------------------------------------
/marketing/event.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- Resolves: Event
 2 | -- source: postgres
 3 | select
 4 |     id,
 5 |     created_at,
 6 |     name,
 7 |     product_area_type,
 8 |     user_id,
 9 |     session_id
10 | from events
11 | 


--------------------------------------------------------------------------------
/full_examples/dynamic_pricing/src/datasources.py:
--------------------------------------------------------------------------------
1 | from chalk.sql import PostgreSQLSource
2 | from chalk.streams import KafkaSource
3 | 
4 | postgres = PostgreSQLSource(name="pg")
5 | kafka_stream = KafkaSource(name="stream")
6 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/src/__init__.py:
--------------------------------------------------------------------------------
1 | import google.generativeai as genai
2 | 
3 | 
4 | # @before_all
5 | # def init_model():
6 | #     genai.configure(api_key="AIzaSyCEgFSw5mRj-POYuvhJJKhIfw76NJxaUo0")
7 | 


--------------------------------------------------------------------------------
/marketplace/seller.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- resolves: Seller
 2 | -- source: postgres
 3 | select
 4 |     hid as id,
 5 |     created_at,
 6 |     name,
 7 |     zipcode,
 8 |     email,
 9 |     phone_number
10 | from marketplace_sellers
11 | 


--------------------------------------------------------------------------------
/marketplace/user.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- resolves: User
 2 | -- source: postgres
 3 | select
 4 |     hid as id,
 5 |     created_at,
 6 |     first_name,
 7 |     last_name,
 8 |     email,
 9 |     birthday
10 | from marketplace_users
11 | 


--------------------------------------------------------------------------------
/github/sql/github_archive_stars.sql:
--------------------------------------------------------------------------------
1 | -- type: online
2 | -- resolves: GithubArchive
3 | -- source: postgres
4 | select id, name as path, url as api_url, stars
5 | from github_archive_stars
6 | order by stars desc
7 | limit 100
8 | ;
9 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/src/transactions.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- resolves: Transaction
 2 | -- source: postgres
 3 | select
 4 |     id,
 5 |     amount,
 6 |     user_id,
 7 |     at,
 8 |     description as memo
 9 | from txns
10 | 


--------------------------------------------------------------------------------
/full_examples/sagemaker/src/resolvers/transactions.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- resolves: Transaction
 2 | -- source: postgres
 3 | select
 4 |   id,
 5 |   amt,
 6 |   customer_id,
 7 |   confirmed_fraud,
 8 |   created_at as at
 9 | from
10 |   transactions;
11 | 


--------------------------------------------------------------------------------
/marketplace/interaction/interaction_type.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | 
3 | 
4 | class InteractionType(str, Enum):
5 |     PRODUCT_INQUIRY = "productInquiry"
6 |     ORDER_PLACEMENT = "orderPlacement"
7 |     FEEDBACK_AND_REVIEWS = "feedbackAndReviews"
8 | 


--------------------------------------------------------------------------------
/marketplace/item_price.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- resolves: ItemPrice
 2 | -- source: postgres
 3 | select
 4 |     hid as id,
 5 |     price as value,
 6 |     created_at ,
 7 |     product_hid as item_id,
 8 |     seller_hid as seller_id
 9 | from marketplace_product_prices
10 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/src/transactions_offline.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- resolves: Transaction
 2 | -- source: bigquery
 3 | -- type: offline
 4 | select
 5 |     id,
 6 |     amount,
 7 |     user_id,
 8 |     updated_at as at,
 9 |     description as memo
10 | from transactions_log
11 | 


--------------------------------------------------------------------------------
/unstructured_data/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # Python artifacts
 3 | venv
 4 | *venv*
 5 | virtualenv
 6 | __pycache__
 7 | *.pyc
 8 | *.py~
 9 | .eggs
10 | *.egg-info
11 | dist
12 | 
13 | # VSCode
14 | .vscode
15 | 
16 | # Intellij
17 | *.iml
18 | idea
19 | 
20 | # Git artifacts
21 | .git
22 | .github
23 | 


--------------------------------------------------------------------------------
/full_examples/image_processing/README.md:
--------------------------------------------------------------------------------
1 | # Image Processing Example
2 | 
3 | In this example we set up some code showing how to scrape images from websites and process them.
4 | 
5 | The images are processed with the Python Pillow library and flagged by an image model
6 | hosted on a SageMaker endpoint.
7 | 


--------------------------------------------------------------------------------
/marketplace/interaction.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- resolves: Interaction
 2 | -- source: postgres
 3 | select
 4 |     hid as id,
 5 |     created_at,
 6 |     interaction_type,
 7 |     seller_hid as seller_id,
 8 |     user_hid as user_id,
 9 |     product_hid as item_id,
10 |     price
11 | from marketplace_interactions
12 | 


--------------------------------------------------------------------------------
/full_examples/batch_ml/src/resolvers/sql/get_transactions.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- get transactions from postgres
 2 | -- source: pg
 3 | -- resolves: Transaction
 4 | SELECT
 5 |     transaction_id as id,
 6 |     user_id,
 7 |     merchant_id,
 8 |     amount,
 9 |     ts,
10 |     category
11 | FROM
12 |     transactions
13 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # Python artifacts
 3 | venv
 4 | *venv*
 5 | virtualenv
 6 | __pycache__
 7 | *.pyc
 8 | *.py~
 9 | .eggs
10 | *.egg-info
11 | dist
12 | 
13 | # VSCode
14 | .vscode
15 | 
16 | # Intellij
17 | *.iml
18 | idea
19 | 
20 | # Git artifacts
21 | .git
22 | .github
23 | 


--------------------------------------------------------------------------------
/marketplace/review.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- resolves: Review
 2 | -- source: postgres
 3 | select
 4 |     hid as id,
 5 |     created_at,
 6 |     star_rating,
 7 |     review_headline,
 8 |     review_body,
 9 |     product_hid as item_id,
10 |     user_hid as user_id,
11 |     seller_hid as seller_id
12 | from marketplace_reviews
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature Request
 3 | about: Suggest an idea for a new example
 4 | title: "[FEATURE]"
 5 | labels: enhancement
 6 | ---
 7 | 
 8 | **Desired Example**
 9 | A description of the example you'd like to see!
10 | 
11 | **Is this request related to a problem?**
12 | 
13 | **Additional Context**
14 | 


--------------------------------------------------------------------------------
/13_airflow/features.py:
--------------------------------------------------------------------------------
 1 | from chalk import online
 2 | from chalk.features import features
 3 | 
 4 | 
 5 | @features
 6 | class User:
 7 |     id: int
 8 |     name: str
 9 |     email: str
10 |     email_domain: str
11 | 
12 | 
13 | @online
14 | def get_email_domain(email: User.email) -> User.email_domain:
15 |     return email.split("@")[1].lower()
16 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from chalk.client import ChalkClient
 3 | 
 4 | 
 5 | @pytest.fixture(scope="session")
 6 | def client():
 7 |     # OPTION 2
 8 |     # chalk apply --branch <new branch name>
 9 |     # CHALK_CLIENT_ID
10 |     # CHALK_CLIENT_SECRET
11 |     return ChalkClient(branch=True)
12 | 


--------------------------------------------------------------------------------
/github/features/fraud/github_event.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- type: online
 2 | -- resolves: GithubEvent
 3 | -- source: postgres
 4 | select
 5 |     event_id as id,
 6 |     event_type as type,
 7 |     created_at,
 8 |     public,
 9 |     payload_action,
10 |     repo_id,
11 |     repo_name,
12 |     -- actor_id as user_id,
13 |     actor_login as username
14 | from github_events
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # dependencies
 2 | /node_modules
 3 | node_modules
 4 | /.pnp
 5 | .pnp.js
 6 | 
 7 | # testing
 8 | /coverage
 9 | 
10 | # misc
11 | .DS_Store
12 | *.pem
13 | 
14 | # python
15 | __pycache__
16 | *.pyc
17 | .eggs
18 | *.egg-info
19 | venv/
20 | dist
21 | build
22 | engine.iml
23 | 
24 | # VSCode
25 | .vscode/
26 | 
27 | # intellij
28 | *.iml
29 | .idea
30 | 
31 | 


--------------------------------------------------------------------------------
/github/features/__init__.py:
--------------------------------------------------------------------------------
 1 | from .github.github_archive import GithubArchive
 2 | from .github.github_repo import GithubRepo
 3 | from .github.github_repo_document_vector_database import GithubRepoDocVDB
 4 | from .github.github_user import GithubUser
 5 | 
 6 | __all__ = [
 7 |     "GithubArchive",
 8 |     "GithubRepo",
 9 |     "GithubRepoDocVDB",
10 |     "GithubUser",
11 | ]
12 | 


--------------------------------------------------------------------------------
/marketing/requirements.txt:
--------------------------------------------------------------------------------
 1 | chalkpy[bigquery,openai,postgresql,runtime,vertexai,clickhouse]
 2 | google-generativeai
 3 | httpx~=0.27.2
 4 | lancedb~=0.24.1
 5 | openai>=1.52.2
 6 | orjson~=3.11.0
 7 | pydantic~=1.10.22
 8 | pygithub~=2.6.1
 9 | requests~=2.32.3
10 | numpy~=1.26.4
11 | protobuf~=5.29.5
12 | pyarrow~=18.1.0
13 | marimo~=0.14.13
14 | pandas~=2.2.3
15 | pytest~=8.4.2
16 | 


--------------------------------------------------------------------------------
/marketing/customer_interaction.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- Resolves: CustomerInteraction
 2 | -- source: postgres
 3 | select
 4 |     id,
 5 |     created_at,
 6 |     sentiment_rating,
 7 |     correspondence_subject,
 8 |     correspondence_body,
 9 |     communication_channel,
10 |     communication_direction,
11 |     user_event_id,
12 |     user_id,
13 |     product_area_id
14 | from correspondences
15 | 


--------------------------------------------------------------------------------
/call_recordings/sql/fathom_message.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- type: online
 2 | -- resolves: FathomMessage
 3 | -- source: clickhouse
 4 | select
 5 |     id,
 6 |     recording_id,
 7 |     message_id,
 8 |     url,
 9 |     title,
10 |     date,
11 |     timestamp,
12 |     speaker,
13 |     organization as organization_raw,
14 |     message,
15 |     action_item,
16 |     watch_link
17 | from "fathom-messages-etl-01"
18 | ;
19 | 


--------------------------------------------------------------------------------
/08_testing/2_integration_tests.py:
--------------------------------------------------------------------------------
 1 | # You can apply changes with the `--no-promote`
 2 | # flag to create a preview environment:
 3 | #
 4 | # > chalk apply --no-promote
 5 | 
 6 | # Once your code has been deployed, you can query
 7 | # the resulting deployment id:
 8 | #
 9 | # > chalk query --deployment $DEPLOYMENT_ID \
10 | #               --in user.id=1 \
11 | #               --out user.id \
12 | #               --out user.email
13 | 


--------------------------------------------------------------------------------
/github/sql/github_owner.sql:
--------------------------------------------------------------------------------
 1 | -- type: online
 2 | -- resolves: GithubOwner
 3 | -- source: postgres
 4 | -- ELVIS: TODO: DISABLED USING THE ACTUAL PYTHON
 5 | select
 6 |     id,
 7 |     hid,
 8 |     login,
 9 |     node_id,
10 |     avatar_url,
11 |     url,
12 |     html_url,
13 |     followers_url,
14 |     following_url,
15 |     starred_url,
16 |     organizations_url,
17 |     repos_url,
18 |     type
19 | from github_owner
20 | 


--------------------------------------------------------------------------------
/unstructured_data/.chalkignore:
--------------------------------------------------------------------------------
 1 | # .gitignore compatible file for ignoring files with chalk apply
 2 | # Chalk also respects .gitignore
 3 | 
 4 | # Ignore test files
 5 | tests
 6 | 
 7 | # Python artifacts
 8 | venv
 9 | *venv*
10 | virtualenv
11 | __pycache__
12 | *.pyc
13 | *.py~
14 | .eggs
15 | *.egg-info
16 | dist
17 | 
18 | # VSCode
19 | .vscode
20 | 
21 | # Intellij
22 | *.iml
23 | idea
24 | 
25 | # Git artifacts
26 | .git
27 | .github
28 | 


--------------------------------------------------------------------------------
/06_dataframe/3_projections.py:
--------------------------------------------------------------------------------
 1 | from chalk.features import DataFrame, features
 2 | 
 3 | 
 4 | @features
 5 | class Transaction:
 6 |     id: int
 7 |     user_id: "User.id"
 8 |     memo: str
 9 |     merchant: str
10 |     amount: float
11 | 
12 | 
13 | @features
14 | class User:
15 |     id: int
16 |     txns: DataFrame[Transaction]
17 | 
18 | 
19 | # You can filter down the transactions by any of the
20 | # properties on the transaction
21 | credits = User.txns[Transaction.amount]
22 | 


--------------------------------------------------------------------------------
/full_examples/batch_ml/src/resolvers/sql/get_transactions_offline.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- get transactions from snowflake
 2 | -- source: sf
 3 | -- resolves: Transaction
 4 | -- tag: ['model_sample']
 5 | -- incremental:
 6 | --   mode: row
 7 | --   lookback_period: 60m
 8 | --   incremental_column: ts
 9 | SELECT
10 |     transaction_id as id,
11 |     user_id,
12 |     merchant_id,
13 |     amount,
14 |     ts,
15 |     category
16 | FROM
17 |     "ML.TRANSACTIONS"
18 | WHERE category <> "pending"
19 | 


--------------------------------------------------------------------------------
/full_examples/dynamic_pricing/README.md:
--------------------------------------------------------------------------------
1 | # Dynamic Price Prediction with Chalk
2 | 
3 | In this example we set up some code showing how to write dynamic pricing features in Chalk. The goal```
4 | is to show how a company that dynamically prices hotels might define their features. This example assumes that data is defined in two places:
5 | - A Postgres database with a `hotel` table which contains basic features like `num_rooms` and `location`,
6 | - A Kafka stream which updates in realtime with customer-hotel interaction information.
7 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/.chalkignore:
--------------------------------------------------------------------------------
 1 | # .gitignore compatible file for ignoring files with chalk apply
 2 | # Chalk also respects .gitignore
 3 | 
 4 | # Ignore test files
 5 | tests
 6 | 
 7 | # Python artifacts
 8 | venv
 9 | *venv*
10 | virtualenv
11 | __pycache__
12 | *.pyc
13 | *.py~
14 | .eggs
15 | *.egg-info
16 | dist
17 | 
18 | # VSCode
19 | .vscode
20 | 
21 | # Intellij
22 | *.iml
23 | idea
24 | 
25 | # Git artifacts
26 | .git
27 | .github
28 | 
29 | 
30 | streaming.py
31 | transactions_offline.chalk.sql
32 | 


--------------------------------------------------------------------------------
/marketplace/__init__.py:
--------------------------------------------------------------------------------
 1 | # Import all feature classes from models.py to make them available at package level
 2 | from .models import (
 3 |     Interaction,
 4 |     Item,
 5 |     ItemPrice,
 6 |     ItemSearch,
 7 |     Review,
 8 |     Seller,
 9 |     StructuredOutput,
10 |     User,
11 |     UserItem,
12 | )
13 | 
14 | __all__ = [
15 |     "Interaction",
16 |     "Item",
17 |     "ItemPrice",
18 |     "ItemSearch",
19 |     "Review",
20 |     "Seller",
21 |     "StructuredOutput",
22 |     "User",
23 |     "UserItem",
24 | ]
25 | 


--------------------------------------------------------------------------------
/github/features/github/github_repo_document_vector_database.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | from chalk.features import (
 4 |     Primary,
 5 |     features,
 6 | )
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from src.github.features.search import GithubSearch
10 | 
11 | 
12 | @features
13 | class GithubRepoDocVDB:
14 |     # from vector database
15 |     path: Primary[str]
16 |     query: "GithubSearch.query" = ""
17 |     url: str
18 |     distance: float | None
19 |     ai_summary: str
20 |     query_type: str = "VECTOR"
21 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | ## Security
 2 | 
 3 | If you believe you have found a security vulnerability in Chalk, please report it to us!
 4 | 
 5 | ### Reporting Security Issues
 6 | 
 7 | **Please do not report security vulnerabilities through public GitHub issues.**
 8 | 
 9 | Please email security concerns to [security@chalk.ai](mailto:security@chalk.ai).
10 | 
11 | ### Security Overview
12 | 
13 | https://docs.chalk.ai/docs/security
14 | 
15 | ### SOC-2 Report
16 | 
17 | To request access to Chalk's SOC-2 report, please email [security@chalk.ai](mailto:security@chalk.ai).
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Report an issue with an example
 4 | title: "[BUG]"
 5 | labels: bug
 6 | ---
 7 | 
 8 | **Bug Description**
 9 | 
10 | **Reproduction**
11 | Steps to reproduce the behavior.
12 | 
13 | **Expected Behavior**
14 | A description of what you expected to happen.
15 | 
16 | **Screenshots**
17 | If applicable, add screenshots to help explain your problem.
18 | 
19 | **Version**
20 | Version of the Chalk Python package.
21 | 
22 | **Additional Context**
23 | Add any other context about the problem here.
24 | 


--------------------------------------------------------------------------------
/01_features/6_has_one_has_many.py:
--------------------------------------------------------------------------------
 1 | from chalk.features import DataFrame, features, has_many
 2 | 
 3 | 
 4 | @features
 5 | class Book:
 6 |     id: str
 7 |     name: str
 8 |     page_count: int
 9 |     author_id: str
10 |     # Here, we do not define the has_one relationship.
11 |     # The relationship is assumed to be symmetric, and the join
12 |     # condition is taken from the `has_many(...)` defined on `Author`.
13 |     author: "Author"
14 | 
15 | 
16 | @features
17 | class Author:
18 |     id: str
19 |     books: DataFrame[Book] = has_many(lambda: Book.author_id == Author.id)
20 | 


--------------------------------------------------------------------------------
/github/features/github/github_archive.py:
--------------------------------------------------------------------------------
 1 | import chalk.functions as F
 2 | from chalk.features import Primary, _, features
 3 | 
 4 | 
 5 | @features
 6 | class GithubArchive:
 7 |     id: int
 8 |     path: Primary[str]
 9 |     api_url: str
10 |     stars: int = -1
11 |     is_valid_repo_path: bool = F.regexp_like(
12 |         expr=_.path,
13 |         pattern=r"^[a-zA-Z0-9_-]+\/[a-zA-Z0-9._-]+$",
14 |     )
15 |     url: str | None = F.if_then_else(
16 |         condition=_.is_valid_repo_path,
17 |         if_true="https://github.com/" + _.path,
18 |         if_false=None,
19 |     )
20 | 


--------------------------------------------------------------------------------
/06_dataframe/5_aggregations.py:
--------------------------------------------------------------------------------
 1 | from chalk.features import DataFrame, features
 2 | 
 3 | 
 4 | @features
 5 | class Transaction:
 6 |     id: int
 7 |     user_id: "User.id"
 8 |     memo: str
 9 |     merchant: str
10 |     amount: float
11 | 
12 | 
13 | @features
14 | class User:
15 |     id: int
16 |     txns: DataFrame[Transaction]
17 |     num_credits: int
18 | 
19 | 
20 | # You can filter down the transactions by any of the
21 | # properties on the transaction
22 | @online
23 | def get_num_credits(credits: User.txns[Transaction.amount < 0]) -> User.num_credits:
24 |     return len(credits)
25 | 


--------------------------------------------------------------------------------
/unstructured_data/src/denylist.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | from chalk import chalk_logger
 3 | 
 4 | 
 5 | class Denylist:
 6 |     def __init__(
 7 |         self,
 8 |         source: str,
 9 |     ):
10 |         self.source = source
11 |         self.s = set()
12 | 
13 |     def load(self):
14 |         try:
15 |             self.s = set(pl.read_csv(self.source).to_series().to_list())
16 |         except Exception as e:
17 |             chalk_logger.warn(f"Failed to load denylist {e}", exc_info=True)
18 | 
19 |     def __contains__(self, email: str) -> bool:
20 |         return email in self.s
21 | 


--------------------------------------------------------------------------------
/06_dataframe/4_filters_and_projections.py:
--------------------------------------------------------------------------------
 1 | from chalk.features import DataFrame, _, features
 2 | 
 3 | 
 4 | @features
 5 | class Transaction:
 6 |     id: int
 7 |     user_id: "User.id"
 8 |     memo: str
 9 |     merchant: str
10 |     amount: float
11 | 
12 | 
13 | @features
14 | class User:
15 |     id: int
16 |     txns: DataFrame[Transaction]
17 | 
18 | 
19 | # You can filter down the transactions by any of the
20 | # properties on the transaction
21 | credits = User.txns[Transaction.amount < 0]
22 | 
23 | # You can also use the '_' as an alias for the current namespace
24 | credits = User.txns[_.amount < 0]
25 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/src/denylist.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | from chalk import chalk_logger
 3 | 
 4 | 
 5 | class Denylist:
 6 |     def __init__(
 7 |         self,
 8 |         source: str,
 9 |     ):
10 |         self.source = source
11 |         self.s = set()
12 | 
13 |     def load(self):
14 |         try:
15 |             self.s = set(pl.read_csv(self.source).to_series().to_list())
16 |         except Exception as e:
17 |             chalk_logger.warn(f"Failed to load denylist {e}", exc_info=True)
18 | 
19 |     def __contains__(self, email: str) -> bool:
20 |         return email in self.s
21 | 


--------------------------------------------------------------------------------
/09_github_actions/1_install_chalk_cli.yaml:
--------------------------------------------------------------------------------
 1 | name: Install the Chalk CLI
 2 | on: push
 3 | 
 4 | jobs:
 5 |   test-with-chalk:
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |       - uses: actions/checkout@v4
 9 | 
10 |       - uses: chalk-ai/cli-action@v2
11 |         with:
12 |           client-id: ${{secrets.CHALK_CLIENT_ID}}
13 |           client-secret: ${{secrets.CHALK_CLIENT_SECRET}}
14 | 
15 |       - name: Use the Chalk CLI
16 |         run: |
17 |           # Print out the version
18 |           chalk version
19 |           # All commands are now authenticated with your client-id and client-secret
20 |           chalk whoami
21 | 


--------------------------------------------------------------------------------
/full_examples/batch_ml/src/queries.py:
--------------------------------------------------------------------------------
 1 | from chalk import ScheduledQuery, NamedQuery
 2 | from src.models import Transaction
 3 | 
 4 | 
 5 | # Scheduled Queries allow you to compute a specified
 6 | # set of features on a schedule, useful for persisting
 7 | # values to the online and offline stores.
 8 | # https://docs.chalk.ai/docs/scheduled-query
 9 | 
10 | sq = ScheduledQuery(
11 |     name="run_fraud_model",
12 |     schedule="0 0 * * *",  # Every day at midnight
13 |     output=[
14 |         Transaction.is_fraud,
15 |     ],
16 |     store_online=True,
17 |     store_offline=True,
18 |     tags=["model_sample"],
19 |     incremental_resolvers=["get_transactions_offline"],
20 | )
21 | 


--------------------------------------------------------------------------------
/marketing/__init__.py:
--------------------------------------------------------------------------------
 1 | # Import all feature classes from models.py to make them available at package level
 2 | from .models import (
 3 |     CustomerInteraction,
 4 |     CustomerInteractionDocument,
 5 |     CustomerInteractionSearch,
 6 |     ProductArea,
 7 |     UserEventType,
 8 |     StructuredOutput,
 9 |     User,
10 |     Event,
11 |     UserEventAnalysis,
12 |     EventType,
13 | )
14 | 
15 | __all__ = [
16 |     "CustomerInteraction",
17 |     "CustomerInteractionDocument",
18 |     "CustomerInteractionSearch",
19 |     "ProductArea",
20 |     "UserEventType",
21 |     "StructuredOutput",
22 |     "User",
23 |     "Event",
24 |     "UserEventAnalysis",
25 |     "EventType",
26 | ]
27 | 


--------------------------------------------------------------------------------
/13_airflow/shared_environment.py:
--------------------------------------------------------------------------------
 1 | from airflow.decorators import task
 2 | from chalk.client import ChalkClient
 3 | from airflow.exceptions import AirflowFailException
 4 | 
 5 | 
 6 | @task
 7 | def run_chalk_resolver() -> str:
 8 |     """
 9 |     Trigger the resolver.get_email_domain resolver
10 |     """
11 | 
12 |     # This assumes that CHALK_CLIENT_SECRET, CHALK_CLIENT_ID, & CHALK_ENVIRONMENT environment variables
13 |     # are passed to airflow.
14 |     client = ChalkClient()
15 | 
16 |     result = client.trigger_resolver_run(
17 |         "get_users"
18 |     )
19 |     if result.status == "failed":
20 |         raise AirflowFailException(f"Resolver run failed: {result}")
21 |     return result.id


--------------------------------------------------------------------------------
/01_features/4_has_one.py:
--------------------------------------------------------------------------------
 1 | from chalk.features import features, has_one
 2 | 
 3 | 
 4 | @features
 5 | class Author:
 6 |     id: str
 7 |     author_name: str
 8 | 
 9 | 
10 | @features
11 | class Book:
12 |     id: str
13 |     name: str
14 |     author_id: str
15 |     # The `has_one(...)` function takes a lambda function
16 |     # that specifies the join condition between the classes.
17 |     # We need to use a lambda function, not simply the join condition,
18 |     # to allow for forward references to the `Author` class.
19 |     author: Author = has_one(lambda: Book.author_id == Author.id)
20 | 
21 | 
22 | # You can reference features through this has-one relationship
23 | author_name_type = Book.author.author_name
24 | 


--------------------------------------------------------------------------------
/github/sql/github_repo.sql:
--------------------------------------------------------------------------------
 1 | -- type: online
 2 | -- resolves: GithubRepo
 3 | -- source: postgres
 4 | -- ELVIS: TODO: DISABLED USING THE ACTUAL PYTHON
 5 | select
 6 |     id,
 7 |     hid,
 8 |     node_id,
 9 |     name,
10 |     full_name,
11 |     html_url,
12 |     description,
13 |     url,
14 |     created_at,
15 |     updated_at,
16 |     pushed_at,
17 |     homepage,
18 |     size,
19 |     stargazers_count,
20 |     watchers_count,
21 |     language,
22 |     has_issues,
23 |     forks_count,
24 |     archived,
25 |     open_issues_count,
26 |     license,
27 |     visibility,
28 |     forks,
29 |     open_issues,
30 |     watchers,
31 |     default_branch,
32 |     owner as owner_id
33 | from github_repos_elvis
34 | ;
35 | 


--------------------------------------------------------------------------------
/07_streaming/1_mapping_stream.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from chalk import stream
 4 | from chalk.features import Features, features
 5 | from chalk.streams import KafkaSource
 6 | 
 7 | 
 8 | @features
 9 | class User:
10 |     id: str
11 |     favorite_color: str
12 | 
13 | 
14 | class UserUpdateBody(BaseModel):
15 |     user_id: str
16 |     favorite_color: str
17 | 
18 | 
19 | src = KafkaSource(
20 |     bootstrap_server="kafka.website.com:9092", topic="user_favorite_color_updates"
21 | )
22 | 
23 | 
24 | @stream(source=src)
25 | def fn(message: UserUpdateBody) -> Features[User.id, User.favorite_color]:
26 |     return User(
27 |         id=message.user_id,
28 |         favorite_color=message.favorite_color,
29 |     )
30 | 


--------------------------------------------------------------------------------
/01_features/3_primary_keys.py:
--------------------------------------------------------------------------------
 1 | from chalk import is_primary
 2 | from chalk.features import Primary, feature, features
 3 | 
 4 | 
 5 | # Feature classes have exactly one primary key,
 6 | # which by default, is taken to be the field with
 7 | # the name `id`.
 8 | @features
 9 | class Book1:
10 |     id: str
11 | 
12 | 
13 | # If you want to name your primary key something other than `id`,
14 | # you can explicitly assign it a primary key
15 | @features
16 | class Book2:
17 |     book_id: Primary[str]
18 | 
19 | 
20 | # Alternatively, you can use the `features(...)` function
21 | # to set a feature to primary
22 | @features
23 | class Book2:
24 |     book_id: str = feature(primary=True)
25 | 
26 | 
27 | assert is_primary(Book2.book_id)
28 | assert is_primary(Book1.id)
29 | 


--------------------------------------------------------------------------------
/05_feature_discovery/1_descriptions.py:
--------------------------------------------------------------------------------
 1 | from chalk import description
 2 | from chalk.features import feature, features
 3 | 
 4 | 
 5 | @features
 6 | class RocketShip1:
 7 |     id: int
 8 |     # Comments above a feature are applied assigned
 9 |     # to the features above which they sit.
10 |     software_version: str
11 | 
12 | 
13 | @features
14 | class RocketShip2:
15 |     software_version: str = feature(
16 |         description="""
17 |         You can use explicit comments too! Explicit comments
18 |         take precedence over comments parsed from comments in
19 |         the code (as above)
20 |         """
21 |     )
22 | 
23 | 
24 | # The function `chalk.features.description(...)` returns the description text
25 | print(description(RocketShip1.software_version))
26 | 


--------------------------------------------------------------------------------
/11_sql/2_dataframes.py:
--------------------------------------------------------------------------------
 1 | from chalk import online
 2 | from chalk.features import DataFrame, features
 3 | from chalk.sql import SQLiteInMemorySource
 4 | 
 5 | 
 6 | @features
 7 | class User:
 8 |     id: str
 9 |     viewed_minutes: float
10 | 
11 | 
12 | db = SQLiteInMemorySource()
13 | 
14 | 
15 | @online
16 | def get_views() -> DataFrame[User]:
17 |     """
18 |     Chalk is able to perform push down filters on the returned type here,
19 |     so even though we're returning the viewed minutes for every user,
20 |     Chalk will only read the rows that it needs to serve queries.
21 |     """
22 |     return db.query_string(
23 |         """
24 |         select id, sum(mins) as viewed_minutes
25 |         from view_counts
26 |         group by id
27 |         """,
28 |     ).all()
29 | 


--------------------------------------------------------------------------------
/06_dataframe/1_creating_dataframes.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from chalk.features import DataFrame, features
 4 | 
 5 | 
 6 | @features
 7 | class User:
 8 |     id: int
 9 |     email: str
10 | 
11 | 
12 | # Constructing an empty DataFrame
13 | df = DataFrame()
14 | 
15 | # Constructing from a Python dictionary
16 | DataFrame.from_dict(
17 |     {
18 |         User.id: [1, 2],
19 |         User.email: ["elliot@chalk.ai", "samantha@chalk.ai"],
20 |     }
21 | )
22 | 
23 | # Constructing from a Pandas DataFrame
24 | pandas_df = pd.DataFrame(
25 |     {
26 |         User.id: [1, 2],
27 |         User.email: ["elliot@chalk.ai", "samantha@chalk.ai"],
28 |     }
29 | )
30 | DataFrame(pandas_df)
31 | 
32 | # Loading a .csv
33 | DataFrame.read_csv("s3://...")
34 | DataFrame.read_parquet("s3://...")
35 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/src/emailage/client.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | 
 4 | 
 5 | class EmailAgeClient:
 6 |     def get_email_score(self, email: str) -> str:
 7 |         domainname = email.split("@")[1]
 8 |         if "fraud" in email:
 9 |             return json.dumps(
10 |                 {
11 |                     "domainAge": 120,
12 |                     "domainname": domainname,
13 |                     "emailAge": random.randint(0, 30),
14 |                 }
15 |             )
16 |         return json.dumps(
17 |             {
18 |                 "domainAge": 10200,
19 |                 "domainname": domainname,
20 |                 "emailAge": random.randint(365, 5_000),
21 |             }
22 |         )
23 | 
24 | 
25 | emailage_client = EmailAgeClient()
26 | 


--------------------------------------------------------------------------------
/01_features/5_has_many.py:
--------------------------------------------------------------------------------
 1 | from chalk.features import DataFrame, features, has_many
 2 | 
 3 | 
 4 | @features
 5 | class Book:
 6 |     id: str
 7 |     name: str
 8 |     page_count: int
 9 |     author_id: str
10 | 
11 | 
12 | @features
13 | class Author:
14 |     id: str
15 |     # The `has_many(...)` function takes a lambda function
16 |     # that specifies the join condition between the classes.
17 |     # We need to use a lambda function, not simply the join condition,
18 |     # to allow for forward references to the `Author` class.
19 |     books: DataFrame[Book] = has_many(lambda: Book.author_id == Author.id)
20 | 
21 | 
22 | # You can reference the has-many relationship, and interact with the
23 | # dataframe type
24 | book_pages_df: DataFrame[Book.page_count] = Author.books[Book.page_count]
25 | 


--------------------------------------------------------------------------------
/14_codegen/score_resolvers.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from chalk import online
 3 | from models import *
 4 | 
 5 | 
 6 | @online
 7 | def get_score1(nms: User.name_match_score, email: User.email) -> User.score1:
 8 |     response = requests.post(
 9 |         "https://internal.example.com/model1",
10 |         headers={"accept": "application/json"},
11 |         json={"nms": nms, "email": email},
12 |     )
13 |     return response.json().get("prediction")
14 | 
15 | 
16 | @online
17 | def get_score2(nms: User.name_match_score, email: User.email) -> User.score2:
18 |     response = requests.post(
19 |         "https://internal.example.com/model2",
20 |         headers={"accept": "application/json"},
21 |         json={"nms": nms, "email": email},
22 |     )
23 |     return response.json().get("prediction")
24 | 


--------------------------------------------------------------------------------
/predictive_maintenance/2_time_query.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from chalk.client import ChalkClient
 4 | from chalk.features import DataFrame, has_many, feature_time, features, FeatureTime
 5 | 
 6 | 
 7 | @features
 8 | class Measurement:
 9 |     device_id: str
10 |     lat: float
11 |     long: float
12 |     voltage: float
13 |     temp: float
14 |     timestamp: FeatureTime
15 | 
16 | 
17 | @features
18 | class Sensor:
19 |     id: str
20 |     measurements: DataFrame[Measurement] = has_many(lambda: Measurement.device_id == Sensor.id)
21 | 
22 | 
23 | ChalkClient().offline_query(
24 |     input=labels[[Measurement.device_id]],
25 |     input_times=[(datetime.now() - timedelta(days=30)).isoformat()],
26 |     output=[Measurement.lat, Measurement.long, Measurement.temp],
27 | )
28 | 


--------------------------------------------------------------------------------
/03_caching/7_prefetching.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import requests
 4 | 
 5 | from chalk import online
 6 | from chalk.features import feature, features
 7 | 
 8 | 
 9 | @features
10 | class User:
11 |     id: int
12 |     name: str
13 |     email: str
14 |     last_login: datetime
15 |     fico_score: int = feature(max_staleness="30d")
16 | 
17 | 
18 | # You can warm the cache by scheduling a resolver to run
19 | # more frequently than the max-staleness.
20 | # Here, the maximum-staleness for the FICO score is 30 days,
21 | # and the cron schedule means that this function will run
22 | # every 29 days and 11 hours. So, the cache will always be warm.
23 | @online(cron="29d 11h")
24 | def get_fico_score(name: User.name, email: User.email) -> User.fico_score:
25 |     return requests.get("https://experian.com").json()["score"]
26 | 


--------------------------------------------------------------------------------
/13_airflow/isolated_environment.py:
--------------------------------------------------------------------------------
 1 | from airflow.decorators import task
 2 | from airflow.exceptions import AirflowFailException
 3 | 
 4 | 
 5 | @task.virtualenv(
 6 |     task_id="virtualenv_python", requirements=["chalkpy"], system_site_packages=False
 7 | )
 8 | def run_chalk_resolver() -> str:
 9 |     """
10 |     Trigger the resolver.get_email_domain resolver in a virtual environment
11 |     """
12 |     from chalk.client import ChalkClient
13 | 
14 |     # This assumes that CHALK_CLIENT_SECRET, CHALK_CLIENT_ID, & CHALK_ENVIRONMENT  environment variables
15 |     # are passed to airflow.
16 |     client = ChalkClient()
17 | 
18 |     result = client.trigger_resolver_run(
19 |         "get_users"
20 |     )
21 |     if result.status == "failed":
22 |         raise AirflowFailException(f"Resolver run failed: {result}")
23 |     return result.id


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/src/streaming.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from chalk import Features, stream
 4 | from chalk.streams import KafkaSource
 5 | from pydantic import BaseModel
 6 | 
 7 | from src.models import Transaction
 8 | 
 9 | transactions_topic = KafkaSource(name="transactions")
10 | 
11 | 
12 | class TransactionMessage(BaseModel):
13 |     id: str
14 |     memo: str
15 |     amount: float
16 |     at: datetime
17 | 
18 | 
19 | @stream(source=transactions_topic)
20 | def process_stream_message(
21 |     msg: TransactionMessage,
22 | ) -> Features[
23 |     Transaction.id,
24 |     Transaction.amount,
25 |     Transaction.at,
26 |     Transaction.memo,
27 | ]:
28 |     return Transaction(
29 |         id=msg.id,
30 |         amount=msg.amount,
31 |         at=msg.at,
32 |         memo=msg.memo,
33 |     )
34 | 


--------------------------------------------------------------------------------
/13_airflow/polling.py:
--------------------------------------------------------------------------------
 1 | from airflow.decorators import task
 2 | from airflow.sensors.base import PokeReturnValue
 3 | from chalk.client import ChalkClient
 4 | from airflow.exceptions import AirflowFailException
 5 | 
 6 | 
 7 | @task.sensor(poke_interval=30, timeout=60 * 5)
 8 | def poll_resolver_run(run_id) -> PokeReturnValue:
 9 |     """
10 |     Poll the running chalk resolver
11 |     """
12 |     # This assumes that CHALK_CLIENT_SECRET, CHALK_CLIENT_ID, & CHALK_ENVIRONMENT environment variables
13 |     # are passed to airflow.
14 |     client = ChalkClient()
15 |     status = client.get_run_status(run_id).status
16 | 
17 |     if status == "succeeded":
18 |         return PokeReturnValue(True, run_id)
19 |     elif status == "failed":
20 |         raise AirflowFailException(f"Chalk resolver resolver run: {run_id}")
21 |     return PokeReturnValue(False)


--------------------------------------------------------------------------------
/full_examples/batch_ml/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "batch_ml_example"
 3 | version = "1.0.0"
 4 | description = "Batch Machine Learning Example using Chalk"
 5 | readme = "README.md"
 6 | requires-python = ">=3.10"
 7 | dependencies = [
 8 |     "chalkpy[runtime,postgresql,snowflake]",
 9 |     "onnxruntime>=1.22.1",
10 | ]
11 | 
12 | [tool.uv]
13 | dev-dependencies = [
14 |     "pytest>=7.0",
15 |     "pytest-cov>=4.0",
16 |     "black>=23.0",
17 |     "isort>=5.0",
18 |     "flake8>=6.0",
19 |     "ipython>=8.37.0",
20 |     "jupyter>=1.1.1",
21 |     "ipykernel>=6.29.5",
22 | ]
23 | 
24 | [tool.pytest.ini_options]
25 | pythonpath = ["."]
26 | 
27 | [tool.ruff.lint.per-file-ignores]
28 | "*.ipynb" = ["F821","F401"]
29 | 
30 | [tool.pyright]
31 | reportUninitializedInstanceVariable = false
32 | reportAssignmentType = false
33 | reportInvalidTypeForm = false
34 | 


--------------------------------------------------------------------------------
/03_caching/2_lastest_value.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | from chalk import online
 4 | from chalk.client import ChalkClient
 5 | from chalk.features import feature, features
 6 | 
 7 | 
 8 | @features
 9 | class User:
10 |     id: int
11 |     name: str
12 | 
13 |     # Setting the maximum staleness to `infinity` means that this
14 |     # value is calculated once and then read from the online store
15 |     # for subsequent requests.
16 |     fico_score: int = feature(max_staleness="infinity")
17 | 
18 | 
19 | # Slow and expensive `User.fico_score` resolver from `1_basic_caching.py`
20 | @online
21 | def get_fico_score(name: User.name) -> User.fico_score:
22 |     return requests.get("...").json()["fico"]
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     ChalkClient().query(
27 |         input={User.name: "Katherine Johnson"},
28 |         output=[User.fico_score],
29 |     )
30 | 


--------------------------------------------------------------------------------
/predictive_maintenance/3_keep_data_fresh.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from pydantic import BaseModel
 3 | 
 4 | from chalk.features import DataFrame, features
 5 | from chalk.streams import stream, KafkaSource, Windowed, windowed
 6 | 
 7 | 
 8 | @features
 9 | class Sensor:
10 |     id: str
11 |     count_failed: Windowed[int] = windowed("10m", "20m")
12 | 
13 | 
14 | source = KafkaSource(name="sensor_stream")
15 | 
16 | 
17 | class Message(BaseModel):
18 |     device_id: str
19 |     timestamp: datetime
20 |     is_failing: bool
21 | 
22 | 
23 | @stream(source=source, mode="continuous")
24 | def process_measurements(df: DataFrame[Message]) -> DataFrame[Sensor]:
25 |     return f"""
26 |         select
27 |             count(*) as count_failed,
28 |             id as device_id
29 |         from {df}
30 |         where is_failing <> TRUE
31 |         group by id
32 |     """
33 | 


--------------------------------------------------------------------------------
/07_streaming/4_continuous_aggregation.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from chalk import stream
 4 | from chalk.features import DataFrame, Features, features
 5 | from chalk.streams import KafkaSource, Windowed, windowed
 6 | 
 7 | src = KafkaSource(
 8 |     bootstrap_server='kafka.website.com:9092',
 9 |     topic='user_favorite_color_updates'
10 | )
11 | 
12 | 
13 | @features
14 | class User:
15 |     id: str
16 |     num_failed_logins: Windowed[int] = windowed("10m", "30m", "1d")
17 | 
18 | 
19 | class LoginMessage(BaseModel):
20 |     user_id: int
21 |     success: bool
22 | 
23 | 
24 | @stream(source=src, mode='continuous', keys={"user_id": User.id})
25 | def failed_logins(events: DataFrame[LoginMessage]) -> Features[
26 |     User.id,
27 |     User.num_failed_logins
28 | ]:
29 |     return User(id=events[0].user_id, num_failed_logins=sum(1 for e in events if e.success))
30 | 


--------------------------------------------------------------------------------
/07_streaming/3_window_sql.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from chalk import stream
 4 | from chalk.features import DataFrame, features
 5 | from chalk.streams import KafkaSource, Windowed, windowed
 6 | 
 7 | src = KafkaSource(
 8 |     bootstrap_server="kafka.website.com:9092", topic="user_favorite_color_updates"
 9 | )
10 | 
11 | 
12 | @features
13 | class User:
14 |     id: str
15 |     num_failed_logins: Windowed[int] = windowed("10m", "30m", "1d")
16 | 
17 | 
18 | class LoginMessage(BaseModel):
19 |     user_id: int
20 |     failed: bool
21 | 
22 | 
23 | @stream(source=src)
24 | def failed_logins(
25 |     events: DataFrame[LoginMessage],
26 | ) -> DataFrame[User.id, User.num_failed_logins]:
27 |     return f"""
28 |         select
29 |           user_id as id,
30 |           count(*) as num_failed_logins
31 |         from {events}
32 |         where failed = 1
33 |         group by 1
34 |     """
35 | 


--------------------------------------------------------------------------------
/07_streaming/2_window_dataframe.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from chalk import stream
 4 | from chalk.features import Features, DataFrame
 5 | from chalk.features import features
 6 | from chalk.streams import KafkaSource
 7 | from chalk.streams import Windowed, windowed
 8 | 
 9 | 
10 | src = KafkaSource(
11 |     bootstrap_server='kafka.website.com:9092',
12 |     topic='user_favorite_color_updates'
13 | )
14 | 
15 | 
16 | @features
17 | class User:
18 |     id: str
19 |     num_failed_logins: Windowed[int] = windowed("10m", "30m", "1d")
20 | 
21 | 
22 | class LoginMessage(BaseModel):
23 |     user_id: int
24 |     failed: bool
25 | 
26 | 
27 | @stream(source=src)
28 | def failed_logins(events: DataFrame[LoginMessage]) -> Features[
29 |     User.id,
30 |     User.num_failed_logins
31 | ]:
32 |     return User(
33 |         id=events["id"].max(),
34 |         num_failed_logins=events["failed"].sum(),
35 |     )
36 | 


--------------------------------------------------------------------------------
/09_github_actions/2_deploy_with_chalk.yaml:
--------------------------------------------------------------------------------
 1 | name: Create a preview deployment
 2 | on: push
 3 | 
 4 | jobs:
 5 |   test-with-chalk:
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |       - uses: actions/checkout@v4
 9 | 
10 |       - name: Setup Python
11 |         uses: actions/setup-python@v4
12 |         with:
13 |           python-version: '3.10'
14 |           cache: 'pip'
15 | 
16 |       # The chalk-ai/deploy-action expects chalkpy to be installed
17 |       - name: Install dependencies
18 |         run: pip install -r requirements.txt
19 | 
20 |       - uses: chalk-ai/deploy-action@v2
21 |         with:
22 |           client-id: ${{secrets.CHALK_CLIENT_ID}}
23 |           client-secret: ${{secrets.CHALK_CLIENT_SECRET}}
24 |           # Waits for the deployment to succeed (Optional, default false)
25 |           await: true
26 | 
27 |       - name: Use the Chalk CLI
28 |         run: chalk query --in transaction.transaction_id=1 --out transaction.clean_memo
29 | 


--------------------------------------------------------------------------------
/03_caching/6_cache_busting.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | from chalk import realtime
 4 | from chalk.client import ChalkClient
 5 | from chalk.features import feature, features
 6 | 
 7 | 
 8 | @features
 9 | class User:
10 |     id: int
11 |     name: str
12 |     fico_score: int = feature(max_staleness="30d")
13 | 
14 | 
15 | @realtime
16 | def get_fico_score(name: User.name) -> User.fico_score:
17 |     return requests.get("https://experian.com").json()["score"]
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     # You can force cache invalidation by specifying a
22 |     # maximum staleness of 0 seconds at the time of making the query:
23 |     ChalkClient().query(
24 |         input={User.id: 1, User.name: "Katherine Johnson"},
25 |         output=[User.fico_score],
26 |         # Cache busting is a special case of providing an override
27 |         # max-staleness. See `4_override_max_staleness.py` for more information.
28 |         staleness={User.fico_score: "0s"},
29 |     )
30 | 


--------------------------------------------------------------------------------
/09_github_actions/3_deploy_preview.yaml:
--------------------------------------------------------------------------------
 1 | name: Create a preview deployment
 2 | # You might want to set up preview deployments for every pull request
 3 | on: pull_request
 4 | 
 5 | jobs:
 6 |   test-with-chalk:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v4
10 | 
11 |       - name: Setup Python
12 |         uses: actions/setup-python@v4
13 |         with:
14 |           python-version: '3.10'
15 |           cache: 'pip'
16 | 
17 |       # The chalk-ai/deploy-action expects chalkpy to be installed
18 |       - name: Install dependencies
19 |         run: pip install -r requirements.txt
20 | 
21 |       - uses: chalk-ai/deploy-action@v2
22 |         with:
23 |           client-id: ${{secrets.CHALK_CLIENT_ID}}
24 |           client-secret: ${{secrets.CHALK_CLIENT_SECRET}}
25 |           # Creates a preview deployment with a unique deployment ID,
26 |           # output by this step
27 |           no-promote: true
28 |           # Waits for the deployment to succeed
29 |           await: true
30 | 


--------------------------------------------------------------------------------
/14_codegen/models.py:
--------------------------------------------------------------------------------
 1 | import chalk.functions as F
 2 | from chalk import _
 3 | from chalk.features import features
 4 | from custom_model import CustomModel
 5 | 
 6 | 
 7 | @features
 8 | class User:
 9 |     id: int
10 |     name: str
11 |     email: str
12 |     name_match_score: float = F.jaccard_similarity(_.email, _.name)
13 |     score1: float
14 |     score2: float
15 | 
16 | 
17 | model1 = CustomModel(
18 |     url="https://internal.example.com/model1",
19 |     dependencies={
20 |         "nms": User.name_match_score,
21 |         "email": User.email,
22 |     },
23 |     computes=User.score1,
24 | )
25 | 
26 | model2 = CustomModel(
27 |     url="https://internal.example.com/model2",
28 |     dependencies={
29 |         "nms": User.name_match_score,
30 |         "email": User.email,
31 |     },
32 |     computes=User.score2,
33 | )
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     CustomModel.render_all(
38 |         header="from models import *",
39 |         path="./score_resolvers.py",
40 |         models=[model1, model2],
41 |     )
42 | 


--------------------------------------------------------------------------------
/credit/4_aggregate_tradelines.py:
--------------------------------------------------------------------------------
 1 | """An example of connecting Users to Tradelines.
 2 | 
 3 | In particular, this example shows how to pass a
 4 | filtered DataFrame of features to a resolver.
 5 | """
 6 | from chalk import online
 7 | from chalk.features import features, DataFrame, has_many
 8 | 
 9 | 
10 | @features
11 | class Tradeline:
12 |     id: int
13 |     user_id: "User.id"
14 |     outstanding: float
15 |     is_delinquent: bool
16 | 
17 | 
18 | @features
19 | class User:
20 |     id: int
21 |     delinquent_amount: float
22 |     tradelines: DataFrame[Tradeline]
23 | 
24 | 
25 | @online
26 | def tradeline_rollup(
27 |     accounts: User.tradelines[
28 |         # resolvers can request a subset of a DataFrame's rows as input
29 |         # (https://docs.chalk.ai/docs/dataframe#filters).
30 |         Tradeline.is_delinquent is True
31 |     ]
32 |     ) -> User.delinquent_amount:
33 |     """
34 |     Sum the outstanding balances on tradelines that
35 |     are marked as delinquent.
36 |     """
37 |     return accounts[Tradeline.outstanding].sum()
38 | 


--------------------------------------------------------------------------------
/04_scheduling/3_sample_arguments.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import requests
 4 | 
 5 | from chalk import Cron, online
 6 | from chalk.features import DataFrame, feature, features
 7 | from chalk.sql import PostgreSQLSource
 8 | 
 9 | 
10 | @features
11 | class User:
12 |     id: int
13 |     name: str
14 |     email: str
15 |     last_login: datetime
16 |     fico_score: int = feature(max_staleness="30d")
17 | 
18 | 
19 | session = PostgreSQLSource()
20 | 
21 | 
22 | def get_active_users() -> DataFrame[User.id]:
23 |     return session.query_string(
24 |         "select users.id from users where users.active = true",
25 |         fields={"id": User.id},
26 |     ).all()
27 | 
28 | 
29 | # The sample function can pull the primary keys or any subset of
30 | # the arguments that you'd like to sample, and Chalk will sample
31 | # the other arguments.
32 | @online(cron=Cron(schedule="29d 11h", sample=get_active_users))
33 | def get_fico_score(name: User.name, email: User.email) -> User.fico_score:
34 |     return requests.get("https://experian.com").json()["score"]
35 | 


--------------------------------------------------------------------------------
/call_recordings/sql/fathom_call_data.chalk.sql:
--------------------------------------------------------------------------------
 1 | -- type: online
 2 | -- resolves: FathomCallData
 3 | -- source: clickhouse
 4 | -- incremental:
 5 | --   mode: row
 6 | --   incremental_column: meeting_scheduled_start_time
 7 | select
 8 |     id,
 9 |     recording_id as call_id,
10 | 
11 |     CAST(meeting_scheduled_start_time AS DATETIME) as meeting_scheduled_start_time,
12 |     CASE
13 |         WHEN meeting_scheduled_end_time IS NOT NULL
14 |         THEN CAST(meeting_scheduled_end_time AS DATETIME)
15 |         ELSE NULL
16 |     END as meeting_scheduled_end_time,
17 | 
18 |     meeting_has_external_invitees as has_external_attandees,
19 | 
20 |     meeting_invitees_name as attendee_name,
21 |     meeting_invitees_email as attendee_email,
22 |     meeting_invitees_is_external as attendee_is_external,
23 |     meeting_external_domains_domain_name as company_domain,
24 | 
25 |     meeting_join_url,
26 |     meeting_scheduled_duration_in_minutes,
27 |     meeting_title,
28 |     recording_duration_in_minutes,
29 |     recording_url,
30 |     transcript_plaintext
31 | from "fathom-calls-etl"
32 | ;
33 | 


--------------------------------------------------------------------------------
/github/features/named_queries.py:
--------------------------------------------------------------------------------
 1 | from chalk.queries.named_query import NamedQuery
 2 | 
 3 | from .github_feature_set import GithubProject
 4 | 
 5 | NamedQuery(
 6 |     name="github_project",
 7 |     input=[GithubProject.path],
 8 |     output=[
 9 |         GithubProject.project_is_valid_repo_path,
10 |         GithubProject.project_url,
11 |         GithubProject.username,
12 |         GithubProject.repo.description,  # project_description
13 |         GithubProject.archive.stars,  # project_stars_last_year_from_gh_archive
14 |         GithubProject.repo.stargazers_count,  # project_stars_from_api
15 |         GithubProject.vdb.ai_summary,  # project_summary_from_vdb
16 |         GithubProject.repo.created_at,  # repo_created_at
17 |         GithubProject.repo.forks_count,  # repo_forks
18 |         GithubProject.repo.homepage,  # repo_homepage_url
19 |         GithubProject.repo.open_issues_count,  # repo_issues
20 |         GithubProject.repo.size,  # repo_size_in_kb
21 |         GithubProject.user.bio,  # user_bio
22 |         GithubProject.user.email,  # user_email
23 |         GithubProject.user.location,  # user_location
24 |     ],
25 | )
26 | 


--------------------------------------------------------------------------------
/github/features/github/github_repo.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from chalk.features import Primary, features
 4 | 
 5 | from .github_user import GithubUser
 6 | 
 7 | 
 8 | @features(max_staleness="28d")
 9 | class GithubRepo:
10 |     path: Primary[str]
11 |     full_name: str
12 |     id: int | None
13 |     name: str | None
14 |     html_url: str | None
15 |     description: str | None
16 |     url: str | None
17 |     created_at: datetime | None
18 |     updated_at: datetime | None
19 |     pushed_at: datetime | None
20 |     homepage: str | None
21 |     size: int | None
22 |     stargazers_count: int | None
23 |     watchers_count: int | None
24 |     language: str | None
25 |     has_issues: bool | None
26 |     forks_count: int | None
27 |     archived: bool | None
28 |     open_issues_count: int | None
29 |     license: str | None
30 |     visibility: str | None
31 |     forks: int | None
32 |     open_issues: int | None
33 |     watchers: int | None
34 |     default_branch: str | None
35 | 
36 |     owner_id: str | None
37 |     owner_login: GithubUser.login
38 |     user: GithubUser
39 | 
40 |     updated_at_chalk: datetime | None
41 | 


--------------------------------------------------------------------------------
/05_feature_discovery/4_unified.py:
--------------------------------------------------------------------------------
 1 | from chalk import tags, is_primary, owner, description
 2 | from chalk.features import features
 3 | 
 4 | 
 5 | @features(owner="shuttle@nasa.gov", tags="group:rocketry")
 6 | class SpaceShuttle:
 7 |     id: str
 8 | 
 9 |     # The SHA1 of the software deployed to the shuttle.
10 |     # Should align with a git commit on main.
11 |     #
12 |     # :owner: katherine.johnson@nasa.gov
13 |     software_version: str
14 | 
15 |     # The volume of this shuttle in square meters.
16 |     # :owner: architecture@nasa.gov
17 |     # :tags: zillow-fact, size
18 |     volume: str
19 | 
20 | 
21 | # Pulling the description programmatically
22 | assert len(description(SpaceShuttle.software_version)) > 0
23 | 
24 | # Pulling the tags for the feature class and features
25 | assert tags(SpaceShuttle) == ["group:rocketry"]
26 | assert tags(SpaceShuttle.volume) == ["zillow-fact", "size", "group:rocketry"]
27 | 
28 | # Pulling the owner for the feature class and features
29 | assert owner(SpaceShuttle) == "shuttle@nasa.gov"
30 | assert owner(SpaceShuttle.software_version) == "katherine.johnson@nasa.gov"
31 | 
32 | assert is_primary(SpaceShuttle.id)
33 | 


--------------------------------------------------------------------------------
/fraud/2_patterns.py:
--------------------------------------------------------------------------------
 1 | """An example of calculating non-sufficient fund (NSF) amount from
 2 | a user's transactions
 3 | """
 4 | 
 5 | from chalk import online
 6 | from chalk.features import features, DataFrame, before, after, FeatureTime
 7 | 
 8 | 
 9 | @features
10 | class Transaction:
11 |     id: int
12 |     amount: float
13 |     memo: str
14 |     on: FeatureTime
15 |     user_id: "User.id"
16 |     user: "User"
17 | 
18 | 
19 | @features
20 | class User:
21 |     id: int
22 |     plaid_transactions: DataFrame[Transaction]
23 | 
24 |     # percentage change last 30 days vs a year ago
25 |     change_from_last_year: float
26 | 
27 | 
28 | @online
29 | def get_transaction_trend(
30 |     this_year_txns: User.transactions[after(days_ago=30)],
31 |     last_year_txns: User.transactions[before(days_ago=30), after(days_ago=2*30)],
32 | ) -> User.change_from_last_year:
33 |     """
34 |     Calculates the percentage change in total transaction amount between
35 |     30 day windows.
36 |     """
37 |     sum_last = last_year_txns[Transaction.amount].sum()
38 |     sum_this = this_year_txns[Transaction.amount].sum()
39 |     return (sum_last - sum_this) * 100 / sum_last
40 | 


--------------------------------------------------------------------------------
/02_resolvers/2_multiple_features_resolver.py:
--------------------------------------------------------------------------------
 1 | from mocks import user_service
 2 | 
 3 | from chalk import online
 4 | from chalk.client import ChalkClient
 5 | from chalk.features import Features, features
 6 | 
 7 | 
 8 | @features
 9 | class User:
10 |     id: int
11 |     name: str
12 |     email: str
13 | 
14 | 
15 | # Unlike with our scalar resolvers, here we need to wrap our output in
16 | # the class `Features[...]`.
17 | @online
18 | def get_user_details(uid: User.id) -> Features[User.name, User.email]:
19 |     details = user_service.get_identity(uid)
20 |     # Note that we don't need to supply all arguments to `User`.
21 |     # The field `id` on `User` is non-optional, and doesn't have a
22 |     # default value, but these classes accept partial application.
23 |     # See `01_features/8_constructing_features.py` for more info.
24 |     return User(
25 |         name=details.name,
26 |         email_domain=details.email,
27 |     )
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     # We can then query features as we did in the previous example.
32 |     result = ChalkClient().query(
33 |         input={User.id: 4},
34 |         output=[User.name, User.email],
35 |     )
36 | 


--------------------------------------------------------------------------------
/fraud/3_identity.py:
--------------------------------------------------------------------------------
 1 | """An example of connecting Users to Credit Reports from a
 2 | third part API (in this case socure).
 3 | 
 4 | In this example, we use the requests library to make
 5 | get a client's socure score from the socure REST API. This
 6 | example shows how you can run arbitrary python code (and connect
 7 | to third party APIs) in a python resolver.
 8 | """
 9 | 
10 | import requests
11 | 
12 | from chalk import online
13 | from chalk.features import features, feature
14 | 
15 | 
16 | @features
17 | class User:
18 |     id: str
19 | 
20 |     # the max staleness assignment on the feature means
21 |     # that a new socure score is only computed if one
22 |     # hasn't been computed in the last 30 days.
23 |     socure_score: float = feature(max_staleness="30d")
24 | 
25 | 
26 | @online
27 | def get_socure_score(uid: User.id) -> User.socure_score:
28 |     """This resolver approximates how one might make a REST
29 |     API call to socure in a python resolver for a specific
30 |     user.
31 |     """
32 |     return requests.get(
33 |         "https://api.socure.com",
34 |         json={
35 |             "id": uid,
36 |         },
37 |     ).json()["socure_score"]
38 | 


--------------------------------------------------------------------------------
/11_sql/README.md:
--------------------------------------------------------------------------------
 1 | # SQL
 2 | 
 3 | Chalk can ingest your data using a SQL interface from any 
 4 | of the integrations that support it. You can describe your 
 5 | queries using SQL strings or SQLAlchemy. In offline, event 
 6 | tables can be ingested incrementally.
 7 | 
 8 | https://docs.chalk.ai/docs/sql
 9 | 
10 | ## 1. Query Scalars
11 | Query scalars with SQL files or strings.
12 | 
13 | **[1_scalars.py](1_scalars.py)**
14 | 
15 | ```python
16 | @realtime
17 | def get_views(user: User.id) -> User.viewed_minutes:
18 |     return db.query_string(
19 |         "select sum(mins) as viewed_minutes from view_counts where uid = :uid",
20 |         args=dict(uid=user),
21 |     ).one()
22 | ```
23 | https://docs.chalk.ai/docs/sql
24 | 
25 | ## 2. Query DataFrames
26 | Query many rows and take advantage of push down filters.
27 | 
28 | **[2_dataframes.py](2_dataframes.py)**
29 | 
30 | ```python
31 | @realtime
32 | def get_views() -> DataFrame[User]:
33 |     return db.query_string(
34 |         """
35 |         select id, sum(mins) as viewed_minutes 
36 |         from view_counts 
37 |         group by id
38 |         """,
39 |     ).all()
40 | ```
41 | 
42 | https://docs.chalk.ai/docs/sql
43 | 


--------------------------------------------------------------------------------
/ecommerce/1_users_sellers.py:
--------------------------------------------------------------------------------
 1 | from chalk import online
 2 | from chalk.features import features
 3 | 
 4 | 
 5 | @features
 6 | class Seller:
 7 |     id: str
 8 |     categories: set[str]
 9 | 
10 | 
11 | @features
12 | class User:
13 |     id: str
14 |     age: int
15 |     favorite_categories: set[str]
16 | 
17 | 
18 | @features
19 | class UserSeller:
20 |     id: str
21 |     user_id: User.id
22 |     user: User
23 |     seller_id: Seller.id
24 |     seller: Seller
25 |     favorites_match: bool
26 | 
27 | 
28 | @online
29 | def get_similarity(
30 |     fc: UserSeller.user.favorite_categories, fc2: UserSeller.seller.categories
31 | ) -> UserSeller.favorites_match:
32 |     return len(fc & fc2) > 0
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     from chalk.client import ChalkClient
37 | 
38 |     client = ChalkClient()
39 |     user_stores = client.query(
40 |         input=[
41 |             UserSeller(user_id="1", seller_id="456"),
42 |             UserSeller(user_id="1", seller_id="457"),
43 |             UserSeller(user_id="1", seller_id="458"),
44 |         ],
45 |         output=[UserSeller.user.id, UserSeller.seller.id, UserSeller.favorites_match],
46 |     )
47 |     print(user_stores)
48 | 


--------------------------------------------------------------------------------
/11_sql/1_scalars.py:
--------------------------------------------------------------------------------
 1 | from chalk import online
 2 | from chalk.features import features
 3 | from chalk.sql import SQLiteInMemorySource
 4 | 
 5 | 
 6 | @features
 7 | class User:
 8 |     id: str
 9 |     viewed_minutes: float
10 | 
11 | 
12 | db = SQLiteInMemorySource()
13 | 
14 | 
15 | @realtime
16 | def get_views(user: User.id) -> User.viewed_minutes:
17 |     return db.query_string(
18 |         "select sum(mins) as viewed_minutes from view_counts where uid = :uid",
19 |         args=dict(uid=user),
20 |         # Chalk lines up the name of your returned SQL columns
21 |         # with the features that your resolver says it returns
22 |         # It they don't line up, you can explicitly map any
23 |         # of the columns with the line below:
24 |         #    fields=dict(viewed_minutes=User.viewed_minutes),
25 |     ).one()
26 | 
27 | 
28 | @online
29 | def get_views(user: User.id) -> User.viewed_minutes:
30 |     """
31 |     This resolver executes the same query as above,
32 |     but moves the SQL string into the file `user_views.sql`.
33 |     """
34 |     return db.query_sql_file(
35 |         "user_views.sql",
36 |         args=dict(uid=user),
37 |         fields=dict(viewed_minutes=User.viewed_minutes),
38 |     ).one()
39 | 


--------------------------------------------------------------------------------
/full_examples/sagemaker/steps/evaluate.py:
--------------------------------------------------------------------------------
 1 | from sagemaker.workflow.function_step import step
 2 | 
 3 | @step(
 4 |     name="model-evaluation",
 5 |     instance_type='ml.t3.medium',
 6 |     keep_alive_period_in_seconds=300,
 7 | )
 8 | def evaluate(model, xtest_path: str, ytest_path: str, run_bucket: str) -> str:
 9 |     import pandas as pd
10 |     from sklearn.metrics import (
11 |         accuracy_score,
12 |         f1_score,
13 |         precision_score,
14 |         recall_score,
15 |     )
16 |     import s3fs
17 |     import json
18 | 
19 |     X_test = pd.read_parquet(xtest_path)
20 |     y_test = pd.read_parquet(ytest_path)
21 | 
22 |     predictions = model.predict(X_test)
23 | 
24 |     results = {
25 |         "accuracy": accuracy_score(y_test, predictions),
26 |         "f1": f1_score(y_test, predictions),
27 |         "precision": precision_score(y_test, predictions),
28 |         "recall": recall_score(y_test, predictions),
29 |     }
30 | 
31 |     # Upload evaluation report to s3
32 |     s3_fs = s3fs.S3FileSystem()
33 |     eval_src_s3 = f"{run_bucket}/evaluation/evaluation.json"
34 | 
35 |     with s3_fs.open(eval_src_s3, "wb") as file:
36 |         file.write(json.dumps(results))
37 | 
38 |     return eval_src_s3
39 | 
40 | 


--------------------------------------------------------------------------------
/mocks/__init__.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from dataclasses import dataclass
 3 | from enum import Enum
 4 | 
 5 | # A set of mocks for the examples.
 6 | 
 7 | class AccountKind(Enum):
 8 |     plaid = "plaid"
 9 |     checking = "checking"
10 |     savings = "savings"
11 | 
12 | 
13 | @dataclass
14 | class UserIdentity:
15 |     name: str
16 |     email: str
17 | 
18 | 
19 | class UserService:
20 |     domains = ["gmail.com", "chalk.ai", "nasa.gov"]
21 |     names = ["Monica", "Justine", "Sam", "Nikhil"]
22 | 
23 |     def get_identity(self, id: int) -> UserIdentity:
24 |         random.seed(id)
25 |         name = random.choice(self.names)
26 |         return UserIdentity(
27 |             name=name,
28 |             email=f"{name.lower()}@{random.choice(self.domains)}",
29 |         )
30 | 
31 | 
32 | user_service = UserService()
33 | 
34 | 
35 | @dataclass
36 | class EmailRisk:
37 |     age_years: float
38 |     risk_score: float
39 | 
40 | 
41 | class LexusNexus:
42 |     def get_email_risk(self, email: str) -> EmailRisk:
43 |         random.seed(email)
44 |         return EmailRisk(
45 |             age_years=random.uniform(0, 10),
46 |             risk_score=random.uniform(0, 1),
47 |         )
48 | 
49 | 
50 | lexus_nexus = LexusNexus()
51 | 


--------------------------------------------------------------------------------
/marketplace/resolvers.py:
--------------------------------------------------------------------------------
 1 | from chalk import online
 2 | 
 3 | from src.marketplace import Review, User
 4 | 
 5 | 
 6 | @online
 7 | def get_normalized_rating(
 8 |     review_rating: Review.star_rating,
 9 |     review_count_across_all_books: Review.item.total_reviews,
10 |     average_rating_across_all_books: Review.item.average_rating,
11 | ) -> Review.normalized_rating:
12 |     minimum_reviews: float = review_count_across_all_books / 10
13 |     return (
14 |         review_count_across_all_books
15 |         / (review_count_across_all_books + minimum_reviews)
16 |     ) * review_rating + (
17 |         minimum_reviews / (review_count_across_all_books + minimum_reviews)
18 |     ) * average_rating_across_all_books
19 | 
20 | 
21 | @online
22 | def is_positive_review_from_python_resolver(
23 |     bayesian_normalized_rating: Review.normalized_rating,
24 | ) -> Review.is_positive_review_python_resolver:
25 |     return bayesian_normalized_rating >= 3.5
26 | 
27 | 
28 | @online
29 | def get_username(email: User.email) -> User.username:
30 |     # def get_username(email: str) -> str:
31 |     username = email.split("@")[0]
32 |     if "gmail.com" in email:
33 |         username = username.split("+")[0].replace(".", "")
34 | 
35 |     return username.lower()
36 | 


--------------------------------------------------------------------------------
/02_resolvers/3_downstream_scalars.py:
--------------------------------------------------------------------------------
 1 | from chalk import online
 2 | from chalk.client import ChalkClient
 3 | from chalk.features import features
 4 | 
 5 | 
 6 | @features
 7 | class User:
 8 |     id: int
 9 |     email: str
10 |     email_domain: str
11 |     banned_email: bool
12 | 
13 | 
14 | @online
15 | def get_email_domain(email: User.email) -> User.email_domain:
16 |     return email.split("@")[1].lower()
17 | 
18 | 
19 | @online
20 | def is_banned_email(domain: User.email_domain) -> User.banned_email:
21 |     return domain in {
22 |         "pergi.id",
23 |         "convoitucpa.com",
24 |         "vshgl.com",
25 |         "nieise.com",
26 |         "bookue.site",
27 |         "umaasa.com",
28 |     }
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     client = ChalkClient()
33 |     assert not client.query(
34 |         input={User.email: "katherine.johnson@nasa.gov"},
35 |         # Requesting User.banned_email requires running
36 |         # `get_email_domain` and then `is_banned_email`
37 |         output=[User.banned_email],
38 |     ).get_feature_value(User.banned_email)
39 | 
40 |     assert client.query(
41 |         input={User.email: "attacker@vshgl.com"},
42 |         output=[User.banned_email],
43 |     ).get_feature_value(User.banned_email)
44 | 


--------------------------------------------------------------------------------
/08_testing/README.md:
--------------------------------------------------------------------------------
 1 | # Testing
 2 | Test your Chalk features and resolvers.
 3 | 
 4 | ## 1. Unit tests
 5 | Resolvers are just Python functions, so they are easy to unit test.
 6 | 
 7 | Chalk lets you specify your feature pipelines using
 8 | idiomatic Python. This means that you can unit test
 9 | individual resolvers and combinations of resolvers.
10 | 
11 | **[1_unit_tests.py](1_unit_tests.py)**
12 | 
13 | ```python
14 | @realtime
15 | def get_home_data(
16 |         hid: HomeFeatures.id,
17 | ) -> Features[HomeFeatures.price, HomeFeatures.sq_ft]:
18 |     return HomeFeatures(price=200_000, sq_ft=2_000)
19 | 
20 | 
21 | def test_multiple_output():
22 |     assert get_home_data(2) == HomeFeatures(
23 |         price=200_000,
24 |         sq_ft=2_000,
25 |     )
26 | ```
27 | https://docs.chalk.ai/docs/unit-tests
28 | 
29 | ## 2. Integration tests
30 | Test interactions between resolvers with preview deployments.
31 | 
32 | **[2_integration_tests.py](2_integration_tests.py)**
33 | 
34 | ```bash
35 | > chalk apply --no-promote
36 | ```
37 | 
38 | ```bash
39 | > chalk query --deployment $DEPLOYMENT_ID \
40 |               --in user.id=1 \
41 |               --out user.id \
42 |               --out user.email
43 | ```
44 | https://docs.chalk.ai/docs/integration-tests
45 | 


--------------------------------------------------------------------------------
/predictive_maintenance/4_customer_sensors.py:
--------------------------------------------------------------------------------
 1 | from chalk import batch
 2 | from chalk.features import DataFrame, features, has_many, feature
 3 | from chalk.sql import SnowflakeSource
 4 | 
 5 | 
 6 | @features
 7 | class Sensor:
 8 |     id: str
 9 |     customer_id: str
10 |     is_failing: bool
11 | 
12 | 
13 | 
14 | @features
15 | class Customer:
16 |     id: str
17 |     customer_needs_service: bool = feature(max_staleness="2h")
18 |     sensors: DataFrame[Sensor] = has_many(lambda: Customer.id == Sensor.customer_id)
19 | 
20 | 
21 | snowflake = SnowflakeSource()
22 | 
23 | 
24 | @batch(cron="1h")
25 | def get_sensors() -> DataFrame[Sensor.id, Sensor.customer_id, Sensor.is_failing]:
26 |     """
27 |     Incrementally ingest new sensors from our Snowflake warehouse
28 |     as they become available.
29 |     """
30 |     return snowflake.query_string(
31 |         """
32 |         select id, customer_id, is_failing from sensors
33 |         """
34 |     ).incremental(incremental_column="updated_at", mode='row')
35 | 
36 | 
37 | @batch(cron="1h")
38 | def get_customers_needing_service(
39 |     bad_sensors: Customer.sensors[
40 |         Sensor.is_failing is True,
41 |         Sensor.id
42 |     ]
43 | ) -> Customer.customer_needs_service:
44 |     return len(bad_sensors) > 0
45 | 


--------------------------------------------------------------------------------
/03_caching/4_override_max_staleness.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | from chalk import online
 4 | from chalk.client import ChalkClient
 5 | from chalk.features import feature, features
 6 | 
 7 | 
 8 | @features
 9 | class User:
10 |     id: int
11 |     name: str
12 |     fico_score: int = feature(max_staleness="30d")
13 | 
14 | 
15 | @online
16 | def get_fico_score(name: User.name) -> User.fico_score:
17 |     return requests.get(...).json()["score"]
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     # By default, the staleness will be taken to be the value given
22 |     # on the feature class. In this case, user.fico_score is cached
23 |     # for 30 days. But if you have a model that needs fresher data,
24 |     # you can specify the desired staleness at the time of making
25 |     # the query. For example, here we request a staleness of only
26 |     # 10 minutes.
27 |     ChalkClient().query(
28 |         input={User.name: "Katherine Johnson"},
29 |         output=[User.fico_score],
30 |         staleness={User.fico_score: "10m"},
31 |     )
32 | 
33 |     # If you didn't specify the staleness, the default staleness
34 |     # of 30 days would apply
35 |     ChalkClient().query(
36 |         input={User.name: "Katherine Johnson"},
37 |         output=[User.fico_score],
38 |     )
39 | 


--------------------------------------------------------------------------------
/02_resolvers/4_downstream_dataframes.py:
--------------------------------------------------------------------------------
 1 | from chalk import online
 2 | from chalk.client import ChalkClient
 3 | from chalk.features import DataFrame, features, has_many
 4 | 
 5 | 
 6 | @features
 7 | class Email:
 8 |     id: str
 9 |     uid: str
10 |     domain: str
11 |     is_banned: bool
12 |     value: str
13 | 
14 | 
15 | @features
16 | class User:
17 |     id: str
18 |     banned: bool
19 |     emails: DataFrame[Email] = has_many(lambda: Email.uid == User.id)
20 | 
21 | 
22 | @online
23 | def is_banned_email(domain: Email.domain) -> Email.is_banned:
24 |     return domain in {
25 |         "pergi.id",
26 |         "convoitucpa.com",
27 |         "vshgl.com",
28 |         "nieise.com",
29 |         "bookue.site",
30 |         "umaasa.com",
31 |     }
32 | 
33 | 
34 | # Here, we say a user is banned if the user has any emails that are banned.
35 | # Note that all of this can be computed real-time, and Chalk will run the
36 | # `is_banned_email` resolver for each of the emails that the user has.
37 | @online
38 | def banned_user(domains: User.emails[Email.is_banned == True]) -> User.banned:
39 |     return len(domains) > 0
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     result = ChalkClient().query(
44 |         input={User.id: "1"},
45 |         output=[User.banned],
46 |     )
47 | 


--------------------------------------------------------------------------------
/predictive_maintenance/1_device_data.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from pydantic import BaseModel
 3 | 
 4 | from chalk.features import DataFrame, has_many, features, FeatureTime
 5 | from chalk.streams import stream, KafkaSource
 6 | 
 7 | 
 8 | @features
 9 | class Measurement:
10 |     device_id: str
11 |     lat: float
12 |     long: float
13 |     voltage: float
14 |     temp: float
15 | 
16 |     timestamp: FeatureTime
17 | 
18 | 
19 | @features
20 | class Sensor:
21 |     id: str
22 |     measurements: DataFrame[Measurement] = has_many(lambda: Measurement.device_id == Sensor.id)
23 | 
24 | 
25 | source = KafkaSource(name="sensor_stream")
26 | 
27 | 
28 | class DeviceDataJson(BaseModel):
29 |     latitude: float
30 |     longitude: float
31 |     voltage: float
32 |     temperature: float
33 | 
34 | 
35 | class Message(BaseModel):
36 |     device_id: str
37 |     timestamp: datetime
38 |     data: DeviceDataJson
39 | 
40 | 
41 | @stream(source=source)
42 | def read_message(message: Message) -> Measurement:
43 |     return Measurement(
44 |         device_id=message.device_id,
45 |         timestamp=message.timestamp,
46 |         lat=message.data.latitude,
47 |         long=message.data.longitude,
48 |         voltage=message.data.voltage,
49 |         temp=message.data.temperature,
50 |     )
51 | 


--------------------------------------------------------------------------------
/05_feature_discovery/3_tags.py:
--------------------------------------------------------------------------------
 1 | from chalk import tags
 2 | from chalk.features import feature, features
 3 | 
 4 | 
 5 | # Tags are assigned as code comments
 6 | @features
 7 | class RiskProfile1:
 8 |     id: int
 9 |     # :tags: group:risk
10 |     email: str
11 |     kyc_score: str
12 | 
13 | 
14 | # Or explicitly set via `feature(tags=...)`
15 | @features
16 | class RiskProfile2:
17 |     id: int
18 |     email: str = feature(tags="group:risk")
19 |     kyc_score: str
20 | 
21 | 
22 | # A feature can have many tags
23 | @features
24 | class RiskProfile3:
25 |     id: int
26 |     # :tags: group:risk, pii
27 |     email: str
28 |     kyc_score: str
29 | 
30 | 
31 | # Tags assigned on the class will apply to each of its features
32 | @features(tags="group:risk")
33 | class RiskProfile4:
34 |     id: str
35 |     kyc_score: float
36 |     email_age_days: int
37 | 
38 | 
39 | # Tags on the class add to tags on the feature
40 | @features(tags="group:risk")
41 | class RiskProfile5:
42 |     id: str
43 |     kyc_score: float
44 |     email_age_days: int
45 |     # :tags: pii
46 |     email: str
47 | 
48 | 
49 | # The function `chalk.features.tags(...)` returns the tags for a feature
50 | assert tags(RiskProfile5) == ["group:risk"]
51 | assert tags(RiskProfile5.id) == ["group:risk"]
52 | assert tags(RiskProfile5.email) == ["pii", "group:risk"]
53 | 


--------------------------------------------------------------------------------
/01_features/1_feature_types.py:
--------------------------------------------------------------------------------
 1 | from datetime import date, datetime
 2 | from enum import Enum
 3 | 
 4 | from chalk.features import features
 5 | 
 6 | 
 7 | class Genre(Enum):
 8 |     FICTION = "FICTION"
 9 |     NONFICTION = "NONFICTION"
10 |     DRAMA = "DRAMA"
11 |     POETRY = "POETRY"
12 | 
13 | 
14 | # The @features decorator creates a feature for each attribute
15 | # of the class. These feature classes work a lot like Python's
16 | # dataclasses, except that you can construct them with only
17 | # partial arguments.
18 | @features
19 | class Book:
20 |     # Features can be any primitive Python type
21 |     id: int
22 |     name: str
23 |     pages: int
24 |     publish_date: date
25 |     copyright_ended_at: datetime | None
26 |     genre: Genre
27 | 
28 |     # Features can also be lists and sets of any primitive
29 |     authors: list[str]
30 |     categories: set[str]
31 | 
32 |     # Descriptions live as comments above features.
33 |     # See 05_feature_discovery/4_descriptions.py for more information.
34 |     jacket_copy: str
35 | 
36 | 
37 | # Note that we don't supply all the arguments to book here
38 | anna_karenina = Book(name="Anna Karenina", pages=864)
39 | 
40 | # Feature classes can be easily converted to dictionaries
41 | assert dict(anna_karenina) == {
42 |     "book.name": "Anna Karenina",
43 |     "book.pages": 864,
44 | }
45 | 


--------------------------------------------------------------------------------
/full_examples/sagemaker/src/models.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | from dateutil.relativedelta import relativedelta
 3 | from chalk import online, DataFrame, FeatureTime, Windowed, _, feature, windowed
 4 | from chalk.features import features
 5 | 
 6 | 
 7 | @features
 8 | class Transaction: 
 9 |     id: int
10 |     amt: float
11 |     confirmed_fraud: bool
12 |     customer_id: "Customer.id"
13 |     customer: "Customer"
14 | 
15 |     # The time at which the transaction was created for temporal consistency
16 |     at: FeatureTime
17 | 
18 | 
19 | @features
20 | class Customer:
21 |     id: int
22 |     name: str
23 |     email: str
24 |     dob: date
25 |     age: int
26 |     income: int
27 |     fico: int
28 | 
29 |     # The transactions, linked by the Customer.id type on the Transaction.customer_id field
30 |     transactions: DataFrame[Transaction]
31 | 
32 |     transaction_sum: Windowed[float] = windowed(
33 |         "30m",
34 |         "1h",
35 |         default=0,
36 |         expression=_.transactions[_.amount, _.ts > _.chalk_window].sum(),
37 |     )
38 | 
39 | 
40 | @online
41 | async def get_fico(email: Customer.email) -> Customer.fico:
42 |     # Use your preferred FICO score API here
43 |     ...
44 | 
45 | @online
46 | async def get_age(Customer.dob) -> Customer.age:
47 |     return relativedelta(end_date, start_date).years
48 | 
49 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Chalk Quickstart
 3 | 
 4 | 1. Install Chalk
 5 | 
 6 |    Install the [Chalk command line tool](https://docs.chalk.ai/cli).
 7 |    The Chalk CLI allows you to create, update, and manage your feature
 8 |    pipelines directly from your terminal.
 9 | 
10 |    > curl -s -L https://api.chalk.ai/install.sh | sh
11 | 
12 | 2. Create and activate a virtual environment in your project directory
13 | 
14 | 	Creating a virtual environment is a good practice to keep your project 
15 | 	dependencies isolated from your system dependencies.
16 | 
17 | 	> python -m venv .venv 
18 | 	> source .venv/bin/activate
19 | 
20 | 3. Login or sign up
21 | 
22 |    Login or signup with Chalk directly from the command line. The
23 |    [`chalk login`](https://docs.chalk.ai/cli/login) command will
24 |    open your browser and create an API token for your local development.
25 | 
26 | 4. Deploy your features
27 | 
28 |    Deploy your feature pipeline to production. After you've written some
29 |    features and resolvers, use the [`chalk apply`](https://docs.chalk.ai/cli/apply)
30 |    command to deploy your feature pipelines.
31 | 
32 | 5. Query your features
33 | 
34 |    Query your features directly from the command line with
35 |    [`chalk query`](https://docs.chalk.ai/cli/query) to see that they're
36 |    live and available.
37 | 


--------------------------------------------------------------------------------
/14_codegen/README.md:
--------------------------------------------------------------------------------
 1 | ## Codegen with Chalk
 2 | 
 3 | If you’re working with external ML models or microservices, it can be helpful to generate
 4 | boilerplate code for calling those services—especially when those models operate on Chalk
 5 | features.
 6 | 
 7 | In this folder, the file `custom_model.py` defines a class `CustomModel`, that when created,
 8 | stores the information it needs to render the resolver definition for an HTTP call to a
 9 | service hosting a model score.
10 | 
11 | ```python
12 | CustomModel(
13 |     url="https://internal.example.com/model1",
14 |     dependencies={
15 |         "nms": User.name_match_score,
16 |         "email": User.email,
17 |     },
18 |     computes=User.score1,
19 | )
20 | ```
21 | 
22 | The file `models.py` when executed overwrites the `score_resolvers.py` file with the
23 | auto-generated definitions of the custom model:
24 | 
25 | ```python
26 | @online
27 | def get_score1(nms: User.name_match_score, email: User.email) -> User.score1:
28 |     response = requests.post(
29 |         "https://internal.example.com/model1",
30 |         headers={"accept": "application/json"},
31 |         json={"nms": nms, "email": email},
32 |     )
33 |     return response.json().get("prediction")
34 | ```
35 | 
36 | If you find yourself repeating the same pattern for many of these resolvers, codegen
37 | can be helpful to dry up your definitions.


--------------------------------------------------------------------------------
/01_features/7_feature_time.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from chalk import offline
 4 | from chalk.features import Features, features
 5 | 
 6 | 
 7 | @features
 8 | class Book:
 9 |     id: str
10 |     name: str
11 | 
12 |     # By default, Chalk marks the time a feature was
13 |     # created as the time that its resolver was run.
14 |     # However, you may want to provide a custom value
15 |     # for this time for data sources like events tables.
16 |     # You can inspect the time a feature was created
17 |     # and set the time for when a feature was created
18 |     # by creating a feature time feature.
19 |     # By default, a feature is a feature time feature if
20 |     # it has the name `ts` and a type of `datetime.datetime`:
21 |     ts: datetime
22 | 
23 |     # However, you may also explicitly set the feature time
24 |     # via the `chalk.features.FeatureTime` type:
25 |     #
26 |     #    timestamp: FeatureTime
27 |     #
28 | 
29 | 
30 | # To set the time a feature was created, assign the feature
31 | # when you resolve it:
32 | @offline
33 | def fn(book_id: Book.id) -> Features[Book.name, Book.ts]:
34 |     return Book(
35 |         name="Anna Karenina",
36 |         ts=datetime(month=9, day=12, year=1877),
37 |     )
38 | 
39 | 
40 | # Then, when you sample offline data, the name feature will
41 | # be treated as having been created at the provided date.
42 | 


--------------------------------------------------------------------------------
/02_resolvers/1_scalar_resolver.py:
--------------------------------------------------------------------------------
 1 | from chalk import online
 2 | from chalk.client import ChalkClient
 3 | from chalk.features import features
 4 | 
 5 | 
 6 | @features
 7 | class User:
 8 |     id: int
 9 |     email: str
10 |     email_domain: str
11 | 
12 | 
13 | # This resolver computes one features, `User.email_domain`.
14 | # To compute that feature, it takes a data dependency on `User.email`.
15 | @online
16 | def get_email_domain(email: User.email) -> User.email_domain:
17 |     return email.split("@")[1].lower()
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     # Once you've deployed your features, you can query them by providing
22 |     # the data you know that's already in scope, and asking for any feature
23 |     # value that can be computed downstream from that data
24 |     result = ChalkClient().query(
25 |         # Here, we say that we know the email is `jessie@chalk.ai`.
26 |         # In practice, typically this is just the id of the entity
27 |         # that you care about
28 |         input={User.email: "jessie@chalk.ai"},
29 |         # Here we ask Chalk to compute the `User.email_domain` from this
30 |         # downstream feature
31 |         output=[User.email_domain],
32 |     )
33 | 
34 |     # From the resulting object, we can pull the `User.email_domain`
35 |     # feature, and see that it is in fact `chalk.ai`.
36 |     assert result.get_feature_value(User.email_domain) == "chalk.ai"
37 | 


--------------------------------------------------------------------------------
/06_dataframe/2_filters.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from chalk.features import DataFrame, features
 4 | 
 5 | 
 6 | @features
 7 | class Transaction:
 8 |     id: int
 9 |     user_id: "User.id"
10 |     memo: str
11 |     merchant: str
12 |     amount: float
13 |     canceled_at: None | datetime
14 | 
15 | 
16 | @features
17 | class User:
18 |     id: int
19 |     txns: DataFrame[Transaction]
20 | 
21 | 
22 | # You can filter down the transactions by any of the
23 | # properties on the transaction
24 | credits = User.txns[Transaction.amount < 0]
25 | 
26 | # Or works much like `and`:
27 | rideshare_income = User.txns[
28 |     Transaction.amount < 0
29 |     and (Transaction.merchant in ("uber", "lyft") or "uberpmts" == Transaction.memo)
30 | ]
31 | 
32 | # You can also check for set or list membership with `in`:
33 | rideshare_txns = User.txns[Transaction.merchant in ("uber", "lyft")]
34 | 
35 | # Filters separated by commas function as `and` filters:
36 | rideshare_credits = User.txns[
37 |     Transaction.amount < 0, Transaction.merchant in ("uber", "lyft")
38 | ]
39 | 
40 | # Equivalently, you can use the keyword `and` instead of separating by commas
41 | rideshare_credits = User.txns[
42 |     Transaction.amount < 0 and Transaction.merchant in ("uber", "lyft")
43 | ]
44 | 
45 | 
46 | # Filters can also check for None the same way you check for None in Python
47 | valid_txns = User.txns[Transaction.canceled_at is not None]
48 | 


--------------------------------------------------------------------------------
/08_testing/1_unit_tests.py:
--------------------------------------------------------------------------------
 1 | from chalk import realtime
 2 | from chalk.features import Features, features
 3 | 
 4 | 
 5 | # First, we'll define a set of features and resolvers:
 6 | @features
 7 | class HomeFeatures:
 8 |     id: str
 9 |     address: str
10 |     price: int
11 |     sq_ft: int
12 | 
13 | 
14 | @realtime
15 | def get_address(hid: HomeFeatures.id) -> HomeFeatures.address:
16 |     return "Bridge Street" if hid == 1 else "Filbert Street"
17 | 
18 | 
19 | @realtime
20 | def get_home_data(
21 |     hid: HomeFeatures.id,
22 | ) -> Features[HomeFeatures.price, HomeFeatures.sq_ft]:
23 |     return HomeFeatures(price=200_000, sq_ft=2_000)
24 | 
25 | 
26 | # Chalk lets you specify your feature pipelines using
27 | # idiomatic Python. This means that you can unit test
28 | # individual resolvers and combinations of resolvers,
29 | # since they’re just Python functions.
30 | def test_single_output():
31 |     assert get_address(2) == "Filbert Street"
32 | 
33 | 
34 | # Dataclasses support equality, which can be used
35 | # to test resolvers which return multiple features.
36 | def test_multiple_output():
37 |     result = get_home_data(2)
38 |     assert result.price == 200_000
39 |     assert result.sq_ft == 2_000
40 |     assert result != HomeFeatures(
41 |         address="hello",
42 |         price=200_000,
43 |         sq_ft=2_000,
44 |     )
45 |     assert result == HomeFeatures(
46 |         price=200_000,
47 |         sq_ft=2_000,
48 |     )
49 | 


--------------------------------------------------------------------------------
/full_examples/sagemaker/steps/training.py:
--------------------------------------------------------------------------------
 1 | from sagemaker.workflow.function_step import step
 2 | 
 3 | PARAM_GRID = {
 4 |     'xgb__n_estimators': [20, 50, 100, 200],
 5 |     'xgb__learning_rate': [0.01, 0.1, 0.2],
 6 |     'xgb__max_depth': [3, 5, 7, 9],
 7 | }
 8 | 
 9 | @step(
10 |     name="model-training",
11 |     instance_type="ml.m5.xlarge",
12 |     keep_alive_period_in_seconds=300,
13 | )
14 | def train(
15 |     xtrain_path: str,
16 |     ytrain_path: str,
17 |     num_rounds: int
18 | ):
19 |     from sklearn.pipeline import Pipeline
20 |     import pandas as pd
21 |     from sklearn.preprocessing import StandardScaler
22 |     from sklearn.impute import SimpleImputer
23 |     from sklearn.ensemble import GradientBoostingClassifier
24 |     from sklearn.model_selection import RandomizedSearchCV
25 | 
26 |     # read data files from S3
27 |     X_train = pd.read_parquet(xtrain_path)
28 |     y_train = pd.read_parquet(ytrain_path)
29 | 
30 |     pipeline = Pipeline(
31 |         steps=[
32 |             ("impute", (SimpleImputer())),
33 |             ("scaler", StandardScaler()),
34 |             ("xgb", GradientBoostingClassifier()),
35 |         ]
36 |     )
37 |     rsc = RandomizedSearchCV(
38 |         pipeline,
39 |         param_distributions=PARAM_GRID,
40 |         n_iter=num_rounds,
41 |         cv=3,
42 |         scoring="f1",
43 |         n_jobs=-1,
44 |     )
45 |     rsc.fit(X_train, y_train)
46 | 
47 |     return rsc.best_estimator_
48 | 


--------------------------------------------------------------------------------
/github/features/github_feature_set.py:
--------------------------------------------------------------------------------
 1 | import chalk.functions as F
 2 | from chalk.features import (
 3 |     Primary,
 4 |     _,
 5 |     features,
 6 |     has_one,
 7 | )
 8 | 
 9 | from src.github.features import (
10 |     GithubArchive,
11 |     GithubRepo,
12 |     GithubRepoDocVDB,
13 |     GithubUser,
14 | )
15 | 
16 | 
17 | @features
18 | class GithubProject:
19 |     path: Primary[str]
20 |     project_is_valid_repo_path: bool = F.regexp_like(
21 |         expr=_.path,
22 |         pattern=r"^[a-zA-Z0-9_-]+\/[a-zA-Z0-9._-]+$",
23 |     )
24 |     project_url: str | None = F.if_then_else(
25 |         condition=_.project_is_valid_repo_path,
26 |         if_true="https://github.com/" + _.path,
27 |         if_false=None,
28 |     )
29 | 
30 |     username: GithubUser.name = F.split_part(
31 |         expr=_.path,
32 |         delimiter="/",
33 |         index=0,
34 |     )
35 | 
36 |     user: GithubUser | None = has_one(
37 |         lambda: GithubProject.username == GithubUser.name,
38 |     )
39 | 
40 |     repo: GithubRepo = has_one(
41 |         lambda: GithubProject.path == GithubRepo.path,
42 |     )
43 |     repo_language: str | None = F.coalesce(
44 |         _.repo.language,
45 |         "MISSING",
46 |     )
47 | 
48 |     archive: GithubArchive | None = has_one(
49 |         lambda: GithubProject.path == GithubArchive.path,
50 |     )
51 |     vdb: GithubRepoDocVDB | None = has_one(
52 |         lambda: GithubProject.path == GithubRepoDocVDB.path,
53 |     )
54 | 


--------------------------------------------------------------------------------
/03_caching/5_override_cache_values.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | import requests
 4 | 
 5 | from chalk import realtime
 6 | from chalk.client import ChalkClient
 7 | from chalk.features import feature, features
 8 | 
 9 | 
10 | class FICOBucket(str, Enum):
11 |     HIGH = "HIGH"
12 |     MEDIUM = "MEDIUM"
13 |     LOW = "LOW"
14 | 
15 | 
16 | @features
17 | class User:
18 |     id: int
19 |     name: str
20 |     fico_score: int = feature(max_staleness="30d")
21 |     fico_bucket: FICOBucket
22 | 
23 | 
24 | @realtime
25 | def get_fico_score(name: User.name) -> User.fico_score:
26 |     return requests.get(...).json()["score"]
27 | 
28 | 
29 | @realtime
30 | def discretize_fico_score(score: User.fico_score) -> User.fico_bucket:
31 |     if score > 700:
32 |         return FICOBucket.HIGH
33 |     if score > 600:
34 |         return FICOBucket.MEDIUM
35 |     return FICOBucket.LOW
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     # You can also override the cached _value_ (in addition to the cache
40 |     # duration, as described in 4_override_max_staleness) by providing
41 |     # the value as an input to the query.
42 |     #
43 |     # Here, we specify that the FICO score is 700. That value is passed
44 |     # to the resolver `discritize_fico_score` to compute `User.fico_bucket`,
45 |     # instead of running `get_fico_score`.
46 |     ChalkClient().query(
47 |         input={User.name: "Katherine Johnson", User.fico_score: 700},
48 |         output=[User.fico_bucket],
49 |     )
50 | 


--------------------------------------------------------------------------------
/full_examples/image_processing/src/feature_sets.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import chalk.functions as F
 3 | from chalk.features import DataFrame, Primary, _, feature, features
 4 | 
 5 | @features
 6 | class Image:
 7 |     url: Primary[str]
 8 | 
 9 |     # The website that the image was scraped from.
10 |     source_url: "Website.url"
11 | 
12 |     # Image type: svg, png, jpg, unknown
13 |     type: str = (
14 |         F.when(F.ends_with(_.url, ".svg"))
15 |         .then("svg")
16 |         .when(F.ends_with(_.url, ".jpeg") | F.ends_with(_.url, ".jpg"))
17 |         .then("jpg")
18 |         .when(F.ends_with(_.url, ".png"))
19 |         .then("png")
20 |         .otherwise("unknown")
21 |     )
22 | 
23 |     # The raw bytes of the image
24 |     image_bytes: bytes
25 | 
26 |     # The x dimension for an image
27 |     x: int
28 | 
29 |     # The y dimension for an image
30 |     y: int
31 | 
32 |     # whether the image is flaged based on an in house model running deployed on the
33 |     # `image-model_1.0.1_2024-09-16` SageMaker endpoint.
34 |     flagged: bool = feature(
35 |         max_staleness="infinity",
36 |         underscore=F.sagemaker_predict(
37 |             _.image_bytes, endpoint="image-model_1.0.1_2024-09-16"
38 |         ),
39 |     )
40 | 
41 | 
42 | @features
43 | class Website:
44 |     url: Primary[str]
45 | 
46 |     host: str = F.url_extract_host(_.url)
47 | 
48 |     # The html of the website
49 |     html: str
50 | 
51 |     # The images associated with a given website
52 |     images: DataFrame[Image]
53 | 


--------------------------------------------------------------------------------
/14_codegen/custom_model.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Any
 3 | 
 4 | from chalk.features import unwrap_feature
 5 | 
 6 | 
 7 | class CustomModel:
 8 |     def __init__(self, url: str, dependencies: dict[str, Any], computes: Any):
 9 |         self.url = url
10 |         self.dependencies = dependencies
11 |         self.computes = unwrap_feature(computes)
12 | 
13 |     @classmethod
14 |     def render_all(cls, *, header: str, path: Path | str, models: "list[CustomModel]"):
15 |         children = "\n".join(model.render() for model in models)
16 |         content = f"""{header}
17 | from chalk import online
18 | import requests
19 | 
20 | 
21 | {children}
22 | """
23 |         with open(path, "w") as f:
24 |             f.write(content)
25 | 
26 |     def render(self):
27 |         args: dict[str, str] = {}
28 |         for k, v in self.dependencies.items():
29 |             f = unwrap_feature(v)
30 |             args[k] = f"{f.features_cls.__name__}.{f.attribute_name}"
31 |         json_body = ", ".join(f'"{k}": {k}' for k in args.keys())
32 |         returns = f"{self.computes.features_cls.__name__}.{self.computes.attribute_name}"
33 | 
34 |         return f"""
35 | @online
36 | def get_{self.computes.name}(
37 |     {', '.join(f'{k}: {v}' for k, v in args.items())}
38 | ) -> {returns}:
39 |     response = requests.post(
40 |         "{self.url}",
41 |         headers={{"accept": "application/json"}},
42 |         json={{{json_body}}},
43 |     )
44 |     return response.json().get("prediction")
45 | """
46 | 


--------------------------------------------------------------------------------
/github/features/search/prompts.py:
--------------------------------------------------------------------------------
 1 | SYSTEM_PROMPT: str = """
 2 |     You are an AI assistant that processes GitHub repository descriptions and returns back the repository that best matches a given search query.
 3 |     Your goal is to evaluate, rank, and then return the repository that is most relevant based on the search query.
 4 |     Assess each repository's features, functionality, and use cases in relation to the search intent.
 5 |     Clearly explain why the repository is relevant and differentiate between highly relevant, partially relevant, and less relevant results.
 6 |     Maintain clarity and brevity while optimizing for accuracy and usefulness.
 7 |     """
 8 | 
 9 | USER_PROMPT: str = (
10 |     """
11 |     Given the following GitHub repositories: {{GithubSearch.urls_in}}
12 | 
13 |     Analyze them in relation to the search query: '{{GithubSearch.query}}'
14 | 
15 |     Here are the repository descriptions: {{GithubSearch.descriptions}}
16 | 
17 |     Give me back the one repository URL that is most relevant to the query.
18 |     Clearly explain why each repository is relevant to the query, highlighting key features, functionality, and use cases.
19 |     If a repository is only partially relevant, mention the relevant aspects while keeping the summary concise.
20 | 
21 |     Generate a structured JSON response following this schema:
22 | 
23 |      ```json
24 |      {
25 |          'repo_url': '< str >',
26 |          'confidence': '< float >',
27 |          'summary': '< str >'
28 |      }
29 |      ```
30 |     """
31 | )
32 | 


--------------------------------------------------------------------------------
/01_features/8_constructing_features.py:
--------------------------------------------------------------------------------
 1 | from chalk.features import Features, features
 2 | 
 3 | 
 4 | @features
 5 | class Book:
 6 |     id: str
 7 |     name: str
 8 |     pages: int
 9 |     author: str
10 | 
11 | 
12 | # Feature classes function like data classes, except that they
13 | # are allowed to take only part of their arguments.
14 | # Here, we're not providing `author` or `id`, even though
15 | # they don't have default values or allow optional values.
16 | assert Book(name="Anna Karenina") == Book(name="Anna Karenina")
17 | assert Book(name="Anna Karenina") != Book(name="Anna Karenina", author="Leo Tolstoy")
18 | 
19 | # Feature classes are a bag of `Features`.
20 | # If you use Chalk's mypy plugin, the types below will behave as you expect.
21 | x: Features[Book.author, Book.name] = Book(name="Anna Karenina", author="Leo Tolstoy")
22 | 
23 | # `Features` is commutative, so `Features[A, B] == Feature[B, A]`
24 | y: Features[Book.name, Book.author] = x
25 | 
26 | # Features are iterable, and iterate as tuples of
27 | # (feature_name, feature_value)
28 | for feature_name, value in Book(name="Anna Karenina", pages=864):
29 |     print(f"{feature_name=}, {value=}")
30 | 
31 | # This iterable property means that features convert nicely into dicts
32 | assert dict(Book(name="Anna Karenina", pages=864)) == {
33 |     "book.name": "Anna Karenina",
34 |     "book.pages": 864,
35 | }
36 | 
37 | # And also into lists
38 | assert [
39 |     ("book.name", "Anna Karenina"),
40 |     ("book.pages", 864),
41 | ] == list(Book(name="Anna Karenina", pages=864))
42 | 


--------------------------------------------------------------------------------
/03_caching/3_intermediates.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | import requests
 4 | 
 5 | from chalk import realtime
 6 | from chalk.client import ChalkClient
 7 | from chalk.features import feature, features
 8 | 
 9 | 
10 | class FICOBucket(str, Enum):
11 |     HIGH = "HIGH"
12 |     MEDIUM = "MEDIUM"
13 |     LOW = "LOW"
14 | 
15 | 
16 | @features
17 | class User:
18 |     id: int
19 |     name: str
20 |     fico_score: int = feature(max_staleness="30d")
21 |     fico_bucket: FICOBucket
22 | 
23 | 
24 | @realtime
25 | def get_fico_score(name: User.name) -> User.fico_score:
26 |     return requests.get(...).json()["score"]
27 | 
28 | 
29 | @realtime
30 | def discretize_fico_score(score: User.fico_score) -> User.fico_bucket:
31 |     if score > 700:
32 |         return FICOBucket.HIGH
33 |     if score > 600:
34 |         return FICOBucket.MEDIUM
35 |     return FICOBucket.LOW
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     # Specifying the max-staleness value also holds when
40 |     # the cached feature is an intermediate result for your
41 |     # query, but not a desired output.
42 |     ChalkClient().query(
43 |         input={User.name: "Katherine Johnson"},
44 |         # User.fico_score is not requested in the output
45 |         output=[User.fico_bucket],
46 |         # ...but is necessary to compute User.fico_bucket.
47 |         # The requested feature `User.fico_bucket` is computed
48 |         # by running `discretize_fico_score`, which in turn
49 |         # depends on `User.fico_score`.
50 |         staleness={User.fico_score: "10m"},
51 |     )
52 | 


--------------------------------------------------------------------------------
/04_scheduling/1_cron.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | from chalk import online
 4 | from chalk.features import features
 5 | 
 6 | 
 7 | @features
 8 | class User:
 9 |     id: str
10 |     name: str
11 |     email: str
12 |     credit_score: str
13 | 
14 | 
15 | # By default, resolvers with a `cron` parameter will sample the latest
16 | # versions of the data. Imagine you had historically resolved the following
17 | # features:
18 | #
19 | #   | Time | ID  | Email                | Name        |
20 | #   | :--: | :-: | -------------------- | ----------- |
21 | #   |  0   |  1  | elliot@chalk.ai      | Elliot      |
22 | #   |  1   |  2  | andy@chalk.ai        | Andy        |
23 | #   |  2   |  1  |                      | Elliot Marx |
24 | #   |  3   |  2  | elliot.marx@chalk.ai |             |
25 | #
26 | # Then, we would sample the following pairs and invoke the resolver
27 | # with these arguments:
28 | #
29 | #   | Email                | Name        |
30 | #   | -------------------- | ----------- |
31 | #   | elliot.marx@chalk.ai | Elliot Marx |
32 | #   | andy@chalk.ai        | Andy        |
33 | #
34 | # Note that we don't sample (elliot.marx@chalk.ai, Elliot),
35 | # for example, as those features are not the latest values
36 | # for a given id.
37 | #
38 | # The argument to cron can use the Chalk duration type,
39 | # or take a crontab-formatted string:
40 | # i.e.: @online(cron="*/5 * * * *")
41 | 
42 | 
43 | @online(cron="30d")
44 | def get_credit_score(name: User.name, email: User.email) -> User.credit_score:
45 |     return requests.get("https://experian.com").json()["score"]
46 | 


--------------------------------------------------------------------------------
/05_feature_discovery/2_owners.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | 
 3 | from chalk import owner
 4 | from chalk.features import feature, features
 5 | 
 6 | 
 7 | @features
 8 | class User:
 9 |     id: str
10 |     # Owners are specified via code comments:
11 |     # :owner: katherine.johnson@nasa.gov
12 |     name: str
13 | 
14 | 
15 | @features
16 | class User1:
17 |     id: str
18 |     # or explicitly with `feature(owner=...)`:
19 |     name: str = feature(owner="katherine.johnson@nasa.gov")
20 | 
21 | 
22 | # Setting an owner through the `@features` decorator
23 | # determines the owner all the features on the class
24 | @features(owner="katherine.johnson@nasa.gov")
25 | class User2:
26 |     id: str  # assigned the owner katherine.johnson@nasa.gov
27 |     name: str  # assigned the owner katherine.johnson@nasa.gov
28 | 
29 | 
30 | # Owners on features are more specific than owners
31 | # set via the `@features` decorator.
32 | @features(owner="katherine.johnson@nasa.gov")
33 | class User3:
34 |     # Katherine is the owner of the id and dob feature,
35 |     # because she is the owner set in the `@features` decorator
36 |     id: str
37 |     dob: date
38 | 
39 |     # Annie is the owner of this feature because she is set
40 |     # as the owner at the feature level, which is more specific
41 |     # than the owner from the feature class
42 |     # :owner: annie.easley@nasa.gov
43 |     name: str
44 | 
45 | 
46 | # The function `chalk.features.owner(...)` returns the owner of a feature
47 | assert owner(User3.name) == "annie.easley@nasa.gov"
48 | assert owner(User3.id) == "katherine.johnson@nasa.gov"
49 | 


--------------------------------------------------------------------------------
/marketplace/item_category/item_category_value_enum.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class ItemCategoryValueEnum(str, Enum):
 5 |     ARTS_AND_PHOTOGRAPHY = "Arts & Photography"
 6 |     BIOGRAPHIES_AND_MEMOIRS = "Biographies & Memoirs"
 7 |     BUSINESS_AND_MONEY = "Business & Money"
 8 |     CHILDRENS_BOOKS = "Children's Books"
 9 |     COMICS_AND_GRAPHIC_NOVELS = "Comics & Graphic Novels"
10 |     COMPUTERS_AND_TECHNOLOGY = "Computers & Technology"
11 |     COOKBOOKS_FOOD_AND_WINE = "Cookbooks, Food & Wine"
12 |     CRAFTS_HOBBIES_AND_HOME = "Crafts, Hobbies & Home"
13 |     EDUCATION_AND_TEACHING = "Education & Teaching"
14 |     ENGINEERING_AND_TRANSPORTATION = "Engineering & Transportation"
15 |     HEALTH_FITNESS_AND_DIETING = "Health, Fitness & Dieting"
16 |     HISTORY = "History"
17 |     HUMOR_AND_ENTERTAINMENT = "Humor & Entertainment"
18 |     LAW = "Law"
19 |     LITERATURE_AND_FICTION = "Literature & Fiction"
20 |     MEDICAL_BOOKS = "Medical Books"
21 |     MYSTERY_THRILLER_AND_SUSPENSE = "Mystery, Thriller & Suspense"
22 |     PARENTING_AND_RELATIONSHIPS = "Parenting & Relationships"
23 |     POLITICS_AND_SOCIAL_SCIENCES = "Politics & Social Sciences"
24 |     REFERENCE = "Reference"
25 |     RELIGION_AND_SPIRITUALITY = "Religion & Spirituality"
26 |     ROMANCE = "Romance"
27 |     SCIENCE_AND_MATH = "Science & Math"
28 |     SCIENCE_FICTION_AND_FANTASY = "Science Fiction & Fantasy"
29 |     SELF_HELP = "Self-Help"
30 |     SPORTS_AND_OUTDOORS = "Sports & Outdoors"
31 |     TEEN_AND_YOUNG_ADULT = "Teen & Young Adult"
32 |     TEST_PREPARATION = "Test Preparation"
33 |     TRAVEL = "Travel"
34 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/tests/test_denylisted.py:
--------------------------------------------------------------------------------
 1 | from chalk.client import ChalkClient
 2 | from src.models import Transaction, User
 3 | 
 4 | 
 5 | def test_email_features(client: ChalkClient):
 6 |     client.check(
 7 |         input={
 8 |             User.id: 1,
 9 |             User.email: "monica.1984+123@gmail.com",
10 |             User.name: "Monica Geller",
11 |         },
12 |         assertions={
13 |             User.email_username: "monica1984",
14 |             User.domain_name: "gmail.com",
15 |             User.name_email_match_score: 39.89,
16 |         },
17 |     )
18 |     """
19 |     Chalk Feature Value Check Table
20 |     ┏━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
21 |     ┃ Kind   ┃ Name                        ┃ Value      ┃
22 |     ┡━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
23 |     │ Match  │ user.domain_name            │ gmail.com  │
24 |     │ Match  │ user.email_username         │ monica1984 │
25 |     │ Expect │ user.name_email_match_score │ 39.89      │
26 |     │ Actual │ user.name_email_match_score │ 62.5       │
27 |     └────────┴─────────────────────────────┴────────────┘
28 |     """
29 | 
30 | 
31 | def test_transactions(client: ChalkClient):
32 |     client.check(
33 |         input={
34 |             User.id: 1,
35 |             User.transactions: [
36 |                 Transaction(id=1, amount=110.0),
37 |                 Transaction(id=2, amount=900.0),
38 |                 Transaction(id=3, amount=300.0),
39 |             ],
40 |         },
41 |         assertions={
42 |             User.total_spend: 1310.0,
43 |         },
44 |     )
45 | 


--------------------------------------------------------------------------------
/04_scheduling/2_filtered_cron.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | import requests
 4 | 
 5 | from chalk import Cron, Now, online
 6 | from chalk.features import feature, features
 7 | 
 8 | 
 9 | @features
10 | class User:
11 |     id: int
12 |     name: str
13 |     email: str
14 |     status: str
15 |     last_login: datetime
16 |     fico_score: int = feature(max_staleness="30d")
17 | 
18 | 
19 | # Filter functions can take in any features as arguments, and must
20 | # output True or False to indicate whether to consider a given entity
21 | # in a scheduled run
22 | def only_active_filter(
23 |     last_login: User.last_login, status: User.status, now: Now
24 | ) -> bool:
25 |     return status == "active" and last_login > (now - timedelta(days=30))
26 | 
27 | 
28 | # You may want to run your cron jobs only on a subset of your userbase.
29 | # Imagine, for example, that you wanted to regularly pull credit scores
30 | # for only users who had logged in within the last 30 days.
31 | #
32 | # To do that, pass the keyword argument `cron` an instance of `Cron`,
33 | # and provide a filter function. The filter function should take as arguments
34 | # any feature values that it needs to output a boolean answer for whether
35 | # an entity should be considered for scheduled runs.
36 | #
37 | # Note that in this example, our active filter depends on two features
38 | # that are not part of our resolver's arguments.
39 | @online(cron=Cron(schedule="29d 11h", filter=only_active_filter))
40 | def get_fico_score(name: User.name, email: User.email) -> User.fico_score:
41 |     return requests.get("https://experian.com").json()["score"]
42 | 


--------------------------------------------------------------------------------
/fraud/5_account_takeover.py:
--------------------------------------------------------------------------------
 1 | """An example of using streams and windowed features to calculate
 2 | the number of failed logins for a user, at different time slices.
 3 | """
 4 | 
 5 | from enum import Enum
 6 | from pydantic import BaseModel
 7 | 
 8 | from chalk.features import (
 9 |     DataFrame,
10 |     features,
11 | )
12 | from chalk.streams import KafkaSource, stream, windowed, Windowed
13 | 
14 | 
15 | @features
16 | class User:
17 |     id: str
18 | 
19 |     # Chalk make it easy to calculate time windowed features,
20 |     # below we calculate the number of failed logins in the
21 |     # past 10 minutes, 30 minutes, and 1 hour.
22 |     failed_logins: Windowed[int] = windowed("10m", "30m", "1h")
23 | 
24 | 
25 | # setup a stream source, this can be configured from your chalk
26 | # dashboard, in the datasources tab.
27 | source = KafkaSource(name="sensor_stream")
28 | 
29 | 
30 | class LoginStatus(Enum):
31 |     success = "success"
32 |     failed = "failed"
33 | 
34 | 
35 | class LoginMessage(BaseModel):
36 |     user_id: str
37 |     status: LoginStatus
38 | 
39 | 
40 | @stream(source=source, mode="continuous")
41 | def agg_logins(df: DataFrame[LoginMessage]) -> DataFrame[User]:
42 |     # If a resolver annotation returns a dataframe and takes a dataframe,
43 |     # but the function actually returns a string, Chalk treats the return
44 |     # value as a SQL query, which it will execute on the passed in dataframe.
45 |     return f"""
46 |         select
47 |             count(*) as failed_logins,
48 |             user_id as id
49 |         from {df}
50 |         where status = 'failed'
51 |         group by id
52 |     """
53 | 


--------------------------------------------------------------------------------
/03_caching/1_basic_caching.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | from chalk import realtime
 4 | from chalk.client import ChalkClient
 5 | from chalk.features import feature, features
 6 | 
 7 | 
 8 | @features
 9 | class User:
10 |     id: int
11 |     name: str
12 | 
13 |     # Here, we can specify the default maximum staleness
14 |     # that we'll tolerate for a feature.
15 |     # You can also override this setting when you go to fetch
16 |     # the feature! See 4_override_max_staleness.py for more info
17 |     fico_score: int = feature(max_staleness="30d")
18 | 
19 | 
20 | # This function is both slow and expensive to run,
21 | # but because we're caching the `User.fico_score`
22 | # feature, it won't run every time we need the feature!
23 | @realtime
24 | def get_fico_score(name: User.name) -> User.fico_score:
25 |     response = requests.get(
26 |         "https://experian.com/api/score",
27 |         json={"name": name},
28 |     ).json()
29 |     return response["fico"]
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     # The first time that we run this query,
34 |     # `get_fico_score` will call out to Experian,
35 |     # because the FICO score is not available.
36 |     ChalkClient().query(
37 |         input={User.name: "Katherine Johnson"},
38 |         output=[User.fico_score],
39 |     )
40 | 
41 |     # The second time that we run this query with
42 |     # the same name, however, `get_fico_score` will
43 |     # NOT call out to Experian, because we have computed
44 |     # the FICO score for this user in the last 30 days.
45 |     ChalkClient().query(
46 |         input={User.name: "Katherine Johnson"},
47 |         output=[User.fico_score],
48 |     )
49 | 


--------------------------------------------------------------------------------
/05_feature_discovery/README.md:
--------------------------------------------------------------------------------
 1 | # Feature Discovery
 2 | Capture metadata to inform alerting, monitoring, and discovery.
 3 | 
 4 | https://docs.chalk.ai/docs/feature-discovery
 5 | 
 6 | ## 1. Descriptions
 7 | Describe features at a feature class or feature level.
 8 | 
 9 | **[1_descriptions.py](1_descriptions.py)**
10 | 
11 | ```python
12 | @features
13 | class RocketLaunch:
14 |     # Feature descriptions are parsed from your code!
15 |     launched_at: datetime
16 | ```
17 | https://docs.chalk.ai/docs/feature-discovery#description
18 | 
19 | ## 2. Owners
20 | Assign owners to features for monitoring and alerting.
21 | 
22 | **[2_owners.py](2_owners.py)**
23 | 
24 | ```python
25 | @features(owner="default-owner@gmail.com")
26 | class RocketLaunch:
27 |     # :owner: specific-owner@gmail.com
28 |     launched_at: datetime
29 | ```
30 | https://docs.chalk.ai/docs/feature-discovery#owner
31 | 
32 | ## 3. Tags
33 | Tag related features.
34 | 
35 | **[3_tags.py](3_tags.py)**
36 | 
37 | ```python
38 | @features(tags="group:risk")
39 | class RiskReport:
40 |     id: str
41 |     risk_score: str
42 |     # :tags: pii
43 |     first_name: str
44 | ```
45 | https://docs.chalk.ai/docs/feature-discovery#tags
46 | 
47 | ## 4. Tags & Owners
48 | Assigning tags & owners to features.
49 | 
50 | **[4_unified.py](4_unified.py)**
51 | 
52 | ```python
53 | @features(owner="shuttle@nasa.gov", tags="group:rocketry")
54 | class SpaceShuttle:
55 |     # The volume of this shuttle in square meters.
56 |     # :owner: architecture@nasa.gov
57 |     # :tags: zillow-fact, size
58 |     volume: str
59 | 
60 | assert tags(SpaceShuttle.volume) == ["zillow-fact", "size", "group:rocketry"]
61 | ```
62 | 


--------------------------------------------------------------------------------
/09_github_actions/README.md:
--------------------------------------------------------------------------------
 1 | # GitHub Actions
 2 | Deploy feature pipelines in GitHub Actions.
 3 | 
 4 | Docs: https://docs.chalk.ai/docs/github-actions
 5 | 
 6 | CLI Step: https://github.com/chalk-ai/cli-action
 7 | 
 8 | Deploy Step: https://github.com/chalk-ai/deploy-action
 9 | 
10 | ## 1. Install Chalk CLI
11 | Install the Chalk CLI in a GitHub Action.
12 | 
13 | **[1_install_chalk_cli.yaml](1_install_chalk_cli.yaml)**
14 | 
15 | ```yaml
16 | - uses: chalk-ai/cli-action@v2
17 |   with:
18 |     client-id: ${{secrets.CHALK_CLIENT_ID}}
19 |     client-secret: ${{secrets.CHALK_CLIENT_SECRET}}
20 | ```
21 | Docs: https://docs.chalk.ai/docs/github-actions
22 | 
23 | Step: https://github.com/chalk-ai/cli-action
24 | 
25 | ## 2. Deploy with Chalk
26 | Deploy to Chalk (either as a preview deployment or to production).
27 | 
28 | **[2_deploy_with_chalk.yaml](2_deploy_with_chalk.yaml)**
29 | 
30 | ```yaml
31 | - uses: chalk-ai/deploy-action@v2
32 |   with:
33 |     client-id: ${{secrets.CHALK_CLIENT_ID}}
34 |     client-secret: ${{secrets.CHALK_CLIENT_SECRET}}
35 |     await: true
36 | ```
37 | Docs: https://docs.chalk.ai/docs/github-actions
38 | 
39 | Step: https://github.com/chalk-ai/deploy-action
40 | 
41 | ## 3. Preview deployments
42 | Set up preview deployments for all PRs.
43 | 
44 | **[3_deploy_preview.yaml](3_deploy_preview.yaml)**
45 | 
46 | ```yaml
47 | - uses: chalk-ai/deploy-action@v2
48 |   with:
49 |     client-id: ${{secrets.CHALK_CLIENT_ID}}
50 |     client-secret: ${{secrets.CHALK_CLIENT_SECRET}}
51 |     await: true
52 |     no-promote: true
53 | ```
54 | Docs: https://docs.chalk.ai/docs/github-actions
55 | 
56 | Step: https://github.com/chalk-ai/deploy-action
57 | 


--------------------------------------------------------------------------------
/full_examples/batch_ml/tests/test_batch_prediction.py:
--------------------------------------------------------------------------------
 1 | from chalk import DataFrame
 2 | from datetime import datetime, timedelta
 3 | from src.resolvers.fraud_model import run_fraud_model
 4 | from src.models import User, Transaction
 5 | 
 6 | # Chalk provides a simple interface for unit tests that works with
 7 | # pytest or any other python testing framework: https://docs.chalk.ai/docs/unit-tests
 8 | # since chalk resolvers are just python functions, you can test them
 9 | # just like you'd unit test any other python function.
10 | 
11 | 
12 | def test_fraud_model():
13 |     # call the python resolver and assert the result
14 |     input = DataFrame(
15 |         {
16 |             Transaction.id: [1, 2, 3, 4],
17 |             Transaction.amount: [10, 100, 50, 200],
18 |             Transaction.user.time_since_last_transaction: [
19 |                 timedelta(days=30).total_seconds(),
20 |                 timedelta(days=10).total_seconds(),
21 |                 timedelta(days=5).total_seconds(),
22 |                 timedelta(days=60).total_seconds(),
23 |             ],
24 |             Transaction.user.num_transactions["1d"]: [1, 0, 2, 0],
25 |             Transaction.user.num_transactions["10d"]: [5, 2, 6, 0],
26 |             Transaction.user.num_transactions["30d"]: [10, 4, 12, 1],
27 |             Transaction.user.num_distinct_merchants_transacted["1d"]: [1, 0, 2, 0],
28 |             Transaction.user.num_distinct_merchants_transacted["10d"]: [2, 1, 3, 0],
29 |             Transaction.user.num_distinct_merchants_transacted["30d"]: [3, 2, 4, 1],
30 |         }
31 |     )
32 |     result = run_fraud_model(input)
33 |     assert isinstance(result, DataFrame)
34 |     assert len(result) == 4
35 | 


--------------------------------------------------------------------------------
/full_examples/fraud_transactions_with_llm/src/experian/__init__.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from datetime import date
 3 | 
 4 | from chalk import DataFrame
 5 | from src.models import User
 6 | 
 7 | 
 8 | class ExperianClient:
 9 |     def __init__(self, api_key: str):
10 |         self.api_key = api_key
11 | 
12 |     def fetch_credit_report(
13 |         self,
14 |         name: str,
15 |         dob: date,
16 |     ):
17 |         return DataFrame(
18 |             {
19 |                 User.credit_report.id: [123],
20 |                 User.credit_report.raw: [
21 |                     json.dumps(
22 |                         {
23 |                             "Tradelines": [
24 |                                 {
25 |                                     "Id": 1,
26 |                                     "OpenDate": "2021-01-01",
27 |                                     "Balance": 7203.40,
28 |                                     "Amount": 10000.0,
29 |                                     "AmountPastDue": 0.0,
30 |                                     "PaymentAmount": 200.0,
31 |                                 },
32 |                                 {
33 |                                     "Id": 2,
34 |                                     "OpenDate": "2021-01-01",
35 |                                     "Balance": 7203.40,
36 |                                     "Amount": 10000.0,
37 |                                     "AmountPastDue": 0.0,
38 |                                     "PaymentAmount": 200.0,
39 |                                 },
40 |                             ],
41 |                         }
42 |                     )
43 |                 ],
44 |             }
45 |         )
46 | 
47 | 


--------------------------------------------------------------------------------
/call_recordings/features/fathom/fathom_meeting_insights_sales.py:
--------------------------------------------------------------------------------
 1 | # trunk-ignore-all(ruff/W291)
 2 | from __future__ import annotations
 3 | 
 4 | from pydantic import BaseModel, Field
 5 | 
 6 | 
 7 | class StructuredOutputCallInsights(BaseModel):
 8 |     """Minimal schema matching the simple JSON format in the prompt.
 9 |     Prefer StructuredOutputCallInsights for richer analytics and UI.
10 |     """
11 | 
12 |     reasons_for_meeting: str = Field(
13 |         ...,
14 |         description="""Plain-text reasons. Return "None" if there wan't anything mentioned.""",
15 |     )
16 |     risk_flag: str = Field(
17 |         ...,
18 |         description="""One-sentence risk summaries. Return "None" if there wan't anything mentioned.""",
19 |     )
20 | 
21 | 
22 | def prompt_meeting_insights_sales_user() -> str:
23 |     return """
24 |     You are analyzing a transcript of a B2B sales call between a representative from Chalk, a data infrastructure platform, and a prospective customer.
25 |     Read the entire transcript carefully and extract the following insights in JSON format:
26 | 
27 |     1. **Reasons for Meeting** The stated reasons the prospect gave for agreeing to take the call or meet with Chalk. If none are stated explicitly, write `"None"`.
28 |     2. **Risk Flag** Any risks raised by the prospect related to security, legal, or project timeline. If mentioned, summarize the risk in one sentence per item. If none, write `"None"`.
29 | 
30 |     **Output format**:
31 |     ```json
32 |     {
33 |       "reasons_for_meeting": "string" or "None"
34 |       "risk_flag": "string" or "None"
35 |     }
36 | 
37 |     THE TRANSCRIPT TO ANALYZE:
38 | 
39 |     {{ FathomCall.transcript }}
40 |     """
41 | 


--------------------------------------------------------------------------------
/unstructured_data/src/models.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from datetime import date
 3 | 
 4 | from chalk import DataFrame, FeatureTime, Windowed, _, feature, windowed
 5 | from chalk.features import features
 6 | 
 7 | default_completion = json.dumps(
 8 |     dict(
 9 |         category="unknown",
10 |         is_nsf=False,
11 |         is_ach=False,
12 |         clean_memo="",
13 |     )
14 | )
15 | 
16 | 
17 | @features
18 | class Transaction:
19 |     id: int
20 |     amount: float
21 |     memo: str
22 | 
23 |     # :tags: genai
24 |     clean_memo: str
25 | 
26 |     # The User.id type defines our join key implicitly
27 |     user_id: "User.id"
28 |     user: "User"
29 | 
30 |     # The time at which the transaction was created for temporal consistency
31 |     at: FeatureTime
32 | 
33 |     completion: str = feature(max_staleness="infinity", default=default_completion)
34 | 
35 |     category: str = "unknown"
36 |     is_nsf: bool = False
37 |     is_ach: bool = False
38 | 
39 | 
40 | @features
41 | class User:
42 |     # Features pulled from Postgres for the user
43 |     id: int
44 |     email: str
45 |     name: str
46 |     dob: date
47 | 
48 |     # Whether the user appears in a denylist in s3
49 |     denylisted: bool
50 | 
51 |     # The transactions, linked by the User.id type on the Transaction.user_id field
52 |     transactions: DataFrame[Transaction]
53 | 
54 |     # The number of payments made by the user in the last 1, 7, and 30 days
55 |     # Uses the category pulled from Gemini to count payments
56 |     count_payments: Windowed[int] = windowed(
57 |         "1d", "7d", "30d",
58 |         expression=_.transactions[
59 |             _.amount,
60 |             _.at >= _.chalk_window,
61 |             _.category == "payment"
62 |         ].count(),
63 |     )
64 | 


--------------------------------------------------------------------------------
/04_scheduling/README.md:
--------------------------------------------------------------------------------
 1 | # Scheduling
 2 | Run resolvers on a schedule, sampling values
 3 | for the inputs.
 4 | 
 5 | https://docs.chalk.ai/docs/resolver-cron
 6 | 
 7 | ## 1. Cron
 8 | Run resolvers on a schedule with all possible arguments.
 9 | 
10 | **[1_cron.py](1_cron.py)**
11 | 
12 | ```python
13 | @realtime(cron="30d")
14 | def get_fico_score(name: User.name, email: User.email) -> User.fico_score:
15 |     return ...
16 | ```
17 | https://docs.chalk.ai/docs/resolver-cron
18 | 
19 | ## 2. Filtered Cron
20 | Run resolvers on a schedule and filter down which examples to consider.
21 | 
22 | **[2_filtered_cron.py](2_filtered_cron.py)**
23 | 
24 | ```python
25 | def only_active_filter(last_login: User.last_login, status: User.status) -> bool:
26 |     return status == "active" and last_login > datetime.now() - timedelta(days=30)
27 | 
28 | @realtime(cron=Cron(schedule="29d 11h", filter=only_active_filter))
29 | def get_fico_score(name: User.name, email: User.email) -> User.fico_score:
30 |     return requests.get("https://experian.com").json()["score"]
31 | ```
32 | https://docs.chalk.ai/docs/resolver-cron#filtering-examples
33 | 
34 | ## 3. Sampling Cron
35 | Pick exactly the examples that you’d like to run.
36 | 
37 | **[3_sample_arguments.py](3_sample_arguments.py)**
38 | 
39 | ```python
40 | def get_active_users() -> DataFrame[User.id]:
41 |     return session.query_string(
42 |         "select users.id from users where users.active = true",
43 |         fields={"id": User.id},
44 |     ).all()
45 | 
46 | @realtime(cron=Cron(schedule="29d 11h", sample=get_active_users))
47 | def get_fico_score(name: User.name, email: User.email) -> User.fico_score:
48 |     return requests.get("https://experian.com").json()["score"]
49 | ```
50 | https://docs.chalk.ai/docs/resolver-cron#custom-examples
51 | 


--------------------------------------------------------------------------------
/02_resolvers/5_tagged_resolvers.py:
--------------------------------------------------------------------------------
 1 | from random import random
 2 | 
 3 | from mocks import lexus_nexus
 4 | 
 5 | from chalk import online
 6 | from chalk.client import ChalkClient, OnlineQueryContext
 7 | from chalk.features import features
 8 | 
 9 | 
10 | @features
11 | class User:
12 |     id: int
13 |     email: str
14 |     email_domain: str
15 |     email_risk_score: float
16 |     banned_email: bool
17 | 
18 | 
19 | # If a request for features is made under the tag
20 | # `mock`, then this resolver will run.
21 | @online(tags="mock")
22 | def mock_check_banned_email(domain: User.email_domain) -> User.banned_email:
23 |     if domain == "chalk.ai":
24 |         return False
25 |     if domain == "fraudster.com":
26 |         return True
27 |     return random() < 0.1
28 | 
29 | 
30 | @online
31 | def get_email_risk_score(email: User.email) -> User.email_risk_score:
32 |     return lexus_nexus.get_email_risk(email).risk_score
33 | 
34 | 
35 | # If a request for features is made with _without_ the tag
36 | # than `mock`, then this resolver will run.
37 | #
38 | # Note that the two resolvers that resolve the feature
39 | # User.banned_email require different features as input!
40 | @online
41 | def check_banned_email(score: User.email_risk_score) -> User.banned_email:
42 |     return score >= 0.8
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     result = ChalkClient().query(
47 |         input={User.email: "katherine.johnson@nasa.gov"},
48 |         output=[User.banned_email],
49 |     )
50 |     assert result.get_feature_value(User.banned_email) == False
51 | 
52 |     result = ChalkClient().query(
53 |         input={User.email: "attacker@fraudster.com"},
54 |         output=[User.banned_email],
55 |         context=OnlineQueryContext(tags=["mock"]),
56 |     )
57 |     assert result.get_feature_value(User.banned_email) == True
58 | 


--------------------------------------------------------------------------------
/full_examples/sagemaker/chalk_sagemaker_pipeline.py:
--------------------------------------------------------------------------------
 1 | from sagemaker.workflow.pipeline import Pipeline
 2 | from steps.dataset import create_dataset
 3 | from steps.training import train
 4 | from steps.evaluate import evaluate
 5 | from sagemaker.workflow.parameters import (
 6 |     ParameterInteger,
 7 |     ParameterString,
 8 |     ParameterFloat,
 9 | )
10 | from uuid import uuid4
11 | 
12 | BUCKET_PREFIX = "s3://chalk-sagemaker-models/"
13 | 
14 | if __name__ == "__main__":
15 |     # Create Run Parameters
16 |     model_package_group = "chalk-sagemaker-xgb"
17 |     run_bucket = f"s3://chalk-sagemaker-models/{model_package_group}/{uuid4()}/"
18 | 
19 |     # Required F1 Threshold for model registration
20 |     f1_threshold = ParameterFloat(name="F1Threshold", default_value=0.8)
21 | 
22 |     # Size of test split
23 |     test_size = ParameterFloat(name="TestSize", default_value=0.2)
24 | 
25 |     # Number of estimators to evaluate
26 |     num_rounds = ParameterInteger(name="NumRounds", default_value=50)
27 |     run_bucket = ParameterString(name="RunBucket", default_value=run_bucket)
28 |     model_package_group = ParameterString(name="ModelPackageGroup", default_value="chalk-sagemaker-xgb")
29 | 
30 | 
31 |     # Instantiate Steps
32 |     delayed_data = create_dataset(test_size=test_size, run_bucket=run_bucket)
33 |     delayed_model = train(xtrain_path=delayed_data[0], ytrain_path=delayed_data[2], num_rounds=num_rounds)
34 |     delayed_evaluation = evaluate(model=delayed_model, xtest_path=delayed_data[1], ytest_path=delayed_data[3], run_bucket=run_bucket)
35 | 
36 |     # Create Pipeline
37 |     pipeline = Pipeline(
38 |         name="ChalkaiSagemakerPipeline",
39 |         steps=[delayed_evaluation],
40 |         parameters=[
41 |             f1_threshold,
42 |             test_size,
43 |             run_bucket,
44 |             model_package_group,
45 |             num_rounds,
46 |         ]
47 |     )
48 | 


--------------------------------------------------------------------------------
/credit/3_bureau_api.py:
--------------------------------------------------------------------------------
 1 | """An example of connecting Users to Credit Reports from a
 2 | third part API.
 3 | 
 4 | In this example, we are getting Credit Reports for our
 5 | users through a third party API. This example shows how
 6 | you can run arbitrary python code (and connect to third
 7 | party APIs) in a python resolver.
 8 | """
 9 | 
10 | import os
11 | import requests
12 | 
13 | from chalk import online
14 | 
15 | from chalk.features import features, has_many, DataFrame, Primary
16 | 
17 | 
18 | @features
19 | class CreditReport:
20 |     # if a feature doesn't have an id field, the Primary key must be specified
21 |     report_id: Primary[str]
22 |     user_id: "User.id"
23 |     # The raw report, which we'll save as a plain string
24 |     # to parse and extract later.
25 |     report: str
26 | 
27 | 
28 | @features
29 | class User:
30 |     id: int
31 |     first_name: str
32 |     last_name: str
33 |     # Adds the pii tag to the ssn feature (https://docs.chalk.ai/docs/feature-discovery#tags)
34 |     # :tags: pii
35 |     ssn: str
36 |     city: str
37 |     state: str
38 |     credit_report: DataFrame[CreditReport]
39 | 
40 | 
41 | # Inject a secret through the Chalk dashboard (https://docs.chalk.ai/docs/env-vars)
42 | url = os.getenv("MY_VENDOR_URL")
43 | 
44 | 
45 | @online
46 | def get_credit_report(
47 |     first_name: User.first_name,
48 |     last_name: User.last_name,
49 |     city: User.city,
50 |     state: User.state,
51 |     id: User.id,
52 | ) -> CreditReport:
53 |     """
54 |     This resolver populates the credit report feature for a user by making a request to
55 |     a third party API.
56 |     """
57 |     res = requests.get(
58 |         f"{url}/transunion/credit-report",
59 |         json={
60 |             "firstName": first_name,
61 |             "lastName": last_name,
62 |             "city": city,
63 |             "state": state,
64 |         },
65 |     ).json()
66 |     return CreditReport(user_id=id, report_id=res["pdfReportId"], report=res["data"])
67 | 


--------------------------------------------------------------------------------
/06_dataframe/6_self_joins.py:
--------------------------------------------------------------------------------
 1 | from chalk import online
 2 | from chalk.features import DataFrame, features, has_many, has_one
 3 | 
 4 | 
 5 | @features
 6 | class SeriesLink:
 7 |     id: int
 8 |     books: "DataFrame[Book]"
 9 | 
10 | 
11 | @features
12 | class Author:
13 |     id: int
14 |     name: str
15 |     books: "DataFrame[Book]"
16 | 
17 | 
18 | @features
19 | class PrequelLink:
20 |     id: int
21 |     prequel_id: int
22 |     book: "Book" = has_one(lambda: Book.id == PrequelLink.prequel_id)
23 | 
24 | 
25 | @features
26 | class Book:
27 |     id: int
28 |     title: str
29 |     author_id: Author.id
30 |     prequel_id: PrequelLink.id | None
31 |     prequel: PrequelLink | None = has_one(lambda: Book.id == PrequelLink.prequel_id)
32 |     series_id: SeriesLink.id | None
33 |     series: SeriesLink = has_one(lambda: SeriesLink.id == Book.series_id)
34 | 
35 | 
36 | @online
37 | def get_books() -> DataFrame[Book]:
38 |     return DataFrame(
39 |         [
40 |             Book(
41 |                 id=1,
42 |                 title="To the Lighthouse",
43 |                 author_id=1,
44 |                 series_id=None,
45 |                 prequel_id=None,
46 |             ),
47 |             Book(
48 |                 id=2,
49 |                 title="The Fellowship of the Ring",
50 |                 series_id=1,
51 |                 author_id=2,
52 |                 prequel_id=None,
53 |             ),
54 |             Book(id=3, title="The Two Towers", series_id=1, author_id=2, prequel_id=2),
55 |             Book(
56 |                 id=4,
57 |                 title="The Return of the King",
58 |                 series_id=1,
59 |                 author_id=2,
60 |                 prequel_id=3,
61 |             ),
62 |         ]
63 |     )
64 | 
65 | 
66 | @online
67 | def get_prequel_links() -> DataFrame[PrequelLink]:
68 |     return DataFrame([PrequelLink(id=3, prequel_id=2), PrequelLink(id=4, prequel_id=3)])
69 | 
70 | 
71 | @online
72 | def get_series_links() -> DataFrame[SeriesLink]:
73 |     return DataFrame([SeriesLink(id=1)])
74 | 


--------------------------------------------------------------------------------
/full_examples/batch_ml/src/models.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from chalk.features import features, DataFrame, _
 3 | from chalk import windowed, Windowed
 4 | import chalk.functions as F
 5 | 
 6 | 
 7 | @features
 8 | class User:
 9 |     id: int
10 |     name: str
11 |     created_at: datetime
12 |     transactions: "DataFrame[Transaction]"
13 | 
14 |     # The amount of time since the user was created in seconds.
15 |     time_since_creation: int = F.total_seconds(_.chalk_now - _.created_at)
16 | 
17 |     # The number of transactions the user has made in the last 1, 10, and 30 days.
18 |     num_transactions: Windowed[int] = windowed(
19 |         "1d",
20 |         "10d",
21 |         "30d",
22 |         expression=_.transactions[_.ts < _.chalk_now, _.ts >= _.chalk_window].count(),
23 |     )
24 | 
25 |     # The latest transaction timestamp for the user, considering only transactions
26 |     latest_transaction_timestamp: datetime = _.transactions[
27 |         _.ts, _.ts < _.chalk_now
28 |     ].max()
29 | 
30 |     # The time since the last transaction in seconds.
31 |     time_since_last_transaction: int = F.total_seconds(
32 |         _.chalk_now - _.latest_transaction_timestamp
33 |     )
34 | 
35 |     # The number of distinct merchants the user has transacted with in the last
36 |     # 1, 10, and 30 days.
37 |     num_distinct_merchants_transacted: Windowed[int] = windowed(
38 |         "1d",
39 |         "10d",
40 |         "30d",
41 |         expression=_.transactions[
42 |             _.merchant_id, _.ts < _.chalk_now, _.ts >= _.chalk_window
43 |         ].approx_count_distict(),
44 |     )
45 | 
46 |     # The churn prediction is a float between 0 and 1, where 1 means the user is
47 |     # predicted to churn.
48 |     churn_prediction: float
49 | 
50 | 
51 | @features
52 | class Transaction:
53 |     id: int
54 |     amount: float
55 |     merchant_id: int
56 |     ts: datetime
57 |     user_id: User.id
58 |     user: User
59 |     category: str
60 | 
61 |     # model score predicted by a scheduled job
62 |     is_fraud: bool
63 | 


--------------------------------------------------------------------------------
/full_examples/image_processing/src/resolvers.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | from cairosvg import svg2png
 6 | from chalk import online
 7 | from chalk.features import DataFrame, Features
 8 | from PIL import Image as PI
 9 | 
10 | from src.feature_sets import Image, Website
11 | 
12 | 
13 | @online
14 | def get_html(url: Website.url) -> Website.html:
15 |     """Get the HTML of a website."""
16 |     res = requests.get(url)
17 |     return res.content
18 | 
19 | 
20 | def process_url(image_src, host):
21 |     if image_src.startswith("https://") or image_src.startswith("http://"):
22 |         return image_src
23 |     elif image_src.startswith("//"):
24 |         return f"https:{image_src}"
25 |     return f"https://{host}/{image_src.strip('/')}"
26 | 
27 | 
28 | @online
29 | def get_images(
30 |     html: Website.html, website_url: Website.url, website_host: Website.host
31 | ) -> Website.images[Image.url, Image.source_url]:
32 |     """Extract all images from the HTML of a website."""
33 |     soup = BeautifulSoup(html, "html.parser")
34 | 
35 |     return DataFrame(
36 |         [
37 |             Image(
38 |                 url=process_url(it["src"], website_host),
39 |                 source_url=website_url,
40 |             )
41 |             for it in soup.find_all("img")
42 |         ]
43 |     )
44 | 
45 | 
46 | @online
47 | def get_image_bytes(
48 |     image_url: Image.url,
49 | ) -> Image.image_bytes:
50 |     """Get the image as bytes from the image's URL."""
51 |     res = requests.get(image_url)
52 |     return res.content
53 | 
54 | 
55 | @online
56 | def get_image_shape(
57 |     image_bytes: Image.image_bytes, image_type: Image.type
58 | ) -> Features[Image.x, Image.y]:
59 |     """Read the image using pillow and get its dimensions"""
60 |     pil_bytes = io.BytesIO()
61 |     if image_type == "svg":
62 |         svg2png(bytestring=image_bytes, write_to=pil_bytes)
63 |     else:
64 |         pil_bytes.write(image_bytes)
65 |     x, y = PI.open(pil_bytes).size
66 |     return Image(x=x, y=y)
67 | 


--------------------------------------------------------------------------------
/ecommerce/2_interactions.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | from chalk import online
 4 | from chalk.features import DataFrame, FeatureTime, features, _, has_many
 5 | 
 6 | 
 7 | @features
 8 | class Seller:
 9 |     id: str
10 |     categories: set[str]
11 | 
12 | 
13 | @features
14 | class User:
15 |     id: str
16 |     age: int
17 |     favorite_categories: set[str]
18 | 
19 | 
20 | @features
21 | class UserSeller:
22 |     id: str
23 |     user_id: User.id
24 |     user: User
25 |     seller_id: Seller.id
26 |     seller: Seller
27 |     favorites_match: bool
28 |     user_seller_score: int
29 |     interactions: "DataFrame[Interaction]" = has_many(
30 |         lambda: (User.id == Interaction.user_id) & (Seller.id == Interaction.seller_id)
31 |     )
32 |     number_of_interactions: int = _.interactions.count()
33 | 
34 | 
35 | class InteractionKind(Enum):
36 |     LIKE = "LIKE"
37 |     VIEW = "VIEW"
38 |     PURCHASE = "PURCHASE"
39 |     OTHER = "OTHER"
40 | 
41 |     @classmethod
42 |     def _missing_(cls, _):
43 |         return cls.OTHER
44 | 
45 | 
46 | @features
47 | class Interaction:
48 |     id: str
49 |     user_id: User.id
50 |     user: User
51 |     seller_id: Seller.id
52 |     seller: Seller
53 |     interaction_kind: InteractionKind
54 |     on: FeatureTime
55 | 
56 | 
57 | @online
58 | def get_similarity(
59 |     fc: UserSeller.user.favorite_categories, fc2: UserSeller.seller.categories
60 | ) -> UserSeller.favorites_match:
61 |     return len(fc & fc2) > 0
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     from chalk.client import ChalkClient
66 | 
67 |     client = ChalkClient()
68 |     user_stores = client.query(
69 |         input=[
70 |             UserSeller(user_id="1", seller_id="456"),
71 |             UserSeller(user_id="2", seller_id="457"),
72 |             UserSeller(user_id="3", seller_id="460"),
73 |         ],
74 |         output=[
75 |             UserSeller.user.id,
76 |             UserSeller.seller.id,
77 |             UserSeller.favorites_match,
78 |             UserSeller.number_of_interactions,
79 |         ],
80 |     )
81 |     print(user_stores)
82 | 


--------------------------------------------------------------------------------
/credit/README.md:
--------------------------------------------------------------------------------
 1 | # Credit
 2 | 
 3 | Chalk can help you build insight into the financial transactions
 4 | of your users.
 5 | 
 6 | ## 1. Income
 7 | 
 8 | Compute income from Plaid transactions.
 9 | 
10 | **[1_income.py](1_income.py)**
11 | 
12 | ```python
13 | @realtime
14 | def get_plaid_income(
15 |         txns: User.transactions[
16 |             PlaidTransaction.is_payroll is True,
17 |             after(days_ago=30),
18 |         ],
19 | ) -> User.computed_income_30:
20 |     return txns[PlaidTransaction.amount].sum()
21 | ```
22 | 
23 | https://docs.chalk.ai/docs/features
24 | 
25 | ## 2. Multiple Accounts
26 | 
27 | Identify users with multiple accounts.
28 | 
29 | **[2_accounts.py](2_accounts.py)**
30 | 
31 | ```python
32 | @features
33 | class Account:
34 |   id: int
35 |   user_id: "User.id"
36 |   user: "User"
37 | 
38 | @features
39 | class User:
40 |     id: int
41 |     accounts: DataFrame[Account]
42 | ```
43 | 
44 | https://docs.chalk.ai/docs/has-many
45 | 
46 | ## 3. Credit Bureau API
47 | 
48 | Integrate data from credit bureaus like Transunion.
49 | 
50 | **[3_bureau_api.py](3_bureau_api.py)**
51 | 
52 | ```python
53 | @realtime
54 | def get_credit_report(
55 |     first_name: User.first_name,
56 |     last_name: User.last_name,
57 |     city: User.city,
58 |     state: User.state,
59 |     id: User.id,
60 | ) -> CreditReport:
61 |     res = requests.post(
62 |         f"{url}/transunion/credit-report",
63 |         json={
64 |             "firstName": first_name,
65 |             "lastName": last_name,
66 |             "city": city,
67 |             "state": state,
68 |         },
69 |     ).json()
70 |     return CreditReport(user_id=id, report_id=res["pdfReportId"], report=res["data"])
71 | ```
72 | 
73 | https://docs.chalk.ai/docs/resolver-overview
74 | 
75 | ## 4. Aggregate Tradelines
76 | 
77 | Aggregate user statistics across tradelines.
78 | 
79 | **[4_aggregate_tradelines.py](4_aggregate_tradelines.py)**
80 | 
81 | ```python
82 | @realtime
83 | def tradeline_rollup(
84 |     accounts: User.tradelines[
85 |         Tradeline.is_delinquent is True
86 |     ]
87 | ) -> User.delinquent_amount:
88 |     return accounts[Tradeline.outstanding].sum()
89 | ```
90 | 
91 | https://docs.chalk.ai/docs/resolver-overview
92 | 


--------------------------------------------------------------------------------
/unstructured_data/src/resolvers.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import textwrap
 3 | 
 4 | import google.generativeai as genai
 5 | from chalk import online
 6 | from chalk.features import Features, before_all
 7 | from src.denylist import Denylist
 8 | 
 9 | from src.models import Transaction, User
10 | 
11 | model = genai.GenerativeModel(model_name="models/gemini-1.5-flash-latest")
12 | 
13 | 
14 | @online
15 | async def get_transaction_classification(memo: Transaction.memo) -> Transaction.completion:
16 |     """Here, we pull out the raw response from calling out to Gemini.
17 |     The feature `Transaction.completion` is a string, but we can use the
18 |     `get_structured_outputs` function to convert it to a structured output.
19 | 
20 |     Transaction.completion has max-staleness of infinity, so we won't need
21 |     to recompute the completion, but can still iterate on how we parse it.
22 |     """
23 | 
24 |     return model.generate_content(
25 |         textwrap.dedent(
26 |             f"""\
27 |         Please return JSON for classifying a financial transaction
28 |         using the following schema.
29 | 
30 |         {{"category": str, "is_nsf": bool, "clean_memo": str, "is_ach": bool}}
31 | 
32 |         All fields are required. Return EXACTLY one JSON object with NO other text.
33 |         Memo: {memo}"""
34 |         ),
35 |         generation_config={"response_mime_type": "application/json"},
36 |     ).candidates[0].content.parts[0].text
37 | 
38 | 
39 | @online
40 | def get_structured_outputs(completion: Transaction.completion) -> Features[
41 |     Transaction.category,
42 |     Transaction.is_nsf,
43 |     Transaction.is_ach,
44 |     Transaction.clean_memo,
45 | ]:
46 |     """Given the completion, we parse it into a structured output."""
47 |     body = json.loads(completion)
48 |     return Transaction(
49 |         category=body["category"],
50 |         is_nsf=body["is_nsf"],
51 |         is_ach=body["is_ach"],
52 |         clean_memo=body["clean_memo"],
53 |     )
54 | 
55 | 
56 | denylist = Denylist(source="gs://socure-data/denylist.csv")
57 | 
58 | 
59 | @before_all
60 | def init_denylist():
61 |     denylist.load()
62 | 
63 | 
64 | @online
65 | def email_in_denylist(email: User.email) -> User.denylisted:
66 |     return email in denylist
67 | 


--------------------------------------------------------------------------------
/full_examples/sagemaker/steps/dataset.py:
--------------------------------------------------------------------------------
 1 | from sagemaker.workflow.function_step import step
 2 | 
 3 | 
 4 | TRAINING_FEATURES = [
 5 |     "transaction.amt",
 6 |     "transaction.customer.age",
 7 |     "transaction.customer.income",
 8 |     "transaction.customer.fico",
 9 |     "transaction.customer.transaction_sum_30m",
10 |     "transaction.customer.transaction_sum_1h",
11 |     "transaction.confirmed_fraud"
12 | ]
13 | 
14 | TARGET_FEATURE = "transaction.confirmed_fraud"
15 | 
16 | 
17 | @step(
18 |     name="create_dataset",
19 |     instance_type='ml.t3.medium',
20 |     keep_alive_period_in_seconds=300,
21 | )
22 | def create_dataset(test_size, run_bucket):
23 |     from chalk.client import ChalkClient
24 |     from sklearn.model_selection import train_test_split
25 | 
26 |     # a Chalk client id & client secret for a token with permission to create datasets
27 |     # should be added to the sagemaker environment—these are passed automatically to the
28 |     # ChalkClient but can also be explicitly passed as arguments.
29 | 
30 |     chalk_dataset = ChalkClient(
31 |         # client_id=os.environ['CHALK_CLIENT_ID'],           # automatically loaded by the Chalk Client if in the environment
32 |         # client_secret=os.environ['CHALK_CLIENT_SECRET']    # automatically loaded by the Chalk Client if in the environment
33 |     ).offline_query(
34 |         max_samples=100_000,  # reads 100,000 samples from the Chalk dataset
35 |         output=TRAINING_FEATURES,
36 |         dataset_name="transactions_fraud_model",
37 |     )
38 |     dataset = chalk_dataset.to_pandas()
39 | 
40 |     X_train, X_test, y_train, y_test = train_test_split(
41 |         dataset.drop(columns=[TARGET_FEATURE]),  # X
42 |         dataset[TARGET_FEATURE],  # y
43 |         test_size=test_size
44 |     )
45 | 
46 |     xtrain_path = f"{run_bucket}/input/X_train.parquet"
47 |     xtest_path = f"{run_bucket}/input/X_test.parquet"
48 |     ytrain_path = f"{run_bucket}/input/y_train.parquet"
49 |     ytest_path = f"{run_bucket}/input/y_test.parquet"
50 | 
51 |     dataset.to_parquet(f"{run_bucket}/raw_data/data.parquet")
52 |     X_train.to_parquet(xtrain_path)
53 |     y_train.to_parquet(ytrain_path)
54 |     X_test.to_parquet(xtest_path)
55 |     y_test.to_parquet(ytest_path)
56 |     return xtrain_path, xtest_path, ytrain_path, ytest_path
57 | 


--------------------------------------------------------------------------------
/predictive_maintenance/README.md:
--------------------------------------------------------------------------------
 1 | # Predictive Maintenance
 2 | 
 3 | Predicting device failure requires complex analysis executed 
 4 | against a variety of data sources. Chalk's platform allows 
 5 | data scientists to bring all the different data together, 
 6 | including streaming data from devices.
 7 | 
 8 | ## 1. Device Data
 9 | Easily listen to streaming data and parse messages with 
10 | custom logic.
11 | 
12 | **[1_device_data.py](1_device_data.py)**
13 | 
14 | ```python
15 | @stream(source=source)
16 | def read_message(message: Message) -> Measurement:
17 |     return Measurement(
18 |         device_id=message.device_id,
19 |         timestamp=message.timestamp,
20 |         lat=message.data.latitude,
21 |         long=message.data.longitude,
22 |         voltage=message.data.voltage,
23 |         temp=message.data.temperature,
24 |     )
25 | ```
26 | https://docs.chalk.ai/docs/streams
27 | 
28 | ## 2. Historical Data
29 | Access historical sensor data as-of any time in the past.
30 | 
31 | **[2_time_query.py](2_time_query.py)**
32 | 
33 | ```python
34 | ChalkClient.get_training_dataframe(
35 |     input=labels[[Measurement.device_id]],
36 |     input_times = [(datetime.now() - timedelta(days = 30)).isoformat()],
37 |     output=[
38 |         Measurement.lat,
39 |         Measurement.long,
40 |         Measurement.temp
41 |     ]
42 | )
43 | ```
44 | 
45 | https://docs.chalk.ai/docs/temporal-consistency
46 | 
47 | ## 3. Sensor Streams
48 | 
49 | Compute streaming window aggregate functions
50 | on sensor data.
51 | 
52 | **[3_keep_data_fresh.py](3_keep_data_fresh.py)**
53 | 
54 | ```python
55 | @stream(source=source, mode="continuous")
56 | def process_measurements(df: DataFrame[Message]) -> DataFrame[Sensor]:
57 |     return f"""
58 |         select
59 |             count(*) as count_failed,
60 |             id as device_id
61 |         from {df}
62 |         where is_failing <> TRUE
63 |         group by id
64 |     """
65 | ```
66 | 
67 | https://docs.chalk.ai/docs/aggregations
68 | 
69 | ## 4. Failing Sensors
70 | 
71 | Combine batch, caching, and DataFrames to create a powerful
72 | predictive maintenance pipeline.
73 | 
74 | **[4_customer_sensors.py](4_customer_sensors.py)**
75 | 
76 | ```python
77 | @batch(cron="1h")
78 | def get_customers_needing_service(
79 |     bad_sensors: Customer.sensors[
80 |         Sensor.is_failing is True,
81 |         Sensor.id
82 |     ]
83 | ) -> Customer.customer_needs_service:
84 |     return len(bad_sensors) > 0
85 | ```
86 | 
87 | https://docs.chalk.ai/docs/feature-caching
88 | 


--------------------------------------------------------------------------------
/07_streaming/README.md:
--------------------------------------------------------------------------------
 1 | # Streaming
 2 | 
 3 | Chalk ships with a powerful streams module for computing
 4 | features from a stream and computing window functions
 5 | on streams.
 6 | 
 7 | https://docs.chalk.ai/docs/streams
 8 | 
 9 | https://docs.chalk.ai/docs/aggregations
10 | 
11 | ## 1. Mapping Stream
12 | Create features directly from messages on a stream.
13 | 
14 | **[1_mapping_stream.py](1_mapping_stream.py)**
15 | 
16 | ```python
17 | @stream(source=src)
18 | def fn(message: UserUpdateBody) -> Features[User.uid, User.favorite_color]:
19 |     return User(
20 |         uid=message.value.user_id,
21 |         favorite_color=message.value.favorite_color
22 |     )
23 | ```
24 | 
25 | https://docs.chalk.ai/docs/streams
26 | 
27 | ## 2. Stream DataFrame
28 | 
29 | Compute a streaming window aggregate function using [DataFrames](https://docs.chalk.ai/docs/dataframe).
30 | 
31 | **[2_window_dataframe.py](2_window_dataframe.py)**
32 | 
33 | ```python
34 | @stream(source=src)
35 | def failed_logins(events: DataFrame[LoginMessage]) -> Features[
36 |     User.id,
37 |     User.num_failed_logins
38 | ]:
39 |     return User(
40 |         id=events["id"].max(),
41 |         num_failed_logins=events["failed"].sum(),
42 |     )
43 | ```
44 | 
45 | https://docs.chalk.ai/docs/aggregations#using-dataframes
46 | 
47 | ## 3. Stream SQL
48 | 
49 | Compute a streaming window aggregate function using [DataFrames](https://docs.chalk.ai/docs/dataframe).
50 | 
51 | **[3_window_sql.py](3_window_sql.py)**
52 | 
53 | ```python
54 | @stream(source=src)
55 | def failed_logins(events: DataFrame[LoginMessage]) -> DataFrame[
56 |     User.id,
57 |     User.num_failed_logins
58 | ]:
59 |     return f"""
60 |         select
61 |           user_id as id,
62 |           count(*) as num_failed_logins
63 |         from {events}
64 |         where failed = 1
65 |         group by 1
66 |     """
67 | ```
68 | 
69 | https://docs.chalk.ai/docs/aggregations#using-sql
70 | 
71 | ## 4. Stream SQL Aggregation
72 | 
73 | Compute an aggregation on windows using [DataFrames](https://docs.chalk.ai/docs/dataframe).
74 | 
75 | **[4_continuous_aggregation.py](4_continuous_aggregation.py)**
76 | 
77 | ```python
78 | @stream(source=src, mode='continuous', keys=["user_id": User.id])
79 | def failed_logins(events: DataFrame[LoginMessage]) -> DataFrame[
80 |     User.id,
81 |     User.distinct_ips
82 | ]:
83 |     return f"""
84 |         select
85 |           user_id as id,
86 |           approximate_count_distinct(id_address) as distinct_ips
87 |         from {events}
88 |     """
89 | ```
90 | 


--------------------------------------------------------------------------------
/01_features/2_custom_feature_types.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | import attrs
 5 | from pydantic import BaseModel, constr
 6 | 
 7 | from chalk.features import feature, features
 8 | 
 9 | 
10 | # A dataclass can be used as a feature (Book.jacket_info below)
11 | @dataclass
12 | class JacketInfo:
13 |     title: str
14 |     subtitle: str
15 |     body: str
16 | 
17 | 
18 | # Pydantic classes can also be used as features (Book.title below)
19 | class TitleInfo(BaseModel):
20 |     heading: constr(min_length=2)
21 |     subheading: Optional[str]
22 | 
23 | 
24 | # attrs classes are also valid for feature types (Book.table_of_contents below)
25 | @attrs.define
26 | class TableOfContentsItem:
27 |     foo: str
28 |     bar: int
29 | 
30 | 
31 | @features
32 | class Book:
33 |     id: int
34 |     # You can use any `dataclass` as a struct feature.
35 |     # Struct types should be used for objects that don't have ids.
36 |     # If an object has an id, consider using has_one.
37 |     jacket_info: JacketInfo
38 | 
39 |     # If you prefer `pydantic` to `dataclass`, you can use that instead.
40 |     title: TitleInfo
41 | 
42 |     # Alternatively, you can use `attrs`. Any of these struct types
43 |     # (`dataclass`, `pydantic`, and `attrs`) can be used with
44 |     # `set[...]` or `list[...]`.
45 |     table_of_contents: list[TableOfContentsItem]
46 | 
47 | 
48 | # Finally, if you have an object that you want to serialize that isn't
49 | # from `dataclass`, `attrs`, or `pydantic`, you can write a custom codec.
50 | 
51 | 
52 | # Consider the custom class below:
53 | class CustomStruct:
54 |     def __init__(self, foo: str, bar: int) -> None:
55 |         self.foo = foo
56 |         self.bar = bar
57 | 
58 |     def __eq__(self, other: object) -> bool:
59 |         return (
60 |             isinstance(other, CustomStruct)
61 |             and self.foo == other.bar
62 |             and self.bar == other.bar
63 |         )
64 | 
65 |     def __hash__(self) -> int:
66 |         return hash((self.foo, self.bar))
67 | 
68 | 
69 | @dataclass
70 | class CustomStructDC(BaseModel):
71 |     foo: str
72 |     bar: int
73 | 
74 | 
75 | @features
76 | class Book:
77 |     id: int
78 |     jacket_info: JacketInfo
79 |     title: TitleInfo
80 |     table_of_contents: list[TableOfContentsItem]
81 |     custom_field: CustomStructDC = feature(
82 |         # The encoder takes an instance of the custom type and outputs a Python object
83 |         encoder=lambda x: CustomStructDC(foo=x.foo, bar=x.bar),
84 |         # The decoder takes output of the encoder and creates an instance of the custom type
85 |         decoder=lambda x: CustomStruct(**x),
86 |     )
87 | 


--------------------------------------------------------------------------------
/marketplace/tests.py:
--------------------------------------------------------------------------------
 1 | import datetime as dt
 2 | 
 3 | import pytest
 4 | from chalk.client import ChalkClient
 5 | from chalk.features import DataFrame
 6 | 
 7 | from src.marketplace import (
 8 |     Review,
 9 |     User,
10 | )
11 | 
12 | 
13 | @pytest.fixture(scope="session")
14 | def client():
15 |     return ChalkClient(branch=True)
16 | 
17 | 
18 | def test_user_aggregations(client: ChalkClient) -> None:
19 |     now = dt.datetime.now()
20 |     client.check(
21 |         input={
22 |             User.id: 1,
23 |             User.reviews: DataFrame(
24 |                 [
25 |                     Review(
26 |                         id=1,
27 |                         star_rating=3,
28 |                         created_at=now - dt.timedelta(days=1),
29 |                     ),
30 |                     Review(
31 |                         id=2,
32 |                         star_rating=4,
33 |                         created_at=now - dt.timedelta(days=2),
34 |                     ),
35 |                     Review(
36 |                         id=3,
37 |                         star_rating=2,
38 |                         created_at=now - dt.timedelta(days=3),
39 |                     ),
40 |                     Review(
41 |                         id=4,
42 |                         star_rating=3,
43 |                         created_at=now - dt.timedelta(days=4),
44 |                     ),
45 |                     Review(
46 |                         id=5,
47 |                         star_rating=3,
48 |                         created_at=now - dt.timedelta(days=5),
49 |                     ),
50 |                     Review(
51 |                         id=6,
52 |                         star_rating=1,
53 |                         created_at=now - dt.timedelta(days=6),
54 |                     ),
55 |                     Review(
56 |                         id=7,
57 |                         star_rating=5,
58 |                         created_at=now - dt.timedelta(days=7),
59 |                     ),
60 |                     Review(
61 |                         id=8,
62 |                         star_rating=2,
63 |                         created_at=now - dt.timedelta(days=8),
64 |                     ),
65 |                     Review(
66 |                         id=9,
67 |                         star_rating=3,
68 |                         created_at=now - dt.timedelta(days=9),
69 |                     ),
70 |                 ],
71 |             ),
72 |         },
73 |         assertions={
74 |             User.review_count["3d"]: 3,
75 |             User.review_count["7d"]: 7,
76 |             User.average_rating_given["3d"]: 3,
77 |             User.average_rating_given["7d"]: 3,
78 |         },
79 |     )
80 | 


--------------------------------------------------------------------------------
/call_recordings/features/fathom/fathom_message_webhook.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from chalk.features import online
 3 | 
 4 | from src.fathom.features.fathom_feature_set import FathomMessage
 5 | 
 6 | 
 7 | @online
 8 | def fathom_message_webhook(
 9 |     id: FathomMessage.id,
10 |     recording_id: FathomMessage.recording_id,
11 |     message_id: FathomMessage.message_id,
12 |     url: FathomMessage.url,
13 |     title: FathomMessage.title,
14 |     date: FathomMessage.date,
15 |     timestamp: FathomMessage.timestamp,
16 |     speaker: FathomMessage.speaker,
17 |     organization: FathomMessage.organization,
18 |     message: FathomMessage.message,
19 |     action_item: FathomMessage.action_item,
20 |     watch_link: FathomMessage.watch_link,
21 |     speaker_changes: FathomMessage.call.speaker_changes,
22 |     meeting_duration_ratio: FathomMessage.call.meeting_duration_ratio,
23 |     attendee_count: FathomMessage.call.attendee_count,
24 |     chalk_attendee_count: FathomMessage.call.chalk_email_count,
25 |     customer_attendee_count: FathomMessage.call.customer_email_count,
26 |     meeting_scheduled_duration: FathomMessage.call.meeting_scheduled_duration,
27 |     recording_duration_in_minutes: FathomMessage.call.recording_duration_in_minutes,
28 |     ai_meeting_type: FathomMessage.call.ai_call_type,
29 |     ai_reasons_for_meeting: FathomMessage.call.ai_reasons_for_meeting,
30 |     ai_risk_flags: FathomMessage.call.ai_risk_flag,
31 | ) -> FathomMessage.webhook_status_code:
32 |     data = {
33 |         "id": id,
34 |         "url": url,
35 |         "date": date.isoformat(),
36 |         "title": title,
37 |         "message": message,
38 |         "speaker": speaker,
39 |         "timestamp": timestamp,
40 |         "message_id": message_id,
41 |         "watch_link": watch_link,
42 |         "action_item": action_item,
43 |         "organization": organization,
44 |         "recording_id": recording_id,
45 |         "attendee_count": attendee_count,
46 |         "chalk_attendee_count": chalk_attendee_count,
47 |         "customer_attendee_count": customer_attendee_count,
48 |         "meeting_scheduled_duration": meeting_scheduled_duration,
49 |         "recording_duration_in_minutes": recording_duration_in_minutes,
50 |         "speaker_changes": speaker_changes,
51 |         "meeting_duration_ratio": meeting_duration_ratio,
52 |         "ai_meeting_type": ai_meeting_type,
53 |         "ai_reasons_for_meeting": ai_reasons_for_meeting,
54 |         "ai_risk_flags": ai_risk_flags,
55 |     }
56 |     status_code = requests.post(
57 |         headers={"Content-Type": "application/json"},
58 |         url="url",
59 |         json=data,
60 |     ).status_code
61 |     if status_code == 200:
62 |         return 200
63 | 
64 |     return None
65 | 


--------------------------------------------------------------------------------
/fraud/1_return.py:
--------------------------------------------------------------------------------
 1 | """An example of calculating non-sufficient fund (NSF) amount from
 2 | a user's transactions
 3 | """
 4 | 
 5 | from chalk import online
 6 | from chalk.features import features, DataFrame, FeatureTime
 7 | 
 8 | from datetime import datetime
 9 | import pytz
10 | 
11 | 
12 | @features
13 | class Transaction:
14 |     id: str
15 |     amount: float
16 |     memo: str
17 |     on: FeatureTime
18 |     user_id: "User.id"
19 |     user: "User"
20 | 
21 |     # Computed properties
22 |     clean_memo: str
23 |     is_nsf: bool
24 | 
25 | 
26 | @features
27 | class User:
28 |     id: int
29 |     transactions: DataFrame[Transaction]
30 | 
31 |     # Computed properties
32 |     nsf_amount: float
33 | 
34 | 
35 | @online
36 | def get_clean_memo(memo: Transaction.memo) -> Transaction.clean_memo:
37 |     computed = memo.lower()
38 |     for prefix in ("sale", "pos", "tst", "sq"):
39 |         computed = computed.removeprefix(prefix).strip()
40 |     return computed
41 | 
42 | 
43 | @online
44 | def get_transaction_is_nsf(
45 |     memo_clean: Transaction.clean_memo,
46 | ) -> Transaction.is_nsf:
47 |     return "nsf" in memo_clean.lower()
48 | 
49 | 
50 | @online
51 | def get_nsf_amount(amounts: User.transactions[Transaction.is_nsf is True, Transaction.amount]) -> User.nsf_amount:
52 |     """
53 |     In this resolver, we calculate the total NSF ammount for our users.
54 |     """
55 |     return amounts.sum()
56 | 
57 | 
58 | # Below we generate a couple dummy resolvers to make the example fully runnable without connected
59 | # datasources.
60 | 
61 | 
62 | @online
63 | def get_test_users() -> DataFrame[User.id]:
64 |     return DataFrame([
65 |         User(id=1),
66 |         User(id=2),
67 |     ])
68 | 
69 | 
70 | @online
71 | def get_test_transactions() -> (
72 |     DataFrame[Transaction.id, Transaction.user_id, Transaction.amount, Transaction.memo, Transaction.on]
73 | ):
74 |     return DataFrame([
75 |         Transaction(id=1, user_id=1, amount=-277.0, memo="directdep", on=datetime(2014, 8, 12)),
76 |         Transaction(id=2, user_id=1, amount=-10_001.0, memo="other", on=datetime(2014, 8, 12)),
77 |         Transaction(id=3, user_id=1, amount=42.1, memo="tetst nsf", on=datetime(2014,8, 12)),
78 |         Transaction(id=4, user_id=1, amount=-1303.0, memo="paycheck", on=datetime(2014,8, 12)),
79 |         Transaction(id=5, user_id=1, amount=124.0, memo="test", on=datetime(2014,8, 12)),
80 |         Transaction(id=7, user_id=2, amount=2132.04, memo="undefined", on=datetime(2014,8, 12)),
81 |         Transaction(id=6, user_id=2, amount=-1.0, memo="sale nsf", on=datetime(2014,8, 12)),
82 |         Transaction(id=8, user_id=2, amount=-30.0, memo="tst nsf", on=datetime(2014,8, 12)),
83 |         Transaction(id=9, user_id=2, amount=-999.99, memo="payroll", on=datetime(2014,8, 12)),
84 |     ])
85 | 


--------------------------------------------------------------------------------
/06_dataframe/README.md:
--------------------------------------------------------------------------------
  1 | # DataFrames
  2 | A Chalk DataFrame is a 2-dimensional data structure similar 
  3 | to `pandas.Dataframe`, but with richer types and
  4 | underlying optimizations. 
  5 | 
  6 | https://docs.chalk.ai/docs/dataframe
  7 | 
  8 | ## 1. Creating DataFrames
  9 | Describe features at a feature class or feature level.
 10 | 
 11 | **[1_creating_dataframes.py](1_creating_dataframes.py)**
 12 | 
 13 | ```python
 14 | df = DataFrame()
 15 | DataFrame.from_dict({
 16 |     User.id: [1, 2],
 17 |     User.email: ["elliot@chalk.ai", "samantha@chalk.ai"],
 18 | })
 19 | ```
 20 | https://docs.chalk.ai/docs/dataframe
 21 | 
 22 | ## 2. Filters
 23 | Filter the rows of a `DataFrame` by supplying conditions
 24 | to the `__getitem__()` method.
 25 | 
 26 | **[2_filtering.py](2_filters.py)**
 27 | 
 28 | ```python
 29 | User.txns[
 30 |     Transaction.amount < 0,
 31 |     Transaction.merchant in {"uber", "lyft"} or Transaction.memo == "uberpmts",
 32 |     Transaction.canceled_at is None
 33 | ]
 34 | ```
 35 | https://docs.chalk.ai/docs/dataframe
 36 | 
 37 | ## 3. Projections
 38 | Scope down the set of rows available in a `DataFrame`.
 39 | 
 40 | **[3_projections.py](3_projections.py)**
 41 | 
 42 | ```python
 43 | User.txns[
 44 |     Transaction.amount,
 45 |     Transaction.memo
 46 | ]
 47 | ```
 48 | https://docs.chalk.ai/docs/dataframe
 49 | 
 50 | ## 4. Projections with Filters
 51 | Compose projections and filters to create a new `DataFrame`.
 52 | 
 53 | **[4_filters_and_projections.py](4_filters_and_projections.py)**
 54 | 
 55 | ```python
 56 | User.transactions[Transaction.amount > 100, Transaction.memo]
 57 | ```
 58 | 
 59 | https://docs.chalk.ai/docs/dataframe#composing-projections-and-filters
 60 | 
 61 | ## 5. Aggregations
 62 | 
 63 | Compute aggregates over a `DataFrame`.
 64 | 
 65 | **[5_aggregations.py](5_aggregations.py)**
 66 | 
 67 | ```python
 68 | User.transactions[Transaction.amount].sum()
 69 | User.transactions[Transaction.amount].mean()
 70 | User.transactions[Transaction.amount].count()
 71 | User.transactions[Transaction.amount].max()
 72 | ```
 73 | https://docs.chalk.ai/docs/dataframe#aggregations
 74 | 
 75 | ## 6. Self Joins
 76 | 
 77 | Join a feature set back to itself.
 78 | 
 79 | **[6_self_joins.py](6_self_joins.py)**
 80 | 
 81 | ```python
 82 | @features
 83 | class PrequelLink:
 84 |     id: int
 85 |     prequel_id: int
 86 |     book: "Book" = has_one(lambda: Book.id == PrequelLink.prequel_id)
 87 | 
 88 | 
 89 | @features
 90 | class Book:
 91 |     id: int
 92 |     title: str
 93 |     author_id: Author.id
 94 |     prequel_id: PrequelLink.id | None
 95 |     prequel: PrequelLink | None = has_one(lambda: Book.id == PrequelLink.prequel_id)
 96 |     series_id: SeriesLink.id | None
 97 |     series: SeriesLink = has_one(lambda: SeriesLink.id == Book.series_id)
 98 | ```
 99 | 
100 | 


--------------------------------------------------------------------------------
/marketplace/lancedb.py:
--------------------------------------------------------------------------------
 1 | # trunk-ignore-all(pyright/reportInvalidTypeForm,pyright/reportCallIssue,ruff/PLW0603,pyright/reportOptionalMemberAccess)
 2 | import os
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | import lancedb
 6 | from chalk.features import (
 7 |     DataFrame,
 8 |     before_all,
 9 |     online,
10 | )
11 | from chalk.logging import chalk_logger
12 | from lancedb.db import DBConnection
13 | 
14 | from src.marketplace.models import ItemDocument, ItemSearch
15 | 
16 | if TYPE_CHECKING:
17 |     from lancedb.table import Table
18 | 
19 | db: DBConnection | None = None
20 | 
21 | DB_URI: str = "db://marketplace-x205j4"
22 | TABLE_NAME: str = "marketplace_product_descriptions"
23 | VECTOR_COLUMN_NAME: str = "embedding"
24 | REGION: str = "us-east-1"
25 | 
26 | 
27 | @before_all
28 | def init_client() -> None:
29 |     global db
30 |     lance_api_key: str | None = os.getenv("LANCEDB_API_KEY_MARKETPLACE")
31 |     if lance_api_key is None:
32 |         error_msg: str = "LANCEDB_API_KEY is not set."
33 |         chalk_logger.error(msg=error_msg)
34 |         raise ValueError(error_msg)
35 | 
36 |     db = lancedb.connect(
37 |         uri=DB_URI,
38 |         api_key=lance_api_key,
39 |         region=REGION,
40 |     )
41 |     chalk_logger.info(
42 |         msg=f"Initializing client: LanceDB",
43 |     )
44 | 
45 | 
46 | @online
47 | def get_vector_search_results(
48 |     vector: ItemSearch.vector,
49 |     q: ItemSearch.q,
50 |     query_type: ItemSearch.query_type,
51 | ) -> DataFrame[ItemDocument]:
52 |     def execute_vector_search(vector, q: str) -> DataFrame[ItemDocument]:
53 |         tbl: Table = db.open_table(
54 |             name=TABLE_NAME,
55 |         )
56 |         results: list = (
57 |             tbl.search(
58 |                 query=vector.to_pylist(),
59 |                 query_type="vector",
60 |                 vector_column_name=VECTOR_COLUMN_NAME,
61 |             )
62 |             .select(
63 |                 columns=[
64 |                     "hid",
65 |                     "title",
66 |                     # "description",
67 |                 ],
68 |             )
69 |             .limit(
70 |                 limit=30,
71 |             )
72 |             .to_list()
73 |         )
74 |         documents: list[ItemDocument] = [
75 |             ItemDocument(
76 |                 query=q,
77 |                 id=result["hid"],
78 |                 distance=result["_distance"],
79 |                 query_type=query_type,
80 |                 title=result["title"],
81 |                 # description=result["description"],
82 |             )
83 |             for result in results
84 |         ]
85 |         return DataFrame(documents)
86 | 
87 |     match query_type:
88 |         case "vector":
89 |             return execute_vector_search(vector=vector, q=q)
90 | 
91 |         case _:
92 |             raise ValueError(f"Unsupported query_type: {query_type}")
93 | 


--------------------------------------------------------------------------------
/marketplace/named_queries.py:
--------------------------------------------------------------------------------
 1 | from chalk.queries.named_query import NamedQuery
 2 | 
 3 | from . import Review
 4 | 
 5 | NamedQuery(
 6 |     name="review",
 7 |     input=[Review.id],
 8 |     output=[
 9 |         # Review features
10 |         Review.review_body,
11 |         Review.review_headline,
12 |         Review.star_rating,
13 |         Review.is_positive_review_inline,
14 |         Review.is_positive_review_python_resolver,
15 |         Review.is_positive_review_from_llm,
16 |         Review.normalized_rating,
17 |         Review.sentiment_from_llm,
18 |         Review.reviewer_name,
19 |         Review.created_at,
20 |         # Product features
21 |         Review.item.title,
22 |         Review.item.genre_with_llm_from_title,
23 |         Review.item.average_rating,
24 |         Review.item.total_reviews,
25 |         # User features
26 |         Review.user.first_name,
27 |         Review.user.last_name,
28 |         Review.user.created_at,
29 |         Review.user.username,
30 |         Review.user.name_match_score,
31 |         Review.user.top_genres,
32 |         # Seller features
33 |         Review.seller.name,
34 |         Review.seller.zipcode,
35 |     ],
36 | )
37 | 
38 | NamedQuery(
39 |     name="review_dag",
40 |     input=[Review.id],
41 |     output=[
42 |         # Base features pulled from SQL
43 |         Review.id,
44 |         Review.created_at,
45 |         Review.review_headline,
46 |         Review.review_body,
47 |         Review.star_rating,
48 |         # Computed features
49 |         Review.is_positive_review_inline,
50 |         Review.is_positive_review_python_resolver,
51 |         Review.is_positive_review_from_llm,
52 |         Review.normalized_rating,
53 |         # Product information
54 |         Review.item_id,
55 |         Review.item.title,
56 |         Review.item.genre_with_llm_from_title,
57 |         Review.item.genre_with_llm_from_title_confidence,
58 |         Review.item.genre_with_llm_from_title_reasoning,
59 |         Review.item.average_rating,
60 |         Review.item.total_reviews,
61 |         Review.item.review_count,
62 |         Review.interaction.id,
63 |         Review.interaction.created_at,
64 |         Review.interaction.interaction_type,
65 |         # User information
66 |         Review.reviewer_name,
67 |         Review.user.id,
68 |         Review.user.first_name,
69 |         Review.user.last_name,
70 |         Review.user.username,
71 |         Review.user.email,
72 |         Review.user.birthday,
73 |         Review.user.review_count,
74 |         Review.user.average_rating_given,
75 |         # Seller information
76 |         Review.seller.id,
77 |         Review.seller.created_at,
78 |         Review.seller.name,
79 |         Review.seller.zipcode,
80 |         Review.seller.email,
81 |         Review.seller.phone_number,
82 |         # Sentiment analysis features
83 |         Review.llm,
84 |         Review.sentiment_from_llm,
85 |     ],
86 | )
87 | 


--------------------------------------------------------------------------------
/ecommerce/README.md:
--------------------------------------------------------------------------------
  1 | # E-commerce
  2 | 
  3 | Chalk can help you build realtime recommendation systems.
  4 | 
  5 | This guide shows you how to:
  6 | 1). Implement User and Seller features in Chalk,
  7 | 2). Add an Interaction feature and connect it to users,
  8 | 3). Stream Interaction data from a Kafka queue.
  9 | 
 10 | In each section, you can find an `example_query.py` file. The file shows how the Chalk python client API can be used to
 11 | get information on the affinity between a User and a Seller.
 12 | 
 13 | ## 1. Query Users & Sellers
 14 | 
 15 | Create Chalk features for Users and Sellers and evaluate whether a user and seller have matching categories.
 16 | 
 17 | **[1_users_sellers.py](1_users_sellers.py)**
 18 | 
 19 | ```python
 20 | from chalk.features import features
 21 | 
 22 | 
 23 | @features
 24 | class Seller:
 25 |     id: str
 26 |     categories: set[str]
 27 | 
 28 | 
 29 | @features
 30 | class User:
 31 |     id: str
 32 |     age: int
 33 |     favorite_categories: set[str]
 34 | 
 35 | @features
 36 | class UserSeller:
 37 |     id: str
 38 |     user_id: str
 39 |     user: User.id
 40 |     seller_id: str
 41 |     seller: Seller.id
 42 |     favorites_match: bool
 43 | ```
 44 | 
 45 | ## 2. Track User Seller Interactions
 46 | 
 47 | Identify the number of interactions that have occurred between users and sellers.
 48 | 
 49 | **[2_interactions.py](2_interactions.py)**
 50 | 
 51 | ```python
 52 | from chalk.features import features, DataFrame, FeatureTime
 53 | 
 54 | class InteractionKind(Enum):
 55 |     LIKE = "LIKE"
 56 |     VIEW = "VIEW"
 57 |     PURCHASE = "PURCHASE"
 58 |     OTHER = "OTHER"
 59 | 
 60 |     @classmethod
 61 |     def _missing_(cls, _):
 62 |         return cls.OTHER
 63 | 
 64 | @features
 65 | class Interaction:
 66 |     id: str
 67 |     user: User.id
 68 |     user_id: str
 69 |     seller: Seller.id
 70 |     seller_id: Seller.id
 71 |     interaction_kind: InteractionKind
 72 |     on: FeatureTime
 73 | 
 74 | @online
 75 | def get_number_of_interactions(
 76 |     user_interactions: UserSeller.user.interactions,
 77 |     seller_id: UserSeller.seller.id,
 78 | ) -> UserSeller.number_of_interactions:
 79 |     return len(user_interactions.loc[Interaction.seller_id == seller_id])
 80 | ```
 81 | 
 82 | ## 3. Stream User Seller Interaction Data
 83 | 
 84 | Enrich User Interaction data with stream data.
 85 | 
 86 | **[3_streams.py](3_streams.py)**
 87 | 
 88 | ```python
 89 | from chalk.streams import KafkaSource
 90 | from chalk.features import Features
 91 | from chalk import stream, online
 92 | import uuid
 93 | 
 94 | interaction_stream = KafkaSource(name="interactions")
 95 | 
 96 | @stream(source=interaction_stream)
 97 | def interactions_handler(
 98 |     message: InteractionMessage,
 99 | ) -> Features[Interaction]:
100 |     return Interaction(
101 |         id=uuid.uuid4(),
102 |         interaction_kind=message.interaction_kind,
103 |         user_id=message.user_id,
104 |         seller_id=message.seller_id,
105 |     )
106 | ```
107 | 


--------------------------------------------------------------------------------
/github/features/fraud/prompts.py:
--------------------------------------------------------------------------------
 1 | SYSTEM_PROMPT: str = """ You are an intelligent fraud detection assistant specialized in analyzing GitHub profiles. Your task is to evaluate the likelihood of suspicious or fraudulent activity associated with a user's GitHub profile and provide a **fraud score** between 0.0 and 1.0.
 2 | 
 3 | You will assess the following GitHub profile attributes if they are provided:
 4 | - GitHub Username
 5 | - Full Name
 6 | - Profile Bio
 7 | - Company's Name
 8 | - Email address
 9 | - Geographical Location
10 | - Twitter Username linked to the GitHub profile
11 | - Personal Blog/Website HTTP Response Status Code, if the code is 0 then the user does not have a personal blog or website linked to their GitHub profile. If the code is -1 then they lised their blog/website but it is not accessible (which is highly likely for fraudulent activity).
12 | 
13 | ### Key Guidelines:
14 | 1. Evaluate the presence, content, and consistency of the profile attributes, having an email is good signal that the account is not fraudulent.
15 | 2. Analyze whether the profile details (e.g., bio, company, blog) appear authentic and aligned with typical GitHub user behavior. Bios that are professional and detailed are more likely to be authentic. Bios with stop and go text and short phrases are less likely to be authentic.
16 | 3. Handle missing or null attributes objectively, treating their absence as potentially suspicious, without making too strong an assumption.
17 | 4. Weigh details like generic or unverifiable information (e.g., placeholder text in the bio or invalid links) as more likely to indicate suspicious activity.
18 | 5. Calculate a fraud score ranging between **0.0 (very trustworthy)** and **1.0 (high fraud likelihood)**. Provide clear reasoning for how the score was computed.
19 | 
20 | Finally, return a compact JSON object with the fraud score and a brief explanation of how the GitHub profile was evaluated, listing the main contributing factors.
21 | """
22 | 
23 | USER_PROMPT: str = """Analyze a GitHub profile for potential fraudulent behavior based on the attributes provided. Below are the details of the profile:
24 | 
25 | - GitHub Username: {{GithubFraud.username}}
26 | - Full Name: {{GithubFraud.user.full_name}}
27 | - Profile Bio: {{GithubFraud.user.bio}}
28 | - Company: {{GithubFraud.user.company}}
29 | - Email: {{GithubFraud.user.email}}
30 | - Location: {{GithubFraud.user.location}}
31 | - Twitter Username: {{GithubFraud.user.twitter_username}}
32 | - Personal Blog/Website HTTP Response Status Code: {{GithubFraud.user_website_status_code}}
33 | 
34 | ### Tasks:
35 | 1. Identify whether the profile details (if present) align with typical behavior and authentic information on GitHub.
36 | 2. Assess the impact of the content (e.g., detailed bio, real company, valid blog link) on the likelihood of suspicious activity.
37 | 3. Treat missing or generic information (e.g., no bio or placeholder text) as a signal of potential fraud risk.
38 | 4. Assign a **fraud score** between 0.0 and 1.0 and explain your reasoning.
39 | """
40 | 


--------------------------------------------------------------------------------
/credit/2_accounts.py:
--------------------------------------------------------------------------------
 1 | """An example of connecting Users to Bank Accounts through Chalk.
 2 | 
 3 | In this example, we connect Users to Bank Accounts. On top of
 4 | this, we show how to use a connected postgres datasource to
 5 | resolve features.
 6 | """
 7 | 
 8 | from datetime import datetime
 9 | 
10 | from chalk import online
11 | from chalk.features import features, DataFrame, has_many, FeatureTime
12 | from chalk.sql import PostgreSQLSource
13 | 
14 | # This example assumes a postgres database has been added through the chalk
15 | # dashboard where it was assigned a name of "CLOUD_DB". The database should
16 | # contain an accounts table with 'id', 'bank_bank_account_number', 'decision',
17 | # 'user_id', 'created_at', and 'updated_at' fields.
18 | 
19 | pg = PostgreSQLSource(name="CLOUD_DB")
20 | 
21 | 
22 | @features
23 | class Account:
24 |     id: int
25 |     bank_account_number: int
26 |     decision: str
27 |     user_id: int
28 |     created_at: datetime
29 |     # https://docs.chalk.ai/docs/features#feature-time
30 |     updated_at: FeatureTime
31 | 
32 | 
33 | # this call connects the Accounts feature to the "account" table of the `PostgresSQLSource`
34 | # that we instantiated above.
35 | pg.with_table(
36 |     name="accounts",
37 |     features=Account,
38 |     column_to_feature={
39 |         "id": Account.id,
40 |         "bank_account_number": Account.bank_account_number,
41 |         "user_id": Account.user_id,
42 |         "created_at": Account.created_at,
43 |         "updated_at": Account.updated_at,
44 |     },
45 | )
46 | 
47 | 
48 | @features
49 | class User:
50 |     id: Account.user_id
51 |     name: str
52 |     accounts: DataFrame[Account]
53 | 
54 |     # computed
55 |     number_of_accounts: int
56 | 
57 | 
58 | @online
59 | def count_accounts(accounts: User.accounts) -> User.number_of_accounts:
60 |     return len(accounts)
61 | 
62 | # ---------------------------------------------------------------------------------
63 | # Let us assume that our postgres database also contains a `users` table with
64 | # the fields: ['id', 'first_name', 'last_name', 'birthday', 'age']. To get data
65 | # into this feature, we could update our base User feature class to include all
66 | # the fields specified in the database table and use the same `with_table` syntax
67 | # that we used to populate our `Account` feature. However, suppose we want to keep
68 | # the User feature lean or that we want to apply some simple transformation on the
69 | # raw data without using a python resolver.
70 | #
71 | # To accomplish this we can use what Chalk refers to as a `sql_file_resolver`
72 | # (https://docs.chalk.ai/docs/sql#sql-file-resolvers). Essentially, we can
73 | # resolve the User feature by placing a file called `get_user.chalk.sql` in our
74 | # Chalk Directory and adding some metadata specifying the name of the resolved
75 | # feature and the upstream raw data source. It would wind up looking like the
76 | # following:
77 | # ---------------------------------------------------------------------------------
78 | #
79 | # -- type: online
80 | # -- resolvers: user
81 | # -- source: CLOUD_DB
82 | # select id, first_name||last_name as name FROM users;
83 | 


--------------------------------------------------------------------------------
/full_examples/sagemaker/README.md:
--------------------------------------------------------------------------------
 1 | # Integrating Chalk with AWS Sagemaker
 2 | 
 3 | Chalk integrates nicely with machine learning frameworks like AWS Sagemaker.
 4 | 
 5 | You can use Chalk to define your transformed features and pull datasets directly into your 
 6 | model training pipeline. Using Chalk for dataset generation ensures that feature transformation 
 7 | code is consistent between training and serving.
 8 | 
 9 | ## Setup
10 | 
11 | To pull a dataset from Chalk into Sagemaker, run an offline query with Chalk's Python API client
12 | in a Sagemaker step. Chalk offline queries return datasets, which can be uploaded to a 
13 | bucket and used in the subsequent steps of your machine learning pipeline.
14 | 
15 | **[steps/dataset.py](./steps/dataset.py)**
16 | 
17 | ```python
18 | from chalk.client import ChalkClient
19 | from sagemaker.workflow.function_step import step
20 | 
21 | 
22 | TRAINING_FEATURES = [
23 |     "transaction.amt",
24 |     "transaction.customer.age",
25 |     "transaction.customer.income",
26 |     "transaction.customer.fico",
27 |     "transaction.customer.transaction_sum_30m",
28 |     "transaction.customer.transaction_sum_1h",
29 |     "transaction.confirmed_fraud"
30 | ]
31 | 
32 | TARGET_FEATURE = "transaction.confirmed_fraud"
33 | 
34 | @step(
35 |     name="create_dataset",
36 |     instance_type='ml.t3.medium',
37 |     keep_alive_period_in_seconds=300,
38 | )
39 | def create_dataset(test_size, run_bucket):
40 |     from sklearn.model_selection import train_test_split
41 | 
42 |     # a Chalk client id & client secret for a token with permission to create datasets
43 |     # should be added to the Sagemaker environment—these are passed automatically to the
44 |     # ChalkClient but can also be explicitly passed as arguments.
45 | 
46 |     chalk_dataset = ChalkClient(
47 |         # client_id=os.environ['CHALK_CLIENT_ID'],           # automatically loaded by the Chalk Client if in the environment
48 |         # client_secret=os.environ['CHALK_CLIENT_SECRET']    # automatically loaded by the Chalk Client if in the environment
49 |     ).offline_query(
50 |         max_samples=100_000,  # reads 100,000 samples from the Chalk dataset
51 |         output=TRAINING_FEATURES,
52 |         dataset_name="transactions_fraud_model",
53 |     )
54 |     dataset = chalk_dataset.to_pandas()
55 |     
56 |     X_train, X_test, y_train, y_test = train_test_split(
57 |         dataset.drop(columns=[TARGET_FEATURE]),  # X
58 |         dataset[TARGET_FEATURE],  # y
59 |         test_size=test_size,
60 |     )
61 | 
62 |     xtrain_path = f"{run_bucket}/input/X_train.parquet"
63 |     xtest_path = f"{run_bucket}/input/X_test.parquet"
64 |     ytrain_path = f"{run_bucket}/input/y_train.parquet"
65 |     ytest_path = f"{run_bucket}/input/y_test.parquet"
66 | 
67 |     dataset.to_parquet(f"{run_bucket}/raw_data/data.parquet")
68 |     X_train.to_parquet(xtrain_path)
69 |     X_test.to_parquet(xtest_path)
70 |     y_train.to_parquet(ytrain_path)
71 |     y_test.to_parquet(ytest_path)
72 | 
73 |     return xtrain_path, xtest_path, ytrain_path, ytest_path
74 | ```
75 | 
76 | Subsequent Sagemaker steps can then pull the dataset from the paths returned by the `create_dataset` step.
77 | 


--------------------------------------------------------------------------------
/13_airflow/chalk_airflow.py:
--------------------------------------------------------------------------------
 1 | from time import sleep
 2 | 
 3 | import pendulum
 4 | from airflow.decorators import dag, task
 5 | from airflow.exceptions import AirflowFailException
 6 | from airflow.sensors.base import PokeReturnValue
 7 | 
 8 | from chalk.client import ChalkClient
 9 | 
10 | 
11 | @dag(
12 |     schedule=None,
13 |     start_date=pendulum.datetime(2024, 5, 7, tz="UTC"),
14 |     catchup=False,
15 |     tags=["chalk"],
16 | )
17 | def taskflow_with_chalk():
18 |     """
19 |     Simple example of setting up airflow DAG that triggers Chalk resolvers
20 |     """
21 | 
22 |     @task()
23 |     def extract(): ...
24 | 
25 |     @task(multiple_outputs=True)
26 |     def transform(): ...
27 | 
28 |     @task()
29 |     def load(): ...
30 | 
31 |     @task.virtualenv(
32 |         task_id="virtualenv_python",
33 |         requirements=["chalkpy"],
34 |         system_site_packages=False,
35 |     )
36 |     def run_chalk_resolver_virtual_env():
37 |         """
38 |         Trigger the resolver.get_email_domain resolver in a virtual environment
39 |         """
40 |         from chalk.client import ChalkClient
41 | 
42 |         # This assumes that CHALK_CLIENT_SECRET, CHALK_CLIENT_ID, & CHALK_ENVIRONMENT environment variables
43 |         # are passed to airflow.
44 |         client = ChalkClient()
45 | 
46 |         result = client.trigger_resolver_run(
47 |             "get_users"  # this is the name of our sql file resolver {name}.chalk.sql
48 |         )
49 |         if result.status == "failed":
50 |             raise AirflowFailException(f"Resolver run failed: {result}")
51 |         return result.id
52 | 
53 |     @task()
54 |     def run_chalk_resolver() -> str:
55 |         """
56 |         Trigger the resolver.get_email_domain resolver
57 |         """
58 | 
59 |         # This assumes that CHALK_CLIENT_SECRET, CHALK_CLIENT_ID, & CHALK_ENVIRONMENT environment variables
60 |         # are passed to airflow.
61 |         client = ChalkClient()
62 | 
63 |         result = client.trigger_resolver_run(
64 |             "get_users"  # this is the name of our sql file resolver {name}.chalk.sql
65 |         )
66 |         if result.status == "failed":
67 |             raise AirflowFailException(f"Resolver run failed: {result}")
68 |         return result.id
69 | 
70 |     @task.sensor(poke_interval=30, timeout=60 * 5)
71 |     def poll_resolver_run(run_id) -> PokeReturnValue:
72 |         """
73 |         Poll the running chalk resolver
74 |         """
75 | 
76 |         # This assumes that CHALK_CLIENT_SECRET, CHALK_CLIENT_ID, & CHALK_ENVIRONMENT environment variables
77 |         # are passed to airflow.
78 |         client = ChalkClient()
79 | 
80 |         if (status := client.get_run_status(run_id).status) == "succeeded":
81 |             if status == "succeeded":
82 |                 return PokeReturnValue(True, run_id)
83 |             elif status == "failed":
84 |                 raise AirflowFailException(f"Chalk resolver resolver run: {run_id}")
85 |         return PokeReturnValue(False)
86 | 
87 |     extract()
88 |     transform()
89 |     load()
90 |     rid = run_chalk_resolver()
91 |     poll_resolver_run(rid)
92 |     # run_chalk_resolver_virtual_env()
93 | 
94 | 
95 | taskflow_with_chalk()
96 | 


--------------------------------------------------------------------------------
/full_examples/batch_ml/src/resolvers/fraud_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from chalk import DataFrame, offline
 3 | from src.models import Transaction
 4 | import onnxruntime as rt
 5 | from functools import cached_property
 6 | import numpy as np
 7 | 
 8 | 
 9 | ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
10 | 
11 | 
12 | class PredictionModel:
13 |     """
14 |     # Previously, we trained a model on our user data. This
15 |     # model has been saved to our local chalk directory next to
16 |     # our feature and resolver code. When we run chalk apply
17 |     # it will be incorporated into deployments.
18 | 
19 |     from sklearn.linear import LogisticRegression
20 | 
21 |     X_train, y_train = ...
22 | 
23 |     model = LogisticRegression()
24 |     model.fit(X_train, y_train)
25 |     model
26 |     """
27 | 
28 |     def __init__(self, filename: str):
29 |         self.filename = filename
30 |         self.input_name = None
31 |         self.output_name = None
32 | 
33 |     @cached_property
34 |     def _model(self) -> rt.InferenceSession:
35 |         # The "TARGET_ROOT" environment variable is set by Chalk for both branch and
36 |         # standard deployments. You can read more about it on our docs:
37 |         # https://docs.chalk.ai/docs/env-vars#chalk-environment-variable
38 |         filepath = os.path.join(
39 |             os.environ.get("TARGET_ROOT", ROOT_DIR), "models", self.filename
40 |         )
41 | 
42 |         if not os.path.exists(filepath):
43 |             raise FileNotFoundError(f"Model file not found: {filepath}")
44 | 
45 |         try:
46 |             session = rt.InferenceSession(filepath)
47 |         except Exception as e:
48 |             raise RuntimeError(f"Failed to load ONNX model from {filepath}: {e}")
49 | 
50 |         self.input_name = session.get_inputs()[0].name
51 |         self.output_name = session.get_outputs()[0].name
52 | 
53 |         return session
54 | 
55 |     def predict(self, data: np.array, target_class=1):
56 |         return self._model.run([self.output_name], {self.input_name: data})[0]
57 | 
58 | 
59 | # the model has been trained and saved in our local Chalk directory
60 | # models/fraud_model.onnx
61 | fraud_model = PredictionModel("fraud_model.onnx")
62 | 
63 | 
64 | @offline
65 | def run_fraud_model(
66 |     features: DataFrame[
67 |         Transaction.id,
68 |         Transaction.amount,
69 |         Transaction.user.time_since_last_transaction,
70 |         Transaction.user.num_transactions["1d"],
71 |         Transaction.user.num_transactions["10d"],
72 |         Transaction.user.num_transactions["30d"],
73 |         Transaction.user.num_distinct_merchants_transacted["1d"],
74 |         Transaction.user.num_distinct_merchants_transacted["10d"],
75 |         Transaction.user.num_distinct_merchants_transacted["30d"],
76 |     ],
77 | ) -> DataFrame[Transaction.id, Transaction.is_fraud]:
78 |     """Predict whether new transactions are fraudulent based on transaction and user data."""
79 | 
80 |     predictions = fraud_model.predict(
81 |         features.to_pandas().astype(np.float32).drop(columns=[Transaction.id]).values,
82 |     )
83 | 
84 |     return features[Transaction.id].with_columns({Transaction.is_fraud: predictions})
85 | 


--------------------------------------------------------------------------------
/github/features/github/github_user.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | 
  3 | import chalk.functions as F
  4 | from chalk.features import DataFrame, Primary, _, features, has_many
  5 | 
  6 | 
  7 | @features
  8 | class GithubUserStarredRepo:
  9 |     path: Primary[str]
 10 |     description: str | None
 11 |     homepage: str | None
 12 |     stargazers_count: int | None
 13 |     language: str | None
 14 |     license: str | None
 15 |     open_issues_count: int | None
 16 |     forks_count: int | None
 17 |     url_html: str | None
 18 |     url_api: str | None
 19 | 
 20 |     username: str
 21 | 
 22 | 
 23 | def parse_starred_github_repos_for_github_user(
 24 |     username: str,
 25 |     repo_data: dict | None,
 26 | ) -> GithubUserStarredRepo | None:
 27 |     if repo_data is None or not repo_data:
 28 |         return None
 29 | 
 30 |     repo_license: str = "MISSING"
 31 |     if temp_license_data := repo_data.get("license"):
 32 |         if temp_license := temp_license_data.get("spdx_id"):
 33 |             repo_license = str(temp_license)
 34 | 
 35 |     path: str | None = repo_data.get("full_name")
 36 |     return GithubUserStarredRepo(
 37 |         username=username,
 38 |         path=path,
 39 |         description=repo_data.get("description"),
 40 |         homepage=repo_data.get("homepage"),
 41 |         stargazers_count=repo_data.get("stargazers_count"),
 42 |         language=repo_data.get("language"),
 43 |         license=repo_license,
 44 |         open_issues_count=repo_data.get("open_issues_count"),
 45 |         forks_count=repo_data.get("forks_count"),
 46 |         url_api=repo_data.get("url"),
 47 |         url_html=repo_data.get("html_url"),
 48 |     )
 49 | 
 50 | 
 51 | @features(max_staleness="infinity")
 52 | class GithubUser:
 53 |     login: str
 54 |     name: Primary[str]
 55 |     id: int
 56 |     node_id: str
 57 |     bio: str | None
 58 |     blog: str
 59 |     company: str | None
 60 |     created_at: datetime | None
 61 |     email: str | None
 62 |     events_url: str | None
 63 |     followers: int
 64 |     following: int
 65 |     full_name: str | None
 66 |     gists_url: str | None
 67 |     gravatar_id: str | None
 68 |     hireable: bool | None
 69 |     location: str | None
 70 |     public_gists: int
 71 |     public_repos: int
 72 |     received_events_url: str | None
 73 |     site_admin: bool | None
 74 |     subscriptions_url: str | None
 75 |     twitter_username: str | None
 76 |     updated_at: datetime | None
 77 |     user_view_type: str | None
 78 |     url_api: str
 79 |     url_avatar: str
 80 |     url_html: str
 81 |     url_followers: str
 82 |     url_following: str
 83 |     url_starred: str
 84 |     url_organizations: str
 85 |     url_repos: str
 86 |     type: str
 87 | 
 88 |     updated_at_chalk: datetime
 89 | 
 90 |     starred_repos: DataFrame[GithubUserStarredRepo] = has_many(
 91 |         lambda: GithubUser.name == GithubUserStarredRepo.username,
 92 |     )
 93 |     starred_most_recent_path: str = F.array_join(
 94 |         F.head(
 95 |             _.starred_repos[_.path],
 96 |             n=1,
 97 |         ),
 98 |         delimiter="",
 99 |     )
100 |     starred_most_recent_url: str | None = (
101 |         "https://github.com/" + _.starred_most_recent_path
102 |     )
103 | 


--------------------------------------------------------------------------------
/03_caching/README.md:
--------------------------------------------------------------------------------
  1 | # Caching
  2 | When a feature is expensive or slow to compute, 
  3 | you may wish to cache its value. 
  4 | Chalk uses the terminology "maximum staleness" 
  5 | to describe how recently a feature value needs 
  6 | to have been computed to be returned without 
  7 | re-running a resolver.
  8 | 
  9 | https://docs.chalk.ai/docs/feature-caching
 10 | 
 11 | ## 1. Basic Caching
 12 | Cache feature values rather than computing them realtime.
 13 | 
 14 | **[1_basic_caching.py](1_basic_caching.py)**
 15 | 
 16 | ```python
 17 | @features
 18 | class User:
 19 |     fico_score: int = feature(max_staleness="30d")
 20 | ```
 21 | https://docs.chalk.ai/docs/feature-caching
 22 | 
 23 | ## 2. Latest Computed Value
 24 | Cache the last computed example of the feature.
 25 | 
 26 | **[2_latest_value.py](2_lastest_value.py)**
 27 | 
 28 | ```python
 29 | @features
 30 | class User:
 31 |     fico_score: int = feature(max_staleness="infinity")
 32 | ```
 33 | https://docs.chalk.ai/docs/feature-caching
 34 | 
 35 | ## 3. Intermediate Feature Values
 36 | Cache intermediate feature values.
 37 | 
 38 | **[3_intermediates.py](3_intermediates.py)**
 39 | 
 40 | ```python
 41 | ChalkClient().query(
 42 |     input={ ... },
 43 |     # User.fico_score is not requested in the output...
 44 |     output=[User.risk_score],
 45 |     # ...but you can specify the staleness anyhow!
 46 |     staleness={User.fico_score: "10m"},
 47 | )
 48 | ```
 49 | https://docs.chalk.ai/docs/query-caching
 50 | 
 51 | ## 4. Override Max-Staleness
 52 | Set max-staleness per-request.
 53 | 
 54 | **[4_override_max_staleness.py](4_override_max_staleness.py)**
 55 | 
 56 | ```python
 57 | @features
 58 | class User:
 59 |     fico_score: int = feature(max_staleness="30d")
 60 | 
 61 | ChalkClient().query(
 62 |     input={...},
 63 |     output=[User.fico_score],
 64 |     staleness={User.fico_score: "10m"},
 65 | )
 66 | ```
 67 | https://docs.chalk.ai/docs/query-caching
 68 | 
 69 | ## 5. Override Cache Values
 70 | Supply a feature value in the input to skip the cache and any resolver entirely.
 71 | 
 72 | **[5_override_cache_values.py](5_override_cache_values.py)**
 73 | 
 74 | ```python
 75 | @features
 76 | class User:
 77 |     fico_score: int = feature(max_staleness="30d")
 78 | 
 79 | ChalkClient().query(
 80 |     input={User.fico_score: 1, ...},
 81 |     output=[...],
 82 | )
 83 | ```
 84 | https://docs.chalk.ai/docs/query-caching
 85 | 
 86 | ## 6. Cache Busting
 87 | Bypass the cache with a max-staleness of 0.
 88 | 
 89 | **[6_cache_busting.py](6_cache_busting.py)**
 90 | 
 91 | ```python
 92 | ChalkClient().query(
 93 |     input={...},
 94 |     output=[User.fico_score],
 95 |     staleness={User.fico_score: "0s"},
 96 | )
 97 | ```
 98 | https://docs.chalk.ai/docs/query-caching#cache-busting
 99 | 
100 | 
101 | ## 7. Pre-Fetching
102 | Keep the cache warm by scheduling a resolver to run
103 | more frequently than the max-staleness.
104 | 
105 | **[7_prefetching.py](7_prefetching.py)**
106 | 
107 | ```python
108 | @features
109 | class User:
110 |     fico_score: int = feature(max_staleness="30d")
111 | 
112 | @realtime(cron="29d 11h")
113 | def get_fico_score(name: User.name) -> User.fico_score:
114 |     return requests.get("https://experian.com").json()["score"]
115 | ```
116 | https://docs.chalk.ai/docs/resolver-cron
117 | 
118 | 


--------------------------------------------------------------------------------
/ecommerce/3_streams.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | from datetime import datetime
  3 | 
  4 | from chalk import online
  5 | from chalk.features.resolver import make_stream_resolver
  6 | from chalk.features import DataFrame, FeatureTime, features, _, has_many
  7 | from chalk.streams import KafkaSource
  8 | from pydantic import BaseModel
  9 | 
 10 | 
 11 | @features
 12 | class Seller:
 13 |     id: str
 14 |     categories: set[str]
 15 | 
 16 | 
 17 | @features
 18 | class User:
 19 |     id: str
 20 |     age: int
 21 |     favorite_categories: set[str]
 22 | 
 23 | 
 24 | @features
 25 | class UserSeller:
 26 |     id: str
 27 |     user_id: User.id
 28 |     user: User
 29 |     seller_id: Seller.id
 30 |     seller: Seller
 31 |     favorites_match: bool
 32 |     user_seller_score: int
 33 | 
 34 |     interactions: "DataFrame[Interaction]" = has_many(
 35 |         lambda: (User.id == Interaction.user_id) & (Seller.id == Interaction.seller_id)
 36 |     )
 37 | 
 38 |     number_of_interactions: int = _.interactions.count()
 39 | 
 40 | 
 41 | class InteractionKind(Enum):
 42 |     LIKE = "LIKE"
 43 |     VIEW = "VIEW"
 44 |     PURCHASE = "PURCHASE"
 45 |     OTHER = "OTHER"
 46 | 
 47 |     @classmethod
 48 |     def _missing_(cls, _):
 49 |         return cls.OTHER
 50 | 
 51 | 
 52 | @features
 53 | class Interaction:
 54 |     id: str
 55 |     user_id: User.id
 56 |     user: User
 57 |     seller_id: Seller.id
 58 |     seller: Seller
 59 |     interaction_kind: InteractionKind
 60 |     on: FeatureTime
 61 | 
 62 | 
 63 | interaction_stream = KafkaSource(name="interactions")
 64 | 
 65 | 
 66 | class InteractionMessage(BaseModel):
 67 |     id: str
 68 |     user_id: str
 69 |     seller_id: str
 70 |     interaction_kind: str
 71 |     ingestion_time: datetime
 72 | 
 73 | 
 74 | process_interactions = make_stream_resolver(
 75 |     name="process_interactions",
 76 |     source=interaction_stream,
 77 |     message_type=InteractionMessage,
 78 |     output_features={
 79 |         Interaction.id: _.message.id,
 80 |         Interaction.user_id: _.message.user_id,
 81 |         Interaction.seller_id: _.message.seller_id,
 82 |         Interaction.interaction_kind: _.message.interaction_kind,
 83 |         Interaction.on: _.ingestion_time,
 84 |     },
 85 | )
 86 | 
 87 | 
 88 | @online
 89 | def get_similarity(
 90 |     fc: UserSeller.user.favorite_categories, fc2: UserSeller.seller.categories
 91 | ) -> UserSeller.favorites_match:
 92 |     return len(fc & fc2) > 0
 93 | 
 94 | 
 95 | if __name__ == "__main__":
 96 |     from chalk.client import ChalkClient
 97 | 
 98 |     client = ChalkClient()
 99 |     user_stores = client.query(
100 |         input=[
101 |             UserSeller(user_id="123", seller_id="456"),
102 |             UserSeller(user_id="123", seller_id="457"),
103 |             UserSeller(user_id="123", seller_id="458"),
104 |             UserSeller(user_id="123", seller_id="458"),
105 |             UserSeller(user_id="123", seller_id="456"),
106 |             UserSeller(user_id="123", seller_id="461"),
107 |             UserSeller(user_id="123", seller_id="460"),
108 |         ],
109 |         output=[
110 |             UserSeller.user.id,
111 |             UserSeller.seller.id,
112 |             UserSeller.favorites_match,
113 |             UserSeller.number_of_interactions,
114 |         ],
115 |     )
116 |     print(user_stores)
117 | 


--------------------------------------------------------------------------------
/github/features/search/github_search.py:
--------------------------------------------------------------------------------
  1 | # trunk-ignore-all(ruff/N812)
  2 | import chalk.functions as F
  3 | import chalk.prompts as P
  4 | from chalk.features import (
  5 |     DataFrame,
  6 |     Primary,
  7 |     Vector,
  8 |     _,
  9 |     embed,
 10 |     features,
 11 | )
 12 | from pydantic import BaseModel, Field
 13 | 
 14 | from src.github.features import GithubRepoDocVDB
 15 | from src.github.features.cerebras.cerebras import (
 16 |     CEREBRAS_API_KEY,
 17 |     CEREBRAS_BASE_URL,
 18 |     CEREBRAS_MODEL,
 19 |     CEREBRAS_MODEL_PROVIDER,
 20 | )
 21 | 
 22 | from .prompts import (
 23 |     SYSTEM_PROMPT,
 24 |     USER_PROMPT,
 25 | )
 26 | 
 27 | CHAT_MAX_TOKENS: int = 8192
 28 | CHAT_TEMPERATURE: float = 0.1
 29 | CHAT_TOP_P: float = 0.1
 30 | 
 31 | 
 32 | class StructuredOutput(BaseModel):
 33 |     repo_url: str = Field(
 34 |         description="The URL of the best matching GitHub repository",
 35 |     )
 36 |     confidence: float = Field(
 37 |         description="The confidence threshold for the generated summary, between 0 and 1",
 38 |     )
 39 |     summary: str = Field(
 40 |         description="What this repo does and why it was selected",
 41 |     )
 42 | 
 43 | 
 44 | @features
 45 | class GithubSearch:
 46 |     query: Primary[str]
 47 |     limit: int = 10
 48 |     vector: Vector[768] = embed(
 49 |         input=lambda: GithubSearch.query,
 50 |         provider="vertexai",  # openai
 51 |         model="text-embedding-005",  # text-embedding-3-small
 52 |     )
 53 | 
 54 |     results: DataFrame[GithubRepoDocVDB]
 55 |     urls_in_list: list[str] = F.array_agg(
 56 |         expr=_.results[_.url],
 57 |     )
 58 |     urls_in: str = F.array_join(
 59 |         arr=_.urls_in_list,
 60 |         delimiter="\n\n====\n\n",
 61 |     )
 62 | 
 63 |     individual_descriptions: list[str] = F.array_agg(
 64 |         expr=_.results[_.ai_summary],
 65 |     )
 66 |     descriptions: str = F.array_join(
 67 |         arr=_.individual_descriptions,
 68 |         delimiter="\n\n====\n\n",
 69 |     )
 70 | 
 71 |     distances_in_list: list[float] = F.array_agg(
 72 |         expr=_.results[_.distance,],
 73 |     )
 74 | 
 75 |     # https://chalk.ai/projects/dmo5dhaj3yqu/environments/dvxenv/prompts
 76 |     # Can also edit prompts from the dashboard
 77 |     # completion_gui: P.PromptResponse = P.run_prompt("repo_summary")
 78 |     completion: P.PromptResponse = P.completion(
 79 |         api_key=CEREBRAS_API_KEY,
 80 |         model_provider=CEREBRAS_MODEL_PROVIDER,
 81 |         model=CEREBRAS_MODEL,
 82 |         base_url=CEREBRAS_BASE_URL,
 83 |         max_tokens=CHAT_MAX_TOKENS,
 84 |         temperature=CHAT_TEMPERATURE,
 85 |         top_p=CHAT_TOP_P,
 86 |         messages=[
 87 |             P.message(
 88 |                 role="system",
 89 |                 content=SYSTEM_PROMPT,
 90 |             ),
 91 |             P.message(
 92 |                 role="user",
 93 |                 content=F.jinja(USER_PROMPT),
 94 |             ),
 95 |         ],
 96 |         output_structure=StructuredOutput,
 97 |     )
 98 | 
 99 |     c_url: str = F.json_value(
100 |         _.completion.response,
101 |         "$.repo_url",
102 |     )
103 |     c_confidence: float = F.json_value(
104 |         _.completion.response,
105 |         "$.confidence",
106 |     )
107 |     c_summary: str = F.json_value(
108 |         _.completion.response,
109 |         "$.summary",
110 |     )
111 | 


--------------------------------------------------------------------------------
/fraud/README.md:
--------------------------------------------------------------------------------
  1 | # Fraud Detection
  2 | 
  3 | Finding a balance between user experience and
  4 | risk management is a complex task for banking
  5 | products. Chalk helps you express complex business
  6 | logic with features and resolvers, and lets data
  7 | scientists and machine learning engineers collaborate
  8 | on solutions.
  9 | 
 10 | ## 1. Returns
 11 | 
 12 | Identify transactions returned for non-sufficient funds.
 13 | 
 14 | **[1_return.py](1_return.py)**
 15 | 
 16 | ```python
 17 | @online
 18 | def get_transaction_is_nsf(
 19 |     memo_clean: Transaction.clean_memo,
 20 | ) -> Transaction.is_nsf:
 21 |     return "nsf" in memo_clean.lower()
 22 | 
 23 | @online
 24 | def get_nsf_amount(
 25 |     amounts: User.transactions[
 26 |         Transaction.is_nsf is True,
 27 |         Transaction.amount
 28 |     ]
 29 | ) -> User.nsf_amount:
 30 |     return amounts.sum()
 31 | ```
 32 | 
 33 | https://docs.chalk.ai/docs/python-resolvers
 34 | 
 35 | ## 2. Changes in Behavior
 36 | 
 37 | Detect changes in user behavior over time.
 38 | 
 39 | **[2_patterns.py](2_patterns.py)**
 40 | 
 41 | ```python
 42 | @online
 43 | def get_transaction_trend(
 44 |     this_year_txns: User.transactions[after(days_ago=30)],
 45 |     last_year_txns: User.transactions[
 46 |         before(days_ago=30),
 47 |         after(days_ago=30 * 2)
 48 |     ]
 49 | ) -> User.change_from_last_year:
 50 |     sum_last = last_year_txns[Transaction.amount].sum()
 51 |     sum_this = this_year_txns[Transaction.amount].sum()
 52 |     return (sum_last - sum_this) / sum_last
 53 | ```
 54 | 
 55 | https://docs.chalk.ai/docs/window-functions
 56 | 
 57 | ## 3. Identity Verification
 58 | 
 59 | Make use of vendor APIs to verify identities, control costs with Chalk's platform.
 60 | 
 61 | **[3_identity.py](3_identity.py)**
 62 | 
 63 | ```python
 64 | @features
 65 | class User:
 66 |     id: str
 67 |     socure_score: float = feature(max_staleness="30d")
 68 | 
 69 | @online
 70 | def get_socure_score(uid: User.id) -> Features[User.socure_score]:
 71 |     return (
 72 |         requests.get("https://api.socure.com", json={
 73 |             id: uid
 74 |         }).json()['socure_score']
 75 |     )
 76 | ```
 77 | 
 78 | https://docs.chalk.ai/docs/feature-caching
 79 | 
 80 | ## 4. Withdrawal Model
 81 | 
 82 | Decide and enforce withdrawal limits with custom hold times.
 83 | 
 84 | **[4_withdrawal_model.py](4_withdrawal_model.py)**
 85 | 
 86 | ```python
 87 | @realtime(when=TransferLimit.to_account.is_internal is False)
 88 | def withdrawal_limit(
 89 |         internal_accounts: TransferLimit.user.accounts[Account.is_internal is True],
 90 |         deposits_last_90: TransferLimit.user.transfers[Transfer.from_account.is_internal is False, before(days_ago=90)],
 91 |         user_settlement: TransferLimit.user.holdback,
 92 | ) -> TransferLimit.amount:
 93 |     ...
 94 | ```
 95 | 
 96 | https://docs.chalk.ai/docs/resolver-overview
 97 | 
 98 | ## 5. Account Takeover
 99 | 
100 | Aggregate failed logins over a Kafka stream.
101 | 
102 | **[5_account_takeover.py](5_account_takeover.py)**
103 | 
104 | ```python
105 | @stream(...)
106 | def agg_logins(df: DataFrame[LoginMessage]) -> DataFrame[User]:
107 |     return f"""
108 |         select
109 |             count(*) as failed_logins,
110 |             user_id as id
111 |         from {df}
112 |         where status = 'failed'
113 |         group by id
114 |     """
115 | ```
116 | 
117 | https://docs.chalk.ai/docs/aggregations
118 | 


--------------------------------------------------------------------------------
/02_resolvers/6_sharing_resolvers.py:
--------------------------------------------------------------------------------
  1 | from chalk import online
  2 | from chalk.client import ChalkClient
  3 | from chalk.features import DataFrame, FeatureTime, after, features, has_many, has_one
  4 | from chalk.sql import PostgreSQLSource
  5 | 
  6 | 
  7 | # Imagine that we have two models:
  8 | #   1. send_reminder_email: decides when we should send our next reminder email
  9 | #   2. expected_loan_repayment: predicts the amount of money we expect to collect
 10 | #
 11 | # First, we'll lay out some feature classes for this problem:
 12 | @features
 13 | class EmailRecord:
 14 |     id: str
 15 |     user_id: str
 16 |     user: "User"
 17 |     sent_at: FeatureTime
 18 | 
 19 | 
 20 | @features
 21 | class User:
 22 |     id: str
 23 |     name: str
 24 |     emails_sent_last_10_days: int
 25 |     email_history: DataFrame[EmailRecord] = has_many(
 26 |         lambda: EmailRecord.user_id == User.id
 27 |     )
 28 | 
 29 | 
 30 | # The business logic for a feature is written only once,
 31 | # though it can be referenced many times.
 32 | @online
 33 | def get_emails_sent_last_10_days(
 34 |     emails: User.email_history[after(days_ago=10)],
 35 | ) -> User.emails_sent_last_10_days:
 36 |     return emails.count()
 37 | 
 38 | 
 39 | # For our second model on expected loan repayment, we'll first model
 40 | # another feature class around loans:
 41 | @features
 42 | class Loan:
 43 |     id: int
 44 |     user_id: str
 45 |     amount: float
 46 |     user: User = has_one(lambda: User.id == Loan.user_id)
 47 | 
 48 | 
 49 | # Here, we leverage the work we did to build the
 50 | # feature `User.emails_sent_last_10_days` from the
 51 | # first model by requesting
 52 | # `Loan.user.emails_sent_last_10_days`.
 53 | # We configure this postgres source in the Chalk dashboard.
 54 | db = PostgreSQLSource()
 55 | 
 56 | 
 57 | # Work for sql queries is shared in the same way.
 58 | # For example, we need to be able to resolve the
 59 | # fields of `EmailRecord`.
 60 | @online
 61 | def get_email_record(user: User.id) -> DataFrame[EmailRecord]:
 62 |     return db.query_string(
 63 |         "select id, sent_at, user_id from email_record where user=:uid",
 64 |         fields=dict(
 65 |             id=EmailRecord.id,
 66 |             sent_at=EmailRecord.sent_at,
 67 |             user_id=EmailRecord.user_id,
 68 |         ),
 69 |         args=dict(uid=user),
 70 |     ).all()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 
 75 |     # For this first model, we request the `User.name`
 76 |     # and `User.emails_sent_last_10_days` features under
 77 |     # the query name `send_reminder_email`.
 78 |     ChalkClient().query(
 79 |         input={User.id: 1},
 80 |         output=[
 81 |             User.emails_sent_last_10_days,
 82 |             User.name,
 83 |         ],
 84 |         # This optional `query_name` associates the data
 85 |         # that we requested with a given model for monitoring
 86 |         # and migrations.
 87 |         query_name="send_reminder_email",
 88 |     )
 89 | 
 90 |     ChalkClient().query(
 91 |         input={Loan.id: "1"},
 92 |         output=[
 93 |             Loan.user.emails_sent_last_10_days,
 94 |             Loan.amount,
 95 |         ],
 96 |         query_name="expected_loan_repayment",
 97 |     )
 98 | 
 99 |     # Here, we're running a really basic query that just maps columns to features.
100 |     # For these simple queries, there's a shortcut to automatically ingest these
101 |     # tables:
102 |     db.with_table(name="email_record", features=EmailRecord)
103 | 


--------------------------------------------------------------------------------