├── python ├── __init__.py ├── requirements.txt └── constants.py ├── .dockerignore ├── rasgotransforms ├── rasgotransforms │ ├── tests │ │ ├── __init__.py │ │ └── transforms │ │ │ ├── __init__.py │ │ │ └── plot │ │ │ └── __init__.py │ ├── snippets │ │ ├── int.sql │ │ ├── column.sql │ │ ├── date.sql │ │ ├── table.sql │ │ ├── alias.sql │ │ ├── boolean.sql │ │ ├── number.sql │ │ ├── string.sql │ │ ├── agg.sql │ │ ├── date_part.sql │ │ ├── join_type.sql │ │ ├── timestamp.sql │ │ ├── column_value.sql │ │ ├── custom_option.sql │ │ ├── sort_direction.sql │ │ ├── column_list.sql │ │ ├── number_list.sql │ │ ├── table_list.sql │ │ ├── column_or_expression.sql │ │ ├── column_or_expression_list.sql │ │ ├── string_list.sql │ │ ├── column_value_list.sql │ │ ├── custom_option_list.sql │ │ ├── filter_list.sql │ │ ├── sort_dict.sql │ │ ├── calculated_column_list.sql │ │ ├── column_agg_list.sql │ │ └── agg_dict.sql │ ├── exceptions.py │ ├── transforms │ │ ├── select │ │ │ ├── select.sql │ │ │ └── select.yaml │ │ ├── dropna │ │ │ ├── dropna.py │ │ │ ├── dropna.yaml │ │ │ └── dropna.sql │ │ ├── filter │ │ │ ├── filter.py │ │ │ ├── filter.sql │ │ │ └── filter.yaml │ │ ├── order │ │ │ ├── order.py │ │ │ ├── order.sql │ │ │ └── order.yaml │ │ ├── sample │ │ │ ├── sample.py │ │ │ ├── sample.yaml │ │ │ └── sample.sql │ │ ├── sample_class │ │ │ ├── sample_class.py │ │ │ ├── sample_class.sql │ │ │ └── sample_class.yaml │ │ ├── remove_duplicates │ │ │ ├── remove_duplicates.py │ │ │ ├── remove_duplicates.sql │ │ │ └── remove_duplicates.yaml │ │ ├── unpivot │ │ │ ├── unpivot.sql │ │ │ └── unpivot.yaml │ │ ├── text_to_sql │ │ │ ├── text_to_sql.sql │ │ │ └── text_to_sql.yaml │ │ ├── train_test_split │ │ │ ├── train_test_split.py │ │ │ ├── train_test_split.sql │ │ │ └── train_test_split.yaml │ │ ├── target_encode │ │ │ ├── target_encode.py │ │ │ ├── target_encode.sql │ │ │ └── target_encode.yaml │ │ ├── remove_outliers │ │ │ └── remove_outliers.py │ │ ├── datepart │ │ │ ├── datepart.py │ │ │ ├── datepart.sql │ │ │ ├── postgresql │ │ │ │ └── datepart.sql │ │ │ ├── redshift │ │ │ │ └── datepart.sql │ │ │ ├── snowflake │ │ │ │ └── datepart.sql │ │ │ ├── datepart.yaml │ │ │ └── bigquery │ │ │ │ └── datepart.sql │ │ ├── label_encode │ │ │ ├── label_encode.py │ │ │ ├── snowflake │ │ │ │ └── label_encode.sql │ │ │ ├── bigquery │ │ │ │ └── label_encode.sql │ │ │ └── label_encode.yaml │ │ ├── funnel │ │ │ ├── funnel.sql │ │ │ └── funnel.yaml │ │ ├── market_basket │ │ │ ├── market_basket.py │ │ │ ├── market_basket.sql │ │ │ └── market_basket.yaml │ │ ├── datespine │ │ │ └── datespine.py │ │ ├── if_then │ │ │ ├── if_then.sql │ │ │ ├── if_then.py │ │ │ └── if_then.yaml │ │ ├── datetrunc │ │ │ ├── bigquery │ │ │ │ └── datetrunc.sql │ │ │ ├── postgresql │ │ │ │ └── datetrunc.sql │ │ │ ├── redshift │ │ │ │ └── datetrunc.sql │ │ │ ├── snowflake │ │ │ │ └── datetrunc.sql │ │ │ └── datetrunc.py │ │ ├── apply │ │ │ ├── apply.sql │ │ │ └── apply.yaml │ │ ├── moving_avg │ │ │ ├── moving_avg.py │ │ │ ├── moving_avg.sql │ │ │ └── moving_avg.yaml │ │ ├── datediff │ │ │ ├── datediff.py │ │ │ ├── bigquery │ │ │ │ └── datediff.sql │ │ │ ├── datediff.sql │ │ │ ├── snowflake │ │ │ │ └── datediff.sql │ │ │ └── datediff.yaml │ │ ├── lag │ │ │ ├── lag.py │ │ │ ├── lag.sql │ │ │ ├── bigquery │ │ │ │ └── lag.sql │ │ │ └── lag.yaml │ │ ├── lead │ │ │ ├── lead.py │ │ │ ├── lead.sql │ │ │ ├── bigquery │ │ │ │ └── lead.sql │ │ │ └── lead.yaml │ │ ├── prefix │ │ │ ├── prefix.sql │ │ │ └── prefix.yaml │ │ ├── suffix │ │ │ ├── suffix.sql │ │ │ └── suffix.yaml │ │ ├── uppercase_columns │ │ │ ├── bigquery │ │ │ │ └── uppercase_columns.sql │ │ │ ├── snowflake │ │ │ │ └── uppercase_columns.sql │ │ │ └── uppercase_columns.yaml │ │ ├── replace_string │ │ │ ├── replace_string.py │ │ │ └── replace_string.sql │ │ ├── drop_columns │ │ │ ├── drop_columns.py │ │ │ ├── drop_columns.sql │ │ │ └── drop_columns.yaml │ │ ├── replace_missing │ │ │ ├── replace_missing.py │ │ │ └── replace_missing.yaml │ │ ├── new_columns │ │ │ ├── new_columns.sql │ │ │ └── new_columns.yaml │ │ ├── aggregate_string │ │ │ ├── aggregate_string.sql │ │ │ └── aggregate_string.yaml │ │ ├── rename │ │ │ ├── rename.py │ │ │ ├── rename.sql │ │ │ ├── snowflake │ │ │ │ └── rename.sql │ │ │ └── rename.yaml │ │ ├── linear_regression │ │ │ ├── linear_regression.py │ │ │ ├── linear_regression.sql │ │ │ └── linear_regression.yaml │ │ ├── cast │ │ │ ├── cast.py │ │ │ ├── cast.sql │ │ │ ├── cast.yaml │ │ │ └── bigquery │ │ │ │ └── cast.sql │ │ ├── min_max_scaler │ │ │ ├── min_max_scaler.py │ │ │ └── min_max_scaler.sql │ │ ├── standard_scaler │ │ │ ├── standard_scaler.py │ │ │ └── standard_scaler.sql │ │ ├── latest │ │ │ ├── latest.py │ │ │ ├── latest.sql │ │ │ └── latest.yaml │ │ ├── to_date │ │ │ ├── to_date.sql │ │ │ ├── to_date.py │ │ │ └── to_date.yaml │ │ ├── join │ │ │ ├── join.py │ │ │ └── join.yaml │ │ ├── profile_column │ │ │ ├── profile_column.sql │ │ │ └── profile_column.yaml │ │ ├── sankey │ │ │ ├── sankey.sql │ │ │ └── sankey.yaml │ │ ├── dateadd │ │ │ ├── dateadd.sql │ │ │ ├── snowflake │ │ │ │ └── dateadd.sql │ │ │ ├── postgresql │ │ │ │ └── dateadd.sql │ │ │ ├── redshift │ │ │ │ └── dateadd.sql │ │ │ ├── bigquery │ │ │ │ └── dateadd.sql │ │ │ ├── dateadd.yaml │ │ │ └── dateadd.py │ │ ├── rank │ │ │ ├── rank.py │ │ │ └── rank.sql │ │ ├── correlation │ │ │ ├── correlation.yaml │ │ │ └── correlation.sql │ │ ├── bin │ │ │ ├── bin.sql │ │ │ ├── bigquery │ │ │ │ └── bin.sql │ │ │ └── bin.yaml │ │ ├── describe │ │ │ └── describe.yaml │ │ ├── math │ │ │ ├── math.sql │ │ │ ├── bigquery │ │ │ │ └── math.sql │ │ │ └── snowflake │ │ │ │ └── math.sql │ │ ├── union │ │ │ ├── union.sql │ │ │ └── union.yaml │ │ ├── cumulative_agg │ │ │ └── cumulative_agg.sql │ │ ├── entropy │ │ │ └── entropy.yaml │ │ ├── one_hot_encode │ │ │ ├── one_hot_encode.yaml │ │ │ └── one_hot_encode.sql │ │ ├── unions │ │ │ ├── unions.sql │ │ │ └── unions.yaml │ │ ├── aggregate │ │ │ ├── aggregate.yaml │ │ │ └── aggregate.py │ │ ├── conditional_agg │ │ │ ├── conditional_agg.sql │ │ │ └── conditional_agg.yaml │ │ ├── rolling_agg │ │ │ ├── rolling_agg.sql │ │ │ └── rolling_agg.yaml │ │ ├── datarobot_score │ │ │ └── datarobot_score.sql │ │ ├── histogram │ │ │ └── histogram.yaml │ │ ├── timeseries_agg │ │ │ ├── timeseries_agg.sql │ │ │ └── bigquery │ │ │ │ └── timeseries_agg.sql │ │ ├── extract_sequences │ │ │ └── snowflake │ │ │ │ └── extract_sequences.sql │ │ ├── summarize_flatlines │ │ │ ├── summarize_flatlines.sql │ │ │ └── summarize_flatlines.yaml │ │ ├── heatmap │ │ │ └── heatmap.yaml │ │ ├── summarize │ │ │ ├── summarize.sql │ │ │ └── summarize.yaml │ │ ├── vlookup │ │ │ ├── vlookup.sql │ │ │ └── vlookup.yaml │ │ ├── ratio_with_shrinkage │ │ │ ├── ratio_with_shrinkage.sql │ │ │ ├── snowflake │ │ │ │ └── ratio_with_shrinkage.sql │ │ │ └── ratio_with_shrinkage.yaml │ │ ├── clean │ │ │ └── clean.yaml │ │ ├── reshape │ │ │ ├── bigquery │ │ │ │ └── reshape.sql │ │ │ └── snowflake │ │ │ │ └── reshape.sql │ │ ├── sliding_slope │ │ │ ├── sliding_slope.sql │ │ │ ├── snowflake │ │ │ │ └── sliding_slope.sql │ │ │ └── sliding_slope.yaml │ │ ├── pivot_table │ │ │ └── pivot_table.yaml │ │ ├── rsi │ │ │ └── rsi.sql │ │ └── encode_values │ │ │ └── encode_values.yaml │ ├── version.py │ ├── __init__.py │ ├── render │ │ └── __init__.py │ ├── macros │ │ ├── prior.sql │ │ ├── rolling.sql │ │ └── period_to_date.sql │ └── dtypes.py ├── requirements.txt ├── scripts │ ├── install-local.sh │ └── publish-pypi.sh ├── requirements-tests.txt └── DESCRIPTION.md ├── bin ├── backup-source.sh └── publish-pypi.sh ├── .github └── workflows │ ├── backup.yaml │ ├── run_tests.yaml │ ├── publish.yaml │ ├── docs_generation.yaml │ └── publish_accelerators.yaml └── docs ├── select.md ├── prefix.md ├── suffix.md ├── accelerators ├── baby_name_analysis.md ├── web_traffic_channels.md ├── website_page_performance.md ├── plg.md ├── sales_growth_funnel.md └── omni_channel_performance.md ├── uppercase_columns.md ├── correlation.md ├── profile_column.md ├── label_encode.md ├── order.md ├── describe.md ├── sample_class.md ├── rename.md ├── sankey.md ├── entropy.md ├── target_encode.md ├── apply.md ├── filter.md ├── one_hot_encode.md ├── new_columns.md ├── datepart.md ├── cast.md ├── latest.md ├── text_to_sql.md ├── lead.md ├── sample.md ├── moving_avg.md ├── aggregate.md ├── histogram.md ├── to_date.md ├── funnel.md ├── remove_duplicates.md ├── drop_columns.md ├── union.md ├── market_basket.md ├── unions.md ├── dropna.md ├── heatmap.md ├── train_test_split.md └── conditional_agg.md /python/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | ./tmp/* 2 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/int.sql: -------------------------------------------------------------------------------- 1 | {{ int }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/column.sql: -------------------------------------------------------------------------------- 1 | {{ column }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/date.sql: -------------------------------------------------------------------------------- 1 | '{{ date }}' -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/table.sql: -------------------------------------------------------------------------------- 1 | {{ table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/tests/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/alias.sql: -------------------------------------------------------------------------------- 1 | AS {{ alias }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/boolean.sql: -------------------------------------------------------------------------------- 1 | {{ boolean }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/number.sql: -------------------------------------------------------------------------------- 1 | {{ decimal }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/string.sql: -------------------------------------------------------------------------------- 1 | '{{ string }}' -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/tests/transforms/plot/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rasgotransforms/requirements.txt: -------------------------------------------------------------------------------- 1 | jinja2>=2.0 2 | pyyaml>=5.0 3 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/agg.sql: -------------------------------------------------------------------------------- 1 | {{ agg }}( column_name ) -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/date_part.sql: -------------------------------------------------------------------------------- 1 | {{ date_part }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/join_type.sql: -------------------------------------------------------------------------------- 1 | {{ join_type }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/timestamp.sql: -------------------------------------------------------------------------------- 1 | '{{ timestamp }}' -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/column_value.sql: -------------------------------------------------------------------------------- 1 | {{ column_value }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/custom_option.sql: -------------------------------------------------------------------------------- 1 | {{ custom_option }} -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | pyrasgo>=2.5.1 3 | pytablewriter 4 | pyyaml 5 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/sort_direction.sql: -------------------------------------------------------------------------------- 1 | {{ sort_direction }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/column_list.sql: -------------------------------------------------------------------------------- 1 | {{ column_list | join(', ') }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/number_list.sql: -------------------------------------------------------------------------------- 1 | {{ number_list | join(', ') }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/table_list.sql: -------------------------------------------------------------------------------- 1 | {{ table_list | join(', ') }} -------------------------------------------------------------------------------- /rasgotransforms/scripts/install-local.sh: -------------------------------------------------------------------------------- 1 | python -m pip install -e rasgotransforms 2 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/column_or_expression.sql: -------------------------------------------------------------------------------- 1 | {{ column_or_expression }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/exceptions.py: -------------------------------------------------------------------------------- 1 | class RenderException(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/select/select.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | FROM {{source_table}} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/column_or_expression_list.sql: -------------------------------------------------------------------------------- 1 | {{ column_or_expression_list | join(',') }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/version.py: -------------------------------------------------------------------------------- 1 | """ 2 | Package version for pypi 3 | """ 4 | __version__ = "2.7.8" 5 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/string_list.sql: -------------------------------------------------------------------------------- 1 | {% for string in string_list %}'{{ string }}'{{ ',' if not loop.last }}{% endfor %} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/dropna/dropna.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | return source_columns 3 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/filter/filter.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | return source_columns 3 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/order/order.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | return source_columns 3 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/sample/sample.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | return source_columns 3 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import serve_rasgo_transform_templates, serve_rasgo_transform_snippets, DataWarehouse 2 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/sample_class/sample_class.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | return source_columns 3 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/remove_duplicates/remove_duplicates.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | return source_columns 3 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/column_value_list.sql: -------------------------------------------------------------------------------- 1 | ( 2 | {% for val in column_value_list %} 3 | '{{ val }}'{{ ',' if not loop.last }} 4 | {% endfor %} 5 | ) -------------------------------------------------------------------------------- /rasgotransforms/requirements-tests.txt: -------------------------------------------------------------------------------- 1 | google-cloud-bigquery 2 | pandas 3 | pytest 4 | python-dotenv 5 | snowflake-connector-python 6 | snowflake-connector-python[pandas] 7 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/render/__init__.py: -------------------------------------------------------------------------------- 1 | from .environment import RasgoEnvironment 2 | from .infer_columns import infer_columns 3 | from .transforms import Transforms 4 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/custom_option_list.sql: -------------------------------------------------------------------------------- 1 | ( 2 | {% for option in custom_option_list %} 3 | '{{ option }}'{{ ',' if not loop.last }} 4 | {% endfor %} 5 | ) -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/filter_list.sql: -------------------------------------------------------------------------------- 1 | {% set filters = filter_list %} 2 | {% from 'filter.sql' import get_filter_statement %} 3 | {{ get_filter_statement(filters) }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/sort_dict.sql: -------------------------------------------------------------------------------- 1 | {% for column, direction in sort_dict.items() -%} 2 | {{ column }} {{ direction }}{%- if not loop.last %}, {% endif -%} 3 | {%- endfor %} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/unpivot/unpivot.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM {{ source_table }} 2 | UNPIVOT( {{ value_column }} for {{ name_column }} in ( {{ column_list | join(', ')}} )) 3 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/text_to_sql/text_to_sql.sql: -------------------------------------------------------------------------------- 1 | {# Placeholder code. Will be replaced by user supplied text #} 2 | Write a query against {{source_table}} that returns {{ text }}... 3 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/train_test_split/train_test_split.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | source_columns['TT_SPLIT'] = 'varchar' 3 | return source_columns 4 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/target_encode/target_encode.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | source_columns[f"{args['column']}_target_encoded"] = 'float' 3 | return source_columns 4 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/order/order.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | FROM {{source_table}} 3 | ORDER BY 4 | {%- for col, order_method in order_by.items() %} 5 | {{ col }} {{ order_method }}{{ ',' if not loop.last else ' ' }} 6 | {%- endfor -%} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/remove_outliers/remove_outliers.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | if not ('drop' in args and args['drop']): 3 | source_columns['OUTLIER'] = 'boolean' 4 | return source_columns 5 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datepart/datepart.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | for target_col, date_part in args['dates'].items(): 3 | source_columns[f"{target_col}_{date_part}".upper()] = 'int' 4 | return source_columns 5 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/label_encode/label_encode.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | source_columns[f"{args['column']}_encoded".upper()] = "int" 3 | source_columns["all_values_array"] = "array" 4 | return source_columns 5 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/funnel/funnel.sql: -------------------------------------------------------------------------------- 1 | {%- for col_name in stage_columns -%} 2 | SELECT 3 | '{{ col_name }}' AS LABEL 4 | ,SUM({{ col_name }}) AS LABEL_COUNT 5 | FROM {{ source_table }} 6 | {{ "UNION ALL" if not loop.last else "" }} 7 | {% endfor %} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/market_basket/market_basket.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | out_columns = {} 3 | out_columns[f"{args['agg_column']}_listagg"] = 'text' 4 | out_columns["NumTransactions"] = 'int' 5 | return out_columns 6 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datepart/datepart.sql: -------------------------------------------------------------------------------- 1 | SELECT *, 2 | {%- for target_col, date_part in dates.items() %} 3 | EXTRACT({{date_part}} FROM {{target_col}}) AS {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} 4 | {%- endfor %} 5 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datespine/datespine.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | source_columns[f"{args['date_col']}_spine_start"] = 'timestamp_ntz' 3 | source_columns[f"{args['date_col']}_spine_end"] = 'timestamp_ntz' 4 | return source_columns 5 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/if_then/if_then.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | *, 3 | CASE 4 | {%- for condition in conditions %} 5 | {{"WHEN " + condition[0] }} THEN {{ condition[1] }} {% endfor %} 6 | ELSE {{ default }} 7 | END AS {{ cleanse_name(alias) }} 8 | FROM {{ source_table }} 9 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datepart/postgresql/datepart.sql: -------------------------------------------------------------------------------- 1 | SELECT *, 2 | {%- for target_col, date_part in dates.items() %} 3 | DATE_PART('{{date_part}}', {{target_col}}) AS {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} 4 | {%- endfor %} 5 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datepart/redshift/datepart.sql: -------------------------------------------------------------------------------- 1 | SELECT *, 2 | {%- for target_col, date_part in dates.items() %} 3 | DATE_PART('{{date_part}}', {{target_col}}) AS {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} 4 | {%- endfor %} 5 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datepart/snowflake/datepart.sql: -------------------------------------------------------------------------------- 1 | SELECT *, 2 | {%- for target_col, date_part in dates.items() %} 3 | DATE_PART('{{date_part}}', {{target_col}}) AS {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} 4 | {%- endfor %} 5 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datetrunc/bigquery/datetrunc.sql: -------------------------------------------------------------------------------- 1 | SELECT *, 2 | {%- for target_col, date_part in dates.items() %} 3 | DATE_TRUNC({{target_col}}, {{date_part}}) as {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} 4 | {%- endfor %} 5 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datetrunc/postgresql/datetrunc.sql: -------------------------------------------------------------------------------- 1 | SELECT *, 2 | {%- for target_col, date_part in dates.items() %} 3 | DATE_TRUNC({{date_part}}, {{target_col}}) as {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} 4 | {%- endfor %} 5 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datetrunc/redshift/datetrunc.sql: -------------------------------------------------------------------------------- 1 | SELECT *, 2 | {%- for target_col, date_part in dates.items() %} 3 | DATE_TRUNC({{date_part}}, {{target_col}}) as {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} 4 | {%- endfor %} 5 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datetrunc/snowflake/datetrunc.sql: -------------------------------------------------------------------------------- 1 | SELECT *, 2 | {%- for target_col, date_part in dates.items() %} 3 | DATE_TRUNC({{date_part}}, {{target_col}}) as {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }} 4 | {%- endfor %} 5 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/apply/apply.sql: -------------------------------------------------------------------------------- 1 | {# Placeholder code. Will be replaced by user supplied template #} 2 | {% if sql %} 3 | {{ sql }} 4 | {% else %} 5 | SELECT * FROM {{ source_table }} 6 | {% endif %} 7 | {{ raise_exception('Placeholder code must be replaced by user supplied template') }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datetrunc/datetrunc.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | for target_col, date_part in args['dates'].items(): 3 | source_columns[f"{target_col}_{date_part}".upper()] = source_columns[target_col.upper()] 4 | return source_columns 5 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/moving_avg/moving_avg.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | for column in args['input_columns']: 3 | for window in args['window_sizes']: 4 | source_columns[f"mean_{column}_{window}"] = 'float' 5 | return source_columns 6 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datediff/datediff.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | if 'alias' in args: 3 | source_columns[args['alias']] = 'int' 4 | else: 5 | source_columns[f"DIFF_{args['date_1']}_{args['date_2']}"] = 'int' 6 | return source_columns 7 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/lag/lag.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | for column in args['columns']: 3 | for amount in args['amounts']: 4 | source_columns[f"lag_{column}_{amount}".upper()] = source_columns[column.upper()] 5 | return source_columns 6 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/lead/lead.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | for column in args['columns']: 3 | for amount in args['amounts']: 4 | source_columns[f"lead_{column}_{amount}".upper()] = source_columns[column.upper()] 5 | return source_columns 6 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/sample_class/sample_class.sql: -------------------------------------------------------------------------------- 1 | {%- for class, n in sample.items() %} 2 | SELECT * FROM 3 | (SELECT * FROM {{ source_table }} WHERE {{ sample_col }} = '{{ class }}') SAMPLE ({{ n }}{{' rows' if n > 1 else ''}}) 4 | {{ '' if loop.last else ' UNION ALL ' }} 5 | {%- endfor %} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/prefix/prefix.sql: -------------------------------------------------------------------------------- 1 | {%- set source_col_names = get_columns(source_table) -%} 2 | {%- set alias = cleanse_name(prefix) -%} 3 | SELECT 4 | {%- for column in source_col_names %} 5 | {{column}} AS {{ alias~'_'~column }}{{',' if not loop.last else ''}} 6 | {%- endfor %} 7 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/suffix/suffix.sql: -------------------------------------------------------------------------------- 1 | {%- set source_col_names = get_columns(source_table) -%} 2 | {%- set alias = cleanse_name(suffix) -%} 3 | SELECT 4 | {%- for column in source_col_names %} 5 | {{column}} AS {{ column~'_'~alias }}{{',' if not loop.last else ''}} 6 | {%- endfor %} 7 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/uppercase_columns/bigquery/uppercase_columns.sql: -------------------------------------------------------------------------------- 1 | {%- set source_col_names = get_columns(source_table) -%} 2 | 3 | SELECT 4 | {%- for col in source_col_names %} 5 | {{ col }} as {{ cleanse_name(col) | upper }}{{ ", " if not loop.last else "" }} 6 | {%- endfor %} 7 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/calculated_column_list.sql: -------------------------------------------------------------------------------- 1 | {%- for col_dict in calculated_columns %} 2 | {%- if 'alias' in col_dict -%} 3 | , {{ col_dict['calculated_column'] }} as {{ cleanse_name(col_dict['alias']) }} 4 | {%- else -%} 5 | , {{ col_dict['calculated_column'] }} 6 | {%- endif %} 7 | {%- endfor %} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/replace_string/replace_string.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | if 'alias' in args and args['alias']: 3 | source_columns[args['alias']] = 'varchar' 4 | else: 5 | source_columns[f"REPLACE_{args['source_col']}"] = 'text' 6 | return source_columns 7 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/uppercase_columns/snowflake/uppercase_columns.sql: -------------------------------------------------------------------------------- 1 | {%- set source_col_names = get_columns(source_table) -%} 2 | 3 | SELECT 4 | {%- for col in source_col_names %} 5 | "{{ col }}" as {{ cleanse_name(col) | upper }}{{ ", " if not loop.last else "" }} 6 | {%- endfor %} 7 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/drop_columns/drop_columns.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | if 'include_cols' in args: 3 | return {k: v for k, v in source_columns.items() if k in args['include_cols']} 4 | else: 5 | return {k: v for k, v in source_columns.items() if k not in args['exclude_cols']} 6 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/replace_missing/replace_missing.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | if 'flag_missing_vals' in args and args['flag_missing_vals']: 3 | for target_col, _ in args['replacements'].items(): 4 | source_columns[f"{target_col}_missing_flag".upper()] = 'int' 5 | return source_columns 6 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/select/select.yaml: -------------------------------------------------------------------------------- 1 | name: select 2 | tags: 3 | - column 4 | - table 5 | description: | 6 | select * from a table 7 | 8 | arguments: 9 | none: 10 | type: none 11 | description: this transform does not take any arguments 12 | example_code: | 13 | ds = rasgo.get.dataset(id) 14 | 15 | ds2 = ds.select() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datediff/bigquery/datediff.sql: -------------------------------------------------------------------------------- 1 | {%- if alias is defined -%} 2 | {%- set alias = cleanse_name(alias) -%} 3 | {%- else -%} 4 | {%- set alias = 'DIFF_'~ cleanse_name(date_1~'_'~date_2) -%} 5 | {%- endif -%} 6 | 7 | SELECT *, 8 | DATE_DIFF({{ date_1 }}, {{ date_2 }}, {{ date_part }}) as {{ alias }} 9 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datediff/datediff.sql: -------------------------------------------------------------------------------- 1 | {%- if alias is defined -%} 2 | {%- set alias = cleanse_name(alias) -%} 3 | {%- else -%} 4 | {%- set alias = 'DIFF_'~ cleanse_name(date_1~'_'~date_2) -%} 5 | {%- endif -%} 6 | 7 | SELECT *, 8 | EXTRACT({{ date_part }} FROM DATE {{ date_1 }} - DATE {{ date_2 }}) AS {{ alias }} 9 | FROM {{ source_table }} -------------------------------------------------------------------------------- /bin/backup-source.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e; 4 | 5 | # zip up the repo 6 | zip -r ./rasgo-transforms.zip * 7 | 8 | # upload it to s3 9 | export AWS_DEFAULT_REGION=us-east-1 10 | aws s3 cp rasgo-transforms.zip s3://rasgo-source-backups/rasgo-transforms.zip --storage-class GLACIER_IR 11 | 12 | # clean up clean up everybody do your share 13 | rm ./rasgo-transforms.zip 14 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/label_encode/snowflake/label_encode.sql: -------------------------------------------------------------------------------- 1 | with distinct_values as ( 2 | select array_agg(distinct {{ column }}) within group (order by {{ column }} asc) as all_values_array from {{ source_table }} 3 | ) 4 | select *, 5 | array_position({{ column }}::variant,all_values_array) as {{ column }}_encoded 6 | from distinct_values,{{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/new_columns/new_columns.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | {%- for col_dict in calculated_columns %} 3 | {%- if 'alias' in col_dict -%} 4 | , {{ col_dict['calculated_column'] }} as {{ cleanse_name(col_dict['alias']) }} 5 | {%- else -%} 6 | , {{ col_dict['calculated_column'] }} 7 | {%- endif %} 8 | {%- endfor %} 9 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datediff/snowflake/datediff.sql: -------------------------------------------------------------------------------- 1 | {%- if alias is defined -%} 2 | {%- set alias = cleanse_name(alias) -%} 3 | {%- else -%} 4 | {%- set alias = 'DIFF_'~ cleanse_name(date_1~'_'~date_2) -%} 5 | {%- endif -%} 6 | 7 | SELECT *, 8 | DATEDIFF({{ date_part }}, {{ date_1 }}, {{ date_2 }}) as {{ alias }} 9 | FROM {{ source_table }} 10 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/remove_duplicates/remove_duplicates.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | * 3 | FROM {{ source_table }} 4 | QUALIFY ROW_NUMBER() OVER ( 5 | PARTITION BY {%- for col in natural_key %} {{col}}{{"," if not loop.last else ""}} {%- endfor %} 6 | ORDER BY {%- for col in order_col %} {{col}}{{"," if not loop.last else ""}} {%- endfor %} {{order_method}} 7 | ) = 1 8 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/target_encode/target_encode.sql: -------------------------------------------------------------------------------- 1 | with means as ( 2 | select distinct {{column}} as value, ROUND(AVG({{target}}), 3) as {{column}}_target_encoded 3 | from {{ source_table }} 4 | group by value) 5 | 6 | select t.*, m.{{column}}_target_encoded 7 | from {{ source_table }} t 8 | left join 9 | means m 10 | on t.{{column}} = m.value -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/aggregate_string/aggregate_string.sql: -------------------------------------------------------------------------------- 1 | SELECT {{ group_by | join(', ') }} 2 | {%- for agg_column in agg_columns %} 3 | , listagg({{ 'distinct ' if distinct else ''}} {{agg_column}}, '{{sep}}') 4 | WITHIN group (order by {{agg_column}} {{order}}) as {{agg_column}}_listagg 5 | {%- endfor %} 6 | FROM {{ source_table }} 7 | GROUP BY {{ group_by | join(', ') }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/rename/rename.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | out_columns = {} 3 | for column, column_type in source_columns.items(): 4 | if column in args['renames']: 5 | out_columns[args['renames'][column]] = column_type 6 | else: 7 | out_columns[column] = column_type 8 | return out_columns 9 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/prefix/prefix.yaml: -------------------------------------------------------------------------------- 1 | name: prefix 2 | tags: 3 | - row 4 | - data_cleaning 5 | description: Add a prefix to each column name 6 | arguments: 7 | prefix: 8 | type: value 9 | description: text to prefix each column name with 10 | example_code: | 11 | ds = rasgo.get.dataset(74) 12 | 13 | ds2 = ds.prefix(prefix='PRODUCT') 14 | 15 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/suffix/suffix.yaml: -------------------------------------------------------------------------------- 1 | name: suffix 2 | tags: 3 | - row 4 | - data_cleaning 5 | description: Add a suffix to each column name 6 | arguments: 7 | suffix: 8 | type: value 9 | description: text to suffix each column name with 10 | example_code: | 11 | ds = rasgo.get.dataset(74) 12 | 13 | ds2 = ds.suffix(suffix='PRODUCT') 14 | 15 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/linear_regression/linear_regression.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | out_columns = {k: v for k, v in source_columns.items() if k in args['group_by']} 3 | out_columns['Slope'] = 'double' 4 | out_columns['Intercept'] = 'double' 5 | out_columns['R2'] = 'double' 6 | out_columns['Formula'] = 'varchar' 7 | return out_columns 8 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/linear_regression/linear_regression.sql: -------------------------------------------------------------------------------- 1 | SELECT {{ group_by | join(', ') }}{{ ', ' if group_by else ''}} 2 | REGR_SLOPE({{y}}, {{x}}) Slope, 3 | REGR_INTERCEPT({{y}}, {{x}}) Intercept, 4 | REGR_R2({{y}}, {{x}}) R2, 5 | CONCAT('Y = ',Slope,'*X + ',Intercept) as Formula 6 | FROM {{ source_table }} 7 | {{ 'GROUP BY ' if group_by else ''}}{{ group_by | join(', ') }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/cast/cast.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | for target_col, type in args['casts'].items(): 3 | if 'overwrite_columns' in args and args['overwrite_columns']: 4 | source_columns[target_col.upper()] = type 5 | else: 6 | source_columns[f'{target_col.upper()}_{type.upper()}'] = type 7 | return source_columns 8 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/label_encode/bigquery/label_encode.sql: -------------------------------------------------------------------------------- 1 | with distinct_values as ( 2 | select distinct 3 | rank() over(order by {{ column }} asc) as id, 4 | {{ column }} 5 | from {{ source_table }} 6 | order by {{ column }} asc 7 | ) 8 | select *, 9 | (v.id - 1) as {{ column }}_encoded 10 | FROM {{ source_table }} t 11 | left join distinct_values v using ({{ column }}) -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/min_max_scaler/min_max_scaler.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | for column in args['columns_to_scale']: 3 | if not ('overwrite_columns' in args and args['overwrite_columns']): 4 | source_columns[f"{column}_MIN_MAX_SCALED"] = 'float' 5 | else: 6 | source_columns[column.upper()] = 'float' 7 | return source_columns 8 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/standard_scaler/standard_scaler.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | for column in args['columns_to_scale']: 3 | if not ('overwrite_columns' in args and args['overwrite_columns']): 4 | source_columns[f"{column}_STANDARD_SCALED"] = 'float' 5 | else: 6 | source_columns[column.upper()] = 'float' 7 | return source_columns 8 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/train_test_split/train_test_split.sql: -------------------------------------------------------------------------------- 1 | SELECT *, 2 | {%- if order_by is defined %} 3 | CASE WHEN ROW_NUMBER() OVER (ORDER BY {{order_by | join(", ")}} ) < (COUNT(1) OVER () * {{train_percent}}) THEN 'TRAIN' ELSE 'TEST' END AS TT_SPLIT 4 | {%- else %} 5 | CASE WHEN MOD(RANDOM(), 1/{{train_percent}}) = 0 THEN 'TEST' ELSE 'TRAIN' END AS TT_SPLIT 6 | {%- endif %} 7 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/rename/rename.sql: -------------------------------------------------------------------------------- 1 | 2 | {%- set source_col_names = get_columns(source_table) -%} 3 | 4 | SELECT 5 | {%- for target_col, new_name in renames.items() %} 6 | {{target_col}} AS {{new_name}}{{ ", " if not loop.last else "" }} 7 | {%- endfor -%} 8 | {%- for col in source_col_names %} 9 | {%- if col not in renames %}, {{col}}{%- endif -%} 10 | {% endfor %} 11 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/latest/latest.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | out_columns = {} 3 | for column_name, column_type in source_columns.items(): 4 | if column_name not in args['group_by'] and column_name not in args['order_by']: 5 | out_columns[f"LATEST_{column_name}"] = column_type 6 | else: 7 | out_columns[column_name] = column_type 8 | return out_columns 9 | -------------------------------------------------------------------------------- /rasgotransforms/scripts/publish-pypi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [ 1 != $# ]; then 6 | echo "usage: $0 index" 7 | echo "Index values: pypi pypitest" 8 | exit 1; 9 | fi 10 | PYPI_INDEX="$1" 11 | 12 | # Remove old build artifacts 13 | rm -rf dist/* 14 | 15 | # Generate new artifacts 16 | python setup.py sdist bdist_wheel 17 | 18 | # Upload artifacts to pypi 19 | python -m twine upload --verbose -r "$PYPI_INDEX" dist/* 20 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/to_date/to_date.sql: -------------------------------------------------------------------------------- 1 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', dates)|join(',') if overwrite_columns else "*" -%} 2 | 3 | SELECT {{ untouched_cols }}, 4 | {%- for target_col, date_format in dates.items() %} 5 | DATE({{target_col}}, '{{date_format}}') as {{target_col if overwrite_columns else target_col + "_DATE"}}{{ ", " if not loop.last else "" }} 6 | {%- endfor %} 7 | from {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/column_agg_list.sql: -------------------------------------------------------------------------------- 1 | {%- for column, aggs in column_agg_list.items() %} 2 | {%- set oloop = loop %} 3 | {%- for aggregation_type in aggs %} 4 | {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}}{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}{{ ',' if not (loop.last and oloop.last) }} 5 | {%- endfor %} 6 | {%- endfor %} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/join/join.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | join_columns = args['source_columns'][args['join_table']] 3 | if 'join_prefix' in args and args['join_prefix']: 4 | for col_name, col_type in join_columns.items(): 5 | source_columns[f"{args['join_prefix'].upper()}_{col_name}"] = col_type 6 | return source_columns 7 | else: 8 | return {**source_columns, **join_columns} 9 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/profile_column/profile_column.sql: -------------------------------------------------------------------------------- 1 | WITH COUNTS AS ( 2 | SELECT 3 | REPLACE('{{ column_name }}','"') AS COLUMN_NAME 4 | ,COL AS VALUE 5 | ,COUNT(1) AS VALUE_COUNT 6 | FROM 7 | (SELECT {{ column_name }} AS COL FROM {{ source_table }}) 8 | GROUP BY 2) 9 | SELECT 10 | COLUMN_NAME 11 | ,VALUE 12 | ,VALUE_COUNT 13 | ,VALUE_COUNT / SUM(VALUE_COUNT) OVER () AS VALUE_FREQUENCY 14 | FROM 15 | COUNTS 16 | ORDER BY 17 | VALUE_COUNT desc -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/uppercase_columns/uppercase_columns.yaml: -------------------------------------------------------------------------------- 1 | name: rename 2 | tags: 3 | - column 4 | - data_cleaning 5 | - data_quality 6 | description: | 7 | Rename columns by converting all names to uppercase and removing non-SQL safe characters. 8 | arguments: 9 | none: 10 | type: none 11 | description: this transform does not take any arguments 12 | example_code: | 13 | ds = rasgo.get.dataset(id) 14 | 15 | ds2 = ds.uppercase_columns() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/macros/prior.sql: -------------------------------------------------------------------------------- 1 | {% macro prior(metric_name, dimensions, calc_config) %} 2 | {% set alias = metric_name + '_' + calc_config.alias if calc_config.alias is defined else metric_name + '_lag_' + calc_config.interval|string %} 3 | lag( 4 | {{ metric_name }}, {{ calc_config.interval }} 5 | ) over ( 6 | {% if dimensions %} 7 | partition by {{ dimensions | join(", ") }} 8 | {% endif %} 9 | order by period_min 10 | ) as {{ alias }} 11 | {% endmacro %} 12 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/filter/filter.sql: -------------------------------------------------------------------------------- 1 | {% from 'filter.sql' import get_filter_statement %} 2 | {% if items is not defined %} 3 | {% if filter_statements is not defined %} 4 | {{ raise_exception('items is empty: there are no filters to apply') }} 5 | {% else %} 6 | {% set items = filter_statements %} 7 | {% endif %} 8 | {% endif %} 9 | 10 | select * 11 | from {{ source_table }} 12 | where true and 13 | {{ get_filter_statement(items) | indent }} 14 | -------------------------------------------------------------------------------- /.github/workflows/backup.yaml: -------------------------------------------------------------------------------- 1 | name: Backup Source to S3 2 | 3 | on: 4 | schedule: 5 | # Every day at 1am EST 6 | - cron: '0 6 * * *' 7 | 8 | jobs: 9 | backup-source: 10 | runs-on: ubuntu-latest 11 | env: 12 | AWS_ACCESS_KEY_ID: ${{ secrets.S3_BACKUPS_ACCESS_KEY }} 13 | AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_BACKUPS_SECRET_ACCESS_KEY }} 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - name: Push to S3 19 | run: ./bin/backup-source.sh 20 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/rename/snowflake/rename.sql: -------------------------------------------------------------------------------- 1 | {%- set source_col_names = get_columns(source_table) -%} 2 | 3 | SELECT 4 | {%- for target_col, new_name in renames.items() %} 5 | {{target_col}} AS {{new_name}}{{ ", " if not loop.last else "" }} 6 | {%- endfor -%} 7 | {%- set renames = (renames|join(',')|upper).split(',') -%} 8 | {%- for col in source_col_names %} 9 | {%- if col|upper not in renames %}, {{col|upper}}{%- endif -%} 10 | {% endfor %} 11 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/sankey/sankey.sql: -------------------------------------------------------------------------------- 1 | {%- for i in range((stage|length) - 1) -%} 2 | SELECT 3 | '{{ stage[i] }}_' || CAST({{ stage[i] }} AS STRING) AS SOURCE_NODE, 4 | '{{ stage[i+1] }}_' || CAST({{ stage[i+1] }} AS STRING) AS DEST_NODE, 5 | COUNT(*) AS WIDTH 6 | FROM {{ source_table }} 7 | GROUP BY 8 | SOURCE_NODE, 9 | DEST_NODE 10 | HAVING 11 | SOURCE_NODE IS NOT NULL AND DEST_NODE IS NOT NULL 12 | {{ "UNION ALL" if not loop.last else "" }} 13 | {% endfor %} 14 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/dateadd/dateadd.sql: -------------------------------------------------------------------------------- 1 | {%- if overwrite_columns -%} 2 | {%- set alias = date -%} 3 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%} 4 | {%- else -%} 5 | {%- set untouched_cols = "*" -%} 6 | {%- endif -%} 7 | 8 | {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%} 9 | 10 | SELECT {{ untouched_cols }}, 11 | {{ date }} + INTERVAL {{ offset }} {{ date_part }} AS {{ cleanse_name(alias) }} 12 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/dateadd/snowflake/dateadd.sql: -------------------------------------------------------------------------------- 1 | {%- if overwrite_columns -%} 2 | {%- set alias = date -%} 3 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%} 4 | {%- else -%} 5 | {%- set untouched_cols = "*" -%} 6 | {%- endif -%} 7 | 8 | {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%} 9 | 10 | SELECT {{ untouched_cols }}, 11 | DATEADD({{ date_part }}, {{ offset }}, {{ date }}) AS {{ cleanse_name(alias) }} 12 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/order/order.yaml: -------------------------------------------------------------------------------- 1 | name: order 2 | tags: 3 | - row 4 | - conditional 5 | - data_cleaning 6 | description: Order a dataset by specified columns, in a specified order 7 | arguments: 8 | order_by: 9 | type: column_value_dict 10 | description: dict where the keys are column names and the values are the order_method (ASC or DESC) 11 | example_code: | 12 | ds = rasgo.get.dataset(id) 13 | 14 | ds2 = ds.order(order_by={'DS_WEATHER_ICON':'ASC', 'DS_DAILY_HIGH_TEMP':'DESC'}) 15 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/dateadd/postgresql/dateadd.sql: -------------------------------------------------------------------------------- 1 | {%- if overwrite_columns -%} 2 | {%- set alias = date -%} 3 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%} 4 | {%- else -%} 5 | {%- set untouched_cols = "*" -%} 6 | {%- endif -%} 7 | 8 | {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%} 9 | 10 | SELECT {{ untouched_cols }}, 11 | {{ date }} + INTERVAL '{{ offset }} {{ date_part }}' AS {{ cleanse_name(alias) }} 12 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/dateadd/redshift/dateadd.sql: -------------------------------------------------------------------------------- 1 | {%- if overwrite_columns -%} 2 | {%- set alias = date -%} 3 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%} 4 | {%- else -%} 5 | {%- set untouched_cols = "*" -%} 6 | {%- endif -%} 7 | 8 | {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%} 9 | 10 | SELECT {{ untouched_cols }}, 11 | {{ date }} + INTERVAL '{{ offset }} {{ date_part }}' AS {{ cleanse_name(alias) }} 12 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/market_basket/market_basket.sql: -------------------------------------------------------------------------------- 1 | WITH order_detail as 2 | (SELECT {{transaction_id}}, 3 | listagg({{agg_column}}, '{{sep}}') 4 | WITHIN group (order by {{agg_column}}) as {{agg_column}}_listagg, 5 | COUNT({{agg_column}}) as num_products 6 | FROM {{ source_table }} 7 | GROUP BY {{transaction_id}} ) 8 | 9 | SELECT {{agg_column}}_listagg, count({{transaction_id}}) as NumTransactions 10 | FROM order_detail 11 | where num_products > 1 12 | GROUP BY {{agg_column}}_listagg 13 | order by count({{transaction_id}}) desc -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/dateadd/bigquery/dateadd.sql: -------------------------------------------------------------------------------- 1 | {%- if overwrite_columns -%} 2 | {%- set alias = date -%} 3 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%} 4 | {%- else -%} 5 | {%- set untouched_cols = "*" -%} 6 | {%- endif -%} 7 | 8 | {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%} 9 | 10 | SELECT {{ untouched_cols }}, 11 | DATE_ADD({{ date }}, INTERVAL {{ offset }} {{ date_part }}) AS {{ cleanse_name(alias) }} 12 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/rank/rank.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | if 'overwrite_columns' in args and args['overwrite_columns']: 3 | out_columns = {k: v for k, v in source_columns.items() if k not in args['rank_columns']} 4 | else: 5 | out_columns = source_columns.copy() 6 | if 'alias' in args and args['alias']: 7 | alias = args['alias'] 8 | else: 9 | alias = f"RANK_{'_'.join(args['rank_columns'])}" 10 | out_columns[alias] = 'integer' 11 | return out_columns 12 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/profile_column/profile_column.yaml: -------------------------------------------------------------------------------- 1 | name: profile_column 2 | tags: 3 | - table 4 | description: | 5 | ## Analyze the distinct values in a column 6 | 7 | ### Required Inputs 8 | - Column: the column you want to profile 9 | 10 | ### Notes 11 | - Only supports profiling one column at a time 12 | arguments: 13 | column_name: 14 | type: column 15 | description: The column you want to profile 16 | example_code: | 17 | ds = rasgo.get.dataset(id) 18 | 19 | ds.profile_column(column_name = 'IMPORTANTCOLUMN') -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/correlation/correlation.yaml: -------------------------------------------------------------------------------- 1 | name: correlation 2 | type: insight 3 | operation_type: VIZ 4 | context: 5 | chart_type: heatmap_discrete 6 | tags: 7 | description: Run pairwise correlation on all numeric columns in the source_table 8 | arguments: 9 | rows_to_sample: 10 | type: value 11 | is_optional: true 12 | description: number of rows to sample from the table before calculating correlation 13 | example_code: | 14 | ds = rasgo.get.dataset(id) 15 | 16 | ds2 = ds.correlation(rows_to_sample=1000) 17 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/bin/bin.sql: -------------------------------------------------------------------------------- 1 | SELECT *, 2 | {% if type == 'ntile' %} 3 | ntile({{bin_count}}) OVER (ORDER BY {{column}}) AS {{column}}_{{bin_count}}_NTB 4 | {% elif type == 'equalwidth' %} 5 | width_bucket({{column}}, 6 | (SELECT MIN({{column}}) FROM {{source_table}}), 7 | (SELECT MAX({{column}}) FROM {{source_table}}), 8 | {{bin_count}}) AS {{column}}_{{bin_count}}_EWB 9 | {% else %} 10 | {{ raise_exception('You must select either "ntile" or "equalwidth" as your binning type') }} 11 | {% endif %} 12 | FROM {{ source_table }} 13 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/snippets/agg_dict.sql: -------------------------------------------------------------------------------- 1 | {%- for col, aggs in agg_dict.items() %} 2 | {%- set outer_loop = loop -%} 3 | {%- for agg in aggs %} 4 | {%- if ' DISTINCT' in agg|upper %} 5 | {{ agg|upper|replace(" DISTINCT", "") }}(DISTINCT {{ col }}) as {{ col ~ '_' ~ agg|upper|replace(" DISTINCT", "") ~ 'DISTINCT'}}{{ '' if loop.last and outer_loop.last else ',' }} 6 | {%- else %} 7 | {{ agg }}({{ col }}) as {{ col + '_' + agg }}{{ '' if loop.last and outer_loop.last else ',' }} 8 | {%- endif %} 9 | {%- endfor -%} 10 | {%- endfor %} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/macros/rolling.sql: -------------------------------------------------------------------------------- 1 | {% macro rolling(metric_name, dimensions, calc_config) %} 2 | {% set alias = metric_name + '_' + calc_config.alias if calc_config.alias is defined else metric_name + calc_config.aggregate + '_last_' + calc_config.interval|string + '_periods' %} 3 | {{ calc_config.aggregate }}({{ metric_name }}) 4 | over ( 5 | {% if dimensions -%} 6 | partition by {{ dimensions | join(", ") }} 7 | {% endif -%} 8 | order by period_min 9 | rows between {{ calc_config.interval - 1 }} preceding and current row 10 | ) as {{ alias }} 11 | {% endmacro %} 12 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/sample_class/sample_class.yaml: -------------------------------------------------------------------------------- 1 | name: sample_class 2 | tags: 3 | - row 4 | - data_cleaning 5 | - conditional 6 | description: Sample n rows for each value of a column 7 | arguments: 8 | sample_col: 9 | type: column 10 | description: The column for which you want to sample 11 | sample: 12 | type: dict 13 | description: Value of column as a key, n rows to be sampled as values 14 | example_code: | 15 | ds = rasgo.get.dataset(id) 16 | 17 | ds2 = ds.sample_class(sample_col='BINARY_TARGET_COLUMNNAME', sample={'1':15000, '0':60000}) 18 | ds2.preview() -------------------------------------------------------------------------------- /.github/workflows/run_tests.yaml: -------------------------------------------------------------------------------- 1 | name: Run RasgoTransforms Tests 2 | 3 | on: push 4 | 5 | jobs: 6 | run-tests: 7 | runs-on: ubuntu-latest 8 | 9 | container: 10 | image: "python:3.7" 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | 15 | - name: Install RasgoTransforms in Editable Mode and test requirements 16 | run: | 17 | python -m pip install --upgrade pip 18 | pip install -e ./rasgotransforms 19 | pip install -r ./rasgotransforms/requirements-tests.txt 20 | 21 | - name: Run Tests 22 | run: pytest -vv ./rasgotransforms 23 | -------------------------------------------------------------------------------- /bin/publish-pypi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [ 1 != $# ]; then 6 | echo "usage: $0 index" 7 | echo "Index values: pypi pypitest" 8 | exit 1; 9 | fi 10 | PYPI_INDEX="$1" 11 | 12 | # Remove old build artifacts 13 | rm -rf dist/* 14 | 15 | # Install dependencies 16 | python -m pip install --upgrade pip 17 | python -m pip install --upgrade setuptools build twine 18 | python -m pip install -r rasgotransforms/requirements.txt 19 | 20 | # Generate new artifacts 21 | cd rasgotransforms 22 | python -m build 23 | 24 | # Upload artifacts to pypi 25 | python -m twine upload --verbose -r "$PYPI_INDEX" dist/* 26 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/to_date/to_date.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | if 'overwrite_columns' in args and args['overwrite_columns']: 3 | overwrite_columns = True 4 | else: 5 | overwrite_columns = False 6 | out_columns = source_columns.copy() 7 | for column_name, column_type in source_columns.items(): 8 | if column_name in args['dates']: 9 | if overwrite_columns: 10 | out_columns[column_name] = 'date' 11 | else: 12 | out_columns[f"{column_name}_DATE"] = 'date' 13 | return source_columns 14 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/describe/describe.yaml: -------------------------------------------------------------------------------- 1 | name: describe 2 | tags: 3 | - table 4 | - math 5 | description: | 6 | Describes the dataset using a consistent set of metrics, based on data type. 7 | Numeric: DTYPE, COUNT, NULL_COUNT, UNIQUE_COUNT, MOST_FREQUENT, MEAN, STD_DEV, MIN, _25_PERCENTILE, _50_PERCENTILE, _75_PERCENTILE, MAX 8 | Other: DTYPE, COUNT, NULL_COUNT, UNIQUE_COUNT, MOST_FREQUENT, MIN, MAX 9 | arguments: 10 | none: 11 | type: none 12 | description: this transform does not take any arguments 13 | example_code: | 14 | ds = rasgo.get.dataset(id) 15 | 16 | ds.describe().to_df() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/rename/rename.yaml: -------------------------------------------------------------------------------- 1 | name: rename 2 | tags: 3 | - column 4 | - data_cleaning 5 | - data_quality 6 | description: | 7 | Rename columns by passing a renames dict. 8 | arguments: 9 | renames: 10 | type: column_value_dict 11 | description: A dict representing each existing column to be renamed and its corresponding new name. 12 | example_code: | 13 | ds = rasgo.get.dataset(dataset_id) 14 | ds2 = ds.rename(renames={ 15 | 'DS_WEATHER_ICON': 'Weather', 16 | 'DS_DAILY_HIGH_TEMP': 'High_Temp', 17 | 'DS_DAILY_LOW_TEMP': 'Low_Temp' 18 | }) 19 | ds2.preview() -------------------------------------------------------------------------------- /docs/select.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # select 4 | 5 | select * from a table 6 | 7 | 8 | ## Parameters 9 | 10 | | Name | Type | Description | Is Optional | 11 | | ---- | ---- | ------------------------------------------ | ----------- | 12 | | none | none | this transform does not take any arguments | | 13 | 14 | 15 | ## Example 16 | 17 | ```python 18 | ds = rasgo.get.dataset(id) 19 | 20 | ds2 = ds.select() 21 | ``` 22 | 23 | ## Source Code 24 | 25 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/select/select.sql" %} 26 | 27 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/math/math.sql: -------------------------------------------------------------------------------- 1 | {%- if names -%} 2 | {%- if names|length != math_ops|length -%} 3 | 4 | {{ raise_exception('Provide a new column alias for each math operation') }} 5 | 6 | {%- elif names|length == math_ops|length -%} 7 | 8 | SELECT * 9 | {%- for math_op in math_ops %} 10 | , {{math_op}} as {{cleanse_name(names[loop.index-1])}} 11 | {%- endfor %} 12 | FROM {{source_table}} 13 | 14 | {%- endif -%} 15 | {%- else -%} 16 | 17 | SELECT * 18 | {%- for math_op in math_ops %} 19 | , {{math_op}} as {{cleanse_name(math_op)}} 20 | {%- endfor %} 21 | FROM {{source_table}} 22 | 23 | {%- endif -%} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/if_then/if_then.py: -------------------------------------------------------------------------------- 1 | def infer_columns(args, source_columns) -> dict: 2 | if args['default'].upper() in source_columns: 3 | output_type = source_columns[args['default']] 4 | elif args['conditions'][1].upper() in source_columns: 5 | output_type = source_columns[args['conditions'][1].upper()] 6 | elif type(args['default']) is int: 7 | output_type = 'integer' 8 | elif type(args['default']) is float: 9 | output_type = 'float' 10 | else: 11 | output_type = 'text' 12 | source_columns[args['alias'].upper()] = output_type 13 | return source_columns 14 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/math/bigquery/math.sql: -------------------------------------------------------------------------------- 1 | {%- if names -%} 2 | {%- if names|length != math_ops|length -%} 3 | 4 | {{ raise_exception('Provide a new column alias for each math operation') }} 5 | 6 | {%- elif names|length == math_ops|length -%} 7 | 8 | SELECT * 9 | {%- for math_op in math_ops %} 10 | , {{math_op}} as {{cleanse_name(names[loop.index-1])}} 11 | {%- endfor %} 12 | FROM {{source_table}} 13 | 14 | {%- endif -%} 15 | {%- else -%} 16 | 17 | SELECT * 18 | {%- for math_op in math_ops %} 19 | , {{math_op}} as {{cleanse_name(math_op)}} 20 | {%- endfor %} 21 | FROM {{source_table}} 22 | 23 | {%- endif -%} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/label_encode/label_encode.yaml: -------------------------------------------------------------------------------- 1 | name: label_encode 2 | tags: 3 | - column 4 | - feature_engineering 5 | description: | 6 | Encode target labels with value between 0 and n_classes-1. See scikit-learn's [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder) for full documentation. 7 | 8 | arguments: 9 | column: 10 | type: column 11 | description: Column name to label encode 12 | example_code: | 13 | ds = rasgo.get.dataset(id) 14 | 15 | ds2 = ds.label_encode(column='WEATHER_DESCRIPTION') 16 | ds2.preview() 17 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/math/snowflake/math.sql: -------------------------------------------------------------------------------- 1 | {%- if names -%} 2 | {%- if names|length != math_ops|length -%} 3 | 4 | {{ raise_exception('Provide a new column alias for each math operation') }} 5 | 6 | {%- elif names|length == math_ops|length -%} 7 | 8 | SELECT * 9 | {%- for math_op in math_ops %} 10 | , {{math_op}} as {{cleanse_name(names[loop.index-1])}} 11 | {%- endfor %} 12 | FROM {{source_table}} 13 | 14 | {%- endif -%} 15 | {%- else -%} 16 | 17 | SELECT * 18 | {%- for math_op in math_ops %} 19 | , {{math_op}} as {{cleanse_name(math_op)}} 20 | {%- endfor %} 21 | FROM {{source_table}} 22 | 23 | {%- endif -%} -------------------------------------------------------------------------------- /docs/prefix.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # prefix 4 | 5 | Add a prefix to each column name 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ------ | ----- | ------------------------------------ | ----------- | 11 | | prefix | value | text to prefix each column name with | | 12 | 13 | 14 | ## Example 15 | 16 | ```python 17 | ds = rasgo.get.dataset(74) 18 | 19 | ds2 = ds.prefix(prefix='PRODUCT') 20 | 21 | ds2.preview() 22 | ``` 23 | 24 | ## Source Code 25 | 26 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/prefix/prefix.sql" %} 27 | 28 | -------------------------------------------------------------------------------- /docs/suffix.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # suffix 4 | 5 | Add a suffix to each column name 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ------ | ----- | ------------------------------------ | ----------- | 11 | | suffix | value | text to suffix each column name with | | 12 | 13 | 14 | ## Example 15 | 16 | ```python 17 | ds = rasgo.get.dataset(74) 18 | 19 | ds2 = ds.suffix(suffix='PRODUCT') 20 | 21 | ds2.preview() 22 | ``` 23 | 24 | ## Source Code 25 | 26 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/suffix/suffix.sql" %} 27 | 28 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/moving_avg/moving_avg.sql: -------------------------------------------------------------------------------- 1 | {%- for amount in window_sizes -%} 2 | {%- if amount < 0 -%} 3 | {{ raise_exception('Cannot use negative values for a moving average. Please only pass positive values in `window_sizes`.') }} 4 | {%- endif -%} 5 | {%- endfor -%} 6 | SELECT * 7 | {%- for column in input_columns -%} 8 | {%- for window in window_sizes -%} 9 | , avg({{column}}) OVER(PARTITION BY {{partition | join(", ")}} ORDER BY {{order_by | join(", ")}} ROWS BETWEEN {{window - 1}} PRECEDING AND CURRENT ROW) AS mean_{{column}}_{{window}} 10 | {%- endfor %} 11 | {%- endfor %} 12 | FROM {{ source_table }} 13 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/union/union.sql: -------------------------------------------------------------------------------- 1 | {# Get all Columns in Source Table #} 2 | {%- set source_col_names = get_columns(source_table) -%} 3 | 4 | {# Get all columns in Inputted Source #} 5 | {%- set other_source_col_names = get_columns(dataset2) -%} 6 | 7 | {# Get Unique Columns Across Both Datasets #} 8 | {%- set union_cols = source_col_names.keys()|list + other_source_col_names.keys()|list -%} 9 | {%- set union_cols = union_cols | unique | list -%} 10 | 11 | {# Generate Union Query #} 12 | SELECT {{ union_cols | join(', ') }} FROM {{ dataset2 }} 13 | UNION {{ 'ALL' if keep_dupes else '' }} 14 | SELECT {{ union_cols | join(', ') }} FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/cumulative_agg/cumulative_agg.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | {% for col, aggs in aggregations.items() -%} 3 | {%- for agg in aggs %} 4 | , {{ agg }}({{ col }}) OVER( 5 | {%- if group_by %} 6 | PARTITION BY {{ group_by | join(", ") }} 7 | {% endif -%} 8 | ORDER BY {{ order_by | join(", ") }} 9 | {% if direction and direction.lower() == 'forward' -%} 10 | ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING 11 | {% else -%} 12 | ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW 13 | {%- endif -%} 14 | ) as {{ cleanse_name(agg + '_' + col) }} 15 | {%- endfor -%} 16 | {%- endfor %} 17 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/lag/lag.sql: -------------------------------------------------------------------------------- 1 | {%- if partition is not defined or partition|length == 0 -%} 2 | {%- set partition = ["NULL"]-%} 3 | {%- endif -%} 4 | {%- if order_by is not defined or order_by|length == 0 -%} 5 | {%- set order_by = ["NULL"]-%} 6 | {%- endif -%} 7 | SELECT *, 8 | {%- for col in columns -%} 9 | {%- for amount in amounts %} 10 | lag({{col}}, {{amount}}) over (partition by {{partition | join(", ")}} order by {{order_by | join(", ")}}) as Lag_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }} 11 | {%- endfor -%} 12 | {{ ", " if not loop.last else "" }} 13 | {%- endfor %} 14 | from {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/lead/lead.sql: -------------------------------------------------------------------------------- 1 | {%- if partition is not defined or partition|length == 0 -%} 2 | {%- set partition = ["NULL"]-%} 3 | {%- endif -%} 4 | {%- if order_by is not defined or order_by|length == 0 -%} 5 | {%- set order_by = ["NULL"]-%} 6 | {%- endif -%} 7 | SELECT *, 8 | {%- for col in columns -%} 9 | {%- for amount in amounts %} 10 | lead({{col}}, {{amount}}) over (partition by {{partition | join(", ")}} order by {{order_by | join(", ")}}) as lead_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }} 11 | {%- endfor -%} 12 | {{ ", " if not loop.last else "" }} 13 | {%- endfor %} 14 | from {{ source_table }} 15 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/bin/bigquery/bin.sql: -------------------------------------------------------------------------------- 1 | SELECT *, 2 | {%- if type == 'ntile' %} 3 | ntile({{bin_count}}) OVER (ORDER BY {{column}}) AS {{column}}_{{bin_count}}_NTB 4 | {%- elif type == 'equalwidth' %} 5 | RANGE_BUCKET( 6 | {{ column }}, 7 | GENERATE_ARRAY( 8 | (SELECT MIN({{ column }}) FROM {{ source_table }}) 9 | ,(SELECT MAX({{ column }}) FROM {{ source_table }}) 10 | ,(SELECT (MAX({{ column }}) - MIN({{ column }}))/20 FROM {{ source_table }}) 11 | ) 12 | ) AS {{column}}_{{bin_count}}_EWB 13 | {%- else %} 14 | {{ raise_exception('You must select either "ntile" or "equalwidth" as your binning type') }} 15 | {%- endif %} 16 | FROM {{ source_table }} 17 | -------------------------------------------------------------------------------- /docs/accelerators/baby_name_analysis.md: -------------------------------------------------------------------------------- 1 | # Baby Name Analysis Accelerator 2 | 3 | Build an introductory dataset using Rasgo accelerators! 4 | 5 | ## Parameters 6 | 7 | | Name | Type | Description | Is Optional | 8 | | ----------------------- | ---- | -------------------------------------- | ----------- | 9 | | annual_baby_names_table | | The Rasgo Community baby names dataset | | 10 | | baby_name | | Input your name here! | | 11 | 12 | 13 | ## Source Code 14 | 15 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/accelerators/baby_name_analysis.yml" %} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/latest/latest.sql: -------------------------------------------------------------------------------- 1 | {%- set source_col_names = get_columns(source_table) -%} 2 | 3 | SELECT 4 | {%- for group_item in group_by %} 5 | {{ group_item }}, 6 | {%- endfor -%} 7 | 8 | {%- for order_item in order_by %} 9 | {{ order_item }}, 10 | {%- endfor -%} 11 | 12 | {%- for source_col in source_col_names %} 13 | {%- if source_col not in group_by and source_col not in order_by -%} 14 | LAST_VALUE({{ source_col }} {{ nulls }} NULLS) OVER (PARTITION BY {{ group_by | join(', ') }} ORDER BY {{ order_by | join(', ') }} ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS LATEST_{{ source_col }}{{ ', ' if not loop.last else ' ' }} 15 | {%- endif -%} 16 | {%- endfor -%} 17 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/cast/cast.sql: -------------------------------------------------------------------------------- 1 | {%- if overwrite_columns == true -%} 2 | 3 | {%- set source_columns = get_columns(source_table) -%} 4 | {%- set untouched_cols = source_columns | reject('in', casts) -%} 5 | 6 | SELECT {% for col in untouched_cols %}{{ col }},{% endfor %} 7 | {%- for target_col, type in casts.items() %} 8 | CAST({{target_col}} AS {{type}}) AS {{target_col}}{{", " if not loop.last else ""}} 9 | {%- endfor %} 10 | FROM {{ source_table }} 11 | 12 | {%- else -%} 13 | 14 | SELECT * 15 | {%- for target_col, type in casts.items() %} 16 | , CAST({{target_col}} AS {{type}}) AS {{cleanse_name(target_col)+'_'+cleanse_name(type)}} 17 | {%- endfor %} 18 | FROM {{ source_table }} 19 | 20 | {%- endif -%} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/entropy/entropy.yaml: -------------------------------------------------------------------------------- 1 | name: entropy 2 | tags: 3 | - aggregate 4 | - reshape 5 | description: | 6 | Entropy is a way to calculate the amount of "disorder" in a non-numeric column. Lower entropy indicates less disorder, while higher entropy indicates more. 7 | 8 | The calculation for Shannon's entropy is: H = -Sum[ P(xi) * log2( P(xi)) ] 9 | arguments: 10 | group_by: 11 | type: column_list 12 | description: Columns to group by 13 | columns: 14 | type: column_list 15 | description: Columns to calculate entropy on. Must be non-numeric. 16 | example_code: | 17 | ds = rasgo.get.dataset(id) 18 | 19 | ds2 = ds.entropy(group_by=['FIPS'], columns=['NAME', 'ADDRESS']) 20 | ds2.preview() -------------------------------------------------------------------------------- /docs/uppercase_columns.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # rename 4 | 5 | Rename columns by converting all names to uppercase and removing non-SQL safe characters. 6 | 7 | 8 | ## Parameters 9 | 10 | | Name | Type | Description | Is Optional | 11 | | ---- | ---- | ------------------------------------------ | ----------- | 12 | | none | none | this transform does not take any arguments | | 13 | 14 | 15 | ## Example 16 | 17 | ```python 18 | ds = rasgo.get.dataset(id) 19 | 20 | ds2 = ds.uppercase_columns() 21 | ``` 22 | 23 | ## Source Code 24 | 25 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/uppercase_columns/snowflake/uppercase_columns.sql" %} 26 | 27 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/one_hot_encode/one_hot_encode.yaml: -------------------------------------------------------------------------------- 1 | name: one_hot_encode 2 | tags: 3 | - column 4 | - feature_engineering 5 | description: One hot encode a column. Create a null value flag for the column if any of the values are NULL. 6 | arguments: 7 | column: 8 | type: column_or_expression 9 | description: Column name to one-hot encode. Supports a calculated field via a valid SQL function. 10 | list_of_vals: 11 | type: string_list 12 | description: optional argument to override the dynamic lookup of all values in the target one-hot column 13 | is_optional: true 14 | example_code: | 15 | ds = rasgo.get.dataset(id) 16 | 17 | ds2 = ds.one_hot_encode(column='WEATHER_DESCRIPTION') 18 | ds2.preview() 19 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/sankey/sankey.yaml: -------------------------------------------------------------------------------- 1 | name: sankey 2 | type: insight 3 | operation_type: VIZ 4 | context: 5 | chart_type: sankey 6 | tags: 7 | - insight 8 | - visualization 9 | description: Analyze the hierarchical record count of a series of columns by counting the number of records in each pair of values in hierarchically adjacent columns. The columns fed to this transformation should be categorical lables to be counted. 10 | arguments: 11 | stage: 12 | type: column_list 13 | description: Ordered list of categorial columns, from highest in hierarchy to lowest 14 | example_code: | 15 | ds = rasgo.get.dataset(id) 16 | 17 | ds2 = ds.sankey(stage=["ENGLISHCOUNTRYREGIONNAME", "STATEPROVINCENAME", "CITY"]) 18 | ds2.preview() 19 | -------------------------------------------------------------------------------- /python/constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constants used for python code in the UDT Repo 3 | """ 4 | from enum import Enum 5 | 6 | 7 | class PyRasgoEnvironment(Enum): 8 | """ 9 | Different Environment for connecting PyRasgo to 10 | """ 11 | 12 | PRODUCTION = "api.rasgoml.com" 13 | STAGING = "staging-rasgo-proxy.herokuapp.com" 14 | LOCAL = "localhost" 15 | 16 | 17 | # Base Github Repo for UDT Jinja SQL Links 18 | GITHUB_REPO_URL = "https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms" 19 | 20 | # Default data warehouse 21 | # TODO: When we support multiple DWs in the API 22 | # we'll need to refactor the functions that consume this 23 | RASGO_DATAWAREHOUSE = 'snowflake' 24 | 25 | COMMUNITY_ORGANIZATION_ID: int = 1 26 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/unions/unions.sql: -------------------------------------------------------------------------------- 1 | {# Get all Columns in Source Table #} 2 | {%- set source_col_names = get_columns(source_table) -%} 3 | {% set ns = namespace(union_columns=source_col_names.keys()) %} 4 | 5 | {%- for utable in union_tables -%} 6 | {%- set utable_cols = get_columns(utable) -%} 7 | {%- set ns.union_columns = ns.union_columns|list|select("in", utable_cols.keys()|list) -%} 8 | {%- endfor -%} 9 | 10 | {%- set columns_to_select = ns.union_columns|join(', ') -%} 11 | 12 | {# Generate Union Query #} 13 | SELECT {{ columns_to_select }} 14 | FROM {{ source_table }} 15 | {%- for u_table in union_tables %} 16 | UNION {{ 'ALL' if not remove_duplicates else '' }} 17 | SELECT {{ columns_to_select }} 18 | FROM {{ u_table }} 19 | {%- endfor -%} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/macros/period_to_date.sql: -------------------------------------------------------------------------------- 1 | {% macro period_to_date(metric_name, dimensions, calc_config) %} 2 | {% set alias = metric_name + '_' + calc_config.alias if calc_config.alias is defined else metric_name + '_' + calc_config.aggregate + '_' + calc_config.period %} 3 | {{ calc_config.aggregate }}({{ metric_name }}) 4 | over ( 5 | partition by 6 | {% if dw_type() == 'bigquery' %} 7 | date_trunc(period_min, {{ calc_config.period }}) 8 | {% else %} 9 | date_trunc('{{ calc_config.period }}', period_min) 10 | {% endif %} 11 | {% if dimensions %} 12 | , {{ dimensions | join(", ") }} 13 | {% endif %} 14 | order by period_min 15 | rows between unbounded preceding and current row 16 | ) as {{ alias }} 17 | {% endmacro %} 18 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/aggregate/aggregate.yaml: -------------------------------------------------------------------------------- 1 | name: aggregate 2 | tags: 3 | - table 4 | - reshape 5 | - aggregate 6 | description: Groups rows by the group_by items applying aggregations functions for the resulting group and selected columns 7 | arguments: 8 | group_by: 9 | type: column_list 10 | description: Columns to group by 11 | aggregations: 12 | type: agg_dict 13 | description: Aggregations to apply for other columns. Dict keys are column names, and values are a list of aggegations to apply for that column. 14 | example_code: | 15 | ds = rasgo.get.dataset(id) 16 | 17 | ds2 = ds.aggregate(group_by=['FIPS'], aggregations={ 18 | 'COL_1': ['SUM', 'AVG'], 19 | 'COL_2': ['SUM', 'AVG'] 20 | }) 21 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/sample/sample.yaml: -------------------------------------------------------------------------------- 1 | name: sample 2 | tags: 3 | - row 4 | - math 5 | - conditional 6 | description: Take a sample of a dataset using a specific number of rows or a probability that each row will be selected 7 | arguments: 8 | num_rows: 9 | type: value 10 | description: To sample using a probability of selecting each row, your num_rows should be a decimal less than 1. Otherwise, pass an integer value for number of rows to keep. 11 | filters: 12 | type: filter_list 13 | description: Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text. 14 | is_optional: true 15 | example_code: | 16 | ds = rasgo.get.dataset(id) 17 | 18 | ds2 = ds.sample(num_rows=1000) 19 | ds2.preview() -------------------------------------------------------------------------------- /docs/accelerators/web_traffic_channels.md: -------------------------------------------------------------------------------- 1 | # Google Analytics Web Traffic Channels 2 | 3 | The Web Traffic Channels analysis uses Google Analytics Web Traffic data, including bounce rate, conversion rate, new users, and session duration to create visualizations comparing page performance by channel. 4 | 5 | ## Parameters 6 | 7 | | Name | Type | Description | Is Optional | 8 | | ---------------------------------- | ------- | ---------------------------------- | ----------- | 9 | | google_analytics_web_traffic_table | dataset | Google Analytics Web Traffic Table | | 10 | 11 | 12 | ## Source Code 13 | 14 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/accelerators/web_traffic_channels.yml" %} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datepart/datepart.yaml: -------------------------------------------------------------------------------- 1 | name: datepart 2 | tags: 3 | - column 4 | - date_time 5 | description: | 6 | Extracts a specific part of a date column. For example, if the input is '2021-01-01', you can ask for the year and get back 2021. 7 | 8 | An exhaustive list of valid date parts can be [found here](https://docs.snowflake.com/en/sql-reference/functions-date-time.html#label-supported-date-time-parts). 9 | arguments: 10 | dates: 11 | type: datepart_dict 12 | description: dict where keys are names of columns you want to date part and values are the desired date part grain 13 | example_code: | 14 | ds = rasgo.get.dataset(id) 15 | 16 | ds2 = ds.datepart(dates={ 17 | 'DATE_STRING':'year', 18 | 'DATE2_STR':'month' 19 | }) 20 | ds2.preview() 21 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/target_encode/target_encode.yaml: -------------------------------------------------------------------------------- 1 | name: target_encode 2 | tags: 3 | - column 4 | - feature_engineering 5 | description: | 6 | Encode a categorical column with the average value of a target column for the corresponding value of the categorical column. 7 | 8 | See scikit-learn's [TargetEncoder](https://contrib.scikit-learn.org/category_encoders/targetencoder.html) for full documentation. 9 | 10 | arguments: 11 | column: 12 | type: column 13 | description: Column name to target encode 14 | target: 15 | type: column 16 | description: Numeric target column to use to create averages 17 | example_code: | 18 | ds = rasgo.get.dataset(id) 19 | 20 | ds2 = ds.target_encode(column='WEATHER_DESCRIPTION', target='DAILY_HIGH_TEMP') 21 | ds2.preview() 22 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/conditional_agg/conditional_agg.sql: -------------------------------------------------------------------------------- 1 | {%- if distinct -%} 2 | {%- set agg_thing = 'DISTINCT '~agg_column -%} 3 | {%- else -%} 4 | {%- set agg_thing = agg_column -%} 5 | {%- endif -%} 6 | {%- set rule_combos = [] -%} 7 | {%- for r in rules -%} 8 | {%- if loop.first -%} 9 | {%- set rule_combos = rule_combos.append(r) -%} 10 | {%- else -%} 11 | {%- set new_rule = rule_combos[loop.index-2] ~ ' AND ' ~ r -%} 12 | {%- set rule_combos = rule_combos.append(new_rule) -%} 13 | {%- endif -%} 14 | {%- endfor -%} 15 | {%- for rule in rule_combos -%} 16 | SELECT '{{ rule|replace("'","") }}' AS rule_desc, {{ agg }}({{ agg_thing }}) as QTY 17 | FROM {{ source_table }} 18 | WHERE {{ rule }} 19 | {% if not loop.last %} 20 | UNION ALL 21 | {% endif %} 22 | {%- endfor -%} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/rolling_agg/rolling_agg.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | {% for col, aggs in aggregations.items() -%} 3 | {%- for agg in aggs -%} 4 | {%- for offset in offsets %} 5 | {% set normalized_offset = -offset %} 6 | , {{ agg }}({{ col }}) OVER( 7 | {%- if group_by %} 8 | PARTITION BY {{ group_by | join(", ") }} 9 | {% endif -%} 10 | ORDER BY {{ order_by | join(", ") }} 11 | {% if normalized_offset > 0 -%} 12 | ROWS BETWEEN CURRENT ROW AND {{ normalized_offset }} FOLLOWING 13 | {% else -%} 14 | ROWS BETWEEN {{ normalized_offset|abs }} PRECEDING AND CURRENT ROW 15 | {% endif -%} 16 | ) as {{ cleanse_name(agg + '_' + col + '_' + offset|string) }} 17 | {%- endfor -%} 18 | {%- endfor -%} 19 | {%- endfor %} 20 | FROM {{ source_table }} -------------------------------------------------------------------------------- /docs/correlation.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # correlation 4 | 5 | Run pairwise correlation on all numeric columns in the source_table 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | -------------- | ----- | ---------------------------------------------------------------------- | ----------- | 11 | | rows_to_sample | value | number of rows to sample from the table before calculating correlation | True | 12 | 13 | 14 | ## Example 15 | 16 | ```python 17 | ds = rasgo.get.dataset(id) 18 | 19 | ds2 = ds.correlation(rows_to_sample=1000) 20 | ds2.preview() 21 | ``` 22 | 23 | ## Source Code 24 | 25 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/correlation/correlation.sql" %} 26 | 27 | -------------------------------------------------------------------------------- /docs/profile_column.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # profile_column 4 | 5 | ## Analyze the distinct values in a column 6 | 7 | ### Required Inputs 8 | - Column: the column you want to profile 9 | 10 | ### Notes 11 | - Only supports profiling one column at a time 12 | 13 | 14 | ## Parameters 15 | 16 | | Name | Type | Description | Is Optional | 17 | | ----------- | ------ | ------------------------------ | ----------- | 18 | | column_name | column | The column you want to profile | | 19 | 20 | 21 | ## Example 22 | 23 | ```python 24 | ds = rasgo.get.dataset(id) 25 | 26 | ds.profile_column(column_name = 'IMPORTANTCOLUMN') 27 | ``` 28 | 29 | ## Source Code 30 | 31 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/profile_column/profile_column.sql" %} 32 | 33 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datarobot_score/datarobot_score.sql: -------------------------------------------------------------------------------- 1 | SELECT {{ include_cols|join(',') }}, 2 | 3 | {%- if num_explains is defined and threshold_low is defined and threshold_high is defined -%} 4 | S:score AS PREDICTION 5 | {%- set function_call = '(OBJECT_CONSTRUCT_KEEP_NULL(*),' ~ num_explains ~ ',' ~ threshold_low ~ ',' ~ threshold_high ~ ')' %} 6 | {% for i in range(num_explains) -%} 7 | ,CONCAT(S:explanations[{{ i }}].featureName, '=', S:explanations[{{ i }}].featureValue, ' (', S:explanations[{{ i }}].strength, ')') AS TOP{{ i+1 }}_INFLUENCING_FACTOR 8 | {% endfor -%} 9 | {%- else -%} 10 | S AS PREDICTION 11 | {% set function_call = '(OBJECT_CONSTRUCT_KEEP_NULL(*))' %} 12 | {%- endif %} 13 | FROM ( 14 | SELECT *, 15 | {{ function_name }}{{ function_call }} AS S 16 | FROM {{ source_table }} ) -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/new_columns/new_columns.yaml: -------------------------------------------------------------------------------- 1 | name: new_columns 2 | tags: 3 | - column 4 | - math 5 | description: | 6 | ## Build new columns, using SQL formulas. 7 | 8 | ### Required Inputs 9 | - Calculated Column: the formula for the new column you want to build 10 | 11 | ### Optional Inputs 12 | - Alias: name for your columns 13 | 14 | ### Notes 15 | - Supports any SQL column functions that are compatible with your data warehouse 16 | 17 | arguments: 18 | calculated_columns: 19 | type: calculated_column_list 20 | description: List of SQL formulas to generate new columns 21 | example_code: | 22 | ds2 = ds.new_columns( 23 | calculated_columns={ 24 | calcuated_column: 'POWER(COLUMN_NAME, 3)', 25 | alias: 'COLUMN_NAME_Cubed' 26 | } 27 | ) 28 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/apply/apply.yaml: -------------------------------------------------------------------------------- 1 | name: apply 2 | tags: 3 | - table 4 | - custom 5 | - reshape 6 | - aggregate 7 | description: A transform that accepts a custom template to execute. Must use the sql template argument `source_table` to reference the Rasgo dataset which will serve as the base of any SELECT 8 | arguments: 9 | sql: 10 | type: custom 11 | description: The custom SQL transform template to apply 12 | example_code: | 13 | ds = rasgo.get.dataset(id) 14 | 15 | ds2 = ds.apply( 16 | sql='SELECT * FROM {{ source_table }} WHERE COLUMNVALUE = I17' 17 | ) 18 | ds2.preview() 19 | 20 | # passing in custom arguments 21 | ds = rasgo.get.dataset(id) 22 | 23 | ds2 = ds.apply( 24 | sql="SELECT * FROM {{ source_table }} WHERE COLUMNVALUE = '{{ my_value }}'", 25 | my_value="I17" 26 | ) 27 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/cast/cast.yaml: -------------------------------------------------------------------------------- 1 | name: cast 2 | tags: 3 | - column 4 | - data_cleaning 5 | - data_quality 6 | description: | 7 | Cast selected columns to a new type 8 | arguments: 9 | casts: 10 | type: cast_value_dict 11 | description: A dict where the keys are columns and the values are the new type to cast them to. 12 | overwrite_columns: 13 | type: boolean 14 | is_optional: true 15 | description: to overwrite column names with the new casted column, use 'true'. otherwise, use 'false'. defaults to 'false'. 16 | example_code: | 17 | ds = rasgo.get.dataset(id) 18 | 19 | ds_casted = ds.cast( 20 | casts={ 21 | 'DS_WEATHER_ICON':'INT', 22 | 'DS_DAILY_HIGH_TEMP':'STRING', 23 | 'DS_DAILY_LOW_TEMP':'INT' 24 | }, 25 | overwrite_columns=True 26 | ) 27 | 28 | ds_casted.preview() 29 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/filter/filter.yaml: -------------------------------------------------------------------------------- 1 | name: filter 2 | tags: 3 | - row 4 | - data_cleaning 5 | - conditional 6 | description: | 7 | Filter the dataset. Supports two types of filters: 8 | 1. Comparison filters, which compare the values in a column with a value 9 | 2. Advanced filters, which support full SQL strings for custom filtering logic 10 | arguments: 11 | items: 12 | type: filter_list 13 | description: list of dictionaries representing filters 14 | example_code: | 15 | ds = rasgo.get.dataset(74) 16 | 17 | # comma separated list of 'WHERE' clauses 18 | ds2 = ds.filter(items=['PRODUCTKEY < 500']) 19 | ds2.preview() 20 | 21 | # full filtering with a column, operator, and comparison value 22 | ds3 = ds.filter(items=[{'column_name':'PRODUCTKEY', 'operator':'>', 'comparison_value':'101'}]) 23 | ds3.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/latest/latest.yaml: -------------------------------------------------------------------------------- 1 | name: latest 2 | tags: 3 | - column 4 | - data_cleaning 5 | - data_quality 6 | - time_series 7 | description: Impute missing values in ALL columns with the latest value seen in rows prior 8 | arguments: 9 | group_by: 10 | type: column_list 11 | description: List of columns to perform the imputation "within" 12 | order_by: 13 | type: column_list 14 | description: List of columns to sort ascending, in order to find the last known value for imputation 15 | nulls: 16 | type: string 17 | description: Pass either 'ignore' or 'respect' to determine whether nulls should be ignored or not during imputation. 18 | 19 | example_code: | 20 | ds = rasgo.get.dataset(id) 21 | 22 | ds2 = ds.latest( 23 | group_by=['FIPS'], 24 | order_by=['DATE'], 25 | nulls='ignore') 26 | 27 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/text_to_sql/text_to_sql.yaml: -------------------------------------------------------------------------------- 1 | name: text_to_sql 2 | tags: 3 | - custom 4 | description: | 5 | ## Text to SQL, powered by OpenAI. 6 | ### Required Inputs 7 | - Text: a prompt describing the SQL query that you want OpenAI to generate for you. Add as much context as possible to help OpenAI generate a useful query. Avoid using relative date terms like "last year" because OpenAI doesn't have any knowledge past 2021. 8 | arguments: 9 | text: 10 | type: string-long 11 | description: | 12 | Text description of the query you want to generate. 13 | Example: total revenue for the Southwest region in 2021 14 | example_code: | 15 | ds = rasgo.get.dataset(fqtn='DB.SCHEMA.IOWA_LIQUOR_SALES') 16 | 17 | ds2 = ds.text_to_sql( 18 | text='total bottles sold in Des Moines last year' 19 | ) 20 | ds2.sql() 21 | ds2.preview() 22 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/histogram/histogram.yaml: -------------------------------------------------------------------------------- 1 | name: histogram 2 | type: insight 3 | operation_type: VIZ 4 | context: 5 | chart_type: series_continuous 6 | tags: 7 | description: Analyze the value distribution of a single continuous variable by binning it and calculating frequencies in each bin 8 | arguments: 9 | column: 10 | type: column 11 | description: numeric column to use to generate the histogram 12 | filters: 13 | type: filter_list 14 | description: Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text. 15 | is_optional: true 16 | num_buckets: 17 | type: value 18 | is_optional: true 19 | description: max number of buckets to create; defaults to 200 20 | example_code: | 21 | ds = rasgo.get.dataset(id) 22 | 23 | ds2 = ds.histogram(column='SALESAMOUNT') 24 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/funnel/funnel.yaml: -------------------------------------------------------------------------------- 1 | name: funnel 2 | type: insight 3 | operation_type: VIZ 4 | context: 5 | chart_type: funnel 6 | tags: 7 | - insight 8 | - visualization 9 | description: Creates a funnel visualization-ready dataset from numeric columns (e.g., ["Number of leads", "Number of contacts", "Number of deals closed"]) representing a hierarchy with summed incidence rates 10 | arguments: 11 | stage_columns: 12 | type: column_list 13 | description: List of columns to include in the funnel dataset, in order of hierarchy from highest stage to lowest stage (e.g., ["Number of leads", "Number of contacts", "Number of deals closed"]) 14 | example_code: | 15 | ds = rasgo.get.dataset(id) 16 | 17 | ds2 = ds.funnel(stage_columns=["TOTAL_IMPRESSIONS", "TOTAL_EMAILS_SENT", "TOTAL_WEBTRAFFIC_USERS", "TOTAL_LEADS_CREATED", "TOTAL_DEALS_CLOSED"]) 18 | ds2.preview() 19 | -------------------------------------------------------------------------------- /docs/label_encode.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # label_encode 4 | 5 | Encode target labels with value between 0 and n_classes-1. See scikit-learn's [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder) for full documentation. 6 | 7 | 8 | ## Parameters 9 | 10 | | Name | Type | Description | Is Optional | 11 | | ------ | ------ | --------------------------- | ----------- | 12 | | column | column | Column name to label encode | | 13 | 14 | 15 | ## Example 16 | 17 | ```python 18 | ds = rasgo.get.dataset(id) 19 | 20 | ds2 = ds.label_encode(column='WEATHER_DESCRIPTION') 21 | ds2.preview() 22 | 23 | ``` 24 | 25 | ## Source Code 26 | 27 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/label_encode/snowflake/label_encode.sql" %} 28 | 29 | -------------------------------------------------------------------------------- /docs/order.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # order 4 | 5 | Order a dataset by specified columns, in a specified order 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | -------- | ----------------- | -------------------------------------------------------------------------------------- | ----------- | 11 | | order_by | column_value_dict | dict where the keys are column names and the values are the order_method (ASC or DESC) | | 12 | 13 | 14 | ## Example 15 | 16 | ```python 17 | ds = rasgo.get.dataset(id) 18 | 19 | ds2 = ds.order(order_by={'DS_WEATHER_ICON':'ASC', 'DS_DAILY_HIGH_TEMP':'DESC'}) 20 | ds2.preview() 21 | ``` 22 | 23 | ## Source Code 24 | 25 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/order/order.sql" %} 26 | 27 | -------------------------------------------------------------------------------- /docs/describe.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # describe 4 | 5 | Describes the dataset using a consistent set of metrics, based on data type. 6 | Numeric: DTYPE, COUNT, NULL_COUNT, UNIQUE_COUNT, MOST_FREQUENT, MEAN, STD_DEV, MIN, _25_PERCENTILE, _50_PERCENTILE, _75_PERCENTILE, MAX 7 | Other: DTYPE, COUNT, NULL_COUNT, UNIQUE_COUNT, MOST_FREQUENT, MIN, MAX 8 | 9 | 10 | ## Parameters 11 | 12 | | Name | Type | Description | Is Optional | 13 | | ---- | ---- | ------------------------------------------ | ----------- | 14 | | none | none | this transform does not take any arguments | | 15 | 16 | 17 | ## Example 18 | 19 | ```python 20 | ds = rasgo.get.dataset(id) 21 | 22 | ds.describe().to_df() 23 | ``` 24 | 25 | ## Source Code 26 | 27 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/describe/snowflake/describe.sql" %} 28 | 29 | -------------------------------------------------------------------------------- /docs/sample_class.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # sample_class 4 | 5 | Sample n rows for each value of a column 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ---------- | ------ | -------------------------------------------------------- | ----------- | 11 | | sample_col | column | The column for which you want to sample | | 12 | | sample | dict | Value of column as a key, n rows to be sampled as values | | 13 | 14 | 15 | ## Example 16 | 17 | ```python 18 | ds = rasgo.get.dataset(id) 19 | 20 | ds2 = ds.sample_class(sample_col='BINARY_TARGET_COLUMNNAME', sample={'1':15000, '0':60000}) 21 | ds2.preview() 22 | ``` 23 | 24 | ## Source Code 25 | 26 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/sample_class/sample_class.sql" %} 27 | 28 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/correlation/correlation.sql: -------------------------------------------------------------------------------- 1 | {%- set names_types_list = get_columns(source_table) -%} 2 | 3 | {%- set column_list = [] -%} 4 | 5 | {%- for key, value in names_types_list.items() -%} 6 | {% if (value|upper == 'NUMBER' or 'FLOAT' in value|upper or 'INT' in value|upper) %} 7 | {%- do column_list.append(key) -%} 8 | {%- endif -%} 9 | {%- endfor -%} 10 | 11 | WITH source_sampled as ( 12 | SELECT * from {{ source_table }} 13 | {% if rows_to_sample is defined %} SAMPLE ({{ rows_to_sample }} ROWS) {% endif -%} 14 | ) 15 | 16 | SELECT * FROM ( 17 | {%- for combo in itertools.product(column_list, repeat=2) -%} 18 | SELECT '{{ combo[0] }}' as COLUMN_A, 19 | '{{ combo[1] }}' as COLUMN_B, 20 | CORR({{ combo[0] }}, {{ combo[1] }}) as Correlation 21 | FROM source_sampled 22 | {% if not loop.last %} UNION {% endif -%} 23 | {%- endfor -%} 24 | ) 25 | ORDER BY COLUMN_A, COLUMN_B -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/timeseries_agg/timeseries_agg.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | {% for offset in offsets -%} 3 | {% set normalized_offset = -offset %} 4 | {% for col, aggs in aggregations.items() -%} 5 | {% for agg in aggs %} 6 | ,( 7 | SELECT {{ agg }}({{ col }}) 8 | FROM {{ source_table }} i 9 | WHERE 10 | {% if normalized_offset > 0 -%} 11 | i.{{ date }} BETWEEN o.{{ date }} AND (o.{{ date }} + INTERVAL {{ normalized_offset }} {{ date_part }}) 12 | {% else -%} 13 | i.{{ date }} BETWEEN (o.{{ date }} - INTERVAL {{ normalized_offset|abs }} {{ date_part }})) AND o.{{ date }} 14 | {%- endif -%} 15 | {%- for g in group_by %} 16 | AND o.{{ g }} = i.{{ g }} 17 | {% endfor -%} 18 | ) AS {{ cleanse_name(agg + '_' + col + '_' + offset|string + date_part) }} 19 | {%- endfor -%} 20 | {%- endfor %} 21 | {% endfor %} 22 | FROM {{ source_table }} o -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/dropna/dropna.yaml: -------------------------------------------------------------------------------- 1 | name: dropna 2 | tags: 3 | - row 4 | - data_cleaning 5 | - conditional 6 | description: Remove missing values 7 | arguments: 8 | how: 9 | type: value 10 | description: Method to determine if record is removed, 'any' removes each record with at least one missing value, 'all' removes records only when all values are missing (default = 'any'). 11 | is_optional: true 12 | subset: 13 | type: column_list 14 | description: List of columns to check for missing values. All columns are checked if not defined. 15 | is_optional: true 16 | thresh: 17 | type: int 18 | description: (Optional) Acts like all, but only requires this number of values to be null to remove a record instead of all. 19 | is_optional: true 20 | example_code: | 21 | ds = rasgo.get.dataset(id) 22 | 23 | ds2 = ds.dropna(how='all', subset=['ORDERS', 'SALES']) 24 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/extract_sequences/snowflake/extract_sequences.sql: -------------------------------------------------------------------------------- 1 | WITH CTE_{{ column }} AS ( 2 | select * from {{ source_table }} 3 | match_recognize( 4 | partition by {{ group_by | join(', ') }} 5 | order by {{ order_by }} 6 | measures 7 | match_number() as SEQUENCE_NUMBER, 8 | first({{ order_by }}) as SEQUENCE_START_DATE, 9 | last({{ order_by }}) as SEQUENCE_END_DATE, 10 | count(*) as SEQUENCE_LEN, 11 | count(row_decrease.*) as SEQUENCE_DECREASE_CNT, 12 | count(row_increase.*) as SEQUENCE_INCREASE_CNT 13 | one row per match 14 | after match skip to last row_increase 15 | pattern(FOO row_decrease+ row_increase+) 16 | define 17 | row_decrease AS {{ column }} < lag({{ column }}), 18 | row_increase AS {{ column }} > lag({{ column }}) 19 | ) 20 | ) 21 | SELECT * FROM CTE_{{ column }} ORDER BY {{ group_by | join(', ') }}, SEQUENCE_NUMBER -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/moving_avg/moving_avg.yaml: -------------------------------------------------------------------------------- 1 | name: moving_avg 2 | tags: 3 | - column 4 | - date_time 5 | - feature_engineering 6 | description: generates moving averages per column and per window size 7 | arguments: 8 | input_columns: 9 | type: column_list 10 | description: names of column(s) you want to moving average 11 | window_sizes: 12 | type: int_list 13 | description: the integer values for window sizes you want to use in your moving average 14 | order_by: 15 | type: column_list 16 | description: columns to order by, typically the date index of the table 17 | partition: 18 | type: column_list 19 | description: columns to partition the moving average by 20 | example_code: | 21 | ds = rasgo.get.dataset(id) 22 | 23 | ds2 = ds.moving_avg(input_columns=['OPEN','CLOSE','HIGH','LOW'], window_sizes=[1,2,3,7], order_by=['DATE, 'TICKER'], partition=['TICKER']) 24 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/lag/bigquery/lag.sql: -------------------------------------------------------------------------------- 1 | {%- if partition is not defined or partition|length == 0 -%} 2 | {%- set partition = ["NULL"]-%} 3 | {%- endif -%} 4 | {%- if order_by is not defined or order_by|length == 0 -%} 5 | {%- set order_by = ["NULL"]-%} 6 | {%- endif -%} 7 | {%- for amount in amounts -%} 8 | {%- if amount < 0 -%} 9 | {{ raise_exception('BigQuery cannot use negative values for a lag function. Please utilize lead for forward looking windows.') }} 10 | {%- endif -%} 11 | {%- endfor -%} 12 | SELECT *, 13 | {%- for col in columns -%} 14 | {%- for amount in amounts %} 15 | lag({{col}}, {{amount}}) over (partition by {{partition | join(", ")}} order by {{order_by | join(", ")}}) as Lag_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }} 16 | {%- endfor -%} 17 | {{ ", " if not loop.last else "" }} 18 | {%- endfor %} 19 | from {{ source_table }} 20 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/lead/bigquery/lead.sql: -------------------------------------------------------------------------------- 1 | {%- if partition is not defined or partition|length == 0 -%} 2 | {%- set partition = ["NULL"]-%} 3 | {%- endif -%} 4 | {%- if order_by is not defined or order_by|length == 0 -%} 5 | {%- set order_by = ["NULL"]-%} 6 | {%- endif -%} 7 | {%- for amount in amounts -%} 8 | {%- if amount < 0 -%} 9 | {{ raise_exception('BigQuery cannot use negative values for a lead function. Please utilize lag for backwards looking windows.') }} 10 | {%- endif -%} 11 | {%- endfor -%} 12 | SELECT *, 13 | {%- for col in columns -%} 14 | {%- for amount in amounts %} 15 | lead({{col}}, {{amount}}) over (partition by {{partition | join(", ")}} order by {{order_by | join(", ")}}) as lead_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }} 16 | {%- endfor -%} 17 | {{ ", " if not loop.last else "" }} 18 | {%- endfor %} 19 | from {{ source_table }} 20 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/timeseries_agg/bigquery/timeseries_agg.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | {% for offset in offsets -%} 3 | {% set normalized_offset = -offset %} 4 | {% for col, aggs in aggregations.items() -%} 5 | {% for agg in aggs %} 6 | ,( 7 | SELECT {{ agg }}({{ col }}) 8 | FROM {{ source_table }} i 9 | WHERE 10 | {% if normalized_offset > 0 -%} 11 | i.{{ date }} BETWEEN o.{{ date }} AND DATE_ADD(o.{{ date }}, INTERVAL {{ normalized_offset }} {{ date_part }}) 12 | {% else -%} 13 | i.{{ date }} BETWEEN DATE_SUB(o.{{ date }}, INTERVAL {{ normalized_offset|abs }} {{ date_part }}) AND o.{{ date }} 14 | {%- endif -%} 15 | {%- for g in group_by %} 16 | AND o.{{ g }} = i.{{ g }} 17 | {% endfor -%} 18 | ) AS {{ cleanse_name(agg + '_' + col + '_' + offset|string + date_part) }} 19 | {%- endfor -%} 20 | {%- endfor %} 21 | {% endfor %} 22 | FROM {{ source_table }} o -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/drop_columns/drop_columns.sql: -------------------------------------------------------------------------------- 1 | {% if include_cols and exclude_cols is defined %} 2 | {{ raise_exception('You cannot pass both an include_cols list and an exclude_cols list') }} 3 | {% else %} 4 | 5 | {%- if exclude_cols is defined -%} 6 | {%- set source_col_names = get_columns(source_table) -%} 7 | 8 | {# Upper exclude cols to ensure case insensitive name matching #} 9 | {%- set exclude_cols = (exclude_cols|join(',')|upper).split(',') -%} 10 | {% set include_cols = [] -%} 11 | {% for column_name in source_col_names -%} 12 | {% if column_name.upper() not in exclude_cols -%} 13 | {% do include_cols.append(column_name) -%} 14 | {% endif -%} 15 | {% endfor -%} 16 | {%- endif -%} 17 | 18 | {%- if include_cols is defined -%} 19 | SELECT 20 | {%- for col in include_cols %} 21 | {{col}}{{ ", " if not loop.last else " " }} 22 | {%- endfor %} 23 | FROM {{source_table}} 24 | {%- endif -%} 25 | 26 | {%- endif -%} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/union/union.yaml: -------------------------------------------------------------------------------- 1 | name: union 2 | tags: 3 | - table 4 | - reshape 5 | - join 6 | description: Performs a SQL UNION or UNION ALL for the parent dataset, and another dataset. Operation will only merge columns with matching columns names in both datasets and drop all other columns. Column data type validation does not happen. 7 | arguments: 8 | dataset2: 9 | type: table 10 | description: Dataset to Union/Union All with main dataset 11 | keep_dupes: 12 | type: boolean 13 | description: | 14 | Set to True to performn a UNION ALL between the two tables, which keeps rows that are duplicated. 15 | Set to False to eliminate duplicate rows. 16 | is_optional: true 17 | example_code: | 18 | d1 = rasgo.get.dataset(dataset_id) 19 | d2 = rasgo.get.dataset(dataset_id_2) 20 | 21 | ds2 = d1.transform.union( 22 | dataset2=d2, 23 | keep_dupes=True 24 | ) 25 | 26 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/summarize_flatlines/summarize_flatlines.sql: -------------------------------------------------------------------------------- 1 | WITH CTE_SEQUENCES AS ( 2 | SELECT 3 | T.*, 4 | ROW_NUMBER() OVER (PARTITION BY {%- for group_item in group_by %} {{ group_item }},{%- endfor -%} {{ value_col }} ORDER BY {{ order_col }}) AS RN_R97_B42_O, 5 | ROW_NUMBER() OVER (ORDER BY {%- for group_item in group_by %} {{ group_item }},{%- endfor -%} {{ order_col }}) AS RN_R97_B42_E 6 | FROM 7 | {{ source_table }} T 8 | ) 9 | SELECT 10 | {%- for group_item in group_by %} S.{{ group_item }},{%- endfor -%} 11 | S.{{ value_col }} as REPEATED_VALUE, 12 | MIN(S.{{ order_col }}) AS FLATLINE_START_DATE, 13 | MAX(S.{{ order_col }}) AS FLATLINE_END_DATE, 14 | COUNT(*) AS OCCURRENCE_COUNT 15 | FROM 16 | CTE_SEQUENCES S 17 | GROUP BY 18 | {%- for group_item in group_by %} S.{{ group_item }},{%- endfor -%} 19 | S.{{ value_col }}, 20 | S.RN_R97_B42_E - S.RN_R97_B42_O 21 | HAVING COUNT(*) > {{ min_repeat_count }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/to_date/to_date.yaml: -------------------------------------------------------------------------------- 1 | name: to_date 2 | tags: 3 | - column 4 | - data_cleaning 5 | - date_time 6 | description: | 7 | Creates a column of a date/timestamp type from a string or other non-date column. 8 | 9 | See [this Snowflake doc](https://docs.snowflake.com/en/user-guide/date-time-input-output.html#about-the-format-specifiers-in-this-section) for information about valid formats. 10 | arguments: 11 | dates: 12 | type: column_value_dict 13 | description: dict where the values are the date columns and the keys are the date formats to use for the conversion 14 | overwrite_columns: 15 | type: boolean 16 | description: "Optional: if true, the output columns will overwrite the input columns" 17 | is_optional: true 18 | example_code: | 19 | ds = rasgo.get.dataset(id) 20 | 21 | ds2 = ds.to_date(dates={ 22 | 'DATE_STRING':'YYYY-MM-DD', 23 | 'DATE2_STR':'YYYY-DD-MM' 24 | }) 25 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/replace_string/replace_string.sql: -------------------------------------------------------------------------------- 1 | {% if position is not defined %} 2 | {% set position = 1 %} 3 | {% else %} 4 | {% set use_regex = True %} 5 | {% endif %} 6 | 7 | {% if occurrence is not defined %} 8 | {% set occurrence = 0 %} 9 | {% else %} 10 | {% set use_regex = True %} 11 | {% endif %} 12 | 13 | {% if parameters is not defined %} 14 | {% set parameters = 'c' %} 15 | {% else %} 16 | {% set use_regex = True %} 17 | {% endif %} 18 | 19 | {% if use_regex %} 20 | SELECT *, 21 | REGEXP_REPLACE({{ source_col }}, '{{ pattern }}', '{{ replacement }}', {{ position }}, {{ occurrence }}, '{{ parameters }}') AS {{cleanse_name(alias) if alias is defined else "REPLACE_" + source_col}} 22 | FROM {{ source_table }} 23 | {% else %} 24 | SELECT *, 25 | REPLACE({{ source_col }}, '{{ pattern }}', '{{ replacement }}') AS {{cleanse_name(alias) if alias is defined else "REPLACE_" + source_col}} 26 | FROM {{ source_table }} 27 | {% endif %} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datepart/bigquery/datepart.sql: -------------------------------------------------------------------------------- 1 | SELECT *, 2 | {%- for target_col, date_part in dates.items() %} 3 | {%- if date_part|lower == 'weekiso' %} 4 | EXTRACT(ISOWEEK FROM {{ target_col }}) AS {{ target_col }}_ISOWEEK {{ ", " if not loop.last else "" }} 5 | {%- elif date_part|lower == 'dayofweekiso' %} 6 | MOD(EXTRACT(DAYOFWEEK FROM {{ target_col }}) + 5, 7) + 1 AS {{ target_col }}_ISODAYOFWEEK {{ ", " if not loop.last else "" }} 7 | {%- elif date_part|lower == 'yearofweekiso' %} 8 | EXTRACT(ISOYEAR FROM {{ target_col }}) AS {{ target_col }}_ISOYEAR {{ ", " if not loop.last else "" }} 9 | {%- elif date_part|lower == 'yearofweek' %} 10 | EXTRACT(YEAR FROM {{ target_col }}) AS {{ target_col }}_YEAR {{ ", " if not loop.last else "" }} 11 | {%- else %} 12 | EXTRACT({{ date_part }} FROM {{ target_col }}) AS {{ target_col }}_{{ date_part }} {{ ", " if not loop.last else "" }} 13 | {%- endif %} 14 | {%- endfor %} 15 | FROM {{ source_table }} 16 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/rank/rank.sql: -------------------------------------------------------------------------------- 1 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', rank_columns)|join(',') if overwrite_columns else "*" -%} 2 | 3 | {%- set alias = alias if alias is defined else cleanse_name('RANK_' + '_'.join(rank_columns)) -%} 4 | 5 | SELECT {{ untouched_cols }}, 6 | {%- if rank_type == 'dense' %} 7 | DENSE_RANK() OVER( 8 | {% elif rank_type == 'percent' %} 9 | PERCENT_RANK() OVER( 10 | {% elif rank_type == 'unique' %} 11 | ROW_NUMBER() OVER( 12 | {%- else -%} 13 | RANK() OVER( 14 | {% endif %} 15 | {% if partition_by -%} 16 | PARTITION BY {% for col in partition_by -%}{{col}}{{ ", " if not loop.last else " " }}{%- endfor %} 17 | {% endif -%} 18 | ORDER BY {% for col in rank_columns -%}{{col}}{% if order %} {{ order }}{% endif %}{{ ", " if not loop.last else " " }}{%- endfor %} 19 | ) AS {{ alias }} 20 | FROM {{ source_table }} 21 | {% if qualify_filter %}QUALIFY {{ alias }} {{ qualify_filter }}{% endif %} 22 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/unions/unions.yaml: -------------------------------------------------------------------------------- 1 | name: union 2 | tags: 3 | - table 4 | - reshape 5 | - join 6 | description: | 7 | Union one or multiple tables with the base table. 8 | Looks at all columns in each table and finds columns in common across all of them to keep in the final table. 9 | arguments: 10 | union_tables: 11 | type: table_list 12 | description: tables to union with the base table 13 | remove_duplicates: 14 | type: boolean 15 | description: | 16 | Defaults to False. 17 | Set to True to use UNION, which removes duplicate rows. 18 | Set to False to use UNION ALL, which keeps rows that are duplicated. 19 | is_optional: true 20 | example_code: | 21 | d1 = rasgo.get.dataset(dataset_id) 22 | d2 = rasgo.get.dataset(dataset_id_2) 23 | d3 = rasgo.get.dataset(dataset_id_3) 24 | 25 | union_ds = d1.unions( 26 | union_tables=[d2.fqtn, d3.fqtn] 27 | remove_duplicates=True 28 | ) 29 | 30 | union_ds.preview() -------------------------------------------------------------------------------- /docs/rename.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # rename 4 | 5 | Rename columns by passing a renames dict. 6 | 7 | 8 | ## Parameters 9 | 10 | | Name | Type | Description | Is Optional | 11 | | ------- | ----------------- | -------------------------------------------------------------------------------------- | ----------- | 12 | | renames | column_value_dict | A dict representing each existing column to be renamed and its corresponding new name. | | 13 | 14 | 15 | ## Example 16 | 17 | ```python 18 | ds = rasgo.get.dataset(dataset_id) 19 | ds2 = ds.rename(renames={ 20 | 'DS_WEATHER_ICON': 'Weather', 21 | 'DS_DAILY_HIGH_TEMP': 'High_Temp', 22 | 'DS_DAILY_LOW_TEMP': 'Low_Temp' 23 | }) 24 | ds2.preview() 25 | ``` 26 | 27 | ## Source Code 28 | 29 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/rename/snowflake/rename.sql" %} 30 | 31 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/heatmap/heatmap.yaml: -------------------------------------------------------------------------------- 1 | name: heatmap 2 | type: insight 3 | operation_type: VIZ 4 | context: 5 | chart_type: heatmap_continuous 6 | tags: 7 | description: Generate an x / y heatmap, which uses the number of rows in each x/y bin as a density overlay to a 2-d histogram 8 | arguments: 9 | x_axis: 10 | type: column 11 | description: numeric column to use as the x axis 12 | y_axis: 13 | type: column 14 | description: numeric column to use as the y axis 15 | filters: 16 | type: filter_list 17 | description: Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text. 18 | is_optional: true 19 | num_buckets: 20 | type: value 21 | is_optional: true 22 | description: max number of buckets to create; defaults to 100 23 | example_code: | 24 | ds = rasgo.get.dataset(id) 25 | 26 | ds2 = ds.heatmap(x_axis='TEMPERATURE', 27 | y_axis='PRECIPITATION') 28 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/drop_columns/drop_columns.yaml: -------------------------------------------------------------------------------- 1 | name: drop_columns 2 | tags: 3 | - column 4 | - data_cleaning 5 | description: | 6 | Drop columns by passing either an include_cols list of columns to include or an exclude_cols list of columns to exclude. 7 | 8 | Passing both include_cols and exclude_cols will result in an error. 9 | 10 | arguments: 11 | include_cols: 12 | type: column_list 13 | description: A list of the columns from the dataset you want to keep. 14 | is_optional: true 15 | exclude_cols: 16 | type: column_list 17 | description: A list of the columns from the dataset you want to drop. Any columns not in the exclude_cols list will be kept. 18 | is_optional: true 19 | example_code: | 20 | ds = rasgo.get.dataset(id) 21 | 22 | ds2a = ds.drop_columns(include_cols=["DS_WEATHER_ICON", "DS_DAILY_HIGH_TEMP"]) 23 | ds2a.preview() 24 | 25 | ds2b = ds.drop_columns(exclude_cols=["DS_CLOUD_COVER", "DS_TOTAL_RAINFALL"]) 26 | ds2b.preview() 27 | -------------------------------------------------------------------------------- /docs/sankey.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # sankey 4 | 5 | Analyze the hierarchical record count of a series of columns by counting the number of records in each pair of values in hierarchically adjacent columns. The columns fed to this transformation should be categorical lables to be counted. 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ----- | ----------- | ----------------------------------------------------------------------- | ----------- | 11 | | stage | column_list | Ordered list of categorial columns, from highest in hierarchy to lowest | | 12 | 13 | 14 | ## Example 15 | 16 | ```python 17 | ds = rasgo.get.dataset(id) 18 | 19 | ds2 = ds.sankey(stage=["ENGLISHCOUNTRYREGIONNAME", "STATEPROVINCENAME", "CITY"]) 20 | ds2.preview() 21 | 22 | ``` 23 | 24 | ## Source Code 25 | 26 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/sankey/sankey.sql" %} 27 | 28 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/remove_duplicates/remove_duplicates.yaml: -------------------------------------------------------------------------------- 1 | name: remove_duplicates 2 | tags: 3 | - table 4 | - data_quality 5 | - data_cleaning 6 | description: Deduplicate a table based on a passed-in composite key. Once an order column and an order method are selected, only the top record from the resulting grouped and ordered dataset will be kept. 7 | arguments: 8 | natural_key: 9 | type: column_list 10 | description: Columns forming the grain at which to remove duplicates 11 | order_col: 12 | type: column_list 13 | description: Columns by which to order the result set, such that the first result is kept 14 | order_method: 15 | type: sort_direction 16 | description: Sets the order behavior for the chosen `order_col`. Can be ASC or DESC. 17 | example_code: | 18 | ds = rasgo.get.dataset(id) 19 | 20 | ds2 = ds.remove_duplicates( 21 | natural_key=["FIPS", "DS_WEATHER_ICON", "DATE"], 22 | order_col=["DATE", "FIPS"], 23 | order_method="asc" 24 | ) 25 | ds2.preview() 26 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/summarize/summarize.sql: -------------------------------------------------------------------------------- 1 | {% from 'filter.sql' import get_filter_statement %} 2 | 3 | WITH filtered as ( 4 | SELECT * 5 | FROM {{ source_table }} 6 | {%- if filters is defined %} 7 | where true AND 8 | {{ get_filter_statement(filters) | indent }} 9 | {%- endif %} 10 | ) 11 | , 12 | summarized as ( 13 | SELECT 14 | {%- if group_by is defined %} 15 | {{ group_by | join(', ') }}, 16 | {%- endif %} 17 | {%- for column, aggs in summarize.items() %} 18 | {%- set oloop = loop %} 19 | {%- for aggregation_type in aggs %} 20 | {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}}{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}{{ ',' if not (loop.last and oloop.last) }} 21 | {%- endfor %} 22 | {%- endfor %} 23 | FROM filtered 24 | {%- if group_by is defined %} 25 | GROUP BY {{ group_by | join(', ') }} 26 | {%- endif %} 27 | ) 28 | SELECT * 29 | FROM summarized -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/vlookup/vlookup.sql: -------------------------------------------------------------------------------- 1 | {%- macro table_from_fqtn(fqtn) -%} 2 | {{ fqtn.split('.')[-1] }} 3 | {%- endmacro -%} 4 | 5 | {# Get all Columns in Source Table #} 6 | {%- set source_col_names = get_columns(source_table) -%} 7 | 8 | {# Get relevant Columns and Table Name in Lookup Table #} 9 | {%- if keep_columns is defined -%} 10 | {%- set lookup_table_cols = keep_columns -%} 11 | {%- else -%} 12 | {%- set lookup_table_cols = get_columns(lookup_table) -%} 13 | {%- endif -%} 14 | {%- set lookup_table_name = table_from_fqtn(lookup_table) -%} 15 | 16 | 17 | SELECT base.*, 18 | {%- for column in lookup_table_cols %} 19 | {%- if column in source_col_names -%} 20 | lookupt.{{ column }} as {{ lookup_table_name }}_{{ column }}{{ ', ' if not loop.last }} 21 | {%- else -%} 22 | {{ column }}{{ ', ' if not loop.last }} 23 | {%- endif -%} 24 | {%- endfor %} 25 | FROM {{ source_table }} base 26 | LEFT OUTER JOIN {{ lookup_table }} lookupt 27 | on base.{{ lookup_column }} = lookupt.{{ lookup_column }} 28 | -------------------------------------------------------------------------------- /docs/entropy.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # entropy 4 | 5 | Entropy is a way to calculate the amount of "disorder" in a non-numeric column. Lower entropy indicates less disorder, while higher entropy indicates more. 6 | 7 | The calculation for Shannon's entropy is: H = -Sum[ P(xi) * log2( P(xi)) ] 8 | 9 | 10 | ## Parameters 11 | 12 | | Name | Type | Description | Is Optional | 13 | | -------- | ----------- | ----------------------------------------------------- | ----------- | 14 | | group_by | column_list | Columns to group by | | 15 | | columns | column_list | Columns to calculate entropy on. Must be non-numeric. | | 16 | 17 | 18 | ## Example 19 | 20 | ```python 21 | ds = rasgo.get.dataset(id) 22 | 23 | ds2 = ds.entropy(group_by=['FIPS'], columns=['NAME', 'ADDRESS']) 24 | ds2.preview() 25 | ``` 26 | 27 | ## Source Code 28 | 29 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/entropy/entropy.sql" %} 30 | 31 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/unpivot/unpivot.yaml: -------------------------------------------------------------------------------- 1 | name: unpivot 2 | tags: 3 | - table 4 | - reshape 5 | description: Performs a UNPIVOT operation, rotating a table by transforming columns into rows 6 | arguments: 7 | value_column: 8 | type: string 9 | description: The name to assign to the generated column that will be populated with the values from the columns in the column list 10 | name_column: 11 | type: string 12 | description: The name to assign to the generated column that will be populated with the names of the columns in the column list 13 | column_list: 14 | type: column_list 15 | description: List of columns in the source table that will be narrowed into a single pivot column. The column names will populate name_column, and the column values will populate value_column. 16 | example_code: | 17 | internet_sales = rasgo.get.dataset(74) 18 | 19 | ds2 = internet_sales.unpivot( 20 | value_column="SALES_FEES", 21 | name_column="PRODUCT", 22 | column_list=["TAXAMT", "FREIGHT"] 23 | ) 24 | 25 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/lead/lead.yaml: -------------------------------------------------------------------------------- 1 | name: lead 2 | tags: 3 | - column 4 | - date_time 5 | - feature_engineering 6 | description: Lead shifts your features on a partition index, creating a look-forward feature offset by an amount. Lead supports generating multiple leads in one transform by generating each unique combination of columns and amounts from your inputs. 7 | arguments: 8 | columns: 9 | type: column_list 10 | description: names of column(s) you want to lead 11 | amounts: 12 | type: int_list 13 | description: Magnitude of amounts you want to use for the lead. 14 | partition: 15 | type: column_list 16 | description: name of column(s) to partition by for the lead 17 | is_optional: true 18 | order_by: 19 | type: column_list 20 | description: name of column(s) to order by in the final data set 21 | is_optional: true 22 | example_code: | 23 | ds = rasgo.get.dataset(id) 24 | 25 | ds2 = ds.lead(columns=['OPEN', 'CLOSE'], amounts=[1,2,3,7], order_by=['DATE, 'TICKER'], partition=['TICKER']) 26 | ds2.preview() 27 | -------------------------------------------------------------------------------- /docs/target_encode.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # target_encode 4 | 5 | Encode a categorical column with the average value of a target column for the corresponding value of the categorical column. 6 | 7 | See scikit-learn's [TargetEncoder](https://contrib.scikit-learn.org/category_encoders/targetencoder.html) for full documentation. 8 | 9 | 10 | ## Parameters 11 | 12 | | Name | Type | Description | Is Optional | 13 | | ------ | ------ | ----------------------------------------------- | ----------- | 14 | | column | column | Column name to target encode | | 15 | | target | column | Numeric target column to use to create averages | | 16 | 17 | 18 | ## Example 19 | 20 | ```python 21 | ds = rasgo.get.dataset(id) 22 | 23 | ds2 = ds.target_encode(column='WEATHER_DESCRIPTION', target='DAILY_HIGH_TEMP') 24 | ds2.preview() 25 | 26 | ``` 27 | 28 | ## Source Code 29 | 30 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/target_encode/target_encode.sql" %} 31 | 32 | -------------------------------------------------------------------------------- /docs/apply.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # apply 4 | 5 | A transform that accepts a custom template to execute. Must use the sql template argument `source_table` to reference the Rasgo dataset which will serve as the base of any SELECT 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ---- | ------ | ------------------------------------------ | ----------- | 11 | | sql | custom | The custom SQL transform template to apply | | 12 | 13 | 14 | ## Example 15 | 16 | ```python 17 | ds = rasgo.get.dataset(id) 18 | 19 | ds2 = ds.apply( 20 | sql='SELECT * FROM {{ source_table }} WHERE COLUMNVALUE = I17' 21 | ) 22 | ds2.preview() 23 | 24 | # passing in custom arguments 25 | ds = rasgo.get.dataset(id) 26 | 27 | ds2 = ds.apply( 28 | sql="SELECT * FROM {{ source_table }} WHERE COLUMNVALUE = '{{ my_value }}'", 29 | my_value="I17" 30 | ) 31 | ds2.preview() 32 | ``` 33 | 34 | ## Source Code 35 | 36 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/apply/apply.sql" %} 37 | 38 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/market_basket/market_basket.yaml: -------------------------------------------------------------------------------- 1 | name: market_basket 2 | tags: 3 | - table 4 | - modeling 5 | - reshape 6 | description: | 7 | Analyze historical transaction contents to understand products that are frequently purchased together. 8 | 9 | This approach uses a transactional table to aggregate each product purchased in a transaction, and then aggregates transactions together to look for common patterns. 10 | arguments: 11 | transaction_id: 12 | type: column 13 | description: Column identifying a unique event ID (i.e., transaction) for which to aggregate line items 14 | sep: 15 | type: value 16 | description: Text separator to use when aggregating the strings, i.e. ', ' or '|'. 17 | agg_column: 18 | type: column 19 | description: Product ID or description to use when aggregating into transactions 20 | example_code: | 21 | sales = rasgo.get.dataset(id) 22 | 23 | ds2 = sales.market_basket(transaction_id='SALESORDERNUMBER', 24 | agg_column='ENGLISHPRODUCTNAME', 25 | sep='|') 26 | ds2.preview() -------------------------------------------------------------------------------- /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy-generic: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@v3 20 | with: 21 | python-version: '3.7' 22 | 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install --upgrade setuptools build twine 27 | pip install -r rasgotransforms/requirements.txt 28 | 29 | 30 | - name: Build and publish 31 | env: 32 | TWINE_USERNAME: __token__ 33 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} 34 | run: | 35 | cd rasgotransforms 36 | python -m build 37 | twine upload dist/* 38 | -------------------------------------------------------------------------------- /docs/filter.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # filter 4 | 5 | Filter the dataset. Supports two types of filters: 6 | 1. Comparison filters, which compare the values in a column with a value 7 | 2. Advanced filters, which support full SQL strings for custom filtering logic 8 | 9 | 10 | ## Parameters 11 | 12 | | Name | Type | Description | Is Optional | 13 | | ----- | ----------- | ----------------------------------------- | ----------- | 14 | | items | filter_list | list of dictionaries representing filters | | 15 | 16 | 17 | ## Example 18 | 19 | ```python 20 | ds = rasgo.get.dataset(74) 21 | 22 | # comma separated list of 'WHERE' clauses 23 | ds2 = ds.filter(items=['PRODUCTKEY < 500']) 24 | ds2.preview() 25 | 26 | # full filtering with a column, operator, and comparison value 27 | ds3 = ds.filter(items=[{'column_name':'PRODUCTKEY', 'operator':'>', 'comparison_value':'101'}]) 28 | ds3.preview() 29 | ``` 30 | 31 | ## Source Code 32 | 33 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/filter/filter.sql" %} 34 | 35 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/datediff/datediff.yaml: -------------------------------------------------------------------------------- 1 | name: datediff 2 | tags: 3 | - column 4 | - date_time 5 | description: | 6 | Calculates the difference between two date, time, or timestamp expressions based on the date or time part requested. 7 | Difference is calculated as date_1 - date_2. 8 | arguments: 9 | date_part: 10 | type: date_part 11 | description: | 12 | Must be one of the values listed in [Supported Date and Time Parts](https://docs.snowflake.com/en/sql-reference/functions-date-time.html#label-supported-date-time-parts) 13 | date_1: 14 | type: mixed_value 15 | description: Starting date. Can be a date column, date, time, or timestamp. 16 | date_2: 17 | type: mixed_value 18 | description: Date to subtract from date_1. Can be a date column, date, time, or timestamp. 19 | alias: 20 | type: value 21 | is_optional: true 22 | description: Name for the new column created by the datediff. 23 | example_code: | 24 | ds = rasgo.get.dataset(id) 25 | 26 | ds2 = ds.datediff(date_part='year', date_1='END_DATE', date_2="'2022-01-01'") 27 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/train_test_split/train_test_split.yaml: -------------------------------------------------------------------------------- 1 | name: train_test_split 2 | tags: 3 | - column 4 | - feature_engineering 5 | description: | 6 | Label rows as part of the train or test set based off of percentage split you want to apply to the data. 7 | 8 | If you want a row-wise random sample applied, do not pass an order_by column. If you want an ordered split, then pass the order_by column. 9 | arguments: 10 | order_by: 11 | type: column_list 12 | description: Optional argument that affects the train/test split method applied. if needed, pass the names of column(s) you want to order by when applying the split. 13 | is_optional: true 14 | train_percent: 15 | type: int 16 | description: Percent of the data you want in the train set, expressed as a decimal (i.e. .8). The rest of the rows will be included in the test set. 17 | example_code: | 18 | ds = rasgo.get.dataset(id) 19 | 20 | ds2 = ds.train_test_split(order_by = ['DATE'], 21 | train_percent = 0.8) 22 | ds2.preview() 23 | 24 | ds2b = ds.train_test_split(train_percent = 0.8) 25 | ds2b.preview() -------------------------------------------------------------------------------- /docs/one_hot_encode.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # one_hot_encode 4 | 5 | One hot encode a column. Create a null value flag for the column if any of the values are NULL. 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ------------ | -------------------- | ------------------------------------------------------------------------------------------- | ----------- | 11 | | column | column_or_expression | Column name to one-hot encode. Supports a calculated field via a valid SQL function. | | 12 | | list_of_vals | string_list | optional argument to override the dynamic lookup of all values in the target one-hot column | True | 13 | 14 | 15 | ## Example 16 | 17 | ```python 18 | ds = rasgo.get.dataset(id) 19 | 20 | ds2 = ds.one_hot_encode(column='WEATHER_DESCRIPTION') 21 | ds2.preview() 22 | 23 | ``` 24 | 25 | ## Source Code 26 | 27 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/one_hot_encode/one_hot_encode.sql" %} 28 | 29 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/ratio_with_shrinkage.sql: -------------------------------------------------------------------------------- 1 | {# the strange __var__ names are meant to prevent collisions #} 2 | 3 | {%- set source_col_names = get_columns(source_table) -%} 4 | WITH CTE_AGG AS ( 5 | SELECT 6 | *, 7 | {{ numerator }} / {{ denom }} as RAW__PCT 8 | FROM 9 | {{ source_table }} 10 | ), 11 | CTE_FILTER AS ( 12 | SELECT 13 | * 14 | FROM 15 | CTE_AGG 16 | WHERE 17 | {{ denom }} >= {{ min_cutoff }} 18 | ), 19 | CTE_STATS AS ( 20 | SELECT 21 | AVG(RAW__PCT) AS __U__, 22 | VARIANCE_SAMP(RAW__PCT) AS __V__ 23 | FROM 24 | CTE_FILTER 25 | ), 26 | CTE_JOINED AS ( 27 | SELECT 28 | * 29 | FROM CTE_AGG 30 | CROSS JOIN CTE_STATS 31 | ), 32 | CTE_COEF AS ( 33 | SELECT 34 | *, 35 | __U__ * ( 36 | __U__ * (1 - __U__)/ __V__ - 1 37 | ) AS __ALPHA__, 38 | __ALPHA__ * (1 - __U__)/ __U__ AS __BETA__ 39 | FROM 40 | CTE_JOINED 41 | ) 42 | SELECT 43 | {{ source_col_names | join(', ') }}, 44 | RAW__PCT, 45 | ({{ numerator }} + __ALPHA__) / ({{ denom }} + __ALPHA__ + __BETA__) AS ADJ__PCT 46 | FROM 47 | CTE_COEF -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/dtypes.py: -------------------------------------------------------------------------------- 1 | DTYPES = { 2 | "smallint": "smallint", 3 | "bigint": "bigint", 4 | "int": "int", 5 | "integer": "integer", 6 | "tinyint": "tinyint", 7 | "byteint": "byteint", 8 | "float": "float", 9 | "float4": "float4", 10 | "float8": "float8", 11 | "float64": "float64", 12 | "decimal": "decimal", 13 | "numeric": "numeric", 14 | "number": "number", 15 | "real": "real", 16 | "double": "double", 17 | "string": "string", 18 | "text": "text", 19 | "varchar": "varchar", 20 | "char": "char", 21 | "character": "character", 22 | "date": "date", 23 | "datetime": "datetime", 24 | "time": "time", 25 | "timestamp": "timestamp", 26 | "timestamp_ltz": "timestamp_ltz", 27 | "timestamp_ntz": "timestamp_ntz", 28 | "timestamp_tz": "timestamp_tz", 29 | "binary": "binary", 30 | "varbinary": "varbinary", 31 | "boolean": "boolean", 32 | "bool": "bool", 33 | "variant": "variant", 34 | "object": "object", 35 | "array": "array", 36 | # Aliases 37 | "double precision": "double", 38 | "doubleprecision": "double", 39 | } 40 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/lag/lag.yaml: -------------------------------------------------------------------------------- 1 | name: lag 2 | tags: 3 | - column 4 | - date_time 5 | - feature_engineering 6 | description: Lag shifts your features on a partition index, creating a lookback feature offset by an amount. Lag supports generating multiple lags in one transform by generating each unique combination of columns and amounts from your inputs. 7 | arguments: 8 | columns: 9 | type: column_list 10 | description: names of column(s) you want to lag 11 | amounts: 12 | type: int_list 13 | description: Magnitude of amounts you want to use for the lag. Positive values result in a historical offset; negative amounts result in forward-looking offset. 14 | partition: 15 | type: column_list 16 | description: name of column(s) to partition by for the lag 17 | is_optional: true 18 | order_by: 19 | type: column_list 20 | description: name of column(s) to order by in the final data set 21 | is_optional: true 22 | example_code: | 23 | ds = rasgo.get.dataset(id) 24 | 25 | ds2 = ds.lag(columns=['OPEN', 'CLOSE'], amounts=[1,2,3,7], order_by=['DATE, 'TICKER'], partition=['TICKER']) 26 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/one_hot_encode/one_hot_encode.sql: -------------------------------------------------------------------------------- 1 | {%- set run_query_error_message -%} 2 | This transform depends on dynamic values to work, but no Data Warehouse connection is available. 3 | Instead, please use the `list_of_vals` argument to provide these values explicitly 4 | {%- endset -%} 5 | 6 | {%- if list_of_vals is not defined -%} 7 | {%- set results = run_query("SELECT DISTINCT " + column + " FROM " + source_table) -%} 8 | {%- if results is none -%} 9 | {{ raise_exception(run_query_error_message) }} 10 | {%- endif -%} 11 | {%- set distinct_col_vals = results[column].to_list() -%} 12 | {%- else -%} 13 | {%- set distinct_col_vals = list_of_vals -%} 14 | {%- endif -%} 15 | 16 | SELECT *, 17 | {% for val in distinct_col_vals %} 18 | {%- if val is not none %} 19 | CASE WHEN {{ column }} = {{ "'" ~ val ~ "'"}} THEN 1 ELSE 0 END as {{ cleanse_name(column ~ '_' ~ val) }}{{ ', ' if not loop.last else '' }} 20 | {%- else %} 21 | CASE WHEN {{ column }} IS NULL THEN 1 ELSE 0 END as {{ column }}_IS_NULL{{ ', ' if not loop.last else '' }} 22 | {%- endif -%} 23 | {% endfor %} 24 | FROM {{ source_table }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/snowflake/ratio_with_shrinkage.sql: -------------------------------------------------------------------------------- 1 | {# the strange __var__ names are meant to prevent collisions #} 2 | 3 | {%- set source_col_names = get_columns(source_table) -%} 4 | WITH CTE_AGG AS ( 5 | SELECT 6 | *, 7 | {{ numerator }} / {{ denom }} as RAW__PCT 8 | FROM 9 | {{ source_table }} 10 | ), 11 | CTE_FILTER AS ( 12 | SELECT 13 | * 14 | FROM 15 | CTE_AGG 16 | WHERE 17 | {{ denom }} >= {{ min_cutoff }} 18 | ), 19 | CTE_STATS AS ( 20 | SELECT 21 | AVG(RAW__PCT) AS __U__, 22 | VARIANCE_SAMP(RAW__PCT) AS __V__ 23 | FROM 24 | CTE_FILTER 25 | ), 26 | CTE_JOINED AS ( 27 | SELECT 28 | * 29 | FROM CTE_AGG 30 | CROSS JOIN CTE_STATS 31 | ), 32 | CTE_COEF AS ( 33 | SELECT 34 | *, 35 | __U__ * ( 36 | __U__ * (1 - __U__)/ __V__ - 1 37 | ) AS __ALPHA__, 38 | __ALPHA__ * (1 - __U__)/ __U__ AS __BETA__ 39 | FROM 40 | CTE_JOINED 41 | ) 42 | SELECT 43 | {{ source_col_names | join(', ') }}, 44 | RAW__PCT, 45 | ({{ numerator }} + __ALPHA__) / ({{ denom }} + __ALPHA__ + __BETA__) AS ADJ__PCT 46 | FROM 47 | CTE_COEF -------------------------------------------------------------------------------- /docs/new_columns.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # new_columns 4 | 5 | ## Build new columns, using SQL formulas. 6 | 7 | ### Required Inputs 8 | - Calculated Column: the formula for the new column you want to build 9 | 10 | ### Optional Inputs 11 | - Alias: name for your columns 12 | 13 | ### Notes 14 | - Supports any SQL column functions that are compatible with your data warehouse 15 | 16 | 17 | ## Parameters 18 | 19 | | Name | Type | Description | Is Optional | 20 | | ------------------ | ---------------------- | -------------------------------------------- | ----------- | 21 | | calculated_columns | calculated_column_list | List of SQL formulas to generate new columns | | 22 | 23 | 24 | ## Example 25 | 26 | ```python 27 | ds2 = ds.new_columns( 28 | calculated_columns={ 29 | calcuated_column: 'POWER(COLUMN_NAME, 3)', 30 | alias: 'COLUMN_NAME_Cubed' 31 | } 32 | ) 33 | ds2.preview() 34 | ``` 35 | 36 | ## Source Code 37 | 38 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/new_columns/new_columns.sql" %} 39 | 40 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/dateadd/dateadd.yaml: -------------------------------------------------------------------------------- 1 | name: dateadd 2 | description: Increments a date by the specified interval value. 3 | tags: 4 | - column 5 | - date_time 6 | arguments: 7 | date_part: 8 | type: date_part 9 | description: | 10 | A valid SQL date part. 11 | Must be one of the values listed in [Supported Date and Time Parts](https://docs.snowflake.com/en/sql-reference/functions-date-time.html#label-supported-date-time-parts) 12 | date: 13 | type: mixed_value 14 | description: Date value to increment. Can be a column or literal of these types (date, datetime, time, or timestamp). 15 | offset: 16 | type: int 17 | description: Numeric value to increment the date by. 18 | alias: 19 | type: string 20 | description: Name of output column 21 | is_optional: true 22 | overwrite_columns: 23 | type: boolean 24 | description: "Optional: if true, the output column will replace the existing 'date' column" 25 | is_optional: true 26 | example_code: | 27 | ds = rasgo.get.dataset(id) 28 | 29 | ds2 = ds.dateadd(date_part='year', date='END_DATE', offset=3, alias='THREE_YEARS_FUTURE') 30 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/dateadd/dateadd.py: -------------------------------------------------------------------------------- 1 | DATE_PARTS = [ 2 | 'year', 3 | 'month', 4 | 'day', 5 | 'dayofweek', 6 | 'dayofweekiso', 7 | 'dayofyear', 8 | 'week', 9 | 'weekiso', 10 | 'quarter', 11 | 'yearofweek', 12 | 'yearofweekiso', 13 | ] 14 | TIME_PARTS = [ 15 | 'hour', 16 | 'minute', 17 | 'second', 18 | 'millisecond', 19 | 'nanosecond', 20 | 'epoch_second', 21 | 'epoch_millisecond', 22 | 'epoch_microsecond', 23 | 'epoch_nanosecond', 24 | 'timezone_hour', 25 | 'timezone_minute', 26 | ] 27 | 28 | 29 | def infer_columns(args, source_columns) -> dict: 30 | if args['date'] in source_columns: 31 | output_type = source_columns[args['date']] 32 | else: 33 | output_type = 'date' 34 | if 'overwrite_columns' in args and args['overwrite_columns']: 35 | source_columns[args['date'].upper()] = output_type 36 | elif 'alias' in args and args['alias']: 37 | source_columns[args['alias']] = output_type 38 | else: 39 | source_columns[f"{args['date']}_add{args['offset']}{args['date_part']}".upper()] = output_type 40 | return source_columns 41 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/sample/sample.sql: -------------------------------------------------------------------------------- 1 | {%- if num_rows|float < 1 -%} 2 | {%- set sample_amount = num_rows*100 |float -%} 3 | {% else %} 4 | {%- set sample_amount = num_rows~' ROWS' -%} 5 | {% endif %} 6 | 7 | {% if filters is defined %} 8 | WITH filtered AS ( 9 | SELECT * FROM {{source_table}} 10 | {% for filter_block in filters %} 11 | {%- set oloop = loop -%} 12 | {{ 'WHERE ' if oloop.first else ' AND ' }} 13 | {%- if filter_block is not mapping -%} 14 | {{ filter_block }} 15 | {%- else -%} 16 | {%- if filter_block['operator'] == 'CONTAINS' -%} 17 | {{ filter_block['operator'] }}({{ filter_block['column_name'] }}, {{ filter_block['comparison_value'] }}) 18 | {%- else -%} 19 | {{ filter_block['column_name'] }} {{ filter_block['operator'] }} {{ filter_block['comparison_value'] }} 20 | {%- endif -%} 21 | {%- endif -%} 22 | {%- endfor -%} 23 | 24 | ) 25 | SELECT * FROM filtered 26 | TABLESAMPLE BERNOULLI ( {{ sample_amount }} ) 27 | {% else %} 28 | SELECT * FROM {{source_table}} 29 | TABLESAMPLE BERNOULLI ( {{ sample_amount }} ) 30 | {% endif %} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/dropna/dropna.sql: -------------------------------------------------------------------------------- 1 | {%- if subset is not defined -%} 2 | {%- set subset = get_columns(source_table) -%} 3 | {%- set source_col_names = subset -%} 4 | {%- endif -%} 5 | 6 | {%- if how is not defined -%} 7 | {%- set how = "any" -%} 8 | {%- endif -%} 9 | 10 | {%- if how == "any" and thresh is not defined -%} 11 | select * from {{ source_table }} 12 | {%- for col in subset %} 13 | {{ 'where' if loop.first else ' and' }} {{ col }} is not null 14 | {%- endfor -%} 15 | 16 | {%- else -%} 17 | {%- if thresh is not defined -%} 18 | {%- set thresh = subset|length -%} 19 | {%- endif -%} 20 | {%- if source_col_names is not defined -%} 21 | {%- set source_col_names = get_columns(source_table) -%} 22 | {%- endif -%} 23 | with not_null as ( 24 | select *, 25 | {%- for col in subset %} 26 | cast({{ col }} is null as int) {{ "+ " if not loop.last else " " }} 27 | {%- endfor %} 28 | as NUM_IS_NA 29 | from {{ source_table }} 30 | where NUM_IS_NA < {{ thresh }} 31 | ) select 32 | {% for col in source_col_names -%} 33 | {{ col }}{{ ", " if not loop.last else " " }} 34 | {%- endfor %} 35 | from not_null 36 | {%- endif -%} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/linear_regression/linear_regression.yaml: -------------------------------------------------------------------------------- 1 | name: linear_regression 2 | tags: 3 | - table 4 | - modeling 5 | description: | 6 | Fit a simple linear regression and return the formula. Optionally, use one or more group_by columns to create a regression per unique grouping. 7 | 8 | Currently, only supports a single independent variable. 9 | arguments: 10 | group_by: 11 | type: column_list 12 | is_optional: true 13 | description: Columns to group by before building the linear regression model. Use this field to create multiple models (one per unique grouping) 14 | y: 15 | type: column 16 | description: Dependent variable for the linear regression 17 | x: 18 | type: column 19 | description: Independent variable for the linear regression 20 | example_code: | 21 | internet_sales = rasgo.get.dataset(74) 22 | 23 | ds1 = internet_sales.aggregate( 24 | group_by=['PRODUCTKEY','CUSTOMERKEY'], 25 | aggregations={'SALESAMOUNT':['AVG'], 26 | 'TOTALPRODUCTCOST':['AVG']}) 27 | 28 | ds2 = ds1.linear_regression( 29 | x = 'SALESAMOUNT_AVG', 30 | y = 'TOTALPRODUCTCOST_AVG') 31 | 32 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/if_then/if_then.yaml: -------------------------------------------------------------------------------- 1 | name: if_then 2 | tags: 3 | - column 4 | - conditional 5 | - data_cleaning 6 | - natural_language_processing 7 | description: | 8 | This function creates a new column based on the conditions provided in the `conditions` argument. 9 | 10 | Output values should be of the same type, since they are constructing one new column. 11 | 12 | A default value for the new column should be set, as should the output column name. 13 | arguments: 14 | conditions: 15 | type: conditional_list 16 | description: A nested list. In each inner list the first element would be the condition to check, and the second the value with which to fill. 17 | default: 18 | type: mixed_value 19 | description: The default value with which to fill the new column. Please enclose fixed strings in quotes inside of the argument (e.g., below) 20 | alias: 21 | type: string 22 | description: The name of the output column in the new dataset. 23 | example_code: | 24 | ds = rasgo.get.dataset(id) 25 | 26 | ds2 = ds.if_then(conditions=[["DS_WEATHER_ICON like '%cloudy%'", 1]], 27 | default=2, 28 | alias="CLOUDY_WEATHER_FLAG") 29 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/min_max_scaler/min_max_scaler.sql: -------------------------------------------------------------------------------- 1 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', columns_to_scale)|join(',') if overwrite_columns else "*" -%} 2 | 3 | {%- if minimums is not defined -%} 4 | with min_max_vals as ( 5 | select 6 | {%- for column in columns_to_scale %} 7 | min({{column}}) as min_{{column}}, 8 | max({{column}}) as max_{{column}}{{ "," if not loop.last else "" }} 9 | {%- endfor %} 10 | from {{source_table}} 11 | ) select {{ source_table + ".*" if not overwrite_columns else untouched_cols}}, 12 | {%- for column in columns_to_scale %} 13 | ({{column}} - min_{{column}}) / (max_{{column}} - min_{{column}}) as {{column if overwrite_columns else column + "_MIN_MAX_SCALED"}}{{ ", " if not loop.last else "" }} 14 | {%- endfor %} 15 | from min_max_vals, {{source_table}} 16 | 17 | {%- else -%} 18 | select {{ untouched_cols }}, 19 | {%- for column in columns_to_scale %} 20 | ({{column}} - {{minimums[loop.index0]}}) / ({{maximums[loop.index0]}} - {{minimums[loop.index0]}}) as {{column if overwrite_columns else column + "_MIN_MAX_SCALED"}}{{ ", " if not loop.last else "" }} 21 | {%- endfor %} 22 | from {{source_table}} 23 | {%- endif -%} -------------------------------------------------------------------------------- /docs/accelerators/website_page_performance.md: -------------------------------------------------------------------------------- 1 | # Google Analytics Web Page Performance 2 | 3 | The Web Page Performance analysis uses Google Analytics data, including bounce rate, time on page, number of visits, and total users to create a custom metric that ranks the performance of pages on your site. 4 | 5 | ## Parameters 6 | 7 | | Name | Type | Description | Is Optional | 8 | | ------------------------------ | ------- | ------------------------------------------------------------------------------------------------------------------- | ----------- | 9 | | google_analytics_traffic_table | dataset | Google Analytics traffic table | | 10 | | lookback_window | string | This template will create metrics for a timewindow within "x" days of the current date. This is the lookback value. | | 11 | 12 | 13 | ## Source Code 14 | 15 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/accelerators/website_page_performance.yml" %} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/standard_scaler/standard_scaler.sql: -------------------------------------------------------------------------------- 1 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', columns_to_scale)|join(',') if overwrite_columns else "*" -%} 2 | 3 | {%- if averages is not defined or standarddevs is not defined -%} 4 | with avg_stddev_vals as ( 5 | select 6 | {%- for column in columns_to_scale %} 7 | avg({{column}}) as avg_{{column}}, 8 | stddev({{column}}) as stddev_{{column}}{{ ", " if not loop.last else "" }} 9 | {%- endfor %} 10 | from {{source_table}} 11 | ) select {{ source_table + ".*" if not overwrite_columns else untouched_cols}}, 12 | {%- for column in columns_to_scale %} 13 | ({{column}} - avg_{{column}}) / (stddev_{{column}}) as {{column if overwrite_columns else column + "_STANDARD_SCALED"}}{{ ", " if not loop.last else "" }} 14 | {%- endfor %} 15 | from avg_stddev_vals, {{source_table}} 16 | 17 | {%- else -%} 18 | select {{ untouched_cols }}, 19 | {%- for column in columns_to_scale %} 20 | ({{column}} - {{averages[loop.index0]}}) / ({{standarddevs[loop.index0]}}) as {{column if overwrite_columns else column + "_STANDARD_SCALED"}}{{ ", " if not loop.last else "" }} 21 | {%- endfor %} 22 | from {{source_table}} 23 | {%- endif -%} -------------------------------------------------------------------------------- /docs/datepart.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # datepart 4 | 5 | Extracts a specific part of a date column. For example, if the input is '2021-01-01', you can ask for the year and get back 2021. 6 | 7 | An exhaustive list of valid date parts can be [found here](https://docs.snowflake.com/en/sql-reference/functions-date-time.html#label-supported-date-time-parts). 8 | 9 | 10 | ## Parameters 11 | 12 | | Name | Type | Description | Is Optional | 13 | | ----- | ------------- | ----------------------------------------------------------------------------------------------------- | ----------- | 14 | | dates | datepart_dict | dict where keys are names of columns you want to date part and values are the desired date part grain | | 15 | 16 | 17 | ## Example 18 | 19 | ```python 20 | ds = rasgo.get.dataset(id) 21 | 22 | ds2 = ds.datepart(dates={ 23 | 'DATE_STRING':'year', 24 | 'DATE2_STR':'month' 25 | }) 26 | ds2.preview() 27 | 28 | ``` 29 | 30 | ## Source Code 31 | 32 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/datepart/snowflake/datepart.sql" %} 33 | 34 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/replace_missing/replace_missing.yaml: -------------------------------------------------------------------------------- 1 | name: replace_missing 2 | tags: 3 | - column 4 | - data_cleaning 5 | - data_quality 6 | description: Replace missing values in column/columns with the mean, median, mode, or a value 7 | arguments: 8 | replacements: 9 | type: imputation_dict 10 | description: Dictionary with keys as column names to replace missing values for, and dictionary values the type of replacement strategy ('mean', 'median', 'mode', ) 11 | flag_missing_vals: 12 | type: boolean 13 | description: Use True to create an indicator column for when a value was replaced. This column will be named like '_missing_flag'. 14 | is_optional: true 15 | example_code: | 16 | ds = rasgo.get.dataset(id) 17 | 18 | ds2 = ds.replace_missing( 19 | replacements={ 20 | 'MONTH': 'mean', # Replace with mean 21 | 'FIPS': 'median', # Replace with median 22 | 'COVID_NEW_CASES': 'mode', # Replace with mode 23 | 'YEAR': '2021', # Replace with the string '2021' 24 | 'COVID_DEATHS': 2.45 # Replace with the number 2.45 25 | }, 26 | flag_missing_vals=True) 27 | 28 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/conditional_agg/conditional_agg.yaml: -------------------------------------------------------------------------------- 1 | name: conditional_agg 2 | tags: 3 | - table 4 | - conditional 5 | - reshape 6 | - aggregate 7 | description: | 8 | Pass in a list of filter rules, and aggregate rows that match. 9 | 10 | If multiple rules are passed, they are combined and aggregated both together and separately. 11 | arguments: 12 | rules: 13 | type: value_list 14 | description: List of filter rules to use 15 | agg_column: 16 | type: column 17 | description: Column to aggregate 18 | agg: 19 | type: agg 20 | description: Method to use when aggregating the agg_column 21 | distinct: 22 | type: boolean 23 | description: When aggregating the agg_column, use TRUE to qualify with a DISTINCT 24 | example_code: | 25 | customer = rasgo.get.dataset(55) 26 | 27 | rules = [ 28 | "FIRSTNAME LIKE 'J%'", 29 | "BIRTHDATE < '1970-01-01'", 30 | "ENGLISHEDUCATION = 'Bachelors'", 31 | "MARITALSTATUS = 'M'", 32 | "GENDER='F'"] 33 | 34 | ds2 = customer.conditional_agg(rules=rules, 35 | agg_column='CUSTOMERKEY', 36 | agg='COUNT', 37 | distinct=True) 38 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/summarize/summarize.yaml: -------------------------------------------------------------------------------- 1 | name: summarize 2 | tags: 3 | - query 4 | - default 5 | description: | 6 | Filter and then aggregate columns in a table 7 | 8 | The filter is applied first to the table. If no filters are included, then the full table is selected. 9 | Next, the table is aggregated. 10 | 11 | arguments: 12 | filters: 13 | type: filter_list 14 | description: Remove rows using filter logic on one or more columns 15 | is_optional: true 16 | summarize: 17 | type: column_agg_list 18 | description: Columns to summarize 19 | is_optional: false 20 | group_by: 21 | type: column_list 22 | description: One or more columns to group by A categorical column by which to pivot the calculated metrics. Including this argument will generate a new metric calculation for each distinct value in the group by column. If this column has more than 20 distinct values, the plot will not generate. 23 | is_optional: false 24 | 25 | example_code: | 26 | internet_sales = rasgo.get.dataset(74) 27 | 28 | ds1 = internet_sales.query( 29 | summarize={ 30 | 'SALESAMOUNT': ['COUNT', 'SUM'], 31 | 'CUSTOMERKEY': ['COUNT'] 32 | }, 33 | group_by = ['PRODUCTKEY']) 34 | 35 | ds1.preview() -------------------------------------------------------------------------------- /docs/cast.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # cast 4 | 5 | Cast selected columns to a new type 6 | 7 | 8 | ## Parameters 9 | 10 | | Name | Type | Description | Is Optional | 11 | | ----------------- | --------------- | -------------------------------------------------------------------------------------------------------------- | ----------- | 12 | | casts | cast_value_dict | A dict where the keys are columns and the values are the new type to cast them to. | | 13 | | overwrite_columns | boolean | to overwrite column names with the new casted column, use 'true'. otherwise, use 'false'. defaults to 'false'. | True | 14 | 15 | 16 | ## Example 17 | 18 | ```python 19 | ds = rasgo.get.dataset(id) 20 | 21 | ds_casted = ds.cast( 22 | casts={ 23 | 'DS_WEATHER_ICON':'INT', 24 | 'DS_DAILY_HIGH_TEMP':'STRING', 25 | 'DS_DAILY_LOW_TEMP':'INT' 26 | }, 27 | overwrite_columns=True 28 | ) 29 | 30 | ds_casted.preview() 31 | 32 | ``` 33 | 34 | ## Source Code 35 | 36 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/cast/cast.sql" %} 37 | 38 | -------------------------------------------------------------------------------- /docs/latest.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # latest 4 | 5 | Impute missing values in ALL columns with the latest value seen in rows prior 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | -------- | ----------- | -------------------------------------------------------------------------------------------------------- | ----------- | 11 | | group_by | column_list | List of columns to perform the imputation "within" | | 12 | | order_by | column_list | List of columns to sort ascending, in order to find the last known value for imputation | | 13 | | nulls | string | Pass either 'ignore' or 'respect' to determine whether nulls should be ignored or not during imputation. | | 14 | 15 | 16 | ## Example 17 | 18 | ```python 19 | ds = rasgo.get.dataset(id) 20 | 21 | ds2 = ds.latest( 22 | group_by=['FIPS'], 23 | order_by=['DATE'], 24 | nulls='ignore') 25 | 26 | ds2.preview() 27 | ``` 28 | 29 | ## Source Code 30 | 31 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/latest/latest.sql" %} 32 | 33 | -------------------------------------------------------------------------------- /docs/text_to_sql.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # text_to_sql 4 | 5 | ## Text to SQL, powered by OpenAI. 6 | ### Required Inputs 7 | - Text: a prompt describing the SQL query that you want OpenAI to generate for you. Add as much context as possible to help OpenAI generate a useful query. Avoid using relative date terms like "last year" because OpenAI doesn't have any knowledge past 2021. 8 | 9 | 10 | ## Parameters 11 | 12 | | Name | Type | Description | Is Optional | 13 | | ---- | ----------- | ------------------------------------------------------------------------------------------------------------ | ----------- | 14 | | text | string-long | Text description of the query you want to generate. Example: total revenue for the Southwest region in 2021 | | 15 | 16 | 17 | ## Example 18 | 19 | ```python 20 | ds = rasgo.get.dataset(fqtn='DB.SCHEMA.IOWA_LIQUOR_SALES') 21 | 22 | ds2 = ds.text_to_sql( 23 | text='total bottles sold in Des Moines last year' 24 | ) 25 | ds2.sql() 26 | ds2.preview() 27 | 28 | ``` 29 | 30 | ## Source Code 31 | 32 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/text_to_sql/text_to_sql.sql" %} 33 | 34 | -------------------------------------------------------------------------------- /.github/workflows/docs_generation.yaml: -------------------------------------------------------------------------------- 1 | # Auto generate Transform Docs from YAML File on each Push 2 | name: Transform Docs Generation 3 | 4 | on: push 5 | 6 | jobs: 7 | generate-docs: 8 | runs-on: ubuntu-latest 9 | defaults: 10 | run: 11 | shell: bash 12 | 13 | container: 14 | image: "python:3.7" 15 | 16 | env: 17 | PYTHONPATH: /__w/RasgoTransforms/RasgoTransforms 18 | 19 | steps: 20 | - uses: actions/checkout@v3 21 | 22 | - name: Access Transform Directory 23 | run: | 24 | git config --global --add safe.directory /__w/RasgoTransforms/RasgoTransforms 25 | 26 | - name: Install Python Requirments 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install -r python/requirements.txt 30 | 31 | - name: Generate Transform Docs 32 | run: python python/doc_generator.py 33 | 34 | - name: Git Commit Generated Transform Docs 35 | run: | 36 | if [[ `git status --porcelain` ]]; then 37 | git add -A 38 | git config user.name GitHub 39 | git config user.email noreply@github.com 40 | echo commiting 41 | git commit -m 'Added Auto Generated Transform docs' 42 | git push 43 | fi 44 | -------------------------------------------------------------------------------- /docs/lead.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # lead 4 | 5 | Lead shifts your features on a partition index, creating a look-forward feature offset by an amount. Lead supports generating multiple leads in one transform by generating each unique combination of columns and amounts from your inputs. 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | --------- | ----------- | --------------------------------------------------- | ----------- | 11 | | columns | column_list | names of column(s) you want to lead | | 12 | | amounts | int_list | Magnitude of amounts you want to use for the lead. | | 13 | | partition | column_list | name of column(s) to partition by for the lead | True | 14 | | order_by | column_list | name of column(s) to order by in the final data set | True | 15 | 16 | 17 | ## Example 18 | 19 | ```python 20 | ds = rasgo.get.dataset(id) 21 | 22 | ds2 = ds.lead(columns=['OPEN', 'CLOSE'], amounts=[1,2,3,7], order_by=['DATE, 'TICKER'], partition=['TICKER']) 23 | ds2.preview() 24 | 25 | ``` 26 | 27 | ## Source Code 28 | 29 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/lead/lead.sql" %} 30 | 31 | -------------------------------------------------------------------------------- /docs/sample.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # sample 4 | 5 | Take a sample of a dataset using a specific number of rows or a probability that each row will be selected 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | -------- | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | 11 | | num_rows | value | To sample using a probability of selecting each row, your num_rows should be a decimal less than 1. Otherwise, pass an integer value for number of rows to keep. | | 12 | | filters | filter_list | Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text. | True | 13 | 14 | 15 | ## Example 16 | 17 | ```python 18 | ds = rasgo.get.dataset(id) 19 | 20 | ds2 = ds.sample(num_rows=1000) 21 | ds2.preview() 22 | ``` 23 | 24 | ## Source Code 25 | 26 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/sample/sample.sql" %} 27 | 28 | -------------------------------------------------------------------------------- /docs/moving_avg.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # moving_avg 4 | 5 | generates moving averages per column and per window size 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ------------- | ----------- | -------------------------------------------------------------------------- | ----------- | 11 | | input_columns | column_list | names of column(s) you want to moving average | | 12 | | window_sizes | int_list | the integer values for window sizes you want to use in your moving average | | 13 | | order_by | column_list | columns to order by, typically the date index of the table | | 14 | | partition | column_list | columns to partition the moving average by | | 15 | 16 | 17 | ## Example 18 | 19 | ```python 20 | ds = rasgo.get.dataset(id) 21 | 22 | ds2 = ds.moving_avg(input_columns=['OPEN','CLOSE','HIGH','LOW'], window_sizes=[1,2,3,7], order_by=['DATE, 'TICKER'], partition=['TICKER']) 23 | ds2.preview() 24 | ``` 25 | 26 | ## Source Code 27 | 28 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/moving_avg/moving_avg.sql" %} 29 | 30 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/clean/clean.yaml: -------------------------------------------------------------------------------- 1 | name: clean 2 | tags: 3 | - column 4 | - data_cleaning 5 | - data_quality 6 | description: Cast data types, rename or drop columns, impute missing values, and filter values in a dataset 7 | arguments: 8 | columns: 9 | type: clean_dict 10 | description: "Dictionary with keys as column names to clean, values are all optional: type - the 11 | dtype to cast the values to, name - the new name for a column, impute - an imputation strategy or value for replacing 12 | null values ('mean', 'median', 'mode', ), filter - a filter statement to filter the output table, drop - 13 | drops column from the output if true" 14 | example_code: | 15 | ds = rasgo.get.dataset(id) 16 | 17 | ds2 = ds.clean( 18 | columns={ 19 | 'GLD_ADJUSTED_CLOSE': { 20 | 'type': 'FLOAT', 21 | 'name': 'GLD', 22 | 'impute': 'mean', 23 | 'filter': "> 100", 24 | }, 25 | 'GLTR_ADJUSTED_CLOSE': { 26 | 'type': 'FLOAT', 27 | 'name': 'GLTR', 28 | 'impute': 'min', 29 | 'filter': "> 10", 30 | }, 31 | 'DATE': { 32 | 'type': 'string' 33 | } 34 | } 35 | ) 36 | 37 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/summarize_flatlines/summarize_flatlines.yaml: -------------------------------------------------------------------------------- 1 | name: summarize_flatlines 2 | tags: 3 | - table 4 | - reshape 5 | - data_quality 6 | description: | 7 | Given a dataset, searches finds "flatline" sequences of a repeated values that do not change. 8 | 9 | Choose a value column, a column to be used for ordering (such as a date), and a minimum cutoff for the number of repeated occurrences to consider. 10 | 11 | The result is a summarized table. 12 | arguments: 13 | group_by: 14 | type: column_list 15 | description: The column(s) used to partition you data into groups. 16 | Flatlines (repeated values) will be searched within each group 17 | value_col: 18 | type: column 19 | description: The column for which to search for flatlines. 20 | order_col: 21 | type: column 22 | description: The column used to order the rows within groups. 23 | min_repeat_count: 24 | type: int 25 | description: The minimum length of a sequence of repeated values to consider 26 | 27 | example_code: | 28 | ds = rasgo.get.dataset() 29 | 30 | test = ds.apply(group_by=['TICKER','SYMBOL'], 31 | value_col='CLOSE', 32 | order_col='DATE', 33 | min_repeat_count=1 34 | ) 35 | 36 | test.preview() -------------------------------------------------------------------------------- /docs/accelerators/plg.md: -------------------------------------------------------------------------------- 1 | # Sales Growth Funnel 2 | 3 | The sales growth funnel tracks users through a common Software as a Service (SaaS) sales funnel: from awareness via marketing at top of funnel, to product user, to qualified lead for an enterprise sales motion, to closed won. The data sources necessary to generate this accelerator are Google Analytics, Heap, and Salesforce. 4 | 5 | ## Parameters 6 | 7 | | Name | Type | Description | Is Optional | 8 | | ------------------- | ------- | --------------------------------------------- | ----------- | 9 | | contact_table | dataset | Salesforce contacts table | | 10 | | opportunity_table | dataset | Salesforce opportunities table | | 11 | | account_table | dataset | Salesforce accounts table | | 12 | | lead_table | dataset | Salesforce leads table | | 13 | | daily_traffic_table | dataset | Google Analytics daily traffic overview table | | 14 | | heap_users_table | dataset | Heap Users Table | | 15 | 16 | 17 | ## Source Code 18 | 19 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/accelerators/plg.yml" %} -------------------------------------------------------------------------------- /docs/aggregate.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # aggregate 4 | 5 | Groups rows by the group_by items applying aggregations functions for the resulting group and selected columns 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ------------ | ----------- | ----------------------------------------------------------------------------------------------------------------------------------- | ----------- | 11 | | group_by | column_list | Columns to group by | | 12 | | aggregations | agg_dict | Aggregations to apply for other columns. Dict keys are column names, and values are a list of aggegations to apply for that column. | | 13 | 14 | 15 | ## Example 16 | 17 | ```python 18 | ds = rasgo.get.dataset(id) 19 | 20 | ds2 = ds.aggregate(group_by=['FIPS'], aggregations={ 21 | 'COL_1': ['SUM', 'AVG'], 22 | 'COL_2': ['SUM', 'AVG'] 23 | }) 24 | ds2.preview() 25 | ``` 26 | 27 | ## Source Code 28 | 29 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/aggregate/snowflake/aggregate.sql" %} 30 | 31 | -------------------------------------------------------------------------------- /docs/histogram.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # histogram 4 | 5 | Analyze the value distribution of a single continuous variable by binning it and calculating frequencies in each bin 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ----------- | ----------- | ---------------------------------------------------------------------------------------------------------------------- | ----------- | 11 | | column | column | numeric column to use to generate the histogram | | 12 | | filters | filter_list | Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text. | True | 13 | | num_buckets | value | max number of buckets to create; defaults to 200 | True | 14 | 15 | 16 | ## Example 17 | 18 | ```python 19 | ds = rasgo.get.dataset(id) 20 | 21 | ds2 = ds.histogram(column='SALESAMOUNT') 22 | ds2.preview() 23 | ``` 24 | 25 | ## Source Code 26 | 27 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/histogram/histogram.sql" %} 28 | 29 | -------------------------------------------------------------------------------- /docs/accelerators/sales_growth_funnel.md: -------------------------------------------------------------------------------- 1 | # Sales Growth Funnel 2 | 3 | The sales growth funnel tracks users through a common Software as a Service (SaaS) sales funnel: from awareness via marketing at top of funnel, to product user, to qualified lead for an enterprise sales motion, to closed won. The data sources necessary to generate this accelerator are Google Analytics, Heap, and Salesforce. 4 | 5 | ## Parameters 6 | 7 | | Name | Type | Description | Is Optional | 8 | | ------------------- | ------- | --------------------------------------------- | ----------- | 9 | | contact_table | dataset | Salesforce contacts table | | 10 | | opportunity_table | dataset | Salesforce opportunities table | | 11 | | account_table | dataset | Salesforce accounts table | | 12 | | lead_table | dataset | Salesforce leads table | | 13 | | daily_traffic_table | dataset | Google Analytics daily traffic overview table | | 14 | | heap_users_table | dataset | Heap Users Table | | 15 | 16 | 17 | ## Source Code 18 | 19 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/accelerators/sales_growth_funnel.yml" %} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/reshape/bigquery/reshape.sql: -------------------------------------------------------------------------------- 1 | {% if method|lower == 'pivot' -%} 2 | {%- set distinct_val_query -%} 3 | select distinct {{ columns }} 4 | from {{ source_table }} 5 | limit 1000 6 | {%- endset -%} 7 | 8 | {%- if list_of_vals is not defined -%} 9 | {%- set results = run_query(distinct_val_query) -%} 10 | {%- set distinct_vals = results[results.columns[0]].to_list() -%} 11 | {%- else -%} 12 | {%- set distinct_vals = list_of_vals -%} 13 | {%- endif -%} 14 | 15 | SELECT * FROM ( 16 | SELECT 17 | {%- for dimension in dimensions %} 18 | {{ dimension }}, 19 | {%- endfor %} 20 | {{ values }}, 21 | {{ columns }} 22 | FROM {{ source_table }} 23 | ) 24 | PIVOT ( 25 | {{ agg_method }} ( {{ values }} ) as _ 26 | FOR {{ columns }} IN ( 27 | {%- for val in distinct_vals %} 28 | {%- if val is string -%} 29 | '{{ val }}' 30 | {%- else -%} 31 | {{ val }} 32 | {%- endif -%} 33 | {{', ' if not loop.last else ''}} 34 | {%- endfor -%} 35 | ) 36 | ) 37 | {%- else -%} 38 | SELECT * FROM {{ source_table }} 39 | UNPIVOT( {{ value_column }} for {{ name_column }} in ( {{ columns | join(', ')}} )) 40 | {%- endif -%} -------------------------------------------------------------------------------- /docs/to_date.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # to_date 4 | 5 | Creates a column of a date/timestamp type from a string or other non-date column. 6 | 7 | See [this Snowflake doc](https://docs.snowflake.com/en/user-guide/date-time-input-output.html#about-the-format-specifiers-in-this-section) for information about valid formats. 8 | 9 | 10 | ## Parameters 11 | 12 | | Name | Type | Description | Is Optional | 13 | | ----------------- | ----------------- | ------------------------------------------------------------------------------------------------------ | ----------- | 14 | | dates | column_value_dict | dict where the values are the date columns and the keys are the date formats to use for the conversion | | 15 | | overwrite_columns | boolean | Optional: if true, the output columns will overwrite the input columns | True | 16 | 17 | 18 | ## Example 19 | 20 | ```python 21 | ds = rasgo.get.dataset(id) 22 | 23 | ds2 = ds.to_date(dates={ 24 | 'DATE_STRING':'YYYY-MM-DD', 25 | 'DATE2_STR':'YYYY-DD-MM' 26 | }) 27 | ds2.preview() 28 | ``` 29 | 30 | ## Source Code 31 | 32 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/to_date/to_date.sql" %} 33 | 34 | -------------------------------------------------------------------------------- /docs/accelerators/omni_channel_performance.md: -------------------------------------------------------------------------------- 1 | # Omni-channel performance 2 | 3 | Omni-channel performance tracks leads through a traditional sales funnel: from awareness via marketing at top of funnel, to marketing qualified lead, to sales qualified lead, and finally to closed as a won opportunity. The data sources necessary to generate this accelerator are Google Analytics, Hubspot, and Salesforce. 4 | 5 | ## Parameters 6 | 7 | | Name | Type | Description | Is Optional | 8 | | ------------------- | ------- | --------------------------------------------- | ----------- | 9 | | contact_table | dataset | Salesforce contacts table | | 10 | | opportunity_table | dataset | Salesforce opportunities table | | 11 | | account_table | dataset | Salesforce accounts table | | 12 | | lead_table | dataset | Salesforce leads table | | 13 | | daily_traffic_table | dataset | Google Analytics daily traffic overview table | | 14 | | email_event_table | dataset | Hubspot email event table | | 15 | 16 | 17 | ## Source Code 18 | 19 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/accelerators/omni_channel_performance.yml" %} -------------------------------------------------------------------------------- /docs/funnel.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # funnel 4 | 5 | Creates a funnel visualization-ready dataset from numeric columns (e.g., ["Number of leads", "Number of contacts", "Number of deals closed"]) representing a hierarchy with summed incidence rates 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | 11 | | stage_columns | column_list | List of columns to include in the funnel dataset, in order of hierarchy from highest stage to lowest stage (e.g., ["Number of leads", "Number of contacts", "Number of deals closed"]) | | 12 | 13 | 14 | ## Example 15 | 16 | ```python 17 | ds = rasgo.get.dataset(id) 18 | 19 | ds2 = ds.funnel(stage_columns=["TOTAL_IMPRESSIONS", "TOTAL_EMAILS_SENT", "TOTAL_WEBTRAFFIC_USERS", "TOTAL_LEADS_CREATED", "TOTAL_DEALS_CLOSED"]) 20 | ds2.preview() 21 | 22 | ``` 23 | 24 | ## Source Code 25 | 26 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/funnel/funnel.sql" %} 27 | 28 | -------------------------------------------------------------------------------- /docs/remove_duplicates.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # remove_duplicates 4 | 5 | Deduplicate a table based on a passed-in composite key. Once an order column and an order method are selected, only the top record from the resulting grouped and ordered dataset will be kept. 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ------------ | -------------- | ---------------------------------------------------------------------------- | ----------- | 11 | | natural_key | column_list | Columns forming the grain at which to remove duplicates | | 12 | | order_col | column_list | Columns by which to order the result set, such that the first result is kept | | 13 | | order_method | sort_direction | Sets the order behavior for the chosen `order_col`. Can be ASC or DESC. | | 14 | 15 | 16 | ## Example 17 | 18 | ```python 19 | ds = rasgo.get.dataset(id) 20 | 21 | ds2 = ds.remove_duplicates( 22 | natural_key=["FIPS", "DS_WEATHER_ICON", "DATE"], 23 | order_col=["DATE", "FIPS"], 24 | order_method="asc" 25 | ) 26 | ds2.preview() 27 | 28 | ``` 29 | 30 | ## Source Code 31 | 32 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/remove_duplicates/remove_duplicates.sql" %} 33 | 34 | -------------------------------------------------------------------------------- /docs/drop_columns.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # drop_columns 4 | 5 | Drop columns by passing either an include_cols list of columns to include or an exclude_cols list of columns to exclude. 6 | 7 | Passing both include_cols and exclude_cols will result in an error. 8 | 9 | 10 | ## Parameters 11 | 12 | | Name | Type | Description | Is Optional | 13 | | ------------ | ----------- | --------------------------------------------------------------------------------------------------------------- | ----------- | 14 | | include_cols | column_list | A list of the columns from the dataset you want to keep. | True | 15 | | exclude_cols | column_list | A list of the columns from the dataset you want to drop. Any columns not in the exclude_cols list will be kept. | True | 16 | 17 | 18 | ## Example 19 | 20 | ```python 21 | ds = rasgo.get.dataset(id) 22 | 23 | ds2a = ds.drop_columns(include_cols=["DS_WEATHER_ICON", "DS_DAILY_HIGH_TEMP"]) 24 | ds2a.preview() 25 | 26 | ds2b = ds.drop_columns(exclude_cols=["DS_CLOUD_COVER", "DS_TOTAL_RAINFALL"]) 27 | ds2b.preview() 28 | 29 | ``` 30 | 31 | ## Source Code 32 | 33 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/drop_columns/drop_columns.sql" %} 34 | 35 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/sliding_slope/sliding_slope.sql: -------------------------------------------------------------------------------- 1 | WITH CTE_RANK AS ( 2 | SELECT *, ROW_NUMBER() OVER(PARTITION BY {{ partition_col }} ORDER BY {{ order_col }} ASC) AS RANK_{{ order_col }} 3 | FROM {{ source_table }} 4 | ) , 5 | CTE_WINDOW AS ( 6 | SELECT A.{{ partition_col }}, A.RANK_{{ order_col }}, 7 | ARRAY_AGG(ARRAY_CONSTRUCT(B.{{ value_col }}, B.RANK_{{ order_col }})) ARRAY_AGG_OBJ 8 | FROM CTE_RANK A 9 | JOIN CTE_RANK B 10 | ON A.{{ partition_col }}=B.{{ partition_col }} 11 | AND A.RANK_{{ order_col }} BETWEEN B.RANK_{{ order_col }} AND B.RANK_{{ order_col }}+{{ window }} 12 | GROUP BY A.{{ partition_col }}, A.RANK_{{ order_col }} 13 | ), 14 | CTE_SLOPE AS 15 | ( 16 | SELECT {{ partition_col }}, RANK_{{ order_col }} 17 | , regr_slope(X.VALUE[0], X.VALUE[1]) AS {{ value_col }}_SLOPE_{{ window }} 18 | FROM CTE_WINDOW, table(flatten(ARRAY_AGG_OBJ)) X 19 | GROUP BY {{ partition_col }}, RANK_{{ order_col }} 20 | ), 21 | CTE_RESULT AS 22 | ( 23 | SELECT A.{{ partition_col }}, A.{{ order_col }}, B.{{ value_col }}_SLOPE_{{ window }} 24 | FROM CTE_RANK A 25 | INNER JOIN CTE_SLOPE B 26 | ON A.{{ partition_col }} = B.{{ partition_col }} 27 | AND A.RANK_{{ order_col }} = B.RANK_{{ order_col }} 28 | ) 29 | SELECT A.*, B.{{ value_col }}_SLOPE_{{ window }} 30 | FROM {{ source_table }} A 31 | LEFT OUTER JOIN CTE_RESULT B 32 | ON A.{{ partition_col }} = B.{{ partition_col }} 33 | AND A.{{ order_col }} = B.{{ order_col }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/sliding_slope/snowflake/sliding_slope.sql: -------------------------------------------------------------------------------- 1 | WITH CTE_RANK AS ( 2 | SELECT *, ROW_NUMBER() OVER(PARTITION BY {{ partition_col }} ORDER BY {{ order_col }} ASC) AS RANK_{{ order_col }} 3 | FROM {{ source_table }} 4 | ) , 5 | CTE_WINDOW AS ( 6 | SELECT A.{{ partition_col }}, A.RANK_{{ order_col }}, 7 | ARRAY_AGG(ARRAY_CONSTRUCT(B.{{ value_col }}, B.RANK_{{ order_col }})) ARRAY_AGG_OBJ 8 | FROM CTE_RANK A 9 | JOIN CTE_RANK B 10 | ON A.{{ partition_col }}=B.{{ partition_col }} 11 | AND A.RANK_{{ order_col }} BETWEEN B.RANK_{{ order_col }} AND B.RANK_{{ order_col }}+{{ window }} 12 | GROUP BY A.{{ partition_col }}, A.RANK_{{ order_col }} 13 | ), 14 | CTE_SLOPE AS 15 | ( 16 | SELECT {{ partition_col }}, RANK_{{ order_col }} 17 | , regr_slope(X.VALUE[0], X.VALUE[1]) AS {{ value_col }}_SLOPE_{{ window }} 18 | FROM CTE_WINDOW, table(flatten(ARRAY_AGG_OBJ)) X 19 | GROUP BY {{ partition_col }}, RANK_{{ order_col }} 20 | ), 21 | CTE_RESULT AS 22 | ( 23 | SELECT A.{{ partition_col }}, A.{{ order_col }}, B.{{ value_col }}_SLOPE_{{ window }} 24 | FROM CTE_RANK A 25 | INNER JOIN CTE_SLOPE B 26 | ON A.{{ partition_col }} = B.{{ partition_col }} 27 | AND A.RANK_{{ order_col }} = B.RANK_{{ order_col }} 28 | ) 29 | SELECT A.*, B.{{ value_col }}_SLOPE_{{ window }} 30 | FROM {{ source_table }} A 31 | LEFT OUTER JOIN CTE_RESULT B 32 | ON A.{{ partition_col }} = B.{{ partition_col }} 33 | AND A.{{ order_col }} = B.{{ order_col }} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/aggregate_string/aggregate_string.yaml: -------------------------------------------------------------------------------- 1 | name: aggregate_string 2 | tags: 3 | - table 4 | - reshape 5 | - aggregate 6 | - natural_language_processing 7 | description: | 8 | Aggregate strings across rows by concatenating them together, and grouping by other columns. 9 | 10 | Uses a text separator to aggregate the string values together, and returns a single column where the rows are the aggregated strings. 11 | arguments: 12 | agg_columns: 13 | type: column_list 14 | description: Columns with string values to aggregate 15 | sep: 16 | type: value 17 | description: Text separator to use when aggregating the strings, i.e. ', '. 18 | group_by: 19 | type: column_list 20 | description: Columns to group by when applying the aggregation. 21 | distinct: 22 | type: boolean 23 | description: If you want to collapse multiple rows of the same string value into a single distinct value, use TRUE. Otherwise, use FALSE. 24 | order: 25 | type: sort_direction 26 | description: ASC or DESC, to set the alphabetical order of the agg_column when aggregating it 27 | example_code: | 28 | product = rasgo.get.dataset(75) 29 | 30 | ds2 = product.aggregate_string(group_by=['PRODUCTLINE'], 31 | agg_columns=['PRODUCTKEY', 'ENGLISHPRODUCTNAME'], 32 | sep=', ', 33 | distinct='FALSE', 34 | order='ASC') 35 | ds2.preview() -------------------------------------------------------------------------------- /docs/union.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # union 4 | 5 | Performs a SQL UNION or UNION ALL for the parent dataset, and another dataset. Operation will only merge columns with matching columns names in both datasets and drop all other columns. Column data type validation does not happen. 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | 11 | | dataset2 | table | Dataset to Union/Union All with main dataset | | 12 | | keep_dupes | boolean | Set to True to performn a UNION ALL between the two tables, which keeps rows that are duplicated. Set to False to eliminate duplicate rows. | True | 13 | 14 | 15 | ## Example 16 | 17 | ```python 18 | d1 = rasgo.get.dataset(dataset_id) 19 | d2 = rasgo.get.dataset(dataset_id_2) 20 | 21 | ds2 = d1.transform.union( 22 | dataset2=d2, 23 | keep_dupes=True 24 | ) 25 | 26 | ds2.preview() 27 | ``` 28 | 29 | ## Source Code 30 | 31 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/union/union.sql" %} 32 | 33 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/reshape/snowflake/reshape.sql: -------------------------------------------------------------------------------- 1 | {% if method|lower == 'pivot' -%} 2 | {%- set distinct_val_query -%} 3 | select distinct {{ columns }} 4 | from {{ source_table }} 5 | limit 1000 6 | {%- endset -%} 7 | 8 | {%- if list_of_vals is not defined -%} 9 | {%- set results = run_query(distinct_val_query) -%} 10 | {%- set distinct_vals = results[results.columns[0]].to_list() -%} 11 | {%- else -%} 12 | {%- set distinct_vals = list_of_vals -%} 13 | {%- endif -%} 14 | 15 | {# Jinja Macro to get the comma separated cleansed name list #} 16 | {%- macro get_values(distinct_values) -%} 17 | {%- for val in distinct_vals -%} 18 | {{ cleanse_name(val) }}{{ ', ' if not loop.last else '' }} 19 | {%- endfor -%} 20 | {%- endmacro -%} 21 | 22 | 23 | SELECT {{ dimensions | join(", ") }}{{ ',' if dimensions else ''}} {{ get_values(distinct_vals) }} 24 | FROM ( SELECT {{ dimensions | join(", ") }}{{ ',' if dimensions else ''}} {{ values }}, {{ columns }} FROM {{ source_table }}) 25 | PIVOT ( {{ agg_method }} ( {{ values }} ) FOR {{ columns }} IN ( '{{ distinct_vals | join("', '") }}' ) ) as p 26 | ( {{ dimensions | join(", ") }}{{ ',' if dimensions else ''}} {{ get_values(distinct_vals) }} ) 27 | {%- else -%} 28 | SELECT * FROM {{ source_table }} 29 | UNPIVOT( {{ value_column }} for {{ name_column }} in ( {{ columns | join(', ')}} )) 30 | {%- endif -%} -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/cast/bigquery/cast.sql: -------------------------------------------------------------------------------- 1 | {%- if overwrite_columns == true -%} 2 | 3 | {%- set source_columns = get_columns(source_table) -%} 4 | {%- set untouched_cols = source_columns | reject('in', casts) -%} 5 | 6 | SELECT {% for col in untouched_cols %}{{ col }},{% endfor %} 7 | {%- for target_col, type in casts.items() %} 8 | {%- if type|lower == 'float' %} 9 | CAST({{target_col}} AS FLOAT64) AS {{target_col}}{{", " if not loop.last else ""}}f 10 | {%- elif type|lower == 'number' %} 11 | CAST({{target_col}} AS NUMERIC) AS {{target_col}}{{", " if not loop.last else ""}} 12 | {%- else %} 13 | CAST({{target_col}} AS {{type}}) AS {{target_col}}{{", " if not loop.last else ""}} 14 | {%- endif %} 15 | {%- endfor %} 16 | FROM {{ source_table }} 17 | 18 | {%- else -%} 19 | 20 | SELECT * 21 | {%- for target_col, type in casts.items() %} 22 | {%- if type|lower == 'float' %} 23 | ,CAST({{target_col}} AS FLOAT64) AS {{cleanse_name(target_col)+'_'+cleanse_name(type)}} 24 | {%- elif type|lower == 'number' %} 25 | ,CAST({{target_col}} AS NUMERIC) AS {{cleanse_name(target_col)+'_'+cleanse_name(type)}} 26 | {%- else %} 27 | ,CAST({{target_col}} AS {{type}}) AS {{cleanse_name(target_col)+'_'+cleanse_name(type)}} 28 | {%- endif %} 29 | {%- endfor %} 30 | FROM {{ source_table }} 31 | 32 | {%- endif -%} 33 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/pivot_table/pivot_table.yaml: -------------------------------------------------------------------------------- 1 | name: pivot_table 2 | tags: 3 | - table 4 | - reshape 5 | - math 6 | - aggregate 7 | description: | 8 | ## Pivot table, powered by SQL. 9 | 10 | ### Required Inputs 11 | - Values: a dictionary of the column with values you want to aggregate & the method of aggregation for your values 12 | 13 | ### Optional Inputs 14 | - Rows: column(s) to group by down 15 | - Columns: column(s) to pivot across 16 | - Filters: filters to apply 17 | 18 | ### Notes 19 | - Applies a hard limit of 500 distinct values in the 'columns' column 20 | 21 | arguments: 22 | values: 23 | type: column_agg_list 24 | description: columns to aggregate 25 | rows: 26 | type: column_or_expression_list 27 | description: Columns to group by (column values will become rows). Supports calculated fields via valid SQL functions. 28 | is_optional: true 29 | columns: 30 | type: column_or_expression 31 | description: Column with distinct values that will be pivoted into columns. Supports a calculated field via a valid SQL function. 32 | is_optional: true 33 | filters: 34 | type: filter_list 35 | description: Filters to apply to the table 36 | is_optional: true 37 | example_code: | 38 | ds2 = ds.pivot_table( 39 | rows=['DATE'], 40 | values={ 41 | 'CLOSE': ['SUM', 'AVG'], 42 | 'OPEN': ['SUM', 'AVG'] 43 | }, 44 | columns='SYMBOL', 45 | ) 46 | ds2.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/ratio_with_shrinkage.yaml: -------------------------------------------------------------------------------- 1 | name: ratio_with_shrinkage 2 | tags: 3 | - row 4 | - math 5 | description: | 6 | Performs empirical bayesian estimation with shrinkage towards a beta prior. 7 | Given a dataset with a numerator and a denominator, will calculate the raw ratio as numerator / denom, 8 | as well as provide an adjusted ratio that shrinks the ratio towards the observed beta prior. 9 | 10 | This is a simplified version that establishes the priors directly from the data given a min_cutoff count of observations. 11 | 12 | NOTE: your data should already be aggregated before performing this operation. 13 | arguments: 14 | numerator: 15 | type: column 16 | description: | 17 | A column that is pre-aggregated to contain the count of positive cases 18 | denom: 19 | type: column 20 | description: | 21 | A column that is pre-aggregated to contain the count of ALL cases 22 | min_cutoff: 23 | type: int 24 | description: | 25 | Enter a minimum value to limit the denominator when creating the prior estimates. Example: if estimating a batter's hitting percentage, 26 | entering 500 would limit the estimation of the priors to be only for batters with over 500 at-bats. 27 | 28 | example_code: | 29 | ds = rasgo.get.dataset(fqtn="BATTING_AVERAGES") 30 | 31 | ds2 = ds.ratio_with_shrinkage(numerator = 'HITS', 32 | denom = 'AT_BATS', 33 | min_cutoff = 500) 34 | -------------------------------------------------------------------------------- /docs/market_basket.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # market_basket 4 | 5 | Analyze historical transaction contents to understand products that are frequently purchased together. 6 | 7 | This approach uses a transactional table to aggregate each product purchased in a transaction, and then aggregates transactions together to look for common patterns. 8 | 9 | 10 | ## Parameters 11 | 12 | | Name | Type | Description | Is Optional | 13 | | -------------- | ------ | ------------------------------------------------------------------------------------------ | ----------- | 14 | | transaction_id | column | Column identifying a unique event ID (i.e., transaction) for which to aggregate line items | | 15 | | sep | value | Text separator to use when aggregating the strings, i.e. ', ' or '\|'. | | 16 | | agg_column | column | Product ID or description to use when aggregating into transactions | | 17 | 18 | 19 | ## Example 20 | 21 | ```python 22 | sales = rasgo.get.dataset(id) 23 | 24 | ds2 = sales.market_basket(transaction_id='SALESORDERNUMBER', 25 | agg_column='ENGLISHPRODUCTNAME', 26 | sep='|') 27 | ds2.preview() 28 | ``` 29 | 30 | ## Source Code 31 | 32 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/market_basket/market_basket.sql" %} 33 | 34 | -------------------------------------------------------------------------------- /docs/unions.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # union 4 | 5 | Union one or multiple tables with the base table. 6 | Looks at all columns in each table and finds columns in common across all of them to keep in the final table. 7 | 8 | 9 | ## Parameters 10 | 11 | | Name | Type | Description | Is Optional | 12 | | ----------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | ----------- | 13 | | union_tables | table_list | tables to union with the base table | | 14 | | remove_duplicates | boolean | Defaults to False. Set to True to use UNION, which removes duplicate rows. Set to False to use UNION ALL, which keeps rows that are duplicated. | True | 15 | 16 | 17 | ## Example 18 | 19 | ```python 20 | d1 = rasgo.get.dataset(dataset_id) 21 | d2 = rasgo.get.dataset(dataset_id_2) 22 | d3 = rasgo.get.dataset(dataset_id_3) 23 | 24 | union_ds = d1.unions( 25 | union_tables=[d2.fqtn, d3.fqtn] 26 | remove_duplicates=True 27 | ) 28 | 29 | union_ds.preview() 30 | ``` 31 | 32 | ## Source Code 33 | 34 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/unions/unions.sql" %} 35 | 36 | -------------------------------------------------------------------------------- /rasgotransforms/DESCRIPTION.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | # Rasgo Transforms 6 | 7 | Rasgo Transforms provide jinja SQL templates that can be applied to your data using rasgoQL, a pandas-like python package. 8 | - Transforms are equivalent to SQL functions that accept a table or view from your eixsting DataWarehouse and return a SQL string to transform it 9 | - Rasgo has built a starter library of transforms for you to use or fork 10 | - *Coming Soon:* Users will be able to create their own Transforms and add them to a private namespace or contribute to the open-source library 11 | 12 | ## Running Transforms 13 | 14 | Rasgo Transforms can be applied via: 15 | - your Rasgo Feature Store account ([pyrasgo](https://pypi.org/project/pyrasgo/) - Account required) 16 | - the Rasgo open-source package ([rasgoql](https://pypi.org/project/rasgoql/) - totally free). 17 | 18 | ## Package Dependencies 19 | - pyyaml 20 | 21 | 22 | # About Us 23 | Rasgo Transforms are maintained by *[Rasgo](https://rasgoml.com)*. Rasgo's enterprise feature store integrates with your data warehouse to help users build features faster, collaborate with team members, and serve features to models in production. 24 | 25 | 26 | Built for Data Scientists, by Data Scientists 27 | -------------------------------------------------------------------------------- /docs/dropna.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # dropna 4 | 5 | Remove missing values 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ------ | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------- | 11 | | how | value | Method to determine if record is removed, 'any' removes each record with at least one missing value, 'all' removes records only when all values are missing (default = 'any'). | True | 12 | | subset | column_list | List of columns to check for missing values. All columns are checked if not defined. | True | 13 | | thresh | int | (Optional) Acts like all, but only requires this number of values to be null to remove a record instead of all. | True | 14 | 15 | 16 | ## Example 17 | 18 | ```python 19 | ds = rasgo.get.dataset(id) 20 | 21 | ds2 = ds.dropna(how='all', subset=['ORDERS', 'SALES']) 22 | ds2.preview() 23 | ``` 24 | 25 | ## Source Code 26 | 27 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/dropna/dropna.sql" %} 28 | 29 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/aggregate/aggregate.py: -------------------------------------------------------------------------------- 1 | NUMERIC_TYPES = [ 2 | 'int', 3 | 'integer', 4 | 'bigint', 5 | 'smallint', 6 | 'number', 7 | 'numeric', 8 | 'float', 9 | 'float4', 10 | 'float8', 11 | 'decimal', 12 | 'double precision', 13 | 'real', 14 | ] 15 | 16 | 17 | def infer_columns(args, source_columns) -> dict: 18 | args = args.copy() 19 | out_cols = {} 20 | for col in args['group_by']: 21 | out_cols[col] = source_columns[col.upper()] 22 | if 'numeric columns' in args['aggregations'].keys(): 23 | for column, column_type in source_columns.items(): 24 | if column not in args['aggregations'].keys() and column_type.lower() in NUMERIC_TYPES: 25 | args['aggregations'].setdefault(column, []).extend(args['aggregations']['numeric columns']) 26 | args['aggregations'].pop('numeric columns') 27 | if 'nonnumeric columns' in args['aggregations'].keys(): 28 | for column, column_type in source_columns.items(): 29 | if column not in args['aggregations'].keys() and column_type.lower() not in NUMERIC_TYPES: 30 | args['aggregations'].setdefault(column, []).extend(args['aggregations']['nonnumeric columns']) 31 | args['aggregations'].pop('nonnumeric columns') 32 | for col in args['aggregations'].keys(): 33 | for agg in args['aggregations'][col]: 34 | agg = agg.replace(' ', '') 35 | out_cols[f'{col}_{agg}'] = 'NUMERIC' 36 | return out_cols 37 | -------------------------------------------------------------------------------- /docs/heatmap.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # heatmap 4 | 5 | Generate an x / y heatmap, which uses the number of rows in each x/y bin as a density overlay to a 2-d histogram 6 | 7 | ## Parameters 8 | 9 | | Name | Type | Description | Is Optional | 10 | | ----------- | ----------- | ---------------------------------------------------------------------------------------------------------------------- | ----------- | 11 | | x_axis | column | numeric column to use as the x axis | | 12 | | y_axis | column | numeric column to use as the y axis | | 13 | | filters | filter_list | Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text. | True | 14 | | num_buckets | value | max number of buckets to create; defaults to 100 | True | 15 | 16 | 17 | ## Example 18 | 19 | ```python 20 | ds = rasgo.get.dataset(id) 21 | 22 | ds2 = ds.heatmap(x_axis='TEMPERATURE', 23 | y_axis='PRECIPITATION') 24 | ds2.preview() 25 | ``` 26 | 27 | ## Source Code 28 | 29 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/heatmap/heatmap.sql" %} 30 | 31 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/rsi/rsi.sql: -------------------------------------------------------------------------------- 1 | WITH CTE_LAG1 AS ( 2 | SELECT *, 3 | lag({{ value_col }}, 1) over (partition by {{ partition_col }} order by {{ order_col }}) as LAG_{{ value_col }} 4 | from {{ source_table }} 5 | ) , 6 | CTE_DELTA AS ( 7 | SELECT * 8 | , {{ value_col }} - LAG_{{ value_col }} as DELTA 9 | FROM CTE_LAG1 10 | ) , 11 | CTE_GAINLOSS_SPLIT AS ( 12 | SELECT * 13 | , CASE WHEN DELTA > 0 THEN DELTA WHEN DELTA = 0 THEN 0 ELSE 0 END as GAIN 14 | , CASE WHEN DELTA < 0 THEN abs(DELTA) WHEN DELTA = 0 THEN 0 ELSE 0 END as LOSS 15 | FROM CTE_DELTA 16 | ) , 17 | CTE_MOVINGAVG AS ( 18 | SELECT * 19 | , avg(GAIN) OVER(PARTITION BY {{ partition_col }} ORDER BY {{ order_col }} ROWS BETWEEN {{ window - 1 }} PRECEDING AND CURRENT ROW) AS AVG_GAIN_{{ window }} 20 | , avg(LOSS) OVER(PARTITION BY {{ partition_col }} ORDER BY {{ order_col }} ROWS BETWEEN {{ window - 1 }} PRECEDING AND CURRENT ROW) AS AVG_LOSS_{{ window }} 21 | FROM CTE_GAINLOSS_SPLIT 22 | ) , 23 | CTE_RSI AS ( 24 | SELECT * 25 | , CASE WHEN AVG_LOSS_{{ window }}=0 THEN 100 ELSE 100 - (100 / (1+(AVG_GAIN_{{ window }} / AVG_LOSS_{{ window }}))) END as {{ value_col }}_RSI_{{ window }} 26 | FROM CTE_MOVINGAVG 27 | ) , 28 | CTE_FINAL AS ( 29 | SELECT {{ order_col }}, {{ partition_col }}, {{ value_col }}_RSI_{{ window }} 30 | FROM CTE_RSI 31 | ) 32 | SELECT A.*, B.{{ value_col }}_RSI_{{ window }} 33 | FROM {{ source_table }} A 34 | INNER JOIN CTE_FINAL B 35 | ON A.{{ partition_col }} = B.{{ partition_col }} 36 | AND A.{{ order_col }} = B.{{ order_col }} 37 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/sliding_slope/sliding_slope.yaml: -------------------------------------------------------------------------------- 1 | name: sliding_slope 2 | tags: 3 | - table 4 | - time_series 5 | description: | 6 | Calculates the linear slope on a given row, looking backwards for a user-defined window of periods. 7 | 8 | Pass in a partition_col, an order_col, and a lookback window size. 9 | 10 | NOTE: Your data should be a properly formatted timeseries dataset before applying this transformation. In other words, each period should only appear once, and periods considered zero should be imputed with 0 already. 11 | NOTE: Slope calculations are notoriously sensitive to large outliers, especially with smaller windows. 12 | 13 | Example use case: On daily stock data, calculate SLOPE by TICKER, with a 14-period lookback window. 14 | arguments: 15 | partition_col: 16 | type: column 17 | description: | 18 | Grouping column to calculate the slope within. 19 | order_col: 20 | type: column 21 | description: Column to order rows by when calculating the agg window. Slope automatically sorts ascending. 22 | value_col: 23 | type: column 24 | description: Column to calulate slope for. 25 | window: 26 | type: int 27 | description: | 28 | Number of periods to use as a lookback period, to calculate slope. 29 | example_code: | 30 | ds = rasgo.get.dataset(fqtn="RASGOCOMMUNITY.PUBLIC.ZEPL_DAILY_STOCK_FEATURES") 31 | 32 | ds2 = ds.sliding_slope(partition_col = 'TICKER', 33 | order_col = 'DATE', 34 | value_col = 'CLOSE', 35 | window = 14) 36 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/encode_values/encode_values.yaml: -------------------------------------------------------------------------------- 1 | name: label_encode 2 | tags: 3 | - column 4 | - feature_engineering 5 | description: | 6 | Encodes values in a column through a variety of methods: 7 | 8 | Label Encoding: 9 | Encode target labels with value between 0 and n_classes-1. See scikit-learn's [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder) for full documentation. 10 | 11 | Target Encoding: 12 | Encode a categorical column with the average value of a target column for the corresponding value of the categorical column. 13 | See scikit-learn's [TargetEncoder](https://contrib.scikit-learn.org/category_encoders/targetencoder.html) for full documentation. 14 | 15 | One Hot Encoding: 16 | Encode a categorical column as a 0 or 1 for each possible category, each of which will be it's own row. 17 | 18 | 19 | arguments: 20 | method: 21 | type: string 22 | description: Encoding method which will be used ('label', 'target', or 'oh') 23 | column: 24 | type: column 25 | description: Column name to label encode 26 | target: 27 | type: column 28 | description: Required if method = 'target'. Numeric target column to use to create averages 29 | is_optional: true 30 | 31 | example_code: | 32 | ds = rasgo.get.dataset(id) 33 | 34 | ds2 = ds.label_encode(column='WEATHER_DESCRIPTION', method='oh') 35 | ds2.preview() 36 | 37 | ds3 = ds.target_encode(column='WEATHER_DESCRIPTION', target='DAILY_HIGH_TEMP') 38 | ds3.preview() -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/vlookup/vlookup.yaml: -------------------------------------------------------------------------------- 1 | name: vlookup 2 | tags: 3 | - table 4 | - join 5 | - reshape 6 | description: | 7 | ## Inspired by Excel... a VLookup experience that works in SQL 8 | 9 | ### Required Inputs 10 | - Lookup Column: The column to look up in the Lookup Table. Make sure the column is named the same in both tables. 11 | - Lookup Table: The table to look up the Lookup Column in. 12 | 13 | ### Optional Inputs 14 | - Keep Columns: The columns to keep from the Lookup Table. If not provided, all columns from the Lookup Table will be kept. 15 | 16 | ### Notes 17 | - For values that don't find a match in the lookup_column, you will see Null 18 | - For columns that have the same name in both tables, the columns in the Lookup Table will be prefixed with the table name 19 | 20 | arguments: 21 | lookup_column: 22 | type: column 23 | description: | 24 | Column to look up in the lookup table 25 | lookup_table: 26 | type: table 27 | description: | 28 | Table to look up the lookup_column in 29 | keep_columns: 30 | type: column_list 31 | description: | 32 | Columns to keep from the lookup table 33 | is_optional: true 34 | context: 35 | tableArg: lookup_table 36 | example_code: | 37 | internet_sales = rasgo.get.dataset(74) 38 | customer = rasgo.get.dataset(55) 39 | product = rasgo.get.dataset(75) 40 | 41 | ds2 = internet_sales.vlookup( 42 | lookup_column='PRODUCTKEY', 43 | lookup_table=product.fqtn, 44 | keep_columns=['WEIGHT', 'ENGLISHDESCRIPTION'] 45 | ) 46 | ds2.preview() -------------------------------------------------------------------------------- /.github/workflows/publish_accelerators.yaml: -------------------------------------------------------------------------------- 1 | # Publish all Accelerators in this Repo Production and Staging Environments 2 | # Run job each time something is pushed/committed to remote 'main' branch 3 | name: Publish Accelerators 4 | 5 | on: 6 | workflow_dispatch: 7 | inputs: 8 | environment: 9 | description: 'Environment that accelerators will be published to' 10 | default: 'all' 11 | required: true 12 | 13 | jobs: 14 | publish-accelerators: 15 | runs-on: ubuntu-latest 16 | defaults: 17 | run: 18 | shell: bash 19 | 20 | container: 21 | image: "python:3.7" 22 | 23 | env: 24 | PYTHONPATH: /__w/RasgoTransforms/RasgoTransforms 25 | RASGO_COMMUNITY_API_KEY: ${{ secrets.RASGO_COMMUNITY_API_KEY }} 26 | RASGO_STAGING_COMMUNITY_API_KEY: ${{ secrets.RASGO_STAGING_COMMUNITY_API_KEY }} 27 | 28 | steps: 29 | - uses: actions/checkout@v2 30 | 31 | - name: Install Python Requirements 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install -r python/requirements.txt 35 | 36 | - name: Publish Accelerators on Prod 37 | if: github.event.inputs.environment == 'production' || github.event.inputs.environment == 'all' 38 | run: python python/publish_accelerators.py "$RASGO_COMMUNITY_API_KEY" -d production 39 | 40 | - name: Publish Accelerators on Staging 41 | if: github.event.inputs.environment == 'staging' || github.event.inputs.environment == 'all' 42 | run: python python/publish_accelerators.py "$RASGO_STAGING_COMMUNITY_API_KEY" -d staging 43 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/bin/bin.yaml: -------------------------------------------------------------------------------- 1 | name: bin 2 | tags: 3 | - column 4 | - aggregate 5 | - feature_engineering 6 | description: | 7 | This function will categorize or bin an input column such that for N bins, an output column is created with values `[1-N]` where each value represents some bin. 8 | 9 | This transformation supports two binning methods (called "binning_type" in the arguments): `ntile` and `equalwidth`. 10 | 11 | ## N-tile 12 | When using `ntile` binnint the boundaries for the bins are calculated such that each bin will receive an almost equal number of elements. It will create a new column called {{column}}_{{bin_count}}_NTB. This ensures that multiple equal-weight binning operations will produce column names that don't overlap. 13 | 14 | ## Equal Width 15 | The `equalwidth` method will calculate the boundaries of the bins such that they will be of equal width based on the min and max value within the source column. This transformation will create a new column called {{column}}_{{bin_count}}_EWB. This ensures that multiple equal-weight binning operations will produce column names that don't overlap. 16 | arguments: 17 | type: 18 | type: string 19 | description: binning algorithm to use; must be `ntile` or `equalwidth` 20 | bin_count: 21 | type: int 22 | description: the number of equal-width bins to use 23 | column: 24 | type: column 25 | description: which column to bucket 26 | example_code: | 27 | ds = rasgo.get.dataset(id) 28 | 29 | ds2 = ds.bin(type='equalwidth', bin_count=6, column='DAILY_HIGH_TEMP') 30 | ds2.preview() 31 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/rolling_agg/rolling_agg.yaml: -------------------------------------------------------------------------------- 1 | name: rolling_agg 2 | tags: 3 | - table 4 | - aggregate 5 | - date_time 6 | description: | 7 | Row-based; Calculates a rolling aggregate based on a relative row window. 8 | 9 | Pass in order_by columns and offsets to create a row-based look-back or look-forward windows. 10 | 11 | Example use case: Aggregate the last 10 sales for a customer regardless of when they occurred. 12 | arguments: 13 | aggregations: 14 | type: agg_dict 15 | description: | 16 | Dictionary of columns and aggregate functions to apply. 17 | A column can have a list of multiple aggregates applied. 18 | One column will be created for each column:aggregate pair. 19 | order_by: 20 | type: column_list 21 | description: Column(s) to order rows by when calculating the agg window 22 | offsets: 23 | type: int_list 24 | description: | 25 | List of numeric values to offset the date column. 26 | Positive values apply a look-back window. 27 | Negative values apply a look-forward window. 28 | One column will be created for each offset value. 29 | group_by: 30 | type: column_list 31 | description: Column(s) to group by when calculating the agg window 32 | is_optional: True 33 | example_code: | 34 | internet_sales = rasgo.get.dataset(74) 35 | 36 | ds = internet_sales.rolling_agg( 37 | aggregations={ 38 | 'SALESAMOUNT':['MAX', 'MIN', 'SUM'] 39 | }, 40 | order_by=['ORDERDATE'], 41 | offsets=[-7, 7, 14], 42 | group_by=['PRODUCTKEY'], 43 | ) 44 | -------------------------------------------------------------------------------- /docs/train_test_split.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # train_test_split 4 | 5 | Label rows as part of the train or test set based off of percentage split you want to apply to the data. 6 | 7 | If you want a row-wise random sample applied, do not pass an order_by column. If you want an ordered split, then pass the order_by column. 8 | 9 | 10 | ## Parameters 11 | 12 | | Name | Type | Description | Is Optional | 13 | | ------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | 14 | | order_by | column_list | Optional argument that affects the train/test split method applied. if needed, pass the names of column(s) you want to order by when applying the split. | True | 15 | | train_percent | int | Percent of the data you want in the train set, expressed as a decimal (i.e. .8). The rest of the rows will be included in the test set. | | 16 | 17 | 18 | ## Example 19 | 20 | ```python 21 | ds = rasgo.get.dataset(id) 22 | 23 | ds2 = ds.train_test_split(order_by = ['DATE'], 24 | train_percent = 0.8) 25 | ds2.preview() 26 | 27 | ds2b = ds.train_test_split(train_percent = 0.8) 28 | ds2b.preview() 29 | ``` 30 | 31 | ## Source Code 32 | 33 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/train_test_split/train_test_split.sql" %} 34 | 35 | -------------------------------------------------------------------------------- /rasgotransforms/rasgotransforms/transforms/join/join.yaml: -------------------------------------------------------------------------------- 1 | name: join 2 | tags: 3 | - table 4 | - join 5 | - reshape 6 | description: | 7 | Join a dataset with another dataset, by matching on one or more columns between the two tables. 8 | 9 | If you pass a join_prefix, all column names in the join table will be named "{join_prefix}_{columnname}". 10 | If you don't pass a join_prefix, columns that share the same name in both tables will be only have the column from the base table included in the final output. 11 | arguments: 12 | join_table: 13 | type: table 14 | description: Dataset object to join with the source dataset. 15 | join_type: 16 | type: join_type 17 | description: LEFT, RIGHT, or INNER 18 | join_columns: 19 | type: join_dict 20 | description: Columns to use for the join. Keys are columns in the source_table and values are on columns in the join_table. 21 | join_prefix: 22 | type: value 23 | is_optional: true 24 | description: Prefix all columns in the join_table with a string to differentiate them 25 | filters: 26 | type: filter_list 27 | description: Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text. 28 | is_optional: true 29 | example_code: | 30 | internet_sales = rasgo.get.dataset(74) 31 | product = rasgo.get.dataset(75) 32 | 33 | ds2 = internet_sales.join( 34 | join_table=product, 35 | join_columns={'PRODUCTKEY':'PRODUCTKEY'}, 36 | join_type='LEFT', 37 | join_prefix='product', 38 | filters=['CUSTOMERKEY IS NOT NULL', 'ORDERDATE < CURRENT_DATE()']) 39 | 40 | ds2.preview() -------------------------------------------------------------------------------- /docs/conditional_agg.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # conditional_agg 4 | 5 | Pass in a list of filter rules, and aggregate rows that match. 6 | 7 | If multiple rules are passed, they are combined and aggregated both together and separately. 8 | 9 | 10 | ## Parameters 11 | 12 | | Name | Type | Description | Is Optional | 13 | | ---------- | ---------- | -------------------------------------------------------------------- | ----------- | 14 | | rules | value_list | List of filter rules to use | | 15 | | agg_column | column | Column to aggregate | | 16 | | agg | agg | Method to use when aggregating the agg_column | | 17 | | distinct | boolean | When aggregating the agg_column, use TRUE to qualify with a DISTINCT | | 18 | 19 | 20 | ## Example 21 | 22 | ```python 23 | customer = rasgo.get.dataset(55) 24 | 25 | rules = [ 26 | "FIRSTNAME LIKE 'J%'", 27 | "BIRTHDATE < '1970-01-01'", 28 | "ENGLISHEDUCATION = 'Bachelors'", 29 | "MARITALSTATUS = 'M'", 30 | "GENDER='F'"] 31 | 32 | ds2 = customer.conditional_agg(rules=rules, 33 | agg_column='CUSTOMERKEY', 34 | agg='COUNT', 35 | distinct=True) 36 | ds2.preview() 37 | ``` 38 | 39 | ## Source Code 40 | 41 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/conditional_agg/conditional_agg.sql" %} 42 | 43 | --------------------------------------------------------------------------------