├── python
    ├── __init__.py
    ├── requirements.txt
    └── constants.py
├── .dockerignore
├── rasgotransforms
    ├── rasgotransforms
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── transforms
    │   │   │   ├── __init__.py
    │   │   │   └── plot
    │   │   │       └── __init__.py
    │   ├── snippets
    │   │   ├── int.sql
    │   │   ├── column.sql
    │   │   ├── date.sql
    │   │   ├── table.sql
    │   │   ├── alias.sql
    │   │   ├── boolean.sql
    │   │   ├── number.sql
    │   │   ├── string.sql
    │   │   ├── agg.sql
    │   │   ├── date_part.sql
    │   │   ├── join_type.sql
    │   │   ├── timestamp.sql
    │   │   ├── column_value.sql
    │   │   ├── custom_option.sql
    │   │   ├── sort_direction.sql
    │   │   ├── column_list.sql
    │   │   ├── number_list.sql
    │   │   ├── table_list.sql
    │   │   ├── column_or_expression.sql
    │   │   ├── column_or_expression_list.sql
    │   │   ├── string_list.sql
    │   │   ├── column_value_list.sql
    │   │   ├── custom_option_list.sql
    │   │   ├── filter_list.sql
    │   │   ├── sort_dict.sql
    │   │   ├── calculated_column_list.sql
    │   │   ├── column_agg_list.sql
    │   │   └── agg_dict.sql
    │   ├── exceptions.py
    │   ├── transforms
    │   │   ├── select
    │   │   │   ├── select.sql
    │   │   │   └── select.yaml
    │   │   ├── dropna
    │   │   │   ├── dropna.py
    │   │   │   ├── dropna.yaml
    │   │   │   └── dropna.sql
    │   │   ├── filter
    │   │   │   ├── filter.py
    │   │   │   ├── filter.sql
    │   │   │   └── filter.yaml
    │   │   ├── order
    │   │   │   ├── order.py
    │   │   │   ├── order.sql
    │   │   │   └── order.yaml
    │   │   ├── sample
    │   │   │   ├── sample.py
    │   │   │   ├── sample.yaml
    │   │   │   └── sample.sql
    │   │   ├── sample_class
    │   │   │   ├── sample_class.py
    │   │   │   ├── sample_class.sql
    │   │   │   └── sample_class.yaml
    │   │   ├── remove_duplicates
    │   │   │   ├── remove_duplicates.py
    │   │   │   ├── remove_duplicates.sql
    │   │   │   └── remove_duplicates.yaml
    │   │   ├── unpivot
    │   │   │   ├── unpivot.sql
    │   │   │   └── unpivot.yaml
    │   │   ├── text_to_sql
    │   │   │   ├── text_to_sql.sql
    │   │   │   └── text_to_sql.yaml
    │   │   ├── train_test_split
    │   │   │   ├── train_test_split.py
    │   │   │   ├── train_test_split.sql
    │   │   │   └── train_test_split.yaml
    │   │   ├── target_encode
    │   │   │   ├── target_encode.py
    │   │   │   ├── target_encode.sql
    │   │   │   └── target_encode.yaml
    │   │   ├── remove_outliers
    │   │   │   └── remove_outliers.py
    │   │   ├── datepart
    │   │   │   ├── datepart.py
    │   │   │   ├── datepart.sql
    │   │   │   ├── postgresql
    │   │   │   │   └── datepart.sql
    │   │   │   ├── redshift
    │   │   │   │   └── datepart.sql
    │   │   │   ├── snowflake
    │   │   │   │   └── datepart.sql
    │   │   │   ├── datepart.yaml
    │   │   │   └── bigquery
    │   │   │   │   └── datepart.sql
    │   │   ├── label_encode
    │   │   │   ├── label_encode.py
    │   │   │   ├── snowflake
    │   │   │   │   └── label_encode.sql
    │   │   │   ├── bigquery
    │   │   │   │   └── label_encode.sql
    │   │   │   └── label_encode.yaml
    │   │   ├── funnel
    │   │   │   ├── funnel.sql
    │   │   │   └── funnel.yaml
    │   │   ├── market_basket
    │   │   │   ├── market_basket.py
    │   │   │   ├── market_basket.sql
    │   │   │   └── market_basket.yaml
    │   │   ├── datespine
    │   │   │   └── datespine.py
    │   │   ├── if_then
    │   │   │   ├── if_then.sql
    │   │   │   ├── if_then.py
    │   │   │   └── if_then.yaml
    │   │   ├── datetrunc
    │   │   │   ├── bigquery
    │   │   │   │   └── datetrunc.sql
    │   │   │   ├── postgresql
    │   │   │   │   └── datetrunc.sql
    │   │   │   ├── redshift
    │   │   │   │   └── datetrunc.sql
    │   │   │   ├── snowflake
    │   │   │   │   └── datetrunc.sql
    │   │   │   └── datetrunc.py
    │   │   ├── apply
    │   │   │   ├── apply.sql
    │   │   │   └── apply.yaml
    │   │   ├── moving_avg
    │   │   │   ├── moving_avg.py
    │   │   │   ├── moving_avg.sql
    │   │   │   └── moving_avg.yaml
    │   │   ├── datediff
    │   │   │   ├── datediff.py
    │   │   │   ├── bigquery
    │   │   │   │   └── datediff.sql
    │   │   │   ├── datediff.sql
    │   │   │   ├── snowflake
    │   │   │   │   └── datediff.sql
    │   │   │   └── datediff.yaml
    │   │   ├── lag
    │   │   │   ├── lag.py
    │   │   │   ├── lag.sql
    │   │   │   ├── bigquery
    │   │   │   │   └── lag.sql
    │   │   │   └── lag.yaml
    │   │   ├── lead
    │   │   │   ├── lead.py
    │   │   │   ├── lead.sql
    │   │   │   ├── bigquery
    │   │   │   │   └── lead.sql
    │   │   │   └── lead.yaml
    │   │   ├── prefix
    │   │   │   ├── prefix.sql
    │   │   │   └── prefix.yaml
    │   │   ├── suffix
    │   │   │   ├── suffix.sql
    │   │   │   └── suffix.yaml
    │   │   ├── uppercase_columns
    │   │   │   ├── bigquery
    │   │   │   │   └── uppercase_columns.sql
    │   │   │   ├── snowflake
    │   │   │   │   └── uppercase_columns.sql
    │   │   │   └── uppercase_columns.yaml
    │   │   ├── replace_string
    │   │   │   ├── replace_string.py
    │   │   │   └── replace_string.sql
    │   │   ├── drop_columns
    │   │   │   ├── drop_columns.py
    │   │   │   ├── drop_columns.sql
    │   │   │   └── drop_columns.yaml
    │   │   ├── replace_missing
    │   │   │   ├── replace_missing.py
    │   │   │   └── replace_missing.yaml
    │   │   ├── new_columns
    │   │   │   ├── new_columns.sql
    │   │   │   └── new_columns.yaml
    │   │   ├── aggregate_string
    │   │   │   ├── aggregate_string.sql
    │   │   │   └── aggregate_string.yaml
    │   │   ├── rename
    │   │   │   ├── rename.py
    │   │   │   ├── rename.sql
    │   │   │   ├── snowflake
    │   │   │   │   └── rename.sql
    │   │   │   └── rename.yaml
    │   │   ├── linear_regression
    │   │   │   ├── linear_regression.py
    │   │   │   ├── linear_regression.sql
    │   │   │   └── linear_regression.yaml
    │   │   ├── cast
    │   │   │   ├── cast.py
    │   │   │   ├── cast.sql
    │   │   │   ├── cast.yaml
    │   │   │   └── bigquery
    │   │   │   │   └── cast.sql
    │   │   ├── min_max_scaler
    │   │   │   ├── min_max_scaler.py
    │   │   │   └── min_max_scaler.sql
    │   │   ├── standard_scaler
    │   │   │   ├── standard_scaler.py
    │   │   │   └── standard_scaler.sql
    │   │   ├── latest
    │   │   │   ├── latest.py
    │   │   │   ├── latest.sql
    │   │   │   └── latest.yaml
    │   │   ├── to_date
    │   │   │   ├── to_date.sql
    │   │   │   ├── to_date.py
    │   │   │   └── to_date.yaml
    │   │   ├── join
    │   │   │   ├── join.py
    │   │   │   └── join.yaml
    │   │   ├── profile_column
    │   │   │   ├── profile_column.sql
    │   │   │   └── profile_column.yaml
    │   │   ├── sankey
    │   │   │   ├── sankey.sql
    │   │   │   └── sankey.yaml
    │   │   ├── dateadd
    │   │   │   ├── dateadd.sql
    │   │   │   ├── snowflake
    │   │   │   │   └── dateadd.sql
    │   │   │   ├── postgresql
    │   │   │   │   └── dateadd.sql
    │   │   │   ├── redshift
    │   │   │   │   └── dateadd.sql
    │   │   │   ├── bigquery
    │   │   │   │   └── dateadd.sql
    │   │   │   ├── dateadd.yaml
    │   │   │   └── dateadd.py
    │   │   ├── rank
    │   │   │   ├── rank.py
    │   │   │   └── rank.sql
    │   │   ├── correlation
    │   │   │   ├── correlation.yaml
    │   │   │   └── correlation.sql
    │   │   ├── bin
    │   │   │   ├── bin.sql
    │   │   │   ├── bigquery
    │   │   │   │   └── bin.sql
    │   │   │   └── bin.yaml
    │   │   ├── describe
    │   │   │   └── describe.yaml
    │   │   ├── math
    │   │   │   ├── math.sql
    │   │   │   ├── bigquery
    │   │   │   │   └── math.sql
    │   │   │   └── snowflake
    │   │   │   │   └── math.sql
    │   │   ├── union
    │   │   │   ├── union.sql
    │   │   │   └── union.yaml
    │   │   ├── cumulative_agg
    │   │   │   └── cumulative_agg.sql
    │   │   ├── entropy
    │   │   │   └── entropy.yaml
    │   │   ├── one_hot_encode
    │   │   │   ├── one_hot_encode.yaml
    │   │   │   └── one_hot_encode.sql
    │   │   ├── unions
    │   │   │   ├── unions.sql
    │   │   │   └── unions.yaml
    │   │   ├── aggregate
    │   │   │   ├── aggregate.yaml
    │   │   │   └── aggregate.py
    │   │   ├── conditional_agg
    │   │   │   ├── conditional_agg.sql
    │   │   │   └── conditional_agg.yaml
    │   │   ├── rolling_agg
    │   │   │   ├── rolling_agg.sql
    │   │   │   └── rolling_agg.yaml
    │   │   ├── datarobot_score
    │   │   │   └── datarobot_score.sql
    │   │   ├── histogram
    │   │   │   └── histogram.yaml
    │   │   ├── timeseries_agg
    │   │   │   ├── timeseries_agg.sql
    │   │   │   └── bigquery
    │   │   │   │   └── timeseries_agg.sql
    │   │   ├── extract_sequences
    │   │   │   └── snowflake
    │   │   │   │   └── extract_sequences.sql
    │   │   ├── summarize_flatlines
    │   │   │   ├── summarize_flatlines.sql
    │   │   │   └── summarize_flatlines.yaml
    │   │   ├── heatmap
    │   │   │   └── heatmap.yaml
    │   │   ├── summarize
    │   │   │   ├── summarize.sql
    │   │   │   └── summarize.yaml
    │   │   ├── vlookup
    │   │   │   ├── vlookup.sql
    │   │   │   └── vlookup.yaml
    │   │   ├── ratio_with_shrinkage
    │   │   │   ├── ratio_with_shrinkage.sql
    │   │   │   ├── snowflake
    │   │   │   │   └── ratio_with_shrinkage.sql
    │   │   │   └── ratio_with_shrinkage.yaml
    │   │   ├── clean
    │   │   │   └── clean.yaml
    │   │   ├── reshape
    │   │   │   ├── bigquery
    │   │   │   │   └── reshape.sql
    │   │   │   └── snowflake
    │   │   │   │   └── reshape.sql
    │   │   ├── sliding_slope
    │   │   │   ├── sliding_slope.sql
    │   │   │   ├── snowflake
    │   │   │   │   └── sliding_slope.sql
    │   │   │   └── sliding_slope.yaml
    │   │   ├── pivot_table
    │   │   │   └── pivot_table.yaml
    │   │   ├── rsi
    │   │   │   └── rsi.sql
    │   │   └── encode_values
    │   │   │   └── encode_values.yaml
    │   ├── version.py
    │   ├── __init__.py
    │   ├── render
    │   │   └── __init__.py
    │   ├── macros
    │   │   ├── prior.sql
    │   │   ├── rolling.sql
    │   │   └── period_to_date.sql
    │   └── dtypes.py
    ├── requirements.txt
    ├── scripts
    │   ├── install-local.sh
    │   └── publish-pypi.sh
    ├── requirements-tests.txt
    └── DESCRIPTION.md
├── bin
    ├── backup-source.sh
    └── publish-pypi.sh
├── .github
    └── workflows
    │   ├── backup.yaml
    │   ├── run_tests.yaml
    │   ├── publish.yaml
    │   ├── docs_generation.yaml
    │   └── publish_accelerators.yaml
└── docs
    ├── select.md
    ├── prefix.md
    ├── suffix.md
    ├── accelerators
        ├── baby_name_analysis.md
        ├── web_traffic_channels.md
        ├── website_page_performance.md
        ├── plg.md
        ├── sales_growth_funnel.md
        └── omni_channel_performance.md
    ├── uppercase_columns.md
    ├── correlation.md
    ├── profile_column.md
    ├── label_encode.md
    ├── order.md
    ├── describe.md
    ├── sample_class.md
    ├── rename.md
    ├── sankey.md
    ├── entropy.md
    ├── target_encode.md
    ├── apply.md
    ├── filter.md
    ├── one_hot_encode.md
    ├── new_columns.md
    ├── datepart.md
    ├── cast.md
    ├── latest.md
    ├── text_to_sql.md
    ├── lead.md
    ├── sample.md
    ├── moving_avg.md
    ├── aggregate.md
    ├── histogram.md
    ├── to_date.md
    ├── funnel.md
    ├── remove_duplicates.md
    ├── drop_columns.md
    ├── union.md
    ├── market_basket.md
    ├── unions.md
    ├── dropna.md
    ├── heatmap.md
    ├── train_test_split.md
    └── conditional_agg.md


/python/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | ./tmp/*
2 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/int.sql:
--------------------------------------------------------------------------------
1 | {{ int }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/column.sql:
--------------------------------------------------------------------------------
1 | {{ column }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/date.sql:
--------------------------------------------------------------------------------
1 | '{{ date }}'


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/table.sql:
--------------------------------------------------------------------------------
1 | {{ table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/tests/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/alias.sql:
--------------------------------------------------------------------------------
1 | AS {{ alias }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/boolean.sql:
--------------------------------------------------------------------------------
1 | {{ boolean }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/number.sql:
--------------------------------------------------------------------------------
1 | {{ decimal }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/string.sql:
--------------------------------------------------------------------------------
1 | '{{ string }}'


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/tests/transforms/plot/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rasgotransforms/requirements.txt:
--------------------------------------------------------------------------------
1 | jinja2>=2.0
2 | pyyaml>=5.0
3 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/agg.sql:
--------------------------------------------------------------------------------
1 | {{ agg }}( column_name )


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/date_part.sql:
--------------------------------------------------------------------------------
1 | {{ date_part }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/join_type.sql:
--------------------------------------------------------------------------------
1 | {{ join_type }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/timestamp.sql:
--------------------------------------------------------------------------------
1 | '{{ timestamp }}'


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/column_value.sql:
--------------------------------------------------------------------------------
1 | {{ column_value }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/custom_option.sql:
--------------------------------------------------------------------------------
1 | {{ custom_option }}


--------------------------------------------------------------------------------
/python/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
2 | pyrasgo>=2.5.1
3 | pytablewriter
4 | pyyaml
5 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/sort_direction.sql:
--------------------------------------------------------------------------------
1 | {{ sort_direction }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/column_list.sql:
--------------------------------------------------------------------------------
1 | {{ column_list | join(', ') }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/number_list.sql:
--------------------------------------------------------------------------------
1 | {{ number_list | join(', ') }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/table_list.sql:
--------------------------------------------------------------------------------
1 | {{ table_list | join(', ') }}


--------------------------------------------------------------------------------
/rasgotransforms/scripts/install-local.sh:
--------------------------------------------------------------------------------
1 | python -m pip install -e rasgotransforms
2 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/column_or_expression.sql:
--------------------------------------------------------------------------------
1 | {{ column_or_expression }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/exceptions.py:
--------------------------------------------------------------------------------
1 | class RenderException(Exception):
2 |     pass
3 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/select/select.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM {{source_table}}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/column_or_expression_list.sql:
--------------------------------------------------------------------------------
1 | {{ column_or_expression_list | join(',') }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/version.py:
--------------------------------------------------------------------------------
1 | """
2 | Package version for pypi
3 | """
4 | __version__ = "2.7.8"
5 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/string_list.sql:
--------------------------------------------------------------------------------
1 | {% for string in string_list %}'{{ string }}'{{ ',' if not loop.last }}{% endfor %}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/dropna/dropna.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     return source_columns
3 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/filter/filter.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     return source_columns
3 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/order/order.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     return source_columns
3 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/sample/sample.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     return source_columns
3 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import serve_rasgo_transform_templates, serve_rasgo_transform_snippets, DataWarehouse
2 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/sample_class/sample_class.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     return source_columns
3 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/remove_duplicates/remove_duplicates.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     return source_columns
3 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/column_value_list.sql:
--------------------------------------------------------------------------------
1 | (
2 | {% for val in column_value_list %}
3 |     '{{ val }}'{{ ',' if not loop.last }}
4 | {% endfor %}
5 | )


--------------------------------------------------------------------------------
/rasgotransforms/requirements-tests.txt:
--------------------------------------------------------------------------------
1 | google-cloud-bigquery
2 | pandas
3 | pytest
4 | python-dotenv
5 | snowflake-connector-python
6 | snowflake-connector-python[pandas]
7 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/render/__init__.py:
--------------------------------------------------------------------------------
1 | from .environment import RasgoEnvironment
2 | from .infer_columns import infer_columns
3 | from .transforms import Transforms
4 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/custom_option_list.sql:
--------------------------------------------------------------------------------
1 | (
2 | {% for option in custom_option_list %}
3 |     '{{ option }}'{{ ',' if not loop.last }}
4 | {% endfor %}
5 | )


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/filter_list.sql:
--------------------------------------------------------------------------------
1 | {% set filters = filter_list %}
2 | {% from 'filter.sql' import get_filter_statement %}
3 | {{ get_filter_statement(filters) }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/sort_dict.sql:
--------------------------------------------------------------------------------
1 | {% for column, direction in sort_dict.items() -%}
2 |   {{ column }} {{ direction }}{%- if not loop.last %}, {% endif -%}
3 | {%- endfor %}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/unpivot/unpivot.sql:
--------------------------------------------------------------------------------
1 | SELECT * FROM {{ source_table }}
2 | UNPIVOT( {{ value_column }} for {{ name_column }} in ( {{ column_list | join(', ')}} ))
3 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/text_to_sql/text_to_sql.sql:
--------------------------------------------------------------------------------
1 | {# Placeholder code. Will be replaced by user supplied text #}
2 | Write a query against {{source_table}} that returns {{ text }}...
3 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/train_test_split/train_test_split.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     source_columns['TT_SPLIT'] = 'varchar'
3 |     return source_columns
4 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/target_encode/target_encode.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     source_columns[f"{args['column']}_target_encoded"] = 'float'
3 |     return source_columns
4 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/order/order.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM {{source_table}}
3 | ORDER BY 
4 | {%- for col, order_method in order_by.items() %}
5 |     {{ col }} {{ order_method }}{{ ',' if not loop.last else ' ' }}
6 | {%- endfor -%}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/remove_outliers/remove_outliers.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     if not ('drop' in args and args['drop']):
3 |         source_columns['OUTLIER'] = 'boolean'
4 |     return source_columns
5 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datepart/datepart.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     for target_col, date_part in args['dates'].items():
3 |         source_columns[f"{target_col}_{date_part}".upper()] = 'int'
4 |     return source_columns
5 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/label_encode/label_encode.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     source_columns[f"{args['column']}_encoded".upper()] = "int"
3 |     source_columns["all_values_array"] = "array"
4 |     return source_columns
5 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/funnel/funnel.sql:
--------------------------------------------------------------------------------
1 | {%- for col_name in stage_columns -%}
2 |     SELECT
3 |     '{{ col_name }}' AS LABEL
4 |     ,SUM({{ col_name }}) AS LABEL_COUNT
5 | FROM {{ source_table }}
6 | {{ "UNION ALL" if not loop.last else "" }}
7 | {% endfor %}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/market_basket/market_basket.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     out_columns = {}
3 |     out_columns[f"{args['agg_column']}_listagg"] = 'text'
4 |     out_columns["NumTransactions"] = 'int'
5 |     return out_columns
6 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datepart/datepart.sql:
--------------------------------------------------------------------------------
1 | SELECT *,
2 | {%- for target_col, date_part in dates.items() %}
3 |   EXTRACT({{date_part}} FROM {{target_col}}) AS {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }}
4 | {%- endfor %}
5 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datespine/datespine.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     source_columns[f"{args['date_col']}_spine_start"] = 'timestamp_ntz'
3 |     source_columns[f"{args['date_col']}_spine_end"] = 'timestamp_ntz'
4 |     return source_columns
5 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/if_then/if_then.sql:
--------------------------------------------------------------------------------
1 | SELECT 
2 | *,
3 | CASE
4 | {%- for condition in conditions %} 
5 |     {{"WHEN " + condition[0] }} THEN {{ condition[1] }} {% endfor %}
6 |     ELSE {{ default }}
7 | END AS {{ cleanse_name(alias) }}
8 | FROM {{ source_table }}
9 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datepart/postgresql/datepart.sql:
--------------------------------------------------------------------------------
1 | SELECT *,
2 | {%- for target_col, date_part in dates.items() %}
3 |   DATE_PART('{{date_part}}', {{target_col}}) AS {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }}
4 | {%- endfor %}
5 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datepart/redshift/datepart.sql:
--------------------------------------------------------------------------------
1 | SELECT *,
2 | {%- for target_col, date_part in dates.items() %}
3 |   DATE_PART('{{date_part}}', {{target_col}}) AS {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }}
4 | {%- endfor %}
5 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datepart/snowflake/datepart.sql:
--------------------------------------------------------------------------------
1 | SELECT *,
2 | {%- for target_col, date_part in dates.items() %}
3 |   DATE_PART('{{date_part}}', {{target_col}}) AS {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }}
4 | {%- endfor %}
5 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datetrunc/bigquery/datetrunc.sql:
--------------------------------------------------------------------------------
1 | SELECT *,
2 | {%- for target_col, date_part in dates.items() %}
3 |   DATE_TRUNC({{target_col}}, {{date_part}}) as {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }}
4 | {%- endfor %}
5 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datetrunc/postgresql/datetrunc.sql:
--------------------------------------------------------------------------------
1 | SELECT *,
2 | {%- for target_col, date_part in dates.items() %}
3 |   DATE_TRUNC({{date_part}}, {{target_col}}) as {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }}
4 | {%- endfor %}
5 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datetrunc/redshift/datetrunc.sql:
--------------------------------------------------------------------------------
1 | SELECT *,
2 | {%- for target_col, date_part in dates.items() %}
3 |   DATE_TRUNC({{date_part}}, {{target_col}}) as {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }}
4 | {%- endfor %}
5 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datetrunc/snowflake/datetrunc.sql:
--------------------------------------------------------------------------------
1 | SELECT *,
2 | {%- for target_col, date_part in dates.items() %}
3 |   DATE_TRUNC({{date_part}}, {{target_col}}) as {{target_col}}_{{date_part}} {{ ", " if not loop.last else "" }}
4 | {%- endfor %}
5 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/apply/apply.sql:
--------------------------------------------------------------------------------
1 | {# Placeholder code. Will be replaced by user supplied template #}
2 | {% if sql %}
3 | {{ sql }}
4 | {% else %}
5 | SELECT * FROM {{ source_table }}
6 | {% endif %}
7 | {{ raise_exception('Placeholder code must be replaced by user supplied template') }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datetrunc/datetrunc.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     for target_col, date_part in args['dates'].items():
3 |         source_columns[f"{target_col}_{date_part}".upper()] = source_columns[target_col.upper()]
4 |     return source_columns
5 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/moving_avg/moving_avg.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     for column in args['input_columns']:
3 |         for window in args['window_sizes']:
4 |             source_columns[f"mean_{column}_{window}"] = 'float'
5 |     return source_columns
6 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datediff/datediff.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     if 'alias' in args:
3 |         source_columns[args['alias']] = 'int'
4 |     else:
5 |         source_columns[f"DIFF_{args['date_1']}_{args['date_2']}"] = 'int'
6 |     return source_columns
7 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/lag/lag.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     for column in args['columns']:
3 |         for amount in args['amounts']:
4 |             source_columns[f"lag_{column}_{amount}".upper()] = source_columns[column.upper()]
5 |     return source_columns
6 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/lead/lead.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     for column in args['columns']:
3 |         for amount in args['amounts']:
4 |             source_columns[f"lead_{column}_{amount}".upper()] = source_columns[column.upper()]
5 |     return source_columns
6 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/sample_class/sample_class.sql:
--------------------------------------------------------------------------------
1 | {%- for class, n in sample.items() %}
2 |     SELECT * FROM 
3 |     (SELECT * FROM {{ source_table }} WHERE {{ sample_col }} = '{{ class }}') SAMPLE ({{ n }}{{' rows' if n > 1 else ''}})
4 |     {{ '' if loop.last else ' UNION ALL ' }}
5 | {%- endfor %}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/prefix/prefix.sql:
--------------------------------------------------------------------------------
1 | {%- set source_col_names = get_columns(source_table) -%}
2 | {%- set alias = cleanse_name(prefix) -%}
3 | SELECT
4 | {%- for column in source_col_names %}
5 |    {{column}} AS {{ alias~'_'~column }}{{',' if not loop.last else ''}}
6 | {%- endfor %}
7 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/suffix/suffix.sql:
--------------------------------------------------------------------------------
1 | {%- set source_col_names = get_columns(source_table) -%}
2 | {%- set alias = cleanse_name(suffix) -%}
3 | SELECT
4 | {%- for column in source_col_names %}
5 |    {{column}} AS {{ column~'_'~alias }}{{',' if not loop.last else ''}}
6 | {%- endfor %}
7 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/uppercase_columns/bigquery/uppercase_columns.sql:
--------------------------------------------------------------------------------
1 | {%- set source_col_names = get_columns(source_table) -%}
2 | 
3 | SELECT
4 | {%- for col in source_col_names %}
5 |     {{ col }} as {{ cleanse_name(col) | upper }}{{ ", " if not loop.last else "" }}
6 | {%- endfor %}
7 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/calculated_column_list.sql:
--------------------------------------------------------------------------------
1 | {%- for col_dict in calculated_columns %}
2 |     {%- if 'alias' in col_dict -%}
3 |     , {{ col_dict['calculated_column'] }} as {{ cleanse_name(col_dict['alias']) }}
4 |     {%- else -%}
5 |     , {{ col_dict['calculated_column'] }}
6 |     {%- endif %}
7 | {%- endfor %}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/replace_string/replace_string.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     if 'alias' in args and args['alias']:
3 |         source_columns[args['alias']] = 'varchar'
4 |     else:
5 |         source_columns[f"REPLACE_{args['source_col']}"] = 'text'
6 |     return source_columns
7 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/uppercase_columns/snowflake/uppercase_columns.sql:
--------------------------------------------------------------------------------
1 | {%- set source_col_names = get_columns(source_table) -%}
2 | 
3 | SELECT
4 | {%- for col in source_col_names %}
5 |     "{{ col }}" as {{ cleanse_name(col) | upper }}{{ ", " if not loop.last else "" }}
6 | {%- endfor %}
7 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/drop_columns/drop_columns.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     if 'include_cols' in args:
3 |         return {k: v for k, v in source_columns.items() if k in args['include_cols']}
4 |     else:
5 |         return {k: v for k, v in source_columns.items() if k not in args['exclude_cols']}
6 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/replace_missing/replace_missing.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     if 'flag_missing_vals' in args and args['flag_missing_vals']:
3 |         for target_col, _ in args['replacements'].items():
4 |             source_columns[f"{target_col}_missing_flag".upper()] = 'int'
5 |     return source_columns
6 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/select/select.yaml:
--------------------------------------------------------------------------------
 1 | name: select
 2 | tags:
 3 |   - column
 4 |   - table
 5 | description: |
 6 |   select * from a table
 7 | 
 8 | arguments:
 9 |   none:
10 |     type: none
11 |     description: this transform does not take any arguments
12 | example_code: |
13 |   ds = rasgo.get.dataset(id)
14 | 
15 |   ds2 = ds.select()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datediff/bigquery/datediff.sql:
--------------------------------------------------------------------------------
1 | {%- if alias is defined -%}
2 |     {%- set alias = cleanse_name(alias) -%}
3 | {%- else -%}
4 |     {%- set alias = 'DIFF_'~ cleanse_name(date_1~'_'~date_2) -%}
5 | {%- endif -%}
6 | 
7 | SELECT *,
8 |   DATE_DIFF({{ date_1 }}, {{ date_2 }}, {{ date_part }}) as {{ alias }}
9 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datediff/datediff.sql:
--------------------------------------------------------------------------------
1 | {%- if alias is defined -%}
2 |     {%- set alias = cleanse_name(alias) -%}
3 | {%- else -%}
4 |     {%- set alias = 'DIFF_'~ cleanse_name(date_1~'_'~date_2) -%}
5 | {%- endif -%}
6 | 
7 | SELECT *,
8 |   EXTRACT({{ date_part }} FROM DATE {{ date_1 }} - DATE {{ date_2 }}) AS {{ alias }}
9 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/bin/backup-source.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e;
 4 | 
 5 | # zip up the repo
 6 | zip -r ./rasgo-transforms.zip *
 7 | 
 8 | # upload it to s3
 9 | export AWS_DEFAULT_REGION=us-east-1
10 | aws s3 cp rasgo-transforms.zip s3://rasgo-source-backups/rasgo-transforms.zip --storage-class GLACIER_IR
11 | 
12 | # clean up clean up everybody do your share
13 | rm ./rasgo-transforms.zip
14 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/label_encode/snowflake/label_encode.sql:
--------------------------------------------------------------------------------
1 | with distinct_values as (
2 |     select array_agg(distinct {{ column }}) within group (order by {{ column }} asc) as all_values_array from {{ source_table }}
3 | )
4 | select *,
5 | array_position({{ column }}::variant,all_values_array) as {{ column }}_encoded
6 | from distinct_values,{{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/new_columns/new_columns.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | {%- for col_dict in calculated_columns %}
3 |     {%- if 'alias' in col_dict -%}
4 |     , {{ col_dict['calculated_column'] }} as {{ cleanse_name(col_dict['alias']) }}
5 |     {%- else -%}
6 |     , {{ col_dict['calculated_column'] }}
7 |     {%- endif %}
8 | {%- endfor %}
9 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datediff/snowflake/datediff.sql:
--------------------------------------------------------------------------------
 1 | {%- if alias is defined -%}
 2 |     {%- set alias = cleanse_name(alias) -%}
 3 | {%- else -%}
 4 |     {%- set alias = 'DIFF_'~ cleanse_name(date_1~'_'~date_2) -%}
 5 | {%- endif -%}
 6 | 
 7 | SELECT *,
 8 |   DATEDIFF({{ date_part }}, {{ date_1 }}, {{ date_2 }}) as {{ alias }}
 9 | FROM {{ source_table }}
10 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/remove_duplicates/remove_duplicates.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 | *
3 | FROM {{ source_table }}
4 | QUALIFY ROW_NUMBER() OVER (
5 |     PARTITION BY {%- for col in natural_key %} {{col}}{{"," if not loop.last else ""}} {%- endfor %}
6 |     ORDER BY {%- for col in order_col %} {{col}}{{"," if not loop.last else ""}} {%- endfor %} {{order_method}}
7 | ) = 1
8 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/target_encode/target_encode.sql:
--------------------------------------------------------------------------------
 1 | with means as (
 2 |     select distinct {{column}} as value, ROUND(AVG({{target}}), 3) as {{column}}_target_encoded
 3 |     from {{ source_table }}
 4 |     group by value)
 5 | 
 6 | select t.*, m.{{column}}_target_encoded
 7 | from {{ source_table }} t
 8 | left join
 9 | means m
10 | on t.{{column}} = m.value


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/aggregate_string/aggregate_string.sql:
--------------------------------------------------------------------------------
1 | SELECT {{ group_by | join(', ') }}
2 | {%- for agg_column in agg_columns %}
3 | , listagg({{ 'distinct ' if distinct else ''}} {{agg_column}}, '{{sep}}')
4 | WITHIN group (order by {{agg_column}} {{order}}) as {{agg_column}}_listagg
5 | {%- endfor %}
6 | FROM {{ source_table }}
7 | GROUP BY {{ group_by | join(', ') }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/rename/rename.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     out_columns = {}
3 |     for column, column_type in source_columns.items():
4 |         if column in args['renames']:
5 |             out_columns[args['renames'][column]] = column_type
6 |         else:
7 |             out_columns[column] = column_type
8 |     return out_columns
9 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/prefix/prefix.yaml:
--------------------------------------------------------------------------------
 1 | name: prefix
 2 | tags:
 3 |   - row
 4 |   - data_cleaning
 5 | description: Add a prefix to each column name
 6 | arguments:
 7 |   prefix:
 8 |     type: value
 9 |     description: text to prefix each column name with
10 | example_code: |
11 |   ds = rasgo.get.dataset(74)
12 | 
13 |   ds2 = ds.prefix(prefix='PRODUCT')
14 | 
15 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/suffix/suffix.yaml:
--------------------------------------------------------------------------------
 1 | name: suffix
 2 | tags:
 3 |   - row
 4 |   - data_cleaning
 5 | description: Add a suffix to each column name
 6 | arguments:
 7 |   suffix:
 8 |     type: value
 9 |     description: text to suffix each column name with
10 | example_code: |
11 |   ds = rasgo.get.dataset(74)
12 | 
13 |   ds2 = ds.suffix(suffix='PRODUCT')
14 | 
15 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/linear_regression/linear_regression.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     out_columns = {k: v for k, v in source_columns.items() if k in args['group_by']}
3 |     out_columns['Slope'] = 'double'
4 |     out_columns['Intercept'] = 'double'
5 |     out_columns['R2'] = 'double'
6 |     out_columns['Formula'] = 'varchar'
7 |     return out_columns
8 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/linear_regression/linear_regression.sql:
--------------------------------------------------------------------------------
1 | SELECT {{ group_by | join(', ') }}{{ ', ' if group_by else ''}}
2 |   REGR_SLOPE({{y}}, {{x}}) Slope,
3 |   REGR_INTERCEPT({{y}}, {{x}}) Intercept,
4 |   REGR_R2({{y}}, {{x}}) R2, 
5 |   CONCAT('Y = ',Slope,'*X + ',Intercept) as Formula
6 | FROM {{ source_table }}
7 | {{ 'GROUP BY ' if group_by else ''}}{{ group_by | join(', ') }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/cast/cast.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     for target_col, type in args['casts'].items():
3 |         if 'overwrite_columns' in args and args['overwrite_columns']:
4 |             source_columns[target_col.upper()] = type
5 |         else:
6 |             source_columns[f'{target_col.upper()}_{type.upper()}'] = type
7 |     return source_columns
8 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/label_encode/bigquery/label_encode.sql:
--------------------------------------------------------------------------------
 1 | with distinct_values as (    
 2 |   select distinct 
 3 |     rank() over(order by {{ column }} asc) as id,
 4 |     {{ column }}
 5 |   from {{ source_table }} 
 6 |   order by {{ column }} asc
 7 | )
 8 | select *,
 9 |   (v.id - 1) as {{ column }}_encoded
10 | FROM {{ source_table }} t
11 | left join distinct_values v using ({{ column }})


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/min_max_scaler/min_max_scaler.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     for column in args['columns_to_scale']:
3 |         if not ('overwrite_columns' in args and args['overwrite_columns']):
4 |             source_columns[f"{column}_MIN_MAX_SCALED"] = 'float'
5 |         else:
6 |             source_columns[column.upper()] = 'float'
7 |     return source_columns
8 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/standard_scaler/standard_scaler.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     for column in args['columns_to_scale']:
3 |         if not ('overwrite_columns' in args and args['overwrite_columns']):
4 |             source_columns[f"{column}_STANDARD_SCALED"] = 'float'
5 |         else:
6 |             source_columns[column.upper()] = 'float'
7 |     return source_columns
8 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/train_test_split/train_test_split.sql:
--------------------------------------------------------------------------------
1 | SELECT *,
2 | {%- if order_by is defined %}
3 | CASE WHEN ROW_NUMBER() OVER (ORDER BY {{order_by | join(", ")}} ) < (COUNT(1) OVER () * {{train_percent}}) THEN 'TRAIN' ELSE 'TEST' END AS TT_SPLIT
4 | {%- else %}
5 | CASE WHEN MOD(RANDOM(),  1/{{train_percent}}) = 0 THEN 'TEST' ELSE 'TRAIN' END AS TT_SPLIT
6 | {%- endif %}
7 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/rename/rename.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {%- set source_col_names = get_columns(source_table) -%}
 3 | 
 4 | SELECT
 5 | {%- for target_col, new_name in renames.items() %}
 6 |     {{target_col}} AS {{new_name}}{{ ", " if not loop.last else "" }}
 7 | {%- endfor -%}
 8 | {%- for col in source_col_names %}
 9 |     {%- if col not in renames %}, {{col}}{%- endif -%}
10 | {% endfor %}
11 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/latest/latest.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     out_columns = {}
3 |     for column_name, column_type in source_columns.items():
4 |         if column_name not in args['group_by'] and column_name not in args['order_by']:
5 |             out_columns[f"LATEST_{column_name}"] = column_type
6 |         else:
7 |             out_columns[column_name] = column_type
8 |     return out_columns
9 | 


--------------------------------------------------------------------------------
/rasgotransforms/scripts/publish-pypi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ 1 != $# ]; then
 6 |     echo "usage: $0 index"
 7 |     echo "Index values: pypi pypitest"
 8 |     exit 1;
 9 | fi
10 | PYPI_INDEX="$1"
11 | 
12 | # Remove old build artifacts
13 | rm -rf dist/*
14 | 
15 | # Generate new artifacts
16 | python setup.py sdist bdist_wheel
17 | 
18 | # Upload artifacts to pypi
19 | python -m twine upload --verbose  -r "$PYPI_INDEX" dist/*
20 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/to_date/to_date.sql:
--------------------------------------------------------------------------------
1 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', dates)|join(',') if overwrite_columns else "*" -%}
2 | 
3 | SELECT {{ untouched_cols }},
4 | {%- for target_col, date_format in dates.items() %}
5 |     DATE({{target_col}}, '{{date_format}}') as {{target_col if overwrite_columns else target_col + "_DATE"}}{{ ", " if not loop.last else "" }}
6 | {%- endfor %}
7 | from {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/column_agg_list.sql:
--------------------------------------------------------------------------------
1 | {%- for column, aggs in column_agg_list.items() %}
2 |     {%- set oloop = loop %}
3 |     {%- for aggregation_type in aggs %}
4 |         {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}}{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}{{ ',' if not (loop.last and oloop.last) }}
5 |     {%- endfor %}
6 | {%- endfor %}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/join/join.py:
--------------------------------------------------------------------------------
1 | def infer_columns(args, source_columns) -> dict:
2 |     join_columns = args['source_columns'][args['join_table']]
3 |     if 'join_prefix' in args and args['join_prefix']:
4 |         for col_name, col_type in join_columns.items():
5 |             source_columns[f"{args['join_prefix'].upper()}_{col_name}"] = col_type
6 |         return source_columns
7 |     else:
8 |         return {**source_columns, **join_columns}
9 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/profile_column/profile_column.sql:
--------------------------------------------------------------------------------
 1 | WITH COUNTS AS (
 2 | SELECT
 3 |   REPLACE('{{ column_name }}','"') AS COLUMN_NAME
 4 |   ,COL AS VALUE
 5 |   ,COUNT(1) AS VALUE_COUNT
 6 | FROM
 7 |   (SELECT {{ column_name }} AS COL FROM {{ source_table }})
 8 | GROUP BY 2)
 9 | SELECT
10 |   COLUMN_NAME
11 |   ,VALUE
12 |   ,VALUE_COUNT
13 |   ,VALUE_COUNT / SUM(VALUE_COUNT) OVER () AS VALUE_FREQUENCY
14 | FROM
15 |   COUNTS
16 | ORDER BY
17 |   VALUE_COUNT desc


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/uppercase_columns/uppercase_columns.yaml:
--------------------------------------------------------------------------------
 1 | name: rename
 2 | tags:
 3 |   - column
 4 |   - data_cleaning
 5 |   - data_quality
 6 | description: |
 7 |   Rename columns by converting all names to uppercase and removing non-SQL safe characters.
 8 | arguments:
 9 |   none:
10 |     type: none
11 |     description: this transform does not take any arguments
12 | example_code: |
13 |   ds = rasgo.get.dataset(id)
14 | 
15 |   ds2 = ds.uppercase_columns()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/macros/prior.sql:
--------------------------------------------------------------------------------
 1 | {% macro prior(metric_name, dimensions, calc_config) %}
 2 | {% set alias = metric_name + '_' + calc_config.alias if calc_config.alias is defined else metric_name + '_lag_' + calc_config.interval|string %}
 3 | lag(
 4 |     {{ metric_name }}, {{ calc_config.interval }}
 5 | ) over (
 6 |     {% if dimensions %}
 7 |         partition by {{ dimensions | join(", ") }}
 8 |     {% endif %}
 9 |     order by period_min
10 | ) as {{ alias }}
11 | {% endmacro %}
12 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/filter/filter.sql:
--------------------------------------------------------------------------------
 1 | {% from 'filter.sql' import get_filter_statement %}
 2 | {% if items is not defined %}
 3 |     {% if filter_statements is not defined %}
 4 |         {{ raise_exception('items is empty: there are no filters to apply') }}
 5 |     {% else %}
 6 |         {% set items = filter_statements %}
 7 |     {% endif %}
 8 | {% endif %}
 9 | 
10 | select *
11 | from {{ source_table }}
12 | where true and
13 |     {{ get_filter_statement(items) | indent }}
14 | 


--------------------------------------------------------------------------------
/.github/workflows/backup.yaml:
--------------------------------------------------------------------------------
 1 | name: Backup Source to S3
 2 | 
 3 | on:
 4 |   schedule:
 5 |     # Every day at 1am EST
 6 |     - cron:  '0 6 * * *'
 7 | 
 8 | jobs:
 9 |   backup-source:
10 |     runs-on: ubuntu-latest
11 |     env:
12 |       AWS_ACCESS_KEY_ID: ${{ secrets.S3_BACKUPS_ACCESS_KEY }}
13 |       AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_BACKUPS_SECRET_ACCESS_KEY }}
14 | 
15 |     steps:
16 |       - uses: actions/checkout@v2
17 | 
18 |       - name: Push to S3
19 |         run: ./bin/backup-source.sh
20 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/rename/snowflake/rename.sql:
--------------------------------------------------------------------------------
 1 | {%- set source_col_names = get_columns(source_table) -%}
 2 | 
 3 | SELECT
 4 | {%- for target_col, new_name in renames.items() %}
 5 |     {{target_col}} AS {{new_name}}{{ ", " if not loop.last else "" }}
 6 | {%- endfor -%}
 7 | {%- set renames = (renames|join(',')|upper).split(',') -%}
 8 | {%- for col in source_col_names %}
 9 |     {%- if col|upper not in renames %}, {{col|upper}}{%- endif -%}
10 | {% endfor %}
11 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/sankey/sankey.sql:
--------------------------------------------------------------------------------
 1 | {%- for i in range((stage|length) - 1) -%}
 2 |     SELECT
 3 |     '{{ stage[i] }}_' || CAST({{ stage[i] }} AS STRING) AS SOURCE_NODE,
 4 |     '{{ stage[i+1] }}_' || CAST({{ stage[i+1] }} AS STRING) AS DEST_NODE,
 5 |     COUNT(*) AS WIDTH
 6 | FROM {{ source_table }}
 7 | GROUP BY
 8 |     SOURCE_NODE,
 9 |     DEST_NODE
10 | HAVING
11 |     SOURCE_NODE IS NOT NULL AND DEST_NODE IS NOT NULL
12 | {{ "UNION ALL" if not loop.last else "" }}
13 | {% endfor %}
14 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/dateadd/dateadd.sql:
--------------------------------------------------------------------------------
 1 | {%- if overwrite_columns -%}
 2 | {%- set alias = date -%}
 3 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%}
 4 | {%- else -%}
 5 | {%- set untouched_cols = "*" -%}
 6 | {%- endif -%}
 7 | 
 8 | {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%}
 9 | 
10 | SELECT {{ untouched_cols }},
11 |   {{ date }} + INTERVAL {{ offset }} {{ date_part }} AS {{ cleanse_name(alias) }}
12 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/dateadd/snowflake/dateadd.sql:
--------------------------------------------------------------------------------
 1 | {%- if overwrite_columns -%}
 2 | {%- set alias = date -%}
 3 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%}
 4 | {%- else -%}
 5 | {%- set untouched_cols = "*" -%}
 6 | {%- endif -%}
 7 | 
 8 | {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%}
 9 | 
10 | SELECT {{ untouched_cols }},
11 |   DATEADD({{ date_part }}, {{ offset }}, {{ date }}) AS {{ cleanse_name(alias) }}
12 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/order/order.yaml:
--------------------------------------------------------------------------------
 1 | name: order
 2 | tags:
 3 |   - row
 4 |   - conditional
 5 |   - data_cleaning
 6 | description: Order a dataset by specified columns, in a specified order
 7 | arguments:
 8 |   order_by:
 9 |     type: column_value_dict
10 |     description: dict where the keys are column names and the values are the order_method (ASC or DESC)
11 | example_code: |
12 |   ds = rasgo.get.dataset(id)
13 | 
14 |   ds2 = ds.order(order_by={'DS_WEATHER_ICON':'ASC', 'DS_DAILY_HIGH_TEMP':'DESC'})
15 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/dateadd/postgresql/dateadd.sql:
--------------------------------------------------------------------------------
 1 | {%- if overwrite_columns -%}
 2 | {%- set alias = date -%}
 3 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%}
 4 | {%- else -%}
 5 | {%- set untouched_cols = "*" -%}
 6 | {%- endif -%}
 7 | 
 8 | {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%}
 9 | 
10 | SELECT {{ untouched_cols }},
11 |   {{ date }} + INTERVAL '{{ offset }} {{ date_part }}' AS {{ cleanse_name(alias) }}
12 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/dateadd/redshift/dateadd.sql:
--------------------------------------------------------------------------------
 1 | {%- if overwrite_columns -%}
 2 | {%- set alias = date -%}
 3 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%}
 4 | {%- else -%}
 5 | {%- set untouched_cols = "*" -%}
 6 | {%- endif -%}
 7 | 
 8 | {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%}
 9 | 
10 | SELECT {{ untouched_cols }},
11 |   {{ date }} + INTERVAL '{{ offset }} {{ date_part }}' AS {{ cleanse_name(alias) }}
12 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/market_basket/market_basket.sql:
--------------------------------------------------------------------------------
 1 | WITH order_detail as
 2 | (SELECT {{transaction_id}},
 3 | listagg({{agg_column}}, '{{sep}}')
 4 | WITHIN group (order by {{agg_column}}) as {{agg_column}}_listagg,
 5 | COUNT({{agg_column}}) as num_products
 6 | FROM {{ source_table }}
 7 | GROUP BY {{transaction_id}} )
 8 | 
 9 | SELECT {{agg_column}}_listagg, count({{transaction_id}}) as NumTransactions
10 | FROM order_detail
11 | where num_products > 1
12 | GROUP BY {{agg_column}}_listagg
13 | order by count({{transaction_id}}) desc


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/dateadd/bigquery/dateadd.sql:
--------------------------------------------------------------------------------
 1 | {%- if overwrite_columns -%}
 2 | {%- set alias = date -%}
 3 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', [alias])|join(',') -%}
 4 | {%- else -%}
 5 | {%- set untouched_cols = "*" -%}
 6 | {%- endif -%}
 7 | 
 8 | {%- set alias = alias if alias is defined else date + '_add' + offset|string + date_part -%}
 9 | 
10 | SELECT {{ untouched_cols }},
11 |   DATE_ADD({{ date }}, INTERVAL {{ offset }} {{ date_part }}) AS {{ cleanse_name(alias) }}
12 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/rank/rank.py:
--------------------------------------------------------------------------------
 1 | def infer_columns(args, source_columns) -> dict:
 2 |     if 'overwrite_columns' in args and args['overwrite_columns']:
 3 |         out_columns = {k: v for k, v in source_columns.items() if k not in args['rank_columns']}
 4 |     else:
 5 |         out_columns = source_columns.copy()
 6 |     if 'alias' in args and args['alias']:
 7 |         alias = args['alias']
 8 |     else:
 9 |         alias = f"RANK_{'_'.join(args['rank_columns'])}"
10 |     out_columns[alias] = 'integer'
11 |     return out_columns
12 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/profile_column/profile_column.yaml:
--------------------------------------------------------------------------------
 1 | name: profile_column
 2 | tags:
 3 |   - table
 4 | description:  |
 5 |   ## Analyze the distinct values in a column
 6 | 
 7 |   ### Required Inputs
 8 |   - Column: the column you want to profile
 9 | 
10 |   ### Notes
11 |   - Only supports profiling one column at a time
12 | arguments:
13 |   column_name:
14 |     type: column
15 |     description: The column you want to profile
16 | example_code: |
17 |   ds = rasgo.get.dataset(id)
18 | 
19 |   ds.profile_column(column_name = 'IMPORTANTCOLUMN')


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/correlation/correlation.yaml:
--------------------------------------------------------------------------------
 1 | name: correlation
 2 | type: insight
 3 | operation_type: VIZ
 4 | context:
 5 |   chart_type: heatmap_discrete
 6 | tags:
 7 | description: Run pairwise correlation on all numeric columns in the source_table
 8 | arguments:
 9 |   rows_to_sample:
10 |     type: value
11 |     is_optional: true
12 |     description: number of rows to sample from the table before calculating correlation
13 | example_code: |
14 |   ds = rasgo.get.dataset(id)
15 | 
16 |   ds2 = ds.correlation(rows_to_sample=1000)
17 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/bin/bin.sql:
--------------------------------------------------------------------------------
 1 | SELECT *,
 2 | {% if type == 'ntile' %}
 3 |   ntile({{bin_count}}) OVER (ORDER BY {{column}}) AS {{column}}_{{bin_count}}_NTB
 4 | {% elif type == 'equalwidth' %}
 5 |   width_bucket({{column}},
 6 |       (SELECT MIN({{column}}) FROM {{source_table}}),
 7 |       (SELECT MAX({{column}}) FROM {{source_table}}),
 8 |       {{bin_count}}) AS {{column}}_{{bin_count}}_EWB
 9 | {% else %}
10 |   {{ raise_exception('You must select either "ntile" or "equalwidth" as your binning type') }}
11 | {% endif %}
12 | FROM {{ source_table }}
13 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/snippets/agg_dict.sql:
--------------------------------------------------------------------------------
 1 | {%- for col, aggs in agg_dict.items() %}
 2 |     {%- set outer_loop = loop -%}
 3 |     {%- for agg in aggs %}
 4 |         {%- if ' DISTINCT' in agg|upper %}
 5 |             {{ agg|upper|replace(" DISTINCT", "") }}(DISTINCT {{ col }}) as {{ col ~ '_' ~ agg|upper|replace(" DISTINCT", "") ~ 'DISTINCT'}}{{ '' if loop.last and outer_loop.last else ',' }}
 6 |         {%- else %}
 7 |             {{ agg }}({{ col }}) as {{ col + '_' + agg }}{{ '' if loop.last and outer_loop.last else ',' }}
 8 |         {%- endif %}
 9 |     {%- endfor -%}
10 | {%- endfor %}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/macros/rolling.sql:
--------------------------------------------------------------------------------
 1 | {% macro rolling(metric_name, dimensions, calc_config) %}
 2 | {% set alias = metric_name + '_' + calc_config.alias if calc_config.alias is defined else metric_name + calc_config.aggregate + '_last_' + calc_config.interval|string + '_periods' %}
 3 | {{ calc_config.aggregate }}({{ metric_name }})
 4 | over (
 5 |     {% if dimensions -%}
 6 |         partition by {{ dimensions | join(", ") }}
 7 |     {% endif -%}
 8 |     order by period_min
 9 |     rows between {{ calc_config.interval - 1 }} preceding and current row
10 | ) as {{ alias }}
11 | {% endmacro %}
12 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/sample_class/sample_class.yaml:
--------------------------------------------------------------------------------
 1 | name: sample_class
 2 | tags:
 3 |   - row
 4 |   - data_cleaning
 5 |   - conditional
 6 | description: Sample n rows for each value of a column
 7 | arguments:
 8 |   sample_col:
 9 |     type: column
10 |     description: The column for which you want to sample
11 |   sample:
12 |     type: dict
13 |     description: Value of column as a key, n rows to be sampled as values
14 | example_code: |
15 |   ds = rasgo.get.dataset(id)
16 | 
17 |   ds2 = ds.sample_class(sample_col='BINARY_TARGET_COLUMNNAME', sample={'1':15000, '0':60000})
18 |   ds2.preview()


--------------------------------------------------------------------------------
/.github/workflows/run_tests.yaml:
--------------------------------------------------------------------------------
 1 | name: Run RasgoTransforms Tests
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   run-tests:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     container:
10 |       image: "python:3.7"
11 | 
12 |     steps:
13 |       - uses: actions/checkout@v2
14 | 
15 |       - name: Install RasgoTransforms in Editable Mode and test requirements
16 |         run: |
17 |           python -m pip install --upgrade pip
18 |           pip install -e ./rasgotransforms
19 |           pip install -r ./rasgotransforms/requirements-tests.txt
20 | 
21 |       - name: Run Tests
22 |         run: pytest -vv ./rasgotransforms
23 | 


--------------------------------------------------------------------------------
/bin/publish-pypi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ 1 != $# ]; then
 6 |     echo "usage: $0 index"
 7 |     echo "Index values: pypi pypitest"
 8 |     exit 1;
 9 | fi
10 | PYPI_INDEX="$1"
11 | 
12 | # Remove old build artifacts
13 | rm -rf dist/*
14 | 
15 | # Install dependencies
16 | python -m pip install --upgrade pip
17 | python -m pip install --upgrade setuptools build twine
18 | python -m pip install -r rasgotransforms/requirements.txt
19 | 
20 | # Generate new artifacts
21 | cd rasgotransforms
22 | python -m build
23 | 
24 | # Upload artifacts to pypi
25 | python -m twine upload --verbose  -r "$PYPI_INDEX" dist/*
26 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/to_date/to_date.py:
--------------------------------------------------------------------------------
 1 | def infer_columns(args, source_columns) -> dict:
 2 |     if 'overwrite_columns' in args and args['overwrite_columns']:
 3 |         overwrite_columns = True
 4 |     else:
 5 |         overwrite_columns = False
 6 |     out_columns = source_columns.copy()
 7 |     for column_name, column_type in source_columns.items():
 8 |         if column_name in args['dates']:
 9 |             if overwrite_columns:
10 |                 out_columns[column_name] = 'date'
11 |             else:
12 |                 out_columns[f"{column_name}_DATE"] = 'date'
13 |     return source_columns
14 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/describe/describe.yaml:
--------------------------------------------------------------------------------
 1 | name: describe
 2 | tags:
 3 |   - table
 4 |   - math
 5 | description:  |
 6 |   Describes the dataset using a consistent set of metrics, based on data type.
 7 |   Numeric: DTYPE, COUNT, NULL_COUNT, UNIQUE_COUNT, MOST_FREQUENT, MEAN, STD_DEV, MIN, _25_PERCENTILE, _50_PERCENTILE, _75_PERCENTILE, MAX
 8 |   Other: DTYPE, COUNT, NULL_COUNT, UNIQUE_COUNT, MOST_FREQUENT, MIN, MAX
 9 | arguments:
10 |   none:
11 |     type: none
12 |     description: this transform does not take any arguments
13 | example_code: |
14 |   ds = rasgo.get.dataset(id)
15 | 
16 |   ds.describe().to_df()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/rename/rename.yaml:
--------------------------------------------------------------------------------
 1 | name: rename
 2 | tags:
 3 |   - column
 4 |   - data_cleaning
 5 |   - data_quality
 6 | description: |
 7 |   Rename columns by passing a renames dict.
 8 | arguments:
 9 |   renames:
10 |     type: column_value_dict
11 |     description: A dict representing each existing column to be renamed and its corresponding new name.
12 | example_code: |
13 |   ds = rasgo.get.dataset(dataset_id)
14 |   ds2 = ds.rename(renames={
15 |         'DS_WEATHER_ICON': 'Weather',
16 |         'DS_DAILY_HIGH_TEMP': 'High_Temp',
17 |         'DS_DAILY_LOW_TEMP': 'Low_Temp'
18 |   })
19 |   ds2.preview()


--------------------------------------------------------------------------------
/docs/select.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # select
 4 | 
 5 | select * from a table
 6 | 
 7 | 
 8 | ## Parameters
 9 | 
10 | | Name | Type |                Description                 | Is Optional |
11 | | ---- | ---- | ------------------------------------------ | ----------- |
12 | | none | none | this transform does not take any arguments |             |
13 | 
14 | 
15 | ## Example
16 | 
17 | ```python
18 | ds = rasgo.get.dataset(id)
19 | 
20 | ds2 = ds.select()
21 | ```
22 | 
23 | ## Source Code
24 | 
25 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/select/select.sql" %}
26 | 
27 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/math/math.sql:
--------------------------------------------------------------------------------
 1 | {%- if names -%}
 2 |     {%- if names|length != math_ops|length -%}
 3 | 
 4 | {{ raise_exception('Provide a new column alias for each math operation') }}
 5 | 
 6 |     {%- elif names|length == math_ops|length -%}
 7 | 
 8 | SELECT *
 9 | {%- for math_op in math_ops %}
10 |     , {{math_op}} as {{cleanse_name(names[loop.index-1])}}
11 | {%- endfor %}
12 | FROM {{source_table}}
13 | 
14 |     {%- endif -%}
15 | {%- else -%}
16 | 
17 | SELECT *
18 | {%- for math_op in math_ops %}
19 |     , {{math_op}} as {{cleanse_name(math_op)}}
20 | {%- endfor %}
21 | FROM {{source_table}}
22 | 
23 | {%- endif -%}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/if_then/if_then.py:
--------------------------------------------------------------------------------
 1 | def infer_columns(args, source_columns) -> dict:
 2 |     if args['default'].upper() in source_columns:
 3 |         output_type = source_columns[args['default']]
 4 |     elif args['conditions'][1].upper() in source_columns:
 5 |         output_type = source_columns[args['conditions'][1].upper()]
 6 |     elif type(args['default']) is int:
 7 |         output_type = 'integer'
 8 |     elif type(args['default']) is float:
 9 |         output_type = 'float'
10 |     else:
11 |         output_type = 'text'
12 |     source_columns[args['alias'].upper()] = output_type
13 |     return source_columns
14 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/math/bigquery/math.sql:
--------------------------------------------------------------------------------
 1 | {%- if names -%}
 2 |     {%- if names|length != math_ops|length -%}
 3 | 
 4 | {{ raise_exception('Provide a new column alias for each math operation') }}
 5 | 
 6 |     {%- elif names|length == math_ops|length -%}
 7 | 
 8 | SELECT *
 9 | {%- for math_op in math_ops %}
10 |     , {{math_op}} as {{cleanse_name(names[loop.index-1])}}
11 | {%- endfor %}
12 | FROM {{source_table}}
13 | 
14 |     {%- endif -%}
15 | {%- else -%}
16 | 
17 | SELECT *
18 | {%- for math_op in math_ops %}
19 |     , {{math_op}} as {{cleanse_name(math_op)}}
20 | {%- endfor %}
21 | FROM {{source_table}}
22 | 
23 | {%- endif -%}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/label_encode/label_encode.yaml:
--------------------------------------------------------------------------------
 1 | name: label_encode
 2 | tags:
 3 |   - column
 4 |   - feature_engineering
 5 | description: |
 6 |   Encode target labels with value between 0 and n_classes-1. See scikit-learn's [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder) for full documentation.
 7 | 
 8 | arguments:
 9 |   column:
10 |     type: column
11 |     description: Column name to label encode
12 | example_code: |
13 |   ds = rasgo.get.dataset(id)
14 | 
15 |   ds2 = ds.label_encode(column='WEATHER_DESCRIPTION')
16 |   ds2.preview()
17 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/math/snowflake/math.sql:
--------------------------------------------------------------------------------
 1 | {%- if names -%}
 2 |     {%- if names|length != math_ops|length -%}
 3 | 
 4 | {{ raise_exception('Provide a new column alias for each math operation') }}
 5 | 
 6 |     {%- elif names|length == math_ops|length -%}
 7 | 
 8 | SELECT *
 9 | {%- for math_op in math_ops %}
10 |     , {{math_op}} as {{cleanse_name(names[loop.index-1])}}
11 | {%- endfor %}
12 | FROM {{source_table}}
13 | 
14 |     {%- endif -%}
15 | {%- else -%}
16 | 
17 | SELECT *
18 | {%- for math_op in math_ops %}
19 |     , {{math_op}} as {{cleanse_name(math_op)}}
20 | {%- endfor %}
21 | FROM {{source_table}}
22 | 
23 | {%- endif -%}


--------------------------------------------------------------------------------
/docs/prefix.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # prefix
 4 | 
 5 | Add a prefix to each column name
 6 | 
 7 | ## Parameters
 8 | 
 9 | |  Name  | Type  |             Description              | Is Optional |
10 | | ------ | ----- | ------------------------------------ | ----------- |
11 | | prefix | value | text to prefix each column name with |             |
12 | 
13 | 
14 | ## Example
15 | 
16 | ```python
17 | ds = rasgo.get.dataset(74)
18 | 
19 | ds2 = ds.prefix(prefix='PRODUCT')
20 | 
21 | ds2.preview()
22 | ```
23 | 
24 | ## Source Code
25 | 
26 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/prefix/prefix.sql" %}
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/suffix.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # suffix
 4 | 
 5 | Add a suffix to each column name
 6 | 
 7 | ## Parameters
 8 | 
 9 | |  Name  | Type  |             Description              | Is Optional |
10 | | ------ | ----- | ------------------------------------ | ----------- |
11 | | suffix | value | text to suffix each column name with |             |
12 | 
13 | 
14 | ## Example
15 | 
16 | ```python
17 | ds = rasgo.get.dataset(74)
18 | 
19 | ds2 = ds.suffix(suffix='PRODUCT')
20 | 
21 | ds2.preview()
22 | ```
23 | 
24 | ## Source Code
25 | 
26 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/suffix/suffix.sql" %}
27 | 
28 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/moving_avg/moving_avg.sql:
--------------------------------------------------------------------------------
 1 | {%- for amount in window_sizes -%}
 2 |     {%- if amount < 0 -%}
 3 |         {{ raise_exception('Cannot use negative values for a moving average. Please only pass positive values in `window_sizes`.') }}
 4 |     {%- endif -%}
 5 | {%- endfor -%}
 6 | SELECT * 
 7 | {%- for column in input_columns -%}
 8 |     {%- for window in window_sizes -%}
 9 |         , avg({{column}}) OVER(PARTITION BY {{partition | join(", ")}} ORDER BY {{order_by | join(", ")}} ROWS BETWEEN {{window - 1}} PRECEDING AND CURRENT ROW) AS mean_{{column}}_{{window}} 
10 |     {%- endfor %}
11 | {%- endfor %}
12 | FROM {{ source_table }}
13 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/union/union.sql:
--------------------------------------------------------------------------------
 1 | {# Get all Columns in Source Table #}
 2 | {%- set source_col_names = get_columns(source_table) -%}
 3 | 
 4 | {# Get all columns in Inputted Source #}
 5 | {%- set other_source_col_names = get_columns(dataset2) -%}
 6 | 
 7 | {# Get Unique Columns Across Both Datasets #}
 8 | {%- set union_cols = source_col_names.keys()|list + other_source_col_names.keys()|list -%}
 9 | {%- set union_cols = union_cols | unique | list -%} 
10 | 
11 | {# Generate Union Query #}
12 | SELECT {{ union_cols | join(', ') }} FROM {{ dataset2 }}
13 | UNION {{ 'ALL' if keep_dupes else '' }}
14 | SELECT {{ union_cols | join(', ') }} FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/cumulative_agg/cumulative_agg.sql:
--------------------------------------------------------------------------------
 1 | SELECT *
 2 | {% for col, aggs in aggregations.items() -%}
 3 |   {%- for agg in aggs %}
 4 |     , {{ agg }}({{ col }}) OVER( 
 5 |       {%- if group_by %}
 6 |       PARTITION BY {{ group_by | join(", ") }} 
 7 |       {% endif -%}
 8 |       ORDER BY {{ order_by | join(", ") }}
 9 |       {% if direction and direction.lower() == 'forward' -%}
10 |       ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
11 |       {% else -%}
12 |       ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
13 |       {%- endif -%}
14 |     ) as {{ cleanse_name(agg + '_' + col) }}
15 |   {%- endfor -%}
16 | {%- endfor %}
17 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/lag/lag.sql:
--------------------------------------------------------------------------------
 1 | {%- if partition is not defined or partition|length == 0 -%}
 2 | {%- set partition = ["NULL"]-%}
 3 | {%- endif -%}
 4 | {%- if order_by is not defined or order_by|length == 0 -%}
 5 | {%- set order_by = ["NULL"]-%}
 6 | {%- endif -%}
 7 | SELECT *,
 8 |     {%- for col in columns -%}
 9 |         {%- for amount in amounts %}
10 |         lag({{col}}, {{amount}}) over (partition by {{partition | join(", ")}} order by {{order_by | join(", ")}}) as Lag_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }}
11 |         {%- endfor -%}
12 |         {{ ", " if not loop.last else "" }}
13 |     {%- endfor %}
14 | from {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/lead/lead.sql:
--------------------------------------------------------------------------------
 1 | {%- if partition is not defined or partition|length == 0 -%}
 2 | {%- set partition = ["NULL"]-%}
 3 | {%- endif -%}
 4 | {%- if order_by is not defined or order_by|length == 0 -%}
 5 | {%- set order_by = ["NULL"]-%}
 6 | {%- endif -%}
 7 | SELECT *,
 8 |     {%- for col in columns -%}
 9 |         {%- for amount in amounts %}
10 |         lead({{col}}, {{amount}}) over (partition by {{partition | join(", ")}} order by {{order_by | join(", ")}}) as lead_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }}
11 |         {%- endfor -%}
12 |         {{ ", " if not loop.last else "" }}
13 |     {%- endfor %}
14 | from {{ source_table }}
15 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/bin/bigquery/bin.sql:
--------------------------------------------------------------------------------
 1 | SELECT *,
 2 | {%- if type == 'ntile' %}
 3 |   ntile({{bin_count}}) OVER (ORDER BY {{column}}) AS {{column}}_{{bin_count}}_NTB
 4 | {%- elif type == 'equalwidth' %}
 5 |   RANGE_BUCKET(
 6 |     {{ column }},
 7 |     GENERATE_ARRAY(
 8 |       (SELECT MIN({{ column }}) FROM {{ source_table }})
 9 |       ,(SELECT MAX({{ column }}) FROM {{ source_table }})
10 |       ,(SELECT (MAX({{ column }}) - MIN({{ column }}))/20 FROM {{ source_table }})
11 |     )
12 |   ) AS {{column}}_{{bin_count}}_EWB
13 | {%- else %}
14 |   {{ raise_exception('You must select either "ntile" or "equalwidth" as your binning type') }}
15 | {%- endif %}
16 | FROM {{ source_table }}
17 | 


--------------------------------------------------------------------------------
/docs/accelerators/baby_name_analysis.md:
--------------------------------------------------------------------------------
 1 | # Baby Name Analysis Accelerator
 2 | 
 3 | Build an introductory dataset using Rasgo accelerators!
 4 | 
 5 | ## Parameters
 6 | 
 7 | |          Name           | Type |              Description               | Is Optional |
 8 | | ----------------------- | ---- | -------------------------------------- | ----------- |
 9 | | annual_baby_names_table |      | The Rasgo Community baby names dataset |             |
10 | | baby_name               |      | Input your name here!                  |             |
11 | 
12 | 
13 | ## Source Code
14 | 
15 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/accelerators/baby_name_analysis.yml" %}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/latest/latest.sql:
--------------------------------------------------------------------------------
 1 | {%- set source_col_names = get_columns(source_table) -%}
 2 | 
 3 | SELECT 
 4 | {%- for group_item in group_by %}
 5 |  {{ group_item }},
 6 | {%- endfor -%}
 7 | 
 8 | {%- for order_item in order_by %}
 9 |  {{ order_item }},
10 | {%- endfor -%}
11 | 
12 | {%- for source_col in source_col_names %}
13 |  {%- if source_col not in group_by and source_col not in order_by -%}
14 |   LAST_VALUE({{ source_col }} {{ nulls }} NULLS) OVER (PARTITION BY {{ group_by | join(', ') }} ORDER BY {{ order_by | join(', ') }} ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS LATEST_{{ source_col }}{{ ', ' if not loop.last else ' ' }} 
15 |  {%- endif -%}
16 | {%- endfor -%}
17 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/cast/cast.sql:
--------------------------------------------------------------------------------
 1 | {%- if overwrite_columns == true -%}
 2 | 
 3 | {%- set source_columns = get_columns(source_table) -%}
 4 | {%- set untouched_cols = source_columns | reject('in', casts) -%}
 5 | 
 6 | SELECT {% for col in untouched_cols %}{{ col }},{% endfor %}
 7 | {%- for target_col, type in casts.items() %}
 8 |     CAST({{target_col}} AS {{type}}) AS {{target_col}}{{", " if not loop.last else ""}}
 9 | {%- endfor %}
10 | FROM {{ source_table }}
11 | 
12 | {%- else -%}
13 | 
14 | SELECT *
15 | {%- for target_col, type in casts.items() %}
16 |     , CAST({{target_col}} AS {{type}}) AS {{cleanse_name(target_col)+'_'+cleanse_name(type)}}
17 | {%- endfor %}
18 | FROM {{ source_table }}
19 | 
20 | {%- endif -%}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/entropy/entropy.yaml:
--------------------------------------------------------------------------------
 1 | name: entropy
 2 | tags:
 3 |   - aggregate
 4 |   - reshape
 5 | description: |
 6 |   Entropy is a way to calculate the amount of "disorder" in a non-numeric column. Lower entropy indicates less disorder, while higher entropy indicates more.
 7 | 
 8 |   The calculation for Shannon's entropy is: H = -Sum[ P(xi) * log2( P(xi)) ]
 9 | arguments:
10 |   group_by:
11 |     type: column_list
12 |     description: Columns to group by
13 |   columns:
14 |     type: column_list
15 |     description: Columns to calculate entropy on. Must be non-numeric.
16 | example_code: |
17 |   ds = rasgo.get.dataset(id)
18 | 
19 |   ds2 = ds.entropy(group_by=['FIPS'], columns=['NAME', 'ADDRESS'])
20 |   ds2.preview()


--------------------------------------------------------------------------------
/docs/uppercase_columns.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # rename
 4 | 
 5 | Rename columns by converting all names to uppercase and removing non-SQL safe characters.
 6 | 
 7 | 
 8 | ## Parameters
 9 | 
10 | | Name | Type |                Description                 | Is Optional |
11 | | ---- | ---- | ------------------------------------------ | ----------- |
12 | | none | none | this transform does not take any arguments |             |
13 | 
14 | 
15 | ## Example
16 | 
17 | ```python
18 | ds = rasgo.get.dataset(id)
19 | 
20 | ds2 = ds.uppercase_columns()
21 | ```
22 | 
23 | ## Source Code
24 | 
25 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/uppercase_columns/snowflake/uppercase_columns.sql" %}
26 | 
27 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/one_hot_encode/one_hot_encode.yaml:
--------------------------------------------------------------------------------
 1 | name: one_hot_encode
 2 | tags:
 3 |   - column
 4 |   - feature_engineering
 5 | description:  One hot encode a column. Create a null value flag for the column if any of the values are NULL.
 6 | arguments:
 7 |   column:
 8 |     type: column_or_expression
 9 |     description: Column name to one-hot encode. Supports a calculated field via a valid SQL function.
10 |   list_of_vals:
11 |     type: string_list
12 |     description: optional argument to override the dynamic lookup of all values in the target one-hot column
13 |     is_optional: true
14 | example_code: |
15 |   ds = rasgo.get.dataset(id)
16 | 
17 |   ds2 = ds.one_hot_encode(column='WEATHER_DESCRIPTION')
18 |   ds2.preview()
19 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/sankey/sankey.yaml:
--------------------------------------------------------------------------------
 1 | name: sankey
 2 | type: insight
 3 | operation_type: VIZ
 4 | context:
 5 |   chart_type: sankey
 6 | tags:
 7 |   - insight
 8 |   - visualization
 9 | description: Analyze the hierarchical record count of a series of columns by counting the number of records in each pair of values in hierarchically adjacent columns. The columns fed to this transformation should be categorical lables to be counted.
10 | arguments:
11 |   stage:
12 |     type: column_list
13 |     description: Ordered list of categorial columns, from highest in hierarchy to lowest
14 | example_code: |
15 |   ds = rasgo.get.dataset(id)
16 | 
17 |   ds2 = ds.sankey(stage=["ENGLISHCOUNTRYREGIONNAME", "STATEPROVINCENAME", "CITY"])
18 |   ds2.preview()
19 | 


--------------------------------------------------------------------------------
/python/constants.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Constants used for python code in the UDT Repo
 3 | """
 4 | from enum import Enum
 5 | 
 6 | 
 7 | class PyRasgoEnvironment(Enum):
 8 |     """
 9 |     Different Environment for connecting PyRasgo to
10 |     """
11 | 
12 |     PRODUCTION = "api.rasgoml.com"
13 |     STAGING = "staging-rasgo-proxy.herokuapp.com"
14 |     LOCAL = "localhost"
15 | 
16 | 
17 | # Base Github Repo for UDT Jinja SQL Links
18 | GITHUB_REPO_URL = "https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms"
19 | 
20 | # Default data warehouse
21 | # TODO: When we support multiple DWs in the API
22 | # we'll need to refactor the functions that consume this
23 | RASGO_DATAWAREHOUSE = 'snowflake'
24 | 
25 | COMMUNITY_ORGANIZATION_ID: int = 1
26 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/unions/unions.sql:
--------------------------------------------------------------------------------
 1 | {# Get all Columns in Source Table #}
 2 | {%- set source_col_names = get_columns(source_table) -%}
 3 | {% set ns = namespace(union_columns=source_col_names.keys()) %}      
 4 | 
 5 | {%- for utable in union_tables -%}
 6 |     {%- set utable_cols = get_columns(utable) -%}
 7 |     {%- set ns.union_columns = ns.union_columns|list|select("in", utable_cols.keys()|list) -%}
 8 | {%- endfor -%}
 9 | 
10 | {%- set columns_to_select = ns.union_columns|join(', ') -%}
11 | 
12 | {# Generate Union Query #}
13 | SELECT {{ columns_to_select }} 
14 | FROM {{ source_table }}
15 | {%- for u_table in union_tables %}
16 | UNION {{ 'ALL' if not remove_duplicates else '' }}
17 | SELECT {{ columns_to_select }} 
18 | FROM {{ u_table }}
19 | {%- endfor -%}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/macros/period_to_date.sql:
--------------------------------------------------------------------------------
 1 | {% macro period_to_date(metric_name, dimensions, calc_config) %}
 2 | {% set alias = metric_name + '_' + calc_config.alias if calc_config.alias is defined else metric_name + '_' + calc_config.aggregate + '_' + calc_config.period %}
 3 | {{ calc_config.aggregate }}({{ metric_name }})
 4 | over (
 5 |     partition by
 6 |     {% if dw_type() == 'bigquery' %}
 7 |         date_trunc(period_min, {{ calc_config.period }})
 8 |     {% else %}
 9 |         date_trunc('{{ calc_config.period }}', period_min)
10 |     {% endif %}
11 |     {% if dimensions %}
12 |         , {{ dimensions | join(", ") }}
13 |     {% endif %}
14 |     order by period_min
15 |     rows between unbounded preceding and current row
16 | ) as {{ alias }}
17 | {% endmacro %}
18 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/aggregate/aggregate.yaml:
--------------------------------------------------------------------------------
 1 | name: aggregate
 2 | tags:
 3 |   - table
 4 |   - reshape
 5 |   - aggregate
 6 | description:  Groups rows by the group_by items applying aggregations functions for the resulting group and selected columns
 7 | arguments:
 8 |   group_by:
 9 |     type: column_list
10 |     description: Columns to group by
11 |   aggregations:
12 |     type: agg_dict
13 |     description: Aggregations to apply for other columns. Dict keys are column names, and values are a list of aggegations to apply for that column.
14 | example_code: |
15 |   ds = rasgo.get.dataset(id)
16 | 
17 |   ds2 = ds.aggregate(group_by=['FIPS'], aggregations={
18 |             'COL_1': ['SUM', 'AVG'],
19 |             'COL_2': ['SUM', 'AVG']
20 |         })
21 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/sample/sample.yaml:
--------------------------------------------------------------------------------
 1 | name: sample
 2 | tags:
 3 |   - row
 4 |   - math
 5 |   - conditional
 6 | description: Take a sample of a dataset using a specific number of rows or a probability that each row will be selected
 7 | arguments:
 8 |   num_rows:
 9 |     type: value
10 |     description: To sample using a probability of selecting each row, your num_rows should be a decimal less than 1. Otherwise, pass an integer value for number of rows to keep.
11 |   filters:
12 |     type: filter_list
13 |     description: Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text.
14 |     is_optional: true
15 | example_code: |
16 |   ds = rasgo.get.dataset(id)
17 | 
18 |   ds2 = ds.sample(num_rows=1000)
19 |   ds2.preview()


--------------------------------------------------------------------------------
/docs/accelerators/web_traffic_channels.md:
--------------------------------------------------------------------------------
 1 | # Google Analytics Web Traffic Channels
 2 | 
 3 | The Web Traffic Channels analysis uses Google Analytics Web Traffic data, including bounce rate, conversion rate, new users, and session duration to create visualizations comparing page performance by channel.
 4 | 
 5 | ## Parameters
 6 | 
 7 | |                Name                |  Type   |            Description             | Is Optional |
 8 | | ---------------------------------- | ------- | ---------------------------------- | ----------- |
 9 | | google_analytics_web_traffic_table | dataset | Google Analytics Web Traffic Table |             |
10 | 
11 | 
12 | ## Source Code
13 | 
14 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/accelerators/web_traffic_channels.yml" %}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datepart/datepart.yaml:
--------------------------------------------------------------------------------
 1 | name: datepart
 2 | tags:
 3 |   - column
 4 |   - date_time
 5 | description: |
 6 |   Extracts a specific part of a date column. For example, if the input is '2021-01-01', you can ask for the year and get back 2021.
 7 |   
 8 |   An exhaustive list of valid date parts can be [found here](https://docs.snowflake.com/en/sql-reference/functions-date-time.html#label-supported-date-time-parts).
 9 | arguments:
10 |   dates:
11 |     type: datepart_dict
12 |     description: dict where keys are names of columns you want to date part and values are the desired date part grain
13 | example_code: |
14 |   ds = rasgo.get.dataset(id)
15 | 
16 |   ds2 = ds.datepart(dates={
17 |       'DATE_STRING':'year',
18 |       'DATE2_STR':'month'
19 |     })
20 |   ds2.preview()
21 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/target_encode/target_encode.yaml:
--------------------------------------------------------------------------------
 1 | name: target_encode
 2 | tags:
 3 |   - column
 4 |   - feature_engineering
 5 | description: |
 6 |   Encode a categorical column with the average value of a target column for the corresponding value of the categorical column.
 7 |   
 8 |   See scikit-learn's [TargetEncoder](https://contrib.scikit-learn.org/category_encoders/targetencoder.html) for full documentation.
 9 | 
10 | arguments:
11 |   column:
12 |     type: column
13 |     description: Column name to target encode
14 |   target:
15 |     type: column
16 |     description: Numeric target column to use to create averages
17 | example_code: |
18 |   ds = rasgo.get.dataset(id)
19 | 
20 |   ds2 = ds.target_encode(column='WEATHER_DESCRIPTION', target='DAILY_HIGH_TEMP')
21 |   ds2.preview()
22 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/conditional_agg/conditional_agg.sql:
--------------------------------------------------------------------------------
 1 | {%- if distinct -%}
 2 |     {%- set agg_thing = 'DISTINCT '~agg_column -%}
 3 | {%- else -%}
 4 |     {%- set agg_thing = agg_column -%}
 5 | {%- endif -%}
 6 | {%- set rule_combos = [] -%}
 7 | {%- for r in rules -%}
 8 |     {%- if loop.first -%}
 9 |         {%- set rule_combos = rule_combos.append(r) -%}
10 |     {%- else -%}
11 |         {%- set new_rule = rule_combos[loop.index-2] ~ ' AND ' ~ r -%}
12 |         {%- set rule_combos = rule_combos.append(new_rule) -%}
13 |     {%- endif -%}
14 | {%- endfor -%}
15 | {%- for rule in rule_combos -%}
16 | SELECT '{{ rule|replace("'","") }}' AS rule_desc, {{ agg }}({{ agg_thing }}) as QTY 
17 | FROM {{ source_table }} 
18 | WHERE {{ rule }}
19 | {% if not loop.last %}
20 | UNION ALL
21 | {% endif %}
22 | {%- endfor -%}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/rolling_agg/rolling_agg.sql:
--------------------------------------------------------------------------------
 1 | SELECT *
 2 | {% for col, aggs in aggregations.items() -%}
 3 |   {%- for agg in aggs -%}
 4 |     {%- for offset in offsets %}
 5 |       {% set normalized_offset = -offset %}
 6 |       , {{ agg }}({{ col }}) OVER( 
 7 |         {%- if group_by %}
 8 |         PARTITION BY {{ group_by | join(", ") }} 
 9 |         {% endif -%}
10 |         ORDER BY {{ order_by | join(", ") }} 
11 |         {% if normalized_offset > 0 -%}
12 |         ROWS BETWEEN CURRENT ROW AND {{ normalized_offset }} FOLLOWING
13 |         {% else -%}
14 |         ROWS BETWEEN {{ normalized_offset|abs }} PRECEDING AND CURRENT ROW
15 |         {% endif -%}
16 |       ) as {{ cleanse_name(agg + '_' + col + '_' + offset|string) }}
17 |     {%- endfor -%}
18 |   {%- endfor -%}
19 | {%- endfor %}
20 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/docs/correlation.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # correlation
 4 | 
 5 | Run pairwise correlation on all numeric columns in the source_table
 6 | 
 7 | ## Parameters
 8 | 
 9 | |      Name      | Type  |                              Description                               | Is Optional |
10 | | -------------- | ----- | ---------------------------------------------------------------------- | ----------- |
11 | | rows_to_sample | value | number of rows to sample from the table before calculating correlation | True        |
12 | 
13 | 
14 | ## Example
15 | 
16 | ```python
17 | ds = rasgo.get.dataset(id)
18 | 
19 | ds2 = ds.correlation(rows_to_sample=1000)
20 | ds2.preview()
21 | ```
22 | 
23 | ## Source Code
24 | 
25 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/correlation/correlation.sql" %}
26 | 
27 | 


--------------------------------------------------------------------------------
/docs/profile_column.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # profile_column
 4 | 
 5 | ## Analyze the distinct values in a column
 6 | 
 7 | ### Required Inputs
 8 | - Column: the column you want to profile
 9 | 
10 | ### Notes
11 | - Only supports profiling one column at a time
12 | 
13 | 
14 | ## Parameters
15 | 
16 | |    Name     |  Type  |          Description           | Is Optional |
17 | | ----------- | ------ | ------------------------------ | ----------- |
18 | | column_name | column | The column you want to profile |             |
19 | 
20 | 
21 | ## Example
22 | 
23 | ```python
24 | ds = rasgo.get.dataset(id)
25 | 
26 | ds.profile_column(column_name = 'IMPORTANTCOLUMN')
27 | ```
28 | 
29 | ## Source Code
30 | 
31 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/profile_column/profile_column.sql" %}
32 | 
33 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datarobot_score/datarobot_score.sql:
--------------------------------------------------------------------------------
 1 | SELECT {{ include_cols|join(',') }},
 2 | 
 3 | {%- if num_explains is defined and threshold_low is defined and threshold_high is defined -%}
 4 |     S:score AS PREDICTION
 5 |     {%- set function_call = '(OBJECT_CONSTRUCT_KEEP_NULL(*),' ~ num_explains ~ ',' ~ threshold_low ~ ',' ~ threshold_high ~ ')' %}
 6 |     {% for i in range(num_explains) -%}
 7 |     ,CONCAT(S:explanations[{{ i }}].featureName, '=', S:explanations[{{ i }}].featureValue, ' (', S:explanations[{{ i }}].strength, ')') AS TOP{{ i+1 }}_INFLUENCING_FACTOR
 8 |     {% endfor -%}
 9 | {%- else -%}
10 |     S AS PREDICTION
11 |     {% set function_call = '(OBJECT_CONSTRUCT_KEEP_NULL(*))' %}
12 | {%- endif %}
13 | FROM (
14 |     SELECT *,
15 |     {{ function_name }}{{ function_call }} AS S
16 |     FROM {{ source_table }} )


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/new_columns/new_columns.yaml:
--------------------------------------------------------------------------------
 1 | name: new_columns
 2 | tags:
 3 |   - column
 4 |   - math
 5 | description: |
 6 |   ## Build new columns, using SQL formulas.
 7 | 
 8 |   ### Required Inputs
 9 |   - Calculated Column: the formula for the new column you want to build
10 | 
11 |   ### Optional Inputs
12 |   - Alias: name for your columns
13 | 
14 |   ### Notes
15 |   - Supports any SQL column functions that are compatible with your data warehouse
16 | 
17 | arguments:
18 |   calculated_columns:
19 |     type: calculated_column_list
20 |     description: List of SQL formulas to generate new columns
21 | example_code: |
22 |   ds2 = ds.new_columns(
23 |       calculated_columns={
24 |             calcuated_column: 'POWER(COLUMN_NAME, 3)',
25 |             alias: 'COLUMN_NAME_Cubed'
26 |           }
27 |       )
28 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/apply/apply.yaml:
--------------------------------------------------------------------------------
 1 | name: apply
 2 | tags:
 3 |   - table
 4 |   - custom
 5 |   - reshape
 6 |   - aggregate
 7 | description: A transform that accepts a custom template to execute. Must use the sql template argument `source_table` to reference the Rasgo dataset which will serve as the base of any SELECT
 8 | arguments:
 9 |   sql:
10 |     type: custom
11 |     description: The custom SQL transform template to apply
12 | example_code: |
13 |   ds = rasgo.get.dataset(id)
14 | 
15 |   ds2 = ds.apply(
16 |     sql='SELECT * FROM {{ source_table }} WHERE COLUMNVALUE = I17'
17 |   )
18 |   ds2.preview()
19 | 
20 |   # passing in custom arguments
21 |   ds = rasgo.get.dataset(id)
22 | 
23 |   ds2 = ds.apply(
24 |     sql="SELECT * FROM {{ source_table }} WHERE COLUMNVALUE = '{{ my_value }}'",
25 |     my_value="I17"
26 |   )
27 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/cast/cast.yaml:
--------------------------------------------------------------------------------
 1 | name: cast
 2 | tags:
 3 |   - column
 4 |   - data_cleaning
 5 |   - data_quality
 6 | description: |
 7 |   Cast selected columns to a new type
 8 | arguments:
 9 |   casts:
10 |     type: cast_value_dict
11 |     description: A dict where the keys are columns and the values are the new type to cast them to.
12 |   overwrite_columns:
13 |     type: boolean
14 |     is_optional: true
15 |     description: to overwrite column names with the new casted column, use 'true'. otherwise, use 'false'. defaults to 'false'.
16 | example_code: |
17 |   ds = rasgo.get.dataset(id)
18 | 
19 |   ds_casted = ds.cast(
20 |     casts={
21 |       'DS_WEATHER_ICON':'INT',
22 |       'DS_DAILY_HIGH_TEMP':'STRING',
23 |       'DS_DAILY_LOW_TEMP':'INT'
24 |     },
25 |     overwrite_columns=True
26 |   )
27 | 
28 |   ds_casted.preview()
29 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/filter/filter.yaml:
--------------------------------------------------------------------------------
 1 | name: filter
 2 | tags:
 3 |   - row
 4 |   - data_cleaning
 5 |   - conditional
 6 | description: |
 7 |   Filter the dataset. Supports two types of filters:
 8 |     1. Comparison filters, which compare the values in a column with a value
 9 |     2. Advanced filters, which support full SQL strings for custom filtering logic
10 | arguments:
11 |   items:
12 |     type: filter_list
13 |     description: list of dictionaries representing filters
14 | example_code: |
15 |   ds = rasgo.get.dataset(74)
16 |   
17 |   # comma separated list of 'WHERE' clauses
18 |   ds2 = ds.filter(items=['PRODUCTKEY < 500'])
19 |   ds2.preview()
20 | 
21 |   # full filtering with a column, operator, and comparison value
22 |   ds3 = ds.filter(items=[{'column_name':'PRODUCTKEY', 'operator':'>', 'comparison_value':'101'}])
23 |   ds3.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/latest/latest.yaml:
--------------------------------------------------------------------------------
 1 | name: latest
 2 | tags:
 3 |   - column
 4 |   - data_cleaning
 5 |   - data_quality
 6 |   - time_series
 7 | description:  Impute missing values in ALL columns with the latest value seen in rows prior
 8 | arguments:
 9 |   group_by:
10 |     type: column_list
11 |     description: List of columns to perform the imputation "within"
12 |   order_by:
13 |     type: column_list
14 |     description: List of columns to sort ascending, in order to find the last known value for imputation
15 |   nulls:
16 |     type: string
17 |     description: Pass either 'ignore' or 'respect' to determine whether nulls should be ignored or not during imputation.
18 | 
19 | example_code: |
20 |   ds = rasgo.get.dataset(id)
21 |   
22 |   ds2 = ds.latest(
23 |     group_by=['FIPS'],
24 |     order_by=['DATE'],
25 |     nulls='ignore')
26 |   
27 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/text_to_sql/text_to_sql.yaml:
--------------------------------------------------------------------------------
 1 | name: text_to_sql
 2 | tags:
 3 |   - custom
 4 | description: |
 5 |   ## Text to SQL, powered by OpenAI.
 6 |   ### Required Inputs
 7 |   - Text: a prompt describing the SQL query that you want OpenAI to generate for you. Add as much context as possible to help OpenAI generate a useful query. Avoid using relative date terms like "last year" because OpenAI doesn't have any knowledge past 2021.
 8 | arguments:
 9 |   text:
10 |     type: string-long
11 |     description: |
12 |       Text description of the query you want to generate.
13 |       Example: total revenue for the Southwest region in 2021
14 | example_code: |
15 |   ds = rasgo.get.dataset(fqtn='DB.SCHEMA.IOWA_LIQUOR_SALES')
16 | 
17 |   ds2 = ds.text_to_sql(
18 |     text='total bottles sold in Des Moines last year'
19 |   )
20 |   ds2.sql()
21 |   ds2.preview()
22 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/histogram/histogram.yaml:
--------------------------------------------------------------------------------
 1 | name: histogram
 2 | type: insight
 3 | operation_type: VIZ
 4 | context:
 5 |   chart_type: series_continuous
 6 | tags:
 7 | description: Analyze the value distribution of a single continuous variable by binning it and calculating frequencies in each bin
 8 | arguments:
 9 |   column:
10 |     type: column
11 |     description: numeric column to use to generate the histogram
12 |   filters:
13 |     type: filter_list
14 |     description: Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text.
15 |     is_optional: true
16 |   num_buckets:
17 |     type: value
18 |     is_optional: true
19 |     description: max number of buckets to create; defaults to 200
20 | example_code: |
21 |   ds = rasgo.get.dataset(id)
22 | 
23 |   ds2 = ds.histogram(column='SALESAMOUNT')
24 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/funnel/funnel.yaml:
--------------------------------------------------------------------------------
 1 | name: funnel
 2 | type: insight
 3 | operation_type: VIZ
 4 | context:
 5 |   chart_type: funnel
 6 | tags:
 7 |   - insight
 8 |   - visualization
 9 | description: Creates a funnel visualization-ready dataset from numeric columns (e.g., ["Number of leads", "Number of contacts", "Number of deals closed"]) representing a hierarchy with summed incidence rates
10 | arguments:
11 |   stage_columns:
12 |     type: column_list
13 |     description: List of columns to include in the funnel dataset, in order of hierarchy from highest stage to lowest stage (e.g., ["Number of leads", "Number of contacts", "Number of deals closed"])
14 | example_code: |
15 |   ds = rasgo.get.dataset(id)
16 | 
17 |   ds2 = ds.funnel(stage_columns=["TOTAL_IMPRESSIONS", "TOTAL_EMAILS_SENT", "TOTAL_WEBTRAFFIC_USERS", "TOTAL_LEADS_CREATED", "TOTAL_DEALS_CLOSED"])
18 |   ds2.preview()
19 | 


--------------------------------------------------------------------------------
/docs/label_encode.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # label_encode
 4 | 
 5 | Encode target labels with value between 0 and n_classes-1. See scikit-learn's [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder) for full documentation.
 6 | 
 7 | 
 8 | ## Parameters
 9 | 
10 | |  Name  |  Type  |         Description         | Is Optional |
11 | | ------ | ------ | --------------------------- | ----------- |
12 | | column | column | Column name to label encode |             |
13 | 
14 | 
15 | ## Example
16 | 
17 | ```python
18 | ds = rasgo.get.dataset(id)
19 | 
20 | ds2 = ds.label_encode(column='WEATHER_DESCRIPTION')
21 | ds2.preview()
22 | 
23 | ```
24 | 
25 | ## Source Code
26 | 
27 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/label_encode/snowflake/label_encode.sql" %}
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/order.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # order
 4 | 
 5 | Order a dataset by specified columns, in a specified order
 6 | 
 7 | ## Parameters
 8 | 
 9 | |   Name   |       Type        |                                      Description                                       | Is Optional |
10 | | -------- | ----------------- | -------------------------------------------------------------------------------------- | ----------- |
11 | | order_by | column_value_dict | dict where the keys are column names and the values are the order_method (ASC or DESC) |             |
12 | 
13 | 
14 | ## Example
15 | 
16 | ```python
17 | ds = rasgo.get.dataset(id)
18 | 
19 | ds2 = ds.order(order_by={'DS_WEATHER_ICON':'ASC', 'DS_DAILY_HIGH_TEMP':'DESC'})
20 | ds2.preview()
21 | ```
22 | 
23 | ## Source Code
24 | 
25 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/order/order.sql" %}
26 | 
27 | 


--------------------------------------------------------------------------------
/docs/describe.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # describe
 4 | 
 5 | Describes the dataset using a consistent set of metrics, based on data type.
 6 | Numeric: DTYPE, COUNT, NULL_COUNT, UNIQUE_COUNT, MOST_FREQUENT, MEAN, STD_DEV, MIN, _25_PERCENTILE, _50_PERCENTILE, _75_PERCENTILE, MAX
 7 | Other: DTYPE, COUNT, NULL_COUNT, UNIQUE_COUNT, MOST_FREQUENT, MIN, MAX
 8 | 
 9 | 
10 | ## Parameters
11 | 
12 | | Name | Type |                Description                 | Is Optional |
13 | | ---- | ---- | ------------------------------------------ | ----------- |
14 | | none | none | this transform does not take any arguments |             |
15 | 
16 | 
17 | ## Example
18 | 
19 | ```python
20 | ds = rasgo.get.dataset(id)
21 | 
22 | ds.describe().to_df()
23 | ```
24 | 
25 | ## Source Code
26 | 
27 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/describe/snowflake/describe.sql" %}
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/sample_class.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # sample_class
 4 | 
 5 | Sample n rows for each value of a column
 6 | 
 7 | ## Parameters
 8 | 
 9 | |    Name    |  Type  |                       Description                        | Is Optional |
10 | | ---------- | ------ | -------------------------------------------------------- | ----------- |
11 | | sample_col | column | The column for which you want to sample                  |             |
12 | | sample     | dict   | Value of column as a key, n rows to be sampled as values |             |
13 | 
14 | 
15 | ## Example
16 | 
17 | ```python
18 | ds = rasgo.get.dataset(id)
19 | 
20 | ds2 = ds.sample_class(sample_col='BINARY_TARGET_COLUMNNAME', sample={'1':15000, '0':60000})
21 | ds2.preview()
22 | ```
23 | 
24 | ## Source Code
25 | 
26 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/sample_class/sample_class.sql" %}
27 | 
28 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/correlation/correlation.sql:
--------------------------------------------------------------------------------
 1 | {%- set names_types_list = get_columns(source_table) -%}
 2 | 
 3 | {%- set column_list = [] -%}
 4 | 
 5 | {%- for key, value in names_types_list.items() -%}
 6 |     {% if (value|upper == 'NUMBER' or 'FLOAT' in value|upper or 'INT' in value|upper) %}
 7 |     {%- do column_list.append(key) -%}
 8 |     {%- endif -%}
 9 | {%- endfor -%}
10 | 
11 | WITH source_sampled as (
12 |     SELECT * from {{ source_table }}
13 |     {% if rows_to_sample is defined %} SAMPLE ({{ rows_to_sample }} ROWS) {% endif -%}
14 | )
15 | 
16 | SELECT * FROM (
17 | {%- for combo in itertools.product(column_list, repeat=2) -%}
18 |     SELECT '{{ combo[0] }}' as COLUMN_A,
19 |     '{{ combo[1] }}' as COLUMN_B,
20 |     CORR({{ combo[0] }}, {{ combo[1] }}) as Correlation
21 |     FROM source_sampled
22 |     {% if not loop.last %} UNION {% endif -%}
23 | {%- endfor -%}
24 | )
25 | ORDER BY COLUMN_A, COLUMN_B


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/timeseries_agg/timeseries_agg.sql:
--------------------------------------------------------------------------------
 1 | SELECT *
 2 | {% for offset in offsets -%}
 3 |   {% set normalized_offset = -offset %}
 4 |   {% for col, aggs in aggregations.items() -%}
 5 |     {% for agg in aggs %}
 6 |     ,(
 7 |       SELECT {{ agg }}({{ col }})
 8 |       FROM {{ source_table }} i  
 9 |       WHERE 
10 |       {% if normalized_offset > 0 -%}
11 |         i.{{ date }} BETWEEN o.{{ date }} AND (o.{{ date }} + INTERVAL {{ normalized_offset }} {{ date_part }})
12 |       {% else -%}
13 |         i.{{ date }} BETWEEN (o.{{ date }} - INTERVAL {{ normalized_offset|abs }} {{ date_part }})) AND o.{{ date }}
14 |       {%- endif -%}
15 |       {%- for g in group_by %}
16 |         AND o.{{ g }} = i.{{ g }} 
17 |       {% endfor -%}
18 |     ) AS {{ cleanse_name(agg + '_' + col + '_' + offset|string + date_part) }}
19 |     {%- endfor -%}
20 |   {%- endfor %}
21 | {% endfor %}
22 | FROM {{ source_table }} o


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/dropna/dropna.yaml:
--------------------------------------------------------------------------------
 1 | name: dropna
 2 | tags:
 3 |   - row
 4 |   - data_cleaning
 5 |   - conditional
 6 | description: Remove missing values
 7 | arguments:
 8 |   how:
 9 |     type: value
10 |     description: Method to determine if record is removed, 'any' removes each record with at least one missing value, 'all' removes records only when all values are missing (default = 'any').
11 |     is_optional: true
12 |   subset:
13 |     type: column_list
14 |     description: List of columns to check for missing values. All columns are checked if not defined.
15 |     is_optional: true
16 |   thresh:
17 |     type: int
18 |     description: (Optional) Acts like all, but only requires this number of values to be null to remove a record instead of all.
19 |     is_optional: true
20 | example_code: |
21 |   ds = rasgo.get.dataset(id)
22 | 
23 |   ds2 = ds.dropna(how='all', subset=['ORDERS', 'SALES'])
24 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/extract_sequences/snowflake/extract_sequences.sql:
--------------------------------------------------------------------------------
 1 | WITH CTE_{{ column }} AS (
 2 | select * from {{ source_table }}
 3 |   match_recognize(
 4 |     partition by {{ group_by | join(', ') }}
 5 |     order by {{ order_by }}
 6 |     measures
 7 |       match_number() as SEQUENCE_NUMBER,
 8 |       first({{ order_by }}) as SEQUENCE_START_DATE,
 9 |       last({{ order_by }})  as SEQUENCE_END_DATE,
10 |       count(*)              as SEQUENCE_LEN,
11 |       count(row_decrease.*) as SEQUENCE_DECREASE_CNT,
12 |       count(row_increase.*) as SEQUENCE_INCREASE_CNT
13 |     one row per match
14 |     after match skip to last row_increase
15 |     pattern(FOO row_decrease+ row_increase+)
16 |     define 
17 |     row_decrease AS {{ column }} < lag({{ column }}),
18 |     row_increase AS {{ column }} > lag({{ column }})
19 |   )
20 | )
21 | SELECT * FROM CTE_{{ column }} ORDER BY {{ group_by | join(', ') }}, SEQUENCE_NUMBER


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/moving_avg/moving_avg.yaml:
--------------------------------------------------------------------------------
 1 | name: moving_avg
 2 | tags:
 3 |   - column
 4 |   - date_time
 5 |   - feature_engineering
 6 | description: generates moving averages per column and per window size
 7 | arguments:
 8 |   input_columns:
 9 |     type: column_list
10 |     description: names of column(s) you want to moving average
11 |   window_sizes:
12 |     type: int_list
13 |     description: the integer values for window sizes you want to use in your moving average
14 |   order_by:
15 |     type: column_list
16 |     description: columns to order by, typically the date index of the table
17 |   partition:
18 |     type: column_list
19 |     description: columns to partition the moving average by
20 | example_code: |
21 |   ds = rasgo.get.dataset(id)
22 | 
23 |   ds2 = ds.moving_avg(input_columns=['OPEN','CLOSE','HIGH','LOW'], window_sizes=[1,2,3,7], order_by=['DATE, 'TICKER'], partition=['TICKER'])
24 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/lag/bigquery/lag.sql:
--------------------------------------------------------------------------------
 1 | {%- if partition is not defined or partition|length == 0 -%}
 2 | {%- set partition = ["NULL"]-%}
 3 | {%- endif -%}
 4 | {%- if order_by is not defined or order_by|length == 0 -%}
 5 | {%- set order_by = ["NULL"]-%}
 6 | {%- endif -%}
 7 | {%- for amount in amounts -%}
 8 |     {%- if amount < 0 -%}
 9 |         {{ raise_exception('BigQuery cannot use negative values for a lag function. Please utilize lead for forward looking windows.') }}
10 |     {%- endif -%}
11 | {%- endfor -%}
12 | SELECT *,
13 |     {%- for col in columns -%}
14 |         {%- for amount in amounts %}
15 |         lag({{col}}, {{amount}}) over (partition by {{partition | join(", ")}} order by {{order_by | join(", ")}}) as Lag_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }}
16 |         {%- endfor -%}
17 |         {{ ", " if not loop.last else "" }}
18 |     {%- endfor %}
19 | from {{ source_table }}
20 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/lead/bigquery/lead.sql:
--------------------------------------------------------------------------------
 1 | {%- if partition is not defined or partition|length == 0 -%}
 2 | {%- set partition = ["NULL"]-%}
 3 | {%- endif -%}
 4 | {%- if order_by is not defined or order_by|length == 0 -%}
 5 | {%- set order_by = ["NULL"]-%}
 6 | {%- endif -%}
 7 | {%- for amount in amounts -%}
 8 |     {%- if amount < 0 -%}
 9 |         {{ raise_exception('BigQuery cannot use negative values for a lead function. Please utilize lag for backwards looking windows.') }}
10 |     {%- endif -%}
11 | {%- endfor -%}
12 | SELECT *,
13 |     {%- for col in columns -%}
14 |         {%- for amount in amounts %}
15 |         lead({{col}}, {{amount}}) over (partition by {{partition | join(", ")}} order by {{order_by | join(", ")}}) as lead_{{ cleanse_name(col ~ '_' ~ amount) }}{{ "," if not loop.last else "" }}
16 |         {%- endfor -%}
17 |         {{ ", " if not loop.last else "" }}
18 |     {%- endfor %}
19 | from {{ source_table }}
20 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/timeseries_agg/bigquery/timeseries_agg.sql:
--------------------------------------------------------------------------------
 1 | SELECT *
 2 | {% for offset in offsets -%}
 3 |   {% set normalized_offset = -offset %}
 4 |   {% for col, aggs in aggregations.items() -%}
 5 |     {% for agg in aggs %}
 6 |     ,(
 7 |       SELECT {{ agg }}({{ col }})
 8 |       FROM {{ source_table }} i  
 9 |       WHERE 
10 |       {% if normalized_offset > 0 -%}
11 |         i.{{ date }} BETWEEN o.{{ date }} AND DATE_ADD(o.{{ date }}, INTERVAL {{ normalized_offset }} {{ date_part }})
12 |       {% else -%}
13 |         i.{{ date }} BETWEEN DATE_SUB(o.{{ date }}, INTERVAL {{ normalized_offset|abs }} {{ date_part }}) AND o.{{ date }}
14 |       {%- endif -%}
15 |       {%- for g in group_by %}
16 |         AND o.{{ g }} = i.{{ g }} 
17 |       {% endfor -%}
18 |     ) AS {{ cleanse_name(agg + '_' + col + '_' + offset|string + date_part) }}
19 |     {%- endfor -%}
20 |   {%- endfor %}
21 | {% endfor %}
22 | FROM {{ source_table }} o


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/drop_columns/drop_columns.sql:
--------------------------------------------------------------------------------
 1 | {% if include_cols and exclude_cols is defined %}
 2 | {{ raise_exception('You cannot pass both an include_cols list and an exclude_cols list') }}
 3 | {% else %}
 4 | 
 5 | {%- if exclude_cols is defined -%}
 6 | {%- set source_col_names = get_columns(source_table) -%}
 7 | 
 8 | {# Upper exclude cols to ensure case insensitive name matching #}
 9 | {%- set exclude_cols = (exclude_cols|join(',')|upper).split(',') -%}
10 | {% set include_cols = [] -%}
11 | {% for column_name in source_col_names -%}
12 |     {% if column_name.upper() not in exclude_cols -%}
13 |          {% do include_cols.append(column_name) -%}
14 |     {% endif -%}
15 | {% endfor -%}
16 | {%- endif -%}
17 | 
18 | {%- if include_cols is defined -%}
19 | SELECT
20 | {%- for col in include_cols %}
21 |     {{col}}{{ ", " if not loop.last else " " }}
22 | {%- endfor %}
23 | FROM {{source_table}}
24 | {%- endif -%}
25 | 
26 | {%- endif -%}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/union/union.yaml:
--------------------------------------------------------------------------------
 1 | name: union
 2 | tags:
 3 |   - table
 4 |   - reshape
 5 |   - join
 6 | description:  Performs a SQL UNION or UNION ALL for the parent dataset, and another dataset. Operation will only merge columns with matching columns names in both datasets and drop all other columns. Column data type validation does not happen.
 7 | arguments:
 8 |   dataset2:
 9 |     type: table
10 |     description: Dataset to Union/Union All with main dataset
11 |   keep_dupes:
12 |     type: boolean
13 |     description: |
14 |       Set to True to performn a UNION ALL between the two tables, which keeps rows that are duplicated.
15 |       Set to False to eliminate duplicate rows.
16 |     is_optional: true
17 | example_code: |
18 |   d1 = rasgo.get.dataset(dataset_id)
19 |   d2 = rasgo.get.dataset(dataset_id_2)
20 |   
21 |   ds2 = d1.transform.union(
22 |       dataset2=d2,
23 |       keep_dupes=True
24 |   )
25 | 
26 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/summarize_flatlines/summarize_flatlines.sql:
--------------------------------------------------------------------------------
 1 | WITH CTE_SEQUENCES AS (
 2 |   SELECT
 3 |     T.*,
 4 |     ROW_NUMBER() OVER (PARTITION BY {%- for group_item in group_by %} {{ group_item }},{%- endfor -%} {{ value_col }} ORDER BY {{ order_col }}) AS RN_R97_B42_O,
 5 |     ROW_NUMBER() OVER (ORDER BY {%- for group_item in group_by %} {{ group_item }},{%- endfor -%} {{ order_col }}) AS RN_R97_B42_E
 6 |   FROM
 7 |     {{ source_table }} T
 8 | )
 9 | SELECT
10 |   {%- for group_item in group_by %} S.{{ group_item }},{%- endfor -%}
11 |   S.{{ value_col }} as REPEATED_VALUE,
12 |   MIN(S.{{ order_col }}) AS FLATLINE_START_DATE,
13 |   MAX(S.{{ order_col }}) AS FLATLINE_END_DATE,
14 |   COUNT(*) AS OCCURRENCE_COUNT
15 | FROM
16 |   CTE_SEQUENCES S
17 | GROUP BY
18 |   {%- for group_item in group_by %} S.{{ group_item }},{%- endfor -%}
19 |   S.{{ value_col }},
20 |   S.RN_R97_B42_E - S.RN_R97_B42_O
21 | HAVING COUNT(*) > {{ min_repeat_count }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/to_date/to_date.yaml:
--------------------------------------------------------------------------------
 1 | name: to_date
 2 | tags:
 3 |   - column
 4 |   - data_cleaning
 5 |   - date_time
 6 | description: |
 7 |   Creates a column of a date/timestamp type from a string or other non-date column.
 8 | 
 9 |   See [this Snowflake doc](https://docs.snowflake.com/en/user-guide/date-time-input-output.html#about-the-format-specifiers-in-this-section) for information about valid formats.
10 | arguments:
11 |   dates:
12 |     type: column_value_dict
13 |     description: dict where the values are the date columns and the keys are the date formats to use for the conversion
14 |   overwrite_columns:
15 |     type: boolean
16 |     description: "Optional: if true, the output columns will overwrite the input columns"
17 |     is_optional: true
18 | example_code: |
19 |   ds = rasgo.get.dataset(id)
20 | 
21 |   ds2 = ds.to_date(dates={
22 |       'DATE_STRING':'YYYY-MM-DD',
23 |       'DATE2_STR':'YYYY-DD-MM'
24 |     })
25 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/replace_string/replace_string.sql:
--------------------------------------------------------------------------------
 1 | {% if position is not defined %}
 2 |     {% set position = 1 %}
 3 | {% else %}
 4 |     {% set use_regex = True %}
 5 | {% endif %}
 6 | 
 7 | {% if occurrence is not defined %}
 8 |     {% set occurrence = 0 %}
 9 | {% else %}
10 |     {% set use_regex = True %}
11 | {% endif %}
12 | 
13 | {% if parameters is not defined %}
14 |     {% set parameters = 'c' %}
15 | {% else %}
16 |     {% set use_regex = True %}
17 | {% endif %}
18 | 
19 | {% if use_regex %}
20 | SELECT *,
21 | REGEXP_REPLACE({{ source_col }}, '{{ pattern }}', '{{ replacement }}', {{ position }}, {{ occurrence }}, '{{ parameters }}') AS {{cleanse_name(alias) if alias is defined else "REPLACE_" + source_col}}
22 | FROM {{ source_table }}
23 | {% else %}
24 | SELECT *,
25 | REPLACE({{ source_col }}, '{{ pattern }}', '{{ replacement }}') AS {{cleanse_name(alias) if alias is defined else "REPLACE_" + source_col}}
26 | FROM {{ source_table }}
27 | {% endif %}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datepart/bigquery/datepart.sql:
--------------------------------------------------------------------------------
 1 | SELECT *,
 2 | {%- for target_col, date_part in dates.items() %}
 3 |   {%- if date_part|lower == 'weekiso' %}
 4 |     EXTRACT(ISOWEEK FROM {{ target_col }}) AS {{ target_col }}_ISOWEEK {{ ", " if not loop.last else "" }}
 5 |   {%- elif date_part|lower == 'dayofweekiso' %}
 6 |     MOD(EXTRACT(DAYOFWEEK FROM {{ target_col }}) + 5, 7) + 1 AS {{ target_col }}_ISODAYOFWEEK {{ ", " if not loop.last else "" }}
 7 |   {%- elif date_part|lower == 'yearofweekiso' %}
 8 |     EXTRACT(ISOYEAR FROM {{ target_col }}) AS {{ target_col }}_ISOYEAR {{ ", " if not loop.last else "" }}
 9 |   {%- elif date_part|lower == 'yearofweek' %}
10 |     EXTRACT(YEAR FROM {{ target_col }}) AS {{ target_col }}_YEAR {{ ", " if not loop.last else "" }}
11 |   {%- else %}
12 |     EXTRACT({{ date_part }} FROM {{ target_col }}) AS {{ target_col }}_{{ date_part }} {{ ", " if not loop.last else "" }}
13 |   {%- endif %}
14 | {%- endfor %}
15 | FROM {{ source_table }}
16 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/rank/rank.sql:
--------------------------------------------------------------------------------
 1 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', rank_columns)|join(',') if overwrite_columns else "*" -%}
 2 | 
 3 | {%- set alias = alias if alias is defined else cleanse_name('RANK_' + '_'.join(rank_columns)) -%}
 4 | 
 5 | SELECT {{ untouched_cols }},
 6 | {%- if rank_type == 'dense' %}
 7 |   DENSE_RANK() OVER(
 8 | {% elif rank_type == 'percent' %}
 9 |   PERCENT_RANK() OVER(
10 | {% elif rank_type == 'unique' %}
11 |   ROW_NUMBER() OVER(
12 | {%- else -%}
13 |   RANK() OVER(
14 | {% endif %}
15 |     {% if partition_by -%}
16 |     PARTITION BY {% for col in partition_by -%}{{col}}{{ ", " if not loop.last else " " }}{%- endfor %}
17 |     {% endif -%}
18 |     ORDER BY {% for col in rank_columns -%}{{col}}{% if order %} {{ order }}{% endif %}{{ ", " if not loop.last else " " }}{%- endfor %}
19 |   ) AS {{ alias }}
20 | FROM {{ source_table }}
21 | {% if qualify_filter %}QUALIFY {{ alias }} {{ qualify_filter }}{% endif %}
22 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/unions/unions.yaml:
--------------------------------------------------------------------------------
 1 | name: union
 2 | tags:
 3 |   - table
 4 |   - reshape
 5 |   - join
 6 | description: |
 7 |   Union one or multiple tables with the base table.
 8 |   Looks at all columns in each table and finds columns in common across all of them to keep in the final table.
 9 | arguments:
10 |   union_tables:
11 |     type: table_list
12 |     description: tables to union with the base table
13 |   remove_duplicates:
14 |     type: boolean
15 |     description: |
16 |       Defaults to False.
17 |       Set to True to use UNION, which removes duplicate rows.
18 |       Set to False to use UNION ALL, which keeps rows that are duplicated.
19 |     is_optional: true
20 | example_code: |
21 |   d1 = rasgo.get.dataset(dataset_id)
22 |   d2 = rasgo.get.dataset(dataset_id_2)
23 |   d3 = rasgo.get.dataset(dataset_id_3)
24 | 
25 |   union_ds = d1.unions(
26 |       union_tables=[d2.fqtn, d3.fqtn]
27 |       remove_duplicates=True
28 |   )
29 | 
30 |   union_ds.preview()


--------------------------------------------------------------------------------
/docs/rename.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # rename
 4 | 
 5 | Rename columns by passing a renames dict.
 6 | 
 7 | 
 8 | ## Parameters
 9 | 
10 | |  Name   |       Type        |                                      Description                                       | Is Optional |
11 | | ------- | ----------------- | -------------------------------------------------------------------------------------- | ----------- |
12 | | renames | column_value_dict | A dict representing each existing column to be renamed and its corresponding new name. |             |
13 | 
14 | 
15 | ## Example
16 | 
17 | ```python
18 | ds = rasgo.get.dataset(dataset_id)
19 | ds2 = ds.rename(renames={
20 |       'DS_WEATHER_ICON': 'Weather',
21 |       'DS_DAILY_HIGH_TEMP': 'High_Temp',
22 |       'DS_DAILY_LOW_TEMP': 'Low_Temp'
23 | })
24 | ds2.preview()
25 | ```
26 | 
27 | ## Source Code
28 | 
29 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/rename/snowflake/rename.sql" %}
30 | 
31 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/heatmap/heatmap.yaml:
--------------------------------------------------------------------------------
 1 | name: heatmap
 2 | type: insight
 3 | operation_type: VIZ
 4 | context:
 5 |   chart_type: heatmap_continuous
 6 | tags:
 7 | description: Generate an x / y heatmap, which uses the number of rows in each x/y bin as a density overlay to a 2-d histogram
 8 | arguments:
 9 |   x_axis:
10 |     type: column
11 |     description: numeric column to use as the x axis
12 |   y_axis:
13 |     type: column
14 |     description: numeric column to use as the y axis
15 |   filters:
16 |     type: filter_list
17 |     description: Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text.
18 |     is_optional: true
19 |   num_buckets:
20 |     type: value
21 |     is_optional: true
22 |     description: max number of buckets to create; defaults to 100
23 | example_code: |
24 |   ds = rasgo.get.dataset(id)
25 | 
26 |   ds2 = ds.heatmap(x_axis='TEMPERATURE',
27 |     y_axis='PRECIPITATION')
28 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/drop_columns/drop_columns.yaml:
--------------------------------------------------------------------------------
 1 | name: drop_columns
 2 | tags:
 3 |   - column
 4 |   - data_cleaning
 5 | description: |
 6 |   Drop columns by passing either an include_cols list of columns to include or an exclude_cols list of columns to exclude.
 7 |   
 8 |   Passing both include_cols and exclude_cols will result in an error.
 9 | 
10 | arguments:
11 |   include_cols:
12 |     type: column_list
13 |     description: A list of the columns from the dataset you want to keep.
14 |     is_optional: true
15 |   exclude_cols:
16 |     type: column_list
17 |     description: A list of the columns from the dataset you want to drop. Any columns not in the exclude_cols list will be kept.
18 |     is_optional: true
19 | example_code: |
20 |   ds = rasgo.get.dataset(id)
21 | 
22 |   ds2a = ds.drop_columns(include_cols=["DS_WEATHER_ICON", "DS_DAILY_HIGH_TEMP"])
23 |   ds2a.preview()
24 | 
25 |   ds2b = ds.drop_columns(exclude_cols=["DS_CLOUD_COVER", "DS_TOTAL_RAINFALL"])
26 |   ds2b.preview()
27 | 


--------------------------------------------------------------------------------
/docs/sankey.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # sankey
 4 | 
 5 | Analyze the hierarchical record count of a series of columns by counting the number of records in each pair of values in hierarchically adjacent columns. The columns fed to this transformation should be categorical lables to be counted.
 6 | 
 7 | ## Parameters
 8 | 
 9 | | Name  |    Type     |                               Description                               | Is Optional |
10 | | ----- | ----------- | ----------------------------------------------------------------------- | ----------- |
11 | | stage | column_list | Ordered list of categorial columns, from highest in hierarchy to lowest |             |
12 | 
13 | 
14 | ## Example
15 | 
16 | ```python
17 | ds = rasgo.get.dataset(id)
18 | 
19 | ds2 = ds.sankey(stage=["ENGLISHCOUNTRYREGIONNAME", "STATEPROVINCENAME", "CITY"])
20 | ds2.preview()
21 | 
22 | ```
23 | 
24 | ## Source Code
25 | 
26 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/sankey/sankey.sql" %}
27 | 
28 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/remove_duplicates/remove_duplicates.yaml:
--------------------------------------------------------------------------------
 1 | name: remove_duplicates
 2 | tags:
 3 |   - table
 4 |   - data_quality
 5 |   - data_cleaning
 6 | description:  Deduplicate a table based on a passed-in composite key. Once an order column and an order method are selected, only the top record from the resulting grouped and ordered dataset will be kept.
 7 | arguments:
 8 |   natural_key:
 9 |     type: column_list
10 |     description: Columns forming the grain at which to remove duplicates
11 |   order_col:
12 |     type: column_list
13 |     description: Columns by which to order the result set, such that the first result is kept
14 |   order_method:
15 |     type: sort_direction
16 |     description: Sets the order behavior for the chosen `order_col`. Can be ASC or DESC.
17 | example_code: |
18 |   ds = rasgo.get.dataset(id)
19 | 
20 |   ds2 = ds.remove_duplicates(
21 |     natural_key=["FIPS", "DS_WEATHER_ICON", "DATE"],
22 |     order_col=["DATE", "FIPS"],
23 |     order_method="asc"
24 |   )
25 |   ds2.preview()
26 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/summarize/summarize.sql:
--------------------------------------------------------------------------------
 1 | {% from 'filter.sql' import get_filter_statement %}
 2 | 
 3 | WITH filtered as (
 4 |     SELECT *
 5 |     FROM {{ source_table }}
 6 |     {%- if filters is defined %}
 7 |     where true AND
 8 |     {{ get_filter_statement(filters) | indent }}
 9 |     {%- endif %}
10 | )
11 | ,
12 | summarized as (
13 |     SELECT
14 |     {%- if group_by is defined %}
15 |     {{ group_by | join(', ') }},
16 |     {%- endif %}
17 | {%- for column, aggs in summarize.items() %}
18 |     {%- set oloop = loop %}
19 |     {%- for aggregation_type in aggs %}
20 |         {{ aggregation_type|lower|replace('_', '')|replace('distinct', '') }}({{ 'distinct ' if 'distinct' in aggregation_type|lower else ''}}{{ column }}) as {{ cleanse_name(aggregation_type + '_' + column)}}{{ ',' if not (loop.last and oloop.last) }}
21 |     {%- endfor %}
22 | {%- endfor %}
23 |     FROM filtered
24 |     {%- if group_by is defined %}
25 |     GROUP BY {{ group_by | join(', ') }}
26 |     {%- endif %}
27 | ) 
28 | SELECT *
29 | FROM summarized


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/vlookup/vlookup.sql:
--------------------------------------------------------------------------------
 1 | {%- macro table_from_fqtn(fqtn) -%}
 2 |     {{ fqtn.split('.')[-1] }}
 3 | {%- endmacro -%}
 4 | 
 5 | {# Get all Columns in Source Table #}
 6 | {%- set source_col_names = get_columns(source_table) -%}
 7 | 
 8 | {# Get relevant Columns and Table Name in Lookup Table #}
 9 | {%- if keep_columns is defined -%}
10 |     {%- set lookup_table_cols = keep_columns -%}
11 | {%- else -%}
12 |     {%- set lookup_table_cols = get_columns(lookup_table) -%}
13 | {%- endif -%}
14 | {%- set lookup_table_name = table_from_fqtn(lookup_table) -%}
15 | 
16 | 
17 | SELECT base.*, 
18 | {%- for column in lookup_table_cols %}
19 |     {%- if column in source_col_names -%}
20 |         lookupt.{{ column }} as {{ lookup_table_name }}_{{ column }}{{ ', ' if not loop.last }}
21 |     {%- else -%}
22 |         {{ column }}{{ ', ' if not loop.last }}
23 |     {%- endif -%}
24 | {%- endfor %}
25 | FROM {{ source_table }} base
26 | LEFT OUTER JOIN {{ lookup_table }} lookupt
27 | on base.{{ lookup_column }} = lookupt.{{ lookup_column }}
28 | 


--------------------------------------------------------------------------------
/docs/entropy.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # entropy
 4 | 
 5 | Entropy is a way to calculate the amount of "disorder" in a non-numeric column. Lower entropy indicates less disorder, while higher entropy indicates more.
 6 | 
 7 | The calculation for Shannon's entropy is: H = -Sum[ P(xi) * log2( P(xi)) ]
 8 | 
 9 | 
10 | ## Parameters
11 | 
12 | |   Name   |    Type     |                      Description                      | Is Optional |
13 | | -------- | ----------- | ----------------------------------------------------- | ----------- |
14 | | group_by | column_list | Columns to group by                                   |             |
15 | | columns  | column_list | Columns to calculate entropy on. Must be non-numeric. |             |
16 | 
17 | 
18 | ## Example
19 | 
20 | ```python
21 | ds = rasgo.get.dataset(id)
22 | 
23 | ds2 = ds.entropy(group_by=['FIPS'], columns=['NAME', 'ADDRESS'])
24 | ds2.preview()
25 | ```
26 | 
27 | ## Source Code
28 | 
29 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/entropy/entropy.sql" %}
30 | 
31 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/unpivot/unpivot.yaml:
--------------------------------------------------------------------------------
 1 | name: unpivot
 2 | tags:
 3 |   - table
 4 |   - reshape
 5 | description:  Performs a UNPIVOT operation, rotating a table by transforming columns into rows
 6 | arguments:
 7 |   value_column:
 8 |     type: string
 9 |     description: The name to assign to the generated column that will be populated with the values from the columns in the column list
10 |   name_column:
11 |     type: string
12 |     description: The name to assign to the generated column that will be populated with the names of the columns in the column list
13 |   column_list:
14 |     type: column_list
15 |     description: List of columns in the source table that will be narrowed into a single pivot column. The column names will populate name_column, and the column values will populate value_column.
16 | example_code: |
17 |   internet_sales = rasgo.get.dataset(74)
18 | 
19 |   ds2 = internet_sales.unpivot(
20 |       value_column="SALES_FEES",
21 |       name_column="PRODUCT",
22 |       column_list=["TAXAMT", "FREIGHT"]
23 |       )
24 | 
25 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/lead/lead.yaml:
--------------------------------------------------------------------------------
 1 | name: lead
 2 | tags:
 3 |   - column
 4 |   - date_time
 5 |   - feature_engineering
 6 | description: Lead shifts your features on a partition index, creating a look-forward feature offset by an amount. Lead supports generating multiple leads in one transform by generating each unique combination of columns and amounts from your inputs.
 7 | arguments:
 8 |   columns:
 9 |     type: column_list
10 |     description: names of column(s) you want to lead
11 |   amounts:
12 |     type: int_list
13 |     description: Magnitude of amounts you want to use for the lead.
14 |   partition:
15 |     type: column_list
16 |     description: name of column(s) to partition by for the lead
17 |     is_optional: true
18 |   order_by:
19 |     type: column_list
20 |     description: name of column(s) to order by in the final data set
21 |     is_optional: true
22 | example_code: |
23 |   ds = rasgo.get.dataset(id)
24 | 
25 |   ds2 = ds.lead(columns=['OPEN', 'CLOSE'], amounts=[1,2,3,7], order_by=['DATE, 'TICKER'], partition=['TICKER'])
26 |   ds2.preview()
27 | 


--------------------------------------------------------------------------------
/docs/target_encode.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # target_encode
 4 | 
 5 | Encode a categorical column with the average value of a target column for the corresponding value of the categorical column.
 6 | 
 7 | See scikit-learn's [TargetEncoder](https://contrib.scikit-learn.org/category_encoders/targetencoder.html) for full documentation.
 8 | 
 9 | 
10 | ## Parameters
11 | 
12 | |  Name  |  Type  |                   Description                   | Is Optional |
13 | | ------ | ------ | ----------------------------------------------- | ----------- |
14 | | column | column | Column name to target encode                    |             |
15 | | target | column | Numeric target column to use to create averages |             |
16 | 
17 | 
18 | ## Example
19 | 
20 | ```python
21 | ds = rasgo.get.dataset(id)
22 | 
23 | ds2 = ds.target_encode(column='WEATHER_DESCRIPTION', target='DAILY_HIGH_TEMP')
24 | ds2.preview()
25 | 
26 | ```
27 | 
28 | ## Source Code
29 | 
30 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/target_encode/target_encode.sql" %}
31 | 
32 | 


--------------------------------------------------------------------------------
/docs/apply.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # apply
 4 | 
 5 | A transform that accepts a custom template to execute. Must use the sql template argument `source_table` to reference the Rasgo dataset which will serve as the base of any SELECT
 6 | 
 7 | ## Parameters
 8 | 
 9 | | Name |  Type  |                Description                 | Is Optional |
10 | | ---- | ------ | ------------------------------------------ | ----------- |
11 | | sql  | custom | The custom SQL transform template to apply |             |
12 | 
13 | 
14 | ## Example
15 | 
16 | ```python
17 | ds = rasgo.get.dataset(id)
18 | 
19 | ds2 = ds.apply(
20 |   sql='SELECT * FROM {{ source_table }} WHERE COLUMNVALUE = I17'
21 | )
22 | ds2.preview()
23 | 
24 | # passing in custom arguments
25 | ds = rasgo.get.dataset(id)
26 | 
27 | ds2 = ds.apply(
28 |   sql="SELECT * FROM {{ source_table }} WHERE COLUMNVALUE = '{{ my_value }}'",
29 |   my_value="I17"
30 | )
31 | ds2.preview()
32 | ```
33 | 
34 | ## Source Code
35 | 
36 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/apply/apply.sql" %}
37 | 
38 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/market_basket/market_basket.yaml:
--------------------------------------------------------------------------------
 1 | name: market_basket
 2 | tags:
 3 |   - table
 4 |   - modeling
 5 |   - reshape
 6 | description: |
 7 |   Analyze historical transaction contents to understand products that are frequently purchased together.
 8 | 
 9 |   This approach uses a transactional table to aggregate each product purchased in a transaction, and then aggregates transactions together to look for common patterns.
10 | arguments:
11 |   transaction_id:
12 |     type: column
13 |     description: Column identifying a unique event ID (i.e., transaction) for which to aggregate line items
14 |   sep:
15 |     type: value
16 |     description: Text separator to use when aggregating the strings, i.e. ', ' or '|'.
17 |   agg_column:
18 |     type: column
19 |     description: Product ID or description to use when aggregating into transactions
20 | example_code: |
21 |   sales = rasgo.get.dataset(id)
22 | 
23 |   ds2 = sales.market_basket(transaction_id='SALESORDERNUMBER',
24 |                   agg_column='ENGLISHPRODUCTNAME',
25 |                   sep='|')
26 |   ds2.preview()


--------------------------------------------------------------------------------
/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy-generic:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v3
17 | 
18 |     - name: Set up Python
19 |       uses: actions/setup-python@v3
20 |       with:
21 |         python-version: '3.7'
22 | 
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install --upgrade setuptools build twine
27 |         pip install -r rasgotransforms/requirements.txt
28 | 
29 | 
30 |     - name: Build and publish
31 |       env:
32 |         TWINE_USERNAME: __token__
33 |         TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
34 |       run: |
35 |         cd rasgotransforms
36 |         python -m build
37 |         twine upload dist/*
38 | 


--------------------------------------------------------------------------------
/docs/filter.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # filter
 4 | 
 5 | Filter the dataset. Supports two types of filters:
 6 |   1. Comparison filters, which compare the values in a column with a value
 7 |   2. Advanced filters, which support full SQL strings for custom filtering logic
 8 | 
 9 | 
10 | ## Parameters
11 | 
12 | | Name  |    Type     |                Description                | Is Optional |
13 | | ----- | ----------- | ----------------------------------------- | ----------- |
14 | | items | filter_list | list of dictionaries representing filters |             |
15 | 
16 | 
17 | ## Example
18 | 
19 | ```python
20 | ds = rasgo.get.dataset(74)
21 | 
22 | # comma separated list of 'WHERE' clauses
23 | ds2 = ds.filter(items=['PRODUCTKEY < 500'])
24 | ds2.preview()
25 | 
26 | # full filtering with a column, operator, and comparison value
27 | ds3 = ds.filter(items=[{'column_name':'PRODUCTKEY', 'operator':'>', 'comparison_value':'101'}])
28 | ds3.preview()
29 | ```
30 | 
31 | ## Source Code
32 | 
33 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/filter/filter.sql" %}
34 | 
35 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/datediff/datediff.yaml:
--------------------------------------------------------------------------------
 1 | name: datediff
 2 | tags:
 3 |   - column
 4 |   - date_time
 5 | description: |
 6 |   Calculates the difference between two date, time, or timestamp expressions based on the date or time part requested.
 7 |   Difference is calculated as date_1 - date_2.
 8 | arguments:
 9 |   date_part:
10 |     type: date_part
11 |     description: |
12 |        Must be one of the values listed in [Supported Date and Time Parts](https://docs.snowflake.com/en/sql-reference/functions-date-time.html#label-supported-date-time-parts)
13 |   date_1:
14 |     type: mixed_value
15 |     description: Starting date. Can be a date column, date, time, or timestamp.
16 |   date_2:
17 |     type: mixed_value
18 |     description: Date to subtract from date_1. Can be a date column, date, time, or timestamp.
19 |   alias:
20 |     type: value
21 |     is_optional: true
22 |     description: Name for the new column created by the datediff.
23 | example_code: |
24 |   ds = rasgo.get.dataset(id)
25 | 
26 |   ds2 = ds.datediff(date_part='year', date_1='END_DATE', date_2="'2022-01-01'")
27 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/train_test_split/train_test_split.yaml:
--------------------------------------------------------------------------------
 1 | name: train_test_split
 2 | tags:
 3 |   - column
 4 |   - feature_engineering
 5 | description: |
 6 |   Label rows as part of the train or test set based off of percentage split you want to apply to the data.
 7 |   
 8 |   If you want a row-wise random sample applied, do not pass an order_by column. If you want an ordered split, then pass the order_by column.
 9 | arguments:
10 |   order_by:
11 |     type: column_list
12 |     description: Optional argument that affects the train/test split method applied. if needed, pass the names of column(s) you want to order by when applying the split.
13 |     is_optional: true
14 |   train_percent:
15 |     type: int
16 |     description: Percent of the data you want in the train set, expressed as a decimal (i.e. .8). The rest of the rows will be included in the test set.
17 | example_code: |
18 |   ds = rasgo.get.dataset(id)
19 | 
20 |   ds2 = ds.train_test_split(order_by = ['DATE'],
21 |       train_percent = 0.8)
22 |   ds2.preview()
23 | 
24 |   ds2b = ds.train_test_split(train_percent = 0.8)
25 |   ds2b.preview()


--------------------------------------------------------------------------------
/docs/one_hot_encode.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # one_hot_encode
 4 | 
 5 | One hot encode a column. Create a null value flag for the column if any of the values are NULL.
 6 | 
 7 | ## Parameters
 8 | 
 9 | |     Name     |         Type         |                                         Description                                         | Is Optional |
10 | | ------------ | -------------------- | ------------------------------------------------------------------------------------------- | ----------- |
11 | | column       | column_or_expression | Column name to one-hot encode. Supports a calculated field via a valid SQL function.        |             |
12 | | list_of_vals | string_list          | optional argument to override the dynamic lookup of all values in the target one-hot column | True        |
13 | 
14 | 
15 | ## Example
16 | 
17 | ```python
18 | ds = rasgo.get.dataset(id)
19 | 
20 | ds2 = ds.one_hot_encode(column='WEATHER_DESCRIPTION')
21 | ds2.preview()
22 | 
23 | ```
24 | 
25 | ## Source Code
26 | 
27 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/one_hot_encode/one_hot_encode.sql" %}
28 | 
29 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/ratio_with_shrinkage.sql:
--------------------------------------------------------------------------------
 1 | {# the strange __var__ names are meant to prevent collisions #}
 2 | 
 3 | {%- set source_col_names = get_columns(source_table) -%}
 4 | WITH CTE_AGG AS (
 5 |   SELECT 
 6 |     *, 
 7 |     {{ numerator }} / {{ denom }} as RAW__PCT 
 8 |   FROM 
 9 |     {{ source_table }}
10 | ), 
11 | CTE_FILTER AS (
12 |   SELECT 
13 |     * 
14 |   FROM 
15 |     CTE_AGG 
16 |   WHERE 
17 |     {{ denom }} >= {{ min_cutoff }}
18 | ), 
19 | CTE_STATS AS (
20 |   SELECT 
21 |     AVG(RAW__PCT) AS __U__, 
22 |     VARIANCE_SAMP(RAW__PCT) AS __V__ 
23 |   FROM 
24 |     CTE_FILTER
25 | ), 
26 | CTE_JOINED AS (
27 |   SELECT 
28 |     * 
29 |   FROM CTE_AGG 
30 |   CROSS JOIN CTE_STATS
31 | ), 
32 | CTE_COEF AS (
33 |   SELECT 
34 |     *, 
35 |     __U__ * (
36 |     __U__ * (1 - __U__)/ __V__ - 1
37 |     ) AS __ALPHA__, 
38 |     __ALPHA__ * (1 - __U__)/ __U__ AS __BETA__ 
39 |   FROM 
40 |     CTE_JOINED
41 | ) 
42 | SELECT 
43 |   {{ source_col_names | join(', ') }},
44 |   RAW__PCT, 
45 |   ({{ numerator }} + __ALPHA__) / ({{ denom }} + __ALPHA__ + __BETA__) AS ADJ__PCT 
46 | FROM 
47 |   CTE_COEF


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/dtypes.py:
--------------------------------------------------------------------------------
 1 | DTYPES = {
 2 |     "smallint": "smallint",
 3 |     "bigint": "bigint",
 4 |     "int": "int",
 5 |     "integer": "integer",
 6 |     "tinyint": "tinyint",
 7 |     "byteint": "byteint",
 8 |     "float": "float",
 9 |     "float4": "float4",
10 |     "float8": "float8",
11 |     "float64": "float64",
12 |     "decimal": "decimal",
13 |     "numeric": "numeric",
14 |     "number": "number",
15 |     "real": "real",
16 |     "double": "double",
17 |     "string": "string",
18 |     "text": "text",
19 |     "varchar": "varchar",
20 |     "char": "char",
21 |     "character": "character",
22 |     "date": "date",
23 |     "datetime": "datetime",
24 |     "time": "time",
25 |     "timestamp": "timestamp",
26 |     "timestamp_ltz": "timestamp_ltz",
27 |     "timestamp_ntz": "timestamp_ntz",
28 |     "timestamp_tz": "timestamp_tz",
29 |     "binary": "binary",
30 |     "varbinary": "varbinary",
31 |     "boolean": "boolean",
32 |     "bool": "bool",
33 |     "variant": "variant",
34 |     "object": "object",
35 |     "array": "array",
36 |     # Aliases
37 |     "double precision": "double",
38 |     "doubleprecision": "double",
39 | }
40 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/lag/lag.yaml:
--------------------------------------------------------------------------------
 1 | name: lag
 2 | tags:
 3 |   - column
 4 |   - date_time
 5 |   - feature_engineering
 6 | description: Lag shifts your features on a partition index, creating a lookback feature offset by an amount. Lag supports generating multiple lags in one transform by generating each unique combination of columns and amounts from your inputs.
 7 | arguments:
 8 |   columns:
 9 |     type: column_list
10 |     description: names of column(s) you want to lag
11 |   amounts:
12 |     type: int_list
13 |     description: Magnitude of amounts you want to use for the lag. Positive values result in a historical offset; negative amounts result in forward-looking offset.
14 |   partition:
15 |     type: column_list
16 |     description: name of column(s) to partition by for the lag
17 |     is_optional: true
18 |   order_by:
19 |     type: column_list
20 |     description: name of column(s) to order by in the final data set
21 |     is_optional: true
22 | example_code: |
23 |   ds = rasgo.get.dataset(id)
24 | 
25 |   ds2 = ds.lag(columns=['OPEN', 'CLOSE'], amounts=[1,2,3,7], order_by=['DATE, 'TICKER'], partition=['TICKER'])
26 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/one_hot_encode/one_hot_encode.sql:
--------------------------------------------------------------------------------
 1 | {%- set run_query_error_message -%}
 2 | This transform depends on dynamic values to work, but no Data Warehouse connection is available. 
 3 | Instead, please use the `list_of_vals` argument to provide these values explicitly
 4 | {%- endset -%}
 5 | 
 6 | {%- if list_of_vals is not defined -%}
 7 |     {%- set results = run_query("SELECT DISTINCT " +  column + " FROM " + source_table) -%}
 8 |     {%- if results is none -%}
 9 |         {{ raise_exception(run_query_error_message) }}
10 |     {%- endif -%}
11 |     {%- set distinct_col_vals = results[column].to_list() -%}
12 | {%- else -%}
13 |     {%- set distinct_col_vals = list_of_vals -%}
14 | {%- endif -%}
15 | 
16 | SELECT *,
17 | {% for val in distinct_col_vals %}
18 |     {%- if val is not none %}
19 |     CASE WHEN {{ column }} = {{ "'" ~ val ~ "'"}} THEN 1 ELSE 0 END as {{ cleanse_name(column ~ '_' ~ val) }}{{ ', ' if not loop.last else '' }}
20 |     {%- else %}
21 |     CASE WHEN {{ column }} IS NULL THEN 1 ELSE 0 END as {{ column }}_IS_NULL{{ ', ' if not loop.last else '' }}
22 |     {%- endif -%}
23 | {% endfor %}
24 | FROM {{ source_table }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/snowflake/ratio_with_shrinkage.sql:
--------------------------------------------------------------------------------
 1 | {# the strange __var__ names are meant to prevent collisions #}
 2 | 
 3 | {%- set source_col_names = get_columns(source_table) -%}
 4 | WITH CTE_AGG AS (
 5 |   SELECT 
 6 |     *, 
 7 |     {{ numerator }} / {{ denom }} as RAW__PCT 
 8 |   FROM 
 9 |     {{ source_table }}
10 | ), 
11 | CTE_FILTER AS (
12 |   SELECT 
13 |     * 
14 |   FROM 
15 |     CTE_AGG 
16 |   WHERE 
17 |     {{ denom }} >= {{ min_cutoff }}
18 | ), 
19 | CTE_STATS AS (
20 |   SELECT 
21 |     AVG(RAW__PCT) AS __U__, 
22 |     VARIANCE_SAMP(RAW__PCT) AS __V__ 
23 |   FROM 
24 |     CTE_FILTER
25 | ), 
26 | CTE_JOINED AS (
27 |   SELECT 
28 |     * 
29 |   FROM CTE_AGG 
30 |   CROSS JOIN CTE_STATS
31 | ), 
32 | CTE_COEF AS (
33 |   SELECT 
34 |     *, 
35 |     __U__ * (
36 |     __U__ * (1 - __U__)/ __V__ - 1
37 |     ) AS __ALPHA__, 
38 |     __ALPHA__ * (1 - __U__)/ __U__ AS __BETA__ 
39 |   FROM 
40 |     CTE_JOINED
41 | ) 
42 | SELECT 
43 |   {{ source_col_names | join(', ') }},
44 |   RAW__PCT, 
45 |   ({{ numerator }} + __ALPHA__) / ({{ denom }} + __ALPHA__ + __BETA__) AS ADJ__PCT 
46 | FROM 
47 |   CTE_COEF


--------------------------------------------------------------------------------
/docs/new_columns.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # new_columns
 4 | 
 5 | ## Build new columns, using SQL formulas.
 6 | 
 7 | ### Required Inputs
 8 | - Calculated Column: the formula for the new column you want to build
 9 | 
10 | ### Optional Inputs
11 | - Alias: name for your columns
12 | 
13 | ### Notes
14 | - Supports any SQL column functions that are compatible with your data warehouse
15 | 
16 | 
17 | ## Parameters
18 | 
19 | |        Name        |          Type          |                 Description                  | Is Optional |
20 | | ------------------ | ---------------------- | -------------------------------------------- | ----------- |
21 | | calculated_columns | calculated_column_list | List of SQL formulas to generate new columns |             |
22 | 
23 | 
24 | ## Example
25 | 
26 | ```python
27 | ds2 = ds.new_columns(
28 |     calculated_columns={
29 |           calcuated_column: 'POWER(COLUMN_NAME, 3)',
30 |           alias: 'COLUMN_NAME_Cubed'
31 |         }
32 |     )
33 | ds2.preview()
34 | ```
35 | 
36 | ## Source Code
37 | 
38 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/new_columns/new_columns.sql" %}
39 | 
40 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/dateadd/dateadd.yaml:
--------------------------------------------------------------------------------
 1 | name: dateadd
 2 | description: Increments a date by the specified interval value.
 3 | tags:
 4 |   - column
 5 |   - date_time
 6 | arguments:
 7 |   date_part:
 8 |     type: date_part
 9 |     description: |
10 |        A valid SQL date part.
11 |        Must be one of the values listed in [Supported Date and Time Parts](https://docs.snowflake.com/en/sql-reference/functions-date-time.html#label-supported-date-time-parts)
12 |   date:
13 |     type: mixed_value
14 |     description: Date value to increment. Can be a column or literal of these types (date, datetime, time, or timestamp).
15 |   offset:
16 |     type: int
17 |     description: Numeric value to increment the date by.
18 |   alias:
19 |     type: string
20 |     description: Name of output column
21 |     is_optional: true
22 |   overwrite_columns:
23 |     type: boolean
24 |     description: "Optional: if true, the output column will replace the existing 'date' column"
25 |     is_optional: true
26 | example_code: |
27 |   ds = rasgo.get.dataset(id)
28 | 
29 |   ds2 = ds.dateadd(date_part='year', date='END_DATE', offset=3, alias='THREE_YEARS_FUTURE')
30 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/dateadd/dateadd.py:
--------------------------------------------------------------------------------
 1 | DATE_PARTS = [
 2 |     'year',
 3 |     'month',
 4 |     'day',
 5 |     'dayofweek',
 6 |     'dayofweekiso',
 7 |     'dayofyear',
 8 |     'week',
 9 |     'weekiso',
10 |     'quarter',
11 |     'yearofweek',
12 |     'yearofweekiso',
13 | ]
14 | TIME_PARTS = [
15 |     'hour',
16 |     'minute',
17 |     'second',
18 |     'millisecond',
19 |     'nanosecond',
20 |     'epoch_second',
21 |     'epoch_millisecond',
22 |     'epoch_microsecond',
23 |     'epoch_nanosecond',
24 |     'timezone_hour',
25 |     'timezone_minute',
26 | ]
27 | 
28 | 
29 | def infer_columns(args, source_columns) -> dict:
30 |     if args['date'] in source_columns:
31 |         output_type = source_columns[args['date']]
32 |     else:
33 |         output_type = 'date'
34 |     if 'overwrite_columns' in args and args['overwrite_columns']:
35 |         source_columns[args['date'].upper()] = output_type
36 |     elif 'alias' in args and args['alias']:
37 |         source_columns[args['alias']] = output_type
38 |     else:
39 |         source_columns[f"{args['date']}_add{args['offset']}{args['date_part']}".upper()] = output_type
40 |     return source_columns
41 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/sample/sample.sql:
--------------------------------------------------------------------------------
 1 | {%- if num_rows|float < 1 -%}
 2 |     {%- set sample_amount = num_rows*100 |float -%}
 3 | {% else %}
 4 |     {%- set sample_amount = num_rows~' ROWS' -%}
 5 | {% endif %}
 6 | 
 7 | {% if filters is defined %}
 8 | WITH filtered AS (
 9 |     SELECT * FROM {{source_table}}
10 |     {% for filter_block in filters %}
11 |     {%- set oloop = loop -%}
12 |     {{ 'WHERE ' if oloop.first else ' AND ' }}
13 |         {%- if filter_block is not mapping -%}
14 |             {{ filter_block }}
15 |         {%- else -%}
16 |             {%- if filter_block['operator'] == 'CONTAINS' -%}
17 |                 {{ filter_block['operator'] }}({{ filter_block['column_name'] }}, {{ filter_block['comparison_value'] }})
18 |             {%- else -%}
19 |                 {{ filter_block['column_name'] }} {{ filter_block['operator'] }} {{ filter_block['comparison_value'] }}
20 |             {%- endif -%}
21 |         {%- endif -%}
22 |     {%- endfor -%}
23 | 
24 | )
25 | SELECT * FROM filtered
26 | TABLESAMPLE BERNOULLI ( {{ sample_amount }} )
27 | {% else %}
28 | SELECT * FROM {{source_table}}
29 | TABLESAMPLE BERNOULLI ( {{ sample_amount }} )
30 | {% endif %}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/dropna/dropna.sql:
--------------------------------------------------------------------------------
 1 | {%- if subset is not defined -%}
 2 | {%- set subset = get_columns(source_table) -%}
 3 | {%- set source_col_names = subset -%}
 4 | {%- endif -%}
 5 | 
 6 | {%- if how is not defined -%}
 7 | {%- set how = "any" -%}
 8 | {%- endif -%}
 9 | 
10 | {%- if how == "any" and thresh is not defined -%}
11 | select * from {{ source_table }}
12 | {%- for col in subset %}
13 | {{ 'where' if loop.first else '    and' }} {{ col }} is not null
14 | {%- endfor -%}
15 | 
16 | {%- else -%}
17 | {%- if thresh is not defined -%}
18 | {%- set thresh = subset|length -%}
19 | {%- endif -%}
20 | {%- if source_col_names is not defined -%}
21 | {%- set source_col_names = get_columns(source_table) -%}
22 | {%- endif -%}
23 | with not_null as (
24 |     select *,
25 |         {%- for col in subset %}
26 |         cast({{ col }} is null as int) {{ "+ " if not loop.last else " " }}
27 |         {%- endfor %}
28 |         as NUM_IS_NA
29 |     from {{ source_table }}
30 |     where NUM_IS_NA < {{ thresh }}
31 | ) select
32 |     {% for col in source_col_names -%}
33 |     {{ col }}{{ ", " if not loop.last else " " }}
34 |     {%- endfor %}
35 | from not_null
36 | {%- endif -%}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/linear_regression/linear_regression.yaml:
--------------------------------------------------------------------------------
 1 | name: linear_regression
 2 | tags:
 3 |   - table
 4 |   - modeling
 5 | description: |
 6 |   Fit a simple linear regression and return the formula. Optionally, use one or more group_by columns to create a regression per unique grouping.
 7 |   
 8 |   Currently, only supports a single independent variable.
 9 | arguments:
10 |   group_by:
11 |     type: column_list
12 |     is_optional: true
13 |     description: Columns to group by before building the linear regression model. Use this field to create multiple models (one per unique grouping)
14 |   y:
15 |     type: column
16 |     description: Dependent variable for the linear regression
17 |   x:
18 |     type: column
19 |     description: Independent variable for the linear regression
20 | example_code: |
21 |   internet_sales = rasgo.get.dataset(74)
22 | 
23 |   ds1 = internet_sales.aggregate(
24 |       group_by=['PRODUCTKEY','CUSTOMERKEY'],
25 |       aggregations={'SALESAMOUNT':['AVG'],
26 |                   'TOTALPRODUCTCOST':['AVG']})
27 |   
28 |   ds2 = ds1.linear_regression(
29 |     x = 'SALESAMOUNT_AVG',
30 |     y = 'TOTALPRODUCTCOST_AVG')
31 |   
32 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/if_then/if_then.yaml:
--------------------------------------------------------------------------------
 1 | name: if_then
 2 | tags:
 3 |   - column
 4 |   - conditional
 5 |   - data_cleaning
 6 |   - natural_language_processing
 7 | description: |
 8 |   This function creates a new column based on the conditions provided in the `conditions` argument.
 9 | 
10 |   Output values should be of the same type, since they are constructing one new column.
11 | 
12 |   A default value for the new column should be set, as should the output column name.
13 | arguments:
14 |   conditions:
15 |     type: conditional_list
16 |     description: A nested list. In each inner list the first element would be the condition to check, and the second the value with which to fill.
17 |   default:
18 |     type: mixed_value
19 |     description: The default value with which to fill the new column. Please enclose fixed strings in quotes inside of the argument (e.g., below)
20 |   alias:
21 |     type: string
22 |     description: The name of the output column in the new dataset.
23 | example_code: |
24 |   ds = rasgo.get.dataset(id)
25 | 
26 |   ds2 = ds.if_then(conditions=[["DS_WEATHER_ICON like '%cloudy%'", 1]],
27 |         default=2,
28 |         alias="CLOUDY_WEATHER_FLAG")
29 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/min_max_scaler/min_max_scaler.sql:
--------------------------------------------------------------------------------
 1 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', columns_to_scale)|join(',') if overwrite_columns else "*" -%}
 2 | 
 3 | {%- if minimums is not defined -%}
 4 | with min_max_vals as (
 5 |   select
 6 |     {%- for column in columns_to_scale %}
 7 |     min({{column}}) as min_{{column}},
 8 |     max({{column}}) as max_{{column}}{{ "," if not loop.last else "" }}
 9 |     {%- endfor %}
10 |   from {{source_table}}
11 | ) select {{ source_table + ".*" if not overwrite_columns else untouched_cols}},
12 | {%- for column in columns_to_scale %}
13 |   ({{column}} - min_{{column}}) / (max_{{column}} - min_{{column}}) as {{column if overwrite_columns else column + "_MIN_MAX_SCALED"}}{{ ", " if not loop.last else "" }}
14 | {%- endfor %}
15 | from min_max_vals, {{source_table}}
16 | 
17 | {%- else -%}
18 | select {{ untouched_cols }},
19 | {%- for column in columns_to_scale %}
20 |   ({{column}} - {{minimums[loop.index0]}}) / ({{maximums[loop.index0]}} - {{minimums[loop.index0]}}) as {{column if overwrite_columns else column + "_MIN_MAX_SCALED"}}{{ ", " if not loop.last else "" }}
21 | {%- endfor %}
22 | from {{source_table}}
23 | {%- endif -%}


--------------------------------------------------------------------------------
/docs/accelerators/website_page_performance.md:
--------------------------------------------------------------------------------
 1 | # Google Analytics Web Page Performance
 2 | 
 3 | The Web Page Performance analysis uses Google Analytics data, including bounce rate, time on page, number of visits, and total users to create a custom metric that ranks the performance of pages on your site.
 4 | 
 5 | ## Parameters
 6 | 
 7 | |              Name              |  Type   |                                                     Description                                                     | Is Optional |
 8 | | ------------------------------ | ------- | ------------------------------------------------------------------------------------------------------------------- | ----------- |
 9 | | google_analytics_traffic_table | dataset | Google Analytics traffic table                                                                                      |             |
10 | | lookback_window                | string  | This template will create metrics for a timewindow within "x" days of the current date. This is the lookback value. |             |
11 | 
12 | 
13 | ## Source Code
14 | 
15 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/accelerators/website_page_performance.yml" %}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/standard_scaler/standard_scaler.sql:
--------------------------------------------------------------------------------
 1 | {%- set untouched_cols = get_columns(source_table)|list|reject('in', columns_to_scale)|join(',') if overwrite_columns else "*" -%}
 2 | 
 3 | {%- if averages is not defined or standarddevs is not defined -%}
 4 | with avg_stddev_vals as (
 5 |   select 
 6 |     {%- for column in columns_to_scale %}
 7 |     avg({{column}}) as avg_{{column}},
 8 |     stddev({{column}}) as stddev_{{column}}{{ ", " if not loop.last else "" }}
 9 |     {%- endfor %}
10 |   from {{source_table}}
11 | ) select {{ source_table + ".*" if not overwrite_columns else untouched_cols}},
12 | {%- for column in columns_to_scale %}
13 |   ({{column}} - avg_{{column}}) / (stddev_{{column}}) as {{column if overwrite_columns else column + "_STANDARD_SCALED"}}{{ ", " if not loop.last else "" }}
14 | {%- endfor %}
15 | from avg_stddev_vals, {{source_table}}
16 | 
17 | {%- else -%}
18 | select {{ untouched_cols }},
19 | {%- for column in columns_to_scale %}
20 |   ({{column}} - {{averages[loop.index0]}}) / ({{standarddevs[loop.index0]}}) as {{column if overwrite_columns else column + "_STANDARD_SCALED"}}{{ ", " if not loop.last else "" }}
21 | {%- endfor %}
22 | from {{source_table}}
23 | {%- endif -%}


--------------------------------------------------------------------------------
/docs/datepart.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # datepart
 4 | 
 5 | Extracts a specific part of a date column. For example, if the input is '2021-01-01', you can ask for the year and get back 2021.
 6 | 
 7 | An exhaustive list of valid date parts can be [found here](https://docs.snowflake.com/en/sql-reference/functions-date-time.html#label-supported-date-time-parts).
 8 | 
 9 | 
10 | ## Parameters
11 | 
12 | | Name  |     Type      |                                              Description                                              | Is Optional |
13 | | ----- | ------------- | ----------------------------------------------------------------------------------------------------- | ----------- |
14 | | dates | datepart_dict | dict where keys are names of columns you want to date part and values are the desired date part grain |             |
15 | 
16 | 
17 | ## Example
18 | 
19 | ```python
20 | ds = rasgo.get.dataset(id)
21 | 
22 | ds2 = ds.datepart(dates={
23 |     'DATE_STRING':'year',
24 |     'DATE2_STR':'month'
25 |   })
26 | ds2.preview()
27 | 
28 | ```
29 | 
30 | ## Source Code
31 | 
32 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/datepart/snowflake/datepart.sql" %}
33 | 
34 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/replace_missing/replace_missing.yaml:
--------------------------------------------------------------------------------
 1 | name: replace_missing
 2 | tags:
 3 |   - column
 4 |   - data_cleaning
 5 |   - data_quality
 6 | description:  Replace missing values in column/columns with the mean, median, mode, or a value
 7 | arguments:
 8 |   replacements:
 9 |     type: imputation_dict
10 |     description: Dictionary with keys as column names to replace missing values for, and dictionary values the type of replacement strategy ('mean', 'median', 'mode', <value>)
11 |   flag_missing_vals:
12 |     type: boolean
13 |     description: Use True to create an indicator column for when a value was replaced. This column will be named like '<col_name>_missing_flag'.
14 |     is_optional: true
15 | example_code: |
16 |   ds = rasgo.get.dataset(id)
17 |   
18 |   ds2 = ds.replace_missing(
19 |     replacements={
20 |         'MONTH': 'mean',            # Replace with mean 
21 |         'FIPS': 'median',           # Replace with median
22 |         'COVID_NEW_CASES': 'mode',  # Replace with mode
23 |         'YEAR': '2021',             # Replace with the string '2021'
24 |         'COVID_DEATHS': 2.45       # Replace with the number 2.45
25 |     },
26 |     flag_missing_vals=True)
27 |   
28 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/conditional_agg/conditional_agg.yaml:
--------------------------------------------------------------------------------
 1 | name: conditional_agg
 2 | tags:
 3 |   - table
 4 |   - conditional
 5 |   - reshape
 6 |   - aggregate
 7 | description: |
 8 |   Pass in a list of filter rules, and aggregate rows that match.
 9 | 
10 |   If multiple rules are passed, they are combined and aggregated both together and separately.
11 | arguments:
12 |   rules:
13 |     type: value_list
14 |     description: List of filter rules to use
15 |   agg_column:
16 |     type: column
17 |     description: Column to aggregate
18 |   agg:
19 |     type: agg
20 |     description: Method to use when aggregating the agg_column
21 |   distinct:
22 |     type: boolean
23 |     description: When aggregating the agg_column, use TRUE to qualify with a DISTINCT
24 | example_code: |
25 |   customer = rasgo.get.dataset(55)
26 | 
27 |   rules = [
28 |     "FIRSTNAME LIKE 'J%'",
29 |     "BIRTHDATE < '1970-01-01'",
30 |     "ENGLISHEDUCATION = 'Bachelors'",
31 |     "MARITALSTATUS = 'M'",
32 |     "GENDER='F'"]
33 | 
34 |   ds2 = customer.conditional_agg(rules=rules,
35 |                                 agg_column='CUSTOMERKEY',
36 |                                 agg='COUNT',
37 |                                 distinct=True)
38 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/summarize/summarize.yaml:
--------------------------------------------------------------------------------
 1 | name: summarize
 2 | tags:
 3 |   - query
 4 |   - default
 5 | description: |
 6 |   Filter and then aggregate columns in a table
 7 | 
 8 |   The filter is applied first to the table. If no filters are included, then the full table is selected.
 9 |   Next, the table is aggregated.
10 | 
11 | arguments:
12 |   filters:
13 |     type: filter_list
14 |     description: Remove rows using filter logic on one or more columns
15 |     is_optional: true
16 |   summarize:
17 |     type: column_agg_list
18 |     description: Columns to summarize
19 |     is_optional: false
20 |   group_by:
21 |     type: column_list
22 |     description: One or more columns to group by A categorical column by which to pivot the calculated metrics. Including this argument will generate a new metric calculation for each distinct value in the group by column. If this column has more than 20 distinct values, the plot will not generate.
23 |     is_optional: false
24 | 
25 | example_code: |
26 |   internet_sales = rasgo.get.dataset(74)
27 | 
28 |   ds1 = internet_sales.query(
29 |     summarize={
30 |         'SALESAMOUNT': ['COUNT', 'SUM'],
31 |         'CUSTOMERKEY': ['COUNT']
32 |     },
33 |     group_by = ['PRODUCTKEY'])
34 |   
35 |   ds1.preview()


--------------------------------------------------------------------------------
/docs/cast.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # cast
 4 | 
 5 | Cast selected columns to a new type
 6 | 
 7 | 
 8 | ## Parameters
 9 | 
10 | |       Name        |      Type       |                                                  Description                                                   | Is Optional |
11 | | ----------------- | --------------- | -------------------------------------------------------------------------------------------------------------- | ----------- |
12 | | casts             | cast_value_dict | A dict where the keys are columns and the values are the new type to cast them to.                             |             |
13 | | overwrite_columns | boolean         | to overwrite column names with the new casted column, use 'true'. otherwise, use 'false'. defaults to 'false'. | True        |
14 | 
15 | 
16 | ## Example
17 | 
18 | ```python
19 | ds = rasgo.get.dataset(id)
20 | 
21 | ds_casted = ds.cast(
22 |   casts={
23 |     'DS_WEATHER_ICON':'INT',
24 |     'DS_DAILY_HIGH_TEMP':'STRING',
25 |     'DS_DAILY_LOW_TEMP':'INT'
26 |   },
27 |   overwrite_columns=True
28 | )
29 | 
30 | ds_casted.preview()
31 | 
32 | ```
33 | 
34 | ## Source Code
35 | 
36 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/cast/cast.sql" %}
37 | 
38 | 


--------------------------------------------------------------------------------
/docs/latest.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # latest
 4 | 
 5 | Impute missing values in ALL columns with the latest value seen in rows prior
 6 | 
 7 | ## Parameters
 8 | 
 9 | |   Name   |    Type     |                                               Description                                                | Is Optional |
10 | | -------- | ----------- | -------------------------------------------------------------------------------------------------------- | ----------- |
11 | | group_by | column_list | List of columns to perform the imputation "within"                                                       |             |
12 | | order_by | column_list | List of columns to sort ascending, in order to find the last known value for imputation                  |             |
13 | | nulls    | string      | Pass either 'ignore' or 'respect' to determine whether nulls should be ignored or not during imputation. |             |
14 | 
15 | 
16 | ## Example
17 | 
18 | ```python
19 | ds = rasgo.get.dataset(id)
20 | 
21 | ds2 = ds.latest(
22 |   group_by=['FIPS'],
23 |   order_by=['DATE'],
24 |   nulls='ignore')
25 | 
26 | ds2.preview()
27 | ```
28 | 
29 | ## Source Code
30 | 
31 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/latest/latest.sql" %}
32 | 
33 | 


--------------------------------------------------------------------------------
/docs/text_to_sql.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # text_to_sql
 4 | 
 5 | ## Text to SQL, powered by OpenAI.
 6 | ### Required Inputs
 7 | - Text: a prompt describing the SQL query that you want OpenAI to generate for you. Add as much context as possible to help OpenAI generate a useful query. Avoid using relative date terms like "last year" because OpenAI doesn't have any knowledge past 2021.
 8 | 
 9 | 
10 | ## Parameters
11 | 
12 | | Name |    Type     |                                                 Description                                                  | Is Optional |
13 | | ---- | ----------- | ------------------------------------------------------------------------------------------------------------ | ----------- |
14 | | text | string-long | Text description of the query you want to generate. Example: total revenue for the Southwest region in 2021  |             |
15 | 
16 | 
17 | ## Example
18 | 
19 | ```python
20 | ds = rasgo.get.dataset(fqtn='DB.SCHEMA.IOWA_LIQUOR_SALES')
21 | 
22 | ds2 = ds.text_to_sql(
23 |   text='total bottles sold in Des Moines last year'
24 | )
25 | ds2.sql()
26 | ds2.preview()
27 | 
28 | ```
29 | 
30 | ## Source Code
31 | 
32 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/text_to_sql/text_to_sql.sql" %}
33 | 
34 | 


--------------------------------------------------------------------------------
/.github/workflows/docs_generation.yaml:
--------------------------------------------------------------------------------
 1 | # Auto generate Transform Docs from YAML File on each Push
 2 | name: Transform Docs Generation
 3 | 
 4 | on: push
 5 | 
 6 | jobs:
 7 |   generate-docs:
 8 |     runs-on: ubuntu-latest
 9 |     defaults:
10 |       run:
11 |         shell: bash
12 | 
13 |     container:
14 |       image: "python:3.7"
15 | 
16 |     env:
17 |       PYTHONPATH: /__w/RasgoTransforms/RasgoTransforms
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v3
21 | 
22 |       - name: Access Transform Directory
23 |         run: |
24 |           git config --global --add safe.directory /__w/RasgoTransforms/RasgoTransforms
25 | 
26 |       - name: Install Python Requirments
27 |         run: |
28 |           python -m pip install --upgrade pip
29 |           pip install -r python/requirements.txt
30 | 
31 |       - name: Generate Transform Docs
32 |         run: python python/doc_generator.py
33 | 
34 |       - name: Git Commit Generated Transform Docs
35 |         run: |
36 |           if [[ `git status --porcelain` ]]; then
37 |             git add -A
38 |             git config user.name GitHub
39 |             git config user.email noreply@github.com
40 |             echo commiting
41 |             git commit -m 'Added Auto Generated Transform docs'
42 |             git push
43 |           fi
44 | 


--------------------------------------------------------------------------------
/docs/lead.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # lead
 4 | 
 5 | Lead shifts your features on a partition index, creating a look-forward feature offset by an amount. Lead supports generating multiple leads in one transform by generating each unique combination of columns and amounts from your inputs.
 6 | 
 7 | ## Parameters
 8 | 
 9 | |   Name    |    Type     |                     Description                     | Is Optional |
10 | | --------- | ----------- | --------------------------------------------------- | ----------- |
11 | | columns   | column_list | names of column(s) you want to lead                 |             |
12 | | amounts   | int_list    | Magnitude of amounts you want to use for the lead.  |             |
13 | | partition | column_list | name of column(s) to partition by for the lead      | True        |
14 | | order_by  | column_list | name of column(s) to order by in the final data set | True        |
15 | 
16 | 
17 | ## Example
18 | 
19 | ```python
20 | ds = rasgo.get.dataset(id)
21 | 
22 | ds2 = ds.lead(columns=['OPEN', 'CLOSE'], amounts=[1,2,3,7], order_by=['DATE, 'TICKER'], partition=['TICKER'])
23 | ds2.preview()
24 | 
25 | ```
26 | 
27 | ## Source Code
28 | 
29 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/lead/lead.sql" %}
30 | 
31 | 


--------------------------------------------------------------------------------
/docs/sample.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # sample
 4 | 
 5 | Take a sample of a dataset using a specific number of rows or a probability that each row will be selected
 6 | 
 7 | ## Parameters
 8 | 
 9 | |   Name   |    Type     |                                                                           Description                                                                            | Is Optional |
10 | | -------- | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- |
11 | | num_rows | value       | To sample using a probability of selecting each row, your num_rows should be a decimal less than 1. Otherwise, pass an integer value for number of rows to keep. |             |
12 | | filters  | filter_list | Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text.                                           | True        |
13 | 
14 | 
15 | ## Example
16 | 
17 | ```python
18 | ds = rasgo.get.dataset(id)
19 | 
20 | ds2 = ds.sample(num_rows=1000)
21 | ds2.preview()
22 | ```
23 | 
24 | ## Source Code
25 | 
26 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/sample/sample.sql" %}
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/moving_avg.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # moving_avg
 4 | 
 5 | generates moving averages per column and per window size
 6 | 
 7 | ## Parameters
 8 | 
 9 | |     Name      |    Type     |                                Description                                 | Is Optional |
10 | | ------------- | ----------- | -------------------------------------------------------------------------- | ----------- |
11 | | input_columns | column_list | names of column(s) you want to moving average                              |             |
12 | | window_sizes  | int_list    | the integer values for window sizes you want to use in your moving average |             |
13 | | order_by      | column_list | columns to order by, typically the date index of the table                 |             |
14 | | partition     | column_list | columns to partition the moving average by                                 |             |
15 | 
16 | 
17 | ## Example
18 | 
19 | ```python
20 | ds = rasgo.get.dataset(id)
21 | 
22 | ds2 = ds.moving_avg(input_columns=['OPEN','CLOSE','HIGH','LOW'], window_sizes=[1,2,3,7], order_by=['DATE, 'TICKER'], partition=['TICKER'])
23 | ds2.preview()
24 | ```
25 | 
26 | ## Source Code
27 | 
28 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/moving_avg/moving_avg.sql" %}
29 | 
30 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/clean/clean.yaml:
--------------------------------------------------------------------------------
 1 | name: clean
 2 | tags:
 3 |   - column
 4 |   - data_cleaning
 5 |   - data_quality
 6 | description:  Cast data types, rename or drop columns, impute missing values, and filter values in a dataset
 7 | arguments:
 8 |   columns:
 9 |     type: clean_dict
10 |     description: "Dictionary with keys as column names to clean, values are all optional: type - the 
11 |       dtype to cast the values to, name - the new name for a column, impute - an imputation strategy or value for replacing 
12 |       null values ('mean', 'median', 'mode', <value>), filter - a filter statement to filter the output table, drop - 
13 |       drops column from the output if true"
14 | example_code: |
15 |   ds = rasgo.get.dataset(id)
16 |   
17 |   ds2 = ds.clean(
18 |       columns={
19 |           'GLD_ADJUSTED_CLOSE': {
20 |               'type': 'FLOAT',
21 |               'name': 'GLD',
22 |               'impute': 'mean',
23 |               'filter': "> 100",
24 |           },
25 |           'GLTR_ADJUSTED_CLOSE': {
26 |               'type': 'FLOAT',
27 |               'name': 'GLTR',
28 |               'impute': 'min',
29 |               'filter': "> 10",
30 |           },
31 |           'DATE': {
32 |               'type': 'string'
33 |           }
34 |       }
35 |   )
36 |   
37 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/summarize_flatlines/summarize_flatlines.yaml:
--------------------------------------------------------------------------------
 1 | name: summarize_flatlines
 2 | tags:
 3 |   - table
 4 |   - reshape
 5 |   - data_quality
 6 | description: |
 7 |   Given a dataset, searches finds "flatline" sequences of a repeated values that do not change. 
 8 |   
 9 |   Choose a value column, a column to be used for ordering (such as a date), and a minimum cutoff for the number of repeated occurrences to consider.
10 |    
11 |   The result is a summarized table. 
12 | arguments:
13 |   group_by:
14 |     type: column_list
15 |     description: The column(s) used to partition you data into groups.
16 |       Flatlines (repeated values) will be searched within each group
17 |   value_col:
18 |     type: column
19 |     description: The column for which to search for flatlines.
20 |   order_col:
21 |     type: column
22 |     description: The column used to order the rows within groups.
23 |   min_repeat_count:
24 |     type: int
25 |     description: The minimum length of a sequence of repeated values to consider
26 |   
27 | example_code: |
28 |   ds = rasgo.get.dataset()
29 | 
30 |   test = ds.apply(group_by=['TICKER','SYMBOL'],
31 |                   value_col='CLOSE',
32 |                   order_col='DATE',
33 |                   min_repeat_count=1
34 |                   )
35 | 
36 |   test.preview()


--------------------------------------------------------------------------------
/docs/accelerators/plg.md:
--------------------------------------------------------------------------------
 1 | # Sales Growth Funnel
 2 | 
 3 | The sales growth funnel tracks users through a common Software as a Service (SaaS) sales funnel: from awareness via marketing at top of funnel, to product user, to qualified lead for an enterprise sales motion, to closed won. The data sources necessary to generate this accelerator are Google Analytics, Heap, and Salesforce.
 4 | 
 5 | ## Parameters
 6 | 
 7 | |        Name         |  Type   |                  Description                  | Is Optional |
 8 | | ------------------- | ------- | --------------------------------------------- | ----------- |
 9 | | contact_table       | dataset | Salesforce contacts table                     |             |
10 | | opportunity_table   | dataset | Salesforce opportunities table                |             |
11 | | account_table       | dataset | Salesforce accounts table                     |             |
12 | | lead_table          | dataset | Salesforce leads table                        |             |
13 | | daily_traffic_table | dataset | Google Analytics daily traffic overview table |             |
14 | | heap_users_table    | dataset | Heap Users Table                              |             |
15 | 
16 | 
17 | ## Source Code
18 | 
19 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/accelerators/plg.yml" %}


--------------------------------------------------------------------------------
/docs/aggregate.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # aggregate
 4 | 
 5 | Groups rows by the group_by items applying aggregations functions for the resulting group and selected columns
 6 | 
 7 | ## Parameters
 8 | 
 9 | |     Name     |    Type     |                                                             Description                                                             | Is Optional |
10 | | ------------ | ----------- | ----------------------------------------------------------------------------------------------------------------------------------- | ----------- |
11 | | group_by     | column_list | Columns to group by                                                                                                                 |             |
12 | | aggregations | agg_dict    | Aggregations to apply for other columns. Dict keys are column names, and values are a list of aggegations to apply for that column. |             |
13 | 
14 | 
15 | ## Example
16 | 
17 | ```python
18 | ds = rasgo.get.dataset(id)
19 | 
20 | ds2 = ds.aggregate(group_by=['FIPS'], aggregations={
21 |           'COL_1': ['SUM', 'AVG'],
22 |           'COL_2': ['SUM', 'AVG']
23 |       })
24 | ds2.preview()
25 | ```
26 | 
27 | ## Source Code
28 | 
29 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/aggregate/snowflake/aggregate.sql" %}
30 | 
31 | 


--------------------------------------------------------------------------------
/docs/histogram.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # histogram
 4 | 
 5 | Analyze the value distribution of a single continuous variable by binning it and calculating frequencies in each bin
 6 | 
 7 | ## Parameters
 8 | 
 9 | |    Name     |    Type     |                                                      Description                                                       | Is Optional |
10 | | ----------- | ----------- | ---------------------------------------------------------------------------------------------------------------------- | ----------- |
11 | | column      | column      | numeric column to use to generate the histogram                                                                        |             |
12 | | filters     | filter_list | Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text. | True        |
13 | | num_buckets | value       | max number of buckets to create; defaults to 200                                                                       | True        |
14 | 
15 | 
16 | ## Example
17 | 
18 | ```python
19 | ds = rasgo.get.dataset(id)
20 | 
21 | ds2 = ds.histogram(column='SALESAMOUNT')
22 | ds2.preview()
23 | ```
24 | 
25 | ## Source Code
26 | 
27 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/histogram/histogram.sql" %}
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/accelerators/sales_growth_funnel.md:
--------------------------------------------------------------------------------
 1 | # Sales Growth Funnel
 2 | 
 3 | The sales growth funnel tracks users through a common Software as a Service (SaaS) sales funnel: from awareness via marketing at top of funnel, to product user, to qualified lead for an enterprise sales motion, to closed won. The data sources necessary to generate this accelerator are Google Analytics, Heap, and Salesforce.
 4 | 
 5 | ## Parameters
 6 | 
 7 | |        Name         |  Type   |                  Description                  | Is Optional |
 8 | | ------------------- | ------- | --------------------------------------------- | ----------- |
 9 | | contact_table       | dataset | Salesforce contacts table                     |             |
10 | | opportunity_table   | dataset | Salesforce opportunities table                |             |
11 | | account_table       | dataset | Salesforce accounts table                     |             |
12 | | lead_table          | dataset | Salesforce leads table                        |             |
13 | | daily_traffic_table | dataset | Google Analytics daily traffic overview table |             |
14 | | heap_users_table    | dataset | Heap Users Table                              |             |
15 | 
16 | 
17 | ## Source Code
18 | 
19 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/accelerators/sales_growth_funnel.yml" %}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/reshape/bigquery/reshape.sql:
--------------------------------------------------------------------------------
 1 | {% if method|lower == 'pivot' -%}
 2 |     {%- set distinct_val_query -%}
 3 |     select distinct {{ columns }}
 4 |     from {{ source_table }}
 5 |     limit 1000
 6 |     {%- endset -%}
 7 | 
 8 |     {%- if list_of_vals is not defined -%}
 9 |         {%- set results = run_query(distinct_val_query) -%}
10 |         {%- set distinct_vals = results[results.columns[0]].to_list() -%}
11 |     {%- else -%}
12 |         {%- set distinct_vals = list_of_vals -%}
13 |     {%- endif -%}
14 | 
15 |     SELECT * FROM (
16 |         SELECT
17 |             {%- for dimension in dimensions %}
18 |             {{ dimension }},
19 |             {%- endfor %}
20 |             {{ values }},
21 |             {{ columns }}
22 |         FROM {{ source_table }}
23 |     )
24 |     PIVOT ( 
25 |         {{ agg_method }} ( {{ values }} ) as _
26 |         FOR {{ columns }} IN ( 
27 |             {%- for val in distinct_vals %}
28 |             {%- if val is string -%}
29 |             '{{ val }}'
30 |             {%- else -%}
31 |             {{ val }}
32 |             {%- endif -%}
33 |             {{', ' if not loop.last else ''}}
34 |             {%- endfor -%}
35 |         )
36 |     )
37 | {%- else -%}
38 |     SELECT * FROM {{ source_table }}
39 |     UNPIVOT( {{ value_column }} for {{ name_column }} in ( {{ columns | join(', ')}} ))
40 | {%- endif -%}


--------------------------------------------------------------------------------
/docs/to_date.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # to_date
 4 | 
 5 | Creates a column of a date/timestamp type from a string or other non-date column.
 6 | 
 7 | See [this Snowflake doc](https://docs.snowflake.com/en/user-guide/date-time-input-output.html#about-the-format-specifiers-in-this-section) for information about valid formats.
 8 | 
 9 | 
10 | ## Parameters
11 | 
12 | |       Name        |       Type        |                                              Description                                               | Is Optional |
13 | | ----------------- | ----------------- | ------------------------------------------------------------------------------------------------------ | ----------- |
14 | | dates             | column_value_dict | dict where the values are the date columns and the keys are the date formats to use for the conversion |             |
15 | | overwrite_columns | boolean           | Optional: if true, the output columns will overwrite the input columns                                 | True        |
16 | 
17 | 
18 | ## Example
19 | 
20 | ```python
21 | ds = rasgo.get.dataset(id)
22 | 
23 | ds2 = ds.to_date(dates={
24 |     'DATE_STRING':'YYYY-MM-DD',
25 |     'DATE2_STR':'YYYY-DD-MM'
26 |   })
27 | ds2.preview()
28 | ```
29 | 
30 | ## Source Code
31 | 
32 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/to_date/to_date.sql" %}
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/accelerators/omni_channel_performance.md:
--------------------------------------------------------------------------------
 1 | # Omni-channel performance
 2 | 
 3 | Omni-channel performance tracks leads through a traditional sales funnel: from awareness via marketing at top of funnel, to marketing qualified lead, to sales qualified lead, and finally to closed as a won opportunity. The data sources necessary to generate this accelerator are Google Analytics, Hubspot, and Salesforce.
 4 | 
 5 | ## Parameters
 6 | 
 7 | |        Name         |  Type   |                  Description                  | Is Optional |
 8 | | ------------------- | ------- | --------------------------------------------- | ----------- |
 9 | | contact_table       | dataset | Salesforce contacts table                     |             |
10 | | opportunity_table   | dataset | Salesforce opportunities table                |             |
11 | | account_table       | dataset | Salesforce accounts table                     |             |
12 | | lead_table          | dataset | Salesforce leads table                        |             |
13 | | daily_traffic_table | dataset | Google Analytics daily traffic overview table |             |
14 | | email_event_table   | dataset | Hubspot email event table                     |             |
15 | 
16 | 
17 | ## Source Code
18 | 
19 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/accelerators/omni_channel_performance.yml" %}


--------------------------------------------------------------------------------
/docs/funnel.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # funnel
 4 | 
 5 | Creates a funnel visualization-ready dataset from numeric columns (e.g., ["Number of leads", "Number of contacts", "Number of deals closed"]) representing a hierarchy with summed incidence rates
 6 | 
 7 | ## Parameters
 8 | 
 9 | |     Name      |    Type     |                                                                                      Description                                                                                       | Is Optional |
10 | | ------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- |
11 | | stage_columns | column_list | List of columns to include in the funnel dataset, in order of hierarchy from highest stage to lowest stage (e.g., ["Number of leads", "Number of contacts", "Number of deals closed"]) |             |
12 | 
13 | 
14 | ## Example
15 | 
16 | ```python
17 | ds = rasgo.get.dataset(id)
18 | 
19 | ds2 = ds.funnel(stage_columns=["TOTAL_IMPRESSIONS", "TOTAL_EMAILS_SENT", "TOTAL_WEBTRAFFIC_USERS", "TOTAL_LEADS_CREATED", "TOTAL_DEALS_CLOSED"])
20 | ds2.preview()
21 | 
22 | ```
23 | 
24 | ## Source Code
25 | 
26 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/funnel/funnel.sql" %}
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/remove_duplicates.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # remove_duplicates
 4 | 
 5 | Deduplicate a table based on a passed-in composite key. Once an order column and an order method are selected, only the top record from the resulting grouped and ordered dataset will be kept.
 6 | 
 7 | ## Parameters
 8 | 
 9 | |     Name     |      Type      |                                 Description                                  | Is Optional |
10 | | ------------ | -------------- | ---------------------------------------------------------------------------- | ----------- |
11 | | natural_key  | column_list    | Columns forming the grain at which to remove duplicates                      |             |
12 | | order_col    | column_list    | Columns by which to order the result set, such that the first result is kept |             |
13 | | order_method | sort_direction | Sets the order behavior for the chosen `order_col`. Can be ASC or DESC.      |             |
14 | 
15 | 
16 | ## Example
17 | 
18 | ```python
19 | ds = rasgo.get.dataset(id)
20 | 
21 | ds2 = ds.remove_duplicates(
22 |   natural_key=["FIPS", "DS_WEATHER_ICON", "DATE"],
23 |   order_col=["DATE", "FIPS"],
24 |   order_method="asc"
25 | )
26 | ds2.preview()
27 | 
28 | ```
29 | 
30 | ## Source Code
31 | 
32 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/remove_duplicates/remove_duplicates.sql" %}
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/drop_columns.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # drop_columns
 4 | 
 5 | Drop columns by passing either an include_cols list of columns to include or an exclude_cols list of columns to exclude.
 6 | 
 7 | Passing both include_cols and exclude_cols will result in an error.
 8 | 
 9 | 
10 | ## Parameters
11 | 
12 | |     Name     |    Type     |                                                   Description                                                   | Is Optional |
13 | | ------------ | ----------- | --------------------------------------------------------------------------------------------------------------- | ----------- |
14 | | include_cols | column_list | A list of the columns from the dataset you want to keep.                                                        | True        |
15 | | exclude_cols | column_list | A list of the columns from the dataset you want to drop. Any columns not in the exclude_cols list will be kept. | True        |
16 | 
17 | 
18 | ## Example
19 | 
20 | ```python
21 | ds = rasgo.get.dataset(id)
22 | 
23 | ds2a = ds.drop_columns(include_cols=["DS_WEATHER_ICON", "DS_DAILY_HIGH_TEMP"])
24 | ds2a.preview()
25 | 
26 | ds2b = ds.drop_columns(exclude_cols=["DS_CLOUD_COVER", "DS_TOTAL_RAINFALL"])
27 | ds2b.preview()
28 | 
29 | ```
30 | 
31 | ## Source Code
32 | 
33 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/drop_columns/drop_columns.sql" %}
34 | 
35 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/sliding_slope/sliding_slope.sql:
--------------------------------------------------------------------------------
 1 | WITH  CTE_RANK AS (
 2 | SELECT *, ROW_NUMBER() OVER(PARTITION BY {{ partition_col }} ORDER BY {{ order_col }} ASC) AS RANK_{{ order_col }}
 3 | FROM {{ source_table }}
 4 | ) , 
 5 | CTE_WINDOW AS (
 6 | SELECT A.{{ partition_col }}, A.RANK_{{ order_col }}, 
 7 | ARRAY_AGG(ARRAY_CONSTRUCT(B.{{ value_col }}, B.RANK_{{ order_col }})) ARRAY_AGG_OBJ 
 8 | FROM CTE_RANK A 
 9 | JOIN CTE_RANK B 
10 | ON A.{{ partition_col }}=B.{{ partition_col }} 
11 | AND A.RANK_{{ order_col }} BETWEEN B.RANK_{{ order_col }} AND B.RANK_{{ order_col }}+{{ window }} 
12 | GROUP BY A.{{ partition_col }}, A.RANK_{{ order_col }}
13 | ),
14 | CTE_SLOPE AS
15 | (
16 | SELECT {{ partition_col }}, RANK_{{ order_col }}
17 |   , regr_slope(X.VALUE[0], X.VALUE[1]) AS {{ value_col }}_SLOPE_{{ window }}
18 | FROM CTE_WINDOW, table(flatten(ARRAY_AGG_OBJ)) X
19 | GROUP BY {{ partition_col }}, RANK_{{ order_col }}
20 | ),
21 | CTE_RESULT AS
22 | (
23 | SELECT A.{{ partition_col }}, A.{{ order_col }}, B.{{ value_col }}_SLOPE_{{ window }}
24 | FROM CTE_RANK A
25 | INNER JOIN CTE_SLOPE B
26 | ON A.{{ partition_col }} = B.{{ partition_col }}
27 | AND A.RANK_{{ order_col }} = B.RANK_{{ order_col }}
28 | )
29 | SELECT A.*, B.{{ value_col }}_SLOPE_{{ window }}
30 | FROM {{ source_table }} A
31 | LEFT OUTER JOIN CTE_RESULT B
32 | ON A.{{ partition_col }} = B.{{ partition_col }}
33 | AND A.{{ order_col }} = B.{{ order_col }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/sliding_slope/snowflake/sliding_slope.sql:
--------------------------------------------------------------------------------
 1 | WITH  CTE_RANK AS (
 2 | SELECT *, ROW_NUMBER() OVER(PARTITION BY {{ partition_col }} ORDER BY {{ order_col }} ASC) AS RANK_{{ order_col }}
 3 | FROM {{ source_table }}
 4 | ) , 
 5 | CTE_WINDOW AS (
 6 | SELECT A.{{ partition_col }}, A.RANK_{{ order_col }}, 
 7 | ARRAY_AGG(ARRAY_CONSTRUCT(B.{{ value_col }}, B.RANK_{{ order_col }})) ARRAY_AGG_OBJ 
 8 | FROM CTE_RANK A 
 9 | JOIN CTE_RANK B 
10 | ON A.{{ partition_col }}=B.{{ partition_col }} 
11 | AND A.RANK_{{ order_col }} BETWEEN B.RANK_{{ order_col }} AND B.RANK_{{ order_col }}+{{ window }} 
12 | GROUP BY A.{{ partition_col }}, A.RANK_{{ order_col }}
13 | ),
14 | CTE_SLOPE AS
15 | (
16 | SELECT {{ partition_col }}, RANK_{{ order_col }}
17 |   , regr_slope(X.VALUE[0], X.VALUE[1]) AS {{ value_col }}_SLOPE_{{ window }}
18 | FROM CTE_WINDOW, table(flatten(ARRAY_AGG_OBJ)) X
19 | GROUP BY {{ partition_col }}, RANK_{{ order_col }}
20 | ),
21 | CTE_RESULT AS
22 | (
23 | SELECT A.{{ partition_col }}, A.{{ order_col }}, B.{{ value_col }}_SLOPE_{{ window }}
24 | FROM CTE_RANK A
25 | INNER JOIN CTE_SLOPE B
26 | ON A.{{ partition_col }} = B.{{ partition_col }}
27 | AND A.RANK_{{ order_col }} = B.RANK_{{ order_col }}
28 | )
29 | SELECT A.*, B.{{ value_col }}_SLOPE_{{ window }}
30 | FROM {{ source_table }} A
31 | LEFT OUTER JOIN CTE_RESULT B
32 | ON A.{{ partition_col }} = B.{{ partition_col }}
33 | AND A.{{ order_col }} = B.{{ order_col }}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/aggregate_string/aggregate_string.yaml:
--------------------------------------------------------------------------------
 1 | name: aggregate_string
 2 | tags:
 3 |   - table
 4 |   - reshape
 5 |   - aggregate
 6 |   - natural_language_processing
 7 | description: |
 8 |   Aggregate strings across rows by concatenating them together, and grouping by other columns.
 9 | 
10 |   Uses a text separator to aggregate the string values together, and returns a single column where the rows are the aggregated strings.
11 | arguments:
12 |   agg_columns:
13 |     type: column_list
14 |     description: Columns with string values to aggregate
15 |   sep:
16 |     type: value
17 |     description: Text separator to use when aggregating the strings, i.e. ', '.
18 |   group_by:
19 |     type: column_list
20 |     description: Columns to group by when applying the aggregation.
21 |   distinct:
22 |     type: boolean
23 |     description: If you want to collapse multiple rows of the same string value into a single distinct value, use TRUE. Otherwise, use FALSE.
24 |   order:
25 |     type: sort_direction
26 |     description: ASC or DESC, to set the alphabetical order of the agg_column when aggregating it
27 | example_code: |
28 |   product = rasgo.get.dataset(75)
29 | 
30 |   ds2 = product.aggregate_string(group_by=['PRODUCTLINE'],
31 |                   agg_columns=['PRODUCTKEY', 'ENGLISHPRODUCTNAME'],
32 |                   sep=', ',
33 |                   distinct='FALSE',
34 |                   order='ASC')
35 |   ds2.preview()


--------------------------------------------------------------------------------
/docs/union.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # union
 4 | 
 5 | Performs a SQL UNION or UNION ALL for the parent dataset, and another dataset. Operation will only merge columns with matching columns names in both datasets and drop all other columns. Column data type validation does not happen.
 6 | 
 7 | ## Parameters
 8 | 
 9 | |    Name    |  Type   |                                                                 Description                                                                  | Is Optional |
10 | | ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ----------- |
11 | | dataset2   | table   | Dataset to Union/Union All with main dataset                                                                                                 |             |
12 | | keep_dupes | boolean | Set to True to performn a UNION ALL between the two tables, which keeps rows that are duplicated. Set to False to eliminate duplicate rows.  | True        |
13 | 
14 | 
15 | ## Example
16 | 
17 | ```python
18 | d1 = rasgo.get.dataset(dataset_id)
19 | d2 = rasgo.get.dataset(dataset_id_2)
20 | 
21 | ds2 = d1.transform.union(
22 |     dataset2=d2,
23 |     keep_dupes=True
24 | )
25 | 
26 | ds2.preview()
27 | ```
28 | 
29 | ## Source Code
30 | 
31 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/union/union.sql" %}
32 | 
33 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/reshape/snowflake/reshape.sql:
--------------------------------------------------------------------------------
 1 | {% if method|lower == 'pivot' -%}
 2 |     {%- set distinct_val_query -%}
 3 |     select distinct {{ columns }}
 4 |     from {{ source_table }}
 5 |     limit 1000
 6 |     {%- endset -%}
 7 | 
 8 |     {%- if list_of_vals is not defined -%}
 9 |         {%- set results = run_query(distinct_val_query) -%}
10 |         {%- set distinct_vals = results[results.columns[0]].to_list() -%}
11 |     {%- else -%}
12 |         {%- set distinct_vals = list_of_vals -%}
13 |     {%- endif -%}
14 | 
15 |     {# Jinja Macro to get the comma separated cleansed name list #}
16 |     {%- macro get_values(distinct_values) -%}
17 |     {%- for val in distinct_vals -%}
18 |     {{ cleanse_name(val) }}{{ ', ' if not loop.last else '' }}
19 |     {%- endfor -%}
20 |     {%- endmacro -%}
21 | 
22 | 
23 |     SELECT {{ dimensions | join(", ") }}{{ ',' if dimensions else ''}} {{ get_values(distinct_vals) }}
24 |     FROM ( SELECT {{ dimensions | join(", ") }}{{ ',' if dimensions else ''}} {{ values }}, {{ columns }} FROM {{ source_table }})
25 |     PIVOT ( {{ agg_method }} ( {{ values }} ) FOR {{ columns }} IN ( '{{ distinct_vals | join("', '") }}' ) ) as p
26 |     ( {{ dimensions | join(", ") }}{{ ',' if dimensions else ''}} {{ get_values(distinct_vals) }} )
27 | {%- else -%}
28 |     SELECT * FROM {{ source_table }}
29 |     UNPIVOT( {{ value_column }} for {{ name_column }} in ( {{ columns | join(', ')}} ))
30 | {%- endif -%}


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/cast/bigquery/cast.sql:
--------------------------------------------------------------------------------
 1 | {%- if overwrite_columns == true -%}
 2 | 
 3 |     {%- set source_columns = get_columns(source_table) -%}
 4 |     {%- set untouched_cols = source_columns | reject('in', casts) -%}
 5 | 
 6 |     SELECT {% for col in untouched_cols %}{{ col }},{% endfor %}
 7 |     {%- for target_col, type in casts.items() %}
 8 |         {%- if type|lower == 'float' %}
 9 |             CAST({{target_col}} AS FLOAT64) AS {{target_col}}{{", " if not loop.last else ""}}f
10 |         {%- elif type|lower == 'number' %}
11 |             CAST({{target_col}} AS NUMERIC) AS {{target_col}}{{", " if not loop.last else ""}}
12 |         {%- else %}
13 |             CAST({{target_col}} AS {{type}}) AS {{target_col}}{{", " if not loop.last else ""}}
14 |         {%- endif %}
15 |     {%- endfor %}
16 |     FROM {{ source_table }}
17 | 
18 | {%- else -%}
19 | 
20 |     SELECT *
21 |     {%- for target_col, type in casts.items() %}
22 |         {%- if type|lower == 'float' %}
23 |             ,CAST({{target_col}} AS FLOAT64) AS {{cleanse_name(target_col)+'_'+cleanse_name(type)}}
24 |         {%- elif type|lower == 'number' %}
25 |             ,CAST({{target_col}} AS NUMERIC) AS {{cleanse_name(target_col)+'_'+cleanse_name(type)}}
26 |         {%- else %}
27 |             ,CAST({{target_col}} AS {{type}}) AS {{cleanse_name(target_col)+'_'+cleanse_name(type)}}
28 |         {%- endif %}
29 |     {%- endfor %}
30 |     FROM {{ source_table }}
31 | 
32 | {%- endif -%}
33 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/pivot_table/pivot_table.yaml:
--------------------------------------------------------------------------------
 1 | name: pivot_table
 2 | tags:
 3 |   - table
 4 |   - reshape
 5 |   - math
 6 |   - aggregate
 7 | description: |
 8 |   ## Pivot table, powered by SQL.
 9 | 
10 |   ### Required Inputs
11 |   - Values: a dictionary of the column with values you want to aggregate & the method of aggregation for your values
12 | 
13 |   ### Optional Inputs
14 |   - Rows: column(s) to group by down
15 |   - Columns: column(s) to pivot across
16 |   - Filters: filters to apply
17 | 
18 |   ### Notes
19 |   - Applies a hard limit of 500 distinct values in the 'columns' column
20 | 
21 | arguments:
22 |   values:
23 |     type: column_agg_list
24 |     description: columns to aggregate
25 |   rows:
26 |     type: column_or_expression_list
27 |     description: Columns to group by (column values will become rows). Supports calculated fields via valid SQL functions.
28 |     is_optional: true
29 |   columns:
30 |     type: column_or_expression
31 |     description: Column with distinct values that will be pivoted into columns. Supports a calculated field via a valid SQL function.
32 |     is_optional: true
33 |   filters:
34 |     type: filter_list
35 |     description: Filters to apply to the table
36 |     is_optional: true
37 | example_code: |
38 |   ds2 = ds.pivot_table(
39 |     rows=['DATE'],
40 |     values={
41 |         'CLOSE': ['SUM', 'AVG'],
42 |         'OPEN': ['SUM', 'AVG']
43 |     },
44 |     columns='SYMBOL',
45 |   )
46 |   ds2.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/ratio_with_shrinkage/ratio_with_shrinkage.yaml:
--------------------------------------------------------------------------------
 1 | name: ratio_with_shrinkage
 2 | tags:
 3 |   - row
 4 |   - math
 5 | description: |
 6 |   Performs empirical bayesian estimation with shrinkage towards a beta prior.
 7 |   Given a dataset with a numerator and a denominator, will calculate the raw ratio as numerator / denom, 
 8 |   as well as provide an adjusted ratio that shrinks the ratio towards the observed beta prior.
 9 | 
10 |   This is a simplified version that establishes the priors directly from the data given a min_cutoff count of observations.
11 | 
12 |   NOTE: your data should already be aggregated before performing this operation.
13 | arguments:
14 |   numerator:
15 |     type: column
16 |     description: |
17 |       A column that is pre-aggregated to contain the count of positive cases
18 |   denom:
19 |     type: column
20 |     description: |
21 |       A column that is pre-aggregated to contain the count of ALL cases
22 |   min_cutoff:
23 |     type: int
24 |     description: |
25 |       Enter a minimum value to limit the denominator when creating the prior estimates. Example: if estimating a batter's hitting percentage, 
26 |       entering 500 would limit the estimation of the priors to be only for batters with over 500 at-bats.
27 | 
28 | example_code: |
29 |   ds = rasgo.get.dataset(fqtn="BATTING_AVERAGES")
30 | 
31 |   ds2 = ds.ratio_with_shrinkage(numerator = 'HITS', 
32 |                 denom = 'AT_BATS', 
33 |                 min_cutoff = 500)
34 | 


--------------------------------------------------------------------------------
/docs/market_basket.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # market_basket
 4 | 
 5 | Analyze historical transaction contents to understand products that are frequently purchased together.
 6 | 
 7 | This approach uses a transactional table to aggregate each product purchased in a transaction, and then aggregates transactions together to look for common patterns.
 8 | 
 9 | 
10 | ## Parameters
11 | 
12 | |      Name      |  Type  |                                        Description                                         | Is Optional |
13 | | -------------- | ------ | ------------------------------------------------------------------------------------------ | ----------- |
14 | | transaction_id | column | Column identifying a unique event ID (i.e., transaction) for which to aggregate line items |             |
15 | | sep            | value  | Text separator to use when aggregating the strings, i.e. ', ' or '\|'.                      |             |
16 | | agg_column     | column | Product ID or description to use when aggregating into transactions                        |             |
17 | 
18 | 
19 | ## Example
20 | 
21 | ```python
22 | sales = rasgo.get.dataset(id)
23 | 
24 | ds2 = sales.market_basket(transaction_id='SALESORDERNUMBER',
25 |                 agg_column='ENGLISHPRODUCTNAME',
26 |                 sep='|')
27 | ds2.preview()
28 | ```
29 | 
30 | ## Source Code
31 | 
32 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/market_basket/market_basket.sql" %}
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/unions.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # union
 4 | 
 5 | Union one or multiple tables with the base table.
 6 | Looks at all columns in each table and finds columns in common across all of them to keep in the final table.
 7 | 
 8 | 
 9 | ## Parameters
10 | 
11 | |       Name        |    Type    |                                                                   Description                                                                    | Is Optional |
12 | | ----------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | ----------- |
13 | | union_tables      | table_list | tables to union with the base table                                                                                                              |             |
14 | | remove_duplicates | boolean    | Defaults to False. Set to True to use UNION, which removes duplicate rows. Set to False to use UNION ALL, which keeps rows that are duplicated.  | True        |
15 | 
16 | 
17 | ## Example
18 | 
19 | ```python
20 | d1 = rasgo.get.dataset(dataset_id)
21 | d2 = rasgo.get.dataset(dataset_id_2)
22 | d3 = rasgo.get.dataset(dataset_id_3)
23 | 
24 | union_ds = d1.unions(
25 |     union_tables=[d2.fqtn, d3.fqtn]
26 |     remove_duplicates=True
27 | )
28 | 
29 | union_ds.preview()
30 | ```
31 | 
32 | ## Source Code
33 | 
34 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/unions/unions.sql" %}
35 | 
36 | 


--------------------------------------------------------------------------------
/rasgotransforms/DESCRIPTION.md:
--------------------------------------------------------------------------------
 1 | <p align="left">
 2 |   <img width="90%" href="https://rasgoml.com" target="_blank" src="https://gblobscdn.gitbook.com/assets%2F-MJDKltt3A57jhixTfmu%2F-MJZZeY9BhUCtGPyz6bm%2F-MJZiXHTjQnyVWs6YGPc%2Frasgo-logo-full-color-rgb%20(4).png?alt=media&token=64e56b18-4282-4140-836b-e19c8e2787dc" />
 3 | </p>
 4 | 
 5 | # Rasgo Transforms
 6 | 
 7 | Rasgo Transforms provide jinja SQL templates that can be applied to your data using rasgoQL, a pandas-like python package.
 8 | - Transforms are equivalent to SQL functions that accept a table or view from your eixsting DataWarehouse and return a SQL string to transform it
 9 | - Rasgo has built a starter library of transforms for you to use or fork
10 | - *Coming Soon:* Users will be able to create their own Transforms and add them to a private namespace or contribute to the open-source library
11 | 
12 | ## Running Transforms
13 | 
14 | Rasgo Transforms can be applied via:
15 | - your Rasgo Feature Store account ([pyrasgo](https://pypi.org/project/pyrasgo/) - Account required)
16 | - the Rasgo open-source package ([rasgoql](https://pypi.org/project/rasgoql/) - totally free).
17 | 
18 | ## Package Dependencies
19 | - pyyaml
20 | 
21 | 
22 | # About Us
23 | Rasgo Transforms are maintained by *[Rasgo](https://rasgoml.com)*. Rasgo's enterprise feature store integrates with your data warehouse to help users build features faster, collaborate with team members, and serve features to models in production.
24 | 
25 | 
26 | <i>Built for Data Scientists, by Data Scientists</i>
27 | 


--------------------------------------------------------------------------------
/docs/dropna.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # dropna
 4 | 
 5 | Remove missing values
 6 | 
 7 | ## Parameters
 8 | 
 9 | |  Name  |    Type     |                                                                                  Description                                                                                   | Is Optional |
10 | | ------ | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------- |
11 | | how    | value       | Method to determine if record is removed, 'any' removes each record with at least one missing value, 'all' removes records only when all values are missing (default = 'any'). | True        |
12 | | subset | column_list | List of columns to check for missing values. All columns are checked if not defined.                                                                                           | True        |
13 | | thresh | int         | (Optional) Acts like all, but only requires this number of values to be null to remove a record instead of all.                                                                | True        |
14 | 
15 | 
16 | ## Example
17 | 
18 | ```python
19 | ds = rasgo.get.dataset(id)
20 | 
21 | ds2 = ds.dropna(how='all', subset=['ORDERS', 'SALES'])
22 | ds2.preview()
23 | ```
24 | 
25 | ## Source Code
26 | 
27 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/dropna/dropna.sql" %}
28 | 
29 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/aggregate/aggregate.py:
--------------------------------------------------------------------------------
 1 | NUMERIC_TYPES = [
 2 |     'int',
 3 |     'integer',
 4 |     'bigint',
 5 |     'smallint',
 6 |     'number',
 7 |     'numeric',
 8 |     'float',
 9 |     'float4',
10 |     'float8',
11 |     'decimal',
12 |     'double precision',
13 |     'real',
14 | ]
15 | 
16 | 
17 | def infer_columns(args, source_columns) -> dict:
18 |     args = args.copy()
19 |     out_cols = {}
20 |     for col in args['group_by']:
21 |         out_cols[col] = source_columns[col.upper()]
22 |     if 'numeric columns' in args['aggregations'].keys():
23 |         for column, column_type in source_columns.items():
24 |             if column not in args['aggregations'].keys() and column_type.lower() in NUMERIC_TYPES:
25 |                 args['aggregations'].setdefault(column, []).extend(args['aggregations']['numeric columns'])
26 |         args['aggregations'].pop('numeric columns')
27 |     if 'nonnumeric columns' in args['aggregations'].keys():
28 |         for column, column_type in source_columns.items():
29 |             if column not in args['aggregations'].keys() and column_type.lower() not in NUMERIC_TYPES:
30 |                 args['aggregations'].setdefault(column, []).extend(args['aggregations']['nonnumeric columns'])
31 |         args['aggregations'].pop('nonnumeric columns')
32 |     for col in args['aggregations'].keys():
33 |         for agg in args['aggregations'][col]:
34 |             agg = agg.replace(' ', '')
35 |             out_cols[f'{col}_{agg}'] = 'NUMERIC'
36 |     return out_cols
37 | 


--------------------------------------------------------------------------------
/docs/heatmap.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # heatmap
 4 | 
 5 | Generate an x / y heatmap, which uses the number of rows in each x/y bin as a density overlay to a 2-d histogram
 6 | 
 7 | ## Parameters
 8 | 
 9 | |    Name     |    Type     |                                                      Description                                                       | Is Optional |
10 | | ----------- | ----------- | ---------------------------------------------------------------------------------------------------------------------- | ----------- |
11 | | x_axis      | column      | numeric column to use as the x axis                                                                                    |             |
12 | | y_axis      | column      | numeric column to use as the y axis                                                                                    |             |
13 | | filters     | filter_list | Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text. | True        |
14 | | num_buckets | value       | max number of buckets to create; defaults to 100                                                                       | True        |
15 | 
16 | 
17 | ## Example
18 | 
19 | ```python
20 | ds = rasgo.get.dataset(id)
21 | 
22 | ds2 = ds.heatmap(x_axis='TEMPERATURE',
23 |   y_axis='PRECIPITATION')
24 | ds2.preview()
25 | ```
26 | 
27 | ## Source Code
28 | 
29 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/heatmap/heatmap.sql" %}
30 | 
31 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/rsi/rsi.sql:
--------------------------------------------------------------------------------
 1 | WITH CTE_LAG1 AS (
 2 | SELECT *,
 3 |         lag({{ value_col }}, 1) over (partition by {{ partition_col }} order by {{ order_col }}) as LAG_{{ value_col }}
 4 | from {{ source_table }}
 5 | ) , 
 6 | CTE_DELTA AS (
 7 | SELECT *
 8 |     , {{ value_col }} - LAG_{{ value_col }}  as DELTA
 9 | FROM CTE_LAG1
10 | ) , 
11 | CTE_GAINLOSS_SPLIT AS (
12 | SELECT *
13 |     , CASE WHEN DELTA > 0 THEN DELTA WHEN DELTA = 0 THEN 0 ELSE 0 END as GAIN
14 |     , CASE WHEN DELTA < 0 THEN abs(DELTA) WHEN DELTA = 0 THEN 0 ELSE 0 END as LOSS
15 | FROM CTE_DELTA
16 | ) , 
17 | CTE_MOVINGAVG AS (
18 | SELECT *
19 | , avg(GAIN) OVER(PARTITION BY {{ partition_col }} ORDER BY {{ order_col }} ROWS BETWEEN {{ window - 1 }} PRECEDING AND CURRENT ROW) AS AVG_GAIN_{{ window }}
20 | , avg(LOSS) OVER(PARTITION BY {{ partition_col }} ORDER BY {{ order_col }} ROWS BETWEEN {{ window - 1 }} PRECEDING AND CURRENT ROW) AS AVG_LOSS_{{ window }}
21 | FROM CTE_GAINLOSS_SPLIT
22 | ) , 
23 | CTE_RSI AS (
24 | SELECT *
25 |     , CASE WHEN AVG_LOSS_{{ window }}=0 THEN 100 ELSE 100 - (100 / (1+(AVG_GAIN_{{ window }} / AVG_LOSS_{{ window }}))) END as {{ value_col }}_RSI_{{ window }}
26 | FROM CTE_MOVINGAVG
27 | ) ,
28 | CTE_FINAL AS (
29 | SELECT {{ order_col }}, {{ partition_col }}, {{ value_col }}_RSI_{{ window }} 
30 | FROM CTE_RSI
31 | )
32 | SELECT A.*, B.{{ value_col }}_RSI_{{ window }}
33 | FROM {{ source_table }} A
34 | INNER JOIN CTE_FINAL B
35 | ON A.{{ partition_col }} = B.{{ partition_col }}
36 | AND A.{{ order_col }} = B.{{ order_col }}
37 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/sliding_slope/sliding_slope.yaml:
--------------------------------------------------------------------------------
 1 | name: sliding_slope
 2 | tags:
 3 |   - table
 4 |   - time_series
 5 | description: |
 6 |   Calculates the linear slope on a given row, looking backwards for a user-defined window of periods.
 7 | 
 8 |   Pass in a partition_col, an order_col, and a lookback window size. 
 9 | 
10 |   NOTE: Your data should be a properly formatted timeseries dataset before applying this transformation. In other words, each period should only appear once, and periods considered zero should be imputed with 0 already.
11 |   NOTE: Slope calculations are notoriously sensitive to large outliers, especially with smaller windows.
12 | 
13 |   Example use case: On daily stock data, calculate SLOPE by TICKER, with a 14-period lookback window. 
14 | arguments:
15 |   partition_col:
16 |     type: column
17 |     description: |
18 |       Grouping column to calculate the slope within.
19 |   order_col:
20 |     type: column
21 |     description: Column to order rows by when calculating the agg window. Slope automatically sorts ascending.
22 |   value_col:
23 |     type: column
24 |     description: Column to calulate slope for.
25 |   window:
26 |     type: int
27 |     description: |
28 |       Number of periods to use as a lookback period, to calculate slope. 
29 | example_code: |
30 |   ds = rasgo.get.dataset(fqtn="RASGOCOMMUNITY.PUBLIC.ZEPL_DAILY_STOCK_FEATURES")
31 | 
32 |   ds2 = ds.sliding_slope(partition_col = 'TICKER', 
33 |                 order_col = 'DATE', 
34 |                 value_col = 'CLOSE', 
35 |                 window = 14)
36 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/encode_values/encode_values.yaml:
--------------------------------------------------------------------------------
 1 | name: label_encode
 2 | tags:
 3 |   - column
 4 |   - feature_engineering
 5 | description: |
 6 |   Encodes values in a column through a variety of methods:
 7 |   
 8 |   Label Encoding:
 9 |   Encode target labels with value between 0 and n_classes-1. See scikit-learn's [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder) for full documentation.
10 |   
11 |   Target Encoding:
12 |   Encode a categorical column with the average value of a target column for the corresponding value of the categorical column.
13 |   See scikit-learn's [TargetEncoder](https://contrib.scikit-learn.org/category_encoders/targetencoder.html) for full documentation.
14 | 
15 |   One Hot Encoding:
16 |   Encode a categorical column as a 0 or 1 for each possible category, each of which will be it's own row.
17 | 
18 | 
19 | arguments:
20 |   method:
21 |     type: string
22 |     description: Encoding method which will be used ('label', 'target', or 'oh')
23 |   column:
24 |     type: column
25 |     description: Column name to label encode
26 |   target:
27 |     type: column
28 |     description: Required if method = 'target'. Numeric target column to use to create averages
29 |     is_optional: true
30 |   
31 | example_code: |
32 |   ds = rasgo.get.dataset(id)
33 | 
34 |   ds2 = ds.label_encode(column='WEATHER_DESCRIPTION', method='oh')
35 |   ds2.preview()
36 | 
37 |   ds3 = ds.target_encode(column='WEATHER_DESCRIPTION', target='DAILY_HIGH_TEMP')
38 |   ds3.preview()


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/vlookup/vlookup.yaml:
--------------------------------------------------------------------------------
 1 | name: vlookup
 2 | tags:
 3 |   - table
 4 |   - join
 5 |   - reshape
 6 | description: |
 7 |   ## Inspired by Excel... a VLookup experience that works in SQL
 8 | 
 9 |   ### Required Inputs
10 |   - Lookup Column: The column to look up in the Lookup Table. Make sure the column is named the same in both tables.
11 |   - Lookup Table: The table to look up the Lookup Column in.
12 | 
13 |   ### Optional Inputs
14 |   - Keep Columns: The columns to keep from the Lookup Table. If not provided, all columns from the Lookup Table will be kept.
15 | 
16 |   ### Notes
17 |   - For values that don't find a match in the lookup_column, you will see Null
18 |   - For columns that have the same name in both tables, the columns in the Lookup Table will be prefixed with the table name
19 | 
20 | arguments:
21 |   lookup_column:
22 |     type: column
23 |     description: |
24 |       Column to look up in the lookup table
25 |   lookup_table:
26 |     type: table
27 |     description: |
28 |       Table to look up the lookup_column in 
29 |   keep_columns:
30 |     type: column_list
31 |     description: |
32 |       Columns to keep from the lookup table
33 |     is_optional: true
34 |     context:
35 |       tableArg: lookup_table
36 | example_code: |
37 |   internet_sales = rasgo.get.dataset(74)
38 |   customer = rasgo.get.dataset(55)
39 |   product = rasgo.get.dataset(75)
40 | 
41 |   ds2 = internet_sales.vlookup(
42 |     lookup_column='PRODUCTKEY',
43 |     lookup_table=product.fqtn,
44 |     keep_columns=['WEIGHT', 'ENGLISHDESCRIPTION']
45 |   )
46 |   ds2.preview()


--------------------------------------------------------------------------------
/.github/workflows/publish_accelerators.yaml:
--------------------------------------------------------------------------------
 1 | # Publish all Accelerators in this Repo Production and Staging Environments
 2 | # Run job each time something is pushed/committed to remote 'main' branch
 3 | name: Publish Accelerators
 4 | 
 5 | on:
 6 |   workflow_dispatch:
 7 |     inputs:
 8 |       environment:
 9 |         description: 'Environment that accelerators will be published to'
10 |         default: 'all'
11 |         required: true
12 | 
13 | jobs:
14 |   publish-accelerators:
15 |     runs-on: ubuntu-latest
16 |     defaults:
17 |       run:
18 |         shell: bash
19 | 
20 |     container:
21 |       image: "python:3.7"
22 | 
23 |     env:
24 |       PYTHONPATH: /__w/RasgoTransforms/RasgoTransforms
25 |       RASGO_COMMUNITY_API_KEY: ${{ secrets.RASGO_COMMUNITY_API_KEY }}
26 |       RASGO_STAGING_COMMUNITY_API_KEY: ${{ secrets.RASGO_STAGING_COMMUNITY_API_KEY }}
27 | 
28 |     steps:
29 |       - uses: actions/checkout@v2
30 | 
31 |       - name: Install Python Requirements
32 |         run: |
33 |           python -m pip install --upgrade pip
34 |           pip install -r python/requirements.txt
35 | 
36 |       - name: Publish Accelerators on Prod
37 |         if: github.event.inputs.environment == 'production' || github.event.inputs.environment == 'all'
38 |         run: python python/publish_accelerators.py "$RASGO_COMMUNITY_API_KEY" -d production
39 | 
40 |       - name: Publish Accelerators on Staging
41 |         if: github.event.inputs.environment == 'staging' || github.event.inputs.environment == 'all'
42 |         run: python python/publish_accelerators.py "$RASGO_STAGING_COMMUNITY_API_KEY" -d staging
43 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/bin/bin.yaml:
--------------------------------------------------------------------------------
 1 | name: bin
 2 | tags:
 3 |   - column
 4 |   - aggregate
 5 |   - feature_engineering
 6 | description: |
 7 |   This function will categorize or bin an input column such that for N bins, an output column is created with values `[1-N]` where each value represents some bin.
 8 | 
 9 |   This transformation supports two binning methods (called "binning_type" in the arguments): `ntile` and `equalwidth`.
10 | 
11 |   ## N-tile
12 |   When using `ntile` binnint the boundaries for the bins are calculated such that each bin will receive an almost equal number of elements. It will create a new column called {{column}}_{{bin_count}}_NTB. This ensures that multiple equal-weight binning operations will produce column names that don't overlap.
13 | 
14 |   ## Equal Width
15 |   The `equalwidth` method will calculate the boundaries of the bins such that they will be of equal width based on the min and max value within the source column. This transformation will create a new column called {{column}}_{{bin_count}}_EWB. This ensures that multiple equal-weight binning operations will produce column names that don't overlap.
16 | arguments:
17 |   type:
18 |     type: string
19 |     description: binning algorithm to use; must be `ntile` or `equalwidth`
20 |   bin_count:
21 |     type: int
22 |     description: the number of equal-width bins to use
23 |   column:
24 |     type: column
25 |     description: which column to bucket
26 | example_code: |
27 |   ds = rasgo.get.dataset(id)
28 | 
29 |   ds2 = ds.bin(type='equalwidth', bin_count=6, column='DAILY_HIGH_TEMP')
30 |   ds2.preview()
31 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/rolling_agg/rolling_agg.yaml:
--------------------------------------------------------------------------------
 1 | name: rolling_agg
 2 | tags:
 3 |   - table
 4 |   - aggregate
 5 |   - date_time
 6 | description: |
 7 |   Row-based; Calculates a rolling aggregate based on a relative row window.
 8 | 
 9 |   Pass in order_by columns and offsets to create a row-based look-back or look-forward windows.
10 | 
11 |   Example use case: Aggregate the last 10 sales for a customer regardless of when they occurred.
12 | arguments:
13 |   aggregations:
14 |     type: agg_dict
15 |     description: |
16 |       Dictionary of columns and aggregate functions to apply.
17 |       A column can have a list of multiple aggregates applied.
18 |       One column will be created for each column:aggregate pair.
19 |   order_by:
20 |     type: column_list
21 |     description: Column(s) to order rows by when calculating the agg window
22 |   offsets:
23 |     type: int_list
24 |     description: |
25 |       List of numeric values to offset the date column.
26 |       Positive values apply a look-back window.
27 |       Negative values apply a look-forward window.
28 |       One column will be created for each offset value.
29 |   group_by:
30 |     type: column_list
31 |     description: Column(s) to group by when calculating the agg window
32 |     is_optional: True
33 | example_code: |
34 |   internet_sales = rasgo.get.dataset(74)
35 | 
36 |   ds = internet_sales.rolling_agg(
37 |         aggregations={
38 |           'SALESAMOUNT':['MAX', 'MIN', 'SUM']
39 |         },
40 |         order_by=['ORDERDATE'],
41 |         offsets=[-7, 7, 14],
42 |         group_by=['PRODUCTKEY'],
43 |     )
44 | 


--------------------------------------------------------------------------------
/docs/train_test_split.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # train_test_split
 4 | 
 5 | Label rows as part of the train or test set based off of percentage split you want to apply to the data.
 6 | 
 7 | If you want a row-wise random sample applied, do not pass an order_by column. If you want an ordered split, then pass the order_by column.
 8 | 
 9 | 
10 | ## Parameters
11 | 
12 | |     Name      |    Type     |                                                                       Description                                                                        | Is Optional |
13 | | ------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- |
14 | | order_by      | column_list | Optional argument that affects the train/test split method applied. if needed, pass the names of column(s) you want to order by when applying the split. | True        |
15 | | train_percent | int         | Percent of the data you want in the train set, expressed as a decimal (i.e. .8). The rest of the rows will be included in the test set.                  |             |
16 | 
17 | 
18 | ## Example
19 | 
20 | ```python
21 | ds = rasgo.get.dataset(id)
22 | 
23 | ds2 = ds.train_test_split(order_by = ['DATE'],
24 |     train_percent = 0.8)
25 | ds2.preview()
26 | 
27 | ds2b = ds.train_test_split(train_percent = 0.8)
28 | ds2b.preview()
29 | ```
30 | 
31 | ## Source Code
32 | 
33 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/train_test_split/train_test_split.sql" %}
34 | 
35 | 


--------------------------------------------------------------------------------
/rasgotransforms/rasgotransforms/transforms/join/join.yaml:
--------------------------------------------------------------------------------
 1 | name: join
 2 | tags:
 3 |   - table
 4 |   - join
 5 |   - reshape
 6 | description: |
 7 |   Join a dataset with another dataset, by matching on one or more columns between the two tables.
 8 | 
 9 |   If you pass a join_prefix, all column names in the join table will be named "{join_prefix}_{columnname}".
10 |   If you don't pass a join_prefix, columns that share the same name in both tables will be only have the column from the base table included in the final output.
11 | arguments:
12 |   join_table:
13 |     type: table
14 |     description: Dataset object to join with the source dataset.
15 |   join_type:
16 |     type: join_type
17 |     description: LEFT, RIGHT, or INNER
18 |   join_columns:
19 |     type: join_dict
20 |     description: Columns to use for the join. Keys are columns in the source_table and values are on columns in the join_table.
21 |   join_prefix:
22 |     type: value
23 |     is_optional: true
24 |     description: Prefix all columns in the join_table with a string to differentiate them
25 |   filters:
26 |     type: filter_list
27 |     description: Filter logic on one or more columns. Can choose between a simple comparison filter or advanced filter using free text.
28 |     is_optional: true
29 | example_code: |
30 |   internet_sales = rasgo.get.dataset(74)
31 |   product = rasgo.get.dataset(75)
32 | 
33 |   ds2 = internet_sales.join(
34 |     join_table=product,
35 |     join_columns={'PRODUCTKEY':'PRODUCTKEY'},
36 |     join_type='LEFT',
37 |     join_prefix='product',
38 |     filters=['CUSTOMERKEY IS NOT NULL', 'ORDERDATE < CURRENT_DATE()'])
39 | 
40 |   ds2.preview()


--------------------------------------------------------------------------------
/docs/conditional_agg.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # conditional_agg
 4 | 
 5 | Pass in a list of filter rules, and aggregate rows that match.
 6 | 
 7 | If multiple rules are passed, they are combined and aggregated both together and separately.
 8 | 
 9 | 
10 | ## Parameters
11 | 
12 | |    Name    |    Type    |                             Description                              | Is Optional |
13 | | ---------- | ---------- | -------------------------------------------------------------------- | ----------- |
14 | | rules      | value_list | List of filter rules to use                                          |             |
15 | | agg_column | column     | Column to aggregate                                                  |             |
16 | | agg        | agg        | Method to use when aggregating the agg_column                        |             |
17 | | distinct   | boolean    | When aggregating the agg_column, use TRUE to qualify with a DISTINCT |             |
18 | 
19 | 
20 | ## Example
21 | 
22 | ```python
23 | customer = rasgo.get.dataset(55)
24 | 
25 | rules = [
26 |   "FIRSTNAME LIKE 'J%'",
27 |   "BIRTHDATE < '1970-01-01'",
28 |   "ENGLISHEDUCATION = 'Bachelors'",
29 |   "MARITALSTATUS = 'M'",
30 |   "GENDER='F'"]
31 | 
32 | ds2 = customer.conditional_agg(rules=rules,
33 |                               agg_column='CUSTOMERKEY',
34 |                               agg='COUNT',
35 |                               distinct=True)
36 | ds2.preview()
37 | ```
38 | 
39 | ## Source Code
40 | 
41 | {% embed url="https://github.com/rasgointelligence/RasgoTransforms/blob/main/rasgotransforms/rasgotransforms/transforms/conditional_agg/conditional_agg.sql" %}
42 | 
43 | 


--------------------------------------------------------------------------------