├── sample-data
├── Readme.md
└── Industrial production total index.csv
├── .gitignore
├── notebooks
├── custom_calendar
│ ├── workflow.png
│ └── analyze_trade_and_quote_data.ipynb
├── technical_indicators
│ ├── finspace_logo.png
│ └── technical-indicators-all.ipynb
├── analyze_trade_and_quote_data
│ └── workflow.png
├── collect_timebars_and_summarize
│ ├── workflow.png
│ ├── finspace_logo.png
│ └── collect-timebars-summarize.ipynb
├── compute_and_plot_volatility_from_taq
│ ├── workflow.png
│ ├── finspace_logo.png
│ └── plot-volatility.ipynb
├── Utilities
│ └── finspace_spark.py
├── cluster_management
│ └── ClusterManagement.ipynb
├── s3_import
│ └── s3_import.ipynb
└── third_party_apis
│ ├── yfinance_import.ipynb
│ └── polygon_import.ipynb
├── webinars
└── snowflake_2021-09
│ ├── workflow.png
│ ├── finspace_logo.png
│ ├── README.md
│ └── finspace_spark.py
├── blogs
└── finspace_redshift-2021-09
│ ├── finspace-redshift-import.png
│ ├── finspace-redshift-analysis.png
│ ├── finspace_redshift.sql
│ ├── README.md
│ └── finspace_spark.py
├── CODE_OF_CONDUCT.md
├── LICENSE
├── CONTRIBUTING.md
└── README.md
/sample-data/Readme.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | **/.DS_Store
3 | .ipynb_checkpoints
4 | **/.ipynb_checkpoints
5 | .idea
6 |
--------------------------------------------------------------------------------
/notebooks/custom_calendar/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/custom_calendar/workflow.png
--------------------------------------------------------------------------------
/webinars/snowflake_2021-09/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/webinars/snowflake_2021-09/workflow.png
--------------------------------------------------------------------------------
/webinars/snowflake_2021-09/finspace_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/webinars/snowflake_2021-09/finspace_logo.png
--------------------------------------------------------------------------------
/notebooks/technical_indicators/finspace_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/technical_indicators/finspace_logo.png
--------------------------------------------------------------------------------
/notebooks/analyze_trade_and_quote_data/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/analyze_trade_and_quote_data/workflow.png
--------------------------------------------------------------------------------
/notebooks/collect_timebars_and_summarize/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/collect_timebars_and_summarize/workflow.png
--------------------------------------------------------------------------------
/notebooks/collect_timebars_and_summarize/finspace_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/collect_timebars_and_summarize/finspace_logo.png
--------------------------------------------------------------------------------
/blogs/finspace_redshift-2021-09/finspace-redshift-import.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/blogs/finspace_redshift-2021-09/finspace-redshift-import.png
--------------------------------------------------------------------------------
/notebooks/compute_and_plot_volatility_from_taq/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/compute_and_plot_volatility_from_taq/workflow.png
--------------------------------------------------------------------------------
/blogs/finspace_redshift-2021-09/finspace-redshift-analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/blogs/finspace_redshift-2021-09/finspace-redshift-analysis.png
--------------------------------------------------------------------------------
/notebooks/compute_and_plot_volatility_from_taq/finspace_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/compute_and_plot_volatility_from_taq/finspace_logo.png
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | this software and associated documentation files (the "Software"), to deal in
5 | the Software without restriction, including without limitation the rights to
6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7 | the Software, and to permit persons to whom the Software is furnished to do so.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 |
16 |
--------------------------------------------------------------------------------
/blogs/finspace_redshift-2021-09/finspace_redshift.sql:
--------------------------------------------------------------------------------
1 | -- This code assumes the name of the database is 'dev'. If your databse has a different nme, update the code below.
2 |
3 | CREATE SCHEMA trading_data;
4 |
5 | CREATE TABLE dev.trading_data.trade_history(
6 | trans_id varchar,
7 | ticker varchar,
8 | price decimal(8,2),
9 | quantity bigint,
10 | trans_type varchar,
11 | trans_date date);
12 |
13 |
14 | INSERT into dev.trading_data.trade_history values
15 | ('154644', 'AMZN', '1876.02', '190', 'P', '2020-01-02'),
16 | ('154699', 'AMZN', '1877.98', '268', 'P', '2020-01-02'),
17 | ('156655', 'AMZN', '1870.00', '100', 'P', '2020-01-02'),
18 | ('156656', 'AMZN', '1876.02', '100', 'P', '2020-01-02'),
19 | ('156849', 'AMZN', '1865.65', '187', 'P', '2020-01-02'),
20 | ('166894', 'AMZN', '1897.67', '100', 'P', '2020-01-02'),
21 | ('166905', 'AMZN', '1897.89', '200', 'S', '2020-01-02');
22 |
23 | COMMENT on table
24 | dev.trading_data.trade_history is 'Table contains a list of all buy and sell transactions across the organization';
25 | COMMENT on column
26 | dev.trading_data.trade_history.trans_id is 'Unique transaction ID';
27 | COMMENT on column
28 | dev.trading_data.trade_history.ticker is 'Stock ticker';
29 | COMMENT on column
30 | dev.trading_data.trade_history.price is 'Purchase or sale price';
31 | COMMENT on column
32 | dev.trading_data.trade_history.quantity is 'Purchase or sale quantity';
33 | COMMENT on column
34 | dev.trading_data.trade_history.trans_type is 'Transaction type (P=purchase, S=sale)';
35 | COMMENT on column
36 | dev.trading_data.trade_history.trans_date is 'Purchase or sale date';
--------------------------------------------------------------------------------
/blogs/finspace_redshift-2021-09/README.md:
--------------------------------------------------------------------------------
1 | # Amazon FinSpace Examples
2 |
3 | ## Connecting to Amazon Redshift from FinSpace
4 |
5 | This repository contains example notebooks that show how to connect to an Amazon Redhsift cluster from Amazon FinSpace using JDBC driver.
6 |
7 | ### Import tables and metadata
8 |
9 | The [Import notebook](redshift_in_finspace_import.ipynb) shows how to create datasets in FinSpace based on table metadata in Amazon Redshift.
10 | It covers the following steps:
11 | 1. Create a category and an attribute set in FinSpace.
12 | 2. Connect to Amazon Redshift from FinSpace Jupyter notebook using JDBC.
13 | 3. Create dataset in Amazon FinSpace for each table in Amazon Redshift. Add description, owner, and attributes to each dataset to help with data discovery and access control.
14 |
15 | 
16 |
17 | ### Analyze data
18 |
19 | Once you created datasets in FinSpace, you can use the attached attribute set to connect to the Amazon Redshift cluster directly from FinSpace without specifying database, schema, or table name.
20 | The [analysis notebook](redshift_in_finspace_analysis.ipynb), shows how to:
21 | 1. Connect to Amazon Redshift using the attribute set from FinSpace
22 | 2. Use data from both Amazon FinSpace and Amazon Redshift to evaluate trade performance based on daily price for AMZN stock.
23 |
24 | 
25 |
26 | ## FAQ
27 |
28 | **What do I need in order to get started?**
29 | - Setup FinSpace environment by following the [“Create an Amazon FinSpace Environment”](https://docs.aws.amazon.com/finspace/latest/userguide/create-an-amazon-finspace-environment.html) guide
30 | - Install Capital Markets sample data bundle, as explained in the [“Sample Data Bundle”](https://docs.aws.amazon.com/finspace/latest/userguide/sample-data-bundle.html) guide
31 | - Ensure you have permissions to ‘Manage Categories and Controlled Vocabularies’ and ‘Manage Attribute Sets’ in FinSpace
32 | - Create an Amazon Redshift cluster in the same AWS account as the FinSpace environment
33 | - Follow the [“Create Cluster Guide”](https://docs.aws.amazon.com/redshift/latest/dg/tutorial-loading-data-launch-cluster.html) guide to get started
34 | - Create a superuser and ensure the cluster is publicly accessible by following [this guide](https://aws.amazon.com/premiumsupport/knowledge-center/redshift-cluster-private-public/)
35 | - Create a table in Amazon Redshift and insert trading transaction data using [these SQL queries](finspace_redshift.sql)
36 |
--------------------------------------------------------------------------------
/webinars/snowflake_2021-09/README.md:
--------------------------------------------------------------------------------
1 | # Financial Services Data Summit
2 | ## Session
3 | [Making Financial Data More Accessible in the Cloud](https://www.snowflake.com/financial-services-data-summit/americas/agenda/?agendaPath=session/615483)
4 | **Date:** Sept 14, 2021
5 |
6 | ### Agenda
7 | Join this session to learn how AWS and Snowflake are innovating to make it easier for financial customers to share, manage, and analyze financial content in the cloud. Leading Financial Markets Operator TP ICAP will showcase their use of Snowflake running on AWS within the Parameta Solutions business to share market data and analytics amongst clients who use proprietary models built to run written in Python. Then AWS will demonstrate the benefits of incorporating data from Snowflake into Amazon FinSpace, a new service to provide customers with a scalable research environment that offers integrated data management, analytics, and governance.
8 |
9 | ## FinSpace Prerequisits
10 | - Capital Markets Sample Data bundle has been installed in the environment
11 | - Category 'Source' contains a sub-category named 'Snowflake'
12 | - Attribute set named 'Snowflake Table Attributes' exists with fields
13 | - Name: Catalog, Type: String
14 | - Name: Schema, Type: String
15 | - Name: Table, Type: String
16 | - Name: Source, Type: Category: Source
17 |
18 | ### snowflake.ini
19 | It is assumed that there exists in this same folder with the notebooks a customer provided snowflake.ini file
20 | that contains snowflake instance information and authentication credentials.
21 |
22 | #### Contents of snowflake.ini
23 | ```
24 | [snowflake]
25 | user: USERNAME
26 | password: PASSWORD
27 | account: ACCOUNT
28 | database: DATABASE
29 | warehouse: WAREHOUSE
30 | ```
31 | Please provide values from your snowflake installation for: USERNAME, PASSWORD, ACCOUNT, DATABASE, and WAREHOUSE
32 |
33 | ## Code Artifacts
34 | Code artifacts from the demonstration given at the summit
35 |
36 | ### Notebooks
37 | [Delete Datasets](delete_datasets.ipynb) Deletes all datasets with a given classification (Source) and value (Snowflake)
38 | [Snowflake Datasets](snowflake_datasets.ipynb) Shows how to search for and display datasets in FinSpace that reside in Snowflake
39 | [Snowflake Import](snowflake_import.ipynb) Notebook that creates a FinSpace dataset for each table in the given snowflake table
40 | [Plot Volatility](plot-volatility-snowflake.ipynb) Notebook to plot volatility then add and plots events over the volatility plot, presented in session
41 |
42 | ### Python
43 | [finspace.py](finspace.py) Utility class for working with FinSpace boto3 service API
44 | [finspace_spark.py](finspace_spark.py) Utility class for works with Spark and FinSpace boto3 service API
45 |
46 | ### Other
47 | [finspace_logo.png](finspace_logo.png) FinSpace logo in notebooks
48 | [workflow.png](workflow.png) FinSpace time-series library workflow image
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 |
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 |
42 |
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 |
46 |
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 |
52 |
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 |
56 |
57 | ## Licensing
58 |
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Amazon FinSpace Examples.
2 | This repository contains example notebooks and python scripts that show how to work with [Amazon FinSpace](https://aws.amazon.com/finspace/).
3 |
4 | ## Examples
5 |
6 | ### Notebooks: Inside Amazon FinSpace
7 | These notebooks are intended to be run from the FinSpace managed notebook environment.
8 | Notebooks will reference a dataset (and others a permission group as well) found in FinSpace,
9 | be sure you have entered the empty identifiers for dataset_id, view_id, and basicPermissionGroupId found in your
10 | environment installation. All example notebooks assume that the Capital Markets Sample Data bundle was installed
11 | with the FinSpace environment. Some example notebooks make use of Utility classes found in the Utilities folder
12 | (e.g. finspace.py and finspace_spak.py) be sure to have run the '%load' for the python files twice, first to load
13 | the file contents into the notebook, and a second time to ensure the code is run and pushed onto your Spark cluster.
14 |
15 | - [Analyzing petabytes of trade and quote data with Amazon FinSpace](notebooks/analyze_trade_and_quote_data) shows how to use the FinSpace Time Series Library.
16 | - [Cluster Management](notebooks/cluster_management) demonstrates using the cluster management APIs from within a notebook.
17 | - [Collect Timebars and Summarize](notebooks/collect_timebars_and_summarize) demonstrates how to create a summary time-bar dataset and add it to FinSpace.
18 | - [Compute and Plot Volatility from TAQ](notebooks/compute_and_plot_volatility_from_taq) demonstrates how to compute and plot volatility using the FinSpace Time Series Libraries.
19 | - [Technical Indicators](notebooks/technical_indicators) demonstrates the creation of a Spark DataFrame that uses all the FinSpace technical indicators.
20 | - [S3 Import](notebooks/s3_import) shows how to import data from an external (to FinSpace) S3 bucket into a FinSpace dataset
21 | - [Using Third Party APIs](notebooks/third_party_apis) shows how you can install and use third party APIs from FinSpace
22 | - [Custom Calendars](notebooks/custom_calendar) shows how you can create a custom calendar for the time series fill and filter stage
23 |
24 | ### Python: Helper Code
25 | - [Utility Classes](notebooks/Utilities) facilitates the use of the FinSpace APIs.
26 |
27 | ## Blogs
28 | [Analyze daily trading activity using transaction data from Amazon Redshift in Amazon FinSpace](blogs/finspace_redshift-2021-09)
29 | How to connect Amazon FinSpace to a Redshift cluster, import tables into FinSpace datasets, and pull data in tables from
30 | Redshift directly into Spark DataFrames.
31 |
32 | ## Webinars
33 | [Making Financial Data More Accessible in the Cloud](webinars/snowflake_2021-09)
34 | Notebooks used to demonstrate integration of Snowflake tables with Amazon FinSpace. Presented at Snowflake Financial
35 | Services Summit Sept 14, 2021: [Making Financial Data More Accessible in the Cloud](https://www.snowflake.com/financial-services-data-summit/americas/agenda/?agendaPath=session/615483)
36 |
37 | ## FAQ
38 | *How do I contribute my own example notebook?*
39 |
40 | - Although we're extremely excited to receive contributions from the community, we're still working on the best mechanism to take in examples from external sources. Please bare with us in the short-term if pull requests take longer than expected or are closed.
41 |
42 | ## License
43 |
44 | This library is licensed under the MIT-0 License. See the LICENSE file.
45 |
46 |
--------------------------------------------------------------------------------
/notebooks/Utilities/finspace_spark.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import time
3 | import boto3
4 | from botocore.config import Config
5 |
6 | # FinSpace class with Spark bindings
7 |
8 | class SparkFinSpace(FinSpace):
9 | import pyspark
10 | def __init__(
11 | self,
12 | spark: pyspark.sql.session.SparkSession = None,
13 | config = Config(retries = {'max_attempts': 0, 'mode': 'standard'}),
14 | dev_overrides: dict = None
15 | ):
16 | FinSpace.__init__(self, config=config, dev_overrides=dev_overrides)
17 | self.spark = spark # used on Spark cluster for reading views, creating changesets from DataFrames
18 |
19 | def upload_dataframe(self, data_frame: pyspark.sql.dataframe.DataFrame):
20 | resp = self.client.get_user_ingestion_info()
21 | upload_location = resp['ingestionPath']
22 | # data_frame.write.option('header', 'true').csv(upload_location)
23 | data_frame.write.parquet(upload_location)
24 | return upload_location
25 |
26 | def ingest_dataframe(self, data_frame: pyspark.sql.dataframe.DataFrame, dataset_id: str, change_type: str, wait_for_completion=True):
27 | print("Uploading data...")
28 | upload_location = self.upload_dataframe(data_frame)
29 |
30 | print("Data upload finished. Ingesting data...")
31 |
32 | return self.ingest_from_s3(upload_location, dataset_id, change_type, wait_for_completion, format_type='parquet', format_params={})
33 |
34 | def read_view_as_spark(
35 | self,
36 | dataset_id: str,
37 | view_id: str
38 | ):
39 | # TODO: switch to DescribeMatz when available in HFS
40 | views = self.list_views(dataset_id=dataset_id, max_results=50)
41 | filtered = [v for v in views if v['id'] == view_id]
42 |
43 | if len(filtered) == 0:
44 | raise Exception('No such view found')
45 | if len(filtered) > 1:
46 | raise Exception('Internal Server error')
47 | view = filtered[0]
48 |
49 | # 0. Ensure view is ready to be read
50 | if (view['status'] != 'SUCCESS'):
51 | status = view['status']
52 | print(f'view run status is not ready: {status}. Returning empty.')
53 | return
54 |
55 | glue_db_name = view['destinationTypeProperties']['databaseName']
56 | glue_table_name = view['destinationTypeProperties']['tableName']
57 |
58 | # Query Glue table directly with catalog function of spark
59 | return self.spark.table(f"`{glue_db_name}`.`{glue_table_name}`")
60 |
61 | def get_schema_from_spark(self, data_frame: pyspark.sql.dataframe.DataFrame):
62 | from pyspark.sql.types import StructType
63 |
64 | # for translation to FinSpace's schema
65 | # 'STRING'|'CHAR'|'INTEGER'|'TINYINT'|'SMALLINT'|'BIGINT'|'FLOAT'|'DOUBLE'|'DATE'|'DATETIME'|'BOOLEAN'|'BINARY'
66 | DoubleType = "DOUBLE"
67 | FloatType = "FLOAT"
68 | DateType = "DATE"
69 | StringType = "STRING"
70 | IntegerType = "INTEGER"
71 | LongType = "BIGINT"
72 | BooleanType = "BOOLEAN"
73 | TimestampType = "DATETIME"
74 |
75 | hab_columns = []
76 |
77 | items = [i for i in data_frame.schema]
78 |
79 | switcher = {
80 | "BinaryType" : StringType,
81 | "BooleanType" : BooleanType,
82 | "ByteType" : IntegerType,
83 | "DateType" : DateType,
84 | "DoubleType" : FloatType,
85 | "IntegerType" : IntegerType,
86 | "LongType" : IntegerType,
87 | "NullType" : StringType,
88 | "ShortType" : IntegerType,
89 | "StringType" : StringType,
90 | "TimestampType" : TimestampType,
91 | }
92 |
93 |
94 | for i in items:
95 | # print( f"name: {i.name} type: {i.dataType}" )
96 |
97 | habType = switcher.get( str(i.dataType), StringType)
98 |
99 | hab_columns.append({
100 | "dataType" : habType,
101 | "name" : i.name,
102 | "description" : ""
103 | })
104 |
105 | return( hab_columns )
106 |
--------------------------------------------------------------------------------
/webinars/snowflake_2021-09/finspace_spark.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import time
3 | import boto3
4 | from botocore.config import Config
5 |
6 | # FinSpace class with Spark bindings
7 |
8 | class SparkFinSpace(FinSpace):
9 | import pyspark
10 | def __init__(
11 | self,
12 | spark: pyspark.sql.session.SparkSession = None,
13 | config = Config(retries = {'max_attempts': 0, 'mode': 'standard'}),
14 | dev_overrides: dict = None
15 | ):
16 | FinSpace.__init__(self, config=config, dev_overrides=dev_overrides)
17 | self.spark = spark # used on Spark cluster for reading views, creating changesets from DataFrames
18 |
19 | def upload_dataframe(self, data_frame: pyspark.sql.dataframe.DataFrame):
20 | resp = self.client.get_user_ingestion_info()
21 | upload_location = resp['ingestionPath']
22 | # data_frame.write.option('header', 'true').csv(upload_location)
23 | data_frame.write.parquet(upload_location)
24 | return upload_location
25 |
26 | def ingest_dataframe(self, data_frame: pyspark.sql.dataframe.DataFrame, dataset_id: str, change_type: str, wait_for_completion=True):
27 | print("Uploading data...")
28 | upload_location = self.upload_dataframe(data_frame)
29 |
30 | print("Data upload finished. Ingesting data...")
31 |
32 | return self.ingest_from_s3(upload_location, dataset_id, change_type, wait_for_completion, format_type='parquet', format_params={})
33 |
34 | def read_view_as_spark(
35 | self,
36 | dataset_id: str,
37 | view_id: str
38 | ):
39 | # TODO: switch to DescribeMatz when available in HFS
40 | views = self.list_views(dataset_id=dataset_id, max_results=50)
41 | filtered = [v for v in views if v['id'] == view_id]
42 |
43 | if len(filtered) == 0:
44 | raise Exception('No such view found')
45 | if len(filtered) > 1:
46 | raise Exception('Internal Server error')
47 | view = filtered[0]
48 |
49 | # 0. Ensure view is ready to be read
50 | if (view['status'] != 'SUCCESS'):
51 | status = view['status']
52 | print(f'view run status is not ready: {status}. Returning empty.')
53 | return
54 |
55 | glue_db_name = view['destinationTypeProperties']['databaseName']
56 | glue_table_name = view['destinationTypeProperties']['tableName']
57 |
58 | # Query Glue table directly with catalog function of spark
59 | return self.spark.table(f"`{glue_db_name}`.`{glue_table_name}`")
60 |
61 | def get_schema_from_spark(self, data_frame: pyspark.sql.dataframe.DataFrame):
62 | from pyspark.sql.types import StructType
63 |
64 | # for translation to FinSpace's schema
65 | # 'STRING'|'CHAR'|'INTEGER'|'TINYINT'|'SMALLINT'|'BIGINT'|'FLOAT'|'DOUBLE'|'DATE'|'DATETIME'|'BOOLEAN'|'BINARY'
66 | DoubleType = "DOUBLE"
67 | FloatType = "FLOAT"
68 | DateType = "DATE"
69 | StringType = "STRING"
70 | IntegerType = "INTEGER"
71 | LongType = "BIGINT"
72 | BooleanType = "BOOLEAN"
73 | TimestampType = "DATETIME"
74 |
75 | hab_columns = []
76 |
77 | items = [i for i in data_frame.schema]
78 |
79 | switcher = {
80 | "BinaryType" : StringType,
81 | "BooleanType" : BooleanType,
82 | "ByteType" : IntegerType,
83 | "DateType" : DateType,
84 | "DoubleType" : FloatType,
85 | "IntegerType" : IntegerType,
86 | "LongType" : IntegerType,
87 | "NullType" : StringType,
88 | "ShortType" : IntegerType,
89 | "StringType" : StringType,
90 | "TimestampType" : TimestampType,
91 | }
92 |
93 |
94 | for i in items:
95 | # print( f"name: {i.name} type: {i.dataType}" )
96 |
97 | habType = switcher.get( str(i.dataType), StringType)
98 |
99 | hab_columns.append({
100 | "dataType" : habType,
101 | "name" : i.name,
102 | "description" : ""
103 | })
104 |
105 | return( hab_columns )
106 |
--------------------------------------------------------------------------------
/blogs/finspace_redshift-2021-09/finspace_spark.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import time
3 | import boto3
4 | from botocore.config import Config
5 |
6 | # FinSpace class with Spark bindings
7 |
8 | class SparkFinSpace(FinSpace):
9 | import pyspark
10 | def __init__(
11 | self,
12 | spark: pyspark.sql.session.SparkSession = None,
13 | config = Config(retries = {'max_attempts': 0, 'mode': 'standard'}),
14 | dev_overrides: dict = None
15 | ):
16 | FinSpace.__init__(self, config=config, dev_overrides=dev_overrides)
17 | self.spark = spark # used on Spark cluster for reading views, creating changesets from DataFrames
18 |
19 | def upload_dataframe(self, data_frame: pyspark.sql.dataframe.DataFrame):
20 | resp = self.client.get_user_ingestion_info()
21 | upload_location = resp['ingestionPath']
22 | # data_frame.write.option('header', 'true').csv(upload_location)
23 | data_frame.write.parquet(upload_location)
24 | return upload_location
25 |
26 | def ingest_dataframe(self, data_frame: pyspark.sql.dataframe.DataFrame, dataset_id: str, change_type: str, wait_for_completion=True):
27 | print("Uploading data...")
28 | upload_location = self.upload_dataframe(data_frame)
29 |
30 | print("Data upload finished. Ingesting data...")
31 |
32 | return self.ingest_from_s3(upload_location, dataset_id, change_type, wait_for_completion, format_type='parquet', format_params={})
33 |
34 | def read_view_as_spark(
35 | self,
36 | dataset_id: str,
37 | view_id: str
38 | ):
39 | # TODO: switch to DescribeMatz when available in HFS
40 | views = self.list_views(dataset_id=dataset_id, max_results=50)
41 | filtered = [v for v in views if v['id'] == view_id]
42 |
43 | if len(filtered) == 0:
44 | raise Exception('No such view found')
45 | if len(filtered) > 1:
46 | raise Exception('Internal Server error')
47 | view = filtered[0]
48 |
49 | # 0. Ensure view is ready to be read
50 | if (view['status'] != 'SUCCESS'):
51 | status = view['status']
52 | print(f'view run status is not ready: {status}. Returning empty.')
53 | return
54 |
55 | glue_db_name = view['destinationTypeProperties']['databaseName']
56 | glue_table_name = view['destinationTypeProperties']['tableName']
57 |
58 | # Query Glue table directly with catalog function of spark
59 | return self.spark.table(f"`{glue_db_name}`.`{glue_table_name}`")
60 |
61 | def get_schema_from_spark(self, data_frame: pyspark.sql.dataframe.DataFrame):
62 | from pyspark.sql.types import StructType
63 |
64 | # for translation to FinSpace's schema
65 | # 'STRING'|'CHAR'|'INTEGER'|'TINYINT'|'SMALLINT'|'BIGINT'|'FLOAT'|'DOUBLE'|'DATE'|'DATETIME'|'BOOLEAN'|'BINARY'
66 | DoubleType = "DOUBLE"
67 | FloatType = "FLOAT"
68 | DateType = "DATE"
69 | StringType = "STRING"
70 | IntegerType = "INTEGER"
71 | LongType = "BIGINT"
72 | BooleanType = "BOOLEAN"
73 | TimestampType = "DATETIME"
74 |
75 | hab_columns = []
76 |
77 | items = [i for i in data_frame.schema]
78 |
79 | switcher = {
80 | "BinaryType" : StringType,
81 | "BooleanType" : BooleanType,
82 | "ByteType" : IntegerType,
83 | "DateType" : DateType,
84 | "DoubleType" : FloatType,
85 | "IntegerType" : IntegerType,
86 | "LongType" : IntegerType,
87 | "NullType" : StringType,
88 | "ShortType" : IntegerType,
89 | "StringType" : StringType,
90 | "TimestampType" : TimestampType,
91 | }
92 |
93 |
94 | for i in items:
95 | # print( f"name: {i.name} type: {i.dataType}" )
96 |
97 | habType = switcher.get( str(i.dataType), StringType)
98 |
99 | hab_columns.append({
100 | "dataType" : habType,
101 | "name" : i.name,
102 | "description" : ""
103 | })
104 |
105 | return( hab_columns )
106 |
--------------------------------------------------------------------------------
/notebooks/cluster_management/ClusterManagement.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Initialize the Cluster Manager\n",
8 | "The cluster manager works in a local session (not spark) so it can interact with the cluster manager service to manage clusters."
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "%local\n",
18 | "from aws.finspace.cluster import FinSpaceClusterManager\n",
19 | "\n",
20 | "# if this was already run, no need to run again\n",
21 | "if 'finspace_clusters' not in globals():\n",
22 | " finspace_clusters = FinSpaceClusterManager()\n",
23 | " finspace_clusters.auto_connect()\n",
24 | "else:\n",
25 | " print(f'connected to cluster: {finspace_clusters.get_connected_cluster_id()}')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 6,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "data": {
35 | "text/html": [
36 | "Current session configs: {'files': [], 'jars': [], 'conf': {'spark.pyspark.python': 'python3', 'spark.pyspark.virtualenv.enabled': 'true', 'spark.pyspark.virtualenv.type': 'native', 'spark.pyspark.virtualenv.bin.path': '/usr/bin/virtualenv', 'spark.pyspark.virtualenv.packages': '', 'spark.jars.packages': '', 'spark.jars.repositories': ''}, 'kind': 'pyspark'} "
37 | ],
38 | "text/plain": [
39 | ""
40 | ]
41 | },
42 | "metadata": {},
43 | "output_type": "display_data"
44 | },
45 | {
46 | "data": {
47 | "text/html": [
48 | "