├── sample-data
    ├── Readme.md
    └── Industrial production total index.csv
├── .gitignore
├── notebooks
    ├── custom_calendar
    │   ├── workflow.png
    │   └── analyze_trade_and_quote_data.ipynb
    ├── technical_indicators
    │   ├── finspace_logo.png
    │   └── technical-indicators-all.ipynb
    ├── analyze_trade_and_quote_data
    │   └── workflow.png
    ├── collect_timebars_and_summarize
    │   ├── workflow.png
    │   ├── finspace_logo.png
    │   └── collect-timebars-summarize.ipynb
    ├── compute_and_plot_volatility_from_taq
    │   ├── workflow.png
    │   ├── finspace_logo.png
    │   └── plot-volatility.ipynb
    ├── Utilities
    │   └── finspace_spark.py
    ├── cluster_management
    │   └── ClusterManagement.ipynb
    ├── s3_import
    │   └── s3_import.ipynb
    └── third_party_apis
    │   ├── yfinance_import.ipynb
    │   └── polygon_import.ipynb
├── webinars
    └── snowflake_2021-09
    │   ├── workflow.png
    │   ├── finspace_logo.png
    │   ├── README.md
    │   └── finspace_spark.py
├── blogs
    └── finspace_redshift-2021-09
    │   ├── finspace-redshift-import.png
    │   ├── finspace-redshift-analysis.png
    │   ├── finspace_redshift.sql
    │   ├── README.md
    │   └── finspace_spark.py
├── CODE_OF_CONDUCT.md
├── LICENSE
├── CONTRIBUTING.md
└── README.md


/sample-data/Readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | **/.DS_Store
3 | .ipynb_checkpoints
4 | **/.ipynb_checkpoints
5 | .idea
6 | 


--------------------------------------------------------------------------------
/notebooks/custom_calendar/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/custom_calendar/workflow.png


--------------------------------------------------------------------------------
/webinars/snowflake_2021-09/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/webinars/snowflake_2021-09/workflow.png


--------------------------------------------------------------------------------
/webinars/snowflake_2021-09/finspace_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/webinars/snowflake_2021-09/finspace_logo.png


--------------------------------------------------------------------------------
/notebooks/technical_indicators/finspace_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/technical_indicators/finspace_logo.png


--------------------------------------------------------------------------------
/notebooks/analyze_trade_and_quote_data/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/analyze_trade_and_quote_data/workflow.png


--------------------------------------------------------------------------------
/notebooks/collect_timebars_and_summarize/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/collect_timebars_and_summarize/workflow.png


--------------------------------------------------------------------------------
/notebooks/collect_timebars_and_summarize/finspace_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/collect_timebars_and_summarize/finspace_logo.png


--------------------------------------------------------------------------------
/blogs/finspace_redshift-2021-09/finspace-redshift-import.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/blogs/finspace_redshift-2021-09/finspace-redshift-import.png


--------------------------------------------------------------------------------
/notebooks/compute_and_plot_volatility_from_taq/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/compute_and_plot_volatility_from_taq/workflow.png


--------------------------------------------------------------------------------
/blogs/finspace_redshift-2021-09/finspace-redshift-analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/blogs/finspace_redshift-2021-09/finspace-redshift-analysis.png


--------------------------------------------------------------------------------
/notebooks/compute_and_plot_volatility_from_taq/finspace_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/amazon-finspace-examples/main/notebooks/compute_and_plot_volatility_from_taq/finspace_logo.png


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/blogs/finspace_redshift-2021-09/finspace_redshift.sql:
--------------------------------------------------------------------------------
 1 | -- This code assumes the name of the database is 'dev'. If your databse has a different nme, update the code below.
 2 | 
 3 | CREATE SCHEMA trading_data;
 4 | 
 5 | CREATE TABLE dev.trading_data.trade_history(
 6 | trans_id varchar,
 7 | ticker varchar,
 8 | price decimal(8,2),
 9 | quantity bigint,
10 | trans_type varchar,
11 | trans_date date);
12 | 
13 | 
14 | INSERT into dev.trading_data.trade_history values 
15 | ('154644', 'AMZN', '1876.02', '190', 'P', '2020-01-02'),
16 | ('154699', 'AMZN', '1877.98', '268', 'P', '2020-01-02'),
17 | ('156655', 'AMZN', '1870.00', '100', 'P', '2020-01-02'),
18 | ('156656', 'AMZN', '1876.02', '100', 'P', '2020-01-02'),
19 | ('156849', 'AMZN', '1865.65', '187', 'P', '2020-01-02'),
20 | ('166894', 'AMZN', '1897.67', '100', 'P', '2020-01-02'),
21 | ('166905', 'AMZN', '1897.89', '200', 'S', '2020-01-02');
22 | 
23 | COMMENT on table
24 | dev.trading_data.trade_history is 'Table contains a list of all buy and sell transactions across the organization';
25 | COMMENT on column
26 | dev.trading_data.trade_history.trans_id is 'Unique transaction ID';
27 |  COMMENT on column
28 | dev.trading_data.trade_history.ticker is 'Stock ticker';
29 |  COMMENT on column
30 | dev.trading_data.trade_history.price is 'Purchase or sale price';
31 | COMMENT on column
32 | dev.trading_data.trade_history.quantity is 'Purchase or sale quantity';
33 | COMMENT on column
34 | dev.trading_data.trade_history.trans_type is 'Transaction type (P=purchase, S=sale)';
35 | COMMENT on column
36 | dev.trading_data.trade_history.trans_date is 'Purchase or sale date';


--------------------------------------------------------------------------------
/blogs/finspace_redshift-2021-09/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon FinSpace Examples
 2 | 
 3 | ## Connecting to Amazon Redshift from FinSpace
 4 | 
 5 | This repository contains example notebooks that show how to connect to an Amazon Redhsift cluster from Amazon FinSpace using JDBC driver.
 6 | 
 7 | ### Import tables and metadata
 8 | 
 9 | The [Import notebook](redshift_in_finspace_import.ipynb) shows how to create datasets in FinSpace based on table metadata in Amazon Redshift. 
10 | It covers the following steps:  
11 | 1. Create a category and an attribute set in FinSpace.
12 | 2. Connect to Amazon Redshift from FinSpace Jupyter notebook using JDBC.
13 | 3. Create dataset in Amazon FinSpace for each table in Amazon Redshift. Add description, owner, and attributes to each dataset to help with data discovery and access control.
14 | 
15 | ![Import Diagram](finspace-redshift-import.png)
16 | 
17 | ### Analyze data
18 | 
19 | Once you created datasets in FinSpace, you can use the attached attribute set to connect to the Amazon Redshift cluster directly from FinSpace without specifying database, schema, or table name. 
20 | The [analysis notebook](redshift_in_finspace_analysis.ipynb), shows how to:  
21 | 1. Connect to Amazon Redshift using the attribute set from FinSpace
22 | 2. Use data from both Amazon FinSpace and Amazon Redshift to evaluate trade performance based on daily price for AMZN stock.
23 | 
24 | ![Import Diagram](finspace-redshift-analysis.png)
25 | 
26 | ## FAQ
27 | 
28 | **What do I need in order to get started?**  
29 | - Setup FinSpace environment by following the [“Create an Amazon FinSpace Environment”](https://docs.aws.amazon.com/finspace/latest/userguide/create-an-amazon-finspace-environment.html) guide
30 | - Install Capital Markets sample data bundle, as explained in the [“Sample Data Bundle”](https://docs.aws.amazon.com/finspace/latest/userguide/sample-data-bundle.html) guide
31 | - Ensure you have permissions to ‘Manage Categories and Controlled Vocabularies’ and ‘Manage Attribute Sets’ in FinSpace
32 | - Create an Amazon Redshift cluster in the same AWS account as the FinSpace environment
33 |   - Follow the [“Create Cluster Guide”](https://docs.aws.amazon.com/redshift/latest/dg/tutorial-loading-data-launch-cluster.html) guide to get started
34 |   - Create a superuser and ensure the cluster is publicly accessible by following [this guide](https://aws.amazon.com/premiumsupport/knowledge-center/redshift-cluster-private-public/)
35 |   - Create a table in Amazon Redshift and insert trading transaction data using [these SQL queries](finspace_redshift.sql)
36 | 


--------------------------------------------------------------------------------
/webinars/snowflake_2021-09/README.md:
--------------------------------------------------------------------------------
 1 | # Financial Services Data Summit
 2 | ## Session
 3 | [Making Financial Data More Accessible in the Cloud](https://www.snowflake.com/financial-services-data-summit/americas/agenda/?agendaPath=session/615483)  
 4 | **Date:** Sept 14, 2021  
 5 | 
 6 | ### Agenda
 7 | Join this session to learn how AWS and Snowflake are innovating to make it easier for financial customers to share, manage, and analyze financial content in the cloud. Leading Financial Markets Operator TP ICAP will showcase their use of Snowflake running on AWS within the Parameta Solutions business to share market data and analytics amongst clients who use proprietary models built to run written in Python. Then AWS will demonstrate the benefits of incorporating data from Snowflake into Amazon FinSpace, a new service to provide customers with a scalable research environment that offers integrated data management, analytics, and governance.
 8 | 
 9 | ## FinSpace Prerequisits
10 | - Capital Markets Sample Data bundle has been installed in the environment    
11 | - Category 'Source' contains a sub-category named 'Snowflake' 
12 | - Attribute set named 'Snowflake Table Attributes' exists with fields
13 |   - Name: Catalog, Type: String
14 |   - Name: Schema, Type: String
15 |   - Name: Table, Type: String
16 |   - Name: Source, Type: Category: Source
17 | 
18 | ### snowflake.ini
19 | It is assumed that there exists in this same folder with the notebooks a customer provided snowflake.ini file
20 | that contains snowflake instance information and authentication credentials.
21 | 
22 | #### Contents of snowflake.ini
23 | ```
24 | [snowflake]  
25 | user: USERNAME  
26 | password: PASSWORD  
27 | account: ACCOUNT  
28 | database: DATABASE  
29 | warehouse: WAREHOUSE
30 | ```
31 | Please provide values from your snowflake installation for: USERNAME, PASSWORD, ACCOUNT, DATABASE, and WAREHOUSE    
32 | 
33 | ## Code Artifacts
34 | Code artifacts from the demonstration given at the summit  
35 | 
36 | ### Notebooks
37 | [Delete Datasets](delete_datasets.ipynb) Deletes all datasets with a given classification (Source) and value (Snowflake)  
38 | [Snowflake Datasets](snowflake_datasets.ipynb) Shows how to search for and display datasets in FinSpace that reside in Snowflake  
39 | [Snowflake Import](snowflake_import.ipynb) Notebook that creates a FinSpace dataset for each table in the given snowflake table  
40 | [Plot Volatility](plot-volatility-snowflake.ipynb) Notebook to plot volatility then add and plots events over the volatility plot, presented in session  
41 | 
42 | ### Python
43 | [finspace.py](finspace.py) Utility class for working with FinSpace boto3 service API  
44 | [finspace_spark.py](finspace_spark.py) Utility class for works with Spark and FinSpace boto3 service API  
45 | 
46 | ### Other
47 | [finspace_logo.png](finspace_logo.png) FinSpace logo in notebooks  
48 | [workflow.png](workflow.png) FinSpace time-series library workflow image     


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon FinSpace Examples. 
 2 | This repository contains example notebooks and python scripts that show how to work with [Amazon FinSpace](https://aws.amazon.com/finspace/).
 3 | 
 4 | ## Examples
 5 | 
 6 | ### Notebooks: Inside Amazon FinSpace  
 7 | These notebooks are intended to be run from the FinSpace managed notebook environment. 
 8 | Notebooks will reference a dataset (and others a permission group as well) found in FinSpace, 
 9 | be sure you have entered the empty identifiers for dataset_id, view_id, and basicPermissionGroupId found in your 
10 | environment installation. All example notebooks assume that the Capital Markets Sample Data bundle was installed 
11 | with the FinSpace environment. Some example notebooks make use of Utility classes found in the Utilities folder 
12 | (e.g. finspace.py and finspace_spak.py) be sure to have run the '%load' for the python files twice, first to load 
13 | the file contents into the notebook, and a second time to ensure the code is run and pushed onto your Spark cluster.  
14 | 
15 | - [Analyzing petabytes of trade and quote data with Amazon FinSpace](notebooks/analyze_trade_and_quote_data) shows how to use the FinSpace Time Series Library.  
16 | - [Cluster Management](notebooks/cluster_management) demonstrates using the cluster management APIs from within a  notebook.  
17 | - [Collect Timebars and Summarize](notebooks/collect_timebars_and_summarize) demonstrates how to create a summary time-bar dataset and add it to FinSpace.  
18 | - [Compute and Plot Volatility from TAQ](notebooks/compute_and_plot_volatility_from_taq) demonstrates how to compute and plot volatility using the FinSpace Time Series Libraries.  
19 | - [Technical Indicators](notebooks/technical_indicators) demonstrates the creation of a Spark DataFrame that uses all the FinSpace technical indicators. 
20 | - [S3 Import](notebooks/s3_import) shows how to import data from an external (to FinSpace) S3 bucket into a FinSpace dataset
21 | - [Using Third Party APIs](notebooks/third_party_apis) shows how you can install and use third party APIs from FinSpace
22 | - [Custom Calendars](notebooks/custom_calendar) shows how you can create a custom calendar for the time series fill and filter stage
23 | 
24 | ### Python: Helper Code  
25 | - [Utility Classes](notebooks/Utilities) facilitates the use of the FinSpace APIs.  
26 | 
27 | ## Blogs
28 | [Analyze daily trading activity using transaction data from Amazon Redshift in Amazon FinSpace](blogs/finspace_redshift-2021-09)   
29 | How to connect Amazon FinSpace to a Redshift cluster, import tables into FinSpace datasets, and pull data in tables from
30 | Redshift directly into Spark DataFrames.
31 | 
32 | ## Webinars
33 | [Making Financial Data More Accessible in the Cloud](webinars/snowflake_2021-09)  
34 | Notebooks used to demonstrate integration of Snowflake tables with Amazon FinSpace. Presented at Snowflake Financial
35 | Services Summit Sept 14, 2021: [Making Financial Data More Accessible in the Cloud](https://www.snowflake.com/financial-services-data-summit/americas/agenda/?agendaPath=session/615483)
36 | 
37 | ## FAQ
38 | *How do I contribute my own example notebook?*  
39 | 
40 | - Although we're extremely excited to receive contributions from the community, we're still working on the best mechanism to take in examples from external sources.  Please bare with us in the short-term if pull requests take longer than expected or are closed.
41 | 
42 | ## License
43 | 
44 | This library is licensed under the MIT-0 License. See the LICENSE file.
45 | 
46 | 


--------------------------------------------------------------------------------
/notebooks/Utilities/finspace_spark.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import time
  3 | import boto3
  4 | from botocore.config import Config
  5 | 
  6 | # FinSpace class with Spark bindings
  7 | 
  8 | class SparkFinSpace(FinSpace):
  9 |     import pyspark
 10 |     def __init__(
 11 |         self, 
 12 |         spark: pyspark.sql.session.SparkSession = None,
 13 |         config = Config(retries = {'max_attempts': 0, 'mode': 'standard'}),
 14 |         dev_overrides: dict = None
 15 |     ):
 16 |         FinSpace.__init__(self, config=config, dev_overrides=dev_overrides)
 17 |         self.spark = spark # used on Spark cluster for reading views, creating changesets from DataFrames
 18 |         
 19 |     def upload_dataframe(self, data_frame: pyspark.sql.dataframe.DataFrame):
 20 |         resp = self.client.get_user_ingestion_info()
 21 |         upload_location = resp['ingestionPath']
 22 | #        data_frame.write.option('header', 'true').csv(upload_location)
 23 |         data_frame.write.parquet(upload_location)
 24 |         return upload_location
 25 |     
 26 |     def ingest_dataframe(self, data_frame: pyspark.sql.dataframe.DataFrame, dataset_id: str, change_type: str, wait_for_completion=True):
 27 |         print("Uploading data...")
 28 |         upload_location = self.upload_dataframe(data_frame)
 29 |         
 30 |         print("Data upload finished. Ingesting data...")
 31 |         
 32 |         return self.ingest_from_s3(upload_location, dataset_id, change_type, wait_for_completion, format_type='parquet', format_params={})
 33 |     
 34 |     def read_view_as_spark(
 35 |         self,
 36 |         dataset_id: str,
 37 |         view_id: str
 38 |         ):
 39 |         # TODO: switch to DescribeMatz when available in HFS
 40 |         views = self.list_views(dataset_id=dataset_id, max_results=50)
 41 |         filtered = [v for v in views if v['id'] == view_id]
 42 | 
 43 |         if len(filtered) == 0:
 44 |             raise Exception('No such view found')
 45 |         if len(filtered) > 1:
 46 |             raise Exception('Internal Server error')
 47 |         view = filtered[0]
 48 |         
 49 |         # 0. Ensure view is ready to be read
 50 |         if (view['status'] != 'SUCCESS'): 
 51 |             status = view['status'] 
 52 |             print(f'view run status is not ready: {status}. Returning empty.')
 53 |             return
 54 | 
 55 |         glue_db_name = view['destinationTypeProperties']['databaseName']
 56 |         glue_table_name = view['destinationTypeProperties']['tableName']
 57 |         
 58 |         # Query Glue table directly with catalog function of spark
 59 |         return self.spark.table(f"`{glue_db_name}`.`{glue_table_name}`")
 60 |     
 61 |     def get_schema_from_spark(self, data_frame: pyspark.sql.dataframe.DataFrame):
 62 |         from pyspark.sql.types import StructType
 63 | 
 64 |         # for translation to FinSpace's schema
 65 |         # 'STRING'|'CHAR'|'INTEGER'|'TINYINT'|'SMALLINT'|'BIGINT'|'FLOAT'|'DOUBLE'|'DATE'|'DATETIME'|'BOOLEAN'|'BINARY'
 66 |         DoubleType    = "DOUBLE"
 67 |         FloatType     = "FLOAT"
 68 |         DateType      = "DATE"
 69 |         StringType    = "STRING"
 70 |         IntegerType   = "INTEGER"
 71 |         LongType      = "BIGINT"
 72 |         BooleanType   = "BOOLEAN"
 73 |         TimestampType = "DATETIME"
 74 |         
 75 |         hab_columns = []
 76 | 
 77 |         items = [i for i in data_frame.schema] 
 78 | 
 79 |         switcher = {
 80 |             "BinaryType"    : StringType,
 81 |             "BooleanType"   : BooleanType,
 82 |             "ByteType"      : IntegerType,
 83 |             "DateType"      : DateType,
 84 |             "DoubleType"    : FloatType,
 85 |             "IntegerType"   : IntegerType,
 86 |             "LongType"      : IntegerType,
 87 |             "NullType"      : StringType,
 88 |             "ShortType"     : IntegerType,
 89 |             "StringType"    : StringType,
 90 |             "TimestampType" : TimestampType,
 91 |         }
 92 | 
 93 |         
 94 |         for i in items:
 95 | #            print( f"name: {i.name} type: {i.dataType}" )
 96 | 
 97 |             habType = switcher.get( str(i.dataType), StringType)
 98 | 
 99 |             hab_columns.append({
100 |                 "dataType"    : habType, 
101 |                 "name"        : i.name,
102 |                 "description" : ""
103 |             })
104 | 
105 |         return( hab_columns )
106 | 


--------------------------------------------------------------------------------
/webinars/snowflake_2021-09/finspace_spark.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import time
  3 | import boto3
  4 | from botocore.config import Config
  5 | 
  6 | # FinSpace class with Spark bindings
  7 | 
  8 | class SparkFinSpace(FinSpace):
  9 |     import pyspark
 10 |     def __init__(
 11 |         self, 
 12 |         spark: pyspark.sql.session.SparkSession = None,
 13 |         config = Config(retries = {'max_attempts': 0, 'mode': 'standard'}),
 14 |         dev_overrides: dict = None
 15 |     ):
 16 |         FinSpace.__init__(self, config=config, dev_overrides=dev_overrides)
 17 |         self.spark = spark # used on Spark cluster for reading views, creating changesets from DataFrames
 18 |         
 19 |     def upload_dataframe(self, data_frame: pyspark.sql.dataframe.DataFrame):
 20 |         resp = self.client.get_user_ingestion_info()
 21 |         upload_location = resp['ingestionPath']
 22 | #        data_frame.write.option('header', 'true').csv(upload_location)
 23 |         data_frame.write.parquet(upload_location)
 24 |         return upload_location
 25 |     
 26 |     def ingest_dataframe(self, data_frame: pyspark.sql.dataframe.DataFrame, dataset_id: str, change_type: str, wait_for_completion=True):
 27 |         print("Uploading data...")
 28 |         upload_location = self.upload_dataframe(data_frame)
 29 |         
 30 |         print("Data upload finished. Ingesting data...")
 31 |         
 32 |         return self.ingest_from_s3(upload_location, dataset_id, change_type, wait_for_completion, format_type='parquet', format_params={})
 33 |     
 34 |     def read_view_as_spark(
 35 |         self,
 36 |         dataset_id: str,
 37 |         view_id: str
 38 |         ):
 39 |         # TODO: switch to DescribeMatz when available in HFS
 40 |         views = self.list_views(dataset_id=dataset_id, max_results=50)
 41 |         filtered = [v for v in views if v['id'] == view_id]
 42 | 
 43 |         if len(filtered) == 0:
 44 |             raise Exception('No such view found')
 45 |         if len(filtered) > 1:
 46 |             raise Exception('Internal Server error')
 47 |         view = filtered[0]
 48 |         
 49 |         # 0. Ensure view is ready to be read
 50 |         if (view['status'] != 'SUCCESS'): 
 51 |             status = view['status'] 
 52 |             print(f'view run status is not ready: {status}. Returning empty.')
 53 |             return
 54 | 
 55 |         glue_db_name = view['destinationTypeProperties']['databaseName']
 56 |         glue_table_name = view['destinationTypeProperties']['tableName']
 57 |         
 58 |         # Query Glue table directly with catalog function of spark
 59 |         return self.spark.table(f"`{glue_db_name}`.`{glue_table_name}`")
 60 |     
 61 |     def get_schema_from_spark(self, data_frame: pyspark.sql.dataframe.DataFrame):
 62 |         from pyspark.sql.types import StructType
 63 | 
 64 |         # for translation to FinSpace's schema
 65 |         # 'STRING'|'CHAR'|'INTEGER'|'TINYINT'|'SMALLINT'|'BIGINT'|'FLOAT'|'DOUBLE'|'DATE'|'DATETIME'|'BOOLEAN'|'BINARY'
 66 |         DoubleType    = "DOUBLE"
 67 |         FloatType     = "FLOAT"
 68 |         DateType      = "DATE"
 69 |         StringType    = "STRING"
 70 |         IntegerType   = "INTEGER"
 71 |         LongType      = "BIGINT"
 72 |         BooleanType   = "BOOLEAN"
 73 |         TimestampType = "DATETIME"
 74 |         
 75 |         hab_columns = []
 76 | 
 77 |         items = [i for i in data_frame.schema] 
 78 | 
 79 |         switcher = {
 80 |             "BinaryType"    : StringType,
 81 |             "BooleanType"   : BooleanType,
 82 |             "ByteType"      : IntegerType,
 83 |             "DateType"      : DateType,
 84 |             "DoubleType"    : FloatType,
 85 |             "IntegerType"   : IntegerType,
 86 |             "LongType"      : IntegerType,
 87 |             "NullType"      : StringType,
 88 |             "ShortType"     : IntegerType,
 89 |             "StringType"    : StringType,
 90 |             "TimestampType" : TimestampType,
 91 |         }
 92 | 
 93 |         
 94 |         for i in items:
 95 | #            print( f"name: {i.name} type: {i.dataType}" )
 96 | 
 97 |             habType = switcher.get( str(i.dataType), StringType)
 98 | 
 99 |             hab_columns.append({
100 |                 "dataType"    : habType, 
101 |                 "name"        : i.name,
102 |                 "description" : ""
103 |             })
104 | 
105 |         return( hab_columns )
106 | 


--------------------------------------------------------------------------------
/blogs/finspace_redshift-2021-09/finspace_spark.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import time
  3 | import boto3
  4 | from botocore.config import Config
  5 | 
  6 | # FinSpace class with Spark bindings
  7 | 
  8 | class SparkFinSpace(FinSpace):
  9 |     import pyspark
 10 |     def __init__(
 11 |         self, 
 12 |         spark: pyspark.sql.session.SparkSession = None,
 13 |         config = Config(retries = {'max_attempts': 0, 'mode': 'standard'}),
 14 |         dev_overrides: dict = None
 15 |     ):
 16 |         FinSpace.__init__(self, config=config, dev_overrides=dev_overrides)
 17 |         self.spark = spark # used on Spark cluster for reading views, creating changesets from DataFrames
 18 |         
 19 |     def upload_dataframe(self, data_frame: pyspark.sql.dataframe.DataFrame):
 20 |         resp = self.client.get_user_ingestion_info()
 21 |         upload_location = resp['ingestionPath']
 22 | #        data_frame.write.option('header', 'true').csv(upload_location)
 23 |         data_frame.write.parquet(upload_location)
 24 |         return upload_location
 25 |     
 26 |     def ingest_dataframe(self, data_frame: pyspark.sql.dataframe.DataFrame, dataset_id: str, change_type: str, wait_for_completion=True):
 27 |         print("Uploading data...")
 28 |         upload_location = self.upload_dataframe(data_frame)
 29 |         
 30 |         print("Data upload finished. Ingesting data...")
 31 |         
 32 |         return self.ingest_from_s3(upload_location, dataset_id, change_type, wait_for_completion, format_type='parquet', format_params={})
 33 |     
 34 |     def read_view_as_spark(
 35 |         self,
 36 |         dataset_id: str,
 37 |         view_id: str
 38 |         ):
 39 |         # TODO: switch to DescribeMatz when available in HFS
 40 |         views = self.list_views(dataset_id=dataset_id, max_results=50)
 41 |         filtered = [v for v in views if v['id'] == view_id]
 42 | 
 43 |         if len(filtered) == 0:
 44 |             raise Exception('No such view found')
 45 |         if len(filtered) > 1:
 46 |             raise Exception('Internal Server error')
 47 |         view = filtered[0]
 48 |         
 49 |         # 0. Ensure view is ready to be read
 50 |         if (view['status'] != 'SUCCESS'): 
 51 |             status = view['status'] 
 52 |             print(f'view run status is not ready: {status}. Returning empty.')
 53 |             return
 54 | 
 55 |         glue_db_name = view['destinationTypeProperties']['databaseName']
 56 |         glue_table_name = view['destinationTypeProperties']['tableName']
 57 |         
 58 |         # Query Glue table directly with catalog function of spark
 59 |         return self.spark.table(f"`{glue_db_name}`.`{glue_table_name}`")
 60 |     
 61 |     def get_schema_from_spark(self, data_frame: pyspark.sql.dataframe.DataFrame):
 62 |         from pyspark.sql.types import StructType
 63 | 
 64 |         # for translation to FinSpace's schema
 65 |         # 'STRING'|'CHAR'|'INTEGER'|'TINYINT'|'SMALLINT'|'BIGINT'|'FLOAT'|'DOUBLE'|'DATE'|'DATETIME'|'BOOLEAN'|'BINARY'
 66 |         DoubleType    = "DOUBLE"
 67 |         FloatType     = "FLOAT"
 68 |         DateType      = "DATE"
 69 |         StringType    = "STRING"
 70 |         IntegerType   = "INTEGER"
 71 |         LongType      = "BIGINT"
 72 |         BooleanType   = "BOOLEAN"
 73 |         TimestampType = "DATETIME"
 74 |         
 75 |         hab_columns = []
 76 | 
 77 |         items = [i for i in data_frame.schema] 
 78 | 
 79 |         switcher = {
 80 |             "BinaryType"    : StringType,
 81 |             "BooleanType"   : BooleanType,
 82 |             "ByteType"      : IntegerType,
 83 |             "DateType"      : DateType,
 84 |             "DoubleType"    : FloatType,
 85 |             "IntegerType"   : IntegerType,
 86 |             "LongType"      : IntegerType,
 87 |             "NullType"      : StringType,
 88 |             "ShortType"     : IntegerType,
 89 |             "StringType"    : StringType,
 90 |             "TimestampType" : TimestampType,
 91 |         }
 92 | 
 93 |         
 94 |         for i in items:
 95 | #            print( f"name: {i.name} type: {i.dataType}" )
 96 | 
 97 |             habType = switcher.get( str(i.dataType), StringType)
 98 | 
 99 |             hab_columns.append({
100 |                 "dataType"    : habType, 
101 |                 "name"        : i.name,
102 |                 "description" : ""
103 |             })
104 | 
105 |         return( hab_columns )
106 | 


--------------------------------------------------------------------------------
/notebooks/cluster_management/ClusterManagement.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Initialize the Cluster Manager\n",
  8 |     "The cluster manager works in a local session (not spark) so it can interact with the cluster manager service to manage clusters."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "%local\n",
 18 |     "from aws.finspace.cluster import FinSpaceClusterManager\n",
 19 |     "\n",
 20 |     "# if this was already run, no need to run again\n",
 21 |     "if 'finspace_clusters' not in globals():\n",
 22 |     "    finspace_clusters = FinSpaceClusterManager()\n",
 23 |     "    finspace_clusters.auto_connect()\n",
 24 |     "else:\n",
 25 |     "    print(f'connected to cluster: {finspace_clusters.get_connected_cluster_id()}')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 6,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "data": {
 35 |       "text/html": [
 36 |        "Current session configs: <tt>{'files': [], 'jars': [], 'conf': {'spark.pyspark.python': 'python3', 'spark.pyspark.virtualenv.enabled': 'true', 'spark.pyspark.virtualenv.type': 'native', 'spark.pyspark.virtualenv.bin.path': '/usr/bin/virtualenv', 'spark.pyspark.virtualenv.packages': '', 'spark.jars.packages': '', 'spark.jars.repositories': ''}, 'kind': 'pyspark'}</tt><br>"
 37 |       ],
 38 |       "text/plain": [
 39 |        "<IPython.core.display.HTML object>"
 40 |       ]
 41 |      },
 42 |      "metadata": {},
 43 |      "output_type": "display_data"
 44 |     },
 45 |     {
 46 |      "data": {
 47 |       "text/html": [
 48 |        "<table>\n",
 49 |        "<tr><th>ID</th><th>YARN Application ID</th><th>Kind</th><th>State</th><th>Spark UI</th><th>Driver log</th><th>Current session?</th></tr><tr><td>10</td><td>application_1623322609824_0011</td><td>pyspark</td><td>busy</td><td><a target=\"_blank\" href=\"http://ip-192-168-43-59.ec2.internal:20888/proxy/application_1623322609824_0011/\">Link</a></td><td><a target=\"_blank\" href=\"http://ip-192-168-43-217.ec2.internal:8042/node/containerlogs/container_1623322609824_0011_01_000001/livy\">Link</a></td><td></td></tr></table>"
 50 |       ],
 51 |       "text/plain": [
 52 |        "<IPython.core.display.HTML object>"
 53 |       ]
 54 |      },
 55 |      "metadata": {},
 56 |      "output_type": "display_data"
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "%%info"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Basic Cluster Management\n",
 68 |     "\n",
 69 |     "- list clusters\n",
 70 |     "- update (resize) them\n",
 71 |     "- stop (terminate) them"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "%local\n",
 81 |     "import pandas as pd\n",
 82 |     "\n",
 83 |     "# list all clusters, running and recently terminated\n",
 84 |     "pd.DataFrame.from_dict( finspace_clusters.list()['clusters'] )"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "%local\n",
 94 |     "\n",
 95 |     "# from above return of list, enter the running cluster's clusterId below and your target size for the cluster\n",
 96 |     "\n",
 97 |     "cid = ''  # paste your own id here\n",
 98 |     "\n",
 99 |     "finspace_clusters.update(cid, 'Medium') # Choices: Small, Medium, Large"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "%local\n",
109 |     "import time\n",
110 |     "\n",
111 |     "def wait_for_status( client, clusterId:str, status:str, sleep_sec = 30 ):\n",
112 |     "    while True:\n",
113 |     "        resp = client.list()\n",
114 |     "\n",
115 |     "        this_cluster = None\n",
116 |     "\n",
117 |     "        # is this the cluster?\n",
118 |     "        for c in resp['clusters']:\n",
119 |     "            if cid == c['clusterId']:\n",
120 |     "                this_cluster = c\n",
121 |     "\n",
122 |     "        if this_cluster is None:\n",
123 |     "            print(f\"clusterId:{cid} not found\")\n",
124 |     "            return( None )\n",
125 |     "\n",
126 |     "        this_status = this_cluster['clusterStatus']['state']\n",
127 |     "\n",
128 |     "        if this_status.upper() != status.upper():\n",
129 |     "            print(f\"Cluster status is {this_status}, waiting {sleep_sec} sec ...\")\n",
130 |     "            time.sleep(sleep_sec)\n",
131 |     "            continue\n",
132 |     "        else:\n",
133 |     "            return( this_cluster )\n"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "%local\n",
143 |     "wait_for_status(finspace_clusters, cid, status='RUNNING')"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "raw",
148 |    "metadata": {},
149 |    "source": [
150 |     "%local\n",
151 |     "\n",
152 |     "### THIS IS SET TO NON_RUNNABLE TO PREVENT MISTAKES\n",
153 |     "### To terminate a cluster, use the below command\n",
154 |     "\n",
155 |     "# terminate the cluster\n",
156 |     "finspace_clusters.terminate('')"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "## Livy Session\n",
164 |     "Use livy and sparkmagic to monitor the spark sessions on the cluster"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "%%info"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "### This will delete a specific session\n",
181 |     "Commented out so its not accidently run"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "#%%delete -f -s << livy session id >>"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "### This will cleanup/delete all livy sessions on the cluster\n",
198 |     "Commented out so its not accidently run"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "#%%cleanup -f"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "%local\n",
217 |     "import datetime\n",
218 |     "print( f\"Last Run: {datetime.datetime.now()}\" )"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": []
227 |   }
228 |  ],
229 |  "metadata": {
230 |   "instance_type": "ml.t3.medium",
231 |   "kernelspec": {
232 |    "display_name": "FinSpace PySpark (finspace-sparkmagic-5567a/latest)",
233 |    "language": "python",
234 |    "name": "pysparkkernel__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:656007506553:image/finspace-sparkmagic-5567a"
235 |   },
236 |   "language_info": {
237 |    "codemirror_mode": {
238 |     "name": "python",
239 |     "version": 3
240 |    },
241 |    "mimetype": "text/x-python",
242 |    "name": "pyspark",
243 |    "pygments_lexer": "python3"
244 |   }
245 |  },
246 |  "nbformat": 4,
247 |  "nbformat_minor": 4
248 | }
249 | 


--------------------------------------------------------------------------------
/notebooks/compute_and_plot_volatility_from_taq/plot-volatility.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "![logo](./finspace_logo.png)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "%local\n",
 17 |     "from aws.finspace.cluster import FinSpaceClusterManager\n",
 18 |     "\n",
 19 |     "# if this was already run, no need to run again\n",
 20 |     "if 'finspace_clusters' not in globals():\n",
 21 |     "    finspace_clusters = FinSpaceClusterManager()\n",
 22 |     "    finspace_clusters.auto_connect()\n",
 23 |     "else:\n",
 24 |     "    print(f'connected to cluster: {finspace_clusters.get_connected_cluster_id()}')"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "print(f'Spark Version: {sc.version}')"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "# Realized Volatility\n",
 41 |     "This notebook will pull summarized data from FinSpace's catalog and then use the analytic function realized_volatility to compute realized volatility for a group of tickers and exchange event types."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "# import needed libraries\n",
 51 |     "import pandas as pd\n",
 52 |     "import matplotlib.pyplot as plt\n",
 53 |     "import datetime as dt\n",
 54 |     "import pyspark.sql.functions as F\n",
 55 |     "import pyspark.sql.types as T\n",
 56 |     "\n",
 57 |     "from aws.finspace.timeseries.spark.analytics import *\n",
 58 |     "from aws.finspace.timeseries.spark.windows import *\n",
 59 |     "\n",
 60 |     "from aws.finspace.timeseries.spark.util import string_to_timestamp_micros"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "#####----------------------------------------------------------\n",
 70 |     "##### REPLACE WITH CORRECT IDS!\n",
 71 |     "##### Dataset: \"US Equity Time-Bar Summary - 1 min, 14 Symbols - Sample\"\n",
 72 |     "#####\n",
 73 |     "#####----------------------------------------------------------\n",
 74 |     "dataset_id = ''  \n",
 75 |     "view_id    = ''"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "from aws.finspace.analytics import FinSpaceAnalyticsManager\n",
 85 |     "finspace_manager = FinSpaceAnalyticsManager(spark = spark)\n",
 86 |     "\n",
 87 |     "sumDF = finspace_manager.read_data_view(dataset_id = dataset_id, data_view_id = view_id)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "# What is the date range for the data?\n",
 97 |     "sumDF.select(F.min(sumDF.date).alias(\"MIN\"), F.max(sumDF.date).alias(\"MAX\")).show()"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "# What tickers are in this dataset?\n",
107 |     "sumDF.groupBy(\"ticker\").count().orderBy('ticker').show()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "# Filter and select\n",
117 |     "sDate = dt.datetime(2020, 1, 15)\n",
118 |     "eDate = dt.datetime(2020, 2, 15)\n",
119 |     "\n",
120 |     "#df = ( sumDF.filter(sumDF.eventtype == \"TRADE NB\").filter( sumDF.date.between(sDate, eDate) ) )\n",
121 |     "df = ( sumDF.filter( sumDF.date.between(sDate, eDate) ) )\n",
122 |     "\n",
123 |     "# sample the data\n",
124 |     "df.show(10)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "# Spark Analytics\n",
132 |     "All our analytic functions have help, lets look at the signatures for the functions we will use\n",
133 |     "\n",
134 |     "![Workflow](./workflow.png)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "help(realized_volatility)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "tenor = 15\n",
153 |     "numStd = 2\n",
154 |     "\n",
155 |     "# analytics to calculate\n",
156 |     "realVolDef = realized_volatility( tenor, \"end\", \"vwap\" )\n",
157 |     "bbandsDef  = bollinger_bands(tenor, numStd, \"end\", \"vwap\", \"high\", \"low\")\n",
158 |     "\n",
159 |     "# group the sets of values\n",
160 |     "partitionList = [\"ticker\", \"eventtype\"]\n",
161 |     "\n",
162 |     "tsDF = df\n",
163 |     "\n",
164 |     "tsDF = compute_analytics_on_features(tsDF, \"realized_volatility\", realVolDef, partition_col_list = partitionList)\n",
165 |     "tsDF = compute_analytics_on_features(tsDF, \"bollinger_band\", bbandsDef, partition_col_list = partitionList)\n",
166 |     "\n",
167 |     "# will be working with the once calculated, lets cache it\n",
168 |     "#tsDF = tsDF.cache()\n",
169 |     "\n",
170 |     "tsDF.printSchema()"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "# Realized Volatility Graph\n",
178 |     "Calculate and plot realized volatility\n",
179 |     "\n",
180 |     "When plotting with Spark, the calculations are performed on the cluster, specifically, the data is collected to the driver, the plot image created, then the image is shipped over to the local notebook to be shown. This is all done for you."
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "fTicker = 'AMZN'\n",
190 |     "\n",
191 |     "# filter and bring data into a pandas dataframe for plotting\n",
192 |     "pltDF = ( tsDF\n",
193 |     "    .filter(sumDF.eventtype == \"TRADE NB\")\n",
194 |     "    .filter(df.ticker == fTicker)\n",
195 |     "    .select( 'end', 'realized_volatility' )\n",
196 |     ").toPandas()\n",
197 |     "\n",
198 |     "pltDF = pltDF.set_index('end')\n",
199 |     "pltDF.index = pltDF.index.strftime(\"%Y-%m-%d %H:%m\")\n",
200 |     "\n",
201 |     "fig, ax = plt.subplots(1, 1, figsize=(12, 6))\n",
202 |     "\n",
203 |     "#ax.get_yaxis().set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')) )\n",
204 |     "\n",
205 |     "# Realized Volatility\n",
206 |     "pltDF[[ 'realized_volatility' ]].plot(figsize=(12,6))\n",
207 |     "\n",
208 |     "# labels and other items to make the plot readable\n",
209 |     "plt.title(f\"{fTicker} Realized Vol (tenor: {tenor}, 1 min bars)\")\n",
210 |     "plt.ylabel('Realized Vol')\n",
211 |     "plt.xlabel('Date/Time')\n",
212 |     "plt.xticks(rotation=30)\n",
213 |     "plt.subplots_adjust(bottom=0.2)\n",
214 |     "\n",
215 |     "%matplot plt"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "# So why that spike?\n",
223 |     "\n",
224 |     "[Amazon soars after huge earnings beat](https://www.cnbc.com/2020/01/30/amazon-amzn-q4-2019-earnings.html) (CNBC).  \n",
225 |     "- Amazon reported fourth-quarter results on Thursday that smashed analysts’ expectations.  \n",
226 |     "- The company’s profits rebounded during the quarter, while revenue climbed 21% year over year.  \n",
227 |     "- The outperforming results show Amazon’s big investments in one-day delivery are paying off.  \n"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "# Keep Iterating\n",
235 |     "The data wasn't just calculated for one ticker, Spark did this for every ticker in the DataFrame..."
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "fTicker = 'GOOG'\n",
245 |     "\n",
246 |     "pltDF = ( tsDF\n",
247 |     "    .filter(sumDF.eventtype == \"TRADE NB\")\n",
248 |     "    .filter(df.ticker == fTicker)\n",
249 |     "    .select( 'end', 'realized_volatility' )\n",
250 |     ").toPandas()\n",
251 |     "\n",
252 |     "pltDF = pltDF.set_index('end')\n",
253 |     "pltDF.index = pltDF.index.strftime(\"%Y-%m-%d %H:%m\")\n",
254 |     "\n",
255 |     "# Realized Vol\n",
256 |     "pltDF[[ 'realized_volatility' ]].plot(figsize=(12,6))\n",
257 |     "\n",
258 |     "plt.title(f\"{fTicker} Realized Vol (tenor: {tenor}, 1 min bars)\")\n",
259 |     "plt.ylabel('Realized Vol')\n",
260 |     "plt.xlabel('Date/Time')\n",
261 |     "plt.xticks(rotation=30)\n",
262 |     "plt.subplots_adjust(bottom=0.2)\n",
263 |     "\n",
264 |     "%matplot plt"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "fTicker = 'AAPL'\n",
274 |     "\n",
275 |     "pltDF = ( tsDF\n",
276 |     "    .filter(sumDF.eventtype == \"TRADE NB\")\n",
277 |     "    .filter(df.ticker == fTicker)\n",
278 |     "    .select( 'end', 'realized_volatility' )\n",
279 |     ").toPandas()\n",
280 |     "\n",
281 |     "pltDF = pltDF.set_index('end')\n",
282 |     "pltDF.index = pltDF.index.strftime(\"%Y-%m-%d %H:%m\")\n",
283 |     "\n",
284 |     "# Realized Vol\n",
285 |     "pltDF[[ 'realized_volatility' ]].plot(figsize=(12,6))\n",
286 |     "\n",
287 |     "plt.title(f\"{fTicker} Realized Vol (tenor: {tenor}, 1 min bars)\")\n",
288 |     "plt.ylabel('Realized Vol')\n",
289 |     "plt.xlabel('Date/Time')\n",
290 |     "plt.xticks(rotation=30)\n",
291 |     "plt.subplots_adjust(bottom=0.2)\n",
292 |     "\n",
293 |     "%matplot plt"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "import datetime\n",
303 |     "print( f\"Last Run: {datetime.datetime.now()}\" )"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": []
312 |   }
313 |  ],
314 |  "metadata": {
315 |   "instance_type": "ml.t3.medium",
316 |   "kernelspec": {
317 |    "display_name": "FinSpace PySpark (finspace-sparkmagic-84084/latest)",
318 |    "language": "python",
319 |    "name": "pysparkkernel__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:489461498020:image/finspace-sparkmagic-84084"
320 |   },
321 |   "language_info": {
322 |    "codemirror_mode": {
323 |     "name": "python",
324 |     "version": 3
325 |    },
326 |    "mimetype": "text/x-python",
327 |    "name": "pyspark",
328 |    "pygments_lexer": "python3"
329 |   }
330 |  },
331 |  "nbformat": 4,
332 |  "nbformat_minor": 4
333 | }
334 | 


--------------------------------------------------------------------------------
/notebooks/technical_indicators/technical-indicators-all.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "![logo](./finspace_logo.png)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "%local\n",
 17 |     "from aws.finspace.cluster import FinSpaceClusterManager\n",
 18 |     "\n",
 19 |     "# if this was already run, no need to run again\n",
 20 |     "if 'finspace_clusters' not in globals():\n",
 21 |     "    finspace_clusters = FinSpaceClusterManager()\n",
 22 |     "    finspace_clusters.auto_connect()\n",
 23 |     "else:\n",
 24 |     "    print(f'connected to cluster: {finspace_clusters.get_connected_cluster_id()}')"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "#####----------------------------------------------------------\n",
 34 |     "##### REPLACE WITH CORRECT IDS!\n",
 35 |     "##### Dataset: \"US Equity Time-Bar Summary - 1 min, 14 Symbols\"\n",
 36 |     "#####\n",
 37 |     "#####----------------------------------------------------------\n",
 38 |     "dataset_id = ''\n",
 39 |     "view_id    = ''"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# import needed libraries\n",
 49 |     "import pandas as pd\n",
 50 |     "import matplotlib.pyplot as plt\n",
 51 |     "import datetime as dt\n",
 52 |     "import pyspark.sql.functions as F\n",
 53 |     "import pyspark.sql.types as T\n",
 54 |     "\n",
 55 |     "from aws.finspace.timeseries.spark.util import string_to_timestamp_micros\n",
 56 |     "from aws.finspace.timeseries.spark.analytics import *\n",
 57 |     "from aws.finspace.timeseries.spark.windows import *"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "from aws.finspace.analytics import FinSpaceAnalyticsManager\n",
 67 |     "finspace = FinSpaceAnalyticsManager(spark = spark)\n",
 68 |     "\n",
 69 |     "sumDF = finspace.read_data_view(dataset_id = dataset_id, data_view_id = view_id)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "sumDF.show(5)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# for now, limit the dates involved\n",
 88 |     "sDate = dt.datetime(2019, 10, 1)\n",
 89 |     "eDate = dt.datetime(2019, 10, 1)\n",
 90 |     "\n",
 91 |     "df = ( sumDF.filter(sumDF.eventtype == \"TRADE NB\").filter( sumDF.date.between(sDate, eDate) ) )"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "df.show(5)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "# Available Time Series Features"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "raw",
112 |    "metadata": {},
113 |    "source": [
114 |     "help(moving_average)\n",
115 |     "help(realized_volatility)\n",
116 |     "help(exponential_moving_average)\n",
117 |     "help(moving_average_converge_diverge)\n",
118 |     "help(moving_average_converge_diverge_hist)\n",
119 |     "\n",
120 |     "help(momentum_indicator)\n",
121 |     "help(roc_indicator)\n",
122 |     "help(rsi)\n",
123 |     "help(realized_volatility_spread)\n",
124 |     "help(pairwise_realized_correlation)\n",
125 |     "\n",
126 |     "help(linear_regression)\n",
127 |     "help(realized_correlation_matrix)\n",
128 |     "help(abands)\n",
129 |     "help(bollinger_bands)\n",
130 |     "help(acc_dist_indicator)\n",
131 |     "\n",
132 |     "help(average_true_range)\n",
133 |     "help(commodity_channel_index)\n",
134 |     "help(ichimoku_indicator)\n",
135 |     "help(linear_weighted_moving_average)\n",
136 |     "help(on_balance_volume)\n",
137 |     "\n",
138 |     "help(slow_stock_oscillator)\n",
139 |     "help(fast_stock_oscillator)\n",
140 |     "help(aroon_up_indicator)\n",
141 |     "help(aroon_down_indicator)\n",
142 |     "help(aroon_oscillator)\n",
143 |     "\n",
144 |     "help(will_r_indicator)\n",
145 |     "help(cmo_indicator)\n",
146 |     "help(natr_indicator)\n",
147 |     "help(pvt_indicator)\n",
148 |     "help(keltner_indicator)\n",
149 |     "\n",
150 |     "help(ult_osc_indicator)\n",
151 |     "help(double_exponential_moving_average)\n",
152 |     "help(mf_indicator)\n",
153 |     "help(triple_exponential_moving_average)\n",
154 |     "\n",
155 |     "help(ts_linear_regression)\n",
156 |     "help(weighted_ts_linear_regression)\n",
157 |     "help(weighted_linear_regression)\n",
158 |     "help(percentage_price_oscillator)\n",
159 |     "help(ROC100_indicator)\n",
160 |     "\n",
161 |     "help(ROCP_indicator)\n",
162 |     "help(ROCR_indicator)\n",
163 |     "help(price_channel_indicator)\n",
164 |     "help(trima_indicator)\n",
165 |     "help(trix_indicator)\n",
166 |     "\n",
167 |     "help(midprice_indicator)\n",
168 |     "help(midpoint_indicator)\n",
169 |     "help(min_indicator)\n",
170 |     "help(max_indicator)\n",
171 |     "help(minmax_indicator)\n",
172 |     "\n",
173 |     "help(kama_indicator)\n",
174 |     "help(adx_indicator)\n",
175 |     "help(t3_ema_indicator)\n",
176 |     "help(wilder_smoothing_indicator)\n",
177 |     "help(sar_indicator)\n",
178 |     "\n",
179 |     "help(stddev_indicator)\n",
180 |     "help(dmi_indicator)\n",
181 |     "help(tr_indicator)\n",
182 |     "help(pos_dm_indicator)\n",
183 |     "help(neg_dm_indicator)\n",
184 |     "\n",
185 |     "help(chaiken_money_flow_indicator)\n",
186 |     "help(force_index_indicator)\n",
187 |     "help(ease_of_movement_indicator)\n",
188 |     "help(negative_volume_indicator)\n",
189 |     "help(donchian_channel_indicator)\n",
190 |     "\n",
191 |     "help(mass_index_indicator)\n",
192 |     "help(dpo_indicator)\n",
193 |     "help(coppock_curve_indicator)\n",
194 |     "help(elder_ray_index_indicator)\n",
195 |     "help(fisher_transformation_indicator)\n",
196 |     "\n",
197 |     "help(chaikens_volatility_indicator)\n",
198 |     "help(klinger_oscillator_indicator)\n",
199 |     "help(hull_moving_average_indicator)\n",
200 |     "help(time_series_forecast_indicator)\n",
201 |     "help(stoch_rsi_indicator)\n",
202 |     "\n",
203 |     "help(typical_price_indicator)\n",
204 |     "help(weighted_close_indicator)\n",
205 |     "help(adrx_indicator)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "# all functions have help, like this\n",
215 |     "help(time_series_forecast_indicator)"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "# All Indicators in one DataFrame\n",
223 |     "This is an example of how one can combine any or all the indicator functions into very wide DataFrames"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "tsDF = df\n",
233 |     "\n",
234 |     "# group the sets of values\n",
235 |     "partitionList = [\"ticker\", \"eventtype\"]\n",
236 |     "\n",
237 |     "tenor = 15\n",
238 |     "numStd = 2\n",
239 |     "\n",
240 |     "timeCol  = 'end'\n",
241 |     "priceCol = 'vwap'\n",
242 |     "highCol  = 'high'\n",
243 |     "lowCol   = 'low'\n",
244 |     "volCol   = 'volume'\n",
245 |     "\n",
246 |     "ts_dict = {\n",
247 |     "    'moving_average'                       : moving_average( tenor, timeCol, priceCol ),\n",
248 |     "    'realized_volatility'                  : realized_volatility( tenor, timeCol, priceCol ),\n",
249 |     "    'exponential_moving_average'           : exponential_moving_average( tenor, timeCol, priceCol ),\n",
250 |     "    'moving_average_converge_diverge'      : moving_average_converge_diverge( 12, 26, timeCol, priceCol ),\n",
251 |     "    'moving_average_converge_diverge_hist' : moving_average_converge_diverge_hist( 12, 26, 9, timeCol, priceCol ),\n",
252 |     "    \n",
253 |     "    'momentum_indicator'                   : momentum_indicator( tenor, timeCol, priceCol ),\n",
254 |     "    'roc_indicator'                        : roc_indicator( tenor, timeCol, priceCol ),\n",
255 |     "    'rsi'                                  : rsi( tenor, timeCol, priceCol ),\n",
256 |     "    'realized_volatility_spread'           : realized_volatility_spread( tenor, timeCol, priceCol, 'open' ),\n",
257 |     "    'pairwise_realized_correlation'        : pairwise_realized_correlation( tenor, timeCol, priceCol, 'open' ),\n",
258 |     "    \n",
259 |     "    'bollinger_bands'                      : bollinger_bands( tenor, numStd, timeCol, priceCol, highCol, lowCol ),\n",
260 |     "    'acc_dist_indicator'                   : acc_dist_indicator( timeCol, priceCol, highCol, lowCol, volCol ),\n",
261 |     "    \n",
262 |     "    'average_true_range'                   : average_true_range( tenor, timeCol, priceCol, highCol, lowCol ),\n",
263 |     "    'commodity_channel_index'              : commodity_channel_index( tenor, timeCol, priceCol, highCol, lowCol ),\n",
264 |     "    'ichimoku_indicator'                   : ichimoku_indicator( timeCol, priceCol, 9, 26, 52 ),\n",
265 |     "    'linear_weighted_moving_average'       : linear_weighted_moving_average( tenor, timeCol, priceCol ),\n",
266 |     "    'on_balance_volume'                    : on_balance_volume( timeCol, priceCol, volCol ),\n",
267 |     "    \n",
268 |     "    'slow_stock_oscillator'                : slow_stock_oscillator( tenor, timeCol, priceCol, highCol, lowCol ),\n",
269 |     "    'fast_stock_oscillator'                : fast_stock_oscillator( tenor, timeCol, priceCol, highCol, lowCol ),\n",
270 |     "    'aroon_up_indicator'                   : aroon_up_indicator( tenor, timeCol, priceCol ),\n",
271 |     "    'aroon_down_indicator'                 : aroon_down_indicator( tenor, timeCol, priceCol ),\n",
272 |     "    'aroon_oscillator'                     : aroon_oscillator( tenor, timeCol, priceCol ),\n",
273 |     "    \n",
274 |     "    'will_r_indicator'                     : will_r_indicator( tenor, timeCol, priceCol, highCol, lowCol ),\n",
275 |     "    'cmo_indicator'                        : cmo_indicator( tenor, timeCol, priceCol ),\n",
276 |     "    'natr_indicator'                       : natr_indicator( tenor, timeCol, priceCol, highCol, lowCol ),\n",
277 |     "    'pvt_indicator'                        : pvt_indicator( timeCol, priceCol, volCol ),\n",
278 |     "    'keltner_indicator'                    : keltner_indicator( timeCol, priceCol, highCol, lowCol, 2, 20, 20),\n",
279 |     "    \n",
280 |     "    'ult_osc_indicator'                    : ult_osc_indicator( timeCol, priceCol, highCol, lowCol ),\n",
281 |     "    'apo_indicator'                        : apo_indicator( 12, 26, timeCol, priceCol ),\n",
282 |     "    'double_exponential_moving_average'    : double_exponential_moving_average( tenor, timeCol, priceCol ),\n",
283 |     "    'mf_indicator'                         : mf_indicator( tenor, timeCol, priceCol, highCol, lowCol, volCol ),\n",
284 |     "    'triple_exponential_moving_average'    : triple_exponential_moving_average( tenor, timeCol, priceCol ),\n",
285 |     "    \n",
286 |     "    'ts_linear_regression'                 : ts_linear_regression( tenor, 'seconds', timeCol, timeCol, priceCol ),\n",
287 |     "    'weighted_ts_linear_regression'        : weighted_ts_linear_regression( tenor, 'seconds', timeCol, timeCol, priceCol, volCol ),\n",
288 |     "    'percentage_price_oscillator'          : percentage_price_oscillator( 12, 26, 9, timeCol, priceCol ),\n",
289 |     "    'ROC100_indicator'                     : ROC100_indicator( tenor, timeCol, priceCol ),\n",
290 |     "\n",
291 |     "    'ROCP_indicator'                       : ROCP_indicator( tenor, timeCol, priceCol ),\n",
292 |     "    'ROCR_indicator'                       : ROCR_indicator( tenor, timeCol, priceCol ),\n",
293 |     "    'price_channel_indicator'              : price_channel_indicator( tenor, timeCol, priceCol ),\n",
294 |     "    'trima_indicator'                      : trima_indicator( tenor, timeCol, priceCol ),\n",
295 |     "    'trix_indicator'                       : trix_indicator( tenor, timeCol, priceCol ),\n",
296 |     "\n",
297 |     "    'midprice_indicator'                   : midprice_indicator( tenor, timeCol, highCol, lowCol ),\n",
298 |     "    'midpoint_indicator'                   : midpoint_indicator( tenor, timeCol, priceCol ),\n",
299 |     "    'min_indicator'                        : min_indicator( tenor, timeCol, priceCol ),\n",
300 |     "    'max_indicator'                        : max_indicator( tenor, timeCol, priceCol ),\n",
301 |     "    'minmax_indicator'                     : minmax_indicator( tenor, timeCol, priceCol ),\n",
302 |     "\n",
303 |     "    'kama_indicator'                       : kama_indicator( timeCol, priceCol, 10, 30, 2 ),\n",
304 |     "    'adx_indicator'                        : adx_indicator( tenor, timeCol, priceCol, highCol, lowCol ),\n",
305 |     "    't3_ema_indicator'                     : t3_ema_indicator( tenor, timeCol, priceCol ),\n",
306 |     "    'wilder_smoothing_indicator'           : wilder_smoothing_indicator( tenor, timeCol, priceCol ),\n",
307 |     "    'sar_indicator'                        : sar_indicator( timeCol, priceCol, highCol, lowCol ),\n",
308 |     "\n",
309 |     "    'stddev_indicator'                     : stddev_indicator( tenor, timeCol, priceCol ),\n",
310 |     "    'dmi_indicator'                        : dmi_indicator( tenor, timeCol, priceCol, highCol, lowCol ),\n",
311 |     "    'tr_indicator'                         : tr_indicator( timeCol, priceCol, highCol, lowCol ),\n",
312 |     "    'pos_dm_indicator'                     : pos_dm_indicator( timeCol, highCol, lowCol ),\n",
313 |     "    'neg_dm_indicator'                     : neg_dm_indicator( timeCol, highCol, lowCol ),\n",
314 |     "\n",
315 |     "    'chaiken_money_flow_indicator'         : chaiken_money_flow_indicator( tenor, timeCol, priceCol, highCol, lowCol, volCol ),\n",
316 |     "    'force_index_indicator'                : force_index_indicator( tenor, timeCol, priceCol, volCol ),\n",
317 |     "    'negative_volume_indicator'            : negative_volume_indicator( 1, timeCol, priceCol, volCol ),\n",
318 |     "    'donchian_channel_indicator'           : donchian_channel_indicator( tenor, timeCol, highCol, lowCol ),\n",
319 |     "\n",
320 |     "    'mass_index_indicator'                 : mass_index_indicator( 9, 25, timeCol, highCol, lowCol ),\n",
321 |     "    'dpo_indicator'                        : dpo_indicator( tenor, timeCol, priceCol ),\n",
322 |     "    'coppock_curve_indicator'              : coppock_curve_indicator( 14, 11, 10, timeCol, priceCol ),\n",
323 |     "    'elder_ray_index_indicator'            : elder_ray_index_indicator( tenor, timeCol, priceCol ),\n",
324 |     "    'fisher_transformation_indicator'      : fisher_transformation_indicator( tenor, timeCol, highCol, lowCol ),\n",
325 |     "\n",
326 |     "    'chaikens_volatility_indicator'        : chaikens_volatility_indicator( tenor, timeCol, highCol, lowCol ),\n",
327 |     "    'klinger_oscillator_indicator'         : klinger_oscillator_indicator( 34, 55, timeCol, priceCol, highCol, lowCol, volCol ),\n",
328 |     "    'hull_moving_average_indicator'        : hull_moving_average_indicator( tenor, timeCol, priceCol ),\n",
329 |     "    'time_series_forecast_indicator'       : time_series_forecast_indicator( tenor, 'seconds', timeCol, timeCol, priceCol ),\n",
330 |     "    'stoch_rsi_indicator'                  : stoch_rsi_indicator( tenor, tenor, timeCol, priceCol ),\n",
331 |     "\n",
332 |     "    'typical_price_indicator'              : typical_price_indicator( tenor, timeCol, priceCol, highCol, lowCol ),\n",
333 |     "    'weighted_close_indicator'             : weighted_close_indicator( tenor, timeCol, priceCol, highCol, lowCol ),\n",
334 |     "    'adrx_indicator'                       : adrx_indicator( tenor, tenor, timeCol, priceCol, highCol, lowCol ),\n",
335 |     "\n",
336 |     "}\n",
337 |     "\n",
338 |     "for key, fnc in ts_dict.items():\n",
339 |     "    tsDF = compute_analytics_on_features(tsDF, key, fnc, partition_col_list = partitionList)\n",
340 |     "\n",
341 |     "tsDF.printSchema()    "
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "# Compute and Sample some rows \n",
351 |     "# 2*tenor: get past the initial NAs that are computed for less-full windows\n",
352 |     "\n",
353 |     "tsDF.show(2*tenor)"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "import datetime\n",
363 |     "print( f\"Last Run: {datetime.datetime.now()}\" )"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": []
372 |   }
373 |  ],
374 |  "metadata": {
375 |   "instance_type": "ml.t3.medium",
376 |   "kernelspec": {
377 |    "display_name": "FinSpace PySpark (finspace-sparkmagic-84084/latest)",
378 |    "language": "python",
379 |    "name": "pysparkkernel__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:489461498020:image/finspace-sparkmagic-84084"
380 |   },
381 |   "language_info": {
382 |    "codemirror_mode": {
383 |     "name": "python",
384 |     "version": 3
385 |    },
386 |    "mimetype": "text/x-python",
387 |    "name": "pyspark",
388 |    "pygments_lexer": "python3"
389 |   }
390 |  },
391 |  "nbformat": 4,
392 |  "nbformat_minor": 4
393 | }
394 | 


--------------------------------------------------------------------------------
/notebooks/s3_import/s3_import.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Importing S3 Data into FinSpace\n",
  8 |     "\n",
  9 |     "This notebook will demonstrate use of FinSpace APIs to create a dataset and populate it with data from an external (to FinSpace) S3 source.\n",
 10 |     "\n",
 11 |     "## Preparation\n",
 12 |     "There exists an S3 bucket with the data you wish to import, and that bucket has given the your FinSpace's service account access to that bucket. \n",
 13 |     "\n",
 14 |     "## Deutsche Börse Public Dataset (DBG PDS)\n",
 15 |     "Github: https://github.com/Deutsche-Boerse/dbg-pds   \n",
 16 |     "\n",
 17 |     "Copy data into a bucket that has entitled the FinSpace service account to it. That bucket must grant \n",
 18 |     "s3:GetObject and s3:ListBucket actions to the service account ARN.\n",
 19 |     "\n",
 20 |     "FinSpace Service Account ARN (replace with your environment's service account):   \n",
 21 |     "    arn:aws:iam::**INFRASTRUCTURE_ACCOUNT_ID**:role/FinSpaceServiceRole\n",
 22 |     "\n",
 23 |     "## Entitlement Example\n",
 24 |     "\n",
 25 |     "- S3 bucket is externally accessible\n",
 26 |     "- replace INFRASTRUCTURE_ACCOUNT_ID with your environment's service account\n",
 27 |     "- replace S3_BUCKET with your s3 bucket\n",
 28 |     "\n",
 29 |     "```\n",
 30 |     "{\n",
 31 |     "    \"Version\": \"2012-10-17\",\n",
 32 |     "    \"Id\": \"CrossAccountAccess\",\n",
 33 |     "    \"Statement\": [\n",
 34 |     "        {\n",
 35 |     "            \"Effect\": \"Allow\",\n",
 36 |     "            \"Principal\": {\n",
 37 |     "                \"AWS\": [\n",
 38 |     "                    \"arn:aws:iam::INFRASTRUCTURE_ACCOUNT_ID:role/FinSpaceServiceRole\"\n",
 39 |     "                ]\n",
 40 |     "            },\n",
 41 |     "            \"Action\": \"s3:GetObject\",\n",
 42 |     "            \"Resource\": \"arn:aws:s3:::S3_BUCKET/*\"\n",
 43 |     "        },\n",
 44 |     "        {\n",
 45 |     "            \"Effect\": \"Allow\",\n",
 46 |     "            \"Principal\": {\n",
 47 |     "                \"AWS\": [\n",
 48 |     "                    \"arn:aws:iam::INFRASTRUCTURE_ACCOUNT_ID:role/FinSpaceServiceRole\"\n",
 49 |     "                ]\n",
 50 |     "            },\n",
 51 |     "            \"Action\": \"s3:ListBucket\",\n",
 52 |     "            \"Resource\": \"arn:aws:s3:::S3_BUCKET\"\n",
 53 |     "        }\n",
 54 |     "    ]\n",
 55 |     "}\n",
 56 |     " ```"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "%local\n",
 66 |     "from aws.finspace.cluster import FinSpaceClusterManager\n",
 67 |     "\n",
 68 |     "# if this was already run, no need to run again\n",
 69 |     "if 'finspace_clusters' not in globals():\n",
 70 |     "    finspace_clusters = FinSpaceClusterManager()\n",
 71 |     "    finspace_clusters.auto_connect()\n",
 72 |     "else:\n",
 73 |     "    print(f'connected to cluster: {finspace_clusters.get_connected_cluster_id()}')"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "# FinSpace Environment\n",
 81 |     "\n",
 82 |     "Please provide values from your AWS account (S3 location) and your FinSpace environment. The group ID is from the user group you want to associate the dataset to, this example will grant all permissions to the group for this dataset it creates.\n",
 83 |     "\n",
 84 |     "## Getting the Group ID\n",
 85 |     "\n",
 86 |     "Navigate to the Analyst group (gear menu, users and groups, select group named Analyst). The URL is of this pattern:  \n",
 87 |     "http://**ENVIRONMEN_ID**.**REGION**.amazonfinspace.com/userGroup/**GROUP_ID**  \n",
 88 |     "\n",
 89 |     "Copy the string for GroupID into the **group_id** variable assignment below\n"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "import time\n",
 99 |     "import pandas as pd\n",
100 |     "from aws.finspace.analytics import FinSpaceAnalyticsManager\n",
101 |     "\n",
102 |     "# location of data copied from s3://deutsche-boerse-xetra-pds \n",
103 |     "root_folder = 's3://'\n",
104 |     "\n",
105 |     "# dataset_id, if None will create, if not None, this update will be an append usig bucket contents\n",
106 |     "dataset_id = None\n",
107 |     "\n",
108 |     "# User Group to grant access to the dataset\n",
109 |     "group_id = ''\n",
110 |     "\n",
111 |     "# date range to pull from S3 into FinSpace (directories of root_foles)\n",
112 |     "start_date = '2021-01-04' \n",
113 |     "end_date = '2021-01-06'\n",
114 |     "\n",
115 |     "finspace_manager = FinSpaceAnalyticsManager(spark)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "# Dataset Definitions\n",
123 |     "Capture the dataset's name, description, schema, attribute set, attribute set values, permissions to assign to the permission group."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "# Name for the dataset\n",
133 |     "name = \"Deutsche Börse Public Dataset (Xetra)\"\n",
134 |     "\n",
135 |     "# description for the dataset\n",
136 |     "description = \"\"\"The Deutsche Börse Public Dataset (PDS) project makes near-time data derived from Deutsche Börse's trading systems available to the public for free. This is the first time that such detailed financial market data has been shared freely and continually from the source provider.\n",
137 |     "\"\"\"\n",
138 |     "\n",
139 |     "# this is the attribute set to use, will search for it in system\n",
140 |     "att_name = \"Capital Market Details\"\n",
141 |     "\n",
142 |     "# Attributes to associate, based on the definition of the attribute set\n",
143 |     "att_values = [\n",
144 |     "    { 'field' : 'AssetClass', 'type' : 'TAXONOMY', 'values' : [ 'Equity', 'CommonStocks'] },\n",
145 |     "    { 'field' : 'EventType',  'type' : 'TAXONOMY', 'values' : [ 'OHLC' ] },\n",
146 |     "    { 'field' : 'Exchange',   'type' : 'TAXONOMY', 'values' : [ ] },\n",
147 |     "    { 'field' : 'FinancialContentType', 'type' : 'TAXONOMY', 'values' : [ ] },\n",
148 |     "    { 'field' : 'RegionsAndCountries',  'type' : 'TAXONOMY', 'values' : [ 'Germany' ] }\n",
149 |     "]\n",
150 |     "\n",
151 |     "# Permissions to grant the above group for the created dataset\n",
152 |     "basicPermissions = [\n",
153 |     "    \"ViewDatasetDetails\",\n",
154 |     "    \"ReadDatasetData\",\n",
155 |     "    \"AddDatasetData\",\n",
156 |     "    \"CreateSnapshot\",\n",
157 |     "    \"EditDatasetMetadata\",\n",
158 |     "    \"ManageDatasetPermissions\",\n",
159 |     "    \"DeleteDataset\"\n",
160 |     "]\n",
161 |     "\n",
162 |     "request_dataset_permissions = [{\"permission\": permissionName} for permissionName in basicPermissions]\n",
163 |     "\n",
164 |     "# All datasets have ownership\n",
165 |     "basicOwnerInfo = {\n",
166 |     "    \"phoneNumber\" : \"12125551000\",\n",
167 |     "    \"email\"       : \"jdoe@amazon.com\",\n",
168 |     "    \"name\"        : \"Jane Doe\"\n",
169 |     "}\n",
170 |     "\n",
171 |     "# schema of the dataset\n",
172 |     "schema = {\n",
173 |     "    'primaryKeyColumns': [],\n",
174 |     "    'columns' : [\n",
175 |     "        {'dataType': 'STRING',  'name': 'ISIN', 'description': 'ISIN of the security'},\n",
176 |     "        {'dataType': 'STRING',  'name': 'Mnemonic', 'description': 'The product market segment, following the convention on http://www.eurexchange.com'},\n",
177 |     "        {'dataType': 'STRING',  'name': 'SecurityDesc', 'description': 'Description of the security'},\n",
178 |     "        {'dataType': 'STRING',  'name': 'SecurityType', 'description': 'Type of security'},\n",
179 |     "        {'dataType': 'STRING',  'name': 'Currency', 'description': 'Currency in which the product is traded'},\n",
180 |     "        {'dataType': 'INTEGER', 'name': 'SecurityID', 'description': 'Unique identifier for each contract'},\n",
181 |     "        {'dataType': 'DATE',    'name': 'Date', 'description': 'Date of trading period'},\n",
182 |     "        {'dataType': 'STRING',  'name': 'Time', 'description': 'Minute of trading to which this entry relates'},\n",
183 |     "        {'dataType': 'DOUBLE',  'name': 'StartPrice', 'description': 'Trading price at the start of period'},\n",
184 |     "        {'dataType': 'DOUBLE',  'name': 'MaxPrice', 'description': 'Maximum price over the period'},\n",
185 |     "        {'dataType': 'DOUBLE',  'name': 'MinPrice', 'description': 'Minimum price over the period'},\n",
186 |     "        {'dataType': 'DOUBLE',  'name': 'EndPrice', 'description': 'Trading price at the end of the period'},\n",
187 |     "        {'dataType': 'DOUBLE',  'name': 'TradedVolume', 'description': 'Total value traded'},\n",
188 |     "        {'dataType': 'INTEGER', 'name': 'NumberOfTrades', 'description': 'Number of distinct trades during the period'}\n",
189 |     "    ]\n",
190 |     "}\n"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "# call FinSpace to create the dataset if no ID was assigned\n",
200 |     "# if an ID was assigned, will not create a dataset but will simply add data to it\n",
201 |     "if dataset_id is None:\n",
202 |     "    # Create the dataset if it does not exist yet\n",
203 |     "    resp = finspace_manager.finspace_client.create_dataset(\n",
204 |     "        name = name, \n",
205 |     "        description = description, \n",
206 |     "        permissionGroupId = group_id,\n",
207 |     "        datasetPermissions = request_dataset_permissions,\n",
208 |     "        kind = \"TABULAR\",\n",
209 |     "        ownerInfo = basicOwnerInfo,\n",
210 |     "        schema = schema\n",
211 |     "    )\n",
212 |     "\n",
213 |     "    dataset_id = resp[\"datasetId\"]\n",
214 |     "    \n",
215 |     "    time.sleep(5)\n",
216 |     "\n",
217 |     "print(f'Dataset ID: {dataset_id}')"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "# use pandas to generate a range of dates between start and end\n",
227 |     "dates = pd.date_range(start=start_date, end=end_date)\n",
228 |     "\n",
229 |     "for d in dates:\n",
230 |     "    s3_source = f'{root_folder}/{d.strftime(\"%Y-%m-%d\")}'\n",
231 |     "    print(f'Ingesting from: {s3_source}')\n",
232 |     "\n",
233 |     "    try:\n",
234 |     "        resp = finspace_manager.finspace_client.instance.create_changeset(datasetId=dataset_id, changeType='APPEND', \n",
235 |     "                                                                          sourceType='S3', sourceParams={'s3SourcePath': s3_source}, \n",
236 |     "                                                                          formatType='CSV', \n",
237 |     "                                                                          formatParams={'separator': ',', 'withHeader': 'true'})\n",
238 |     "\n",
239 |     "        changeset_id = resp['changeset']['id']\n",
240 |     "        \n",
241 |     "    except Exception as e:\n",
242 |     "        print(f'No Data, Weekend? {d} {e}')\n",
243 |     "        continue\n",
244 |     "\n",
245 |     "    print(f'Changeset ID: {changeset_id}')\n"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "def wait_for_ingestion(client, dataset_id: str, changeset_id: str, sleep_sec=10):\n",
255 |     "    \"\"\"\n",
256 |     "    function that will continuously poll the changeset creation to ensure it completes or fails before returning\n",
257 |     "    :param dataset_id: GUID of the dataset\n",
258 |     "    :type: str\n",
259 |     "    :param changeset_id: GUID of the changeset\n",
260 |     "    :type: str\n",
261 |     "    :param sleep_sec: seconds to wait between checks\n",
262 |     "    :type: int\n",
263 |     "    \"\"\"\n",
264 |     "    while True:\n",
265 |     "        resp1 = client.describe_changeset(datasetId=dataset_id, id=changeset_id)\n",
266 |     "\n",
267 |     "        resp2 = resp1.get('changeset', '')\n",
268 |     "        status = resp2.get('status', '')\n",
269 |     "\n",
270 |     "        if status == 'SUCCESS':\n",
271 |     "            print(f\"Changeset complete\")\n",
272 |     "            break\n",
273 |     "        elif status == 'PENDING' or status == 'RUNNING':\n",
274 |     "            print(f\"Changeset status is still PENDING, waiting {sleep_sec} sec ...\")\n",
275 |     "            time.sleep(sleep_sec)\n",
276 |     "            continue\n",
277 |     "        else:\n",
278 |     "            raise Exception(f\"Bad changeset status: {resp1}{status}, failing now.\")"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "wait_for_ingestion(finspace_manager.finspace_client, dataset_id=dataset_id, changeset_id=changeset_id)"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {},
294 |    "outputs": [],
295 |    "source": [
296 |     "print( f\"dataset_id: {dataset_id}\")        \n",
297 |     "\n",
298 |     "resp = finspace_manager.finspace_client.list_data_views(datasetIdEquals = dataset_id, maxResults=100)\n",
299 |     "\n",
300 |     "existing_views = resp['dataViews']\n",
301 |     "\n",
302 |     "autoupdate_view_id = None\n",
303 |     "\n",
304 |     "for ss in existing_views:\n",
305 |     "    if ss['autoUpdate'] == True: \n",
306 |     "        autoupdate_view_id = ss['dataViewId']\n",
307 |     "        \n",
308 |     "# create a an auto-update snapshot for this dataset if one does not already exist\n",
309 |     "if autoupdate_view_id is None:\n",
310 |     "    print(\"creating auto-update view\")\n",
311 |     "\n",
312 |     "    resp = finspace_manager.finspace_client.create_materialized_snapshot(\n",
313 |     "        destinationProperties={},\n",
314 |     "        autoUpdate=True,\n",
315 |     "        sortColumns=[],\n",
316 |     "        partitionColumns=[],\n",
317 |     "        destinationType = \"GLUE_TABLE\",\n",
318 |     "        datasetId=dataset_id)\n",
319 |     "    autoupdate_view_id = resp['id']\n",
320 |     "else:\n",
321 |     "    print(f\"Exists: autoupdate_view_id = {autoupdate_view_id}\")\n",
322 |     "        "
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {},
328 |    "source": [
329 |     "## Associate Attribute Set"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "def list_attribute_sets(client):\n",
339 |     "    resp = client.list_dataset_types(sort='NAME')\n",
340 |     "    results = resp['datasetTypeSummaries']\n",
341 |     "\n",
342 |     "    while \"nextToken\" in resp:\n",
343 |     "        resp = client.list_dataset_types(sort='NAME', nextToken=resp['nextToken'])\n",
344 |     "        results.extend(resp['datasetTypeSummaries'])\n",
345 |     "\n",
346 |     "    return results\n",
347 |     "\n",
348 |     "def attribute_set(client, name: str):\n",
349 |     "    \"\"\"\n",
350 |     "    Exact name search for a dataset type of the given name\n",
351 |     "    :param name: name of the dataset type to find\n",
352 |     "    :param name: name of the dataset type to find\n",
353 |     "    :type: str\n",
354 |     "    :return\n",
355 |     "    \"\"\"\n",
356 |     "    all_dataset_types = list_attribute_sets(client)\n",
357 |     "    existing_dataset_type = next((c for c in all_dataset_types if c['name'].lower() == name.lower()), None)\n",
358 |     "\n",
359 |     "    if existing_dataset_type:\n",
360 |     "        return existing_dataset_type\n",
361 |     "\n",
362 |     "def describe_attribute_set(client, attribute_set_id: str):\n",
363 |     "    \"\"\"\n",
364 |     "    Calls describe dataset type API function and only returns the dataset type portion of the response\n",
365 |     "    :param attribute_set_id: the GUID of the dataset type to get description of\n",
366 |     "    :type: str\n",
367 |     "    \"\"\"\n",
368 |     "    resp = None\n",
369 |     "    dataset_type_details_resp = client.describe_dataset_type(datasetTypeId=attribute_set_id)\n",
370 |     "\n",
371 |     "    if 'datasetType' in dataset_type_details_resp:\n",
372 |     "        resp = dataset_type_details_resp['datasetType']\n",
373 |     "\n",
374 |     "    return resp\n",
375 |     "\n",
376 |     "def associate_attribute_set(client, att_name: str, att_values: list, dataset_id: str):\n",
377 |     "    # get the attribute set by name, will need its id\n",
378 |     "    att_set = attribute_set(client, att_name)\n",
379 |     "\n",
380 |     "    # get the dataset's information, will need the arn\n",
381 |     "    dataset_details_resp = client.describe_dataset_details(datasetId=dataset_id)\n",
382 |     "\n",
383 |     "    dataset = dataset_details_resp.get(\"dataset\", None)\n",
384 |     "\n",
385 |     "    if dataset is None:\n",
386 |     "        raise ValueError(f'No dataset found for id: {dataset_id}')\n",
387 |     "\n",
388 |     "    # disassociate any existing relationship\n",
389 |     "    try:\n",
390 |     "        client.dissociate_dataset_from_attribute_set(datasetArn=dataset['arn'], attributeSetId=att_set['id'], datasetId=dataset_id)\n",
391 |     "    except:\n",
392 |     "        print(\"Nothing to disassociate\")\n",
393 |     "\n",
394 |     "    arn = dataset['arn']\n",
395 |     "    attribute_set_id = att_set['id']\n",
396 |     "\n",
397 |     "    client.associate_dataset_with_attribute_set(datasetArn=arn, attributeSetId=attribute_set_id, datasetId=dataset_id)\n",
398 |     "\n",
399 |     "    resp = client.update_dataset_attribute_set_context(datasetArn=arn, datasetId=dataset_id, attributeSetId=attribute_set_id, values=att_values)\n",
400 |     "\n",
401 |     "    if resp['ResponseMetadata']['HTTPStatusCode'] != 200:\n",
402 |     "        return resp\n",
403 |     "\n",
404 |     "    return"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": null,
410 |    "metadata": {},
411 |    "outputs": [],
412 |    "source": [
413 |     "# Associate an attribute set and fill its values\n",
414 |     "print(f\"Associating values to attribute set: {att_name}\")\n",
415 |     "\n",
416 |     "associate_attribute_set(finspace_manager.finspace_client.instance, att_name=att_name, att_values=att_values, dataset_id=dataset_id)"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": null,
422 |    "metadata": {},
423 |    "outputs": [],
424 |    "source": [
425 |     "import datetime\n",
426 |     "print( f\"Last Run: {datetime.datetime.now()}\" )"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": null,
432 |    "metadata": {},
433 |    "outputs": [],
434 |    "source": []
435 |   }
436 |  ],
437 |  "metadata": {
438 |   "instance_type": "ml.t3.medium",
439 |   "kernelspec": {
440 |    "display_name": "FinSpace PySpark (finspace-sparkmagic-84084/latest)",
441 |    "language": "python",
442 |    "name": "pysparkkernel__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:489461498020:image/finspace-sparkmagic-84084"
443 |   },
444 |   "language_info": {
445 |    "codemirror_mode": {
446 |     "name": "python",
447 |     "version": 3
448 |    },
449 |    "mimetype": "text/x-python",
450 |    "name": "pyspark",
451 |    "pygments_lexer": "python3"
452 |   }
453 |  },
454 |  "nbformat": 4,
455 |  "nbformat_minor": 4
456 | }
457 | 


--------------------------------------------------------------------------------
/notebooks/third_party_apis/yfinance_import.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Yahoo Finance API\n",
  8 |     "\n",
  9 |     "Using the Yahoo Finance APIs and the Python library yfinance will download and create a series of datasets in FinSpace.\n",
 10 |     "\n",
 11 |     "## Datasets to Create\n",
 12 |     "1. Daily HLOC for all history\n",
 13 |     "2. Corporate Actions\n",
 14 |     "3. Dividends\n",
 15 |     "4. Information\n",
 16 |     "\n",
 17 |     "## References\n",
 18 |     "- [yfinance GitHub](https://github.com/ranaroussi/yfinance)\n",
 19 |     "- [Reliably download historical market data from Yahoo! Finance with Python](https://github.com/ranaroussi/yfinance)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "%local\n",
 29 |     "from aws.finspace.cluster import FinSpaceClusterManager\n",
 30 |     "\n",
 31 |     "# if this was already run, no need to run again\n",
 32 |     "if 'finspace_clusters' not in globals():\n",
 33 |     "    finspace_clusters = FinSpaceClusterManager()\n",
 34 |     "    finspace_clusters.auto_connect()\n",
 35 |     "else:\n",
 36 |     "    print(f'connected to cluster: {finspace_clusters.get_connected_cluster_id()}')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "try:\n",
 46 |     "    sc.install_pypi_package('yfinance==0.1.70')\n",
 47 |     "except Exception as e:\n",
 48 |     "    print('Packages already Installed')"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "# Variables and Libraries\n",
 56 |     "\n",
 57 |     "**Important** that you fill in the identifiers for the ticker dataset and view you will be using as the universe of tickers to pull data for. These are identifiers specific to the ticker history you would have created with the polygon_import notebook."
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# imports\n",
 67 |     "import time\n",
 68 |     "\n",
 69 |     "import yfinance as yf\n",
 70 |     "import pandas as pd\n",
 71 |     "import numpy as np\n",
 72 |     "import matplotlib as plt\n",
 73 |     "\n",
 74 |     "from io import StringIO\n",
 75 |     "from pyspark.sql.types import *\n",
 76 |     "from pyspark.sql import functions as F\n",
 77 |     "from pyspark.sql.functions import udf\n",
 78 |     "from pyspark.sql import Window\n",
 79 |     "from pyspark.sql.functions import pandas_udf, PandasUDFType\n",
 80 |     "\n",
 81 |     "from aws.finspace.analytics import FinSpaceAnalyticsManager\n",
 82 |     "\n",
 83 |     "from datetime import datetime, timedelta\n",
 84 |     "\n",
 85 |     "###\n",
 86 |     "### Populate with values from your FinSpace\n",
 87 |     "###\n",
 88 |     "### Use the dataset and view created from polygon_import.ipynb\n",
 89 |     "###\n",
 90 |     "ticker_dataset_id = ''\n",
 91 |     "ticker_view_id    = ''\n",
 92 |     "\n",
 93 |     "# User Group to grant access to the dataset\n",
 94 |     "group_id = ''\n",
 95 |     "\n",
 96 |     "# if this is to work with an existing dataset\n",
 97 |     "dataset_id = None\n",
 98 |     "\n",
 99 |     "finspace_manager = FinSpaceAnalyticsManager(spark)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "## Read in the Tickers Dataset\n",
107 |     "\n",
108 |     "This would have been created from the polygon_import notebook."
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "tickersDF = finspace_manager.read_data_view(ticker_dataset_id, ticker_view_id)\n",
118 |     "tickersDF.printSchema()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "tickersDF.show(5)\n",
128 |     "print(f'Tickers: {tickersDF.count()}')"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "# Stock Prices\n",
136 |     "\n",
137 |     "Using the APIs and Spark, construct a dataframe of all historical prices."
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "schema = StructType(\n",
147 |     "  [\n",
148 |     "    StructField('ticker', StringType(), True), \n",
149 |     "    StructField('date', DateType(), True),\n",
150 |     "    StructField('open', DoubleType(), True),\n",
151 |     "    StructField('high', DoubleType(), True),\n",
152 |     "    StructField('low', DoubleType(), True),\n",
153 |     "    StructField('close', DoubleType(), True),\n",
154 |     "    StructField('volume', DoubleType(), True),\n",
155 |     "    StructField('dividends', DoubleType(), True),  \n",
156 |     "    StructField('stock_splits', DoubleType(), True),  \n",
157 |     "  ]\n",
158 |     ")\n",
159 |     "\n",
160 |     "def fetch_tick(group, pdf):\n",
161 |     "    tick = group[0]\n",
162 |     "    period = 'max'\n",
163 |     "    interval = '1d'\n",
164 |     "    try:\n",
165 |     "        aTicker = yf.Ticker(tick)\n",
166 |     "        raw = aTicker.history(period=period, interval=interval )[['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits']]\n",
167 |     "        # fill in missing business days\n",
168 |     "        idx = pd.date_range(raw.index.min(), raw.index.max(), freq='B')\n",
169 |     "        # use last observation carried forward for missing value\n",
170 |     "        output_df = raw.reindex(idx, method='pad')\n",
171 |     "        # Pandas does not keep index (date) when converted into spark dataframe\n",
172 |     "        output_df['date'] = output_df.index\n",
173 |     "        output_df['ticker'] = tick\n",
174 |     "        new_cols = [elem.strip().lower().replace(\" \", \"_\") for elem in output_df.columns] \n",
175 |     "        output_df.columns = new_cols\n",
176 |     "        return output_df\n",
177 |     "    except:\n",
178 |     "        print(f'Issue with {tick}')\n",
179 |     "        return pd.DataFrame(columns = schema.names)\n",
180 |     "\n",
181 |     "# parallel grab of data, meaning the apply is executed across executors, data is gathered faster\n",
182 |     "stockDF = ( tickersDF\n",
183 |     "  .groupBy(\"ticker\")\n",
184 |     "  .applyInPandas(fetch_tick, schema=schema)\n",
185 |     ")\n",
186 |     "\n",
187 |     "# show does few calls to yfinance b/c its only going to need 5 rows...\n",
188 |     "stockDF.show(5)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "stockDF.printSchema()"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "# Create FinSpace Dataset\n",
205 |     "\n",
206 |     "Using the FinSpace APIs will define the Dataset, add the Changeset, create auto-updating view, and associate and populate attributes to the dataset.  \n",
207 |     "\n",
208 |     "## Definitions\n",
209 |     "\n",
210 |     "Here are the various data elements we need for creating the dataset."
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "# Name for the dataset\n",
220 |     "name = \"EOD Prices\"\n",
221 |     "\n",
222 |     "# description for the dataset\n",
223 |     "description = \"Equity EOD Price History from Yahoo Finance\"\n",
224 |     "\n",
225 |     "# this is the attribute set to use, will search for it in system, this name assumes the Capital Markets Sample Data Bundle was installed\n",
226 |     "att_name = \"Capital Market Details\"\n",
227 |     "\n",
228 |     "# Attributes to associate, based on the definition of the attribute set\n",
229 |     "att_values = [\n",
230 |     "    { 'field' : 'AssetClass', 'type' : 'TAXONOMY', 'values' : [ 'Equity', 'CommonStocks', 'Currencies' ] },\n",
231 |     "    { 'field' : 'EventType',  'type' : 'TAXONOMY', 'values' : [ ] },\n",
232 |     "    { 'field' : 'Exchange',   'type' : 'TAXONOMY', 'values' : [ ] },\n",
233 |     "    { 'field' : 'FinancialContentType', 'type' : 'TAXONOMY', 'values' : [ ] },\n",
234 |     "    { 'field' : 'RegionsAndCountries',  'type' : 'TAXONOMY', 'values' : [ ] }\n",
235 |     "]\n",
236 |     "\n",
237 |     "# Permissions to grant the above group for the created dataset\n",
238 |     "basicPermissions = [\n",
239 |     "    \"ViewDatasetDetails\",\n",
240 |     "    \"ReadDatasetData\",\n",
241 |     "    \"AddDatasetData\",\n",
242 |     "    \"CreateSnapshot\",\n",
243 |     "    \"EditDatasetMetadata\",\n",
244 |     "    \"ManageDatasetPermissions\",\n",
245 |     "    \"DeleteDataset\"\n",
246 |     "]\n",
247 |     "\n",
248 |     "request_dataset_permissions = [{\"permission\": permissionName} for permissionName in basicPermissions]\n",
249 |     "\n",
250 |     "# All datasets have ownership\n",
251 |     "basicOwnerInfo = {\n",
252 |     "    \"phoneNumber\" : \"12125551000\",\n",
253 |     "    \"email\"       : \"jdoe@amazon.com\",\n",
254 |     "    \"name\"        : \"Jane Doe\"\n",
255 |     "}\n",
256 |     "\n",
257 |     "# schema of the dataset\n",
258 |     "schema = {\n",
259 |     "    'primaryKeyColumns': [],\n",
260 |     "    'columns' : [\n",
261 |     "        {'dataType': 'STRING',    'name': 'ticker', 'description': 'The exchange symbol that this item is traded under'},\n",
262 |     "        {'dataType': 'DATE',      'name': 'date', 'description': 'Reporting date'},\n",
263 |     "        {'dataType': 'DOUBLE',    'name': 'open', 'description': 'Open Price'},\n",
264 |     "        {'dataType': 'DOUBLE',    'name': 'high', 'description': 'High Price'},\n",
265 |     "        {'dataType': 'DOUBLE',    'name': 'low', 'description': 'Low Price'},\n",
266 |     "        {'dataType': 'DOUBLE',    'name': 'close', 'description': 'Close Price'},\n",
267 |     "        {'dataType': 'DOUBLE',    'name': 'volume', 'description': 'Number of Shares Traded'},\n",
268 |     "        {'dataType': 'DOUBLE',    'name': 'dividends', 'description': 'Any dividends paid'},\n",
269 |     "        {'dataType': 'DOUBLE',    'name': 'stock_splits', 'description': 'Any stock sploits'}\n",
270 |     "    ]\n",
271 |     "}"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "# call FinSpace to create the dataset if no ID was assigned\n",
281 |     "# if an ID was assigned, will not create a dataset but will simply add data to it\n",
282 |     "if dataset_id is None:\n",
283 |     "    # Create the dataset if it does not exist yet\n",
284 |     "    resp = finspace_manager.finspace_client.create_dataset(\n",
285 |     "        name = name, \n",
286 |     "        description = description, \n",
287 |     "        permissionGroupId = group_id,\n",
288 |     "        datasetPermissions = request_dataset_permissions,\n",
289 |     "        kind = \"TABULAR\",\n",
290 |     "        ownerInfo = basicOwnerInfo,\n",
291 |     "        schema = schema\n",
292 |     "    )\n",
293 |     "\n",
294 |     "    dataset_id = resp[\"datasetId\"]\n",
295 |     "    \n",
296 |     "print(f'Dataset ID: {dataset_id}')"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "resp = finspace_manager.finspace_client.instance.get_user_ingestion_info()\n",
306 |     "\n",
307 |     "upload_location = resp['ingestionPath']\n",
308 |     "stockDF.write.parquet(upload_location)\n",
309 |     "\n",
310 |     "resp = finspace_manager.finspace_client.instance.create_changeset(datasetId=dataset_id, changeType='REPLACE', \n",
311 |     "                                        sourceType='S3', sourceParams={'s3SourcePath': upload_location}, formatType='PARQUET', formatParams={})\n",
312 |     "\n",
313 |     "changeset_id = resp['changeset']['id']\n",
314 |     "\n",
315 |     "print(f\"changeset_id = {changeset_id}\")"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {},
322 |    "outputs": [],
323 |    "source": [
324 |     "def wait_for_ingestion(client, dataset_id: str, changeset_id: str, sleep_sec=10):\n",
325 |     "    \"\"\"\n",
326 |     "    function that will continuously poll the changeset creation to ensure it completes or fails before returning\n",
327 |     "    :param dataset_id: GUID of the dataset\n",
328 |     "    :type: str\n",
329 |     "    :param changeset_id: GUID of the changeset\n",
330 |     "    :type: str\n",
331 |     "    :param sleep_sec: seconds to wait between checks\n",
332 |     "    :type: int\n",
333 |     "    \"\"\"\n",
334 |     "    while True:\n",
335 |     "        resp1 = client.describe_changeset(datasetId=dataset_id, id=changeset_id)\n",
336 |     "\n",
337 |     "        resp2 = resp1.get('changeset', '')\n",
338 |     "        status = resp2.get('status', '')\n",
339 |     "\n",
340 |     "        if status == 'SUCCESS':\n",
341 |     "            print(f\"Changeset complete\")\n",
342 |     "            break\n",
343 |     "        elif status == 'PENDING' or status == 'RUNNING':\n",
344 |     "            print(f\"Changeset status is still PENDING, waiting {sleep_sec} sec ...\")\n",
345 |     "            time.sleep(sleep_sec)\n",
346 |     "            continue\n",
347 |     "        else:\n",
348 |     "            raise Exception(f\"Bad changeset status: {status}, failing now.\")"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": null,
354 |    "metadata": {},
355 |    "outputs": [],
356 |    "source": [
357 |     "wait_for_ingestion(finspace_manager.finspace_client, dataset_id=dataset_id, changeset_id=changeset_id)"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "markdown",
362 |    "metadata": {},
363 |    "source": [
364 |     "# Create View of the Dataset"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": null,
370 |    "metadata": {},
371 |    "outputs": [],
372 |    "source": [
373 |     "print( f\"dataset_id: {dataset_id}\")        \n",
374 |     "\n",
375 |     "resp = finspace_manager.finspace_client.list_data_views(datasetIdEquals = dataset_id, maxResults=100)\n",
376 |     "\n",
377 |     "existing_views = resp['dataViews']\n",
378 |     "\n",
379 |     "autoupdate_view_id = None\n",
380 |     "\n",
381 |     "for ss in existing_views:\n",
382 |     "    if ss['autoUpdate'] == True: \n",
383 |     "        autoupdate_view_id = ss.get('dataViewId', None)\n",
384 |     "        \n",
385 |     "# create a an auto-update snapshot for this dataset if one does not already exist\n",
386 |     "if autoupdate_view_id is None:\n",
387 |     "    print(\"creating auto-update view\")\n",
388 |     "\n",
389 |     "    resp = finspace_manager.finspace_client.create_materialized_snapshot(\n",
390 |     "        destinationProperties={},\n",
391 |     "        autoUpdate=True,\n",
392 |     "        sortColumns=[],\n",
393 |     "        partitionColumns=[],\n",
394 |     "        destinationType = \"GLUE_TABLE\",\n",
395 |     "        datasetId=dataset_id)\n",
396 |     "    autoupdate_view_id = resp['id']\n",
397 |     "else:\n",
398 |     "    print(f\"Exists: autoupdate_view_id = {autoupdate_view_id}\")\n",
399 |     "        "
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "markdown",
404 |    "metadata": {},
405 |    "source": [
406 |     "## Associate Attribute Set"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": null,
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": [
415 |     "def list_attribute_sets(client):\n",
416 |     "    resp = client.list_dataset_types(sort='NAME')\n",
417 |     "    results = resp['datasetTypeSummaries']\n",
418 |     "\n",
419 |     "    while \"nextToken\" in resp:\n",
420 |     "        resp = client.list_dataset_types(sort='NAME', nextToken=resp['nextToken'])\n",
421 |     "        results.extend(resp['datasetTypeSummaries'])\n",
422 |     "\n",
423 |     "    return results\n",
424 |     "\n",
425 |     "def attribute_set(client, name: str):\n",
426 |     "    \"\"\"\n",
427 |     "    Exact name search for a dataset type of the given name\n",
428 |     "    :param name: name of the dataset type to find\n",
429 |     "    :param name: name of the dataset type to find\n",
430 |     "    :type: str\n",
431 |     "    :return\n",
432 |     "    \"\"\"\n",
433 |     "    all_dataset_types = list_attribute_sets(client)\n",
434 |     "    existing_dataset_type = next((c for c in all_dataset_types if c['name'].lower() == name.lower()), None)\n",
435 |     "\n",
436 |     "    if existing_dataset_type:\n",
437 |     "        return existing_dataset_type\n",
438 |     "\n",
439 |     "def describe_attribute_set(client, attribute_set_id: str):\n",
440 |     "    \"\"\"\n",
441 |     "    Calls describe dataset type API function and only returns the dataset type portion of the response\n",
442 |     "    :param attribute_set_id: the GUID of the dataset type to get description of\n",
443 |     "    :type: str\n",
444 |     "    \"\"\"\n",
445 |     "    resp = None\n",
446 |     "    dataset_type_details_resp = client.describe_dataset_type(datasetTypeId=attribute_set_id)\n",
447 |     "\n",
448 |     "    if 'datasetType' in dataset_type_details_resp:\n",
449 |     "        resp = dataset_type_details_resp['datasetType']\n",
450 |     "\n",
451 |     "    return resp\n",
452 |     "\n",
453 |     "def associate_attribute_set(client, att_name: str, att_values: list, dataset_id: str):\n",
454 |     "    # get the attribute set by name, will need its id\n",
455 |     "    att_set = attribute_set(client, att_name)\n",
456 |     "\n",
457 |     "    # get the dataset's information, will need the arn\n",
458 |     "    dataset_details_resp = client.describe_dataset_details(datasetId=dataset_id)\n",
459 |     "\n",
460 |     "    dataset = dataset_details_resp.get(\"dataset\", None)\n",
461 |     "\n",
462 |     "    if dataset is None:\n",
463 |     "        raise ValueError(f'No dataset found for id: {dataset_id}')\n",
464 |     "\n",
465 |     "    # disassociate any existing relationship\n",
466 |     "    try:\n",
467 |     "        client.dissociate_dataset_from_attribute_set(datasetArn=dataset['arn'], attributeSetId=att_set['id'], datasetId=dataset_id)\n",
468 |     "    except:\n",
469 |     "        print(\"Nothing to disassociate\")\n",
470 |     "\n",
471 |     "    arn = dataset['arn']\n",
472 |     "    attribute_set_id = att_set['id']\n",
473 |     "\n",
474 |     "    client.associate_dataset_with_attribute_set(datasetArn=arn, attributeSetId=attribute_set_id, datasetId=dataset_id)\n",
475 |     "\n",
476 |     "    resp = client.update_dataset_attribute_set_context(datasetArn=arn, datasetId=dataset_id, attributeSetId=attribute_set_id, values=att_values)\n",
477 |     "\n",
478 |     "    if resp['ResponseMetadata']['HTTPStatusCode'] != 200:\n",
479 |     "        return resp\n",
480 |     "\n",
481 |     "    return"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": null,
487 |    "metadata": {},
488 |    "outputs": [],
489 |    "source": [
490 |     "# Associate an attribute set and fill its values\n",
491 |     "print(f\"Associating values to attribute set: {att_name}\")\n",
492 |     "\n",
493 |     "associate_attribute_set(finspace_manager.finspace_client, att_name=att_name, att_values=att_values, dataset_id=dataset_id)"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": null,
499 |    "metadata": {},
500 |    "outputs": [],
501 |    "source": [
502 |     "import datetime\n",
503 |     "print( f\"Last Run: {datetime.datetime.now()}\" )"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": null,
509 |    "metadata": {},
510 |    "outputs": [],
511 |    "source": []
512 |   }
513 |  ],
514 |  "metadata": {
515 |   "kernelspec": {
516 |    "display_name": "FinSpace PySpark (finspace-sparkmagic-84084/latest)",
517 |    "language": "python",
518 |    "name": "pysparkkernel__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:489461498020:image/finspace-sparkmagic-84084"
519 |   },
520 |   "language_info": {
521 |    "codemirror_mode": {
522 |     "name": "python",
523 |     "version": 3
524 |    },
525 |    "mimetype": "text/x-python",
526 |    "name": "pyspark",
527 |    "pygments_lexer": "python3"
528 |   },
529 |   "widgets": {
530 |    "application/vnd.jupyter.widget-state+json": {
531 |     "state": {},
532 |     "version_major": 2,
533 |     "version_minor": 0
534 |    }
535 |   }
536 |  },
537 |  "nbformat": 4,
538 |  "nbformat_minor": 5
539 | }
540 | 


--------------------------------------------------------------------------------
/notebooks/third_party_apis/polygon_import.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Polygon.io\n",
  8 |     "Notebook to use polygon.io with Spark and FinSpace\n",
  9 |     "\n",
 10 |     "## Reference\n",
 11 |     "[Polygon.io](https://polygon.io)\n",
 12 |     "[Python Client](https://github.com/polygon-io/client-python)"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "%local\n",
 22 |     "from aws.finspace.cluster import FinSpaceClusterManager\n",
 23 |     "\n",
 24 |     "# if this was already run, no need to run again\n",
 25 |     "if 'finspace_clusters' not in globals():\n",
 26 |     "    finspace_clusters = FinSpaceClusterManager()\n",
 27 |     "    finspace_clusters.auto_connect()\n",
 28 |     "else:\n",
 29 |     "    print(f'connected to cluster: {finspace_clusters.get_connected_cluster_id()}')"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "try:\n",
 39 |     "    sc.install_pypi_package('polygon-api-client==0.2.11')\n",
 40 |     "except Exception as e:\n",
 41 |     "    print('Packages already Installed')"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "# Variables and Libraries\n",
 49 |     "\n",
 50 |     "You will need a Polygon client_key, fill its value below.\n",
 51 |     "\n",
 52 |     "**IMPORTANT** Use a group ID from your FinSpace to grant permissions to for the dataset."
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "import time\n",
 62 |     "import pandas as pd\n",
 63 |     "import urllib.parse as urlparse\n",
 64 |     "\n",
 65 |     "from aws.finspace.analytics import FinSpaceAnalyticsManager\n",
 66 |     "\n",
 67 |     "from polygon import RESTClient\n",
 68 |     "from urllib.parse import parse_qs\n",
 69 |     "\n",
 70 |     "#\n",
 71 |     "# User Group to grant access to the dataset\n",
 72 |     "# \n",
 73 |     "# FILL IN VALUES BELOW FOR YOUR GROUP ID AND CLIENT KEY\n",
 74 |     "# ----------------------------------------------------------------------\n",
 75 |     "group_id = '' \n",
 76 |     "dataset_id = None\n",
 77 |     "\n",
 78 |     "client_key = ''\n",
 79 |     "\n",
 80 |     "client = RESTClient(client_key)\n",
 81 |     "\n",
 82 |     "finspace_manager = FinSpaceAnalyticsManager(spark)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "# Get Tickers\n",
 90 |     "\n",
 91 |     "Using the Polygon APIs, create a table of all Tickers."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "# function to extract the pagination cursor\n",
101 |     "def get_cursor(url):\n",
102 |     "    parsed = urlparse.urlparse(url)\n",
103 |     "    cursor = parse_qs(parsed.query)['cursor']\n",
104 |     "    return cursor\n",
105 |     "\n",
106 |     "resp = client.reference_tickers_v3(limit=1000)\n",
107 |     "\n",
108 |     "all_tickers = []\n",
109 |     "\n",
110 |     "if resp.status == 'OK':\n",
111 |     "    all_tickers.extend(resp.results)\n",
112 |     "\n",
113 |     "while hasattr(resp, 'next_url'):\n",
114 |     "    print(\"Next, but first sleeping...\")\n",
115 |     "    time.sleep((60/5) + 1) # 5 calls per minute\n",
116 |     "    cursor = get_cursor(resp.next_url)\n",
117 |     "    resp = client.reference_tickers_v3(limit=1000, cursor=cursor)\n",
118 |     "\n",
119 |     "    all_tickers.extend(resp.results)\n",
120 |     "\n",
121 |     "# create pandas dataframe from the responses    \n",
122 |     "tickers_df = pd.DataFrame.from_records(all_tickers)\n",
123 |     "tickers_df"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "# Convert to Spark DataFrame"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "from pyspark.sql.types import *\n",
140 |     "\n",
141 |     "# Auxiliar functions\n",
142 |     "def equivalent_type(f):\n",
143 |     "    if f == 'datetime64[ns]': return TimestampType()\n",
144 |     "    elif f == 'int64': return LongType()\n",
145 |     "    elif f == 'int32': return IntegerType()\n",
146 |     "    elif f == 'float64': return FloatType()\n",
147 |     "    elif f == 'bool': return BooleanType()\n",
148 |     "    else: return StringType()\n",
149 |     "\n",
150 |     "def define_structure(string, format_type):\n",
151 |     "    try: typo = equivalent_type(format_type)\n",
152 |     "    except: typo = StringType()\n",
153 |     "    return StructField(string, typo)\n",
154 |     "\n",
155 |     "def get_schema(pandas_df):\n",
156 |     "    columns = list(pandas_df.columns)\n",
157 |     "    types = list(pandas_df.dtypes)\n",
158 |     "    struct_list = []\n",
159 |     "    for column, typo in zip(columns, types): \n",
160 |     "        struct_list.append(define_structure(column, typo))\n",
161 |     "    \n",
162 |     "    return StructType(struct_list)\n",
163 |     "\n",
164 |     "# Given pandas dataframe, it will return a spark's dataframe.\n",
165 |     "def pandas_to_spark(pandas_df):\n",
166 |     "    p_schema = get_schema(pandas_df)\n",
167 |     "    \n",
168 |     "    return sqlContext.createDataFrame(pandas_df, p_schema)\n"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "tickersDF = pandas_to_spark(tickers_df)\n",
178 |     "\n",
179 |     "tickersDF.printSchema()"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "from pyspark.sql.functions import *\n",
189 |     "\n",
190 |     "# convert the datatime column of string to proper timestamp type\n",
191 |     "tickersDF = ( tickersDF\n",
192 |     "    .withColumnRenamed('last_updated_utc', 'input_timestamp_str')\n",
193 |     "    .withColumn(\"last_updated_utc\",to_timestamp(\"input_timestamp_str\"))\n",
194 |     "    .drop('input_timestamp_str')\n",
195 |     ")"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "# sample the table\n",
205 |     "tickersDF.show(5)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "# Create FinSpace Dataset\n",
213 |     "\n",
214 |     "Using the FinSpace APIs will define the Dataset, add the Changeset, create auto-updating view, and associate and populate attributes to the dataset.  \n",
215 |     "\n",
216 |     "## Definitions\n",
217 |     "\n",
218 |     "Here are the various data elements we need for creating the dataset."
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "# Name for the dataset\n",
228 |     "name = \"Ticker Universe\"\n",
229 |     "\n",
230 |     "# description for the dataset\n",
231 |     "description = \"All ticker symbols which are supported by Polygon.io\"\n",
232 |     "\n",
233 |     "# this is the attribute set to use, will search for it in system, this name assumes the Capital Markets Sample Data Bundle was installed\n",
234 |     "att_name = \"Capital Market Details\"\n",
235 |     "\n",
236 |     "# Attributes to associate, based on the definition of the attribute set\n",
237 |     "att_values = [\n",
238 |     "    { 'field' : 'AssetClass', 'type' : 'TAXONOMY', 'values' : [ 'Equity', 'CommonStocks', 'Currencies', 'FXSpot', 'Crypto'] },\n",
239 |     "    { 'field' : 'EventType',  'type' : 'TAXONOMY', 'values' : [ ] },\n",
240 |     "    { 'field' : 'Exchange',   'type' : 'TAXONOMY', 'values' : [ ] },\n",
241 |     "    { 'field' : 'FinancialContentType', 'type' : 'TAXONOMY', 'values' : [ ] },\n",
242 |     "    { 'field' : 'RegionsAndCountries',  'type' : 'TAXONOMY', 'values' : [ ] }\n",
243 |     "]\n",
244 |     "\n",
245 |     "# Permissions to grant the above group for the created dataset\n",
246 |     "basicPermissions = [\n",
247 |     "    \"ViewDatasetDetails\",\n",
248 |     "    \"ReadDatasetData\",\n",
249 |     "    \"AddDatasetData\",\n",
250 |     "    \"CreateSnapshot\",\n",
251 |     "    \"EditDatasetMetadata\",\n",
252 |     "    \"ManageDatasetPermissions\",\n",
253 |     "    \"DeleteDataset\"\n",
254 |     "]\n",
255 |     "\n",
256 |     "request_dataset_permissions = [{\"permission\": permissionName} for permissionName in basicPermissions]\n",
257 |     "\n",
258 |     "# All datasets have ownership\n",
259 |     "basicOwnerInfo = {\n",
260 |     "    \"phoneNumber\" : \"12125551000\",\n",
261 |     "    \"email\"       : \"jdoe@amazon.com\",\n",
262 |     "    \"name\"        : \"Jane Doe\"\n",
263 |     "}\n",
264 |     "\n",
265 |     "# schema of the dataset\n",
266 |     "schema = {\n",
267 |     "    'primaryKeyColumns': [],\n",
268 |     "    'columns' : [\n",
269 |     "        {'dataType': 'STRING',    'name': 'ticker', 'description': 'The exchange symbol that this item is traded under'},\n",
270 |     "        {'dataType': 'STRING',    'name': 'name', 'description': 'The name of the asset. For stocks equities this will be the companies registered name. For crypto/fx this will be the name of the currency or coin pair'},\n",
271 |     "        {'dataType': 'STRING',    'name': 'market', 'description': 'The market type of the asset'},\n",
272 |     "        {'dataType': 'STRING',    'name': 'locale', 'description': 'The locale of the asset'},\n",
273 |     "        {'dataType': 'STRING',    'name': 'primary_exchange', 'description': 'The ISO code of the primary listing exchange for this asset'},\n",
274 |     "        {'dataType': 'STRING',    'name': 'type', 'description': 'The type of the asset'},\n",
275 |     "        {'dataType': 'BOOLEAN',   'name': 'active', 'description': 'Whether or not the asset is actively traded. False means the asset has been delisted'},\n",
276 |     "        {'dataType': 'STRING',    'name': 'currency_name', 'description': 'The name of the currency that this asset is traded with'},\n",
277 |     "        {'dataType': 'STRING',    'name': 'cik', 'description': 'The CIK number for this ticker'},\n",
278 |     "        {'dataType': 'STRING',    'name': 'composite_figi', 'description': 'The composite OpenFIGI number for this ticker'},\n",
279 |     "        {'dataType': 'STRING',    'name': 'share_class_figi', 'description': 'The share Class OpenFIGI number for this ticker'},\n",
280 |     "        {'dataType': 'STRING',    'name': 'currency_symbol', 'description': ''},\n",
281 |     "        {'dataType': 'STRING',    'name': 'base_currency_symbol', 'description': ''},\n",
282 |     "        {'dataType': 'STRING',    'name': 'base_currency_name', 'description': ''},\n",
283 |     "        {'dataType': 'DATETIME',  'name': 'last_updated_utc', 'description': 'The last time this asset record was updated'}\n",
284 |     "    ]\n",
285 |     "}\n"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "# call FinSpace to create the dataset if no ID was assigned\n",
295 |     "# if an ID was assigned, will not create a dataset but will simply add data to it\n",
296 |     "if dataset_id is None:\n",
297 |     "    # Create the dataset if it does not exist yet\n",
298 |     "    resp = finspace_manager.finspace_client.create_dataset(\n",
299 |     "        name = name, \n",
300 |     "        description = description, \n",
301 |     "        permissionGroupId = group_id,\n",
302 |     "        datasetPermissions = request_dataset_permissions,\n",
303 |     "        kind = \"TABULAR\",\n",
304 |     "        ownerInfo = basicOwnerInfo,\n",
305 |     "        schema = schema\n",
306 |     "    )\n",
307 |     "\n",
308 |     "    dataset_id = resp[\"datasetId\"]\n",
309 |     "    \n",
310 |     "    time.sleep(5)\n",
311 |     "\n",
312 |     "print(f'Dataset ID: {dataset_id}')"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "resp = finspace_manager.finspace_client.instance.get_user_ingestion_info()\n",
322 |     "\n",
323 |     "upload_location = resp['ingestionPath']\n",
324 |     "tickersDF.write.parquet(upload_location)\n",
325 |     "\n",
326 |     "resp = finspace_manager.finspace_client.instance.create_changeset(datasetId=dataset_id, changeType='REPLACE', \n",
327 |     "                                        sourceType='S3', sourceParams={'s3SourcePath': upload_location}, formatType='PARQUET', formatParams={})\n",
328 |     "\n",
329 |     "changeset_id = resp['changeset']['id']\n",
330 |     "\n",
331 |     "print(f\"changeset_id = {changeset_id}\")"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "def wait_for_ingestion(client, dataset_id: str, changeset_id: str, sleep_sec=10):\n",
341 |     "    \"\"\"\n",
342 |     "    function that will continuously poll the changeset creation to ensure it completes or fails before returning\n",
343 |     "    :param dataset_id: GUID of the dataset\n",
344 |     "    :type: str\n",
345 |     "    :param changeset_id: GUID of the changeset\n",
346 |     "    :type: str\n",
347 |     "    :param sleep_sec: seconds to wait between checks\n",
348 |     "    :type: int\n",
349 |     "    \"\"\"\n",
350 |     "    while True:\n",
351 |     "        resp1 = client.describe_changeset(datasetId=dataset_id, id=changeset_id)\n",
352 |     "\n",
353 |     "        resp2 = resp1.get('changeset', '')\n",
354 |     "        status = resp2.get('status', '')\n",
355 |     "\n",
356 |     "        if status == 'SUCCESS':\n",
357 |     "            print(f\"Changeset complete\")\n",
358 |     "            break\n",
359 |     "        elif status == 'PENDING' or status == 'RUNNING':\n",
360 |     "            print(f\"Changeset status is still PENDING, waiting {sleep_sec} sec ...\")\n",
361 |     "            time.sleep(sleep_sec)\n",
362 |     "            continue\n",
363 |     "        else:\n",
364 |     "            raise Exception(f\"Bad changeset status: {resp1}{status}, failing now.\")"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": null,
370 |    "metadata": {},
371 |    "outputs": [],
372 |    "source": [
373 |     "wait_for_ingestion(finspace_manager.finspace_client, dataset_id=dataset_id, changeset_id=changeset_id)"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {},
379 |    "source": [
380 |     "# Create View of the Dataset"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "print( f\"dataset_id: {dataset_id}\")        \n",
390 |     "\n",
391 |     "resp = finspace_manager.finspace_client.list_data_views(datasetIdEquals = dataset_id, maxResults=100)\n",
392 |     "\n",
393 |     "existing_views = resp['dataViews']\n",
394 |     "\n",
395 |     "autoupdate_view_id = None\n",
396 |     "\n",
397 |     "for ss in existing_views:\n",
398 |     "    if ss['autoUpdate'] == True: \n",
399 |     "        autoupdate_view_id = ss.get('dataViewId', None)\n",
400 |     "        \n",
401 |     "# create a an auto-update snapshot for this dataset if one does not already exist\n",
402 |     "if autoupdate_view_id is None:\n",
403 |     "    print(\"creating auto-update view\")\n",
404 |     "\n",
405 |     "    resp = finspace_manager.finspace_client.create_materialized_snapshot(\n",
406 |     "        destinationProperties={},\n",
407 |     "        autoUpdate=True,\n",
408 |     "        sortColumns=[],\n",
409 |     "        partitionColumns=[],\n",
410 |     "        destinationType = \"GLUE_TABLE\",\n",
411 |     "        datasetId=dataset_id)\n",
412 |     "    autoupdate_view_id = resp['id']\n",
413 |     "else:\n",
414 |     "    print(f\"Exists: autoupdate_view_id = {autoupdate_view_id}\")\n",
415 |     "        "
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "markdown",
420 |    "metadata": {},
421 |    "source": [
422 |     "## Associate Attribute Set"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": null,
428 |    "metadata": {},
429 |    "outputs": [],
430 |    "source": [
431 |     "def list_attribute_sets(client):\n",
432 |     "    resp = client.list_dataset_types(sort='NAME')\n",
433 |     "    results = resp['datasetTypeSummaries']\n",
434 |     "\n",
435 |     "    while \"nextToken\" in resp:\n",
436 |     "        resp = client.list_dataset_types(sort='NAME', nextToken=resp['nextToken'])\n",
437 |     "        results.extend(resp['datasetTypeSummaries'])\n",
438 |     "\n",
439 |     "    return results\n",
440 |     "\n",
441 |     "def attribute_set(client, name: str):\n",
442 |     "    \"\"\"\n",
443 |     "    Exact name search for a dataset type of the given name\n",
444 |     "    :param name: name of the dataset type to find\n",
445 |     "    :param name: name of the dataset type to find\n",
446 |     "    :type: str\n",
447 |     "    :return\n",
448 |     "    \"\"\"\n",
449 |     "    all_dataset_types = list_attribute_sets(client)\n",
450 |     "    existing_dataset_type = next((c for c in all_dataset_types if c['name'].lower() == name.lower()), None)\n",
451 |     "\n",
452 |     "    if existing_dataset_type:\n",
453 |     "        return existing_dataset_type\n",
454 |     "\n",
455 |     "def describe_attribute_set(client, attribute_set_id: str):\n",
456 |     "    \"\"\"\n",
457 |     "    Calls describe dataset type API function and only returns the dataset type portion of the response\n",
458 |     "    :param attribute_set_id: the GUID of the dataset type to get description of\n",
459 |     "    :type: str\n",
460 |     "    \"\"\"\n",
461 |     "    resp = None\n",
462 |     "    dataset_type_details_resp = client.describe_dataset_type(datasetTypeId=attribute_set_id)\n",
463 |     "\n",
464 |     "    if 'datasetType' in dataset_type_details_resp:\n",
465 |     "        resp = dataset_type_details_resp['datasetType']\n",
466 |     "\n",
467 |     "    return resp\n",
468 |     "\n",
469 |     "def associate_attribute_set(client, att_name: str, att_values: list, dataset_id: str):\n",
470 |     "    # get the attribute set by name, will need its id\n",
471 |     "    att_set = attribute_set(client, att_name)\n",
472 |     "\n",
473 |     "    # get the dataset's information, will need the arn\n",
474 |     "    dataset_details_resp = client.describe_dataset_details(datasetId=dataset_id)\n",
475 |     "\n",
476 |     "    dataset = dataset_details_resp.get(\"dataset\", None)\n",
477 |     "\n",
478 |     "    if dataset is None:\n",
479 |     "        raise ValueError(f'No dataset found for id: {dataset_id}')\n",
480 |     "\n",
481 |     "    # disassociate any existing relationship\n",
482 |     "    try:\n",
483 |     "        client.dissociate_dataset_from_attribute_set(datasetArn=dataset['arn'], attributeSetId=att_set['id'], datasetId=dataset_id)\n",
484 |     "    except:\n",
485 |     "        print(\"Nothing to disassociate\")\n",
486 |     "\n",
487 |     "    arn = dataset['arn']\n",
488 |     "    attribute_set_id = att_set['id']\n",
489 |     "\n",
490 |     "    foo = client.associate_dataset_with_attribute_set(datasetArn=arn, attributeSetId=attribute_set_id, datasetId=dataset_id)\n",
491 |     "\n",
492 |     "    resp = client.update_dataset_attribute_set_context(datasetArn=arn, datasetId=dataset_id, attributeSetId=attribute_set_id, values=att_values)\n",
493 |     "\n",
494 |     "    if resp['ResponseMetadata']['HTTPStatusCode'] != 200:\n",
495 |     "        return resp\n",
496 |     "\n",
497 |     "    return"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "code",
502 |    "execution_count": null,
503 |    "metadata": {},
504 |    "outputs": [],
505 |    "source": [
506 |     "# Associate an attribute set and fill its values\n",
507 |     "print(f\"Associating values to attribute set: {att_name}\")\n",
508 |     "\n",
509 |     "associate_attribute_set(finspace_manager.finspace_client, att_name=att_name, att_values=att_values, dataset_id=dataset_id)"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": null,
515 |    "metadata": {},
516 |    "outputs": [],
517 |    "source": [
518 |     "print(f\"\"\"\n",
519 |     "ticker_dataset_id = '{dataset_id}'\n",
520 |     "ticker_view_id    = '{autoupdate_view_id}'\n",
521 |     "\"\"\")"
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "code",
526 |    "execution_count": null,
527 |    "metadata": {},
528 |    "outputs": [],
529 |    "source": [
530 |     "import datetime\n",
531 |     "print( f\"Last Run: {datetime.datetime.now()}\" )"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": null,
537 |    "metadata": {},
538 |    "outputs": [],
539 |    "source": []
540 |   }
541 |  ],
542 |  "metadata": {
543 |   "instance_type": "ml.t3.medium",
544 |   "kernelspec": {
545 |    "display_name": "FinSpace PySpark (finspace-sparkmagic-84084/latest)",
546 |    "language": "python",
547 |    "name": "pysparkkernel__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:489461498020:image/finspace-sparkmagic-84084"
548 |   },
549 |   "language_info": {
550 |    "codemirror_mode": {
551 |     "name": "python",
552 |     "version": 3
553 |    },
554 |    "mimetype": "text/x-python",
555 |    "name": "pyspark",
556 |    "pygments_lexer": "python3"
557 |   },
558 |   "widgets": {
559 |    "application/vnd.jupyter.widget-state+json": {
560 |     "state": {},
561 |     "version_major": 2,
562 |     "version_minor": 0
563 |    }
564 |   }
565 |  },
566 |  "nbformat": 4,
567 |  "nbformat_minor": 5
568 | }
569 | 


--------------------------------------------------------------------------------
/notebooks/custom_calendar/analyze_trade_and_quote_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Time Series Blog Notebook with Custom Calendar\n",
  8 |     "This notebook executes an end to end time series analysis using the Amazon FinSpace time series framework and included analytics functions. The notebook will process Equity TAQ data (provided to FinSpace by AlgoSeek LLC) and runs that data through the framework to generate Volatility and Bollinger bad data and plots.\n",
  9 |     "\n",
 10 |     "This notebook goes beyond the original blog notebook in that it uses the [pandas_market_calendar](https://pypi.org/project/pandas-market-calendars/) library to create a custom calendar for use in the 'Fill and Filter' stage of the time series data pipeline. "
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "%local\n",
 20 |     "from aws.finspace.cluster import FinSpaceClusterManager\n",
 21 |     "\n",
 22 |     "# if this was already run, no need to run again\n",
 23 |     "if 'finspace_clusters' not in globals():\n",
 24 |     "    finspace_clusters = FinSpaceClusterManager()\n",
 25 |     "    finspace_clusters.auto_connect()\n",
 26 |     "else:\n",
 27 |     "    print(f'connected to cluster: {finspace_clusters.get_connected_cluster_id()}')"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "# Collect and Summarize Timebars\n",
 35 |     "Time bars are obtained by sampling information at fixed time intervals, e.g., once every minute. \n",
 36 |     "\n",
 37 |     "**Series:** Time Series Data Engineering and Analysis\n",
 38 |     "\n",
 39 |     "As part of the big data timeseries processing workflow FinSpace supports, shows how one takes raw, uneven in time events of TAQ data and collects them into a performant derived dataset of collected bars of data.\n",
 40 |     "\n",
 41 |     "## Timeseries Workflow\n",
 42 |     "Raw Events → **\\[Collect bars → Summarize bars → Fill Missing → Prepare → Analytics\\]**\n",
 43 |     "\n",
 44 |     "![Workflow](workflow.png)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "#####----------------------------------------------------------\n",
 54 |     "##### REPLACE WITH CORRECT IDS!\n",
 55 |     "##### Dataset: \"US Equity TAQ - AMZN 6 Months\"\n",
 56 |     "#####\n",
 57 |     "#####----------------------------------------------------------\n",
 58 |     "dataset_id = ''\n",
 59 |     "view_id    = ''"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "# python imports\n",
 69 |     "import time\n",
 70 |     "import datetime\n",
 71 |     "import pprint \n",
 72 |     "\n",
 73 |     "import pandas as pd\n",
 74 |     "import matplotlib.pyplot as plt\n",
 75 |     "import pyspark.sql.functions as F\n",
 76 |     "import pyspark.sql.types as T\n",
 77 |     "\n",
 78 |     "# FinSpace imports\n",
 79 |     "from aws.finspace.timeseries.spark.util import string_to_timestamp_micros\n",
 80 |     "from aws.finspace.timeseries.spark.windows import create_time_bars, compute_analytics_on_features, compute_features_on_time_bars\n",
 81 |     "from aws.finspace.timeseries.spark.spec import BarInputSpec, TimeBarSpec\n",
 82 |     "from aws.finspace.timeseries.spark.summarizer import *\n",
 83 |     "from aws.finspace.timeseries.spark.analytics import *\n",
 84 |     "from aws.finspace.timeseries.finance.calendars import *\n",
 85 |     "from aws.finspace.timeseries.spark.prepare import *\n",
 86 |     "\n",
 87 |     "# Date range\n",
 88 |     "start_date = datetime.datetime(2019, 10, 1)\n",
 89 |     "end_date   = datetime.datetime(2019, 12, 31)\n",
 90 |     "\n",
 91 |     "barNum  = 1\n",
 92 |     "barUnit = \"minute\"\n",
 93 |     "\n",
 94 |     "barWidth = f\"{barNum} {barUnit}\""
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "# Get the Data from FinSpace\n",
102 |     "Using the given dataset and view ids, get the view as a Spark DataFrame"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "from aws.finspace.analytics import FinSpaceAnalyticsManager\n",
112 |     "finspace_manager = FinSpaceAnalyticsManager(spark)\n",
113 |     "\n",
114 |     "tDF = finspace_manager.read_data_view(dataset_id, view_id)\n",
115 |     "tDF.printSchema()"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "# Interact with Spark DataFrame\n",
123 |     "As a Spark DataFrame, you can interact with the data with spark functions."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "tDF.printSchema()\n",
133 |     "tDF.show(5)\n",
134 |     "print(f'Rows: {tDF.count():,}')"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "# Time Series Framework Stages\n",
142 |     "The functions below process the time series data by first collecting the data into time-bars then summarizing the data captured in the bar. The bars are collected into a column 'activity' for the window of time in the collectTimeBars function. The summarize bar function's purpose is to summarize the data collected in the bar, that bar can be of any type, not just time.\n",
143 |     "\n",
144 |     "Customizations\n",
145 |     "- vary the width and steps of the time-bar, collect different data from the source DataFrame\n",
146 |     "- Summarize the bar with other calculations  \n",
147 |     "\n",
148 |     "Bring Your Own  \n",
149 |     "- Customers can add their own custom Spark user defined functions (UDF) into the summarizer phase\n",
150 |     "\n",
151 |     "![Workflow](workflow.png)\n"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "# Stage: Collect Bars\n",
159 |     "\n",
160 |     "Collect raw TAQ events into time bars using FinSpace time series functions."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "# define the time-bar, column for time and how much time to collect\n",
170 |     "timebar_spec = TimeBarSpec(timestamp_column='datetime', window_duration=barWidth, slide_duration=barWidth)\n",
171 |     "\n",
172 |     "# what columns to collect in the bar\n",
173 |     "bar_input_spec = BarInputSpec('activity', 'datetime', 'timestamp', 'price', 'quantity', 'exchange', 'conditions' )\n",
174 |     "\n",
175 |     "# timebar column name\n",
176 |     "timebar_col = 'window'\n",
177 |     "\n",
178 |     "# The results in a new DataFrame, also add column for number of activity items collected in the bar\n",
179 |     "collDF = create_time_bars(data = tDF, \n",
180 |     "                         timebar_column = timebar_col, \n",
181 |     "                         grouping_col_list = ['date', 'ticker', 'eventtype'], \n",
182 |     "                         input_spec = bar_input_spec, \n",
183 |     "                         timebar_spec = timebar_spec)\\\n",
184 |     "    .withColumn( 'activity_count', F.size( F.col('activity') ) )"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "# schema at end of this stage\n",
194 |     "collDF.printSchema()\n",
195 |     "\n",
196 |     "# sample 5 rows, truncate results (activity can get big)\n",
197 |     "collDF.filter( collDF.date == start_date ).show(5, True)"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "# Stage: Summarize Bars\n",
205 |     "\n",
206 |     "Summarize the bars and once summarized drop activity since it will no longer be needed.\n"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "# Bar data is in a column that is a list of structs named 'activity'\n",
216 |     "# values collected in 'activity': datetime, teimstamp, price, quantity, exchange, conditions\n",
217 |     "\n",
218 |     "# Spark Way\n",
219 |     "sumDF = ( collDF\n",
220 |     "    .withColumn( 'std',    std( 'activity.price' ) )\n",
221 |     "    .withColumn( 'vwap',   vwap( 'activity.price', 'activity.quantity' ) )\n",
222 |     "    .withColumn( 'ohlc',   ohlc_func( 'activity.datetime', 'activity.price' ) ) \n",
223 |     "    .withColumn( 'volume', total_volume( 'activity.quantity' ) )\n",
224 |     "#        .withColumn('MY_RESULT', MY_SPECIAL_FUNCTION( 'activity.datetime', 'activity.price', 'activity.quantity' ) )\n",
225 |     "    .drop( collDF.activity )\n",
226 |     ")\n",
227 |     "\n",
228 |     "# Library Way\n",
229 |     "sumDF = compute_features_on_time_bars(collDF, \"std\", std( 'activity.price' ), True, \"window\")\n",
230 |     "sumDF = compute_features_on_time_bars(sumDF, \"vwap\", vwap( 'activity.price', 'activity.quantity' ), True, \"window\")\n",
231 |     "sumDF = compute_features_on_time_bars(sumDF, \"ohlc\", ohlc_func( 'activity.datetime', 'activity.price' ), True, \"window\")\n",
232 |     "sumDF = compute_features_on_time_bars(sumDF, \"volume\", total_volume( 'activity.quantity' ), True, \"window\")\n",
233 |     "\n",
234 |     "sumDF = sumDF.drop(sumDF.activity)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "# schema at end of this stage\n",
244 |     "sumDF.printSchema()\n",
245 |     "\n",
246 |     "# sample 5 rows, don't truncate so we can see full values\n",
247 |     "sumDF.show(5, False)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "# Stage: Fill and Filter\n",
255 |     "\n",
256 |     "## Custom Calendar\n",
257 |     "Define a calendar that uses the pandas_market_calendars package.\n",
258 |     "\n",
259 |     "See [PyPi](https://pypi.org/project/pandas-market-calendars), [github](https://github.com/rsheftel/pandas_market_calendars), and the [documentation](http://pandas_market_calendars.readthedocs.io/en/latest/) for more information and [here](https://pandas-market-calendars.readthedocs.io/en/latest/calendars.html) for all the calendars covered."
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "metadata": {},
266 |    "outputs": [],
267 |    "source": [
268 |     "# install the library on your cluster if not already installed\n",
269 |     "try:\n",
270 |     "    sc.install_pypi_package('pandas_market_calendars==1.2')\n",
271 |     "except Exception as e:\n",
272 |     "    print('Packages already Installed')"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": null,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "import pandas_market_calendars as mcal\n",
282 |     "import datetime\n",
283 |     "\n",
284 |     "class MarketCalendar(AbstractCalendar):\n",
285 |     "    def __init__(self, calendar_name: str = 'NYSE'):\n",
286 |     "        super(AbstractCalendar).__init__()\n",
287 |     "        self.calendar = mcal.get_calendar(calendar_name)\n",
288 |     "\n",
289 |     "    def raw_calendar_data(self) -> typing.Dict[str, typing.Any]:\n",
290 |     "        return {AbstractCalendar.TZINFO: self.calendar.tz.info}\n",
291 |     "\n",
292 |     "    def create_schedule_from_to(self, from_date: datetime.date, to_date: datetime.date, time_bar_spec_window_duration: str,\n",
293 |     "                                from_time: typing.Optional[datetime.time] = None,\n",
294 |     "                                to_time: typing.Optional[datetime.time] = None,\n",
295 |     "                                ) -> numpy.array:\n",
296 |     "        \"\"\"\n",
297 |     "\n",
298 |     "        A list of datetimes are created from the given start and end date with a frequency as given by time_bar_spec_window_duration. \n",
299 |     "        The from_time and to_time values are not used with this class but are required to maintain the interface with the abstract class.\n",
300 |     "\n",
301 |     "        :param from_date: from date\n",
302 |     "        :param to_date: to date\n",
303 |     "        :param time_bar_spec_window_duration:\n",
304 |     "        :param from_time NOT USED\n",
305 |     "        :param to_time NOT USED\n",
306 |     "\n",
307 |     "        :return: List of datetime\n",
308 |     "        \"\"\"\n",
309 |     "\n",
310 |     "        tz = self.calendar.tz.zone\n",
311 |     "        \n",
312 |     "        if isinstance(from_date, datetime.date) or isinstance(to_date, datetime.date):\n",
313 |     "            from_date = datetime.datetime(from_date.year, from_date.month, from_date.day, 0)\n",
314 |     "            to_date = datetime.datetime(to_date.year, to_date.month, to_date.day, 23, 59, 59, 999999)\n",
315 |     "        if not from_date.tzinfo and not to_date.tzinfo:\n",
316 |     "            from_date = pytz.timezone(tz).localize(from_date)\n",
317 |     "            to_date = pytz.timezone(tz).localize(to_date)\n",
318 |     "        elif from_date.tzinfo != to_date.tzinfo:\n",
319 |     "            raise RuntimeError(\"invalid input for timezones in create schedule\")\n",
320 |     " \n",
321 |     "        data = self.calendar.schedule(start_date=from_date, end_date=to_date)\n",
322 |     "\n",
323 |     "        #aa = pytz.timezone(nyse.tz.zone).localize( mcal.date_range(all_days, frequency=timebar_spec.windowDuration, closed=None) )\n",
324 |     "        valid_dates = mcal.date_range(data, frequency=time_bar_spec_window_duration, closed=None).to_pydatetime()\n",
325 |     "\n",
326 |     "        return valid_dates\n"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "# fill and filter, use the timebar defined in collect stage\n",
336 |     "ffDF = time_bar_fill_and_filter(sumDF, timebar_col, MarketCalendar('NYSE'), timebar_spec, start_date, end_date)"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "markdown",
341 |    "metadata": {},
342 |    "source": [
343 |     "# Stage: Prepare Feature Dataset\n",
344 |     "Simplify schema by selecting needed items and drop what is not needed."
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "metadata": {},
351 |    "outputs": [],
352 |    "source": [
353 |     "prepDF = ( ffDF\n",
354 |     "    .filter( ffDF.date.between(start_date, end_date) )\n",
355 |     "\n",
356 |     "    # flatten window\n",
357 |     "    .withColumn(\"start\", ffDF.window.start)\n",
358 |     "    .withColumn(\"end\",   ffDF.window.end)\n",
359 |     "    .drop(\"window\")\n",
360 |     "\n",
361 |     "    # flatten ohlc\n",
362 |     "    .withColumn(\"open\",  ffDF.ohlc.open)\n",
363 |     "    .withColumn(\"high\",  ffDF.ohlc.high)\n",
364 |     "    .withColumn(\"low\",   ffDF.ohlc.low)\n",
365 |     "    .withColumn(\"close\", ffDF.ohlc.close)\n",
366 |     "    .drop(\"ohlc\")\n",
367 |     ")"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "prepDF.printSchema()\n",
377 |     "\n",
378 |     "# sample the data\n",
379 |     "prepDF.show(10, False)"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "markdown",
384 |    "metadata": {},
385 |    "source": [
386 |     "# Stage: Analytics\n",
387 |     "\n",
388 |     "Now apply analytics to the data, in our case calculate realized volatility and bollinger bands"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": [
397 |     "# See help for the function\n",
398 |     "help(realized_volatility)"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": null,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "# See help for the function\n",
408 |     "help(bollinger_bands)"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": null,
414 |    "metadata": {},
415 |    "outputs": [],
416 |    "source": [
417 |     "# Arguments to the functions\n",
418 |     "tenor = 15\n",
419 |     "numStd = 2\n",
420 |     "\n",
421 |     "# analytics to calculate\n",
422 |     "realVolDef = realized_volatility( tenor, \"end\", \"vwap\" )\n",
423 |     "bbandsDef  = bollinger_bands(tenor, numStd, \"end\", \"vwap\", \"high\", \"low\")\n",
424 |     "\n",
425 |     "# group the dataset's values by....\n",
426 |     "partitionList = [\"ticker\", \"eventtype\"]\n",
427 |     "\n",
428 |     "# Prepare the dataframe\n",
429 |     "tsDF = prepDF\n",
430 |     "\n",
431 |     "tsDF = compute_analytics_on_features(tsDF, \"realized_volatility\", realVolDef, partition_col_list = partitionList)\n",
432 |     "tsDF = compute_analytics_on_features(tsDF, \"bollinger_band\", bbandsDef, partition_col_list = partitionList)\n"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": null,
438 |    "metadata": {},
439 |    "outputs": [],
440 |    "source": [
441 |     "tsDF.printSchema()\n",
442 |     "\n",
443 |     "# sample first fiew rows, but lets be sure to filter the null values as well\n",
444 |     "# Times is UTC, realized_volatility not null after the given tenor\n",
445 |     "tsDF.drop( \"date\", \"activity_count\" ).filter( tsDF.realized_volatility.isNotNull() ).sort(tsDF.end).show( 10, False )"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "markdown",
450 |    "metadata": {},
451 |    "source": [
452 |     "# Plots  \n",
453 |     "## Realized Volatility Graph\n",
454 |     "Calculate and plot realized volatility\n",
455 |     "\n",
456 |     "When plotting with Spark, the calculations are performed on the cluster, specifically, the data is collected to the driver, the plot image created, then the image is shipped over to the local notebook to be shown. This is all done for you."
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": null,
462 |    "metadata": {},
463 |    "outputs": [],
464 |    "source": [
465 |     "# ticker and event to filter for\n",
466 |     "fTicker = 'AMZN'\n",
467 |     "event_type = 'TRADE NB'\n",
468 |     "\n",
469 |     "# filter and bring data into a pandas dataframe for plotting\n",
470 |     "pltDF = ( tsDF\n",
471 |     "    .filter(tsDF.eventtype == event_type)\n",
472 |     "    .filter(tsDF.ticker == fTicker)\n",
473 |     "    .select( 'end', 'realized_volatility' )\n",
474 |     ").toPandas()\n",
475 |     "\n",
476 |     "pltDF = pltDF.set_index('end')\n",
477 |     "pltDF.index = pltDF.index.strftime(\"%Y-%m-%d %H:%m\")\n",
478 |     "\n",
479 |     "fig, ax = plt.subplots(1, 1, figsize=(12, 6))\n",
480 |     "\n",
481 |     "#ax.get_yaxis().set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')) )\n",
482 |     "\n",
483 |     "# Realized Volatility\n",
484 |     "pltDF[[ 'realized_volatility' ]].plot(figsize=(12,6))\n",
485 |     "\n",
486 |     "# labels and other items to make the plot readable\n",
487 |     "plt.title(f\"{fTicker} Realized Vol (tenor: {tenor}, 5 min bars)\")\n",
488 |     "plt.ylabel('Realized Vol')\n",
489 |     "plt.xlabel('Date/Time')\n",
490 |     "plt.xticks(rotation=30)\n",
491 |     "plt.subplots_adjust(bottom=0.2)\n",
492 |     "\n",
493 |     "%matplot plt"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "markdown",
498 |    "metadata": {},
499 |    "source": [
500 |     "## Bollinger Bands\n",
501 |     "Bollinger Bands where calculated as well...."
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": null,
507 |    "metadata": {},
508 |    "outputs": [],
509 |    "source": [
510 |     "# filter the bollinger band data\n",
511 |     "pltDF = ( tsDF\n",
512 |     "    .filter(tsDF.eventtype == \"TRADE NB\")\n",
513 |     "    .withColumn('upper_band',  tsDF.bollinger_band.upper_band)\n",
514 |     "    .withColumn('middle_band', tsDF.bollinger_band.middle_band)\n",
515 |     "    .withColumn('lower_band',  tsDF.bollinger_band.lower_band)\n",
516 |     "    .filter(tsDF.ticker == fTicker)\n",
517 |     "    .select( 'end', 'close', 'upper_band', 'middle_band', 'lower_band' )\n",
518 |     ").toPandas()\n",
519 |     "\n",
520 |     "pltDF = pltDF.set_index('end')\n",
521 |     "pltDF.index = pltDF.index.strftime(\"%Y-%m-%d %H:%m\")\n",
522 |     "\n",
523 |     "# Simple Bollinger Band\n",
524 |     "pltDF[['close', 'middle_band', 'upper_band', 'lower_band']].plot(figsize=(12,6))\n",
525 |     "\n",
526 |     "plt.title(f\"{fTicker} Bollinger Bands (tenor: {tenor}, 5 min bars, n-std: {numStd})\")\n",
527 |     "plt.ylabel('Price (USD)')\n",
528 |     "plt.xlabel('Date/Time')\n",
529 |     "plt.xticks(rotation=30)\n",
530 |     "plt.subplots_adjust(bottom=0.2)\n",
531 |     "\n",
532 |     "%matplot plt"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": null,
538 |    "metadata": {},
539 |    "outputs": [],
540 |    "source": [
541 |     "# What is the date range for the data?\n",
542 |     "tsDF.select( F.min(tsDF.start).alias(\"MIN\"), F.max(tsDF.end).alias(\"MAX\")).show()"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "code",
547 |    "execution_count": null,
548 |    "metadata": {},
549 |    "outputs": [],
550 |    "source": [
551 |     "# What tickers are in this dataset?\n",
552 |     "tsDF.groupBy(\"ticker\").count().orderBy('ticker').show()"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": null,
558 |    "metadata": {},
559 |    "outputs": [],
560 |    "source": [
561 |     "print( f\"Last Run: {datetime.datetime.now()}\" )"
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "code",
566 |    "execution_count": null,
567 |    "metadata": {},
568 |    "outputs": [],
569 |    "source": []
570 |   }
571 |  ],
572 |  "metadata": {
573 |   "instance_type": "ml.t3.medium",
574 |   "kernelspec": {
575 |    "display_name": "FinSpace PySpark (finspace-sparkmagic-84084/latest)",
576 |    "language": "python",
577 |    "name": "pysparkkernel__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:489461498020:image/finspace-sparkmagic-84084"
578 |   },
579 |   "language_info": {
580 |    "codemirror_mode": {
581 |     "name": "python",
582 |     "version": 3
583 |    },
584 |    "mimetype": "text/x-python",
585 |    "name": "pyspark",
586 |    "pygments_lexer": "python3"
587 |   }
588 |  },
589 |  "nbformat": 4,
590 |  "nbformat_minor": 4
591 | }
592 | 


--------------------------------------------------------------------------------
/notebooks/collect_timebars_and_summarize/collect-timebars-summarize.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%local\n",
 10 |     "from aws.finspace.cluster import FinSpaceClusterManager\n",
 11 |     "\n",
 12 |     "# if this was already run, no need to run again\n",
 13 |     "if 'finspace_clusters' not in globals():\n",
 14 |     "    finspace_clusters = FinSpaceClusterManager()\n",
 15 |     "    finspace_clusters.auto_connect()\n",
 16 |     "else:\n",
 17 |     "    print(f'connected to cluster: {finspace_clusters.get_connected_cluster_id()}')"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "# Collect Timebars and Summarize\n",
 25 |     "Time bars are obtained by sampling information at fixed time intervals, e.g., once every minute. \n",
 26 |     "\n",
 27 |     "**Series:** Time Series Data Engineering and Analysis\n",
 28 |     "\n",
 29 |     "As part of the big data timeseries processing workflow FinSpace supports, show how one takes raw, uneven in time events of TAQ data and collect them into a performant derived dataset of collected bars of data.\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "### Timeseries Workflow\n",
 33 |     "Raw Events → **\\[Collect bars → Summarize bars\\]** → Fill Missing → Prepare → Analytics\n",
 34 |     "\n",
 35 |     "![Workflow](workflow.png)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## Getting the Group ID\n",
 43 |     "\n",
 44 |     "Navigate to the Analyst group (gear menu, users and groups, select group named Analyst). The URL is of this pattern:  \n",
 45 |     "http://**ENVIRONMENT_ID**.**REGION**.amazonfinspace.com/userGroup/**GROUP_ID**  \n",
 46 |     "\n",
 47 |     "Copy the string for Group ID into the **basicPermissionGroupId** variable assignment below"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "# REPLACE WITH CORRECT IDS!\n",
 57 |     "# US Equity TAQ Sample - AMZN 6 Months - Sample\n",
 58 |     "source_dataset_id = ''\n",
 59 |     "source_view_id    = ''\n",
 60 |     "\n",
 61 |     "# Group: Analyst\n",
 62 |     "basicPermissionGroupId = ''"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "# notebook imports\n",
 72 |     "import time\n",
 73 |     "import datetime as dt\n",
 74 |     "import pyspark.sql.functions as F\n",
 75 |     "import pyspark.sql.types as T\n",
 76 |     "import pprint \n",
 77 |     "\n",
 78 |     "# FinSpace imports\n",
 79 |     "from aws.finspace.timeseries.spark.util import string_to_timestamp_micros\n",
 80 |     "from aws.finspace.timeseries.spark.windows import create_time_bars, compute_analytics_on_features, compute_features_on_time_bars\n",
 81 |     "from aws.finspace.timeseries.spark.spec import BarInputSpec, TimeBarSpec\n",
 82 |     "from aws.finspace.timeseries.spark.summarizer import *\n",
 83 |     "\n",
 84 |     "# destination if adding to an existing dataset\n",
 85 |     "dest_dataset_id   = None\n",
 86 |     "\n",
 87 |     "start_date = \"2019-10-01\"\n",
 88 |     "end_date   = \"2019-12-31\"\n",
 89 |     "\n",
 90 |     "barNum  = 9\n",
 91 |     "barUnit = \"minute\"\n",
 92 |     "\n",
 93 |     "# \n",
 94 |     "d = time.strftime('%Y-%m-%d %-I:%M %p %Z')  # name is unique to date and time created\n",
 95 |     "\n",
 96 |     "name = f\"TAQ Timebar Summaries - DEMO ({barNum} {barUnit})\"\n",
 97 |     "description = f\"TAQ data summarized into time bars of {barNum} {barUnit} containing STD, VWAP, OHLC and Total Volume. start: {start_date} end: {end_date}\""
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "# Dataset Ownership"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "# permissions that will be given on the dataset\n",
114 |     "basicPermissions = [\n",
115 |     "    \"ViewDatasetDetails\" \n",
116 |     "    ,\"ReadDatasetData\" \n",
117 |     "    ,\"AddDatasetData\" \n",
118 |     "    ,\"CreateSnapshot\" \n",
119 |     "    ,\"EditDatasetMetadata\"\n",
120 |     "    ,\"ManageDatasetPermissions\"\n",
121 |     "    ,\"DeleteDataset\"\n",
122 |     "]\n",
123 |     "\n",
124 |     "request_dataset_permissions = [{\"permission\": permissionName} for permissionName in basicPermissions]\n",
125 |     "\n",
126 |     "basicOwnerInfo = {\n",
127 |     "    \"phoneNumber\" : \"12125551000\",\n",
128 |     "    \"email\"       : \"jdoe@amazon.com\",\n",
129 |     "    \"name\"        : \"John Doe\"\n",
130 |     "}"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "## Python Helper Functions"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "# function to generate a series of dates from a given start/stop date\n",
147 |     "def daterange(startD, endD):\n",
148 |     "    for n in range(int ((endD - startD).days)+1):\n",
149 |     "        yield startD + dt.timedelta(n)\n",
150 |     "\n",
151 |     "#\n",
152 |     "def businessDatesBetween(startD, endD):\n",
153 |     "    weekdays = [6, 7]\n",
154 |     "\n",
155 |     "    holidays = [ dt.date(2019, 11, 28), \n",
156 |     "             dt.date(2019, 12, 25), \n",
157 |     "             dt.date(2020, 1, 1), \n",
158 |     "             dt.date(2020, 1, 20), \n",
159 |     "             dt.date(2020, 2, 17),\n",
160 |     "             dt.date(2020, 4, 10),\n",
161 |     "             dt.date(2020, 5, 25),\n",
162 |     "             dt.date(2020, 7, 3), \n",
163 |     "             dt.date(2020, 9, 7),\n",
164 |     "             dt.date(2020, 11, 26) ]\n",
165 |     "\n",
166 |     "    processDates = list()\n",
167 |     "\n",
168 |     "    for aDate in daterange(startD, endD):\n",
169 |     "        if (aDate.isoweekday() not in weekdays) & (aDate not in holidays):                    \n",
170 |     "            processDates.append( aDate )\n",
171 |     "    \n",
172 |     "    return( processDates )"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "# Get the Data from FinSpace\n",
180 |     "Using the given dataset and view ids, get the view as a Spark DataFrame"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "from aws.finspace.analytics import FinSpaceAnalyticsManager\n",
190 |     "finspace_manager = FinSpaceAnalyticsManager(spark)\n",
191 |     "\n",
192 |     "tDF = finspace_manager.read_data_view(source_dataset_id, source_view_id)\n",
193 |     "tDF.printSchema()"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "## Interact with the DataFrame\n",
201 |     "As a Spark DataFrame, you can interact with the data using Spark."
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "tDF.show(5)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "# FUNCTIONS: Collect and Summarize\n",
218 |     "The functions below process the time series data by first collecting the data into time-bars then summarizing the data captured in the bar. The bars are collected into a column 'activity' for the window of time in the collectTimeBars function. The summarize bar function's purpose is to summarize the data collected in the bar, that bar can be of any type, not just time.\n",
219 |     "\n",
220 |     "Customizations\n",
221 |     "- vary the width and steps of the time-bar, collect different data from the source DataFrame\n",
222 |     "- Summarize the bar with other calculations  \n",
223 |     "\n",
224 |     "Bring Your Own  \n",
225 |     "- Customers can add their own custom Spark user defined functions (UDF) into the summarizer phase\n",
226 |     "\n",
227 |     "![Workflow](workflow.png)\n"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "#-------------------------------------------------------------------\n",
237 |     "# Collects event data into Time-Bars\n",
238 |     "#\n",
239 |     "# barWidth: number and units and time, e.g. '1 minute'\n",
240 |     "#-------------------------------------------------------------------\n",
241 |     "def collectTimeBars( taqDF, barWidth ): \n",
242 |     "\n",
243 |     "    # define the time-bar, column for time and how much time to collect\n",
244 |     "    timebar_spec   = TimeBarSpec(timestamp_column='datetime', window_duration=barWidth, slide_duration=barWidth)\n",
245 |     "    \n",
246 |     "    # what from the source DataFrame to collect in the bar\n",
247 |     "    bar_input_spec = BarInputSpec('activity', 'datetime', 'timestamp', 'price', 'quantity', 'exchange', 'conditions' )\n",
248 |     "\n",
249 |     "    # The results in a new DataFrame\n",
250 |     "    barDF = ( create_time_bars(data=taqDF, \n",
251 |     "                             timebar_column='window', \n",
252 |     "                             grouping_col_list=['date', 'ticker', 'eventtype'], \n",
253 |     "                             input_spec=bar_input_spec, \n",
254 |     "                             timebar_spec=timebar_spec)\n",
255 |     "        .withColumn('activity_count', F.size(F.col('activity'))) )\n",
256 |     "\n",
257 |     "    return( barDF )\n",
258 |     "\n",
259 |     "#-------------------------------------------------------------------\n",
260 |     "# Summarizes the data that was collected in the bar\n",
261 |     "#-------------------------------------------------------------------\n",
262 |     "def summarizeBars( barDF ):\n",
263 |     "\n",
264 |     "# Bar data is in a column that is a list of structs named 'activity'\n",
265 |     "# values collected in 'activity': datetime, teimstamp, price, quantity, exchange, conditions\n",
266 |     "    \n",
267 |     "    sumDF = ( barDF\n",
268 |     "        .withColumn( 'std',    std( 'activity.price' ) )\n",
269 |     "        .withColumn( 'vwap',   vwap( 'activity.price', 'activity.quantity' ) )\n",
270 |     "        .withColumn( 'ohlc',   ohlc_func( 'activity.datetime', 'activity.price' ) ) \n",
271 |     "        .withColumn( 'volume', total_volume( 'activity.quantity' ) )\n",
272 |     "#        .withColumn('MY_RESULT', MY_SPECIAL_FUNCTION( 'activity.datetime', 'activity.price', 'activity.quantity' ) )\n",
273 |     "        .drop( barDF.activity )\n",
274 |     "    )\n",
275 |     "\n",
276 |     "    return( sumDF )\n"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "# Create the Spark DataFrame\n",
284 |     "Create a Spark dataframe that is the result of the data pipline to collect TAQ data into time bars and then summarizes each bar.\n",
285 |     "\n",
286 |     "## Outline of Processing\n",
287 |     "- for each set of dates in the overall range....\n",
288 |     "- collect data into time bars\n",
289 |     "- summarize the data for each bar\n",
290 |     "- save as a changeset to the dataset\n",
291 |     "  - creates a new dataset if one does not exist yet\n",
292 |     "  - uses the FinSpace APIs to simpliffy dataset creation from a Spark DataFrame\n",
293 |     "- continue until all dates have been processed"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "finalDF_schema = {\n",
303 |     "    'primaryKeyColumns': [],\n",
304 |     "    'columns' : [\n",
305 |     "        {'dataType': 'DATE',     'name': 'date',           'description': 'The trade date'},\n",
306 |     "        {'dataType': 'STRING',   'name': 'ticker',         'description': 'Equity Ticker'},\n",
307 |     "        {'dataType': 'STRING',   'name': 'eventtype',      'description': 'Event type'},\n",
308 |     "        {'dataType': 'INTEGER',  'name': 'activity_count', 'description': 'Number of events in period'},\n",
309 |     "        {'dataType': 'DOUBLE',   'name': 'std',            'description': 'Standard deviation of prices in period'},\n",
310 |     "        {'dataType': 'DOUBLE',   'name': 'vwap',           'description': 'Volumn Weighted Average Price in period'},\n",
311 |     "        {'dataType': 'DOUBLE',   'name': 'volume',         'description': 'Total shares during the period'},\n",
312 |     "        {'dataType': 'DATETIME', 'name': 'start',          'description': 'Period Start'},\n",
313 |     "        {'dataType': 'DATETIME', 'name': 'end',            'description': 'Period End'},\n",
314 |     "        {'dataType': 'DOUBLE',   'name': 'open',           'description': 'First/opening price over the period'},\n",
315 |     "        {'dataType': 'DOUBLE',   'name': 'high',           'description': 'High price over the period'},\n",
316 |     "        {'dataType': 'DOUBLE',   'name': 'low',            'description': 'High price over the period'},\n",
317 |     "        {'dataType': 'DOUBLE',   'name': 'close',          'description': 'Last/Closing price over the period'}\n",
318 |     "    ]\n",
319 |     "}"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "metadata": {},
326 |    "outputs": [],
327 |    "source": [
328 |     "# convert strings to dates\n",
329 |     "start_dt = dt.datetime.strptime(start_date, '%Y-%m-%d').date()\n",
330 |     "end_dt   = dt.datetime.strptime(end_date, '%Y-%m-%d').date()\n",
331 |     "\n",
332 |     "# get the list of business dates between given dates\n",
333 |     "processDates = businessDatesBetween( start_dt, end_dt )\n",
334 |     "\n",
335 |     "# grabs a set items from the list, allows us to iterate with a set of dates at a time\n",
336 |     "def chunker(seq, size):\n",
337 |     "    return (seq[pos:pos + size] for pos in range(0, len(seq), size))\n",
338 |     "\n",
339 |     "chunk_size = 3\n",
340 |     "\n",
341 |     "# necessary for time bar API\n",
342 |     "barWidth = f\"{barNum} {barUnit}\"\n",
343 |     "\n",
344 |     "isFirst = True\n",
345 |     "\n",
346 |     "for dates in chunker(processDates, chunk_size):\n",
347 |     "    print(f\"Processing {len(dates)}: {dates}\")\n",
348 |     "\n",
349 |     "    # filter the data for the day\n",
350 |     "    dayDF = tDF.filter(tDF.date.isin(dates))\n",
351 |     "\n",
352 |     "    # collect the data into time bars of the desired width\n",
353 |     "    dayDF = collectTimeBars( dayDF, barWidth )\n",
354 |     "\n",
355 |     "    # summarize the bars, drop activity since its no longer needed\n",
356 |     "    dayDF = summarizeBars( dayDF ).drop('activity')\n",
357 |     "\n",
358 |     "    # add indicators using summaries\n",
359 |     "    #dayDF = addIndicators( dayDF, numSteps = 10, shortStep = 12, longStep = 26)\n",
360 |     "\n",
361 |     "    ## flatted the complex schema into a simple one, drop columns no longer needed\n",
362 |     "    finalDF = ( dayDF\n",
363 |     "        .withColumn(\"start\", dayDF.window.start)\n",
364 |     "        .withColumn(\"end\",   dayDF.window.end)\n",
365 |     "\n",
366 |     "        .withColumn(\"open\",  dayDF.ohlc.open)\n",
367 |     "        .withColumn(\"high\",  dayDF.ohlc.high)\n",
368 |     "        .withColumn(\"low\",   dayDF.ohlc.low)\n",
369 |     "        .withColumn(\"close\", dayDF.ohlc.close)\n",
370 |     "\n",
371 |     "        .drop(\"window\")\n",
372 |     "        .drop(\"ohlc\")\n",
373 |     "    )\n",
374 |     "    \n",
375 |     "    # create the changeset\n",
376 |     "    change_type = \"APPEND\"\n",
377 |     "    \n",
378 |     "    # is this the first pass and no dest_dateset_id given, create the dataset\n",
379 |     "    if (isFirst and dest_dataset_id is None): \n",
380 |     "        \n",
381 |     "        print(\"creating dataset\")\n",
382 |     "\n",
383 |     "        # Create the dataset if it does not exist yet\n",
384 |     "        resp = finspace_manager.finspace_client.create_dataset(\n",
385 |     "            name = name, \n",
386 |     "            description = description, \n",
387 |     "            permissionGroupId = basicPermissionGroupId,\n",
388 |     "            datasetPermissions = request_dataset_permissions,\n",
389 |     "            kind = \"TABULAR\",\n",
390 |     "            ownerInfo = basicOwnerInfo,\n",
391 |     "            schema = finalDF_schema\n",
392 |     "        )\n",
393 |     "\n",
394 |     "        dest_dataset_id = resp[\"datasetId\"]\n",
395 |     "        # first changeset will be a replace\n",
396 |     "        change_type = \"REPLACE\"\n",
397 |     "\n",
398 |     "        print( f\"Created dest_dataset_id= {dest_dataset_id}\")        \n",
399 |     "\n",
400 |     "    print(f\"Creating Changeset: {change_type}\")\n",
401 |     "    \n",
402 |     "    resp = finspace_manager.finspace_client.get_user_ingestion_info()\n",
403 |     "    \n",
404 |     "    upload_location = resp['ingestionPath']\n",
405 |     "    finalDF.write.parquet(upload_location)\n",
406 |     "    \n",
407 |     "    resp = finspace_manager.finspace_client.create_changeset(datasetId=dest_dataset_id, changeType=change_type, \n",
408 |     "                                            sourceType='S3', sourceParams={'s3SourcePath': upload_location}, formatType='PARQUET', formatParams={})\n",
409 |     "    \n",
410 |     "    changeset_id = resp['changeset']['id']\n",
411 |     "    \n",
412 |     "    isFirst = False\n",
413 |     "    \n",
414 |     "    print(f\"changeset_id = {changeset_id}\")"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "metadata": {},
421 |    "outputs": [],
422 |    "source": [
423 |     "def wait_for_ingestion(client, dataset_id: str, changeset_id: str, sleep_sec=10):\n",
424 |     "    \"\"\"\n",
425 |     "    function that will continuously poll the changeset creation to ensure it completes or fails before returning\n",
426 |     "    :param dataset_id: GUID of the dataset\n",
427 |     "    :type: str\n",
428 |     "    :param changeset_id: GUID of the changeset\n",
429 |     "    :type: str\n",
430 |     "    :param sleep_sec: seconds to wait between checks\n",
431 |     "    :type: int\n",
432 |     "    \"\"\"\n",
433 |     "    while True:\n",
434 |     "        resp1 = client.describe_changeset(datasetId=dataset_id, id=changeset_id)\n",
435 |     "\n",
436 |     "        resp2 = resp1.get('changeset', '')\n",
437 |     "        status = resp2.get('status', '')\n",
438 |     "\n",
439 |     "        if status == 'SUCCESS':\n",
440 |     "            print(f\"Changeset complete\")\n",
441 |     "            break\n",
442 |     "        elif status == 'PENDING' or status == 'RUNNING':\n",
443 |     "            print(f\"Changeset status is still PENDING, waiting {sleep_sec} sec ...\")\n",
444 |     "            time.sleep(sleep_sec)\n",
445 |     "            continue\n",
446 |     "        else:\n",
447 |     "            raise Exception(f\"Bad changeset status: {resp1}{status}, failing now.\")"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "wait_for_ingestion(finspace_manager.finspace_client, dataset_id=dest_dataset_id, changeset_id=changeset_id)"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "markdown",
461 |    "metadata": {},
462 |    "source": [
463 |     "# Create Views of the Dataset\n",
464 |     "use the FinSpace APIs to create 2 views of the data, an 'as-of' view for state up to this moment, and an additional auto-updating view if one does not exist for the dataset."
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": null,
470 |    "metadata": {},
471 |    "outputs": [],
472 |    "source": [
473 |     "print( f\"dest_dataset_id: {dest_dataset_id}\")        "
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": null,
479 |    "metadata": {},
480 |    "outputs": [],
481 |    "source": [
482 |     "resp = finspace_manager.finspace_client.list_data_views(datasetIdEquals = dest_dataset_id, maxResults=100)\n",
483 |     "resp"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": null,
489 |    "metadata": {},
490 |    "outputs": [],
491 |    "source": [
492 |     "resp = finspace_manager.finspace_client.list_data_views(datasetIdEquals = dest_dataset_id, maxResults=100)\n",
493 |     "\n",
494 |     "existing_views = resp['dataViews']\n",
495 |     "\n",
496 |     "autoupdate_view_id = None\n",
497 |     "\n",
498 |     "for ss in existing_views:\n",
499 |     "    if ss['autoUpdate'] == True: \n",
500 |     "        autoupdate_view_id = ss.get('dataViewId', None)\n",
501 |     "        \n",
502 |     "# create a an auto-update snapshot for this dataset if one does not already exist\n",
503 |     "if autoupdate_view_id is None:\n",
504 |     "    print(\"creating auto-update view\")\n",
505 |     "\n",
506 |     "    resp = finspace_manager.finspace_client.create_materialized_snapshot(\n",
507 |     "        destinationProperties={},\n",
508 |     "        autoUpdate=True,\n",
509 |     "        sortColumns=[],\n",
510 |     "        partitionColumns=[],\n",
511 |     "        destinationType = \"GLUE_TABLE\",\n",
512 |     "        datasetId=dest_dataset_id)\n",
513 |     "    autoupdate_view_id = resp['id']\n",
514 |     "else:\n",
515 |     "    print(f\"Exists: autoupdate_view_id = {autoupdate_view_id}\")\n",
516 |     "        "
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "markdown",
521 |    "metadata": {},
522 |    "source": [
523 |     "## Associate Attribute Set\n",
524 |     "Associate the 'Sample Data Attribute Set' to the data just created"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "code",
529 |    "execution_count": null,
530 |    "metadata": {},
531 |    "outputs": [],
532 |    "source": [
533 |     "def list_attribute_sets(client):\n",
534 |     "    resp = client.list_dataset_types(sort='NAME')\n",
535 |     "    results = resp['datasetTypeSummaries']\n",
536 |     "\n",
537 |     "    while \"nextToken\" in resp:\n",
538 |     "        resp = client.list_dataset_types(sort='NAME', nextToken=resp['nextToken'])\n",
539 |     "        results.extend(resp['datasetTypeSummaries'])\n",
540 |     "\n",
541 |     "    return results\n",
542 |     "\n",
543 |     "def attribute_set(client, name: str):\n",
544 |     "    \"\"\"\n",
545 |     "    Exact name search for a dataset type of the given name\n",
546 |     "    :param name: name of the dataset type to find\n",
547 |     "    :param name: name of the dataset type to find\n",
548 |     "    :type: str\n",
549 |     "    :return\n",
550 |     "    \"\"\"\n",
551 |     "    all_dataset_types = list_attribute_sets(client)\n",
552 |     "    existing_dataset_type = next((c for c in all_dataset_types if c['name'].lower() == name.lower()), None)\n",
553 |     "\n",
554 |     "    if existing_dataset_type:\n",
555 |     "        return existing_dataset_type\n",
556 |     "\n",
557 |     "def describe_attribute_set(client, attribute_set_id: str):\n",
558 |     "    \"\"\"\n",
559 |     "    Calls describe dataset type API function and only returns the dataset type portion of the response\n",
560 |     "    :param attribute_set_id: the GUID of the dataset type to get description of\n",
561 |     "    :type: str\n",
562 |     "    \"\"\"\n",
563 |     "    resp = None\n",
564 |     "    dataset_type_details_resp = client.describe_dataset_type(datasetTypeId=attribute_set_id)\n",
565 |     "\n",
566 |     "    if 'datasetType' in dataset_type_details_resp:\n",
567 |     "        resp = dataset_type_details_resp['datasetType']\n",
568 |     "\n",
569 |     "    return resp\n",
570 |     "\n",
571 |     "def associate_attribute_set(client, att_name: str, att_values: list, dataset_id: str):\n",
572 |     "    # get the attribute set by name, will need its id\n",
573 |     "    att_set = attribute_set(client, att_name)\n",
574 |     "\n",
575 |     "    # get the dataset's information, will need the arn\n",
576 |     "    dataset_details_resp = client.describe_dataset_details(datasetId=dataset_id)\n",
577 |     "\n",
578 |     "    dataset = dataset_details_resp.get(\"dataset\", None)\n",
579 |     "\n",
580 |     "    if dataset is None:\n",
581 |     "        raise ValueError(f'No dataset found for id: {dataset_id}')\n",
582 |     "\n",
583 |     "    # disassociate any existing relationship\n",
584 |     "    try:\n",
585 |     "        client.dissociate_dataset_from_attribute_set(datasetArn=dataset['arn'], attributeSetId=att_set['id'], datasetId=dataset_id)\n",
586 |     "    except:\n",
587 |     "        print(\"Nothing to disassociate\")\n",
588 |     "\n",
589 |     "    arn = dataset['arn']\n",
590 |     "    attribute_set_id = att_set['id']\n",
591 |     "\n",
592 |     "    client.associate_dataset_with_attribute_set(datasetArn=arn, attributeSetId=attribute_set_id, datasetId=dataset_id)\n",
593 |     "\n",
594 |     "    resp = client.update_dataset_attribute_set_context(datasetArn=arn, datasetId=dataset_id, attributeSetId=attribute_set_id, values=att_values)\n",
595 |     "\n",
596 |     "    if resp['ResponseMetadata']['HTTPStatusCode'] != 200:\n",
597 |     "        return resp\n",
598 |     "\n",
599 |     "    return"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": null,
605 |    "metadata": {},
606 |    "outputs": [],
607 |    "source": [
608 |     "# this is the attribute set to use, will search for it in system\n",
609 |     "att_name = \"Capital Market Details\"\n",
610 |     "\n",
611 |     "# Attributes to associate, based on the definition of the attribute set\n",
612 |     "att_values = [\n",
613 |     "    { 'field' : 'AssetClass', 'type' : 'TAXONOMY', 'values' : [ 'Equity', 'CommonStocks', 'ETFs'] },\n",
614 |     "    { 'field' : 'EventType',  'type' : 'TAXONOMY', 'values' : [ ] },\n",
615 |     "    { 'field' : 'Exchange',   'type' : 'TAXONOMY', 'values' : [ ] },\n",
616 |     "    { 'field' : 'FinancialContentType', 'type' : 'TAXONOMY', 'values' : [ ] },\n",
617 |     "    { 'field' : 'RegionsAndCountries',  'type' : 'TAXONOMY', 'values' : [ ] }\n",
618 |     "]\n",
619 |     "\n",
620 |     "\n",
621 |     "# Associate an attribute set and fill its values\n",
622 |     "print(f\"Associating values to attribute set: {att_name}\")\n",
623 |     "\n",
624 |     "associate_attribute_set(finspace_manager.finspace_client, att_name=att_name, att_values=att_values, dataset_id=dest_dataset_id)"
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": null,
630 |    "metadata": {},
631 |    "outputs": [],
632 |    "source": [
633 |     "print(f\"dataset_id = '{dest_dataset_id}'\")"
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "code",
638 |    "execution_count": null,
639 |    "metadata": {},
640 |    "outputs": [],
641 |    "source": [
642 |     "import datetime\n",
643 |     "print( f\"Last Run: {datetime.datetime.now()}\" )"
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": null,
649 |    "metadata": {},
650 |    "outputs": [],
651 |    "source": []
652 |   }
653 |  ],
654 |  "metadata": {
655 |   "instance_type": "ml.t3.medium",
656 |   "kernelspec": {
657 |    "display_name": "FinSpace PySpark (finspace-sparkmagic-84084/latest)",
658 |    "language": "python",
659 |    "name": "pysparkkernel__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:489461498020:image/finspace-sparkmagic-84084"
660 |   },
661 |   "language_info": {
662 |    "codemirror_mode": {
663 |     "name": "python",
664 |     "version": 3
665 |    },
666 |    "mimetype": "text/x-python",
667 |    "name": "pyspark",
668 |    "pygments_lexer": "python3"
669 |   }
670 |  },
671 |  "nbformat": 4,
672 |  "nbformat_minor": 4
673 | }
674 | 


--------------------------------------------------------------------------------
/sample-data/Industrial production total index.csv:
--------------------------------------------------------------------------------
   1 | DATE,Industrial Production
   2 | 1919-01-01,5.0124
   3 | 1919-02-01,4.7908
   4 | 1919-03-01,4.6524
   5 | 1919-04-01,4.7355
   6 | 1919-05-01,4.7632
   7 | 1919-06-01,5.0678
   8 | 1919-07-01,5.3724
   9 | 1919-08-01,5.4555
  10 | 1919-09-01,5.3447
  11 | 1919-10-01,5.2893
  12 | 1919-11-01,5.2062
  13 | 1919-12-01,5.2893
  14 | 1920-01-01,5.7878
  15 | 1920-02-01,5.7878
  16 | 1920-03-01,5.677
  17 | 1920-04-01,5.3724
  18 | 1920-05-01,5.5109
  19 | 1920-06-01,5.5663
  20 | 1920-07-01,5.4278
  21 | 1920-08-01,5.4555
  22 | 1920-09-01,5.2616
  23 | 1920-10-01,5.0401
  24 | 1920-11-01,4.6247
  25 | 1920-12-01,4.3478
  26 | 1921-01-01,4.0985
  27 | 1921-02-01,4.0155
  28 | 1921-03-01,3.9047
  29 | 1921-04-01,3.9047
  30 | 1921-05-01,4.0155
  31 | 1921-06-01,3.9878
  32 | 1921-07-01,3.9601
  33 | 1921-08-01,4.0985
  34 | 1921-09-01,4.1262
  35 | 1921-10-01,4.3755
  36 | 1921-11-01,4.3201
  37 | 1921-12-01,4.2924
  38 | 1922-01-01,4.4585
  39 | 1922-02-01,4.6524
  40 | 1922-03-01,4.9016
  41 | 1922-04-01,4.7355
  42 | 1922-05-01,4.9847
  43 | 1922-06-01,5.2339
  44 | 1922-07-01,5.2339
  45 | 1922-08-01,5.1232
  46 | 1922-09-01,5.4001
  47 | 1922-10-01,5.7047
  48 | 1922-11-01,5.9539
  49 | 1922-12-01,6.1201
  50 | 1923-01-01,5.9816
  51 | 1923-02-01,6.0647
  52 | 1923-03-01,6.2586
  53 | 1923-04-01,6.397
  54 | 1923-05-01,6.4801
  55 | 1923-06-01,6.4247
  56 | 1923-07-01,6.3693
  57 | 1923-08-01,6.2586
  58 | 1923-09-01,6.1201
  59 | 1923-10-01,6.0924
  60 | 1923-11-01,6.0924
  61 | 1923-12-01,5.9539
  62 | 1924-01-01,6.0924
  63 | 1924-02-01,6.2032
  64 | 1924-03-01,6.0924
  65 | 1924-04-01,5.8986
  66 | 1924-05-01,5.6493
  67 | 1924-06-01,5.4001
  68 | 1924-07-01,5.317
  69 | 1924-08-01,5.5109
  70 | 1924-09-01,5.7047
  71 | 1924-10-01,5.8432
  72 | 1924-11-01,5.954
  73 | 1924-12-01,6.1201
  74 | 1925-01-01,6.314
  75 | 1925-02-01,6.314
  76 | 1925-03-01,6.314
  77 | 1925-04-01,6.3693
  78 | 1925-05-01,6.3416
  79 | 1925-06-01,6.2863
  80 | 1925-07-01,6.4524
  81 | 1925-08-01,6.3416
  82 | 1925-09-01,6.2586
  83 | 1925-10-01,6.5078
  84 | 1925-11-01,6.6463
  85 | 1925-12-01,6.7293
  86 | 1926-01-01,6.6186
  87 | 1926-02-01,6.6186
  88 | 1926-03-01,6.7017
  89 | 1926-04-01,6.7017
  90 | 1926-05-01,6.6463
  91 | 1926-06-01,6.7293
  92 | 1926-07-01,6.757
  93 | 1926-08-01,6.8401
  94 | 1926-09-01,6.9509
  95 | 1926-10-01,6.9509
  96 | 1926-11-01,6.9232
  97 | 1926-12-01,6.8955
  98 | 1927-01-01,6.8678
  99 | 1927-02-01,6.9232
 100 | 1927-03-01,7.0063
 101 | 1927-04-01,6.8401
 102 | 1927-05-01,6.8955
 103 | 1927-06-01,6.8678
 104 | 1927-07-01,6.7847
 105 | 1927-08-01,6.7847
 106 | 1927-09-01,6.674
 107 | 1927-10-01,6.5355
 108 | 1927-11-01,6.5355
 109 | 1927-12-01,6.5632
 110 | 1928-01-01,6.7017
 111 | 1928-02-01,6.757
 112 | 1928-03-01,6.8124
 113 | 1928-04-01,6.7847
 114 | 1928-05-01,6.8678
 115 | 1928-06-01,6.9232
 116 | 1928-07-01,7.0063
 117 | 1928-08-01,7.1447
 118 | 1928-09-01,7.2001
 119 | 1928-10-01,7.3386
 120 | 1928-11-01,7.477
 121 | 1928-12-01,7.6155
 122 | 1929-01-01,7.7263
 123 | 1929-02-01,7.6986
 124 | 1929-03-01,7.7263
 125 | 1929-04-01,7.8648
 126 | 1929-05-01,8.0032
 127 | 1929-06-01,8.0586
 128 | 1929-07-01,8.1694
 129 | 1929-08-01,8.0863
 130 | 1929-09-01,8.0309
 131 | 1929-10-01,7.8924
 132 | 1929-11-01,7.5047
 133 | 1929-12-01,7.1724
 134 | 1930-01-01,7.1724
 135 | 1930-02-01,7.1447
 136 | 1930-03-01,7.034
 137 | 1930-04-01,6.9786
 138 | 1930-05-01,6.8678
 139 | 1930-06-01,6.674
 140 | 1930-07-01,6.3693
 141 | 1930-08-01,6.2309
 142 | 1930-09-01,6.1201
 143 | 1930-10-01,5.9539
 144 | 1930-11-01,5.8155
 145 | 1930-12-01,5.677
 146 | 1931-01-01,5.6493
 147 | 1931-02-01,5.677
 148 | 1931-03-01,5.7878
 149 | 1931-04-01,5.8155
 150 | 1931-05-01,5.7324
 151 | 1931-06-01,5.5939
 152 | 1931-07-01,5.5109
 153 | 1931-08-01,5.317
 154 | 1931-09-01,5.0678
 155 | 1931-10-01,4.8739
 156 | 1931-11-01,4.8185
 157 | 1931-12-01,4.7908
 158 | 1932-01-01,4.6524
 159 | 1932-02-01,4.5416
 160 | 1932-03-01,4.4862
 161 | 1932-04-01,4.1816
 162 | 1932-05-01,4.0431
 163 | 1932-06-01,3.9047
 164 | 1932-07-01,3.7939
 165 | 1932-08-01,3.9047
 166 | 1932-09-01,4.1539
 167 | 1932-10-01,4.2924
 168 | 1932-11-01,4.2924
 169 | 1932-12-01,4.2093
 170 | 1933-01-01,4.1262
 171 | 1933-02-01,4.1539
 172 | 1933-03-01,3.9047
 173 | 1933-04-01,4.1816
 174 | 1933-05-01,4.8739
 175 | 1933-06-01,5.6216
 176 | 1933-07-01,6.1478
 177 | 1933-08-01,5.8986
 178 | 1933-09-01,5.5663
 179 | 1933-10-01,5.2893
 180 | 1933-11-01,4.9847
 181 | 1933-12-01,5.0124
 182 | 1934-01-01,5.1785
 183 | 1934-02-01,5.4278
 184 | 1934-03-01,5.677
 185 | 1934-04-01,5.677
 186 | 1934-05-01,5.7878
 187 | 1934-06-01,5.677
 188 | 1934-07-01,5.2893
 189 | 1934-08-01,5.2339
 190 | 1934-09-01,4.9293
 191 | 1934-10-01,5.1509
 192 | 1934-11-01,5.2062
 193 | 1934-12-01,5.5386
 194 | 1935-01-01,5.9816
 195 | 1935-02-01,6.0924
 196 | 1935-03-01,6.0647
 197 | 1935-04-01,5.9539
 198 | 1935-05-01,5.9539
 199 | 1935-06-01,6.037
 200 | 1935-07-01,6.037
 201 | 1935-08-01,6.2586
 202 | 1935-09-01,6.4247
 203 | 1935-10-01,6.6186
 204 | 1935-11-01,6.757
 205 | 1935-12-01,6.8401
 206 | 1936-01-01,6.7293
 207 | 1936-02-01,6.5632
 208 | 1936-03-01,6.6463
 209 | 1936-04-01,7.0617
 210 | 1936-05-01,7.2001
 211 | 1936-06-01,7.3386
 212 | 1936-07-01,7.477
 213 | 1936-08-01,7.5878
 214 | 1936-09-01,7.7263
 215 | 1936-10-01,7.8371
 216 | 1936-11-01,8.0586
 217 | 1936-12-01,8.3078
 218 | 1937-01-01,8.2801
 219 | 1937-02-01,8.3909
 220 | 1937-03-01,8.5848
 221 | 1937-04-01,8.5848
 222 | 1937-05-01,8.6125
 223 | 1937-06-01,8.5017
 224 | 1937-07-01,8.5571
 225 | 1937-08-01,8.5017
 226 | 1937-09-01,8.2248
 227 | 1937-10-01,7.6155
 228 | 1937-11-01,6.8678
 229 | 1937-12-01,6.2586
 230 | 1938-01-01,6.1201
 231 | 1938-02-01,6.0647
 232 | 1938-03-01,6.0647
 233 | 1938-04-01,5.9539
 234 | 1938-05-01,5.8155
 235 | 1938-06-01,5.8709
 236 | 1938-07-01,6.2032
 237 | 1938-08-01,6.5355
 238 | 1938-09-01,6.7293
 239 | 1938-10-01,6.8955
 240 | 1938-11-01,7.1724
 241 | 1938-12-01,7.2555
 242 | 1939-01-01,7.2555
 243 | 1939-02-01,7.3109
 244 | 1939-03-01,7.3386
 245 | 1939-04-01,7.3109
 246 | 1939-05-01,7.2832
 247 | 1939-06-01,7.4494
 248 | 1939-07-01,7.6709
 249 | 1939-08-01,7.7817
 250 | 1939-09-01,8.2525
 251 | 1939-10-01,8.6678
 252 | 1939-11-01,8.8894
 253 | 1939-12-01,8.8894
 254 | 1940-01-01,8.7786
 255 | 1940-02-01,8.5017
 256 | 1940-03-01,8.3078
 257 | 1940-04-01,8.474
 258 | 1940-05-01,8.7232
 259 | 1940-06-01,9.0002
 260 | 1940-07-01,9.1109
 261 | 1940-08-01,9.1663
 262 | 1940-09-01,9.3602
 263 | 1940-10-01,9.4986
 264 | 1940-11-01,9.7202
 265 | 1940-12-01,10.0525
 266 | 1941-01-01,10.3017
 267 | 1941-02-01,10.6063
 268 | 1941-03-01,10.9386
 269 | 1941-04-01,10.9663
 270 | 1941-05-01,11.4648
 271 | 1941-06-01,11.5479
 272 | 1941-07-01,11.6864
 273 | 1941-08-01,11.8248
 274 | 1941-09-01,11.8248
 275 | 1941-10-01,11.9356
 276 | 1941-11-01,11.991
 277 | 1941-12-01,12.1848
 278 | 1942-01-01,12.4341
 279 | 1942-02-01,12.6556
 280 | 1942-03-01,12.7941
 281 | 1942-04-01,12.4341
 282 | 1942-05-01,12.4617
 283 | 1942-06-01,12.4895
 284 | 1942-07-01,12.7941
 285 | 1942-08-01,13.1818
 286 | 1942-09-01,13.4864
 287 | 1942-10-01,13.9295
 288 | 1942-11-01,14.2618
 289 | 1942-12-01,14.5941
 290 | 1943-01-01,14.7326
 291 | 1943-02-01,15.1203
 292 | 1943-03-01,15.231
 293 | 1943-04-01,15.4249
 294 | 1943-05-01,15.5356
 295 | 1943-06-01,15.4526
 296 | 1943-07-01,15.9233
 297 | 1943-08-01,16.2557
 298 | 1943-09-01,16.6711
 299 | 1943-10-01,16.9203
 300 | 1943-11-01,17.1418
 301 | 1943-12-01,16.9203
 302 | 1944-01-01,17.1141
 303 | 1944-02-01,17.2526
 304 | 1944-03-01,17.2249
 305 | 1944-04-01,17.2249
 306 | 1944-05-01,17.1142
 307 | 1944-06-01,17.0588
 308 | 1944-07-01,17.0311
 309 | 1944-08-01,17.2526
 310 | 1944-09-01,17.1418
 311 | 1944-10-01,17.1972
 312 | 1944-11-01,17.0588
 313 | 1944-12-01,17.0034
 314 | 1945-01-01,16.8372
 315 | 1945-02-01,16.7818
 316 | 1945-03-01,16.671
 317 | 1945-04-01,16.3664
 318 | 1945-05-01,15.9234
 319 | 1945-06-01,15.5633
 320 | 1945-07-01,15.2034
 321 | 1945-08-01,13.6249
 322 | 1945-09-01,12.4064
 323 | 1945-10-01,11.9079
 324 | 1945-11-01,12.351
 325 | 1945-12-01,12.4064
 326 | 1946-01-01,11.714
 327 | 1946-02-01,11.1325
 328 | 1946-03-01,12.2956
 329 | 1946-04-01,12.0741
 330 | 1946-05-01,11.631
 331 | 1946-06-01,12.351
 332 | 1946-07-01,12.7664
 333 | 1946-08-01,13.2371
 334 | 1946-09-01,13.4864
 335 | 1946-10-01,13.7356
 336 | 1946-11-01,13.8187
 337 | 1946-12-01,13.9018
 338 | 1947-01-01,14.0679
 339 | 1947-02-01,14.151
 340 | 1947-03-01,14.2341
 341 | 1947-04-01,14.1233
 342 | 1947-05-01,14.1787
 343 | 1947-06-01,14.1787
 344 | 1947-07-01,14.0956
 345 | 1947-08-01,14.1787
 346 | 1947-09-01,14.2895
 347 | 1947-10-01,14.4279
 348 | 1947-11-01,14.6218
 349 | 1947-12-01,14.6772
 350 | 1948-01-01,14.7603
 351 | 1948-02-01,14.788
 352 | 1948-03-01,14.6218
 353 | 1948-04-01,14.6495
 354 | 1948-05-01,14.8987
 355 | 1948-06-01,15.0926
 356 | 1948-07-01,15.0926
 357 | 1948-08-01,15.0372
 358 | 1948-09-01,14.9264
 359 | 1948-10-01,15.0372
 360 | 1948-11-01,14.8433
 361 | 1948-12-01,14.7049
 362 | 1949-01-01,14.5664
 363 | 1949-02-01,14.428
 364 | 1949-03-01,14.151
 365 | 1949-04-01,14.0679
 366 | 1949-05-01,13.8741
 367 | 1949-06-01,13.8464
 368 | 1949-07-01,13.8187
 369 | 1949-08-01,13.9572
 370 | 1949-09-01,14.0956
 371 | 1949-10-01,13.5695
 372 | 1949-11-01,13.9295
 373 | 1949-12-01,14.1787
 374 | 1950-01-01,14.4279
 375 | 1950-02-01,14.4833
 376 | 1950-03-01,14.9541
 377 | 1950-04-01,15.4526
 378 | 1950-05-01,15.8126
 379 | 1950-06-01,16.2834
 380 | 1950-07-01,16.8095
 381 | 1950-08-01,17.3357
 382 | 1950-09-01,17.2249
 383 | 1950-10-01,17.3357
 384 | 1950-11-01,17.308
 385 | 1950-12-01,17.6126
 386 | 1951-01-01,17.668
 387 | 1951-02-01,17.7788
 388 | 1951-03-01,17.8618
 389 | 1951-04-01,17.8895
 390 | 1951-05-01,17.8342
 391 | 1951-06-01,17.7511
 392 | 1951-07-01,17.4741
 393 | 1951-08-01,17.308
 394 | 1951-09-01,17.4188
 395 | 1951-10-01,17.3911
 396 | 1951-11-01,17.5295
 397 | 1951-12-01,17.6403
 398 | 1952-01-01,17.8341
 399 | 1952-02-01,17.9449
 400 | 1952-03-01,18.0003
 401 | 1952-04-01,17.8341
 402 | 1952-05-01,17.668
 403 | 1952-06-01,17.5018
 404 | 1952-07-01,17.2249
 405 | 1952-08-01,18.3326
 406 | 1952-09-01,18.9972
 407 | 1952-10-01,19.1911
 408 | 1952-11-01,19.5788
 409 | 1952-12-01,19.6896
 410 | 1953-01-01,19.7449
 411 | 1953-02-01,19.8557
 412 | 1953-03-01,20.0219
 413 | 1953-04-01,20.105
 414 | 1953-05-01,20.2157
 415 | 1953-06-01,20.1326
 416 | 1953-07-01,20.3819
 417 | 1953-08-01,20.2711
 418 | 1953-09-01,19.8557
 419 | 1953-10-01,19.6896
 420 | 1953-11-01,19.2188
 421 | 1953-12-01,18.748
 422 | 1954-01-01,18.6095
 423 | 1954-02-01,18.6649
 424 | 1954-03-01,18.5542
 425 | 1954-04-01,18.4434
 426 | 1954-05-01,18.5542
 427 | 1954-06-01,18.6096
 428 | 1954-07-01,18.6372
 429 | 1954-08-01,18.6095
 430 | 1954-09-01,18.6372
 431 | 1954-10-01,18.8588
 432 | 1954-11-01,19.1634
 433 | 1954-12-01,19.4126
 434 | 1955-01-01,19.8557
 435 | 1955-02-01,20.105
 436 | 1955-03-01,20.5757
 437 | 1955-04-01,20.825
 438 | 1955-05-01,21.1573
 439 | 1955-06-01,21.185
 440 | 1955-07-01,21.3511
 441 | 1955-08-01,21.3234
 442 | 1955-09-01,21.4619
 443 | 1955-10-01,21.8219
 444 | 1955-11-01,21.8773
 445 | 1955-12-01,21.9604
 446 | 1956-01-01,22.0988
 447 | 1956-02-01,21.905
 448 | 1956-03-01,21.905
 449 | 1956-04-01,22.0712
 450 | 1956-05-01,21.8773
 451 | 1956-06-01,21.6835
 452 | 1956-07-01,21.0188
 453 | 1956-08-01,21.8773
 454 | 1956-09-01,22.3758
 455 | 1956-10-01,22.5696
 456 | 1956-11-01,22.3758
 457 | 1956-12-01,22.7081
 458 | 1957-01-01,22.625
 459 | 1957-02-01,22.8466
 460 | 1957-03-01,22.8189
 461 | 1957-04-01,22.5142
 462 | 1957-05-01,22.4312
 463 | 1957-06-01,22.4866
 464 | 1957-07-01,22.625
 465 | 1957-08-01,22.625
 466 | 1957-09-01,22.4312
 467 | 1957-10-01,22.0988
 468 | 1957-11-01,21.5727
 469 | 1957-12-01,21.1573
 470 | 1958-01-01,20.7696
 471 | 1958-02-01,20.3265
 472 | 1958-03-01,20.0773
 473 | 1958-04-01,19.745
 474 | 1958-05-01,19.9388
 475 | 1958-06-01,20.465
 476 | 1958-07-01,20.7696
 477 | 1958-08-01,21.185
 478 | 1958-09-01,21.3788
 479 | 1958-10-01,21.6281
 480 | 1958-11-01,22.265
 481 | 1958-12-01,22.2927
 482 | 1959-01-01,22.625
 483 | 1959-02-01,23.0681
 484 | 1959-03-01,23.4004
 485 | 1959-04-01,23.8989
 486 | 1959-05-01,24.2589
 487 | 1959-06-01,24.2866
 488 | 1959-07-01,23.705
 489 | 1959-08-01,22.9019
 490 | 1959-09-01,22.8743
 491 | 1959-10-01,22.7081
 492 | 1959-11-01,22.8466
 493 | 1959-12-01,24.2589
 494 | 1960-01-01,24.8958
 495 | 1960-02-01,24.6743
 496 | 1960-03-01,24.4528
 497 | 1960-04-01,24.2589
 498 | 1960-05-01,24.2312
 499 | 1960-06-01,23.9266
 500 | 1960-07-01,23.8435
 501 | 1960-08-01,23.8158
 502 | 1960-09-01,23.5666
 503 | 1960-10-01,23.5389
 504 | 1960-11-01,23.2066
 505 | 1960-12-01,22.7635
 506 | 1961-01-01,22.7912
 507 | 1961-02-01,22.7635
 508 | 1961-03-01,22.902
 509 | 1961-04-01,23.3727
 510 | 1961-05-01,23.7327
 511 | 1961-06-01,24.065
 512 | 1961-07-01,24.342
 513 | 1961-08-01,24.5635
 514 | 1961-09-01,24.5358
 515 | 1961-10-01,25.0066
 516 | 1961-11-01,25.3943
 517 | 1961-12-01,25.6158
 518 | 1962-01-01,25.3943
 519 | 1962-02-01,25.8097
 520 | 1962-03-01,25.9482
 521 | 1962-04-01,26.0035
 522 | 1962-05-01,25.9758
 523 | 1962-06-01,25.9205
 524 | 1962-07-01,26.1697
 525 | 1962-08-01,26.1974
 526 | 1962-09-01,26.3635
 527 | 1962-10-01,26.3912
 528 | 1962-11-01,26.502
 529 | 1962-12-01,26.502
 530 | 1963-01-01,26.6958
 531 | 1963-02-01,27.0005
 532 | 1963-03-01,27.1666
 533 | 1963-04-01,27.4159
 534 | 1963-05-01,27.7482
 535 | 1963-06-01,27.8313
 536 | 1963-07-01,27.7205
 537 | 1963-08-01,27.7759
 538 | 1963-09-01,28.0528
 539 | 1963-10-01,28.2466
 540 | 1963-11-01,28.3851
 541 | 1963-12-01,28.3297
 542 | 1964-01-01,28.579
 543 | 1964-02-01,28.7728
 544 | 1964-03-01,28.7728
 545 | 1964-04-01,29.2436
 546 | 1964-05-01,29.4098
 547 | 1964-06-01,29.4928
 548 | 1964-07-01,29.6867
 549 | 1964-08-01,29.8805
 550 | 1964-09-01,29.9913
 551 | 1964-10-01,29.5759
 552 | 1964-11-01,30.4898
 553 | 1964-12-01,30.8498
 554 | 1965-01-01,31.1821
 555 | 1965-02-01,31.376
 556 | 1965-03-01,31.7913
 557 | 1965-04-01,31.9298
 558 | 1965-05-01,32.179
 559 | 1965-06-01,32.4283
 560 | 1965-07-01,32.7329
 561 | 1965-08-01,32.8713
 562 | 1965-09-01,32.9544
 563 | 1965-10-01,33.2868
 564 | 1965-11-01,33.4252
 565 | 1965-12-01,33.8406
 566 | 1966-01-01,34.1729
 567 | 1966-02-01,34.3945
 568 | 1966-03-01,34.8652
 569 | 1966-04-01,34.9206
 570 | 1966-05-01,35.2529
 571 | 1966-06-01,35.4191
 572 | 1966-07-01,35.6129
 573 | 1966-08-01,35.6406
 574 | 1966-09-01,35.9729
 575 | 1966-10-01,36.2221
 576 | 1966-11-01,35.9729
 577 | 1966-12-01,36.056
 578 | 1967-01-01,36.2261
 579 | 1967-02-01,35.8152
 580 | 1967-03-01,35.6133
 581 | 1967-04-01,35.9492
 582 | 1967-05-01,35.6356
 583 | 1967-06-01,35.6311
 584 | 1967-07-01,35.55
 585 | 1967-08-01,36.2311
 586 | 1967-09-01,36.1723
 587 | 1967-10-01,36.4669
 588 | 1967-11-01,36.9884
 589 | 1967-12-01,37.3868
 590 | 1968-01-01,37.3465
 591 | 1968-02-01,37.4804
 592 | 1968-03-01,37.5974
 593 | 1968-04-01,37.6518
 594 | 1968-05-01,38.0741
 595 | 1968-06-01,38.214
 596 | 1968-07-01,38.1554
 597 | 1968-08-01,38.2619
 598 | 1968-09-01,38.4068
 599 | 1968-10-01,38.483
 600 | 1968-11-01,38.9808
 601 | 1968-12-01,39.1039
 602 | 1969-01-01,39.3405
 603 | 1969-02-01,39.5922
 604 | 1969-03-01,39.9027
 605 | 1969-04-01,39.7559
 606 | 1969-05-01,39.6056
 607 | 1969-06-01,39.9925
 608 | 1969-07-01,40.2037
 609 | 1969-08-01,40.2961
 610 | 1969-09-01,40.287
 611 | 1969-10-01,40.2988
 612 | 1969-11-01,39.9189
 613 | 1969-12-01,39.8118
 614 | 1970-01-01,39.0746
 615 | 1970-02-01,39.0488
 616 | 1970-03-01,38.9981
 617 | 1970-04-01,38.8979
 618 | 1970-05-01,38.8525
 619 | 1970-06-01,38.7269
 620 | 1970-07-01,38.8219
 621 | 1970-08-01,38.7527
 622 | 1970-09-01,38.4856
 623 | 1970-10-01,37.7157
 624 | 1970-11-01,37.4873
 625 | 1970-12-01,38.3482
 626 | 1971-01-01,38.6432
 627 | 1971-02-01,38.5695
 628 | 1971-03-01,38.5278
 629 | 1971-04-01,38.7441
 630 | 1971-05-01,38.9407
 631 | 1971-06-01,39.104
 632 | 1971-07-01,38.9905
 633 | 1971-08-01,38.7642
 634 | 1971-09-01,39.3942
 635 | 1971-10-01,39.6891
 636 | 1971-11-01,39.8576
 637 | 1971-12-01,40.3177
 638 | 1972-01-01,41.2876
 639 | 1972-02-01,41.6865
 640 | 1972-03-01,41.9746
 641 | 1972-04-01,42.4183
 642 | 1972-05-01,42.3963
 643 | 1972-06-01,42.5166
 644 | 1972-07-01,42.498
 645 | 1972-08-01,43.0629
 646 | 1972-09-01,43.3918
 647 | 1972-10-01,43.9718
 648 | 1972-11-01,44.4919
 649 | 1972-12-01,45.0015
 650 | 1973-01-01,45.2942
 651 | 1973-02-01,45.9656
 652 | 1973-03-01,45.9872
 653 | 1973-04-01,45.9237
 654 | 1973-05-01,46.2192
 655 | 1973-06-01,46.2509
 656 | 1973-07-01,46.4421
 657 | 1973-08-01,46.3648
 658 | 1973-09-01,46.7803
 659 | 1973-10-01,47.0954
 660 | 1973-11-01,47.3445
 661 | 1973-12-01,47.2315
 662 | 1974-01-01,46.8994
 663 | 1974-02-01,46.7538
 664 | 1974-03-01,46.7685
 665 | 1974-04-01,46.6069
 666 | 1974-05-01,46.9743
 667 | 1974-06-01,46.9233
 668 | 1974-07-01,46.9464
 669 | 1974-08-01,46.4907
 670 | 1974-09-01,46.5136
 671 | 1974-10-01,46.3401
 672 | 1974-11-01,44.8209
 673 | 1974-12-01,43.234
 674 | 1975-01-01,42.6409
 675 | 1975-02-01,41.6602
 676 | 1975-03-01,41.2135
 677 | 1975-04-01,41.246
 678 | 1975-05-01,41.1515
 679 | 1975-06-01,41.4224
 680 | 1975-07-01,41.8234
 681 | 1975-08-01,42.2573
 682 | 1975-09-01,42.7796
 683 | 1975-10-01,42.9687
 684 | 1975-11-01,43.0614
 685 | 1975-12-01,43.5973
 686 | 1976-01-01,44.2288
 687 | 1976-02-01,44.6725
 688 | 1976-03-01,44.7125
 689 | 1976-04-01,44.9643
 690 | 1976-05-01,45.1741
 691 | 1976-06-01,45.183
 692 | 1976-07-01,45.4535
 693 | 1976-08-01,45.7737
 694 | 1976-09-01,45.9031
 695 | 1976-10-01,45.9207
 696 | 1976-11-01,46.5978
 697 | 1976-12-01,47.0854
 698 | 1977-01-01,46.8276
 699 | 1977-02-01,47.5417
 700 | 1977-03-01,48.131
 701 | 1977-04-01,48.5838
 702 | 1977-05-01,48.9891
 703 | 1977-06-01,49.3428
 704 | 1977-07-01,49.412
 705 | 1977-08-01,49.4259
 706 | 1977-09-01,49.6639
 707 | 1977-10-01,49.7592
 708 | 1977-11-01,49.8117
 709 | 1977-12-01,49.8949
 710 | 1978-01-01,49.2066
 711 | 1978-02-01,49.4502
 712 | 1978-03-01,50.3928
 713 | 1978-04-01,51.4368
 714 | 1978-05-01,51.6276
 715 | 1978-06-01,51.9833
 716 | 1978-07-01,51.9592
 717 | 1978-08-01,52.1543
 718 | 1978-09-01,52.286
 719 | 1978-10-01,52.7054
 720 | 1978-11-01,53.1047
 721 | 1978-12-01,53.3886
 722 | 1979-01-01,53.0453
 723 | 1979-02-01,53.3303
 724 | 1979-03-01,53.4973
 725 | 1979-04-01,52.8938
 726 | 1979-05-01,53.3187
 727 | 1979-06-01,53.3157
 728 | 1979-07-01,53.2433
 729 | 1979-08-01,52.8899
 730 | 1979-09-01,52.9486
 731 | 1979-10-01,53.239
 732 | 1979-11-01,53.1918
 733 | 1979-12-01,53.2603
 734 | 1980-01-01,53.5037
 735 | 1980-02-01,53.5053
 736 | 1980-03-01,53.3294
 737 | 1980-04-01,52.2336
 738 | 1980-05-01,50.9638
 739 | 1980-06-01,50.3348
 740 | 1980-07-01,49.9462
 741 | 1980-08-01,50.1256
 742 | 1980-09-01,50.9386
 743 | 1980-10-01,51.5813
 744 | 1980-11-01,52.4717
 745 | 1980-12-01,52.7685
 746 | 1981-01-01,52.4668
 747 | 1981-02-01,52.226
 748 | 1981-03-01,52.5025
 749 | 1981-04-01,52.2691
 750 | 1981-05-01,52.5803
 751 | 1981-06-01,52.8284
 752 | 1981-07-01,53.1751
 753 | 1981-08-01,53.1679
 754 | 1981-09-01,52.8514
 755 | 1981-10-01,52.4949
 756 | 1981-11-01,51.894
 757 | 1981-12-01,51.3274
 758 | 1982-01-01,50.3043
 759 | 1982-02-01,51.3016
 760 | 1982-03-01,50.9104
 761 | 1982-04-01,50.4627
 762 | 1982-05-01,50.138
 763 | 1982-06-01,49.9692
 764 | 1982-07-01,49.8138
 765 | 1982-08-01,49.3773
 766 | 1982-09-01,49.226
 767 | 1982-10-01,48.7874
 768 | 1982-11-01,48.592
 769 | 1982-12-01,48.2424
 770 | 1983-01-01,49.1762
 771 | 1983-02-01,48.8688
 772 | 1983-03-01,49.2654
 773 | 1983-04-01,49.8675
 774 | 1983-05-01,50.2084
 775 | 1983-06-01,50.5089
 776 | 1983-07-01,51.2733
 777 | 1983-08-01,51.8454
 778 | 1983-09-01,52.6303
 779 | 1983-10-01,53.0679
 780 | 1983-11-01,53.2534
 781 | 1983-12-01,53.5343
 782 | 1984-01-01,54.6008
 783 | 1984-02-01,54.835
 784 | 1984-03-01,55.1052
 785 | 1984-04-01,55.4514
 786 | 1984-05-01,55.7141
 787 | 1984-06-01,55.9085
 788 | 1984-07-01,56.0842
 789 | 1984-08-01,56.1376
 790 | 1984-09-01,56.0382
 791 | 1984-10-01,55.9459
 792 | 1984-11-01,56.1667
 793 | 1984-12-01,56.2296
 794 | 1985-01-01,56.1398
 795 | 1985-02-01,56.3323
 796 | 1985-03-01,56.4232
 797 | 1985-04-01,56.2693
 798 | 1985-05-01,56.3488
 799 | 1985-06-01,56.3901
 800 | 1985-07-01,56.0234
 801 | 1985-08-01,56.2555
 802 | 1985-09-01,56.4983
 803 | 1985-10-01,56.2648
 804 | 1985-11-01,56.4549
 805 | 1985-12-01,57.0458
 806 | 1986-01-01,57.3104
 807 | 1986-02-01,56.9344
 808 | 1986-03-01,56.542
 809 | 1986-04-01,56.5599
 810 | 1986-05-01,56.6823
 811 | 1986-06-01,56.4976
 812 | 1986-07-01,56.814
 813 | 1986-08-01,56.7381
 814 | 1986-09-01,56.8532
 815 | 1986-10-01,57.1191
 816 | 1986-11-01,57.3792
 817 | 1986-12-01,57.8623
 818 | 1987-01-01,57.685
 819 | 1987-02-01,58.4399
 820 | 1987-03-01,58.516
 821 | 1987-04-01,58.8851
 822 | 1987-05-01,59.2653
 823 | 1987-06-01,59.5409
 824 | 1987-07-01,59.9536
 825 | 1987-08-01,60.4517
 826 | 1987-09-01,60.6069
 827 | 1987-10-01,61.491
 828 | 1987-11-01,61.8137
 829 | 1987-12-01,62.119
 830 | 1988-01-01,62.1469
 831 | 1988-02-01,62.4169
 832 | 1988-03-01,62.5418
 833 | 1988-04-01,62.8959
 834 | 1988-05-01,62.8223
 835 | 1988-06-01,62.9823
 836 | 1988-07-01,63.0093
 837 | 1988-08-01,63.2723
 838 | 1988-09-01,63.099
 839 | 1988-10-01,63.4122
 840 | 1988-11-01,63.5128
 841 | 1988-12-01,63.8233
 842 | 1989-01-01,64.0153
 843 | 1989-02-01,63.7242
 844 | 1989-03-01,63.8691
 845 | 1989-04-01,63.9124
 846 | 1989-05-01,63.4884
 847 | 1989-06-01,63.5187
 848 | 1989-07-01,62.9354
 849 | 1989-08-01,63.5168
 850 | 1989-09-01,63.2954
 851 | 1989-10-01,63.2542
 852 | 1989-11-01,63.4616
 853 | 1989-12-01,63.8467
 854 | 1990-01-01,63.4228
 855 | 1990-02-01,64.0446
 856 | 1990-03-01,64.358
 857 | 1990-04-01,64.2602
 858 | 1990-05-01,64.3973
 859 | 1990-06-01,64.6041
 860 | 1990-07-01,64.5205
 861 | 1990-08-01,64.7326
 862 | 1990-09-01,64.8145
 863 | 1990-10-01,64.3274
 864 | 1990-11-01,63.5753
 865 | 1990-12-01,63.1594
 866 | 1991-01-01,62.8852
 867 | 1991-02-01,62.4462
 868 | 1991-03-01,62.119
 869 | 1991-04-01,62.2415
 870 | 1991-05-01,62.8646
 871 | 1991-06-01,63.4372
 872 | 1991-07-01,63.5128
 873 | 1991-08-01,63.5671
 874 | 1991-09-01,64.133
 875 | 1991-10-01,64.0213
 876 | 1991-11-01,63.948
 877 | 1991-12-01,63.6937
 878 | 1992-01-01,63.3374
 879 | 1992-02-01,63.7911
 880 | 1992-03-01,64.3212
 881 | 1992-04-01,64.8088
 882 | 1992-05-01,65.0202
 883 | 1992-06-01,65.0295
 884 | 1992-07-01,65.6172
 885 | 1992-08-01,65.2943
 886 | 1992-09-01,65.446
 887 | 1992-10-01,65.9369
 888 | 1992-11-01,66.2174
 889 | 1992-12-01,66.2772
 890 | 1993-01-01,66.5643
 891 | 1993-02-01,66.8594
 892 | 1993-03-01,66.7658
 893 | 1993-04-01,67
 894 | 1993-05-01,66.7674
 895 | 1993-06-01,66.8781
 896 | 1993-07-01,67.0845
 897 | 1993-08-01,67.014
 898 | 1993-09-01,67.3345
 899 | 1993-10-01,67.8515
 900 | 1993-11-01,68.1326
 901 | 1993-12-01,68.5054
 902 | 1994-01-01,68.7648
 903 | 1994-02-01,68.7836
 904 | 1994-03-01,69.4766
 905 | 1994-04-01,69.8703
 906 | 1994-05-01,70.2281
 907 | 1994-06-01,70.6763
 908 | 1994-07-01,70.7846
 909 | 1994-08-01,71.2053
 910 | 1994-09-01,71.4767
 911 | 1994-10-01,72.082
 912 | 1994-11-01,72.5263
 913 | 1994-12-01,73.2879
 914 | 1995-01-01,73.4219
 915 | 1995-02-01,73.3025
 916 | 1995-03-01,73.4081
 917 | 1995-04-01,73.3612
 918 | 1995-05-01,73.6104
 919 | 1995-06-01,73.8602
 920 | 1995-07-01,73.5664
 921 | 1995-08-01,74.495
 922 | 1995-09-01,74.7937
 923 | 1995-10-01,74.7017
 924 | 1995-11-01,74.8897
 925 | 1995-12-01,75.1755
 926 | 1996-01-01,74.6841
 927 | 1996-02-01,75.8344
 928 | 1996-03-01,75.7631
 929 | 1996-04-01,76.4562
 930 | 1996-05-01,77.0161
 931 | 1996-06-01,77.6669
 932 | 1996-07-01,77.5662
 933 | 1996-08-01,78.016
 934 | 1996-09-01,78.5532
 935 | 1996-10-01,78.5065
 936 | 1996-11-01,79.1996
 937 | 1996-12-01,79.7143
 938 | 1997-01-01,79.8273
 939 | 1997-02-01,80.793
 940 | 1997-03-01,81.334
 941 | 1997-04-01,81.3531
 942 | 1997-05-01,81.8293
 943 | 1997-06-01,82.2285
 944 | 1997-07-01,82.8557
 945 | 1997-08-01,83.7214
 946 | 1997-09-01,84.4651
 947 | 1997-10-01,85.1918
 948 | 1997-11-01,85.9397
 949 | 1997-12-01,86.2047
 950 | 1998-01-01,86.6474
 951 | 1998-02-01,86.7612
 952 | 1998-03-01,86.8198
 953 | 1998-04-01,87.1411
 954 | 1998-05-01,87.6952
 955 | 1998-06-01,87.145
 956 | 1998-07-01,86.8423
 957 | 1998-08-01,88.6247
 958 | 1998-09-01,88.4515
 959 | 1998-10-01,89.1677
 960 | 1998-11-01,89.1098
 961 | 1998-12-01,89.4407
 962 | 1999-01-01,89.8594
 963 | 1999-02-01,90.3386
 964 | 1999-03-01,90.4819
 965 | 1999-04-01,90.7274
 966 | 1999-05-01,91.352
 967 | 1999-06-01,91.1994
 968 | 1999-07-01,91.7766
 969 | 1999-08-01,92.1629
 970 | 1999-09-01,91.774
 971 | 1999-10-01,92.9979
 972 | 1999-11-01,93.4371
 973 | 1999-12-01,94.1593
 974 | 2000-01-01,94.1758
 975 | 2000-02-01,94.4557
 976 | 2000-03-01,94.798
 977 | 2000-04-01,95.4808
 978 | 2000-05-01,95.6435
 979 | 2000-06-01,95.7353
 980 | 2000-07-01,95.5906
 981 | 2000-08-01,95.3112
 982 | 2000-09-01,95.679
 983 | 2000-10-01,95.397
 984 | 2000-11-01,95.4229
 985 | 2000-12-01,95.1573
 986 | 2001-01-01,94.5448
 987 | 2001-02-01,93.9398
 988 | 2001-03-01,93.7201
 989 | 2001-04-01,93.4469
 990 | 2001-05-01,92.876
 991 | 2001-06-01,92.3208
 992 | 2001-07-01,91.7933
 993 | 2001-08-01,91.6795
 994 | 2001-09-01,91.3289
 995 | 2001-10-01,90.9315
 996 | 2001-11-01,90.486
 997 | 2001-12-01,90.5073
 998 | 2002-01-01,91.0794
 999 | 2002-02-01,91.0553
1000 | 2002-03-01,91.798
1001 | 2002-04-01,92.1771
1002 | 2002-05-01,92.5668
1003 | 2002-06-01,93.4476
1004 | 2002-07-01,93.2237
1005 | 2002-08-01,93.2359
1006 | 2002-09-01,93.3654
1007 | 2002-10-01,93.0834
1008 | 2002-11-01,93.5693
1009 | 2002-12-01,93.1103
1010 | 2003-01-01,93.8198
1011 | 2003-02-01,93.9532
1012 | 2003-03-01,93.7358
1013 | 2003-04-01,93.0657
1014 | 2003-05-01,93.0918
1015 | 2003-06-01,93.2476
1016 | 2003-07-01,93.6582
1017 | 2003-08-01,93.5246
1018 | 2003-09-01,94.0751
1019 | 2003-10-01,94.2079
1020 | 2003-11-01,94.9338
1021 | 2003-12-01,94.8662
1022 | 2004-01-01,95.1085
1023 | 2004-02-01,95.6847
1024 | 2004-03-01,95.2088
1025 | 2004-04-01,95.6385
1026 | 2004-05-01,96.3966
1027 | 2004-06-01,95.6019
1028 | 2004-07-01,96.3385
1029 | 2004-08-01,96.4095
1030 | 2004-09-01,96.4907
1031 | 2004-10-01,97.4094
1032 | 2004-11-01,97.6137
1033 | 2004-12-01,98.3318
1034 | 2005-01-01,98.7845
1035 | 2005-02-01,99.4639
1036 | 2005-03-01,99.3061
1037 | 2005-04-01,99.4734
1038 | 2005-05-01,99.6033
1039 | 2005-06-01,99.9853
1040 | 2005-07-01,99.6692
1041 | 2005-08-01,99.9435
1042 | 2005-09-01,98.0779
1043 | 2005-10-01,99.3149
1044 | 2005-11-01,100.3216
1045 | 2005-12-01,100.9437
1046 | 2006-01-01,101.0627
1047 | 2006-02-01,101.0669
1048 | 2006-03-01,101.2751
1049 | 2006-04-01,101.6813
1050 | 2006-05-01,101.579
1051 | 2006-06-01,101.9693
1052 | 2006-07-01,101.9319
1053 | 2006-08-01,102.3327
1054 | 2006-09-01,102.1252
1055 | 2006-10-01,102.0668
1056 | 2006-11-01,101.9688
1057 | 2006-12-01,103.0292
1058 | 2007-01-01,102.4933
1059 | 2007-02-01,103.5264
1060 | 2007-03-01,103.7521
1061 | 2007-04-01,104.4814
1062 | 2007-05-01,104.5322
1063 | 2007-06-01,104.5617
1064 | 2007-07-01,104.5227
1065 | 2007-08-01,104.7556
1066 | 2007-09-01,105.1611
1067 | 2007-10-01,104.7171
1068 | 2007-11-01,105.3338
1069 | 2007-12-01,105.3457
1070 | 2008-01-01,105.0619
1071 | 2008-02-01,104.7094
1072 | 2008-03-01,104.4616
1073 | 2008-04-01,103.6704
1074 | 2008-05-01,103.0863
1075 | 2008-06-01,102.8445
1076 | 2008-07-01,102.3002
1077 | 2008-08-01,100.7353
1078 | 2008-09-01,96.3666
1079 | 2008-10-01,97.2832
1080 | 2008-11-01,96.0605
1081 | 2008-12-01,93.2521
1082 | 2009-01-01,91.0373
1083 | 2009-02-01,90.4502
1084 | 2009-03-01,89.016
1085 | 2009-04-01,88.3056
1086 | 2009-05-01,87.4155
1087 | 2009-06-01,87.0742
1088 | 2009-07-01,88.0323
1089 | 2009-08-01,89.019
1090 | 2009-09-01,89.6926
1091 | 2009-10-01,89.9797
1092 | 2009-11-01,90.3375
1093 | 2009-12-01,90.6132
1094 | 2010-01-01,91.6642
1095 | 2010-02-01,91.994
1096 | 2010-03-01,92.5993
1097 | 2010-04-01,92.9436
1098 | 2010-05-01,94.2997
1099 | 2010-06-01,94.4397
1100 | 2010-07-01,94.8536
1101 | 2010-08-01,95.1448
1102 | 2010-09-01,95.3637
1103 | 2010-10-01,95.1109
1104 | 2010-11-01,95.1383
1105 | 2010-12-01,96.0599
1106 | 2011-01-01,95.9364
1107 | 2011-02-01,95.5154
1108 | 2011-03-01,96.4643
1109 | 2011-04-01,96.1187
1110 | 2011-05-01,96.3377
1111 | 2011-06-01,96.6154
1112 | 2011-07-01,97.1292
1113 | 2011-08-01,97.6731
1114 | 2011-09-01,97.6494
1115 | 2011-10-01,98.3222
1116 | 2011-11-01,98.2433
1117 | 2011-12-01,98.7876
1118 | 2012-01-01,99.3925
1119 | 2012-02-01,99.6203
1120 | 2012-03-01,99.1551
1121 | 2012-04-01,99.9006
1122 | 2012-05-01,100.0924
1123 | 2012-06-01,100.0728
1124 | 2012-07-01,100.3354
1125 | 2012-08-01,99.856
1126 | 2012-09-01,99.9049
1127 | 2012-10-01,100.1167
1128 | 2012-11-01,100.5991
1129 | 2012-12-01,100.9542
1130 | 2013-01-01,100.8204
1131 | 2013-02-01,101.3995
1132 | 2013-03-01,101.8114
1133 | 2013-04-01,101.6364
1134 | 2013-05-01,101.7476
1135 | 2013-06-01,101.9549
1136 | 2013-07-01,101.5204
1137 | 2013-08-01,102.1907
1138 | 2013-09-01,102.717
1139 | 2013-10-01,102.5371
1140 | 2013-11-01,102.8379
1141 | 2013-12-01,103.1504
1142 | 2014-01-01,102.7216
1143 | 2014-02-01,103.5917
1144 | 2014-03-01,104.5889
1145 | 2014-04-01,104.6371
1146 | 2014-05-01,105.0152
1147 | 2014-06-01,105.4081
1148 | 2014-07-01,105.6157
1149 | 2014-08-01,105.4986
1150 | 2014-09-01,105.8138
1151 | 2014-10-01,105.8357
1152 | 2014-11-01,106.6634
1153 | 2014-12-01,106.5085
1154 | 2015-01-01,105.9806
1155 | 2015-02-01,105.4425
1156 | 2015-03-01,105.1464
1157 | 2015-04-01,104.5272
1158 | 2015-05-01,104.0742
1159 | 2015-06-01,103.7174
1160 | 2015-07-01,104.3243
1161 | 2015-08-01,104.1621
1162 | 2015-09-01,103.7768
1163 | 2015-10-01,103.3976
1164 | 2015-11-01,102.6866
1165 | 2015-12-01,102.1014
1166 | 2016-01-01,102.9525
1167 | 2016-02-01,102.2225
1168 | 2016-03-01,101.4155
1169 | 2016-04-01,101.5167
1170 | 2016-05-01,101.4298
1171 | 2016-06-01,101.8747
1172 | 2016-07-01,102.1325
1173 | 2016-08-01,102.0407
1174 | 2016-09-01,102.0477
1175 | 2016-10-01,102.2485
1176 | 2016-11-01,102.0507
1177 | 2016-12-01,102.9281
1178 | 2017-01-01,103.0366
1179 | 2017-02-01,102.6479
1180 | 2017-03-01,103.343
1181 | 2017-04-01,104.2721
1182 | 2017-05-01,104.4129
1183 | 2017-06-01,104.5849
1184 | 2017-07-01,104.5427
1185 | 2017-08-01,104.0475
1186 | 2017-09-01,104.0502
1187 | 2017-10-01,105.6287
1188 | 2017-11-01,106.193
1189 | 2017-12-01,106.536
1190 | 2018-01-01,106.2655
1191 | 2018-02-01,106.6419
1192 | 2018-03-01,107.2519
1193 | 2018-04-01,108.2223
1194 | 2018-05-01,107.3639
1195 | 2018-06-01,108.1707
1196 | 2018-07-01,108.652
1197 | 2018-08-01,109.5246
1198 | 2018-09-01,109.6749
1199 | 2018-10-01,109.9165
1200 | 2018-11-01,110.5067
1201 | 2018-12-01,110.5516
1202 | 2019-01-01,110.1185
1203 | 2019-02-01,109.5631
1204 | 2019-03-01,109.6811
1205 | 2019-04-01,108.9888
1206 | 2019-05-01,109.2264
1207 | 2019-06-01,109.2774
1208 | 2019-07-01,109.0852
1209 | 2019-08-01,109.8543
1210 | 2019-09-01,109.4725
1211 | 2019-10-01,109.027
1212 | 2019-11-01,110.0388
1213 | 2019-12-01,109.6527
1214 | 2020-01-01,109.1845
1215 | 2020-02-01,109.2966
1216 | 2020-03-01,104.5221
1217 | 2020-04-01,91.2658
1218 | 2020-05-01,92.0613
1219 | 2020-06-01,97.8019
1220 | 2020-07-01,101.8924
1221 | 2020-08-01,102.6619
1222 | 2020-09-01,102.6008
1223 | 2020-10-01,103.5731
1224 | 2020-11-01,103.9818


--------------------------------------------------------------------------------