├── .bumpversion.cfg ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── custom.md │ └── feature_request.md ├── logo │ ├── hudson_and_thames_logo.png │ └── support.png └── pull_request_template.md ├── .gitignore ├── .readthedocs.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.txt ├── README.md ├── docs ├── Makefile ├── make.bat └── source │ ├── _static │ ├── .gitkeep │ ├── favicon_mlfinlab.png │ ├── ht_logo_black.png │ ├── ht_logo_white.png │ ├── logo_black.png │ └── logo_white.png │ ├── _templates │ └── breadcrumbs.html │ ├── additional_information │ ├── analytics.rst │ ├── contact.rst │ ├── contributing.rst │ ├── images │ │ └── slack.png │ ├── license.rst │ └── privacy_gdpr.rst │ ├── changelog.rst │ ├── conf.py │ ├── index.rst │ └── requirements.txt ├── mlfinlab ├── __init__.py ├── backtest_statistics │ ├── __init__.py │ ├── backtests.py │ └── statistics.py ├── bet_sizing │ ├── __init__.py │ ├── bet_sizing.py │ ├── ch10_snippets.py │ └── ef3m.py ├── clustering │ ├── __init__.py │ ├── feature_clusters.py │ ├── hierarchical_clustering.py │ └── onc.py ├── codependence │ ├── __init__.py │ ├── codependence_matrix.py │ ├── correlation.py │ ├── gnpr_distance.py │ ├── information.py │ └── optimal_transport.py ├── cross_validation │ ├── __init__.py │ ├── combinatorial.py │ └── cross_validation.py ├── data_generation │ ├── __init__.py │ ├── bootstrap.py │ ├── correlated_random_walks.py │ ├── corrgan.py │ ├── data_verification.py │ ├── hcbm.py │ └── vines.py ├── data_structures │ ├── __init__.py │ ├── base_bars.py │ ├── imbalance_data_structures.py │ ├── run_data_structures.py │ ├── standard_data_structures.py │ └── time_data_structures.py ├── datasets │ ├── __init__.py │ ├── data │ │ ├── dollar_bar_sample.csv │ │ ├── stock_prices.csv │ │ └── tick_data.csv │ └── load_datasets.py ├── ensemble │ ├── __init__.py │ └── sb_bagging.py ├── feature_importance │ ├── __init__.py │ ├── fingerpint.py │ ├── importance.py │ └── orthogonal.py ├── features │ ├── __init__.py │ └── fracdiff.py ├── filters │ ├── __init__.py │ └── filters.py ├── labeling │ ├── __init__.py │ ├── bull_bear.py │ ├── excess_over_mean.py │ ├── excess_over_median.py │ ├── fixed_time_horizon.py │ ├── labeling.py │ ├── matrix_flags.py │ ├── raw_return.py │ ├── return_vs_benchmark.py │ ├── tail_sets.py │ └── trend_scanning.py ├── microstructural_features │ ├── __init__.py │ ├── encoding.py │ ├── entropy.py │ ├── feature_generator.py │ ├── first_generation.py │ ├── misc.py │ ├── second_generation.py │ └── third_generation.py ├── multi_product │ ├── __init__.py │ └── etf_trick.py ├── networks │ ├── __init__.py │ ├── almst.py │ ├── dash_graph.py │ ├── dual_dash_graph.py │ ├── graph.py │ ├── mst.py │ ├── pmfg.py │ └── visualisations.py ├── regression │ ├── __init__.py │ └── history_weight_regression.py ├── sample_weights │ ├── __init__.py │ └── attribution.py ├── sampling │ ├── __init__.py │ ├── bootstrapping.py │ └── concurrent.py ├── structural_breaks │ ├── __init__.py │ ├── chow.py │ ├── cusum.py │ └── sadf.py └── util │ ├── __init__.py │ ├── fast_ewma.py │ ├── generate_dataset.py │ ├── misc.py │ ├── multiprocess.py │ ├── volatility.py │ └── volume_classifier.py ├── requirements.txt ├── setup.cfg └── setup.py /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.2.0 3 | commit = True 4 | tag = True 5 | tag_name = {new_version} 6 | 7 | [bumpversion:file:setup.cfg] 8 | 9 | [bumpversion:file:docs/source/conf.py] 10 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: HudsonThames # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Custom issue template 3 | about: Describe this issue template's purpose here. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/logo/hudson_and_thames_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/.github/logo/hudson_and_thames_logo.png -------------------------------------------------------------------------------- /.github/logo/support.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/.github/logo/support.png -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. 4 | 5 | Fixes # (issue) 6 | 7 | ## Type of change 8 | 9 | Please delete options that are not relevant. 10 | 11 | - [ ] Bug fix (non-breaking change which fixes an issue) 12 | - [ ] New feature (non-breaking change which adds functionality) 13 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 14 | - [ ] This change requires a documentation update 15 | 16 | # How Has This Been Tested? 17 | 18 | Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration 19 | 20 | - [ ] Test A 21 | - [ ] Test B 22 | 23 | **Test Configuration**: 24 | * Operating system 25 | * IDE used 26 | 27 | 28 | # Checklist: 29 | 30 | - [ ] My code follows the style guidelines of this project 31 | - [ ] I have performed a self-review of my own code 32 | - [ ] I have commented my code, particularly in hard-to-understand areas 33 | - [ ] I have made corresponding changes to the documentation 34 | - [ ] My changes generate no new warnings 35 | - [ ] I have added tests that prove my fix is effective or that my feature works 36 | - [ ] New and existing unit tests pass locally with my changes 37 | - [ ] Any dependent changes have been merged and published in downstream modules 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | *.pyc 3 | __pycache__ 4 | test_reports 5 | .coverage 6 | .DS_Store 7 | docs/build/ 8 | .local/ 9 | cover/ 10 | *.pickle 11 | */.ipynb_checkpoints/* 12 | mlfinlab.egg-info/* 13 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/source/conf.py 11 | 12 | # Optionally build your docs in additional formats such as PDF 13 | formats: [] 14 | 15 | # Optionally set the version of Python and requirements required to build your docs 16 | python: 17 | version: 3.8 18 | install: 19 | - requirements: docs/source/requirements.txt 20 | 21 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at hudsonthames19@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to MlFinLab: 2 | 3 | First off we wanted to thank you for taking the time to contribute to the project. 4 | 5 | We make use of a [Apprenticeship Program](https://hudsonthames.org/mentorship/), which caters to ambitious students looking 6 | to make an impact on open-source and develop a portfolio of work based on financial machine learning. 7 | 8 | This allows us to establish organised collaboration and control the level of code quality. 9 | 10 | ## External Contributions: 11 | 12 | We do encourage external contributions which are sourced by members of our community, [Slack Channel](https://www.patreon.com/HudsonThames), 13 | 14 | We have quite a rigorous process of unit testing, code style checks, and documentation. 15 | 16 | 17 | ## Raise an Issue 18 | We have created [templates](https://github.com/hudson-and-thames/mlfinlab/issues/new/choose) to help aid in creating issues and PRs: 19 | * Bug report 20 | * Feature request 21 | * Custom issue template 22 | * Pull Request Template 23 | 24 | --- 25 | 26 | ## Contact us 27 | We host a booming community of like minded data scientists and quants, join the 28 | [Slack Channel](https://www.patreon.com/HudsonThames) now! Open to sponsors of our package. 29 | 30 | The channel has the following benefits: 31 | 32 | * Community of like minded individuals. 33 | * Ask questions about the package implementations and get community feedback. 34 | * Occasional presentations on topics within financial machine learning. 35 | * A papers channel where we share the papers which are freely available. 36 | * Access to members of our research group. 37 | 38 | You can also email us at research@hudsonthames.org 39 | 40 | Looking forward to hearing from you! -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 5 | 6 | 7 |
8 |
9 | 10 | 11 | # Welcome to Machine Learning Financial Laboratory! 12 | 13 |
14 |
15 |
16 | 17 | >This repo is public facing and exists for the sole purpose of providing users with an easy way to raise bugs, feature requests, and other issues. 18 | 19 |
20 |
21 |
22 | 23 | ## What is MlFinLab? 24 | MlFinlab python library is a perfect toolbox that every financial machine learning researcher needs. 25 | 26 | It covers every step of the ML strategy creation, starting from data structures generation and finishing with backtest statistics. 27 | We pride ourselves in the robustness of our codebase - every line of code existing in the modules is extensively tested and 28 | documented. 29 | 30 | 31 | ## Documentation, Example Notebooks and Lecture Videos 32 | For every technique present in the library we not only provide extensive documentation, with both theoretical explanations 33 | and detailed descriptions of available functions, but also supplement the modules with ever-growing array of lecture videos and slides 34 | on the implemented methods. 35 | 36 | We want you to be able to use the tools right away. To achieve that, every module comes with a number of example notebooks 37 | which include detailed examples of the usage of the algorithms. Our goal is to show you the whole pipeline, starting from 38 | importing the libraries and ending with strategy performance metrics so you can get the added value from the get-go. 39 | 40 |
41 | 42 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 |
52 | 53 | 54 | ### Included modules: 55 | 56 | - Backtest Overfitting Tools 57 | - Data Structures 58 | - Labeling 59 | - Sampling 60 | - Feature Engineering 61 | - Models 62 | - Clustering 63 | - Cross-Validation 64 | - Hyper-Parameter Tuning 65 | - Feature Importance 66 | - Bet Sizing 67 | - Synthetic Data Generation 68 | - Networks 69 | - Measures of Codependence 70 | - Useful Financial Features 71 | 72 | 73 | ## Licensing options 74 | This project is licensed under an all rights reserved [licence](https://github.com/hudson-and-thames/mlfinlab/blob/master/LICENSE.txt). 75 | 76 | * Business 77 | * Enterprise 78 | 79 | 80 | ## Community 81 | With the purchase of the library, our clients get access to the Hudson & Thames Slack community, where our engineers and other quants 82 | are always ready to answer your questions. 83 | 84 | Alternatively, you can email us at: research@hudsonthames.org. 85 | 86 |
87 | 88 | 90 | 91 |
92 | 93 | 94 | ## Who is Hudson & Thames? 95 | Hudson and Thames Quantitative Research is a company with the goal of bridging the gap between the advanced research developed in 96 | quantitative finance and its practical application. We have created three premium python libraries so you can effortlessly access the 97 | latest techniques and focus on what matters most: **creating your own winning strategy**. 98 | 99 | 100 | ### What was only possible with the help of huge R&D teams is now at your disposal, anywhere, anytime. 101 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/_static/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/.gitkeep -------------------------------------------------------------------------------- /docs/source/_static/favicon_mlfinlab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/favicon_mlfinlab.png -------------------------------------------------------------------------------- /docs/source/_static/ht_logo_black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/ht_logo_black.png -------------------------------------------------------------------------------- /docs/source/_static/ht_logo_white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/ht_logo_white.png -------------------------------------------------------------------------------- /docs/source/_static/logo_black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/logo_black.png -------------------------------------------------------------------------------- /docs/source/_static/logo_white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/logo_white.png -------------------------------------------------------------------------------- /docs/source/_templates/breadcrumbs.html: -------------------------------------------------------------------------------- 1 | {%- extends "sphinx_rtd_theme/breadcrumbs.html" %} 2 | 3 | {% block breadcrumbs_aside %} 4 | {% endblock %} -------------------------------------------------------------------------------- /docs/source/additional_information/analytics.rst: -------------------------------------------------------------------------------- 1 | .. _additional_information-analytics: 2 | 3 | ========= 4 | Analytics 5 | ========= 6 | 7 | .. warning:: 8 | 9 | * Please don't alter or change any of the code as this is a violation of our license agreement. 10 | * We do provide a separate enterprise license for companies that want to white label or alter code. 11 | * All changes are flagged by the system. 12 | 13 | Please note that we have added standard web analytics to MLFinLab, using `Segment. `__ 14 | 15 | We track the following: 16 | 17 | * City, Country, Region, City Geographic Coordinate 18 | * UserIDs (MAC address) 19 | * Function calls 20 | * Timestamps 21 | 22 | This allows our team to see how the package is being used by you, our client, so that we may improve the functionality and 23 | build more tools that you will love. An additional purpose is that we need to start tracking growth KPIs such as cohort 24 | retention and MAU and we will compile these into reports for investors, as we are aiming for VC funding in late 2021. 25 | 26 | The impact of the analytics is negligible. 27 | 28 | .. note:: 29 | 30 | * We chose to use MAC Addresses as it is an anonymous token which allows us to track a machine and is not considered as personal information under GDPR unless it is combined with other personal data which then identifies the natural person. 31 | * Your data is also anonymized by filtering it through ipinfo, which returns high level location (City, Country, Region) data without sharing your IP address. 32 | * Segment is the tool we use to collect, clean, and control the data. -------------------------------------------------------------------------------- /docs/source/additional_information/contact.rst: -------------------------------------------------------------------------------- 1 | .. _additional_information-contact: 2 | 3 | ========================= 4 | Join the Slack Channel 🔑 5 | ========================= 6 | 7 | We host a booming community of like minded data scientists and quants, join the Slack channel now! Available via 8 | `H&T Client Portal `__. 9 | 10 | The channel has the following benefits: 11 | 12 | * Community of like minded individuals. 13 | * Ask questions about the package implementations and get community feedback. 14 | * Occasional presentations on topics within financial machine learning. 15 | * A papers channel where we share the papers which are freely available. 16 | * Access to members of our research group. 17 | 18 | Looking forward to hearing from you! 19 | 20 | .. image:: ./images/slack.png 21 | :scale: 65 % 22 | :align: center 23 | -------------------------------------------------------------------------------- /docs/source/additional_information/contributing.rst: -------------------------------------------------------------------------------- 1 | .. _additional_information-contributing: 2 | 3 | ============ 4 | Contributing 5 | ============ 6 | 7 | Areas of Contribution 8 | ##################### 9 | 10 | Currently we have a live project board that follows the principles of Agile Project Management. 11 | 12 | At the time of writing, we are focusing our attentions primarily on those contributions by the current Researchers enrolled 13 | in our `Apprenticeship Program `_. 14 | 15 | There is of course room for the public to make contributions. The most useful are those that help to improve user experience. 16 | Good examples of this is writing tutorial notebooks which answer questions 17 | from the back of a chapter, mlfinlab recipes, improving docstrings, and adding new sphinx documentation. 18 | 19 | Raising Issues 20 | ############## 21 | 22 | We have created `templates`_ to help aid in creating issues and PRs: 23 | 24 | * Bug report 25 | * Feature request 26 | * Custom issue template 27 | * Pull Request Template 28 | 29 | Please do create issues for new feature requests and bug fixes. 30 | 31 | .. _templates: https://github.com/hudson-and-thames/mlfinlab/issues/new/choose 32 | -------------------------------------------------------------------------------- /docs/source/additional_information/images/slack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/additional_information/images/slack.png -------------------------------------------------------------------------------- /docs/source/additional_information/privacy_gdpr.rst: -------------------------------------------------------------------------------- 1 | .. _additional_information-analytics: 2 | 3 | ======================= 4 | Privacy and GDPR Policy 5 | ======================= 6 | 7 | .. note:: 8 | Our Privacy and GDPR Policies can be downloaded directly from our website: 9 | 10 | * `Privacy Policy `_ 11 | * `GDPR Policy `_ 12 | -------------------------------------------------------------------------------- /docs/source/changelog.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Changelog 3 | ========= 4 | .. 5 | The Following are valid options 6 | * :release:`0.1.0 <2021-01-12>` 7 | * :support:`119` Upgrade to pandas 1.0 8 | * :feature:`50` Add a distutils command for marbles 9 | * :bug:`58` Fixed test failure on OSX 10 | .. 11 | For Help: https://releases.readthedocs.io/en/latest/index.html 12 | 13 | * :release:`1.3.0 <2021-07-09>` 14 | * :feature:`69` Added support for Python 3.6 and Python 3.7. 15 | * :feature:`69` Requirements versions are now non-fixed. 16 | * :support:`69` Migrated Optimal Mean Reversion Module from MlFinLab to ArbitrageLab. 17 | * :support:`69` Reflected Optimal Mean Reversion Module migration in the documentation. 18 | 19 | * :release:`1.2.0 <2021-06-23>` 20 | * :support:`64` Updated references in documentation. 21 | * :support:`63` Updated documentation theme to hudsonthames-sphinx-docs. 22 | * :bug:`66 major` Fixed issue with too many function calls in web analytics. 23 | 24 | * :release:`1.1.0 <2021-04-15>` 25 | * :feature:`56` MAE/MSE added as possible metrics for the Trend Scanning Module. 26 | * :feature:`58` Low silhouette scores check made optional in Feature Clusters Module. 27 | * :bug:`57 major` Fix purging bug in Purged KFold/Combinatorial Purged KFold. 28 | * :feature:`61` History Weighted Regression added to the Regression Module. 29 | * :support:`61` History Weighted Regression documentation. 30 | * :feature:`59` Code and unit tests style unified. 31 | * :support:`59` Documentation style unified. 32 | * :feature:`45` Added Pagan et al. and Lunde et al. Bull Bear Methods to the Labeling Module. 33 | * :support:`45` Added Pagan et al. and Lunde et al. Bull Bear Methods documentation. 34 | * :bug:`60 major` Fix structural break bug in the Chu-Stinchcombe-White test. 35 | * :feature:`46` Stacked Module with Cross Validation, Feature Importance, and Sampling methods added. 36 | * :feature:`46` Lambda code in Microstructural Features Module speed-up. 37 | * :support:`46` Stacked Module documentation. 38 | 39 | * :release:`1.0.1 <2021-02-19>` 40 | * :support:`55` Removed TensorFlow from requirements and adjusted installation guide. 41 | 42 | * :release:`1.0.0 <2021-02-16>` 43 | * :feature:`35` Debugged ETF Trick code. 44 | * :feature:`44` Added n_repeat parameter to MDA feature importance. 45 | * :feature:`50` Added t-student option to BVC classifier. 46 | * :bug:`50` Fix bug in Bar-based Kyle lambdas calculation. 47 | * :feature:`52` Migrated Portfolio Optimisation Module code from MlFinLab to PortfolioLab. 48 | * :support:`52` Migrated Portfolio Optimisation Module documentation from MlFinLab to PortfolioLab. 49 | * :feature:`52` Migrated Online Portfolio Selection Module code from MlFinLab to PortfolioLab. 50 | * :support:`52` Migrated Online Portfolio Selection Module documentation from MlFinLab to PortfolioLab. 51 | * :support:`52` Updated requirements versions (numpy==1.20.1, matplotlib==3.2.2, 52 | pandas==1.1.5, scikit-learn==0.24.1, scipy==1.6.0, statsmodels==0.12.2). 53 | 54 | * :release:`0.15.3 <2021-01-12>` 55 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # http://www.sphinx-doc.org/en/master/config 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('./../..')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'mlfinlab' 21 | copyright = '2019, Hudson & Thames Quantitative Research.' 22 | author = 'Hudson & Thames Quantitative Research' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = '1.3.0' 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.coverage', 36 | 'sphinx.ext.intersphinx', 37 | 'sphinx.ext.viewcode', 38 | 'releases' 39 | ] 40 | 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ['_templates'] 44 | 45 | master_doc = 'index' 46 | 47 | # List of patterns, relative to source directory, that match files and 48 | # directories to ignore when looking for source files. 49 | # This pattern also affects html_static_path and html_extra_path. 50 | exclude_patterns = [] 51 | 52 | 53 | # -- Options for HTML output ------------------------------------------------- 54 | 55 | # The theme to use for HTML and HTML Help pages. See the documentation for 56 | # a list of builtin themes. 57 | # 58 | html_theme = 'hudsonthames_sphinx_theme' 59 | add_module_names = False 60 | 61 | # Theme options are theme-specific and customize the look and feel of a theme 62 | # further. For a list of options available for each theme, see the 63 | # documentation. 64 | # 65 | # html_theme_options = {} 66 | 67 | html_context = {'logo': 'logo_white.png', 'theme_logo_only': True} 68 | html_favicon = '_static/favicon_mlfinlab.png' 69 | 70 | # Add any paths that contain custom static files (such as style sheets) here, 71 | # relative to this directory. They are copied after the builtin static files, 72 | # so a file named "default.css" will overwrite the builtin "default.css". 73 | html_static_path = ['_static'] 74 | html_copy_source = True 75 | 76 | # 'releases' (changelog) settings 77 | releases_github_path = 'hudson-and-thames/mlfinlab_premium' 78 | releases_unstable_prehistory = True 79 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. image:: _static/logo_black.png 2 | :scale: 50 % 3 | :align: center 4 | :target: https://hudsonthames.org/ 5 | 6 | | 7 | 8 | ================================================ 9 | Machine Learning Financial Laboratory (mlfinlab) 10 | ================================================ 11 | 12 | MlFinlab is a python package which helps portfolio managers and traders who want to leverage the power of machine learning 13 | by providing reproducible, interpretable, and easy to use tools. 14 | 15 | Adding MlFinLab to your companies pipeline is like adding a department of PhD researchers to your team. 16 | 17 | .. code-block:: 18 | 19 | pip install mlfinlab 20 | 21 | We source all of our implementations from the most elite and peer-reviewed journals. Including publications from: 22 | 23 | 1. `The Journal of Financial Data Science `_ 24 | 2. `The Journal of Portfolio Management `_ 25 | 3. `The Journal of Algorithmic Finance `_ 26 | 4. `Cambridge University Press `_ 27 | 28 | 29 | Documentation & Tutorials 30 | ######################### 31 | 32 | We lower barriers to entry for all users by providing extensive `documentation `_ 33 | and `tutorial notebooks `_, with code examples. 34 | 35 | Who is Hudson & Thames? 36 | ####################### 37 | 38 | Hudson and Thames Quantitative Research is a company with a focus on implementing the most cutting edge algorithms in 39 | quantitative finance. We productionalize all our tools in the form of libraries and provide capability to our clients. 40 | 41 | * `Website `_ 42 | * `Github Group `_ 43 | * `MlFinLab Documentation `_ 44 | 45 | Contact us 46 | ########## 47 | 48 | The best place to contact the team is via the Slack channel. Alternatively you can email us at: research@hudsonthames.org. 49 | 50 | Looking forward to hearing from you! 51 | 52 | License 53 | ####### 54 | 55 | This project is licensed under an all rights reserved licence and is NOT open-source, and may not be used for commercial purposes without a commercial license which may be purchased from Hudson and Thames Quantitative Research. 56 | 57 | `LICENSE.txt `_ file for details. 58 | 59 | .. toctree:: 60 | :maxdepth: 2 61 | :caption: Legal 62 | :hidden: 63 | 64 | additional_information/license 65 | additional_information/analytics 66 | additional_information/privacy_gdpr 67 | -------------------------------------------------------------------------------- /docs/source/requirements.txt: -------------------------------------------------------------------------------- 1 | # Production 2 | numpy==1.18.5 3 | matplotlib==3.2.2 4 | pandas==1.1.5 5 | scikit-learn==0.24.1 6 | scipy==1.6.0 7 | statsmodels==0.12.2 8 | cython==0.29.17 9 | POT==0.7.0 10 | numba==0.52.0 11 | networkx==2.5 12 | dash==1.19.0 13 | dash-cytoscape==0.2.0 14 | dash-bootstrap-components==0.11.3 15 | jupyter-dash==0.4.0 16 | tensorflow==2.2.1 17 | joblib==1.0.1 18 | analytics-python==1.2.9 19 | getmac==0.8.2 20 | 21 | 22 | # Develop 23 | bump2version==1.0.1 24 | bumpversion==0.6.0 25 | codecov==2.1.11 26 | coverage==5.4 27 | pylint==2.6.0 28 | sphinx==3.4.3 # Docs 29 | hudsonthames-sphinx-theme==0.1.5 # Docs 30 | sphinx-rtd-theme==0.5.2 # Docs 31 | releases==1.6.3 # Docs 32 | -------------------------------------------------------------------------------- /mlfinlab/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | MlFinlab helps portfolio managers and traders who want to leverage the power of machine learning by providing 3 | reproducible, interpretable, and easy to use tools. 4 | 5 | Adding MlFinLab to your companies pipeline is like adding a department of PhD researchers to your team. 6 | """ 7 | 8 | import mlfinlab.cross_validation as cross_validation 9 | import mlfinlab.data_structures as data_structures 10 | import mlfinlab.datasets as datasets 11 | import mlfinlab.multi_product as multi_product 12 | import mlfinlab.filters.filters as filters 13 | import mlfinlab.labeling as labeling 14 | import mlfinlab.features.fracdiff as fracdiff 15 | import mlfinlab.sample_weights as sample_weights 16 | import mlfinlab.sampling as sampling 17 | import mlfinlab.bet_sizing as bet_sizing 18 | import mlfinlab.util as util 19 | import mlfinlab.structural_breaks as structural_breaks 20 | import mlfinlab.feature_importance as feature_importance 21 | import mlfinlab.ensemble as ensemble 22 | import mlfinlab.clustering as clustering 23 | import mlfinlab.microstructural_features as microstructural_features 24 | import mlfinlab.backtest_statistics.backtests as backtests 25 | import mlfinlab.backtest_statistics.statistics as backtest_statistics 26 | import mlfinlab.networks as networks 27 | import mlfinlab.data_generation as data_generation 28 | import mlfinlab.regression as regression 29 | -------------------------------------------------------------------------------- /mlfinlab/backtest_statistics/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements general backtest statistics. 3 | """ 4 | 5 | from mlfinlab.backtest_statistics.backtests import CampbellBacktesting 6 | from mlfinlab.backtest_statistics.statistics import (timing_of_flattening_and_flips, average_holding_period, 7 | bets_concentration, all_bets_concentration, 8 | drawdown_and_time_under_water, sharpe_ratio, 9 | information_ratio, probabilistic_sharpe_ratio, 10 | deflated_sharpe_ratio, minimum_track_record_length) 11 | -------------------------------------------------------------------------------- /mlfinlab/bet_sizing/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions derived from Chapter 10: Bet Sizing 3 | Only the highest-level user functions are included in the __init__ file. 4 | 5 | This folder contains classes and functions for sizing bets based on a given investment strategy with given bet side 6 | confidence, e.g. the output from a machine learning model. The approaches implemented in this module are based on 7 | those described in Chapter 10 of "Advances in Financial Machine Learning" by Marcos López de Prado. 8 | """ 9 | 10 | from mlfinlab.bet_sizing.bet_sizing import (bet_size_probability, bet_size_dynamic, bet_size_budget, bet_size_reserve, 11 | confirm_and_cast_to_df, get_concurrent_sides, cdf_mixture, 12 | single_bet_size_mixed) 13 | from mlfinlab.bet_sizing.ef3m import (M2N, centered_moment, raw_moment, most_likely_parameters) 14 | -------------------------------------------------------------------------------- /mlfinlab/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements clustering module methods. 3 | """ 4 | 5 | from mlfinlab.clustering.onc import get_onc_clusters 6 | from mlfinlab.clustering.feature_clusters import get_feature_clusters 7 | from mlfinlab.clustering.hierarchical_clustering import optimal_hierarchical_cluster 8 | -------------------------------------------------------------------------------- /mlfinlab/clustering/feature_clusters.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module creates clustered subsets of features described in the paper Clustered Feature Importance (Presentation 3 | Slides) by Dr. Marcos Lopez de Prado. https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3517595 and is also explained 4 | in the book Machine Learning for Asset Managers Snippet 6.5.2 page 84. 5 | """ 6 | 7 | #Imports 8 | import numpy as np 9 | import pandas as pd 10 | import statsmodels.api as sm 11 | from scipy.spatial.distance import squareform 12 | from scipy.cluster.hierarchy import linkage, fcluster 13 | from statsmodels.regression.linear_model import OLS 14 | 15 | from mlfinlab.clustering.onc import get_onc_clusters 16 | from mlfinlab.codependence.codependence_matrix import get_dependence_matrix, get_distance_matrix 17 | 18 | 19 | # pylint: disable=invalid-name 20 | def get_feature_clusters(X: pd.DataFrame, dependence_metric: str, distance_metric: str = None, 21 | linkage_method: str = None, n_clusters: int = None, critical_threshold: float = 0.0) -> list: 22 | """ 23 | Machine Learning for Asset Managers 24 | Snippet 6.5.2.1 , page 85. Step 1: Features Clustering 25 | 26 | Gets clustered features subsets from the given set of features. 27 | 28 | :param X: (pd.DataFrame) Dataframe of features. 29 | :param dependence_metric: (str) Method to be use for generating dependence_matrix, either 'linear' or 30 | 'information_variation' or 'mutual_information' or 'distance_correlation'. 31 | :param distance_metric: (str) The distance operator to be used for generating the distance matrix. The methods that 32 | can be applied are: 'angular', 'squared_angular', 'absolute_angular'. Set it to None if the 33 | feature are to be generated as it is by the ONC algorithm. 34 | :param linkage_method: (str) Method of linkage to be used for clustering. Methods include: 'single', 'ward', 35 | 'complete', 'average', 'weighted', and 'centroid'. Set it to None if the feature are to 36 | be generated as it is by the ONC algorithm. 37 | :param n_clusters: (int) Number of clusters to form. Must be less the total number of features. If None then it 38 | returns optimal number of clusters decided by the ONC Algorithm. 39 | :param critical_threshold: (float) Threshold for determining low silhouette score in the dataset. It can any real number 40 | in [-1,+1], default is 0 which means any feature that has a silhouette score below 0 will be 41 | indentified as having low silhouette and hence requied transformation will be appiled to for 42 | for correction of the same. 43 | :return: (list) Feature subsets. 44 | """ 45 | 46 | pass 47 | 48 | 49 | def _cluster_transformation(X: pd.DataFrame, clusters: dict, feats_to_transform: list) -> pd.DataFrame: 50 | """ 51 | Machine Learning for Asset Managers 52 | Snippet 6.5.2.1 , page 85. Step 1: Features Clustering (last paragraph) 53 | 54 | Transforms a dataset to reduce the multicollinearity of the system by replacing the original feature with 55 | the residual from regression. 56 | 57 | :param X: (pd.DataFrame) Dataframe of features. 58 | :param clusters: (dict) Clusters generated by ONC algorithm. 59 | :param feats_to_transform: (list) Features that have low silhouette score and to be transformed. 60 | :return: (pd.DataFrame) Transformed features. 61 | """ 62 | 63 | pass 64 | 65 | 66 | def _combine_features(X, clusters, exclude_key) -> np.array: 67 | """ 68 | Combines features of each cluster linearly by following a minimum variance weighting scheme. 69 | The Minimum Variance weights are calculated without constraints, other than the weights sum to one. 70 | 71 | :param X: (pd.DataFrame) Dataframe of features. 72 | :param clusters: (dict) Clusters generated by ONC algorithm. 73 | :param exclude_key: (int) Key of the cluster which is to be excluded. 74 | :return: (np.array) Combined features for each cluster. 75 | """ 76 | 77 | pass 78 | 79 | 80 | def _check_for_low_silhouette_scores(X: pd.DataFrame, dep_matrix: pd.DataFrame, 81 | critical_threshold: float = 0.0) -> pd.DataFrame: 82 | """ 83 | Machine Learning for Asset Managers 84 | Snippet 6.5.2.1 , page 85. Step 1: Features Clustering (last paragraph) 85 | 86 | Checks where the dataset contains features low silhouette due one feature being a combination of 87 | multiple features across clusters. This is a problem, because ONC cannot assign one feature to multiple 88 | clusters and it needs a transformation. 89 | 90 | :param X: (pd.DataFrame) Dataframe of features. 91 | :param dep_matrix: (pd.DataFrame) Dataframe with dependences between features. 92 | :param critical_threshold: (float) Threshold for determining low silhouette score. 93 | :return: (pd.DataFrame) Dataframe of features. 94 | """ 95 | 96 | pass 97 | -------------------------------------------------------------------------------- /mlfinlab/clustering/hierarchical_clustering.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of hierarchical clustering algorithms. 3 | """ 4 | import numpy as np 5 | import pandas as pd 6 | from scipy.cluster import hierarchy 7 | 8 | 9 | def optimal_hierarchical_cluster(mat: np.array, method: str = "ward") -> np.array: 10 | """ 11 | Calculates the optimal clustering of a matrix. 12 | 13 | It calculates the hierarchy clusters from the distance of the matrix. Then it calculates 14 | the optimal leaf ordering of the hierarchy clusters, and returns the optimally clustered matrix. 15 | 16 | It is reproduced with modifications from the following blog post: 17 | `Marti, G. (2020) TF 2.0 DCGAN for 100x100 financial correlation matrices [Online]. 18 | Available at: https://marti.ai/ml/2019/10/13/tf-dcgan-financial-correlation-matrices.html. 19 | (Accessed: 17 Aug 2020) 20 | `_ 21 | 22 | This method relies and acts as a wrapper for the `scipy.cluster.hierarchy` module. 23 | ``_ 24 | 25 | :param mat: (np.array/pd.DataFrame) Correlation matrix. 26 | :param method: (str) Method to calculate the hierarchy clusters. Can take the values 27 | ["single", "complete", "average", "weighted", "centroid", "median", "ward"]. 28 | :return: (np.array) Optimal hierarchy cluster matrix. 29 | """ 30 | 31 | pass 32 | -------------------------------------------------------------------------------- /mlfinlab/clustering/onc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Optimal Number of Clusters (ONC Algorithm) 3 | Detection of False Investment Strategies using Unsupervised Learning Methods 4 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3167017 5 | """ 6 | 7 | from typing import Union 8 | 9 | import numpy as np 10 | import pandas as pd 11 | 12 | from sklearn.cluster import KMeans 13 | from sklearn.metrics import silhouette_samples 14 | 15 | 16 | def _improve_clusters(corr_mat: pd.DataFrame, clusters: dict, top_clusters: dict) -> Union[ 17 | pd.DataFrame, dict, pd.Series]: 18 | """ 19 | Improve number clusters using silh scores 20 | 21 | :param corr_mat: (pd.DataFrame) Correlation matrix 22 | :param clusters: (dict) Clusters elements 23 | :param top_clusters: (dict) Improved clusters elements 24 | :return: (tuple) [ordered correlation matrix, clusters, silh scores] 25 | """ 26 | 27 | pass 28 | 29 | 30 | def _cluster_kmeans_base(corr_mat: pd.DataFrame, max_num_clusters: int = 10, repeat: int = 10) -> Union[ 31 | pd.DataFrame, dict, pd.Series]: 32 | """ 33 | Initial clustering step using KMeans. 34 | 35 | :param corr_mat: (pd.DataFrame) Correlation matrix 36 | :param max_num_clusters: (int) Maximum number of clusters to search for. 37 | :param repeat: (int) Number of clustering algorithm repetitions. 38 | :return: (tuple) [ordered correlation matrix, clusters, silh scores] 39 | """ 40 | 41 | pass 42 | 43 | 44 | def _check_improve_clusters(new_tstat_mean: float, mean_redo_tstat: float, old_cluster: tuple, 45 | new_cluster: tuple) -> tuple: 46 | """ 47 | Checks cluster improvement condition based on t-statistic. 48 | 49 | :param new_tstat_mean: (float) T-statistics 50 | :param mean_redo_tstat: (float) Average t-statistcs for cluster improvement 51 | :param old_cluster: (tuple) Old cluster correlation matrix, optimized clusters, silh scores 52 | :param new_cluster: (tuple) New cluster correlation matrix, optimized clusters, silh scores 53 | :return: (tuple) Cluster 54 | """ 55 | 56 | pass 57 | 58 | 59 | def cluster_kmeans_top(corr_mat: pd.DataFrame, repeat: int = 10) -> Union[pd.DataFrame, dict, pd.Series, bool]: 60 | """ 61 | Improve the initial clustering by leaving clusters with high scores unchanged and modifying clusters with 62 | below average scores. 63 | 64 | :param corr_mat: (pd.DataFrame) Correlation matrix 65 | :param repeat: (int) Number of clustering algorithm repetitions. 66 | :return: (tuple) [correlation matrix, optimized clusters, silh scores, boolean to rerun ONC] 67 | """ 68 | 69 | pass 70 | 71 | 72 | def get_onc_clusters(corr_mat: pd.DataFrame, repeat: int = 10) -> Union[pd.DataFrame, dict, pd.Series]: 73 | """ 74 | Optimal Number of Clusters (ONC) algorithm described in the following paper: 75 | `Marcos Lopez de Prado, Michael J. Lewis, Detection of False Investment Strategies Using Unsupervised 76 | Learning Methods, 2015 `_; 77 | The code is based on the code provided by the authors of the paper. 78 | 79 | The algorithm searches for the optimal number of clusters using the correlation matrix of elements as an input. 80 | 81 | The correlation matrix is transformed to a matrix of distances, the K-Means algorithm is applied multiple times 82 | with a different number of clusters to use. The results are evaluated on the t-statistics of the silhouette scores. 83 | 84 | The output of the algorithm is the reordered correlation matrix (clustered elements are placed close to each other), 85 | optimal clustering, and silhouette scores. 86 | 87 | :param corr_mat: (pd.DataFrame) Correlation matrix of features 88 | :param repeat: (int) Number of clustering algorithm repetitions 89 | :return: (tuple) [correlation matrix, optimized clusters, silh scores] 90 | """ 91 | 92 | pass 93 | -------------------------------------------------------------------------------- /mlfinlab/codependence/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various codependence measures: mutual info, distance correlations, variation of information. 3 | """ 4 | 5 | from mlfinlab.codependence.correlation import (angular_distance, absolute_angular_distance, squared_angular_distance, 6 | distance_correlation, kullback_leibler_distance, norm_distance) 7 | from mlfinlab.codependence.information import (get_mutual_info, get_optimal_number_of_bins, variation_of_information_score) 8 | from mlfinlab.codependence.codependence_matrix import (get_dependence_matrix, get_distance_matrix) 9 | from mlfinlab.codependence.gnpr_distance import (spearmans_rho, gpr_distance, gnpr_distance) 10 | from mlfinlab.codependence.optimal_transport import (optimal_transport_dependence) 11 | -------------------------------------------------------------------------------- /mlfinlab/codependence/codependence_matrix.py: -------------------------------------------------------------------------------- 1 | """ 2 | This implementation lets user generate dependence and distance matrix based on the various methods of Information 3 | Codependence described in Cornell lecture notes on Codependence: 4 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes 5 | """ 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | from mlfinlab.codependence.information import variation_of_information_score, get_mutual_info 11 | from mlfinlab.codependence.correlation import distance_correlation 12 | from mlfinlab.codependence.gnpr_distance import spearmans_rho, gpr_distance, gnpr_distance 13 | from mlfinlab.codependence.optimal_transport import optimal_transport_dependence 14 | 15 | 16 | # pylint: disable=invalid-name 17 | 18 | def get_dependence_matrix(df: pd.DataFrame, dependence_method: str, theta: float = 0.5, 19 | n_bins: int = None, normalize: bool = True, 20 | estimator: str = 'standard', target_dependence: str = 'comonotonicity', 21 | gaussian_corr: float = 0.7, var_threshold: float = 0.2) -> pd.DataFrame: 22 | """ 23 | This function returns a dependence matrix for elements given in the dataframe using the chosen dependence method. 24 | 25 | List of supported algorithms to use for generating the dependence matrix: ``information_variation``, 26 | ``mutual_information``, ``distance_correlation``, ``spearmans_rho``, ``gpr_distance``, ``gnpr_distance``, 27 | ``optimal_transport``. 28 | 29 | :param df: (pd.DataFrame) Features. 30 | :param dependence_method: (str) Algorithm to be use for generating dependence_matrix. 31 | :param theta: (float) Type of information being tested in the GPR and GNPR distances. Falls in range [0, 1]. 32 | (0.5 by default) 33 | :param n_bins: (int) Number of bins for discretization in ``information_variation`` and ``mutual_information``, 34 | if None the optimal number will be calculated. (None by default) 35 | :param normalize: (bool) Flag used to normalize the result to [0, 1] in ``information_variation`` and 36 | ``mutual_information``. (True by default) 37 | :param estimator: (str) Estimator to be used for calculation in ``mutual_information``. 38 | [``standard``, ``standard_copula``, ``copula_entropy``] (``standard`` by default) 39 | :param target_dependence: (str) Type of target dependence to use in ``optimal_transport``. 40 | [``comonotonicity``, ``countermonotonicity``, ``gaussian``, 41 | ``positive_negative``, ``different_variations``, ``small_variations``] 42 | (``comonotonicity`` by default) 43 | :param gaussian_corr: (float) Correlation coefficient to use when creating ``gaussian`` and 44 | ``small_variations`` copulas. [from 0 to 1] (0.7 by default) 45 | :param var_threshold: (float) Variation threshold to use for coefficient to use in ``small_variations``. 46 | Sets the relative area of correlation in a copula. [from 0 to 1] (0.2 by default) 47 | :return: (pd.DataFrame) Dependence matrix. 48 | """ 49 | 50 | pass 51 | 52 | 53 | def get_distance_matrix(X: pd.DataFrame, distance_metric: str = 'angular') -> pd.DataFrame: 54 | """ 55 | Applies distance operator to a dependence matrix. 56 | 57 | This allows to turn a correlation matrix into a distance matrix. Distances used are true metrics. 58 | 59 | List of supported distance metrics to use for generating the distance matrix: ``angular``, ``squared_angular``, 60 | and ``absolute_angular``. 61 | 62 | :param X: (pd.DataFrame) Dataframe to which distance operator to be applied. 63 | :param distance_metric: (str) The distance metric to be used for generating the distance matrix. 64 | :return: (pd.DataFrame) Distance matrix. 65 | """ 66 | 67 | pass 68 | -------------------------------------------------------------------------------- /mlfinlab/codependence/correlation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Correlation based distances and various modifications (angular, absolute, squared) described in Cornell lecture notes: 3 | Codependence: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes 4 | """ 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from scipy.spatial.distance import squareform, pdist 9 | 10 | 11 | # pylint: disable=invalid-name 12 | 13 | 14 | def angular_distance(x: np.array, y: np.array) -> float: 15 | """ 16 | Returns angular distance between two vectors. Angular distance is a slight modification of Pearson correlation which 17 | satisfies metric conditions. 18 | 19 | Formula used for calculation: 20 | 21 | Ang_Distance = (1/2 * (1 - Corr))^(1/2) 22 | 23 | Read Cornell lecture notes for more information about angular distance: 24 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes. 25 | 26 | :param x: (np.array/pd.Series) X vector. 27 | :param y: (np.array/pd.Series) Y vector. 28 | :return: (float) Angular distance. 29 | """ 30 | 31 | pass 32 | 33 | 34 | def absolute_angular_distance(x: np.array, y: np.array) -> float: 35 | """ 36 | Returns absolute angular distance between two vectors. It is a modification of angular distance where the absolute 37 | value of the Pearson correlation coefficient is used. 38 | 39 | Formula used for calculation: 40 | 41 | Abs_Ang_Distance = (1/2 * (1 - abs(Corr)))^(1/2) 42 | 43 | Read Cornell lecture notes for more information about absolute angular distance: 44 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes. 45 | 46 | :param x: (np.array/pd.Series) X vector. 47 | :param y: (np.array/pd.Series) Y vector. 48 | :return: (float) Absolute angular distance. 49 | """ 50 | 51 | pass 52 | 53 | 54 | def squared_angular_distance(x: np.array, y: np.array) -> float: 55 | """ 56 | Returns squared angular distance between two vectors. It is a modification of angular distance where the square of 57 | Pearson correlation coefficient is used. 58 | 59 | Formula used for calculation: 60 | 61 | Squared_Ang_Distance = (1/2 * (1 - (Corr)^2))^(1/2) 62 | 63 | Read Cornell lecture notes for more information about squared angular distance: 64 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes. 65 | 66 | :param x: (np.array/pd.Series) X vector. 67 | :param y: (np.array/pd.Series) Y vector. 68 | :return: (float) Squared angular distance. 69 | """ 70 | 71 | pass 72 | 73 | 74 | def distance_correlation(x: np.array, y: np.array) -> float: 75 | """ 76 | Returns distance correlation between two vectors. Distance correlation captures both linear and non-linear 77 | dependencies. 78 | 79 | Formula used for calculation: 80 | 81 | Distance_Corr[X, Y] = dCov[X, Y] / (dCov[X, X] * dCov[Y, Y])^(1/2) 82 | 83 | dCov[X, Y] is the average Hadamard product of the doubly-centered Euclidean distance matrices of X, Y. 84 | 85 | Read Cornell lecture notes for more information about distance correlation: 86 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes. 87 | 88 | :param x: (np.array/pd.Series) X vector. 89 | :param y: (np.array/pd.Series) Y vector. 90 | :return: (float) Distance correlation coefficient. 91 | """ 92 | 93 | pass 94 | 95 | def kullback_leibler_distance(corr_a, corr_b): 96 | """ 97 | Returns the Kullback-Leibler distance between two correlation matrices, all elements must be positive. 98 | Formula used for calculation: 99 | kullback_leibler_distance[X, Y] = 0.5 * ( Log( det(Y) / det(X) ) + tr((Y ^ -1).X - n ) 100 | Where n is the dimension space spanned by X. 101 | Read Don H. Johnson's research paper for more information on Kullback-Leibler distance: 102 | ``_ 103 | 104 | :param corr_a: (np.array/pd.Series/pd.DataFrame) Numpy array of the first correlation matrix. 105 | :param corr_b: (np.array/pd.Series/pd.DataFrame) Numpy array of the second correlation matrix. 106 | :return: (np.float64) the Kullback-Leibler distance between the two matrices. 107 | """ 108 | 109 | pass 110 | 111 | 112 | def norm_distance(matrix_a, matrix_b, r_val=2): 113 | """ 114 | Returns the normalized distance between two matrices. 115 | This function is a wrap for numpy's linear algebra method (numpy.linalg.norm). 116 | Link to documentation: ``_. 117 | Formula used to normalize matrix: 118 | norm_distance[X, Y] = sum( abs(X - Y) ^ r ) ^ 1/r 119 | Where r is a parameter. r=1 City block(L1 norm), r=2 Euclidean distance (L2 norm), 120 | r=inf Supermum (L_inf norm). For values of r < 1, the result is not really a mathematical ‘norm’. 121 | 122 | :param matrix_a: (np.array/pd.Series/pd.DataFrame) Array of the first matrix. 123 | :param matrix_b: (np.array/pd.Series/pd.DataFrame) Array of the second matrix. 124 | :param r_val: (int/str) The r value of the normalization formula. (``2`` by default, Any Integer) 125 | :return: (np.float64) The Euclidean distance between the two matrices. 126 | """ 127 | 128 | pass -------------------------------------------------------------------------------- /mlfinlab/codependence/gnpr_distance.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of distance using the Generic Non-Parametric Representation approach from "Some contributions to the 3 | clustering of financial time series and applications to credit default swaps" by Gautier Marti 4 | https://www.researchgate.net/publication/322714557 5 | """ 6 | import numpy as np 7 | import pandas as pd 8 | from scipy.stats import spearmanr 9 | import ot 10 | 11 | # pylint: disable=invalid-name 12 | 13 | 14 | def spearmans_rho(x: np.array, y: np.array) -> float: 15 | """ 16 | Calculates a statistical estimate of Spearman's rho - a copula-based dependence measure. 17 | 18 | Formula for calculation: 19 | rho = 1 - (6)/(T*(T^2-1)) * Sum((X_t-Y_t)^2) 20 | 21 | It is more robust to noise and can be defined if the variables have an infinite second moment. 22 | This statistic is described in more detail in the work by Gautier Marti 23 | https://www.researchgate.net/publication/322714557 (p.54) 24 | 25 | This method is a wrapper for the scipy spearmanr function. For more details about the function and its parameters, 26 | please visit scipy documentation 27 | https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.spearmanr.html 28 | 29 | :param x: (np.array/pd.Series) X vector 30 | :param y: (np.array/pd.Series) Y vector (same number of observations as X) 31 | :return: (float) Spearman's rho statistical estimate 32 | """ 33 | 34 | # Coefficient calculationS 35 | 36 | pass 37 | 38 | 39 | def gpr_distance(x: np.array, y: np.array, theta: float) -> float: 40 | """ 41 | Calculates the distance between two Gaussians under the Generic Parametric Representation (GPR) approach. 42 | 43 | According to the original work https://www.researchgate.net/publication/322714557 (p.70): 44 | "This is a fast and good proxy for distance d_theta when the first two moments ... predominate". But it's not 45 | a good metric for heavy-tailed distributions. 46 | 47 | Parameter theta defines what type of information dependency is being tested: 48 | - for theta = 0 the distribution information is tested 49 | - for theta = 1 the dependence information is tested 50 | - for theta = 0.5 a mix of both information types is tested 51 | 52 | With theta in [0, 1] the distance lies in range [0, 1] and is a metric. (See original work for proof, p.71) 53 | 54 | :param x: (np.array/pd.Series) X vector. 55 | :param y: (np.array/pd.Series) Y vector (same number of observations as X). 56 | :param theta: (float) Type of information being tested. Falls in range [0, 1]. 57 | :return: (float) Distance under GPR approach. 58 | """ 59 | 60 | pass 61 | 62 | 63 | def gnpr_distance(x: np.array, y: np.array, theta: float, n_bins: int = 50) -> float: 64 | """ 65 | Calculates the empirical distance between two random variables under the Generic Non-Parametric Representation 66 | (GNPR) approach. 67 | 68 | Formula for the distance is taken from https://www.researchgate.net/publication/322714557 (p.72). 69 | 70 | Parameter theta defines what type of information dependency is being tested: 71 | - for theta = 0 the distribution information is tested 72 | - for theta = 1 the dependence information is tested 73 | - for theta = 0.5 a mix of both information types is tested 74 | 75 | With theta in [0, 1] the distance lies in the range [0, 1] and is a metric. 76 | (See original work for proof, p.71) 77 | 78 | This method is modified as it uses 1D Optimal Transport Distance to measure 79 | distribution distance. This solves the issue of defining support and choosing 80 | a number of bins. The number of bins can be given as an input to speed up calculations. 81 | Big numbers of bins can take a long time to calculate. 82 | 83 | :param x: (np.array/pd.Series) X vector. 84 | :param y: (np.array/pd.Series) Y vector (same number of observations as X). 85 | :param theta: (float) Type of information being tested. Falls in range [0, 1]. 86 | :param n_bins: (int) Number of bins to use to split the X and Y vector observations. 87 | (100 by default) 88 | :return: (float) Distance under GNPR approach. 89 | """ 90 | 91 | pass 92 | -------------------------------------------------------------------------------- /mlfinlab/codependence/information.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementations of mutual information (I) and variation of information (VI) codependence measures from Cornell 3 | lecture slides: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes 4 | """ 5 | import numpy as np 6 | import scipy.stats as ss 7 | from sklearn.metrics import mutual_info_score 8 | 9 | 10 | # pylint: disable=invalid-name 11 | 12 | def get_optimal_number_of_bins(num_obs: int, corr_coef: float = None) -> int: 13 | """ 14 | Calculates optimal number of bins for discretization based on number of observations 15 | and correlation coefficient (univariate case). 16 | 17 | Algorithms used in this function were originally proposed in the works of Hacine-Gharbi et al. (2012) 18 | and Hacine-Gharbi and Ravier (2018). They are described in the Cornell lecture notes: 19 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes (p.26) 20 | 21 | :param num_obs: (int) Number of observations. 22 | :param corr_coef: (float) Correlation coefficient, used to estimate the number of bins for univariate case. 23 | :return: (int) Optimal number of bins. 24 | """ 25 | 26 | pass 27 | 28 | 29 | def get_mutual_info(x: np.array, y: np.array, n_bins: int = None, normalize: bool = False, 30 | estimator: str = 'standard') -> float: 31 | """ 32 | Returns mutual information (MI) between two vectors. 33 | 34 | This function uses the discretization with the optimal bins algorithm proposed in the works of 35 | Hacine-Gharbi et al. (2012) and Hacine-Gharbi and Ravier (2018). 36 | 37 | Read Cornell lecture notes for more information about the mutual information: 38 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes. 39 | 40 | This function supports multiple ways the mutual information can be estimated: 41 | 42 | 1. ``standard`` - the standard way of estimation - binning observations according to a given 43 | number of bins and applying the MI formula. 44 | 2. ``standard_copula`` - estimating the copula (as a normalized ranking of the observations) and 45 | applying the standard mutual information estimator on it. 46 | 3. ``copula_entropy`` - estimating the copula (as a normalized ranking of the observations) and 47 | calculating its entropy. Then MI estimator = (-1) * copula entropy. 48 | 49 | The last two estimators' implementation is taken from the blog post by Dr. Gautier Marti. 50 | Read this blog post for more information about the differences in the estimators: 51 | https://gmarti.gitlab.io/qfin/2020/07/01/mutual-information-is-copula-entropy.html 52 | 53 | :param x: (np.array) X vector. 54 | :param y: (np.array) Y vector. 55 | :param n_bins: (int) Number of bins for discretization, if None the optimal number will be calculated. 56 | (None by default) 57 | :param normalize: (bool) Flag used to normalize the result to [0, 1]. (False by default) 58 | :param estimator: (str) Estimator to be used for calculation. [``standard``, ``standard_copula``, ``copula_entropy``] 59 | (``standard`` by default) 60 | :return: (float) Mutual information score. 61 | """ 62 | 63 | pass 64 | 65 | 66 | def variation_of_information_score(x: np.array, y: np.array, n_bins: int = None, normalize: bool = False) -> float: 67 | """ 68 | Returns variantion of information (VI) between two vectors. 69 | 70 | This function uses the discretization using optimal bins algorithm proposed in the works of 71 | Hacine-Gharbi et al. (2012) and Hacine-Gharbi and Ravier (2018). 72 | 73 | Read Cornell lecture notes for more information about the variation of information: 74 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes. 75 | 76 | :param x: (np.array) X vector. 77 | :param y: (np.array) Y vector. 78 | :param n_bins: (int) Number of bins for discretization, if None the optimal number will be calculated. 79 | (None by default) 80 | :param normalize: (bool) True to normalize the result to [0, 1]. (False by default) 81 | :return: (float) Variation of information score. 82 | """ 83 | 84 | pass 85 | -------------------------------------------------------------------------------- /mlfinlab/codependence/optimal_transport.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementations of Optimal Copula Transport dependence measure proposed by Marti et al. : https://arxiv.org/abs/1610.09659 3 | And implemented in the blog post by Marti: https://gmarti.gitlab.io/qfin/2020/06/25/copula-optimal-transport-dependence.html 4 | """ 5 | import numpy as np 6 | import scipy.stats as ss 7 | import ot 8 | 9 | 10 | # pylint: disable=invalid-name 11 | 12 | def _get_empirical_copula(x: np.array, y: np.array) -> np.array: 13 | """ 14 | Calculate empirical copula using ranked observations. 15 | 16 | :param x: (np.array) X vector. 17 | :param y: (np.array) Y vector. 18 | :return: (np.array) Empirical copula. 19 | """ 20 | 21 | pass 22 | 23 | 24 | def optimal_transport_dependence(x: np.array, y: np.array, target_dependence: str = 'comonotonicity', 25 | gaussian_corr: float = 0.7, var_threshold: float = 0.2) -> float: 26 | """ 27 | Calculates optimal copula transport dependence between the empirical copula of the two vectors and a target copula. 28 | 29 | This implementation is based on the blog post by Marti: 30 | https://gmarti.gitlab.io/qfin/2020/06/25/copula-optimal-transport-dependence.html 31 | 32 | The target and forget copulas are being used to reference where between them does the empirical 33 | copula stand in the space of copulas. The forget copula used is the copula associated with 34 | independent random variables. The target copula is defined by the target_dependence parameter. 35 | 36 | Currently, these target_dependence copulas are supported: 37 | 38 | - ``comonotonicity`` - a comonotone copula. 39 | - ``countermonotonicity`` - a countermonotone copula. 40 | - ``gaussian`` - a Gaussian copula with a custom correlation coefficient. 41 | - ``positive_negative`` - a copula of both positive and negative correlation. 42 | - ``different_variations`` - a copula with some elements having extreme variations, 43 | while those of others are relatively small, and conversely. 44 | - ``small_variations`` - a copula with elements being positively correlated for small variations 45 | but uncorrelated otherwise. 46 | - ``v-shape`` - a copula that is seen with vol index vs. returns index: when returns of the index 47 | are extreme, vol is usually high, when returns small in absolute value, vol usually low. 48 | 49 | :param x: (np.array) X vector. 50 | :param y: (np.array) Y vector. 51 | :param target_dependence: (str) Type of target dependence to use when measuring distance. 52 | (``comonotonicity`` by default) 53 | :param gaussian_corr: (float) Correlation coefficient to use when creating ``gaussian`` and 54 | ``small_variations`` copulas. [from 0 to 1] (0.7 by default) 55 | :param var_threshold: (float) Variation threshold to use for coefficient to use in ``small_variations``. 56 | Sets the relative area of correlation in a copula. [from 0 to 1] (0.2 by default) 57 | :return: (float) Optimal copula transport dependence. 58 | """ 59 | 60 | pass 61 | 62 | 63 | def _compute_copula_ot_dependence(empirical: np.array, target: np.array, forget: np.array, 64 | n_obs: int) -> float: 65 | """ 66 | Calculates optimal copula transport dependence measure. 67 | 68 | :param empirical: (np.array) Empirical copula. 69 | :param target: (np.array) Target copula. 70 | :param forget: (np.array) Forget copula. 71 | :param nb_obs: (int) Number of observations. 72 | :return: (float) Optimal copula transport dependence. 73 | """ 74 | 75 | pass 76 | 77 | 78 | def _create_target_copula(target_dependence: str, n_obs: int, gauss_corr: float, 79 | var_threshold: float) -> np.array: 80 | """ 81 | Creates target copula with given dependence and number of observations. 82 | 83 | :param target_dependence: (str) Type of dependence to use for copula creation.[``comonotonicity``, 84 | ``countermonotonicity``, ``gaussian``, ``positive_negative``, 85 | ``different_variations``, ``small_variations``, ``v-shape``] 86 | :param n_obs: (int) Number of observations to use for copula creation. 87 | :param gauss_corr: (float) Correlation coefficient to use when creating ``gaussian`` and 88 | ``small_variations`` copulas. 89 | :param var_threshold: (float) Variation threshold to use for coefficient to use in ``small_variations``. 90 | :return: (np.array) Resulting copula. 91 | """ 92 | 93 | pass 94 | -------------------------------------------------------------------------------- /mlfinlab/cross_validation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions derived from Chapter 7: Cross Validation 3 | and stacked (multi-asset datasets) cross-validation functions. 4 | """ 5 | 6 | from mlfinlab.cross_validation.cross_validation import (ml_get_train_times, ml_cross_val_score, stacked_ml_cross_val_score, 7 | PurgedKFold, StackedPurgedKFold) 8 | from mlfinlab.cross_validation.combinatorial import (CombinatorialPurgedKFold, StackedCombinatorialPurgedKFold) 9 | -------------------------------------------------------------------------------- /mlfinlab/cross_validation/combinatorial.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements the following classes from Chapter 12 of AFML: 3 | 4 | - Combinatorial Purged Cross-Validation class. 5 | - Stacked Combinatorial Purged Cross-Validation class. 6 | """ 7 | # pylint: disable=too-many-locals, arguments-differ, invalid-name, unused-argument 8 | 9 | from itertools import combinations 10 | from typing import List 11 | 12 | import pandas as pd 13 | import numpy as np 14 | from scipy.special import comb 15 | from sklearn.model_selection import KFold 16 | 17 | from mlfinlab.cross_validation.cross_validation import ml_get_train_times 18 | 19 | 20 | def _get_number_of_backtest_paths(n_train_splits: int, n_test_splits: int) -> int: 21 | """ 22 | Number of combinatorial paths for CPCV(N,K). 23 | 24 | :param n_train_splits: (int) Number of train splits. 25 | :param n_test_splits: (int) Number of test splits. 26 | :return: (int) Number of backtest paths for CPCV(N,k). 27 | """ 28 | 29 | pass 30 | 31 | 32 | class CombinatorialPurgedKFold(KFold): 33 | """ 34 | Advances in Financial Machine Learning, Chapter 12. 35 | 36 | Implements Combinatorial Purged Cross Validation (CPCV). 37 | 38 | The train is purged of observations overlapping test-label intervals. 39 | Test set is assumed contiguous (shuffle=False), w/o training samples in between. 40 | """ 41 | 42 | def __init__(self, 43 | n_splits: int = 3, 44 | n_test_splits: int = 2, 45 | samples_info_sets: pd.Series = None, 46 | pct_embargo: float = 0.): 47 | """ 48 | Initialize. 49 | 50 | :param n_splits: (int) The number of splits. Default to 3 51 | :param samples_info_sets: (pd.Series) The information range on which each record is constructed from 52 | *samples_info_sets.index*: Time when the information extraction started. 53 | *samples_info_sets.value*: Time when the information extraction ended. 54 | :param pct_embargo: (float) Percent that determines the embargo size. 55 | """ 56 | 57 | pass 58 | 59 | def _generate_combinatorial_test_ranges(self, splits_indices: dict) -> List: 60 | """ 61 | Using start and end indices of test splits from KFolds and number of test_splits (self.n_test_splits), 62 | generates combinatorial test ranges splits. 63 | 64 | :param splits_indices: (dict) Test fold integer index: [start test index, end test index]. 65 | :return: (list) Combinatorial test splits ([start index, end index]). 66 | """ 67 | 68 | pass 69 | 70 | def _fill_backtest_paths(self, train_indices: list, test_splits: list): 71 | """ 72 | Using start and end indices of test splits and purged/embargoed train indices from CPCV, find backtest path and 73 | place in the path where these indices should be used. 74 | 75 | :param test_splits: (list) List of lists with first element corresponding to test start index and second - test end. 76 | """ 77 | 78 | pass 79 | 80 | def split(self, 81 | X: pd.DataFrame, 82 | y: pd.Series = None, 83 | groups=None) -> tuple: 84 | """ 85 | The main method to call for the PurgedKFold class. 86 | 87 | :param X: (pd.DataFrame) Samples dataset that is to be split. 88 | :param y: (pd.Series) Sample labels series. 89 | :param groups: (array-like), with shape (n_samples,), optional 90 | Group labels for the samples used while splitting the dataset into 91 | train/test set. 92 | :return: (tuple) [train list of sample indices, and test list of sample indices]. 93 | """ 94 | 95 | pass 96 | 97 | 98 | class StackedCombinatorialPurgedKFold(KFold): 99 | """ 100 | Advances in Financial Machine Learning, Chapter 12. 101 | 102 | Implements Stacked Combinatorial Purged Cross Validation (CPCV). It implements CPCV for multiasset dataset. 103 | 104 | The train is purged of observations overlapping test-label intervals. 105 | Test set is assumed contiguous (shuffle=False), w/o training samples in between. 106 | """ 107 | 108 | def __init__(self, 109 | n_splits: int = 3, 110 | n_test_splits: int = 2, 111 | samples_info_sets_dict: dict = None, 112 | pct_embargo: float = 0.): 113 | """ 114 | Initialize. 115 | 116 | :param n_splits: (int) The number of splits. Default to 3 117 | :param samples_info_sets_dict: (dict) Dictionary of samples info sets. 118 | ASSET_1: SAMPLE_INFO_SETS, ASSET_2:... 119 | 120 | *samples_info_sets.index*: Time when the information extraction started. 121 | *samples_info_sets.value*: Time when the information extraction ended. 122 | :param pct_embargo: (float) Percent that determines the embargo size. 123 | """ 124 | 125 | pass 126 | 127 | def _fill_backtest_paths(self, asset, train_indices: list, test_splits: list): 128 | """ 129 | Using start and end indices of test splits and purged/embargoed train indices from CPCV, find backtest path and 130 | place in the path where these indices should be used. 131 | 132 | :param asset: (str) Asset for which backtest paths are filled. 133 | :param train_indices: (list) List of lists with first element corresponding to train start index, second - test end. 134 | :param test_splits: (list) List of lists with first element corresponding to test start index and second - test end. 135 | """ 136 | 137 | pass 138 | 139 | def _generate_combinatorial_test_ranges(self, splits_indices: dict) -> List: 140 | """ 141 | Using start and end indices of test splits from KFolds and number of test_splits (self.n_test_splits), 142 | generates combinatorial test ranges splits. 143 | 144 | :param splits_indices: (dict) Test fold integer index: [start test index, end test index]. 145 | :return: (list) Combinatorial test splits ([start index, end index]). 146 | """ 147 | 148 | pass 149 | 150 | def split(self, 151 | X_dict: dict, 152 | y_dict: dict = None, 153 | groups=None) -> tuple: 154 | """ 155 | The main method to call for the PurgedKFold class. 156 | 157 | :param X_dict: (dict) Dictionary of asset : X_{asset}. 158 | :param y_dict: (dict) Dictionary of asset : y_{asset}. 159 | :param groups: (array-like), with shape (n_samples,), optional 160 | Group labels for the samples used while splitting the dataset into 161 | train/test set. 162 | :return: (tuple) [train list of sample indices, and test list of sample indices]. 163 | """ 164 | 165 | pass 166 | -------------------------------------------------------------------------------- /mlfinlab/data_generation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tools for synthetic data generation. 3 | """ 4 | 5 | from mlfinlab.data_generation.corrgan import sample_from_corrgan 6 | from mlfinlab.data_generation.data_verification import (plot_pairwise_dist, plot_eigenvalues, plot_eigenvectors, 7 | plot_hierarchical_structure, plot_mst_degree_count, plot_stylized_facts, 8 | plot_time_series_dependencies, plot_optimal_hierarchical_cluster) 9 | from mlfinlab.data_generation.vines import (sample_from_cvine, sample_from_dvine, sample_from_ext_onion) 10 | from mlfinlab.data_generation.correlated_random_walks import generate_cluster_time_series 11 | from mlfinlab.data_generation.hcbm import (time_series_from_dist, generate_hcmb_mat) 12 | from mlfinlab.data_generation.bootstrap import (row_bootstrap, pair_bootstrap, block_bootstrap) 13 | -------------------------------------------------------------------------------- /mlfinlab/data_generation/bootstrap.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of generating bootstrapped matrices from 3 | "Bootstrap validation of links of a minimum spanning tree" by F. Musciotto, 4 | L. Marotta, S. Miccichè, and R. N. Mantegna https://arxiv.org/pdf/1802.03395.pdf. 5 | """ 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | 11 | def row_bootstrap(mat, n_samples=1, size=None): 12 | """ 13 | Uses the Row Bootstrap method to generate a new matrix of size equal or smaller than the given matrix. 14 | 15 | It samples with replacement a random row from the given matrix. If the required bootstrapped 16 | columns' size is less than the columns of the original matrix, it randomly samples contiguous 17 | columns of the required size. It cannot generate a matrix greater than the original. 18 | 19 | It is inspired by the following paper: 20 | `Musciotto, F., Marotta, L., Miccichè, S. and Mantegna, R.N., 2018. Bootstrap validation of 21 | links of a minimum spanning tree. Physica A: Statistical Mechanics and its Applications, 22 | 512, pp.1032-1043. `_. 23 | 24 | :param mat: (pd.DataFrame/np.array) Matrix to sample from. 25 | :param n_samples: (int) Number of matrices to generate. 26 | :param size: (tuple) Size of the bootstrapped matrix. 27 | :return: (np.array) The generated bootstrapped matrices. Has shape (n_samples, size[0], size[1]). 28 | """ 29 | 30 | pass 31 | 32 | 33 | def pair_bootstrap(mat, n_samples=1, size=None): 34 | """ 35 | Uses the Pair Bootstrap method to generate a new correlation matrix of returns. 36 | 37 | It generates a correlation matrix based on the number of columns of the returns matrix given. It 38 | samples with replacement a pair of columns from the original matrix, the rows of the pairs generate 39 | a new row-bootstrapped matrix. The correlation value of the pair of assets is calculated and 40 | its value is used to fill the corresponding value in the generated correlation matrix. 41 | 42 | It is inspired by the following paper: 43 | `Musciotto, F., Marotta, L., Miccichè, S. and Mantegna, R.N., 2018. Bootstrap validation of 44 | links of a minimum spanning tree. Physica A: Statistical Mechanics and its Applications, 45 | 512, pp.1032-1043. `_. 46 | 47 | :param mat: (pd.DataFrame/np.array) Returns matrix to sample from. 48 | :param n_samples: (int) Number of matrices to generate. 49 | :param size: (int) Size of the bootstrapped correlation matrix. 50 | :return: (np.array) The generated bootstrapped correlation matrices. Has shape (n_samples, mat.shape[1], mat.shape[1]). 51 | """ 52 | 53 | pass 54 | 55 | 56 | def block_bootstrap(mat, n_samples=1, size=None, block_size=None): 57 | """ 58 | Uses the Block Bootstrap method to generate a new matrix of size equal to or smaller than the given matrix. 59 | 60 | It divides the original matrix into blocks of the given size. It samples with replacement random 61 | blocks to populate the bootstrapped matrix. It cannot generate a matrix greater than the original. 62 | 63 | It is inspired by the following paper: 64 | `Künsch, H.R., 1989. The jackknife and the bootstrap for general stationary observations. 65 | Annals of Statistics, 17(3), pp.1217-1241. `_. 66 | 67 | :param mat: (pd.DataFrame/np.array) Matrix to sample from. 68 | :param n_samples: (int) Number of matrices to generate. 69 | :param size: (tuple) Size of the bootstrapped matrix. 70 | :param block_size: (tuple) Size of the blocks. 71 | :return: (np.array) The generated bootstrapped matrices. Has shape (n_samples, size[0], size[1]). 72 | """ 73 | 74 | pass 75 | -------------------------------------------------------------------------------- /mlfinlab/data_generation/correlated_random_walks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains methods for generating correlated random walks. 3 | """ 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | 9 | def generate_cluster_time_series(n_series, t_samples=100, k_corr_clusters=1, 10 | d_dist_clusters=1, rho_main=0.1, rho_corr=0.3, price_start=100.0, 11 | dists_clusters=("normal", "normal", "student-t", "normal", "student-t")): 12 | """ 13 | Generates a synthetic time series of correlation and distribution clusters. 14 | 15 | It is reproduced with modifications from the following paper: 16 | `Donnat, P., Marti, G. and Very, P., 2016. Toward a generic representation of random 17 | variables for machine learning. Pattern Recognition Letters, 70, pp.24-31. 18 | `_ 19 | 20 | `www.datagrapple.com. (n.d.). DataGrapple - Tech: A GNPR tutorial: How to cluster random walks. 21 | [online] Available at: [Accessed 26 Aug. 2020]. 22 | `_ 23 | 24 | This method creates `n_series` time series of length `t_samples`. Each time series is divided 25 | into `k_corr_clusters` correlation clusters. Each correlation cluster is subdivided into 26 | `d_dist_clusters` distribution clusters. 27 | A main distribution is sampled from a normal distribution with mean = 0 and stdev = 1, adjusted 28 | by a `rho_main` factor. The correlation clusters are sampled from a given distribution, are generated 29 | once, and adjusted by a `rho_corr` factor. The distribution clusters are sampled from other 30 | given distributions, and adjusted by (1 - `rho_main` - `rho_corr`). They are sampled for each time series. 31 | These three series are added together to form a time series of returns. The final time series 32 | is the cumulative sum of the returns, with a start price given by `price_start`. 33 | 34 | :param n_series: (int) Number of time series to generate. 35 | :param t_samples: (int) Number of samples in each time series. 36 | :param k_corr_clusters: (int) Number of correlation clusters in each time series. 37 | :param d_dist_clusters: (int) Number of distribution clusters in each time series. 38 | :param rho_main: (float): Strength of main time series distribution. 39 | :param rho_corr: (float): Strength of correlation cluster distribution. 40 | :param price_start: (float) Starting price of the time series. 41 | :param dists_clusters: (list) List containing the names of the distributions to sample from. 42 | The following numpy distributions are available: "normal" = normal(0, 1), "normal_2" = normal(0, 2), 43 | "student-t" = standard_t(3)/sqrt(3), "laplace" = laplace(1/sqrt(2)). The first disitribution 44 | is used to sample for the correlation clusters (k_corr_clusters), the remaining ones are used 45 | to sample for the distribution clusters (d_dist_clusters). 46 | :return: (pd.DataFrame) Generated time series. Has size (t_samples, n_series). 47 | """ 48 | 49 | pass 50 | -------------------------------------------------------------------------------- /mlfinlab/data_generation/corrgan.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019, Hudson and Thames Quantitative Research 2 | # All rights reserved 3 | # Read more: https://github.com/hudson-and-thames/mlfinlab/blob/master/LICENSE.txt 4 | """ 5 | Implementation of sampling realistic financial correlation matrices from 6 | "CorrGAN: Sampling Realistic Financial Correlation Matrices using 7 | Generative Adversarial Networks" by Gautier Marti. 8 | https://arxiv.org/pdf/1910.09504.pdf 9 | """ 10 | from os import listdir, path 11 | import numpy as np 12 | from scipy.cluster import hierarchy 13 | from statsmodels.stats.correlation_tools import corr_nearest 14 | 15 | 16 | def sample_from_corrgan(model_loc, dim=10, n_samples=1): 17 | """ 18 | Samples correlation matrices from the pre-trained CorrGAN network. 19 | 20 | It is reproduced with modifications from the following paper: 21 | `Marti, G., 2020, May. CorrGAN: Sampling Realistic Financial Correlation Matrices Using 22 | Generative Adversarial Networks. In ICASSP 2020-2020 IEEE International Conference on 23 | Acoustics, Speech and Signal Processing (ICASSP) (pp. 8459-8463). IEEE. 24 | `_ 25 | 26 | It loads the appropriate CorrGAN model for the required dimension. Generates a matrix output 27 | from this network. Symmetries this matrix and finds the nearest correlation matrix 28 | that is positive semi-definite. Finally, it maximizes the sum of the similarities between 29 | adjacent leaves to arrange it with hierarchical clustering. 30 | 31 | The CorrGAN network was trained on the correlation profiles of the S&P 500 stocks. Therefore 32 | the output retains these properties. In addition, the final output retains the following 33 | 6 stylized facts: 34 | 35 | 1. Distribution of pairwise correlations is significantly shifted to the positive. 36 | 37 | 2. Eigenvalues follow the Marchenko-Pastur distribution, but for a very large first 38 | eigenvalue (the market). 39 | 40 | 3. Eigenvalues follow the Marchenko-Pastur distribution, but for a couple of other 41 | large eigenvalues (industries). 42 | 43 | 4. Perron-Frobenius property (first eigenvector has positive entries). 44 | 45 | 5. Hierarchical structure of correlations. 46 | 47 | 6. Scale-free property of the corresponding Minimum Spanning Tree (MST). 48 | 49 | :param model_loc: (str) Location of folder containing CorrGAN models. 50 | :param dim: (int) Dimension of correlation matrix to sample. 51 | In the range [2, 200]. 52 | :param n_samples: (int) Number of samples to generate. 53 | :return: (np.array) Sampled correlation matrices of shape (n_samples, dim, dim). 54 | """ 55 | 56 | pass 57 | -------------------------------------------------------------------------------- /mlfinlab/data_generation/hcbm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of the Hierarchical Correlation Block Model (HCBM) matrix. 3 | "Clustering financial time series: How long is enough?" by Marti, G., Andler, S., Nielsen, F. and Donnat, P. 4 | https://www.ijcai.org/Proceedings/16/Papers/367.pdf 5 | """ 6 | import numpy as np 7 | import pandas as pd 8 | from statsmodels.sandbox.distributions.multivariate import multivariate_t_rvs 9 | 10 | 11 | def _hcbm_mat_helper(mat, n_low=0, n_high=214, rho_low=0.1, rho_high=0.9, blocks=4, depth=4): 12 | """ 13 | Helper function for `generate_hcmb_mat` that recursively places rho values to HCBM matrix 14 | given as an input. 15 | 16 | By using a uniform distribution we select the start and end locations of the blocks in the 17 | matrix. For each block, we recurse depth times and repeat splitting up the sub-matrix into 18 | blocks. Each depth level has a unique correlation (rho) values generated from a uniform 19 | distributions, and bounded by `rho_low` and `rho_high`. This function works as a 20 | side-effect to the `mat` parameter. 21 | 22 | It is reproduced with modifications from the following paper: 23 | `Marti, G., Andler, S., Nielsen, F. and Donnat, P., 2016. 24 | Clustering financial time series: How long is enough?. arXiv preprint arXiv:1603.04017. 25 | `_ 26 | 27 | :param mat: (np.array) Parent HCBM matrix. 28 | :param n_low: (int) Start location of HCMB matrix to work on. 29 | :param n_high: (int) End location of HCMB matrix to work on. 30 | :param rho_low: (float) Lower correlation bound of the matrix. Must be greater or equal 31 | to 0. 32 | :param rho_high: (float) Upper correlation bound of the matrix. Must be less or equal to 1. 33 | :param blocks: (int) Maximum number of blocks to generate per level of depth. 34 | :param depth: (int) Depth of recursion for generating new blocks. 35 | """ 36 | 37 | pass 38 | 39 | 40 | def generate_hcmb_mat(t_samples, n_size, rho_low=0.1, rho_high=0.9, blocks=4, depth=4, permute=False): 41 | """ 42 | Generates a Hierarchical Correlation Block Model (HCBM) matrix of correlation values. 43 | 44 | By using a uniform distribution we select the start and end locations of the blocks in the 45 | matrix. For each block, we recurse depth times and repeat splitting up the sub-matrix into 46 | blocks. Each depth level has a unique correlation (rho) values generated from a uniform 47 | distributions, and bounded by `rho_low` and `rho_high`. 48 | 49 | It is reproduced with modifications from the following paper: 50 | `Marti, G., Andler, S., Nielsen, F. and Donnat, P., 2016. 51 | Clustering financial time series: How long is enough?. arXiv preprint arXiv:1603.04017. 52 | `_ 53 | 54 | :param t_samples: (int) Number of HCBM matrices to generate. 55 | :param n_size: (int) Size of HCBM matrix. 56 | :param rho_low: (float) Lower correlation bound of the matrix. Must be greater or equal to 0. 57 | :param rho_high: (float) Upper correlation bound of the matrix. Must be less or equal to 1. 58 | :param blocks: (int) Number of blocks to generate per level of depth. 59 | :param depth: (int) Depth of recursion for generating new blocks. 60 | :param permute: (bool) Whether to permute the final HCBM matrix. 61 | :return: (np.array) Generated HCBM matrix of shape (t_samples, n_size, n_size). 62 | """ 63 | 64 | pass 65 | 66 | 67 | def time_series_from_dist(corr, t_samples=1000, dist="normal", deg_free=3): 68 | """ 69 | Generates a time series from a given correlation matrix. 70 | 71 | It uses multivariate sampling from distributions to create the time series. It supports 72 | normal and student-t distributions. This method relies and acts as a wrapper for the 73 | `np.random.multivariate_normal` and 74 | `statsmodels.sandbox.distributions.multivariate.multivariate_t_rvs` modules. 75 | ``_ 76 | ``_ 77 | 78 | It is reproduced with modifications from the following paper: 79 | `Marti, G., Andler, S., Nielsen, F. and Donnat, P., 2016. 80 | Clustering financial time series: How long is enough?. arXiv preprint arXiv:1603.04017. 81 | `_ 82 | 83 | :param corr: (np.array) Correlation matrix. 84 | :param t_samples: (int) Number of samples in the time series. 85 | :param dist: (str) Type of distributions to use. 86 | Can take the values ["normal", "student"]. 87 | :param deg_free: (int) Degrees of freedom. Only used for student-t distribution. 88 | :return: (pd.DataFrame) The resulting time series of shape (len(corr), t_samples). 89 | """ 90 | 91 | pass 92 | -------------------------------------------------------------------------------- /mlfinlab/data_structures/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logic regarding the various sampling techniques, in particular: 3 | 4 | * Time Bars 5 | * Tick Bars 6 | * Volume Bars 7 | * Dollar Bars 8 | * Tick Imbalance Bars (EMA and Const) 9 | * Volume Imbalance Bars (EMA and Const) 10 | * Dollar Imbalance Bars (EMA and Const) 11 | * Tick Run Bars (EMA and Const) 12 | * Volume Run Bars (EMA and Const) 13 | * Dollar Run Bars (EMA and Const) 14 | """ 15 | 16 | from mlfinlab.data_structures.imbalance_data_structures import (get_ema_dollar_imbalance_bars, get_ema_volume_imbalance_bars, 17 | get_ema_tick_imbalance_bars, get_const_dollar_imbalance_bars, 18 | get_const_volume_imbalance_bars, get_const_tick_imbalance_bars) 19 | from mlfinlab.data_structures.run_data_structures import (get_ema_volume_run_bars, get_ema_tick_run_bars, 20 | get_ema_dollar_run_bars, get_const_volume_run_bars, 21 | get_const_tick_run_bars, get_const_dollar_run_bars) 22 | from mlfinlab.data_structures.standard_data_structures import (get_tick_bars, get_dollar_bars, get_volume_bars) 23 | from mlfinlab.data_structures.time_data_structures import get_time_bars 24 | -------------------------------------------------------------------------------- /mlfinlab/data_structures/time_data_structures.py: -------------------------------------------------------------------------------- 1 | """ 2 | Advances in Financial Machine Learning, Marcos Lopez de Prado 3 | Chapter 2: Financial Data Structures 4 | 5 | Time bars generation logic 6 | """ 7 | 8 | # Imports 9 | from typing import Union, Iterable, Optional 10 | import numpy as np 11 | import pandas as pd 12 | 13 | from mlfinlab.data_structures.base_bars import BaseBars 14 | 15 | 16 | # pylint: disable=too-many-instance-attributes 17 | class TimeBars(BaseBars): 18 | """ 19 | Contains all of the logic to construct the time bars. This class shouldn't be used directly. 20 | Use get_time_bars instead 21 | """ 22 | 23 | def __init__(self, resolution: str, num_units: int, batch_size: int = 20000000): 24 | """ 25 | Constructor 26 | 27 | :param resolution: (str) Type of bar resolution: ['D', 'H', 'MIN', 'S'] 28 | :param num_units: (int) Number of days, minutes, etc. 29 | :param batch_size: (int) Number of rows to read in from the csv, per batch 30 | """ 31 | 32 | pass 33 | 34 | def _reset_cache(self): 35 | """ 36 | Implementation of abstract method _reset_cache for time bars 37 | """ 38 | 39 | pass 40 | 41 | def _extract_bars(self, data: Union[list, tuple, np.ndarray]) -> list: 42 | """ 43 | For loop which compiles time bars. 44 | We did investigate the use of trying to solve this in a vectorised manner but found that a For loop worked well. 45 | 46 | :param data: (tuple) Contains 3 columns - date_time, price, and volume. 47 | :return: (list) Extracted bars 48 | """ 49 | 50 | pass 51 | 52 | 53 | def get_time_bars(file_path_or_df: Union[str, Iterable[str], pd.DataFrame], resolution: str = 'D', num_units: int = 1, batch_size: int = 20000000, 54 | verbose: bool = True, to_csv: bool = False, output_path: Optional[str] = None): 55 | """ 56 | Creates Time Bars: date_time, open, high, low, close, volume, cum_buy_volume, cum_ticks, cum_dollar_value. 57 | 58 | :param file_path_or_df: (str, iterable of str, or pd.DataFrame) Path to the csv file(s) or Pandas Data Frame containing raw tick data 59 | in the format[date_time, price, volume] 60 | :param resolution: (str) Resolution type ('D', 'H', 'MIN', 'S') 61 | :param num_units: (int) Number of resolution units (3 days for example, 2 hours) 62 | :param batch_size: (int) The number of rows per batch. Less RAM = smaller batch size. 63 | :param verbose: (int) Print out batch numbers (True or False) 64 | :param to_csv: (bool) Save bars to csv after every batch run (True or False) 65 | :param output_path: (str) Path to csv file, if to_csv is True 66 | :return: (pd.DataFrame) Dataframe of time bars, if to_csv=True return None 67 | """ 68 | 69 | pass 70 | -------------------------------------------------------------------------------- /mlfinlab/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module implementing typical financial datasets load (stock prices, dollar bars, ticks). 3 | """ 4 | 5 | from mlfinlab.datasets.load_datasets import (load_dollar_bar_sample, load_stock_prices, load_tick_sample, 6 | generate_multi_asset_data_set) 7 | -------------------------------------------------------------------------------- /mlfinlab/datasets/data/tick_data.csv: -------------------------------------------------------------------------------- 1 | Date and Time,Price,Volume 2 | 2011/07/31 22:38:45.108,1205.0,1 3 | 2011/07/31 22:38:45.934,1005.0,1 4 | 2011/07/31 22:38:47.008,1304.75,6 5 | 2011/07/31 22:38:48.944,1904.75,1 6 | 2011/07/31 22:38:52.951,1304.75,20 7 | 2011/07/31 22:38:52.951,1304.75,1 8 | 2011/07/31 22:38:52.951,1304.75,5 9 | 2011/07/31 22:38:56.589,1304.5,1 10 | 2011/07/31 22:38:57.858,1304.5,1 11 | 2011/07/31 22:39:08.695,1304.5,1 12 | 2011/07/31 22:39:09.396,1304.5,1 13 | 2011/07/31 22:39:20.495,1304.5,1 14 | 2011/07/31 22:39:23.937,1304.5,1 15 | 2011/07/31 22:39:23.937,1304.5,5 16 | 2011/07/31 22:39:23.937,1304.5,1 17 | 2011/07/31 22:39:26.084,1304.5,1 18 | 2011/07/31 22:39:26.084,1304.5,1 19 | 2011/07/31 22:39:26.095,1304.5,4 20 | 2011/07/31 22:39:26.743,1304.5,11 21 | 2011/07/31 22:39:26.801,1304.5,9 22 | 2011/07/31 22:39:27.050,1304.5,1 23 | 2011/07/31 22:39:27.274,1304.5,1 24 | 2011/07/31 22:39:28.914,1304.5,1 25 | 2011/07/31 22:39:28.965,1304.5,6 26 | 2011/07/31 22:39:28.965,1304.5,1 27 | 2011/07/31 22:39:28.965,1304.5,1 28 | 2011/07/31 22:39:33.568,1304.75,1 29 | 2011/07/31 22:39:37.360,1304.5,1 30 | 2011/07/31 22:39:37.360,1304.5,1 31 | 2011/07/31 22:39:38.991,1304.5,1 32 | 2011/07/31 22:39:40.423,1304.5,1 33 | 2011/07/31 22:39:51.519,1304.5,1 34 | 2011/07/31 22:39:51.519,1304.5,4 35 | 2011/07/31 22:39:53.030,1304.5,1 36 | 2011/07/31 22:39:55.765,1304.5,1 37 | 2011/07/31 22:39:56.614,1304.5,1 38 | 2011/07/31 22:39:56.614,1304.5,1 39 | 2011/07/31 22:39:56.614,1304.5,5 40 | 2011/07/31 22:39:56.614,1304.5,1 41 | 2011/07/31 22:39:56.614,1304.5,1 42 | 2011/07/31 22:39:59.606,1304.5,10 43 | 2011/07/31 22:39:59.606,1304.5,2 44 | 2011/07/31 22:39:59.606,1304.5,4 45 | 2011/07/31 22:40:01.914,1304.5,1 46 | 2011/07/31 22:40:01.914,1304.5,1 47 | 2011/07/31 22:40:10.794,1304.75,6 48 | 2011/07/31 22:40:11.161,1304.5,4 49 | 2011/07/31 22:40:11.168,1304.75,4 50 | 2011/07/31 22:40:11.168,1304.75,1 51 | 2011/07/31 22:40:11.168,1304.75,1 52 | 2011/07/31 22:40:11.168,1304.75,1 53 | 2011/07/31 22:40:12.014,1304.5,2 54 | 2011/07/31 22:40:12.014,1304.5,3 55 | 2011/07/31 22:40:12.014,1304.5,1 56 | 2011/07/31 22:40:13.964,1304.75,1 57 | 2011/07/31 22:40:14.306,1304.75,1 58 | 2011/07/31 22:40:14.514,1304.75,1 59 | 2011/07/31 22:40:14.617,1304.75,1 60 | 2011/07/31 22:40:14.730,1304.75,1 61 | 2011/07/31 22:40:14.822,1304.75,1 62 | 2011/07/31 22:40:16.182,1305.0,9 63 | 2011/07/31 22:40:16.182,1305.0,1 64 | 2011/07/31 22:40:20.267,1304.75,1 65 | 2011/07/31 22:40:22.083,1305.0,1 66 | 2011/07/31 22:40:28.918,1304.75,1 67 | 2011/07/31 22:40:28.918,1304.75,1 68 | 2011/07/31 22:40:29.030,1305.0,5 69 | 2011/07/31 22:40:29.478,1305.0,3 70 | 2011/07/31 22:40:29.478,3305.0,1 71 | 2011/07/31 22:40:29.478,205.0,2 72 | 2011/07/31 22:40:29.478,1405.0,1 73 | 2011/07/31 22:40:29.478,1305.0,1 74 | 2011/07/31 22:40:29.478,1305.0,1 75 | 2011/07/31 22:40:29.478,1305.0,1 76 | 2011/07/31 22:40:29.478,1305.0,1 77 | 2011/07/31 22:40:29.478,1305.0,1 78 | 2011/07/31 22:40:29.478,1305.0,2 79 | 2011/07/31 22:40:29.478,1305.0,1 80 | 2011/07/31 22:40:29.478,1305.0,1 81 | 2011/07/31 22:40:29.478,1305.0,1 82 | 2011/07/31 22:40:29.478,1305.0,1 83 | 2011/07/31 22:40:29.478,1305.0,2 84 | 2011/07/31 22:40:29.541,1305.0,5 85 | 2011/07/31 22:40:29.940,1305.0,1 86 | 2011/07/31 22:40:30.694,1305.25,10 87 | 2011/07/31 22:40:31.492,1305.25,10 88 | 2011/07/31 22:40:31.576,1305.25,5 89 | 2011/07/31 22:40:31.576,1305.25,1 90 | 2011/07/31 22:40:31.576,1305.25,1 91 | 2011/07/31 22:40:31.576,1305.25,2 92 | 2011/07/31 22:40:31.576,1305.25,1 93 | 2011/07/31 22:40:33.213,1305.25,1 94 | 2011/07/31 22:40:41.016,1305.25,1 95 | 2011/07/31 22:40:41.849,1305.25,1 96 | 2011/07/31 22:40:42.779,1305.0,1 97 | 2011/07/31 22:40:44.921,1305.25,5 98 | 2011/07/31 22:40:44.921,1305.25,1 99 | 2011/07/31 22:40:44.921,1305.25,1 100 | 2011/07/31 22:40:44.921,1305.25,2 101 | 2011/07/31 22:40:44.921,1305.25,1 102 | -------------------------------------------------------------------------------- /mlfinlab/datasets/load_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module implementing various functions loading tick, dollar, stock data sets which can be used as 3 | sandbox data. 4 | """ 5 | 6 | import os 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from mlfinlab.labeling.labeling import get_events, add_vertical_barrier, get_bins 12 | from mlfinlab.util.volatility import get_daily_vol 13 | from mlfinlab.filters.filters import cusum_filter 14 | 15 | 16 | def load_stock_prices() -> pd.DataFrame: 17 | """ 18 | Loads stock prices data sets consisting of 19 | EEM, EWG, TIP, EWJ, EFA, IEF, EWQ, EWU, XLB, XLE, XLF, LQD, XLK, XLU, EPP, FXI, VGK, VPL, SPY, TLT, BND, CSJ, 20 | DIA starting from 2008 till 2016. 21 | 22 | :return: (pd.DataFrame) The stock_prices data frame. 23 | """ 24 | 25 | pass 26 | 27 | 28 | def load_tick_sample() -> pd.DataFrame: 29 | """ 30 | Loads E-Mini S&P 500 futures tick data sample. 31 | 32 | :return: (pd.DataFrame) Frame with tick data sample. 33 | """ 34 | 35 | pass 36 | 37 | 38 | def load_dollar_bar_sample() -> pd.DataFrame: 39 | """ 40 | Loads E-Mini S&P 500 futures dollar bars data sample. 41 | 42 | :return: (pd.DataFrame) Frame with dollar bar data sample. 43 | """ 44 | 45 | pass 46 | 47 | 48 | def generate_multi_asset_data_set(start_date: pd.Timestamp = pd.Timestamp(2008, 1, 1), 49 | end_date: pd.Timestamp = pd.Timestamp(2020, 1, 1)) -> tuple: 50 | # pylint: disable=invalid-name 51 | """ 52 | Generates multi-asset dataset from stock prices labelled by triple-barrier method. 53 | 54 | :param start_date: (pd.Timestamp) Dataset start date. 55 | :param end_date: (pd.Timestamp) Dataset end date. 56 | :return: (tuple) Tuple of dictionaries (asset: data) for X, y, cont contract used to label the dataset. 57 | """ 58 | 59 | pass 60 | -------------------------------------------------------------------------------- /mlfinlab/ensemble/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of Sequentially Bootstrapped Bagging Classifier using sklearn's library as base class. 3 | """ 4 | 5 | from mlfinlab.ensemble.sb_bagging import (SequentiallyBootstrappedBaggingClassifier, SequentiallyBootstrappedBaggingRegressor) 6 | -------------------------------------------------------------------------------- /mlfinlab/feature_importance/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module which implements feature importance algorithms described in Chapter 8 and other interpretability tools 3 | from the Journal of Financial Data Science. 4 | And Stacked feature importance functions (Stacked MDA/SFI). 5 | """ 6 | 7 | from mlfinlab.feature_importance.importance import (mean_decrease_impurity, mean_decrease_accuracy, 8 | single_feature_importance, plot_feature_importance, 9 | stacked_mean_decrease_accuracy) 10 | from mlfinlab.feature_importance.orthogonal import (feature_pca_analysis, get_pca_rank_weighted_kendall_tau, 11 | get_orthogonal_features) 12 | from mlfinlab.feature_importance.fingerpint import (RegressionModelFingerprint, ClassificationModelFingerprint) 13 | -------------------------------------------------------------------------------- /mlfinlab/feature_importance/fingerpint.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of an algorithm described in Yimou Li, David Turkington, Alireza Yazdani 3 | 'Beyond the Black Box: An Intuitive Approach to Investment Prediction with Machine Learning' 4 | (https://jfds.pm-research.com/content/early/2019/12/11/jfds.2019.1.023) 5 | """ 6 | 7 | from abc import ABC, abstractmethod 8 | from typing import Tuple 9 | import pandas as pd 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | from sklearn.linear_model import LinearRegression 13 | 14 | 15 | # pylint: disable=invalid-name 16 | # pylint: disable=too-many-locals 17 | 18 | class AbstractModelFingerprint(ABC): 19 | """ 20 | Model fingerprint constructor. 21 | 22 | This is an abstract base class for the RegressionModelFingerprint and ClassificationModelFingerprint classes. 23 | """ 24 | 25 | def __init__(self): 26 | """ 27 | Model fingerprint constructor. 28 | """ 29 | pass 30 | 31 | def fit(self, model: object, X: pd.DataFrame, num_values: int = 50, pairwise_combinations: list = None) -> None: 32 | """ 33 | Get linear, non-linear and pairwise effects estimation. 34 | 35 | :param model: (object) Trained model. 36 | :param X: (pd.DataFrame) Dataframe of features. 37 | :param num_values: (int) Number of values used to estimate feature effect. 38 | :param pairwise_combinations: (list) Tuples (feature_i, feature_j) to test pairwise effect. 39 | """ 40 | 41 | pass 42 | 43 | def get_effects(self) -> Tuple: 44 | """ 45 | Return computed linear, non-linear and pairwise effects. The model should be fit() before using this method. 46 | 47 | :return: (tuple) Linear, non-linear and pairwise effects, of type dictionary (raw values and normalised). 48 | """ 49 | 50 | pass 51 | 52 | def plot_effects(self) -> plt.figure: 53 | """ 54 | Plot each effect (normalized) on a bar plot (linear, non-linear). Also plots pairwise effects if calculated. 55 | 56 | :return: (plt.figure) Plot figure. 57 | """ 58 | 59 | pass 60 | 61 | def _get_feature_values(self, X: pd.DataFrame, num_values: int) -> None: 62 | """ 63 | Step 1 of the algorithm which generates possible feature values used in analysis. 64 | 65 | :param X: (pd.DataFrame) Dataframe of features. 66 | :param num_values: (int) Number of values used to estimate feature effect. 67 | """ 68 | 69 | pass 70 | 71 | def _get_individual_partial_dependence(self, model: object, X: pd.DataFrame) -> None: 72 | """ 73 | Get individual partial dependence function values for each column. 74 | 75 | :param model: (object) Trained model. 76 | :param X: (pd.DataFrame) Dataframe of features. 77 | """ 78 | 79 | pass 80 | 81 | def _get_linear_effect(self, X: pd.DataFrame) -> dict: 82 | """ 83 | Get linear effect estimates as the mean absolute deviation of the linear predictions around their average value. 84 | 85 | :param X: (pd.DataFrame) Dataframe of features. 86 | :return: (dict) Linear effect estimates for each feature column. 87 | """ 88 | 89 | pass 90 | 91 | def _get_non_linear_effect(self, X: pd.DataFrame) -> dict: 92 | """ 93 | Get non-linear effect estimates as as the mean absolute deviation of the total marginal (single variable) 94 | effect around its corresponding linear effect. 95 | 96 | :param X: (pd.DataFrame) Dataframe of features. 97 | :return: (dict) Non-linear effect estimates for each feature column. 98 | """ 99 | 100 | pass 101 | 102 | def _get_pairwise_effect(self, pairwise_combinations: list, model: object, X: pd.DataFrame, num_values) -> dict: 103 | """ 104 | Get pairwise effect estimates as the de-meaned joint partial prediction of the two variables minus the de-meaned 105 | partial predictions of each variable independently. 106 | 107 | :param pairwise_combinations: (list) Tuples (feature_i, feature_j) to test pairwise effect. 108 | :param model: (object) Trained model. 109 | :param X: (pd.DataFrame) Dataframe of features. 110 | :param num_values: (int) Number of values used to estimate feature effect. 111 | :return: (dict) Raw and normalised pairwise effects. 112 | """ 113 | 114 | pass 115 | 116 | @abstractmethod 117 | def _get_model_predictions(self, model: object, X_: pd.DataFrame): 118 | """ 119 | Get model predictions based on problem type (predict for regression, predict_proba for classification). 120 | 121 | :param model: (object) Trained model. 122 | :param X_: (np.array) Feature set. 123 | :return: (np.array) Predictions. 124 | """ 125 | 126 | pass 127 | 128 | @staticmethod 129 | def _normalize(effect: dict) -> dict: 130 | """ 131 | Normalize effect values (sum equals 1). 132 | 133 | :param effect: (dict) Effect values. 134 | :return: (dict) Normalized effect values. 135 | """ 136 | 137 | pass 138 | 139 | 140 | class RegressionModelFingerprint(AbstractModelFingerprint): 141 | """ 142 | Regression Fingerprint class used for regression type of models. 143 | """ 144 | 145 | def __init__(self): 146 | """ 147 | Regression model fingerprint constructor. 148 | """ 149 | 150 | pass 151 | 152 | def _get_model_predictions(self, model, X_): 153 | """ 154 | Abstract method _get_model_predictions implementation. 155 | 156 | :param model: (object) Trained model. 157 | :param X_: (np.array) Feature set. 158 | :return: (np.array) Predictions. 159 | """ 160 | 161 | pass 162 | 163 | 164 | class ClassificationModelFingerprint(AbstractModelFingerprint): 165 | """ 166 | Classification Fingerprint class used for classification type of models. 167 | """ 168 | 169 | def __init__(self): 170 | """ 171 | Classification model fingerprint constructor. 172 | """ 173 | 174 | pass 175 | 176 | def _get_model_predictions(self, model, X_): 177 | """ 178 | Abstract method _get_model_predictions implementation. 179 | 180 | :param model: (object) Trained model. 181 | :param X_: (np.array) Feature set. 182 | :return: (np.array) Predictions. 183 | """ 184 | 185 | pass 186 | -------------------------------------------------------------------------------- /mlfinlab/feature_importance/orthogonal.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module which implements feature PCA compression and PCA analysis of feature importance. 3 | """ 4 | 5 | import pandas as pd 6 | import numpy as np 7 | from scipy.stats import weightedtau, kendalltau, spearmanr, pearsonr 8 | 9 | 10 | def _get_eigen_vector(dot_matrix, variance_thresh, num_features=None): 11 | """ 12 | Advances in Financial Machine Learning, Snippet 8.5, page 119. 13 | 14 | Computation of Orthogonal Features 15 | 16 | Gets eigen values and eigen vector from matrix which explain % variance_thresh of total variance. 17 | 18 | :param dot_matrix: (np.array): Matrix for which eigen values/vectors should be computed. 19 | :param variance_thresh: (float): Percentage % of overall variance which compressed vectors should explain. 20 | :param num_features: (int) Manually set number of features, overrides variance_thresh. (None by default) 21 | :return: (pd.Series, pd.DataFrame): Eigenvalues, Eigenvectors. 22 | """ 23 | 24 | pass 25 | 26 | 27 | def _standardize_df(data_frame): 28 | """ 29 | Helper function which divides df by std and extracts mean. 30 | 31 | :param data_frame: (pd.DataFrame): Dataframe to standardize 32 | :return: (pd.DataFrame): Standardized dataframe 33 | """ 34 | 35 | pass 36 | 37 | 38 | def get_orthogonal_features(feature_df, variance_thresh=.95, num_features=None): 39 | """ 40 | Advances in Financial Machine Learning, Snippet 8.5, page 119. 41 | 42 | Computation of Orthogonal Features. 43 | 44 | Gets PCA orthogonal features. 45 | 46 | :param feature_df: (pd.DataFrame): Dataframe of features. 47 | :param variance_thresh: (float): Percentage % of overall variance which compressed vectors should explain. 48 | :param num_features: (int) Manually set number of features, overrides variance_thresh. (None by default) 49 | :return: (pd.DataFrame): Compressed PCA features which explain %variance_thresh of variance. 50 | """ 51 | 52 | pass 53 | 54 | 55 | def get_pca_rank_weighted_kendall_tau(feature_imp, pca_rank): 56 | """ 57 | Advances in Financial Machine Learning, Snippet 8.6, page 121. 58 | 59 | Computes Weighted Kendall's Tau Between Feature Importance and Inverse PCA Ranking. 60 | 61 | :param feature_imp: (np.array): Feature mean importance. 62 | :param pca_rank: (np.array): PCA based feature importance rank. 63 | :return: (float): Weighted Kendall Tau of feature importance and inverse PCA rank with p_value. 64 | """ 65 | 66 | pass 67 | 68 | 69 | def feature_pca_analysis(feature_df, feature_importance, variance_thresh=0.95): 70 | """ 71 | Performs correlation analysis between feature importance (MDI for example, supervised) and PCA eigenvalues 72 | (unsupervised). 73 | 74 | High correlation means that probably the pattern identified by the ML algorithm is not entirely overfit. 75 | 76 | :param feature_df: (pd.DataFrame): Features dataframe. 77 | :param feature_importance: (pd.DataFrame): Individual MDI feature importance. 78 | :param variance_thresh: (float): Percentage % of overall variance which compressed vectors should explain in PCA compression. 79 | :return: (dict): Dictionary with kendall, spearman, pearson and weighted_kendall correlations and p_values. 80 | """ 81 | 82 | pass 83 | -------------------------------------------------------------------------------- /mlfinlab/features/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions derived from Chapter 5: Fractional Differentiation. 3 | """ 4 | 5 | from mlfinlab.features.fracdiff import (get_weights, frac_diff, get_weights_ffd, frac_diff_ffd, plot_min_ffd) 6 | -------------------------------------------------------------------------------- /mlfinlab/filters/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logic regarding the various types of filters: 3 | 4 | * CUSUM Filter 5 | * Z-score filter 6 | """ 7 | 8 | from mlfinlab.filters.filters import cusum_filter 9 | from mlfinlab.filters.filters import z_score_filter 10 | -------------------------------------------------------------------------------- /mlfinlab/filters/filters.py: -------------------------------------------------------------------------------- 1 | """ 2 | Filters are used to filter events based on some kind of trigger. For example a structural break filter can be 3 | used to filter events where a structural break occurs. This event is then used to measure the return from the event 4 | to some event horizon, say a day. 5 | """ 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | 11 | # Snippet 2.4, page 39, The Symmetric CUSUM Filter. 12 | def cusum_filter(raw_time_series, threshold, time_stamps=True): 13 | """ 14 | Advances in Financial Machine Learning, Snippet 2.4, page 39. 15 | 16 | The Symmetric Dynamic/Fixed CUSUM Filter. 17 | 18 | The CUSUM filter is a quality-control method, designed to detect a shift in the mean value of a measured quantity 19 | away from a target value. The filter is set up to identify a sequence of upside or downside divergences from any 20 | reset level zero. We sample a bar t if and only if S_t >= threshold, at which point S_t is reset to 0. 21 | 22 | One practical aspect that makes CUSUM filters appealing is that multiple events are not triggered by raw_time_series 23 | hovering around a threshold level, which is a flaw suffered by popular market signals such as Bollinger Bands. 24 | It will require a full run of length threshold for raw_time_series to trigger an event. 25 | 26 | Once we have obtained this subset of event-driven bars, we will let the ML algorithm determine whether the occurrence 27 | of such events constitutes actionable intelligence. Below is an implementation of the Symmetric CUSUM filter. 28 | 29 | Note: As per the book this filter is applied to closing prices but we extended it to also work on other 30 | time series such as volatility. 31 | 32 | :param raw_time_series: (pd.Series) Close prices (or other time series, e.g. volatility). 33 | :param threshold: (float or pd.Series) When the abs(change) is larger than the threshold, the function captures 34 | it as an event, can be dynamic if threshold is pd.Series 35 | :param time_stamps: (bool) Default is to return a DateTimeIndex, change to false to have it return a list. 36 | :return: (datetime index vector) Vector of datetimes when the events occurred. This is used later to sample. 37 | """ 38 | 39 | pass 40 | 41 | 42 | def z_score_filter(raw_time_series, mean_window, std_window, z_score=3, time_stamps=True): 43 | """ 44 | Filter which implements z_score filter 45 | (https://stackoverflow.com/questions/22583391/peak-signal-detection-in-realtime-timeseries-data) 46 | 47 | :param raw_time_series: (pd.Series) Close prices (or other time series, e.g. volatility). 48 | :param mean_window: (int): Rolling mean window 49 | :param std_window: (int): Rolling std window 50 | :param z_score: (float): Number of standard deviations to trigger the event 51 | :param time_stamps: (bool) Default is to return a DateTimeIndex, change to false to have it return a list. 52 | :return: (datetime index vector) Vector of datetimes when the events occurred. This is used later to sample. 53 | """ 54 | 55 | pass 56 | -------------------------------------------------------------------------------- /mlfinlab/labeling/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Labeling techniques used in financial machine learning. 3 | """ 4 | 5 | from mlfinlab.labeling.labeling import (add_vertical_barrier, apply_pt_sl_on_t1, barrier_touched, drop_labels, 6 | get_bins, get_events) 7 | from mlfinlab.labeling.trend_scanning import trend_scanning_labels 8 | from mlfinlab.labeling.tail_sets import TailSetLabels 9 | from mlfinlab.labeling.fixed_time_horizon import fixed_time_horizon 10 | from mlfinlab.labeling.matrix_flags import MatrixFlagLabels 11 | from mlfinlab.labeling.excess_over_median import excess_over_median 12 | from mlfinlab.labeling.raw_return import raw_return 13 | from mlfinlab.labeling.return_vs_benchmark import return_over_benchmark 14 | from mlfinlab.labeling.excess_over_mean import excess_over_mean 15 | from mlfinlab.labeling.bull_bear import (pagan_sossounov, lunde_timmermann) 16 | -------------------------------------------------------------------------------- /mlfinlab/labeling/bull_bear.py: -------------------------------------------------------------------------------- 1 | """ 2 | Detection of bull and bear markets. 3 | """ 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | def pagan_sossounov(prices, window=8, censor=6, cycle=16, phase=4, threshold=0.2): 9 | """ 10 | Pagan and Sossounov's labeling method. Sourced from `Pagan, Adrian R., and Kirill A. Sossounov. "A simple framework 11 | for analysing bull and bear markets." Journal of applied econometrics 18.1 (2003): 23-46. 12 | `__ 13 | 14 | Returns a DataFrame with labels of 1 for Bull and -1 for Bear. 15 | 16 | :param prices: (pd.DataFrame) Close prices of all tickers in the market. 17 | :param window: (int) Rolling window length to determine local extrema. Paper suggests 8 months for monthly obs. 18 | :param censor: (int) Number of months to eliminate for start and end. Paper suggests 6 months for monthly obs. 19 | :param cycle: (int) Minimum length for a complete cycle. Paper suggests 16 months for monthly obs. 20 | :param phase: (int) Minimum length for a phase. Paper suggests 4 months for monthly obs. 21 | :param threshold: (double) Minimum threshold for phase change. Paper suggests 0.2. 22 | :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear. 23 | """ 24 | 25 | pass 26 | 27 | 28 | def _alternation(price): 29 | """ 30 | Helper function to check peak and trough alternation. 31 | 32 | :param price: (pd.DataFrame) Close prices of all tickers in the market. 33 | :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear. 34 | """ 35 | 36 | pass 37 | 38 | 39 | def _apply_pagan_sossounov(price, window, censor, cycle, phase, threshold): 40 | """ 41 | Helper function for Pagan and Sossounov labeling method. 42 | 43 | :param price: (pd.DataFrame) Close prices of all tickers in the market. 44 | :param window: (int) Rolling window length to determine local extrema. Paper suggests 8 months for monthly obs. 45 | :param censor: (int) Number of months to eliminate for start and end. Paper suggests 6 months for monthly obs. 46 | :param cycle: (int) Minimum length for a complete cycle. Paper suggests 16 months for monthly obs. 47 | :param phase: (int) Minimum length for a phase. Paper suggests 4 months for monthly obs. 48 | :param threshold: (double) Minimum threshold for phase change. Paper suggests 20%. 49 | :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear. 50 | """ 51 | 52 | pass 53 | 54 | 55 | def lunde_timmermann(prices, bull_threshold=0.15, bear_threshold=0.15): 56 | """ 57 | Lunde and Timmermann's labeling method. Sourced from `Lunde, Asger, and Allan Timmermann. "Duration dependence 58 | in stock prices: An analysis of bull and bear markets." Journal of Business & Economic Statistics 22.3 (2004): 253-273. 59 | `__ 60 | 61 | Returns a DataFrame with labels of 1 for Bull and -1 for Bear. 62 | 63 | :param prices: (pd.DataFrame) Close prices of all tickers in the market. 64 | :param bull_threshold: (double) Threshold to identify bull market. Paper suggests 0.15. 65 | :param bear_threshold: (double) Threshold to identify bear market. Paper suggests 0.15. 66 | :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear. 67 | """ 68 | 69 | pass 70 | 71 | 72 | def _apply_lunde_timmermann(price, bull_threshold, bear_threshold): 73 | """ 74 | Helper function for Lunde and Timmermann labeling method. 75 | 76 | :param price: (pd.DataFrame) Close prices of all tickers in the market. 77 | :param bull_threshold: (double) Threshold to identify bull market. Paper suggests 0.15. 78 | :param bear_threshold: (double) Threshold to identify bear market. Paper suggests 0.15. 79 | :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear. 80 | """ 81 | 82 | pass 83 | -------------------------------------------------------------------------------- /mlfinlab/labeling/excess_over_mean.py: -------------------------------------------------------------------------------- 1 | """ 2 | Return in excess of mean method. 3 | 4 | Chapter 5, Machine Learning for Factor Investing, by Coqueret and Guida, (2020). 5 | """ 6 | import numpy as np 7 | 8 | 9 | def excess_over_mean(prices, binary=False, resample_by=None, lag=True): 10 | """ 11 | Return in excess of mean labeling method. Sourced from Chapter 5.5.1 of Machine Learning for Factor Investing, 12 | by Coqueret, G. and Guida, T. (2020). 13 | 14 | Returns a DataFrame containing returns of stocks over the mean of all stocks in the portfolio. Returns a DataFrame 15 | of signs of the returns if binary is True. In this case, an observation may be labeled as 0 if it itself is the 16 | mean. 17 | 18 | :param prices: (pd.DataFrame) Close prices of all tickers in the market that are used to establish the mean. NaN 19 | values are ok. Returns on each ticker are then compared to the mean for the given timestamp. 20 | :param binary: (bool) If False, the numerical value of excess returns over mean will be given. If True, then only 21 | the sign of the excess return over mean will be given (-1 or 1). A label of 0 will be given if 22 | the observation itself equal to the mean. 23 | :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per 24 | business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period. 25 | For full details see `here. 26 | `_ 27 | :param lag: (bool) If True, returns will be lagged to make them forward-looking. 28 | :return: (pd.DataFrame) Numerical returns in excess of the market mean return, or sign of return depending on 29 | whether binary is False or True respectively. 30 | """ 31 | 32 | pass 33 | -------------------------------------------------------------------------------- /mlfinlab/labeling/excess_over_median.py: -------------------------------------------------------------------------------- 1 | """ 2 | Return in excess of median method. 3 | 4 | Described in "The benefits of tree-based models for stock selection", Zhu et al. (2012). Data labeled this way can be 5 | used in regression and classification models to predict stock returns over market. 6 | """ 7 | import numpy as np 8 | 9 | 10 | def excess_over_median(prices, binary=False, resample_by=None, lag=True): 11 | """ 12 | Return in excess of median labeling method. Sourced from "The benefits of tree-based models for stock selection" 13 | Zhu et al. (2012). 14 | 15 | Returns a DataFrame containing returns of stocks over the median of all stocks in the portfolio, or returns a 16 | DataFrame containing signs of those returns. In the latter case, an observation may be labeled as 0 if it itself is 17 | the median. 18 | 19 | :param prices: (pd.DataFrame) Close prices of all stocks in the market that are used to establish the median. 20 | Returns on each stock are then compared to the median for the given timestamp. 21 | :param binary: (bool) If False, the numerical value of excess returns over median will be given. If True, then only 22 | the sign of the excess return over median will be given (-1 or 1). A label of 0 will be given if 23 | the observation itself is the median. According to Zhu et al., categorical labels can alleviate 24 | issues with extreme outliers present with numerical labels. 25 | :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per 26 | business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period. 27 | For full details see `here. 28 | `_ 29 | :param lag: (bool) If True, returns will be lagged to make them forward-looking. 30 | :return: (pd.DataFrame) Numerical returns in excess of the market median return, or sign of return depending on 31 | whether binary is False or True respectively. 32 | """ 33 | 34 | pass 35 | -------------------------------------------------------------------------------- /mlfinlab/labeling/fixed_time_horizon.py: -------------------------------------------------------------------------------- 1 | """ 2 | Chapter 3.2 Fixed-Time Horizon Method, in Advances in Financial Machine Learning, by M. L. de Prado. 3 | 4 | Work "Classification-based Financial Markets Prediction using Deep Neural Networks" by Dixon et al. (2016) describes how 5 | labeling data this way can be used in training deep neural networks to predict price movements. 6 | """ 7 | 8 | import warnings 9 | import pandas as pd 10 | 11 | 12 | def fixed_time_horizon(prices, threshold=0, resample_by=None, lag=True, standardized=False, window=None): 13 | """ 14 | Fixed-Time Horizon Labeling Method. 15 | 16 | Originally described in the book Advances in Financial Machine Learning, Chapter 3.2, p.43-44. 17 | 18 | Returns 1 if return is greater than the threshold, -1 if less, and 0 if in between. If no threshold is 19 | provided then it will simply take the sign of the return. 20 | 21 | :param prices: (pd.Series or pd.DataFrame) Time-indexed stock prices used to calculate returns. 22 | :param threshold: (float or pd.Series) When the absolute value of return exceeds the threshold, the observation is 23 | labeled with 1 or -1, depending on the sign of the return. If return is less, it's labeled as 0. 24 | Can be dynamic if threshold is inputted as a pd.Series, and threshold.index must match prices.index. 25 | If resampling is used, the index of threshold must match the index of prices after resampling. 26 | If threshold is negative, then the directionality of the labels will be reversed. If no threshold 27 | is provided, it is assumed to be 0 and the sign of the return is returned. 28 | :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per 29 | business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period. 30 | For full details see `here. 31 | `_ 32 | :param lag: (bool) If True, returns will be lagged to make them forward-looking. 33 | :param standardized: (bool) Whether returns are scaled by mean and standard deviation. 34 | :param window: (int) If standardized is True, the rolling window period for calculating the mean and standard 35 | deviation of returns. 36 | :return: (pd.Series or pd.DataFrame) -1, 0, or 1 denoting whether the return for each observation is 37 | less/between/greater than the threshold at each corresponding time index. First or last row will be 38 | NaN, depending on lag. 39 | """ 40 | 41 | pass 42 | -------------------------------------------------------------------------------- /mlfinlab/labeling/matrix_flags.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=no-self-use 2 | # pylint: disable=unnecessary-comprehension 3 | """ 4 | Matrix Flag labeling method. 5 | """ 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | 11 | class MatrixFlagLabels: 12 | """ 13 | The Matrix Flag labeling method is featured in the paper: Cervelló-Royo, R., Guijarro, F. and Michniuk, K., 2015. 14 | Stock market trading rule based on pattern recognition and technical analysis: Forecasting the DJIA index with 15 | intraday data. 16 | 17 | The method of applying a matrix template was first introduced, and explained in greater detail, in the paper: 18 | Leigh, W., Modani, N., Purvis, R. and Roberts, T., 2002. Stock market trading rule discovery using technical 19 | charting heuristics. 20 | 21 | Cervelló-Royo et al. expand on Leigh et al.'s work by proposing a new bull flag pattern which ameliorates some 22 | weaknesses in Leigh's original template. Additionally, he applies this bull flag labeling method to intraday 23 | candlestick data, rather than just closing prices. 24 | 25 | To find the total weight for a given day, the current price as well as the preceding window days number of prices is 26 | used. The data window is split into 10 buckets each containing a chronological tenth of the data window. Each point 27 | in 1 bucket is put into a decile corresponding to a position in a column based on percentile relative to the entire 28 | data window. Bottom 10% on lowest row, next 10% on second lowest row etc. 29 | The proportion of points in each decile is reported to finalize the column. The first tenth of the data is 30 | transformed to the leftmost column, the next tenth to the next column on the right and so on until finally a 10 by 31 | 10 matrix is achieved. This matrix is then multiplied element-wise with the 10 by 10 template, and the sum of all 32 | columns is the total weight for the day. If desired, the user can specify a threshold to determine positive and 33 | negative classes. The value of the threshold depends on how strict of a classifier the user desires, and the 34 | allowable values based on the template matrix. 35 | """ 36 | 37 | def __init__(self, prices, window, template_name=None): 38 | """ 39 | :param prices: (pd.Series) Price data for one stock. 40 | :param window: (int) Length of preceding data window used when generating the fit matrix for one day. 41 | :param template_name: (str) Name of the an available template in the template library. Allowable names: 42 | ``leigh_bear``, ``leigh_bull``, ``cervelloroyo_bear``, ``cervellororo_bull``. 43 | """ 44 | 45 | pass 46 | 47 | def _init_template(self, name): 48 | """ 49 | :param name: (str) Name of the an available template in the template library. Allowable names: ``leigh_bear``, 50 | ``leigh_bull``, ``cervelloroyo_bear``, ``cervellororo_bull``. 51 | """ 52 | 53 | pass 54 | 55 | def set_template(self, template): 56 | """ 57 | :param template: (pd.DataFrame) Template to override the default template. Must be a 10 by 10 pd.DataFrame. 58 | NaN values not allowed, as they will not automatically be treated as zeros. 59 | """ 60 | 61 | pass 62 | 63 | def _transform_data(self, row_num, window=30): 64 | """ 65 | :param row_num: (int) Row number to use for the "current" data point to apply the window to. The data window 66 | contains the row corresponding to row_num, as well as the (self.window-1) preceding rows. 67 | :param window: (int) The number of rows preceding the current one to use for window. Override with 68 | self.window in most cases. 69 | :return: (pd.DataFrame) Transformed 10 by 10 matrix, in which each column corresponds to a chronological tenth 70 | of the data window, and each row corresponds to a price decile relative to the entire data window. 71 | The template matrix is then applied to this output matrix. 72 | """ 73 | 74 | pass 75 | 76 | def _apply_template_to_matrix(self, matrix, template): 77 | """ 78 | :param matrix: (pd.DataFrame) Processed 10 by 10 matrix, where each column represents a chronological tenth 79 | of the data, and each row represents a decile relative to the entire data window. 80 | :param template: (pd.DataFrame) Template to apply the processed matrix to. 81 | :return: (float) The total score for the day. Consists of the sum of sum of columns of the result from 82 | multiplying the matrix element-wise with the template. 83 | """ 84 | 85 | pass 86 | 87 | def apply_labeling_matrix(self, threshold=None): 88 | """ 89 | :param threshold: (float) If None, labels will be returned numerically as the score for the day. If not None, 90 | then labels are returned categorically, with the positive category for labels that are equal to 91 | or exceed the threshold. 92 | :return: (pd.Series) Total scores for the data series on each eligible day (meaning for indices self.window and 93 | onwards). 94 | """ 95 | 96 | pass 97 | -------------------------------------------------------------------------------- /mlfinlab/labeling/raw_return.py: -------------------------------------------------------------------------------- 1 | """ 2 | Labeling Raw Returns. 3 | 4 | Most basic form of labeling based on raw return of each observation relative to its previous value. 5 | """ 6 | 7 | import numpy as np 8 | 9 | 10 | def raw_return(prices, binary=False, logarithmic=False, resample_by=None, lag=True): 11 | """ 12 | Raw returns labeling method. 13 | 14 | This is the most basic and ubiquitous labeling method used as a precursor to almost any kind of financial data 15 | analysis or machine learning. User can specify simple or logarithmic returns, numerical or binary labels, a 16 | resample period, and whether returns are lagged to be forward looking. 17 | 18 | :param prices: (pd.Series or pd.DataFrame) Time-indexed price data on stocks with which to calculate return. 19 | :param binary: (bool) If False, will return numerical returns. If True, will return the sign of the raw return. 20 | :param logarithmic: (bool) If False, will calculate simple returns. If True, will calculate logarithmic returns. 21 | :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per 22 | business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period. 23 | For full details see `here. 24 | `_ 25 | :param lag: (bool) If True, returns will be lagged to make them forward-looking. 26 | :return: (pd.Series or pd.DataFrame) Raw returns on market data. User can specify whether returns will be based on 27 | simple or logarithmic return, and whether the output will be numerical or categorical. 28 | """ 29 | 30 | pass 31 | -------------------------------------------------------------------------------- /mlfinlab/labeling/return_vs_benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | Return in excess of a given benchmark. 3 | 4 | Chapter 5, Machine Learning for Factor Investing, by Coqueret and Guida, (2020). 5 | 6 | Work "Evaluating multiple classifiers for stock price direction prediction" by Ballings et al. (2015) uses this method 7 | to label yearly returns over a predetermined value to compare the performance of several machine learning algorithms. 8 | """ 9 | import numpy as np 10 | import pandas as pd 11 | 12 | 13 | def return_over_benchmark(prices, benchmark=0, binary=False, resample_by=None, lag=True): 14 | """ 15 | Return over benchmark labeling method. Sourced from Chapter 5.5.1 of Machine Learning for Factor Investing, 16 | by Coqueret, G. and Guida, T. (2020). 17 | 18 | Returns a Series or DataFrame of numerical or categorical returns over a given benchmark. The time index of the 19 | benchmark must match those of the price observations. 20 | 21 | :param prices: (pd.Series or pd.DataFrame) Time indexed prices to compare returns against a benchmark. 22 | :param benchmark: (pd.Series or float) Benchmark of returns to compare the returns from prices against for labeling. 23 | Can be a constant value, or a Series matching the index of prices. If no benchmark is given, then it 24 | is assumed to have a constant value of 0. 25 | :param binary: (bool) If False, labels are given by their numerical value of return over benchmark. If True, 26 | labels are given according to the sign of their excess return. 27 | :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per 28 | business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period. 29 | For full details see `here. 30 | `_ 31 | :param lag: (bool) If True, returns will be lagged to make them forward-looking. 32 | :return: (pd.Series or pd.DataFrame) Excess returns over benchmark. If binary, the labels are -1 if the 33 | return is below the benchmark, 1 if above, and 0 if it exactly matches the benchmark. 34 | """ 35 | 36 | pass 37 | -------------------------------------------------------------------------------- /mlfinlab/labeling/tail_sets.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-module-docstring 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | class TailSetLabels: 7 | """ 8 | Tail set labels are a classification labeling technique introduced in the following paper: Nonlinear support vector 9 | machines can systematically identify stocks with high and low future returns. Algorithmic Finance, 2(1), pp.45-58. 10 | 11 | A tail set is defined to be a group of stocks whose volatility-adjusted return is in the highest or lowest 12 | quantile, for example the highest or lowest 5%. 13 | 14 | A classification model is then fit using these labels to determine which stocks to buy and sell in a long / short 15 | portfolio. 16 | """ 17 | 18 | def __init__(self, prices, n_bins, vol_adj=None, window=None): 19 | """ 20 | :param prices: (pd.DataFrame) Asset prices. 21 | :param n_bins: (int) Number of bins to determine the quantiles for defining the tail sets. The top and 22 | bottom quantiles are considered to be the positive and negative tail sets, respectively. 23 | :param vol_adj: (str) Whether to take volatility adjusted returns. Allowable inputs are ``None``, 24 | ``mean_abs_dev``, and ``stdev``. 25 | :param window: (int) Window period used in the calculation of the volatility adjusted returns, if vol_adj is not 26 | None. Has no impact if vol_adj is None. 27 | """ 28 | 29 | pass 30 | 31 | def get_tail_sets(self): 32 | """ 33 | Computes the tail sets (positive and negative) and then returns a tuple with 3 elements, positive set, negative 34 | set, full matrix set. 35 | 36 | The positive and negative sets are each a series of lists with the names of the securities that fall within each 37 | set at a specific timestamp. 38 | 39 | For the full matrix a value of 1 indicates the volatility adjusted returns were in the top quantile, a value of 40 | -1 for the bottom quantile. 41 | :return: (tuple) positive set, negative set, full matrix set. 42 | """ 43 | 44 | pass 45 | 46 | def _vol_adjusted_rets(self): 47 | """ 48 | Computes the volatility adjusted returns. This is simply the log returns divided by a volatility estimate. We 49 | have provided 2 techniques for volatility estimation: an exponential moving average and the traditional standard 50 | deviation. 51 | """ 52 | 53 | pass 54 | 55 | def _extract_tail_sets(self, row): 56 | """ 57 | Method used in a .apply() setting to transform each row in a DataFrame to the positive and negative tail sets. 58 | 59 | This method splits the data into quantiles determined by the user, with n_bins. 60 | 61 | :param row: (pd.Series) Vol adjusted returns for a given date. 62 | :return: (pd.Series) Tail set with positive and negative labels. 63 | """ 64 | 65 | pass 66 | 67 | @staticmethod 68 | def _positive_tail_set(row): 69 | """ 70 | Takes as input a row from the vol_adj_ret DataFrame and then returns a list of names of the securities in the 71 | positive tail set, for this specific row date. 72 | 73 | This method is used in an apply() setting. 74 | 75 | :param row: (pd.Series) Labeled row of several stocks where each is already labeled with +1 (positive tail set), 76 | -1 (negative tail set), or 0. 77 | :return: (list) Securities in the positive tail set. 78 | """ 79 | 80 | pass 81 | 82 | @staticmethod 83 | def _negative_tail_set(row): 84 | """ 85 | Takes as input a row from the vol_adj_ret DataFrame and then returns a list of names of the securities in the 86 | negative tail set, for this specific row date. 87 | 88 | This method is used in an apply() setting. 89 | 90 | :param row: (pd.Series) Labeled row of several stocks where each is already labeled with +1 (positive tail set), 91 | -1 (negative tail set), or 0. 92 | :return: (list) Securities in the negative tail set. 93 | """ 94 | 95 | pass 96 | -------------------------------------------------------------------------------- /mlfinlab/labeling/trend_scanning.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of Trend-Scanning labels described in `Advances in Financial Machine Learning: Lecture 3/10 3 | `_ 4 | """ 5 | 6 | import pandas as pd 7 | import numpy as np 8 | 9 | from mlfinlab.structural_breaks.sadf import get_betas 10 | 11 | 12 | def trend_scanning_labels(price_series: pd.Series, t_events: list = None, observation_window: int = 20, 13 | look_forward: bool = True, min_sample_length: int = 5, step: int = 1) -> pd.DataFrame: 14 | """ 15 | `Trend scanning `_ is both a classification and 16 | regression labeling technique. 17 | 18 | That can be used in the following ways: 19 | 20 | 1. Classification: By taking the sign of t-value for a given observation we can set {-1, 1} labels to define the 21 | trends as either downward or upward. 22 | 2. Classification: By adding a minimum t-value threshold you can generate {-1, 0, 1} labels for downward, no-trend, 23 | upward. 24 | 3. The t-values can be used as sample weights in classification problems. 25 | 4. Regression: The t-values can be used in a regression setting to determine the magnitude of the trend. 26 | 27 | The output of this algorithm is a DataFrame with t1 (time stamp for the farthest observation), t-value, returns for 28 | the trend, and bin. 29 | 30 | This function allows using both forward-looking and backward-looking window (use the look_forward parameter). 31 | 32 | :param price_series: (pd.Series) Close prices used to label the data set 33 | :param t_events: (list) Filtered events, array of pd.Timestamps 34 | :param observation_window: (int) Maximum look forward window used to get the trend value 35 | :param look_forward: (bool) True if using a forward-looking window, False if using a backward-looking one 36 | :param min_sample_length: (int) Minimum sample length used to fit regression 37 | :param step: (int) Optimal t-value index is searched every 'step' indices 38 | :return: (pd.DataFrame) Consists of t1, t-value, ret, bin (label information). t1 - label endtime, tvalue, 39 | ret - price change %, bin - label value based on price change sign 40 | """ 41 | # pylint: disable=invalid-name 42 | 43 | pass 44 | -------------------------------------------------------------------------------- /mlfinlab/microstructural_features/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions derived from Chapter 19: Market Microstructural features 3 | """ 4 | 5 | from mlfinlab.microstructural_features.encoding import quantile_mapping, encode_array, encode_tick_rule_array, \ 6 | sigma_mapping 7 | from mlfinlab.microstructural_features.entropy import get_lempel_ziv_entropy, get_shannon_entropy, get_plug_in_entropy, \ 8 | get_konto_entropy 9 | from mlfinlab.microstructural_features.feature_generator import MicrostructuralFeaturesGenerator 10 | from mlfinlab.microstructural_features.first_generation import get_corwin_schultz_estimator, get_roll_measure, \ 11 | get_roll_impact, get_bekker_parkinson_vol 12 | from mlfinlab.microstructural_features.misc import get_avg_tick_size, vwap 13 | from mlfinlab.microstructural_features.second_generation import get_bar_based_kyle_lambda, get_bar_based_amihud_lambda, \ 14 | get_bar_based_hasbrouck_lambda, get_trades_based_kyle_lambda, get_trades_based_amihud_lambda, \ 15 | get_trades_based_hasbrouck_lambda 16 | from mlfinlab.microstructural_features.third_generation import get_vpin 17 | -------------------------------------------------------------------------------- /mlfinlab/microstructural_features/encoding.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various functions for message encoding (quantile) 3 | """ 4 | import numpy as np 5 | 6 | 7 | def encode_tick_rule_array(tick_rule_array: list) -> str: 8 | """ 9 | Encode array of tick signs (-1, 1, 0) 10 | 11 | :param tick_rule_array: (list) Tick rules 12 | :return: (str) Encoded message 13 | """ 14 | 15 | pass 16 | 17 | 18 | def _get_ascii_table() -> list: 19 | """ 20 | Get all ASCII symbols 21 | 22 | :return: (list) ASCII symbols 23 | """ 24 | 25 | pass 26 | 27 | 28 | def quantile_mapping(array: list, num_letters: int = 26) -> dict: 29 | """ 30 | Generate dictionary of quantile-letters based on values from array and dictionary length (num_letters). 31 | 32 | :param array: (list) Values to split on quantiles 33 | :param num_letters: (int) Number of letters(quantiles) to encode 34 | :return: (dict) Dict of quantile-symbol 35 | """ 36 | 37 | pass 38 | 39 | 40 | def sigma_mapping(array: list, step: float = 0.01) -> dict: 41 | """ 42 | Generate dictionary of sigma encoded letters based on values from array and discretization step. 43 | 44 | :param array: (list) Values to split on quantiles 45 | :param step: (float) Discretization step (sigma) 46 | :return: (dict) Dict of value-symbol 47 | """ 48 | 49 | pass 50 | 51 | 52 | def _find_nearest(array: list, value: float) -> float: 53 | """ 54 | Find the nearest element from array to value. 55 | 56 | :param array: (list) Values 57 | :param value: (float) Value for which the nearest element needs to be found 58 | :return: (float) The nearest to the value element in array 59 | """ 60 | 61 | pass 62 | 63 | 64 | def _get_letter_from_encoding(value: float, encoding_dict: dict) -> str: 65 | """ 66 | Get letter for float/int value from encoding dict. 67 | 68 | :param value: (float/int) Value to use 69 | :param encoding_dict: (dict) Used dictionary 70 | :return: (str) Letter from encoding dict 71 | """ 72 | 73 | pass 74 | 75 | 76 | def encode_array(array: list, encoding_dict: dict) -> str: 77 | """ 78 | Encode array with strings using encoding dict, in case of multiple occurrences of the minimum values, 79 | the indices corresponding to the first occurrence are returned 80 | 81 | :param array: (list) Values to encode 82 | :param encoding_dict: (dict) Dict of quantile-symbol 83 | :return: (str) Encoded message 84 | """ 85 | 86 | pass 87 | -------------------------------------------------------------------------------- /mlfinlab/microstructural_features/entropy.py: -------------------------------------------------------------------------------- 1 | """ 2 | Entropy calculation module (Shannon, Lempel-Ziv, Plug-In, Konto) 3 | """ 4 | 5 | import math 6 | from typing import Union 7 | 8 | import numpy as np 9 | from numba import njit 10 | 11 | 12 | def get_shannon_entropy(message: str) -> float: 13 | """ 14 | Advances in Financial Machine Learning, page 263-264. 15 | 16 | Get Shannon entropy from message 17 | 18 | :param message: (str) Encoded message 19 | :return: (float) Shannon entropy 20 | """ 21 | 22 | pass 23 | 24 | 25 | def get_lempel_ziv_entropy(message: str) -> float: 26 | """ 27 | Advances in Financial Machine Learning, Snippet 18.2, page 266. 28 | 29 | Get Lempel-Ziv entropy estimate 30 | 31 | :param message: (str) Encoded message 32 | :return: (float) Lempel-Ziv entropy 33 | """ 34 | 35 | pass 36 | 37 | 38 | def _prob_mass_function(message: str, word_length: int) -> dict: 39 | """ 40 | Advances in Financial Machine Learning, Snippet 18.1, page 266. 41 | 42 | Compute probability mass function for a one-dim discete rv 43 | 44 | :param message: (str or array) Encoded message 45 | :param word_length: (int) Approximate word length 46 | :return: (dict) Dict of pmf for each word from message 47 | """ 48 | 49 | pass 50 | 51 | 52 | def get_plug_in_entropy(message: str, word_length: int = None) -> float: 53 | """ 54 | Advances in Financial Machine Learning, Snippet 18.1, page 265. 55 | 56 | Get Plug-in entropy estimator 57 | 58 | :param message: (str or array) Encoded message 59 | :param word_length: (int) Approximate word length 60 | :return: (float) Plug-in entropy 61 | """ 62 | 63 | pass 64 | 65 | 66 | @njit() 67 | def _match_length(message: str, start_index: int, window: int) -> Union[int, str]: # pragma: no cover 68 | """ 69 | Advances in Financial Machine Learning, Snippet 18.3, page 267. 70 | 71 | Function That Computes the Length of the Longest Match 72 | 73 | :param message: (str or array) Encoded message 74 | :param start_index: (int) Start index for search 75 | :param window: (int) Window length 76 | :return: (int, str) Match length and matched string 77 | """ 78 | 79 | pass 80 | 81 | 82 | def get_konto_entropy(message: str, window: int = 0) -> float: 83 | """ 84 | Advances in Financial Machine Learning, Snippet 18.4, page 268. 85 | 86 | Implementations of Algorithms Discussed in Gao et al.[2008] 87 | 88 | Get Kontoyiannis entropy 89 | 90 | :param message: (str or array) Encoded message 91 | :param window: (int) Expanding window length, can be negative 92 | :return: (float) Kontoyiannis entropy 93 | """ 94 | 95 | pass 96 | -------------------------------------------------------------------------------- /mlfinlab/microstructural_features/feature_generator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Inter-bar feature generator which uses trades data and bars index to calculate inter-bar features 3 | """ 4 | 5 | import pandas as pd 6 | import numpy as np 7 | from mlfinlab.microstructural_features.entropy import get_shannon_entropy, get_plug_in_entropy, get_lempel_ziv_entropy, \ 8 | get_konto_entropy 9 | from mlfinlab.microstructural_features.encoding import encode_array 10 | from mlfinlab.microstructural_features.second_generation import get_trades_based_kyle_lambda, \ 11 | get_trades_based_amihud_lambda, get_trades_based_hasbrouck_lambda 12 | from mlfinlab.microstructural_features.misc import get_avg_tick_size, vwap 13 | from mlfinlab.microstructural_features.encoding import encode_tick_rule_array 14 | from mlfinlab.util.misc import crop_data_frame_in_batches 15 | 16 | 17 | # pylint: disable=too-many-instance-attributes 18 | 19 | class MicrostructuralFeaturesGenerator: 20 | """ 21 | Class which is used to generate inter-bar features when bars are already compressed. 22 | 23 | :param trades_input: (str or pd.DataFrame) Path to the csv file or Pandas DataFrame containing raw tick data 24 | in the format[date_time, price, volume] 25 | :param tick_num_series: (pd.Series) Series of tick number where bar was formed. 26 | :param batch_size: (int) Number of rows to read in from the csv, per batch. 27 | :param volume_encoding: (dict) Dictionary of encoding scheme for trades size used to calculate entropy on encoded messages 28 | :param pct_encoding: (dict) Dictionary of encoding scheme for log returns used to calculate entropy on encoded messages 29 | 30 | """ 31 | 32 | def __init__(self, trades_input: (str, pd.DataFrame), tick_num_series: pd.Series, batch_size: int = 2e7, 33 | volume_encoding: dict = None, pct_encoding: dict = None): 34 | """ 35 | Constructor 36 | 37 | :param trades_input: (str or pd.DataFrame) Path to the csv file or Pandas DataFrame containing raw tick data 38 | in the format[date_time, price, volume] 39 | :param tick_num_series: (pd.Series) Series of tick number where bar was formed. 40 | :param batch_size: (int) Number of rows to read in from the csv, per batch. 41 | :param volume_encoding: (dict) Dictionary of encoding scheme for trades size used to calculate entropy on encoded messages 42 | :param pct_encoding: (dict) Dictionary of encoding scheme for log returns used to calculate entropy on encoded messages 43 | """ 44 | 45 | 46 | pass 47 | 48 | def get_features(self, verbose=True, to_csv=False, output_path=None): 49 | """ 50 | Reads a csv file of ticks or pd.DataFrame in batches and then constructs corresponding microstructural intra-bar features: 51 | average tick size, tick rule sum, VWAP, Kyle lambda, Amihud lambda, Hasbrouck lambda, tick/volume/pct Shannon, Lempel-Ziv, 52 | Plug-in entropies if corresponding mapping dictionaries are provided (self.volume_encoding, self.pct_encoding). 53 | The csv file must have only 3 columns: date_time, price, & volume. 54 | 55 | :param verbose: (bool) Flag whether to print message on each processed batch or not 56 | :param to_csv: (bool) Flag for writing the results of bars generation to local csv file, or to in-memory DataFrame 57 | :param output_path: (bool) Path to results file, if to_csv = True 58 | :return: (DataFrame or None) Microstructural features for bar index 59 | """ 60 | 61 | pass 62 | 63 | def _reset_cache(self): 64 | """ 65 | Reset price_diff, trade_size, tick_rule, log_ret arrays to empty when bar is formed and features are 66 | calculated 67 | 68 | :return: None 69 | """ 70 | 71 | pass 72 | 73 | def _extract_bars(self, data): 74 | """ 75 | For loop which calculates features for formed bars using trades data 76 | 77 | :param data: (tuple) Contains 3 columns - date_time, price, and volume. 78 | """ 79 | 80 | pass 81 | 82 | def _get_bar_features(self, date_time: pd.Timestamp, list_bars: list) -> list: 83 | """ 84 | Calculate inter-bar features: lambdas, entropies, avg_tick_size, vwap 85 | 86 | :param date_time: (pd.Timestamp) When bar was formed 87 | :param list_bars: (list) Previously formed bars 88 | :return: (list) Inter-bar features 89 | """ 90 | 91 | pass 92 | 93 | def _apply_tick_rule(self, price: float) -> int: 94 | """ 95 | Advances in Financial Machine Learning, page 29. 96 | 97 | Applies the tick rule 98 | 99 | :param price: (float) Price at time t 100 | :return: (int) The signed tick 101 | """ 102 | 103 | pass 104 | 105 | def _get_price_diff(self, price: float) -> float: 106 | """ 107 | Get price difference between ticks 108 | 109 | :param price: (float) Price at time t 110 | :return: (float) Price difference 111 | """ 112 | 113 | pass 114 | 115 | def _get_log_ret(self, price: float) -> float: 116 | """ 117 | Get log return between ticks 118 | 119 | :param price: (float) Price at time t 120 | :return: (float) Log return 121 | """ 122 | 123 | pass 124 | 125 | @staticmethod 126 | def _assert_csv(test_batch): 127 | """ 128 | Tests that the csv file read has the format: date_time, price, and volume. 129 | If not then the user needs to create such a file. This format is in place to remove any unwanted overhead. 130 | 131 | :param test_batch: (pd.DataFrame) the first row of the dataset. 132 | :return: (None) 133 | """ 134 | 135 | pass 136 | -------------------------------------------------------------------------------- /mlfinlab/microstructural_features/first_generation.py: -------------------------------------------------------------------------------- 1 | """ 2 | First generation features (Roll Measure/Impact, Corwin-Schultz spread estimator) 3 | """ 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | 9 | def get_roll_measure(close_prices: pd.Series, window: int = 20) -> pd.Series: 10 | """ 11 | Advances in Financial Machine Learning, page 282. 12 | 13 | Get Roll Measure 14 | 15 | Roll Measure gives the estimate of effective bid-ask spread 16 | without using quote-data. 17 | 18 | :param close_prices: (pd.Series) Close prices 19 | :param window: (int) Estimation window 20 | :return: (pd.Series) Roll measure 21 | """ 22 | 23 | pass 24 | 25 | 26 | def get_roll_impact(close_prices: pd.Series, dollar_volume: pd.Series, window: int = 20) -> pd.Series: 27 | """ 28 | Get Roll Impact. 29 | 30 | Derivate from Roll Measure which takes into account dollar volume traded. 31 | 32 | :param close_prices: (pd.Series) Close prices 33 | :param dollar_volume: (pd.Series) Dollar volume series 34 | :param window: (int) Estimation window 35 | :return: (pd.Series) Roll impact 36 | """ 37 | 38 | pass 39 | 40 | 41 | # Corwin-Schultz algorithm 42 | def _get_beta(high: pd.Series, low: pd.Series, window: int) -> pd.Series: 43 | """ 44 | Advances in Financial Machine Learning, Snippet 19.1, page 285. 45 | 46 | Get beta estimate from Corwin-Schultz algorithm 47 | 48 | :param high: (pd.Series) High prices 49 | :param low: (pd.Series) Low prices 50 | :param window: (int) Estimation window 51 | :return: (pd.Series) Beta estimates 52 | """ 53 | 54 | pass 55 | 56 | 57 | def _get_gamma(high: pd.Series, low: pd.Series) -> pd.Series: 58 | """ 59 | Advances in Financial Machine Learning, Snippet 19.1, page 285. 60 | 61 | Get gamma estimate from Corwin-Schultz algorithm. 62 | 63 | :param high: (pd.Series) High prices 64 | :param low: (pd.Series) Low prices 65 | :return: (pd.Series) Gamma estimates 66 | """ 67 | 68 | pass 69 | 70 | 71 | def _get_alpha(beta: pd.Series, gamma: pd.Series) -> pd.Series: 72 | """ 73 | Advances in Financial Machine Learning, Snippet 19.1, page 285. 74 | 75 | Get alpha from Corwin-Schultz algorithm. 76 | 77 | :param beta: (pd.Series) Beta estimates 78 | :param gamma: (pd.Series) Gamma estimates 79 | :return: (pd.Series) Alphas 80 | """ 81 | 82 | pass 83 | 84 | 85 | def get_corwin_schultz_estimator(high: pd.Series, low: pd.Series, window: int = 20) -> pd.Series: 86 | """ 87 | Advances in Financial Machine Learning, Snippet 19.1, page 285. 88 | 89 | Get Corwin-Schultz spread estimator using high-low prices 90 | 91 | :param high: (pd.Series) High prices 92 | :param low: (pd.Series) Low prices 93 | :param window: (int) Estimation window 94 | :return: (pd.Series) Corwin-Schultz spread estimators 95 | """ 96 | # Note: S<0 iif alpha<0 97 | 98 | pass 99 | 100 | 101 | def get_bekker_parkinson_vol(high: pd.Series, low: pd.Series, window: int = 20) -> pd.Series: 102 | """ 103 | Advances in Financial Machine Learning, Snippet 19.2, page 286. 104 | 105 | Get Bekker-Parkinson volatility from gamma and beta in Corwin-Schultz algorithm. 106 | 107 | :param high: (pd.Series) High prices 108 | :param low: (pd.Series) Low prices 109 | :param window: (int) Estimation window 110 | :return: (pd.Series) Bekker-Parkinson volatility estimates 111 | """ 112 | # pylint: disable=invalid-name 113 | 114 | pass 115 | -------------------------------------------------------------------------------- /mlfinlab/microstructural_features/misc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various miscellaneous microstructural features (VWAP, average tick size) 3 | """ 4 | 5 | import numpy as np 6 | 7 | 8 | def vwap(dollar_volume: list, volume: list) -> float: 9 | """ 10 | Get Volume Weighted Average Price (VWAP). 11 | 12 | :param dollar_volume: (list) Dollar volumes 13 | :param volume: (list) Trades sizes 14 | :return: (float) VWAP value 15 | """ 16 | 17 | pass 18 | 19 | 20 | def get_avg_tick_size(tick_size_arr: list) -> float: 21 | """ 22 | Get average tick size in a bar. 23 | 24 | :param tick_size_arr: (list) Trade sizes 25 | :return: (float) Average trade size 26 | """ 27 | 28 | pass 29 | -------------------------------------------------------------------------------- /mlfinlab/microstructural_features/second_generation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Second generation models features: Kyle lambda, Amihud Lambda, Hasbrouck lambda (bar and trade based) 3 | """ 4 | 5 | from typing import List 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from mlfinlab.structural_breaks.sadf import get_betas 10 | 11 | # pylint: disable=invalid-name 12 | def get_bar_based_kyle_lambda(close: pd.Series, volume: pd.Series, window: int = 20) -> pd.Series: 13 | """ 14 | Advances in Financial Machine Learning, p. 286-288. 15 | 16 | Get Kyle lambda from bars data 17 | 18 | :param close: (pd.Series) Close prices 19 | :param volume: (pd.Series) Bar volume 20 | :param window: (int) Rolling window used for estimation 21 | :return: (pd.Series) Kyle lambdas 22 | """ 23 | 24 | pass 25 | 26 | 27 | def get_bar_based_amihud_lambda(close: pd.Series, dollar_volume: pd.Series, window: int = 20) -> pd.Series: 28 | """ 29 | Advances in Financial Machine Learning, p.288-289. 30 | 31 | Get Amihud lambda from bars data 32 | 33 | :param close: (pd.Series) Close prices 34 | :param dollar_volume: (pd.Series) Dollar volumes 35 | :param window: (int) rolling window used for estimation 36 | :return: (pd.Series) of Amihud lambda 37 | """ 38 | 39 | pass 40 | 41 | def get_bar_based_hasbrouck_lambda(close: pd.Series, dollar_volume: pd.Series, window: int = 20) -> pd.Series: 42 | """ 43 | Advances in Financial Machine Learning, p.289-290. 44 | 45 | Get Hasbrouck lambda from bars data 46 | 47 | :param close: (pd.Series) Close prices 48 | :param dollar_volume: (pd.Series) Dollar volumes 49 | :param window: (int) Rolling window used for estimation 50 | :return: (pd.Series) Hasbrouck lambda 51 | """ 52 | 53 | pass 54 | 55 | 56 | def get_trades_based_kyle_lambda(price_diff: list, volume: list, aggressor_flags: list) -> List[float]: 57 | """ 58 | Advances in Financial Machine Learning, p.286-288. 59 | 60 | Get Kyle lambda from trades data 61 | 62 | :param price_diff: (list) Price diffs 63 | :param volume: (list) Trades sizes 64 | :param aggressor_flags: (list) Trade directions [-1, 1] (tick rule or aggressor side can be used to define) 65 | :return: (list) Kyle lambda for a bar and t-value 66 | """ 67 | 68 | pass 69 | 70 | 71 | def get_trades_based_amihud_lambda(log_ret: list, dollar_volume: list) -> List[float]: 72 | """ 73 | Advances in Financial Machine Learning, p.288-289. 74 | 75 | Get Amihud lambda from trades data 76 | 77 | :param log_ret: (list) Log returns 78 | :param dollar_volume: (list) Dollar volumes (price * size) 79 | :return: (float) Amihud lambda for a bar 80 | """ 81 | 82 | pass 83 | 84 | 85 | def get_trades_based_hasbrouck_lambda(log_ret: list, dollar_volume: list, aggressor_flags: list) -> List[float]: 86 | """ 87 | Advances in Financial Machine Learning, p.289-290. 88 | 89 | Get Hasbrouck lambda from trades data 90 | 91 | :param log_ret: (list) Log returns 92 | :param dollar_volume: (list) Dollar volumes (price * size) 93 | :param aggressor_flags: (list) Trade directions [-1, 1] (tick rule or aggressor side can be used to define) 94 | :return: (list) Hasbrouck lambda for a bar and t value 95 | """ 96 | 97 | pass 98 | -------------------------------------------------------------------------------- /mlfinlab/microstructural_features/third_generation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Third generation models implementation (VPIN) 3 | """ 4 | import pandas as pd 5 | 6 | 7 | def get_vpin(volume: pd.Series, buy_volume: pd.Series, window: int = 1) -> pd.Series: 8 | """ 9 | Advances in Financial Machine Learning, p. 292-293. 10 | 11 | Get Volume-Synchronized Probability of Informed Trading (VPIN) from bars 12 | 13 | :param volume: (pd.Series) Bar volume 14 | :param buy_volume: (pd.Series) Bar volume classified as buy (either tick rule, BVC or aggressor side methods applied) 15 | :param window: (int) Estimation window 16 | :return: (pd.Series) VPIN series 17 | """ 18 | 19 | pass 20 | -------------------------------------------------------------------------------- /mlfinlab/multi_product/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functionality relating to the ETF trick and stitching futures contracts together. 3 | """ 4 | 5 | from mlfinlab.multi_product.etf_trick import (ETFTrick, get_futures_roll_series) 6 | -------------------------------------------------------------------------------- /mlfinlab/networks/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tools to visualise and filter networks of complex systems. 3 | """ 4 | 5 | from mlfinlab.networks.dash_graph import DashGraph, PMFGDash 6 | from mlfinlab.networks.dual_dash_graph import DualDashGraph 7 | from mlfinlab.networks.graph import Graph 8 | from mlfinlab.networks.mst import MST 9 | from mlfinlab.networks.almst import ALMST 10 | from mlfinlab.networks.pmfg import PMFG 11 | from mlfinlab.networks.visualisations import (generate_mst_server, create_input_matrix, generate_almst_server, 12 | generate_mst_almst_comparison) 13 | -------------------------------------------------------------------------------- /mlfinlab/networks/almst.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains Graph classes, which create NetworkX's Graph objects from matrices. 3 | """ 4 | 5 | import heapq 6 | import itertools 7 | from itertools import count 8 | 9 | import networkx as nx 10 | import numpy as np 11 | import pandas as pd 12 | from mlfinlab.networks.graph import Graph 13 | 14 | 15 | class ALMST(Graph): 16 | """ 17 | ALMST is a subclass of Graph which creates a ALMST Graph object. 18 | The ALMST class converts a distance matrix input into a ALMST matrix. This is then used to create a nx.Graph object. 19 | """ 20 | 21 | def __init__(self, matrix, matrix_type, mst_algorithm='kruskal'): 22 | """ 23 | Initialises the ALMST and sets the self.graph attribute as the ALMST graph. 24 | 25 | :param matrix: (pd.Dataframe) Input matrices such as a distance or correlation matrix. 26 | :param matrix_type: (str) Name of the matrix type (e.g. "distance" or "correlation"). 27 | :param mst_algorithm: (str) Valid MST algorithm types include 'kruskal', 'prim'. 28 | By default, MST algorithm uses Kruskal's. 29 | """ 30 | 31 | pass 32 | 33 | @staticmethod 34 | def create_almst_kruskals(matrix): 35 | """ 36 | This method converts the input matrix into a ALMST matrix. 37 | 38 | ! Currently only works with distance input matrix 39 | 40 | :param matrix: (pd.Dataframe) Input matrix. 41 | :return: (pd.Dataframe) ALMST matrix with all other edges as 0 values. 42 | """ 43 | 44 | pass 45 | 46 | @staticmethod 47 | def _generate_ordered_heap(matrix, clusters): 48 | """ 49 | Given the matrix of edges, and the list of clusters, generate a heap ordered by the average distance between the clusters. 50 | 51 | :param matrix: (pd.Dataframe) Input matrix of the distance matrix. 52 | :param clusters: (List) A list of clusters, where each list contains a list of nodes within the cluster. 53 | :return: (Heap) Returns a heap ordered by the average distance between the clusters. 54 | """ 55 | 56 | pass 57 | 58 | @staticmethod 59 | def _calculate_average_distance(matrix, clusters, c_x, c_y): 60 | """ 61 | Given two clusters, calculates the average distance between the two. 62 | 63 | :param matrix: (pd.Dataframe) Input matrix with all edges. 64 | :param clusters: (List) List of clusters. 65 | :param c_x: (int) Cluster x, where x is the index of the cluster. 66 | :param c_y: (int) Cluster y, where y is the index of the cluster. 67 | """ 68 | 69 | pass 70 | 71 | @staticmethod 72 | def _get_min_edge(node, cluster, matrix): 73 | """ 74 | Returns the minimum edge tuple given a node and a cluster. 75 | 76 | :param node: (str) String of the node name. 77 | :param cluster: (list) List of node names. 78 | :param matrix: (pd.DataFrame) A matrix of all edges. 79 | :return: (tuple) A tuple of average distance from node to the cluster, and the minimum edge nodes, i and j. 80 | """ 81 | 82 | pass 83 | 84 | @staticmethod 85 | def _get_min_edge_clusters(cluster_one, cluster_two, matrix): 86 | """ 87 | Returns a tuple of the minimum edge and the average length for two clusters. 88 | 89 | :param cluster_one: (list) List of node names. 90 | :param cluster_two: (list) List of node names. 91 | :param matrix: (pd.DataFrame) A matrix of all edges. 92 | :return: (tuple) A tuple of average distance between the clusters, and the minimum edge nodes, i and j. 93 | """ 94 | 95 | pass 96 | 97 | @staticmethod 98 | def create_almst(matrix): 99 | """ 100 | Creates and returns a ALMST given an input matrix using Prim's algorithm. 101 | 102 | :param matrix: (pd.Dataframe) Input distance matrix of all edges. 103 | :return: (pd.Dataframe) Returns the ALMST in matrix format. 104 | """ 105 | 106 | pass 107 | 108 | @staticmethod 109 | def _add_next_edge(visited, children, matrix, almst_matrix): 110 | """ 111 | Adds the next edge with the minimum average distance. 112 | 113 | :param visited: (Set) A set of visited nodes. 114 | :param children: (Set) A set of children or frontier nodes, to be visited. 115 | :param matrix: (pd.Dataframe) Input distance matrix of all edges. 116 | :param almst_matrix: (pd.Dataframe) The ALMST matrix. 117 | 118 | :return: (Tuple) Returns the sets visited and children, and the matrix almst_matrix. 119 | """ 120 | 121 | pass 122 | -------------------------------------------------------------------------------- /mlfinlab/networks/dual_dash_graph.py: -------------------------------------------------------------------------------- 1 | """ 2 | This class takes in a Graph object and creates interactive visualisations using Plotly's Dash. 3 | The DualDashGraph class contains private functions used to generate the frontend components needed to create the UI. 4 | 5 | Running run_server() will produce the warning "Warning: This is a development server. Do not use app.run_server 6 | in production, use a production WSGI server like gunicorn instead.". 7 | However, this is okay and the Dash server will run without a problem. 8 | """ 9 | 10 | import dash_bootstrap_components as dbc 11 | import dash_cytoscape as cyto 12 | import dash_html_components as html 13 | from dash import Dash 14 | from dash.dependencies import Input, Output, State 15 | from jupyter_dash import JupyterDash 16 | 17 | class DualDashGraph: 18 | """ 19 | The DualDashGraph class is the inerface for comparing and highlighting the difference between two graphs. 20 | Two Graph class objects should be supplied - such as MST and ALMST graphs. 21 | """ 22 | 23 | def __init__(self, graph_one, graph_two, app_display='default'): 24 | """ 25 | Initialises the dual graph interface and generates the interface layout. 26 | 27 | :param graph_one: (Graph) The first graph for the comparison interface. 28 | :param graph_two: (Graph) The second graph for the comparison interface. 29 | :param app_display: (str) 'default' by default and 'jupyter notebook' for running Dash inside Jupyter Notebook. 30 | """ 31 | 32 | pass 33 | 34 | @staticmethod 35 | def _select_other_graph_node(data, elements): 36 | """ 37 | Callback function to select the other graph node when a graph node 38 | is selected by setting selected to True. 39 | 40 | :param data: (Dict) Dictionary of "tapped" or selected node. 41 | :param elements: (Dict) Dictionary of elements. 42 | :return: (Dict) Returns updates dictionary of elements. 43 | """ 44 | 45 | pass 46 | 47 | def _generate_comparison_layout(self, graph_one, graph_two): 48 | """ 49 | Returns and generates a dual comparison layout. 50 | 51 | :param graph_one: (Graph) The first graph object for the dual interface. 52 | :param graph_two: (Graph) Comparison graph object for the dual interface. 53 | :return: (html.Div) Returns a Div containing the interface. 54 | """ 55 | 56 | pass 57 | 58 | @staticmethod 59 | def _get_default_stylesheet(weights): 60 | """ 61 | Returns the default stylesheet for initialisation. 62 | 63 | :param weights: (List) A list of weights of the edges. 64 | :return: (List) A List of definitions used for Dash styling. 65 | """ 66 | 67 | pass 68 | 69 | def _set_cyto_graph(self): 70 | """ 71 | Updates and sets the two cytoscape graphs using the corresponding components. 72 | """ 73 | 74 | pass 75 | 76 | def _update_elements_dual(self, graph, difference, graph_number): 77 | """ 78 | Updates the elements needed for the Dash Cytoscape Graph object. 79 | 80 | :param graph: (Graph) Graph object such as MST or ALMST. 81 | :param difference: (List) List of edges where the two graphs differ. 82 | :param graph_number: (Int) Graph number to update the correct graph. 83 | """ 84 | 85 | pass 86 | 87 | def get_server(self): 88 | """ 89 | Returns the comparison interface server 90 | 91 | :return: (Dash) Returns the Dash app object, which can be run using run_server. 92 | Returns a Jupyter Dash object if DashGraph has been initialised for Jupyter Notebook. 93 | """ 94 | 95 | pass 96 | -------------------------------------------------------------------------------- /mlfinlab/networks/graph.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains Graph classes, which create NetworkX's Graph objects from matrices. 3 | """ 4 | 5 | from abc import ABC 6 | 7 | import networkx as nx 8 | from matplotlib import pyplot as plt 9 | 10 | 11 | class Graph(ABC): 12 | """ 13 | This Graph class is a parent class for different types of graphs such as a MST. 14 | """ 15 | 16 | def __init__(self, matrix_type): 17 | """ 18 | Initializes the Graph object and the Graph class attributes. 19 | This includes the specific graph such as a MST stored as an attribute. 20 | 21 | :param matrix_type: (str) Name of the matrix type (e.g. "distance" or "correlation"). 22 | """ 23 | 24 | pass 25 | 26 | def get_matrix_type(self): 27 | """ 28 | Returns the matrix type set at initialisation. 29 | 30 | :return: (str) String of matrix type (eg. "correlation" or "distance"). 31 | """ 32 | 33 | pass 34 | 35 | def get_graph(self): 36 | """ 37 | Returns the Graph stored as an attribute. 38 | 39 | :return: (nx.Graph) Returns a NetworkX graph object. 40 | """ 41 | 42 | pass 43 | 44 | def get_difference(self, input_graph_two): 45 | """ 46 | Given two Graph with the same nodes, return a set of differences in edge connections. 47 | 48 | :param input_graph_two: (Graph) A graph to compare self.graph against. 49 | :return: (List) A list of unique tuples showing different edge connections. 50 | """ 51 | 52 | pass 53 | 54 | def get_pos(self): 55 | """ 56 | Returns the dictionary of the nodes coordinates. 57 | 58 | :return: (Dict) Dictionary of node coordinates. 59 | """ 60 | 61 | pass 62 | 63 | def get_graph_plot(self): 64 | """ 65 | Returns the graph of the MST with labels. 66 | Assumes that the matrix contains stock names as headers. 67 | 68 | :return: (AxesSubplot) Axes with graph plot. Call plt.show() to display this graph. 69 | """ 70 | 71 | pass 72 | 73 | def set_node_groups(self, industry_groups): 74 | """ 75 | Sets the node industry group, by taking in a dictionary of industry group to a list of node indexes. 76 | 77 | :param industry_groups: (Dict) Dictionary of the industry name to a list of node indexes. 78 | """ 79 | 80 | pass 81 | 82 | def set_node_size(self, market_caps): 83 | """ 84 | Sets the node sizes, given a list of market cap values corresponding to node indexes. 85 | 86 | :param market_caps: (List) List of numbers corresponding to node indexes. 87 | """ 88 | 89 | pass 90 | 91 | def get_node_sizes(self): 92 | """ 93 | Returns the node sizes as a list. 94 | 95 | :return: (List) List of numbers representing node sizes. 96 | """ 97 | 98 | pass 99 | 100 | def get_node_colours(self): 101 | """ 102 | Returns a map of industry group matched with list of nodes. 103 | 104 | :return: (Dict) Dictionary of industry name to list of node indexes. 105 | """ 106 | 107 | pass 108 | -------------------------------------------------------------------------------- /mlfinlab/networks/mst.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains Graph classes, which create NetworkX's Graph objects from matrices. 3 | """ 4 | 5 | import networkx as nx 6 | from mlfinlab.networks.graph import Graph 7 | 8 | 9 | class MST(Graph): 10 | """ 11 | MST is a subclass of Graph which creates a MST Graph object. 12 | """ 13 | 14 | def __init__(self, matrix, matrix_type, mst_algorithm='kruskal'): 15 | """ 16 | Creates a MST Graph object and stores the MST inside graph attribute. 17 | 18 | :param matrix: (pd.Dataframe) Input matrices such as a distance or correlation matrix. 19 | :param matrix_type: (str) Name of the matrix type (e.g. "distance" or "correlation"). 20 | :param mst_algorithm: (str) Valid MST algorithm types include 'kruskal', 'prim', or 'boruvka'. 21 | By default, MST algorithm uses Kruskal's. 22 | """ 23 | 24 | pass 25 | 26 | @staticmethod 27 | def create_mst(matrix, algorithm='kruskal'): 28 | """ 29 | This method converts the input matrix into a MST graph. 30 | 31 | :param matrix: (pd.Dataframe) Input matrix. 32 | :param algorithm: (str) Valid MST algorithm types include 'kruskal', 'prim', or 'boruvka'. 33 | By default, MST algorithm uses Kruskal's. 34 | """ 35 | 36 | pass 37 | -------------------------------------------------------------------------------- /mlfinlab/networks/pmfg.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains Graph classes, which create NetworkX's Graph objects from matrices. 3 | """ 4 | 5 | import heapq 6 | import itertools 7 | from itertools import count 8 | import warnings 9 | 10 | import networkx as nx 11 | from matplotlib import pyplot as plt 12 | 13 | from mlfinlab.networks.graph import Graph 14 | 15 | 16 | class PMFG(Graph): 17 | """ 18 | PMFG class creates and stores the PMFG as an attribute. 19 | """ 20 | 21 | def __init__(self, input_matrix, matrix_type): 22 | """ 23 | PMFG class creates the Planar Maximally Filtered Graph and stores it as an attribute. 24 | 25 | :param input_matrix: (pd.Dataframe) Input distance matrix 26 | :param matrix_type: (str) Matrix type name (e.g. "distance"). 27 | """ 28 | 29 | pass 30 | 31 | def get_disparity_measure(self): 32 | """ 33 | Getter method for the dictionary of disparity measure values of cliques. 34 | 35 | :return: (Dict) Returns a dictionary of clique to the disparity measure. 36 | """ 37 | 38 | pass 39 | 40 | def _calculate_disparity(self): 41 | """ 42 | Calculate disparity given in Tumminello M, Aste T, Di Matteo T, Mantegna RN. 43 | A tool for filtering information in complex systems. 44 | https://arxiv.org/pdf/cond-mat/0501335.pdf 45 | 46 | :return: (Dict) Returns a dictionary of clique to the disparity measure. 47 | """ 48 | 49 | pass 50 | 51 | def _generate_cliques(self): 52 | """ 53 | Generate cliques from all of the nodes in the PMFG. 54 | """ 55 | 56 | pass 57 | 58 | def create_pmfg(self, input_matrix): 59 | """ 60 | Creates the PMFG matrix from the input matrix of all edges. 61 | 62 | :param input_matrix: (pd.Dataframe) Input matrix with all edges 63 | :return: (nx.Graph) Output PMFG matrix 64 | """ 65 | 66 | pass 67 | 68 | def get_mst_edges(self): 69 | """ 70 | Returns the list of MST edges. 71 | 72 | :return: (list) Returns a list of tuples of edges. 73 | """ 74 | 75 | pass 76 | 77 | def edge_in_mst(self, node1, node2): 78 | """ 79 | Checks whether the edge from node1 to node2 is a part of the MST. 80 | 81 | :param node1: (str) Name of the first node in the edge. 82 | :param node2: (str) Name of the second node in the edge. 83 | :return: (bool) Returns true if the edge is in the MST. False otherwise. 84 | """ 85 | 86 | pass 87 | 88 | def get_graph_plot(self): 89 | """ 90 | Overrides parent get_graph_plot to plot it in a planar format. 91 | 92 | Returns the graph of the MST with labels. 93 | Assumes that the matrix contains stock names as headers. 94 | 95 | :return: (AxesSubplot) Axes with graph plot. Call plt.show() to display this graph. 96 | """ 97 | 98 | pass 99 | -------------------------------------------------------------------------------- /mlfinlab/networks/visualisations.py: -------------------------------------------------------------------------------- 1 | """ 2 | These methods allows the user to easily deploy graph visualisations given an input file dataframe. 3 | """ 4 | 5 | import warnings 6 | import networkx as nx 7 | 8 | from mlfinlab.networks.dash_graph import DashGraph, PMFGDash 9 | from mlfinlab.networks.dual_dash_graph import DualDashGraph 10 | from mlfinlab.networks.mst import MST 11 | from mlfinlab.networks.almst import ALMST 12 | from mlfinlab.networks.pmfg import PMFG 13 | from mlfinlab.codependence import get_distance_matrix 14 | 15 | 16 | def generate_mst_server(log_returns_df, mst_algorithm='kruskal', distance_matrix_type='angular', 17 | jupyter=False, colours=None, sizes=None): 18 | """ 19 | This method returns a Dash server ready to be run. 20 | 21 | :param log_returns_df: (pd.Dataframe) An input dataframe of log returns 22 | with stock names as columns. 23 | :param mst_algorithm: (str) A valid MST type such as 'kruskal', 'prim', or 'boruvka'. 24 | :param distance_matrix_type: (str) A valid sub type of a distance matrix, 25 | namely 'angular', 'abs_angular', 'squared_angular'. 26 | :param jupyter: (bool) True if the user would like to run inside jupyter notebook. False otherwise. 27 | :param colours: (Dict) A dictionary of key string for category name and value of a list of indexes 28 | corresponding to the node indexes inputted in the initial dataframe. 29 | :param sizes: (List) A list of numbers, where the positions correspond to the node indexes inputted 30 | in the initial dataframe. 31 | :return: (Dash) Returns the Dash app object, which can be run using run_server. 32 | Returns a Jupyter Dash object if the parameter jupyter is set to True. 33 | """ 34 | 35 | pass 36 | 37 | 38 | def create_input_matrix(log_returns_df, distance_matrix_type): 39 | """ 40 | This method returns the distance matrix ready to be inputted into the Graph class. 41 | 42 | :param log_returns_df: (pd.Dataframe) An input dataframe of log returns 43 | with stock names as columns. 44 | :param distance_matrix_type: (str) A valid sub type of a distance matrix, 45 | namely 'angular', 'abs_angular', 'squared_angular'. 46 | :return: (pd.Dataframe) A dataframe of a distance matrix. 47 | """ 48 | 49 | pass 50 | 51 | 52 | def generate_almst_server(log_returns_df, distance_matrix_type='angular', 53 | jupyter=False, colours=None, sizes=None): 54 | """ 55 | This method returns a Dash server ready to be run. 56 | 57 | :param log_returns_df: (pd.Dataframe) An input dataframe of log returns 58 | with stock names as columns. 59 | :param distance_matrix_type: (str) A valid sub type of a distance matrix, 60 | namely 'angular', 'abs_angular', 'squared_angular'. 61 | :param jupyter: (bool) True if the user would like to run inside jupyter notebook. False otherwise. 62 | :param colours: (Dict) A dictionary of key string for category name and value of a list of indexes 63 | corresponding to the node indexes inputted in the initial dataframe. 64 | :param sizes: (List) A list of numbers, where the positions correspond to the node indexes inputted 65 | in the initial dataframe. 66 | :return: (Dash) Returns the Dash app object, which can be run using run_server. 67 | Returns a Jupyter Dash object if the parameter jupyter is set to True. 68 | """ 69 | 70 | pass 71 | 72 | 73 | def generate_mst_almst_comparison(log_returns_df, distance_matrix_type='angular', jupyter=False): 74 | """ 75 | This method returns a Dash server ready to be run. 76 | 77 | :param log_returns_df: (pd.Dataframe) An input dataframe of log returns 78 | with stock names as columns. 79 | :param distance_matrix_type: (str) A valid sub type of a distance matrix, 80 | namely 'angular', 'abs_angular', 'squared_angular'. 81 | :param jupyter: (bool) True if the user would like to run inside jupyter notebook. False otherwise. 82 | :return: (Dash) Returns the Dash app object, which can be run using run_server. 83 | Returns a Jupyter Dash object if the parameter jupyter is set to True. 84 | """ 85 | 86 | pass 87 | 88 | 89 | def generate_pmfg_server(log_returns_df, input_type='distance', 90 | jupyter=False, colours=None, sizes=None): 91 | """ 92 | This method returns a PMFGDash server ready to be run. 93 | 94 | :param log_returns_df: (pd.Dataframe) An input dataframe of log returns 95 | with stock names as columns. 96 | :param input_type: (str) A valid input type correlation or distance. Inputting correlation will add the edges 97 | by largest to smallest, instead of smallest to largest. 98 | :param jupyter: (bool) True if the user would like to run inside jupyter notebook. False otherwise. 99 | :param colours: (Dict) A dictionary of key string for category name and value of a list of indexes 100 | corresponding to the node indexes inputted in the initial dataframe. 101 | :param sizes: (List) A list of numbers, where the positions correspond to the node indexes inputted 102 | in the initial dataframe. 103 | :return: (Dash) Returns the Dash app object, which can be run using run_server. 104 | Returns a Jupyter Dash object if the parameter jupyter is set to True. 105 | """ 106 | 107 | pass 108 | 109 | 110 | def generate_central_peripheral_ranking(nx_graph): 111 | """ 112 | Given a NetworkX graph, this method generates and returns a ranking of centrality. 113 | The input should be a distance based PMFG. 114 | 115 | The ranking combines multiple centrality measures to calculate an overall ranking of how central or peripheral the 116 | nodes are. 117 | The smaller the ranking, the more peripheral the node is. The larger the ranking, the more central the node is. 118 | 119 | The factors contributing to the ranking include Degree, Eccentricity, Closeness Centrality, Second Order Centrality, 120 | Eigen Vector Centrality and Betweenness Centrality. The formula for these measures can be found on the NetworkX 121 | documentation (https://networkx.github.io/documentation/stable/reference/algorithms/centrality.html) 122 | 123 | :param nx_graph: (nx.Graph) NetworkX graph object. You can call get_graph() on the MST, ALMST and PMFG to retrieve 124 | the nx.Graph. 125 | :return: (List) Returns a list of tuples of ranking value to node. 126 | """ 127 | 128 | pass -------------------------------------------------------------------------------- /mlfinlab/regression/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of historically weighted regression method based on relevance. 3 | """ 4 | 5 | from mlfinlab.regression.history_weight_regression import HistoryWeightRegression 6 | -------------------------------------------------------------------------------- /mlfinlab/sample_weights/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains the code for implementing sample weights and stacked sample weights. 3 | """ 4 | 5 | from mlfinlab.sample_weights.attribution import (get_weights_by_time_decay, get_weights_by_return, 6 | _apply_weight_by_return, get_stacked_weights_time_decay, 7 | get_stacked_weights_by_return) 8 | -------------------------------------------------------------------------------- /mlfinlab/sample_weights/attribution.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logic regarding return and time decay attribution for sample weights from chapter 4. 3 | And stacked sample weights logic: return and time based sample weights for a multi-asset dataset. 4 | """ 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from mlfinlab.sampling.concurrent import (num_concurrent_events, get_av_uniqueness_from_triple_barrier) 10 | from mlfinlab.util.multiprocess import mp_pandas_obj 11 | 12 | def _apply_weight_by_return(label_endtime, num_conc_events, close_series, molecule): 13 | """ 14 | Advances in Financial Machine Learning, Snippet 4.10, page 69. 15 | 16 | Determination of Sample Weight by Absolute Return Attribution 17 | 18 | Derives sample weights based on concurrency and return. Works on a set of 19 | datetime index values (molecule). This allows the program to parallelize the processing. 20 | 21 | :param label_endtime: (pd.Series) Label endtime series (t1 for triple barrier events). 22 | :param num_conc_events: (pd.Series) Number of concurrent labels (output from num_concurrent_events function). 23 | :param close_series: (pd.Series) Close prices. 24 | :param molecule: (an array) A set of datetime index values for processing. 25 | :return: (pd.Series) Sample weights based on number return and concurrency for molecule. 26 | """ 27 | 28 | pass 29 | 30 | 31 | def get_weights_by_return(triple_barrier_events, close_series, num_threads=5, verbose=True): 32 | """ 33 | Advances in Financial Machine Learning, Snippet 4.10(part 2), page 69. 34 | 35 | Determination of Sample Weight by Absolute Return Attribution 36 | 37 | This function is orchestrator for generating sample weights based on return using mp_pandas_obj. 38 | 39 | :param triple_barrier_events: (pd.DataFrame) Events from labeling.get_events(). 40 | :param close_series: (pd.Series) Close prices. 41 | :param num_threads: (int) The number of threads concurrently used by the function. 42 | :param verbose: (bool) Flag to report progress on asynch jobs. 43 | :return: (pd.Series) Sample weights based on number return and concurrency. 44 | """ 45 | 46 | pass 47 | 48 | 49 | def get_weights_by_time_decay(triple_barrier_events, close_series, num_threads=5, decay=1, verbose=True): 50 | """ 51 | Advances in Financial Machine Learning, Snippet 4.11, page 70. 52 | 53 | Implementation of Time Decay Factors. 54 | 55 | :param triple_barrier_events: (pd.DataFrame) Events from labeling.get_events(). 56 | :param close_series: (pd.Series) Close prices. 57 | :param num_threads: (int) The number of threads concurrently used by the function. 58 | :param decay: (int) Decay factor 59 | - decay = 1 means there is no time decay; 60 | - 0 < decay < 1 means that weights decay linearly over time, but every observation still receives a strictly positive weight, regadless of how old; 61 | - decay = 0 means that weights converge linearly to zero, as they become older; 62 | - decay < 0 means that the oldes portion c of the observations receive zero weight (i.e they are erased from memory). 63 | :param verbose: (bool) Flag to report progress on asynch jobs. 64 | :return: (pd.Series) Sample weights based on time decay factors. 65 | """ 66 | 67 | pass 68 | 69 | 70 | def get_stacked_weights_by_return(triple_barrier_events_dict: dict, close_series_dict: dict, num_threads: int = 5, 71 | verbose: bool = True) -> dict: 72 | """ 73 | Get return based sample weights for multi-asset dataset. The function applies mlinlab's get_weights_by_return. 74 | function to multi-asset dataset. 75 | 76 | :param triple_barrier_events_dict: (dict) Dictionary of asset_name: triple barrier event series. 77 | :param close_series_dict: (dict) Dictionary of asset_name: close series used to form label events. 78 | :param num_threads: (int) Number of threads used to get sample weights. 79 | :param verbose: (bool) Flag to report progress on asynch jobs. 80 | :return: (dict) Dictionary of asset_name: sample weight series. 81 | """ 82 | 83 | pass 84 | 85 | 86 | def get_stacked_weights_time_decay(triple_barrier_events_dict: dict, close_series_dict: dict, decay: int = 0.5, 87 | num_threads: int = 5, 88 | verbose: bool = True) -> dict: 89 | """ 90 | Get return based sample weights for multi-asset dataset. The function applies mlinlab's get_weights_by_time_decay. 91 | function to multi-asset dataset. 92 | 93 | :param triple_barrier_events_dict: (dict) Dictionary of asset_name: triple barrier event series. 94 | :param close_series_dict: (dict) Dictionary of asset_name: close series used to form label events. 95 | :param decay: (int) Decay factor 96 | - decay = 1 means there is no time decay; 97 | - 0 < decay < 1 means that weights decay linearly over time, but every observation still receives a strictly positive weight, regadless of how old; 98 | - decay = 0 means that weights converge linearly to zero, as they become older; 99 | - decay < 0 means that the oldest portion c of the observations receive zero weight (i.e they are erased from memory). 100 | :param num_threads: (int) Number of threads used to get sample weights. 101 | :param verbose: (bool) Flag to report progress on asynch jobs. 102 | :return: (dict) Dictionary of asset_name: sample weight series. 103 | """ 104 | 105 | pass 106 | -------------------------------------------------------------------------------- /mlfinlab/sampling/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains the logic regarding the sequential bootstrapping from chapter 4, as well as the concurrent labels. 3 | """ 4 | 5 | from mlfinlab.sampling.bootstrapping import (get_ind_matrix, get_ind_mat_average_uniqueness, seq_bootstrap, 6 | get_ind_mat_label_uniqueness) 7 | from mlfinlab.sampling.concurrent import (num_concurrent_events, _get_average_uniqueness, 8 | get_av_uniqueness_from_triple_barrier) 9 | -------------------------------------------------------------------------------- /mlfinlab/sampling/bootstrapping.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logic regarding sequential bootstrapping from chapter 4. 3 | """ 4 | 5 | import pandas as pd 6 | import numpy as np 7 | from numba import jit, prange 8 | 9 | 10 | def get_ind_matrix(samples_info_sets, price_bars): 11 | """ 12 | Advances in Financial Machine Learning, Snippet 4.3, page 65. 13 | 14 | Build an Indicator Matrix 15 | 16 | Get indicator matrix. The book implementation uses bar_index as input, however there is no explanation 17 | how to form it. We decided that using triple_barrier_events and price bars by analogy with concurrency 18 | is the best option. 19 | 20 | :param samples_info_sets: (pd.Series): Triple barrier events(t1) from labeling.get_events 21 | :param price_bars: (pd.DataFrame): Price bars which were used to form triple barrier events 22 | :return: (np.array) Indicator binary matrix indicating what (price) bars influence the label for each observation 23 | """ 24 | 25 | pass 26 | 27 | 28 | def get_ind_mat_average_uniqueness(ind_mat): 29 | """ 30 | Advances in Financial Machine Learning, Snippet 4.4. page 65. 31 | 32 | Compute Average Uniqueness 33 | 34 | Average uniqueness from indicator matrix 35 | 36 | :param ind_mat: (np.matrix) Indicator binary matrix 37 | :return: (float) Average uniqueness 38 | """ 39 | 40 | pass 41 | 42 | 43 | def get_ind_mat_label_uniqueness(ind_mat): 44 | """ 45 | Advances in Financial Machine Learning, An adaption of Snippet 4.4. page 65. 46 | 47 | Returns the indicator matrix element uniqueness. 48 | 49 | :param ind_mat: (np.matrix) Indicator binary matrix 50 | :return: (np.matrix) Element uniqueness 51 | """ 52 | 53 | pass 54 | 55 | 56 | @jit(parallel=True, nopython=True) 57 | def _bootstrap_loop_run(ind_mat, prev_concurrency): # pragma: no cover 58 | """ 59 | Part of Sequential Bootstrapping for-loop. Using previously accumulated concurrency array, loops through all samples 60 | and generates averages uniqueness array of label based on previously accumulated concurrency 61 | 62 | :param ind_mat (np.array): Indicator matrix from get_ind_matrix function 63 | :param prev_concurrency (np.array): Accumulated concurrency from previous iterations of sequential bootstrapping 64 | :return: (np.array): Label average uniqueness based on prev_concurrency 65 | """ 66 | 67 | pass 68 | 69 | 70 | def seq_bootstrap(ind_mat, sample_length=None, warmup_samples=None, compare=False, verbose=False, 71 | random_state=np.random.RandomState()): 72 | """ 73 | Advances in Financial Machine Learning, Snippet 4.5, Snippet 4.6, page 65. 74 | 75 | Return Sample from Sequential Bootstrap 76 | 77 | Generate a sample via sequential bootstrap. 78 | Note: Moved from pd.DataFrame to np.matrix for performance increase 79 | 80 | :param ind_mat: (pd.DataFrame) Indicator matrix from triple barrier events 81 | :param sample_length: (int) Length of bootstrapped sample 82 | :param warmup_samples: (list) List of previously drawn samples 83 | :param compare: (boolean) Flag to print standard bootstrap uniqueness vs sequential bootstrap uniqueness 84 | :param verbose: (boolean) Flag to print updated probabilities on each step 85 | :param random_state: (np.random.RandomState) Random state 86 | :return: (array) Bootstrapped samples indexes 87 | """ 88 | 89 | pass 90 | -------------------------------------------------------------------------------- /mlfinlab/sampling/concurrent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logic regarding concurrent labels from chapter 4. 3 | """ 4 | 5 | import pandas as pd 6 | 7 | from mlfinlab.util.multiprocess import mp_pandas_obj 8 | 9 | 10 | def num_concurrent_events(close_series_index, label_endtime, molecule): 11 | """ 12 | Advances in Financial Machine Learning, Snippet 4.1, page 60. 13 | 14 | Estimating the Uniqueness of a Label 15 | 16 | This function uses close series prices and label endtime (when the first barrier is touched) to compute the number 17 | of concurrent events per bar. 18 | 19 | :param close_series_index: (pd.Series) Close prices index 20 | :param label_endtime: (pd.Series) Label endtime series (t1 for triple barrier events) 21 | :param molecule: (an array) A set of datetime index values for processing 22 | :return: (pd.Series) Number concurrent labels for each datetime index 23 | """ 24 | 25 | pass 26 | 27 | 28 | def _get_average_uniqueness(label_endtime, num_conc_events, molecule): 29 | """ 30 | Advances in Financial Machine Learning, Snippet 4.2, page 62. 31 | 32 | Estimating the Average Uniqueness of a Label 33 | 34 | This function uses close series prices and label endtime (when the first barrier is touched) to compute the number 35 | of concurrent events per bar. 36 | 37 | :param label_endtime: (pd.Series) Label endtime series (t1 for triple barrier events) 38 | :param num_conc_events: (pd.Series) Number of concurrent labels (output from num_concurrent_events function). 39 | :param molecule: (an array) A set of datetime index values for processing. 40 | :return: (pd.Series) Average uniqueness over event's lifespan. 41 | """ 42 | 43 | pass 44 | 45 | 46 | def get_av_uniqueness_from_triple_barrier(triple_barrier_events, close_series, num_threads, verbose=True): 47 | """ 48 | This function is the orchestrator to derive average sample uniqueness from a dataset labeled by the triple barrier 49 | method. 50 | 51 | :param triple_barrier_events: (pd.DataFrame) Events from labeling.get_events() 52 | :param close_series: (pd.Series) Close prices. 53 | :param num_threads: (int) The number of threads concurrently used by the function. 54 | :param verbose: (bool) Flag to report progress on asynch jobs 55 | :return: (pd.Series) Average uniqueness over event's lifespan for each index in triple_barrier_events 56 | """ 57 | 58 | pass 59 | -------------------------------------------------------------------------------- /mlfinlab/structural_breaks/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Structural breaks test (CUSUM, Chow, SADF). 3 | """ 4 | 5 | from mlfinlab.structural_breaks.chow import get_chow_type_stat 6 | from mlfinlab.structural_breaks.cusum import get_chu_stinchcombe_white_statistics 7 | from mlfinlab.structural_breaks.sadf import get_sadf 8 | -------------------------------------------------------------------------------- /mlfinlab/structural_breaks/chow.py: -------------------------------------------------------------------------------- 1 | """ 2 | Explosiveness tests: Chow-Type Dickey-Fuller Test 3 | """ 4 | 5 | import pandas as pd 6 | from mlfinlab.structural_breaks.sadf import get_betas 7 | from mlfinlab.util import mp_pandas_obj 8 | 9 | 10 | # pylint: disable=invalid-name 11 | 12 | def _get_dfc_for_t(series: pd.Series, molecule: list) -> pd.Series: 13 | """ 14 | Get Chow-Type Dickey-Fuller Test statistics for each index in molecule 15 | 16 | :param series: (pd.Series) Series to test 17 | :param molecule: (list) Dates to test 18 | :return: (pd.Series) Statistics for each index from molecule 19 | """ 20 | 21 | pass 22 | 23 | 24 | def get_chow_type_stat(series: pd.Series, min_length: int = 20, num_threads: int = 8, verbose: bool = True) -> pd.Series: 25 | """ 26 | Multithread implementation of Chow-Type Dickey-Fuller Test, p.251-252 27 | 28 | :param series: (pd.Series) Series to test 29 | :param min_length: (int) Minimum sample length used to estimate statistics 30 | :param num_threads: (int): Number of cores to use 31 | :param verbose: (bool) Flag to report progress on asynch jobs 32 | :return: (pd.Series) Chow-Type Dickey-Fuller Test statistics 33 | """ 34 | 35 | pass 36 | -------------------------------------------------------------------------------- /mlfinlab/structural_breaks/cusum.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of Chu-Stinchcombe-White test 3 | """ 4 | 5 | import pandas as pd 6 | import numpy as np 7 | from mlfinlab.util import mp_pandas_obj 8 | 9 | 10 | def _get_values_diff(test_type, series, index, ind): 11 | """ 12 | Gets the difference between two values given a test type. 13 | :param test_type: (str) Type of the test ['one_sided', 'two_sided'] 14 | :param series: (pd.Series) Series of values 15 | :param index: (pd.Index) primary index 16 | :param ind: (pd.Index) secondary index 17 | :return: (float) Difference between 2 values 18 | """ 19 | 20 | pass 21 | 22 | 23 | def _get_s_n_for_t(series: pd.Series, test_type: str, molecule: list) -> pd.Series: 24 | """ 25 | Get maximum S_n_t value for each value from molecule for Chu-Stinchcombe-White test 26 | 27 | :param series: (pd.Series) Series to get statistics for 28 | :param test_type: (str): Two-sided or one-sided test 29 | :param molecule: (list) Indices to get test statistics for 30 | :return: (pd.Series) Statistics 31 | """ 32 | 33 | pass 34 | 35 | 36 | def get_chu_stinchcombe_white_statistics(series: pd.Series, test_type: str = 'one_sided', 37 | num_threads: int = 8, verbose: bool = True) -> pd.Series: 38 | """ 39 | Multithread Chu-Stinchcombe-White test implementation, p.251 40 | 41 | :param series: (pd.Series) Series to get statistics for 42 | :param test_type: (str): Two-sided or one-sided test 43 | :param num_threads: (int) Number of cores 44 | :param verbose: (bool) Flag to report progress on asynch jobs 45 | :return: (pd.Series) Statistics 46 | """ 47 | 48 | pass 49 | -------------------------------------------------------------------------------- /mlfinlab/structural_breaks/sadf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Explosiveness tests: SADF 3 | """ 4 | 5 | from typing import Union, Tuple 6 | import pandas as pd 7 | import numpy as np 8 | from mlfinlab.util.multiprocess import mp_pandas_obj 9 | 10 | 11 | # pylint: disable=invalid-name 12 | 13 | def _get_sadf_at_t(X: pd.DataFrame, y: pd.DataFrame, min_length: int, model: str, phi: float) -> float: 14 | """ 15 | Advances in Financial Machine Learning, Snippet 17.2, page 258. 16 | 17 | SADF's Inner Loop (get SADF value at t) 18 | 19 | :param X: (pd.DataFrame) Lagged values, constants, trend coefficients 20 | :param y: (pd.DataFrame) Y values (either y or y.diff()) 21 | :param min_length: (int) Minimum number of samples needed for estimation 22 | :param model: (str) Either 'linear', 'quadratic', 'sm_poly_1', 'sm_poly_2', 'sm_exp', 'sm_power' 23 | :param phi: (float) Coefficient to penalize large sample lengths when computing SMT, in [0, 1] 24 | :return: (float) SADF statistics for y.index[-1] 25 | """ 26 | 27 | pass 28 | 29 | 30 | def _get_y_x(series: pd.Series, model: str, lags: Union[int, list], 31 | add_const: bool) -> Tuple[pd.DataFrame, pd.DataFrame]: 32 | """ 33 | Advances in Financial Machine Learning, Snippet 17.2, page 258-259. 34 | 35 | Preparing The Datasets 36 | 37 | :param series: (pd.Series) Series to prepare for test statistics generation (for example log prices) 38 | :param model: (str) Either 'linear', 'quadratic', 'sm_poly_1', 'sm_poly_2', 'sm_exp', 'sm_power' 39 | :param lags: (int or list) Either number of lags to use or array of specified lags 40 | :param add_const: (bool) Flag to add constant 41 | :return: (pd.DataFrame, pd.DataFrame) Prepared y and X for SADF generation 42 | """ 43 | 44 | pass 45 | 46 | 47 | def _lag_df(df: pd.DataFrame, lags: Union[int, list]) -> pd.DataFrame: 48 | """ 49 | Advances in Financial Machine Learning, Snipet 17.3, page 259. 50 | 51 | Apply Lags to DataFrame 52 | 53 | :param df: (int or list) Either number of lags to use or array of specified lags 54 | :param lags: (int or list) Lag(s) to use 55 | :return: (pd.DataFrame) Dataframe with lags 56 | """ 57 | 58 | pass 59 | 60 | 61 | def get_betas(X: pd.DataFrame, y: pd.DataFrame) -> Tuple[np.array, np.array]: 62 | """ 63 | Advances in Financial Machine Learning, Snippet 17.4, page 259. 64 | 65 | Fitting The ADF Specification (get beta estimate and estimate variance) 66 | 67 | :param X: (pd.DataFrame) Features(factors) 68 | :param y: (pd.DataFrame) Outcomes 69 | :return: (np.array, np.array) Betas and variances of estimates 70 | """ 71 | 72 | pass 73 | 74 | 75 | def _sadf_outer_loop(X: pd.DataFrame, y: pd.DataFrame, min_length: int, model: str, phi: float, 76 | molecule: list) -> pd.Series: 77 | """ 78 | This function gets SADF for t times from molecule 79 | 80 | :param X: (pd.DataFrame) Features(factors) 81 | :param y: (pd.DataFrame) Outcomes 82 | :param min_length: (int) Minimum number of observations 83 | :param model: (str) Either 'linear', 'quadratic', 'sm_poly_1', 'sm_poly_2', 'sm_exp', 'sm_power' 84 | :param phi: (float) Coefficient to penalize large sample lengths when computing SMT, in [0, 1] 85 | :param molecule: (list) Indices to get SADF 86 | :return: (pd.Series) SADF statistics 87 | """ 88 | 89 | pass 90 | 91 | def get_sadf(series: pd.Series, model: str, lags: Union[int, list], min_length: int, add_const: bool = False, 92 | phi: float = 0, num_threads: int = 8, verbose: bool = True) -> pd.Series: 93 | """ 94 | Advances in Financial Machine Learning, p. 258-259. 95 | 96 | Multithread implementation of SADF 97 | 98 | SADF fits the ADF regression at each end point t with backwards expanding start points. For the estimation 99 | of SADF(t), the right side of the window is fixed at t. SADF recursively expands the beginning of the sample 100 | up to t - min_length, and returns the sup of this set. 101 | 102 | When doing with sub- or super-martingale test, the variance of beta of a weak long-run bubble may be smaller than 103 | one of a strong short-run bubble, hence biasing the method towards long-run bubbles. To correct for this bias, 104 | ADF statistic in samples with large lengths can be penalized with the coefficient phi in [0, 1] such that: 105 | 106 | ADF_penalized = ADF / (sample_length ^ phi) 107 | 108 | :param series: (pd.Series) Series for which SADF statistics are generated 109 | :param model: (str) Either 'linear', 'quadratic', 'sm_poly_1', 'sm_poly_2', 'sm_exp', 'sm_power' 110 | :param lags: (int or list) Either number of lags to use or array of specified lags 111 | :param min_length: (int) Minimum number of observations needed for estimation 112 | :param add_const: (bool) Flag to add constant 113 | :param phi: (float) Coefficient to penalize large sample lengths when computing SMT, in [0, 1] 114 | :param num_threads: (int) Number of cores to use 115 | :param verbose: (bool) Flag to report progress on asynch jobs 116 | :return: (pd.Series) SADF statistics 117 | """ 118 | 119 | pass -------------------------------------------------------------------------------- /mlfinlab/util/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions. In particular Chapter20 code on Multiprocessing and Vectorization. 3 | """ 4 | 5 | from mlfinlab.util.fast_ewma import ewma 6 | from mlfinlab.util.multiprocess import (expand_call, lin_parts, mp_pandas_obj, nested_parts, 7 | process_jobs, process_jobs_, report_progress) 8 | from mlfinlab.util.volatility import (get_daily_vol, get_garman_class_vol, get_yang_zhang_vol, get_parksinson_vol) 9 | from mlfinlab.util.volume_classifier import get_bvc_buy_volume 10 | from mlfinlab.util.generate_dataset import get_classification_data 11 | -------------------------------------------------------------------------------- /mlfinlab/util/fast_ewma.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains an implementation of an exponentially weighted moving average based on sample size. 3 | The inspiration and context for this code was from a blog post by writen by Maksim Ivanov: 4 | https://towardsdatascience.com/financial-machine-learning-part-0-bars-745897d4e4ba 5 | """ 6 | 7 | # Imports 8 | import numpy as np 9 | from numba import jit 10 | from numba import float64 11 | from numba import int64 12 | 13 | 14 | @jit((float64[:], int64), nopython=False, nogil=True) 15 | def ewma(arr_in, window): # pragma: no cover 16 | """ 17 | Exponentially weighted moving average specified by a decay ``window`` to provide better adjustments for 18 | small windows via: 19 | y[t] = (x[t] + (1-a)*x[t-1] + (1-a)^2*x[t-2] + ... + (1-a)^n*x[t-n]) / 20 | (1 + (1-a) + (1-a)^2 + ... + (1-a)^n). 21 | 22 | :param arr_in: (np.ndarray), (float64) A single dimensional numpy array 23 | :param window: (int64) The decay window, or 'span' 24 | :return: (np.ndarray) The EWMA vector, same length / shape as ``arr_in`` 25 | """ 26 | 27 | pass 28 | -------------------------------------------------------------------------------- /mlfinlab/util/generate_dataset.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module generates synthetic classification dataset of INFORMED, REDUNDANT, and NOISE explanatory 3 | variables based on the book Machine Learning for Asset Manager (code snippet 6.1) 4 | ''' 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.datasets import make_classification 8 | 9 | # pylint: disable=invalid-name 10 | def get_classification_data(n_features=100, n_informative=25, n_redundant=25, n_samples=10000, random_state=0, sigma=.0): 11 | """ 12 | A function to generate synthetic classification datasets 13 | 14 | :param n_features: (int) Total number of features to be generated (i.e. informative + redundant + noisy). 15 | :param n_informative: (int) Number of informative features. 16 | :param n_redundant: (int) Number of redundant features. 17 | :param n_samples: (int) Number of samples (rows) to be generate. 18 | :param random_state: (int) Random seed. 19 | :param sigma: (float) This argument is used to introduce substitution effect to the redundant features in 20 | the dataset by adding gaussian noise. The lower the value of sigma, the greater the 21 | substitution effect. 22 | :return: (pd.DataFrame, pd.Series) X and y as features and labels respectively. 23 | """ 24 | 25 | pass 26 | -------------------------------------------------------------------------------- /mlfinlab/util/misc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various useful functions 3 | """ 4 | 5 | import pandas as pd 6 | import numpy as np 7 | 8 | def crop_data_frame_in_batches(df: pd.DataFrame, chunksize: int): 9 | # pylint: disable=invalid-name 10 | """ 11 | Splits df into chunks of chunksize 12 | 13 | :param df: (pd.DataFrame) Dataframe to split 14 | :param chunksize: (int) Number of rows in chunk 15 | :return: (list) Chunks (pd.DataFrames) 16 | """ 17 | 18 | pass 19 | -------------------------------------------------------------------------------- /mlfinlab/util/volatility.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various volatility estimators 3 | """ 4 | import pandas as pd 5 | import numpy as np 6 | 7 | 8 | # pylint: disable=redefined-builtin 9 | 10 | def get_daily_vol(close, lookback=100): 11 | """ 12 | Advances in Financial Machine Learning, Snippet 3.1, page 44. 13 | 14 | Daily Volatility Estimates 15 | 16 | Computes the daily volatility at intraday estimation points. 17 | 18 | In practice we want to set profit taking and stop-loss limits that are a function of the risks involved 19 | in a bet. Otherwise, sometimes we will be aiming too high (tao ≫ sigma_t_i,0), and sometimes too low 20 | (tao ≪ sigma_t_i,0 ), considering the prevailing volatility. Snippet 3.1 computes the daily volatility 21 | at intraday estimation points, applying a span of lookback days to an exponentially weighted moving 22 | standard deviation. 23 | 24 | See the pandas documentation for details on the pandas.Series.ewm function. 25 | Note: This function is used to compute dynamic thresholds for profit taking and stop loss limits. 26 | 27 | :param close: (pd.Series) Closing prices 28 | :param lookback: (int) Lookback period to compute volatility 29 | :return: (pd.Series) Daily volatility value 30 | """ 31 | 32 | pass 33 | 34 | 35 | def get_parksinson_vol(high: pd.Series, low: pd.Series, window: int = 20) -> pd.Series: 36 | """ 37 | Parkinson volatility estimator 38 | 39 | :param high: (pd.Series): High prices 40 | :param low: (pd.Series): Low prices 41 | :param window: (int): Window used for estimation 42 | :return: (pd.Series): Parkinson volatility 43 | """ 44 | 45 | pass 46 | 47 | 48 | def get_garman_class_vol(open: pd.Series, high: pd.Series, low: pd.Series, close: pd.Series, 49 | window: int = 20) -> pd.Series: 50 | """ 51 | Garman-Class volatility estimator 52 | 53 | :param open: (pd.Series): Open prices 54 | :param high: (pd.Series): High prices 55 | :param low: (pd.Series): Low prices 56 | :param close: (pd.Series): Close prices 57 | :param window: (int): Window used for estimation 58 | :return: (pd.Series): Garman-Class volatility 59 | """ 60 | 61 | pass 62 | 63 | 64 | def get_yang_zhang_vol(open: pd.Series, high: pd.Series, low: pd.Series, close: pd.Series, 65 | window: int = 20) -> pd.Series: 66 | """ 67 | 68 | Yang-Zhang volatility estimator 69 | 70 | :param open: (pd.Series): Open prices 71 | :param high: (pd.Series): High prices 72 | :param low: (pd.Series): Low prices 73 | :param close: (pd.Series): Close prices 74 | :param window: (int): Window used for estimation 75 | :return: (pd.Series): Yang-Zhang volatility 76 | """ 77 | 78 | pass 79 | -------------------------------------------------------------------------------- /mlfinlab/util/volume_classifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Volume classification methods (BVC and tick rule) 3 | """ 4 | 5 | from scipy.stats import norm 6 | import pandas as pd 7 | 8 | 9 | def get_bvc_buy_volume(close: pd.Series, volume: pd.Series, window: int = 20) -> pd.Series: 10 | """ 11 | Calculates the BVC buy volume 12 | 13 | :param close: (pd.Series): Close prices 14 | :param volume: (pd.Series): Bar volumes 15 | :param window: (int): Window for std estimation uses in BVC calculation 16 | :return: (pd.Series) BVC buy volume 17 | """ 18 | # .apply(norm.cdf) is used to omit Warning for norm.cdf(pd.Series with NaNs) 19 | 20 | pass 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Production 2 | numpy>=0.16.0 3 | matplotlib>=3.0.0 4 | pandas>=1.0.0 5 | scikit-learn>=0.20.0 6 | scipy>=1.2.0 7 | statsmodels>=0.9.0 8 | cython>=0.29 9 | POT>=0.7.0 10 | numba>=0.40.0 11 | networkx>=2.2, <2.6 12 | dash>=1.0.0 13 | dash-cytoscape>=0.1.0 14 | dash-bootstrap-components>=0.10.0 15 | jupyter-dash>=0.2.0 16 | tensorflow>=2.0.0 17 | joblib>=1.0.0 18 | decorator>=4.0.0, <5.0.0 19 | analytics-python>=1.2.7 20 | getmac>=0.8.0 21 | 22 | 23 | # Develop 24 | codecov==2.1.11 25 | coverage==5.4 26 | pylint==2.6.0 27 | sphinx==3.4.3 # Docs 28 | hudsonthames-sphinx-theme==0.1.5 # Docs 29 | sphinx-rtd-theme==0.5.2 # Docs 30 | releases==1.6.3 # Docs 31 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = mlfinlab 3 | version = 1.3.0 4 | author = Hudson and Thames Quantitative Research 5 | author_email = research@hudsonthames.org 6 | licence = All Rights Reserved 7 | licence-file = LICENSE.txt 8 | description = MlFinlab helps portfolio managers and traders who want to leverage the power of machine learning by providing reproducible, interpretable, and easy to use tools. 9 | long_description = file: README.md 10 | long_description_content_type = text/markdown 11 | platform = any 12 | url = https://www.hudsonthames.org/ 13 | project_urls = 14 | Documentation = https://mlfinlab.readthedocs.io/en/latest/ 15 | Bug Reports = https://github.com/hudson-and-thames/mlfinlab/issues 16 | Project Boards = https://github.com/orgs/hudson-and-thames/projects 17 | Source = https://github.com/hudson-and-thames/mlfinlab 18 | Blog = https://hudsonthames.org/blog/ 19 | Apprenticeship Program = https://hudsonthames.org/apprenticeship-program/ 20 | classifiers = 21 | Development Status :: 5 - Production/Stable 22 | Intended Audience :: Developers 23 | Intended Audience :: Education 24 | Intended Audience :: Science/Research 25 | Intended Audience :: Financial and Insurance Industry 26 | License :: Other/Proprietary License 27 | Operating System :: OS Independent 28 | Programming Language :: Python 29 | Programming Language :: Python :: 3.6 30 | Programming Language :: Python :: 3.7 31 | Programming Language :: Python :: 3.8 32 | Topic :: Scientific/Engineering 33 | Topic :: Scientific/Engineering :: Artificial Intelligence 34 | Topic :: Office/Business :: Financial :: Investment 35 | keywords = 36 | machinelearning 37 | finance 38 | investment 39 | education 40 | 41 | [options] 42 | include_package_data = True 43 | packages = find: 44 | python_requires = 45 | >=3.6, <3.9 46 | setup_requires = 47 | setuptools 48 | cython 49 | install_requires = 50 | numpy>=0.16.0 51 | matplotlib>=3.0.0 52 | pandas>=1.0.0 53 | scikit-learn>=0.20.0 54 | scipy>=1.2.0 55 | statsmodels>=0.9.0 56 | cython>=0.29 57 | POT>=0.7.0 58 | numba>=0.40.0 59 | networkx>=2.2, <2.6 60 | dash>=1.0.0 61 | dash-cytoscape>=0.1.0 62 | dash-bootstrap-components>=0.10.0 63 | jupyter-dash>=0.2.0 64 | tensorflow>=2.0.0 65 | joblib>=1.0.0 66 | decorator>=4.0.0, <5.0.0 67 | analytics-python>=1.2.7 68 | getmac>=0.8.0 69 | 70 | 71 | [options.packages.find] 72 | package_dir = 73 | mlfinlab 74 | exclude = 75 | contrib 76 | docs 77 | tests 78 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Always prefer setuptools over distutils 2 | from setuptools import setup 3 | 4 | setup() 5 | 6 | # Create package 7 | # python setup.py bdist_wheel 8 | # python3 -m twine upload --repository-url https://test.pypi.org/legacy/ dist/* (This is the test repo) 9 | # twine upload dist/* (This is official repo) 10 | --------------------------------------------------------------------------------