├── .bumpversion.cfg
├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── custom.md
│ └── feature_request.md
├── logo
│ ├── hudson_and_thames_logo.png
│ └── support.png
└── pull_request_template.md
├── .gitignore
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.txt
├── README.md
├── docs
├── Makefile
├── make.bat
└── source
│ ├── _static
│ ├── .gitkeep
│ ├── favicon_mlfinlab.png
│ ├── ht_logo_black.png
│ ├── ht_logo_white.png
│ ├── logo_black.png
│ └── logo_white.png
│ ├── _templates
│ └── breadcrumbs.html
│ ├── additional_information
│ ├── analytics.rst
│ ├── contact.rst
│ ├── contributing.rst
│ ├── images
│ │ └── slack.png
│ ├── license.rst
│ └── privacy_gdpr.rst
│ ├── changelog.rst
│ ├── conf.py
│ ├── index.rst
│ └── requirements.txt
├── mlfinlab
├── __init__.py
├── backtest_statistics
│ ├── __init__.py
│ ├── backtests.py
│ └── statistics.py
├── bet_sizing
│ ├── __init__.py
│ ├── bet_sizing.py
│ ├── ch10_snippets.py
│ └── ef3m.py
├── clustering
│ ├── __init__.py
│ ├── feature_clusters.py
│ ├── hierarchical_clustering.py
│ └── onc.py
├── codependence
│ ├── __init__.py
│ ├── codependence_matrix.py
│ ├── correlation.py
│ ├── gnpr_distance.py
│ ├── information.py
│ └── optimal_transport.py
├── cross_validation
│ ├── __init__.py
│ ├── combinatorial.py
│ └── cross_validation.py
├── data_generation
│ ├── __init__.py
│ ├── bootstrap.py
│ ├── correlated_random_walks.py
│ ├── corrgan.py
│ ├── data_verification.py
│ ├── hcbm.py
│ └── vines.py
├── data_structures
│ ├── __init__.py
│ ├── base_bars.py
│ ├── imbalance_data_structures.py
│ ├── run_data_structures.py
│ ├── standard_data_structures.py
│ └── time_data_structures.py
├── datasets
│ ├── __init__.py
│ ├── data
│ │ ├── dollar_bar_sample.csv
│ │ ├── stock_prices.csv
│ │ └── tick_data.csv
│ └── load_datasets.py
├── ensemble
│ ├── __init__.py
│ └── sb_bagging.py
├── feature_importance
│ ├── __init__.py
│ ├── fingerpint.py
│ ├── importance.py
│ └── orthogonal.py
├── features
│ ├── __init__.py
│ └── fracdiff.py
├── filters
│ ├── __init__.py
│ └── filters.py
├── labeling
│ ├── __init__.py
│ ├── bull_bear.py
│ ├── excess_over_mean.py
│ ├── excess_over_median.py
│ ├── fixed_time_horizon.py
│ ├── labeling.py
│ ├── matrix_flags.py
│ ├── raw_return.py
│ ├── return_vs_benchmark.py
│ ├── tail_sets.py
│ └── trend_scanning.py
├── microstructural_features
│ ├── __init__.py
│ ├── encoding.py
│ ├── entropy.py
│ ├── feature_generator.py
│ ├── first_generation.py
│ ├── misc.py
│ ├── second_generation.py
│ └── third_generation.py
├── multi_product
│ ├── __init__.py
│ └── etf_trick.py
├── networks
│ ├── __init__.py
│ ├── almst.py
│ ├── dash_graph.py
│ ├── dual_dash_graph.py
│ ├── graph.py
│ ├── mst.py
│ ├── pmfg.py
│ └── visualisations.py
├── regression
│ ├── __init__.py
│ └── history_weight_regression.py
├── sample_weights
│ ├── __init__.py
│ └── attribution.py
├── sampling
│ ├── __init__.py
│ ├── bootstrapping.py
│ └── concurrent.py
├── structural_breaks
│ ├── __init__.py
│ ├── chow.py
│ ├── cusum.py
│ └── sadf.py
└── util
│ ├── __init__.py
│ ├── fast_ewma.py
│ ├── generate_dataset.py
│ ├── misc.py
│ ├── multiprocess.py
│ ├── volatility.py
│ └── volume_classifier.py
├── requirements.txt
├── setup.cfg
└── setup.py
/.bumpversion.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | current_version = 1.2.0
3 | commit = True
4 | tag = True
5 | tag_name = {new_version}
6 |
7 | [bumpversion:file:setup.cfg]
8 |
9 | [bumpversion:file:docs/source/conf.py]
10 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | patreon: HudsonThames # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Smartphone (please complete the following information):**
32 | - Device: [e.g. iPhone6]
33 | - OS: [e.g. iOS8.1]
34 | - Browser [e.g. stock browser, safari]
35 | - Version [e.g. 22]
36 |
37 | **Additional context**
38 | Add any other context about the problem here.
39 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Custom issue template
3 | about: Describe this issue template's purpose here.
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/logo/hudson_and_thames_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/.github/logo/hudson_and_thames_logo.png
--------------------------------------------------------------------------------
/.github/logo/support.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/.github/logo/support.png
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | # Description
2 |
3 | Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.
4 |
5 | Fixes # (issue)
6 |
7 | ## Type of change
8 |
9 | Please delete options that are not relevant.
10 |
11 | - [ ] Bug fix (non-breaking change which fixes an issue)
12 | - [ ] New feature (non-breaking change which adds functionality)
13 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
14 | - [ ] This change requires a documentation update
15 |
16 | # How Has This Been Tested?
17 |
18 | Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration
19 |
20 | - [ ] Test A
21 | - [ ] Test B
22 |
23 | **Test Configuration**:
24 | * Operating system
25 | * IDE used
26 |
27 |
28 | # Checklist:
29 |
30 | - [ ] My code follows the style guidelines of this project
31 | - [ ] I have performed a self-review of my own code
32 | - [ ] I have commented my code, particularly in hard-to-understand areas
33 | - [ ] I have made corresponding changes to the documentation
34 | - [ ] My changes generate no new warnings
35 | - [ ] I have added tests that prove my fix is effective or that my feature works
36 | - [ ] New and existing unit tests pass locally with my changes
37 | - [ ] Any dependent changes have been merged and published in downstream modules
38 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/*
2 | *.pyc
3 | __pycache__
4 | test_reports
5 | .coverage
6 | .DS_Store
7 | docs/build/
8 | .local/
9 | cover/
10 | *.pickle
11 | */.ipynb_checkpoints/*
12 | mlfinlab.egg-info/*
13 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Build documentation in the docs/ directory with Sphinx
9 | sphinx:
10 | configuration: docs/source/conf.py
11 |
12 | # Optionally build your docs in additional formats such as PDF
13 | formats: []
14 |
15 | # Optionally set the version of Python and requirements required to build your docs
16 | python:
17 | version: 3.8
18 | install:
19 | - requirements: docs/source/requirements.txt
20 |
21 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at hudsonthames19@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to MlFinLab:
2 |
3 | First off we wanted to thank you for taking the time to contribute to the project.
4 |
5 | We make use of a [Apprenticeship Program](https://hudsonthames.org/mentorship/), which caters to ambitious students looking
6 | to make an impact on open-source and develop a portfolio of work based on financial machine learning.
7 |
8 | This allows us to establish organised collaboration and control the level of code quality.
9 |
10 | ## External Contributions:
11 |
12 | We do encourage external contributions which are sourced by members of our community, [Slack Channel](https://www.patreon.com/HudsonThames),
13 |
14 | We have quite a rigorous process of unit testing, code style checks, and documentation.
15 |
16 |
17 | ## Raise an Issue
18 | We have created [templates](https://github.com/hudson-and-thames/mlfinlab/issues/new/choose) to help aid in creating issues and PRs:
19 | * Bug report
20 | * Feature request
21 | * Custom issue template
22 | * Pull Request Template
23 |
24 | ---
25 |
26 | ## Contact us
27 | We host a booming community of like minded data scientists and quants, join the
28 | [Slack Channel](https://www.patreon.com/HudsonThames) now! Open to sponsors of our package.
29 |
30 | The channel has the following benefits:
31 |
32 | * Community of like minded individuals.
33 | * Ask questions about the package implementations and get community feedback.
34 | * Occasional presentations on topics within financial machine learning.
35 | * A papers channel where we share the papers which are freely available.
36 | * Access to members of our research group.
37 |
38 | You can also email us at research@hudsonthames.org
39 |
40 | Looking forward to hearing from you!
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
9 |
10 |
11 | # Welcome to Machine Learning Financial Laboratory!
12 |
13 |
14 |
15 |
16 |
17 | >This repo is public facing and exists for the sole purpose of providing users with an easy way to raise bugs, feature requests, and other issues.
18 |
19 |
20 |
21 |
22 |
23 | ## What is MlFinLab?
24 | MlFinlab python library is a perfect toolbox that every financial machine learning researcher needs.
25 |
26 | It covers every step of the ML strategy creation, starting from data structures generation and finishing with backtest statistics.
27 | We pride ourselves in the robustness of our codebase - every line of code existing in the modules is extensively tested and
28 | documented.
29 |
30 |
31 | ## Documentation, Example Notebooks and Lecture Videos
32 | For every technique present in the library we not only provide extensive documentation, with both theoretical explanations
33 | and detailed descriptions of available functions, but also supplement the modules with ever-growing array of lecture videos and slides
34 | on the implemented methods.
35 |
36 | We want you to be able to use the tools right away. To achieve that, every module comes with a number of example notebooks
37 | which include detailed examples of the usage of the algorithms. Our goal is to show you the whole pipeline, starting from
38 | importing the libraries and ending with strategy performance metrics so you can get the added value from the get-go.
39 |
40 |
52 |
53 |
54 | ### Included modules:
55 |
56 | - Backtest Overfitting Tools
57 | - Data Structures
58 | - Labeling
59 | - Sampling
60 | - Feature Engineering
61 | - Models
62 | - Clustering
63 | - Cross-Validation
64 | - Hyper-Parameter Tuning
65 | - Feature Importance
66 | - Bet Sizing
67 | - Synthetic Data Generation
68 | - Networks
69 | - Measures of Codependence
70 | - Useful Financial Features
71 |
72 |
73 | ## Licensing options
74 | This project is licensed under an all rights reserved [licence](https://github.com/hudson-and-thames/mlfinlab/blob/master/LICENSE.txt).
75 |
76 | * Business
77 | * Enterprise
78 |
79 |
80 | ## Community
81 | With the purchase of the library, our clients get access to the Hudson & Thames Slack community, where our engineers and other quants
82 | are always ready to answer your questions.
83 |
84 | Alternatively, you can email us at: research@hudsonthames.org.
85 |
86 |
92 |
93 |
94 | ## Who is Hudson & Thames?
95 | Hudson and Thames Quantitative Research is a company with the goal of bridging the gap between the advanced research developed in
96 | quantitative finance and its practical application. We have created three premium python libraries so you can effortlessly access the
97 | latest techniques and focus on what matters most: **creating your own winning strategy**.
98 |
99 |
100 | ### What was only possible with the help of huge R&D teams is now at your disposal, anywhere, anytime.
101 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/_static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/.gitkeep
--------------------------------------------------------------------------------
/docs/source/_static/favicon_mlfinlab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/favicon_mlfinlab.png
--------------------------------------------------------------------------------
/docs/source/_static/ht_logo_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/ht_logo_black.png
--------------------------------------------------------------------------------
/docs/source/_static/ht_logo_white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/ht_logo_white.png
--------------------------------------------------------------------------------
/docs/source/_static/logo_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/logo_black.png
--------------------------------------------------------------------------------
/docs/source/_static/logo_white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/logo_white.png
--------------------------------------------------------------------------------
/docs/source/_templates/breadcrumbs.html:
--------------------------------------------------------------------------------
1 | {%- extends "sphinx_rtd_theme/breadcrumbs.html" %}
2 |
3 | {% block breadcrumbs_aside %}
4 | {% endblock %}
--------------------------------------------------------------------------------
/docs/source/additional_information/analytics.rst:
--------------------------------------------------------------------------------
1 | .. _additional_information-analytics:
2 |
3 | =========
4 | Analytics
5 | =========
6 |
7 | .. warning::
8 |
9 | * Please don't alter or change any of the code as this is a violation of our license agreement.
10 | * We do provide a separate enterprise license for companies that want to white label or alter code.
11 | * All changes are flagged by the system.
12 |
13 | Please note that we have added standard web analytics to MLFinLab, using `Segment. `__
14 |
15 | We track the following:
16 |
17 | * City, Country, Region, City Geographic Coordinate
18 | * UserIDs (MAC address)
19 | * Function calls
20 | * Timestamps
21 |
22 | This allows our team to see how the package is being used by you, our client, so that we may improve the functionality and
23 | build more tools that you will love. An additional purpose is that we need to start tracking growth KPIs such as cohort
24 | retention and MAU and we will compile these into reports for investors, as we are aiming for VC funding in late 2021.
25 |
26 | The impact of the analytics is negligible.
27 |
28 | .. note::
29 |
30 | * We chose to use MAC Addresses as it is an anonymous token which allows us to track a machine and is not considered as personal information under GDPR unless it is combined with other personal data which then identifies the natural person.
31 | * Your data is also anonymized by filtering it through ipinfo, which returns high level location (City, Country, Region) data without sharing your IP address.
32 | * Segment is the tool we use to collect, clean, and control the data.
--------------------------------------------------------------------------------
/docs/source/additional_information/contact.rst:
--------------------------------------------------------------------------------
1 | .. _additional_information-contact:
2 |
3 | =========================
4 | Join the Slack Channel 🔑
5 | =========================
6 |
7 | We host a booming community of like minded data scientists and quants, join the Slack channel now! Available via
8 | `H&T Client Portal `__.
9 |
10 | The channel has the following benefits:
11 |
12 | * Community of like minded individuals.
13 | * Ask questions about the package implementations and get community feedback.
14 | * Occasional presentations on topics within financial machine learning.
15 | * A papers channel where we share the papers which are freely available.
16 | * Access to members of our research group.
17 |
18 | Looking forward to hearing from you!
19 |
20 | .. image:: ./images/slack.png
21 | :scale: 65 %
22 | :align: center
23 |
--------------------------------------------------------------------------------
/docs/source/additional_information/contributing.rst:
--------------------------------------------------------------------------------
1 | .. _additional_information-contributing:
2 |
3 | ============
4 | Contributing
5 | ============
6 |
7 | Areas of Contribution
8 | #####################
9 |
10 | Currently we have a live project board that follows the principles of Agile Project Management.
11 |
12 | At the time of writing, we are focusing our attentions primarily on those contributions by the current Researchers enrolled
13 | in our `Apprenticeship Program `_.
14 |
15 | There is of course room for the public to make contributions. The most useful are those that help to improve user experience.
16 | Good examples of this is writing tutorial notebooks which answer questions
17 | from the back of a chapter, mlfinlab recipes, improving docstrings, and adding new sphinx documentation.
18 |
19 | Raising Issues
20 | ##############
21 |
22 | We have created `templates`_ to help aid in creating issues and PRs:
23 |
24 | * Bug report
25 | * Feature request
26 | * Custom issue template
27 | * Pull Request Template
28 |
29 | Please do create issues for new feature requests and bug fixes.
30 |
31 | .. _templates: https://github.com/hudson-and-thames/mlfinlab/issues/new/choose
32 |
--------------------------------------------------------------------------------
/docs/source/additional_information/images/slack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/additional_information/images/slack.png
--------------------------------------------------------------------------------
/docs/source/additional_information/privacy_gdpr.rst:
--------------------------------------------------------------------------------
1 | .. _additional_information-analytics:
2 |
3 | =======================
4 | Privacy and GDPR Policy
5 | =======================
6 |
7 | .. note::
8 | Our Privacy and GDPR Policies can be downloaded directly from our website:
9 |
10 | * `Privacy Policy `_
11 | * `GDPR Policy `_
12 |
--------------------------------------------------------------------------------
/docs/source/changelog.rst:
--------------------------------------------------------------------------------
1 | =========
2 | Changelog
3 | =========
4 | ..
5 | The Following are valid options
6 | * :release:`0.1.0 <2021-01-12>`
7 | * :support:`119` Upgrade to pandas 1.0
8 | * :feature:`50` Add a distutils command for marbles
9 | * :bug:`58` Fixed test failure on OSX
10 | ..
11 | For Help: https://releases.readthedocs.io/en/latest/index.html
12 |
13 | * :release:`1.3.0 <2021-07-09>`
14 | * :feature:`69` Added support for Python 3.6 and Python 3.7.
15 | * :feature:`69` Requirements versions are now non-fixed.
16 | * :support:`69` Migrated Optimal Mean Reversion Module from MlFinLab to ArbitrageLab.
17 | * :support:`69` Reflected Optimal Mean Reversion Module migration in the documentation.
18 |
19 | * :release:`1.2.0 <2021-06-23>`
20 | * :support:`64` Updated references in documentation.
21 | * :support:`63` Updated documentation theme to hudsonthames-sphinx-docs.
22 | * :bug:`66 major` Fixed issue with too many function calls in web analytics.
23 |
24 | * :release:`1.1.0 <2021-04-15>`
25 | * :feature:`56` MAE/MSE added as possible metrics for the Trend Scanning Module.
26 | * :feature:`58` Low silhouette scores check made optional in Feature Clusters Module.
27 | * :bug:`57 major` Fix purging bug in Purged KFold/Combinatorial Purged KFold.
28 | * :feature:`61` History Weighted Regression added to the Regression Module.
29 | * :support:`61` History Weighted Regression documentation.
30 | * :feature:`59` Code and unit tests style unified.
31 | * :support:`59` Documentation style unified.
32 | * :feature:`45` Added Pagan et al. and Lunde et al. Bull Bear Methods to the Labeling Module.
33 | * :support:`45` Added Pagan et al. and Lunde et al. Bull Bear Methods documentation.
34 | * :bug:`60 major` Fix structural break bug in the Chu-Stinchcombe-White test.
35 | * :feature:`46` Stacked Module with Cross Validation, Feature Importance, and Sampling methods added.
36 | * :feature:`46` Lambda code in Microstructural Features Module speed-up.
37 | * :support:`46` Stacked Module documentation.
38 |
39 | * :release:`1.0.1 <2021-02-19>`
40 | * :support:`55` Removed TensorFlow from requirements and adjusted installation guide.
41 |
42 | * :release:`1.0.0 <2021-02-16>`
43 | * :feature:`35` Debugged ETF Trick code.
44 | * :feature:`44` Added n_repeat parameter to MDA feature importance.
45 | * :feature:`50` Added t-student option to BVC classifier.
46 | * :bug:`50` Fix bug in Bar-based Kyle lambdas calculation.
47 | * :feature:`52` Migrated Portfolio Optimisation Module code from MlFinLab to PortfolioLab.
48 | * :support:`52` Migrated Portfolio Optimisation Module documentation from MlFinLab to PortfolioLab.
49 | * :feature:`52` Migrated Online Portfolio Selection Module code from MlFinLab to PortfolioLab.
50 | * :support:`52` Migrated Online Portfolio Selection Module documentation from MlFinLab to PortfolioLab.
51 | * :support:`52` Updated requirements versions (numpy==1.20.1, matplotlib==3.2.2,
52 | pandas==1.1.5, scikit-learn==0.24.1, scipy==1.6.0, statsmodels==0.12.2).
53 |
54 | * :release:`0.15.3 <2021-01-12>`
55 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # http://www.sphinx-doc.org/en/master/config
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('./../..'))
16 |
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | project = 'mlfinlab'
21 | copyright = '2019, Hudson & Thames Quantitative Research.'
22 | author = 'Hudson & Thames Quantitative Research'
23 |
24 | # The full version, including alpha/beta/rc tags
25 | release = '1.3.0'
26 |
27 |
28 | # -- General configuration ---------------------------------------------------
29 |
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 | 'sphinx.ext.autodoc',
35 | 'sphinx.ext.coverage',
36 | 'sphinx.ext.intersphinx',
37 | 'sphinx.ext.viewcode',
38 | 'releases'
39 | ]
40 |
41 |
42 | # Add any paths that contain templates here, relative to this directory.
43 | templates_path = ['_templates']
44 |
45 | master_doc = 'index'
46 |
47 | # List of patterns, relative to source directory, that match files and
48 | # directories to ignore when looking for source files.
49 | # This pattern also affects html_static_path and html_extra_path.
50 | exclude_patterns = []
51 |
52 |
53 | # -- Options for HTML output -------------------------------------------------
54 |
55 | # The theme to use for HTML and HTML Help pages. See the documentation for
56 | # a list of builtin themes.
57 | #
58 | html_theme = 'hudsonthames_sphinx_theme'
59 | add_module_names = False
60 |
61 | # Theme options are theme-specific and customize the look and feel of a theme
62 | # further. For a list of options available for each theme, see the
63 | # documentation.
64 | #
65 | # html_theme_options = {}
66 |
67 | html_context = {'logo': 'logo_white.png', 'theme_logo_only': True}
68 | html_favicon = '_static/favicon_mlfinlab.png'
69 |
70 | # Add any paths that contain custom static files (such as style sheets) here,
71 | # relative to this directory. They are copied after the builtin static files,
72 | # so a file named "default.css" will overwrite the builtin "default.css".
73 | html_static_path = ['_static']
74 | html_copy_source = True
75 |
76 | # 'releases' (changelog) settings
77 | releases_github_path = 'hudson-and-thames/mlfinlab_premium'
78 | releases_unstable_prehistory = True
79 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. image:: _static/logo_black.png
2 | :scale: 50 %
3 | :align: center
4 | :target: https://hudsonthames.org/
5 |
6 | |
7 |
8 | ================================================
9 | Machine Learning Financial Laboratory (mlfinlab)
10 | ================================================
11 |
12 | MlFinlab is a python package which helps portfolio managers and traders who want to leverage the power of machine learning
13 | by providing reproducible, interpretable, and easy to use tools.
14 |
15 | Adding MlFinLab to your companies pipeline is like adding a department of PhD researchers to your team.
16 |
17 | .. code-block::
18 |
19 | pip install mlfinlab
20 |
21 | We source all of our implementations from the most elite and peer-reviewed journals. Including publications from:
22 |
23 | 1. `The Journal of Financial Data Science `_
24 | 2. `The Journal of Portfolio Management `_
25 | 3. `The Journal of Algorithmic Finance `_
26 | 4. `Cambridge University Press `_
27 |
28 |
29 | Documentation & Tutorials
30 | #########################
31 |
32 | We lower barriers to entry for all users by providing extensive `documentation `_
33 | and `tutorial notebooks `_, with code examples.
34 |
35 | Who is Hudson & Thames?
36 | #######################
37 |
38 | Hudson and Thames Quantitative Research is a company with a focus on implementing the most cutting edge algorithms in
39 | quantitative finance. We productionalize all our tools in the form of libraries and provide capability to our clients.
40 |
41 | * `Website `_
42 | * `Github Group `_
43 | * `MlFinLab Documentation `_
44 |
45 | Contact us
46 | ##########
47 |
48 | The best place to contact the team is via the Slack channel. Alternatively you can email us at: research@hudsonthames.org.
49 |
50 | Looking forward to hearing from you!
51 |
52 | License
53 | #######
54 |
55 | This project is licensed under an all rights reserved licence and is NOT open-source, and may not be used for commercial purposes without a commercial license which may be purchased from Hudson and Thames Quantitative Research.
56 |
57 | `LICENSE.txt `_ file for details.
58 |
59 | .. toctree::
60 | :maxdepth: 2
61 | :caption: Legal
62 | :hidden:
63 |
64 | additional_information/license
65 | additional_information/analytics
66 | additional_information/privacy_gdpr
67 |
--------------------------------------------------------------------------------
/docs/source/requirements.txt:
--------------------------------------------------------------------------------
1 | # Production
2 | numpy==1.18.5
3 | matplotlib==3.2.2
4 | pandas==1.1.5
5 | scikit-learn==0.24.1
6 | scipy==1.6.0
7 | statsmodels==0.12.2
8 | cython==0.29.17
9 | POT==0.7.0
10 | numba==0.52.0
11 | networkx==2.5
12 | dash==1.19.0
13 | dash-cytoscape==0.2.0
14 | dash-bootstrap-components==0.11.3
15 | jupyter-dash==0.4.0
16 | tensorflow==2.2.1
17 | joblib==1.0.1
18 | analytics-python==1.2.9
19 | getmac==0.8.2
20 |
21 |
22 | # Develop
23 | bump2version==1.0.1
24 | bumpversion==0.6.0
25 | codecov==2.1.11
26 | coverage==5.4
27 | pylint==2.6.0
28 | sphinx==3.4.3 # Docs
29 | hudsonthames-sphinx-theme==0.1.5 # Docs
30 | sphinx-rtd-theme==0.5.2 # Docs
31 | releases==1.6.3 # Docs
32 |
--------------------------------------------------------------------------------
/mlfinlab/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | MlFinlab helps portfolio managers and traders who want to leverage the power of machine learning by providing
3 | reproducible, interpretable, and easy to use tools.
4 |
5 | Adding MlFinLab to your companies pipeline is like adding a department of PhD researchers to your team.
6 | """
7 |
8 | import mlfinlab.cross_validation as cross_validation
9 | import mlfinlab.data_structures as data_structures
10 | import mlfinlab.datasets as datasets
11 | import mlfinlab.multi_product as multi_product
12 | import mlfinlab.filters.filters as filters
13 | import mlfinlab.labeling as labeling
14 | import mlfinlab.features.fracdiff as fracdiff
15 | import mlfinlab.sample_weights as sample_weights
16 | import mlfinlab.sampling as sampling
17 | import mlfinlab.bet_sizing as bet_sizing
18 | import mlfinlab.util as util
19 | import mlfinlab.structural_breaks as structural_breaks
20 | import mlfinlab.feature_importance as feature_importance
21 | import mlfinlab.ensemble as ensemble
22 | import mlfinlab.clustering as clustering
23 | import mlfinlab.microstructural_features as microstructural_features
24 | import mlfinlab.backtest_statistics.backtests as backtests
25 | import mlfinlab.backtest_statistics.statistics as backtest_statistics
26 | import mlfinlab.networks as networks
27 | import mlfinlab.data_generation as data_generation
28 | import mlfinlab.regression as regression
29 |
--------------------------------------------------------------------------------
/mlfinlab/backtest_statistics/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Implements general backtest statistics.
3 | """
4 |
5 | from mlfinlab.backtest_statistics.backtests import CampbellBacktesting
6 | from mlfinlab.backtest_statistics.statistics import (timing_of_flattening_and_flips, average_holding_period,
7 | bets_concentration, all_bets_concentration,
8 | drawdown_and_time_under_water, sharpe_ratio,
9 | information_ratio, probabilistic_sharpe_ratio,
10 | deflated_sharpe_ratio, minimum_track_record_length)
11 |
--------------------------------------------------------------------------------
/mlfinlab/bet_sizing/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions derived from Chapter 10: Bet Sizing
3 | Only the highest-level user functions are included in the __init__ file.
4 |
5 | This folder contains classes and functions for sizing bets based on a given investment strategy with given bet side
6 | confidence, e.g. the output from a machine learning model. The approaches implemented in this module are based on
7 | those described in Chapter 10 of "Advances in Financial Machine Learning" by Marcos López de Prado.
8 | """
9 |
10 | from mlfinlab.bet_sizing.bet_sizing import (bet_size_probability, bet_size_dynamic, bet_size_budget, bet_size_reserve,
11 | confirm_and_cast_to_df, get_concurrent_sides, cdf_mixture,
12 | single_bet_size_mixed)
13 | from mlfinlab.bet_sizing.ef3m import (M2N, centered_moment, raw_moment, most_likely_parameters)
14 |
--------------------------------------------------------------------------------
/mlfinlab/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Implements clustering module methods.
3 | """
4 |
5 | from mlfinlab.clustering.onc import get_onc_clusters
6 | from mlfinlab.clustering.feature_clusters import get_feature_clusters
7 | from mlfinlab.clustering.hierarchical_clustering import optimal_hierarchical_cluster
8 |
--------------------------------------------------------------------------------
/mlfinlab/clustering/feature_clusters.py:
--------------------------------------------------------------------------------
1 | """
2 | This module creates clustered subsets of features described in the paper Clustered Feature Importance (Presentation
3 | Slides) by Dr. Marcos Lopez de Prado. https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3517595 and is also explained
4 | in the book Machine Learning for Asset Managers Snippet 6.5.2 page 84.
5 | """
6 |
7 | #Imports
8 | import numpy as np
9 | import pandas as pd
10 | import statsmodels.api as sm
11 | from scipy.spatial.distance import squareform
12 | from scipy.cluster.hierarchy import linkage, fcluster
13 | from statsmodels.regression.linear_model import OLS
14 |
15 | from mlfinlab.clustering.onc import get_onc_clusters
16 | from mlfinlab.codependence.codependence_matrix import get_dependence_matrix, get_distance_matrix
17 |
18 |
19 | # pylint: disable=invalid-name
20 | def get_feature_clusters(X: pd.DataFrame, dependence_metric: str, distance_metric: str = None,
21 | linkage_method: str = None, n_clusters: int = None, critical_threshold: float = 0.0) -> list:
22 | """
23 | Machine Learning for Asset Managers
24 | Snippet 6.5.2.1 , page 85. Step 1: Features Clustering
25 |
26 | Gets clustered features subsets from the given set of features.
27 |
28 | :param X: (pd.DataFrame) Dataframe of features.
29 | :param dependence_metric: (str) Method to be use for generating dependence_matrix, either 'linear' or
30 | 'information_variation' or 'mutual_information' or 'distance_correlation'.
31 | :param distance_metric: (str) The distance operator to be used for generating the distance matrix. The methods that
32 | can be applied are: 'angular', 'squared_angular', 'absolute_angular'. Set it to None if the
33 | feature are to be generated as it is by the ONC algorithm.
34 | :param linkage_method: (str) Method of linkage to be used for clustering. Methods include: 'single', 'ward',
35 | 'complete', 'average', 'weighted', and 'centroid'. Set it to None if the feature are to
36 | be generated as it is by the ONC algorithm.
37 | :param n_clusters: (int) Number of clusters to form. Must be less the total number of features. If None then it
38 | returns optimal number of clusters decided by the ONC Algorithm.
39 | :param critical_threshold: (float) Threshold for determining low silhouette score in the dataset. It can any real number
40 | in [-1,+1], default is 0 which means any feature that has a silhouette score below 0 will be
41 | indentified as having low silhouette and hence requied transformation will be appiled to for
42 | for correction of the same.
43 | :return: (list) Feature subsets.
44 | """
45 |
46 | pass
47 |
48 |
49 | def _cluster_transformation(X: pd.DataFrame, clusters: dict, feats_to_transform: list) -> pd.DataFrame:
50 | """
51 | Machine Learning for Asset Managers
52 | Snippet 6.5.2.1 , page 85. Step 1: Features Clustering (last paragraph)
53 |
54 | Transforms a dataset to reduce the multicollinearity of the system by replacing the original feature with
55 | the residual from regression.
56 |
57 | :param X: (pd.DataFrame) Dataframe of features.
58 | :param clusters: (dict) Clusters generated by ONC algorithm.
59 | :param feats_to_transform: (list) Features that have low silhouette score and to be transformed.
60 | :return: (pd.DataFrame) Transformed features.
61 | """
62 |
63 | pass
64 |
65 |
66 | def _combine_features(X, clusters, exclude_key) -> np.array:
67 | """
68 | Combines features of each cluster linearly by following a minimum variance weighting scheme.
69 | The Minimum Variance weights are calculated without constraints, other than the weights sum to one.
70 |
71 | :param X: (pd.DataFrame) Dataframe of features.
72 | :param clusters: (dict) Clusters generated by ONC algorithm.
73 | :param exclude_key: (int) Key of the cluster which is to be excluded.
74 | :return: (np.array) Combined features for each cluster.
75 | """
76 |
77 | pass
78 |
79 |
80 | def _check_for_low_silhouette_scores(X: pd.DataFrame, dep_matrix: pd.DataFrame,
81 | critical_threshold: float = 0.0) -> pd.DataFrame:
82 | """
83 | Machine Learning for Asset Managers
84 | Snippet 6.5.2.1 , page 85. Step 1: Features Clustering (last paragraph)
85 |
86 | Checks where the dataset contains features low silhouette due one feature being a combination of
87 | multiple features across clusters. This is a problem, because ONC cannot assign one feature to multiple
88 | clusters and it needs a transformation.
89 |
90 | :param X: (pd.DataFrame) Dataframe of features.
91 | :param dep_matrix: (pd.DataFrame) Dataframe with dependences between features.
92 | :param critical_threshold: (float) Threshold for determining low silhouette score.
93 | :return: (pd.DataFrame) Dataframe of features.
94 | """
95 |
96 | pass
97 |
--------------------------------------------------------------------------------
/mlfinlab/clustering/hierarchical_clustering.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementation of hierarchical clustering algorithms.
3 | """
4 | import numpy as np
5 | import pandas as pd
6 | from scipy.cluster import hierarchy
7 |
8 |
9 | def optimal_hierarchical_cluster(mat: np.array, method: str = "ward") -> np.array:
10 | """
11 | Calculates the optimal clustering of a matrix.
12 |
13 | It calculates the hierarchy clusters from the distance of the matrix. Then it calculates
14 | the optimal leaf ordering of the hierarchy clusters, and returns the optimally clustered matrix.
15 |
16 | It is reproduced with modifications from the following blog post:
17 | `Marti, G. (2020) TF 2.0 DCGAN for 100x100 financial correlation matrices [Online].
18 | Available at: https://marti.ai/ml/2019/10/13/tf-dcgan-financial-correlation-matrices.html.
19 | (Accessed: 17 Aug 2020)
20 | `_
21 |
22 | This method relies and acts as a wrapper for the `scipy.cluster.hierarchy` module.
23 | ``_
24 |
25 | :param mat: (np.array/pd.DataFrame) Correlation matrix.
26 | :param method: (str) Method to calculate the hierarchy clusters. Can take the values
27 | ["single", "complete", "average", "weighted", "centroid", "median", "ward"].
28 | :return: (np.array) Optimal hierarchy cluster matrix.
29 | """
30 |
31 | pass
32 |
--------------------------------------------------------------------------------
/mlfinlab/clustering/onc.py:
--------------------------------------------------------------------------------
1 | """
2 | Optimal Number of Clusters (ONC Algorithm)
3 | Detection of False Investment Strategies using Unsupervised Learning Methods
4 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3167017
5 | """
6 |
7 | from typing import Union
8 |
9 | import numpy as np
10 | import pandas as pd
11 |
12 | from sklearn.cluster import KMeans
13 | from sklearn.metrics import silhouette_samples
14 |
15 |
16 | def _improve_clusters(corr_mat: pd.DataFrame, clusters: dict, top_clusters: dict) -> Union[
17 | pd.DataFrame, dict, pd.Series]:
18 | """
19 | Improve number clusters using silh scores
20 |
21 | :param corr_mat: (pd.DataFrame) Correlation matrix
22 | :param clusters: (dict) Clusters elements
23 | :param top_clusters: (dict) Improved clusters elements
24 | :return: (tuple) [ordered correlation matrix, clusters, silh scores]
25 | """
26 |
27 | pass
28 |
29 |
30 | def _cluster_kmeans_base(corr_mat: pd.DataFrame, max_num_clusters: int = 10, repeat: int = 10) -> Union[
31 | pd.DataFrame, dict, pd.Series]:
32 | """
33 | Initial clustering step using KMeans.
34 |
35 | :param corr_mat: (pd.DataFrame) Correlation matrix
36 | :param max_num_clusters: (int) Maximum number of clusters to search for.
37 | :param repeat: (int) Number of clustering algorithm repetitions.
38 | :return: (tuple) [ordered correlation matrix, clusters, silh scores]
39 | """
40 |
41 | pass
42 |
43 |
44 | def _check_improve_clusters(new_tstat_mean: float, mean_redo_tstat: float, old_cluster: tuple,
45 | new_cluster: tuple) -> tuple:
46 | """
47 | Checks cluster improvement condition based on t-statistic.
48 |
49 | :param new_tstat_mean: (float) T-statistics
50 | :param mean_redo_tstat: (float) Average t-statistcs for cluster improvement
51 | :param old_cluster: (tuple) Old cluster correlation matrix, optimized clusters, silh scores
52 | :param new_cluster: (tuple) New cluster correlation matrix, optimized clusters, silh scores
53 | :return: (tuple) Cluster
54 | """
55 |
56 | pass
57 |
58 |
59 | def cluster_kmeans_top(corr_mat: pd.DataFrame, repeat: int = 10) -> Union[pd.DataFrame, dict, pd.Series, bool]:
60 | """
61 | Improve the initial clustering by leaving clusters with high scores unchanged and modifying clusters with
62 | below average scores.
63 |
64 | :param corr_mat: (pd.DataFrame) Correlation matrix
65 | :param repeat: (int) Number of clustering algorithm repetitions.
66 | :return: (tuple) [correlation matrix, optimized clusters, silh scores, boolean to rerun ONC]
67 | """
68 |
69 | pass
70 |
71 |
72 | def get_onc_clusters(corr_mat: pd.DataFrame, repeat: int = 10) -> Union[pd.DataFrame, dict, pd.Series]:
73 | """
74 | Optimal Number of Clusters (ONC) algorithm described in the following paper:
75 | `Marcos Lopez de Prado, Michael J. Lewis, Detection of False Investment Strategies Using Unsupervised
76 | Learning Methods, 2015 `_;
77 | The code is based on the code provided by the authors of the paper.
78 |
79 | The algorithm searches for the optimal number of clusters using the correlation matrix of elements as an input.
80 |
81 | The correlation matrix is transformed to a matrix of distances, the K-Means algorithm is applied multiple times
82 | with a different number of clusters to use. The results are evaluated on the t-statistics of the silhouette scores.
83 |
84 | The output of the algorithm is the reordered correlation matrix (clustered elements are placed close to each other),
85 | optimal clustering, and silhouette scores.
86 |
87 | :param corr_mat: (pd.DataFrame) Correlation matrix of features
88 | :param repeat: (int) Number of clustering algorithm repetitions
89 | :return: (tuple) [correlation matrix, optimized clusters, silh scores]
90 | """
91 |
92 | pass
93 |
--------------------------------------------------------------------------------
/mlfinlab/codependence/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Various codependence measures: mutual info, distance correlations, variation of information.
3 | """
4 |
5 | from mlfinlab.codependence.correlation import (angular_distance, absolute_angular_distance, squared_angular_distance,
6 | distance_correlation, kullback_leibler_distance, norm_distance)
7 | from mlfinlab.codependence.information import (get_mutual_info, get_optimal_number_of_bins, variation_of_information_score)
8 | from mlfinlab.codependence.codependence_matrix import (get_dependence_matrix, get_distance_matrix)
9 | from mlfinlab.codependence.gnpr_distance import (spearmans_rho, gpr_distance, gnpr_distance)
10 | from mlfinlab.codependence.optimal_transport import (optimal_transport_dependence)
11 |
--------------------------------------------------------------------------------
/mlfinlab/codependence/codependence_matrix.py:
--------------------------------------------------------------------------------
1 | """
2 | This implementation lets user generate dependence and distance matrix based on the various methods of Information
3 | Codependence described in Cornell lecture notes on Codependence:
4 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes
5 | """
6 |
7 | import numpy as np
8 | import pandas as pd
9 |
10 | from mlfinlab.codependence.information import variation_of_information_score, get_mutual_info
11 | from mlfinlab.codependence.correlation import distance_correlation
12 | from mlfinlab.codependence.gnpr_distance import spearmans_rho, gpr_distance, gnpr_distance
13 | from mlfinlab.codependence.optimal_transport import optimal_transport_dependence
14 |
15 |
16 | # pylint: disable=invalid-name
17 |
18 | def get_dependence_matrix(df: pd.DataFrame, dependence_method: str, theta: float = 0.5,
19 | n_bins: int = None, normalize: bool = True,
20 | estimator: str = 'standard', target_dependence: str = 'comonotonicity',
21 | gaussian_corr: float = 0.7, var_threshold: float = 0.2) -> pd.DataFrame:
22 | """
23 | This function returns a dependence matrix for elements given in the dataframe using the chosen dependence method.
24 |
25 | List of supported algorithms to use for generating the dependence matrix: ``information_variation``,
26 | ``mutual_information``, ``distance_correlation``, ``spearmans_rho``, ``gpr_distance``, ``gnpr_distance``,
27 | ``optimal_transport``.
28 |
29 | :param df: (pd.DataFrame) Features.
30 | :param dependence_method: (str) Algorithm to be use for generating dependence_matrix.
31 | :param theta: (float) Type of information being tested in the GPR and GNPR distances. Falls in range [0, 1].
32 | (0.5 by default)
33 | :param n_bins: (int) Number of bins for discretization in ``information_variation`` and ``mutual_information``,
34 | if None the optimal number will be calculated. (None by default)
35 | :param normalize: (bool) Flag used to normalize the result to [0, 1] in ``information_variation`` and
36 | ``mutual_information``. (True by default)
37 | :param estimator: (str) Estimator to be used for calculation in ``mutual_information``.
38 | [``standard``, ``standard_copula``, ``copula_entropy``] (``standard`` by default)
39 | :param target_dependence: (str) Type of target dependence to use in ``optimal_transport``.
40 | [``comonotonicity``, ``countermonotonicity``, ``gaussian``,
41 | ``positive_negative``, ``different_variations``, ``small_variations``]
42 | (``comonotonicity`` by default)
43 | :param gaussian_corr: (float) Correlation coefficient to use when creating ``gaussian`` and
44 | ``small_variations`` copulas. [from 0 to 1] (0.7 by default)
45 | :param var_threshold: (float) Variation threshold to use for coefficient to use in ``small_variations``.
46 | Sets the relative area of correlation in a copula. [from 0 to 1] (0.2 by default)
47 | :return: (pd.DataFrame) Dependence matrix.
48 | """
49 |
50 | pass
51 |
52 |
53 | def get_distance_matrix(X: pd.DataFrame, distance_metric: str = 'angular') -> pd.DataFrame:
54 | """
55 | Applies distance operator to a dependence matrix.
56 |
57 | This allows to turn a correlation matrix into a distance matrix. Distances used are true metrics.
58 |
59 | List of supported distance metrics to use for generating the distance matrix: ``angular``, ``squared_angular``,
60 | and ``absolute_angular``.
61 |
62 | :param X: (pd.DataFrame) Dataframe to which distance operator to be applied.
63 | :param distance_metric: (str) The distance metric to be used for generating the distance matrix.
64 | :return: (pd.DataFrame) Distance matrix.
65 | """
66 |
67 | pass
68 |
--------------------------------------------------------------------------------
/mlfinlab/codependence/correlation.py:
--------------------------------------------------------------------------------
1 | """
2 | Correlation based distances and various modifications (angular, absolute, squared) described in Cornell lecture notes:
3 | Codependence: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes
4 | """
5 |
6 | import numpy as np
7 | import pandas as pd
8 | from scipy.spatial.distance import squareform, pdist
9 |
10 |
11 | # pylint: disable=invalid-name
12 |
13 |
14 | def angular_distance(x: np.array, y: np.array) -> float:
15 | """
16 | Returns angular distance between two vectors. Angular distance is a slight modification of Pearson correlation which
17 | satisfies metric conditions.
18 |
19 | Formula used for calculation:
20 |
21 | Ang_Distance = (1/2 * (1 - Corr))^(1/2)
22 |
23 | Read Cornell lecture notes for more information about angular distance:
24 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes.
25 |
26 | :param x: (np.array/pd.Series) X vector.
27 | :param y: (np.array/pd.Series) Y vector.
28 | :return: (float) Angular distance.
29 | """
30 |
31 | pass
32 |
33 |
34 | def absolute_angular_distance(x: np.array, y: np.array) -> float:
35 | """
36 | Returns absolute angular distance between two vectors. It is a modification of angular distance where the absolute
37 | value of the Pearson correlation coefficient is used.
38 |
39 | Formula used for calculation:
40 |
41 | Abs_Ang_Distance = (1/2 * (1 - abs(Corr)))^(1/2)
42 |
43 | Read Cornell lecture notes for more information about absolute angular distance:
44 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes.
45 |
46 | :param x: (np.array/pd.Series) X vector.
47 | :param y: (np.array/pd.Series) Y vector.
48 | :return: (float) Absolute angular distance.
49 | """
50 |
51 | pass
52 |
53 |
54 | def squared_angular_distance(x: np.array, y: np.array) -> float:
55 | """
56 | Returns squared angular distance between two vectors. It is a modification of angular distance where the square of
57 | Pearson correlation coefficient is used.
58 |
59 | Formula used for calculation:
60 |
61 | Squared_Ang_Distance = (1/2 * (1 - (Corr)^2))^(1/2)
62 |
63 | Read Cornell lecture notes for more information about squared angular distance:
64 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes.
65 |
66 | :param x: (np.array/pd.Series) X vector.
67 | :param y: (np.array/pd.Series) Y vector.
68 | :return: (float) Squared angular distance.
69 | """
70 |
71 | pass
72 |
73 |
74 | def distance_correlation(x: np.array, y: np.array) -> float:
75 | """
76 | Returns distance correlation between two vectors. Distance correlation captures both linear and non-linear
77 | dependencies.
78 |
79 | Formula used for calculation:
80 |
81 | Distance_Corr[X, Y] = dCov[X, Y] / (dCov[X, X] * dCov[Y, Y])^(1/2)
82 |
83 | dCov[X, Y] is the average Hadamard product of the doubly-centered Euclidean distance matrices of X, Y.
84 |
85 | Read Cornell lecture notes for more information about distance correlation:
86 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes.
87 |
88 | :param x: (np.array/pd.Series) X vector.
89 | :param y: (np.array/pd.Series) Y vector.
90 | :return: (float) Distance correlation coefficient.
91 | """
92 |
93 | pass
94 |
95 | def kullback_leibler_distance(corr_a, corr_b):
96 | """
97 | Returns the Kullback-Leibler distance between two correlation matrices, all elements must be positive.
98 | Formula used for calculation:
99 | kullback_leibler_distance[X, Y] = 0.5 * ( Log( det(Y) / det(X) ) + tr((Y ^ -1).X - n )
100 | Where n is the dimension space spanned by X.
101 | Read Don H. Johnson's research paper for more information on Kullback-Leibler distance:
102 | ``_
103 |
104 | :param corr_a: (np.array/pd.Series/pd.DataFrame) Numpy array of the first correlation matrix.
105 | :param corr_b: (np.array/pd.Series/pd.DataFrame) Numpy array of the second correlation matrix.
106 | :return: (np.float64) the Kullback-Leibler distance between the two matrices.
107 | """
108 |
109 | pass
110 |
111 |
112 | def norm_distance(matrix_a, matrix_b, r_val=2):
113 | """
114 | Returns the normalized distance between two matrices.
115 | This function is a wrap for numpy's linear algebra method (numpy.linalg.norm).
116 | Link to documentation: ``_.
117 | Formula used to normalize matrix:
118 | norm_distance[X, Y] = sum( abs(X - Y) ^ r ) ^ 1/r
119 | Where r is a parameter. r=1 City block(L1 norm), r=2 Euclidean distance (L2 norm),
120 | r=inf Supermum (L_inf norm). For values of r < 1, the result is not really a mathematical ‘norm’.
121 |
122 | :param matrix_a: (np.array/pd.Series/pd.DataFrame) Array of the first matrix.
123 | :param matrix_b: (np.array/pd.Series/pd.DataFrame) Array of the second matrix.
124 | :param r_val: (int/str) The r value of the normalization formula. (``2`` by default, Any Integer)
125 | :return: (np.float64) The Euclidean distance between the two matrices.
126 | """
127 |
128 | pass
--------------------------------------------------------------------------------
/mlfinlab/codependence/gnpr_distance.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementation of distance using the Generic Non-Parametric Representation approach from "Some contributions to the
3 | clustering of financial time series and applications to credit default swaps" by Gautier Marti
4 | https://www.researchgate.net/publication/322714557
5 | """
6 | import numpy as np
7 | import pandas as pd
8 | from scipy.stats import spearmanr
9 | import ot
10 |
11 | # pylint: disable=invalid-name
12 |
13 |
14 | def spearmans_rho(x: np.array, y: np.array) -> float:
15 | """
16 | Calculates a statistical estimate of Spearman's rho - a copula-based dependence measure.
17 |
18 | Formula for calculation:
19 | rho = 1 - (6)/(T*(T^2-1)) * Sum((X_t-Y_t)^2)
20 |
21 | It is more robust to noise and can be defined if the variables have an infinite second moment.
22 | This statistic is described in more detail in the work by Gautier Marti
23 | https://www.researchgate.net/publication/322714557 (p.54)
24 |
25 | This method is a wrapper for the scipy spearmanr function. For more details about the function and its parameters,
26 | please visit scipy documentation
27 | https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.spearmanr.html
28 |
29 | :param x: (np.array/pd.Series) X vector
30 | :param y: (np.array/pd.Series) Y vector (same number of observations as X)
31 | :return: (float) Spearman's rho statistical estimate
32 | """
33 |
34 | # Coefficient calculationS
35 |
36 | pass
37 |
38 |
39 | def gpr_distance(x: np.array, y: np.array, theta: float) -> float:
40 | """
41 | Calculates the distance between two Gaussians under the Generic Parametric Representation (GPR) approach.
42 |
43 | According to the original work https://www.researchgate.net/publication/322714557 (p.70):
44 | "This is a fast and good proxy for distance d_theta when the first two moments ... predominate". But it's not
45 | a good metric for heavy-tailed distributions.
46 |
47 | Parameter theta defines what type of information dependency is being tested:
48 | - for theta = 0 the distribution information is tested
49 | - for theta = 1 the dependence information is tested
50 | - for theta = 0.5 a mix of both information types is tested
51 |
52 | With theta in [0, 1] the distance lies in range [0, 1] and is a metric. (See original work for proof, p.71)
53 |
54 | :param x: (np.array/pd.Series) X vector.
55 | :param y: (np.array/pd.Series) Y vector (same number of observations as X).
56 | :param theta: (float) Type of information being tested. Falls in range [0, 1].
57 | :return: (float) Distance under GPR approach.
58 | """
59 |
60 | pass
61 |
62 |
63 | def gnpr_distance(x: np.array, y: np.array, theta: float, n_bins: int = 50) -> float:
64 | """
65 | Calculates the empirical distance between two random variables under the Generic Non-Parametric Representation
66 | (GNPR) approach.
67 |
68 | Formula for the distance is taken from https://www.researchgate.net/publication/322714557 (p.72).
69 |
70 | Parameter theta defines what type of information dependency is being tested:
71 | - for theta = 0 the distribution information is tested
72 | - for theta = 1 the dependence information is tested
73 | - for theta = 0.5 a mix of both information types is tested
74 |
75 | With theta in [0, 1] the distance lies in the range [0, 1] and is a metric.
76 | (See original work for proof, p.71)
77 |
78 | This method is modified as it uses 1D Optimal Transport Distance to measure
79 | distribution distance. This solves the issue of defining support and choosing
80 | a number of bins. The number of bins can be given as an input to speed up calculations.
81 | Big numbers of bins can take a long time to calculate.
82 |
83 | :param x: (np.array/pd.Series) X vector.
84 | :param y: (np.array/pd.Series) Y vector (same number of observations as X).
85 | :param theta: (float) Type of information being tested. Falls in range [0, 1].
86 | :param n_bins: (int) Number of bins to use to split the X and Y vector observations.
87 | (100 by default)
88 | :return: (float) Distance under GNPR approach.
89 | """
90 |
91 | pass
92 |
--------------------------------------------------------------------------------
/mlfinlab/codependence/information.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementations of mutual information (I) and variation of information (VI) codependence measures from Cornell
3 | lecture slides: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes
4 | """
5 | import numpy as np
6 | import scipy.stats as ss
7 | from sklearn.metrics import mutual_info_score
8 |
9 |
10 | # pylint: disable=invalid-name
11 |
12 | def get_optimal_number_of_bins(num_obs: int, corr_coef: float = None) -> int:
13 | """
14 | Calculates optimal number of bins for discretization based on number of observations
15 | and correlation coefficient (univariate case).
16 |
17 | Algorithms used in this function were originally proposed in the works of Hacine-Gharbi et al. (2012)
18 | and Hacine-Gharbi and Ravier (2018). They are described in the Cornell lecture notes:
19 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes (p.26)
20 |
21 | :param num_obs: (int) Number of observations.
22 | :param corr_coef: (float) Correlation coefficient, used to estimate the number of bins for univariate case.
23 | :return: (int) Optimal number of bins.
24 | """
25 |
26 | pass
27 |
28 |
29 | def get_mutual_info(x: np.array, y: np.array, n_bins: int = None, normalize: bool = False,
30 | estimator: str = 'standard') -> float:
31 | """
32 | Returns mutual information (MI) between two vectors.
33 |
34 | This function uses the discretization with the optimal bins algorithm proposed in the works of
35 | Hacine-Gharbi et al. (2012) and Hacine-Gharbi and Ravier (2018).
36 |
37 | Read Cornell lecture notes for more information about the mutual information:
38 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes.
39 |
40 | This function supports multiple ways the mutual information can be estimated:
41 |
42 | 1. ``standard`` - the standard way of estimation - binning observations according to a given
43 | number of bins and applying the MI formula.
44 | 2. ``standard_copula`` - estimating the copula (as a normalized ranking of the observations) and
45 | applying the standard mutual information estimator on it.
46 | 3. ``copula_entropy`` - estimating the copula (as a normalized ranking of the observations) and
47 | calculating its entropy. Then MI estimator = (-1) * copula entropy.
48 |
49 | The last two estimators' implementation is taken from the blog post by Dr. Gautier Marti.
50 | Read this blog post for more information about the differences in the estimators:
51 | https://gmarti.gitlab.io/qfin/2020/07/01/mutual-information-is-copula-entropy.html
52 |
53 | :param x: (np.array) X vector.
54 | :param y: (np.array) Y vector.
55 | :param n_bins: (int) Number of bins for discretization, if None the optimal number will be calculated.
56 | (None by default)
57 | :param normalize: (bool) Flag used to normalize the result to [0, 1]. (False by default)
58 | :param estimator: (str) Estimator to be used for calculation. [``standard``, ``standard_copula``, ``copula_entropy``]
59 | (``standard`` by default)
60 | :return: (float) Mutual information score.
61 | """
62 |
63 | pass
64 |
65 |
66 | def variation_of_information_score(x: np.array, y: np.array, n_bins: int = None, normalize: bool = False) -> float:
67 | """
68 | Returns variantion of information (VI) between two vectors.
69 |
70 | This function uses the discretization using optimal bins algorithm proposed in the works of
71 | Hacine-Gharbi et al. (2012) and Hacine-Gharbi and Ravier (2018).
72 |
73 | Read Cornell lecture notes for more information about the variation of information:
74 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes.
75 |
76 | :param x: (np.array) X vector.
77 | :param y: (np.array) Y vector.
78 | :param n_bins: (int) Number of bins for discretization, if None the optimal number will be calculated.
79 | (None by default)
80 | :param normalize: (bool) True to normalize the result to [0, 1]. (False by default)
81 | :return: (float) Variation of information score.
82 | """
83 |
84 | pass
85 |
--------------------------------------------------------------------------------
/mlfinlab/codependence/optimal_transport.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementations of Optimal Copula Transport dependence measure proposed by Marti et al. : https://arxiv.org/abs/1610.09659
3 | And implemented in the blog post by Marti: https://gmarti.gitlab.io/qfin/2020/06/25/copula-optimal-transport-dependence.html
4 | """
5 | import numpy as np
6 | import scipy.stats as ss
7 | import ot
8 |
9 |
10 | # pylint: disable=invalid-name
11 |
12 | def _get_empirical_copula(x: np.array, y: np.array) -> np.array:
13 | """
14 | Calculate empirical copula using ranked observations.
15 |
16 | :param x: (np.array) X vector.
17 | :param y: (np.array) Y vector.
18 | :return: (np.array) Empirical copula.
19 | """
20 |
21 | pass
22 |
23 |
24 | def optimal_transport_dependence(x: np.array, y: np.array, target_dependence: str = 'comonotonicity',
25 | gaussian_corr: float = 0.7, var_threshold: float = 0.2) -> float:
26 | """
27 | Calculates optimal copula transport dependence between the empirical copula of the two vectors and a target copula.
28 |
29 | This implementation is based on the blog post by Marti:
30 | https://gmarti.gitlab.io/qfin/2020/06/25/copula-optimal-transport-dependence.html
31 |
32 | The target and forget copulas are being used to reference where between them does the empirical
33 | copula stand in the space of copulas. The forget copula used is the copula associated with
34 | independent random variables. The target copula is defined by the target_dependence parameter.
35 |
36 | Currently, these target_dependence copulas are supported:
37 |
38 | - ``comonotonicity`` - a comonotone copula.
39 | - ``countermonotonicity`` - a countermonotone copula.
40 | - ``gaussian`` - a Gaussian copula with a custom correlation coefficient.
41 | - ``positive_negative`` - a copula of both positive and negative correlation.
42 | - ``different_variations`` - a copula with some elements having extreme variations,
43 | while those of others are relatively small, and conversely.
44 | - ``small_variations`` - a copula with elements being positively correlated for small variations
45 | but uncorrelated otherwise.
46 | - ``v-shape`` - a copula that is seen with vol index vs. returns index: when returns of the index
47 | are extreme, vol is usually high, when returns small in absolute value, vol usually low.
48 |
49 | :param x: (np.array) X vector.
50 | :param y: (np.array) Y vector.
51 | :param target_dependence: (str) Type of target dependence to use when measuring distance.
52 | (``comonotonicity`` by default)
53 | :param gaussian_corr: (float) Correlation coefficient to use when creating ``gaussian`` and
54 | ``small_variations`` copulas. [from 0 to 1] (0.7 by default)
55 | :param var_threshold: (float) Variation threshold to use for coefficient to use in ``small_variations``.
56 | Sets the relative area of correlation in a copula. [from 0 to 1] (0.2 by default)
57 | :return: (float) Optimal copula transport dependence.
58 | """
59 |
60 | pass
61 |
62 |
63 | def _compute_copula_ot_dependence(empirical: np.array, target: np.array, forget: np.array,
64 | n_obs: int) -> float:
65 | """
66 | Calculates optimal copula transport dependence measure.
67 |
68 | :param empirical: (np.array) Empirical copula.
69 | :param target: (np.array) Target copula.
70 | :param forget: (np.array) Forget copula.
71 | :param nb_obs: (int) Number of observations.
72 | :return: (float) Optimal copula transport dependence.
73 | """
74 |
75 | pass
76 |
77 |
78 | def _create_target_copula(target_dependence: str, n_obs: int, gauss_corr: float,
79 | var_threshold: float) -> np.array:
80 | """
81 | Creates target copula with given dependence and number of observations.
82 |
83 | :param target_dependence: (str) Type of dependence to use for copula creation.[``comonotonicity``,
84 | ``countermonotonicity``, ``gaussian``, ``positive_negative``,
85 | ``different_variations``, ``small_variations``, ``v-shape``]
86 | :param n_obs: (int) Number of observations to use for copula creation.
87 | :param gauss_corr: (float) Correlation coefficient to use when creating ``gaussian`` and
88 | ``small_variations`` copulas.
89 | :param var_threshold: (float) Variation threshold to use for coefficient to use in ``small_variations``.
90 | :return: (np.array) Resulting copula.
91 | """
92 |
93 | pass
94 |
--------------------------------------------------------------------------------
/mlfinlab/cross_validation/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions derived from Chapter 7: Cross Validation
3 | and stacked (multi-asset datasets) cross-validation functions.
4 | """
5 |
6 | from mlfinlab.cross_validation.cross_validation import (ml_get_train_times, ml_cross_val_score, stacked_ml_cross_val_score,
7 | PurgedKFold, StackedPurgedKFold)
8 | from mlfinlab.cross_validation.combinatorial import (CombinatorialPurgedKFold, StackedCombinatorialPurgedKFold)
9 |
--------------------------------------------------------------------------------
/mlfinlab/cross_validation/combinatorial.py:
--------------------------------------------------------------------------------
1 | """
2 | Implements the following classes from Chapter 12 of AFML:
3 |
4 | - Combinatorial Purged Cross-Validation class.
5 | - Stacked Combinatorial Purged Cross-Validation class.
6 | """
7 | # pylint: disable=too-many-locals, arguments-differ, invalid-name, unused-argument
8 |
9 | from itertools import combinations
10 | from typing import List
11 |
12 | import pandas as pd
13 | import numpy as np
14 | from scipy.special import comb
15 | from sklearn.model_selection import KFold
16 |
17 | from mlfinlab.cross_validation.cross_validation import ml_get_train_times
18 |
19 |
20 | def _get_number_of_backtest_paths(n_train_splits: int, n_test_splits: int) -> int:
21 | """
22 | Number of combinatorial paths for CPCV(N,K).
23 |
24 | :param n_train_splits: (int) Number of train splits.
25 | :param n_test_splits: (int) Number of test splits.
26 | :return: (int) Number of backtest paths for CPCV(N,k).
27 | """
28 |
29 | pass
30 |
31 |
32 | class CombinatorialPurgedKFold(KFold):
33 | """
34 | Advances in Financial Machine Learning, Chapter 12.
35 |
36 | Implements Combinatorial Purged Cross Validation (CPCV).
37 |
38 | The train is purged of observations overlapping test-label intervals.
39 | Test set is assumed contiguous (shuffle=False), w/o training samples in between.
40 | """
41 |
42 | def __init__(self,
43 | n_splits: int = 3,
44 | n_test_splits: int = 2,
45 | samples_info_sets: pd.Series = None,
46 | pct_embargo: float = 0.):
47 | """
48 | Initialize.
49 |
50 | :param n_splits: (int) The number of splits. Default to 3
51 | :param samples_info_sets: (pd.Series) The information range on which each record is constructed from
52 | *samples_info_sets.index*: Time when the information extraction started.
53 | *samples_info_sets.value*: Time when the information extraction ended.
54 | :param pct_embargo: (float) Percent that determines the embargo size.
55 | """
56 |
57 | pass
58 |
59 | def _generate_combinatorial_test_ranges(self, splits_indices: dict) -> List:
60 | """
61 | Using start and end indices of test splits from KFolds and number of test_splits (self.n_test_splits),
62 | generates combinatorial test ranges splits.
63 |
64 | :param splits_indices: (dict) Test fold integer index: [start test index, end test index].
65 | :return: (list) Combinatorial test splits ([start index, end index]).
66 | """
67 |
68 | pass
69 |
70 | def _fill_backtest_paths(self, train_indices: list, test_splits: list):
71 | """
72 | Using start and end indices of test splits and purged/embargoed train indices from CPCV, find backtest path and
73 | place in the path where these indices should be used.
74 |
75 | :param test_splits: (list) List of lists with first element corresponding to test start index and second - test end.
76 | """
77 |
78 | pass
79 |
80 | def split(self,
81 | X: pd.DataFrame,
82 | y: pd.Series = None,
83 | groups=None) -> tuple:
84 | """
85 | The main method to call for the PurgedKFold class.
86 |
87 | :param X: (pd.DataFrame) Samples dataset that is to be split.
88 | :param y: (pd.Series) Sample labels series.
89 | :param groups: (array-like), with shape (n_samples,), optional
90 | Group labels for the samples used while splitting the dataset into
91 | train/test set.
92 | :return: (tuple) [train list of sample indices, and test list of sample indices].
93 | """
94 |
95 | pass
96 |
97 |
98 | class StackedCombinatorialPurgedKFold(KFold):
99 | """
100 | Advances in Financial Machine Learning, Chapter 12.
101 |
102 | Implements Stacked Combinatorial Purged Cross Validation (CPCV). It implements CPCV for multiasset dataset.
103 |
104 | The train is purged of observations overlapping test-label intervals.
105 | Test set is assumed contiguous (shuffle=False), w/o training samples in between.
106 | """
107 |
108 | def __init__(self,
109 | n_splits: int = 3,
110 | n_test_splits: int = 2,
111 | samples_info_sets_dict: dict = None,
112 | pct_embargo: float = 0.):
113 | """
114 | Initialize.
115 |
116 | :param n_splits: (int) The number of splits. Default to 3
117 | :param samples_info_sets_dict: (dict) Dictionary of samples info sets.
118 | ASSET_1: SAMPLE_INFO_SETS, ASSET_2:...
119 |
120 | *samples_info_sets.index*: Time when the information extraction started.
121 | *samples_info_sets.value*: Time when the information extraction ended.
122 | :param pct_embargo: (float) Percent that determines the embargo size.
123 | """
124 |
125 | pass
126 |
127 | def _fill_backtest_paths(self, asset, train_indices: list, test_splits: list):
128 | """
129 | Using start and end indices of test splits and purged/embargoed train indices from CPCV, find backtest path and
130 | place in the path where these indices should be used.
131 |
132 | :param asset: (str) Asset for which backtest paths are filled.
133 | :param train_indices: (list) List of lists with first element corresponding to train start index, second - test end.
134 | :param test_splits: (list) List of lists with first element corresponding to test start index and second - test end.
135 | """
136 |
137 | pass
138 |
139 | def _generate_combinatorial_test_ranges(self, splits_indices: dict) -> List:
140 | """
141 | Using start and end indices of test splits from KFolds and number of test_splits (self.n_test_splits),
142 | generates combinatorial test ranges splits.
143 |
144 | :param splits_indices: (dict) Test fold integer index: [start test index, end test index].
145 | :return: (list) Combinatorial test splits ([start index, end index]).
146 | """
147 |
148 | pass
149 |
150 | def split(self,
151 | X_dict: dict,
152 | y_dict: dict = None,
153 | groups=None) -> tuple:
154 | """
155 | The main method to call for the PurgedKFold class.
156 |
157 | :param X_dict: (dict) Dictionary of asset : X_{asset}.
158 | :param y_dict: (dict) Dictionary of asset : y_{asset}.
159 | :param groups: (array-like), with shape (n_samples,), optional
160 | Group labels for the samples used while splitting the dataset into
161 | train/test set.
162 | :return: (tuple) [train list of sample indices, and test list of sample indices].
163 | """
164 |
165 | pass
166 |
--------------------------------------------------------------------------------
/mlfinlab/data_generation/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Tools for synthetic data generation.
3 | """
4 |
5 | from mlfinlab.data_generation.corrgan import sample_from_corrgan
6 | from mlfinlab.data_generation.data_verification import (plot_pairwise_dist, plot_eigenvalues, plot_eigenvectors,
7 | plot_hierarchical_structure, plot_mst_degree_count, plot_stylized_facts,
8 | plot_time_series_dependencies, plot_optimal_hierarchical_cluster)
9 | from mlfinlab.data_generation.vines import (sample_from_cvine, sample_from_dvine, sample_from_ext_onion)
10 | from mlfinlab.data_generation.correlated_random_walks import generate_cluster_time_series
11 | from mlfinlab.data_generation.hcbm import (time_series_from_dist, generate_hcmb_mat)
12 | from mlfinlab.data_generation.bootstrap import (row_bootstrap, pair_bootstrap, block_bootstrap)
13 |
--------------------------------------------------------------------------------
/mlfinlab/data_generation/bootstrap.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementation of generating bootstrapped matrices from
3 | "Bootstrap validation of links of a minimum spanning tree" by F. Musciotto,
4 | L. Marotta, S. Miccichè, and R. N. Mantegna https://arxiv.org/pdf/1802.03395.pdf.
5 | """
6 |
7 | import numpy as np
8 | import pandas as pd
9 |
10 |
11 | def row_bootstrap(mat, n_samples=1, size=None):
12 | """
13 | Uses the Row Bootstrap method to generate a new matrix of size equal or smaller than the given matrix.
14 |
15 | It samples with replacement a random row from the given matrix. If the required bootstrapped
16 | columns' size is less than the columns of the original matrix, it randomly samples contiguous
17 | columns of the required size. It cannot generate a matrix greater than the original.
18 |
19 | It is inspired by the following paper:
20 | `Musciotto, F., Marotta, L., Miccichè, S. and Mantegna, R.N., 2018. Bootstrap validation of
21 | links of a minimum spanning tree. Physica A: Statistical Mechanics and its Applications,
22 | 512, pp.1032-1043. `_.
23 |
24 | :param mat: (pd.DataFrame/np.array) Matrix to sample from.
25 | :param n_samples: (int) Number of matrices to generate.
26 | :param size: (tuple) Size of the bootstrapped matrix.
27 | :return: (np.array) The generated bootstrapped matrices. Has shape (n_samples, size[0], size[1]).
28 | """
29 |
30 | pass
31 |
32 |
33 | def pair_bootstrap(mat, n_samples=1, size=None):
34 | """
35 | Uses the Pair Bootstrap method to generate a new correlation matrix of returns.
36 |
37 | It generates a correlation matrix based on the number of columns of the returns matrix given. It
38 | samples with replacement a pair of columns from the original matrix, the rows of the pairs generate
39 | a new row-bootstrapped matrix. The correlation value of the pair of assets is calculated and
40 | its value is used to fill the corresponding value in the generated correlation matrix.
41 |
42 | It is inspired by the following paper:
43 | `Musciotto, F., Marotta, L., Miccichè, S. and Mantegna, R.N., 2018. Bootstrap validation of
44 | links of a minimum spanning tree. Physica A: Statistical Mechanics and its Applications,
45 | 512, pp.1032-1043. `_.
46 |
47 | :param mat: (pd.DataFrame/np.array) Returns matrix to sample from.
48 | :param n_samples: (int) Number of matrices to generate.
49 | :param size: (int) Size of the bootstrapped correlation matrix.
50 | :return: (np.array) The generated bootstrapped correlation matrices. Has shape (n_samples, mat.shape[1], mat.shape[1]).
51 | """
52 |
53 | pass
54 |
55 |
56 | def block_bootstrap(mat, n_samples=1, size=None, block_size=None):
57 | """
58 | Uses the Block Bootstrap method to generate a new matrix of size equal to or smaller than the given matrix.
59 |
60 | It divides the original matrix into blocks of the given size. It samples with replacement random
61 | blocks to populate the bootstrapped matrix. It cannot generate a matrix greater than the original.
62 |
63 | It is inspired by the following paper:
64 | `Künsch, H.R., 1989. The jackknife and the bootstrap for general stationary observations.
65 | Annals of Statistics, 17(3), pp.1217-1241. `_.
66 |
67 | :param mat: (pd.DataFrame/np.array) Matrix to sample from.
68 | :param n_samples: (int) Number of matrices to generate.
69 | :param size: (tuple) Size of the bootstrapped matrix.
70 | :param block_size: (tuple) Size of the blocks.
71 | :return: (np.array) The generated bootstrapped matrices. Has shape (n_samples, size[0], size[1]).
72 | """
73 |
74 | pass
75 |
--------------------------------------------------------------------------------
/mlfinlab/data_generation/correlated_random_walks.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains methods for generating correlated random walks.
3 | """
4 |
5 | import numpy as np
6 | import pandas as pd
7 |
8 |
9 | def generate_cluster_time_series(n_series, t_samples=100, k_corr_clusters=1,
10 | d_dist_clusters=1, rho_main=0.1, rho_corr=0.3, price_start=100.0,
11 | dists_clusters=("normal", "normal", "student-t", "normal", "student-t")):
12 | """
13 | Generates a synthetic time series of correlation and distribution clusters.
14 |
15 | It is reproduced with modifications from the following paper:
16 | `Donnat, P., Marti, G. and Very, P., 2016. Toward a generic representation of random
17 | variables for machine learning. Pattern Recognition Letters, 70, pp.24-31.
18 | `_
19 |
20 | `www.datagrapple.com. (n.d.). DataGrapple - Tech: A GNPR tutorial: How to cluster random walks.
21 | [online] Available at: [Accessed 26 Aug. 2020].
22 | `_
23 |
24 | This method creates `n_series` time series of length `t_samples`. Each time series is divided
25 | into `k_corr_clusters` correlation clusters. Each correlation cluster is subdivided into
26 | `d_dist_clusters` distribution clusters.
27 | A main distribution is sampled from a normal distribution with mean = 0 and stdev = 1, adjusted
28 | by a `rho_main` factor. The correlation clusters are sampled from a given distribution, are generated
29 | once, and adjusted by a `rho_corr` factor. The distribution clusters are sampled from other
30 | given distributions, and adjusted by (1 - `rho_main` - `rho_corr`). They are sampled for each time series.
31 | These three series are added together to form a time series of returns. The final time series
32 | is the cumulative sum of the returns, with a start price given by `price_start`.
33 |
34 | :param n_series: (int) Number of time series to generate.
35 | :param t_samples: (int) Number of samples in each time series.
36 | :param k_corr_clusters: (int) Number of correlation clusters in each time series.
37 | :param d_dist_clusters: (int) Number of distribution clusters in each time series.
38 | :param rho_main: (float): Strength of main time series distribution.
39 | :param rho_corr: (float): Strength of correlation cluster distribution.
40 | :param price_start: (float) Starting price of the time series.
41 | :param dists_clusters: (list) List containing the names of the distributions to sample from.
42 | The following numpy distributions are available: "normal" = normal(0, 1), "normal_2" = normal(0, 2),
43 | "student-t" = standard_t(3)/sqrt(3), "laplace" = laplace(1/sqrt(2)). The first disitribution
44 | is used to sample for the correlation clusters (k_corr_clusters), the remaining ones are used
45 | to sample for the distribution clusters (d_dist_clusters).
46 | :return: (pd.DataFrame) Generated time series. Has size (t_samples, n_series).
47 | """
48 |
49 | pass
50 |
--------------------------------------------------------------------------------
/mlfinlab/data_generation/corrgan.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019, Hudson and Thames Quantitative Research
2 | # All rights reserved
3 | # Read more: https://github.com/hudson-and-thames/mlfinlab/blob/master/LICENSE.txt
4 | """
5 | Implementation of sampling realistic financial correlation matrices from
6 | "CorrGAN: Sampling Realistic Financial Correlation Matrices using
7 | Generative Adversarial Networks" by Gautier Marti.
8 | https://arxiv.org/pdf/1910.09504.pdf
9 | """
10 | from os import listdir, path
11 | import numpy as np
12 | from scipy.cluster import hierarchy
13 | from statsmodels.stats.correlation_tools import corr_nearest
14 |
15 |
16 | def sample_from_corrgan(model_loc, dim=10, n_samples=1):
17 | """
18 | Samples correlation matrices from the pre-trained CorrGAN network.
19 |
20 | It is reproduced with modifications from the following paper:
21 | `Marti, G., 2020, May. CorrGAN: Sampling Realistic Financial Correlation Matrices Using
22 | Generative Adversarial Networks. In ICASSP 2020-2020 IEEE International Conference on
23 | Acoustics, Speech and Signal Processing (ICASSP) (pp. 8459-8463). IEEE.
24 | `_
25 |
26 | It loads the appropriate CorrGAN model for the required dimension. Generates a matrix output
27 | from this network. Symmetries this matrix and finds the nearest correlation matrix
28 | that is positive semi-definite. Finally, it maximizes the sum of the similarities between
29 | adjacent leaves to arrange it with hierarchical clustering.
30 |
31 | The CorrGAN network was trained on the correlation profiles of the S&P 500 stocks. Therefore
32 | the output retains these properties. In addition, the final output retains the following
33 | 6 stylized facts:
34 |
35 | 1. Distribution of pairwise correlations is significantly shifted to the positive.
36 |
37 | 2. Eigenvalues follow the Marchenko-Pastur distribution, but for a very large first
38 | eigenvalue (the market).
39 |
40 | 3. Eigenvalues follow the Marchenko-Pastur distribution, but for a couple of other
41 | large eigenvalues (industries).
42 |
43 | 4. Perron-Frobenius property (first eigenvector has positive entries).
44 |
45 | 5. Hierarchical structure of correlations.
46 |
47 | 6. Scale-free property of the corresponding Minimum Spanning Tree (MST).
48 |
49 | :param model_loc: (str) Location of folder containing CorrGAN models.
50 | :param dim: (int) Dimension of correlation matrix to sample.
51 | In the range [2, 200].
52 | :param n_samples: (int) Number of samples to generate.
53 | :return: (np.array) Sampled correlation matrices of shape (n_samples, dim, dim).
54 | """
55 |
56 | pass
57 |
--------------------------------------------------------------------------------
/mlfinlab/data_generation/hcbm.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementation of the Hierarchical Correlation Block Model (HCBM) matrix.
3 | "Clustering financial time series: How long is enough?" by Marti, G., Andler, S., Nielsen, F. and Donnat, P.
4 | https://www.ijcai.org/Proceedings/16/Papers/367.pdf
5 | """
6 | import numpy as np
7 | import pandas as pd
8 | from statsmodels.sandbox.distributions.multivariate import multivariate_t_rvs
9 |
10 |
11 | def _hcbm_mat_helper(mat, n_low=0, n_high=214, rho_low=0.1, rho_high=0.9, blocks=4, depth=4):
12 | """
13 | Helper function for `generate_hcmb_mat` that recursively places rho values to HCBM matrix
14 | given as an input.
15 |
16 | By using a uniform distribution we select the start and end locations of the blocks in the
17 | matrix. For each block, we recurse depth times and repeat splitting up the sub-matrix into
18 | blocks. Each depth level has a unique correlation (rho) values generated from a uniform
19 | distributions, and bounded by `rho_low` and `rho_high`. This function works as a
20 | side-effect to the `mat` parameter.
21 |
22 | It is reproduced with modifications from the following paper:
23 | `Marti, G., Andler, S., Nielsen, F. and Donnat, P., 2016.
24 | Clustering financial time series: How long is enough?. arXiv preprint arXiv:1603.04017.
25 | `_
26 |
27 | :param mat: (np.array) Parent HCBM matrix.
28 | :param n_low: (int) Start location of HCMB matrix to work on.
29 | :param n_high: (int) End location of HCMB matrix to work on.
30 | :param rho_low: (float) Lower correlation bound of the matrix. Must be greater or equal
31 | to 0.
32 | :param rho_high: (float) Upper correlation bound of the matrix. Must be less or equal to 1.
33 | :param blocks: (int) Maximum number of blocks to generate per level of depth.
34 | :param depth: (int) Depth of recursion for generating new blocks.
35 | """
36 |
37 | pass
38 |
39 |
40 | def generate_hcmb_mat(t_samples, n_size, rho_low=0.1, rho_high=0.9, blocks=4, depth=4, permute=False):
41 | """
42 | Generates a Hierarchical Correlation Block Model (HCBM) matrix of correlation values.
43 |
44 | By using a uniform distribution we select the start and end locations of the blocks in the
45 | matrix. For each block, we recurse depth times and repeat splitting up the sub-matrix into
46 | blocks. Each depth level has a unique correlation (rho) values generated from a uniform
47 | distributions, and bounded by `rho_low` and `rho_high`.
48 |
49 | It is reproduced with modifications from the following paper:
50 | `Marti, G., Andler, S., Nielsen, F. and Donnat, P., 2016.
51 | Clustering financial time series: How long is enough?. arXiv preprint arXiv:1603.04017.
52 | `_
53 |
54 | :param t_samples: (int) Number of HCBM matrices to generate.
55 | :param n_size: (int) Size of HCBM matrix.
56 | :param rho_low: (float) Lower correlation bound of the matrix. Must be greater or equal to 0.
57 | :param rho_high: (float) Upper correlation bound of the matrix. Must be less or equal to 1.
58 | :param blocks: (int) Number of blocks to generate per level of depth.
59 | :param depth: (int) Depth of recursion for generating new blocks.
60 | :param permute: (bool) Whether to permute the final HCBM matrix.
61 | :return: (np.array) Generated HCBM matrix of shape (t_samples, n_size, n_size).
62 | """
63 |
64 | pass
65 |
66 |
67 | def time_series_from_dist(corr, t_samples=1000, dist="normal", deg_free=3):
68 | """
69 | Generates a time series from a given correlation matrix.
70 |
71 | It uses multivariate sampling from distributions to create the time series. It supports
72 | normal and student-t distributions. This method relies and acts as a wrapper for the
73 | `np.random.multivariate_normal` and
74 | `statsmodels.sandbox.distributions.multivariate.multivariate_t_rvs` modules.
75 | ``_
76 | ``_
77 |
78 | It is reproduced with modifications from the following paper:
79 | `Marti, G., Andler, S., Nielsen, F. and Donnat, P., 2016.
80 | Clustering financial time series: How long is enough?. arXiv preprint arXiv:1603.04017.
81 | `_
82 |
83 | :param corr: (np.array) Correlation matrix.
84 | :param t_samples: (int) Number of samples in the time series.
85 | :param dist: (str) Type of distributions to use.
86 | Can take the values ["normal", "student"].
87 | :param deg_free: (int) Degrees of freedom. Only used for student-t distribution.
88 | :return: (pd.DataFrame) The resulting time series of shape (len(corr), t_samples).
89 | """
90 |
91 | pass
92 |
--------------------------------------------------------------------------------
/mlfinlab/data_structures/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Logic regarding the various sampling techniques, in particular:
3 |
4 | * Time Bars
5 | * Tick Bars
6 | * Volume Bars
7 | * Dollar Bars
8 | * Tick Imbalance Bars (EMA and Const)
9 | * Volume Imbalance Bars (EMA and Const)
10 | * Dollar Imbalance Bars (EMA and Const)
11 | * Tick Run Bars (EMA and Const)
12 | * Volume Run Bars (EMA and Const)
13 | * Dollar Run Bars (EMA and Const)
14 | """
15 |
16 | from mlfinlab.data_structures.imbalance_data_structures import (get_ema_dollar_imbalance_bars, get_ema_volume_imbalance_bars,
17 | get_ema_tick_imbalance_bars, get_const_dollar_imbalance_bars,
18 | get_const_volume_imbalance_bars, get_const_tick_imbalance_bars)
19 | from mlfinlab.data_structures.run_data_structures import (get_ema_volume_run_bars, get_ema_tick_run_bars,
20 | get_ema_dollar_run_bars, get_const_volume_run_bars,
21 | get_const_tick_run_bars, get_const_dollar_run_bars)
22 | from mlfinlab.data_structures.standard_data_structures import (get_tick_bars, get_dollar_bars, get_volume_bars)
23 | from mlfinlab.data_structures.time_data_structures import get_time_bars
24 |
--------------------------------------------------------------------------------
/mlfinlab/data_structures/time_data_structures.py:
--------------------------------------------------------------------------------
1 | """
2 | Advances in Financial Machine Learning, Marcos Lopez de Prado
3 | Chapter 2: Financial Data Structures
4 |
5 | Time bars generation logic
6 | """
7 |
8 | # Imports
9 | from typing import Union, Iterable, Optional
10 | import numpy as np
11 | import pandas as pd
12 |
13 | from mlfinlab.data_structures.base_bars import BaseBars
14 |
15 |
16 | # pylint: disable=too-many-instance-attributes
17 | class TimeBars(BaseBars):
18 | """
19 | Contains all of the logic to construct the time bars. This class shouldn't be used directly.
20 | Use get_time_bars instead
21 | """
22 |
23 | def __init__(self, resolution: str, num_units: int, batch_size: int = 20000000):
24 | """
25 | Constructor
26 |
27 | :param resolution: (str) Type of bar resolution: ['D', 'H', 'MIN', 'S']
28 | :param num_units: (int) Number of days, minutes, etc.
29 | :param batch_size: (int) Number of rows to read in from the csv, per batch
30 | """
31 |
32 | pass
33 |
34 | def _reset_cache(self):
35 | """
36 | Implementation of abstract method _reset_cache for time bars
37 | """
38 |
39 | pass
40 |
41 | def _extract_bars(self, data: Union[list, tuple, np.ndarray]) -> list:
42 | """
43 | For loop which compiles time bars.
44 | We did investigate the use of trying to solve this in a vectorised manner but found that a For loop worked well.
45 |
46 | :param data: (tuple) Contains 3 columns - date_time, price, and volume.
47 | :return: (list) Extracted bars
48 | """
49 |
50 | pass
51 |
52 |
53 | def get_time_bars(file_path_or_df: Union[str, Iterable[str], pd.DataFrame], resolution: str = 'D', num_units: int = 1, batch_size: int = 20000000,
54 | verbose: bool = True, to_csv: bool = False, output_path: Optional[str] = None):
55 | """
56 | Creates Time Bars: date_time, open, high, low, close, volume, cum_buy_volume, cum_ticks, cum_dollar_value.
57 |
58 | :param file_path_or_df: (str, iterable of str, or pd.DataFrame) Path to the csv file(s) or Pandas Data Frame containing raw tick data
59 | in the format[date_time, price, volume]
60 | :param resolution: (str) Resolution type ('D', 'H', 'MIN', 'S')
61 | :param num_units: (int) Number of resolution units (3 days for example, 2 hours)
62 | :param batch_size: (int) The number of rows per batch. Less RAM = smaller batch size.
63 | :param verbose: (int) Print out batch numbers (True or False)
64 | :param to_csv: (bool) Save bars to csv after every batch run (True or False)
65 | :param output_path: (str) Path to csv file, if to_csv is True
66 | :return: (pd.DataFrame) Dataframe of time bars, if to_csv=True return None
67 | """
68 |
69 | pass
70 |
--------------------------------------------------------------------------------
/mlfinlab/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Module implementing typical financial datasets load (stock prices, dollar bars, ticks).
3 | """
4 |
5 | from mlfinlab.datasets.load_datasets import (load_dollar_bar_sample, load_stock_prices, load_tick_sample,
6 | generate_multi_asset_data_set)
7 |
--------------------------------------------------------------------------------
/mlfinlab/datasets/data/tick_data.csv:
--------------------------------------------------------------------------------
1 | Date and Time,Price,Volume
2 | 2011/07/31 22:38:45.108,1205.0,1
3 | 2011/07/31 22:38:45.934,1005.0,1
4 | 2011/07/31 22:38:47.008,1304.75,6
5 | 2011/07/31 22:38:48.944,1904.75,1
6 | 2011/07/31 22:38:52.951,1304.75,20
7 | 2011/07/31 22:38:52.951,1304.75,1
8 | 2011/07/31 22:38:52.951,1304.75,5
9 | 2011/07/31 22:38:56.589,1304.5,1
10 | 2011/07/31 22:38:57.858,1304.5,1
11 | 2011/07/31 22:39:08.695,1304.5,1
12 | 2011/07/31 22:39:09.396,1304.5,1
13 | 2011/07/31 22:39:20.495,1304.5,1
14 | 2011/07/31 22:39:23.937,1304.5,1
15 | 2011/07/31 22:39:23.937,1304.5,5
16 | 2011/07/31 22:39:23.937,1304.5,1
17 | 2011/07/31 22:39:26.084,1304.5,1
18 | 2011/07/31 22:39:26.084,1304.5,1
19 | 2011/07/31 22:39:26.095,1304.5,4
20 | 2011/07/31 22:39:26.743,1304.5,11
21 | 2011/07/31 22:39:26.801,1304.5,9
22 | 2011/07/31 22:39:27.050,1304.5,1
23 | 2011/07/31 22:39:27.274,1304.5,1
24 | 2011/07/31 22:39:28.914,1304.5,1
25 | 2011/07/31 22:39:28.965,1304.5,6
26 | 2011/07/31 22:39:28.965,1304.5,1
27 | 2011/07/31 22:39:28.965,1304.5,1
28 | 2011/07/31 22:39:33.568,1304.75,1
29 | 2011/07/31 22:39:37.360,1304.5,1
30 | 2011/07/31 22:39:37.360,1304.5,1
31 | 2011/07/31 22:39:38.991,1304.5,1
32 | 2011/07/31 22:39:40.423,1304.5,1
33 | 2011/07/31 22:39:51.519,1304.5,1
34 | 2011/07/31 22:39:51.519,1304.5,4
35 | 2011/07/31 22:39:53.030,1304.5,1
36 | 2011/07/31 22:39:55.765,1304.5,1
37 | 2011/07/31 22:39:56.614,1304.5,1
38 | 2011/07/31 22:39:56.614,1304.5,1
39 | 2011/07/31 22:39:56.614,1304.5,5
40 | 2011/07/31 22:39:56.614,1304.5,1
41 | 2011/07/31 22:39:56.614,1304.5,1
42 | 2011/07/31 22:39:59.606,1304.5,10
43 | 2011/07/31 22:39:59.606,1304.5,2
44 | 2011/07/31 22:39:59.606,1304.5,4
45 | 2011/07/31 22:40:01.914,1304.5,1
46 | 2011/07/31 22:40:01.914,1304.5,1
47 | 2011/07/31 22:40:10.794,1304.75,6
48 | 2011/07/31 22:40:11.161,1304.5,4
49 | 2011/07/31 22:40:11.168,1304.75,4
50 | 2011/07/31 22:40:11.168,1304.75,1
51 | 2011/07/31 22:40:11.168,1304.75,1
52 | 2011/07/31 22:40:11.168,1304.75,1
53 | 2011/07/31 22:40:12.014,1304.5,2
54 | 2011/07/31 22:40:12.014,1304.5,3
55 | 2011/07/31 22:40:12.014,1304.5,1
56 | 2011/07/31 22:40:13.964,1304.75,1
57 | 2011/07/31 22:40:14.306,1304.75,1
58 | 2011/07/31 22:40:14.514,1304.75,1
59 | 2011/07/31 22:40:14.617,1304.75,1
60 | 2011/07/31 22:40:14.730,1304.75,1
61 | 2011/07/31 22:40:14.822,1304.75,1
62 | 2011/07/31 22:40:16.182,1305.0,9
63 | 2011/07/31 22:40:16.182,1305.0,1
64 | 2011/07/31 22:40:20.267,1304.75,1
65 | 2011/07/31 22:40:22.083,1305.0,1
66 | 2011/07/31 22:40:28.918,1304.75,1
67 | 2011/07/31 22:40:28.918,1304.75,1
68 | 2011/07/31 22:40:29.030,1305.0,5
69 | 2011/07/31 22:40:29.478,1305.0,3
70 | 2011/07/31 22:40:29.478,3305.0,1
71 | 2011/07/31 22:40:29.478,205.0,2
72 | 2011/07/31 22:40:29.478,1405.0,1
73 | 2011/07/31 22:40:29.478,1305.0,1
74 | 2011/07/31 22:40:29.478,1305.0,1
75 | 2011/07/31 22:40:29.478,1305.0,1
76 | 2011/07/31 22:40:29.478,1305.0,1
77 | 2011/07/31 22:40:29.478,1305.0,1
78 | 2011/07/31 22:40:29.478,1305.0,2
79 | 2011/07/31 22:40:29.478,1305.0,1
80 | 2011/07/31 22:40:29.478,1305.0,1
81 | 2011/07/31 22:40:29.478,1305.0,1
82 | 2011/07/31 22:40:29.478,1305.0,1
83 | 2011/07/31 22:40:29.478,1305.0,2
84 | 2011/07/31 22:40:29.541,1305.0,5
85 | 2011/07/31 22:40:29.940,1305.0,1
86 | 2011/07/31 22:40:30.694,1305.25,10
87 | 2011/07/31 22:40:31.492,1305.25,10
88 | 2011/07/31 22:40:31.576,1305.25,5
89 | 2011/07/31 22:40:31.576,1305.25,1
90 | 2011/07/31 22:40:31.576,1305.25,1
91 | 2011/07/31 22:40:31.576,1305.25,2
92 | 2011/07/31 22:40:31.576,1305.25,1
93 | 2011/07/31 22:40:33.213,1305.25,1
94 | 2011/07/31 22:40:41.016,1305.25,1
95 | 2011/07/31 22:40:41.849,1305.25,1
96 | 2011/07/31 22:40:42.779,1305.0,1
97 | 2011/07/31 22:40:44.921,1305.25,5
98 | 2011/07/31 22:40:44.921,1305.25,1
99 | 2011/07/31 22:40:44.921,1305.25,1
100 | 2011/07/31 22:40:44.921,1305.25,2
101 | 2011/07/31 22:40:44.921,1305.25,1
102 |
--------------------------------------------------------------------------------
/mlfinlab/datasets/load_datasets.py:
--------------------------------------------------------------------------------
1 | """
2 | The module implementing various functions loading tick, dollar, stock data sets which can be used as
3 | sandbox data.
4 | """
5 |
6 | import os
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 | from mlfinlab.labeling.labeling import get_events, add_vertical_barrier, get_bins
12 | from mlfinlab.util.volatility import get_daily_vol
13 | from mlfinlab.filters.filters import cusum_filter
14 |
15 |
16 | def load_stock_prices() -> pd.DataFrame:
17 | """
18 | Loads stock prices data sets consisting of
19 | EEM, EWG, TIP, EWJ, EFA, IEF, EWQ, EWU, XLB, XLE, XLF, LQD, XLK, XLU, EPP, FXI, VGK, VPL, SPY, TLT, BND, CSJ,
20 | DIA starting from 2008 till 2016.
21 |
22 | :return: (pd.DataFrame) The stock_prices data frame.
23 | """
24 |
25 | pass
26 |
27 |
28 | def load_tick_sample() -> pd.DataFrame:
29 | """
30 | Loads E-Mini S&P 500 futures tick data sample.
31 |
32 | :return: (pd.DataFrame) Frame with tick data sample.
33 | """
34 |
35 | pass
36 |
37 |
38 | def load_dollar_bar_sample() -> pd.DataFrame:
39 | """
40 | Loads E-Mini S&P 500 futures dollar bars data sample.
41 |
42 | :return: (pd.DataFrame) Frame with dollar bar data sample.
43 | """
44 |
45 | pass
46 |
47 |
48 | def generate_multi_asset_data_set(start_date: pd.Timestamp = pd.Timestamp(2008, 1, 1),
49 | end_date: pd.Timestamp = pd.Timestamp(2020, 1, 1)) -> tuple:
50 | # pylint: disable=invalid-name
51 | """
52 | Generates multi-asset dataset from stock prices labelled by triple-barrier method.
53 |
54 | :param start_date: (pd.Timestamp) Dataset start date.
55 | :param end_date: (pd.Timestamp) Dataset end date.
56 | :return: (tuple) Tuple of dictionaries (asset: data) for X, y, cont contract used to label the dataset.
57 | """
58 |
59 | pass
60 |
--------------------------------------------------------------------------------
/mlfinlab/ensemble/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementation of Sequentially Bootstrapped Bagging Classifier using sklearn's library as base class.
3 | """
4 |
5 | from mlfinlab.ensemble.sb_bagging import (SequentiallyBootstrappedBaggingClassifier, SequentiallyBootstrappedBaggingRegressor)
6 |
--------------------------------------------------------------------------------
/mlfinlab/feature_importance/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Module which implements feature importance algorithms described in Chapter 8 and other interpretability tools
3 | from the Journal of Financial Data Science.
4 | And Stacked feature importance functions (Stacked MDA/SFI).
5 | """
6 |
7 | from mlfinlab.feature_importance.importance import (mean_decrease_impurity, mean_decrease_accuracy,
8 | single_feature_importance, plot_feature_importance,
9 | stacked_mean_decrease_accuracy)
10 | from mlfinlab.feature_importance.orthogonal import (feature_pca_analysis, get_pca_rank_weighted_kendall_tau,
11 | get_orthogonal_features)
12 | from mlfinlab.feature_importance.fingerpint import (RegressionModelFingerprint, ClassificationModelFingerprint)
13 |
--------------------------------------------------------------------------------
/mlfinlab/feature_importance/fingerpint.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementation of an algorithm described in Yimou Li, David Turkington, Alireza Yazdani
3 | 'Beyond the Black Box: An Intuitive Approach to Investment Prediction with Machine Learning'
4 | (https://jfds.pm-research.com/content/early/2019/12/11/jfds.2019.1.023)
5 | """
6 |
7 | from abc import ABC, abstractmethod
8 | from typing import Tuple
9 | import pandas as pd
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 | from sklearn.linear_model import LinearRegression
13 |
14 |
15 | # pylint: disable=invalid-name
16 | # pylint: disable=too-many-locals
17 |
18 | class AbstractModelFingerprint(ABC):
19 | """
20 | Model fingerprint constructor.
21 |
22 | This is an abstract base class for the RegressionModelFingerprint and ClassificationModelFingerprint classes.
23 | """
24 |
25 | def __init__(self):
26 | """
27 | Model fingerprint constructor.
28 | """
29 | pass
30 |
31 | def fit(self, model: object, X: pd.DataFrame, num_values: int = 50, pairwise_combinations: list = None) -> None:
32 | """
33 | Get linear, non-linear and pairwise effects estimation.
34 |
35 | :param model: (object) Trained model.
36 | :param X: (pd.DataFrame) Dataframe of features.
37 | :param num_values: (int) Number of values used to estimate feature effect.
38 | :param pairwise_combinations: (list) Tuples (feature_i, feature_j) to test pairwise effect.
39 | """
40 |
41 | pass
42 |
43 | def get_effects(self) -> Tuple:
44 | """
45 | Return computed linear, non-linear and pairwise effects. The model should be fit() before using this method.
46 |
47 | :return: (tuple) Linear, non-linear and pairwise effects, of type dictionary (raw values and normalised).
48 | """
49 |
50 | pass
51 |
52 | def plot_effects(self) -> plt.figure:
53 | """
54 | Plot each effect (normalized) on a bar plot (linear, non-linear). Also plots pairwise effects if calculated.
55 |
56 | :return: (plt.figure) Plot figure.
57 | """
58 |
59 | pass
60 |
61 | def _get_feature_values(self, X: pd.DataFrame, num_values: int) -> None:
62 | """
63 | Step 1 of the algorithm which generates possible feature values used in analysis.
64 |
65 | :param X: (pd.DataFrame) Dataframe of features.
66 | :param num_values: (int) Number of values used to estimate feature effect.
67 | """
68 |
69 | pass
70 |
71 | def _get_individual_partial_dependence(self, model: object, X: pd.DataFrame) -> None:
72 | """
73 | Get individual partial dependence function values for each column.
74 |
75 | :param model: (object) Trained model.
76 | :param X: (pd.DataFrame) Dataframe of features.
77 | """
78 |
79 | pass
80 |
81 | def _get_linear_effect(self, X: pd.DataFrame) -> dict:
82 | """
83 | Get linear effect estimates as the mean absolute deviation of the linear predictions around their average value.
84 |
85 | :param X: (pd.DataFrame) Dataframe of features.
86 | :return: (dict) Linear effect estimates for each feature column.
87 | """
88 |
89 | pass
90 |
91 | def _get_non_linear_effect(self, X: pd.DataFrame) -> dict:
92 | """
93 | Get non-linear effect estimates as as the mean absolute deviation of the total marginal (single variable)
94 | effect around its corresponding linear effect.
95 |
96 | :param X: (pd.DataFrame) Dataframe of features.
97 | :return: (dict) Non-linear effect estimates for each feature column.
98 | """
99 |
100 | pass
101 |
102 | def _get_pairwise_effect(self, pairwise_combinations: list, model: object, X: pd.DataFrame, num_values) -> dict:
103 | """
104 | Get pairwise effect estimates as the de-meaned joint partial prediction of the two variables minus the de-meaned
105 | partial predictions of each variable independently.
106 |
107 | :param pairwise_combinations: (list) Tuples (feature_i, feature_j) to test pairwise effect.
108 | :param model: (object) Trained model.
109 | :param X: (pd.DataFrame) Dataframe of features.
110 | :param num_values: (int) Number of values used to estimate feature effect.
111 | :return: (dict) Raw and normalised pairwise effects.
112 | """
113 |
114 | pass
115 |
116 | @abstractmethod
117 | def _get_model_predictions(self, model: object, X_: pd.DataFrame):
118 | """
119 | Get model predictions based on problem type (predict for regression, predict_proba for classification).
120 |
121 | :param model: (object) Trained model.
122 | :param X_: (np.array) Feature set.
123 | :return: (np.array) Predictions.
124 | """
125 |
126 | pass
127 |
128 | @staticmethod
129 | def _normalize(effect: dict) -> dict:
130 | """
131 | Normalize effect values (sum equals 1).
132 |
133 | :param effect: (dict) Effect values.
134 | :return: (dict) Normalized effect values.
135 | """
136 |
137 | pass
138 |
139 |
140 | class RegressionModelFingerprint(AbstractModelFingerprint):
141 | """
142 | Regression Fingerprint class used for regression type of models.
143 | """
144 |
145 | def __init__(self):
146 | """
147 | Regression model fingerprint constructor.
148 | """
149 |
150 | pass
151 |
152 | def _get_model_predictions(self, model, X_):
153 | """
154 | Abstract method _get_model_predictions implementation.
155 |
156 | :param model: (object) Trained model.
157 | :param X_: (np.array) Feature set.
158 | :return: (np.array) Predictions.
159 | """
160 |
161 | pass
162 |
163 |
164 | class ClassificationModelFingerprint(AbstractModelFingerprint):
165 | """
166 | Classification Fingerprint class used for classification type of models.
167 | """
168 |
169 | def __init__(self):
170 | """
171 | Classification model fingerprint constructor.
172 | """
173 |
174 | pass
175 |
176 | def _get_model_predictions(self, model, X_):
177 | """
178 | Abstract method _get_model_predictions implementation.
179 |
180 | :param model: (object) Trained model.
181 | :param X_: (np.array) Feature set.
182 | :return: (np.array) Predictions.
183 | """
184 |
185 | pass
186 |
--------------------------------------------------------------------------------
/mlfinlab/feature_importance/orthogonal.py:
--------------------------------------------------------------------------------
1 | """
2 | Module which implements feature PCA compression and PCA analysis of feature importance.
3 | """
4 |
5 | import pandas as pd
6 | import numpy as np
7 | from scipy.stats import weightedtau, kendalltau, spearmanr, pearsonr
8 |
9 |
10 | def _get_eigen_vector(dot_matrix, variance_thresh, num_features=None):
11 | """
12 | Advances in Financial Machine Learning, Snippet 8.5, page 119.
13 |
14 | Computation of Orthogonal Features
15 |
16 | Gets eigen values and eigen vector from matrix which explain % variance_thresh of total variance.
17 |
18 | :param dot_matrix: (np.array): Matrix for which eigen values/vectors should be computed.
19 | :param variance_thresh: (float): Percentage % of overall variance which compressed vectors should explain.
20 | :param num_features: (int) Manually set number of features, overrides variance_thresh. (None by default)
21 | :return: (pd.Series, pd.DataFrame): Eigenvalues, Eigenvectors.
22 | """
23 |
24 | pass
25 |
26 |
27 | def _standardize_df(data_frame):
28 | """
29 | Helper function which divides df by std and extracts mean.
30 |
31 | :param data_frame: (pd.DataFrame): Dataframe to standardize
32 | :return: (pd.DataFrame): Standardized dataframe
33 | """
34 |
35 | pass
36 |
37 |
38 | def get_orthogonal_features(feature_df, variance_thresh=.95, num_features=None):
39 | """
40 | Advances in Financial Machine Learning, Snippet 8.5, page 119.
41 |
42 | Computation of Orthogonal Features.
43 |
44 | Gets PCA orthogonal features.
45 |
46 | :param feature_df: (pd.DataFrame): Dataframe of features.
47 | :param variance_thresh: (float): Percentage % of overall variance which compressed vectors should explain.
48 | :param num_features: (int) Manually set number of features, overrides variance_thresh. (None by default)
49 | :return: (pd.DataFrame): Compressed PCA features which explain %variance_thresh of variance.
50 | """
51 |
52 | pass
53 |
54 |
55 | def get_pca_rank_weighted_kendall_tau(feature_imp, pca_rank):
56 | """
57 | Advances in Financial Machine Learning, Snippet 8.6, page 121.
58 |
59 | Computes Weighted Kendall's Tau Between Feature Importance and Inverse PCA Ranking.
60 |
61 | :param feature_imp: (np.array): Feature mean importance.
62 | :param pca_rank: (np.array): PCA based feature importance rank.
63 | :return: (float): Weighted Kendall Tau of feature importance and inverse PCA rank with p_value.
64 | """
65 |
66 | pass
67 |
68 |
69 | def feature_pca_analysis(feature_df, feature_importance, variance_thresh=0.95):
70 | """
71 | Performs correlation analysis between feature importance (MDI for example, supervised) and PCA eigenvalues
72 | (unsupervised).
73 |
74 | High correlation means that probably the pattern identified by the ML algorithm is not entirely overfit.
75 |
76 | :param feature_df: (pd.DataFrame): Features dataframe.
77 | :param feature_importance: (pd.DataFrame): Individual MDI feature importance.
78 | :param variance_thresh: (float): Percentage % of overall variance which compressed vectors should explain in PCA compression.
79 | :return: (dict): Dictionary with kendall, spearman, pearson and weighted_kendall correlations and p_values.
80 | """
81 |
82 | pass
83 |
--------------------------------------------------------------------------------
/mlfinlab/features/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions derived from Chapter 5: Fractional Differentiation.
3 | """
4 |
5 | from mlfinlab.features.fracdiff import (get_weights, frac_diff, get_weights_ffd, frac_diff_ffd, plot_min_ffd)
6 |
--------------------------------------------------------------------------------
/mlfinlab/filters/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Logic regarding the various types of filters:
3 |
4 | * CUSUM Filter
5 | * Z-score filter
6 | """
7 |
8 | from mlfinlab.filters.filters import cusum_filter
9 | from mlfinlab.filters.filters import z_score_filter
10 |
--------------------------------------------------------------------------------
/mlfinlab/filters/filters.py:
--------------------------------------------------------------------------------
1 | """
2 | Filters are used to filter events based on some kind of trigger. For example a structural break filter can be
3 | used to filter events where a structural break occurs. This event is then used to measure the return from the event
4 | to some event horizon, say a day.
5 | """
6 |
7 | import numpy as np
8 | import pandas as pd
9 |
10 |
11 | # Snippet 2.4, page 39, The Symmetric CUSUM Filter.
12 | def cusum_filter(raw_time_series, threshold, time_stamps=True):
13 | """
14 | Advances in Financial Machine Learning, Snippet 2.4, page 39.
15 |
16 | The Symmetric Dynamic/Fixed CUSUM Filter.
17 |
18 | The CUSUM filter is a quality-control method, designed to detect a shift in the mean value of a measured quantity
19 | away from a target value. The filter is set up to identify a sequence of upside or downside divergences from any
20 | reset level zero. We sample a bar t if and only if S_t >= threshold, at which point S_t is reset to 0.
21 |
22 | One practical aspect that makes CUSUM filters appealing is that multiple events are not triggered by raw_time_series
23 | hovering around a threshold level, which is a flaw suffered by popular market signals such as Bollinger Bands.
24 | It will require a full run of length threshold for raw_time_series to trigger an event.
25 |
26 | Once we have obtained this subset of event-driven bars, we will let the ML algorithm determine whether the occurrence
27 | of such events constitutes actionable intelligence. Below is an implementation of the Symmetric CUSUM filter.
28 |
29 | Note: As per the book this filter is applied to closing prices but we extended it to also work on other
30 | time series such as volatility.
31 |
32 | :param raw_time_series: (pd.Series) Close prices (or other time series, e.g. volatility).
33 | :param threshold: (float or pd.Series) When the abs(change) is larger than the threshold, the function captures
34 | it as an event, can be dynamic if threshold is pd.Series
35 | :param time_stamps: (bool) Default is to return a DateTimeIndex, change to false to have it return a list.
36 | :return: (datetime index vector) Vector of datetimes when the events occurred. This is used later to sample.
37 | """
38 |
39 | pass
40 |
41 |
42 | def z_score_filter(raw_time_series, mean_window, std_window, z_score=3, time_stamps=True):
43 | """
44 | Filter which implements z_score filter
45 | (https://stackoverflow.com/questions/22583391/peak-signal-detection-in-realtime-timeseries-data)
46 |
47 | :param raw_time_series: (pd.Series) Close prices (or other time series, e.g. volatility).
48 | :param mean_window: (int): Rolling mean window
49 | :param std_window: (int): Rolling std window
50 | :param z_score: (float): Number of standard deviations to trigger the event
51 | :param time_stamps: (bool) Default is to return a DateTimeIndex, change to false to have it return a list.
52 | :return: (datetime index vector) Vector of datetimes when the events occurred. This is used later to sample.
53 | """
54 |
55 | pass
56 |
--------------------------------------------------------------------------------
/mlfinlab/labeling/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Labeling techniques used in financial machine learning.
3 | """
4 |
5 | from mlfinlab.labeling.labeling import (add_vertical_barrier, apply_pt_sl_on_t1, barrier_touched, drop_labels,
6 | get_bins, get_events)
7 | from mlfinlab.labeling.trend_scanning import trend_scanning_labels
8 | from mlfinlab.labeling.tail_sets import TailSetLabels
9 | from mlfinlab.labeling.fixed_time_horizon import fixed_time_horizon
10 | from mlfinlab.labeling.matrix_flags import MatrixFlagLabels
11 | from mlfinlab.labeling.excess_over_median import excess_over_median
12 | from mlfinlab.labeling.raw_return import raw_return
13 | from mlfinlab.labeling.return_vs_benchmark import return_over_benchmark
14 | from mlfinlab.labeling.excess_over_mean import excess_over_mean
15 | from mlfinlab.labeling.bull_bear import (pagan_sossounov, lunde_timmermann)
16 |
--------------------------------------------------------------------------------
/mlfinlab/labeling/bull_bear.py:
--------------------------------------------------------------------------------
1 | """
2 | Detection of bull and bear markets.
3 | """
4 | import numpy as np
5 | import pandas as pd
6 |
7 |
8 | def pagan_sossounov(prices, window=8, censor=6, cycle=16, phase=4, threshold=0.2):
9 | """
10 | Pagan and Sossounov's labeling method. Sourced from `Pagan, Adrian R., and Kirill A. Sossounov. "A simple framework
11 | for analysing bull and bear markets." Journal of applied econometrics 18.1 (2003): 23-46.
12 | `__
13 |
14 | Returns a DataFrame with labels of 1 for Bull and -1 for Bear.
15 |
16 | :param prices: (pd.DataFrame) Close prices of all tickers in the market.
17 | :param window: (int) Rolling window length to determine local extrema. Paper suggests 8 months for monthly obs.
18 | :param censor: (int) Number of months to eliminate for start and end. Paper suggests 6 months for monthly obs.
19 | :param cycle: (int) Minimum length for a complete cycle. Paper suggests 16 months for monthly obs.
20 | :param phase: (int) Minimum length for a phase. Paper suggests 4 months for monthly obs.
21 | :param threshold: (double) Minimum threshold for phase change. Paper suggests 0.2.
22 | :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear.
23 | """
24 |
25 | pass
26 |
27 |
28 | def _alternation(price):
29 | """
30 | Helper function to check peak and trough alternation.
31 |
32 | :param price: (pd.DataFrame) Close prices of all tickers in the market.
33 | :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear.
34 | """
35 |
36 | pass
37 |
38 |
39 | def _apply_pagan_sossounov(price, window, censor, cycle, phase, threshold):
40 | """
41 | Helper function for Pagan and Sossounov labeling method.
42 |
43 | :param price: (pd.DataFrame) Close prices of all tickers in the market.
44 | :param window: (int) Rolling window length to determine local extrema. Paper suggests 8 months for monthly obs.
45 | :param censor: (int) Number of months to eliminate for start and end. Paper suggests 6 months for monthly obs.
46 | :param cycle: (int) Minimum length for a complete cycle. Paper suggests 16 months for monthly obs.
47 | :param phase: (int) Minimum length for a phase. Paper suggests 4 months for monthly obs.
48 | :param threshold: (double) Minimum threshold for phase change. Paper suggests 20%.
49 | :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear.
50 | """
51 |
52 | pass
53 |
54 |
55 | def lunde_timmermann(prices, bull_threshold=0.15, bear_threshold=0.15):
56 | """
57 | Lunde and Timmermann's labeling method. Sourced from `Lunde, Asger, and Allan Timmermann. "Duration dependence
58 | in stock prices: An analysis of bull and bear markets." Journal of Business & Economic Statistics 22.3 (2004): 253-273.
59 | `__
60 |
61 | Returns a DataFrame with labels of 1 for Bull and -1 for Bear.
62 |
63 | :param prices: (pd.DataFrame) Close prices of all tickers in the market.
64 | :param bull_threshold: (double) Threshold to identify bull market. Paper suggests 0.15.
65 | :param bear_threshold: (double) Threshold to identify bear market. Paper suggests 0.15.
66 | :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear.
67 | """
68 |
69 | pass
70 |
71 |
72 | def _apply_lunde_timmermann(price, bull_threshold, bear_threshold):
73 | """
74 | Helper function for Lunde and Timmermann labeling method.
75 |
76 | :param price: (pd.DataFrame) Close prices of all tickers in the market.
77 | :param bull_threshold: (double) Threshold to identify bull market. Paper suggests 0.15.
78 | :param bear_threshold: (double) Threshold to identify bear market. Paper suggests 0.15.
79 | :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear.
80 | """
81 |
82 | pass
83 |
--------------------------------------------------------------------------------
/mlfinlab/labeling/excess_over_mean.py:
--------------------------------------------------------------------------------
1 | """
2 | Return in excess of mean method.
3 |
4 | Chapter 5, Machine Learning for Factor Investing, by Coqueret and Guida, (2020).
5 | """
6 | import numpy as np
7 |
8 |
9 | def excess_over_mean(prices, binary=False, resample_by=None, lag=True):
10 | """
11 | Return in excess of mean labeling method. Sourced from Chapter 5.5.1 of Machine Learning for Factor Investing,
12 | by Coqueret, G. and Guida, T. (2020).
13 |
14 | Returns a DataFrame containing returns of stocks over the mean of all stocks in the portfolio. Returns a DataFrame
15 | of signs of the returns if binary is True. In this case, an observation may be labeled as 0 if it itself is the
16 | mean.
17 |
18 | :param prices: (pd.DataFrame) Close prices of all tickers in the market that are used to establish the mean. NaN
19 | values are ok. Returns on each ticker are then compared to the mean for the given timestamp.
20 | :param binary: (bool) If False, the numerical value of excess returns over mean will be given. If True, then only
21 | the sign of the excess return over mean will be given (-1 or 1). A label of 0 will be given if
22 | the observation itself equal to the mean.
23 | :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per
24 | business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period.
25 | For full details see `here.
26 | `_
27 | :param lag: (bool) If True, returns will be lagged to make them forward-looking.
28 | :return: (pd.DataFrame) Numerical returns in excess of the market mean return, or sign of return depending on
29 | whether binary is False or True respectively.
30 | """
31 |
32 | pass
33 |
--------------------------------------------------------------------------------
/mlfinlab/labeling/excess_over_median.py:
--------------------------------------------------------------------------------
1 | """
2 | Return in excess of median method.
3 |
4 | Described in "The benefits of tree-based models for stock selection", Zhu et al. (2012). Data labeled this way can be
5 | used in regression and classification models to predict stock returns over market.
6 | """
7 | import numpy as np
8 |
9 |
10 | def excess_over_median(prices, binary=False, resample_by=None, lag=True):
11 | """
12 | Return in excess of median labeling method. Sourced from "The benefits of tree-based models for stock selection"
13 | Zhu et al. (2012).
14 |
15 | Returns a DataFrame containing returns of stocks over the median of all stocks in the portfolio, or returns a
16 | DataFrame containing signs of those returns. In the latter case, an observation may be labeled as 0 if it itself is
17 | the median.
18 |
19 | :param prices: (pd.DataFrame) Close prices of all stocks in the market that are used to establish the median.
20 | Returns on each stock are then compared to the median for the given timestamp.
21 | :param binary: (bool) If False, the numerical value of excess returns over median will be given. If True, then only
22 | the sign of the excess return over median will be given (-1 or 1). A label of 0 will be given if
23 | the observation itself is the median. According to Zhu et al., categorical labels can alleviate
24 | issues with extreme outliers present with numerical labels.
25 | :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per
26 | business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period.
27 | For full details see `here.
28 | `_
29 | :param lag: (bool) If True, returns will be lagged to make them forward-looking.
30 | :return: (pd.DataFrame) Numerical returns in excess of the market median return, or sign of return depending on
31 | whether binary is False or True respectively.
32 | """
33 |
34 | pass
35 |
--------------------------------------------------------------------------------
/mlfinlab/labeling/fixed_time_horizon.py:
--------------------------------------------------------------------------------
1 | """
2 | Chapter 3.2 Fixed-Time Horizon Method, in Advances in Financial Machine Learning, by M. L. de Prado.
3 |
4 | Work "Classification-based Financial Markets Prediction using Deep Neural Networks" by Dixon et al. (2016) describes how
5 | labeling data this way can be used in training deep neural networks to predict price movements.
6 | """
7 |
8 | import warnings
9 | import pandas as pd
10 |
11 |
12 | def fixed_time_horizon(prices, threshold=0, resample_by=None, lag=True, standardized=False, window=None):
13 | """
14 | Fixed-Time Horizon Labeling Method.
15 |
16 | Originally described in the book Advances in Financial Machine Learning, Chapter 3.2, p.43-44.
17 |
18 | Returns 1 if return is greater than the threshold, -1 if less, and 0 if in between. If no threshold is
19 | provided then it will simply take the sign of the return.
20 |
21 | :param prices: (pd.Series or pd.DataFrame) Time-indexed stock prices used to calculate returns.
22 | :param threshold: (float or pd.Series) When the absolute value of return exceeds the threshold, the observation is
23 | labeled with 1 or -1, depending on the sign of the return. If return is less, it's labeled as 0.
24 | Can be dynamic if threshold is inputted as a pd.Series, and threshold.index must match prices.index.
25 | If resampling is used, the index of threshold must match the index of prices after resampling.
26 | If threshold is negative, then the directionality of the labels will be reversed. If no threshold
27 | is provided, it is assumed to be 0 and the sign of the return is returned.
28 | :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per
29 | business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period.
30 | For full details see `here.
31 | `_
32 | :param lag: (bool) If True, returns will be lagged to make them forward-looking.
33 | :param standardized: (bool) Whether returns are scaled by mean and standard deviation.
34 | :param window: (int) If standardized is True, the rolling window period for calculating the mean and standard
35 | deviation of returns.
36 | :return: (pd.Series or pd.DataFrame) -1, 0, or 1 denoting whether the return for each observation is
37 | less/between/greater than the threshold at each corresponding time index. First or last row will be
38 | NaN, depending on lag.
39 | """
40 |
41 | pass
42 |
--------------------------------------------------------------------------------
/mlfinlab/labeling/matrix_flags.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=no-self-use
2 | # pylint: disable=unnecessary-comprehension
3 | """
4 | Matrix Flag labeling method.
5 | """
6 |
7 | import pandas as pd
8 | import numpy as np
9 |
10 |
11 | class MatrixFlagLabels:
12 | """
13 | The Matrix Flag labeling method is featured in the paper: Cervelló-Royo, R., Guijarro, F. and Michniuk, K., 2015.
14 | Stock market trading rule based on pattern recognition and technical analysis: Forecasting the DJIA index with
15 | intraday data.
16 |
17 | The method of applying a matrix template was first introduced, and explained in greater detail, in the paper:
18 | Leigh, W., Modani, N., Purvis, R. and Roberts, T., 2002. Stock market trading rule discovery using technical
19 | charting heuristics.
20 |
21 | Cervelló-Royo et al. expand on Leigh et al.'s work by proposing a new bull flag pattern which ameliorates some
22 | weaknesses in Leigh's original template. Additionally, he applies this bull flag labeling method to intraday
23 | candlestick data, rather than just closing prices.
24 |
25 | To find the total weight for a given day, the current price as well as the preceding window days number of prices is
26 | used. The data window is split into 10 buckets each containing a chronological tenth of the data window. Each point
27 | in 1 bucket is put into a decile corresponding to a position in a column based on percentile relative to the entire
28 | data window. Bottom 10% on lowest row, next 10% on second lowest row etc.
29 | The proportion of points in each decile is reported to finalize the column. The first tenth of the data is
30 | transformed to the leftmost column, the next tenth to the next column on the right and so on until finally a 10 by
31 | 10 matrix is achieved. This matrix is then multiplied element-wise with the 10 by 10 template, and the sum of all
32 | columns is the total weight for the day. If desired, the user can specify a threshold to determine positive and
33 | negative classes. The value of the threshold depends on how strict of a classifier the user desires, and the
34 | allowable values based on the template matrix.
35 | """
36 |
37 | def __init__(self, prices, window, template_name=None):
38 | """
39 | :param prices: (pd.Series) Price data for one stock.
40 | :param window: (int) Length of preceding data window used when generating the fit matrix for one day.
41 | :param template_name: (str) Name of the an available template in the template library. Allowable names:
42 | ``leigh_bear``, ``leigh_bull``, ``cervelloroyo_bear``, ``cervellororo_bull``.
43 | """
44 |
45 | pass
46 |
47 | def _init_template(self, name):
48 | """
49 | :param name: (str) Name of the an available template in the template library. Allowable names: ``leigh_bear``,
50 | ``leigh_bull``, ``cervelloroyo_bear``, ``cervellororo_bull``.
51 | """
52 |
53 | pass
54 |
55 | def set_template(self, template):
56 | """
57 | :param template: (pd.DataFrame) Template to override the default template. Must be a 10 by 10 pd.DataFrame.
58 | NaN values not allowed, as they will not automatically be treated as zeros.
59 | """
60 |
61 | pass
62 |
63 | def _transform_data(self, row_num, window=30):
64 | """
65 | :param row_num: (int) Row number to use for the "current" data point to apply the window to. The data window
66 | contains the row corresponding to row_num, as well as the (self.window-1) preceding rows.
67 | :param window: (int) The number of rows preceding the current one to use for window. Override with
68 | self.window in most cases.
69 | :return: (pd.DataFrame) Transformed 10 by 10 matrix, in which each column corresponds to a chronological tenth
70 | of the data window, and each row corresponds to a price decile relative to the entire data window.
71 | The template matrix is then applied to this output matrix.
72 | """
73 |
74 | pass
75 |
76 | def _apply_template_to_matrix(self, matrix, template):
77 | """
78 | :param matrix: (pd.DataFrame) Processed 10 by 10 matrix, where each column represents a chronological tenth
79 | of the data, and each row represents a decile relative to the entire data window.
80 | :param template: (pd.DataFrame) Template to apply the processed matrix to.
81 | :return: (float) The total score for the day. Consists of the sum of sum of columns of the result from
82 | multiplying the matrix element-wise with the template.
83 | """
84 |
85 | pass
86 |
87 | def apply_labeling_matrix(self, threshold=None):
88 | """
89 | :param threshold: (float) If None, labels will be returned numerically as the score for the day. If not None,
90 | then labels are returned categorically, with the positive category for labels that are equal to
91 | or exceed the threshold.
92 | :return: (pd.Series) Total scores for the data series on each eligible day (meaning for indices self.window and
93 | onwards).
94 | """
95 |
96 | pass
97 |
--------------------------------------------------------------------------------
/mlfinlab/labeling/raw_return.py:
--------------------------------------------------------------------------------
1 | """
2 | Labeling Raw Returns.
3 |
4 | Most basic form of labeling based on raw return of each observation relative to its previous value.
5 | """
6 |
7 | import numpy as np
8 |
9 |
10 | def raw_return(prices, binary=False, logarithmic=False, resample_by=None, lag=True):
11 | """
12 | Raw returns labeling method.
13 |
14 | This is the most basic and ubiquitous labeling method used as a precursor to almost any kind of financial data
15 | analysis or machine learning. User can specify simple or logarithmic returns, numerical or binary labels, a
16 | resample period, and whether returns are lagged to be forward looking.
17 |
18 | :param prices: (pd.Series or pd.DataFrame) Time-indexed price data on stocks with which to calculate return.
19 | :param binary: (bool) If False, will return numerical returns. If True, will return the sign of the raw return.
20 | :param logarithmic: (bool) If False, will calculate simple returns. If True, will calculate logarithmic returns.
21 | :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per
22 | business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period.
23 | For full details see `here.
24 | `_
25 | :param lag: (bool) If True, returns will be lagged to make them forward-looking.
26 | :return: (pd.Series or pd.DataFrame) Raw returns on market data. User can specify whether returns will be based on
27 | simple or logarithmic return, and whether the output will be numerical or categorical.
28 | """
29 |
30 | pass
31 |
--------------------------------------------------------------------------------
/mlfinlab/labeling/return_vs_benchmark.py:
--------------------------------------------------------------------------------
1 | """
2 | Return in excess of a given benchmark.
3 |
4 | Chapter 5, Machine Learning for Factor Investing, by Coqueret and Guida, (2020).
5 |
6 | Work "Evaluating multiple classifiers for stock price direction prediction" by Ballings et al. (2015) uses this method
7 | to label yearly returns over a predetermined value to compare the performance of several machine learning algorithms.
8 | """
9 | import numpy as np
10 | import pandas as pd
11 |
12 |
13 | def return_over_benchmark(prices, benchmark=0, binary=False, resample_by=None, lag=True):
14 | """
15 | Return over benchmark labeling method. Sourced from Chapter 5.5.1 of Machine Learning for Factor Investing,
16 | by Coqueret, G. and Guida, T. (2020).
17 |
18 | Returns a Series or DataFrame of numerical or categorical returns over a given benchmark. The time index of the
19 | benchmark must match those of the price observations.
20 |
21 | :param prices: (pd.Series or pd.DataFrame) Time indexed prices to compare returns against a benchmark.
22 | :param benchmark: (pd.Series or float) Benchmark of returns to compare the returns from prices against for labeling.
23 | Can be a constant value, or a Series matching the index of prices. If no benchmark is given, then it
24 | is assumed to have a constant value of 0.
25 | :param binary: (bool) If False, labels are given by their numerical value of return over benchmark. If True,
26 | labels are given according to the sign of their excess return.
27 | :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per
28 | business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period.
29 | For full details see `here.
30 | `_
31 | :param lag: (bool) If True, returns will be lagged to make them forward-looking.
32 | :return: (pd.Series or pd.DataFrame) Excess returns over benchmark. If binary, the labels are -1 if the
33 | return is below the benchmark, 1 if above, and 0 if it exactly matches the benchmark.
34 | """
35 |
36 | pass
37 |
--------------------------------------------------------------------------------
/mlfinlab/labeling/tail_sets.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=missing-module-docstring
2 | import numpy as np
3 | import pandas as pd
4 |
5 |
6 | class TailSetLabels:
7 | """
8 | Tail set labels are a classification labeling technique introduced in the following paper: Nonlinear support vector
9 | machines can systematically identify stocks with high and low future returns. Algorithmic Finance, 2(1), pp.45-58.
10 |
11 | A tail set is defined to be a group of stocks whose volatility-adjusted return is in the highest or lowest
12 | quantile, for example the highest or lowest 5%.
13 |
14 | A classification model is then fit using these labels to determine which stocks to buy and sell in a long / short
15 | portfolio.
16 | """
17 |
18 | def __init__(self, prices, n_bins, vol_adj=None, window=None):
19 | """
20 | :param prices: (pd.DataFrame) Asset prices.
21 | :param n_bins: (int) Number of bins to determine the quantiles for defining the tail sets. The top and
22 | bottom quantiles are considered to be the positive and negative tail sets, respectively.
23 | :param vol_adj: (str) Whether to take volatility adjusted returns. Allowable inputs are ``None``,
24 | ``mean_abs_dev``, and ``stdev``.
25 | :param window: (int) Window period used in the calculation of the volatility adjusted returns, if vol_adj is not
26 | None. Has no impact if vol_adj is None.
27 | """
28 |
29 | pass
30 |
31 | def get_tail_sets(self):
32 | """
33 | Computes the tail sets (positive and negative) and then returns a tuple with 3 elements, positive set, negative
34 | set, full matrix set.
35 |
36 | The positive and negative sets are each a series of lists with the names of the securities that fall within each
37 | set at a specific timestamp.
38 |
39 | For the full matrix a value of 1 indicates the volatility adjusted returns were in the top quantile, a value of
40 | -1 for the bottom quantile.
41 | :return: (tuple) positive set, negative set, full matrix set.
42 | """
43 |
44 | pass
45 |
46 | def _vol_adjusted_rets(self):
47 | """
48 | Computes the volatility adjusted returns. This is simply the log returns divided by a volatility estimate. We
49 | have provided 2 techniques for volatility estimation: an exponential moving average and the traditional standard
50 | deviation.
51 | """
52 |
53 | pass
54 |
55 | def _extract_tail_sets(self, row):
56 | """
57 | Method used in a .apply() setting to transform each row in a DataFrame to the positive and negative tail sets.
58 |
59 | This method splits the data into quantiles determined by the user, with n_bins.
60 |
61 | :param row: (pd.Series) Vol adjusted returns for a given date.
62 | :return: (pd.Series) Tail set with positive and negative labels.
63 | """
64 |
65 | pass
66 |
67 | @staticmethod
68 | def _positive_tail_set(row):
69 | """
70 | Takes as input a row from the vol_adj_ret DataFrame and then returns a list of names of the securities in the
71 | positive tail set, for this specific row date.
72 |
73 | This method is used in an apply() setting.
74 |
75 | :param row: (pd.Series) Labeled row of several stocks where each is already labeled with +1 (positive tail set),
76 | -1 (negative tail set), or 0.
77 | :return: (list) Securities in the positive tail set.
78 | """
79 |
80 | pass
81 |
82 | @staticmethod
83 | def _negative_tail_set(row):
84 | """
85 | Takes as input a row from the vol_adj_ret DataFrame and then returns a list of names of the securities in the
86 | negative tail set, for this specific row date.
87 |
88 | This method is used in an apply() setting.
89 |
90 | :param row: (pd.Series) Labeled row of several stocks where each is already labeled with +1 (positive tail set),
91 | -1 (negative tail set), or 0.
92 | :return: (list) Securities in the negative tail set.
93 | """
94 |
95 | pass
96 |
--------------------------------------------------------------------------------
/mlfinlab/labeling/trend_scanning.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementation of Trend-Scanning labels described in `Advances in Financial Machine Learning: Lecture 3/10
3 | `_
4 | """
5 |
6 | import pandas as pd
7 | import numpy as np
8 |
9 | from mlfinlab.structural_breaks.sadf import get_betas
10 |
11 |
12 | def trend_scanning_labels(price_series: pd.Series, t_events: list = None, observation_window: int = 20,
13 | look_forward: bool = True, min_sample_length: int = 5, step: int = 1) -> pd.DataFrame:
14 | """
15 | `Trend scanning `_ is both a classification and
16 | regression labeling technique.
17 |
18 | That can be used in the following ways:
19 |
20 | 1. Classification: By taking the sign of t-value for a given observation we can set {-1, 1} labels to define the
21 | trends as either downward or upward.
22 | 2. Classification: By adding a minimum t-value threshold you can generate {-1, 0, 1} labels for downward, no-trend,
23 | upward.
24 | 3. The t-values can be used as sample weights in classification problems.
25 | 4. Regression: The t-values can be used in a regression setting to determine the magnitude of the trend.
26 |
27 | The output of this algorithm is a DataFrame with t1 (time stamp for the farthest observation), t-value, returns for
28 | the trend, and bin.
29 |
30 | This function allows using both forward-looking and backward-looking window (use the look_forward parameter).
31 |
32 | :param price_series: (pd.Series) Close prices used to label the data set
33 | :param t_events: (list) Filtered events, array of pd.Timestamps
34 | :param observation_window: (int) Maximum look forward window used to get the trend value
35 | :param look_forward: (bool) True if using a forward-looking window, False if using a backward-looking one
36 | :param min_sample_length: (int) Minimum sample length used to fit regression
37 | :param step: (int) Optimal t-value index is searched every 'step' indices
38 | :return: (pd.DataFrame) Consists of t1, t-value, ret, bin (label information). t1 - label endtime, tvalue,
39 | ret - price change %, bin - label value based on price change sign
40 | """
41 | # pylint: disable=invalid-name
42 |
43 | pass
44 |
--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions derived from Chapter 19: Market Microstructural features
3 | """
4 |
5 | from mlfinlab.microstructural_features.encoding import quantile_mapping, encode_array, encode_tick_rule_array, \
6 | sigma_mapping
7 | from mlfinlab.microstructural_features.entropy import get_lempel_ziv_entropy, get_shannon_entropy, get_plug_in_entropy, \
8 | get_konto_entropy
9 | from mlfinlab.microstructural_features.feature_generator import MicrostructuralFeaturesGenerator
10 | from mlfinlab.microstructural_features.first_generation import get_corwin_schultz_estimator, get_roll_measure, \
11 | get_roll_impact, get_bekker_parkinson_vol
12 | from mlfinlab.microstructural_features.misc import get_avg_tick_size, vwap
13 | from mlfinlab.microstructural_features.second_generation import get_bar_based_kyle_lambda, get_bar_based_amihud_lambda, \
14 | get_bar_based_hasbrouck_lambda, get_trades_based_kyle_lambda, get_trades_based_amihud_lambda, \
15 | get_trades_based_hasbrouck_lambda
16 | from mlfinlab.microstructural_features.third_generation import get_vpin
17 |
--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/encoding.py:
--------------------------------------------------------------------------------
1 | """
2 | Various functions for message encoding (quantile)
3 | """
4 | import numpy as np
5 |
6 |
7 | def encode_tick_rule_array(tick_rule_array: list) -> str:
8 | """
9 | Encode array of tick signs (-1, 1, 0)
10 |
11 | :param tick_rule_array: (list) Tick rules
12 | :return: (str) Encoded message
13 | """
14 |
15 | pass
16 |
17 |
18 | def _get_ascii_table() -> list:
19 | """
20 | Get all ASCII symbols
21 |
22 | :return: (list) ASCII symbols
23 | """
24 |
25 | pass
26 |
27 |
28 | def quantile_mapping(array: list, num_letters: int = 26) -> dict:
29 | """
30 | Generate dictionary of quantile-letters based on values from array and dictionary length (num_letters).
31 |
32 | :param array: (list) Values to split on quantiles
33 | :param num_letters: (int) Number of letters(quantiles) to encode
34 | :return: (dict) Dict of quantile-symbol
35 | """
36 |
37 | pass
38 |
39 |
40 | def sigma_mapping(array: list, step: float = 0.01) -> dict:
41 | """
42 | Generate dictionary of sigma encoded letters based on values from array and discretization step.
43 |
44 | :param array: (list) Values to split on quantiles
45 | :param step: (float) Discretization step (sigma)
46 | :return: (dict) Dict of value-symbol
47 | """
48 |
49 | pass
50 |
51 |
52 | def _find_nearest(array: list, value: float) -> float:
53 | """
54 | Find the nearest element from array to value.
55 |
56 | :param array: (list) Values
57 | :param value: (float) Value for which the nearest element needs to be found
58 | :return: (float) The nearest to the value element in array
59 | """
60 |
61 | pass
62 |
63 |
64 | def _get_letter_from_encoding(value: float, encoding_dict: dict) -> str:
65 | """
66 | Get letter for float/int value from encoding dict.
67 |
68 | :param value: (float/int) Value to use
69 | :param encoding_dict: (dict) Used dictionary
70 | :return: (str) Letter from encoding dict
71 | """
72 |
73 | pass
74 |
75 |
76 | def encode_array(array: list, encoding_dict: dict) -> str:
77 | """
78 | Encode array with strings using encoding dict, in case of multiple occurrences of the minimum values,
79 | the indices corresponding to the first occurrence are returned
80 |
81 | :param array: (list) Values to encode
82 | :param encoding_dict: (dict) Dict of quantile-symbol
83 | :return: (str) Encoded message
84 | """
85 |
86 | pass
87 |
--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/entropy.py:
--------------------------------------------------------------------------------
1 | """
2 | Entropy calculation module (Shannon, Lempel-Ziv, Plug-In, Konto)
3 | """
4 |
5 | import math
6 | from typing import Union
7 |
8 | import numpy as np
9 | from numba import njit
10 |
11 |
12 | def get_shannon_entropy(message: str) -> float:
13 | """
14 | Advances in Financial Machine Learning, page 263-264.
15 |
16 | Get Shannon entropy from message
17 |
18 | :param message: (str) Encoded message
19 | :return: (float) Shannon entropy
20 | """
21 |
22 | pass
23 |
24 |
25 | def get_lempel_ziv_entropy(message: str) -> float:
26 | """
27 | Advances in Financial Machine Learning, Snippet 18.2, page 266.
28 |
29 | Get Lempel-Ziv entropy estimate
30 |
31 | :param message: (str) Encoded message
32 | :return: (float) Lempel-Ziv entropy
33 | """
34 |
35 | pass
36 |
37 |
38 | def _prob_mass_function(message: str, word_length: int) -> dict:
39 | """
40 | Advances in Financial Machine Learning, Snippet 18.1, page 266.
41 |
42 | Compute probability mass function for a one-dim discete rv
43 |
44 | :param message: (str or array) Encoded message
45 | :param word_length: (int) Approximate word length
46 | :return: (dict) Dict of pmf for each word from message
47 | """
48 |
49 | pass
50 |
51 |
52 | def get_plug_in_entropy(message: str, word_length: int = None) -> float:
53 | """
54 | Advances in Financial Machine Learning, Snippet 18.1, page 265.
55 |
56 | Get Plug-in entropy estimator
57 |
58 | :param message: (str or array) Encoded message
59 | :param word_length: (int) Approximate word length
60 | :return: (float) Plug-in entropy
61 | """
62 |
63 | pass
64 |
65 |
66 | @njit()
67 | def _match_length(message: str, start_index: int, window: int) -> Union[int, str]: # pragma: no cover
68 | """
69 | Advances in Financial Machine Learning, Snippet 18.3, page 267.
70 |
71 | Function That Computes the Length of the Longest Match
72 |
73 | :param message: (str or array) Encoded message
74 | :param start_index: (int) Start index for search
75 | :param window: (int) Window length
76 | :return: (int, str) Match length and matched string
77 | """
78 |
79 | pass
80 |
81 |
82 | def get_konto_entropy(message: str, window: int = 0) -> float:
83 | """
84 | Advances in Financial Machine Learning, Snippet 18.4, page 268.
85 |
86 | Implementations of Algorithms Discussed in Gao et al.[2008]
87 |
88 | Get Kontoyiannis entropy
89 |
90 | :param message: (str or array) Encoded message
91 | :param window: (int) Expanding window length, can be negative
92 | :return: (float) Kontoyiannis entropy
93 | """
94 |
95 | pass
96 |
--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/feature_generator.py:
--------------------------------------------------------------------------------
1 | """
2 | Inter-bar feature generator which uses trades data and bars index to calculate inter-bar features
3 | """
4 |
5 | import pandas as pd
6 | import numpy as np
7 | from mlfinlab.microstructural_features.entropy import get_shannon_entropy, get_plug_in_entropy, get_lempel_ziv_entropy, \
8 | get_konto_entropy
9 | from mlfinlab.microstructural_features.encoding import encode_array
10 | from mlfinlab.microstructural_features.second_generation import get_trades_based_kyle_lambda, \
11 | get_trades_based_amihud_lambda, get_trades_based_hasbrouck_lambda
12 | from mlfinlab.microstructural_features.misc import get_avg_tick_size, vwap
13 | from mlfinlab.microstructural_features.encoding import encode_tick_rule_array
14 | from mlfinlab.util.misc import crop_data_frame_in_batches
15 |
16 |
17 | # pylint: disable=too-many-instance-attributes
18 |
19 | class MicrostructuralFeaturesGenerator:
20 | """
21 | Class which is used to generate inter-bar features when bars are already compressed.
22 |
23 | :param trades_input: (str or pd.DataFrame) Path to the csv file or Pandas DataFrame containing raw tick data
24 | in the format[date_time, price, volume]
25 | :param tick_num_series: (pd.Series) Series of tick number where bar was formed.
26 | :param batch_size: (int) Number of rows to read in from the csv, per batch.
27 | :param volume_encoding: (dict) Dictionary of encoding scheme for trades size used to calculate entropy on encoded messages
28 | :param pct_encoding: (dict) Dictionary of encoding scheme for log returns used to calculate entropy on encoded messages
29 |
30 | """
31 |
32 | def __init__(self, trades_input: (str, pd.DataFrame), tick_num_series: pd.Series, batch_size: int = 2e7,
33 | volume_encoding: dict = None, pct_encoding: dict = None):
34 | """
35 | Constructor
36 |
37 | :param trades_input: (str or pd.DataFrame) Path to the csv file or Pandas DataFrame containing raw tick data
38 | in the format[date_time, price, volume]
39 | :param tick_num_series: (pd.Series) Series of tick number where bar was formed.
40 | :param batch_size: (int) Number of rows to read in from the csv, per batch.
41 | :param volume_encoding: (dict) Dictionary of encoding scheme for trades size used to calculate entropy on encoded messages
42 | :param pct_encoding: (dict) Dictionary of encoding scheme for log returns used to calculate entropy on encoded messages
43 | """
44 |
45 |
46 | pass
47 |
48 | def get_features(self, verbose=True, to_csv=False, output_path=None):
49 | """
50 | Reads a csv file of ticks or pd.DataFrame in batches and then constructs corresponding microstructural intra-bar features:
51 | average tick size, tick rule sum, VWAP, Kyle lambda, Amihud lambda, Hasbrouck lambda, tick/volume/pct Shannon, Lempel-Ziv,
52 | Plug-in entropies if corresponding mapping dictionaries are provided (self.volume_encoding, self.pct_encoding).
53 | The csv file must have only 3 columns: date_time, price, & volume.
54 |
55 | :param verbose: (bool) Flag whether to print message on each processed batch or not
56 | :param to_csv: (bool) Flag for writing the results of bars generation to local csv file, or to in-memory DataFrame
57 | :param output_path: (bool) Path to results file, if to_csv = True
58 | :return: (DataFrame or None) Microstructural features for bar index
59 | """
60 |
61 | pass
62 |
63 | def _reset_cache(self):
64 | """
65 | Reset price_diff, trade_size, tick_rule, log_ret arrays to empty when bar is formed and features are
66 | calculated
67 |
68 | :return: None
69 | """
70 |
71 | pass
72 |
73 | def _extract_bars(self, data):
74 | """
75 | For loop which calculates features for formed bars using trades data
76 |
77 | :param data: (tuple) Contains 3 columns - date_time, price, and volume.
78 | """
79 |
80 | pass
81 |
82 | def _get_bar_features(self, date_time: pd.Timestamp, list_bars: list) -> list:
83 | """
84 | Calculate inter-bar features: lambdas, entropies, avg_tick_size, vwap
85 |
86 | :param date_time: (pd.Timestamp) When bar was formed
87 | :param list_bars: (list) Previously formed bars
88 | :return: (list) Inter-bar features
89 | """
90 |
91 | pass
92 |
93 | def _apply_tick_rule(self, price: float) -> int:
94 | """
95 | Advances in Financial Machine Learning, page 29.
96 |
97 | Applies the tick rule
98 |
99 | :param price: (float) Price at time t
100 | :return: (int) The signed tick
101 | """
102 |
103 | pass
104 |
105 | def _get_price_diff(self, price: float) -> float:
106 | """
107 | Get price difference between ticks
108 |
109 | :param price: (float) Price at time t
110 | :return: (float) Price difference
111 | """
112 |
113 | pass
114 |
115 | def _get_log_ret(self, price: float) -> float:
116 | """
117 | Get log return between ticks
118 |
119 | :param price: (float) Price at time t
120 | :return: (float) Log return
121 | """
122 |
123 | pass
124 |
125 | @staticmethod
126 | def _assert_csv(test_batch):
127 | """
128 | Tests that the csv file read has the format: date_time, price, and volume.
129 | If not then the user needs to create such a file. This format is in place to remove any unwanted overhead.
130 |
131 | :param test_batch: (pd.DataFrame) the first row of the dataset.
132 | :return: (None)
133 | """
134 |
135 | pass
136 |
--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/first_generation.py:
--------------------------------------------------------------------------------
1 | """
2 | First generation features (Roll Measure/Impact, Corwin-Schultz spread estimator)
3 | """
4 |
5 | import numpy as np
6 | import pandas as pd
7 |
8 |
9 | def get_roll_measure(close_prices: pd.Series, window: int = 20) -> pd.Series:
10 | """
11 | Advances in Financial Machine Learning, page 282.
12 |
13 | Get Roll Measure
14 |
15 | Roll Measure gives the estimate of effective bid-ask spread
16 | without using quote-data.
17 |
18 | :param close_prices: (pd.Series) Close prices
19 | :param window: (int) Estimation window
20 | :return: (pd.Series) Roll measure
21 | """
22 |
23 | pass
24 |
25 |
26 | def get_roll_impact(close_prices: pd.Series, dollar_volume: pd.Series, window: int = 20) -> pd.Series:
27 | """
28 | Get Roll Impact.
29 |
30 | Derivate from Roll Measure which takes into account dollar volume traded.
31 |
32 | :param close_prices: (pd.Series) Close prices
33 | :param dollar_volume: (pd.Series) Dollar volume series
34 | :param window: (int) Estimation window
35 | :return: (pd.Series) Roll impact
36 | """
37 |
38 | pass
39 |
40 |
41 | # Corwin-Schultz algorithm
42 | def _get_beta(high: pd.Series, low: pd.Series, window: int) -> pd.Series:
43 | """
44 | Advances in Financial Machine Learning, Snippet 19.1, page 285.
45 |
46 | Get beta estimate from Corwin-Schultz algorithm
47 |
48 | :param high: (pd.Series) High prices
49 | :param low: (pd.Series) Low prices
50 | :param window: (int) Estimation window
51 | :return: (pd.Series) Beta estimates
52 | """
53 |
54 | pass
55 |
56 |
57 | def _get_gamma(high: pd.Series, low: pd.Series) -> pd.Series:
58 | """
59 | Advances in Financial Machine Learning, Snippet 19.1, page 285.
60 |
61 | Get gamma estimate from Corwin-Schultz algorithm.
62 |
63 | :param high: (pd.Series) High prices
64 | :param low: (pd.Series) Low prices
65 | :return: (pd.Series) Gamma estimates
66 | """
67 |
68 | pass
69 |
70 |
71 | def _get_alpha(beta: pd.Series, gamma: pd.Series) -> pd.Series:
72 | """
73 | Advances in Financial Machine Learning, Snippet 19.1, page 285.
74 |
75 | Get alpha from Corwin-Schultz algorithm.
76 |
77 | :param beta: (pd.Series) Beta estimates
78 | :param gamma: (pd.Series) Gamma estimates
79 | :return: (pd.Series) Alphas
80 | """
81 |
82 | pass
83 |
84 |
85 | def get_corwin_schultz_estimator(high: pd.Series, low: pd.Series, window: int = 20) -> pd.Series:
86 | """
87 | Advances in Financial Machine Learning, Snippet 19.1, page 285.
88 |
89 | Get Corwin-Schultz spread estimator using high-low prices
90 |
91 | :param high: (pd.Series) High prices
92 | :param low: (pd.Series) Low prices
93 | :param window: (int) Estimation window
94 | :return: (pd.Series) Corwin-Schultz spread estimators
95 | """
96 | # Note: S<0 iif alpha<0
97 |
98 | pass
99 |
100 |
101 | def get_bekker_parkinson_vol(high: pd.Series, low: pd.Series, window: int = 20) -> pd.Series:
102 | """
103 | Advances in Financial Machine Learning, Snippet 19.2, page 286.
104 |
105 | Get Bekker-Parkinson volatility from gamma and beta in Corwin-Schultz algorithm.
106 |
107 | :param high: (pd.Series) High prices
108 | :param low: (pd.Series) Low prices
109 | :param window: (int) Estimation window
110 | :return: (pd.Series) Bekker-Parkinson volatility estimates
111 | """
112 | # pylint: disable=invalid-name
113 |
114 | pass
115 |
--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/misc.py:
--------------------------------------------------------------------------------
1 | """
2 | Various miscellaneous microstructural features (VWAP, average tick size)
3 | """
4 |
5 | import numpy as np
6 |
7 |
8 | def vwap(dollar_volume: list, volume: list) -> float:
9 | """
10 | Get Volume Weighted Average Price (VWAP).
11 |
12 | :param dollar_volume: (list) Dollar volumes
13 | :param volume: (list) Trades sizes
14 | :return: (float) VWAP value
15 | """
16 |
17 | pass
18 |
19 |
20 | def get_avg_tick_size(tick_size_arr: list) -> float:
21 | """
22 | Get average tick size in a bar.
23 |
24 | :param tick_size_arr: (list) Trade sizes
25 | :return: (float) Average trade size
26 | """
27 |
28 | pass
29 |
--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/second_generation.py:
--------------------------------------------------------------------------------
1 | """
2 | Second generation models features: Kyle lambda, Amihud Lambda, Hasbrouck lambda (bar and trade based)
3 | """
4 |
5 | from typing import List
6 | import numpy as np
7 | import pandas as pd
8 |
9 | from mlfinlab.structural_breaks.sadf import get_betas
10 |
11 | # pylint: disable=invalid-name
12 | def get_bar_based_kyle_lambda(close: pd.Series, volume: pd.Series, window: int = 20) -> pd.Series:
13 | """
14 | Advances in Financial Machine Learning, p. 286-288.
15 |
16 | Get Kyle lambda from bars data
17 |
18 | :param close: (pd.Series) Close prices
19 | :param volume: (pd.Series) Bar volume
20 | :param window: (int) Rolling window used for estimation
21 | :return: (pd.Series) Kyle lambdas
22 | """
23 |
24 | pass
25 |
26 |
27 | def get_bar_based_amihud_lambda(close: pd.Series, dollar_volume: pd.Series, window: int = 20) -> pd.Series:
28 | """
29 | Advances in Financial Machine Learning, p.288-289.
30 |
31 | Get Amihud lambda from bars data
32 |
33 | :param close: (pd.Series) Close prices
34 | :param dollar_volume: (pd.Series) Dollar volumes
35 | :param window: (int) rolling window used for estimation
36 | :return: (pd.Series) of Amihud lambda
37 | """
38 |
39 | pass
40 |
41 | def get_bar_based_hasbrouck_lambda(close: pd.Series, dollar_volume: pd.Series, window: int = 20) -> pd.Series:
42 | """
43 | Advances in Financial Machine Learning, p.289-290.
44 |
45 | Get Hasbrouck lambda from bars data
46 |
47 | :param close: (pd.Series) Close prices
48 | :param dollar_volume: (pd.Series) Dollar volumes
49 | :param window: (int) Rolling window used for estimation
50 | :return: (pd.Series) Hasbrouck lambda
51 | """
52 |
53 | pass
54 |
55 |
56 | def get_trades_based_kyle_lambda(price_diff: list, volume: list, aggressor_flags: list) -> List[float]:
57 | """
58 | Advances in Financial Machine Learning, p.286-288.
59 |
60 | Get Kyle lambda from trades data
61 |
62 | :param price_diff: (list) Price diffs
63 | :param volume: (list) Trades sizes
64 | :param aggressor_flags: (list) Trade directions [-1, 1] (tick rule or aggressor side can be used to define)
65 | :return: (list) Kyle lambda for a bar and t-value
66 | """
67 |
68 | pass
69 |
70 |
71 | def get_trades_based_amihud_lambda(log_ret: list, dollar_volume: list) -> List[float]:
72 | """
73 | Advances in Financial Machine Learning, p.288-289.
74 |
75 | Get Amihud lambda from trades data
76 |
77 | :param log_ret: (list) Log returns
78 | :param dollar_volume: (list) Dollar volumes (price * size)
79 | :return: (float) Amihud lambda for a bar
80 | """
81 |
82 | pass
83 |
84 |
85 | def get_trades_based_hasbrouck_lambda(log_ret: list, dollar_volume: list, aggressor_flags: list) -> List[float]:
86 | """
87 | Advances in Financial Machine Learning, p.289-290.
88 |
89 | Get Hasbrouck lambda from trades data
90 |
91 | :param log_ret: (list) Log returns
92 | :param dollar_volume: (list) Dollar volumes (price * size)
93 | :param aggressor_flags: (list) Trade directions [-1, 1] (tick rule or aggressor side can be used to define)
94 | :return: (list) Hasbrouck lambda for a bar and t value
95 | """
96 |
97 | pass
98 |
--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/third_generation.py:
--------------------------------------------------------------------------------
1 | """
2 | Third generation models implementation (VPIN)
3 | """
4 | import pandas as pd
5 |
6 |
7 | def get_vpin(volume: pd.Series, buy_volume: pd.Series, window: int = 1) -> pd.Series:
8 | """
9 | Advances in Financial Machine Learning, p. 292-293.
10 |
11 | Get Volume-Synchronized Probability of Informed Trading (VPIN) from bars
12 |
13 | :param volume: (pd.Series) Bar volume
14 | :param buy_volume: (pd.Series) Bar volume classified as buy (either tick rule, BVC or aggressor side methods applied)
15 | :param window: (int) Estimation window
16 | :return: (pd.Series) VPIN series
17 | """
18 |
19 | pass
20 |
--------------------------------------------------------------------------------
/mlfinlab/multi_product/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Functionality relating to the ETF trick and stitching futures contracts together.
3 | """
4 |
5 | from mlfinlab.multi_product.etf_trick import (ETFTrick, get_futures_roll_series)
6 |
--------------------------------------------------------------------------------
/mlfinlab/networks/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Tools to visualise and filter networks of complex systems.
3 | """
4 |
5 | from mlfinlab.networks.dash_graph import DashGraph, PMFGDash
6 | from mlfinlab.networks.dual_dash_graph import DualDashGraph
7 | from mlfinlab.networks.graph import Graph
8 | from mlfinlab.networks.mst import MST
9 | from mlfinlab.networks.almst import ALMST
10 | from mlfinlab.networks.pmfg import PMFG
11 | from mlfinlab.networks.visualisations import (generate_mst_server, create_input_matrix, generate_almst_server,
12 | generate_mst_almst_comparison)
13 |
--------------------------------------------------------------------------------
/mlfinlab/networks/almst.py:
--------------------------------------------------------------------------------
1 | """
2 | This file contains Graph classes, which create NetworkX's Graph objects from matrices.
3 | """
4 |
5 | import heapq
6 | import itertools
7 | from itertools import count
8 |
9 | import networkx as nx
10 | import numpy as np
11 | import pandas as pd
12 | from mlfinlab.networks.graph import Graph
13 |
14 |
15 | class ALMST(Graph):
16 | """
17 | ALMST is a subclass of Graph which creates a ALMST Graph object.
18 | The ALMST class converts a distance matrix input into a ALMST matrix. This is then used to create a nx.Graph object.
19 | """
20 |
21 | def __init__(self, matrix, matrix_type, mst_algorithm='kruskal'):
22 | """
23 | Initialises the ALMST and sets the self.graph attribute as the ALMST graph.
24 |
25 | :param matrix: (pd.Dataframe) Input matrices such as a distance or correlation matrix.
26 | :param matrix_type: (str) Name of the matrix type (e.g. "distance" or "correlation").
27 | :param mst_algorithm: (str) Valid MST algorithm types include 'kruskal', 'prim'.
28 | By default, MST algorithm uses Kruskal's.
29 | """
30 |
31 | pass
32 |
33 | @staticmethod
34 | def create_almst_kruskals(matrix):
35 | """
36 | This method converts the input matrix into a ALMST matrix.
37 |
38 | ! Currently only works with distance input matrix
39 |
40 | :param matrix: (pd.Dataframe) Input matrix.
41 | :return: (pd.Dataframe) ALMST matrix with all other edges as 0 values.
42 | """
43 |
44 | pass
45 |
46 | @staticmethod
47 | def _generate_ordered_heap(matrix, clusters):
48 | """
49 | Given the matrix of edges, and the list of clusters, generate a heap ordered by the average distance between the clusters.
50 |
51 | :param matrix: (pd.Dataframe) Input matrix of the distance matrix.
52 | :param clusters: (List) A list of clusters, where each list contains a list of nodes within the cluster.
53 | :return: (Heap) Returns a heap ordered by the average distance between the clusters.
54 | """
55 |
56 | pass
57 |
58 | @staticmethod
59 | def _calculate_average_distance(matrix, clusters, c_x, c_y):
60 | """
61 | Given two clusters, calculates the average distance between the two.
62 |
63 | :param matrix: (pd.Dataframe) Input matrix with all edges.
64 | :param clusters: (List) List of clusters.
65 | :param c_x: (int) Cluster x, where x is the index of the cluster.
66 | :param c_y: (int) Cluster y, where y is the index of the cluster.
67 | """
68 |
69 | pass
70 |
71 | @staticmethod
72 | def _get_min_edge(node, cluster, matrix):
73 | """
74 | Returns the minimum edge tuple given a node and a cluster.
75 |
76 | :param node: (str) String of the node name.
77 | :param cluster: (list) List of node names.
78 | :param matrix: (pd.DataFrame) A matrix of all edges.
79 | :return: (tuple) A tuple of average distance from node to the cluster, and the minimum edge nodes, i and j.
80 | """
81 |
82 | pass
83 |
84 | @staticmethod
85 | def _get_min_edge_clusters(cluster_one, cluster_two, matrix):
86 | """
87 | Returns a tuple of the minimum edge and the average length for two clusters.
88 |
89 | :param cluster_one: (list) List of node names.
90 | :param cluster_two: (list) List of node names.
91 | :param matrix: (pd.DataFrame) A matrix of all edges.
92 | :return: (tuple) A tuple of average distance between the clusters, and the minimum edge nodes, i and j.
93 | """
94 |
95 | pass
96 |
97 | @staticmethod
98 | def create_almst(matrix):
99 | """
100 | Creates and returns a ALMST given an input matrix using Prim's algorithm.
101 |
102 | :param matrix: (pd.Dataframe) Input distance matrix of all edges.
103 | :return: (pd.Dataframe) Returns the ALMST in matrix format.
104 | """
105 |
106 | pass
107 |
108 | @staticmethod
109 | def _add_next_edge(visited, children, matrix, almst_matrix):
110 | """
111 | Adds the next edge with the minimum average distance.
112 |
113 | :param visited: (Set) A set of visited nodes.
114 | :param children: (Set) A set of children or frontier nodes, to be visited.
115 | :param matrix: (pd.Dataframe) Input distance matrix of all edges.
116 | :param almst_matrix: (pd.Dataframe) The ALMST matrix.
117 |
118 | :return: (Tuple) Returns the sets visited and children, and the matrix almst_matrix.
119 | """
120 |
121 | pass
122 |
--------------------------------------------------------------------------------
/mlfinlab/networks/dual_dash_graph.py:
--------------------------------------------------------------------------------
1 | """
2 | This class takes in a Graph object and creates interactive visualisations using Plotly's Dash.
3 | The DualDashGraph class contains private functions used to generate the frontend components needed to create the UI.
4 |
5 | Running run_server() will produce the warning "Warning: This is a development server. Do not use app.run_server
6 | in production, use a production WSGI server like gunicorn instead.".
7 | However, this is okay and the Dash server will run without a problem.
8 | """
9 |
10 | import dash_bootstrap_components as dbc
11 | import dash_cytoscape as cyto
12 | import dash_html_components as html
13 | from dash import Dash
14 | from dash.dependencies import Input, Output, State
15 | from jupyter_dash import JupyterDash
16 |
17 | class DualDashGraph:
18 | """
19 | The DualDashGraph class is the inerface for comparing and highlighting the difference between two graphs.
20 | Two Graph class objects should be supplied - such as MST and ALMST graphs.
21 | """
22 |
23 | def __init__(self, graph_one, graph_two, app_display='default'):
24 | """
25 | Initialises the dual graph interface and generates the interface layout.
26 |
27 | :param graph_one: (Graph) The first graph for the comparison interface.
28 | :param graph_two: (Graph) The second graph for the comparison interface.
29 | :param app_display: (str) 'default' by default and 'jupyter notebook' for running Dash inside Jupyter Notebook.
30 | """
31 |
32 | pass
33 |
34 | @staticmethod
35 | def _select_other_graph_node(data, elements):
36 | """
37 | Callback function to select the other graph node when a graph node
38 | is selected by setting selected to True.
39 |
40 | :param data: (Dict) Dictionary of "tapped" or selected node.
41 | :param elements: (Dict) Dictionary of elements.
42 | :return: (Dict) Returns updates dictionary of elements.
43 | """
44 |
45 | pass
46 |
47 | def _generate_comparison_layout(self, graph_one, graph_two):
48 | """
49 | Returns and generates a dual comparison layout.
50 |
51 | :param graph_one: (Graph) The first graph object for the dual interface.
52 | :param graph_two: (Graph) Comparison graph object for the dual interface.
53 | :return: (html.Div) Returns a Div containing the interface.
54 | """
55 |
56 | pass
57 |
58 | @staticmethod
59 | def _get_default_stylesheet(weights):
60 | """
61 | Returns the default stylesheet for initialisation.
62 |
63 | :param weights: (List) A list of weights of the edges.
64 | :return: (List) A List of definitions used for Dash styling.
65 | """
66 |
67 | pass
68 |
69 | def _set_cyto_graph(self):
70 | """
71 | Updates and sets the two cytoscape graphs using the corresponding components.
72 | """
73 |
74 | pass
75 |
76 | def _update_elements_dual(self, graph, difference, graph_number):
77 | """
78 | Updates the elements needed for the Dash Cytoscape Graph object.
79 |
80 | :param graph: (Graph) Graph object such as MST or ALMST.
81 | :param difference: (List) List of edges where the two graphs differ.
82 | :param graph_number: (Int) Graph number to update the correct graph.
83 | """
84 |
85 | pass
86 |
87 | def get_server(self):
88 | """
89 | Returns the comparison interface server
90 |
91 | :return: (Dash) Returns the Dash app object, which can be run using run_server.
92 | Returns a Jupyter Dash object if DashGraph has been initialised for Jupyter Notebook.
93 | """
94 |
95 | pass
96 |
--------------------------------------------------------------------------------
/mlfinlab/networks/graph.py:
--------------------------------------------------------------------------------
1 | """
2 | This file contains Graph classes, which create NetworkX's Graph objects from matrices.
3 | """
4 |
5 | from abc import ABC
6 |
7 | import networkx as nx
8 | from matplotlib import pyplot as plt
9 |
10 |
11 | class Graph(ABC):
12 | """
13 | This Graph class is a parent class for different types of graphs such as a MST.
14 | """
15 |
16 | def __init__(self, matrix_type):
17 | """
18 | Initializes the Graph object and the Graph class attributes.
19 | This includes the specific graph such as a MST stored as an attribute.
20 |
21 | :param matrix_type: (str) Name of the matrix type (e.g. "distance" or "correlation").
22 | """
23 |
24 | pass
25 |
26 | def get_matrix_type(self):
27 | """
28 | Returns the matrix type set at initialisation.
29 |
30 | :return: (str) String of matrix type (eg. "correlation" or "distance").
31 | """
32 |
33 | pass
34 |
35 | def get_graph(self):
36 | """
37 | Returns the Graph stored as an attribute.
38 |
39 | :return: (nx.Graph) Returns a NetworkX graph object.
40 | """
41 |
42 | pass
43 |
44 | def get_difference(self, input_graph_two):
45 | """
46 | Given two Graph with the same nodes, return a set of differences in edge connections.
47 |
48 | :param input_graph_two: (Graph) A graph to compare self.graph against.
49 | :return: (List) A list of unique tuples showing different edge connections.
50 | """
51 |
52 | pass
53 |
54 | def get_pos(self):
55 | """
56 | Returns the dictionary of the nodes coordinates.
57 |
58 | :return: (Dict) Dictionary of node coordinates.
59 | """
60 |
61 | pass
62 |
63 | def get_graph_plot(self):
64 | """
65 | Returns the graph of the MST with labels.
66 | Assumes that the matrix contains stock names as headers.
67 |
68 | :return: (AxesSubplot) Axes with graph plot. Call plt.show() to display this graph.
69 | """
70 |
71 | pass
72 |
73 | def set_node_groups(self, industry_groups):
74 | """
75 | Sets the node industry group, by taking in a dictionary of industry group to a list of node indexes.
76 |
77 | :param industry_groups: (Dict) Dictionary of the industry name to a list of node indexes.
78 | """
79 |
80 | pass
81 |
82 | def set_node_size(self, market_caps):
83 | """
84 | Sets the node sizes, given a list of market cap values corresponding to node indexes.
85 |
86 | :param market_caps: (List) List of numbers corresponding to node indexes.
87 | """
88 |
89 | pass
90 |
91 | def get_node_sizes(self):
92 | """
93 | Returns the node sizes as a list.
94 |
95 | :return: (List) List of numbers representing node sizes.
96 | """
97 |
98 | pass
99 |
100 | def get_node_colours(self):
101 | """
102 | Returns a map of industry group matched with list of nodes.
103 |
104 | :return: (Dict) Dictionary of industry name to list of node indexes.
105 | """
106 |
107 | pass
108 |
--------------------------------------------------------------------------------
/mlfinlab/networks/mst.py:
--------------------------------------------------------------------------------
1 | """
2 | This file contains Graph classes, which create NetworkX's Graph objects from matrices.
3 | """
4 |
5 | import networkx as nx
6 | from mlfinlab.networks.graph import Graph
7 |
8 |
9 | class MST(Graph):
10 | """
11 | MST is a subclass of Graph which creates a MST Graph object.
12 | """
13 |
14 | def __init__(self, matrix, matrix_type, mst_algorithm='kruskal'):
15 | """
16 | Creates a MST Graph object and stores the MST inside graph attribute.
17 |
18 | :param matrix: (pd.Dataframe) Input matrices such as a distance or correlation matrix.
19 | :param matrix_type: (str) Name of the matrix type (e.g. "distance" or "correlation").
20 | :param mst_algorithm: (str) Valid MST algorithm types include 'kruskal', 'prim', or 'boruvka'.
21 | By default, MST algorithm uses Kruskal's.
22 | """
23 |
24 | pass
25 |
26 | @staticmethod
27 | def create_mst(matrix, algorithm='kruskal'):
28 | """
29 | This method converts the input matrix into a MST graph.
30 |
31 | :param matrix: (pd.Dataframe) Input matrix.
32 | :param algorithm: (str) Valid MST algorithm types include 'kruskal', 'prim', or 'boruvka'.
33 | By default, MST algorithm uses Kruskal's.
34 | """
35 |
36 | pass
37 |
--------------------------------------------------------------------------------
/mlfinlab/networks/pmfg.py:
--------------------------------------------------------------------------------
1 | """
2 | This file contains Graph classes, which create NetworkX's Graph objects from matrices.
3 | """
4 |
5 | import heapq
6 | import itertools
7 | from itertools import count
8 | import warnings
9 |
10 | import networkx as nx
11 | from matplotlib import pyplot as plt
12 |
13 | from mlfinlab.networks.graph import Graph
14 |
15 |
16 | class PMFG(Graph):
17 | """
18 | PMFG class creates and stores the PMFG as an attribute.
19 | """
20 |
21 | def __init__(self, input_matrix, matrix_type):
22 | """
23 | PMFG class creates the Planar Maximally Filtered Graph and stores it as an attribute.
24 |
25 | :param input_matrix: (pd.Dataframe) Input distance matrix
26 | :param matrix_type: (str) Matrix type name (e.g. "distance").
27 | """
28 |
29 | pass
30 |
31 | def get_disparity_measure(self):
32 | """
33 | Getter method for the dictionary of disparity measure values of cliques.
34 |
35 | :return: (Dict) Returns a dictionary of clique to the disparity measure.
36 | """
37 |
38 | pass
39 |
40 | def _calculate_disparity(self):
41 | """
42 | Calculate disparity given in Tumminello M, Aste T, Di Matteo T, Mantegna RN.
43 | A tool for filtering information in complex systems.
44 | https://arxiv.org/pdf/cond-mat/0501335.pdf
45 |
46 | :return: (Dict) Returns a dictionary of clique to the disparity measure.
47 | """
48 |
49 | pass
50 |
51 | def _generate_cliques(self):
52 | """
53 | Generate cliques from all of the nodes in the PMFG.
54 | """
55 |
56 | pass
57 |
58 | def create_pmfg(self, input_matrix):
59 | """
60 | Creates the PMFG matrix from the input matrix of all edges.
61 |
62 | :param input_matrix: (pd.Dataframe) Input matrix with all edges
63 | :return: (nx.Graph) Output PMFG matrix
64 | """
65 |
66 | pass
67 |
68 | def get_mst_edges(self):
69 | """
70 | Returns the list of MST edges.
71 |
72 | :return: (list) Returns a list of tuples of edges.
73 | """
74 |
75 | pass
76 |
77 | def edge_in_mst(self, node1, node2):
78 | """
79 | Checks whether the edge from node1 to node2 is a part of the MST.
80 |
81 | :param node1: (str) Name of the first node in the edge.
82 | :param node2: (str) Name of the second node in the edge.
83 | :return: (bool) Returns true if the edge is in the MST. False otherwise.
84 | """
85 |
86 | pass
87 |
88 | def get_graph_plot(self):
89 | """
90 | Overrides parent get_graph_plot to plot it in a planar format.
91 |
92 | Returns the graph of the MST with labels.
93 | Assumes that the matrix contains stock names as headers.
94 |
95 | :return: (AxesSubplot) Axes with graph plot. Call plt.show() to display this graph.
96 | """
97 |
98 | pass
99 |
--------------------------------------------------------------------------------
/mlfinlab/networks/visualisations.py:
--------------------------------------------------------------------------------
1 | """
2 | These methods allows the user to easily deploy graph visualisations given an input file dataframe.
3 | """
4 |
5 | import warnings
6 | import networkx as nx
7 |
8 | from mlfinlab.networks.dash_graph import DashGraph, PMFGDash
9 | from mlfinlab.networks.dual_dash_graph import DualDashGraph
10 | from mlfinlab.networks.mst import MST
11 | from mlfinlab.networks.almst import ALMST
12 | from mlfinlab.networks.pmfg import PMFG
13 | from mlfinlab.codependence import get_distance_matrix
14 |
15 |
16 | def generate_mst_server(log_returns_df, mst_algorithm='kruskal', distance_matrix_type='angular',
17 | jupyter=False, colours=None, sizes=None):
18 | """
19 | This method returns a Dash server ready to be run.
20 |
21 | :param log_returns_df: (pd.Dataframe) An input dataframe of log returns
22 | with stock names as columns.
23 | :param mst_algorithm: (str) A valid MST type such as 'kruskal', 'prim', or 'boruvka'.
24 | :param distance_matrix_type: (str) A valid sub type of a distance matrix,
25 | namely 'angular', 'abs_angular', 'squared_angular'.
26 | :param jupyter: (bool) True if the user would like to run inside jupyter notebook. False otherwise.
27 | :param colours: (Dict) A dictionary of key string for category name and value of a list of indexes
28 | corresponding to the node indexes inputted in the initial dataframe.
29 | :param sizes: (List) A list of numbers, where the positions correspond to the node indexes inputted
30 | in the initial dataframe.
31 | :return: (Dash) Returns the Dash app object, which can be run using run_server.
32 | Returns a Jupyter Dash object if the parameter jupyter is set to True.
33 | """
34 |
35 | pass
36 |
37 |
38 | def create_input_matrix(log_returns_df, distance_matrix_type):
39 | """
40 | This method returns the distance matrix ready to be inputted into the Graph class.
41 |
42 | :param log_returns_df: (pd.Dataframe) An input dataframe of log returns
43 | with stock names as columns.
44 | :param distance_matrix_type: (str) A valid sub type of a distance matrix,
45 | namely 'angular', 'abs_angular', 'squared_angular'.
46 | :return: (pd.Dataframe) A dataframe of a distance matrix.
47 | """
48 |
49 | pass
50 |
51 |
52 | def generate_almst_server(log_returns_df, distance_matrix_type='angular',
53 | jupyter=False, colours=None, sizes=None):
54 | """
55 | This method returns a Dash server ready to be run.
56 |
57 | :param log_returns_df: (pd.Dataframe) An input dataframe of log returns
58 | with stock names as columns.
59 | :param distance_matrix_type: (str) A valid sub type of a distance matrix,
60 | namely 'angular', 'abs_angular', 'squared_angular'.
61 | :param jupyter: (bool) True if the user would like to run inside jupyter notebook. False otherwise.
62 | :param colours: (Dict) A dictionary of key string for category name and value of a list of indexes
63 | corresponding to the node indexes inputted in the initial dataframe.
64 | :param sizes: (List) A list of numbers, where the positions correspond to the node indexes inputted
65 | in the initial dataframe.
66 | :return: (Dash) Returns the Dash app object, which can be run using run_server.
67 | Returns a Jupyter Dash object if the parameter jupyter is set to True.
68 | """
69 |
70 | pass
71 |
72 |
73 | def generate_mst_almst_comparison(log_returns_df, distance_matrix_type='angular', jupyter=False):
74 | """
75 | This method returns a Dash server ready to be run.
76 |
77 | :param log_returns_df: (pd.Dataframe) An input dataframe of log returns
78 | with stock names as columns.
79 | :param distance_matrix_type: (str) A valid sub type of a distance matrix,
80 | namely 'angular', 'abs_angular', 'squared_angular'.
81 | :param jupyter: (bool) True if the user would like to run inside jupyter notebook. False otherwise.
82 | :return: (Dash) Returns the Dash app object, which can be run using run_server.
83 | Returns a Jupyter Dash object if the parameter jupyter is set to True.
84 | """
85 |
86 | pass
87 |
88 |
89 | def generate_pmfg_server(log_returns_df, input_type='distance',
90 | jupyter=False, colours=None, sizes=None):
91 | """
92 | This method returns a PMFGDash server ready to be run.
93 |
94 | :param log_returns_df: (pd.Dataframe) An input dataframe of log returns
95 | with stock names as columns.
96 | :param input_type: (str) A valid input type correlation or distance. Inputting correlation will add the edges
97 | by largest to smallest, instead of smallest to largest.
98 | :param jupyter: (bool) True if the user would like to run inside jupyter notebook. False otherwise.
99 | :param colours: (Dict) A dictionary of key string for category name and value of a list of indexes
100 | corresponding to the node indexes inputted in the initial dataframe.
101 | :param sizes: (List) A list of numbers, where the positions correspond to the node indexes inputted
102 | in the initial dataframe.
103 | :return: (Dash) Returns the Dash app object, which can be run using run_server.
104 | Returns a Jupyter Dash object if the parameter jupyter is set to True.
105 | """
106 |
107 | pass
108 |
109 |
110 | def generate_central_peripheral_ranking(nx_graph):
111 | """
112 | Given a NetworkX graph, this method generates and returns a ranking of centrality.
113 | The input should be a distance based PMFG.
114 |
115 | The ranking combines multiple centrality measures to calculate an overall ranking of how central or peripheral the
116 | nodes are.
117 | The smaller the ranking, the more peripheral the node is. The larger the ranking, the more central the node is.
118 |
119 | The factors contributing to the ranking include Degree, Eccentricity, Closeness Centrality, Second Order Centrality,
120 | Eigen Vector Centrality and Betweenness Centrality. The formula for these measures can be found on the NetworkX
121 | documentation (https://networkx.github.io/documentation/stable/reference/algorithms/centrality.html)
122 |
123 | :param nx_graph: (nx.Graph) NetworkX graph object. You can call get_graph() on the MST, ALMST and PMFG to retrieve
124 | the nx.Graph.
125 | :return: (List) Returns a list of tuples of ranking value to node.
126 | """
127 |
128 | pass
--------------------------------------------------------------------------------
/mlfinlab/regression/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementation of historically weighted regression method based on relevance.
3 | """
4 |
5 | from mlfinlab.regression.history_weight_regression import HistoryWeightRegression
6 |
--------------------------------------------------------------------------------
/mlfinlab/sample_weights/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains the code for implementing sample weights and stacked sample weights.
3 | """
4 |
5 | from mlfinlab.sample_weights.attribution import (get_weights_by_time_decay, get_weights_by_return,
6 | _apply_weight_by_return, get_stacked_weights_time_decay,
7 | get_stacked_weights_by_return)
8 |
--------------------------------------------------------------------------------
/mlfinlab/sample_weights/attribution.py:
--------------------------------------------------------------------------------
1 | """
2 | Logic regarding return and time decay attribution for sample weights from chapter 4.
3 | And stacked sample weights logic: return and time based sample weights for a multi-asset dataset.
4 | """
5 |
6 | import numpy as np
7 | import pandas as pd
8 |
9 | from mlfinlab.sampling.concurrent import (num_concurrent_events, get_av_uniqueness_from_triple_barrier)
10 | from mlfinlab.util.multiprocess import mp_pandas_obj
11 |
12 | def _apply_weight_by_return(label_endtime, num_conc_events, close_series, molecule):
13 | """
14 | Advances in Financial Machine Learning, Snippet 4.10, page 69.
15 |
16 | Determination of Sample Weight by Absolute Return Attribution
17 |
18 | Derives sample weights based on concurrency and return. Works on a set of
19 | datetime index values (molecule). This allows the program to parallelize the processing.
20 |
21 | :param label_endtime: (pd.Series) Label endtime series (t1 for triple barrier events).
22 | :param num_conc_events: (pd.Series) Number of concurrent labels (output from num_concurrent_events function).
23 | :param close_series: (pd.Series) Close prices.
24 | :param molecule: (an array) A set of datetime index values for processing.
25 | :return: (pd.Series) Sample weights based on number return and concurrency for molecule.
26 | """
27 |
28 | pass
29 |
30 |
31 | def get_weights_by_return(triple_barrier_events, close_series, num_threads=5, verbose=True):
32 | """
33 | Advances in Financial Machine Learning, Snippet 4.10(part 2), page 69.
34 |
35 | Determination of Sample Weight by Absolute Return Attribution
36 |
37 | This function is orchestrator for generating sample weights based on return using mp_pandas_obj.
38 |
39 | :param triple_barrier_events: (pd.DataFrame) Events from labeling.get_events().
40 | :param close_series: (pd.Series) Close prices.
41 | :param num_threads: (int) The number of threads concurrently used by the function.
42 | :param verbose: (bool) Flag to report progress on asynch jobs.
43 | :return: (pd.Series) Sample weights based on number return and concurrency.
44 | """
45 |
46 | pass
47 |
48 |
49 | def get_weights_by_time_decay(triple_barrier_events, close_series, num_threads=5, decay=1, verbose=True):
50 | """
51 | Advances in Financial Machine Learning, Snippet 4.11, page 70.
52 |
53 | Implementation of Time Decay Factors.
54 |
55 | :param triple_barrier_events: (pd.DataFrame) Events from labeling.get_events().
56 | :param close_series: (pd.Series) Close prices.
57 | :param num_threads: (int) The number of threads concurrently used by the function.
58 | :param decay: (int) Decay factor
59 | - decay = 1 means there is no time decay;
60 | - 0 < decay < 1 means that weights decay linearly over time, but every observation still receives a strictly positive weight, regadless of how old;
61 | - decay = 0 means that weights converge linearly to zero, as they become older;
62 | - decay < 0 means that the oldes portion c of the observations receive zero weight (i.e they are erased from memory).
63 | :param verbose: (bool) Flag to report progress on asynch jobs.
64 | :return: (pd.Series) Sample weights based on time decay factors.
65 | """
66 |
67 | pass
68 |
69 |
70 | def get_stacked_weights_by_return(triple_barrier_events_dict: dict, close_series_dict: dict, num_threads: int = 5,
71 | verbose: bool = True) -> dict:
72 | """
73 | Get return based sample weights for multi-asset dataset. The function applies mlinlab's get_weights_by_return.
74 | function to multi-asset dataset.
75 |
76 | :param triple_barrier_events_dict: (dict) Dictionary of asset_name: triple barrier event series.
77 | :param close_series_dict: (dict) Dictionary of asset_name: close series used to form label events.
78 | :param num_threads: (int) Number of threads used to get sample weights.
79 | :param verbose: (bool) Flag to report progress on asynch jobs.
80 | :return: (dict) Dictionary of asset_name: sample weight series.
81 | """
82 |
83 | pass
84 |
85 |
86 | def get_stacked_weights_time_decay(triple_barrier_events_dict: dict, close_series_dict: dict, decay: int = 0.5,
87 | num_threads: int = 5,
88 | verbose: bool = True) -> dict:
89 | """
90 | Get return based sample weights for multi-asset dataset. The function applies mlinlab's get_weights_by_time_decay.
91 | function to multi-asset dataset.
92 |
93 | :param triple_barrier_events_dict: (dict) Dictionary of asset_name: triple barrier event series.
94 | :param close_series_dict: (dict) Dictionary of asset_name: close series used to form label events.
95 | :param decay: (int) Decay factor
96 | - decay = 1 means there is no time decay;
97 | - 0 < decay < 1 means that weights decay linearly over time, but every observation still receives a strictly positive weight, regadless of how old;
98 | - decay = 0 means that weights converge linearly to zero, as they become older;
99 | - decay < 0 means that the oldest portion c of the observations receive zero weight (i.e they are erased from memory).
100 | :param num_threads: (int) Number of threads used to get sample weights.
101 | :param verbose: (bool) Flag to report progress on asynch jobs.
102 | :return: (dict) Dictionary of asset_name: sample weight series.
103 | """
104 |
105 | pass
106 |
--------------------------------------------------------------------------------
/mlfinlab/sampling/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains the logic regarding the sequential bootstrapping from chapter 4, as well as the concurrent labels.
3 | """
4 |
5 | from mlfinlab.sampling.bootstrapping import (get_ind_matrix, get_ind_mat_average_uniqueness, seq_bootstrap,
6 | get_ind_mat_label_uniqueness)
7 | from mlfinlab.sampling.concurrent import (num_concurrent_events, _get_average_uniqueness,
8 | get_av_uniqueness_from_triple_barrier)
9 |
--------------------------------------------------------------------------------
/mlfinlab/sampling/bootstrapping.py:
--------------------------------------------------------------------------------
1 | """
2 | Logic regarding sequential bootstrapping from chapter 4.
3 | """
4 |
5 | import pandas as pd
6 | import numpy as np
7 | from numba import jit, prange
8 |
9 |
10 | def get_ind_matrix(samples_info_sets, price_bars):
11 | """
12 | Advances in Financial Machine Learning, Snippet 4.3, page 65.
13 |
14 | Build an Indicator Matrix
15 |
16 | Get indicator matrix. The book implementation uses bar_index as input, however there is no explanation
17 | how to form it. We decided that using triple_barrier_events and price bars by analogy with concurrency
18 | is the best option.
19 |
20 | :param samples_info_sets: (pd.Series): Triple barrier events(t1) from labeling.get_events
21 | :param price_bars: (pd.DataFrame): Price bars which were used to form triple barrier events
22 | :return: (np.array) Indicator binary matrix indicating what (price) bars influence the label for each observation
23 | """
24 |
25 | pass
26 |
27 |
28 | def get_ind_mat_average_uniqueness(ind_mat):
29 | """
30 | Advances in Financial Machine Learning, Snippet 4.4. page 65.
31 |
32 | Compute Average Uniqueness
33 |
34 | Average uniqueness from indicator matrix
35 |
36 | :param ind_mat: (np.matrix) Indicator binary matrix
37 | :return: (float) Average uniqueness
38 | """
39 |
40 | pass
41 |
42 |
43 | def get_ind_mat_label_uniqueness(ind_mat):
44 | """
45 | Advances in Financial Machine Learning, An adaption of Snippet 4.4. page 65.
46 |
47 | Returns the indicator matrix element uniqueness.
48 |
49 | :param ind_mat: (np.matrix) Indicator binary matrix
50 | :return: (np.matrix) Element uniqueness
51 | """
52 |
53 | pass
54 |
55 |
56 | @jit(parallel=True, nopython=True)
57 | def _bootstrap_loop_run(ind_mat, prev_concurrency): # pragma: no cover
58 | """
59 | Part of Sequential Bootstrapping for-loop. Using previously accumulated concurrency array, loops through all samples
60 | and generates averages uniqueness array of label based on previously accumulated concurrency
61 |
62 | :param ind_mat (np.array): Indicator matrix from get_ind_matrix function
63 | :param prev_concurrency (np.array): Accumulated concurrency from previous iterations of sequential bootstrapping
64 | :return: (np.array): Label average uniqueness based on prev_concurrency
65 | """
66 |
67 | pass
68 |
69 |
70 | def seq_bootstrap(ind_mat, sample_length=None, warmup_samples=None, compare=False, verbose=False,
71 | random_state=np.random.RandomState()):
72 | """
73 | Advances in Financial Machine Learning, Snippet 4.5, Snippet 4.6, page 65.
74 |
75 | Return Sample from Sequential Bootstrap
76 |
77 | Generate a sample via sequential bootstrap.
78 | Note: Moved from pd.DataFrame to np.matrix for performance increase
79 |
80 | :param ind_mat: (pd.DataFrame) Indicator matrix from triple barrier events
81 | :param sample_length: (int) Length of bootstrapped sample
82 | :param warmup_samples: (list) List of previously drawn samples
83 | :param compare: (boolean) Flag to print standard bootstrap uniqueness vs sequential bootstrap uniqueness
84 | :param verbose: (boolean) Flag to print updated probabilities on each step
85 | :param random_state: (np.random.RandomState) Random state
86 | :return: (array) Bootstrapped samples indexes
87 | """
88 |
89 | pass
90 |
--------------------------------------------------------------------------------
/mlfinlab/sampling/concurrent.py:
--------------------------------------------------------------------------------
1 | """
2 | Logic regarding concurrent labels from chapter 4.
3 | """
4 |
5 | import pandas as pd
6 |
7 | from mlfinlab.util.multiprocess import mp_pandas_obj
8 |
9 |
10 | def num_concurrent_events(close_series_index, label_endtime, molecule):
11 | """
12 | Advances in Financial Machine Learning, Snippet 4.1, page 60.
13 |
14 | Estimating the Uniqueness of a Label
15 |
16 | This function uses close series prices and label endtime (when the first barrier is touched) to compute the number
17 | of concurrent events per bar.
18 |
19 | :param close_series_index: (pd.Series) Close prices index
20 | :param label_endtime: (pd.Series) Label endtime series (t1 for triple barrier events)
21 | :param molecule: (an array) A set of datetime index values for processing
22 | :return: (pd.Series) Number concurrent labels for each datetime index
23 | """
24 |
25 | pass
26 |
27 |
28 | def _get_average_uniqueness(label_endtime, num_conc_events, molecule):
29 | """
30 | Advances in Financial Machine Learning, Snippet 4.2, page 62.
31 |
32 | Estimating the Average Uniqueness of a Label
33 |
34 | This function uses close series prices and label endtime (when the first barrier is touched) to compute the number
35 | of concurrent events per bar.
36 |
37 | :param label_endtime: (pd.Series) Label endtime series (t1 for triple barrier events)
38 | :param num_conc_events: (pd.Series) Number of concurrent labels (output from num_concurrent_events function).
39 | :param molecule: (an array) A set of datetime index values for processing.
40 | :return: (pd.Series) Average uniqueness over event's lifespan.
41 | """
42 |
43 | pass
44 |
45 |
46 | def get_av_uniqueness_from_triple_barrier(triple_barrier_events, close_series, num_threads, verbose=True):
47 | """
48 | This function is the orchestrator to derive average sample uniqueness from a dataset labeled by the triple barrier
49 | method.
50 |
51 | :param triple_barrier_events: (pd.DataFrame) Events from labeling.get_events()
52 | :param close_series: (pd.Series) Close prices.
53 | :param num_threads: (int) The number of threads concurrently used by the function.
54 | :param verbose: (bool) Flag to report progress on asynch jobs
55 | :return: (pd.Series) Average uniqueness over event's lifespan for each index in triple_barrier_events
56 | """
57 |
58 | pass
59 |
--------------------------------------------------------------------------------
/mlfinlab/structural_breaks/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Structural breaks test (CUSUM, Chow, SADF).
3 | """
4 |
5 | from mlfinlab.structural_breaks.chow import get_chow_type_stat
6 | from mlfinlab.structural_breaks.cusum import get_chu_stinchcombe_white_statistics
7 | from mlfinlab.structural_breaks.sadf import get_sadf
8 |
--------------------------------------------------------------------------------
/mlfinlab/structural_breaks/chow.py:
--------------------------------------------------------------------------------
1 | """
2 | Explosiveness tests: Chow-Type Dickey-Fuller Test
3 | """
4 |
5 | import pandas as pd
6 | from mlfinlab.structural_breaks.sadf import get_betas
7 | from mlfinlab.util import mp_pandas_obj
8 |
9 |
10 | # pylint: disable=invalid-name
11 |
12 | def _get_dfc_for_t(series: pd.Series, molecule: list) -> pd.Series:
13 | """
14 | Get Chow-Type Dickey-Fuller Test statistics for each index in molecule
15 |
16 | :param series: (pd.Series) Series to test
17 | :param molecule: (list) Dates to test
18 | :return: (pd.Series) Statistics for each index from molecule
19 | """
20 |
21 | pass
22 |
23 |
24 | def get_chow_type_stat(series: pd.Series, min_length: int = 20, num_threads: int = 8, verbose: bool = True) -> pd.Series:
25 | """
26 | Multithread implementation of Chow-Type Dickey-Fuller Test, p.251-252
27 |
28 | :param series: (pd.Series) Series to test
29 | :param min_length: (int) Minimum sample length used to estimate statistics
30 | :param num_threads: (int): Number of cores to use
31 | :param verbose: (bool) Flag to report progress on asynch jobs
32 | :return: (pd.Series) Chow-Type Dickey-Fuller Test statistics
33 | """
34 |
35 | pass
36 |
--------------------------------------------------------------------------------
/mlfinlab/structural_breaks/cusum.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementation of Chu-Stinchcombe-White test
3 | """
4 |
5 | import pandas as pd
6 | import numpy as np
7 | from mlfinlab.util import mp_pandas_obj
8 |
9 |
10 | def _get_values_diff(test_type, series, index, ind):
11 | """
12 | Gets the difference between two values given a test type.
13 | :param test_type: (str) Type of the test ['one_sided', 'two_sided']
14 | :param series: (pd.Series) Series of values
15 | :param index: (pd.Index) primary index
16 | :param ind: (pd.Index) secondary index
17 | :return: (float) Difference between 2 values
18 | """
19 |
20 | pass
21 |
22 |
23 | def _get_s_n_for_t(series: pd.Series, test_type: str, molecule: list) -> pd.Series:
24 | """
25 | Get maximum S_n_t value for each value from molecule for Chu-Stinchcombe-White test
26 |
27 | :param series: (pd.Series) Series to get statistics for
28 | :param test_type: (str): Two-sided or one-sided test
29 | :param molecule: (list) Indices to get test statistics for
30 | :return: (pd.Series) Statistics
31 | """
32 |
33 | pass
34 |
35 |
36 | def get_chu_stinchcombe_white_statistics(series: pd.Series, test_type: str = 'one_sided',
37 | num_threads: int = 8, verbose: bool = True) -> pd.Series:
38 | """
39 | Multithread Chu-Stinchcombe-White test implementation, p.251
40 |
41 | :param series: (pd.Series) Series to get statistics for
42 | :param test_type: (str): Two-sided or one-sided test
43 | :param num_threads: (int) Number of cores
44 | :param verbose: (bool) Flag to report progress on asynch jobs
45 | :return: (pd.Series) Statistics
46 | """
47 |
48 | pass
49 |
--------------------------------------------------------------------------------
/mlfinlab/structural_breaks/sadf.py:
--------------------------------------------------------------------------------
1 | """
2 | Explosiveness tests: SADF
3 | """
4 |
5 | from typing import Union, Tuple
6 | import pandas as pd
7 | import numpy as np
8 | from mlfinlab.util.multiprocess import mp_pandas_obj
9 |
10 |
11 | # pylint: disable=invalid-name
12 |
13 | def _get_sadf_at_t(X: pd.DataFrame, y: pd.DataFrame, min_length: int, model: str, phi: float) -> float:
14 | """
15 | Advances in Financial Machine Learning, Snippet 17.2, page 258.
16 |
17 | SADF's Inner Loop (get SADF value at t)
18 |
19 | :param X: (pd.DataFrame) Lagged values, constants, trend coefficients
20 | :param y: (pd.DataFrame) Y values (either y or y.diff())
21 | :param min_length: (int) Minimum number of samples needed for estimation
22 | :param model: (str) Either 'linear', 'quadratic', 'sm_poly_1', 'sm_poly_2', 'sm_exp', 'sm_power'
23 | :param phi: (float) Coefficient to penalize large sample lengths when computing SMT, in [0, 1]
24 | :return: (float) SADF statistics for y.index[-1]
25 | """
26 |
27 | pass
28 |
29 |
30 | def _get_y_x(series: pd.Series, model: str, lags: Union[int, list],
31 | add_const: bool) -> Tuple[pd.DataFrame, pd.DataFrame]:
32 | """
33 | Advances in Financial Machine Learning, Snippet 17.2, page 258-259.
34 |
35 | Preparing The Datasets
36 |
37 | :param series: (pd.Series) Series to prepare for test statistics generation (for example log prices)
38 | :param model: (str) Either 'linear', 'quadratic', 'sm_poly_1', 'sm_poly_2', 'sm_exp', 'sm_power'
39 | :param lags: (int or list) Either number of lags to use or array of specified lags
40 | :param add_const: (bool) Flag to add constant
41 | :return: (pd.DataFrame, pd.DataFrame) Prepared y and X for SADF generation
42 | """
43 |
44 | pass
45 |
46 |
47 | def _lag_df(df: pd.DataFrame, lags: Union[int, list]) -> pd.DataFrame:
48 | """
49 | Advances in Financial Machine Learning, Snipet 17.3, page 259.
50 |
51 | Apply Lags to DataFrame
52 |
53 | :param df: (int or list) Either number of lags to use or array of specified lags
54 | :param lags: (int or list) Lag(s) to use
55 | :return: (pd.DataFrame) Dataframe with lags
56 | """
57 |
58 | pass
59 |
60 |
61 | def get_betas(X: pd.DataFrame, y: pd.DataFrame) -> Tuple[np.array, np.array]:
62 | """
63 | Advances in Financial Machine Learning, Snippet 17.4, page 259.
64 |
65 | Fitting The ADF Specification (get beta estimate and estimate variance)
66 |
67 | :param X: (pd.DataFrame) Features(factors)
68 | :param y: (pd.DataFrame) Outcomes
69 | :return: (np.array, np.array) Betas and variances of estimates
70 | """
71 |
72 | pass
73 |
74 |
75 | def _sadf_outer_loop(X: pd.DataFrame, y: pd.DataFrame, min_length: int, model: str, phi: float,
76 | molecule: list) -> pd.Series:
77 | """
78 | This function gets SADF for t times from molecule
79 |
80 | :param X: (pd.DataFrame) Features(factors)
81 | :param y: (pd.DataFrame) Outcomes
82 | :param min_length: (int) Minimum number of observations
83 | :param model: (str) Either 'linear', 'quadratic', 'sm_poly_1', 'sm_poly_2', 'sm_exp', 'sm_power'
84 | :param phi: (float) Coefficient to penalize large sample lengths when computing SMT, in [0, 1]
85 | :param molecule: (list) Indices to get SADF
86 | :return: (pd.Series) SADF statistics
87 | """
88 |
89 | pass
90 |
91 | def get_sadf(series: pd.Series, model: str, lags: Union[int, list], min_length: int, add_const: bool = False,
92 | phi: float = 0, num_threads: int = 8, verbose: bool = True) -> pd.Series:
93 | """
94 | Advances in Financial Machine Learning, p. 258-259.
95 |
96 | Multithread implementation of SADF
97 |
98 | SADF fits the ADF regression at each end point t with backwards expanding start points. For the estimation
99 | of SADF(t), the right side of the window is fixed at t. SADF recursively expands the beginning of the sample
100 | up to t - min_length, and returns the sup of this set.
101 |
102 | When doing with sub- or super-martingale test, the variance of beta of a weak long-run bubble may be smaller than
103 | one of a strong short-run bubble, hence biasing the method towards long-run bubbles. To correct for this bias,
104 | ADF statistic in samples with large lengths can be penalized with the coefficient phi in [0, 1] such that:
105 |
106 | ADF_penalized = ADF / (sample_length ^ phi)
107 |
108 | :param series: (pd.Series) Series for which SADF statistics are generated
109 | :param model: (str) Either 'linear', 'quadratic', 'sm_poly_1', 'sm_poly_2', 'sm_exp', 'sm_power'
110 | :param lags: (int or list) Either number of lags to use or array of specified lags
111 | :param min_length: (int) Minimum number of observations needed for estimation
112 | :param add_const: (bool) Flag to add constant
113 | :param phi: (float) Coefficient to penalize large sample lengths when computing SMT, in [0, 1]
114 | :param num_threads: (int) Number of cores to use
115 | :param verbose: (bool) Flag to report progress on asynch jobs
116 | :return: (pd.Series) SADF statistics
117 | """
118 |
119 | pass
--------------------------------------------------------------------------------
/mlfinlab/util/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Utility functions. In particular Chapter20 code on Multiprocessing and Vectorization.
3 | """
4 |
5 | from mlfinlab.util.fast_ewma import ewma
6 | from mlfinlab.util.multiprocess import (expand_call, lin_parts, mp_pandas_obj, nested_parts,
7 | process_jobs, process_jobs_, report_progress)
8 | from mlfinlab.util.volatility import (get_daily_vol, get_garman_class_vol, get_yang_zhang_vol, get_parksinson_vol)
9 | from mlfinlab.util.volume_classifier import get_bvc_buy_volume
10 | from mlfinlab.util.generate_dataset import get_classification_data
11 |
--------------------------------------------------------------------------------
/mlfinlab/util/fast_ewma.py:
--------------------------------------------------------------------------------
1 | """
2 | This module contains an implementation of an exponentially weighted moving average based on sample size.
3 | The inspiration and context for this code was from a blog post by writen by Maksim Ivanov:
4 | https://towardsdatascience.com/financial-machine-learning-part-0-bars-745897d4e4ba
5 | """
6 |
7 | # Imports
8 | import numpy as np
9 | from numba import jit
10 | from numba import float64
11 | from numba import int64
12 |
13 |
14 | @jit((float64[:], int64), nopython=False, nogil=True)
15 | def ewma(arr_in, window): # pragma: no cover
16 | """
17 | Exponentially weighted moving average specified by a decay ``window`` to provide better adjustments for
18 | small windows via:
19 | y[t] = (x[t] + (1-a)*x[t-1] + (1-a)^2*x[t-2] + ... + (1-a)^n*x[t-n]) /
20 | (1 + (1-a) + (1-a)^2 + ... + (1-a)^n).
21 |
22 | :param arr_in: (np.ndarray), (float64) A single dimensional numpy array
23 | :param window: (int64) The decay window, or 'span'
24 | :return: (np.ndarray) The EWMA vector, same length / shape as ``arr_in``
25 | """
26 |
27 | pass
28 |
--------------------------------------------------------------------------------
/mlfinlab/util/generate_dataset.py:
--------------------------------------------------------------------------------
1 | '''
2 | This module generates synthetic classification dataset of INFORMED, REDUNDANT, and NOISE explanatory
3 | variables based on the book Machine Learning for Asset Manager (code snippet 6.1)
4 | '''
5 | import numpy as np
6 | import pandas as pd
7 | from sklearn.datasets import make_classification
8 |
9 | # pylint: disable=invalid-name
10 | def get_classification_data(n_features=100, n_informative=25, n_redundant=25, n_samples=10000, random_state=0, sigma=.0):
11 | """
12 | A function to generate synthetic classification datasets
13 |
14 | :param n_features: (int) Total number of features to be generated (i.e. informative + redundant + noisy).
15 | :param n_informative: (int) Number of informative features.
16 | :param n_redundant: (int) Number of redundant features.
17 | :param n_samples: (int) Number of samples (rows) to be generate.
18 | :param random_state: (int) Random seed.
19 | :param sigma: (float) This argument is used to introduce substitution effect to the redundant features in
20 | the dataset by adding gaussian noise. The lower the value of sigma, the greater the
21 | substitution effect.
22 | :return: (pd.DataFrame, pd.Series) X and y as features and labels respectively.
23 | """
24 |
25 | pass
26 |
--------------------------------------------------------------------------------
/mlfinlab/util/misc.py:
--------------------------------------------------------------------------------
1 | """
2 | Various useful functions
3 | """
4 |
5 | import pandas as pd
6 | import numpy as np
7 |
8 | def crop_data_frame_in_batches(df: pd.DataFrame, chunksize: int):
9 | # pylint: disable=invalid-name
10 | """
11 | Splits df into chunks of chunksize
12 |
13 | :param df: (pd.DataFrame) Dataframe to split
14 | :param chunksize: (int) Number of rows in chunk
15 | :return: (list) Chunks (pd.DataFrames)
16 | """
17 |
18 | pass
19 |
--------------------------------------------------------------------------------
/mlfinlab/util/volatility.py:
--------------------------------------------------------------------------------
1 | """
2 | Various volatility estimators
3 | """
4 | import pandas as pd
5 | import numpy as np
6 |
7 |
8 | # pylint: disable=redefined-builtin
9 |
10 | def get_daily_vol(close, lookback=100):
11 | """
12 | Advances in Financial Machine Learning, Snippet 3.1, page 44.
13 |
14 | Daily Volatility Estimates
15 |
16 | Computes the daily volatility at intraday estimation points.
17 |
18 | In practice we want to set profit taking and stop-loss limits that are a function of the risks involved
19 | in a bet. Otherwise, sometimes we will be aiming too high (tao ≫ sigma_t_i,0), and sometimes too low
20 | (tao ≪ sigma_t_i,0 ), considering the prevailing volatility. Snippet 3.1 computes the daily volatility
21 | at intraday estimation points, applying a span of lookback days to an exponentially weighted moving
22 | standard deviation.
23 |
24 | See the pandas documentation for details on the pandas.Series.ewm function.
25 | Note: This function is used to compute dynamic thresholds for profit taking and stop loss limits.
26 |
27 | :param close: (pd.Series) Closing prices
28 | :param lookback: (int) Lookback period to compute volatility
29 | :return: (pd.Series) Daily volatility value
30 | """
31 |
32 | pass
33 |
34 |
35 | def get_parksinson_vol(high: pd.Series, low: pd.Series, window: int = 20) -> pd.Series:
36 | """
37 | Parkinson volatility estimator
38 |
39 | :param high: (pd.Series): High prices
40 | :param low: (pd.Series): Low prices
41 | :param window: (int): Window used for estimation
42 | :return: (pd.Series): Parkinson volatility
43 | """
44 |
45 | pass
46 |
47 |
48 | def get_garman_class_vol(open: pd.Series, high: pd.Series, low: pd.Series, close: pd.Series,
49 | window: int = 20) -> pd.Series:
50 | """
51 | Garman-Class volatility estimator
52 |
53 | :param open: (pd.Series): Open prices
54 | :param high: (pd.Series): High prices
55 | :param low: (pd.Series): Low prices
56 | :param close: (pd.Series): Close prices
57 | :param window: (int): Window used for estimation
58 | :return: (pd.Series): Garman-Class volatility
59 | """
60 |
61 | pass
62 |
63 |
64 | def get_yang_zhang_vol(open: pd.Series, high: pd.Series, low: pd.Series, close: pd.Series,
65 | window: int = 20) -> pd.Series:
66 | """
67 |
68 | Yang-Zhang volatility estimator
69 |
70 | :param open: (pd.Series): Open prices
71 | :param high: (pd.Series): High prices
72 | :param low: (pd.Series): Low prices
73 | :param close: (pd.Series): Close prices
74 | :param window: (int): Window used for estimation
75 | :return: (pd.Series): Yang-Zhang volatility
76 | """
77 |
78 | pass
79 |
--------------------------------------------------------------------------------
/mlfinlab/util/volume_classifier.py:
--------------------------------------------------------------------------------
1 | """
2 | Volume classification methods (BVC and tick rule)
3 | """
4 |
5 | from scipy.stats import norm
6 | import pandas as pd
7 |
8 |
9 | def get_bvc_buy_volume(close: pd.Series, volume: pd.Series, window: int = 20) -> pd.Series:
10 | """
11 | Calculates the BVC buy volume
12 |
13 | :param close: (pd.Series): Close prices
14 | :param volume: (pd.Series): Bar volumes
15 | :param window: (int): Window for std estimation uses in BVC calculation
16 | :return: (pd.Series) BVC buy volume
17 | """
18 | # .apply(norm.cdf) is used to omit Warning for norm.cdf(pd.Series with NaNs)
19 |
20 | pass
21 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Production
2 | numpy>=0.16.0
3 | matplotlib>=3.0.0
4 | pandas>=1.0.0
5 | scikit-learn>=0.20.0
6 | scipy>=1.2.0
7 | statsmodels>=0.9.0
8 | cython>=0.29
9 | POT>=0.7.0
10 | numba>=0.40.0
11 | networkx>=2.2, <2.6
12 | dash>=1.0.0
13 | dash-cytoscape>=0.1.0
14 | dash-bootstrap-components>=0.10.0
15 | jupyter-dash>=0.2.0
16 | tensorflow>=2.0.0
17 | joblib>=1.0.0
18 | decorator>=4.0.0, <5.0.0
19 | analytics-python>=1.2.7
20 | getmac>=0.8.0
21 |
22 |
23 | # Develop
24 | codecov==2.1.11
25 | coverage==5.4
26 | pylint==2.6.0
27 | sphinx==3.4.3 # Docs
28 | hudsonthames-sphinx-theme==0.1.5 # Docs
29 | sphinx-rtd-theme==0.5.2 # Docs
30 | releases==1.6.3 # Docs
31 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = mlfinlab
3 | version = 1.3.0
4 | author = Hudson and Thames Quantitative Research
5 | author_email = research@hudsonthames.org
6 | licence = All Rights Reserved
7 | licence-file = LICENSE.txt
8 | description = MlFinlab helps portfolio managers and traders who want to leverage the power of machine learning by providing reproducible, interpretable, and easy to use tools.
9 | long_description = file: README.md
10 | long_description_content_type = text/markdown
11 | platform = any
12 | url = https://www.hudsonthames.org/
13 | project_urls =
14 | Documentation = https://mlfinlab.readthedocs.io/en/latest/
15 | Bug Reports = https://github.com/hudson-and-thames/mlfinlab/issues
16 | Project Boards = https://github.com/orgs/hudson-and-thames/projects
17 | Source = https://github.com/hudson-and-thames/mlfinlab
18 | Blog = https://hudsonthames.org/blog/
19 | Apprenticeship Program = https://hudsonthames.org/apprenticeship-program/
20 | classifiers =
21 | Development Status :: 5 - Production/Stable
22 | Intended Audience :: Developers
23 | Intended Audience :: Education
24 | Intended Audience :: Science/Research
25 | Intended Audience :: Financial and Insurance Industry
26 | License :: Other/Proprietary License
27 | Operating System :: OS Independent
28 | Programming Language :: Python
29 | Programming Language :: Python :: 3.6
30 | Programming Language :: Python :: 3.7
31 | Programming Language :: Python :: 3.8
32 | Topic :: Scientific/Engineering
33 | Topic :: Scientific/Engineering :: Artificial Intelligence
34 | Topic :: Office/Business :: Financial :: Investment
35 | keywords =
36 | machinelearning
37 | finance
38 | investment
39 | education
40 |
41 | [options]
42 | include_package_data = True
43 | packages = find:
44 | python_requires =
45 | >=3.6, <3.9
46 | setup_requires =
47 | setuptools
48 | cython
49 | install_requires =
50 | numpy>=0.16.0
51 | matplotlib>=3.0.0
52 | pandas>=1.0.0
53 | scikit-learn>=0.20.0
54 | scipy>=1.2.0
55 | statsmodels>=0.9.0
56 | cython>=0.29
57 | POT>=0.7.0
58 | numba>=0.40.0
59 | networkx>=2.2, <2.6
60 | dash>=1.0.0
61 | dash-cytoscape>=0.1.0
62 | dash-bootstrap-components>=0.10.0
63 | jupyter-dash>=0.2.0
64 | tensorflow>=2.0.0
65 | joblib>=1.0.0
66 | decorator>=4.0.0, <5.0.0
67 | analytics-python>=1.2.7
68 | getmac>=0.8.0
69 |
70 |
71 | [options.packages.find]
72 | package_dir =
73 | mlfinlab
74 | exclude =
75 | contrib
76 | docs
77 | tests
78 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # Always prefer setuptools over distutils
2 | from setuptools import setup
3 |
4 | setup()
5 |
6 | # Create package
7 | # python setup.py bdist_wheel
8 | # python3 -m twine upload --repository-url https://test.pypi.org/legacy/ dist/* (This is the test repo)
9 | # twine upload dist/* (This is official repo)
10 |
--------------------------------------------------------------------------------