├── .bumpversion.cfg
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── custom.md
    │   └── feature_request.md
    ├── logo
    │   ├── hudson_and_thames_logo.png
    │   └── support.png
    └── pull_request_template.md
├── .gitignore
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.txt
├── README.md
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── _static
    │       ├── .gitkeep
    │       ├── favicon_mlfinlab.png
    │       ├── ht_logo_black.png
    │       ├── ht_logo_white.png
    │       ├── logo_black.png
    │       └── logo_white.png
    │   ├── _templates
    │       └── breadcrumbs.html
    │   ├── additional_information
    │       ├── analytics.rst
    │       ├── contact.rst
    │       ├── contributing.rst
    │       ├── images
    │       │   └── slack.png
    │       ├── license.rst
    │       └── privacy_gdpr.rst
    │   ├── changelog.rst
    │   ├── conf.py
    │   ├── index.rst
    │   └── requirements.txt
├── mlfinlab
    ├── __init__.py
    ├── backtest_statistics
    │   ├── __init__.py
    │   ├── backtests.py
    │   └── statistics.py
    ├── bet_sizing
    │   ├── __init__.py
    │   ├── bet_sizing.py
    │   ├── ch10_snippets.py
    │   └── ef3m.py
    ├── clustering
    │   ├── __init__.py
    │   ├── feature_clusters.py
    │   ├── hierarchical_clustering.py
    │   └── onc.py
    ├── codependence
    │   ├── __init__.py
    │   ├── codependence_matrix.py
    │   ├── correlation.py
    │   ├── gnpr_distance.py
    │   ├── information.py
    │   └── optimal_transport.py
    ├── cross_validation
    │   ├── __init__.py
    │   ├── combinatorial.py
    │   └── cross_validation.py
    ├── data_generation
    │   ├── __init__.py
    │   ├── bootstrap.py
    │   ├── correlated_random_walks.py
    │   ├── corrgan.py
    │   ├── data_verification.py
    │   ├── hcbm.py
    │   └── vines.py
    ├── data_structures
    │   ├── __init__.py
    │   ├── base_bars.py
    │   ├── imbalance_data_structures.py
    │   ├── run_data_structures.py
    │   ├── standard_data_structures.py
    │   └── time_data_structures.py
    ├── datasets
    │   ├── __init__.py
    │   ├── data
    │   │   ├── dollar_bar_sample.csv
    │   │   ├── stock_prices.csv
    │   │   └── tick_data.csv
    │   └── load_datasets.py
    ├── ensemble
    │   ├── __init__.py
    │   └── sb_bagging.py
    ├── feature_importance
    │   ├── __init__.py
    │   ├── fingerpint.py
    │   ├── importance.py
    │   └── orthogonal.py
    ├── features
    │   ├── __init__.py
    │   └── fracdiff.py
    ├── filters
    │   ├── __init__.py
    │   └── filters.py
    ├── labeling
    │   ├── __init__.py
    │   ├── bull_bear.py
    │   ├── excess_over_mean.py
    │   ├── excess_over_median.py
    │   ├── fixed_time_horizon.py
    │   ├── labeling.py
    │   ├── matrix_flags.py
    │   ├── raw_return.py
    │   ├── return_vs_benchmark.py
    │   ├── tail_sets.py
    │   └── trend_scanning.py
    ├── microstructural_features
    │   ├── __init__.py
    │   ├── encoding.py
    │   ├── entropy.py
    │   ├── feature_generator.py
    │   ├── first_generation.py
    │   ├── misc.py
    │   ├── second_generation.py
    │   └── third_generation.py
    ├── multi_product
    │   ├── __init__.py
    │   └── etf_trick.py
    ├── networks
    │   ├── __init__.py
    │   ├── almst.py
    │   ├── dash_graph.py
    │   ├── dual_dash_graph.py
    │   ├── graph.py
    │   ├── mst.py
    │   ├── pmfg.py
    │   └── visualisations.py
    ├── regression
    │   ├── __init__.py
    │   └── history_weight_regression.py
    ├── sample_weights
    │   ├── __init__.py
    │   └── attribution.py
    ├── sampling
    │   ├── __init__.py
    │   ├── bootstrapping.py
    │   └── concurrent.py
    ├── structural_breaks
    │   ├── __init__.py
    │   ├── chow.py
    │   ├── cusum.py
    │   └── sadf.py
    └── util
    │   ├── __init__.py
    │   ├── fast_ewma.py
    │   ├── generate_dataset.py
    │   ├── misc.py
    │   ├── multiprocess.py
    │   ├── volatility.py
    │   └── volume_classifier.py
├── requirements.txt
├── setup.cfg
└── setup.py


/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 1.2.0
 3 | commit = True
 4 | tag = True
 5 | tag_name = {new_version}
 6 | 
 7 | [bumpversion:file:setup.cfg]
 8 | 
 9 | [bumpversion:file:docs/source/conf.py]
10 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: HudsonThames # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Custom issue template
 3 | about: Describe this issue template's purpose here.
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/logo/hudson_and_thames_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/.github/logo/hudson_and_thames_logo.png


--------------------------------------------------------------------------------
/.github/logo/support.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/.github/logo/support.png


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | # Description
 2 | 
 3 | Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.
 4 | 
 5 | Fixes # (issue)
 6 | 
 7 | ## Type of change
 8 | 
 9 | Please delete options that are not relevant.
10 | 
11 | - [ ] Bug fix (non-breaking change which fixes an issue)
12 | - [ ] New feature (non-breaking change which adds functionality)
13 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
14 | - [ ] This change requires a documentation update
15 | 
16 | # How Has This Been Tested?
17 | 
18 | Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration
19 | 
20 | - [ ] Test A
21 | - [ ] Test B
22 | 
23 | **Test Configuration**:
24 | * Operating system
25 | * IDE used
26 | 
27 | 
28 | # Checklist:
29 | 
30 | - [ ] My code follows the style guidelines of this project
31 | - [ ] I have performed a self-review of my own code
32 | - [ ] I have commented my code, particularly in hard-to-understand areas
33 | - [ ] I have made corresponding changes to the documentation
34 | - [ ] My changes generate no new warnings
35 | - [ ] I have added tests that prove my fix is effective or that my feature works
36 | - [ ] New and existing unit tests pass locally with my changes
37 | - [ ] Any dependent changes have been merged and published in downstream modules
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/*
 2 | *.pyc
 3 | __pycache__
 4 | test_reports
 5 | .coverage
 6 | .DS_Store
 7 | docs/build/
 8 | .local/
 9 | cover/
10 | *.pickle
11 | */.ipynb_checkpoints/*
12 | mlfinlab.egg-info/*
13 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |    configuration: docs/source/conf.py
11 | 
12 | # Optionally build your docs in additional formats such as PDF
13 | formats: []
14 | 
15 | # Optionally set the version of Python and requirements required to build your docs
16 | python:
17 |    version: 3.8
18 |    install:
19 |    - requirements: docs/source/requirements.txt
20 | 
21 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at hudsonthames19@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to MlFinLab:
 2 | 
 3 | First off we wanted to thank you for taking the time to contribute to the project. 
 4 | 
 5 | We make use of a [Apprenticeship Program](https://hudsonthames.org/mentorship/), which caters to ambitious students looking 
 6 | to make an impact on open-source and develop a portfolio of work based on financial machine learning.
 7 | 
 8 | This allows us to establish organised collaboration and control the level of code quality. 
 9 | 
10 | ## External Contributions: 
11 | 
12 | We do encourage external contributions which are sourced by members of our community, [Slack Channel](https://www.patreon.com/HudsonThames), 
13 | 
14 | We have quite a rigorous process of unit testing, code style checks, and documentation.  
15 | 
16 | 
17 | ## Raise an Issue
18 | We have created [templates](https://github.com/hudson-and-thames/mlfinlab/issues/new/choose) to help aid in creating issues and PRs:
19 | * Bug report
20 | * Feature request
21 | * Custom issue template
22 | * Pull Request Template
23 | 
24 | ---
25 | 
26 | ## Contact us
27 | We host a booming community of like minded data scientists and quants, join the 
28 | [Slack Channel](https://www.patreon.com/HudsonThames) now! Open to sponsors of our package. 
29 | 
30 | The channel has the following benefits: 
31 | 
32 | * Community of like minded individuals.
33 | * Ask questions about the package implementations and get community feedback.
34 | * Occasional presentations on topics within financial machine learning.
35 | * A papers channel where we share the papers which are freely available.
36 | * Access to members of our research group.
37 |  
38 | You can also email us at research@hudsonthames.org
39 |  
40 | Looking forward to hearing from you!


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |    <a href="https://hudsonthames.org/mlfinlab">
  3 |    <img src="https://hudsonthames.org/wp-content/uploads/2021/11/mlfinlab_github_header_v2.jpg" width="100%" 
  4 |    style="margin-left: auto; margin-right: auto; display:block;">
  5 |    
  6 |    </a>
  7 |   </br>
  8 | </div>
  9 | 
 10 | 
 11 | # Welcome to Machine Learning Financial Laboratory! 
 12 | 
 13 | <div align="center">
 14 |     <br>
 15 | </div>
 16 | 
 17 | >This repo is public facing and exists for the sole purpose of providing users with an easy way to raise bugs, feature requests, and other issues.
 18 | 
 19 | <div align="center">
 20 |     <br>
 21 | </div>
 22 | 
 23 | ## What is MlFinLab?
 24 | MlFinlab python library is a perfect toolbox that every financial machine learning researcher needs. 
 25 | 
 26 | It covers every step of the ML strategy creation, starting from data structures generation and finishing with backtest statistics.
 27 | We pride ourselves in the robustness of our codebase - every line of code existing in the modules is extensively tested and 
 28 | documented.
 29 | 
 30 | 
 31 | ## Documentation, Example Notebooks and Lecture Videos
 32 | For every technique present in the library we not only provide extensive documentation, with both theoretical explanations
 33 | and detailed descriptions of available functions, but also supplement the modules with ever-growing array of lecture videos and slides 
 34 | on the implemented methods.
 35 |  
 36 | We want you to be able to use the tools right away. To achieve that, every module comes with a number of example notebooks 
 37 | which include detailed examples of the usage of the algorithms. Our goal is to show you the whole pipeline, starting from 
 38 | importing the libraries and ending with strategy performance metrics so you can get the added value from the get-go.
 39 | 
 40 | <div align="left">
 41 |    <a href="https://portal.hudsonthames.org/sign-in">
 42 |    <img src="https://hudsonthames.org/wp-content/uploads/2021/11/purchase_mlfinlab_v2.png" height="100px" 
 43 |    style="margin-left: auto; margin-right: auto; display:inline-block;">
 44 |    </a>
 45 |    <a href="https://hudsonthames.org/">
 46 |    <img src="https://hudsonthames.org/wp-content/uploads/2021/11/website_link_m.png" height="100px">
 47 |    </a>
 48 |    <a href="https://www.youtube.com/channel/UC8hI87gt0dmTAIEupEcsckA">
 49 |    <img src="https://hudsonthames.org/wp-content/uploads/2021/11/youtube_mlfinlab.png" height="100px">
 50 |    </a>
 51 | </div>
 52 | 
 53 | 
 54 | ### Included modules:
 55 | 
 56 | - Backtest Overfitting Tools
 57 | - Data Structures
 58 | - Labeling
 59 | - Sampling
 60 | - Feature Engineering
 61 | - Models
 62 | - Clustering
 63 | - Cross-Validation
 64 | - Hyper-Parameter Tuning
 65 | - Feature Importance
 66 | - Bet Sizing
 67 | - Synthetic Data Generation
 68 | - Networks
 69 | - Measures of Codependence
 70 | - Useful Financial Features
 71 | 
 72 | 
 73 | ## Licensing options
 74 | This project is licensed under an all rights reserved [licence](https://github.com/hudson-and-thames/mlfinlab/blob/master/LICENSE.txt).
 75 | 
 76 | * Business
 77 | * Enterprise
 78 | 
 79 | 
 80 | ## Community
 81 | With the purchase of the library, our clients get access to the Hudson & Thames Slack community, where our engineers and other quants 
 82 | are always ready to answer your questions.
 83 | 
 84 | Alternatively, you can email us at: research@hudsonthames.org.
 85 | 
 86 | <div align="center">
 87 |    <a>
 88 |    <img src="https://hudsonthames.org/wp-content/uploads/2021/11/header_github_ht.jpg" width="100%" 
 89 |    style="margin-left: auto; margin-right: auto; display:block;">
 90 |    </a>
 91 | </div>
 92 | 
 93 | 
 94 | ## Who is Hudson & Thames?
 95 | Hudson and Thames Quantitative Research is a company with the goal of bridging the gap between the advanced research developed in 
 96 | quantitative finance and its practical application. We have created three premium python libraries so you can effortlessly access the
 97 | latest techniques and focus on what matters most: **creating your own winning strategy**.
 98 | 
 99 | 
100 | ### What was only possible with the help of huge R&D teams is now at your disposal, anywhere, anytime.
101 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/_static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/.gitkeep


--------------------------------------------------------------------------------
/docs/source/_static/favicon_mlfinlab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/favicon_mlfinlab.png


--------------------------------------------------------------------------------
/docs/source/_static/ht_logo_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/ht_logo_black.png


--------------------------------------------------------------------------------
/docs/source/_static/ht_logo_white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/ht_logo_white.png


--------------------------------------------------------------------------------
/docs/source/_static/logo_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/logo_black.png


--------------------------------------------------------------------------------
/docs/source/_static/logo_white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/_static/logo_white.png


--------------------------------------------------------------------------------
/docs/source/_templates/breadcrumbs.html:
--------------------------------------------------------------------------------
1 | {%- extends "sphinx_rtd_theme/breadcrumbs.html" %}
2 | 
3 | {% block breadcrumbs_aside %}
4 | {% endblock %}


--------------------------------------------------------------------------------
/docs/source/additional_information/analytics.rst:
--------------------------------------------------------------------------------
 1 | .. _additional_information-analytics:
 2 | 
 3 | =========
 4 | Analytics
 5 | =========
 6 | 
 7 | .. warning::
 8 | 
 9 |    * Please don't alter or change any of the code as this is a violation of our license agreement.
10 |    * We do provide a separate enterprise license for companies that want to white label or alter code.
11 |    * All changes are flagged by the system.
12 | 
13 | Please note that we have added standard web analytics to MLFinLab, using `Segment. <https://segment.com/>`__
14 | 
15 | We track the following:
16 | 
17 | * City, Country, Region, City Geographic Coordinate
18 | * UserIDs (MAC address)
19 | * Function calls
20 | * Timestamps
21 | 
22 | This allows our team to see how the package is being used by you, our client, so that we may improve the functionality and
23 | build more tools that you will love. An additional purpose is that we need to start tracking growth KPIs such as cohort
24 | retention and MAU and we will compile these into reports for investors, as we are aiming for VC funding in late 2021.
25 | 
26 | The impact of the analytics is negligible.
27 | 
28 | .. note::
29 | 
30 |    * We chose to use MAC Addresses as it is an anonymous token which allows us to track a machine and is not considered as personal information under GDPR unless it is combined with other personal data which then identifies the natural person.
31 |    * Your data is also anonymized by filtering it through ipinfo, which returns high level location (City, Country, Region) data without sharing your IP address.
32 |    * Segment is the tool we use to collect, clean, and control the data.


--------------------------------------------------------------------------------
/docs/source/additional_information/contact.rst:
--------------------------------------------------------------------------------
 1 | .. _additional_information-contact:
 2 | 
 3 | =========================
 4 | Join the Slack Channel 🔑
 5 | =========================
 6 | 
 7 | We host a booming community of like minded data scientists and quants, join the Slack channel now! Available via
 8 | `H&T Client Portal <https://portal.hudsonthames.org/dashboard/product/slack>`__.
 9 | 
10 | The channel has the following benefits:
11 | 
12 | * Community of like minded individuals.
13 | * Ask questions about the package implementations and get community feedback.
14 | * Occasional presentations on topics within financial machine learning.
15 | * A papers channel where we share the papers which are freely available.
16 | * Access to members of our research group.
17 | 
18 | Looking forward to hearing from you!
19 | 
20 | .. image:: ./images/slack.png
21 |    :scale: 65 %
22 |    :align: center
23 | 


--------------------------------------------------------------------------------
/docs/source/additional_information/contributing.rst:
--------------------------------------------------------------------------------
 1 | .. _additional_information-contributing:
 2 | 
 3 | ============
 4 | Contributing
 5 | ============
 6 | 
 7 | Areas of Contribution
 8 | #####################
 9 | 
10 | Currently we have a live project board that follows the principles of Agile Project Management.
11 | 
12 | At the time of writing, we are focusing our attentions primarily on those contributions by the current Researchers enrolled
13 | in our `Apprenticeship Program <https://hudsonthames.org/apprenticeship-program/>`_.
14 | 
15 | There is of course room for the public to make contributions. The most useful are those that help to improve user experience.
16 | Good examples of this is writing tutorial notebooks which answer questions
17 | from the back of a chapter, mlfinlab recipes, improving docstrings, and adding new sphinx documentation.
18 | 
19 | Raising Issues
20 | ##############
21 | 
22 | We have created `templates`_ to help aid in creating issues and PRs:
23 | 
24 | * Bug report
25 | * Feature request
26 | * Custom issue template
27 | * Pull Request Template
28 | 
29 | Please do create issues for new feature requests and bug fixes.
30 | 
31 | .. _templates: https://github.com/hudson-and-thames/mlfinlab/issues/new/choose
32 | 


--------------------------------------------------------------------------------
/docs/source/additional_information/images/slack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hudson-and-thames/mlfinlab/79dcc7120ec84110578f75b025a75850eb72fc73/docs/source/additional_information/images/slack.png


--------------------------------------------------------------------------------
/docs/source/additional_information/privacy_gdpr.rst:
--------------------------------------------------------------------------------
 1 | .. _additional_information-analytics:
 2 | 
 3 | =======================
 4 | Privacy and GDPR Policy
 5 | =======================
 6 | 
 7 | .. note::
 8 |    Our Privacy and GDPR Policies can be downloaded directly from our website:
 9 | 
10 |    * `Privacy Policy <https://hudsonthames.org/wp-content/uploads/2021/06/PrivacyPolicy.pdf>`_
11 |    * `GDPR Policy <https://hudsonthames.org/wp-content/uploads/2021/06/GDPR-Policy.pdf>`_
12 | 


--------------------------------------------------------------------------------
/docs/source/changelog.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Changelog
 3 | =========
 4 | ..
 5 |     The Following are valid options
 6 |     * :release:`0.1.0 <2021-01-12>`
 7 |     * :support:`119` Upgrade to pandas 1.0
 8 |     * :feature:`50` Add a distutils command for marbles
 9 |     * :bug:`58` Fixed test failure on OSX
10 | ..
11 |     For Help: https://releases.readthedocs.io/en/latest/index.html
12 | 
13 | * :release:`1.3.0 <2021-07-09>`
14 | * :feature:`69` Added support for Python 3.6 and Python 3.7.
15 | * :feature:`69` Requirements versions are now non-fixed.
16 | * :support:`69` Migrated Optimal Mean Reversion Module from MlFinLab to ArbitrageLab.
17 | * :support:`69` Reflected Optimal Mean Reversion Module migration in the documentation.
18 | 
19 | * :release:`1.2.0 <2021-06-23>`
20 | * :support:`64` Updated references in documentation.
21 | * :support:`63` Updated documentation theme to hudsonthames-sphinx-docs.
22 | * :bug:`66 major` Fixed issue with too many function calls in web analytics.
23 | 
24 | * :release:`1.1.0 <2021-04-15>`
25 | * :feature:`56` MAE/MSE added as possible metrics for the Trend Scanning Module.
26 | * :feature:`58` Low silhouette scores check made optional in Feature Clusters Module.
27 | * :bug:`57 major` Fix purging bug in Purged KFold/Combinatorial Purged KFold.
28 | * :feature:`61` History Weighted Regression added to the Regression Module.
29 | * :support:`61` History Weighted Regression documentation.
30 | * :feature:`59` Code and unit tests style unified.
31 | * :support:`59` Documentation style unified.
32 | * :feature:`45` Added Pagan et al. and Lunde et al. Bull Bear Methods to the Labeling Module.
33 | * :support:`45` Added Pagan et al. and Lunde et al. Bull Bear Methods documentation.
34 | * :bug:`60 major` Fix structural break bug in the Chu-Stinchcombe-White test.
35 | * :feature:`46` Stacked Module with Cross Validation, Feature Importance, and Sampling methods added.
36 | * :feature:`46` Lambda code in Microstructural Features Module speed-up.
37 | * :support:`46` Stacked Module documentation.
38 | 
39 | * :release:`1.0.1 <2021-02-19>`
40 | * :support:`55` Removed TensorFlow from requirements and adjusted installation guide.
41 | 
42 | * :release:`1.0.0 <2021-02-16>`
43 | * :feature:`35` Debugged ETF Trick code.
44 | * :feature:`44` Added n_repeat parameter to MDA feature importance.
45 | * :feature:`50` Added t-student option to BVC classifier.
46 | * :bug:`50` Fix bug in Bar-based Kyle lambdas calculation.
47 | * :feature:`52` Migrated Portfolio Optimisation Module code from MlFinLab to PortfolioLab.
48 | * :support:`52` Migrated Portfolio Optimisation Module documentation from MlFinLab to PortfolioLab.
49 | * :feature:`52` Migrated Online Portfolio Selection Module code from MlFinLab to PortfolioLab.
50 | * :support:`52` Migrated Online Portfolio Selection Module documentation from MlFinLab to PortfolioLab.
51 | * :support:`52` Updated requirements versions (numpy==1.20.1, matplotlib==3.2.2,
52 |   pandas==1.1.5, scikit-learn==0.24.1, scipy==1.6.0, statsmodels==0.12.2).
53 | 
54 | * :release:`0.15.3 <2021-01-12>`
55 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # http://www.sphinx-doc.org/en/master/config
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('./../..'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'mlfinlab'
21 | copyright = '2019, Hudson & Thames Quantitative Research.'
22 | author = 'Hudson & Thames Quantitative Research'
23 | 
24 | # The full version, including alpha/beta/rc tags
25 | release = '1.3.0'
26 | 
27 | 
28 | # -- General configuration ---------------------------------------------------
29 | 
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 |     'sphinx.ext.autodoc',
35 |     'sphinx.ext.coverage',
36 |     'sphinx.ext.intersphinx',
37 |     'sphinx.ext.viewcode',
38 |     'releases'
39 | ]
40 | 
41 | 
42 | # Add any paths that contain templates here, relative to this directory.
43 | templates_path = ['_templates']
44 | 
45 | master_doc = 'index'
46 | 
47 | # List of patterns, relative to source directory, that match files and
48 | # directories to ignore when looking for source files.
49 | # This pattern also affects html_static_path and html_extra_path.
50 | exclude_patterns = []
51 | 
52 | 
53 | # -- Options for HTML output -------------------------------------------------
54 | 
55 | # The theme to use for HTML and HTML Help pages.  See the documentation for
56 | # a list of builtin themes.
57 | #
58 | html_theme = 'hudsonthames_sphinx_theme'
59 | add_module_names = False
60 | 
61 | # Theme options are theme-specific and customize the look and feel of a theme
62 | # further.  For a list of options available for each theme, see the
63 | # documentation.
64 | #
65 | # html_theme_options = {}
66 | 
67 | html_context = {'logo': 'logo_white.png', 'theme_logo_only': True}
68 | html_favicon = '_static/favicon_mlfinlab.png'
69 | 
70 | # Add any paths that contain custom static files (such as style sheets) here,
71 | # relative to this directory. They are copied after the builtin static files,
72 | # so a file named "default.css" will overwrite the builtin "default.css".
73 | html_static_path = ['_static']
74 | html_copy_source = True
75 | 
76 | # 'releases' (changelog) settings
77 | releases_github_path = 'hudson-and-thames/mlfinlab_premium'
78 | releases_unstable_prehistory = True
79 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. image:: _static/logo_black.png
 2 |    :scale: 50 %
 3 |    :align: center
 4 |    :target: https://hudsonthames.org/
 5 | 
 6 | |
 7 | 
 8 | ================================================
 9 | Machine Learning Financial Laboratory (mlfinlab)
10 | ================================================
11 | 
12 | MlFinlab is a python package which helps portfolio managers and traders who want to leverage the power of machine learning
13 | by providing reproducible, interpretable, and easy to use tools.
14 | 
15 | Adding MlFinLab to your companies pipeline is like adding a department of PhD researchers to your team.
16 | 
17 | .. code-block::
18 | 
19 |    pip install mlfinlab
20 | 
21 | We source all of our implementations from the most elite and peer-reviewed journals. Including publications from:
22 | 
23 | 1. `The Journal of Financial Data Science <https://jfds.pm-research.com/>`_
24 | 2. `The Journal of Portfolio Management <https://jpm.pm-research.com/>`_
25 | 3. `The Journal of Algorithmic Finance <http://www.algorithmicfinance.org/>`_
26 | 4. `Cambridge University Press <https://www.cambridge.org/>`_
27 | 
28 | 
29 | Documentation & Tutorials
30 | #########################
31 | 
32 | We lower barriers to entry for all users by providing extensive `documentation <https://hudson-and-thames-mlfinlab-premium.readthedocs-hosted.com/en/latest/>`_
33 | and `tutorial notebooks <https://github.com/Hudson-and-Thames-Clients/research>`_, with code examples.
34 | 
35 | Who is Hudson & Thames?
36 | #######################
37 | 
38 | Hudson and Thames Quantitative Research is a company with a focus on implementing the most cutting edge algorithms in
39 | quantitative finance. We productionalize all our tools in the form of libraries and provide capability to our clients.
40 | 
41 | * `Website <https://hudsonthames.org/>`_
42 | * `Github Group <https://github.com/hudson-and-thames>`_
43 | * `MlFinLab Documentation <https://mlfinlab.readthedocs.io/en/latest/>`_
44 | 
45 | Contact us
46 | ##########
47 | 
48 | The best place to contact the team is via the Slack channel. Alternatively you can email us at: research@hudsonthames.org.
49 | 
50 | Looking forward to hearing from you!
51 | 
52 | License
53 | #######
54 | 
55 | This project is licensed under an all rights reserved licence and is NOT open-source, and may not be used for commercial purposes without a commercial license which may be purchased from Hudson and Thames Quantitative Research.
56 | 
57 | `LICENSE.txt <https://github.com/hudson-and-thames/mlfinlab/blob/master/LICENSE.txt>`_ file for details.
58 | 
59 | .. toctree::
60 |     :maxdepth: 2
61 |     :caption: Legal
62 |     :hidden:
63 | 
64 |     additional_information/license
65 |     additional_information/analytics
66 |     additional_information/privacy_gdpr
67 | 


--------------------------------------------------------------------------------
/docs/source/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Production
 2 | numpy==1.18.5
 3 | matplotlib==3.2.2
 4 | pandas==1.1.5
 5 | scikit-learn==0.24.1
 6 | scipy==1.6.0
 7 | statsmodels==0.12.2
 8 | cython==0.29.17
 9 | POT==0.7.0
10 | numba==0.52.0
11 | networkx==2.5
12 | dash==1.19.0
13 | dash-cytoscape==0.2.0
14 | dash-bootstrap-components==0.11.3
15 | jupyter-dash==0.4.0
16 | tensorflow==2.2.1
17 | joblib==1.0.1
18 | analytics-python==1.2.9
19 | getmac==0.8.2
20 | 
21 | 
22 | # Develop
23 | bump2version==1.0.1
24 | bumpversion==0.6.0
25 | codecov==2.1.11
26 | coverage==5.4
27 | pylint==2.6.0
28 | sphinx==3.4.3 # Docs
29 | hudsonthames-sphinx-theme==0.1.5 # Docs
30 | sphinx-rtd-theme==0.5.2 # Docs
31 | releases==1.6.3 # Docs
32 | 


--------------------------------------------------------------------------------
/mlfinlab/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MlFinlab helps portfolio managers and traders who want to leverage the power of machine learning by providing
 3 | reproducible, interpretable, and easy to use tools.
 4 | 
 5 | Adding MlFinLab to your companies pipeline is like adding a department of PhD researchers to your team.
 6 | """
 7 | 
 8 | import mlfinlab.cross_validation as cross_validation
 9 | import mlfinlab.data_structures as data_structures
10 | import mlfinlab.datasets as datasets
11 | import mlfinlab.multi_product as multi_product
12 | import mlfinlab.filters.filters as filters
13 | import mlfinlab.labeling as labeling
14 | import mlfinlab.features.fracdiff as fracdiff
15 | import mlfinlab.sample_weights as sample_weights
16 | import mlfinlab.sampling as sampling
17 | import mlfinlab.bet_sizing as bet_sizing
18 | import mlfinlab.util as util
19 | import mlfinlab.structural_breaks as structural_breaks
20 | import mlfinlab.feature_importance as feature_importance
21 | import mlfinlab.ensemble as ensemble
22 | import mlfinlab.clustering as clustering
23 | import mlfinlab.microstructural_features as microstructural_features
24 | import mlfinlab.backtest_statistics.backtests as backtests
25 | import mlfinlab.backtest_statistics.statistics as backtest_statistics
26 | import mlfinlab.networks as networks
27 | import mlfinlab.data_generation as data_generation
28 | import mlfinlab.regression as regression
29 | 


--------------------------------------------------------------------------------
/mlfinlab/backtest_statistics/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implements general backtest statistics.
 3 | """
 4 | 
 5 | from mlfinlab.backtest_statistics.backtests import CampbellBacktesting
 6 | from mlfinlab.backtest_statistics.statistics import (timing_of_flattening_and_flips, average_holding_period,
 7 |                                                      bets_concentration, all_bets_concentration,
 8 |                                                      drawdown_and_time_under_water, sharpe_ratio,
 9 |                                                      information_ratio, probabilistic_sharpe_ratio,
10 |                                                      deflated_sharpe_ratio, minimum_track_record_length)
11 | 


--------------------------------------------------------------------------------
/mlfinlab/bet_sizing/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Functions derived from Chapter 10: Bet Sizing
 3 | Only the highest-level user functions are included in the __init__ file.
 4 | 
 5 | This folder contains classes and functions for sizing bets based on a given investment strategy with given bet side
 6 | confidence, e.g. the output from a machine learning model. The approaches implemented in this module are based on
 7 | those described in Chapter 10 of "Advances in Financial Machine Learning" by Marcos López de Prado.
 8 | """
 9 | 
10 | from mlfinlab.bet_sizing.bet_sizing import (bet_size_probability, bet_size_dynamic, bet_size_budget, bet_size_reserve,
11 |                                             confirm_and_cast_to_df, get_concurrent_sides, cdf_mixture,
12 |                                             single_bet_size_mixed)
13 | from mlfinlab.bet_sizing.ef3m import (M2N, centered_moment, raw_moment, most_likely_parameters)
14 | 


--------------------------------------------------------------------------------
/mlfinlab/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Implements clustering module methods.
3 | """
4 | 
5 | from mlfinlab.clustering.onc import get_onc_clusters
6 | from mlfinlab.clustering.feature_clusters import get_feature_clusters
7 | from mlfinlab.clustering.hierarchical_clustering import optimal_hierarchical_cluster
8 | 


--------------------------------------------------------------------------------
/mlfinlab/clustering/feature_clusters.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module creates clustered subsets of features described in the paper Clustered Feature Importance (Presentation
 3 | Slides) by Dr. Marcos Lopez de Prado. https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3517595 and is also explained
 4 | in the book Machine Learning for Asset Managers Snippet 6.5.2 page 84.
 5 | """
 6 | 
 7 | #Imports
 8 | import numpy as np
 9 | import pandas as pd
10 | import statsmodels.api as sm
11 | from scipy.spatial.distance import squareform
12 | from scipy.cluster.hierarchy import linkage, fcluster
13 | from statsmodels.regression.linear_model import OLS
14 | 
15 | from mlfinlab.clustering.onc import get_onc_clusters
16 | from mlfinlab.codependence.codependence_matrix import get_dependence_matrix, get_distance_matrix
17 | 
18 | 
19 | # pylint: disable=invalid-name
20 | def get_feature_clusters(X: pd.DataFrame, dependence_metric: str, distance_metric: str = None,
21 |                          linkage_method: str = None, n_clusters: int = None, critical_threshold: float = 0.0) -> list:
22 |     """
23 |     Machine Learning for Asset Managers
24 |     Snippet 6.5.2.1 , page 85. Step 1: Features Clustering
25 | 
26 |     Gets clustered features subsets from the given set of features.
27 | 
28 |     :param X: (pd.DataFrame) Dataframe of features.
29 |     :param dependence_metric: (str) Method to be use for generating dependence_matrix, either 'linear' or
30 |                               'information_variation' or 'mutual_information' or 'distance_correlation'.
31 |     :param distance_metric: (str) The distance operator to be used for generating the distance matrix. The methods that
32 |                             can be applied are: 'angular', 'squared_angular', 'absolute_angular'. Set it to None if the
33 |                             feature are to be generated as it is by the ONC algorithm.
34 |     :param linkage_method: (str) Method of linkage to be used for clustering. Methods include: 'single', 'ward',
35 |                            'complete', 'average', 'weighted', and 'centroid'. Set it to None if the feature are to
36 |                            be generated as it is by the ONC algorithm.
37 |     :param n_clusters: (int) Number of clusters to form. Must be less the total number of features. If None then it
38 |                        returns optimal number of clusters decided by the ONC Algorithm.
39 |     :param critical_threshold: (float) Threshold for determining low silhouette score in the dataset. It can any real number
40 |                                 in [-1,+1], default is 0 which means any feature that has a silhouette score below 0 will be
41 |                                 indentified as having low silhouette and hence requied transformation will be appiled to for
42 |                                 for correction of the same.
43 |     :return: (list) Feature subsets.
44 |     """
45 | 
46 |     pass
47 | 
48 | 
49 | def _cluster_transformation(X: pd.DataFrame, clusters: dict, feats_to_transform: list) -> pd.DataFrame:
50 |     """
51 |     Machine Learning for Asset Managers
52 |     Snippet 6.5.2.1 , page 85. Step 1: Features Clustering (last paragraph)
53 | 
54 |     Transforms a dataset to reduce the multicollinearity of the system by replacing the original feature with
55 |     the residual from regression.
56 | 
57 |     :param X: (pd.DataFrame) Dataframe of features.
58 |     :param clusters: (dict) Clusters generated by ONC algorithm.
59 |     :param feats_to_transform: (list) Features that have low silhouette score and to be transformed.
60 |     :return: (pd.DataFrame) Transformed features.
61 |     """
62 | 
63 |     pass
64 | 
65 | 
66 | def _combine_features(X, clusters, exclude_key) -> np.array:
67 |     """
68 |     Combines features of each cluster linearly by following a minimum variance weighting scheme.
69 |     The Minimum Variance weights are calculated without constraints, other than the weights sum to one.
70 | 
71 |     :param X: (pd.DataFrame) Dataframe of features.
72 |     :param clusters: (dict) Clusters generated by ONC algorithm.
73 |     :param exclude_key: (int) Key of the cluster which is to be excluded.
74 |     :return: (np.array) Combined features for each cluster.
75 |     """
76 | 
77 |     pass
78 | 
79 | 
80 | def _check_for_low_silhouette_scores(X: pd.DataFrame, dep_matrix: pd.DataFrame,
81 |                                      critical_threshold: float = 0.0) -> pd.DataFrame:
82 |     """
83 |     Machine Learning for Asset Managers
84 |     Snippet 6.5.2.1 , page 85. Step 1: Features Clustering (last paragraph)
85 | 
86 |     Checks where the dataset contains features low silhouette due one feature being a combination of
87 |     multiple features across clusters. This is a problem, because ONC cannot assign one feature to multiple
88 |     clusters and it needs a transformation.
89 | 
90 |     :param X: (pd.DataFrame) Dataframe of features.
91 |     :param dep_matrix: (pd.DataFrame) Dataframe with dependences between features.
92 |     :param critical_threshold: (float) Threshold for determining low silhouette score.
93 |     :return: (pd.DataFrame) Dataframe of features.
94 |     """
95 | 
96 |     pass
97 | 


--------------------------------------------------------------------------------
/mlfinlab/clustering/hierarchical_clustering.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implementation of hierarchical clustering algorithms.
 3 | """
 4 | import numpy as np
 5 | import pandas as pd
 6 | from scipy.cluster import hierarchy
 7 | 
 8 | 
 9 | def optimal_hierarchical_cluster(mat: np.array, method: str = "ward") -> np.array:
10 |     """
11 |     Calculates the optimal clustering of a matrix.
12 | 
13 |     It calculates the hierarchy clusters from the distance of the matrix. Then it calculates
14 |     the optimal leaf ordering of the hierarchy clusters, and returns the optimally clustered matrix.
15 | 
16 |     It is reproduced with modifications from the following blog post:
17 |     `Marti, G. (2020) TF 2.0 DCGAN for 100x100 financial correlation matrices [Online].
18 |     Available at: https://marti.ai/ml/2019/10/13/tf-dcgan-financial-correlation-matrices.html.
19 |     (Accessed: 17 Aug 2020)
20 |     <https://marti.ai/ml/2019/10/13/tf-dcgan-financial-correlation-matrices.html>`_
21 | 
22 |     This method relies and acts as a wrapper for the `scipy.cluster.hierarchy` module.
23 |     `<https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html>`_
24 | 
25 |     :param mat: (np.array/pd.DataFrame) Correlation matrix.
26 |     :param method: (str) Method to calculate the hierarchy clusters. Can take the values
27 |         ["single", "complete", "average", "weighted", "centroid", "median", "ward"].
28 |     :return: (np.array) Optimal hierarchy cluster matrix.
29 |     """
30 | 
31 |     pass
32 | 


--------------------------------------------------------------------------------
/mlfinlab/clustering/onc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Optimal Number of Clusters (ONC Algorithm)
 3 | Detection of False Investment Strategies using Unsupervised Learning Methods
 4 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3167017
 5 | """
 6 | 
 7 | from typing import Union
 8 | 
 9 | import numpy as np
10 | import pandas as pd
11 | 
12 | from sklearn.cluster import KMeans
13 | from sklearn.metrics import silhouette_samples
14 | 
15 | 
16 | def _improve_clusters(corr_mat: pd.DataFrame, clusters: dict, top_clusters: dict) -> Union[
17 |         pd.DataFrame, dict, pd.Series]:
18 |     """
19 |     Improve number clusters using silh scores
20 | 
21 |     :param corr_mat: (pd.DataFrame) Correlation matrix
22 |     :param clusters: (dict) Clusters elements
23 |     :param top_clusters: (dict) Improved clusters elements
24 |     :return: (tuple) [ordered correlation matrix, clusters, silh scores]
25 |     """
26 | 
27 |     pass
28 | 
29 | 
30 | def _cluster_kmeans_base(corr_mat: pd.DataFrame, max_num_clusters: int = 10, repeat: int = 10) -> Union[
31 |         pd.DataFrame, dict, pd.Series]:
32 |     """
33 |     Initial clustering step using KMeans.
34 | 
35 |     :param corr_mat: (pd.DataFrame) Correlation matrix
36 |     :param max_num_clusters: (int) Maximum number of clusters to search for.
37 |     :param repeat: (int) Number of clustering algorithm repetitions.
38 |     :return: (tuple) [ordered correlation matrix, clusters, silh scores]
39 |     """
40 | 
41 |     pass
42 | 
43 | 
44 | def _check_improve_clusters(new_tstat_mean: float, mean_redo_tstat: float, old_cluster: tuple,
45 |                             new_cluster: tuple) -> tuple:
46 |     """
47 |     Checks cluster improvement condition based on t-statistic.
48 | 
49 |     :param new_tstat_mean: (float) T-statistics
50 |     :param mean_redo_tstat: (float) Average t-statistcs for cluster improvement
51 |     :param old_cluster: (tuple) Old cluster correlation matrix, optimized clusters, silh scores
52 |     :param new_cluster: (tuple) New cluster correlation matrix, optimized clusters, silh scores
53 |     :return: (tuple) Cluster
54 |     """
55 | 
56 |     pass
57 | 
58 | 
59 | def cluster_kmeans_top(corr_mat: pd.DataFrame, repeat: int = 10) -> Union[pd.DataFrame, dict, pd.Series, bool]:
60 |     """
61 |     Improve the initial clustering by leaving clusters with high scores unchanged and modifying clusters with
62 |     below average scores.
63 | 
64 |     :param corr_mat: (pd.DataFrame) Correlation matrix
65 |     :param repeat: (int) Number of clustering algorithm repetitions.
66 |     :return: (tuple) [correlation matrix, optimized clusters, silh scores, boolean to rerun ONC]
67 |     """
68 | 
69 |     pass
70 | 
71 | 
72 | def get_onc_clusters(corr_mat: pd.DataFrame, repeat: int = 10) -> Union[pd.DataFrame, dict, pd.Series]:
73 |     """
74 |     Optimal Number of Clusters (ONC) algorithm described in the following paper:
75 |     `Marcos Lopez de Prado, Michael J. Lewis, Detection of False Investment Strategies Using Unsupervised
76 |     Learning Methods, 2015 <https://papers.ssrn.com/sol3/abstract_id=3167017>`_;
77 |     The code is based on the code provided by the authors of the paper.
78 | 
79 |     The algorithm searches for the optimal number of clusters using the correlation matrix of elements as an input.
80 | 
81 |     The correlation matrix is transformed to a matrix of distances, the K-Means algorithm is applied multiple times
82 |     with a different number of clusters to use. The results are evaluated on the t-statistics of the silhouette scores.
83 | 
84 |     The output of the algorithm is the reordered correlation matrix (clustered elements are placed close to each other),
85 |     optimal clustering, and silhouette scores.
86 | 
87 |     :param corr_mat: (pd.DataFrame) Correlation matrix of features
88 |     :param repeat: (int) Number of clustering algorithm repetitions
89 |     :return: (tuple) [correlation matrix, optimized clusters, silh scores]
90 |     """
91 | 
92 |     pass
93 | 


--------------------------------------------------------------------------------
/mlfinlab/codependence/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Various codependence measures: mutual info, distance correlations, variation of information.
 3 | """
 4 | 
 5 | from mlfinlab.codependence.correlation import (angular_distance, absolute_angular_distance, squared_angular_distance,
 6 |                                                distance_correlation, kullback_leibler_distance, norm_distance)
 7 | from mlfinlab.codependence.information import (get_mutual_info, get_optimal_number_of_bins, variation_of_information_score)
 8 | from mlfinlab.codependence.codependence_matrix import (get_dependence_matrix, get_distance_matrix)
 9 | from mlfinlab.codependence.gnpr_distance import (spearmans_rho, gpr_distance, gnpr_distance)
10 | from mlfinlab.codependence.optimal_transport import (optimal_transport_dependence)
11 | 


--------------------------------------------------------------------------------
/mlfinlab/codependence/codependence_matrix.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This implementation lets user generate dependence and distance matrix based on the various methods of Information
 3 | Codependence  described in Cornell lecture notes on Codependence:
 4 | https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes
 5 | """
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | 
10 | from mlfinlab.codependence.information import variation_of_information_score, get_mutual_info
11 | from mlfinlab.codependence.correlation import distance_correlation
12 | from mlfinlab.codependence.gnpr_distance import spearmans_rho, gpr_distance, gnpr_distance
13 | from mlfinlab.codependence.optimal_transport import optimal_transport_dependence
14 | 
15 | 
16 | # pylint: disable=invalid-name
17 | 
18 | def get_dependence_matrix(df: pd.DataFrame, dependence_method: str, theta: float = 0.5,
19 |                           n_bins: int = None, normalize: bool = True,
20 |                           estimator: str = 'standard', target_dependence: str = 'comonotonicity',
21 |                           gaussian_corr: float = 0.7, var_threshold: float = 0.2) -> pd.DataFrame:
22 |     """
23 |     This function returns a dependence matrix for elements given in the dataframe using the chosen dependence method.
24 | 
25 |     List of supported algorithms to use for generating the dependence matrix: ``information_variation``,
26 |     ``mutual_information``, ``distance_correlation``, ``spearmans_rho``, ``gpr_distance``, ``gnpr_distance``,
27 |     ``optimal_transport``.
28 | 
29 |     :param df: (pd.DataFrame) Features.
30 |     :param dependence_method: (str) Algorithm to be use for generating dependence_matrix.
31 |     :param theta: (float) Type of information being tested in the GPR and GNPR distances. Falls in range [0, 1].
32 |                           (0.5 by default)
33 |     :param n_bins: (int) Number of bins for discretization in ``information_variation`` and ``mutual_information``,
34 |                          if None the optimal number will be calculated. (None by default)
35 |     :param normalize: (bool) Flag used to normalize the result to [0, 1] in ``information_variation`` and
36 |                              ``mutual_information``. (True by default)
37 |     :param estimator: (str) Estimator to be used for calculation in ``mutual_information``.
38 |                             [``standard``, ``standard_copula``, ``copula_entropy``] (``standard`` by default)
39 |     :param target_dependence: (str) Type of target dependence to use in ``optimal_transport``.
40 |                                     [``comonotonicity``, ``countermonotonicity``, ``gaussian``,
41 |                                     ``positive_negative``, ``different_variations``, ``small_variations``]
42 |                                     (``comonotonicity`` by default)
43 |     :param gaussian_corr: (float) Correlation coefficient to use when creating ``gaussian`` and
44 |                                   ``small_variations`` copulas. [from 0 to 1] (0.7 by default)
45 |     :param var_threshold: (float) Variation threshold to use for coefficient to use in ``small_variations``.
46 |                                   Sets the relative area of correlation in a copula. [from 0 to 1] (0.2 by default)
47 |     :return: (pd.DataFrame) Dependence matrix.
48 |     """
49 | 
50 |     pass
51 | 
52 | 
53 | def get_distance_matrix(X: pd.DataFrame, distance_metric: str = 'angular') -> pd.DataFrame:
54 |     """
55 |     Applies distance operator to a dependence matrix.
56 | 
57 |     This allows to turn a correlation matrix into a distance matrix. Distances used are true metrics.
58 | 
59 |     List of supported distance metrics to use for generating the distance matrix: ``angular``, ``squared_angular``,
60 |     and ``absolute_angular``.
61 | 
62 |     :param X: (pd.DataFrame) Dataframe to which distance operator to be applied.
63 |     :param distance_metric: (str) The distance metric to be used for generating the distance matrix.
64 |     :return: (pd.DataFrame) Distance matrix.
65 |     """
66 | 
67 |     pass
68 | 


--------------------------------------------------------------------------------
/mlfinlab/codependence/correlation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Correlation based distances and various modifications (angular, absolute, squared) described in Cornell lecture notes:
  3 | Codependence: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes
  4 | """
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from scipy.spatial.distance import squareform, pdist
  9 | 
 10 | 
 11 | # pylint: disable=invalid-name
 12 | 
 13 | 
 14 | def angular_distance(x: np.array, y: np.array) -> float:
 15 |     """
 16 |     Returns angular distance between two vectors. Angular distance is a slight modification of Pearson correlation which
 17 |     satisfies metric conditions.
 18 | 
 19 |     Formula used for calculation:
 20 | 
 21 |     Ang_Distance = (1/2 * (1 - Corr))^(1/2)
 22 | 
 23 |     Read Cornell lecture notes for more information about angular distance:
 24 |     https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes.
 25 | 
 26 |     :param x: (np.array/pd.Series) X vector.
 27 |     :param y: (np.array/pd.Series) Y vector.
 28 |     :return: (float) Angular distance.
 29 |     """
 30 | 
 31 |     pass
 32 | 
 33 | 
 34 | def absolute_angular_distance(x: np.array, y: np.array) -> float:
 35 |     """
 36 |     Returns absolute angular distance between two vectors. It is a modification of angular distance where the absolute
 37 |     value of the Pearson correlation coefficient is used.
 38 | 
 39 |     Formula used for calculation:
 40 | 
 41 |     Abs_Ang_Distance = (1/2 * (1 - abs(Corr)))^(1/2)
 42 | 
 43 |     Read Cornell lecture notes for more information about absolute angular distance:
 44 |     https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes.
 45 | 
 46 |     :param x: (np.array/pd.Series) X vector.
 47 |     :param y: (np.array/pd.Series) Y vector.
 48 |     :return: (float) Absolute angular distance.
 49 |     """
 50 | 
 51 |     pass
 52 | 
 53 | 
 54 | def squared_angular_distance(x: np.array, y: np.array) -> float:
 55 |     """
 56 |     Returns squared angular distance between two vectors. It is a modification of angular distance where the square of
 57 |     Pearson correlation coefficient is used.
 58 | 
 59 |     Formula used for calculation:
 60 | 
 61 |     Squared_Ang_Distance = (1/2 * (1 - (Corr)^2))^(1/2)
 62 | 
 63 |     Read Cornell lecture notes for more information about squared angular distance:
 64 |     https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes.
 65 | 
 66 |     :param x: (np.array/pd.Series) X vector.
 67 |     :param y: (np.array/pd.Series) Y vector.
 68 |     :return: (float) Squared angular distance.
 69 |     """
 70 | 
 71 |     pass
 72 | 
 73 | 
 74 | def distance_correlation(x: np.array, y: np.array) -> float:
 75 |     """
 76 |     Returns distance correlation between two vectors. Distance correlation captures both linear and non-linear
 77 |     dependencies.
 78 | 
 79 |     Formula used for calculation:
 80 | 
 81 |     Distance_Corr[X, Y] = dCov[X, Y] / (dCov[X, X] * dCov[Y, Y])^(1/2)
 82 | 
 83 |     dCov[X, Y] is the average Hadamard product of the doubly-centered Euclidean distance matrices of X, Y.
 84 | 
 85 |     Read Cornell lecture notes for more information about distance correlation:
 86 |     https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes.
 87 | 
 88 |     :param x: (np.array/pd.Series) X vector.
 89 |     :param y: (np.array/pd.Series) Y vector.
 90 |     :return: (float) Distance correlation coefficient.
 91 |     """
 92 | 
 93 |     pass
 94 | 
 95 | def kullback_leibler_distance(corr_a, corr_b):
 96 |     """
 97 |     Returns the Kullback-Leibler distance between two correlation matrices, all elements must be positive.
 98 |     Formula used for calculation:
 99 |     kullback_leibler_distance[X, Y] = 0.5 * ( Log( det(Y) / det(X) ) + tr((Y ^ -1).X - n )
100 |     Where n is the dimension space spanned by X.
101 |     Read Don H. Johnson's research paper for more information on Kullback-Leibler distance:
102 |     `<https://scholarship.rice.edu/bitstream/handle/1911/19969/Joh2001Mar1Symmetrizi.PDF>`_
103 | 
104 |     :param corr_a: (np.array/pd.Series/pd.DataFrame) Numpy array of the first correlation matrix.
105 |     :param corr_b: (np.array/pd.Series/pd.DataFrame) Numpy array of the second correlation matrix.
106 |     :return: (np.float64) the Kullback-Leibler distance between the two matrices.
107 |     """
108 | 
109 |     pass
110 | 
111 | 
112 | def norm_distance(matrix_a, matrix_b, r_val=2):
113 |     """
114 |     Returns the normalized distance between two matrices.
115 |     This function is a wrap for numpy's linear algebra method (numpy.linalg.norm).
116 |     Link to documentation: `<https://numpy.org/doc/stable/reference/generated/numpy.linalg.norm.html>`_.
117 |     Formula used to normalize matrix:
118 |     norm_distance[X, Y] = sum( abs(X - Y) ^ r ) ^ 1/r
119 |     Where r is a parameter. r=1 City block(L1 norm), r=2 Euclidean distance (L2 norm),
120 |     r=inf Supermum (L_inf norm). For values of r < 1, the result is not really a mathematical ‘norm’.
121 |     
122 |     :param matrix_a: (np.array/pd.Series/pd.DataFrame) Array of the first matrix.
123 |     :param matrix_b: (np.array/pd.Series/pd.DataFrame) Array of the second matrix.
124 |     :param r_val: (int/str) The r value of the normalization formula. (``2`` by default, Any Integer)
125 |     :return: (np.float64) The Euclidean distance between the two matrices.
126 |     """
127 | 
128 |     pass


--------------------------------------------------------------------------------
/mlfinlab/codependence/gnpr_distance.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implementation of distance using the Generic Non-Parametric Representation approach from "Some contributions to the
 3 | clustering of financial time series and applications to credit default swaps" by Gautier Marti
 4 | https://www.researchgate.net/publication/322714557
 5 | """
 6 | import numpy as np
 7 | import pandas as pd
 8 | from scipy.stats import spearmanr
 9 | import ot
10 | 
11 | # pylint: disable=invalid-name
12 | 
13 | 
14 | def spearmans_rho(x: np.array, y: np.array) -> float:
15 |     """
16 |     Calculates a statistical estimate of Spearman's rho - a copula-based dependence measure.
17 | 
18 |     Formula for calculation:
19 |     rho = 1 - (6)/(T*(T^2-1)) * Sum((X_t-Y_t)^2)
20 | 
21 |     It is more robust to noise and can be defined if the variables have an infinite second moment.
22 |     This statistic is described in more detail in the work by Gautier Marti
23 |     https://www.researchgate.net/publication/322714557 (p.54)
24 | 
25 |     This method is a wrapper for the scipy spearmanr function. For more details about the function and its parameters,
26 |     please visit scipy documentation
27 |     https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.spearmanr.html
28 | 
29 |     :param x: (np.array/pd.Series) X vector
30 |     :param y: (np.array/pd.Series) Y vector (same number of observations as X)
31 |     :return: (float) Spearman's rho statistical estimate
32 |     """
33 | 
34 |     # Coefficient calculationS
35 | 
36 |     pass
37 | 
38 | 
39 | def gpr_distance(x: np.array, y: np.array, theta: float) -> float:
40 |     """
41 |     Calculates the distance between two Gaussians under the Generic Parametric Representation (GPR) approach.
42 | 
43 |     According to the original work https://www.researchgate.net/publication/322714557 (p.70):
44 |     "This is a fast and good proxy for distance d_theta when the first two moments ... predominate". But it's not
45 |     a good metric for heavy-tailed distributions.
46 | 
47 |     Parameter theta defines what type of information dependency is being tested:
48 |     - for theta = 0 the distribution information is tested
49 |     - for theta = 1 the dependence information is tested
50 |     - for theta = 0.5 a mix of both information types is tested
51 | 
52 |     With theta in [0, 1] the distance lies in range [0, 1] and is a metric. (See original work for proof, p.71)
53 | 
54 |     :param x: (np.array/pd.Series) X vector.
55 |     :param y: (np.array/pd.Series) Y vector (same number of observations as X).
56 |     :param theta: (float) Type of information being tested. Falls in range [0, 1].
57 |     :return: (float) Distance under GPR approach.
58 |     """
59 | 
60 |     pass
61 | 
62 | 
63 | def gnpr_distance(x: np.array, y: np.array, theta: float, n_bins: int = 50) -> float:
64 |     """
65 |     Calculates the empirical distance between two random variables under the Generic Non-Parametric Representation
66 |     (GNPR) approach.
67 | 
68 |     Formula for the distance is taken from https://www.researchgate.net/publication/322714557 (p.72).
69 | 
70 |     Parameter theta defines what type of information dependency is being tested:
71 |     - for theta = 0 the distribution information is tested
72 |     - for theta = 1 the dependence information is tested
73 |     - for theta = 0.5 a mix of both information types is tested
74 | 
75 |     With theta in [0, 1] the distance lies in the range [0, 1] and is a metric.
76 |     (See original work for proof, p.71)
77 | 
78 |     This method is modified as it uses 1D Optimal Transport Distance to measure
79 |     distribution distance. This solves the issue of defining support and choosing
80 |     a number of bins. The number of bins can be given as an input to speed up calculations.
81 |     Big numbers of bins can take a long time to calculate.
82 | 
83 |     :param x: (np.array/pd.Series) X vector.
84 |     :param y: (np.array/pd.Series) Y vector (same number of observations as X).
85 |     :param theta: (float) Type of information being tested. Falls in range [0, 1].
86 |     :param n_bins: (int) Number of bins to use to split the X and Y vector observations.
87 |         (100 by default)
88 |     :return: (float) Distance under GNPR approach.
89 |     """
90 | 
91 |     pass
92 | 


--------------------------------------------------------------------------------
/mlfinlab/codependence/information.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implementations of mutual information (I) and variation of information (VI) codependence measures from Cornell
 3 | lecture slides: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes
 4 | """
 5 | import numpy as np
 6 | import scipy.stats as ss
 7 | from sklearn.metrics import mutual_info_score
 8 | 
 9 | 
10 | # pylint: disable=invalid-name
11 | 
12 | def get_optimal_number_of_bins(num_obs: int, corr_coef: float = None) -> int:
13 |     """
14 |     Calculates optimal number of bins for discretization based on number of observations
15 |     and correlation coefficient (univariate case).
16 | 
17 |     Algorithms used in this function were originally proposed in the works of Hacine-Gharbi et al. (2012)
18 |     and Hacine-Gharbi and Ravier (2018). They are described in the Cornell lecture notes:
19 |     https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes (p.26)
20 | 
21 |     :param num_obs: (int) Number of observations.
22 |     :param corr_coef: (float) Correlation coefficient, used to estimate the number of bins for univariate case.
23 |     :return: (int) Optimal number of bins.
24 |     """
25 | 
26 |     pass
27 | 
28 | 
29 | def get_mutual_info(x: np.array, y: np.array, n_bins: int = None, normalize: bool = False,
30 |                     estimator: str = 'standard') -> float:
31 |     """
32 |     Returns mutual information (MI) between two vectors.
33 | 
34 |     This function uses the discretization with the optimal bins algorithm proposed in the works of
35 |     Hacine-Gharbi et al. (2012) and Hacine-Gharbi and Ravier (2018).
36 | 
37 |     Read Cornell lecture notes for more information about the mutual information:
38 |     https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes.
39 | 
40 |     This function supports multiple ways the mutual information can be estimated:
41 | 
42 |     1. ``standard`` - the standard way of estimation - binning observations according to a given
43 |        number of bins and applying the MI formula.
44 |     2. ``standard_copula`` - estimating the copula (as a normalized ranking of the observations) and
45 |        applying the standard mutual information estimator on it.
46 |     3. ``copula_entropy`` - estimating the copula (as a normalized ranking of the observations) and
47 |        calculating its entropy. Then MI estimator = (-1) * copula entropy.
48 | 
49 |     The last two estimators' implementation is taken from the blog post by Dr. Gautier Marti.
50 |     Read this blog post for more information about the differences in the estimators:
51 |     https://gmarti.gitlab.io/qfin/2020/07/01/mutual-information-is-copula-entropy.html
52 | 
53 |     :param x: (np.array) X vector.
54 |     :param y: (np.array) Y vector.
55 |     :param n_bins: (int) Number of bins for discretization, if None the optimal number will be calculated.
56 |                          (None by default)
57 |     :param normalize: (bool) Flag used to normalize the result to [0, 1]. (False by default)
58 |     :param estimator: (str) Estimator to be used for calculation. [``standard``, ``standard_copula``, ``copula_entropy``]
59 |                             (``standard`` by default)
60 |     :return: (float) Mutual information score.
61 |     """
62 | 
63 |     pass
64 | 
65 | 
66 | def variation_of_information_score(x: np.array, y: np.array, n_bins: int = None, normalize: bool = False) -> float:
67 |     """
68 |     Returns variantion of information (VI) between two vectors.
69 | 
70 |     This function uses the discretization using optimal bins algorithm proposed in the works of
71 |     Hacine-Gharbi et al. (2012) and Hacine-Gharbi and Ravier (2018).
72 | 
73 |     Read Cornell lecture notes for more information about the variation of information:
74 |     https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3512994&download=yes.
75 | 
76 |     :param x: (np.array) X vector.
77 |     :param y: (np.array) Y vector.
78 |     :param n_bins: (int) Number of bins for discretization, if None the optimal number will be calculated.
79 |                          (None by default)
80 |     :param normalize: (bool) True to normalize the result to [0, 1]. (False by default)
81 |     :return: (float) Variation of information score.
82 |     """
83 | 
84 |     pass
85 | 


--------------------------------------------------------------------------------
/mlfinlab/codependence/optimal_transport.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implementations of Optimal Copula Transport dependence measure proposed by Marti et al. : https://arxiv.org/abs/1610.09659
 3 | And implemented in the blog post by Marti: https://gmarti.gitlab.io/qfin/2020/06/25/copula-optimal-transport-dependence.html
 4 | """
 5 | import numpy as np
 6 | import scipy.stats as ss
 7 | import ot
 8 | 
 9 | 
10 | # pylint: disable=invalid-name
11 | 
12 | def _get_empirical_copula(x: np.array, y: np.array) -> np.array:
13 |     """
14 |     Calculate empirical copula using ranked observations.
15 | 
16 |     :param x: (np.array) X vector.
17 |     :param y: (np.array) Y vector.
18 |     :return: (np.array) Empirical copula.
19 |     """
20 | 
21 |     pass
22 | 
23 | 
24 | def optimal_transport_dependence(x: np.array, y: np.array, target_dependence: str = 'comonotonicity',
25 |                                  gaussian_corr: float = 0.7, var_threshold: float = 0.2) -> float:
26 |     """
27 |     Calculates optimal copula transport dependence between the empirical copula of the two vectors and a target copula.
28 | 
29 |     This implementation is based on the blog post by Marti:
30 |     https://gmarti.gitlab.io/qfin/2020/06/25/copula-optimal-transport-dependence.html
31 | 
32 |     The target and forget copulas are being used to reference where between them does the empirical
33 |     copula stand in the space of copulas. The forget copula used is the copula associated with
34 |     independent random variables. The target copula is defined by the target_dependence parameter.
35 | 
36 |     Currently, these target_dependence copulas are supported:
37 | 
38 |     - ``comonotonicity`` - a comonotone copula.
39 |     - ``countermonotonicity`` - a countermonotone copula.
40 |     - ``gaussian`` - a Gaussian copula with a custom correlation coefficient.
41 |     - ``positive_negative`` - a copula of both positive and negative correlation.
42 |     - ``different_variations`` - a copula with some elements having extreme variations,
43 |       while those of others are relatively small, and conversely.
44 |     - ``small_variations`` - a copula with elements being positively correlated for small variations
45 |       but uncorrelated otherwise.
46 |     - ``v-shape`` - a copula that is seen with vol index vs. returns index: when returns of the index
47 |       are extreme, vol is usually high, when returns small in absolute value, vol usually low.
48 | 
49 |     :param x: (np.array) X vector.
50 |     :param y: (np.array) Y vector.
51 |     :param target_dependence: (str) Type of target dependence to use when measuring distance.
52 |                                     (``comonotonicity`` by default)
53 |     :param gaussian_corr: (float) Correlation coefficient to use when creating ``gaussian`` and
54 |                                   ``small_variations`` copulas. [from 0 to 1] (0.7 by default)
55 |     :param var_threshold: (float) Variation threshold to use for coefficient to use in ``small_variations``.
56 |                                   Sets the relative area of correlation in a copula. [from 0 to 1] (0.2 by default)
57 |     :return: (float) Optimal copula transport dependence.
58 |     """
59 | 
60 |     pass
61 | 
62 | 
63 | def _compute_copula_ot_dependence(empirical: np.array, target: np.array, forget: np.array,
64 |                                   n_obs: int) -> float:
65 |     """
66 |     Calculates optimal copula transport dependence measure.
67 | 
68 |     :param empirical: (np.array) Empirical copula.
69 |     :param target: (np.array) Target copula.
70 |     :param forget: (np.array) Forget copula.
71 |     :param nb_obs: (int) Number of observations.
72 |     :return: (float) Optimal copula transport dependence.
73 |     """
74 | 
75 |     pass
76 | 
77 | 
78 | def _create_target_copula(target_dependence: str, n_obs: int, gauss_corr: float,
79 |                           var_threshold: float) -> np.array:
80 |     """
81 |     Creates target copula with given dependence and number of observations.
82 | 
83 |     :param target_dependence: (str) Type of dependence to use for copula creation.[``comonotonicity``,
84 |                                     ``countermonotonicity``, ``gaussian``, ``positive_negative``,
85 |                                     ``different_variations``, ``small_variations``, ``v-shape``]
86 |     :param n_obs: (int) Number of observations to use for copula creation.
87 |     :param gauss_corr: (float) Correlation coefficient to use when creating ``gaussian`` and
88 |                                   ``small_variations`` copulas.
89 |     :param var_threshold: (float) Variation threshold to use for coefficient to use in ``small_variations``.
90 |     :return: (np.array) Resulting copula.
91 |     """
92 | 
93 |     pass
94 | 


--------------------------------------------------------------------------------
/mlfinlab/cross_validation/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions derived from Chapter 7: Cross Validation
3 | and stacked (multi-asset datasets) cross-validation functions.
4 | """
5 | 
6 | from mlfinlab.cross_validation.cross_validation import (ml_get_train_times, ml_cross_val_score, stacked_ml_cross_val_score,
7 |                                                         PurgedKFold, StackedPurgedKFold)
8 | from mlfinlab.cross_validation.combinatorial import (CombinatorialPurgedKFold, StackedCombinatorialPurgedKFold)
9 | 


--------------------------------------------------------------------------------
/mlfinlab/cross_validation/combinatorial.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements the following classes from Chapter 12 of AFML:
  3 | 
  4 | - Combinatorial Purged Cross-Validation class.
  5 | - Stacked Combinatorial Purged Cross-Validation class.
  6 | """
  7 | # pylint: disable=too-many-locals, arguments-differ, invalid-name, unused-argument
  8 | 
  9 | from itertools import combinations
 10 | from typing import List
 11 | 
 12 | import pandas as pd
 13 | import numpy as np
 14 | from scipy.special import comb
 15 | from sklearn.model_selection import KFold
 16 | 
 17 | from mlfinlab.cross_validation.cross_validation import ml_get_train_times
 18 | 
 19 | 
 20 | def _get_number_of_backtest_paths(n_train_splits: int, n_test_splits: int) -> int:
 21 |     """
 22 |     Number of combinatorial paths for CPCV(N,K).
 23 | 
 24 |     :param n_train_splits: (int) Number of train splits.
 25 |     :param n_test_splits: (int) Number of test splits.
 26 |     :return: (int) Number of backtest paths for CPCV(N,k).
 27 |     """
 28 | 
 29 |     pass
 30 | 
 31 | 
 32 | class CombinatorialPurgedKFold(KFold):
 33 |     """
 34 |     Advances in Financial Machine Learning, Chapter 12.
 35 | 
 36 |     Implements Combinatorial Purged Cross Validation (CPCV).
 37 | 
 38 |     The train is purged of observations overlapping test-label intervals.
 39 |     Test set is assumed contiguous (shuffle=False), w/o training samples in between.
 40 |     """
 41 | 
 42 |     def __init__(self,
 43 |                  n_splits: int = 3,
 44 |                  n_test_splits: int = 2,
 45 |                  samples_info_sets: pd.Series = None,
 46 |                  pct_embargo: float = 0.):
 47 |         """
 48 |         Initialize.
 49 | 
 50 |         :param n_splits: (int) The number of splits. Default to 3
 51 |         :param samples_info_sets: (pd.Series) The information range on which each record is constructed from
 52 |             *samples_info_sets.index*: Time when the information extraction started.
 53 |             *samples_info_sets.value*: Time when the information extraction ended.
 54 |         :param pct_embargo: (float) Percent that determines the embargo size.
 55 |         """
 56 | 
 57 |         pass
 58 | 
 59 |     def _generate_combinatorial_test_ranges(self, splits_indices: dict) -> List:
 60 |         """
 61 |         Using start and end indices of test splits from KFolds and number of test_splits (self.n_test_splits),
 62 |         generates combinatorial test ranges splits.
 63 | 
 64 |         :param splits_indices: (dict) Test fold integer index: [start test index, end test index].
 65 |         :return: (list) Combinatorial test splits ([start index, end index]).
 66 |         """
 67 | 
 68 |         pass
 69 | 
 70 |     def _fill_backtest_paths(self, train_indices: list, test_splits: list):
 71 |         """
 72 |         Using start and end indices of test splits and purged/embargoed train indices from CPCV, find backtest path and
 73 |         place in the path where these indices should be used.
 74 | 
 75 |         :param test_splits: (list) List of lists with first element corresponding to test start index and second - test end.
 76 |         """
 77 | 
 78 |         pass
 79 | 
 80 |     def split(self,
 81 |               X: pd.DataFrame,
 82 |               y: pd.Series = None,
 83 |               groups=None) -> tuple:
 84 |         """
 85 |         The main method to call for the PurgedKFold class.
 86 | 
 87 |         :param X: (pd.DataFrame) Samples dataset that is to be split.
 88 |         :param y: (pd.Series) Sample labels series.
 89 |         :param groups: (array-like), with shape (n_samples,), optional
 90 |             Group labels for the samples used while splitting the dataset into
 91 |             train/test set.
 92 |         :return: (tuple) [train list of sample indices, and test list of sample indices].
 93 |         """
 94 | 
 95 |         pass
 96 | 
 97 | 
 98 | class StackedCombinatorialPurgedKFold(KFold):
 99 |     """
100 |     Advances in Financial Machine Learning, Chapter 12.
101 | 
102 |     Implements Stacked Combinatorial Purged Cross Validation (CPCV). It implements CPCV for multiasset dataset.
103 | 
104 |     The train is purged of observations overlapping test-label intervals.
105 |     Test set is assumed contiguous (shuffle=False), w/o training samples in between.
106 |     """
107 | 
108 |     def __init__(self,
109 |                  n_splits: int = 3,
110 |                  n_test_splits: int = 2,
111 |                  samples_info_sets_dict: dict = None,
112 |                  pct_embargo: float = 0.):
113 |         """
114 |         Initialize.
115 | 
116 |         :param n_splits: (int) The number of splits. Default to 3
117 |         :param samples_info_sets_dict: (dict) Dictionary of samples info sets.
118 |                                         ASSET_1: SAMPLE_INFO_SETS, ASSET_2:...
119 | 
120 |             *samples_info_sets.index*: Time when the information extraction started.
121 |             *samples_info_sets.value*: Time when the information extraction ended.
122 |         :param pct_embargo: (float) Percent that determines the embargo size.
123 |         """
124 | 
125 |         pass
126 | 
127 |     def _fill_backtest_paths(self, asset, train_indices: list, test_splits: list):
128 |         """
129 |         Using start and end indices of test splits and purged/embargoed train indices from CPCV, find backtest path and
130 |         place in the path where these indices should be used.
131 | 
132 |         :param asset: (str) Asset for which backtest paths are filled.
133 |         :param train_indices: (list) List of lists with first element corresponding to train start index, second - test end.
134 |         :param test_splits: (list) List of lists with first element corresponding to test start index and second - test end.
135 |         """
136 | 
137 |         pass
138 | 
139 |     def _generate_combinatorial_test_ranges(self, splits_indices: dict) -> List:
140 |         """
141 |         Using start and end indices of test splits from KFolds and number of test_splits (self.n_test_splits),
142 |         generates combinatorial test ranges splits.
143 | 
144 |         :param splits_indices: (dict) Test fold integer index: [start test index, end test index].
145 |         :return: (list) Combinatorial test splits ([start index, end index]).
146 |         """
147 | 
148 |         pass
149 | 
150 |     def split(self,
151 |               X_dict: dict,
152 |               y_dict: dict = None,
153 |               groups=None) -> tuple:
154 |         """
155 |         The main method to call for the PurgedKFold class.
156 | 
157 |         :param X_dict: (dict) Dictionary of asset : X_{asset}.
158 |         :param y_dict: (dict) Dictionary of asset : y_{asset}.
159 |         :param groups: (array-like), with shape (n_samples,), optional
160 |             Group labels for the samples used while splitting the dataset into
161 |             train/test set.
162 |         :return: (tuple) [train list of sample indices, and test list of sample indices].
163 |         """
164 | 
165 |         pass
166 | 


--------------------------------------------------------------------------------
/mlfinlab/data_generation/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tools for synthetic data generation.
 3 | """
 4 | 
 5 | from mlfinlab.data_generation.corrgan import sample_from_corrgan
 6 | from mlfinlab.data_generation.data_verification import (plot_pairwise_dist, plot_eigenvalues, plot_eigenvectors,
 7 |                                                         plot_hierarchical_structure, plot_mst_degree_count, plot_stylized_facts,
 8 |                                                         plot_time_series_dependencies, plot_optimal_hierarchical_cluster)
 9 | from mlfinlab.data_generation.vines import (sample_from_cvine, sample_from_dvine, sample_from_ext_onion)
10 | from mlfinlab.data_generation.correlated_random_walks import generate_cluster_time_series
11 | from mlfinlab.data_generation.hcbm import (time_series_from_dist, generate_hcmb_mat)
12 | from mlfinlab.data_generation.bootstrap import (row_bootstrap, pair_bootstrap, block_bootstrap)
13 | 


--------------------------------------------------------------------------------
/mlfinlab/data_generation/bootstrap.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implementation of generating bootstrapped matrices from
 3 | "Bootstrap validation of links of a minimum spanning tree" by F. Musciotto,
 4 | L. Marotta, S. Miccichè, and R. N. Mantegna https://arxiv.org/pdf/1802.03395.pdf.
 5 | """
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | 
10 | 
11 | def row_bootstrap(mat, n_samples=1, size=None):
12 |     """
13 |     Uses the Row Bootstrap method to generate a new matrix of size equal or smaller than the given matrix.
14 | 
15 |     It samples with replacement a random row from the given matrix. If the required bootstrapped
16 |     columns' size is less than the columns of the original matrix, it randomly samples contiguous
17 |     columns of the required size. It cannot generate a matrix greater than the original.
18 | 
19 |     It is inspired by the following paper:
20 |     `Musciotto, F., Marotta, L., Miccichè, S. and Mantegna, R.N., 2018. Bootstrap validation of
21 |     links of a minimum spanning tree. Physica A: Statistical Mechanics and its Applications,
22 |     512, pp.1032-1043. <https://arxiv.org/pdf/1802.03395.pdf>`_.
23 | 
24 |     :param mat: (pd.DataFrame/np.array) Matrix to sample from.
25 |     :param n_samples: (int) Number of matrices to generate.
26 |     :param size: (tuple) Size of the bootstrapped matrix.
27 |     :return: (np.array) The generated bootstrapped matrices. Has shape (n_samples, size[0], size[1]).
28 |     """
29 | 
30 |     pass
31 | 
32 | 
33 | def pair_bootstrap(mat, n_samples=1, size=None):
34 |     """
35 |     Uses the Pair Bootstrap method to generate a new correlation matrix of returns.
36 | 
37 |     It generates a correlation matrix based on the number of columns of the returns matrix given. It
38 |     samples with replacement a pair of columns from the original matrix, the rows of the pairs generate
39 |     a new row-bootstrapped matrix. The correlation value of the pair of assets is calculated and
40 |     its value is used to fill the corresponding value in the generated correlation matrix.
41 | 
42 |     It is inspired by the following paper:
43 |     `Musciotto, F., Marotta, L., Miccichè, S. and Mantegna, R.N., 2018. Bootstrap validation of
44 |     links of a minimum spanning tree. Physica A: Statistical Mechanics and its Applications,
45 |     512, pp.1032-1043. <https://arxiv.org/pdf/1802.03395.pdf>`_.
46 | 
47 |     :param mat: (pd.DataFrame/np.array) Returns matrix to sample from.
48 |     :param n_samples: (int) Number of matrices to generate.
49 |     :param size: (int) Size of the bootstrapped correlation matrix.
50 |     :return: (np.array) The generated bootstrapped correlation matrices. Has shape (n_samples, mat.shape[1], mat.shape[1]).
51 |     """
52 | 
53 |     pass
54 | 
55 | 
56 | def block_bootstrap(mat, n_samples=1, size=None, block_size=None):
57 |     """
58 |     Uses the Block Bootstrap method to generate a new matrix of size equal to or smaller than the given matrix.
59 | 
60 |     It divides the original matrix into blocks of the given size. It samples with replacement random
61 |     blocks to populate the bootstrapped matrix. It cannot generate a matrix greater than the original.
62 | 
63 |     It is inspired by the following paper:
64 |     `Künsch, H.R., 1989. The jackknife and the bootstrap for general stationary observations.
65 |     Annals of Statistics, 17(3), pp.1217-1241. <https://projecteuclid.org/euclid.aos/1176347265>`_.
66 | 
67 |     :param mat: (pd.DataFrame/np.array) Matrix to sample from.
68 |     :param n_samples: (int) Number of matrices to generate.
69 |     :param size: (tuple) Size of the bootstrapped matrix.
70 |     :param block_size: (tuple) Size of the blocks.
71 |     :return: (np.array) The generated bootstrapped matrices. Has shape (n_samples, size[0], size[1]).
72 |     """
73 | 
74 |     pass
75 | 


--------------------------------------------------------------------------------
/mlfinlab/data_generation/correlated_random_walks.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Contains methods for generating correlated random walks.
 3 | """
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def generate_cluster_time_series(n_series, t_samples=100, k_corr_clusters=1,
10 |                                  d_dist_clusters=1, rho_main=0.1, rho_corr=0.3, price_start=100.0,
11 |                                  dists_clusters=("normal", "normal", "student-t", "normal", "student-t")):
12 |     """
13 |     Generates a synthetic time series of correlation and distribution clusters.
14 | 
15 |     It is reproduced with modifications from the following paper:
16 |     `Donnat, P., Marti, G. and Very, P., 2016. Toward a generic representation of random
17 |     variables for machine learning. Pattern Recognition Letters, 70, pp.24-31.
18 |     <https://www.sciencedirect.com/science/article/pii/S0167865515003906>`_
19 | 
20 |     `www.datagrapple.com. (n.d.). DataGrapple - Tech: A GNPR tutorial: How to cluster random walks.
21 |     [online] Available at:  [Accessed 26 Aug. 2020].
22 |     <https://www.datagrapple.com/Tech/GNPR-tutorial-How-to-cluster-random-walks.html>`_
23 | 
24 |     This method creates `n_series` time series of length `t_samples`. Each time series is divided
25 |     into `k_corr_clusters` correlation clusters. Each correlation cluster is subdivided into
26 |     `d_dist_clusters` distribution clusters.
27 |     A main distribution is sampled from a normal distribution with mean = 0 and stdev = 1, adjusted
28 |     by a `rho_main` factor. The correlation clusters are sampled from a given distribution, are generated
29 |     once, and adjusted by a `rho_corr` factor. The distribution clusters are sampled from other
30 |     given distributions, and adjusted by (1 - `rho_main` - `rho_corr`). They are sampled for each time series.
31 |     These three series are added together to form a time series of returns. The final time series
32 |     is the cumulative sum of the returns, with a start price given by `price_start`.
33 | 
34 |     :param n_series: (int) Number of time series to generate.
35 |     :param t_samples: (int) Number of samples in each time series.
36 |     :param k_corr_clusters: (int) Number of correlation clusters in each time series.
37 |     :param d_dist_clusters: (int) Number of distribution clusters in each time series.
38 |     :param rho_main: (float): Strength of main time series distribution.
39 |     :param rho_corr: (float): Strength of correlation cluster distribution.
40 |     :param price_start: (float) Starting price of the time series.
41 |     :param dists_clusters: (list) List containing the names of the distributions to sample from.
42 |         The following numpy distributions are available: "normal" = normal(0, 1), "normal_2" = normal(0, 2),
43 |         "student-t" = standard_t(3)/sqrt(3), "laplace" = laplace(1/sqrt(2)). The first disitribution
44 |         is used to sample for the correlation clusters (k_corr_clusters), the remaining ones are used
45 |         to sample for the distribution clusters (d_dist_clusters).
46 |     :return: (pd.DataFrame) Generated time series. Has size (t_samples, n_series).
47 |     """
48 | 
49 |     pass
50 | 


--------------------------------------------------------------------------------
/mlfinlab/data_generation/corrgan.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019, Hudson and Thames Quantitative Research
 2 | # All rights reserved
 3 | # Read more: https://github.com/hudson-and-thames/mlfinlab/blob/master/LICENSE.txt
 4 | """
 5 | Implementation of sampling realistic financial correlation matrices from
 6 | "CorrGAN: Sampling Realistic Financial Correlation Matrices using
 7 | Generative Adversarial Networks" by Gautier Marti.
 8 | https://arxiv.org/pdf/1910.09504.pdf
 9 | """
10 | from os import listdir, path
11 | import numpy as np
12 | from scipy.cluster import hierarchy
13 | from statsmodels.stats.correlation_tools import corr_nearest
14 | 
15 | 
16 | def sample_from_corrgan(model_loc, dim=10, n_samples=1):
17 |     """
18 |     Samples correlation matrices from the pre-trained CorrGAN network.
19 | 
20 |     It is reproduced with modifications from the following paper:
21 |     `Marti, G., 2020, May. CorrGAN: Sampling Realistic Financial Correlation Matrices Using
22 |     Generative Adversarial Networks. In ICASSP 2020-2020 IEEE International Conference on
23 |     Acoustics, Speech and Signal Processing (ICASSP) (pp. 8459-8463). IEEE.
24 |     <https://arxiv.org/pdf/1910.09504.pdf>`_
25 | 
26 |     It loads the appropriate CorrGAN model for the required dimension. Generates a matrix output
27 |     from this network. Symmetries this matrix and finds the nearest correlation matrix
28 |     that is positive semi-definite. Finally, it maximizes the sum of the similarities between
29 |     adjacent leaves to arrange it with hierarchical clustering.
30 | 
31 |     The CorrGAN network was trained on the correlation profiles of the S&P 500 stocks. Therefore
32 |     the output retains these properties. In addition, the final output retains the following
33 |     6 stylized facts:
34 | 
35 |     1. Distribution of pairwise correlations is significantly shifted to the positive.
36 | 
37 |     2. Eigenvalues follow the Marchenko-Pastur distribution, but for a very large first
38 |     eigenvalue (the market).
39 | 
40 |     3. Eigenvalues follow the Marchenko-Pastur distribution, but for a couple of other
41 |     large eigenvalues (industries).
42 | 
43 |     4. Perron-Frobenius property (first eigenvector has positive entries).
44 | 
45 |     5. Hierarchical structure of correlations.
46 | 
47 |     6. Scale-free property of the corresponding Minimum Spanning Tree (MST).
48 | 
49 |     :param model_loc: (str) Location of folder containing CorrGAN models.
50 |     :param dim: (int) Dimension of correlation matrix to sample.
51 |         In the range [2, 200].
52 |     :param n_samples: (int) Number of samples to generate.
53 |     :return: (np.array) Sampled correlation matrices of shape (n_samples, dim, dim).
54 |     """
55 | 
56 |     pass
57 | 


--------------------------------------------------------------------------------
/mlfinlab/data_generation/hcbm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implementation of the Hierarchical Correlation Block Model (HCBM) matrix.
 3 | "Clustering financial time series: How long is enough?" by Marti, G., Andler, S., Nielsen, F. and Donnat, P.
 4 | https://www.ijcai.org/Proceedings/16/Papers/367.pdf
 5 | """
 6 | import numpy as np
 7 | import pandas as pd
 8 | from statsmodels.sandbox.distributions.multivariate import multivariate_t_rvs
 9 | 
10 | 
11 | def _hcbm_mat_helper(mat, n_low=0, n_high=214, rho_low=0.1, rho_high=0.9, blocks=4, depth=4):
12 |     """
13 |     Helper function for `generate_hcmb_mat` that recursively places rho values to HCBM matrix
14 |     given as an input.
15 | 
16 |     By using a uniform distribution we select the start and end locations of the blocks in the
17 |     matrix. For each block, we recurse depth times and repeat splitting up the sub-matrix into
18 |     blocks. Each depth level has a unique correlation (rho) values generated from a uniform
19 |     distributions, and bounded by `rho_low` and `rho_high`. This function works as a
20 |     side-effect to the `mat` parameter.
21 | 
22 |     It is reproduced with modifications from the following paper:
23 |     `Marti, G., Andler, S., Nielsen, F. and Donnat, P., 2016.
24 |     Clustering financial time series: How long is enough?. arXiv preprint arXiv:1603.04017.
25 |     <https://www.ijcai.org/Proceedings/16/Papers/367.pdf>`_
26 | 
27 |     :param mat: (np.array) Parent HCBM matrix.
28 |     :param n_low: (int) Start location of HCMB matrix to work on.
29 |     :param n_high: (int) End location of HCMB matrix to work on.
30 |     :param rho_low: (float) Lower correlation bound of the matrix. Must be greater or equal
31 |     to 0.
32 |     :param rho_high: (float) Upper correlation bound of the matrix. Must be less or equal to 1.
33 |     :param blocks: (int) Maximum number of blocks to generate per level of depth.
34 |     :param depth: (int) Depth of recursion for generating new blocks.
35 |     """
36 | 
37 |     pass
38 | 
39 | 
40 | def generate_hcmb_mat(t_samples, n_size, rho_low=0.1, rho_high=0.9, blocks=4, depth=4, permute=False):
41 |     """
42 |     Generates a Hierarchical Correlation Block Model (HCBM) matrix  of correlation values.
43 | 
44 |     By using a uniform distribution we select the start and end locations of the blocks in the
45 |     matrix. For each block, we recurse depth times and repeat splitting up the sub-matrix into
46 |     blocks. Each depth level has a unique correlation (rho) values generated from a uniform
47 |     distributions, and bounded by `rho_low` and `rho_high`.
48 | 
49 |     It is reproduced with modifications from the following paper:
50 |     `Marti, G., Andler, S., Nielsen, F. and Donnat, P., 2016.
51 |     Clustering financial time series: How long is enough?. arXiv preprint arXiv:1603.04017.
52 |     <https://www.ijcai.org/Proceedings/16/Papers/367.pdf>`_
53 | 
54 |     :param t_samples: (int) Number of HCBM matrices to generate.
55 |     :param n_size: (int) Size of HCBM matrix.
56 |     :param rho_low: (float) Lower correlation bound of the matrix. Must be greater or equal to 0.
57 |     :param rho_high: (float) Upper correlation bound of the matrix. Must be less or equal to 1.
58 |     :param blocks: (int) Number of blocks to generate per level of depth.
59 |     :param depth: (int) Depth of recursion for generating new blocks.
60 |     :param permute: (bool) Whether to permute the final HCBM matrix.
61 |     :return: (np.array) Generated HCBM matrix of shape (t_samples, n_size, n_size).
62 |     """
63 | 
64 |     pass
65 | 
66 | 
67 | def time_series_from_dist(corr, t_samples=1000, dist="normal", deg_free=3):
68 |     """
69 |     Generates a time series from a given correlation matrix.
70 | 
71 |     It uses multivariate sampling from distributions to create the time series. It supports
72 |     normal and student-t distributions. This method relies and acts as a wrapper for the
73 |     `np.random.multivariate_normal` and
74 |     `statsmodels.sandbox.distributions.multivariate.multivariate_t_rvs` modules.
75 |     `<https://numpy.org/doc/stable/reference/random/generated/numpy.random.multivariate_normal.html>`_
76 |     `<https://www.statsmodels.org/stable/sandbox.html?highlight=sandbox#module-statsmodels.sandbox>`_
77 | 
78 |     It is reproduced with modifications from the following paper:
79 |     `Marti, G., Andler, S., Nielsen, F. and Donnat, P., 2016.
80 |     Clustering financial time series: How long is enough?. arXiv preprint arXiv:1603.04017.
81 |     <https://www.ijcai.org/Proceedings/16/Papers/367.pdf>`_
82 | 
83 |     :param corr: (np.array) Correlation matrix.
84 |     :param t_samples: (int) Number of samples in the time series.
85 |     :param dist: (str) Type of distributions to use.
86 |         Can take the values ["normal", "student"].
87 |     :param deg_free: (int) Degrees of freedom. Only used for student-t distribution.
88 |     :return: (pd.DataFrame) The resulting time series of shape (len(corr), t_samples).
89 |     """
90 | 
91 |     pass
92 | 


--------------------------------------------------------------------------------
/mlfinlab/data_structures/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Logic regarding the various sampling techniques, in particular:
 3 | 
 4 | * Time Bars
 5 | * Tick Bars
 6 | * Volume Bars
 7 | * Dollar Bars
 8 | * Tick Imbalance Bars (EMA and Const)
 9 | * Volume Imbalance Bars (EMA and Const)
10 | * Dollar Imbalance Bars (EMA and Const)
11 | * Tick Run Bars (EMA and Const)
12 | * Volume Run Bars (EMA and Const)
13 | * Dollar Run Bars (EMA and Const)
14 | """
15 | 
16 | from mlfinlab.data_structures.imbalance_data_structures import (get_ema_dollar_imbalance_bars, get_ema_volume_imbalance_bars,
17 |                                                                 get_ema_tick_imbalance_bars, get_const_dollar_imbalance_bars,
18 |                                                                 get_const_volume_imbalance_bars, get_const_tick_imbalance_bars)
19 | from mlfinlab.data_structures.run_data_structures import (get_ema_volume_run_bars, get_ema_tick_run_bars,
20 |                                                           get_ema_dollar_run_bars, get_const_volume_run_bars,
21 |                                                           get_const_tick_run_bars, get_const_dollar_run_bars)
22 | from mlfinlab.data_structures.standard_data_structures import (get_tick_bars, get_dollar_bars, get_volume_bars)
23 | from mlfinlab.data_structures.time_data_structures import get_time_bars
24 | 


--------------------------------------------------------------------------------
/mlfinlab/data_structures/time_data_structures.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Advances in Financial Machine Learning, Marcos Lopez de Prado
 3 | Chapter 2: Financial Data Structures
 4 | 
 5 | Time bars generation logic
 6 | """
 7 | 
 8 | # Imports
 9 | from typing import Union, Iterable, Optional
10 | import numpy as np
11 | import pandas as pd
12 | 
13 | from mlfinlab.data_structures.base_bars import BaseBars
14 | 
15 | 
16 | # pylint: disable=too-many-instance-attributes
17 | class TimeBars(BaseBars):
18 |     """
19 |     Contains all of the logic to construct the time bars. This class shouldn't be used directly.
20 |     Use get_time_bars instead
21 |     """
22 | 
23 |     def __init__(self, resolution: str, num_units: int, batch_size: int = 20000000):
24 |         """
25 |         Constructor
26 | 
27 |         :param resolution: (str) Type of bar resolution: ['D', 'H', 'MIN', 'S']
28 |         :param num_units: (int) Number of days, minutes, etc.
29 |         :param batch_size: (int) Number of rows to read in from the csv, per batch
30 |         """
31 | 
32 |         pass
33 | 
34 |     def _reset_cache(self):
35 |         """
36 |         Implementation of abstract method _reset_cache for time bars
37 |         """
38 | 
39 |         pass
40 | 
41 |     def _extract_bars(self, data: Union[list, tuple, np.ndarray]) -> list:
42 |         """
43 |         For loop which compiles time bars.
44 |         We did investigate the use of trying to solve this in a vectorised manner but found that a For loop worked well.
45 | 
46 |         :param data: (tuple) Contains 3 columns - date_time, price, and volume.
47 |         :return: (list) Extracted bars
48 |         """
49 | 
50 |         pass
51 | 
52 | 
53 | def get_time_bars(file_path_or_df: Union[str, Iterable[str], pd.DataFrame], resolution: str = 'D', num_units: int = 1, batch_size: int = 20000000,
54 |                   verbose: bool = True, to_csv: bool = False, output_path: Optional[str] = None):
55 |     """
56 |     Creates Time Bars: date_time, open, high, low, close, volume, cum_buy_volume, cum_ticks, cum_dollar_value.
57 | 
58 |     :param file_path_or_df: (str, iterable of str, or pd.DataFrame) Path to the csv file(s) or Pandas Data Frame containing raw tick data
59 |                             in the format[date_time, price, volume]
60 |     :param resolution: (str) Resolution type ('D', 'H', 'MIN', 'S')
61 |     :param num_units: (int) Number of resolution units (3 days for example, 2 hours)
62 |     :param batch_size: (int) The number of rows per batch. Less RAM = smaller batch size.
63 |     :param verbose: (int) Print out batch numbers (True or False)
64 |     :param to_csv: (bool) Save bars to csv after every batch run (True or False)
65 |     :param output_path: (str) Path to csv file, if to_csv is True
66 |     :return: (pd.DataFrame) Dataframe of time bars, if to_csv=True return None
67 |     """
68 | 
69 |     pass
70 | 


--------------------------------------------------------------------------------
/mlfinlab/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Module implementing typical financial datasets load (stock prices, dollar bars, ticks).
3 | """
4 | 
5 | from mlfinlab.datasets.load_datasets import (load_dollar_bar_sample, load_stock_prices, load_tick_sample,
6 |                                              generate_multi_asset_data_set)
7 | 


--------------------------------------------------------------------------------
/mlfinlab/datasets/data/tick_data.csv:
--------------------------------------------------------------------------------
  1 | Date and Time,Price,Volume
  2 | 2011/07/31 22:38:45.108,1205.0,1
  3 | 2011/07/31 22:38:45.934,1005.0,1
  4 | 2011/07/31 22:38:47.008,1304.75,6
  5 | 2011/07/31 22:38:48.944,1904.75,1
  6 | 2011/07/31 22:38:52.951,1304.75,20
  7 | 2011/07/31 22:38:52.951,1304.75,1
  8 | 2011/07/31 22:38:52.951,1304.75,5
  9 | 2011/07/31 22:38:56.589,1304.5,1
 10 | 2011/07/31 22:38:57.858,1304.5,1
 11 | 2011/07/31 22:39:08.695,1304.5,1
 12 | 2011/07/31 22:39:09.396,1304.5,1
 13 | 2011/07/31 22:39:20.495,1304.5,1
 14 | 2011/07/31 22:39:23.937,1304.5,1
 15 | 2011/07/31 22:39:23.937,1304.5,5
 16 | 2011/07/31 22:39:23.937,1304.5,1
 17 | 2011/07/31 22:39:26.084,1304.5,1
 18 | 2011/07/31 22:39:26.084,1304.5,1
 19 | 2011/07/31 22:39:26.095,1304.5,4
 20 | 2011/07/31 22:39:26.743,1304.5,11
 21 | 2011/07/31 22:39:26.801,1304.5,9
 22 | 2011/07/31 22:39:27.050,1304.5,1
 23 | 2011/07/31 22:39:27.274,1304.5,1
 24 | 2011/07/31 22:39:28.914,1304.5,1
 25 | 2011/07/31 22:39:28.965,1304.5,6
 26 | 2011/07/31 22:39:28.965,1304.5,1
 27 | 2011/07/31 22:39:28.965,1304.5,1
 28 | 2011/07/31 22:39:33.568,1304.75,1
 29 | 2011/07/31 22:39:37.360,1304.5,1
 30 | 2011/07/31 22:39:37.360,1304.5,1
 31 | 2011/07/31 22:39:38.991,1304.5,1
 32 | 2011/07/31 22:39:40.423,1304.5,1
 33 | 2011/07/31 22:39:51.519,1304.5,1
 34 | 2011/07/31 22:39:51.519,1304.5,4
 35 | 2011/07/31 22:39:53.030,1304.5,1
 36 | 2011/07/31 22:39:55.765,1304.5,1
 37 | 2011/07/31 22:39:56.614,1304.5,1
 38 | 2011/07/31 22:39:56.614,1304.5,1
 39 | 2011/07/31 22:39:56.614,1304.5,5
 40 | 2011/07/31 22:39:56.614,1304.5,1
 41 | 2011/07/31 22:39:56.614,1304.5,1
 42 | 2011/07/31 22:39:59.606,1304.5,10
 43 | 2011/07/31 22:39:59.606,1304.5,2
 44 | 2011/07/31 22:39:59.606,1304.5,4
 45 | 2011/07/31 22:40:01.914,1304.5,1
 46 | 2011/07/31 22:40:01.914,1304.5,1
 47 | 2011/07/31 22:40:10.794,1304.75,6
 48 | 2011/07/31 22:40:11.161,1304.5,4
 49 | 2011/07/31 22:40:11.168,1304.75,4
 50 | 2011/07/31 22:40:11.168,1304.75,1
 51 | 2011/07/31 22:40:11.168,1304.75,1
 52 | 2011/07/31 22:40:11.168,1304.75,1
 53 | 2011/07/31 22:40:12.014,1304.5,2
 54 | 2011/07/31 22:40:12.014,1304.5,3
 55 | 2011/07/31 22:40:12.014,1304.5,1
 56 | 2011/07/31 22:40:13.964,1304.75,1
 57 | 2011/07/31 22:40:14.306,1304.75,1
 58 | 2011/07/31 22:40:14.514,1304.75,1
 59 | 2011/07/31 22:40:14.617,1304.75,1
 60 | 2011/07/31 22:40:14.730,1304.75,1
 61 | 2011/07/31 22:40:14.822,1304.75,1
 62 | 2011/07/31 22:40:16.182,1305.0,9
 63 | 2011/07/31 22:40:16.182,1305.0,1
 64 | 2011/07/31 22:40:20.267,1304.75,1
 65 | 2011/07/31 22:40:22.083,1305.0,1
 66 | 2011/07/31 22:40:28.918,1304.75,1
 67 | 2011/07/31 22:40:28.918,1304.75,1
 68 | 2011/07/31 22:40:29.030,1305.0,5
 69 | 2011/07/31 22:40:29.478,1305.0,3
 70 | 2011/07/31 22:40:29.478,3305.0,1
 71 | 2011/07/31 22:40:29.478,205.0,2
 72 | 2011/07/31 22:40:29.478,1405.0,1
 73 | 2011/07/31 22:40:29.478,1305.0,1
 74 | 2011/07/31 22:40:29.478,1305.0,1
 75 | 2011/07/31 22:40:29.478,1305.0,1
 76 | 2011/07/31 22:40:29.478,1305.0,1
 77 | 2011/07/31 22:40:29.478,1305.0,1
 78 | 2011/07/31 22:40:29.478,1305.0,2
 79 | 2011/07/31 22:40:29.478,1305.0,1
 80 | 2011/07/31 22:40:29.478,1305.0,1
 81 | 2011/07/31 22:40:29.478,1305.0,1
 82 | 2011/07/31 22:40:29.478,1305.0,1
 83 | 2011/07/31 22:40:29.478,1305.0,2
 84 | 2011/07/31 22:40:29.541,1305.0,5
 85 | 2011/07/31 22:40:29.940,1305.0,1
 86 | 2011/07/31 22:40:30.694,1305.25,10
 87 | 2011/07/31 22:40:31.492,1305.25,10
 88 | 2011/07/31 22:40:31.576,1305.25,5
 89 | 2011/07/31 22:40:31.576,1305.25,1
 90 | 2011/07/31 22:40:31.576,1305.25,1
 91 | 2011/07/31 22:40:31.576,1305.25,2
 92 | 2011/07/31 22:40:31.576,1305.25,1
 93 | 2011/07/31 22:40:33.213,1305.25,1
 94 | 2011/07/31 22:40:41.016,1305.25,1
 95 | 2011/07/31 22:40:41.849,1305.25,1
 96 | 2011/07/31 22:40:42.779,1305.0,1
 97 | 2011/07/31 22:40:44.921,1305.25,5
 98 | 2011/07/31 22:40:44.921,1305.25,1
 99 | 2011/07/31 22:40:44.921,1305.25,1
100 | 2011/07/31 22:40:44.921,1305.25,2
101 | 2011/07/31 22:40:44.921,1305.25,1
102 | 


--------------------------------------------------------------------------------
/mlfinlab/datasets/load_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The module implementing various functions loading tick, dollar, stock data sets which can be used as
 3 | sandbox data.
 4 | """
 5 | 
 6 | import os
 7 | 
 8 | import numpy as np
 9 | import pandas as pd
10 | 
11 | from mlfinlab.labeling.labeling import get_events, add_vertical_barrier, get_bins
12 | from mlfinlab.util.volatility import get_daily_vol
13 | from mlfinlab.filters.filters import cusum_filter
14 | 
15 | 
16 | def load_stock_prices() -> pd.DataFrame:
17 |     """
18 |     Loads stock prices data sets consisting of
19 |     EEM, EWG, TIP, EWJ, EFA, IEF, EWQ, EWU, XLB, XLE, XLF, LQD, XLK, XLU, EPP, FXI, VGK, VPL, SPY, TLT, BND, CSJ,
20 |     DIA starting from 2008 till 2016.
21 | 
22 |     :return: (pd.DataFrame) The stock_prices data frame.
23 |     """
24 | 
25 |     pass
26 | 
27 | 
28 | def load_tick_sample() -> pd.DataFrame:
29 |     """
30 |     Loads E-Mini S&P 500 futures tick data sample.
31 | 
32 |     :return: (pd.DataFrame) Frame with tick data sample.
33 |     """
34 | 
35 |     pass
36 | 
37 | 
38 | def load_dollar_bar_sample() -> pd.DataFrame:
39 |     """
40 |     Loads E-Mini S&P 500 futures dollar bars data sample.
41 | 
42 |     :return: (pd.DataFrame) Frame with dollar bar data sample.
43 |     """
44 | 
45 |     pass
46 | 
47 | 
48 | def generate_multi_asset_data_set(start_date: pd.Timestamp = pd.Timestamp(2008, 1, 1),
49 |                                   end_date: pd.Timestamp = pd.Timestamp(2020, 1, 1)) -> tuple:
50 |     # pylint: disable=invalid-name
51 |     """
52 |     Generates multi-asset dataset from stock prices labelled by triple-barrier method.
53 | 
54 |     :param start_date: (pd.Timestamp) Dataset start date.
55 |     :param end_date: (pd.Timestamp) Dataset end date.
56 |     :return: (tuple) Tuple of dictionaries (asset: data) for X, y, cont contract used to label the dataset.
57 |     """
58 | 
59 |     pass
60 | 


--------------------------------------------------------------------------------
/mlfinlab/ensemble/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementation of Sequentially Bootstrapped Bagging Classifier using sklearn's library as base class.
3 | """
4 | 
5 | from mlfinlab.ensemble.sb_bagging import (SequentiallyBootstrappedBaggingClassifier, SequentiallyBootstrappedBaggingRegressor)
6 | 


--------------------------------------------------------------------------------
/mlfinlab/feature_importance/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module which implements feature importance algorithms described in Chapter 8 and other interpretability tools
 3 | from the Journal of Financial Data Science.
 4 | And Stacked feature importance functions (Stacked MDA/SFI).
 5 | """
 6 | 
 7 | from mlfinlab.feature_importance.importance import (mean_decrease_impurity, mean_decrease_accuracy,
 8 |                                                     single_feature_importance, plot_feature_importance,
 9 |                                                     stacked_mean_decrease_accuracy)
10 | from mlfinlab.feature_importance.orthogonal import (feature_pca_analysis, get_pca_rank_weighted_kendall_tau,
11 |                                                     get_orthogonal_features)
12 | from mlfinlab.feature_importance.fingerpint import (RegressionModelFingerprint, ClassificationModelFingerprint)
13 | 


--------------------------------------------------------------------------------
/mlfinlab/feature_importance/fingerpint.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementation of an algorithm described in Yimou Li, David Turkington, Alireza Yazdani
  3 | 'Beyond the Black Box: An Intuitive Approach to Investment Prediction with Machine Learning'
  4 | (https://jfds.pm-research.com/content/early/2019/12/11/jfds.2019.1.023)
  5 | """
  6 | 
  7 | from abc import ABC, abstractmethod
  8 | from typing import Tuple
  9 | import pandas as pd
 10 | import numpy as np
 11 | import matplotlib.pyplot as plt
 12 | from sklearn.linear_model import LinearRegression
 13 | 
 14 | 
 15 | # pylint: disable=invalid-name
 16 | # pylint: disable=too-many-locals
 17 | 
 18 | class AbstractModelFingerprint(ABC):
 19 |     """
 20 |     Model fingerprint constructor.
 21 | 
 22 |     This is an abstract base class for the RegressionModelFingerprint and ClassificationModelFingerprint classes.
 23 |     """
 24 | 
 25 |     def __init__(self):
 26 |         """
 27 |         Model fingerprint constructor.
 28 |         """
 29 |         pass
 30 | 
 31 |     def fit(self, model: object, X: pd.DataFrame, num_values: int = 50, pairwise_combinations: list = None) -> None:
 32 |         """
 33 |         Get linear, non-linear and pairwise effects estimation.
 34 | 
 35 |         :param model: (object) Trained model.
 36 |         :param X: (pd.DataFrame) Dataframe of features.
 37 |         :param num_values: (int) Number of values used to estimate feature effect.
 38 |         :param pairwise_combinations: (list) Tuples (feature_i, feature_j) to test pairwise effect.
 39 |         """
 40 | 
 41 |         pass
 42 | 
 43 |     def get_effects(self) -> Tuple:
 44 |         """
 45 |         Return computed linear, non-linear and pairwise effects. The model should be fit() before using this method.
 46 | 
 47 |         :return: (tuple) Linear, non-linear and pairwise effects, of type dictionary (raw values and normalised).
 48 |         """
 49 | 
 50 |         pass
 51 | 
 52 |     def plot_effects(self) -> plt.figure:
 53 |         """
 54 |         Plot each effect (normalized) on a bar plot (linear, non-linear). Also plots pairwise effects if calculated.
 55 | 
 56 |         :return: (plt.figure) Plot figure.
 57 |         """
 58 | 
 59 |         pass
 60 | 
 61 |     def _get_feature_values(self, X: pd.DataFrame, num_values: int) -> None:
 62 |         """
 63 |         Step 1 of the algorithm which generates possible feature values used in analysis.
 64 | 
 65 |         :param X: (pd.DataFrame) Dataframe of features.
 66 |         :param num_values: (int) Number of values used to estimate feature effect.
 67 |         """
 68 | 
 69 |         pass
 70 | 
 71 |     def _get_individual_partial_dependence(self, model: object, X: pd.DataFrame) -> None:
 72 |         """
 73 |         Get individual partial dependence function values for each column.
 74 | 
 75 |         :param model: (object) Trained model.
 76 |         :param X: (pd.DataFrame) Dataframe of features.
 77 |         """
 78 | 
 79 |         pass
 80 | 
 81 |     def _get_linear_effect(self, X: pd.DataFrame) -> dict:
 82 |         """
 83 |         Get linear effect estimates as the mean absolute deviation of the linear predictions around their average value.
 84 | 
 85 |         :param X: (pd.DataFrame) Dataframe of features.
 86 |         :return: (dict) Linear effect estimates for each feature column.
 87 |         """
 88 | 
 89 |         pass
 90 | 
 91 |     def _get_non_linear_effect(self, X: pd.DataFrame) -> dict:
 92 |         """
 93 |         Get non-linear effect estimates as as the mean absolute deviation of the total marginal (single variable)
 94 |         effect around its corresponding linear effect.
 95 | 
 96 |         :param X: (pd.DataFrame) Dataframe of features.
 97 |         :return: (dict) Non-linear effect estimates for each feature column.
 98 |         """
 99 | 
100 |         pass
101 | 
102 |     def _get_pairwise_effect(self, pairwise_combinations: list, model: object, X: pd.DataFrame, num_values) -> dict:
103 |         """
104 |         Get pairwise effect estimates as the de-meaned joint partial prediction of the two variables minus the de-meaned
105 |         partial predictions of each variable independently.
106 | 
107 |         :param pairwise_combinations: (list) Tuples (feature_i, feature_j) to test pairwise effect.
108 |         :param model: (object) Trained model.
109 |         :param X: (pd.DataFrame) Dataframe of features.
110 |         :param num_values: (int) Number of values used to estimate feature effect.
111 |         :return: (dict) Raw and normalised pairwise effects.
112 |         """
113 | 
114 |         pass
115 | 
116 |     @abstractmethod
117 |     def _get_model_predictions(self, model: object, X_: pd.DataFrame):
118 |         """
119 |         Get model predictions based on problem type (predict for regression, predict_proba for classification).
120 | 
121 |         :param model: (object) Trained model.
122 |         :param X_: (np.array) Feature set.
123 |         :return: (np.array) Predictions.
124 |         """
125 | 
126 |         pass
127 | 
128 |     @staticmethod
129 |     def _normalize(effect: dict) -> dict:
130 |         """
131 |         Normalize effect values (sum equals 1).
132 | 
133 |         :param effect: (dict) Effect values.
134 |         :return: (dict) Normalized effect values.
135 |         """
136 | 
137 |         pass
138 | 
139 | 
140 | class RegressionModelFingerprint(AbstractModelFingerprint):
141 |     """
142 |     Regression Fingerprint class used for regression type of models.
143 |     """
144 | 
145 |     def __init__(self):
146 |         """
147 |         Regression model fingerprint constructor.
148 |         """
149 | 
150 |         pass
151 | 
152 |     def _get_model_predictions(self, model, X_):
153 |         """
154 |         Abstract method _get_model_predictions implementation.
155 | 
156 |         :param model: (object) Trained model.
157 |         :param X_: (np.array) Feature set.
158 |         :return: (np.array) Predictions.
159 |         """
160 | 
161 |         pass
162 | 
163 | 
164 | class ClassificationModelFingerprint(AbstractModelFingerprint):
165 |     """
166 |     Classification Fingerprint class used for classification type of models.
167 |     """
168 | 
169 |     def __init__(self):
170 |         """
171 |         Classification model fingerprint constructor.
172 |         """
173 | 
174 |         pass
175 | 
176 |     def _get_model_predictions(self, model, X_):
177 |         """
178 |         Abstract method _get_model_predictions implementation.
179 | 
180 |         :param model: (object) Trained model.
181 |         :param X_: (np.array) Feature set.
182 |         :return: (np.array) Predictions.
183 |         """
184 | 
185 |         pass
186 | 


--------------------------------------------------------------------------------
/mlfinlab/feature_importance/orthogonal.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module which implements feature PCA compression and PCA analysis of feature importance.
 3 | """
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | from scipy.stats import weightedtau, kendalltau, spearmanr, pearsonr
 8 | 
 9 | 
10 | def _get_eigen_vector(dot_matrix, variance_thresh, num_features=None):
11 |     """
12 |     Advances in Financial Machine Learning, Snippet 8.5, page 119.
13 | 
14 |     Computation of Orthogonal Features
15 | 
16 |     Gets eigen values and eigen vector from matrix which explain % variance_thresh of total variance.
17 | 
18 |     :param dot_matrix: (np.array): Matrix for which eigen values/vectors should be computed.
19 |     :param variance_thresh: (float): Percentage % of overall variance which compressed vectors should explain.
20 |     :param num_features: (int) Manually set number of features, overrides variance_thresh. (None by default)
21 |     :return: (pd.Series, pd.DataFrame): Eigenvalues, Eigenvectors.
22 |     """
23 | 
24 |     pass
25 | 
26 | 
27 | def _standardize_df(data_frame):
28 |     """
29 |     Helper function which divides df by std and extracts mean.
30 | 
31 |     :param data_frame: (pd.DataFrame): Dataframe to standardize
32 |     :return: (pd.DataFrame): Standardized dataframe
33 |     """
34 | 
35 |     pass
36 | 
37 | 
38 | def get_orthogonal_features(feature_df, variance_thresh=.95, num_features=None):
39 |     """
40 |     Advances in Financial Machine Learning, Snippet 8.5, page 119.
41 | 
42 |     Computation of Orthogonal Features.
43 | 
44 |     Gets PCA orthogonal features.
45 | 
46 |     :param feature_df: (pd.DataFrame): Dataframe of features.
47 |     :param variance_thresh: (float): Percentage % of overall variance which compressed vectors should explain.
48 |     :param num_features: (int) Manually set number of features, overrides variance_thresh. (None by default)
49 |     :return: (pd.DataFrame): Compressed PCA features which explain %variance_thresh of variance.
50 |     """
51 | 
52 |     pass
53 | 
54 | 
55 | def get_pca_rank_weighted_kendall_tau(feature_imp, pca_rank):
56 |     """
57 |     Advances in Financial Machine Learning, Snippet 8.6, page 121.
58 | 
59 |     Computes Weighted Kendall's Tau Between Feature Importance and Inverse PCA Ranking.
60 | 
61 |     :param feature_imp: (np.array): Feature mean importance.
62 |     :param pca_rank: (np.array): PCA based feature importance rank.
63 |     :return: (float): Weighted Kendall Tau of feature importance and inverse PCA rank with p_value.
64 |     """
65 | 
66 |     pass
67 | 
68 | 
69 | def feature_pca_analysis(feature_df, feature_importance, variance_thresh=0.95):
70 |     """
71 |     Performs correlation analysis between feature importance (MDI for example, supervised) and PCA eigenvalues
72 |     (unsupervised).
73 | 
74 |     High correlation means that probably the pattern identified by the ML algorithm is not entirely overfit.
75 | 
76 |     :param feature_df: (pd.DataFrame): Features dataframe.
77 |     :param feature_importance: (pd.DataFrame): Individual MDI feature importance.
78 |     :param variance_thresh: (float): Percentage % of overall variance which compressed vectors should explain in PCA compression.
79 |     :return: (dict): Dictionary with kendall, spearman, pearson and weighted_kendall correlations and p_values.
80 |     """
81 | 
82 |     pass
83 | 


--------------------------------------------------------------------------------
/mlfinlab/features/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions derived from Chapter 5: Fractional Differentiation.
3 | """
4 | 
5 | from mlfinlab.features.fracdiff import (get_weights, frac_diff, get_weights_ffd, frac_diff_ffd, plot_min_ffd)
6 | 


--------------------------------------------------------------------------------
/mlfinlab/filters/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Logic regarding the various types of filters:
 3 | 
 4 | * CUSUM Filter
 5 | * Z-score filter
 6 | """
 7 | 
 8 | from mlfinlab.filters.filters import cusum_filter
 9 | from mlfinlab.filters.filters import z_score_filter
10 | 


--------------------------------------------------------------------------------
/mlfinlab/filters/filters.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Filters are used to filter events based on some kind of trigger. For example a structural break filter can be
 3 | used to filter events where a structural break occurs. This event is then used to measure the return from the event
 4 | to some event horizon, say a day.
 5 | """
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | 
10 | 
11 | # Snippet 2.4, page 39, The Symmetric CUSUM Filter.
12 | def cusum_filter(raw_time_series, threshold, time_stamps=True):
13 |     """
14 |     Advances in Financial Machine Learning, Snippet 2.4, page 39.
15 | 
16 |     The Symmetric Dynamic/Fixed CUSUM Filter.
17 | 
18 |     The CUSUM filter is a quality-control method, designed to detect a shift in the mean value of a measured quantity
19 |     away from a target value. The filter is set up to identify a sequence of upside or downside divergences from any
20 |     reset level zero. We sample a bar t if and only if S_t >= threshold, at which point S_t is reset to 0.
21 | 
22 |     One practical aspect that makes CUSUM filters appealing is that multiple events are not triggered by raw_time_series
23 |     hovering around a threshold level, which is a flaw suffered by popular market signals such as Bollinger Bands.
24 |     It will require a full run of length threshold for raw_time_series to trigger an event.
25 | 
26 |     Once we have obtained this subset of event-driven bars, we will let the ML algorithm determine whether the occurrence
27 |     of such events constitutes actionable intelligence. Below is an implementation of the Symmetric CUSUM filter.
28 | 
29 |     Note: As per the book this filter is applied to closing prices but we extended it to also work on other
30 |     time series such as volatility.
31 | 
32 |     :param raw_time_series: (pd.Series) Close prices (or other time series, e.g. volatility).
33 |     :param threshold: (float or pd.Series) When the abs(change) is larger than the threshold, the function captures
34 |                       it as an event, can be dynamic if threshold is pd.Series
35 |     :param time_stamps: (bool) Default is to return a DateTimeIndex, change to false to have it return a list.
36 |     :return: (datetime index vector) Vector of datetimes when the events occurred. This is used later to sample.
37 |     """
38 | 
39 |     pass
40 | 
41 | 
42 | def z_score_filter(raw_time_series, mean_window, std_window, z_score=3, time_stamps=True):
43 |     """
44 |     Filter which implements z_score filter
45 |     (https://stackoverflow.com/questions/22583391/peak-signal-detection-in-realtime-timeseries-data)
46 | 
47 |     :param raw_time_series: (pd.Series) Close prices (or other time series, e.g. volatility).
48 |     :param mean_window: (int): Rolling mean window
49 |     :param std_window: (int): Rolling std window
50 |     :param z_score: (float): Number of standard deviations to trigger the event
51 |     :param time_stamps: (bool) Default is to return a DateTimeIndex, change to false to have it return a list.
52 |     :return: (datetime index vector) Vector of datetimes when the events occurred. This is used later to sample.
53 |     """
54 | 
55 |     pass
56 | 


--------------------------------------------------------------------------------
/mlfinlab/labeling/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Labeling techniques used in financial machine learning.
 3 | """
 4 | 
 5 | from mlfinlab.labeling.labeling import (add_vertical_barrier, apply_pt_sl_on_t1, barrier_touched, drop_labels,
 6 |                                         get_bins, get_events)
 7 | from mlfinlab.labeling.trend_scanning import trend_scanning_labels
 8 | from mlfinlab.labeling.tail_sets import TailSetLabels
 9 | from mlfinlab.labeling.fixed_time_horizon import fixed_time_horizon
10 | from mlfinlab.labeling.matrix_flags import MatrixFlagLabels
11 | from mlfinlab.labeling.excess_over_median import excess_over_median
12 | from mlfinlab.labeling.raw_return import raw_return
13 | from mlfinlab.labeling.return_vs_benchmark import return_over_benchmark
14 | from mlfinlab.labeling.excess_over_mean import excess_over_mean
15 | from mlfinlab.labeling.bull_bear import (pagan_sossounov, lunde_timmermann)
16 | 


--------------------------------------------------------------------------------
/mlfinlab/labeling/bull_bear.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Detection of bull and bear markets.
 3 | """
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def pagan_sossounov(prices, window=8, censor=6, cycle=16, phase=4, threshold=0.2):
 9 |     """
10 |     Pagan and Sossounov's labeling method. Sourced from `Pagan, Adrian R., and Kirill A. Sossounov. "A simple framework
11 |     for analysing bull and bear markets." Journal of applied econometrics 18.1 (2003): 23-46.
12 |     <https://onlinelibrary.wiley.com/doi/pdf/10.1002/jae.664>`__
13 | 
14 |     Returns a DataFrame with labels of 1 for Bull and -1 for Bear.
15 | 
16 |     :param prices: (pd.DataFrame) Close prices of all tickers in the market.
17 |     :param window: (int) Rolling window length to determine local extrema. Paper suggests 8 months for monthly obs.
18 |     :param censor: (int) Number of months to eliminate for start and end. Paper suggests 6 months for monthly obs.
19 |     :param cycle: (int) Minimum length for a complete cycle. Paper suggests 16 months for monthly obs.
20 |     :param phase: (int) Minimum length for a phase. Paper suggests 4 months for monthly obs.
21 |     :param threshold: (double) Minimum threshold for phase change. Paper suggests 0.2.
22 |     :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear.
23 |     """
24 | 
25 |     pass
26 | 
27 | 
28 | def _alternation(price):
29 |     """
30 |     Helper function to check peak and trough alternation.
31 | 
32 |     :param price: (pd.DataFrame) Close prices of all tickers in the market.
33 |     :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear.
34 |     """
35 | 
36 |     pass
37 | 
38 | 
39 | def _apply_pagan_sossounov(price, window, censor, cycle, phase, threshold):
40 |     """
41 |     Helper function for Pagan and Sossounov labeling method.
42 | 
43 |     :param price: (pd.DataFrame) Close prices of all tickers in the market.
44 |     :param window: (int) Rolling window length to determine local extrema. Paper suggests 8 months for monthly obs.
45 |     :param censor: (int) Number of months to eliminate for start and end. Paper suggests 6 months for monthly obs.
46 |     :param cycle: (int) Minimum length for a complete cycle. Paper suggests 16 months for monthly obs.
47 |     :param phase: (int) Minimum length for a phase. Paper suggests 4 months for monthly obs.
48 |     :param threshold: (double) Minimum threshold for phase change. Paper suggests 20%.
49 |     :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear.
50 |     """
51 | 
52 |     pass
53 | 
54 | 
55 | def lunde_timmermann(prices, bull_threshold=0.15, bear_threshold=0.15):
56 |     """
57 |     Lunde and Timmermann's labeling method. Sourced from `Lunde, Asger, and Allan Timmermann. "Duration dependence
58 |     in stock prices: An analysis of bull and bear markets." Journal of Business & Economic Statistics 22.3 (2004): 253-273.
59 |     <https://repec.cepr.org/repec/cpr/ceprdp/DP4104.pdf>`__
60 | 
61 |     Returns a DataFrame with labels of 1 for Bull and -1 for Bear.
62 | 
63 |     :param prices: (pd.DataFrame) Close prices of all tickers in the market.
64 |     :param bull_threshold: (double) Threshold to identify bull market. Paper suggests 0.15.
65 |     :param bear_threshold: (double) Threshold to identify bear market. Paper suggests 0.15.
66 |     :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear.
67 |     """
68 | 
69 |     pass
70 | 
71 | 
72 | def _apply_lunde_timmermann(price, bull_threshold, bear_threshold):
73 |     """
74 |     Helper function for Lunde and Timmermann labeling method.
75 | 
76 |     :param price: (pd.DataFrame) Close prices of all tickers in the market.
77 |     :param bull_threshold: (double) Threshold to identify bull market. Paper suggests 0.15.
78 |     :param bear_threshold: (double) Threshold to identify bear market. Paper suggests 0.15.
79 |     :return: (pd.DataFrame) Labeled pd.DataFrame. 1 for Bull, -1 for Bear.
80 |     """
81 | 
82 |     pass
83 | 


--------------------------------------------------------------------------------
/mlfinlab/labeling/excess_over_mean.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Return in excess of mean method.
 3 | 
 4 | Chapter 5, Machine Learning for Factor Investing, by Coqueret and Guida, (2020).
 5 | """
 6 | import numpy as np
 7 | 
 8 | 
 9 | def excess_over_mean(prices, binary=False, resample_by=None, lag=True):
10 |     """
11 |     Return in excess of mean labeling method. Sourced from Chapter 5.5.1 of Machine Learning for Factor Investing,
12 |     by Coqueret, G. and Guida, T. (2020).
13 | 
14 |     Returns a DataFrame containing returns of stocks over the mean of all stocks in the portfolio. Returns a DataFrame
15 |     of signs of the returns if binary is True. In this case, an observation may be labeled as 0 if it itself is the
16 |     mean.
17 | 
18 |     :param prices: (pd.DataFrame) Close prices of all tickers in the market that are used to establish the mean. NaN
19 |                     values are ok. Returns on each ticker are then compared to the mean for the given timestamp.
20 |     :param binary: (bool) If False, the numerical value of excess returns over mean will be given. If True, then only
21 |                     the sign of the excess return over mean will be given (-1 or 1). A label of 0 will be given if
22 |                     the observation itself equal to the mean.
23 |     :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per
24 |                         business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period.
25 |                         For full details see `here.
26 |                         <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`_
27 |     :param lag: (bool) If True, returns will be lagged to make them forward-looking.
28 |     :return: (pd.DataFrame) Numerical returns in excess of the market mean return, or sign of return depending on
29 |                 whether binary is False or True respectively.
30 |     """
31 | 
32 |     pass
33 | 


--------------------------------------------------------------------------------
/mlfinlab/labeling/excess_over_median.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Return in excess of median method.
 3 | 
 4 | Described in "The benefits of tree-based models for stock selection", Zhu et al. (2012). Data labeled this way can be
 5 | used in regression and classification models to predict stock returns over market.
 6 | """
 7 | import numpy as np
 8 | 
 9 | 
10 | def excess_over_median(prices, binary=False, resample_by=None, lag=True):
11 |     """
12 |     Return in excess of median labeling method. Sourced from "The benefits of tree-based models for stock selection"
13 |     Zhu et al. (2012).
14 | 
15 |     Returns a DataFrame containing returns of stocks over the median of all stocks in the portfolio, or returns a
16 |     DataFrame containing signs of those returns. In the latter case, an observation may be labeled as 0 if it itself is
17 |     the median.
18 | 
19 |     :param prices: (pd.DataFrame) Close prices of all stocks in the market that are used to establish the median.
20 |                    Returns on each stock are then compared to the median for the given timestamp.
21 |     :param binary: (bool) If False, the numerical value of excess returns over median will be given. If True, then only
22 |                     the sign of the excess return over median will be given (-1 or 1). A label of 0 will be given if
23 |                     the observation itself is the median. According to Zhu et al., categorical labels can alleviate
24 |                     issues with extreme outliers present with numerical labels.
25 |     :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per
26 |                         business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period.
27 |                         For full details see `here.
28 |                         <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`_
29 |     :param lag: (bool) If True, returns will be lagged to make them forward-looking.
30 |     :return: (pd.DataFrame) Numerical returns in excess of the market median return, or sign of return depending on
31 |                     whether binary is False or True respectively.
32 |     """
33 | 
34 |     pass
35 | 


--------------------------------------------------------------------------------
/mlfinlab/labeling/fixed_time_horizon.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Chapter 3.2 Fixed-Time Horizon Method, in Advances in Financial Machine Learning, by M. L. de Prado.
 3 | 
 4 | Work "Classification-based Financial Markets Prediction using Deep Neural Networks" by Dixon et al. (2016) describes how
 5 | labeling data this way can be used in training deep neural networks to predict price movements.
 6 | """
 7 | 
 8 | import warnings
 9 | import pandas as pd
10 | 
11 | 
12 | def fixed_time_horizon(prices, threshold=0, resample_by=None, lag=True, standardized=False, window=None):
13 |     """
14 |     Fixed-Time Horizon Labeling Method.
15 | 
16 |     Originally described in the book Advances in Financial Machine Learning, Chapter 3.2, p.43-44.
17 | 
18 |     Returns 1 if return is greater than the threshold, -1 if less, and 0 if in between. If no threshold is
19 |     provided then it will simply take the sign of the return.
20 | 
21 |     :param prices: (pd.Series or pd.DataFrame) Time-indexed stock prices used to calculate returns.
22 |     :param threshold: (float or pd.Series) When the absolute value of return exceeds the threshold, the observation is
23 |                     labeled with 1 or -1, depending on the sign of the return. If return is less, it's labeled as 0.
24 |                     Can be dynamic if threshold is inputted as a pd.Series, and threshold.index must match prices.index.
25 |                     If resampling is used, the index of threshold must match the index of prices after resampling.
26 |                     If threshold is negative, then the directionality of the labels will be reversed. If no threshold
27 |                     is provided, it is assumed to be 0 and the sign of the return is returned.
28 |     :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per
29 |                         business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period.
30 |                         For full details see `here.
31 |                         <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`_
32 |     :param lag: (bool) If True, returns will be lagged to make them forward-looking.
33 |     :param standardized: (bool) Whether returns are scaled by mean and standard deviation.
34 |     :param window: (int) If standardized is True, the rolling window period for calculating the mean and standard
35 |                     deviation of returns.
36 |     :return: (pd.Series or pd.DataFrame) -1, 0, or 1 denoting whether the return for each observation is
37 |                     less/between/greater than the threshold at each corresponding time index. First or last row will be
38 |                     NaN, depending on lag.
39 |     """
40 | 
41 |     pass
42 | 


--------------------------------------------------------------------------------
/mlfinlab/labeling/matrix_flags.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=no-self-use
 2 | # pylint: disable=unnecessary-comprehension
 3 | """
 4 | Matrix Flag labeling method.
 5 | """
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | 
10 | 
11 | class MatrixFlagLabels:
12 |     """
13 |     The Matrix Flag labeling method is featured in the paper: Cervelló-Royo, R., Guijarro, F. and Michniuk, K., 2015.
14 |     Stock market trading rule based on pattern recognition and technical analysis: Forecasting the DJIA index with
15 |     intraday data.
16 | 
17 |     The method of applying a matrix template was first introduced, and explained in greater detail, in the paper:
18 |     Leigh, W., Modani, N., Purvis, R. and Roberts, T., 2002. Stock market trading rule discovery using technical
19 |     charting heuristics.
20 | 
21 |     Cervelló-Royo et al. expand on Leigh et al.'s work by proposing a new bull flag pattern which ameliorates some
22 |     weaknesses in Leigh's original template. Additionally, he applies this bull flag labeling method to intraday
23 |     candlestick data, rather than just closing prices.
24 | 
25 |     To find the total weight for a given day, the current price as well as the preceding window days number of prices is
26 |     used. The data window is split into 10 buckets each containing a chronological tenth of the data window. Each point
27 |     in 1 bucket is put into a decile corresponding to a position in a column based on percentile relative to the entire
28 |     data window. Bottom 10% on lowest row, next 10% on second lowest row etc.
29 |     The proportion of points in each decile is reported to finalize the column. The first tenth of the data is
30 |     transformed to the leftmost column, the next tenth to the next column on the right and so on until finally a 10 by
31 |     10 matrix is achieved. This matrix is then multiplied element-wise with the 10 by 10 template, and the sum of all
32 |     columns is the total weight for the day. If desired, the user can specify a threshold to determine positive and
33 |     negative classes. The value of the threshold depends on how strict of a classifier the user desires, and the
34 |     allowable values based on the template matrix.
35 |     """
36 | 
37 |     def __init__(self, prices, window, template_name=None):
38 |         """
39 |         :param prices: (pd.Series) Price data for one stock.
40 |         :param window: (int) Length of preceding data window used when generating the fit matrix for one day.
41 |         :param template_name: (str) Name of the an available template in the template library. Allowable names:
42 |                             ``leigh_bear``, ``leigh_bull``, ``cervelloroyo_bear``, ``cervellororo_bull``.
43 |         """
44 | 
45 |         pass
46 | 
47 |     def _init_template(self, name):
48 |         """
49 |         :param name: (str) Name of the an available template in the template library. Allowable names: ``leigh_bear``,
50 |         ``leigh_bull``, ``cervelloroyo_bear``, ``cervellororo_bull``.
51 |         """
52 | 
53 |         pass
54 | 
55 |     def set_template(self, template):
56 |         """
57 |         :param template: (pd.DataFrame) Template to override the default template. Must be a 10 by 10 pd.DataFrame.
58 |                             NaN values not allowed, as they will not automatically be treated as zeros.
59 |         """
60 | 
61 |         pass
62 | 
63 |     def _transform_data(self, row_num, window=30):
64 |         """
65 |         :param row_num: (int) Row number to use for the "current" data point to apply the window to. The data window
66 |                         contains the row corresponding to row_num, as well as the (self.window-1) preceding rows.
67 |         :param window: (int) The number of rows preceding the current one to use for window. Override with
68 |                         self.window in most cases.
69 |         :return: (pd.DataFrame) Transformed 10 by 10 matrix, in which each column corresponds to a chronological tenth
70 |                     of the data window, and each row corresponds to a price decile relative to the entire data window.
71 |                     The template matrix is then applied to this output matrix.
72 |         """
73 | 
74 |         pass
75 | 
76 |     def _apply_template_to_matrix(self, matrix, template):
77 |         """
78 |         :param matrix: (pd.DataFrame) Processed 10 by 10 matrix, where each column represents a chronological tenth
79 |                         of the data, and each row represents a decile relative to the entire data window.
80 |         :param template: (pd.DataFrame) Template to apply the processed matrix to.
81 |         :return: (float) The total score for the day. Consists of the sum of sum of columns of the result from
82 |                     multiplying the matrix element-wise with the template.
83 |         """
84 | 
85 |         pass
86 | 
87 |     def apply_labeling_matrix(self, threshold=None):
88 |         """
89 |         :param threshold: (float) If None, labels will be returned numerically as the score for the day. If not None,
90 |                         then labels are returned categorically, with the positive category for labels that are equal to
91 |                         or exceed the threshold.
92 |         :return: (pd.Series) Total scores for the data series on each eligible day (meaning for indices self.window and
93 |                     onwards).
94 |         """
95 | 
96 |         pass
97 | 


--------------------------------------------------------------------------------
/mlfinlab/labeling/raw_return.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Labeling Raw Returns.
 3 | 
 4 | Most basic form of labeling based on raw return of each observation relative to its previous value.
 5 | """
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | def raw_return(prices, binary=False, logarithmic=False, resample_by=None, lag=True):
11 |     """
12 |     Raw returns labeling method.
13 | 
14 |     This is the most basic and ubiquitous labeling method used as a precursor to almost any kind of financial data
15 |     analysis or machine learning. User can specify simple or logarithmic returns, numerical or binary labels, a
16 |     resample period, and whether returns are lagged to be forward looking.
17 | 
18 |     :param prices: (pd.Series or pd.DataFrame) Time-indexed price data on stocks with which to calculate return.
19 |     :param binary: (bool) If False, will return numerical returns. If True, will return the sign of the raw return.
20 |     :param logarithmic: (bool) If False, will calculate simple returns. If True, will calculate logarithmic returns.
21 |     :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per
22 |                         business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period.
23 |                         For full details see `here.
24 |                         <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`_
25 |     :param lag: (bool) If True, returns will be lagged to make them forward-looking.
26 |     :return:  (pd.Series or pd.DataFrame) Raw returns on market data. User can specify whether returns will be based on
27 |                 simple or logarithmic return, and whether the output will be numerical or categorical.
28 |     """
29 | 
30 |     pass
31 | 


--------------------------------------------------------------------------------
/mlfinlab/labeling/return_vs_benchmark.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Return in excess of a given benchmark.
 3 | 
 4 | Chapter 5, Machine Learning for Factor Investing, by Coqueret and Guida, (2020).
 5 | 
 6 | Work "Evaluating multiple classifiers for stock price direction prediction" by Ballings et al. (2015) uses this method
 7 | to label yearly returns over a predetermined value to compare the performance of several machine learning algorithms.
 8 | """
 9 | import numpy as np
10 | import pandas as pd
11 | 
12 | 
13 | def return_over_benchmark(prices, benchmark=0, binary=False, resample_by=None, lag=True):
14 |     """
15 |     Return over benchmark labeling method. Sourced from Chapter 5.5.1 of Machine Learning for Factor Investing,
16 |     by Coqueret, G. and Guida, T. (2020).
17 | 
18 |     Returns a Series or DataFrame of numerical or categorical returns over a given benchmark. The time index of the
19 |     benchmark must match those of the price observations.
20 | 
21 |     :param prices: (pd.Series or pd.DataFrame) Time indexed prices to compare returns against a benchmark.
22 |     :param benchmark: (pd.Series or float) Benchmark of returns to compare the returns from prices against for labeling.
23 |                     Can be a constant value, or a Series matching the index of prices. If no benchmark is given, then it
24 |                     is assumed to have a constant value of 0.
25 |     :param binary: (bool) If False, labels are given by their numerical value of return over benchmark. If True,
26 |                 labels are given according to the sign of their excess return.
27 |     :param resample_by: (str) If not None, the resampling period for price data prior to calculating returns. 'B' = per
28 |                         business day, 'W' = week, 'M' = month, etc. Will take the last observation for each period.
29 |                         For full details see `here.
30 |                         <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`_
31 |     :param lag: (bool) If True, returns will be lagged to make them forward-looking.
32 |     :return: (pd.Series or pd.DataFrame) Excess returns over benchmark. If binary, the labels are -1 if the
33 |             return is below the benchmark, 1 if above, and 0 if it exactly matches the benchmark.
34 |     """
35 | 
36 |     pass
37 | 


--------------------------------------------------------------------------------
/mlfinlab/labeling/tail_sets.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=missing-module-docstring
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | 
 6 | class TailSetLabels:
 7 |     """
 8 |     Tail set labels are a classification labeling technique introduced in the following paper: Nonlinear support vector
 9 |     machines can systematically identify stocks with high and low future returns. Algorithmic Finance, 2(1), pp.45-58.
10 | 
11 |     A tail set is defined to be a group of stocks whose volatility-adjusted return is in the highest or lowest
12 |     quantile, for example the highest or lowest 5%.
13 | 
14 |     A classification model is then fit using these labels to determine which stocks to buy and sell in a long / short
15 |     portfolio.
16 |     """
17 | 
18 |     def __init__(self, prices, n_bins, vol_adj=None, window=None):
19 |         """
20 |         :param prices: (pd.DataFrame) Asset prices.
21 |         :param n_bins: (int) Number of bins to determine the quantiles for defining the tail sets. The top and
22 |                         bottom quantiles are considered to be the positive and negative tail sets, respectively.
23 |         :param vol_adj: (str) Whether to take volatility adjusted returns. Allowable inputs are ``None``,
24 |                         ``mean_abs_dev``, and ``stdev``.
25 |         :param window: (int) Window period used in the calculation of the volatility adjusted returns, if vol_adj is not
26 |                         None. Has no impact if vol_adj is None.
27 |         """
28 | 
29 |         pass
30 | 
31 |     def get_tail_sets(self):
32 |         """
33 |         Computes the tail sets (positive and negative) and then returns a tuple with 3 elements, positive set, negative
34 |         set, full matrix set.
35 | 
36 |         The positive and negative sets are each a series of lists with the names of the securities that fall within each
37 |         set at a specific timestamp.
38 | 
39 |         For the full matrix a value of 1 indicates the volatility adjusted returns were in the top quantile, a value of
40 |         -1 for the bottom quantile.
41 |         :return: (tuple) positive set, negative set, full matrix set.
42 |         """
43 | 
44 |         pass
45 | 
46 |     def _vol_adjusted_rets(self):
47 |         """
48 |         Computes the volatility adjusted returns. This is simply the log returns divided by a volatility estimate. We
49 |         have provided 2 techniques for volatility estimation: an exponential moving average and the traditional standard
50 |         deviation.
51 |         """
52 | 
53 |         pass
54 | 
55 |     def _extract_tail_sets(self, row):
56 |         """
57 |         Method used in a .apply() setting to transform each row in a DataFrame to the positive and negative tail sets.
58 | 
59 |         This method splits the data into quantiles determined by the user, with n_bins.
60 | 
61 |         :param row: (pd.Series) Vol adjusted returns for a given date.
62 |         :return: (pd.Series) Tail set with positive and negative labels.
63 |         """
64 | 
65 |         pass
66 | 
67 |     @staticmethod
68 |     def _positive_tail_set(row):
69 |         """
70 |         Takes as input a row from the vol_adj_ret DataFrame and then returns a list of names of the securities in the
71 |         positive tail set, for this specific row date.
72 | 
73 |         This method is used in an apply() setting.
74 | 
75 |         :param row: (pd.Series) Labeled row of several stocks where each is already labeled with +1 (positive tail set),
76 |                     -1 (negative tail set), or 0.
77 |         :return: (list) Securities in the positive tail set.
78 |         """
79 | 
80 |         pass
81 | 
82 |     @staticmethod
83 |     def _negative_tail_set(row):
84 |         """
85 |         Takes as input a row from the vol_adj_ret DataFrame and then returns a list of names of the securities in the
86 |         negative tail set, for this specific row date.
87 | 
88 |         This method is used in an apply() setting.
89 | 
90 |         :param row: (pd.Series) Labeled row of several stocks where each is already labeled with +1 (positive tail set),
91 |                     -1 (negative tail set), or 0.
92 |         :return: (list) Securities in the negative tail set.
93 |         """
94 | 
95 |         pass
96 | 


--------------------------------------------------------------------------------
/mlfinlab/labeling/trend_scanning.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implementation of Trend-Scanning labels described in `Advances in Financial Machine Learning: Lecture 3/10
 3 | <https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2708678>`_
 4 | """
 5 | 
 6 | import pandas as pd
 7 | import numpy as np
 8 | 
 9 | from mlfinlab.structural_breaks.sadf import get_betas
10 | 
11 | 
12 | def trend_scanning_labels(price_series: pd.Series, t_events: list = None, observation_window: int = 20,
13 |                           look_forward: bool = True, min_sample_length: int = 5, step: int = 1) -> pd.DataFrame:
14 |     """
15 |     `Trend scanning <https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3257419>`_ is both a classification and
16 |     regression labeling technique.
17 | 
18 |     That can be used in the following ways:
19 | 
20 |     1. Classification: By taking the sign of t-value for a given observation we can set {-1, 1} labels to define the
21 |        trends as either downward or upward.
22 |     2. Classification: By adding a minimum t-value threshold you can generate {-1, 0, 1} labels for downward, no-trend,
23 |        upward.
24 |     3. The t-values can be used as sample weights in classification problems.
25 |     4. Regression: The t-values can be used in a regression setting to determine the magnitude of the trend.
26 | 
27 |     The output of this algorithm is a DataFrame with t1 (time stamp for the farthest observation), t-value, returns for
28 |     the trend, and bin.
29 | 
30 |     This function allows using both forward-looking and backward-looking window (use the look_forward parameter).
31 | 
32 |     :param price_series: (pd.Series) Close prices used to label the data set
33 |     :param t_events: (list) Filtered events, array of pd.Timestamps
34 |     :param observation_window: (int) Maximum look forward window used to get the trend value
35 |     :param look_forward: (bool) True if using a forward-looking window, False if using a backward-looking one
36 |     :param min_sample_length: (int) Minimum sample length used to fit regression
37 |     :param step: (int) Optimal t-value index is searched every 'step' indices
38 |     :return: (pd.DataFrame) Consists of t1, t-value, ret, bin (label information). t1 - label endtime, tvalue,
39 |         ret - price change %, bin - label value based on price change sign
40 |     """
41 |     # pylint: disable=invalid-name
42 | 
43 |     pass
44 | 


--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Functions derived from Chapter 19: Market Microstructural features
 3 | """
 4 | 
 5 | from mlfinlab.microstructural_features.encoding import quantile_mapping, encode_array, encode_tick_rule_array, \
 6 |     sigma_mapping
 7 | from mlfinlab.microstructural_features.entropy import get_lempel_ziv_entropy, get_shannon_entropy, get_plug_in_entropy, \
 8 |     get_konto_entropy
 9 | from mlfinlab.microstructural_features.feature_generator import MicrostructuralFeaturesGenerator
10 | from mlfinlab.microstructural_features.first_generation import get_corwin_schultz_estimator, get_roll_measure, \
11 |     get_roll_impact, get_bekker_parkinson_vol
12 | from mlfinlab.microstructural_features.misc import get_avg_tick_size, vwap
13 | from mlfinlab.microstructural_features.second_generation import get_bar_based_kyle_lambda, get_bar_based_amihud_lambda, \
14 |     get_bar_based_hasbrouck_lambda, get_trades_based_kyle_lambda, get_trades_based_amihud_lambda, \
15 |     get_trades_based_hasbrouck_lambda
16 | from mlfinlab.microstructural_features.third_generation import get_vpin
17 | 


--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/encoding.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Various functions for message encoding (quantile)
 3 | """
 4 | import numpy as np
 5 | 
 6 | 
 7 | def encode_tick_rule_array(tick_rule_array: list) -> str:
 8 |     """
 9 |     Encode array of tick signs (-1, 1, 0)
10 | 
11 |     :param tick_rule_array: (list) Tick rules
12 |     :return: (str) Encoded message
13 |     """
14 | 
15 |     pass
16 | 
17 | 
18 | def _get_ascii_table() -> list:
19 |     """
20 |     Get all ASCII symbols
21 | 
22 |     :return: (list) ASCII symbols
23 |     """
24 | 
25 |     pass
26 | 
27 | 
28 | def quantile_mapping(array: list, num_letters: int = 26) -> dict:
29 |     """
30 |     Generate dictionary of quantile-letters based on values from array and dictionary length (num_letters).
31 | 
32 |     :param array: (list) Values to split on quantiles
33 |     :param num_letters: (int) Number of letters(quantiles) to encode
34 |     :return: (dict) Dict of quantile-symbol
35 |     """
36 | 
37 |     pass
38 | 
39 | 
40 | def sigma_mapping(array: list, step: float = 0.01) -> dict:
41 |     """
42 |     Generate dictionary of sigma encoded letters based on values from array and discretization step.
43 | 
44 |     :param array: (list) Values to split on quantiles
45 |     :param step: (float) Discretization step (sigma)
46 |     :return: (dict) Dict of value-symbol
47 |     """
48 | 
49 |     pass
50 | 
51 | 
52 | def _find_nearest(array: list, value: float) -> float:
53 |     """
54 |     Find the nearest element from array to value.
55 | 
56 |     :param array: (list) Values
57 |     :param value: (float) Value for which the nearest element needs to be found
58 |     :return: (float) The nearest to the value element in array
59 |     """
60 | 
61 |     pass
62 | 
63 | 
64 | def _get_letter_from_encoding(value: float, encoding_dict: dict) -> str:
65 |     """
66 |     Get letter for float/int value from encoding dict.
67 | 
68 |     :param value: (float/int) Value to use
69 |     :param encoding_dict: (dict) Used dictionary
70 |     :return: (str) Letter from encoding dict
71 |     """
72 | 
73 |     pass
74 | 
75 | 
76 | def encode_array(array: list, encoding_dict: dict) -> str:
77 |     """
78 |     Encode array with strings using encoding dict, in case of multiple occurrences of the minimum values,
79 |     the indices corresponding to the first occurrence are returned
80 | 
81 |     :param array: (list) Values to encode
82 |     :param encoding_dict: (dict) Dict of quantile-symbol
83 |     :return: (str) Encoded message
84 |     """
85 | 
86 |     pass
87 | 


--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/entropy.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Entropy calculation module (Shannon, Lempel-Ziv, Plug-In, Konto)
 3 | """
 4 | 
 5 | import math
 6 | from typing import Union
 7 | 
 8 | import numpy as np
 9 | from numba import njit
10 | 
11 | 
12 | def get_shannon_entropy(message: str) -> float:
13 |     """
14 |     Advances in Financial Machine Learning, page 263-264.
15 | 
16 |     Get Shannon entropy from message
17 | 
18 |     :param message: (str) Encoded message
19 |     :return: (float) Shannon entropy
20 |     """
21 | 
22 |     pass
23 | 
24 | 
25 | def get_lempel_ziv_entropy(message: str) -> float:
26 |     """
27 |     Advances in Financial Machine Learning, Snippet 18.2, page 266.
28 | 
29 |     Get Lempel-Ziv entropy estimate
30 | 
31 |     :param message: (str) Encoded message
32 |     :return: (float) Lempel-Ziv entropy
33 |     """
34 | 
35 |     pass
36 | 
37 | 
38 | def _prob_mass_function(message: str, word_length: int) -> dict:
39 |     """
40 |     Advances in Financial Machine Learning, Snippet 18.1, page 266.
41 | 
42 |     Compute probability mass function for a one-dim discete rv
43 | 
44 |     :param message: (str or array) Encoded message
45 |     :param word_length: (int) Approximate word length
46 |     :return: (dict) Dict of pmf for each word from message
47 |     """
48 | 
49 |     pass
50 | 
51 | 
52 | def get_plug_in_entropy(message: str, word_length: int = None) -> float:
53 |     """
54 |     Advances in Financial Machine Learning, Snippet 18.1, page 265.
55 | 
56 |     Get Plug-in entropy estimator
57 | 
58 |     :param message: (str or array) Encoded message
59 |     :param word_length: (int) Approximate word length
60 |     :return: (float) Plug-in entropy
61 |     """
62 | 
63 |     pass
64 | 
65 | 
66 | @njit()
67 | def _match_length(message: str, start_index: int, window: int) -> Union[int, str]:    # pragma: no cover
68 |     """
69 |     Advances in Financial Machine Learning, Snippet 18.3, page 267.
70 | 
71 |     Function That Computes the Length of the Longest Match
72 | 
73 |     :param message: (str or array) Encoded message
74 |     :param start_index: (int) Start index for search
75 |     :param window: (int) Window length
76 |     :return: (int, str) Match length and matched string
77 |     """
78 | 
79 |     pass
80 | 
81 | 
82 | def get_konto_entropy(message: str, window: int = 0) -> float:
83 |     """
84 |     Advances in Financial Machine Learning, Snippet 18.4, page 268.
85 | 
86 |     Implementations of Algorithms Discussed in Gao et al.[2008]
87 | 
88 |     Get Kontoyiannis entropy
89 | 
90 |     :param message: (str or array) Encoded message
91 |     :param window: (int) Expanding window length, can be negative
92 |     :return: (float) Kontoyiannis entropy
93 |     """
94 | 
95 |     pass
96 | 


--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/feature_generator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Inter-bar feature generator which uses trades data and bars index to calculate inter-bar features
  3 | """
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | from mlfinlab.microstructural_features.entropy import get_shannon_entropy, get_plug_in_entropy, get_lempel_ziv_entropy, \
  8 |     get_konto_entropy
  9 | from mlfinlab.microstructural_features.encoding import encode_array
 10 | from mlfinlab.microstructural_features.second_generation import get_trades_based_kyle_lambda, \
 11 |     get_trades_based_amihud_lambda, get_trades_based_hasbrouck_lambda
 12 | from mlfinlab.microstructural_features.misc import get_avg_tick_size, vwap
 13 | from mlfinlab.microstructural_features.encoding import encode_tick_rule_array
 14 | from mlfinlab.util.misc import crop_data_frame_in_batches
 15 | 
 16 | 
 17 | # pylint: disable=too-many-instance-attributes
 18 | 
 19 | class MicrostructuralFeaturesGenerator:
 20 |     """
 21 |     Class which is used to generate inter-bar features when bars are already compressed.
 22 | 
 23 |     :param trades_input: (str or pd.DataFrame) Path to the csv file or Pandas DataFrame containing raw tick data
 24 |                                                in the format[date_time, price, volume]
 25 |     :param tick_num_series: (pd.Series) Series of tick number where bar was formed.
 26 |     :param batch_size: (int) Number of rows to read in from the csv, per batch.
 27 |     :param volume_encoding: (dict) Dictionary of encoding scheme for trades size used to calculate entropy on encoded messages
 28 |     :param pct_encoding: (dict) Dictionary of encoding scheme for log returns used to calculate entropy on encoded messages
 29 | 
 30 |     """
 31 | 
 32 |     def __init__(self, trades_input: (str, pd.DataFrame), tick_num_series: pd.Series, batch_size: int = 2e7,
 33 |                  volume_encoding: dict = None, pct_encoding: dict = None):
 34 |         """
 35 |         Constructor
 36 | 
 37 |         :param trades_input: (str or pd.DataFrame) Path to the csv file or Pandas DataFrame containing raw tick data
 38 |                                                    in the format[date_time, price, volume]
 39 |         :param tick_num_series: (pd.Series) Series of tick number where bar was formed.
 40 |         :param batch_size: (int) Number of rows to read in from the csv, per batch.
 41 |         :param volume_encoding: (dict) Dictionary of encoding scheme for trades size used to calculate entropy on encoded messages
 42 |         :param pct_encoding: (dict) Dictionary of encoding scheme for log returns used to calculate entropy on encoded messages
 43 |         """
 44 | 
 45 | 
 46 |         pass
 47 | 
 48 |     def get_features(self, verbose=True, to_csv=False, output_path=None):
 49 |         """
 50 |         Reads a csv file of ticks or pd.DataFrame in batches and then constructs corresponding microstructural intra-bar features:
 51 |         average tick size, tick rule sum, VWAP, Kyle lambda, Amihud lambda, Hasbrouck lambda, tick/volume/pct Shannon, Lempel-Ziv,
 52 |         Plug-in entropies if corresponding mapping dictionaries are provided (self.volume_encoding, self.pct_encoding).
 53 |         The csv file must have only 3 columns: date_time, price, & volume.
 54 | 
 55 |         :param verbose: (bool) Flag whether to print message on each processed batch or not
 56 |         :param to_csv: (bool) Flag for writing the results of bars generation to local csv file, or to in-memory DataFrame
 57 |         :param output_path: (bool) Path to results file, if to_csv = True
 58 |         :return: (DataFrame or None) Microstructural features for bar index
 59 |         """
 60 | 
 61 |         pass
 62 | 
 63 |     def _reset_cache(self):
 64 |         """
 65 |         Reset price_diff, trade_size, tick_rule, log_ret arrays to empty when bar is formed and features are
 66 |         calculated
 67 | 
 68 |         :return: None
 69 |         """
 70 | 
 71 |         pass
 72 | 
 73 |     def _extract_bars(self, data):
 74 |         """
 75 |         For loop which calculates features for formed bars using trades data
 76 | 
 77 |         :param data: (tuple) Contains 3 columns - date_time, price, and volume.
 78 |         """
 79 | 
 80 |         pass
 81 | 
 82 |     def _get_bar_features(self, date_time: pd.Timestamp, list_bars: list) -> list:
 83 |         """
 84 |         Calculate inter-bar features: lambdas, entropies, avg_tick_size, vwap
 85 | 
 86 |         :param date_time: (pd.Timestamp) When bar was formed
 87 |         :param list_bars: (list) Previously formed bars
 88 |         :return: (list) Inter-bar features
 89 |         """
 90 | 
 91 |         pass
 92 | 
 93 |     def _apply_tick_rule(self, price: float) -> int:
 94 |         """
 95 |         Advances in Financial Machine Learning, page 29.
 96 | 
 97 |         Applies the tick rule
 98 | 
 99 |         :param price: (float) Price at time t
100 |         :return: (int) The signed tick
101 |         """
102 | 
103 |         pass
104 | 
105 |     def _get_price_diff(self, price: float) -> float:
106 |         """
107 |         Get price difference between ticks
108 | 
109 |         :param price: (float) Price at time t
110 |         :return: (float) Price difference
111 |         """
112 | 
113 |         pass
114 | 
115 |     def _get_log_ret(self, price: float) -> float:
116 |         """
117 |         Get log return between ticks
118 | 
119 |         :param price: (float) Price at time t
120 |         :return: (float) Log return
121 |         """
122 | 
123 |         pass
124 | 
125 |     @staticmethod
126 |     def _assert_csv(test_batch):
127 |         """
128 |         Tests that the csv file read has the format: date_time, price, and volume.
129 |         If not then the user needs to create such a file. This format is in place to remove any unwanted overhead.
130 | 
131 |         :param test_batch: (pd.DataFrame) the first row of the dataset.
132 |         :return: (None)
133 |         """
134 | 
135 |         pass
136 | 


--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/first_generation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | First generation features (Roll Measure/Impact, Corwin-Schultz spread estimator)
  3 | """
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | 
  9 | def get_roll_measure(close_prices: pd.Series, window: int = 20) -> pd.Series:
 10 |     """
 11 |     Advances in Financial Machine Learning, page 282.
 12 | 
 13 |     Get Roll Measure
 14 | 
 15 |     Roll Measure gives the estimate of effective bid-ask spread
 16 |     without using quote-data.
 17 | 
 18 |     :param close_prices: (pd.Series) Close prices
 19 |     :param window: (int) Estimation window
 20 |     :return: (pd.Series) Roll measure
 21 |     """
 22 | 
 23 |     pass
 24 | 
 25 | 
 26 | def get_roll_impact(close_prices: pd.Series, dollar_volume: pd.Series, window: int = 20) -> pd.Series:
 27 |     """
 28 |     Get Roll Impact.
 29 | 
 30 |     Derivate from Roll Measure which takes into account dollar volume traded.
 31 | 
 32 |     :param close_prices: (pd.Series) Close prices
 33 |     :param dollar_volume: (pd.Series) Dollar volume series
 34 |     :param window: (int) Estimation window
 35 |     :return: (pd.Series) Roll impact
 36 |     """
 37 | 
 38 |     pass
 39 | 
 40 | 
 41 | # Corwin-Schultz algorithm
 42 | def _get_beta(high: pd.Series, low: pd.Series, window: int) -> pd.Series:
 43 |     """
 44 |     Advances in Financial Machine Learning, Snippet 19.1, page 285.
 45 | 
 46 |     Get beta estimate from Corwin-Schultz algorithm
 47 | 
 48 |     :param high: (pd.Series) High prices
 49 |     :param low: (pd.Series) Low prices
 50 |     :param window: (int) Estimation window
 51 |     :return: (pd.Series) Beta estimates
 52 |     """
 53 | 
 54 |     pass
 55 | 
 56 | 
 57 | def _get_gamma(high: pd.Series, low: pd.Series) -> pd.Series:
 58 |     """
 59 |     Advances in Financial Machine Learning, Snippet 19.1, page 285.
 60 | 
 61 |     Get gamma estimate from Corwin-Schultz algorithm.
 62 | 
 63 |     :param high: (pd.Series) High prices
 64 |     :param low: (pd.Series) Low prices
 65 |     :return: (pd.Series) Gamma estimates
 66 |     """
 67 | 
 68 |     pass
 69 | 
 70 | 
 71 | def _get_alpha(beta: pd.Series, gamma: pd.Series) -> pd.Series:
 72 |     """
 73 |     Advances in Financial Machine Learning, Snippet 19.1, page 285.
 74 | 
 75 |     Get alpha from Corwin-Schultz algorithm.
 76 | 
 77 |     :param beta: (pd.Series) Beta estimates
 78 |     :param gamma: (pd.Series) Gamma estimates
 79 |     :return: (pd.Series) Alphas
 80 |     """
 81 | 
 82 |     pass
 83 | 
 84 | 
 85 | def get_corwin_schultz_estimator(high: pd.Series, low: pd.Series, window: int = 20) -> pd.Series:
 86 |     """
 87 |     Advances in Financial Machine Learning, Snippet 19.1, page 285.
 88 | 
 89 |     Get Corwin-Schultz spread estimator using high-low prices
 90 | 
 91 |     :param high: (pd.Series) High prices
 92 |     :param low: (pd.Series) Low prices
 93 |     :param window: (int) Estimation window
 94 |     :return: (pd.Series) Corwin-Schultz spread estimators
 95 |     """
 96 |     # Note: S<0 iif alpha<0
 97 | 
 98 |     pass
 99 | 
100 | 
101 | def get_bekker_parkinson_vol(high: pd.Series, low: pd.Series, window: int = 20) -> pd.Series:
102 |     """
103 |     Advances in Financial Machine Learning, Snippet 19.2, page 286.
104 | 
105 |     Get Bekker-Parkinson volatility from gamma and beta in Corwin-Schultz algorithm.
106 | 
107 |     :param high: (pd.Series) High prices
108 |     :param low: (pd.Series) Low prices
109 |     :param window: (int) Estimation window
110 |     :return: (pd.Series) Bekker-Parkinson volatility estimates
111 |     """
112 |     # pylint: disable=invalid-name
113 | 
114 |     pass
115 | 


--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/misc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Various miscellaneous microstructural features (VWAP, average tick size)
 3 | """
 4 | 
 5 | import numpy as np
 6 | 
 7 | 
 8 | def vwap(dollar_volume: list, volume: list) -> float:
 9 |     """
10 |     Get Volume Weighted Average Price (VWAP).
11 | 
12 |     :param dollar_volume: (list) Dollar volumes
13 |     :param volume: (list) Trades sizes
14 |     :return: (float) VWAP value
15 |     """
16 | 
17 |     pass
18 | 
19 | 
20 | def get_avg_tick_size(tick_size_arr: list) -> float:
21 |     """
22 |     Get average tick size in a bar.
23 | 
24 |     :param tick_size_arr: (list) Trade sizes
25 |     :return: (float) Average trade size
26 |     """
27 | 
28 |     pass
29 | 


--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/second_generation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Second generation models features: Kyle lambda, Amihud Lambda, Hasbrouck lambda (bar and trade based)
 3 | """
 4 | 
 5 | from typing import List
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | from mlfinlab.structural_breaks.sadf import get_betas
10 | 
11 | # pylint: disable=invalid-name
12 | def get_bar_based_kyle_lambda(close: pd.Series, volume: pd.Series, window: int = 20) -> pd.Series:
13 |     """
14 |     Advances in Financial Machine Learning, p. 286-288.
15 | 
16 |     Get Kyle lambda from bars data
17 | 
18 |     :param close: (pd.Series) Close prices
19 |     :param volume: (pd.Series) Bar volume
20 |     :param window: (int) Rolling window used for estimation
21 |     :return: (pd.Series) Kyle lambdas
22 |     """
23 | 
24 |     pass
25 | 
26 | 
27 | def get_bar_based_amihud_lambda(close: pd.Series, dollar_volume: pd.Series, window: int = 20) -> pd.Series:
28 |     """
29 |     Advances in Financial Machine Learning, p.288-289.
30 | 
31 |     Get Amihud lambda from bars data
32 | 
33 |     :param close: (pd.Series) Close prices
34 |     :param dollar_volume: (pd.Series) Dollar volumes
35 |     :param window: (int) rolling window used for estimation
36 |     :return: (pd.Series) of Amihud lambda
37 |     """
38 | 
39 |     pass
40 | 
41 | def get_bar_based_hasbrouck_lambda(close: pd.Series, dollar_volume: pd.Series, window: int = 20) -> pd.Series:
42 |     """
43 |     Advances in Financial Machine Learning, p.289-290.
44 | 
45 |     Get Hasbrouck lambda from bars data
46 | 
47 |     :param close: (pd.Series) Close prices
48 |     :param dollar_volume: (pd.Series) Dollar volumes
49 |     :param window: (int) Rolling window used for estimation
50 |     :return: (pd.Series) Hasbrouck lambda
51 |     """
52 | 
53 |     pass
54 | 
55 | 
56 | def get_trades_based_kyle_lambda(price_diff: list, volume: list, aggressor_flags: list) -> List[float]:
57 |     """
58 |     Advances in Financial Machine Learning, p.286-288.
59 | 
60 |     Get Kyle lambda from trades data
61 | 
62 |     :param price_diff: (list) Price diffs
63 |     :param volume: (list) Trades sizes
64 |     :param aggressor_flags: (list) Trade directions [-1, 1]  (tick rule or aggressor side can be used to define)
65 |     :return: (list) Kyle lambda for a bar and t-value
66 |     """
67 | 
68 |     pass
69 | 
70 | 
71 | def get_trades_based_amihud_lambda(log_ret: list, dollar_volume: list) -> List[float]:
72 |     """
73 |     Advances in Financial Machine Learning, p.288-289.
74 | 
75 |     Get Amihud lambda from trades data
76 | 
77 |     :param log_ret: (list) Log returns
78 |     :param dollar_volume: (list) Dollar volumes (price * size)
79 |     :return: (float) Amihud lambda for a bar
80 |     """
81 | 
82 |     pass
83 | 
84 | 
85 | def get_trades_based_hasbrouck_lambda(log_ret: list, dollar_volume: list, aggressor_flags: list) -> List[float]:
86 |     """
87 |     Advances in Financial Machine Learning, p.289-290.
88 | 
89 |     Get Hasbrouck lambda from trades data
90 | 
91 |     :param log_ret: (list) Log returns
92 |     :param dollar_volume: (list) Dollar volumes (price * size)
93 |     :param aggressor_flags: (list) Trade directions [-1, 1]  (tick rule or aggressor side can be used to define)
94 |     :return: (list) Hasbrouck lambda for a bar and t value
95 |     """
96 | 
97 |     pass
98 | 


--------------------------------------------------------------------------------
/mlfinlab/microstructural_features/third_generation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Third generation models implementation (VPIN)
 3 | """
 4 | import pandas as pd
 5 | 
 6 | 
 7 | def get_vpin(volume: pd.Series, buy_volume: pd.Series, window: int = 1) -> pd.Series:
 8 |     """
 9 |     Advances in Financial Machine Learning, p. 292-293.
10 | 
11 |     Get Volume-Synchronized Probability of Informed Trading (VPIN) from bars
12 | 
13 |     :param volume: (pd.Series) Bar volume
14 |     :param buy_volume: (pd.Series) Bar volume classified as buy (either tick rule, BVC or aggressor side methods applied)
15 |     :param window: (int) Estimation window
16 |     :return: (pd.Series) VPIN series
17 |     """
18 | 
19 |     pass
20 | 


--------------------------------------------------------------------------------
/mlfinlab/multi_product/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Functionality relating to the ETF trick and stitching futures contracts together.
3 | """
4 | 
5 | from mlfinlab.multi_product.etf_trick import (ETFTrick, get_futures_roll_series)
6 | 


--------------------------------------------------------------------------------
/mlfinlab/networks/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tools to visualise and filter networks of complex systems.
 3 | """
 4 | 
 5 | from mlfinlab.networks.dash_graph import DashGraph, PMFGDash
 6 | from mlfinlab.networks.dual_dash_graph import DualDashGraph
 7 | from mlfinlab.networks.graph import Graph
 8 | from mlfinlab.networks.mst import MST
 9 | from mlfinlab.networks.almst import ALMST
10 | from mlfinlab.networks.pmfg import PMFG
11 | from mlfinlab.networks.visualisations import (generate_mst_server, create_input_matrix, generate_almst_server,
12 |                                               generate_mst_almst_comparison)
13 | 


--------------------------------------------------------------------------------
/mlfinlab/networks/almst.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains Graph classes, which create NetworkX's Graph objects from matrices.
  3 | """
  4 | 
  5 | import heapq
  6 | import itertools
  7 | from itertools import count
  8 | 
  9 | import networkx as nx
 10 | import numpy as np
 11 | import pandas as pd
 12 | from mlfinlab.networks.graph import Graph
 13 | 
 14 | 
 15 | class ALMST(Graph):
 16 |     """
 17 |     ALMST is a subclass of Graph which creates a ALMST Graph object.
 18 |     The ALMST class converts a distance matrix input into a ALMST matrix. This is then used to create a nx.Graph object.
 19 |     """
 20 | 
 21 |     def __init__(self, matrix, matrix_type, mst_algorithm='kruskal'):
 22 |         """
 23 |         Initialises the ALMST and sets the self.graph attribute as the ALMST graph.
 24 | 
 25 |         :param matrix: (pd.Dataframe) Input matrices such as a distance or correlation matrix.
 26 |         :param matrix_type: (str) Name of the matrix type (e.g. "distance" or "correlation").
 27 |         :param mst_algorithm: (str) Valid MST algorithm types include 'kruskal', 'prim'.
 28 |             By default, MST algorithm uses Kruskal's.
 29 |         """
 30 | 
 31 |         pass
 32 | 
 33 |     @staticmethod
 34 |     def create_almst_kruskals(matrix):
 35 |         """
 36 |         This method converts the input matrix into a ALMST matrix.
 37 | 
 38 |         ! Currently only works with distance input matrix
 39 | 
 40 |         :param matrix: (pd.Dataframe) Input matrix.
 41 |         :return: (pd.Dataframe) ALMST matrix with all other edges as 0 values.
 42 |         """
 43 | 
 44 |         pass
 45 | 
 46 |     @staticmethod
 47 |     def _generate_ordered_heap(matrix, clusters):
 48 |         """
 49 |         Given the matrix of edges, and the list of clusters, generate a heap ordered by the average distance between the clusters.
 50 | 
 51 |         :param matrix: (pd.Dataframe) Input matrix of the distance matrix.
 52 |         :param clusters: (List) A list of clusters, where each list contains a list of nodes within the cluster.
 53 |         :return: (Heap) Returns a heap ordered by the average distance between the clusters.
 54 |         """
 55 | 
 56 |         pass
 57 | 
 58 |     @staticmethod
 59 |     def _calculate_average_distance(matrix, clusters, c_x, c_y):
 60 |         """
 61 |         Given two clusters, calculates the average distance between the two.
 62 | 
 63 |         :param matrix: (pd.Dataframe) Input matrix with all edges.
 64 |         :param clusters: (List) List of clusters.
 65 |         :param c_x: (int) Cluster x, where x is the index of the cluster.
 66 |         :param c_y: (int) Cluster y, where y is the index of the cluster.
 67 |         """
 68 | 
 69 |         pass
 70 | 
 71 |     @staticmethod
 72 |     def _get_min_edge(node, cluster, matrix):
 73 |         """
 74 |         Returns the minimum edge tuple given a node and a cluster.
 75 | 
 76 |         :param node: (str) String of the node name.
 77 |         :param cluster: (list) List of node names.
 78 |         :param matrix: (pd.DataFrame) A matrix of all edges.
 79 |         :return: (tuple) A tuple of average distance from node to the cluster, and the minimum edge nodes, i and j.
 80 |         """
 81 | 
 82 |         pass
 83 | 
 84 |     @staticmethod
 85 |     def _get_min_edge_clusters(cluster_one, cluster_two, matrix):
 86 |         """
 87 |         Returns a tuple of the minimum edge and the average length for two clusters.
 88 | 
 89 |         :param cluster_one: (list) List of node names.
 90 |         :param cluster_two: (list) List of node names.
 91 |         :param matrix: (pd.DataFrame) A matrix of all edges.
 92 |         :return: (tuple) A tuple of average distance between the clusters, and the minimum edge nodes, i and j.
 93 |         """
 94 | 
 95 |         pass
 96 | 
 97 |     @staticmethod
 98 |     def create_almst(matrix):
 99 |         """
100 |         Creates and returns a ALMST given an input matrix using Prim's algorithm.
101 | 
102 |         :param matrix: (pd.Dataframe) Input distance matrix of all edges.
103 |         :return: (pd.Dataframe) Returns the ALMST in matrix format.
104 |         """
105 | 
106 |         pass
107 | 
108 |     @staticmethod
109 |     def _add_next_edge(visited, children, matrix, almst_matrix):
110 |         """
111 |         Adds the next edge with the minimum average distance.
112 | 
113 |         :param visited: (Set) A set of visited nodes.
114 |         :param children: (Set) A set of children or frontier nodes, to be visited.
115 |         :param matrix: (pd.Dataframe) Input distance matrix of all edges.
116 |         :param almst_matrix: (pd.Dataframe) The ALMST matrix.
117 | 
118 |         :return: (Tuple) Returns the sets visited and children, and the matrix almst_matrix.
119 |         """
120 | 
121 |         pass
122 | 


--------------------------------------------------------------------------------
/mlfinlab/networks/dual_dash_graph.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This class takes in a Graph object and creates interactive visualisations using Plotly's Dash.
 3 | The DualDashGraph class contains private functions used to generate the frontend components needed to create the UI.
 4 | 
 5 | Running run_server() will produce the warning "Warning: This is a development server. Do not use app.run_server
 6 | in production, use a production WSGI server like gunicorn instead.".
 7 | However, this is okay and the Dash server will run without a problem.
 8 | """
 9 | 
10 | import dash_bootstrap_components as dbc
11 | import dash_cytoscape as cyto
12 | import dash_html_components as html
13 | from dash import Dash
14 | from dash.dependencies import Input, Output, State
15 | from jupyter_dash import JupyterDash
16 | 
17 | class DualDashGraph:
18 |     """
19 |     The DualDashGraph class is the inerface for comparing and highlighting the difference between two graphs.
20 |     Two Graph class objects should be supplied - such as MST and ALMST graphs.
21 |     """
22 | 
23 |     def __init__(self, graph_one, graph_two, app_display='default'):
24 |         """
25 |         Initialises the dual graph interface and generates the interface layout.
26 | 
27 |         :param graph_one: (Graph) The first graph for the comparison interface.
28 |         :param graph_two: (Graph) The second graph for the comparison interface.
29 |         :param app_display: (str) 'default' by default and 'jupyter notebook' for running Dash inside Jupyter Notebook.
30 |         """
31 | 
32 |         pass
33 | 
34 |     @staticmethod
35 |     def _select_other_graph_node(data, elements):
36 |         """
37 |         Callback function to select the other graph node when a graph node
38 |         is selected by setting selected to True.
39 | 
40 |         :param data: (Dict) Dictionary of "tapped" or selected node.
41 |         :param elements: (Dict) Dictionary of elements.
42 |         :return: (Dict) Returns updates dictionary of elements.
43 |         """
44 | 
45 |         pass
46 | 
47 |     def _generate_comparison_layout(self, graph_one, graph_two):
48 |         """
49 |         Returns and generates a dual comparison layout.
50 | 
51 |         :param graph_one: (Graph) The first graph object for the dual interface.
52 |         :param graph_two: (Graph) Comparison graph object for the dual interface.
53 |         :return: (html.Div) Returns a Div containing the interface.
54 |         """
55 | 
56 |         pass
57 | 
58 |     @staticmethod
59 |     def _get_default_stylesheet(weights):
60 |         """
61 |         Returns the default stylesheet for initialisation.
62 | 
63 |         :param weights: (List) A list of weights of the edges.
64 |         :return: (List) A List of definitions used for Dash styling.
65 |         """
66 | 
67 |         pass
68 | 
69 |     def _set_cyto_graph(self):
70 |         """
71 |         Updates and sets the two cytoscape graphs using the corresponding components.
72 |         """
73 | 
74 |         pass
75 | 
76 |     def _update_elements_dual(self, graph, difference, graph_number):
77 |         """
78 |         Updates the elements needed for the Dash Cytoscape Graph object.
79 | 
80 |         :param graph: (Graph) Graph object such as MST or ALMST.
81 |         :param difference: (List) List of edges where the two graphs differ.
82 |         :param graph_number: (Int) Graph number to update the correct graph.
83 |         """
84 | 
85 |         pass
86 | 
87 |     def get_server(self):
88 |         """
89 |         Returns the comparison interface server
90 | 
91 |         :return: (Dash) Returns the Dash app object, which can be run using run_server.
92 |             Returns a Jupyter Dash object if DashGraph has been initialised for Jupyter Notebook.
93 |         """
94 | 
95 |         pass
96 | 


--------------------------------------------------------------------------------
/mlfinlab/networks/graph.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains Graph classes, which create NetworkX's Graph objects from matrices.
  3 | """
  4 | 
  5 | from abc import ABC
  6 | 
  7 | import networkx as nx
  8 | from matplotlib import pyplot as plt
  9 | 
 10 | 
 11 | class Graph(ABC):
 12 |     """
 13 |     This Graph class is a parent class for different types of graphs such as a MST.
 14 |     """
 15 | 
 16 |     def __init__(self, matrix_type):
 17 |         """
 18 |         Initializes the Graph object and the Graph class attributes.
 19 |         This includes the specific graph such as a MST stored as an attribute.
 20 | 
 21 |         :param matrix_type: (str) Name of the matrix type (e.g. "distance" or "correlation").
 22 |         """
 23 | 
 24 |         pass
 25 | 
 26 |     def get_matrix_type(self):
 27 |         """
 28 |         Returns the matrix type set at initialisation.
 29 | 
 30 |         :return: (str) String of matrix type (eg. "correlation" or "distance").
 31 |         """
 32 | 
 33 |         pass
 34 | 
 35 |     def get_graph(self):
 36 |         """
 37 |         Returns the Graph stored as an attribute.
 38 | 
 39 |         :return: (nx.Graph) Returns a NetworkX graph object.
 40 |         """
 41 | 
 42 |         pass
 43 | 
 44 |     def get_difference(self, input_graph_two):
 45 |         """
 46 |         Given two Graph with the same nodes, return a set of differences in edge connections.
 47 | 
 48 |         :param input_graph_two: (Graph) A graph to compare self.graph against.
 49 |         :return: (List) A list of unique tuples showing different edge connections.
 50 |         """
 51 | 
 52 |         pass
 53 | 
 54 |     def get_pos(self):
 55 |         """
 56 |         Returns the dictionary of the nodes coordinates.
 57 | 
 58 |         :return: (Dict) Dictionary of node coordinates.
 59 |         """
 60 | 
 61 |         pass
 62 | 
 63 |     def get_graph_plot(self):
 64 |         """
 65 |         Returns the graph of the MST with labels.
 66 |         Assumes that the matrix contains stock names as headers.
 67 | 
 68 |         :return: (AxesSubplot) Axes with graph plot. Call plt.show() to display this graph.
 69 |         """
 70 | 
 71 |         pass
 72 | 
 73 |     def set_node_groups(self, industry_groups):
 74 |         """
 75 |         Sets the node industry group, by taking in a dictionary of industry group to a list of node indexes.
 76 | 
 77 |         :param industry_groups: (Dict) Dictionary of the industry name to a list of node indexes.
 78 |         """
 79 | 
 80 |         pass
 81 | 
 82 |     def set_node_size(self, market_caps):
 83 |         """
 84 |         Sets the node sizes, given a list of market cap values corresponding to node indexes.
 85 | 
 86 |         :param market_caps: (List) List of numbers corresponding to node indexes.
 87 |         """
 88 | 
 89 |         pass
 90 | 
 91 |     def get_node_sizes(self):
 92 |         """
 93 |         Returns the node sizes as a list.
 94 | 
 95 |         :return: (List) List of numbers representing node sizes.
 96 |         """
 97 | 
 98 |         pass
 99 | 
100 |     def get_node_colours(self):
101 |         """
102 |         Returns a map of industry group matched with list of nodes.
103 | 
104 |         :return: (Dict) Dictionary of industry name to list of node indexes.
105 |         """
106 | 
107 |         pass
108 | 


--------------------------------------------------------------------------------
/mlfinlab/networks/mst.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains Graph classes, which create NetworkX's Graph objects from matrices.
 3 | """
 4 | 
 5 | import networkx as nx
 6 | from mlfinlab.networks.graph import Graph
 7 | 
 8 | 
 9 | class MST(Graph):
10 |     """
11 |     MST is a subclass of Graph which creates a MST Graph object.
12 |     """
13 | 
14 |     def __init__(self, matrix, matrix_type, mst_algorithm='kruskal'):
15 |         """
16 |         Creates a MST Graph object and stores the MST inside graph attribute.
17 | 
18 |         :param matrix: (pd.Dataframe) Input matrices such as a distance or correlation matrix.
19 |         :param matrix_type: (str) Name of the matrix type (e.g. "distance" or "correlation").
20 |         :param mst_algorithm: (str) Valid MST algorithm types include 'kruskal', 'prim', or 'boruvka'.
21 |             By default, MST algorithm uses Kruskal's.
22 |         """
23 | 
24 |         pass
25 | 
26 |     @staticmethod
27 |     def create_mst(matrix, algorithm='kruskal'):
28 |         """
29 |         This method converts the input matrix into a MST graph.
30 | 
31 |         :param matrix: (pd.Dataframe) Input matrix.
32 |         :param algorithm: (str) Valid MST algorithm types include 'kruskal', 'prim', or 'boruvka'.
33 |             By default, MST algorithm uses Kruskal's.
34 |         """
35 | 
36 |         pass
37 | 


--------------------------------------------------------------------------------
/mlfinlab/networks/pmfg.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains Graph classes, which create NetworkX's Graph objects from matrices.
 3 | """
 4 | 
 5 | import heapq
 6 | import itertools
 7 | from itertools import count
 8 | import warnings
 9 | 
10 | import networkx as nx
11 | from matplotlib import pyplot as plt
12 | 
13 | from mlfinlab.networks.graph import Graph
14 | 
15 | 
16 | class PMFG(Graph):
17 |     """
18 |     PMFG class creates and stores the PMFG as an attribute.
19 |     """
20 | 
21 |     def __init__(self, input_matrix, matrix_type):
22 |         """
23 |         PMFG class creates the Planar Maximally Filtered Graph and stores it as an attribute.
24 | 
25 |         :param input_matrix: (pd.Dataframe) Input distance matrix
26 |         :param matrix_type: (str) Matrix type name (e.g. "distance").
27 |         """
28 | 
29 |         pass
30 | 
31 |     def get_disparity_measure(self):
32 |         """
33 |         Getter method for the dictionary of disparity measure values of cliques.
34 | 
35 |         :return: (Dict) Returns a dictionary of clique to the disparity measure.
36 |         """
37 | 
38 |         pass
39 | 
40 |     def _calculate_disparity(self):
41 |         """
42 |         Calculate disparity given in Tumminello M, Aste T, Di Matteo T, Mantegna RN.
43 |         A tool for filtering information in complex systems.
44 |         https://arxiv.org/pdf/cond-mat/0501335.pdf
45 | 
46 |         :return: (Dict) Returns a dictionary of clique to the disparity measure.
47 |         """
48 | 
49 |         pass
50 | 
51 |     def _generate_cliques(self):
52 |         """
53 |         Generate cliques from all of the nodes in the PMFG.
54 |         """
55 | 
56 |         pass
57 | 
58 |     def create_pmfg(self, input_matrix):
59 |         """
60 |         Creates the PMFG matrix from the input matrix of all edges.
61 | 
62 |         :param input_matrix: (pd.Dataframe) Input matrix with all edges
63 |         :return: (nx.Graph) Output PMFG matrix
64 |         """
65 | 
66 |         pass
67 | 
68 |     def get_mst_edges(self):
69 |         """
70 |         Returns the list of MST edges.
71 | 
72 |         :return: (list) Returns a list of tuples of edges.
73 |         """
74 | 
75 |         pass
76 | 
77 |     def edge_in_mst(self, node1, node2):
78 |         """
79 |         Checks whether the edge from node1 to node2 is a part of the MST.
80 | 
81 |         :param node1: (str) Name of the first node in the edge.
82 |         :param node2: (str) Name of the second node in the edge.
83 |         :return: (bool) Returns true if the edge is in the MST. False otherwise.
84 |         """
85 | 
86 |         pass
87 | 
88 |     def get_graph_plot(self):
89 |         """
90 |         Overrides parent get_graph_plot to plot it in a planar format.
91 | 
92 |         Returns the graph of the MST with labels.
93 |         Assumes that the matrix contains stock names as headers.
94 | 
95 |         :return: (AxesSubplot) Axes with graph plot. Call plt.show() to display this graph.
96 |         """
97 | 
98 |         pass
99 | 


--------------------------------------------------------------------------------
/mlfinlab/networks/visualisations.py:
--------------------------------------------------------------------------------
  1 | """
  2 | These methods allows the user to easily deploy graph visualisations given an input file dataframe.
  3 | """
  4 | 
  5 | import warnings
  6 | import networkx as nx
  7 | 
  8 | from mlfinlab.networks.dash_graph import DashGraph, PMFGDash
  9 | from mlfinlab.networks.dual_dash_graph import DualDashGraph
 10 | from mlfinlab.networks.mst import MST
 11 | from mlfinlab.networks.almst import ALMST
 12 | from mlfinlab.networks.pmfg import PMFG
 13 | from mlfinlab.codependence import get_distance_matrix
 14 | 
 15 | 
 16 | def generate_mst_server(log_returns_df, mst_algorithm='kruskal', distance_matrix_type='angular',
 17 |                         jupyter=False, colours=None, sizes=None):
 18 |     """
 19 |     This method returns a Dash server ready to be run.
 20 | 
 21 |     :param log_returns_df: (pd.Dataframe) An input dataframe of log returns
 22 |         with stock names as columns.
 23 |     :param mst_algorithm: (str) A valid MST type such as 'kruskal', 'prim', or 'boruvka'.
 24 |     :param distance_matrix_type: (str) A valid sub type of a distance matrix,
 25 |         namely 'angular', 'abs_angular', 'squared_angular'.
 26 |     :param jupyter: (bool) True if the user would like to run inside jupyter notebook. False otherwise.
 27 |     :param colours: (Dict) A dictionary of key string for category name and value of a list of indexes
 28 |         corresponding to the node indexes inputted in the initial dataframe.
 29 |     :param sizes: (List) A list of numbers, where the positions correspond to the node indexes inputted
 30 |         in the initial dataframe.
 31 |     :return: (Dash) Returns the Dash app object, which can be run using run_server.
 32 |         Returns a Jupyter Dash object if the parameter jupyter is set to True.
 33 |     """
 34 | 
 35 |     pass
 36 | 
 37 | 
 38 | def create_input_matrix(log_returns_df, distance_matrix_type):
 39 |     """
 40 |     This method returns the distance matrix ready to be inputted into the Graph class.
 41 | 
 42 |     :param log_returns_df: (pd.Dataframe) An input dataframe of log returns
 43 |         with stock names as columns.
 44 |     :param distance_matrix_type: (str) A valid sub type of a distance matrix,
 45 |         namely 'angular', 'abs_angular', 'squared_angular'.
 46 |     :return: (pd.Dataframe) A dataframe of a distance matrix.
 47 |     """
 48 | 
 49 |     pass
 50 | 
 51 | 
 52 | def generate_almst_server(log_returns_df, distance_matrix_type='angular',
 53 |                           jupyter=False, colours=None, sizes=None):
 54 |     """
 55 |     This method returns a Dash server ready to be run.
 56 | 
 57 |     :param log_returns_df: (pd.Dataframe) An input dataframe of log returns
 58 |         with stock names as columns.
 59 |     :param distance_matrix_type: (str) A valid sub type of a distance matrix,
 60 |         namely 'angular', 'abs_angular', 'squared_angular'.
 61 |     :param jupyter: (bool) True if the user would like to run inside jupyter notebook. False otherwise.
 62 |     :param colours: (Dict) A dictionary of key string for category name and value of a list of indexes
 63 |         corresponding to the node indexes inputted in the initial dataframe.
 64 |     :param sizes: (List) A list of numbers, where the positions correspond to the node indexes inputted
 65 |         in the initial dataframe.
 66 |     :return: (Dash) Returns the Dash app object, which can be run using run_server.
 67 |         Returns a Jupyter Dash object if the parameter jupyter is set to True.
 68 |     """
 69 | 
 70 |     pass
 71 | 
 72 | 
 73 | def generate_mst_almst_comparison(log_returns_df, distance_matrix_type='angular', jupyter=False):
 74 |     """
 75 |     This method returns a Dash server ready to be run.
 76 | 
 77 |     :param log_returns_df: (pd.Dataframe) An input dataframe of log returns
 78 |         with stock names as columns.
 79 |     :param distance_matrix_type: (str) A valid sub type of a distance matrix,
 80 |         namely 'angular', 'abs_angular', 'squared_angular'.
 81 |     :param jupyter: (bool) True if the user would like to run inside jupyter notebook. False otherwise.
 82 |     :return: (Dash) Returns the Dash app object, which can be run using run_server.
 83 |         Returns a Jupyter Dash object if the parameter jupyter is set to True.
 84 |     """
 85 | 
 86 |     pass
 87 | 
 88 | 
 89 | def generate_pmfg_server(log_returns_df, input_type='distance',
 90 |                          jupyter=False, colours=None, sizes=None):
 91 |     """
 92 |       This method returns a PMFGDash server ready to be run.
 93 | 
 94 |       :param log_returns_df: (pd.Dataframe) An input dataframe of log returns
 95 |           with stock names as columns.
 96 |       :param input_type: (str) A valid input type correlation or distance. Inputting correlation will add the edges
 97 |           by largest to smallest, instead of smallest to largest.
 98 |       :param jupyter: (bool) True if the user would like to run inside jupyter notebook. False otherwise.
 99 |       :param colours: (Dict) A dictionary of key string for category name and value of a list of indexes
100 |           corresponding to the node indexes inputted in the initial dataframe.
101 |       :param sizes: (List) A list of numbers, where the positions correspond to the node indexes inputted
102 |           in the initial dataframe.
103 |       :return: (Dash) Returns the Dash app object, which can be run using run_server.
104 |           Returns a Jupyter Dash object if the parameter jupyter is set to True.
105 |       """
106 | 
107 |     pass
108 | 
109 | 
110 | def generate_central_peripheral_ranking(nx_graph):
111 |     """
112 |     Given a NetworkX graph, this method generates and returns a ranking of centrality.
113 |     The input should be a distance based PMFG.
114 | 
115 |     The ranking combines multiple centrality measures to calculate an overall ranking of how central or peripheral the
116 |     nodes are.
117 |     The smaller the ranking, the more peripheral the node is. The larger the ranking, the more central the node is.
118 | 
119 |     The factors contributing to the ranking include Degree, Eccentricity, Closeness Centrality, Second Order Centrality,
120 |     Eigen Vector Centrality and Betweenness Centrality. The formula for these measures can be found on the NetworkX
121 |     documentation (https://networkx.github.io/documentation/stable/reference/algorithms/centrality.html)
122 | 
123 |     :param nx_graph: (nx.Graph) NetworkX graph object. You can call get_graph() on the MST, ALMST and PMFG to retrieve
124 |         the nx.Graph.
125 |     :return: (List) Returns a list of tuples of ranking value to node.
126 |     """
127 | 
128 |     pass


--------------------------------------------------------------------------------
/mlfinlab/regression/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementation of historically weighted regression method based on relevance.
3 | """
4 | 
5 | from mlfinlab.regression.history_weight_regression import HistoryWeightRegression
6 | 


--------------------------------------------------------------------------------
/mlfinlab/sample_weights/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains the code for implementing sample weights and stacked sample weights.
3 | """
4 | 
5 | from mlfinlab.sample_weights.attribution import (get_weights_by_time_decay, get_weights_by_return,
6 |                                                  _apply_weight_by_return, get_stacked_weights_time_decay,
7 |                                                  get_stacked_weights_by_return)
8 | 


--------------------------------------------------------------------------------
/mlfinlab/sample_weights/attribution.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Logic regarding return and time decay attribution for sample weights from chapter 4.
  3 | And stacked sample weights logic: return and time based sample weights for a multi-asset dataset.
  4 | """
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | from mlfinlab.sampling.concurrent import (num_concurrent_events, get_av_uniqueness_from_triple_barrier)
 10 | from mlfinlab.util.multiprocess import mp_pandas_obj
 11 | 
 12 | def _apply_weight_by_return(label_endtime, num_conc_events, close_series, molecule):
 13 |     """
 14 |     Advances in Financial Machine Learning, Snippet 4.10, page 69.
 15 | 
 16 |     Determination of Sample Weight by Absolute Return Attribution
 17 | 
 18 |     Derives sample weights based on concurrency and return. Works on a set of
 19 |     datetime index values (molecule). This allows the program to parallelize the processing.
 20 | 
 21 |     :param label_endtime: (pd.Series) Label endtime series (t1 for triple barrier events).
 22 |     :param num_conc_events: (pd.Series) Number of concurrent labels (output from num_concurrent_events function).
 23 |     :param close_series: (pd.Series) Close prices.
 24 |     :param molecule: (an array) A set of datetime index values for processing.
 25 |     :return: (pd.Series) Sample weights based on number return and concurrency for molecule.
 26 |     """
 27 | 
 28 |     pass
 29 | 
 30 | 
 31 | def get_weights_by_return(triple_barrier_events, close_series, num_threads=5, verbose=True):
 32 |     """
 33 |     Advances in Financial Machine Learning, Snippet 4.10(part 2), page 69.
 34 | 
 35 |     Determination of Sample Weight by Absolute Return Attribution
 36 | 
 37 |     This function is orchestrator for generating sample weights based on return using mp_pandas_obj.
 38 | 
 39 |     :param triple_barrier_events: (pd.DataFrame) Events from labeling.get_events().
 40 |     :param close_series: (pd.Series) Close prices.
 41 |     :param num_threads: (int) The number of threads concurrently used by the function.
 42 |     :param verbose: (bool) Flag to report progress on asynch jobs.
 43 |     :return: (pd.Series) Sample weights based on number return and concurrency.
 44 |     """
 45 | 
 46 |     pass
 47 | 
 48 | 
 49 | def get_weights_by_time_decay(triple_barrier_events, close_series, num_threads=5, decay=1, verbose=True):
 50 |     """
 51 |     Advances in Financial Machine Learning, Snippet 4.11, page 70.
 52 | 
 53 |     Implementation of Time Decay Factors.
 54 | 
 55 |     :param triple_barrier_events: (pd.DataFrame) Events from labeling.get_events().
 56 |     :param close_series: (pd.Series) Close prices.
 57 |     :param num_threads: (int) The number of threads concurrently used by the function.
 58 |     :param decay: (int) Decay factor
 59 |         - decay = 1 means there is no time decay;
 60 |         - 0 < decay < 1 means that weights decay linearly over time, but every observation still receives a strictly positive weight, regadless of how old;
 61 |         - decay = 0 means that weights converge linearly to zero, as they become older;
 62 |         - decay < 0 means that the oldes portion c of the observations receive zero weight (i.e they are erased from memory).
 63 |     :param verbose: (bool) Flag to report progress on asynch jobs.
 64 |     :return: (pd.Series) Sample weights based on time decay factors.
 65 |     """
 66 | 
 67 |     pass
 68 | 
 69 | 
 70 | def get_stacked_weights_by_return(triple_barrier_events_dict: dict, close_series_dict: dict, num_threads: int = 5,
 71 |                                   verbose: bool = True) -> dict:
 72 |     """
 73 |     Get return based sample weights for multi-asset dataset. The function applies mlinlab's get_weights_by_return.
 74 |     function to multi-asset dataset.
 75 | 
 76 |     :param triple_barrier_events_dict: (dict) Dictionary of asset_name: triple barrier event series.
 77 |     :param close_series_dict: (dict) Dictionary of asset_name: close series used to form label events.
 78 |     :param num_threads: (int) Number of threads used to get sample weights.
 79 |     :param verbose: (bool) Flag to report progress on asynch jobs.
 80 |     :return: (dict) Dictionary of asset_name: sample weight series.
 81 |     """
 82 | 
 83 |     pass
 84 | 
 85 | 
 86 | def get_stacked_weights_time_decay(triple_barrier_events_dict: dict, close_series_dict: dict, decay: int = 0.5,
 87 |                                    num_threads: int = 5,
 88 |                                    verbose: bool = True) -> dict:
 89 |     """
 90 |     Get return based sample weights for multi-asset dataset. The function applies mlinlab's get_weights_by_time_decay.
 91 |     function to multi-asset dataset.
 92 | 
 93 |     :param triple_barrier_events_dict: (dict) Dictionary of asset_name: triple barrier event series.
 94 |     :param close_series_dict: (dict) Dictionary of asset_name: close series used to form label events.
 95 |     :param decay: (int) Decay factor
 96 |         - decay = 1 means there is no time decay;
 97 |         - 0 < decay < 1 means that weights decay linearly over time, but every observation still receives a strictly positive weight, regadless of how old;
 98 |         - decay = 0 means that weights converge linearly to zero, as they become older;
 99 |         - decay < 0 means that the oldest portion c of the observations receive zero weight (i.e they are erased from memory).
100 |     :param num_threads: (int) Number of threads used to get sample weights.
101 |     :param verbose: (bool) Flag to report progress on asynch jobs.
102 |     :return: (dict) Dictionary of asset_name: sample weight series.
103 |     """
104 | 
105 |     pass
106 | 


--------------------------------------------------------------------------------
/mlfinlab/sampling/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains the logic regarding the sequential bootstrapping from chapter 4, as well as the concurrent labels.
3 | """
4 | 
5 | from mlfinlab.sampling.bootstrapping import (get_ind_matrix, get_ind_mat_average_uniqueness, seq_bootstrap,
6 |                                              get_ind_mat_label_uniqueness)
7 | from mlfinlab.sampling.concurrent import (num_concurrent_events, _get_average_uniqueness,
8 |                                           get_av_uniqueness_from_triple_barrier)
9 | 


--------------------------------------------------------------------------------
/mlfinlab/sampling/bootstrapping.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Logic regarding sequential bootstrapping from chapter 4.
 3 | """
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | from numba import jit, prange
 8 | 
 9 | 
10 | def get_ind_matrix(samples_info_sets, price_bars):
11 |     """
12 |     Advances in Financial Machine Learning, Snippet 4.3, page 65.
13 | 
14 |     Build an Indicator Matrix
15 | 
16 |     Get indicator matrix. The book implementation uses bar_index as input, however there is no explanation
17 |     how to form it. We decided that using triple_barrier_events and price bars by analogy with concurrency
18 |     is the best option.
19 | 
20 |     :param samples_info_sets: (pd.Series): Triple barrier events(t1) from labeling.get_events
21 |     :param price_bars: (pd.DataFrame): Price bars which were used to form triple barrier events
22 |     :return: (np.array) Indicator binary matrix indicating what (price) bars influence the label for each observation
23 |     """
24 | 
25 |     pass
26 | 
27 | 
28 | def get_ind_mat_average_uniqueness(ind_mat):
29 |     """
30 |     Advances in Financial Machine Learning, Snippet 4.4. page 65.
31 | 
32 |     Compute Average Uniqueness
33 | 
34 |     Average uniqueness from indicator matrix
35 | 
36 |     :param ind_mat: (np.matrix) Indicator binary matrix
37 |     :return: (float) Average uniqueness
38 |     """
39 | 
40 |     pass
41 | 
42 | 
43 | def get_ind_mat_label_uniqueness(ind_mat):
44 |     """
45 |     Advances in Financial Machine Learning, An adaption of Snippet 4.4. page 65.
46 | 
47 |     Returns the indicator matrix element uniqueness.
48 | 
49 |     :param ind_mat: (np.matrix) Indicator binary matrix
50 |     :return: (np.matrix) Element uniqueness
51 |     """
52 | 
53 |     pass
54 | 
55 | 
56 | @jit(parallel=True, nopython=True)
57 | def _bootstrap_loop_run(ind_mat, prev_concurrency):  # pragma: no cover
58 |     """
59 |     Part of Sequential Bootstrapping for-loop. Using previously accumulated concurrency array, loops through all samples
60 |     and generates averages uniqueness array of label based on previously accumulated concurrency
61 | 
62 |     :param ind_mat (np.array): Indicator matrix from get_ind_matrix function
63 |     :param prev_concurrency (np.array): Accumulated concurrency from previous iterations of sequential bootstrapping
64 |     :return: (np.array): Label average uniqueness based on prev_concurrency
65 |     """
66 | 
67 |     pass
68 | 
69 | 
70 | def seq_bootstrap(ind_mat, sample_length=None, warmup_samples=None, compare=False, verbose=False,
71 |                   random_state=np.random.RandomState()):
72 |     """
73 |     Advances in Financial Machine Learning, Snippet 4.5, Snippet 4.6, page 65.
74 | 
75 |     Return Sample from Sequential Bootstrap
76 | 
77 |     Generate a sample via sequential bootstrap.
78 |     Note: Moved from pd.DataFrame to np.matrix for performance increase
79 | 
80 |     :param ind_mat: (pd.DataFrame) Indicator matrix from triple barrier events
81 |     :param sample_length: (int) Length of bootstrapped sample
82 |     :param warmup_samples: (list) List of previously drawn samples
83 |     :param compare: (boolean) Flag to print standard bootstrap uniqueness vs sequential bootstrap uniqueness
84 |     :param verbose: (boolean) Flag to print updated probabilities on each step
85 |     :param random_state: (np.random.RandomState) Random state
86 |     :return: (array) Bootstrapped samples indexes
87 |     """
88 | 
89 |     pass
90 | 


--------------------------------------------------------------------------------
/mlfinlab/sampling/concurrent.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Logic regarding concurrent labels from chapter 4.
 3 | """
 4 | 
 5 | import pandas as pd
 6 | 
 7 | from mlfinlab.util.multiprocess import mp_pandas_obj
 8 | 
 9 | 
10 | def num_concurrent_events(close_series_index, label_endtime, molecule):
11 |     """
12 |     Advances in Financial Machine Learning, Snippet 4.1, page 60.
13 | 
14 |     Estimating the Uniqueness of a Label
15 | 
16 |     This function uses close series prices and label endtime (when the first barrier is touched) to compute the number
17 |     of concurrent events per bar.
18 | 
19 |     :param close_series_index: (pd.Series) Close prices index
20 |     :param label_endtime: (pd.Series) Label endtime series (t1 for triple barrier events)
21 |     :param molecule: (an array) A set of datetime index values for processing
22 |     :return: (pd.Series) Number concurrent labels for each datetime index
23 |     """
24 | 
25 |     pass
26 | 
27 | 
28 | def _get_average_uniqueness(label_endtime, num_conc_events, molecule):
29 |     """
30 |     Advances in Financial Machine Learning, Snippet 4.2, page 62.
31 | 
32 |     Estimating the Average Uniqueness of a Label
33 | 
34 |     This function uses close series prices and label endtime (when the first barrier is touched) to compute the number
35 |     of concurrent events per bar.
36 | 
37 |     :param label_endtime: (pd.Series) Label endtime series (t1 for triple barrier events)
38 |     :param num_conc_events: (pd.Series) Number of concurrent labels (output from num_concurrent_events function).
39 |     :param molecule: (an array) A set of datetime index values for processing.
40 |     :return: (pd.Series) Average uniqueness over event's lifespan.
41 |     """
42 | 
43 |     pass
44 | 
45 | 
46 | def get_av_uniqueness_from_triple_barrier(triple_barrier_events, close_series, num_threads, verbose=True):
47 |     """
48 |     This function is the orchestrator to derive average sample uniqueness from a dataset labeled by the triple barrier
49 |     method.
50 | 
51 |     :param triple_barrier_events: (pd.DataFrame) Events from labeling.get_events()
52 |     :param close_series: (pd.Series) Close prices.
53 |     :param num_threads: (int) The number of threads concurrently used by the function.
54 |     :param verbose: (bool) Flag to report progress on asynch jobs
55 |     :return: (pd.Series) Average uniqueness over event's lifespan for each index in triple_barrier_events
56 |     """
57 | 
58 |     pass
59 | 


--------------------------------------------------------------------------------
/mlfinlab/structural_breaks/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Structural breaks test (CUSUM, Chow, SADF).
3 | """
4 | 
5 | from mlfinlab.structural_breaks.chow import get_chow_type_stat
6 | from mlfinlab.structural_breaks.cusum import get_chu_stinchcombe_white_statistics
7 | from mlfinlab.structural_breaks.sadf import get_sadf
8 | 


--------------------------------------------------------------------------------
/mlfinlab/structural_breaks/chow.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Explosiveness tests: Chow-Type Dickey-Fuller Test
 3 | """
 4 | 
 5 | import pandas as pd
 6 | from mlfinlab.structural_breaks.sadf import get_betas
 7 | from mlfinlab.util import mp_pandas_obj
 8 | 
 9 | 
10 | # pylint: disable=invalid-name
11 | 
12 | def _get_dfc_for_t(series: pd.Series, molecule: list) -> pd.Series:
13 |     """
14 |     Get Chow-Type Dickey-Fuller Test statistics for each index in molecule
15 | 
16 |     :param series: (pd.Series) Series to test
17 |     :param molecule: (list) Dates to test
18 |     :return: (pd.Series) Statistics for each index from molecule
19 |     """
20 | 
21 |     pass
22 | 
23 | 
24 | def get_chow_type_stat(series: pd.Series, min_length: int = 20, num_threads: int = 8, verbose: bool = True) -> pd.Series:
25 |     """
26 |     Multithread implementation of Chow-Type Dickey-Fuller Test, p.251-252
27 | 
28 |     :param series: (pd.Series) Series to test
29 |     :param min_length: (int) Minimum sample length used to estimate statistics
30 |     :param num_threads: (int): Number of cores to use
31 |     :param verbose: (bool) Flag to report progress on asynch jobs
32 |     :return: (pd.Series) Chow-Type Dickey-Fuller Test statistics
33 |     """
34 | 
35 |     pass
36 | 


--------------------------------------------------------------------------------
/mlfinlab/structural_breaks/cusum.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implementation of Chu-Stinchcombe-White test
 3 | """
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | from mlfinlab.util import mp_pandas_obj
 8 | 
 9 | 
10 | def _get_values_diff(test_type, series, index, ind):
11 |     """
12 |     Gets the difference between two values given a test type.
13 |     :param test_type: (str) Type of the test ['one_sided', 'two_sided']
14 |     :param series: (pd.Series) Series of values
15 |     :param index: (pd.Index) primary index
16 |     :param ind: (pd.Index) secondary index
17 |     :return: (float) Difference between 2 values
18 |     """
19 | 
20 |     pass
21 | 
22 | 
23 | def _get_s_n_for_t(series: pd.Series, test_type: str, molecule: list) -> pd.Series:
24 |     """
25 |     Get maximum S_n_t value for each value from molecule for Chu-Stinchcombe-White test
26 | 
27 |     :param series: (pd.Series) Series to get statistics for
28 |     :param test_type: (str): Two-sided or one-sided test
29 |     :param molecule: (list) Indices to get test statistics for
30 |     :return: (pd.Series) Statistics
31 |     """
32 | 
33 |     pass
34 | 
35 | 
36 | def get_chu_stinchcombe_white_statistics(series: pd.Series, test_type: str = 'one_sided',
37 |                                          num_threads: int = 8, verbose: bool = True) -> pd.Series:
38 |     """
39 |     Multithread Chu-Stinchcombe-White test implementation, p.251
40 | 
41 |     :param series: (pd.Series) Series to get statistics for
42 |     :param test_type: (str): Two-sided or one-sided test
43 |     :param num_threads: (int) Number of cores
44 |     :param verbose: (bool) Flag to report progress on asynch jobs
45 |     :return: (pd.Series) Statistics
46 |     """
47 | 
48 |     pass
49 | 


--------------------------------------------------------------------------------
/mlfinlab/structural_breaks/sadf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Explosiveness tests: SADF
  3 | """
  4 | 
  5 | from typing import Union, Tuple
  6 | import pandas as pd
  7 | import numpy as np
  8 | from mlfinlab.util.multiprocess import mp_pandas_obj
  9 | 
 10 | 
 11 | # pylint: disable=invalid-name
 12 | 
 13 | def _get_sadf_at_t(X: pd.DataFrame, y: pd.DataFrame, min_length: int, model: str, phi: float) -> float:
 14 |     """
 15 |     Advances in Financial Machine Learning, Snippet 17.2, page 258.
 16 | 
 17 |     SADF's Inner Loop (get SADF value at t)
 18 | 
 19 |     :param X: (pd.DataFrame) Lagged values, constants, trend coefficients
 20 |     :param y: (pd.DataFrame) Y values (either y or y.diff())
 21 |     :param min_length: (int) Minimum number of samples needed for estimation
 22 |     :param model: (str) Either 'linear', 'quadratic', 'sm_poly_1', 'sm_poly_2', 'sm_exp', 'sm_power'
 23 |     :param phi: (float) Coefficient to penalize large sample lengths when computing SMT, in [0, 1]
 24 |     :return: (float) SADF statistics for y.index[-1]
 25 |     """
 26 | 
 27 |     pass
 28 | 
 29 | 
 30 | def _get_y_x(series: pd.Series, model: str, lags: Union[int, list],
 31 |              add_const: bool) -> Tuple[pd.DataFrame, pd.DataFrame]:
 32 |     """
 33 |     Advances in Financial Machine Learning, Snippet 17.2, page 258-259.
 34 | 
 35 |     Preparing The Datasets
 36 | 
 37 |     :param series: (pd.Series) Series to prepare for test statistics generation (for example log prices)
 38 |     :param model: (str) Either 'linear', 'quadratic', 'sm_poly_1', 'sm_poly_2', 'sm_exp', 'sm_power'
 39 |     :param lags: (int or list) Either number of lags to use or array of specified lags
 40 |     :param add_const: (bool) Flag to add constant
 41 |     :return: (pd.DataFrame, pd.DataFrame) Prepared y and X for SADF generation
 42 |     """
 43 | 
 44 |     pass
 45 | 
 46 | 
 47 | def _lag_df(df: pd.DataFrame, lags: Union[int, list]) -> pd.DataFrame:
 48 |     """
 49 |     Advances in Financial Machine Learning, Snipet 17.3, page 259.
 50 | 
 51 |     Apply Lags to DataFrame
 52 | 
 53 |     :param df: (int or list) Either number of lags to use or array of specified lags
 54 |     :param lags: (int or list) Lag(s) to use
 55 |     :return: (pd.DataFrame) Dataframe with lags
 56 |     """
 57 | 
 58 |     pass
 59 | 
 60 | 
 61 | def get_betas(X: pd.DataFrame, y: pd.DataFrame) -> Tuple[np.array, np.array]:
 62 |     """
 63 |     Advances in Financial Machine Learning, Snippet 17.4, page 259.
 64 | 
 65 |     Fitting The ADF Specification (get beta estimate and estimate variance)
 66 | 
 67 |     :param X: (pd.DataFrame) Features(factors)
 68 |     :param y: (pd.DataFrame) Outcomes
 69 |     :return: (np.array, np.array) Betas and variances of estimates
 70 |     """
 71 | 
 72 |     pass
 73 | 
 74 | 
 75 | def _sadf_outer_loop(X: pd.DataFrame, y: pd.DataFrame, min_length: int, model: str, phi: float,
 76 |                      molecule: list) -> pd.Series:
 77 |     """
 78 |     This function gets SADF for t times from molecule
 79 | 
 80 |     :param X: (pd.DataFrame) Features(factors)
 81 |     :param y: (pd.DataFrame) Outcomes
 82 |     :param min_length: (int) Minimum number of observations
 83 |     :param model: (str) Either 'linear', 'quadratic', 'sm_poly_1', 'sm_poly_2', 'sm_exp', 'sm_power'
 84 |     :param phi: (float) Coefficient to penalize large sample lengths when computing SMT, in [0, 1]
 85 |     :param molecule: (list) Indices to get SADF
 86 |     :return: (pd.Series) SADF statistics
 87 |     """
 88 | 
 89 |     pass
 90 | 
 91 | def get_sadf(series: pd.Series, model: str, lags: Union[int, list], min_length: int, add_const: bool = False,
 92 |              phi: float = 0, num_threads: int = 8, verbose: bool = True) -> pd.Series:
 93 |     """
 94 |     Advances in Financial Machine Learning, p. 258-259.
 95 | 
 96 |     Multithread implementation of SADF
 97 | 
 98 |     SADF fits the ADF regression at each end point t with backwards expanding start points. For the estimation
 99 |     of SADF(t), the right side of the window is fixed at t. SADF recursively expands the beginning of the sample
100 |     up to t - min_length, and returns the sup of this set.
101 | 
102 |     When doing with sub- or super-martingale test, the variance of beta of a weak long-run bubble may be smaller than
103 |     one of a strong short-run bubble, hence biasing the method towards long-run bubbles. To correct for this bias,
104 |     ADF statistic in samples with large lengths can be penalized with the coefficient phi in [0, 1] such that:
105 | 
106 |     ADF_penalized = ADF / (sample_length ^ phi)
107 | 
108 |     :param series: (pd.Series) Series for which SADF statistics are generated
109 |     :param model: (str) Either 'linear', 'quadratic', 'sm_poly_1', 'sm_poly_2', 'sm_exp', 'sm_power'
110 |     :param lags: (int or list) Either number of lags to use or array of specified lags
111 |     :param min_length: (int) Minimum number of observations needed for estimation
112 |     :param add_const: (bool) Flag to add constant
113 |     :param phi: (float) Coefficient to penalize large sample lengths when computing SMT, in [0, 1]
114 |     :param num_threads: (int) Number of cores to use
115 |     :param verbose: (bool) Flag to report progress on asynch jobs
116 |     :return: (pd.Series) SADF statistics
117 |     """
118 | 
119 |     pass


--------------------------------------------------------------------------------
/mlfinlab/util/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions. In particular Chapter20 code on Multiprocessing and Vectorization.
 3 | """
 4 | 
 5 | from mlfinlab.util.fast_ewma import ewma
 6 | from mlfinlab.util.multiprocess import (expand_call, lin_parts, mp_pandas_obj, nested_parts,
 7 |                                         process_jobs, process_jobs_, report_progress)
 8 | from mlfinlab.util.volatility import (get_daily_vol, get_garman_class_vol, get_yang_zhang_vol, get_parksinson_vol)
 9 | from mlfinlab.util.volume_classifier import get_bvc_buy_volume
10 | from mlfinlab.util.generate_dataset import get_classification_data
11 | 


--------------------------------------------------------------------------------
/mlfinlab/util/fast_ewma.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains an implementation of an exponentially weighted moving average based on sample size.
 3 | The inspiration and context for this code was from a blog post by writen by Maksim Ivanov:
 4 | https://towardsdatascience.com/financial-machine-learning-part-0-bars-745897d4e4ba
 5 | """
 6 | 
 7 | # Imports
 8 | import numpy as np
 9 | from numba import jit
10 | from numba import float64
11 | from numba import int64
12 | 
13 | 
14 | @jit((float64[:], int64), nopython=False, nogil=True)
15 | def ewma(arr_in, window):  # pragma: no cover
16 |     """
17 |     Exponentially weighted moving average specified by a decay ``window`` to provide better adjustments for
18 |     small windows via:
19 |         y[t] = (x[t] + (1-a)*x[t-1] + (1-a)^2*x[t-2] + ... + (1-a)^n*x[t-n]) /
20 |                (1 + (1-a) + (1-a)^2 + ... + (1-a)^n).
21 | 
22 |     :param arr_in: (np.ndarray), (float64) A single dimensional numpy array
23 |     :param window: (int64) The decay window, or 'span'
24 |     :return: (np.ndarray) The EWMA vector, same length / shape as ``arr_in``
25 |     """
26 | 
27 |     pass
28 | 


--------------------------------------------------------------------------------
/mlfinlab/util/generate_dataset.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This module generates  synthetic classification dataset of INFORMED, REDUNDANT, and NOISE explanatory
 3 | variables based on the book Machine Learning for Asset Manager (code snippet 6.1)
 4 | '''
 5 | import numpy as np
 6 | import pandas as pd
 7 | from sklearn.datasets import make_classification
 8 | 
 9 | # pylint: disable=invalid-name
10 | def get_classification_data(n_features=100, n_informative=25, n_redundant=25, n_samples=10000, random_state=0, sigma=.0):
11 |     """
12 |     A function to generate synthetic classification datasets
13 | 
14 |     :param n_features: (int) Total number of features to be generated (i.e. informative + redundant + noisy).
15 |     :param n_informative: (int) Number of informative features.
16 |     :param n_redundant: (int) Number of redundant features.
17 |     :param n_samples: (int) Number of samples (rows) to be generate.
18 |     :param random_state: (int) Random seed.
19 |     :param sigma: (float) This argument is used to introduce substitution effect to the redundant features in
20 |                      the dataset by adding gaussian noise. The lower the  value of  sigma, the  greater the
21 |                      substitution effect.
22 |     :return: (pd.DataFrame, pd.Series)  X and y as features and labels respectively.
23 |     """
24 | 
25 |     pass
26 | 


--------------------------------------------------------------------------------
/mlfinlab/util/misc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Various useful functions
 3 | """
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | 
 8 | def crop_data_frame_in_batches(df: pd.DataFrame, chunksize: int):
 9 |     # pylint: disable=invalid-name
10 |     """
11 |     Splits df into chunks of chunksize
12 | 
13 |     :param df: (pd.DataFrame) Dataframe to split
14 |     :param chunksize: (int) Number of rows in chunk
15 |     :return: (list) Chunks (pd.DataFrames)
16 |     """
17 | 
18 |     pass
19 | 


--------------------------------------------------------------------------------
/mlfinlab/util/volatility.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Various volatility estimators
 3 | """
 4 | import pandas as pd
 5 | import numpy as np
 6 | 
 7 | 
 8 | # pylint: disable=redefined-builtin
 9 | 
10 | def get_daily_vol(close, lookback=100):
11 |     """
12 |     Advances in Financial Machine Learning, Snippet 3.1, page 44.
13 | 
14 |     Daily Volatility Estimates
15 | 
16 |     Computes the daily volatility at intraday estimation points.
17 | 
18 |     In practice we want to set profit taking and stop-loss limits that are a function of the risks involved
19 |     in a bet. Otherwise, sometimes we will be aiming too high (tao ≫ sigma_t_i,0), and sometimes too low
20 |     (tao ≪ sigma_t_i,0 ), considering the prevailing volatility. Snippet 3.1 computes the daily volatility
21 |     at intraday estimation points, applying a span of lookback days to an exponentially weighted moving
22 |     standard deviation.
23 | 
24 |     See the pandas documentation for details on the pandas.Series.ewm function.
25 |     Note: This function is used to compute dynamic thresholds for profit taking and stop loss limits.
26 | 
27 |     :param close: (pd.Series) Closing prices
28 |     :param lookback: (int) Lookback period to compute volatility
29 |     :return: (pd.Series) Daily volatility value
30 |     """
31 | 
32 |     pass
33 | 
34 | 
35 | def get_parksinson_vol(high: pd.Series, low: pd.Series, window: int = 20) -> pd.Series:
36 |     """
37 |     Parkinson volatility estimator
38 | 
39 |     :param high: (pd.Series): High prices
40 |     :param low: (pd.Series): Low prices
41 |     :param window: (int): Window used for estimation
42 |     :return: (pd.Series): Parkinson volatility
43 |     """
44 | 
45 |     pass
46 | 
47 | 
48 | def get_garman_class_vol(open: pd.Series, high: pd.Series, low: pd.Series, close: pd.Series,
49 |                          window: int = 20) -> pd.Series:
50 |     """
51 |     Garman-Class volatility estimator
52 | 
53 |     :param open: (pd.Series): Open prices
54 |     :param high: (pd.Series): High prices
55 |     :param low: (pd.Series): Low prices
56 |     :param close: (pd.Series): Close prices
57 |     :param window: (int): Window used for estimation
58 |     :return: (pd.Series): Garman-Class volatility
59 |     """
60 | 
61 |     pass
62 | 
63 | 
64 | def get_yang_zhang_vol(open: pd.Series, high: pd.Series, low: pd.Series, close: pd.Series,
65 |                        window: int = 20) -> pd.Series:
66 |     """
67 | 
68 |     Yang-Zhang volatility estimator
69 | 
70 |     :param open: (pd.Series): Open prices
71 |     :param high: (pd.Series): High prices
72 |     :param low: (pd.Series): Low prices
73 |     :param close: (pd.Series): Close prices
74 |     :param window: (int): Window used for estimation
75 |     :return: (pd.Series): Yang-Zhang volatility
76 |     """
77 | 
78 |     pass
79 | 


--------------------------------------------------------------------------------
/mlfinlab/util/volume_classifier.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Volume classification methods (BVC and tick rule)
 3 | """
 4 | 
 5 | from scipy.stats import norm
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def get_bvc_buy_volume(close: pd.Series, volume: pd.Series, window: int = 20) -> pd.Series:
10 |     """
11 |     Calculates the BVC buy volume
12 | 
13 |     :param close: (pd.Series): Close prices
14 |     :param volume: (pd.Series): Bar volumes
15 |     :param window: (int): Window for std estimation uses in BVC calculation
16 |     :return: (pd.Series) BVC buy volume
17 |     """
18 |     # .apply(norm.cdf) is used to omit Warning for norm.cdf(pd.Series with NaNs)
19 | 
20 |     pass
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Production
 2 | numpy>=0.16.0
 3 | matplotlib>=3.0.0
 4 | pandas>=1.0.0
 5 | scikit-learn>=0.20.0
 6 | scipy>=1.2.0
 7 | statsmodels>=0.9.0
 8 | cython>=0.29
 9 | POT>=0.7.0
10 | numba>=0.40.0
11 | networkx>=2.2, <2.6
12 | dash>=1.0.0
13 | dash-cytoscape>=0.1.0
14 | dash-bootstrap-components>=0.10.0
15 | jupyter-dash>=0.2.0
16 | tensorflow>=2.0.0
17 | joblib>=1.0.0
18 | decorator>=4.0.0, <5.0.0
19 | analytics-python>=1.2.7
20 | getmac>=0.8.0
21 | 
22 | 
23 | # Develop
24 | codecov==2.1.11
25 | coverage==5.4
26 | pylint==2.6.0
27 | sphinx==3.4.3 # Docs
28 | hudsonthames-sphinx-theme==0.1.5 # Docs
29 | sphinx-rtd-theme==0.5.2 # Docs
30 | releases==1.6.3 # Docs
31 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = mlfinlab
 3 | version = 1.3.0
 4 | author = Hudson and Thames Quantitative Research
 5 | author_email = research@hudsonthames.org
 6 | licence = All Rights Reserved
 7 | licence-file = LICENSE.txt
 8 | description = MlFinlab helps portfolio managers and traders who want to leverage the power of machine learning by providing reproducible, interpretable, and easy to use tools.
 9 | long_description = file: README.md
10 | long_description_content_type = text/markdown
11 | platform = any
12 | url = https://www.hudsonthames.org/
13 | project_urls =
14 |     Documentation = https://mlfinlab.readthedocs.io/en/latest/
15 |     Bug Reports = https://github.com/hudson-and-thames/mlfinlab/issues
16 |     Project Boards = https://github.com/orgs/hudson-and-thames/projects
17 |     Source = https://github.com/hudson-and-thames/mlfinlab
18 |     Blog = https://hudsonthames.org/blog/
19 |     Apprenticeship Program = https://hudsonthames.org/apprenticeship-program/
20 | classifiers =
21 |     Development Status :: 5 - Production/Stable
22 |     Intended Audience :: Developers
23 |     Intended Audience :: Education
24 |     Intended Audience :: Science/Research
25 |     Intended Audience :: Financial and Insurance Industry
26 |     License :: Other/Proprietary License
27 |     Operating System :: OS Independent
28 |     Programming Language :: Python
29 |     Programming Language :: Python :: 3.6
30 |     Programming Language :: Python :: 3.7
31 |     Programming Language :: Python :: 3.8
32 |     Topic :: Scientific/Engineering
33 |     Topic :: Scientific/Engineering :: Artificial Intelligence
34 |     Topic :: Office/Business :: Financial :: Investment
35 | keywords =
36 |     machinelearning
37 |     finance
38 |     investment
39 |     education
40 | 
41 | [options]
42 | include_package_data = True
43 | packages = find:
44 | python_requires =
45 |     >=3.6, <3.9
46 | setup_requires =
47 |     setuptools
48 |     cython
49 | install_requires =
50 |     numpy>=0.16.0
51 |     matplotlib>=3.0.0
52 |     pandas>=1.0.0
53 |     scikit-learn>=0.20.0
54 |     scipy>=1.2.0
55 |     statsmodels>=0.9.0
56 |     cython>=0.29
57 |     POT>=0.7.0
58 |     numba>=0.40.0
59 |     networkx>=2.2, <2.6
60 |     dash>=1.0.0
61 |     dash-cytoscape>=0.1.0
62 |     dash-bootstrap-components>=0.10.0
63 |     jupyter-dash>=0.2.0
64 |     tensorflow>=2.0.0
65 |     joblib>=1.0.0
66 |     decorator>=4.0.0, <5.0.0
67 |     analytics-python>=1.2.7
68 |     getmac>=0.8.0
69 | 
70 | 
71 | [options.packages.find]
72 | package_dir =
73 |     mlfinlab
74 | exclude =
75 |     contrib
76 |     docs
77 |     tests
78 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Always prefer setuptools over distutils
 2 | from setuptools import setup
 3 | 
 4 | setup()
 5 | 
 6 | # Create package
 7 | # python setup.py bdist_wheel
 8 | # python3 -m twine upload --repository-url https://test.pypi.org/legacy/ dist/*  (This is the test repo)
 9 | # twine upload dist/*  (This is official repo)
10 | 


--------------------------------------------------------------------------------