├── .gitattributes ├── .gitignore ├── README.md ├── conf ├── README.md ├── base │ ├── catalog.yml │ ├── catalog_neptune.yml │ ├── logging.yml │ ├── neptune.yml │ ├── parameters.yml │ └── parameters │ │ ├── data_engineering.yml │ │ ├── data_science.yml │ │ └── model_evaluation.yml └── local │ └── .gitkeep ├── data ├── 03_primary │ └── .gitkeep ├── 04_feature │ └── .gitkeep ├── 05_model_input │ └── .gitkeep ├── 06_models │ └── .gitkeep ├── 07_model_output │ └── .gitkeep └── 08_reporting │ └── .gitkeep ├── docs ├── images │ ├── 01_DS_Pipeline_Overview.png │ └── 05_Anomaly_Detection_Pipeline_Blueprint.png └── source │ ├── conf.py │ └── index.rst ├── logs ├── .gitkeep └── journals │ └── .gitkeep ├── notebooks └── .gitkeep ├── pyproject.toml ├── setup.cfg └── src ├── anomaly_detection_pipeline_kedro ├── __init__.py ├── __main__.py ├── hooks.py ├── pipeline_registry.py ├── pipelines │ ├── __init__.py │ ├── data_engineering │ │ ├── README.md │ │ ├── __init__.py │ │ ├── nodes.py │ │ └── pipeline.py │ ├── data_science │ │ ├── README.md │ │ ├── __init__.py │ │ ├── nodes.py │ │ └── pipeline.py │ └── model_evaluation │ │ ├── README.md │ │ ├── __init__.py │ │ ├── nodes.py │ │ └── pipeline.py └── settings.py ├── requirements.in ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── pipelines ├── __init__.py ├── data_engineering │ ├── __init__.py │ └── test_pipeline.py ├── data_science │ ├── __init__.py │ └── test_pipeline.py └── model_evaluation │ ├── __init__.py │ └── test_pipeline.py └── test_run.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ########################## 2 | # KEDRO PROJECT 3 | 4 | # ignore all local configuration 5 | conf/local/** 6 | !conf/local/.gitkeep 7 | .telemetry 8 | 9 | # ignore potentially sensitive credentials files 10 | conf/**/*credentials* 11 | 12 | # ignore everything in the following folders 13 | data/** 14 | logs/** 15 | 16 | # except their sub-folders 17 | !data/**/ 18 | !logs/**/ 19 | 20 | # also keep all .gitkeep files 21 | !.gitkeep 22 | 23 | # also keep the example dataset 24 | !data/01_raw/iris.csv 25 | 26 | .ipython 27 | 28 | ########################## 29 | # Common files 30 | 31 | # IntelliJ 32 | .idea/ 33 | *.iml 34 | out/ 35 | .idea_modules/ 36 | 37 | ### macOS 38 | *.DS_Store 39 | .AppleDouble 40 | .LSOverride 41 | .Trashes 42 | 43 | # Vim 44 | *~ 45 | .*.swo 46 | .*.swp 47 | 48 | # emacs 49 | *~ 50 | \#*\# 51 | /.emacs.desktop 52 | /.emacs.desktop.lock 53 | *.elc 54 | 55 | # JIRA plugin 56 | atlassian-ide-plugin.xml 57 | 58 | # C extensions 59 | *.so 60 | 61 | ### Python template 62 | # Byte-compiled / optimized / DLL files 63 | __pycache__/ 64 | *.py[cod] 65 | *$py.class 66 | 67 | # Distribution / packaging 68 | .Python 69 | build/ 70 | develop-eggs/ 71 | dist/ 72 | downloads/ 73 | eggs/ 74 | .eggs/ 75 | lib/ 76 | lib64/ 77 | parts/ 78 | sdist/ 79 | var/ 80 | wheels/ 81 | *.egg-info/ 82 | .installed.cfg 83 | *.egg 84 | MANIFEST 85 | 86 | # PyInstaller 87 | # Usually these files are written by a python script from a template 88 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 89 | *.manifest 90 | *.spec 91 | 92 | # Installer logs 93 | pip-log.txt 94 | pip-delete-this-directory.txt 95 | 96 | # Unit test / coverage reports 97 | htmlcov/ 98 | .tox/ 99 | .coverage 100 | .coverage.* 101 | .cache 102 | nosetests.xml 103 | coverage.xml 104 | *.cover 105 | .hypothesis/ 106 | 107 | # Translations 108 | *.mo 109 | *.pot 110 | 111 | # Django stuff: 112 | *.log 113 | .static_storage/ 114 | .media/ 115 | local_settings.py 116 | 117 | # Flask stuff: 118 | instance/ 119 | .webassets-cache 120 | 121 | # Scrapy stuff: 122 | .scrapy 123 | 124 | # Sphinx documentation 125 | docs/_build/ 126 | 127 | # PyBuilder 128 | target/ 129 | 130 | # Jupyter Notebook 131 | .ipynb_checkpoints 132 | 133 | # IPython 134 | .ipython/profile_default/history.sqlite 135 | .ipython/profile_default/startup/README 136 | 137 | # pyenv 138 | .python-version 139 | 140 | # celery beat schedule file 141 | celerybeat-schedule 142 | 143 | # SageMath parsed files 144 | *.sage.py 145 | 146 | # Neptune logs 147 | .neptune 148 | 149 | # Environments 150 | .env 151 | .envrc 152 | .venv 153 | env/ 154 | venv/ 155 | ENV/ 156 | env.bak/ 157 | venv.bak/ 158 | 159 | # mkdocs documentation 160 | /site 161 | 162 | # mypy 163 | .mypy_cache/ 164 | 165 | # Ignore media files 166 | media/ 167 | 168 | # Ignore all notebooks 169 | *.ipynb -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Building and Managing an Isolation Forest Anomaly Detection Pipeline with Kedro 2 | 3 | ## Overview 4 | Anomaly (fraud) detection pipeline on credit card transaction data using Isolation Forest machine learning model and Kedro framework 5 | 6 | Link to article: https://neptune.ai/blog/data-science-pipelines-with-kedro 7 | 8 | ## Objective 9 | Develop a data science pipeline to detect anomalous (fradulent) credit card transactions with the use of: 10 | - **Isolation Forest** machine learning model - For unsupervised anomaly detection 11 | - **Kedro** - An open-source Python framework for creating reproducible, maintainable, and modular data science code. This framework helps to accelerate data pipelining, enhance data science prototyping, and promote pipeline reproducibility.) 12 | 13 | ## Motivation 14 | - Explore how unsupervised anomaly detection works, and better understand the concept and implementation of isolation forest 15 | - Leverage Kedro framework to optimally structure data science pipeline projects 16 | 17 | ## Data 18 | The [credit card transaction data](https://github.com/Fraud-Detection-Handbook/simulated-data-transformed) is obtained from the collaboration between Worldline and Machine Learning Group. It is a realistic simulation of real-world credit card transactions and has been designed to include complicated fraud detection issues. 19 | 20 | ## General Pipeline Structure 21 | ![Alt text](/docs/images/01_DS_Pipeline_Overview.png?raw=true) 22 | 23 | ## Anomaly Detection Pipeline Structure 24 | ![Alt text](/docs/images/05_Anomaly_Detection_Pipeline_Blueprint.png?raw=true) 25 | 26 | ## Steps 27 | 1. Change path to project directory in command line - `cd C:/Anomaly-Detection-Pipeline-Kedro` 28 | 2. Initialize Conda virtual environment (create one if not done so) - `conda activate env_kedro` 29 | 3. Execute a pipeline run with `kedro run` 30 | 31 | Please see the [walkthrough article](https://neptune.ai/blog/data-science-pipelines-with-kedro) for details 32 | -------------------------------------------------------------------------------- /conf/README.md: -------------------------------------------------------------------------------- 1 | # What is this for? 2 | 3 | This folder should be used to store configuration files used by Kedro or by separate tools. 4 | 5 | This file can be used to provide users with instructions for how to reproduce local configuration with their own credentials. You can edit the file however you like, but you may wish to retain the information below and add your own section in the [Instructions](#Instructions) section. 6 | 7 | ## Local configuration 8 | 9 | The `local` folder should be used for configuration that is either user-specific (e.g. IDE configuration) or protected (e.g. security keys). 10 | 11 | > *Note:* Please do not check in any local configuration to version control. 12 | 13 | ## Base configuration 14 | 15 | The `base` folder is for shared configuration, such as non-sensitive and project-related configuration that may be shared across team members. 16 | 17 | WARNING: Please do not put access credentials in the base configuration folder. 18 | 19 | ## Instructions 20 | 21 | 22 | 23 | 24 | 25 | ## Find out more 26 | You can find out more about configuration from the [user guide documentation](https://kedro.readthedocs.io/en/stable/04_user_guide/03_configuration.html). 27 | -------------------------------------------------------------------------------- /conf/base/catalog.yml: -------------------------------------------------------------------------------- 1 | raw_daily_data: 2 | type: PartitionedDataSet 3 | path: data/01_raw # path to the location of partitions 4 | dataset: pandas.CSVDataSet 5 | layer: raw 6 | 7 | 8 | merged_data: 9 | type: pandas.CSVDataSet 10 | filepath: data/02_intermediate/merged_data.csv 11 | layer: intermediate 12 | 13 | 14 | processed_data: 15 | type: pandas.CSVDataSet 16 | filepath: data/03_primary/processed_data.csv 17 | layer: primary 18 | 19 | 20 | train_data: 21 | type: pandas.CSVDataSet 22 | filepath: data/05_model_input/train.csv 23 | layer: model_input 24 | 25 | 26 | test_data: 27 | type: pandas.CSVDataSet 28 | filepath: data/05_model_input/test.csv 29 | layer: model_input 30 | 31 | 32 | test_labels: 33 | type: pandas.CSVDataSet 34 | filepath: data/05_model_input/test_labels.csv 35 | layer: model_input 36 | 37 | 38 | ml_model: 39 | type: pickle.PickleDataSet 40 | filepath: data/06_models/ml_model.pkl 41 | backend: pickle 42 | layer: models 43 | 44 | 45 | predictions: 46 | type: pandas.CSVDataSet 47 | filepath: data/07_model_output/predictions.csv 48 | layer: model_output 49 | 50 | 51 | evaluation_plot: 52 | type: matplotlib.MatplotlibWriter 53 | filepath: data/08_reporting/auc_plots.png 54 | -------------------------------------------------------------------------------- /conf/base/catalog_neptune.yml: -------------------------------------------------------------------------------- 1 | # You can log files to Neptune via NeptuneFileDataSet 2 | # 3 | # example_artifact: 4 | # type: kedro_neptune.NeptuneFileDataSet 5 | # filepath: data/06_models/clf_model.pkl 6 | # 7 | # If you want to log existing Kedro Dataset to Neptune add @neptune to the DataSet name 8 | # 9 | # example_iris_data@neptune: 10 | # type: kedro_neptune.NeptuneFileDataSet 11 | # filepath: data/01_raw/iris.csv 12 | # 13 | # You can use kedro_neptune.NeptuneFileDataSet in any catalog including conf/base/catalog.yml 14 | # 15 | -------------------------------------------------------------------------------- /conf/base/logging.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | disable_existing_loggers: False 3 | formatters: 4 | simple: 5 | format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 6 | json_formatter: 7 | format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 8 | class: pythonjsonlogger.jsonlogger.JsonFormatter 9 | 10 | handlers: 11 | console: 12 | class: logging.StreamHandler 13 | level: INFO 14 | formatter: simple 15 | stream: ext://sys.stdout 16 | 17 | info_file_handler: 18 | class: logging.handlers.RotatingFileHandler 19 | level: INFO 20 | formatter: simple 21 | filename: logs/info.log 22 | maxBytes: 10485760 # 10MB 23 | backupCount: 20 24 | encoding: utf8 25 | delay: True 26 | 27 | error_file_handler: 28 | class: logging.handlers.RotatingFileHandler 29 | level: ERROR 30 | formatter: simple 31 | filename: logs/errors.log 32 | maxBytes: 10485760 # 10MB 33 | backupCount: 20 34 | encoding: utf8 35 | delay: True 36 | 37 | journal_file_handler: 38 | class: kedro.versioning.journal.JournalFileHandler 39 | level: INFO 40 | base_dir: logs/journals 41 | formatter: json_formatter 42 | 43 | loggers: 44 | anyconfig: 45 | level: WARNING 46 | handlers: [console, info_file_handler, error_file_handler] 47 | propagate: no 48 | 49 | kedro.io: 50 | level: INFO 51 | handlers: [console, info_file_handler, error_file_handler] 52 | propagate: no 53 | 54 | kedro.pipeline: 55 | level: INFO 56 | handlers: [console, info_file_handler, error_file_handler] 57 | propagate: no 58 | 59 | kedro.journal: 60 | level: INFO 61 | handlers: [journal_file_handler] 62 | propagate: no 63 | 64 | root: 65 | level: INFO 66 | handlers: [console, info_file_handler, error_file_handler] 67 | -------------------------------------------------------------------------------- /conf/base/neptune.yml: -------------------------------------------------------------------------------- 1 | neptune: 2 | #GLOBAL CONFIG 3 | project: Anomaly-Detection-Pipeline-Kedro 4 | base_namespace: kedro 5 | enabled: true 6 | 7 | #LOGGING 8 | upload_source_files: 9 | - '**/*.py' 10 | - conf/base/*.yml 11 | -------------------------------------------------------------------------------- /conf/base/parameters.yml: -------------------------------------------------------------------------------- 1 | predictor_cols: 2 | - 'TX_DATE' 3 | - 'TX_AMOUNT' 4 | - 'TX_DURING_WEEKEND' 5 | - 'TX_DURING_NIGHT' 6 | - 'CUSTOMER_ID_NB_TX_1DAY_WINDOW' 7 | - 'CUSTOMER_ID_NB_TX_7DAY_WINDOW' 8 | - 'CUSTOMER_ID_NB_TX_30DAY_WINDOW' 9 | - 'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW' 10 | - 'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW' 11 | - 'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW' 12 | - 'TERMINAL_ID_NB_TX_1DAY_WINDOW' 13 | - 'TERMINAL_ID_NB_TX_7DAY_WINDOW' 14 | - 'TERMINAL_ID_NB_TX_30DAY_WINDOW' 15 | - 'TERMINAL_ID_RISK_1DAY_WINDOW' 16 | - 'TERMINAL_ID_RISK_7DAY_WINDOW' 17 | - 'TERMINAL_ID_RISK_30DAY_WINDOW' 18 | - 'TX_FRAUD' 19 | 20 | contamination_value: 0.009 -------------------------------------------------------------------------------- /conf/base/parameters/data_engineering.yml: -------------------------------------------------------------------------------- 1 | # This is a boilerplate parameters config generated for pipeline 'data_engineering' 2 | # using Kedro 0.17.7. 3 | # 4 | # Documentation for this file format can be found in "Parameters" 5 | # Link: https://kedro.readthedocs.io/en/0.17.7/04_kedro_project_setup/02_configuration.html#parameters 6 | -------------------------------------------------------------------------------- /conf/base/parameters/data_science.yml: -------------------------------------------------------------------------------- 1 | # This is a boilerplate parameters config generated for pipeline 'data_science' 2 | # using Kedro 0.17.7. 3 | # 4 | # Documentation for this file format can be found in "Parameters" 5 | # Link: https://kedro.readthedocs.io/en/0.17.7/04_kedro_project_setup/02_configuration.html#parameters 6 | -------------------------------------------------------------------------------- /conf/base/parameters/model_evaluation.yml: -------------------------------------------------------------------------------- 1 | # This is a boilerplate parameters config generated for pipeline 'model_evaluation' 2 | # using Kedro 0.17.7. 3 | # 4 | # Documentation for this file format can be found in "Parameters" 5 | # Link: https://kedro.readthedocs.io/en/0.17.7/04_kedro_project_setup/02_configuration.html#parameters 6 | -------------------------------------------------------------------------------- /conf/local/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/conf/local/.gitkeep -------------------------------------------------------------------------------- /data/03_primary/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/data/03_primary/.gitkeep -------------------------------------------------------------------------------- /data/04_feature/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/data/04_feature/.gitkeep -------------------------------------------------------------------------------- /data/05_model_input/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/data/05_model_input/.gitkeep -------------------------------------------------------------------------------- /data/06_models/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/data/06_models/.gitkeep -------------------------------------------------------------------------------- /data/07_model_output/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/data/07_model_output/.gitkeep -------------------------------------------------------------------------------- /data/08_reporting/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/data/08_reporting/.gitkeep -------------------------------------------------------------------------------- /docs/images/01_DS_Pipeline_Overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/docs/images/01_DS_Pipeline_Overview.png -------------------------------------------------------------------------------- /docs/images/05_Anomaly_Detection_Pipeline_Blueprint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/docs/images/05_Anomaly_Detection_Pipeline_Blueprint.png -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | # anomaly_detection_pipeline_kedro documentation build 5 | # configuration file, created by sphinx-quickstart. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import re 21 | 22 | from kedro.framework.cli.utils import find_stylesheets 23 | from recommonmark.transform import AutoStructify 24 | 25 | from anomaly_detection_pipeline_kedro import __version__ as release 26 | 27 | # -- Project information ----------------------------------------------------- 28 | 29 | project = "anomaly_detection_pipeline_kedro" 30 | author = "Kedro" 31 | 32 | # The short X.Y version. 33 | version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1) 34 | 35 | # -- General configuration --------------------------------------------------- 36 | 37 | # If your documentation needs a minimal Sphinx version, state it here. 38 | # 39 | # needs_sphinx = '1.0' 40 | 41 | # Add any Sphinx extension module names here, as strings. They can be 42 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 43 | # ones. 44 | extensions = [ 45 | "sphinx.ext.autodoc", 46 | "sphinx.ext.napoleon", 47 | "sphinx_autodoc_typehints", 48 | "sphinx.ext.doctest", 49 | "sphinx.ext.todo", 50 | "sphinx.ext.coverage", 51 | "sphinx.ext.mathjax", 52 | "sphinx.ext.ifconfig", 53 | "sphinx.ext.viewcode", 54 | "sphinx.ext.mathjax", 55 | "nbsphinx", 56 | "recommonmark", 57 | "sphinx_copybutton", 58 | ] 59 | 60 | # enable autosummary plugin (table of contents for modules/classes/class 61 | # methods) 62 | autosummary_generate = True 63 | 64 | # Add any paths that contain templates here, relative to this directory. 65 | templates_path = ["_templates"] 66 | 67 | # The suffix(es) of source filenames. 68 | # You can specify multiple suffix as a list of string: 69 | # 70 | source_suffix = {".rst": "restructuredtext", ".md": "markdown"} 71 | 72 | # The master toctree document. 73 | master_doc = "index" 74 | 75 | # The language for content autogenerated by Sphinx. Refer to documentation 76 | # for a list of supported languages. 77 | # 78 | # This is also used if you do content translation via gettext catalogs. 79 | # Usually you set "language" from the command line for these cases. 80 | language = None 81 | 82 | # List of patterns, relative to source directory, that match files and 83 | # directories to ignore when looking for source files. 84 | # This pattern also affects html_static_path and html_extra_path . 85 | exclude_patterns = ["_build", "**.ipynb_checkpoints"] 86 | 87 | # The name of the Pygments (syntax highlighting) style to use. 88 | pygments_style = "sphinx" 89 | 90 | # -- Options for HTML output ------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | # 95 | html_theme = "sphinx_rtd_theme" 96 | 97 | # Theme options are theme-specific and customize the look and feel of a theme 98 | # further. For a list of options available for each theme, see the 99 | # documentation. 100 | # 101 | html_theme_options = {"collapse_navigation": False, "style_external_links": True} 102 | 103 | # Add any paths that contain custom static files (such as style sheets) here, 104 | # relative to this directory. They are copied after the builtin static files, 105 | # so a file named "default.css" will overwrite the builtin "default.css". 106 | html_static_path = ["_static"] 107 | 108 | # Custom sidebar templates, must be a dictionary that maps document names 109 | # to template names. 110 | # 111 | # The default sidebars (for documents that don't match any pattern) are 112 | # defined by theme itself. Builtin themes are using these templates by 113 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 114 | # 'searchbox.html']``. 115 | # 116 | # html_sidebars = {} 117 | 118 | html_show_sourcelink = False 119 | 120 | # -- Options for HTMLHelp output --------------------------------------------- 121 | 122 | # Output file base name for HTML help builder. 123 | htmlhelp_basename = "anomaly_detection_pipeline_kedrodoc" 124 | 125 | # -- Options for LaTeX output ------------------------------------------------ 126 | 127 | latex_elements = { 128 | # The paper size ('letterpaper' or 'a4paper'). 129 | # 130 | # 'papersize': 'letterpaper', 131 | # 132 | # The font size ('10pt', '11pt' or '12pt'). 133 | # 134 | # 'pointsize': '10pt', 135 | # 136 | # Additional stuff for the LaTeX preamble. 137 | # 138 | # 'preamble': '', 139 | # 140 | # Latex figure (float) alignment 141 | # 142 | # 'figure_align': 'htbp', 143 | } 144 | 145 | # Grouping the document tree into LaTeX files. List of tuples 146 | # (source start file, target name, title, 147 | # author, documentclass [howto, manual, or own class]). 148 | latex_documents = [ 149 | ( 150 | master_doc, 151 | "anomaly_detection_pipeline_kedro.tex", 152 | "anomaly_detection_pipeline_kedro Documentation", 153 | "Kedro", 154 | "manual", 155 | ) 156 | ] 157 | 158 | # -- Options for manual page output ------------------------------------------ 159 | 160 | # One entry per manual page. List of tuples 161 | # (source start file, name, description, authors, manual section). 162 | man_pages = [ 163 | ( 164 | master_doc, 165 | "anomaly_detection_pipeline_kedro", 166 | "anomaly_detection_pipeline_kedro Documentation", 167 | [author], 168 | 1, 169 | ) 170 | ] 171 | 172 | # -- Options for Texinfo output ---------------------------------------------- 173 | 174 | # Grouping the document tree into Texinfo files. List of tuples 175 | # (source start file, target name, title, author, 176 | # dir menu entry, description, category) 177 | texinfo_documents = [ 178 | ( 179 | master_doc, 180 | "anomaly_detection_pipeline_kedro", 181 | "anomaly_detection_pipeline_kedro Documentation", 182 | author, 183 | "anomaly_detection_pipeline_kedro", 184 | "Project anomaly_detection_pipeline_kedro codebase.", 185 | "Data-Science", 186 | ) 187 | ] 188 | 189 | # -- Options for todo extension ---------------------------------------------- 190 | 191 | # If true, `todo` and `todoList` produce output, else they produce nothing. 192 | todo_include_todos = False 193 | 194 | # -- Extension configuration ------------------------------------------------- 195 | 196 | # nbsphinx_prolog = """ 197 | # see here for prolog/epilog details: 198 | # https://nbsphinx.readthedocs.io/en/0.3.1/prolog-and-epilog.html 199 | # """ 200 | 201 | # -- NBconvert kernel config ------------------------------------------------- 202 | nbsphinx_kernel_name = "python3" 203 | 204 | 205 | def remove_arrows_in_examples(lines): 206 | for i, line in enumerate(lines): 207 | lines[i] = line.replace(">>>", "") 208 | 209 | 210 | def autodoc_process_docstring(app, what, name, obj, options, lines): 211 | remove_arrows_in_examples(lines) 212 | 213 | 214 | def skip(app, what, name, obj, skip, options): 215 | if name == "__init__": 216 | return False 217 | return skip 218 | 219 | 220 | def setup(app): 221 | app.connect("autodoc-process-docstring", autodoc_process_docstring) 222 | app.connect("autodoc-skip-member", skip) 223 | # add Kedro stylesheets 224 | for stylesheet in find_stylesheets(): 225 | app.add_css_file(stylesheet) 226 | # enable rendering RST tables in Markdown 227 | app.add_config_value("recommonmark_config", {"enable_eval_rst": True}, True) 228 | app.add_transform(AutoStructify) 229 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. anomaly_detection_pipeline_kedro documentation master file, created by sphinx-quickstart. 2 | You can adapt this file completely to your liking, but it should at least 3 | contain the root `toctree` directive. 4 | 5 | Welcome to project anomaly_detection_pipeline_kedro's API docs! 6 | ============================================= 7 | 8 | .. toctree:: 9 | :maxdepth: 4 10 | 11 | modules 12 | 13 | 14 | Indices and tables 15 | ================== 16 | 17 | * :ref:`genindex` 18 | * :ref:`modindex` 19 | * :ref:`search` 20 | -------------------------------------------------------------------------------- /logs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/logs/.gitkeep -------------------------------------------------------------------------------- /logs/journals/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/logs/journals/.gitkeep -------------------------------------------------------------------------------- /notebooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/notebooks/.gitkeep -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.kedro] 2 | package_name = "anomaly_detection_pipeline_kedro" 3 | project_name = "Anomaly Detection Pipeline (Kedro)" 4 | project_version = "0.17.7" 5 | 6 | [tool.isort] 7 | multi_line_output = 3 8 | include_trailing_comma = true 9 | force_grid_wrap = 0 10 | use_parentheses = true 11 | line_length = 88 12 | known_third_party = "kedro" 13 | 14 | [tool.pytest.ini_options] 15 | addopts = """ 16 | --cov-report term-missing \ 17 | --cov src/anomaly_detection_pipeline_kedro -ra""" 18 | 19 | [tool.coverage.report] 20 | fail_under = 0 21 | show_missing = true 22 | exclude_lines = ["pragma: no cover", "raise NotImplementedError"] 23 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length=88 3 | extend-ignore=E203 4 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/__init__.py: -------------------------------------------------------------------------------- 1 | """Anomaly Detection Pipeline (Kedro) 2 | """ 3 | 4 | __version__ = "0.1" 5 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/__main__.py: -------------------------------------------------------------------------------- 1 | """Anomaly Detection Pipeline (Kedro) file for ensuring the package is executable 2 | as `Anomaly-Detection-Pipeline-Kedro` and `python -m anomaly_detection_pipeline_kedro` 3 | """ 4 | import importlib 5 | from pathlib import Path 6 | 7 | from kedro.framework.cli.utils import KedroCliError, load_entry_points 8 | from kedro.framework.project import configure_project 9 | 10 | 11 | def _find_run_command(package_name): 12 | try: 13 | project_cli = importlib.import_module(f"{package_name}.cli") 14 | # fail gracefully if cli.py does not exist 15 | except ModuleNotFoundError as exc: 16 | if f"{package_name}.cli" not in str(exc): 17 | raise 18 | plugins = load_entry_points("project") 19 | run = _find_run_command_in_plugins(plugins) if plugins else None 20 | if run: 21 | # use run command from installed plugin if it exists 22 | return run 23 | # use run command from the framework project 24 | from kedro.framework.cli.project import run 25 | 26 | return run 27 | # fail badly if cli.py exists, but has no `cli` in it 28 | if not hasattr(project_cli, "cli"): 29 | raise KedroCliError(f"Cannot load commands from {package_name}.cli") 30 | return project_cli.run 31 | 32 | 33 | def _find_run_command_in_plugins(plugins): 34 | for group in plugins: 35 | if "run" in group.commands: 36 | return group.commands["run"] 37 | 38 | 39 | def main(): 40 | package_name = Path(__file__).parent.name 41 | configure_project(package_name) 42 | run = _find_run_command(package_name) 43 | run() 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/hooks.py: -------------------------------------------------------------------------------- 1 | """Project hooks.""" 2 | from typing import Any, Dict, Iterable, Optional 3 | 4 | from kedro.config import ConfigLoader 5 | from kedro.framework.hooks import hook_impl 6 | from kedro.io import DataCatalog 7 | from kedro.versioning import Journal 8 | 9 | 10 | class ProjectHooks: 11 | @hook_impl 12 | def register_config_loader( 13 | self, conf_paths: Iterable[str], env: str, extra_params: Dict[str, Any], 14 | ) -> ConfigLoader: 15 | return ConfigLoader(conf_paths) 16 | 17 | @hook_impl 18 | def register_catalog( 19 | self, 20 | catalog: Optional[Dict[str, Dict[str, Any]]], 21 | credentials: Dict[str, Dict[str, Any]], 22 | load_versions: Dict[str, str], 23 | save_version: str, 24 | journal: Journal, 25 | ) -> DataCatalog: 26 | return DataCatalog.from_config( 27 | catalog, credentials, load_versions, save_version, journal 28 | ) 29 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipeline_registry.py: -------------------------------------------------------------------------------- 1 | """Project pipelines.""" 2 | from typing import Dict 3 | from kedro.pipeline import Pipeline, pipeline 4 | 5 | from anomaly_detection_pipeline_kedro.pipelines import ( 6 | data_engineering as de, 7 | data_science as ds, 8 | model_evaluation as me 9 | ) 10 | 11 | def register_pipelines() -> Dict[str, Pipeline]: 12 | """Register the project's pipelines. 13 | 14 | Returns: 15 | A mapping from a pipeline name to a ``Pipeline`` object. 16 | """ 17 | data_engineering_pipeline = de.create_pipeline() 18 | data_science_pipeline = ds.create_pipeline() 19 | model_evaluation_pipeline = me.create_pipeline() 20 | 21 | return { 22 | "de": data_engineering_pipeline, 23 | "ds": data_science_pipeline, 24 | "me": model_evaluation_pipeline, 25 | "__default__": data_engineering_pipeline + data_science_pipeline + model_evaluation_pipeline 26 | } 27 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/src/anomaly_detection_pipeline_kedro/pipelines/__init__.py -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipelines/data_engineering/README.md: -------------------------------------------------------------------------------- 1 | # Pipeline data_engineering 2 | 3 | > *Note:* This is a `README.md` boilerplate generated using `Kedro 0.17.7`. 4 | 5 | ## Overview 6 | 7 | 10 | 11 | ## Pipeline inputs 12 | 13 | 16 | 17 | ## Pipeline outputs 18 | 19 | 22 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipelines/data_engineering/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a boilerplate pipeline 'data_engineering' 3 | generated using Kedro 0.17.7 4 | """ 5 | 6 | from .pipeline import create_pipeline 7 | 8 | __all__ = ["create_pipeline"] 9 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipelines/data_engineering/nodes.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a boilerplate pipeline 'data_engineering' 3 | generated using Kedro 0.17.7 4 | """ 5 | 6 | from typing import Any, Callable, Dict 7 | import pandas as pd 8 | from datetime import timedelta, datetime as dt 9 | 10 | 11 | def merge_data(partitioned_input: Dict[str, Callable[[], Any]]) -> pd.DataFrame: 12 | """Concatenate input partitions into one pandas DataFrame. 13 | 14 | Args: 15 | partitioned_input: A dictionary with partition ids as keys and load functions as values. 16 | 17 | Returns: 18 | Pandas DataFrame representing a concatenation of all loaded partitions. 19 | """ 20 | merged_df = pd.DataFrame() 21 | 22 | for partition_id, partition_load_func in sorted(partitioned_input.items()): 23 | partition_data = partition_load_func() # load actual partition data 24 | merged_df = pd.concat([merged_df, partition_data], ignore_index=True, sort=True) # concat with existing result 25 | 26 | return merged_df 27 | 28 | 29 | def process_data(merged_df: pd.DataFrame, predictor_cols: list) -> pd.DataFrame: 30 | """Process the merged dataset 31 | 32 | Args: 33 | merged_df (pd.DataFrame): Dataframe containing the consolidated credit card transaction data 34 | 35 | Returns: 36 | pd.DataFrame: Pandas dataframe representing the processed dataset 37 | """ 38 | # Generate date column 39 | merged_df['TX_DATETIME'] = pd.to_datetime(merged_df['TX_DATETIME'], infer_datetime_format=True) 40 | merged_df['TX_DATE'] = merged_df['TX_DATETIME'].dt.date 41 | 42 | # Only keep columns which are meaningful and predictive (based on domain knowledge) 43 | processed_df = merged_df[predictor_cols] 44 | 45 | return processed_df 46 | 47 | 48 | def train_test_split(processed_df: pd.DataFrame) -> pd.DataFrame: 49 | """Split processed dataset in train and test sets 50 | 51 | Args: 52 | processed_df (pd.DataFrame): Dataframe containing the processed transaction dataset 53 | 54 | Returns: 55 | Pandas dataframes of the training data, test data, and test labels (if any) 56 | """ 57 | # Perform chronological train test split (80:20) i.e. 8 weeks:2 weeks 58 | processed_df['TX_DATE'] = pd.to_datetime(processed_df['TX_DATE'], infer_datetime_format=True) 59 | split_date = processed_df['TX_DATE'].min() + timedelta(days=(8*7)) 60 | train_df = processed_df.loc[processed_df['TX_DATE'] <= split_date] 61 | test_df = processed_df.loc[processed_df['TX_DATE'] > split_date] 62 | 63 | # Drop date column 64 | train_df.drop(columns=['TX_DATE'], inplace=True) 65 | test_df.drop(columns=['TX_DATE'], inplace=True) 66 | 67 | # Drop actual label in dataset if any (supposed to be unsupervised training) 68 | if 'TX_FRAUD' in train_df.columns: 69 | train_df = train_df.drop(columns=['TX_FRAUD']) 70 | 71 | # Store test labels (if any) for subsequent model evaluation 72 | if 'TX_FRAUD' in test_df.columns: 73 | test_labels = test_df[['TX_FRAUD']] 74 | test_df = test_df.drop(columns=['TX_FRAUD']) 75 | else: 76 | test_labels = pd.DataFrame() # Empty dataframe if no test labels present 77 | 78 | return train_df, test_df, test_labels -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipelines/data_engineering/pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a boilerplate pipeline 'data_engineering' 3 | generated using Kedro 0.17.7 4 | """ 5 | 6 | from kedro.pipeline import Pipeline, node, pipeline 7 | from .nodes import merge_data, process_data, train_test_split 8 | 9 | def create_pipeline(**kwargs) -> Pipeline: 10 | return pipeline([ 11 | 12 | node( 13 | func=merge_data, 14 | inputs="raw_daily_data", 15 | outputs="merged_data", 16 | name="node_merge_raw_daily_data" 17 | ), 18 | 19 | node( 20 | func=process_data, 21 | inputs=["merged_data", "params:predictor_cols"], 22 | outputs="processed_data", 23 | name="node_process_data" 24 | ), 25 | 26 | node( 27 | func=train_test_split, 28 | inputs="processed_data", 29 | outputs=["train_data", "test_data", "test_labels"], 30 | name="node_train_test_split" 31 | ), 32 | ]) 33 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipelines/data_science/README.md: -------------------------------------------------------------------------------- 1 | # Pipeline data_science 2 | 3 | > *Note:* This is a `README.md` boilerplate generated using `Kedro 0.17.7`. 4 | 5 | ## Overview 6 | 7 | 10 | 11 | ## Pipeline inputs 12 | 13 | 16 | 17 | ## Pipeline outputs 18 | 19 | 22 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipelines/data_science/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a boilerplate pipeline 'data_science' 3 | generated using Kedro 0.17.7 4 | """ 5 | 6 | from .pipeline import create_pipeline 7 | 8 | __all__ = ["create_pipeline"] 9 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipelines/data_science/nodes.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a boilerplate pipeline 'data_science' 3 | generated using Kedro 0.17.7 4 | """ 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import sklearn 9 | from sklearn.ensemble import IsolationForest 10 | 11 | def train_model(train_df: pd.DataFrame, contamination_value: float): 12 | # Initialize isolation forest classifier model 13 | clf = IsolationForest(random_state=42, 14 | bootstrap=True, 15 | contamination=contamination_value) 16 | 17 | # Fit model on training dataset 18 | clf.fit(train_df.values) 19 | 20 | return clf 21 | 22 | 23 | def predict(ml_model, test_df: pd.DataFrame): 24 | # Generate predictions on test dataset 25 | preds = ml_model.predict(test_df.values) 26 | 27 | # Modify predictions to match TX_FRAUD label (1 = fraud, 0 = no fraud) 28 | preds_mod = np.array(list(map(lambda x: 1*(x == -1), preds))) 29 | 30 | # Get anomaly scores that led to predictions 31 | anomaly_scores = ml_model.score_samples(test_df) 32 | 33 | # Convert anomaly scores to positive values 34 | anomaly_scores_mod = np.array([-x for x in anomaly_scores]) 35 | 36 | test_df['ANOMALY_SCORE'] = anomaly_scores_mod 37 | test_df['ANOMALY'] = preds_mod 38 | 39 | return test_df 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipelines/data_science/pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a boilerplate pipeline 'data_science' 3 | generated using Kedro 0.17.7 4 | """ 5 | 6 | from kedro.pipeline import Pipeline, node, pipeline 7 | from .nodes import train_model, predict 8 | 9 | 10 | def create_pipeline(**kwargs) -> Pipeline: 11 | return pipeline([ 12 | 13 | node( 14 | func=train_model, 15 | inputs=["train_data", "params:contamination_value"], 16 | outputs="ml_model", 17 | name="node_train_model" 18 | ), 19 | 20 | node( 21 | func=predict, 22 | inputs=["ml_model", "test_data"], 23 | outputs="predictions", 24 | name="node_predict" 25 | ), 26 | 27 | ]) 28 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipelines/model_evaluation/README.md: -------------------------------------------------------------------------------- 1 | # Pipeline model_evaluation 2 | 3 | > *Note:* This is a `README.md` boilerplate generated using `Kedro 0.17.7`. 4 | 5 | ## Overview 6 | 7 | 10 | 11 | ## Pipeline inputs 12 | 13 | 16 | 17 | ## Pipeline outputs 18 | 19 | 22 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipelines/model_evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a boilerplate pipeline 'model_evaluation' 3 | generated using Kedro 0.17.7 4 | """ 5 | 6 | from .pipeline import create_pipeline 7 | 8 | __all__ = ["create_pipeline"] 9 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipelines/model_evaluation/nodes.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a boilerplate pipeline 'model_evaluation' 3 | generated using Kedro 0.17.7 4 | """ 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import matplotlib.pyplot as plt 9 | from sklearn.metrics import roc_curve, precision_recall_curve, auc 10 | import logging 11 | import neptune.new as neptune 12 | 13 | run = neptune.init(project='kennethleung.ty/Anomaly-Detection-Pipeline-Kedro', 14 | api_token='') 15 | 16 | def evaluate_model(predictions: pd.DataFrame, test_labels: pd.DataFrame, 17 | neptune_run: run): 18 | def get_auc(labels, scores): 19 | fpr, tpr, thr = roc_curve(labels, scores) 20 | auc_score = auc(fpr, tpr) 21 | return fpr, tpr, auc_score 22 | 23 | def get_aucpr(labels, scores): 24 | precision, recall, thr = precision_recall_curve(labels, scores) 25 | aucpr_score = np.trapz(recall, precision) 26 | return precision, recall, aucpr_score 27 | 28 | def plot_metric(ax, x, y, x_label, y_label, plot_label, style="-"): 29 | ax.plot(x, y, style, label=plot_label) 30 | ax.legend() 31 | ax.set_ylabel(x_label) 32 | ax.set_xlabel(y_label) 33 | 34 | def prediction_summary(labels, predicted_score, info, plot_baseline=True, axes=None): 35 | if axes is None: 36 | axes = [plt.subplot(1, 2, 1), plt.subplot(1, 2, 2)] 37 | 38 | fpr, tpr, auc_score = get_auc(labels, predicted_score) 39 | plot_metric(axes[0], fpr, tpr, "False positive rate", 40 | "True positive rate", "{} AUC = {:.4f}".format(info, auc_score)) 41 | if plot_baseline: 42 | plot_metric(axes[0], [0, 1], [0, 1], "False positive rate", 43 | "True positive rate", "Baseline AUC = 0.5", "r--") 44 | 45 | precision, recall, aucpr_score = get_aucpr(labels, predicted_score) 46 | plot_metric(axes[1], recall, precision, "Recall", 47 | "Precision", "{} AUCPR = {:.4f}".format(info, aucpr_score)) 48 | 49 | if plot_baseline: 50 | thr = sum(labels)/len(labels) 51 | plot_metric(axes[1], [0, 1], [thr, thr], "Recall", 52 | "Precision", "Baseline AUCPR = {:.4f}".format(thr), "r--") 53 | 54 | plt.show() 55 | return axes 56 | 57 | _, _, auc_score = get_auc(test_labels['TX_FRAUD'].values, predictions['ANOMALY_SCORE'].values) 58 | _, _, aucpr_score = get_aucpr(test_labels['TX_FRAUD'].values, predictions['ANOMALY_SCORE'].values) 59 | 60 | # log = logging.getLogger(__name__) 61 | # log.info("AUC-ROC Score: %0.2f%%", auc_score) 62 | # log.info("AUC-PR Score: %0.2f%%", aucpr_score) 63 | 64 | # Log scores into Neptune 65 | neptune_run['nodes/report/auc_roc_score'].log(auc_score) 66 | neptune_run['nodes/report/auc_pr_score'].log(aucpr_score) 67 | 68 | fig = plt.figure() 69 | fig.set_figheight(4.5) 70 | fig.set_figwidth(4.5*2) 71 | axes = prediction_summary(test_labels['TX_FRAUD'].values, predictions['ANOMALY_SCORE'].values, "Isolation Forest") 72 | 73 | # Log AUC plots to Neptune 74 | neptune_run['nodes/report/auc_plots'].upload(fig) 75 | 76 | return fig -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/pipelines/model_evaluation/pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a boilerplate pipeline 'model_evaluation' 3 | generated using Kedro 0.17.7 4 | """ 5 | 6 | from kedro.pipeline import Pipeline, node, pipeline 7 | from .nodes import evaluate_model 8 | 9 | 10 | def create_pipeline(**kwargs) -> Pipeline: 11 | return pipeline([ 12 | node( 13 | func=evaluate_model, 14 | inputs=["predictions", "test_labels", "neptune_run"], 15 | outputs="evaluation_plot", 16 | name="node_model_evaluation" 17 | ), 18 | ]) 19 | -------------------------------------------------------------------------------- /src/anomaly_detection_pipeline_kedro/settings.py: -------------------------------------------------------------------------------- 1 | """Project settings.""" 2 | from anomaly_detection_pipeline_kedro.hooks import ProjectHooks 3 | 4 | # Instantiate and list your project hooks here 5 | HOOKS = (ProjectHooks(),) 6 | 7 | # List the installed plugins for which to disable auto-registry 8 | # DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",) 9 | 10 | # Define where to store data from a KedroSession. Defaults to BaseSessionStore. 11 | # from kedro.framework.session.store import ShelveStore 12 | # SESSION_STORE_CLASS = ShelveStore 13 | 14 | # Define keyword arguments to be passed to `SESSION_STORE_CLASS` constructor 15 | # SESSION_STORE_ARGS = { 16 | # "path": "./sessions" 17 | # } 18 | 19 | # Define custom context class. Defaults to `KedroContext` 20 | # CONTEXT_CLASS = KedroContext 21 | 22 | # Define the configuration folder. Defaults to `conf` 23 | # CONF_ROOT = "conf" 24 | -------------------------------------------------------------------------------- /src/requirements.in: -------------------------------------------------------------------------------- 1 | black==21.5b1 2 | flake8>=3.7.9, <4.0 3 | ipython~=7.10 4 | ipython~=7.16.3; python_version == '3.6' 5 | ipython>=7.31.1, <8.0; python_version > '3.6' 6 | isort~=5.0 7 | jupyter~=1.0 8 | jupyter_client>=5.1, <7.0 9 | jupyterlab~=3.0 10 | kedro==0.17.7 11 | kedro-telemetry~=0.1.0 12 | nbstripout~=0.4 13 | pytest-cov~=3.0 14 | pytest-mock>=1.7.1, <2.0 15 | pytest~=6.2 16 | wheel>=0.35, <0.37 17 | -------------------------------------------------------------------------------- /src/requirements.txt: -------------------------------------------------------------------------------- 1 | black==21.5b1 2 | flake8>=3.7.9, <4.0 3 | ipython~=7.10 4 | ipython~=7.16.3; python_version == '3.6' 5 | ipython>=7.31.1, <8.0; python_version > '3.6' 6 | isort~=5.0 7 | jupyter~=1.0 8 | jupyter_client>=5.1, <7.0 9 | jupyterlab~=3.0 10 | kedro==0.17.7 11 | kedro-telemetry~=0.1.0 12 | matplotlib>=3.5 13 | nbstripout~=0.4 14 | numpy>=1.21 15 | pandas>=1.3 16 | pytest-cov~=3.0 17 | pytest-mock>=1.7.1, <2.0 18 | pytest~=6.2 19 | scikit-learn>=1.0 20 | wheel~>0.38.1 21 | -------------------------------------------------------------------------------- /src/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | entry_point = ( 4 | "Anomaly-Detection-Pipeline-Kedro = anomaly_detection_pipeline_kedro.__main__:main" 5 | ) 6 | 7 | 8 | # get the dependencies and installs 9 | with open("requirements.txt", encoding="utf-8") as f: 10 | # Make sure we strip all comments and options (e.g "--extra-index-url") 11 | # that arise from a modified pip.conf file that configure global options 12 | # when running kedro build-reqs 13 | requires = [] 14 | for line in f: 15 | req = line.split("#", 1)[0].strip() 16 | if req and not req.startswith("--"): 17 | requires.append(req) 18 | 19 | setup( 20 | name="anomaly_detection_pipeline_kedro", 21 | version="0.1", 22 | packages=find_packages(exclude=["tests"]), 23 | entry_points={"console_scripts": [entry_point]}, 24 | install_requires=requires, 25 | extras_require={ 26 | "docs": [ 27 | "docutils<0.18.0", 28 | "sphinx~=3.4.3", 29 | "sphinx_rtd_theme==0.5.1", 30 | "nbsphinx==0.8.1", 31 | "nbstripout~=0.4", 32 | "recommonmark==0.7.1", 33 | "sphinx-autodoc-typehints==1.11.1", 34 | "sphinx_copybutton==0.3.1", 35 | "ipykernel>=5.3, <7.0", 36 | ] 37 | }, 38 | ) 39 | -------------------------------------------------------------------------------- /src/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/src/tests/__init__.py -------------------------------------------------------------------------------- /src/tests/pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/src/tests/pipelines/__init__.py -------------------------------------------------------------------------------- /src/tests/pipelines/data_engineering/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/src/tests/pipelines/data_engineering/__init__.py -------------------------------------------------------------------------------- /src/tests/pipelines/data_engineering/test_pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a boilerplate test file for pipeline 'data_engineering' 3 | generated using Kedro 0.17.7. 4 | Please add your pipeline tests here. 5 | 6 | Kedro recommends using `pytest` framework, more info about it can be found 7 | in the official documentation: 8 | https://docs.pytest.org/en/latest/getting-started.html 9 | """ 10 | -------------------------------------------------------------------------------- /src/tests/pipelines/data_science/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/src/tests/pipelines/data_science/__init__.py -------------------------------------------------------------------------------- /src/tests/pipelines/data_science/test_pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a boilerplate test file for pipeline 'data_science' 3 | generated using Kedro 0.17.7. 4 | Please add your pipeline tests here. 5 | 6 | Kedro recommends using `pytest` framework, more info about it can be found 7 | in the official documentation: 8 | https://docs.pytest.org/en/latest/getting-started.html 9 | """ 10 | -------------------------------------------------------------------------------- /src/tests/pipelines/model_evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/src/tests/pipelines/model_evaluation/__init__.py -------------------------------------------------------------------------------- /src/tests/pipelines/model_evaluation/test_pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a boilerplate test file for pipeline 'model_evaluation' 3 | generated using Kedro 0.17.7. 4 | Please add your pipeline tests here. 5 | 6 | Kedro recommends using `pytest` framework, more info about it can be found 7 | in the official documentation: 8 | https://docs.pytest.org/en/latest/getting-started.html 9 | """ 10 | -------------------------------------------------------------------------------- /src/tests/test_run.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains an example test. 3 | 4 | Tests should be placed in ``src/tests``, in modules that mirror your 5 | project's structure, and in files named test_*.py. They are simply functions 6 | named ``test_*`` which test a unit of logic. 7 | 8 | To run the tests, run ``kedro test`` from the project root directory. 9 | """ 10 | 11 | from pathlib import Path 12 | 13 | import pytest 14 | from kedro.framework.context import KedroContext 15 | 16 | 17 | @pytest.fixture 18 | def project_context(): 19 | return KedroContext( 20 | package_name="anomaly_detection_pipeline_kedro", project_path=Path.cwd() 21 | ) 22 | 23 | 24 | # The tests below are here for the demonstration purpose 25 | # and should be replaced with the ones testing the project 26 | # functionality 27 | class TestProjectContext: 28 | def test_package_name(self, project_context): 29 | assert project_context.package_name == "anomaly_detection_pipeline_kedro" 30 | --------------------------------------------------------------------------------