├── .gitattributes
├── .gitignore
├── README.md
├── conf
    ├── README.md
    ├── base
    │   ├── catalog.yml
    │   ├── catalog_neptune.yml
    │   ├── logging.yml
    │   ├── neptune.yml
    │   ├── parameters.yml
    │   └── parameters
    │   │   ├── data_engineering.yml
    │   │   ├── data_science.yml
    │   │   └── model_evaluation.yml
    └── local
    │   └── .gitkeep
├── data
    ├── 03_primary
    │   └── .gitkeep
    ├── 04_feature
    │   └── .gitkeep
    ├── 05_model_input
    │   └── .gitkeep
    ├── 06_models
    │   └── .gitkeep
    ├── 07_model_output
    │   └── .gitkeep
    └── 08_reporting
    │   └── .gitkeep
├── docs
    ├── images
    │   ├── 01_DS_Pipeline_Overview.png
    │   └── 05_Anomaly_Detection_Pipeline_Blueprint.png
    └── source
    │   ├── conf.py
    │   └── index.rst
├── logs
    ├── .gitkeep
    └── journals
    │   └── .gitkeep
├── notebooks
    └── .gitkeep
├── pyproject.toml
├── setup.cfg
└── src
    ├── anomaly_detection_pipeline_kedro
        ├── __init__.py
        ├── __main__.py
        ├── hooks.py
        ├── pipeline_registry.py
        ├── pipelines
        │   ├── __init__.py
        │   ├── data_engineering
        │   │   ├── README.md
        │   │   ├── __init__.py
        │   │   ├── nodes.py
        │   │   └── pipeline.py
        │   ├── data_science
        │   │   ├── README.md
        │   │   ├── __init__.py
        │   │   ├── nodes.py
        │   │   └── pipeline.py
        │   └── model_evaluation
        │   │   ├── README.md
        │   │   ├── __init__.py
        │   │   ├── nodes.py
        │   │   └── pipeline.py
        └── settings.py
    ├── requirements.in
    ├── requirements.txt
    ├── setup.py
    └── tests
        ├── __init__.py
        ├── pipelines
            ├── __init__.py
            ├── data_engineering
            │   ├── __init__.py
            │   └── test_pipeline.py
            ├── data_science
            │   ├── __init__.py
            │   └── test_pipeline.py
            └── model_evaluation
            │   ├── __init__.py
            │   └── test_pipeline.py
        └── test_run.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ##########################
  2 | # KEDRO PROJECT
  3 | 
  4 | # ignore all local configuration
  5 | conf/local/**
  6 | !conf/local/.gitkeep
  7 | .telemetry
  8 | 
  9 | # ignore potentially sensitive credentials files
 10 | conf/**/*credentials*
 11 | 
 12 | # ignore everything in the following folders
 13 | data/**
 14 | logs/**
 15 | 
 16 | # except their sub-folders
 17 | !data/**/
 18 | !logs/**/
 19 | 
 20 | # also keep all .gitkeep files
 21 | !.gitkeep
 22 | 
 23 | # also keep the example dataset
 24 | !data/01_raw/iris.csv
 25 | 
 26 | .ipython
 27 | 
 28 | ##########################
 29 | # Common files
 30 | 
 31 | # IntelliJ
 32 | .idea/
 33 | *.iml
 34 | out/
 35 | .idea_modules/
 36 | 
 37 | ### macOS
 38 | *.DS_Store
 39 | .AppleDouble
 40 | .LSOverride
 41 | .Trashes
 42 | 
 43 | # Vim
 44 | *~
 45 | .*.swo
 46 | .*.swp
 47 | 
 48 | # emacs
 49 | *~
 50 | \#*\#
 51 | /.emacs.desktop
 52 | /.emacs.desktop.lock
 53 | *.elc
 54 | 
 55 | # JIRA plugin
 56 | atlassian-ide-plugin.xml
 57 | 
 58 | # C extensions
 59 | *.so
 60 | 
 61 | ### Python template
 62 | # Byte-compiled / optimized / DLL files
 63 | __pycache__/
 64 | *.py[cod]
 65 | *$py.class
 66 | 
 67 | # Distribution / packaging
 68 | .Python
 69 | build/
 70 | develop-eggs/
 71 | dist/
 72 | downloads/
 73 | eggs/
 74 | .eggs/
 75 | lib/
 76 | lib64/
 77 | parts/
 78 | sdist/
 79 | var/
 80 | wheels/
 81 | *.egg-info/
 82 | .installed.cfg
 83 | *.egg
 84 | MANIFEST
 85 | 
 86 | # PyInstaller
 87 | #  Usually these files are written by a python script from a template
 88 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 89 | *.manifest
 90 | *.spec
 91 | 
 92 | # Installer logs
 93 | pip-log.txt
 94 | pip-delete-this-directory.txt
 95 | 
 96 | # Unit test / coverage reports
 97 | htmlcov/
 98 | .tox/
 99 | .coverage
100 | .coverage.*
101 | .cache
102 | nosetests.xml
103 | coverage.xml
104 | *.cover
105 | .hypothesis/
106 | 
107 | # Translations
108 | *.mo
109 | *.pot
110 | 
111 | # Django stuff:
112 | *.log
113 | .static_storage/
114 | .media/
115 | local_settings.py
116 | 
117 | # Flask stuff:
118 | instance/
119 | .webassets-cache
120 | 
121 | # Scrapy stuff:
122 | .scrapy
123 | 
124 | # Sphinx documentation
125 | docs/_build/
126 | 
127 | # PyBuilder
128 | target/
129 | 
130 | # Jupyter Notebook
131 | .ipynb_checkpoints
132 | 
133 | # IPython
134 | .ipython/profile_default/history.sqlite
135 | .ipython/profile_default/startup/README
136 | 
137 | # pyenv
138 | .python-version
139 | 
140 | # celery beat schedule file
141 | celerybeat-schedule
142 | 
143 | # SageMath parsed files
144 | *.sage.py
145 | 
146 | # Neptune logs
147 | .neptune
148 | 
149 | # Environments
150 | .env
151 | .envrc
152 | .venv
153 | env/
154 | venv/
155 | ENV/
156 | env.bak/
157 | venv.bak/
158 | 
159 | # mkdocs documentation
160 | /site
161 | 
162 | # mypy
163 | .mypy_cache/
164 | 
165 | # Ignore media files
166 | media/
167 | 
168 | # Ignore all notebooks
169 | *.ipynb


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Building and Managing an Isolation Forest Anomaly Detection Pipeline with Kedro
 2 | 
 3 | ## Overview
 4 | Anomaly (fraud) detection pipeline on credit card transaction data using Isolation Forest machine learning model and Kedro framework
 5 | 
 6 | Link to article: https://neptune.ai/blog/data-science-pipelines-with-kedro
 7 | 
 8 | ## Objective
 9 | Develop a data science pipeline to detect anomalous (fradulent) credit card transactions with the use of:
10 | - **Isolation Forest** machine learning model - For unsupervised anomaly detection
11 | - **Kedro** - An open-source Python framework for creating reproducible, maintainable, and modular data science code. This framework helps to accelerate data pipelining, enhance data science prototyping, and promote pipeline reproducibility.)
12 | 
13 | ## Motivation
14 | - Explore how unsupervised anomaly detection works, and better understand the concept and implementation of isolation forest
15 | - Leverage Kedro framework to optimally structure data science pipeline projects
16 | 
17 | ## Data
18 | The [credit card transaction data](https://github.com/Fraud-Detection-Handbook/simulated-data-transformed) is obtained from the collaboration between Worldline and Machine Learning Group. It is a realistic simulation of real-world credit card transactions and has been designed to include complicated fraud detection issues.
19 | 
20 | ## General Pipeline Structure
21 | ![Alt text](/docs/images/01_DS_Pipeline_Overview.png?raw=true)
22 | 
23 | ## Anomaly Detection Pipeline Structure
24 | ![Alt text](/docs/images/05_Anomaly_Detection_Pipeline_Blueprint.png?raw=true)
25 | 
26 | ## Steps
27 | 1. Change path to project directory in command line - `cd C:/Anomaly-Detection-Pipeline-Kedro`
28 | 2. Initialize Conda virtual environment (create one if not done so) - `conda activate env_kedro`
29 | 3. Execute a pipeline run with `kedro run`
30 | 
31 | Please see the [walkthrough article](https://neptune.ai/blog/data-science-pipelines-with-kedro) for details
32 | 


--------------------------------------------------------------------------------
/conf/README.md:
--------------------------------------------------------------------------------
 1 | # What is this for?
 2 | 
 3 | This folder should be used to store configuration files used by Kedro or by separate tools.
 4 | 
 5 | This file can be used to provide users with instructions for how to reproduce local configuration with their own credentials. You can edit the file however you like, but you may wish to retain the information below and add your own section in the [Instructions](#Instructions) section.
 6 | 
 7 | ## Local configuration
 8 | 
 9 | The `local` folder should be used for configuration that is either user-specific (e.g. IDE configuration) or protected (e.g. security keys).
10 | 
11 | > *Note:* Please do not check in any local configuration to version control.
12 | 
13 | ## Base configuration
14 | 
15 | The `base` folder is for shared configuration, such as non-sensitive and project-related configuration that may be shared across team members.
16 | 
17 | WARNING: Please do not put access credentials in the base configuration folder.
18 | 
19 | ## Instructions
20 | 
21 | 
22 | 
23 | 
24 | 
25 | ## Find out more
26 | You can find out more about configuration from the [user guide documentation](https://kedro.readthedocs.io/en/stable/04_user_guide/03_configuration.html).
27 | 


--------------------------------------------------------------------------------
/conf/base/catalog.yml:
--------------------------------------------------------------------------------
 1 | raw_daily_data:
 2 |   type: PartitionedDataSet
 3 |   path: data/01_raw  # path to the location of partitions
 4 |   dataset: pandas.CSVDataSet
 5 |   layer: raw
 6 | 
 7 | 
 8 | merged_data:
 9 |   type: pandas.CSVDataSet
10 |   filepath: data/02_intermediate/merged_data.csv
11 |   layer: intermediate
12 | 
13 | 
14 | processed_data:
15 |   type: pandas.CSVDataSet
16 |   filepath: data/03_primary/processed_data.csv
17 |   layer: primary
18 | 
19 | 
20 | train_data:
21 |   type: pandas.CSVDataSet
22 |   filepath: data/05_model_input/train.csv
23 |   layer: model_input
24 | 
25 | 
26 | test_data:
27 |   type: pandas.CSVDataSet
28 |   filepath: data/05_model_input/test.csv
29 |   layer: model_input
30 | 
31 | 
32 | test_labels:
33 |   type: pandas.CSVDataSet
34 |   filepath: data/05_model_input/test_labels.csv
35 |   layer: model_input
36 | 
37 | 
38 | ml_model:
39 |   type: pickle.PickleDataSet
40 |   filepath: data/06_models/ml_model.pkl
41 |   backend: pickle
42 |   layer: models
43 | 
44 | 
45 | predictions:
46 |   type: pandas.CSVDataSet
47 |   filepath: data/07_model_output/predictions.csv
48 |   layer: model_output
49 | 
50 | 
51 | evaluation_plot:
52 |   type: matplotlib.MatplotlibWriter
53 |   filepath: data/08_reporting/auc_plots.png
54 | 


--------------------------------------------------------------------------------
/conf/base/catalog_neptune.yml:
--------------------------------------------------------------------------------
 1 | # You can log files to Neptune via NeptuneFileDataSet
 2 | #
 3 | # example_artifact:
 4 | #   type: kedro_neptune.NeptuneFileDataSet
 5 | #   filepath: data/06_models/clf_model.pkl
 6 | #
 7 | # If you want to log existing Kedro Dataset to Neptune add @neptune to the DataSet name
 8 | #
 9 | # example_iris_data@neptune:
10 | #   type: kedro_neptune.NeptuneFileDataSet
11 | #   filepath: data/01_raw/iris.csv
12 | #
13 | # You can use kedro_neptune.NeptuneFileDataSet in any catalog including conf/base/catalog.yml
14 | #
15 | 


--------------------------------------------------------------------------------
/conf/base/logging.yml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | disable_existing_loggers: False
 3 | formatters:
 4 |     simple:
 5 |         format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 6 |     json_formatter:
 7 |         format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 8 |         class: pythonjsonlogger.jsonlogger.JsonFormatter
 9 | 
10 | handlers:
11 |     console:
12 |         class: logging.StreamHandler
13 |         level: INFO
14 |         formatter: simple
15 |         stream: ext://sys.stdout
16 | 
17 |     info_file_handler:
18 |         class: logging.handlers.RotatingFileHandler
19 |         level: INFO
20 |         formatter: simple
21 |         filename: logs/info.log
22 |         maxBytes: 10485760 # 10MB
23 |         backupCount: 20
24 |         encoding: utf8
25 |         delay: True
26 | 
27 |     error_file_handler:
28 |         class: logging.handlers.RotatingFileHandler
29 |         level: ERROR
30 |         formatter: simple
31 |         filename: logs/errors.log
32 |         maxBytes: 10485760 # 10MB
33 |         backupCount: 20
34 |         encoding: utf8
35 |         delay: True
36 | 
37 |     journal_file_handler:
38 |         class: kedro.versioning.journal.JournalFileHandler
39 |         level: INFO
40 |         base_dir: logs/journals
41 |         formatter: json_formatter
42 | 
43 | loggers:
44 |     anyconfig:
45 |         level: WARNING
46 |         handlers: [console, info_file_handler, error_file_handler]
47 |         propagate: no
48 | 
49 |     kedro.io:
50 |         level: INFO
51 |         handlers: [console, info_file_handler, error_file_handler]
52 |         propagate: no
53 | 
54 |     kedro.pipeline:
55 |         level: INFO
56 |         handlers: [console, info_file_handler, error_file_handler]
57 |         propagate: no
58 | 
59 |     kedro.journal:
60 |         level: INFO
61 |         handlers: [journal_file_handler]
62 |         propagate: no
63 | 
64 | root:
65 |     level: INFO
66 |     handlers: [console, info_file_handler, error_file_handler]
67 | 


--------------------------------------------------------------------------------
/conf/base/neptune.yml:
--------------------------------------------------------------------------------
 1 | neptune:
 2 | #GLOBAL CONFIG
 3 |   project: Anomaly-Detection-Pipeline-Kedro
 4 |   base_namespace: kedro
 5 |   enabled: true
 6 | 
 7 | #LOGGING
 8 |   upload_source_files:
 9 |   - '**/*.py'
10 |   - conf/base/*.yml
11 | 


--------------------------------------------------------------------------------
/conf/base/parameters.yml:
--------------------------------------------------------------------------------
 1 | predictor_cols:
 2 |   - 'TX_DATE'
 3 |   - 'TX_AMOUNT'
 4 |   - 'TX_DURING_WEEKEND'
 5 |   - 'TX_DURING_NIGHT'
 6 |   - 'CUSTOMER_ID_NB_TX_1DAY_WINDOW'
 7 |   - 'CUSTOMER_ID_NB_TX_7DAY_WINDOW'
 8 |   - 'CUSTOMER_ID_NB_TX_30DAY_WINDOW'
 9 |   - 'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW'
10 |   - 'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW'
11 |   - 'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW'
12 |   - 'TERMINAL_ID_NB_TX_1DAY_WINDOW'
13 |   - 'TERMINAL_ID_NB_TX_7DAY_WINDOW'
14 |   - 'TERMINAL_ID_NB_TX_30DAY_WINDOW'
15 |   - 'TERMINAL_ID_RISK_1DAY_WINDOW' 
16 |   - 'TERMINAL_ID_RISK_7DAY_WINDOW'
17 |   - 'TERMINAL_ID_RISK_30DAY_WINDOW'
18 |   - 'TX_FRAUD'
19 | 
20 | contamination_value: 0.009


--------------------------------------------------------------------------------
/conf/base/parameters/data_engineering.yml:
--------------------------------------------------------------------------------
1 | # This is a boilerplate parameters config generated for pipeline 'data_engineering'
2 | # using Kedro 0.17.7.
3 | #
4 | # Documentation for this file format can be found in "Parameters"
5 | # Link: https://kedro.readthedocs.io/en/0.17.7/04_kedro_project_setup/02_configuration.html#parameters
6 | 


--------------------------------------------------------------------------------
/conf/base/parameters/data_science.yml:
--------------------------------------------------------------------------------
1 | # This is a boilerplate parameters config generated for pipeline 'data_science'
2 | # using Kedro 0.17.7.
3 | #
4 | # Documentation for this file format can be found in "Parameters"
5 | # Link: https://kedro.readthedocs.io/en/0.17.7/04_kedro_project_setup/02_configuration.html#parameters
6 | 


--------------------------------------------------------------------------------
/conf/base/parameters/model_evaluation.yml:
--------------------------------------------------------------------------------
1 | # This is a boilerplate parameters config generated for pipeline 'model_evaluation'
2 | # using Kedro 0.17.7.
3 | #
4 | # Documentation for this file format can be found in "Parameters"
5 | # Link: https://kedro.readthedocs.io/en/0.17.7/04_kedro_project_setup/02_configuration.html#parameters
6 | 


--------------------------------------------------------------------------------
/conf/local/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/conf/local/.gitkeep


--------------------------------------------------------------------------------
/data/03_primary/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/data/03_primary/.gitkeep


--------------------------------------------------------------------------------
/data/04_feature/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/data/04_feature/.gitkeep


--------------------------------------------------------------------------------
/data/05_model_input/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/data/05_model_input/.gitkeep


--------------------------------------------------------------------------------
/data/06_models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/data/06_models/.gitkeep


--------------------------------------------------------------------------------
/data/07_model_output/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/data/07_model_output/.gitkeep


--------------------------------------------------------------------------------
/data/08_reporting/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/data/08_reporting/.gitkeep


--------------------------------------------------------------------------------
/docs/images/01_DS_Pipeline_Overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/docs/images/01_DS_Pipeline_Overview.png


--------------------------------------------------------------------------------
/docs/images/05_Anomaly_Detection_Pipeline_Blueprint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/docs/images/05_Anomaly_Detection_Pipeline_Blueprint.png


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | 
  4 | # anomaly_detection_pipeline_kedro documentation build
  5 | # configuration file, created by sphinx-quickstart.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import re
 21 | 
 22 | from kedro.framework.cli.utils import find_stylesheets
 23 | from recommonmark.transform import AutoStructify
 24 | 
 25 | from anomaly_detection_pipeline_kedro import __version__ as release
 26 | 
 27 | # -- Project information -----------------------------------------------------
 28 | 
 29 | project = "anomaly_detection_pipeline_kedro"
 30 | author = "Kedro"
 31 | 
 32 | # The short X.Y version.
 33 | version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1)
 34 | 
 35 | # -- General configuration ---------------------------------------------------
 36 | 
 37 | # If your documentation needs a minimal Sphinx version, state it here.
 38 | #
 39 | # needs_sphinx = '1.0'
 40 | 
 41 | # Add any Sphinx extension module names here, as strings. They can be
 42 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 43 | # ones.
 44 | extensions = [
 45 |     "sphinx.ext.autodoc",
 46 |     "sphinx.ext.napoleon",
 47 |     "sphinx_autodoc_typehints",
 48 |     "sphinx.ext.doctest",
 49 |     "sphinx.ext.todo",
 50 |     "sphinx.ext.coverage",
 51 |     "sphinx.ext.mathjax",
 52 |     "sphinx.ext.ifconfig",
 53 |     "sphinx.ext.viewcode",
 54 |     "sphinx.ext.mathjax",
 55 |     "nbsphinx",
 56 |     "recommonmark",
 57 |     "sphinx_copybutton",
 58 | ]
 59 | 
 60 | # enable autosummary plugin (table of contents for modules/classes/class
 61 | # methods)
 62 | autosummary_generate = True
 63 | 
 64 | # Add any paths that contain templates here, relative to this directory.
 65 | templates_path = ["_templates"]
 66 | 
 67 | # The suffix(es) of source filenames.
 68 | # You can specify multiple suffix as a list of string:
 69 | #
 70 | source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
 71 | 
 72 | # The master toctree document.
 73 | master_doc = "index"
 74 | 
 75 | # The language for content autogenerated by Sphinx. Refer to documentation
 76 | # for a list of supported languages.
 77 | #
 78 | # This is also used if you do content translation via gettext catalogs.
 79 | # Usually you set "language" from the command line for these cases.
 80 | language = None
 81 | 
 82 | # List of patterns, relative to source directory, that match files and
 83 | # directories to ignore when looking for source files.
 84 | # This pattern also affects html_static_path and html_extra_path .
 85 | exclude_patterns = ["_build", "**.ipynb_checkpoints"]
 86 | 
 87 | # The name of the Pygments (syntax highlighting) style to use.
 88 | pygments_style = "sphinx"
 89 | 
 90 | # -- Options for HTML output -------------------------------------------------
 91 | 
 92 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 93 | # a list of builtin themes.
 94 | #
 95 | html_theme = "sphinx_rtd_theme"
 96 | 
 97 | # Theme options are theme-specific and customize the look and feel of a theme
 98 | # further.  For a list of options available for each theme, see the
 99 | # documentation.
100 | #
101 | html_theme_options = {"collapse_navigation": False, "style_external_links": True}
102 | 
103 | # Add any paths that contain custom static files (such as style sheets) here,
104 | # relative to this directory. They are copied after the builtin static files,
105 | # so a file named "default.css" will overwrite the builtin "default.css".
106 | html_static_path = ["_static"]
107 | 
108 | # Custom sidebar templates, must be a dictionary that maps document names
109 | # to template names.
110 | #
111 | # The default sidebars (for documents that don't match any pattern) are
112 | # defined by theme itself.  Builtin themes are using these templates by
113 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
114 | # 'searchbox.html']``.
115 | #
116 | # html_sidebars = {}
117 | 
118 | html_show_sourcelink = False
119 | 
120 | # -- Options for HTMLHelp output ---------------------------------------------
121 | 
122 | # Output file base name for HTML help builder.
123 | htmlhelp_basename = "anomaly_detection_pipeline_kedrodoc"
124 | 
125 | # -- Options for LaTeX output ------------------------------------------------
126 | 
127 | latex_elements = {
128 |     # The paper size ('letterpaper' or 'a4paper').
129 |     #
130 |     # 'papersize': 'letterpaper',
131 |     #
132 |     # The font size ('10pt', '11pt' or '12pt').
133 |     #
134 |     # 'pointsize': '10pt',
135 |     #
136 |     # Additional stuff for the LaTeX preamble.
137 |     #
138 |     # 'preamble': '',
139 |     #
140 |     # Latex figure (float) alignment
141 |     #
142 |     # 'figure_align': 'htbp',
143 | }
144 | 
145 | # Grouping the document tree into LaTeX files. List of tuples
146 | # (source start file, target name, title,
147 | #  author, documentclass [howto, manual, or own class]).
148 | latex_documents = [
149 |     (
150 |         master_doc,
151 |         "anomaly_detection_pipeline_kedro.tex",
152 |         "anomaly_detection_pipeline_kedro Documentation",
153 |         "Kedro",
154 |         "manual",
155 |     )
156 | ]
157 | 
158 | # -- Options for manual page output ------------------------------------------
159 | 
160 | # One entry per manual page. List of tuples
161 | # (source start file, name, description, authors, manual section).
162 | man_pages = [
163 |     (
164 |         master_doc,
165 |         "anomaly_detection_pipeline_kedro",
166 |         "anomaly_detection_pipeline_kedro Documentation",
167 |         [author],
168 |         1,
169 |     )
170 | ]
171 | 
172 | # -- Options for Texinfo output ----------------------------------------------
173 | 
174 | # Grouping the document tree into Texinfo files. List of tuples
175 | # (source start file, target name, title, author,
176 | #  dir menu entry, description, category)
177 | texinfo_documents = [
178 |     (
179 |         master_doc,
180 |         "anomaly_detection_pipeline_kedro",
181 |         "anomaly_detection_pipeline_kedro Documentation",
182 |         author,
183 |         "anomaly_detection_pipeline_kedro",
184 |         "Project anomaly_detection_pipeline_kedro codebase.",
185 |         "Data-Science",
186 |     )
187 | ]
188 | 
189 | # -- Options for todo extension ----------------------------------------------
190 | 
191 | # If true, `todo` and `todoList` produce output, else they produce nothing.
192 | todo_include_todos = False
193 | 
194 | # -- Extension configuration -------------------------------------------------
195 | 
196 | # nbsphinx_prolog = """
197 | # see here for prolog/epilog details:
198 | # https://nbsphinx.readthedocs.io/en/0.3.1/prolog-and-epilog.html
199 | # """
200 | 
201 | # -- NBconvert kernel config -------------------------------------------------
202 | nbsphinx_kernel_name = "python3"
203 | 
204 | 
205 | def remove_arrows_in_examples(lines):
206 |     for i, line in enumerate(lines):
207 |         lines[i] = line.replace(">>>", "")
208 | 
209 | 
210 | def autodoc_process_docstring(app, what, name, obj, options, lines):
211 |     remove_arrows_in_examples(lines)
212 | 
213 | 
214 | def skip(app, what, name, obj, skip, options):
215 |     if name == "__init__":
216 |         return False
217 |     return skip
218 | 
219 | 
220 | def setup(app):
221 |     app.connect("autodoc-process-docstring", autodoc_process_docstring)
222 |     app.connect("autodoc-skip-member", skip)
223 |     # add Kedro stylesheets
224 |     for stylesheet in find_stylesheets():
225 |         app.add_css_file(stylesheet)
226 |     # enable rendering RST tables in Markdown
227 |     app.add_config_value("recommonmark_config", {"enable_eval_rst": True}, True)
228 |     app.add_transform(AutoStructify)
229 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. anomaly_detection_pipeline_kedro documentation master file, created by sphinx-quickstart.
 2 |    You can adapt this file completely to your liking, but it should at least
 3 |    contain the root `toctree` directive.
 4 | 
 5 | Welcome to project anomaly_detection_pipeline_kedro's API docs!
 6 | =============================================
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 4
10 | 
11 |    modules
12 | 
13 | 
14 | Indices and tables
15 | ==================
16 | 
17 | * :ref:`genindex`
18 | * :ref:`modindex`
19 | * :ref:`search`
20 | 


--------------------------------------------------------------------------------
/logs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/logs/.gitkeep


--------------------------------------------------------------------------------
/logs/journals/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/logs/journals/.gitkeep


--------------------------------------------------------------------------------
/notebooks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/notebooks/.gitkeep


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.kedro]
 2 | package_name = "anomaly_detection_pipeline_kedro"
 3 | project_name = "Anomaly Detection Pipeline (Kedro)"
 4 | project_version = "0.17.7"
 5 | 
 6 | [tool.isort]
 7 | multi_line_output = 3
 8 | include_trailing_comma = true
 9 | force_grid_wrap = 0
10 | use_parentheses = true
11 | line_length = 88
12 | known_third_party = "kedro"
13 | 
14 | [tool.pytest.ini_options]
15 | addopts = """
16 | --cov-report term-missing \
17 | --cov src/anomaly_detection_pipeline_kedro -ra"""
18 | 
19 | [tool.coverage.report]
20 | fail_under = 0
21 | show_missing = true
22 | exclude_lines = ["pragma: no cover", "raise NotImplementedError"]
23 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length=88
3 | extend-ignore=E203
4 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/__init__.py:
--------------------------------------------------------------------------------
1 | """Anomaly Detection Pipeline (Kedro)
2 | """
3 | 
4 | __version__ = "0.1"
5 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/__main__.py:
--------------------------------------------------------------------------------
 1 | """Anomaly Detection Pipeline (Kedro) file for ensuring the package is executable
 2 | as `Anomaly-Detection-Pipeline-Kedro` and `python -m anomaly_detection_pipeline_kedro`
 3 | """
 4 | import importlib
 5 | from pathlib import Path
 6 | 
 7 | from kedro.framework.cli.utils import KedroCliError, load_entry_points
 8 | from kedro.framework.project import configure_project
 9 | 
10 | 
11 | def _find_run_command(package_name):
12 |     try:
13 |         project_cli = importlib.import_module(f"{package_name}.cli")
14 |         # fail gracefully if cli.py does not exist
15 |     except ModuleNotFoundError as exc:
16 |         if f"{package_name}.cli" not in str(exc):
17 |             raise
18 |         plugins = load_entry_points("project")
19 |         run = _find_run_command_in_plugins(plugins) if plugins else None
20 |         if run:
21 |             # use run command from installed plugin if it exists
22 |             return run
23 |         # use run command from the framework project
24 |         from kedro.framework.cli.project import run
25 | 
26 |         return run
27 |     # fail badly if cli.py exists, but has no `cli` in it
28 |     if not hasattr(project_cli, "cli"):
29 |         raise KedroCliError(f"Cannot load commands from {package_name}.cli")
30 |     return project_cli.run
31 | 
32 | 
33 | def _find_run_command_in_plugins(plugins):
34 |     for group in plugins:
35 |         if "run" in group.commands:
36 |             return group.commands["run"]
37 | 
38 | 
39 | def main():
40 |     package_name = Path(__file__).parent.name
41 |     configure_project(package_name)
42 |     run = _find_run_command(package_name)
43 |     run()
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/hooks.py:
--------------------------------------------------------------------------------
 1 | """Project hooks."""
 2 | from typing import Any, Dict, Iterable, Optional
 3 | 
 4 | from kedro.config import ConfigLoader
 5 | from kedro.framework.hooks import hook_impl
 6 | from kedro.io import DataCatalog
 7 | from kedro.versioning import Journal
 8 | 
 9 | 
10 | class ProjectHooks:
11 |     @hook_impl
12 |     def register_config_loader(
13 |         self, conf_paths: Iterable[str], env: str, extra_params: Dict[str, Any],
14 |     ) -> ConfigLoader:
15 |         return ConfigLoader(conf_paths)
16 | 
17 |     @hook_impl
18 |     def register_catalog(
19 |         self,
20 |         catalog: Optional[Dict[str, Dict[str, Any]]],
21 |         credentials: Dict[str, Dict[str, Any]],
22 |         load_versions: Dict[str, str],
23 |         save_version: str,
24 |         journal: Journal,
25 |     ) -> DataCatalog:
26 |         return DataCatalog.from_config(
27 |             catalog, credentials, load_versions, save_version, journal
28 |         )
29 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipeline_registry.py:
--------------------------------------------------------------------------------
 1 | """Project pipelines."""
 2 | from typing import Dict
 3 | from kedro.pipeline import Pipeline, pipeline
 4 | 
 5 | from anomaly_detection_pipeline_kedro.pipelines import (
 6 |     data_engineering as de,
 7 |     data_science as ds,
 8 |     model_evaluation as me
 9 | )
10 | 
11 | def register_pipelines() -> Dict[str, Pipeline]:
12 |     """Register the project's pipelines.
13 | 
14 |     Returns:
15 |         A mapping from a pipeline name to a ``Pipeline`` object.
16 |     """
17 |     data_engineering_pipeline = de.create_pipeline()
18 |     data_science_pipeline = ds.create_pipeline()
19 |     model_evaluation_pipeline = me.create_pipeline()
20 | 
21 |     return {
22 |         "de": data_engineering_pipeline,
23 |         "ds": data_science_pipeline,
24 |         "me": model_evaluation_pipeline,
25 |         "__default__": data_engineering_pipeline + data_science_pipeline + model_evaluation_pipeline
26 |     }
27 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/src/anomaly_detection_pipeline_kedro/pipelines/__init__.py


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipelines/data_engineering/README.md:
--------------------------------------------------------------------------------
 1 | # Pipeline data_engineering
 2 | 
 3 | > *Note:* This is a `README.md` boilerplate generated using `Kedro 0.17.7`.
 4 | 
 5 | ## Overview
 6 | 
 7 | <!---
 8 | Please describe your modular pipeline here.
 9 | -->
10 | 
11 | ## Pipeline inputs
12 | 
13 | <!---
14 | The list of pipeline inputs.
15 | -->
16 | 
17 | ## Pipeline outputs
18 | 
19 | <!---
20 | The list of pipeline outputs.
21 | -->
22 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipelines/data_engineering/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a boilerplate pipeline 'data_engineering'
3 | generated using Kedro 0.17.7
4 | """
5 | 
6 | from .pipeline import create_pipeline
7 | 
8 | __all__ = ["create_pipeline"]
9 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipelines/data_engineering/nodes.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a boilerplate pipeline 'data_engineering'
 3 | generated using Kedro 0.17.7
 4 | """
 5 | 
 6 | from typing import Any, Callable, Dict
 7 | import pandas as pd
 8 | from datetime import timedelta, datetime as dt
 9 | 
10 | 
11 | def merge_data(partitioned_input: Dict[str, Callable[[], Any]]) -> pd.DataFrame:
12 |     """Concatenate input partitions into one pandas DataFrame.
13 | 
14 |     Args:
15 |         partitioned_input: A dictionary with partition ids as keys and load functions as values.
16 | 
17 |     Returns:
18 |         Pandas DataFrame representing a concatenation of all loaded partitions.
19 |     """
20 |     merged_df = pd.DataFrame()
21 | 
22 |     for partition_id, partition_load_func in sorted(partitioned_input.items()):
23 |         partition_data = partition_load_func()  # load actual partition data
24 |         merged_df = pd.concat([merged_df, partition_data], ignore_index=True, sort=True) # concat with existing result
25 | 
26 |     return merged_df
27 | 
28 | 
29 | def process_data(merged_df: pd.DataFrame, predictor_cols: list) -> pd.DataFrame:
30 |     """Process the merged dataset
31 | 
32 |     Args:
33 |         merged_df (pd.DataFrame): Dataframe containing the consolidated credit card transaction data
34 | 
35 |     Returns:
36 |         pd.DataFrame: Pandas dataframe representing the processed dataset
37 |     """
38 |     # Generate date column
39 |     merged_df['TX_DATETIME'] =  pd.to_datetime(merged_df['TX_DATETIME'], infer_datetime_format=True)
40 |     merged_df['TX_DATE'] = merged_df['TX_DATETIME'].dt.date
41 | 
42 |     # Only keep columns which are meaningful and predictive (based on domain knowledge)
43 |     processed_df = merged_df[predictor_cols]
44 | 
45 |     return processed_df
46 | 
47 | 
48 | def train_test_split(processed_df: pd.DataFrame) -> pd.DataFrame:
49 |     """Split processed dataset in train and test sets
50 | 
51 |     Args:
52 |         processed_df (pd.DataFrame): Dataframe containing the processed transaction dataset
53 | 
54 |     Returns:
55 |         Pandas dataframes of the training data, test data, and test labels (if any)
56 |     """
57 |     # Perform chronological train test split (80:20) i.e. 8 weeks:2 weeks
58 |     processed_df['TX_DATE'] =  pd.to_datetime(processed_df['TX_DATE'], infer_datetime_format=True)
59 |     split_date = processed_df['TX_DATE'].min() + timedelta(days=(8*7)) 
60 |     train_df = processed_df.loc[processed_df['TX_DATE'] <= split_date]
61 |     test_df = processed_df.loc[processed_df['TX_DATE'] > split_date]
62 | 
63 |     # Drop date column
64 |     train_df.drop(columns=['TX_DATE'], inplace=True)
65 |     test_df.drop(columns=['TX_DATE'], inplace=True)
66 | 
67 |     # Drop actual label in dataset if any (supposed to be unsupervised training)
68 |     if 'TX_FRAUD' in train_df.columns:
69 |         train_df = train_df.drop(columns=['TX_FRAUD'])
70 |     
71 |     # Store test labels (if any) for subsequent model evaluation
72 |     if 'TX_FRAUD' in test_df.columns:
73 |         test_labels = test_df[['TX_FRAUD']]
74 |         test_df = test_df.drop(columns=['TX_FRAUD'])
75 |     else:
76 |         test_labels = pd.DataFrame() # Empty dataframe if no test labels present
77 | 
78 |     return train_df, test_df, test_labels


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipelines/data_engineering/pipeline.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a boilerplate pipeline 'data_engineering'
 3 | generated using Kedro 0.17.7
 4 | """
 5 | 
 6 | from kedro.pipeline import Pipeline, node, pipeline
 7 | from .nodes import merge_data, process_data, train_test_split
 8 | 
 9 | def create_pipeline(**kwargs) -> Pipeline:
10 |     return pipeline([
11 | 
12 |         node(
13 |             func=merge_data,
14 |             inputs="raw_daily_data",
15 |             outputs="merged_data",
16 |             name="node_merge_raw_daily_data"
17 |             ),
18 | 
19 |         node(
20 |             func=process_data,
21 |             inputs=["merged_data", "params:predictor_cols"],
22 |             outputs="processed_data",
23 |             name="node_process_data"
24 |             ),
25 | 
26 |         node(
27 |             func=train_test_split,
28 |             inputs="processed_data",
29 |             outputs=["train_data", "test_data", "test_labels"],
30 |             name="node_train_test_split"
31 |             ),
32 |     ])
33 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipelines/data_science/README.md:
--------------------------------------------------------------------------------
 1 | # Pipeline data_science
 2 | 
 3 | > *Note:* This is a `README.md` boilerplate generated using `Kedro 0.17.7`.
 4 | 
 5 | ## Overview
 6 | 
 7 | <!---
 8 | Please describe your modular pipeline here.
 9 | -->
10 | 
11 | ## Pipeline inputs
12 | 
13 | <!---
14 | The list of pipeline inputs.
15 | -->
16 | 
17 | ## Pipeline outputs
18 | 
19 | <!---
20 | The list of pipeline outputs.
21 | -->
22 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipelines/data_science/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a boilerplate pipeline 'data_science'
3 | generated using Kedro 0.17.7
4 | """
5 | 
6 | from .pipeline import create_pipeline
7 | 
8 | __all__ = ["create_pipeline"]
9 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipelines/data_science/nodes.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a boilerplate pipeline 'data_science'
 3 | generated using Kedro 0.17.7
 4 | """
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | import sklearn
 9 | from sklearn.ensemble import IsolationForest
10 | 
11 | def train_model(train_df: pd.DataFrame, contamination_value: float):
12 |     # Initialize isolation forest classifier model
13 |     clf = IsolationForest(random_state=42, 
14 |                          bootstrap=True,
15 |                          contamination=contamination_value)
16 | 
17 |     # Fit model on training dataset
18 |     clf.fit(train_df.values)
19 | 
20 |     return clf
21 | 
22 | 
23 | def predict(ml_model, test_df: pd.DataFrame):
24 |     # Generate predictions on test dataset
25 |     preds = ml_model.predict(test_df.values)
26 | 
27 |     # Modify predictions to match TX_FRAUD label (1 = fraud, 0 = no fraud)
28 |     preds_mod = np.array(list(map(lambda x: 1*(x == -1), preds)))
29 | 
30 |     # Get anomaly scores that led to predictions
31 |     anomaly_scores = ml_model.score_samples(test_df)
32 | 
33 |     # Convert anomaly scores to positive values
34 |     anomaly_scores_mod = np.array([-x for x in anomaly_scores])
35 | 
36 |     test_df['ANOMALY_SCORE'] = anomaly_scores_mod
37 |     test_df['ANOMALY'] = preds_mod
38 | 
39 |     return test_df
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipelines/data_science/pipeline.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a boilerplate pipeline 'data_science'
 3 | generated using Kedro 0.17.7
 4 | """
 5 | 
 6 | from kedro.pipeline import Pipeline, node, pipeline
 7 | from .nodes import train_model, predict
 8 | 
 9 | 
10 | def create_pipeline(**kwargs) -> Pipeline:
11 |     return pipeline([
12 | 
13 |         node(
14 |             func=train_model,
15 |             inputs=["train_data", "params:contamination_value"],
16 |             outputs="ml_model",
17 |             name="node_train_model"
18 |             ),
19 | 
20 |         node(
21 |             func=predict,
22 |             inputs=["ml_model", "test_data"],
23 |             outputs="predictions",
24 |             name="node_predict"
25 |             ),
26 | 
27 |     ])
28 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipelines/model_evaluation/README.md:
--------------------------------------------------------------------------------
 1 | # Pipeline model_evaluation
 2 | 
 3 | > *Note:* This is a `README.md` boilerplate generated using `Kedro 0.17.7`.
 4 | 
 5 | ## Overview
 6 | 
 7 | <!---
 8 | Please describe your modular pipeline here.
 9 | -->
10 | 
11 | ## Pipeline inputs
12 | 
13 | <!---
14 | The list of pipeline inputs.
15 | -->
16 | 
17 | ## Pipeline outputs
18 | 
19 | <!---
20 | The list of pipeline outputs.
21 | -->
22 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipelines/model_evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a boilerplate pipeline 'model_evaluation'
3 | generated using Kedro 0.17.7
4 | """
5 | 
6 | from .pipeline import create_pipeline
7 | 
8 | __all__ = ["create_pipeline"]
9 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipelines/model_evaluation/nodes.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a boilerplate pipeline 'model_evaluation'
 3 | generated using Kedro 0.17.7
 4 | """
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | import matplotlib.pyplot as plt
 9 | from sklearn.metrics import roc_curve, precision_recall_curve, auc
10 | import logging
11 | import neptune.new as neptune
12 | 
13 | run = neptune.init(project='kennethleung.ty/Anomaly-Detection-Pipeline-Kedro',
14 |                    api_token='<API_TOKEN>')
15 | 
16 | def evaluate_model(predictions: pd.DataFrame, test_labels: pd.DataFrame,
17 |                    neptune_run: run):
18 |     def get_auc(labels, scores):
19 |         fpr, tpr, thr = roc_curve(labels, scores)
20 |         auc_score = auc(fpr, tpr)
21 |         return fpr, tpr, auc_score
22 | 
23 |     def get_aucpr(labels, scores):
24 |         precision, recall, thr = precision_recall_curve(labels, scores)
25 |         aucpr_score = np.trapz(recall, precision)
26 |         return precision, recall, aucpr_score
27 | 
28 |     def plot_metric(ax, x, y, x_label, y_label, plot_label, style="-"):
29 |         ax.plot(x, y, style, label=plot_label)
30 |         ax.legend()
31 |         ax.set_ylabel(x_label)
32 |         ax.set_xlabel(y_label)
33 | 
34 |     def prediction_summary(labels, predicted_score, info, plot_baseline=True, axes=None):
35 |         if axes is None:
36 |             axes = [plt.subplot(1, 2, 1), plt.subplot(1, 2, 2)]
37 | 
38 |         fpr, tpr, auc_score = get_auc(labels, predicted_score)
39 |         plot_metric(axes[0], fpr, tpr, "False positive rate",
40 |                     "True positive rate", "{} AUC = {:.4f}".format(info, auc_score))
41 |         if plot_baseline:
42 |             plot_metric(axes[0], [0, 1], [0, 1], "False positive rate",
43 |                     "True positive rate", "Baseline AUC = 0.5", "r--")
44 | 
45 |         precision, recall, aucpr_score = get_aucpr(labels, predicted_score)
46 |         plot_metric(axes[1], recall, precision, "Recall", 
47 |                     "Precision", "{} AUCPR = {:.4f}".format(info, aucpr_score))
48 | 
49 |         if plot_baseline:
50 |             thr = sum(labels)/len(labels)
51 |             plot_metric(axes[1], [0, 1], [thr, thr], "Recall",
52 |                     "Precision", "Baseline AUCPR = {:.4f}".format(thr), "r--")
53 | 
54 |         plt.show()
55 |         return axes
56 |     
57 |     _, _, auc_score = get_auc(test_labels['TX_FRAUD'].values,  predictions['ANOMALY_SCORE'].values)
58 |     _, _, aucpr_score = get_aucpr(test_labels['TX_FRAUD'].values,  predictions['ANOMALY_SCORE'].values)
59 | 
60 |     # log = logging.getLogger(__name__)
61 |     # log.info("AUC-ROC Score: %0.2f%%", auc_score)
62 |     # log.info("AUC-PR Score: %0.2f%%", aucpr_score)
63 | 
64 |     # Log scores into Neptune
65 |     neptune_run['nodes/report/auc_roc_score'].log(auc_score)
66 |     neptune_run['nodes/report/auc_pr_score'].log(aucpr_score)
67 | 
68 |     fig = plt.figure()
69 |     fig.set_figheight(4.5)
70 |     fig.set_figwidth(4.5*2)
71 |     axes = prediction_summary(test_labels['TX_FRAUD'].values, predictions['ANOMALY_SCORE'].values, "Isolation Forest")
72 | 
73 |     # Log AUC plots to Neptune
74 |     neptune_run['nodes/report/auc_plots'].upload(fig)
75 | 
76 |     return fig


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/pipelines/model_evaluation/pipeline.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a boilerplate pipeline 'model_evaluation'
 3 | generated using Kedro 0.17.7
 4 | """
 5 | 
 6 | from kedro.pipeline import Pipeline, node, pipeline
 7 | from .nodes import evaluate_model
 8 | 
 9 | 
10 | def create_pipeline(**kwargs) -> Pipeline:
11 |     return pipeline([
12 |         node(
13 |             func=evaluate_model,
14 |             inputs=["predictions", "test_labels", "neptune_run"],
15 |             outputs="evaluation_plot",
16 |             name="node_model_evaluation"
17 |             ),
18 |     ])
19 | 


--------------------------------------------------------------------------------
/src/anomaly_detection_pipeline_kedro/settings.py:
--------------------------------------------------------------------------------
 1 | """Project settings."""
 2 | from anomaly_detection_pipeline_kedro.hooks import ProjectHooks
 3 | 
 4 | # Instantiate and list your project hooks here
 5 | HOOKS = (ProjectHooks(),)
 6 | 
 7 | # List the installed plugins for which to disable auto-registry
 8 | # DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",)
 9 | 
10 | # Define where to store data from a KedroSession. Defaults to BaseSessionStore.
11 | # from kedro.framework.session.store import ShelveStore
12 | # SESSION_STORE_CLASS = ShelveStore
13 | 
14 | # Define keyword arguments to be passed to `SESSION_STORE_CLASS` constructor
15 | # SESSION_STORE_ARGS = {
16 | #     "path": "./sessions"
17 | # }
18 | 
19 | # Define custom context class. Defaults to `KedroContext`
20 | # CONTEXT_CLASS = KedroContext
21 | 
22 | # Define the configuration folder. Defaults to `conf`
23 | # CONF_ROOT = "conf"
24 | 


--------------------------------------------------------------------------------
/src/requirements.in:
--------------------------------------------------------------------------------
 1 | black==21.5b1
 2 | flake8>=3.7.9, <4.0
 3 | ipython~=7.10
 4 | ipython~=7.16.3; python_version == '3.6'
 5 | ipython>=7.31.1, <8.0; python_version > '3.6'
 6 | isort~=5.0
 7 | jupyter~=1.0
 8 | jupyter_client>=5.1, <7.0
 9 | jupyterlab~=3.0
10 | kedro==0.17.7
11 | kedro-telemetry~=0.1.0
12 | nbstripout~=0.4
13 | pytest-cov~=3.0
14 | pytest-mock>=1.7.1, <2.0
15 | pytest~=6.2
16 | wheel>=0.35, <0.37
17 | 


--------------------------------------------------------------------------------
/src/requirements.txt:
--------------------------------------------------------------------------------
 1 | black==21.5b1
 2 | flake8>=3.7.9, <4.0
 3 | ipython~=7.10
 4 | ipython~=7.16.3; python_version == '3.6'
 5 | ipython>=7.31.1, <8.0; python_version > '3.6'
 6 | isort~=5.0
 7 | jupyter~=1.0
 8 | jupyter_client>=5.1, <7.0
 9 | jupyterlab~=3.0
10 | kedro==0.17.7
11 | kedro-telemetry~=0.1.0
12 | matplotlib>=3.5
13 | nbstripout~=0.4
14 | numpy>=1.21
15 | pandas>=1.3
16 | pytest-cov~=3.0
17 | pytest-mock>=1.7.1, <2.0
18 | pytest~=6.2
19 | scikit-learn>=1.0
20 | wheel~>0.38.1
21 | 


--------------------------------------------------------------------------------
/src/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | entry_point = (
 4 |     "Anomaly-Detection-Pipeline-Kedro = anomaly_detection_pipeline_kedro.__main__:main"
 5 | )
 6 | 
 7 | 
 8 | # get the dependencies and installs
 9 | with open("requirements.txt", encoding="utf-8") as f:
10 |     # Make sure we strip all comments and options (e.g "--extra-index-url")
11 |     # that arise from a modified pip.conf file that configure global options
12 |     # when running kedro build-reqs
13 |     requires = []
14 |     for line in f:
15 |         req = line.split("#", 1)[0].strip()
16 |         if req and not req.startswith("--"):
17 |             requires.append(req)
18 | 
19 | setup(
20 |     name="anomaly_detection_pipeline_kedro",
21 |     version="0.1",
22 |     packages=find_packages(exclude=["tests"]),
23 |     entry_points={"console_scripts": [entry_point]},
24 |     install_requires=requires,
25 |     extras_require={
26 |         "docs": [
27 |             "docutils<0.18.0",
28 |             "sphinx~=3.4.3",
29 |             "sphinx_rtd_theme==0.5.1",
30 |             "nbsphinx==0.8.1",
31 |             "nbstripout~=0.4",
32 |             "recommonmark==0.7.1",
33 |             "sphinx-autodoc-typehints==1.11.1",
34 |             "sphinx_copybutton==0.3.1",
35 |             "ipykernel>=5.3, <7.0",
36 |         ]
37 |     },
38 | )
39 | 


--------------------------------------------------------------------------------
/src/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/src/tests/__init__.py


--------------------------------------------------------------------------------
/src/tests/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/src/tests/pipelines/__init__.py


--------------------------------------------------------------------------------
/src/tests/pipelines/data_engineering/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/src/tests/pipelines/data_engineering/__init__.py


--------------------------------------------------------------------------------
/src/tests/pipelines/data_engineering/test_pipeline.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a boilerplate test file for pipeline 'data_engineering'
 3 | generated using Kedro 0.17.7.
 4 | Please add your pipeline tests here.
 5 | 
 6 | Kedro recommends using `pytest` framework, more info about it can be found
 7 | in the official documentation:
 8 | https://docs.pytest.org/en/latest/getting-started.html
 9 | """
10 | 


--------------------------------------------------------------------------------
/src/tests/pipelines/data_science/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/src/tests/pipelines/data_science/__init__.py


--------------------------------------------------------------------------------
/src/tests/pipelines/data_science/test_pipeline.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a boilerplate test file for pipeline 'data_science'
 3 | generated using Kedro 0.17.7.
 4 | Please add your pipeline tests here.
 5 | 
 6 | Kedro recommends using `pytest` framework, more info about it can be found
 7 | in the official documentation:
 8 | https://docs.pytest.org/en/latest/getting-started.html
 9 | """
10 | 


--------------------------------------------------------------------------------
/src/tests/pipelines/model_evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennethleungty/Anomaly-Detection-Pipeline-Kedro/1544367a3b8cc8695284dcc6d1189314887c322b/src/tests/pipelines/model_evaluation/__init__.py


--------------------------------------------------------------------------------
/src/tests/pipelines/model_evaluation/test_pipeline.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a boilerplate test file for pipeline 'model_evaluation'
 3 | generated using Kedro 0.17.7.
 4 | Please add your pipeline tests here.
 5 | 
 6 | Kedro recommends using `pytest` framework, more info about it can be found
 7 | in the official documentation:
 8 | https://docs.pytest.org/en/latest/getting-started.html
 9 | """
10 | 


--------------------------------------------------------------------------------
/src/tests/test_run.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains an example test.
 3 | 
 4 | Tests should be placed in ``src/tests``, in modules that mirror your
 5 | project's structure, and in files named test_*.py. They are simply functions
 6 | named ``test_*`` which test a unit of logic.
 7 | 
 8 | To run the tests, run ``kedro test`` from the project root directory.
 9 | """
10 | 
11 | from pathlib import Path
12 | 
13 | import pytest
14 | from kedro.framework.context import KedroContext
15 | 
16 | 
17 | @pytest.fixture
18 | def project_context():
19 |     return KedroContext(
20 |         package_name="anomaly_detection_pipeline_kedro", project_path=Path.cwd()
21 |     )
22 | 
23 | 
24 | # The tests below are here for the demonstration purpose
25 | # and should be replaced with the ones testing the project
26 | # functionality
27 | class TestProjectContext:
28 |     def test_package_name(self, project_context):
29 |         assert project_context.package_name == "anomaly_detection_pipeline_kedro"
30 | 


--------------------------------------------------------------------------------