├── src
    ├── bin
    │   ├── test
    │   │   ├── __init__.py
    │   │   ├── test_contrib_util.py
    │   │   └── contrib_util.py
    │   ├── algos_contrib
    │   │   ├── __init__.py
    │   │   ├── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── test_mds.py
    │   │   │   ├── test_tf_binary.py
    │   │   │   ├── test_savgol_filter.py
    │   │   │   ├── test_example_algo.py
    │   │   │   ├── test_correlation_matrix.py
    │   │   │   ├── test_collaborativefilter.py
    │   │   │   ├── test_agglomerative_clustering.py
    │   │   │   ├── test_nmf.py
    │   │   │   ├── test_min_max_scaler.py
    │   │   │   ├── test_truncated_svd.py
    │   │   │   ├── test_linear_svc.py
    │   │   │   ├── test_latent_dirichlet_allocation.py
    │   │   │   ├── test_CustomDecisionTreeClassifier.py
    │   │   │   ├── test_extra_trees_classifier.py
    │   │   │   ├── test_orthogonal_matching_pursuit.py
    │   │   │   ├── test_IsolationForest.py
    │   │   │   ├── test_tsne.py
    │   │   │   └── test_svr.py
    │   │   ├── ExampleAlgo.py
    │   │   ├── SVR.py
    │   │   ├── LinearSVC.py
    │   │   ├── NMF.py
    │   │   ├── TruncatedSVD.py
    │   │   ├── AdaBoostRegressor.py
    │   │   ├── ExtraTreesRegressor.py
    │   │   ├── SavgolFilter.py
    │   │   ├── BaggingRegressor.py
    │   │   ├── QuantileTransformer.py
    │   │   ├── LatentDirichletAllocation.py
    │   │   ├── OrthogonalMatchingPursuit.py
    │   │   ├── ExtraTreesClassifier.py
    │   │   ├── CorrelationMatrix.py
    │   │   ├── MinMaxScaler.py
    │   │   ├── MDS.py
    │   │   ├── TSNE.py
    │   │   ├── CustomDecisionTreeClassifier.py
    │   │   ├── AgglomerativeClustering.py
    │   │   ├── CollaborativeFilter.py
    │   │   ├── TFBinary.py
    │   │   └── IsolationForest.py
    │   ├── README.md
    │   ├── test.py
    │   └── link_mltk.py
    ├── default
    │   ├── data
    │   │   └── ui
    │   │   │   ├── views
    │   │   │       └── README.md
    │   │   │   └── nav
    │   │   │       └── default.xml
    │   ├── app.conf
    │   └── algos.conf
    └── metadata
    │   └── default.meta
├── .gitignore
├── requirements_1.2.txt
├── tox.ini
├── CONTRIBUTING.md
├── README.md
└── LICENSE


/src/bin/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/bin/README.md:
--------------------------------------------------------------------------------
1 | This is where you put any scripts you want to add to this app.
2 | 


--------------------------------------------------------------------------------
/src/default/data/ui/views/README.md:
--------------------------------------------------------------------------------
1 | Add all the views that your app needs in this directory
2 | 


--------------------------------------------------------------------------------
/src/bin/test.py:
--------------------------------------------------------------------------------
1 | from link_mltk import add_mltk
2 | add_mltk()
3 | 
4 | from test.util import check_signatures
5 | 
6 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_mds.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.MDS import MDS
2 | from test.contrib_util import AlgoTestUtils
3 | 
4 | 
5 | def test_algo():
6 |     AlgoTestUtils.assert_algo_basic(MDS, serializable=False)
7 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/ExampleAlgo.py:
--------------------------------------------------------------------------------
 1 | from base import BaseAlgo
 2 | 
 3 | 
 4 | class ExampleAlgo(BaseAlgo):
 5 |     def __init__(self, options):
 6 |         pass
 7 | 
 8 |     def fit(self, df, options):
 9 |         return df
10 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_tf_binary.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.TFBinary import TFBinary
2 | from test.contrib_util import AlgoTestUtils
3 | 
4 | 
5 | def test_algo():
6 |     AlgoTestUtils.assert_algo_basic(TFBinary, serializable=False)
7 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_savgol_filter.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.SavgolFilter import SavgolFilter
2 | from test.contrib_util import AlgoTestUtils
3 | 
4 | 
5 | def test_algo():
6 |     AlgoTestUtils.assert_algo_basic(SavgolFilter, serializable=False)
7 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_example_algo.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.ExampleAlgo import ExampleAlgo
2 | from test.contrib_util import AlgoTestUtils
3 | 
4 | 
5 | def test_algo():
6 |     AlgoTestUtils.assert_algo_basic(ExampleAlgo, serializable=False)
7 | 
8 | 


--------------------------------------------------------------------------------
/src/default/data/ui/nav/default.xml:
--------------------------------------------------------------------------------
1 | <nav search_view="search" color="#65A637">
2 |   <view name="search" default='true' />
3 |   <view name="datasets" />
4 |   <view name="reports" />
5 |   <view name="alerts" />
6 |   <view name="dashboards" />
7 | </nav>
8 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_correlation_matrix.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.CorrelationMatrix import CorrelationMatrix
2 | from test.contrib_util import AlgoTestUtils
3 | 
4 | 
5 | def test_algo():
6 |     AlgoTestUtils.assert_algo_basic(CorrelationMatrix, serializable=False)
7 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_collaborativefilter.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.CollaborativeFilter import CollaborativeFilter
2 | from test.contrib_util import AlgoTestUtils
3 | 
4 | 
5 | def test_algo():
6 |     AlgoTestUtils.assert_algo_basic(CollaborativeFilter, serializable=False)
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Jupyter Notebook
 2 | .ipynb_checkpoints
 3 | 
 4 | # macOS
 5 | .DS_Store
 6 | 
 7 | # Editors
 8 | *.swp
 9 | *.swo
10 | 
11 | # Python stuff
12 | *.egg-info
13 | .tox
14 | **/.cache
15 | **/.pytest_cache
16 | **/*.pyc
17 | 
18 | # IntelliJ
19 | **/.idea
20 | 
21 | target
22 | 


--------------------------------------------------------------------------------
/requirements_1.2.txt:
--------------------------------------------------------------------------------
 1 | attrs==17.4.0
 2 | funcsigs==1.0.2
 3 | mock==2.0.0
 4 | more-itertools==4.1.0
 5 | numpy==1.10.4
 6 | pandas==0.17.1
 7 | pluggy==0.6.0
 8 | psutil==3.4.2
 9 | py==1.5.3
10 | pytest==3.5.0
11 | scikit-learn==0.17
12 | scipy==0.17.0
13 | six==1.11.0
14 | statsmodels==0.6.1
15 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_agglomerative_clustering.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.AgglomerativeClustering import AgglomerativeClustering
2 | from test.contrib_util import AlgoTestUtils
3 | 
4 | 
5 | def test_algo():
6 |     AlgoTestUtils.assert_algo_basic(AgglomerativeClustering, serializable=False)
7 | 


--------------------------------------------------------------------------------
/src/default/app.conf:
--------------------------------------------------------------------------------
 1 | #
 2 | # Splunk app configuration file
 3 | #
 4 | 
 5 | [install]
 6 | is_configured = 1
 7 | 
 8 | [ui]
 9 | is_visible = false
10 | label = mltk-algo-contrib
11 | 
12 | [launcher]
13 | author = github.com/splunk/mltk-algo-contrib
14 | description =
15 | version = 1.0
16 | 
17 | 


--------------------------------------------------------------------------------
/src/metadata/default.meta:
--------------------------------------------------------------------------------
 1 | 
 2 | # Application-level permissions
 3 | 
 4 | []
 5 | access = read : [ * ], write : [ admin, power ]
 6 | 
 7 | ### EVENT TYPES
 8 | 
 9 | [eventtypes]
10 | export = system
11 | 
12 | 
13 | ### PROPS
14 | 
15 | [props]
16 | export = system
17 | 
18 | 
19 | ### TRANSFORMS
20 | 
21 | [transforms]
22 | export = system
23 | 
24 | 
25 | ### LOOKUPS
26 | 
27 | [lookups]
28 | export = system
29 | 
30 | 
31 | ### VIEWSTATES: even normal users should be able to create shared viewstates
32 | 
33 | [viewstates]
34 | access = read : [ * ], write : [ * ]
35 | export = system
36 | 
37 | 
38 | [algos]
39 | export = system
40 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_nmf.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from algos_contrib.NMF import NMF
 3 | from test.contrib_util import AlgoTestUtils
 4 | 
 5 | 
 6 | def test_algo():
 7 |     input_df = pd.DataFrame({
 8 |         'a': [1, 2, 3],
 9 |         'b': [4, 5, 6],
10 |         'c': ['a', 'b', 'c'],
11 |     })
12 |     options = {
13 |         'feature_variables': ['a', 'b', 'c'],
14 |     }
15 |     required_methods = (
16 |         '__init__',
17 |         'fit',
18 |         'partial_fit',
19 |         'apply',
20 |         'summary',
21 |         'register_codecs',
22 |     )
23 |     AlgoTestUtils.assert_algo_basic(NMF, required_methods, input_df, options)
24 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_min_max_scaler.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from algos_contrib.MinMaxScaler import MinMaxScaler
 3 | from test.contrib_util import AlgoTestUtils
 4 | 
 5 | 
 6 | def test_algo():
 7 |     input_df = pd.DataFrame({
 8 |         'a': [1, 2, 3],
 9 |         'b': [4, 5, 6],
10 |         'c': ['a', 'b', 'c'],
11 |     })
12 |     options = {
13 | 	'feature_variables': ['a', 'b', 'c'],
14 |     }
15 |     required_methods = (
16 |         '__init__',
17 |         'fit',
18 |         'partial_fit',
19 |         'apply',
20 |         'summary',
21 |         'register_codecs',
22 |     )
23 |     AlgoTestUtils.assert_algo_basic(MinMaxScaler, required_methods, input_df, options)
24 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_truncated_svd.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from algos_contrib.TruncatedSVD import TruncatedSVD
 3 | from test.contrib_util import AlgoTestUtils
 4 | 
 5 | 
 6 | def test_algo():
 7 |     input_df = pd.DataFrame({
 8 |         'a': [1, 2, 3],
 9 |         'b': [4, 5, 6],
10 |         'c': ['a', 'b', 'c'],
11 |     })
12 |     options = {
13 |         'feature_variables': ['a', 'b', 'c'],
14 |     }
15 |     required_methods = (
16 |         '__init__',
17 |         'fit',
18 |         'partial_fit',
19 |         'apply',
20 |         'summary',
21 |         'register_codecs',
22 |     )
23 |     AlgoTestUtils.assert_algo_basic(TruncatedSVD, required_methods, input_df, options)
24 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_linear_svc.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from algos_contrib.LinearSVC import LinearSVC
 3 | from test.contrib_util import AlgoTestUtils
 4 | 
 5 | 
 6 | 
 7 | 
 8 | def test_algo():
 9 |     input_df = pd.DataFrame({
10 |         'a': [1, 2, 3],
11 |         'b': [4, 5, 6],
12 |         'c': ['a', 'b', 'c'],
13 |     })
14 |     options = {
15 |         'target_variable': ['a'],
16 |         'feature_variables': ['b', 'c'],
17 |     }
18 |     required_methods = (
19 |         '__init__',
20 |         'fit',
21 |         'partial_fit',
22 |         'apply',
23 |         'summary',
24 |         'register_codecs',
25 |     )
26 |     AlgoTestUtils.assert_algo_basic(LinearSVC, required_methods ,  input_df, options)
27 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_latent_dirichlet_allocation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from algos_contrib.LatentDirichletAllocation import LatentDirichletAllocation
 3 | from test.contrib_util import AlgoTestUtils
 4 | 
 5 | 
 6 | def test_algo():
 7 |     input_df = pd.DataFrame({
 8 |         'a': [1, 2, 3],
 9 |         'b': [4, 5, 6],
10 |         'c': ['a', 'b', 'c'],
11 |     })
12 |     options = {
13 |         'feature_variables': ['b', 'c'],
14 |     }
15 |     required_methods = (
16 |         '__init__',
17 |         'fit',
18 |         'partial_fit',
19 |         'apply',
20 |         'summary',
21 |         'register_codecs',
22 |     )
23 |     AlgoTestUtils.assert_algo_basic(LatentDirichletAllocation, required_methods, input_df, options)
24 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_CustomDecisionTreeClassifier.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from algos_contrib.CustomDecisionTreeClassifier import CustomDecisionTreeClassifier
 3 | from test.contrib_util import AlgoTestUtils
 4 | 
 5 | def test_algo():
 6 |     input_df = pd.DataFrame({
 7 |         'a': [1, 2, 3],
 8 |         'b': [4, 5, 6],
 9 |         'c': ['a', 'b', 'c'],
10 |     })
11 |     options = {
12 |         'target_variable': ['a'],
13 |         'feature_variables': ['b', 'c'],
14 |     }
15 |     required_methods = (
16 |         '__init__',
17 |         'fit',
18 |         'apply',
19 |         'summary',
20 |         'register_codecs',
21 |     )
22 |     AlgoTestUtils.assert_algo_basic(CustomDecisionTreeClassifier, required_methods ,  input_df, options)
23 |     


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_extra_trees_classifier.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from algos_contrib.OrthogonalMatchingPursuit import OrthogonalMatchingPursuit
 3 | from test.contrib_util import AlgoTestUtils
 4 | 
 5 | 
 6 | 
 7 | 
 8 | def test_algo():
 9 |     input_df = pd.DataFrame({
10 |         'a': [1, 2, 3],
11 |         'b': [4, 5, 6],
12 |         'c': ['a', 'b', 'c'],
13 |     })
14 |     options = {
15 |         'target_variable': ['a'],
16 |         'feature_variables': ['b', 'c'],
17 |     }
18 |     required_methods = (
19 |         '__init__',
20 |         'fit',
21 |         'partial_fit',
22 |         'apply',
23 |         'summary',
24 |         'register_codecs',
25 |     )
26 |     AlgoTestUtils.assert_algo_basic(OrthogonalMatchingPursuit, required_methods ,  input_df, options)


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_orthogonal_matching_pursuit.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from algos_contrib.OrthogonalMatchingPursuit import OrthogonalMatchingPursuit
 3 | from test.contrib_util import AlgoTestUtils
 4 | 
 5 | 
 6 | 
 7 | 
 8 | def test_algo():
 9 |     input_df = pd.DataFrame({
10 |         'a': [1, 2, 3],
11 |         'b': [4, 5, 6],
12 |         'c': ['a', 'b', 'c'],
13 |     })
14 |     options = {
15 |         'target_variable': ['a'],
16 |         'feature_variables': ['b', 'c'],
17 |     }
18 |     required_methods = (
19 |         '__init__',
20 |         'fit',
21 |         'partial_fit',
22 |         'apply',
23 |         'summary',
24 |         'register_codecs',
25 |     )
26 |     AlgoTestUtils.assert_algo_basic(OrthogonalMatchingPursuit, required_methods ,  input_df, options)


--------------------------------------------------------------------------------
/src/bin/algos_contrib/SVR.py:
--------------------------------------------------------------------------------
 1 | from sklearn.svm import SVR as _SVR
 2 | 
 3 | from base import BaseAlgo, RegressorMixin
 4 | from util.param_util import convert_params
 5 | 
 6 | 
 7 | class SVR(RegressorMixin, BaseAlgo):
 8 | 
 9 |     def __init__(self, options):
10 |         self.handle_options(options)
11 | 
12 |         params = options.get('params', {})
13 |         out_params = convert_params(
14 |             params,
15 |             floats=['C', 'gamma'],
16 |             strs=['kernel'],
17 |             ints=['degree'],
18 |         )
19 | 
20 |         self.estimator = _SVR(**out_params)
21 | 
22 |     @staticmethod
23 |     def register_codecs():
24 |         from codec.codecs import SimpleObjectCodec
25 |         from codec import codecs_manager
26 |         codecs_manager.add_codec('algos_contrib.SVR', 'SVR', SimpleObjectCodec)
27 |         codecs_manager.add_codec('sklearn.svm.classes', 'SVR', SimpleObjectCodec)
28 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_IsolationForest.py:
--------------------------------------------------------------------------------
 1 | from algos_contrib.IsolationForest import IsolationForest
 2 | from test.contrib_util import AlgoTestUtils
 3 | import pandas as pd
 4 | 
 5 | def test_algo():
 6 |     AlgoTestUtils.assert_algo_basic(IsolationForest, serializable=False)
 7 | 
 8 | def test_algo_options():
 9 |     input_df = pd.DataFrame({
10 |         'a': [5.1, 4.9, 4.7, 4.6],
11 |         'b': [3.5, 3.0, 3.1, 3.2],
12 |         'c': [1.4, 1.4, 1.5, 1.6],
13 |         'd': [0.2, 0.2, 0.2, 0.4],
14 |         'e': ['Iris Setosa','Iris Setosa','Iris Versicolor','Iris Virginica']
15 |     })
16 |     options = {
17 |         'target_variables' : [],
18 |         'feature_variables': ['a','b','c','d'],
19 |     }
20 |     required_methods = (
21 |         '__init__',
22 |         'fit',
23 |         'apply',
24 |         'register_codecs',
25 |     )
26 |     AlgoTestUtils.assert_algo_basic(IsolationForest, required_methods=required_methods, input_df=input_df, options=options, serializable=False)


--------------------------------------------------------------------------------
/src/bin/algos_contrib/LinearSVC.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from sklearn.svm import LinearSVC as _LinearSVC
 4 | 
 5 | from codec import codecs_manager
 6 | from base import BaseAlgo, ClassifierMixin
 7 | from util.param_util import convert_params
 8 | 
 9 | 
10 | class LinearSVC(ClassifierMixin, BaseAlgo):
11 | 
12 |     def __init__(self, options):
13 |         self.handle_options(options)
14 | 
15 |         out_params = convert_params(
16 |             options.get('params', {}),
17 |             floats=['gamma', 'C', 'tol', 'intercept_scaling'],
18 |             ints=['random_state','max_iter'],
19 |             strs=['penalty', 'loss', 'multi_class'],
20 |             bools=['dual', 'fit_intercept'],
21 |         )
22 | 
23 |         self.estimator = _LinearSVC(**out_params)
24 | 
25 |     @staticmethod
26 |     def register_codecs():
27 |         from codec.codecs import SimpleObjectCodec
28 |         codecs_manager.add_codec('algos_contrib.LinearSVC', 'LinearSVC', SimpleObjectCodec)
29 |         codecs_manager.add_codec('sklearn.svm.classes', 'LinearSVC', SimpleObjectCodec)
30 | 


--------------------------------------------------------------------------------
/src/bin/link_mltk.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """ Small utility to add the MLTK bin path to the system path.
 3 | This makes it easy to import algorithms or utilities from the MLTK."""
 4 | import os
 5 | import sys
 6 | 
 7 | 
 8 | def check_splunk_home(splunk_home):
 9 |     """ Check SPLUNK_HOME and raise if not set."""
10 |     if not splunk_home:
11 |         raise RuntimeError('No $SPLUNK_HOME provided. Please set SPLUNK_HOME.')
12 | 
13 | 
14 | def get_mltk_bin_path(splunk_home):
15 |     """ Create the path to the MLTK bin folder."""
16 |     check_splunk_home(splunk_home)
17 |     mltk_path = os.path.join(splunk_home, 'etc', 'apps', 'Splunk_ML_Toolkit', 'bin')
18 | 
19 |     if not os.path.exists(mltk_path):
20 |         raise RuntimeError('MLTK bin folder not found at {}: is MLTK installed?'.format(mltk_path))
21 | 
22 |     return mltk_path
23 | 
24 | 
25 | def add_mltk():
26 |     """ Adds MLTK bin path to sys.path """
27 |     splunk_home = os.environ.get('SPLUNK_HOME', None)
28 |     mltk_bin_path = get_mltk_bin_path(splunk_home)
29 |     sys.path.insert(0, mltk_bin_path)
30 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py27
 3 | skipsdist = True
 4 | skip_install = True
 5 | tox_pip_extensions_ext_venv_update = true
 6 | 
 7 | [testenv]
 8 | passenv =
 9 |     SPLUNK_HOME
10 | setenv =
11 |     PYTHONPATH = {env:SPLUNK_HOME}/etc/apps/Splunk_ML_Toolkit/bin
12 |     APP_NAME = {env:APP_NAME:SA_mltk_contrib_app}
13 |     BUILD_DIR = {toxinidir}/target
14 | deps = -r{toxinidir}/requirements_1.2.txt
15 | commands = pytest {posargs}
16 | 
17 | [testenv:package-macos]
18 | platform = darwin
19 | deps =
20 | changedir = {env:BUILD_DIR}
21 | whitelist_externals = /bin/bash
22 | commands =
23 |     /bin/bash -c 'tar -C {toxinidir} -s ",^src/,{env:APP_NAME}/," -cvzf {env:APP_NAME}.tgz src/\{bin,default,metadata\}'
24 | 
25 | [testenv:package-linux]
26 | platform = linux
27 | deps =
28 | changedir = {env:BUILD_DIR}
29 | whitelist_externals = /bin/bash
30 | commands =
31 |     /bin/bash -c 'tar -C {toxinidir} --transform="s,^src/,{env:APP_NAME}/," -cvzf {env:APP_NAME}.tgz src/\{bin,default,metadata\}'
32 | 
33 | [testenv:clean]
34 | deps =
35 | whitelist_externals = /bin/rm
36 | commands =
37 |     /bin/rm -rf {env:BUILD_DIR}
38 | 
39 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/NMF.py:
--------------------------------------------------------------------------------
 1 | from sklearn.decomposition import NMF as _NMF
 2 | from base import BaseAlgo, TransformerMixin
 3 | from codec import codecs_manager
 4 | from util.param_util import convert_params
 5 | 
 6 | class NMF(TransformerMixin, BaseAlgo):
 7 | 
 8 |     def __init__(self, options):
 9 |         self.handle_options(options)
10 |         out_params = convert_params(
11 |             options.get('params', {}),
12 |             floats=['beta_loss','tol','alpha','l1_ratio'],
13 |             strs=['init','solver'],
14 |             ints=['k','max_iter','random_state'],
15 |             bools=['versbose','shuffle'],
16 |             aliases={'k': 'n_components'}
17 |         )
18 | 
19 |         self.estimator = _NMF(**out_params)
20 | 
21 |     def rename_output(self, default_names, new_names):
22 |         if new_names is None:
23 |             new_names = 'NMF'
24 |         output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
25 |         return output_names
26 | 
27 |     @staticmethod
28 |     def register_codecs():
29 |         from codec.codecs import SimpleObjectCodec
30 |         codecs_manager.add_codec('algos_contrib.NMF', 'NMF', SimpleObjectCodec)
31 |         codecs_manager.add_codec('sklearn.decomposition.nmf', 'NMF', SimpleObjectCodec)
32 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/TruncatedSVD.py:
--------------------------------------------------------------------------------
 1 | from sklearn.decomposition import TruncatedSVD as _TruncatedSVD
 2 | from base import BaseAlgo, TransformerMixin
 3 | from codec import codecs_manager
 4 | from util.param_util import convert_params
 5 | 
 6 | class TruncatedSVD(TransformerMixin, BaseAlgo):
 7 | 
 8 |     def __init__(self, options):
 9 |         self.handle_options(options)
10 |         out_params = convert_params(
11 |             options.get('params', {}),
12 |             floats=['tol'],
13 |             strs=['algorithm'],
14 |             ints=['k','n_iter','random_state'],
15 |             aliases={'k': 'n_components'}
16 |         )
17 | 
18 |         self.estimator = _TruncatedSVD(**out_params)
19 | 
20 |     def rename_output(self, default_names, new_names):
21 |         if new_names is None:
22 |             new_names = 'SVD'
23 |         output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
24 |         return output_names
25 | 
26 |     @staticmethod
27 |     def register_codecs():
28 |         from codec.codecs import SimpleObjectCodec
29 |         codecs_manager.add_codec('algos_contrib.TruncatedSVD', 'TruncatedSVD', SimpleObjectCodec)
30 |         codecs_manager.add_codec('sklearn.decomposition.truncated_svd', 'TruncatedSVD', SimpleObjectCodec)
31 | 


--------------------------------------------------------------------------------
/src/default/algos.conf:
--------------------------------------------------------------------------------
 1 | # Here is where algorithms are registered.
 2 | [default]
 3 | 
 4 | ########################################################################
 5 | # Due to the layering of configuration files in Splunk, we have to
 6 | # override the package name in every section.
 7 | ########################################################################
 8 | 
 9 | 
10 | [AgglomerativeClustering]
11 | package=algos_contrib
12 | 
13 | [CorrelationMatrix]
14 | package=algos_contrib
15 | 
16 | [ExampleAlgo]
17 | package=algos_contrib
18 | 
19 | [SVR]
20 | package=algos_contrib
21 | 
22 | [SavgolFilter]
23 | package=algos_contrib
24 | 
25 | [TSNE]
26 | package=algos_contrib
27 | 
28 | [MDS]
29 | package=algos_contrib
30 | 
31 | [OrthogonalMatchingPursuit]
32 | package=algos_contrib
33 | 
34 | [TruncatedSVD]
35 | package=algos_contrib
36 | 
37 | [LatentDirichletAllocation]
38 | package=algos_contrib
39 | 
40 | [NMF]
41 | package=algos_contrib
42 | 
43 | [CollaborativeFilter]
44 | package=algos_contrib
45 | 
46 | [CustomDecisionTreeClassifier]
47 | package=algos_contrib
48 | 
49 | [TFBinary]
50 | package = algos_contrib
51 | 
52 | [MinMaxScaler]
53 | package = algos_contrib
54 | 
55 | [LinearSVC]
56 | package = algos_contrib
57 | 
58 | [ExtraTreesClassifier]
59 | package = algos_contrib
60 | 
61 | [IsolationForest]
62 | package = algos_contrib


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_tsne.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from algos_contrib.TSNE import TSNE
 3 | from test.contrib_util import AlgoTestUtils
 4 | 
 5 | algo_options = {'feature_variables': ['Review']}
 6 | 
 7 | 
 8 | def test_algo():
 9 |     AlgoTestUtils.assert_algo_basic(TSNE, serializable=False)
10 | 
11 | 
12 | def test_valid_params():
13 |     algo_options['params'] = {'k': '1'}
14 |     TSNE_algo = TSNE(algo_options)
15 |     assert TSNE_algo.estimator.n_components == 1
16 | 
17 | 
18 | def test_invalid_params_k_not_int():
19 |     algo_options['params'] = {'k': '0.1'}
20 |     with pytest.raises((RuntimeError, ValueError)) as excinfo:
21 |         _ = TSNE(algo_options)
22 |         assert excinfo.match('Invalid value for k: must be an int')
23 | 
24 | 
25 | def test_invalid_params_k_not_valid():
26 |     algo_options['params'] = {'k': '0'}
27 |     with pytest.raises((RuntimeError, ValueError)) as excinfo:
28 |         _ = TSNE(algo_options)
29 |         assert excinfo.match('Invalid value for k: k must be greater than or equal to 1')
30 | 
31 | 
32 | def test_default_parameter_values():
33 |     algo_options['params'] = {'k': '1'}
34 |     TSNE_algo = TSNE(algo_options)
35 |     assert TSNE_algo.estimator.n_iter == 200
36 |     assert TSNE_algo.estimator.perplexity == 30.0
37 |     assert TSNE_algo.estimator.early_exaggeration == 4.0
38 |     assert TSNE_algo.estimator.learning_rate == 100
39 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # CONTRIBUTING
 2 | 
 3 | By submitting a Contribution to this Work, You agree that Your Contribution is made subject to the primary LICENSE 
 4 | file applicable to this Work.  In addition, You represent that: (i) You are the copyright owner of the Contribution 
 5 | or (ii) You have the requisite rights to make the Contribution.
 6 | 
 7 | ## Definitions:
 8 | 
 9 | “You” shall mean: (i) yourself if you are making a Contribution on your own behalf; or (ii) your company, 
10 | if you are making a Contribution on behalf of your company.  If you are making a Contribution on behalf of your 
11 | company, you represent that you have the requisite authority to do so.    
12 |  
13 | "Contribution" shall mean any original work of authorship, including any modifications or additions to an existing 
14 | work, that is intentionally submitted by You for inclusion in, or documentation of, this project/repository.  For the 
15 | purposes of this definition, "submitted" means any form of electronic, verbal, or written communication submitted for 
16 | inclusion in this project/repository, including but not limited to communication on electronic mailing lists, source 
17 | code control systems, and issue tracking systems that are managed by, or on behalf of, the maintainers of 
18 | the project/repository. 
19 | 
20 | “Work” shall mean the collective software, content, and documentation in this project/repository.
21 | 
22 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_svr.py:
--------------------------------------------------------------------------------
 1 | from algos_contrib.SVR import SVR
 2 | from test.contrib_util import AlgoTestUtils
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def test_algo_basic():
 9 |     input_df = pd.DataFrame({
10 |         'a': [1, 2, 3],
11 |         'b': [4, 5, 6],
12 |         'c': ['a', 'b', 'c'],
13 |     })
14 |     options = {
15 |         'target_variable': ['a'],
16 |         'feature_variables': ['b', 'c'],
17 |     }
18 |     required_methods = (
19 |         '__init__',
20 |         'fit',
21 |         'partial_fit',
22 |         'apply',
23 |         'summary',
24 |         'register_codecs',
25 |     )
26 |     AlgoTestUtils.assert_algo_basic(SVR, required_methods, input_df, options)
27 | 
28 | 
29 | def test_prediction():
30 |     training_df = pd.DataFrame({
31 |         'y': [1, 2, 3],
32 |         'x1': [4, 5, 6],
33 |         'x2': [7, 8, 9],
34 |     })
35 |     options = {
36 |         'target_variable': ['y'],
37 |         'feature_variables': ['x1', 'x2'],
38 |     }
39 |     test_df = pd.DataFrame({
40 |         'x1': [4],
41 |         'x2': [7],
42 |     })
43 | 
44 |     svr = SVR(options)
45 |     svr.feature_variables = options['feature_variables']
46 |     svr.target_variable = options['target_variable'][0]
47 |     svr.fit(training_df, options)
48 |     output = svr.apply(test_df, options)
49 |     np.testing.assert_approx_equal(output['predicted(y)'].values, np.array([1.1]))
50 | 
51 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/AdaBoostRegressor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from pandas import DataFrame
 4 | from sklearn.ensemble import AdaBoostRegressor as _AdaBoostRegressor
 5 | 
 6 | from base import RegressorMixin, BaseAlgo
 7 | from util.param_util import convert_params
 8 | from util.algo_util import handle_max_features
 9 | from codec import codecs_manager
10 | 
11 | 
12 | class AdaBoostRegressor(RegressorMixin, BaseAlgo):
13 |     def __init__(self, options):
14 |         self.handle_options(options)
15 |         params = options.get('params', {})
16 |         out_params = convert_params(
17 |             params,
18 |             strs=['loss', 'max_features'],
19 |             floats=['learning_rate'],
20 |             ints=['n_estimators'],
21 |         )
22 | 
23 |         self.estimator = _AdaBoostRegressor(**out_params)
24 | 
25 | 
26 |     @staticmethod
27 |     def register_codecs():
28 |         from codec.codecs import SimpleObjectCodec, TreeCodec
29 | 
30 |         codecs_manager.add_codec('algos.AdaBoostRegressor', 'AdaBoostRegressor', SimpleObjectCodec)
31 |         codecs_manager.add_codec('sklearn.ensemble.classes', 'AdaBoostRegressor', SimpleObjectCodec)
32 |         codecs_manager.add_codec('sklearn.tree.tree', 'DecisionTreeRegressor', SimpleObjectCodec)
33 |         codecs_manager.add_codec('sklearn.ensemble.weight_boosting', 'AdaBoostRegressor', SimpleObjectCodec)
34 |         codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
35 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/ExtraTreesRegressor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from pandas import DataFrame
 4 | from sklearn.ensemble import ExtraTreesRegressor as _ExtraTreesRegressor
 5 | 
 6 | from base import RegressorMixin, BaseAlgo
 7 | from util.param_util import convert_params
 8 | from util.algo_util import handle_max_features
 9 | from codec import codecs_manager
10 | 
11 | 
12 | class ExtraTreesRegressor(RegressorMixin, BaseAlgo):
13 |     def __init__(self, options):
14 |         self.handle_options(options)
15 |         params = options.get('params', {})
16 |         out_params = convert_params(
17 |             params,
18 |             floats=['max_samples', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'min_impurity_split'],
19 |             bools=['bootstrap', 'oob_score', 'warm_start'],
20 |             ints=['n_estimators', 'max_depth', 'max_leaf_nodes', 'min_impurity_decrease'],
21 |             strs=['criterion'],
22 |         )
23 | 
24 |         self.estimator = _ExtraTreesRegressor(**out_params)
25 | 
26 | 
27 |     @staticmethod
28 |     def register_codecs():
29 |         from codec.codecs import SimpleObjectCodec, TreeCodec
30 | 
31 |         codecs_manager.add_codec('algos.ExtraTreesRegressor', 'ExtraTreesRegressor', SimpleObjectCodec)
32 |         codecs_manager.add_codec('sklearn.ensemble.forest', 'ExtraTreesRegressor', SimpleObjectCodec)
33 |         codecs_manager.add_codec('sklearn.tree.tree', 'ExtraTreeRegressor', SimpleObjectCodec)
34 |         codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
35 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/SavgolFilter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.signal import savgol_filter
 3 | 
 4 | from base import BaseAlgo
 5 | from util.param_util import convert_params
 6 | from util import df_util
 7 | 
 8 | 
 9 | class SavgolFilter(BaseAlgo):
10 | 
11 |     def __init__(self, options):
12 |         # set parameters
13 |         params = options.get('params', {})
14 |         out_params = convert_params(
15 |             params,
16 |             ints=['window_length', 'polyorder', 'deriv']
17 |         )
18 | 
19 |         # set defaults for parameters
20 |         if 'window_length' in out_params:
21 |             self.window_length = out_params['window_length']
22 |         else:
23 |             self.window_length = 5
24 | 
25 |         if 'polyorder' in out_params:
26 |             self.polyorder = out_params['polyorder']
27 |         else:
28 |             self.polyorder = 2
29 | 
30 |         if 'deriv' in out_params:
31 |             self.deriv = out_params['deriv']
32 |         else:
33 |             self.deriv = 0
34 | 
35 |     def fit(self, df, options):
36 |         X = df.copy()
37 |         X, nans, columns = df_util.prepare_features(X, self.feature_variables)
38 | 
39 |         def f(x):
40 |             return savgol_filter(x, self.window_length, self.polyorder, self.deriv)
41 | 
42 |         y_hat = np.apply_along_axis(f, 0, X)
43 | 
44 |         names = ['SG_%s' % col for col in columns]
45 |         output_df = df_util.create_output_dataframe(y_hat, nans, names)
46 |         df = df_util.merge_predictions(df, output_df)
47 | 
48 |         return df


--------------------------------------------------------------------------------
/src/bin/algos_contrib/BaggingRegressor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from pandas import DataFrame
 4 | from sklearn.ensemble import BaggingRegressor as _BaggingRegressor
 5 | 
 6 | from base import RegressorMixin, BaseAlgo
 7 | from util.param_util import convert_params
 8 | from util.algo_util import handle_max_features
 9 | from codec import codecs_manager
10 | 
11 | 
12 | class BaggingRegressor(RegressorMixin, BaseAlgo):
13 |     def __init__(self, options):
14 |         self.handle_options(options)
15 |         params = options.get('params', {})
16 |         out_params = convert_params(
17 |             params,
18 |             floats=['max_samples', 'max_features'],
19 |             bools=['bootstrap', 'bootstrap_features', 'oob_score', 'warm_start'],
20 |             ints=['n_estimators'],
21 |         )
22 | 
23 |         self.estimator = _BaggingRegressor(**out_params)
24 |         
25 | 
26 |     @staticmethod
27 |     def register_codecs():
28 |         from codec.codecs import SimpleObjectCodec, TreeCodec
29 | 
30 |         codecs_manager.add_codec('algos.BaggingRegressor', 'BaggingRegressor', SimpleObjectCodec)
31 |         codecs_manager.add_codec('sklearn.ensemble.classes', 'BaggingRegressor', SimpleObjectCodec)
32 |         codecs_manager.add_codec('sklearn.tree.tree', 'DecisionTreeRegressor', SimpleObjectCodec)
33 |         codecs_manager.add_codec('sklearn.ensemble.weight_boosting', 'BaggingRegressor', SimpleObjectCodec)
34 |         codecs_manager.add_codec('sklearn.ensemble.bagging', 'BaggingRegressor', SimpleObjectCodec)
35 |         codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
36 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/QuantileTransformer.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | 
 4 | import pandas as pd
 5 | from sklearn.preprocessing import QuantileTransformer as _QuantileTransformer
 6 | 
 7 | from base import BaseAlgo, TransformerMixin
 8 | from codec import codecs_manager
 9 | from util.param_util import convert_params
10 | from util import df_util
11 | 
12 | 
13 | class QuantileTransformer(TransformerMixin, BaseAlgo):
14 | 
15 |     def __init__(self, options):
16 |         self.handle_options(options)
17 | 
18 |         out_params = convert_params(
19 |             options.get('params', {}),
20 |             bools=['copy'],
21 |             ints=['n_quantiles'],
22 |             strs=['output_distribution']
23 |         )
24 |         self.estimator = _QuantileTransformer(**out_params)
25 |         self.columns = None
26 | 
27 |     def rename_output(self, default_names, new_names=None):
28 |         if new_names is None:
29 |             new_names = 'QT'
30 |         output_names = [new_names + '_' + feature for feature in self.columns]
31 |         return output_names
32 | 
33 |     def summary(self, options):
34 |         if len(options) != 2:  # only model name and mlspl_limits
35 |             raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__)
36 |         return pd.DataFrame({'fields': self.columns})
37 | 
38 |     @staticmethod
39 |     def register_codecs():
40 |         from codec.codecs import SimpleObjectCodec
41 |         codecs_manager.add_codec('algos.QuantileTransformer', 'QuantileTransformer', SimpleObjectCodec)
42 |         codecs_manager.add_codec('sklearn.preprocessing.data', 'QuantileTransformer', SimpleObjectCodec)
43 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/LatentDirichletAllocation.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Once newer version of sklearn is used will need to change k alias from n_topics to n_components
 3 | https://stackoverflow.com/a/48121678
 4 | '''
 5 | 
 6 | from sklearn.decomposition import LatentDirichletAllocation as _LatentDirichletAllocation
 7 | from base import BaseAlgo, TransformerMixin
 8 | from codec import codecs_manager
 9 | from util.param_util import convert_params
10 | 
11 | class LatentDirichletAllocation(TransformerMixin, BaseAlgo):
12 | 
13 |     def __init__(self, options):
14 |         self.handle_options(options)
15 |         out_params = convert_params(
16 |             options.get('params', {}),
17 |             floats=['doc_topic_prior','learning_decay','learning_offset','perp_tol','mean_change_tol'],
18 |             strs=['learning_method'],
19 |             ints=['k','max_iter','batch_size','evaluate_every','total_samples','max_doc_update_iter','n_jobs','verbose','random_state'],
20 |             aliases={'k': 'n_topics'}
21 |         )
22 | 
23 |         self.estimator = _LatentDirichletAllocation(**out_params)
24 | 
25 |     def rename_output(self, default_names, new_names):
26 |         if new_names is None:
27 |             new_names = 'LDA'
28 |         output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
29 |         return output_names
30 | 
31 |     @staticmethod
32 |     def register_codecs():
33 |         from codec.codecs import SimpleObjectCodec
34 |         codecs_manager.add_codec('algos_contrib.LatentDirichletAllocation', 'LatentDirichletAllocation', SimpleObjectCodec)
35 |         codecs_manager.add_codec('sklearn.decomposition.online_lda', 'LatentDirichletAllocation', SimpleObjectCodec)
36 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/OrthogonalMatchingPursuit.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.linear_model import OrthogonalMatchingPursuit as _OrthogonalMatchingPursuit
 3 | from base import RegressorMixin, BaseAlgo
 4 | from util.param_util import convert_params
 5 | from util import df_util
 6 | 
 7 | 
 8 | class OrthogonalMatchingPursuit(RegressorMixin, BaseAlgo):
 9 |     def __init__(self, options):
10 |         self.handle_options(options)
11 | 
12 |         params = options.get('params', {})
13 |         out_params = convert_params(
14 |             params,
15 |             floats=['tol'],
16 |             strs=['kernel'],
17 |             ints=['n_nonzero_coefs'],
18 |             bools=['fit_intercept', 'normalize'],
19 |         )
20 | 
21 |         self.estimator = _OrthogonalMatchingPursuit(**out_params)
22 | 
23 |     def summary(self, options):
24 |         if len(options) != 2:  # only model name and mlspl_limits
25 |             raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__)
26 |         df = pd.DataFrame({'feature': self.columns,
27 |                            'coefficient': self.estimator.coef_.ravel()})
28 |         idf = pd.DataFrame({'feature': ['_intercept'],
29 |                             'coefficient': [self.estimator.intercept_]})
30 |         return pd.concat([df, idf])
31 | 
32 |     @staticmethod
33 |     def register_codecs():
34 |         from codec.codecs import SimpleObjectCodec
35 |         from codec import codecs_manager
36 |         codecs_manager.add_codec('algos_contrib.OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuit', SimpleObjectCodec)
37 |         codecs_manager.add_codec('sklearn.linear_model.omp', 'OrthogonalMatchingPursuit', SimpleObjectCodec)
38 | 
39 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/ExtraTreesClassifier.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from pandas import DataFrame
 4 | from sklearn.ensemble import ExtraTreesClassifier as _ExtraTreesClassifier
 5 | 
 6 | from base import ClassifierMixin, BaseAlgo
 7 | from codec import codecs_manager
 8 | from util.param_util import convert_params
 9 | from util.algo_util import handle_max_features
10 | 
11 | 
12 | class ExtraTreesClassifier(ClassifierMixin, BaseAlgo):
13 | 
14 |     def __init__(self, options):
15 |         self.handle_options(options)
16 | 
17 |         out_params = convert_params(
18 |             options.get('params', {}),
19 |             ints=['random_state', 'n_estimators', 'max_depth',
20 |                   'min_samples_split', 'max_leaf_nodes'],
21 |             strs=['max_features', 'criterion'],
22 |         )
23 | 
24 |         if 'max_depth' not in out_params:
25 |             out_params.setdefault('max_leaf_nodes', 2000)
26 | 
27 |         if 'max_features' in out_params:
28 |             out_params['max_features'] = handle_max_features(out_params['max_features'])
29 | 
30 |         self.estimator = _ExtraTreesClassifier(class_weight='balanced',
31 |                                                  **out_params)
32 | 
33 |     def summary(self, options):
34 |         if len(options) != 2:  # only model name and mlspl_limits
35 |             raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__)
36 |         df = DataFrame({
37 |             'feature': self.columns,
38 |             'importance': self.estimator.feature_importances_.ravel()
39 |         })
40 |         return df
41 | 
42 |     @staticmethod
43 |     def register_codecs():
44 |         from codec.codecs import SimpleObjectCodec, TreeCodec
45 |         codecs_manager.add_codec('algos_contrib.ExtraTreesClassifier',
46 |                                  'ExtraTreesClassifier', SimpleObjectCodec)
47 |         codecs_manager.add_codec('sklearn.ensemble.forest',
48 |                                  'ExtraTreesClassifier', SimpleObjectCodec)
49 |         codecs_manager.add_codec('sklearn.tree.tree', 'ExtraTreeClassifier',
50 |                                  SimpleObjectCodec)
51 |         codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
52 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/CorrelationMatrix.py:
--------------------------------------------------------------------------------
 1 | from base import BaseAlgo
 2 | 
 3 | 
 4 | class CorrelationMatrix(BaseAlgo):
 5 |     """Compute and return a correlation matrix."""
 6 | 
 7 |     def __init__(self, options):
 8 |         """Check for valid correlation type, and save it to an attribute on self."""
 9 | 
10 |         feature_variables = options.get('feature_variables', {})
11 |         target_variable = options.get('target_variable', {})
12 | 
13 |         if len(feature_variables) == 0:
14 |             raise RuntimeError('You must supply one or more fields')
15 | 
16 |         if len(target_variable) > 0:
17 |             raise RuntimeError('CorrelationMatrix does not support the from clause')
18 | 
19 |         valid_methods = ['spearman', 'kendall', 'pearson']
20 | 
21 |         # Check to see if parameters exist
22 |         params = options.get('params', {})
23 | 
24 |         # Check if method is in parameters in search
25 |         if 'method' in params:
26 |             if params['method'] not in valid_methods:
27 |                 error_msg = 'Invalid value for method: must be one of {}'.format(
28 |                     ', '.join(valid_methods))
29 |                 raise RuntimeError(error_msg)
30 | 
31 |             # Assign method to self for later usage
32 |             self.method = params['method']
33 | 
34 |         # Assign default method and ensure no other parameters are present
35 |         else:
36 |             # Default method for correlation
37 |             self.method = 'pearson'
38 | 
39 |             # Check for bad parameters
40 |             if len(params) > 0:
41 |                 raise RuntimeError('The only valid parameter is method.')
42 | 
43 |     def fit(self, df, options):
44 |         """Compute the correlations and return a DataFrame."""
45 | 
46 |         # df contains all the search results, including hidden fields
47 |         # but the requested requested are saved as self.feature_variables
48 |         requested_columns = df[self.feature_variables]
49 | 
50 |         # Get correlations
51 |         correlations = requested_columns.corr(method=self.method)
52 | 
53 |         # Reset index so that all the data are in columns
54 |         # (this is necessary for the corr method)
55 |         output_df = correlations.reset_index()
56 | 
57 |         return output_df
58 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/MinMaxScaler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import pandas as pd
 4 | from sklearn.preprocessing import MinMaxScaler as _MinMaxScaler
 5 | 
 6 | from base import BaseAlgo, TransformerMixin
 7 | from codec import codecs_manager
 8 | from util.param_util import convert_params
 9 | from util import df_util
10 | 
11 | 
12 | class MinMaxScaler(TransformerMixin, BaseAlgo):
13 | 
14 |     def __init__(self, options):
15 |         self.handle_options(options)
16 | 
17 |         out_params = convert_params(
18 |             options.get('params', {}),
19 |             bools=['copy'],
20 |             strs=['feature_range']
21 |         )
22 |         self.estimator = _MinMaxScaler(**out_params)
23 |         self.columns = None
24 | 
25 |     def rename_output(self, default_names, new_names=None):
26 |         if new_names is None:
27 |             new_names = 'MMS'
28 |         output_names = [new_names + '_' + feature for feature in self.columns]
29 |         return output_names
30 | 
31 |     def partial_fit(self, df, options):
32 |         # Make a copy of data, to not alter original dataframe
33 |         X = df.copy()
34 | 
35 |         X, _, columns = df_util.prepare_features(
36 |             X=X,
37 |             variables=self.feature_variables,
38 |             mlspl_limits=options.get('mlspl_limits'),
39 |         )
40 |         if self.columns is not None:
41 |             df_util.handle_new_categorical_values(X, None, options, self.columns)
42 |             if X.empty:
43 |                 return
44 |         else:
45 |             self.columns = columns
46 |         self.estimator.partial_fit(X)
47 | 
48 |     def summary(self, options):
49 |         if len(options) != 2:  # only model name and mlspl_limits
50 |             raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__)
51 |         return pd.DataFrame({'fields': self.columns,
52 |                              'mean': self.estimator.mean_,
53 |                              'var': self.estimator.var_,
54 |                              'scale': self.estimator.scale_})
55 | 
56 |     @staticmethod
57 |     def register_codecs():
58 |         from codec.codecs import SimpleObjectCodec
59 |         codecs_manager.add_codec('algos_contrib.MinMaxScaler', 'MinMaxScaler', SimpleObjectCodec)
60 |         codecs_manager.add_codec('sklearn.preprocessing.data', 'MinMaxScaler', SimpleObjectCodec)
61 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/MDS.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from sklearn.manifold import MDS as _MDS
 4 | 
 5 | from base import BaseAlgo, TransformerMixin
 6 | from codec import codecs_manager
 7 | from util.param_util import convert_params
 8 | 
 9 | from util import df_util
10 | 
11 | class MDS(TransformerMixin, BaseAlgo):
12 | 
13 |     def __init__(self, options):
14 |         self.handle_options(options)
15 |         out_params = convert_params(
16 |             options.get('params', {}),
17 |             ints=['k', 'max_iter', 'n_init', 'n_jobs'],
18 |             floats=['eps'],
19 |             bools=['metric'],
20 |             aliases={'k': 'n_components'}
21 |         )
22 | 
23 |         if 'max_iter' not in out_params:
24 |             out_params.setdefault('max_iter', 300)
25 | 
26 |         if 'n_init' not in out_params:
27 |             out_params.setdefault('n_init', 4)
28 | 
29 |         if 'n_jobs' not in out_params:
30 |             out_params.setdefault('n_jobs', 1)
31 | 
32 |         if 'eps' not in out_params:
33 |             out_params.setdefault('eps', 0.001)
34 | 
35 |         if 'metric' not in out_params:
36 |             out_params.setdefault('metric', True)
37 | 
38 |         self.estimator = _MDS(**out_params)
39 | 
40 |     def rename_output(self, default_names, new_names):
41 |         if new_names is None:
42 |             new_names = 'MDS'
43 |         output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
44 |         return output_names
45 | 
46 |     def apply(self, df, options):
47 |         # Make a copy of data, to not alter original dataframe
48 |         X = df.copy()
49 | 
50 |         # Prepare the features
51 |         X, nans, _ = df_util.prepare_features(
52 |             X=X,
53 |             variables=self.feature_variables,
54 |             final_columns=self.columns,
55 |         )
56 | 
57 |         # Call the transform method
58 |         y_hat = self.estimator.fit_transform(X.values)
59 | 
60 |         # Assign output_name
61 |         output_name = options.get('output_name', None)
62 |         default_names = self.make_output_names(
63 |             output_name=output_name,
64 |             n_names=y_hat.shape[1],
65 |         )
66 |         output_names = self.rename_output(default_names, output_name)
67 | 
68 |         # Create output dataframe
69 |         output = df_util.create_output_dataframe(
70 |             y_hat=y_hat,
71 |             nans=nans,
72 |             output_names=output_names,
73 |         )
74 | 
75 |         # Merge with original dataframe
76 |         output = df_util.merge_predictions(df, output)
77 |         return output
78 | 
79 |     @staticmethod
80 |     def register_codecs():
81 |         from codec.codecs import SimpleObjectCodec
82 |         codecs_manager.add_codec('algos_contrib.MDS', 'MDS', SimpleObjectCodec)
83 |         codecs_manager.add_codec('sklearn.manifold.MDS', 'MDS', SimpleObjectCodec)
84 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/TSNE.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from sklearn.manifold import TSNE as _TSNE
 4 | 
 5 | from base import BaseAlgo, TransformerMixin
 6 | from codec import codecs_manager
 7 | from util.param_util import convert_params
 8 | 
 9 | from util import df_util
10 | 
11 | class TSNE(TransformerMixin, BaseAlgo):
12 | 
13 |     def __init__(self, options):
14 |         self.handle_options(options)
15 |         out_params = convert_params(
16 |             options.get('params', {}),
17 |             ints=['k', 'n_iter'],
18 |             floats=['perplexity', 'early_exaggeration', 'learning_rate'],
19 |             aliases={'k': 'n_components'}
20 |         )
21 | 
22 |         if out_params['n_components'] < 1:
23 |             msg = 'Invalid value for k: k must be greater than or equal to 1, but found k="{}".'
24 |             raise RuntimeError(msg.format(out_params['n_components']))
25 | 
26 |         if 'n_iter' not in out_params:
27 |             out_params.setdefault('n_iter', 200)
28 | 
29 |         if 'perplexity' not in out_params:
30 |             out_params.setdefault('perplexity', 30.0)
31 | 
32 |         if 'early_exaggeration' not in out_params:
33 |             out_params.setdefault('early_exaggeration', 4.0)
34 | 
35 |         if 'learning_rate' not in out_params:
36 |             out_params.setdefault('learning_rate', 100)
37 | 
38 |         self.estimator = _TSNE(**out_params)
39 | 
40 |     def rename_output(self, default_names, new_names):
41 |         if new_names is None:
42 |             new_names = 'TSNE'
43 |         output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
44 |         return output_names
45 | 
46 |     def apply(self, df, options):
47 |         # Make a copy of data, to not alter original dataframe
48 |         X = df.copy()
49 | 
50 |         # Prepare the features
51 |         X, nans, _ = df_util.prepare_features(
52 |             X=X,
53 |             variables=self.feature_variables,
54 |             final_columns=self.columns,
55 |         )
56 | 
57 |         # Call the transform method
58 |         y_hat = self.estimator.fit_transform(X.values)
59 | 
60 |         # Assign output_name
61 |         output_name = options.get('output_name', None)
62 |         default_names = self.make_output_names(
63 |             output_name=output_name,
64 |             n_names=y_hat.shape[1],
65 |         )
66 |         output_names = self.rename_output(default_names, output_name)
67 | 
68 |         # Create output dataframe
69 |         output = df_util.create_output_dataframe(
70 |             y_hat=y_hat,
71 |             nans=nans,
72 |             output_names=output_names,
73 |         )
74 | 
75 |         # Merge with original dataframe
76 |         output = df_util.merge_predictions(df, output)
77 |         return output
78 | 
79 |     @staticmethod
80 |     def register_codecs():
81 |         from codec.codecs import SimpleObjectCodec
82 |         codecs_manager.add_codec('algos_contrib.TSNE', 'TSNE', SimpleObjectCodec)
83 |         codecs_manager.add_codec('sklearn.manifold.t_sne', 'TSNE', SimpleObjectCodec)
84 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/CustomDecisionTreeClassifier.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from sklearn.tree import DecisionTreeClassifier as _DecisionTreeClassifier
 4 | from base import ClassifierMixin, BaseAlgo
 5 | from codec import codecs_manager
 6 | from util.param_util import convert_params
 7 | from util.algo_util import tree_summary
 8 | 
 9 | #This algorithm is an updated version of DecisionTreecClassifier from MLTK and class weight parameter has been added to it
10 | 
11 | class CustomDecisionTreeClassifier(ClassifierMixin, BaseAlgo):
12 |     def __init__(self, options):
13 |         self.handle_options(options)
14 | 
15 |         out_params = convert_params(
16 |             options.get('params', {}),
17 |             ints=['random_state', 'max_depth', 'min_samples_split', 'max_leaf_nodes'],
18 |             strs=['criterion', 'splitter', 'max_features', 'class_weight'],
19 |         )
20 | 
21 |         # whitelist valid values for criterion, as error raised by sklearn for invalid values is uninformative
22 |         if 'criterion' in out_params:
23 |             try:
24 |                 assert (out_params['criterion'] in ['gini', 'entropy'])
25 |             except AssertionError:
26 |                 raise RuntimeError('Invalid value for option criterion: "%s"' % out_params['criterion'])
27 | 
28 |         # whitelist valid values for splitter, as error raised by sklearn for invalid values is uninformative
29 |         if 'splitter' in out_params:
30 |             try:
31 |                 assert (out_params['splitter'] in ['best', 'random'])
32 |             except AssertionError:
33 |                 raise RuntimeError('Invalid value for option splitter: "%s"' % out_params['splitter'])
34 | 
35 |         if 'max_depth' not in out_params:
36 |             out_params.setdefault('max_leaf_nodes', 2000)
37 | 
38 |         # EAFP... convert max_features to int or float if it is a number.
39 |         try:
40 |             out_params['max_features'] = float(out_params['max_features'])
41 |             max_features_int = int(out_params['max_features'])
42 |             if out_params['max_features'] == max_features_int:
43 |                 out_params['max_features'] = max_features_int
44 |         except:
45 |             pass
46 | 
47 |         if 'class_weight' in out_params:
48 |             try:
49 |                 from ast import literal_eval
50 |                 out_params['class_weight'] = literal_eval(out_params['class_weight'])
51 |             except Exception:
52 |                 raise RuntimeError('Invalid value for option class_weight: "%s"' % out_params['class_weight'])
53 | 
54 |         self.estimator = _DecisionTreeClassifier(**out_params)
55 | 
56 |     def summary(self, options):
57 |         if 'args' in options:
58 |             raise RuntimeError('Summarization does not take values other than parameters')
59 |         return tree_summary(self, options)
60 | 
61 |     @staticmethod
62 |     def register_codecs():
63 |         from codec.codecs import SimpleObjectCodec, TreeCodec
64 |         codecs_manager.add_codec('algos_contrib.CustomDecisionTreeClassifier', 'CustomDecisionTreeClassifier', SimpleObjectCodec)
65 |         codecs_manager.add_codec('sklearn.tree.tree', 'DecisionTreeClassifier', SimpleObjectCodec)
66 |         codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
67 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/AgglomerativeClustering.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.cluster import AgglomerativeClustering as AgClustering
 3 | from sklearn.metrics import silhouette_samples
 4 | 
 5 | from base import BaseAlgo
 6 | from util.param_util import convert_params
 7 | from util import df_util
 8 | 
 9 | 
10 | class AgglomerativeClustering(BaseAlgo):
11 |     """Use scikit-learn's AgglomerativeClustering algorithm to cluster data."""
12 | 
13 |     def __init__(self, options):
14 | 
15 |         feature_variables = options.get('feature_variables', {})
16 |         target_variable = options.get('target_variable', {})
17 | 
18 |         # Ensure fields are present
19 |         if len(feature_variables) == 0:
20 |             raise RuntimeError('You must supply one or more fields')
21 | 
22 |         # No from clause allowed
23 |         if len(target_variable) > 0:
24 |             raise RuntimeError('AgglomerativeClustering does not support the from clause')
25 | 
26 |         # Convert params & alias k to n_clusters
27 |         params = options.get('params', {})
28 |         out_params = convert_params(
29 |             params,
30 |             ints=['k'],
31 |             strs=['linkage', 'affinity'],
32 |             aliases={'k': 'n_clusters'}
33 |         )
34 | 
35 |         # Check for valid linkage
36 |         if 'linkage' in out_params:
37 |             valid_linkage = ['ward', 'complete', 'average']
38 |             if out_params['linkage'] not in valid_linkage:
39 |                 raise RuntimeError('linkage must be one of: {}'.format(', '.join(valid_linkage)))
40 | 
41 |         # Check for valid affinity
42 |         if 'affinity' in out_params:
43 |             valid_affinity = ['l1', 'l2', 'cosine', 'manhattan',
44 |                               'precomputed', 'euclidean']
45 | 
46 |             if out_params['affinity'] not in valid_affinity:
47 |                 raise RuntimeError('affinity must be one of: {}'.format(', '.join(valid_affinity)))
48 | 
49 |         # Check for invalid affinity & linkage combination
50 |         if 'linkage' in out_params and 'affinity' in out_params:
51 |             if out_params['linkage'] == 'ward':
52 |                 if out_params['affinity'] != 'euclidean':
53 |                     raise RuntimeError('ward linkage (default) must use euclidean affinity (default)')
54 | 
55 |         # Initialize the estimator
56 |         self.estimator = AgClustering(**out_params)
57 | 
58 |     def fit(self, df, options):
59 |         """Do the clustering & merge labels with original data."""
60 |         # Make a copy of the input data
61 |         X = df.copy()
62 | 
63 |         # Use the df_util prepare_features method to
64 |         # - drop null columns & rows
65 |         # - convert categorical columns into dummy indicator columns
66 |         # X is our cleaned data, nans is a mask of the null value locations
67 |         X, nans, columns = df_util.prepare_features(X, self.feature_variables)
68 | 
69 |         # Do the actual clustering
70 |         y_hat = self.estimator.fit_predict(X.values)
71 | 
72 |         # attach silhouette coefficient score for each row
73 |         silhouettes = silhouette_samples(X, y_hat)
74 | 
75 |         # Combine the two arrays, and transpose them.
76 |         y_hat = np.vstack([y_hat, silhouettes]).T
77 | 
78 |         # Assign default output names
79 |         default_name = 'cluster'
80 | 
81 |         # Get the value from the as-clause if present
82 |         output_name = options.get('output_name', default_name)
83 | 
84 |         # There are two columns - one for the labels, for the silhouette scores
85 |         output_names = [output_name, 'silhouette_score']
86 | 
87 |         # Use the predictions & nans-mask to create a new dataframe
88 |         output_df = df_util.create_output_dataframe(y_hat, nans, output_names)
89 | 
90 |         # Merge the dataframe with the original input data
91 |         df = df_util.merge_predictions(df, output_df)
92 |         return df
93 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/CollaborativeFilter.py:
--------------------------------------------------------------------------------
  1 |  
  2 | from base import BaseAlgo
  3 | import pandas as pd
  4 | import numpy as np
  5 | 
  6 | from sklearn.metrics.pairwise import pairwise_distances
  7 | from cexc import get_logger
  8 | from util import df_util
  9 | from util.param_util import convert_params
 10 | 
 11 | # Everyone's favorite in memory collaborative filter, not a scaleable solution for millions of users and millions of items
 12 | # https://en.wikipedia.org/wiki/Collaborative_filtering
 13 | # please check out more scaleable solutions in KNN or "Recommender Systems: The Textbook"
 14 | # TODO add coldstart solution  for nulls
 15 | # TODO currently we assume a |fillnull value=0 is run in splunk prior to calling the algorithm
 16 | 
 17 | # We ASSUME rows are users, columns are items. 
 18 | # TODO I seem to cause splunk memory issues with wide tables, so I should consider doing an XYSERIES like reshape 
 19 | # TODO and consider taking in a table of USERID, ITEM , RATING from splunk. Yucky.
 20 | 
 21 | # TODO There are many many many other distance metrics that could be a good fit.
 22 | 
 23 | 
 24 | class CollaborativeFilter(BaseAlgo):
 25 |     def __init__(self, options):
 26 | 
 27 | 
 28 |        # set parameters
 29 |         params = options.get('params', {})
 30 |         out_params = convert_params(
 31 |             params,
 32 |             strs=['user_field','rating_type','coldstart_field']
 33 |         )
 34 | 
 35 |         # set defaults for parameters
 36 |         if 'user_field' in out_params:
 37 |             self.user_field = out_params['user_field']
 38 |         else:
 39 |             self.user_field = "SME"
 40 | 
 41 |         self.rating_type="item"
 42 |         if 'rating_type' in out_params:
 43 |             if out_params['rating_type'] == "item":
 44 |                 self.rating_type="item"
 45 |             elif out_params['rating_type'] == "user":
 46 |                 self.rating_type="user"
 47 | 
 48 | 
 49 |     def fit(self, df, options):
 50 |         # df contains all the search results, including hidden fields
 51 |         # but the requested requested are saved as self.feature_variables
 52 |         logger = get_logger('MyCustomLogging')
 53 | 
 54 |         X=df.copy()
 55 | 
 56 |         # it is always best practice to prepare your data.
 57 |         # splunk has a number of hidden fields that are exposed as part of the search protocole, and we really only
 58 |         # want the features that are valid field names.
 59 | 
 60 | 
 61 |         #Make sure to turn off get_dummies
 62 |         X, _, self.columns = df_util.prepare_features(
 63 |             X=X,
 64 |             variables=self.feature_variables,
 65 |             get_dummies=False,
 66 |             mlspl_limits=options.get('mlspl_limits'),
 67 |         )
 68 | 
 69 |         # test if user field is in the list
 70 |         logger.debug("The user field is %s",self.user_field )
 71 |         try: 
 72 |             my_list_index=(X[self.user_field].values)
 73 |         except:
 74 |             raise RuntimeError('You must specify user field that exists. You sent %s',self.user_field)
 75 | 
 76 |         X=X.drop([self.user_field],axis=1)
 77 |         my_list_header=(X.columns.values)
 78 | 
 79 |         #ratings as a matrix , clean that data up!
 80 |         X=X.replace([np.inf, -np.inf], "nan").replace("nan","0")
 81 |         matrix=X.values
 82 |         # force type for Numpy Math
 83 |         matrix=matrix.astype(np.float64)
 84 | 
 85 |         # should consider erroring out when you have super sparse user data
 86 |         # TODO add other methods via parameter
 87 |         user_sim = pairwise_distances(matrix, metric='cosine')
 88 |         item_sim = pairwise_distances(matrix.T, metric='cosine')
 89 | 
 90 |         #item prediction
 91 |         item_sim= matrix.dot(item_sim) / np.array([np.abs(item_sim).sum(axis=1)])
 92 | 
 93 |         #user sim
 94 |         mean_user_rating = matrix.mean(axis=1)
 95 |         matrix_diff = (matrix - mean_user_rating[:, np.newaxis])
 96 |         user_sim = mean_user_rating[:, np.newaxis] + user_sim.dot(matrix_diff) / np.array([np.abs(user_sim).sum(axis=1)]).T
 97 | 
 98 |         # add back into the matrix the header row
 99 |         if self.rating_type == "item":
100 |             output_df=pd.DataFrame(item_sim,columns=my_list_header, index=my_list_index)
101 |         if self.rating_type == "user":
102 |             output_df=pd.DataFrame(user_sim,columns=my_list_header, index=my_list_index)        
103 |         output_df[self.user_field]=pd.Series(my_list_index).values
104 | 
105 |         return output_df  
106 |         
107 | 
108 |  
109 | 
110 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/TFBinary.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | Copy of existing TFIDF algo but with 2 boolean options added and 3 options set
  4 | so that binary output is achieved.
  5 | '''
  6 | 
  7 | from sklearn.feature_extraction.text import TfidfVectorizer as _TfidfVectorizer
  8 | 
  9 | from base import BaseAlgo
 10 | from codec import codecs_manager
 11 | from util import df_util
 12 | from util.param_util import convert_params
 13 | 
 14 | 
 15 | class TFBinary(BaseAlgo):
 16 | 
 17 |     def handle_options(self, options):
 18 |         if len(options.get('feature_variables', [])) != 1 or len(options.get('target_variable', [])) > 0:
 19 |             raise RuntimeError('Syntax error: You must specify exactly one field')
 20 | 
 21 |     def __init__(self, options):
 22 |         self.handle_options(options)
 23 | 
 24 |         out_params = convert_params(
 25 |             options.get('params', {}),
 26 |             ints=['max_features'],
 27 |             bools=['use_idf','binary'],
 28 |             strs=['max_df', 'min_df',
 29 |                   'ngram_range', 'stop_words',
 30 |                   'analyzer', 'norm', 'token_pattern'],
 31 |         )
 32 | 
 33 |         for doc_freq, default_val in [('max_df', 1.0), ('min_df', 1)]:
 34 |             if doc_freq in out_params:
 35 |                 # EAFP... convert max_df/min_df to float/int if it is a number.
 36 |                 try:
 37 |                     float_val = float(out_params[doc_freq])
 38 |                     int_val = int(float_val)
 39 |                 except ValueError:
 40 |                     raise RuntimeError('Syntax Error: {doc_freq} requires a numeric value, e.g. {doc_freq}=1.0'.format(doc_freq=doc_freq))
 41 |                 if float_val == 1.0:
 42 |                     out_params[doc_freq] = default_val
 43 |                 elif float_val == int_val:
 44 |                     out_params[doc_freq] = int_val
 45 |                 else:
 46 |                     out_params[doc_freq] = float_val
 47 | 
 48 |         if 'ngram_range' in out_params.keys():
 49 |             try:
 50 |                 out_params['ngram_range'] = tuple(int(i) for i in out_params['ngram_range'].split('-'))
 51 |                 assert len(out_params['ngram_range']) == 2
 52 |             except:
 53 |                 raise RuntimeError('Syntax Error: ngram_range requires a range, e.g. ngram_range=1-5')
 54 | 
 55 |         # TODO: Maybe let the user know that we make this change.
 56 |         out_params.setdefault('max_features', 100)
 57 | 
 58 |         # Binary defaults
 59 |         out_params.setdefault('use_idf', False)
 60 |         out_params.setdefault('norm', None)
 61 |         out_params.setdefault('binary', True)
 62 | 
 63 |         self.estimator = _TfidfVectorizer(**out_params)
 64 | 
 65 |     def fit(self, df, options):
 66 |         # Make a copy of data, to not alter original dataframe
 67 |         X = df.copy()
 68 | 
 69 |         # Make sure to turn off get_dummies
 70 |         X, _, self.columns = df_util.prepare_features(
 71 |             X=X,
 72 |             variables=self.feature_variables,
 73 |             get_dummies=False,
 74 |             mlspl_limits=options.get('mlspl_limits'),
 75 |         )
 76 | 
 77 |         X = X.values.ravel().astype('str')
 78 |         self.estimator.fit(X)
 79 | 
 80 |     def make_output_names(self, options):
 81 |         default_name = self.feature_variables[0] + '_tfbin'
 82 |         output_name = options.get('output_name', default_name)
 83 |         feature_names = self.estimator.get_feature_names()
 84 |         output_names = [output_name + '_' + str(index) + '_' + word
 85 |                         for (index, word) in enumerate(feature_names)]
 86 |         return output_names
 87 | 
 88 |     def apply(self, df, options):
 89 |         # Make a copy of data, to not alter original dataframe
 90 |         X = df.copy()
 91 | 
 92 |         # Make sure to turn off get_dummies
 93 |         X, nans, _ = df_util.prepare_features(
 94 |             X=X,
 95 |             variables=self.feature_variables,
 96 |             final_columns=self.columns,
 97 |             get_dummies=False,
 98 |             mlspl_limits=options.get('mlspl_limits'),
 99 |         )
100 | 
101 |         X = X.values.ravel().astype('str')
102 |         y_hat = self.estimator.transform(X)
103 | 
104 |         # Convert the returned sparse matrix into array
105 |         y_hat = y_hat.toarray()
106 | 
107 |         output_names = self.make_output_names(options)
108 | 
109 |         output = df_util.create_output_dataframe(
110 |             y_hat=y_hat,
111 |             output_names=output_names,
112 |             nans=nans,
113 |         )
114 | 
115 |         df = df_util.merge_predictions(df, output)
116 |         return df
117 | 
118 |     @staticmethod
119 |     def register_codecs():
120 |         from codec.codecs import SimpleObjectCodec
121 |         codecs_manager.add_codec('algos_contrib.TFBinary', 'TFBinary', SimpleObjectCodec)
122 |         codecs_manager.add_codec('sklearn.feature_extraction.text', 'TfidfVectorizer', SimpleObjectCodec)
123 |         codecs_manager.add_codec('sklearn.feature_extraction.text', 'TfidfTransformer', SimpleObjectCodec)
124 |         codecs_manager.add_codec('scipy.sparse.dia', 'dia_matrix', SimpleObjectCodec)
125 | 


--------------------------------------------------------------------------------
/src/bin/test/test_contrib_util.py:
--------------------------------------------------------------------------------
  1 | import mock
  2 | import io
  3 | import pandas as pd
  4 | import pytest
  5 | import sys
  6 | 
  7 | from base import BaseAlgo
  8 | from util.base_util import MLSPLNotImplementedError
  9 | 
 10 | from contrib_util import AlgoTestUtils
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def min_algo_cls():
 15 |     class MinimalAlgo(BaseAlgo):
 16 |         pass
 17 |     return MinimalAlgo
 18 | 
 19 | 
 20 | @pytest.fixture
 21 | def serializable_algo_cls():
 22 |     class SerializableAlgo(BaseAlgo):
 23 |         def __init__(self, options):
 24 |             pass
 25 | 
 26 |         def fit(self, df, options):
 27 |             pass
 28 | 
 29 |         def apply(self, df, options):
 30 |             return df
 31 | 
 32 |         @classmethod
 33 |         def register_codecs(cls):
 34 |             from codec.codecs import SimpleObjectCodec
 35 |             from codec import codecs_manager
 36 |             codecs_manager.add_codec('test.test_contrib_util', 'SerializableAlgo', SimpleObjectCodec)
 37 | 
 38 |     # Add the class to this module so that encoder and decoder can access it.
 39 |     # This is only necessary for a fixture function.  Normally, these classes will be defined within a module.
 40 |     setattr(sys.modules[__name__], 'SerializableAlgo', SerializableAlgo)
 41 |     return SerializableAlgo
 42 | 
 43 | 
 44 | mock_algo_conf = """
 45 | [MinimalAlgo]
 46 | package=algos_contrib
 47 | """
 48 | 
 49 | 
 50 | mock_algo_conf_no_package = """
 51 | [MinimalAlgo]
 52 | """
 53 | 
 54 | 
 55 | def test_method_signature(min_algo_cls):
 56 |     AlgoTestUtils.assert_method_signature(min_algo_cls, 'fit', ['self', 'df', 'options'])
 57 | 
 58 | 
 59 | @mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf))
 60 | def test_registered(mock_get_algos_conf_fp, min_algo_cls):
 61 |     AlgoTestUtils.assert_registered(min_algo_cls)
 62 | 
 63 | 
 64 | def test_serializable(serializable_algo_cls):
 65 |     AlgoTestUtils.assert_serializable(serializable_algo_cls, input_df=pd.DataFrame({}), options={})
 66 | 
 67 | 
 68 | def test_base_algo_method_signatures_default_methods(min_algo_cls):
 69 |     AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls)
 70 | 
 71 | 
 72 | def test_base_algo_method_signatures_all_methods(min_algo_cls):
 73 |     AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls, required_methods=[
 74 |         '__init__',
 75 |         'fit',
 76 |         'partial_fit',
 77 |         'apply',
 78 |         'register_codecs',
 79 |     ])
 80 | 
 81 | 
 82 | def test_base_algo_method_signatures_extra_methods(min_algo_cls):
 83 |     with pytest.raises(AssertionError) as e:
 84 |         extra_args = [
 85 |             'extra1',
 86 |             'extra2',
 87 |         ]
 88 |         AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls, required_methods=[
 89 |             '__init__',
 90 |             'fit',
 91 |             'partial_fit',
 92 |             'apply',
 93 |             'register_codecs',
 94 |         ] + extra_args)
 95 |     assert e.match('{}.*not in BaseAlgo'.format(extra_args))
 96 | 
 97 | 
 98 | @mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf))
 99 | def test_algo_basic(mock_get_algos_conf_fp, min_algo_cls):
100 |     AlgoTestUtils.assert_algo_basic(min_algo_cls, serializable=False)
101 | 
102 | 
103 | def test_no_base_algo():
104 |     class NoBaseAlgo(object):
105 |         pass
106 | 
107 |     with pytest.raises(AssertionError) as e:
108 |         AlgoTestUtils.assert_base_algo_method_signatures(NoBaseAlgo)
109 |     assert e.match('must inherit from BaseAlgo')
110 | 
111 | 
112 | def test_method_signature_non_existent(min_algo_cls):
113 |     bad_method = 'foot'
114 |     with pytest.raises(AssertionError) as e:
115 |         AlgoTestUtils.assert_method_signature(min_algo_cls, bad_method, ['self', 'df', 'options'])
116 |     e.match("{}.*does not exist".format(bad_method))
117 | 
118 | 
119 | def test_method_signature_not_callable(min_algo_cls):
120 |     bad_method = 'fit'
121 | 
122 |     # Make fit a property.
123 |     min_algo_cls.fit = 'fit'
124 | 
125 |     with pytest.raises(AssertionError) as e:
126 |         AlgoTestUtils.assert_method_signature(min_algo_cls, bad_method, ['self', 'df', 'options'])
127 |     e.match("{}.*not callable".format(bad_method))
128 | 
129 | 
130 | @mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf))
131 | def test_unregistered(mock_get_algos_conf_fp):
132 |     class UnregisteredAlgo(BaseAlgo):
133 |         pass
134 | 
135 |     with pytest.raises(AssertionError) as e:
136 |         AlgoTestUtils.assert_registered(UnregisteredAlgo)
137 |     assert e.match('{}.*not registered'.format(UnregisteredAlgo.__name__))
138 | 
139 | 
140 | @mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf_no_package))
141 | def test_registered_with_missing_package_option(mock_get_algos_conf_fp, min_algo_cls):
142 |     with pytest.raises(AssertionError) as e:
143 |         AlgoTestUtils.assert_registered(min_algo_cls)
144 |     assert e.match('{}.*must override.*package'.format(min_algo_cls.__name__))
145 | 
146 | 
147 | def test_not_serializable(min_algo_cls):
148 |     with pytest.raises(MLSPLNotImplementedError) as e:
149 |         AlgoTestUtils.assert_serializable(min_algo_cls, input_df=pd.DataFrame({}), options={})
150 |     assert e.match('does not support saving')
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/src/bin/algos_contrib/IsolationForest.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from sklearn.ensemble import IsolationForest as _IsolationForest
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from base import ClustererMixin, BaseAlgo
  8 | from codec import codecs_manager
  9 | from codec.codecs import BaseCodec
 10 | from codec.flatten import flatten, expand
 11 | from util import df_util
 12 | from util.param_util import convert_params
 13 | from cexc import get_messages_logger,get_logger
 14 | 
 15 | class IsolationForest(ClustererMixin, BaseAlgo):
 16 |     """
 17 |     This is the implementation wrapper around Isolation Forest from scikit-learn. It inherits methods from ClustererMixin and BaseAlgo.
 18 |     """
 19 |     def __init__(self,options):
 20 |         self.handle_options(options)
 21 |         out_params = convert_params(
 22 |             options.get('params',{}),
 23 |             ints = ['n_estimators','n_jobs','random_state','verbose'],
 24 |             floats = ['max_samples','contamination','max_features'],
 25 |             bools = ['bootstrap']
 26 |             )
 27 |         self.return_scores = out_params.pop('anomaly_score', True)
 28 | 
 29 |         # whitelist n_estimators > 0
 30 |         if 'n_estimators' in out_params and out_params['n_estimators']<=0:
 31 |             msg = 'Invalid value error: n_estimators must be greater than 0 and an integer, but found n_estimators="{}".'
 32 |             raise RuntimeError(msg.format(out_params['n_estimators']))
 33 |         
 34 |         # whitelist max_samples > 0 and < 1
 35 |         if 'max_samples' in out_params and out_params['max_samples']<0 and out_params['max_samples']>1:
 36 |             msg = 'Invalid value error: max_samples must be greater than 0 and a float, but found max_samples="{}".'
 37 |             raise RuntimeError(msg.format(out_params['max_samples']))
 38 |         
 39 |         #   whitelist contamination should be in (0.0, 0.5] as error raised by sklearn for values out of range
 40 |         if 'contamination' in out_params and not (0.0 < out_params['contamination'] <= 0.5):
 41 |             msg = (
 42 |                 'Invalid value error: Valid values for contamination are in (0.0, 0.5], '
 43 |                 'but found contamination="{}".'
 44 |             )
 45 |             raise RuntimeError(msg.format(out_params['contamination']))
 46 | 
 47 |         # whitelist max_features > 0 and < 1
 48 |         if 'max_features' in out_params and out_params['max_features']<0 and out_params['max_features']>1:
 49 |             msg = 'Invalid value error: max_features must be greater than 0, but found max_features="{}".'
 50 |             raise RuntimeError(msg.format(out_params['max_features']))
 51 | 
 52 |         
 53 |         self.estimator = _IsolationForest(**out_params)    
 54 | 
 55 | 
 56 |     def apply(self, df, options):
 57 |         # Make a copy of data, to not alter original dataframe
 58 |         logger = get_logger('IsolationForest Logger')
 59 |         X = df.copy()
 60 | 
 61 |         X, nans, _ = df_util.prepare_features(
 62 |             X=X,
 63 |             variables=self.feature_variables,
 64 |             final_columns=self.columns,
 65 |             mlspl_limits=options.get('mlspl_limits'),
 66 |         )
 67 | 
 68 |         # Multiplying the result by -1 to represent Outliers with 1 and Inliers/Normal points with 1.
 69 |         y_hat = self.estimator.predict(X.values)*-1
 70 |         # Printing the accuracy for prediction of outliers
 71 |         accuracy = "Accuracy: {}".format(str(round((list(y_hat).count(-1)*100)/y_hat.shape[0], 2)))
 72 |         logger.debug(accuracy)
 73 |         
 74 |         y_hat = y_hat.astype('str')
 75 | 
 76 |         #Assign output_name
 77 |         default_name = 'isOutlier'
 78 |         new_name = options.get('output_name', None)
 79 |         output_name = self.rename_output(default_names=default_name, new_names=new_name)
 80 | 
 81 |         # Create output dataframe
 82 |         output = df_util.create_output_dataframe(
 83 |             y_hat=y_hat, nans=nans, output_names=output_name
 84 |         )
 85 |         # Merge with original dataframe
 86 |         output = df_util.merge_predictions(df, output)
 87 |         return output
 88 | 
 89 |     def rename_output(self, default_names, new_names=None):
 90 |         """Utility hook to rename output.
 91 | 
 92 |         The default behavior is to take the default_names passed in and simply
 93 |         return them. If however a particular algo needs to rename the columns of
 94 |         the output, this method can be overridden.
 95 |         """
 96 |         return new_names if new_names is not None else default_names    
 97 | 
 98 | 
 99 |     @staticmethod
100 |     def register_codecs():
101 |         from codec.codecs import SimpleObjectCodec, TreeCodec
102 |         codecs_manager.add_codec('algos.IsolationForest', 'IsolationForest', SimpleObjectCodec)
103 |         codecs_manager.add_codec('sklearn.ensemble.iforest', 'IsolationForest', SimpleObjectCodec)
104 |         codecs_manager.add_codec('sklearn.tree.tree','ExtraTreeRegressor', ExtraTreeRegressorCodec)
105 |         codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
106 | 
107 | 
108 | class ExtraTreeRegressorCodec(BaseCodec):
109 |     """
110 |     This is an ExtraTreeRegressor Codec for saving the Isolation Forest base estimator to memory/file.
111 |     """
112 |     @classmethod
113 |     def encode(cls, obj):
114 |         import sklearn.tree
115 |         assert type(obj) == sklearn.tree.tree.ExtraTreeRegressor
116 |         state = obj.__getstate__()
117 |         return {
118 |             '__mlspl_type': [type(obj).__module__, type(obj).__name__],
119 |             'state': state
120 |         }
121 | 
122 |     @classmethod
123 |     def decode(cls,obj):
124 |         from sklearn.tree.tree import ExtraTreeRegressor
125 |         state = obj['state']
126 |         t = ExtraTreeRegressor.__new__(ExtraTreeRegressor)
127 |         t.__setstate__(state)
128 |         return t


--------------------------------------------------------------------------------
/src/bin/test/contrib_util.py:
--------------------------------------------------------------------------------
  1 | """ Utility methods for use in testing."""
  2 | import ConfigParser
  3 | import json
  4 | import os
  5 | from inspect import getargspec
  6 | 
  7 | import pandas as pd
  8 | 
  9 | from base import BaseAlgo
 10 | from codec import MLSPLDecoder, MLSPLEncoder
 11 | 
 12 | 
 13 | PACKAGE_NAME='algos_contrib'
 14 | 
 15 | 
 16 | class AlgoTestUtils(object):
 17 |     """
 18 |     Helper methods for testing algorithm implementations
 19 |     """
 20 |     @staticmethod
 21 |     def assert_method_signature(algo_cls, method_name, args):
 22 |         """
 23 |         Assert the signature of the specified method
 24 | 
 25 |         Args:
 26 |             algo_cls (class): a custom algorithm class to check
 27 |             method_name (str): the name of the method
 28 |             args (list): expected arguments to the named method
 29 | 
 30 |         Returns:
 31 |             (bool): True if the method is callable and has the specified arguments, False otherwise.
 32 | 
 33 |         Raises:
 34 |             AssertionError
 35 |         """
 36 |         method = getattr(algo_cls, method_name, None)
 37 |         assert method, "Method '{}' does not exist".format(method_name)
 38 |         assert callable(method), "Method '{}' is not callable".format(method_name)
 39 |         found_args = getargspec(method).args
 40 |         msg = 'Method {} has signature: {} - but should have {}'.format(method, args, found_args)
 41 |         assert found_args == args, msg
 42 | 
 43 |     @classmethod
 44 |     def assert_registered(cls, algo_cls):
 45 |         """
 46 |         Assert that the algorithm is registered in the algos.conf configuration file.
 47 | 
 48 |         Args:
 49 |             algo_cls (class): a custom algorithm class to check
 50 | 
 51 |         Returns:
 52 |             (bool): True if the method is registered in algos.conf file.
 53 | 
 54 |         Raises:
 55 |             AssertionError
 56 |         """
 57 |         config = ConfigParser.RawConfigParser()
 58 |         with cls.get_algos_conf_fp() as f:
 59 |             config.readfp(f)
 60 |         algo_name = algo_cls.__name__
 61 |         try:
 62 |             package_name = config.get(algo_name, 'package')
 63 |         except ConfigParser.NoSectionError:
 64 |             assert False, "'{}' not registered in algos.conf".format(algo_name)
 65 |         except ConfigParser.NoOptionError:
 66 |             assert False, "'{}' must override 'package' option in algos.conf".format(algo_name)
 67 | 
 68 |         assert package_name == PACKAGE_NAME, "The package name must be '{}'".format(PACKAGE_NAME)
 69 | 
 70 |     @staticmethod
 71 |     def assert_serializable(algo_cls, input_df, options):
 72 |         """
 73 |         Assert that the model created by the algorithm is serializable.
 74 | 
 75 |         Args:
 76 |             algo_cls (class): a custom algorithm class to check
 77 |             input_df (pandas Dataframe): input dataframe for the algorithm being tested
 78 |             options (dict): options for the fit() (and apply(), if applicable) methods of the algorithm
 79 | 
 80 |         Returns:
 81 |             (bool): True if the the model is serializable, False otherwise.
 82 | 
 83 |         Raises:
 84 |             AssertionError
 85 |         """
 86 |         assert hasattr(algo_cls, 'register_codecs')
 87 |         algo_cls.register_codecs()
 88 | 
 89 |         algo_inst = algo_cls(options)
 90 |         algo_inst.feature_variables = ['b', 'c']
 91 |         algo_inst.target_variable = 'a'
 92 |         algo_inst.fit(input_df.copy(), options)
 93 | 
 94 |         encoded = json.dumps(algo_inst, cls=MLSPLEncoder)
 95 |         decoded = json.loads(encoded, cls=MLSPLDecoder)
 96 | 
 97 |         orig_y = algo_inst.apply(input_df.copy(), options)
 98 |         decoded_y = decoded.apply(input_df.copy(), options)
 99 |         pd.util.testing.assert_frame_equal(orig_y, decoded_y)
100 | 
101 |     @classmethod
102 |     def assert_base_algo_method_signatures(cls, algo_cls, required_methods=None):
103 |         """
104 |         Assert that the signatures of algorithm's methods adhere to the API.
105 | 
106 |         Args:
107 |             algo_cls (class): a custom algorithm class to check.
108 |             required_methods (list): list of required method names.
109 |                                      '__init__' and 'fit' are always required, so
110 |                                      they do not need to be included.
111 | 
112 | 
113 |         Returns:
114 |             (bool): True if the methods adhere to the API, False otherwise.
115 | 
116 |         Raises:
117 |             AssertionError
118 |         """
119 |         method_args_map = {
120 |             '__init__': ['self', 'options'],
121 |             'fit': ['self', 'df', 'options'],
122 |             'partial_fit': ['self', 'df', 'options'],
123 |             'apply': ['self', 'df', 'options'],
124 |             'summary': ['self', 'options'],
125 |             'register_codecs': [],
126 |         }
127 | 
128 |         if required_methods is None:
129 |             required_methods = []
130 | 
131 |         assert issubclass(algo_cls, BaseAlgo), 'Algorithms must inherit from BaseAlgo.'
132 | 
133 |         required_method_set = set(required_methods)
134 |         extra_methods = required_method_set - method_args_map.viewkeys()
135 |         assert extra_methods == set(), "'{}' not in BaseAlgo".format(", ".join(extra_methods))
136 | 
137 |         # __init__ and fit are always required.
138 |         required_method_set.add('__init__')
139 |         required_method_set.add('fit')
140 | 
141 |         for required_method in required_method_set:
142 |             cls.assert_method_signature(algo_cls, required_method, method_args_map[required_method])
143 | 
144 |     @classmethod
145 |     def assert_algo_basic(cls, algo_cls, required_methods=None, input_df=None, options=None, serializable=True):
146 |         """
147 |         Assert signatures of methods, registration, and serialization
148 | 
149 |         Args:
150 |             algo_cls (class): a custom algorithm class to check.
151 |             input_df (pandas Dataframe): input dataframe for the algorithm being tested
152 |             options (dict): options for the fit() (and apply(), if applicable) methods of the algorithm
153 |             serializable (bool): whether to check serializability or not.
154 | 
155 |         Returns:
156 |             (bool): True if the methods adhere to the API, False otherwise.
157 | 
158 |         Raises:
159 |             AssertionError
160 |         """
161 |         cls.assert_base_algo_method_signatures(algo_cls, required_methods)
162 |         cls.assert_registered(algo_cls)
163 |         if serializable:
164 |             # The input and options are required for serializability test.
165 |             assert input_df is not None
166 |             assert options is not None
167 |             cls.assert_serializable(algo_cls, input_df, options)
168 | 
169 |     @staticmethod
170 |     def get_algos_conf_fp():
171 |         """
172 |         Get a reference (pointer) to algos.conf file open for read
173 | 
174 |         This method mainly exists to aid testing.
175 | 
176 |         Returns:
177 |             (File): algos.conf file pointer
178 |         """
179 |         algos_file_path = os.path.join(os.path.dirname(__file__), '..', '..', 'default', 'algos.conf')
180 |         return open(algos_file_path)
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # mltk-algo-contrib
  2 | 
  3 | This repo contains custom algorithms for use with the [Splunk Machine Learning Toolkit](https://splunkbase.splunk.com/app/2890/). The repo itself is also a Splunk app.
  4 | Custom algorithms can be added to the Splunk Machine Learning toolkit by adhering to the [ML-SPL API](http://docs.splunk.com/Documentation/MLApp/latest/API/Introduction).
  5 | The API is a thin wrapper around machine learning estimators provided by libraries such as:
  6 | * [scikit-learn](scikit-learn.org)
  7 | * [statsmodels](http://www.statsmodels.org/).
  8 | * [scipy](https://www.scipy.org)
  9 | 
 10 | and custom algorithms.
 11 | 
 12 | Note that this repo is a collection of custom *algorithms* only, and not any libraries. Any libraries required
 13 | should only be added to live environments manually and not to this repo.
 14 | 
 15 | A comprehensive guide to using the ML-SPL API can be found [here](http://docs.splunk.com/Documentation/MLApp/latest/API/Introduction).
 16 | 
 17 | A very simple example:
 18 | 
 19 | ```python
 20 | from base import BaseAlgo
 21 | 
 22 | 
 23 | class CustomAlgorithm(BaseAlgo):
 24 |     def __init__(self, options):
 25 |         # Option checking & initializations here
 26 |         pass
 27 | 
 28 |     def fit(self, df, options):
 29 |         # Fit an estimator to df, a pandas DataFrame of the search results
 30 |         pass
 31 | 
 32 |     def partial_fit(self, df, options):
 33 |         # Incrementally fit a model
 34 |         pass
 35 | 
 36 |     def apply(self, df, options):
 37 |         # Apply a saved model
 38 |         # Modify df, a pandas DataFrame of the search results
 39 |         return df
 40 | 
 41 |     @staticmethod
 42 |     def register_codecs():
 43 |         # Add codecs to the codec manager
 44 |         pass
 45 | 
 46 | ```
 47 | 
 48 | # Dependencies
 49 | 
 50 | To use the custom algorithms contained in this app, you must also have installed:
 51 | 
 52 |  - [Splunk Machine Learning Toolkit](https://splunkbase.splunk.com/app/2890/) 
 53 |  - Python for Scientific Computing Add-on
 54 |     - [Linux64](https://splunkbase.splunk.com/app/2882/)
 55 |     - [Linux32](https://splunkbase.splunk.com/app/2884/)
 56 |     - [Windows64](https://splunkbase.splunk.com/app/2883/)
 57 |     - [macOS](https://splunkbase.splunk.com/app/2881/)
 58 | 
 59 | # Usage
 60 | This repository is contains public contributions and Splunk is not responsible for guaranteeing
 61 | the correctness or validity of the algorithms. Splunk is in no way responsible for the vetting of
 62 | the contents of contributed algorithms.
 63 | 
 64 | # Deploying
 65 | 
 66 | To use the custom algorithms in this repository, you must deploy them as a Splunk app.
 67 | 
 68 | There are two ways to do this.
 69 | 
 70 | ### Manual copying
 71 | 
 72 | You can simple copy the following directories under src:
 73 |   * bin
 74 |   * default
 75 |   * metadata
 76 | 
 77 | to:
 78 |   * ${SPLUNK_HOME}/etc/apps/SA_mltk_contrib_app (you will need to create the directory first):
 79 | 
 80 | OR
 81 | 
 82 | ### Build and install
 83 | 
 84 | #### 1. Build the app:
 85 | 
 86 | You will need to install tox. See [Test Prerequisites](#prereq)
 87 | 
 88 | ```bash
 89 | tox -e package-macos        # if on Mac
 90 | tox -e package-linux        # if on Linux
 91 | ```
 92 | 
 93 |   * The resulting gzipped tarball will be in the `target` directory (e.g. target/SA_mltk_contrib_app.tgz).
 94 |     * The location of the gzipped tarball can be overridden by `BUILD_DIR` environment variable.
 95 |   * The default app name will be `SA_mltk_contrib_app`, but this can be overridden by the `APP_NAME` environment variable.
 96 | 
 97 | * **NOTE**: You can run `tox -e clean` to remove the `target` directory.
 98 | 
 99 | #### 2. Install the tarball:
100 | 
101 |   * You can do one of the followings with the tarball from step 1:
102 |     * Manually untar it in `${SPLUNK_HOME}/etc/apps` directory
103 |     * Install it using the GUI:
104 |       * https://docs.splunk.com/Documentation/AddOns/released/Overview/Singleserverinstall
105 | 
106 | # Contributing
107 | 
108 | This repository was specifically made for your contributions! See [Contributing](https://github.com/splunk/mltk-algo-contrib/blob/master/CONTRIBUTING.md) for more details.
109 | 
110 | ## Developing
111 | 
112 | To start developing, you will need to have Splunk installed. If you don't, read more [here](http://docs.splunk.com/Documentation/Splunk/latest/Installation/InstallonLinux).
113 | 
114 | 1. clone the repo and cd into the directory:
115 | 
116 | ```bash
117 | git clone https://github.com/splunk/mltk-algo-contrib.git
118 | cd mltk-algo-contrib
119 | ```
120 | 
121 | 2. symlink the `src` directory to the apps folder in Splunk and restart splunkd:
122 | 
123 | ```bash
124 | ln -s "$(pwd)/src" $SPLUNK_HOME/etc/apps/SA_mltk_contrib_app
125 | $SPLUNK_HOME/bin/splunk restart
126 | ```
127 |   * _This will eliminate the need to deploy the app to test changes._
128 | 
129 | 3. Add your new algorithm(s) to `src/bin/algos_contrib`.
130 |   (See SVR.py for an example.)
131 |   
132 | 4. Add a new stanza to `src/default/algos.conf`
133 | 
134 | ```bash
135 | [<your_algo>]
136 | package=algos_contrib
137 | ```
138 | 
139 |   * **NOTE**: Due to the way configuration file layering works in Splunk,
140 |   the package name must be overridden in each section, and not
141 |   in the _default_ section.
142 |     
143 | 5. Add your tests to `src/bin/algos_contrib/tests/test_<your_algo>.py`
144 |   (See test_svr.py for an example.)
145 | 
146 | ## Running Tests
147 | 
148 | <a name=prereq></a>
149 | ### Prerequisites
150 | 
151 | 1. Install *tox*:
152 |    * http://tox.readthedocs.io/en/latest/install.html
153 |    ```bash
154 |    pip install tox
155 |    ```
156 | 
157 | 2. Install *tox-pip-extensions*:
158 |    * https://github.com/tox-dev/tox-pip-extensions
159 |    ```bash
160 |    pip install tox-pip-extensions
161 |    ```
162 |    * **NOTE**: You only need this if you do not want to
163 |    recreate the virtualenv(s) manually with `tox -r`
164 |    everytime you update requirements*.txt file, but
165 |    this is recommended for convenience.
166 | 
167 | 3. You must also have the following environment variable set to your
168 | Splunk installation directory (e.g. /opt/splunk):
169 |    * SPLUNK_HOME
170 | 
171 | ### Using tox
172 | 
173 | To run all tests, run the following command in the root source directory:
174 | 
175 | ```bash
176 | tox
177 | ```
178 | 
179 | To run a single test, you can provide the directory or a file as a parameter:
180 | 
181 | ```bash
182 | tox src/bin/algos_contrib/tests/
183 | tox src/bin/algos_contrib/tests/test_example_algo.py
184 | ...
185 | ```
186 | 
187 | Basically, any arguments passed to *tox* will be passed as an argument to the *pytest* command.
188 | To pass in options, use double dashes (--):
189 | 
190 | ```bash
191 | tox -- -k "example"     # Run tests that has keyword 'example'
192 | tox -- -x               # Stop after the first failure
193 | tox -- -s               # Show stdout/stderr (i.e. disable capturing)
194 | ...
195 | ```
196 | 
197 | ### Using Python REPL (Interactive Interpreter)
198 | 
199 | ```python
200 | $ python   # from src/bin directory
201 | >>> # Add the MLTK to our sys.path
202 | >>> from link_mltk import add_mltk
203 | >>> add_mltk()
204 | >>>
205 | >>> # Import our algorithm class
206 | >>> from algos_contrib.ExampleAlgo import ExampleAlgo
207 | ... (some warning from Splunk may show up)
208 | >>>
209 | >>> # Use utilities to catch common mistakes
210 | >>> from test.contrib_util import AlgoTestUtils
211 | >>> AlgoTestUtils.assert_algo_basic(ExampleAlgo, serializable=False)
212 | ```
213 | 
214 | ### Package/File Naming
215 | 
216 | Files and packages under _test_ directory should avoid having names
217 | that conflict with files or directories directly under:
218 | ```bash
219 | $SPLUNK_HOME/etc/apps/Splunk_ML_Toolkit/bin
220 | ```
221 | 
222 | ## Pull requests
223 | 
224 | Once you've finished what you're adding, make a pull request.
225 | 
226 | ## Bugs? Issues?
227 | 
228 | Please file issues with any information that might be needed to:
229 |  - reproduce what you're experiencing
230 |  - understand the problem fully
231 | 
232 | # License
233 | 
234 | The algorithms hosted, as well as the app itself, is licensed under the permissive Apache 2.0 license.
235 | 
236 | **Any additions to this repository must be under one of these licenses:**
237 |  - MIT
238 |  - BSD
239 |  - Apache 2.0
240 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------