├── src
├── bin
│ ├── test
│ │ ├── __init__.py
│ │ ├── test_contrib_util.py
│ │ └── contrib_util.py
│ ├── algos_contrib
│ │ ├── __init__.py
│ │ ├── tests
│ │ │ ├── __init__.py
│ │ │ ├── test_mds.py
│ │ │ ├── test_tf_binary.py
│ │ │ ├── test_savgol_filter.py
│ │ │ ├── test_example_algo.py
│ │ │ ├── test_correlation_matrix.py
│ │ │ ├── test_collaborativefilter.py
│ │ │ ├── test_agglomerative_clustering.py
│ │ │ ├── test_nmf.py
│ │ │ ├── test_min_max_scaler.py
│ │ │ ├── test_truncated_svd.py
│ │ │ ├── test_linear_svc.py
│ │ │ ├── test_latent_dirichlet_allocation.py
│ │ │ ├── test_CustomDecisionTreeClassifier.py
│ │ │ ├── test_extra_trees_classifier.py
│ │ │ ├── test_orthogonal_matching_pursuit.py
│ │ │ ├── test_IsolationForest.py
│ │ │ ├── test_tsne.py
│ │ │ └── test_svr.py
│ │ ├── ExampleAlgo.py
│ │ ├── SVR.py
│ │ ├── LinearSVC.py
│ │ ├── NMF.py
│ │ ├── TruncatedSVD.py
│ │ ├── AdaBoostRegressor.py
│ │ ├── ExtraTreesRegressor.py
│ │ ├── SavgolFilter.py
│ │ ├── BaggingRegressor.py
│ │ ├── QuantileTransformer.py
│ │ ├── LatentDirichletAllocation.py
│ │ ├── OrthogonalMatchingPursuit.py
│ │ ├── ExtraTreesClassifier.py
│ │ ├── CorrelationMatrix.py
│ │ ├── MinMaxScaler.py
│ │ ├── MDS.py
│ │ ├── TSNE.py
│ │ ├── CustomDecisionTreeClassifier.py
│ │ ├── AgglomerativeClustering.py
│ │ ├── CollaborativeFilter.py
│ │ ├── TFBinary.py
│ │ └── IsolationForest.py
│ ├── README.md
│ ├── test.py
│ └── link_mltk.py
├── default
│ ├── data
│ │ └── ui
│ │ │ ├── views
│ │ │ └── README.md
│ │ │ └── nav
│ │ │ └── default.xml
│ ├── app.conf
│ └── algos.conf
└── metadata
│ └── default.meta
├── .gitignore
├── requirements_1.2.txt
├── tox.ini
├── CONTRIBUTING.md
├── README.md
└── LICENSE
/src/bin/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/bin/README.md:
--------------------------------------------------------------------------------
1 | This is where you put any scripts you want to add to this app.
2 |
--------------------------------------------------------------------------------
/src/default/data/ui/views/README.md:
--------------------------------------------------------------------------------
1 | Add all the views that your app needs in this directory
2 |
--------------------------------------------------------------------------------
/src/bin/test.py:
--------------------------------------------------------------------------------
1 | from link_mltk import add_mltk
2 | add_mltk()
3 |
4 | from test.util import check_signatures
5 |
6 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_mds.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.MDS import MDS
2 | from test.contrib_util import AlgoTestUtils
3 |
4 |
5 | def test_algo():
6 | AlgoTestUtils.assert_algo_basic(MDS, serializable=False)
7 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/ExampleAlgo.py:
--------------------------------------------------------------------------------
1 | from base import BaseAlgo
2 |
3 |
4 | class ExampleAlgo(BaseAlgo):
5 | def __init__(self, options):
6 | pass
7 |
8 | def fit(self, df, options):
9 | return df
10 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_tf_binary.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.TFBinary import TFBinary
2 | from test.contrib_util import AlgoTestUtils
3 |
4 |
5 | def test_algo():
6 | AlgoTestUtils.assert_algo_basic(TFBinary, serializable=False)
7 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_savgol_filter.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.SavgolFilter import SavgolFilter
2 | from test.contrib_util import AlgoTestUtils
3 |
4 |
5 | def test_algo():
6 | AlgoTestUtils.assert_algo_basic(SavgolFilter, serializable=False)
7 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_example_algo.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.ExampleAlgo import ExampleAlgo
2 | from test.contrib_util import AlgoTestUtils
3 |
4 |
5 | def test_algo():
6 | AlgoTestUtils.assert_algo_basic(ExampleAlgo, serializable=False)
7 |
8 |
--------------------------------------------------------------------------------
/src/default/data/ui/nav/default.xml:
--------------------------------------------------------------------------------
1 |
8 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_correlation_matrix.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.CorrelationMatrix import CorrelationMatrix
2 | from test.contrib_util import AlgoTestUtils
3 |
4 |
5 | def test_algo():
6 | AlgoTestUtils.assert_algo_basic(CorrelationMatrix, serializable=False)
7 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_collaborativefilter.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.CollaborativeFilter import CollaborativeFilter
2 | from test.contrib_util import AlgoTestUtils
3 |
4 |
5 | def test_algo():
6 | AlgoTestUtils.assert_algo_basic(CollaborativeFilter, serializable=False)
7 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Jupyter Notebook
2 | .ipynb_checkpoints
3 |
4 | # macOS
5 | .DS_Store
6 |
7 | # Editors
8 | *.swp
9 | *.swo
10 |
11 | # Python stuff
12 | *.egg-info
13 | .tox
14 | **/.cache
15 | **/.pytest_cache
16 | **/*.pyc
17 |
18 | # IntelliJ
19 | **/.idea
20 |
21 | target
22 |
--------------------------------------------------------------------------------
/requirements_1.2.txt:
--------------------------------------------------------------------------------
1 | attrs==17.4.0
2 | funcsigs==1.0.2
3 | mock==2.0.0
4 | more-itertools==4.1.0
5 | numpy==1.10.4
6 | pandas==0.17.1
7 | pluggy==0.6.0
8 | psutil==3.4.2
9 | py==1.5.3
10 | pytest==3.5.0
11 | scikit-learn==0.17
12 | scipy==0.17.0
13 | six==1.11.0
14 | statsmodels==0.6.1
15 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_agglomerative_clustering.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.AgglomerativeClustering import AgglomerativeClustering
2 | from test.contrib_util import AlgoTestUtils
3 |
4 |
5 | def test_algo():
6 | AlgoTestUtils.assert_algo_basic(AgglomerativeClustering, serializable=False)
7 |
--------------------------------------------------------------------------------
/src/default/app.conf:
--------------------------------------------------------------------------------
1 | #
2 | # Splunk app configuration file
3 | #
4 |
5 | [install]
6 | is_configured = 1
7 |
8 | [ui]
9 | is_visible = false
10 | label = mltk-algo-contrib
11 |
12 | [launcher]
13 | author = github.com/splunk/mltk-algo-contrib
14 | description =
15 | version = 1.0
16 |
17 |
--------------------------------------------------------------------------------
/src/metadata/default.meta:
--------------------------------------------------------------------------------
1 |
2 | # Application-level permissions
3 |
4 | []
5 | access = read : [ * ], write : [ admin, power ]
6 |
7 | ### EVENT TYPES
8 |
9 | [eventtypes]
10 | export = system
11 |
12 |
13 | ### PROPS
14 |
15 | [props]
16 | export = system
17 |
18 |
19 | ### TRANSFORMS
20 |
21 | [transforms]
22 | export = system
23 |
24 |
25 | ### LOOKUPS
26 |
27 | [lookups]
28 | export = system
29 |
30 |
31 | ### VIEWSTATES: even normal users should be able to create shared viewstates
32 |
33 | [viewstates]
34 | access = read : [ * ], write : [ * ]
35 | export = system
36 |
37 |
38 | [algos]
39 | export = system
40 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_nmf.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from algos_contrib.NMF import NMF
3 | from test.contrib_util import AlgoTestUtils
4 |
5 |
6 | def test_algo():
7 | input_df = pd.DataFrame({
8 | 'a': [1, 2, 3],
9 | 'b': [4, 5, 6],
10 | 'c': ['a', 'b', 'c'],
11 | })
12 | options = {
13 | 'feature_variables': ['a', 'b', 'c'],
14 | }
15 | required_methods = (
16 | '__init__',
17 | 'fit',
18 | 'partial_fit',
19 | 'apply',
20 | 'summary',
21 | 'register_codecs',
22 | )
23 | AlgoTestUtils.assert_algo_basic(NMF, required_methods, input_df, options)
24 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_min_max_scaler.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from algos_contrib.MinMaxScaler import MinMaxScaler
3 | from test.contrib_util import AlgoTestUtils
4 |
5 |
6 | def test_algo():
7 | input_df = pd.DataFrame({
8 | 'a': [1, 2, 3],
9 | 'b': [4, 5, 6],
10 | 'c': ['a', 'b', 'c'],
11 | })
12 | options = {
13 | 'feature_variables': ['a', 'b', 'c'],
14 | }
15 | required_methods = (
16 | '__init__',
17 | 'fit',
18 | 'partial_fit',
19 | 'apply',
20 | 'summary',
21 | 'register_codecs',
22 | )
23 | AlgoTestUtils.assert_algo_basic(MinMaxScaler, required_methods, input_df, options)
24 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_truncated_svd.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from algos_contrib.TruncatedSVD import TruncatedSVD
3 | from test.contrib_util import AlgoTestUtils
4 |
5 |
6 | def test_algo():
7 | input_df = pd.DataFrame({
8 | 'a': [1, 2, 3],
9 | 'b': [4, 5, 6],
10 | 'c': ['a', 'b', 'c'],
11 | })
12 | options = {
13 | 'feature_variables': ['a', 'b', 'c'],
14 | }
15 | required_methods = (
16 | '__init__',
17 | 'fit',
18 | 'partial_fit',
19 | 'apply',
20 | 'summary',
21 | 'register_codecs',
22 | )
23 | AlgoTestUtils.assert_algo_basic(TruncatedSVD, required_methods, input_df, options)
24 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_linear_svc.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from algos_contrib.LinearSVC import LinearSVC
3 | from test.contrib_util import AlgoTestUtils
4 |
5 |
6 |
7 |
8 | def test_algo():
9 | input_df = pd.DataFrame({
10 | 'a': [1, 2, 3],
11 | 'b': [4, 5, 6],
12 | 'c': ['a', 'b', 'c'],
13 | })
14 | options = {
15 | 'target_variable': ['a'],
16 | 'feature_variables': ['b', 'c'],
17 | }
18 | required_methods = (
19 | '__init__',
20 | 'fit',
21 | 'partial_fit',
22 | 'apply',
23 | 'summary',
24 | 'register_codecs',
25 | )
26 | AlgoTestUtils.assert_algo_basic(LinearSVC, required_methods , input_df, options)
27 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_latent_dirichlet_allocation.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from algos_contrib.LatentDirichletAllocation import LatentDirichletAllocation
3 | from test.contrib_util import AlgoTestUtils
4 |
5 |
6 | def test_algo():
7 | input_df = pd.DataFrame({
8 | 'a': [1, 2, 3],
9 | 'b': [4, 5, 6],
10 | 'c': ['a', 'b', 'c'],
11 | })
12 | options = {
13 | 'feature_variables': ['b', 'c'],
14 | }
15 | required_methods = (
16 | '__init__',
17 | 'fit',
18 | 'partial_fit',
19 | 'apply',
20 | 'summary',
21 | 'register_codecs',
22 | )
23 | AlgoTestUtils.assert_algo_basic(LatentDirichletAllocation, required_methods, input_df, options)
24 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_CustomDecisionTreeClassifier.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from algos_contrib.CustomDecisionTreeClassifier import CustomDecisionTreeClassifier
3 | from test.contrib_util import AlgoTestUtils
4 |
5 | def test_algo():
6 | input_df = pd.DataFrame({
7 | 'a': [1, 2, 3],
8 | 'b': [4, 5, 6],
9 | 'c': ['a', 'b', 'c'],
10 | })
11 | options = {
12 | 'target_variable': ['a'],
13 | 'feature_variables': ['b', 'c'],
14 | }
15 | required_methods = (
16 | '__init__',
17 | 'fit',
18 | 'apply',
19 | 'summary',
20 | 'register_codecs',
21 | )
22 | AlgoTestUtils.assert_algo_basic(CustomDecisionTreeClassifier, required_methods , input_df, options)
23 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_extra_trees_classifier.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from algos_contrib.OrthogonalMatchingPursuit import OrthogonalMatchingPursuit
3 | from test.contrib_util import AlgoTestUtils
4 |
5 |
6 |
7 |
8 | def test_algo():
9 | input_df = pd.DataFrame({
10 | 'a': [1, 2, 3],
11 | 'b': [4, 5, 6],
12 | 'c': ['a', 'b', 'c'],
13 | })
14 | options = {
15 | 'target_variable': ['a'],
16 | 'feature_variables': ['b', 'c'],
17 | }
18 | required_methods = (
19 | '__init__',
20 | 'fit',
21 | 'partial_fit',
22 | 'apply',
23 | 'summary',
24 | 'register_codecs',
25 | )
26 | AlgoTestUtils.assert_algo_basic(OrthogonalMatchingPursuit, required_methods , input_df, options)
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_orthogonal_matching_pursuit.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from algos_contrib.OrthogonalMatchingPursuit import OrthogonalMatchingPursuit
3 | from test.contrib_util import AlgoTestUtils
4 |
5 |
6 |
7 |
8 | def test_algo():
9 | input_df = pd.DataFrame({
10 | 'a': [1, 2, 3],
11 | 'b': [4, 5, 6],
12 | 'c': ['a', 'b', 'c'],
13 | })
14 | options = {
15 | 'target_variable': ['a'],
16 | 'feature_variables': ['b', 'c'],
17 | }
18 | required_methods = (
19 | '__init__',
20 | 'fit',
21 | 'partial_fit',
22 | 'apply',
23 | 'summary',
24 | 'register_codecs',
25 | )
26 | AlgoTestUtils.assert_algo_basic(OrthogonalMatchingPursuit, required_methods , input_df, options)
--------------------------------------------------------------------------------
/src/bin/algos_contrib/SVR.py:
--------------------------------------------------------------------------------
1 | from sklearn.svm import SVR as _SVR
2 |
3 | from base import BaseAlgo, RegressorMixin
4 | from util.param_util import convert_params
5 |
6 |
7 | class SVR(RegressorMixin, BaseAlgo):
8 |
9 | def __init__(self, options):
10 | self.handle_options(options)
11 |
12 | params = options.get('params', {})
13 | out_params = convert_params(
14 | params,
15 | floats=['C', 'gamma'],
16 | strs=['kernel'],
17 | ints=['degree'],
18 | )
19 |
20 | self.estimator = _SVR(**out_params)
21 |
22 | @staticmethod
23 | def register_codecs():
24 | from codec.codecs import SimpleObjectCodec
25 | from codec import codecs_manager
26 | codecs_manager.add_codec('algos_contrib.SVR', 'SVR', SimpleObjectCodec)
27 | codecs_manager.add_codec('sklearn.svm.classes', 'SVR', SimpleObjectCodec)
28 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_IsolationForest.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.IsolationForest import IsolationForest
2 | from test.contrib_util import AlgoTestUtils
3 | import pandas as pd
4 |
5 | def test_algo():
6 | AlgoTestUtils.assert_algo_basic(IsolationForest, serializable=False)
7 |
8 | def test_algo_options():
9 | input_df = pd.DataFrame({
10 | 'a': [5.1, 4.9, 4.7, 4.6],
11 | 'b': [3.5, 3.0, 3.1, 3.2],
12 | 'c': [1.4, 1.4, 1.5, 1.6],
13 | 'd': [0.2, 0.2, 0.2, 0.4],
14 | 'e': ['Iris Setosa','Iris Setosa','Iris Versicolor','Iris Virginica']
15 | })
16 | options = {
17 | 'target_variables' : [],
18 | 'feature_variables': ['a','b','c','d'],
19 | }
20 | required_methods = (
21 | '__init__',
22 | 'fit',
23 | 'apply',
24 | 'register_codecs',
25 | )
26 | AlgoTestUtils.assert_algo_basic(IsolationForest, required_methods=required_methods, input_df=input_df, options=options, serializable=False)
--------------------------------------------------------------------------------
/src/bin/algos_contrib/LinearSVC.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from sklearn.svm import LinearSVC as _LinearSVC
4 |
5 | from codec import codecs_manager
6 | from base import BaseAlgo, ClassifierMixin
7 | from util.param_util import convert_params
8 |
9 |
10 | class LinearSVC(ClassifierMixin, BaseAlgo):
11 |
12 | def __init__(self, options):
13 | self.handle_options(options)
14 |
15 | out_params = convert_params(
16 | options.get('params', {}),
17 | floats=['gamma', 'C', 'tol', 'intercept_scaling'],
18 | ints=['random_state','max_iter'],
19 | strs=['penalty', 'loss', 'multi_class'],
20 | bools=['dual', 'fit_intercept'],
21 | )
22 |
23 | self.estimator = _LinearSVC(**out_params)
24 |
25 | @staticmethod
26 | def register_codecs():
27 | from codec.codecs import SimpleObjectCodec
28 | codecs_manager.add_codec('algos_contrib.LinearSVC', 'LinearSVC', SimpleObjectCodec)
29 | codecs_manager.add_codec('sklearn.svm.classes', 'LinearSVC', SimpleObjectCodec)
30 |
--------------------------------------------------------------------------------
/src/bin/link_mltk.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """ Small utility to add the MLTK bin path to the system path.
3 | This makes it easy to import algorithms or utilities from the MLTK."""
4 | import os
5 | import sys
6 |
7 |
8 | def check_splunk_home(splunk_home):
9 | """ Check SPLUNK_HOME and raise if not set."""
10 | if not splunk_home:
11 | raise RuntimeError('No $SPLUNK_HOME provided. Please set SPLUNK_HOME.')
12 |
13 |
14 | def get_mltk_bin_path(splunk_home):
15 | """ Create the path to the MLTK bin folder."""
16 | check_splunk_home(splunk_home)
17 | mltk_path = os.path.join(splunk_home, 'etc', 'apps', 'Splunk_ML_Toolkit', 'bin')
18 |
19 | if not os.path.exists(mltk_path):
20 | raise RuntimeError('MLTK bin folder not found at {}: is MLTK installed?'.format(mltk_path))
21 |
22 | return mltk_path
23 |
24 |
25 | def add_mltk():
26 | """ Adds MLTK bin path to sys.path """
27 | splunk_home = os.environ.get('SPLUNK_HOME', None)
28 | mltk_bin_path = get_mltk_bin_path(splunk_home)
29 | sys.path.insert(0, mltk_bin_path)
30 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27
3 | skipsdist = True
4 | skip_install = True
5 | tox_pip_extensions_ext_venv_update = true
6 |
7 | [testenv]
8 | passenv =
9 | SPLUNK_HOME
10 | setenv =
11 | PYTHONPATH = {env:SPLUNK_HOME}/etc/apps/Splunk_ML_Toolkit/bin
12 | APP_NAME = {env:APP_NAME:SA_mltk_contrib_app}
13 | BUILD_DIR = {toxinidir}/target
14 | deps = -r{toxinidir}/requirements_1.2.txt
15 | commands = pytest {posargs}
16 |
17 | [testenv:package-macos]
18 | platform = darwin
19 | deps =
20 | changedir = {env:BUILD_DIR}
21 | whitelist_externals = /bin/bash
22 | commands =
23 | /bin/bash -c 'tar -C {toxinidir} -s ",^src/,{env:APP_NAME}/," -cvzf {env:APP_NAME}.tgz src/\{bin,default,metadata\}'
24 |
25 | [testenv:package-linux]
26 | platform = linux
27 | deps =
28 | changedir = {env:BUILD_DIR}
29 | whitelist_externals = /bin/bash
30 | commands =
31 | /bin/bash -c 'tar -C {toxinidir} --transform="s,^src/,{env:APP_NAME}/," -cvzf {env:APP_NAME}.tgz src/\{bin,default,metadata\}'
32 |
33 | [testenv:clean]
34 | deps =
35 | whitelist_externals = /bin/rm
36 | commands =
37 | /bin/rm -rf {env:BUILD_DIR}
38 |
39 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/NMF.py:
--------------------------------------------------------------------------------
1 | from sklearn.decomposition import NMF as _NMF
2 | from base import BaseAlgo, TransformerMixin
3 | from codec import codecs_manager
4 | from util.param_util import convert_params
5 |
6 | class NMF(TransformerMixin, BaseAlgo):
7 |
8 | def __init__(self, options):
9 | self.handle_options(options)
10 | out_params = convert_params(
11 | options.get('params', {}),
12 | floats=['beta_loss','tol','alpha','l1_ratio'],
13 | strs=['init','solver'],
14 | ints=['k','max_iter','random_state'],
15 | bools=['versbose','shuffle'],
16 | aliases={'k': 'n_components'}
17 | )
18 |
19 | self.estimator = _NMF(**out_params)
20 |
21 | def rename_output(self, default_names, new_names):
22 | if new_names is None:
23 | new_names = 'NMF'
24 | output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
25 | return output_names
26 |
27 | @staticmethod
28 | def register_codecs():
29 | from codec.codecs import SimpleObjectCodec
30 | codecs_manager.add_codec('algos_contrib.NMF', 'NMF', SimpleObjectCodec)
31 | codecs_manager.add_codec('sklearn.decomposition.nmf', 'NMF', SimpleObjectCodec)
32 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/TruncatedSVD.py:
--------------------------------------------------------------------------------
1 | from sklearn.decomposition import TruncatedSVD as _TruncatedSVD
2 | from base import BaseAlgo, TransformerMixin
3 | from codec import codecs_manager
4 | from util.param_util import convert_params
5 |
6 | class TruncatedSVD(TransformerMixin, BaseAlgo):
7 |
8 | def __init__(self, options):
9 | self.handle_options(options)
10 | out_params = convert_params(
11 | options.get('params', {}),
12 | floats=['tol'],
13 | strs=['algorithm'],
14 | ints=['k','n_iter','random_state'],
15 | aliases={'k': 'n_components'}
16 | )
17 |
18 | self.estimator = _TruncatedSVD(**out_params)
19 |
20 | def rename_output(self, default_names, new_names):
21 | if new_names is None:
22 | new_names = 'SVD'
23 | output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
24 | return output_names
25 |
26 | @staticmethod
27 | def register_codecs():
28 | from codec.codecs import SimpleObjectCodec
29 | codecs_manager.add_codec('algos_contrib.TruncatedSVD', 'TruncatedSVD', SimpleObjectCodec)
30 | codecs_manager.add_codec('sklearn.decomposition.truncated_svd', 'TruncatedSVD', SimpleObjectCodec)
31 |
--------------------------------------------------------------------------------
/src/default/algos.conf:
--------------------------------------------------------------------------------
1 | # Here is where algorithms are registered.
2 | [default]
3 |
4 | ########################################################################
5 | # Due to the layering of configuration files in Splunk, we have to
6 | # override the package name in every section.
7 | ########################################################################
8 |
9 |
10 | [AgglomerativeClustering]
11 | package=algos_contrib
12 |
13 | [CorrelationMatrix]
14 | package=algos_contrib
15 |
16 | [ExampleAlgo]
17 | package=algos_contrib
18 |
19 | [SVR]
20 | package=algos_contrib
21 |
22 | [SavgolFilter]
23 | package=algos_contrib
24 |
25 | [TSNE]
26 | package=algos_contrib
27 |
28 | [MDS]
29 | package=algos_contrib
30 |
31 | [OrthogonalMatchingPursuit]
32 | package=algos_contrib
33 |
34 | [TruncatedSVD]
35 | package=algos_contrib
36 |
37 | [LatentDirichletAllocation]
38 | package=algos_contrib
39 |
40 | [NMF]
41 | package=algos_contrib
42 |
43 | [CollaborativeFilter]
44 | package=algos_contrib
45 |
46 | [CustomDecisionTreeClassifier]
47 | package=algos_contrib
48 |
49 | [TFBinary]
50 | package = algos_contrib
51 |
52 | [MinMaxScaler]
53 | package = algos_contrib
54 |
55 | [LinearSVC]
56 | package = algos_contrib
57 |
58 | [ExtraTreesClassifier]
59 | package = algos_contrib
60 |
61 | [IsolationForest]
62 | package = algos_contrib
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_tsne.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from algos_contrib.TSNE import TSNE
3 | from test.contrib_util import AlgoTestUtils
4 |
5 | algo_options = {'feature_variables': ['Review']}
6 |
7 |
8 | def test_algo():
9 | AlgoTestUtils.assert_algo_basic(TSNE, serializable=False)
10 |
11 |
12 | def test_valid_params():
13 | algo_options['params'] = {'k': '1'}
14 | TSNE_algo = TSNE(algo_options)
15 | assert TSNE_algo.estimator.n_components == 1
16 |
17 |
18 | def test_invalid_params_k_not_int():
19 | algo_options['params'] = {'k': '0.1'}
20 | with pytest.raises((RuntimeError, ValueError)) as excinfo:
21 | _ = TSNE(algo_options)
22 | assert excinfo.match('Invalid value for k: must be an int')
23 |
24 |
25 | def test_invalid_params_k_not_valid():
26 | algo_options['params'] = {'k': '0'}
27 | with pytest.raises((RuntimeError, ValueError)) as excinfo:
28 | _ = TSNE(algo_options)
29 | assert excinfo.match('Invalid value for k: k must be greater than or equal to 1')
30 |
31 |
32 | def test_default_parameter_values():
33 | algo_options['params'] = {'k': '1'}
34 | TSNE_algo = TSNE(algo_options)
35 | assert TSNE_algo.estimator.n_iter == 200
36 | assert TSNE_algo.estimator.perplexity == 30.0
37 | assert TSNE_algo.estimator.early_exaggeration == 4.0
38 | assert TSNE_algo.estimator.learning_rate == 100
39 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # CONTRIBUTING
2 |
3 | By submitting a Contribution to this Work, You agree that Your Contribution is made subject to the primary LICENSE
4 | file applicable to this Work. In addition, You represent that: (i) You are the copyright owner of the Contribution
5 | or (ii) You have the requisite rights to make the Contribution.
6 |
7 | ## Definitions:
8 |
9 | “You” shall mean: (i) yourself if you are making a Contribution on your own behalf; or (ii) your company,
10 | if you are making a Contribution on behalf of your company. If you are making a Contribution on behalf of your
11 | company, you represent that you have the requisite authority to do so.
12 |
13 | "Contribution" shall mean any original work of authorship, including any modifications or additions to an existing
14 | work, that is intentionally submitted by You for inclusion in, or documentation of, this project/repository. For the
15 | purposes of this definition, "submitted" means any form of electronic, verbal, or written communication submitted for
16 | inclusion in this project/repository, including but not limited to communication on electronic mailing lists, source
17 | code control systems, and issue tracking systems that are managed by, or on behalf of, the maintainers of
18 | the project/repository.
19 |
20 | “Work” shall mean the collective software, content, and documentation in this project/repository.
21 |
22 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/tests/test_svr.py:
--------------------------------------------------------------------------------
1 | from algos_contrib.SVR import SVR
2 | from test.contrib_util import AlgoTestUtils
3 |
4 | import numpy as np
5 | import pandas as pd
6 |
7 |
8 | def test_algo_basic():
9 | input_df = pd.DataFrame({
10 | 'a': [1, 2, 3],
11 | 'b': [4, 5, 6],
12 | 'c': ['a', 'b', 'c'],
13 | })
14 | options = {
15 | 'target_variable': ['a'],
16 | 'feature_variables': ['b', 'c'],
17 | }
18 | required_methods = (
19 | '__init__',
20 | 'fit',
21 | 'partial_fit',
22 | 'apply',
23 | 'summary',
24 | 'register_codecs',
25 | )
26 | AlgoTestUtils.assert_algo_basic(SVR, required_methods, input_df, options)
27 |
28 |
29 | def test_prediction():
30 | training_df = pd.DataFrame({
31 | 'y': [1, 2, 3],
32 | 'x1': [4, 5, 6],
33 | 'x2': [7, 8, 9],
34 | })
35 | options = {
36 | 'target_variable': ['y'],
37 | 'feature_variables': ['x1', 'x2'],
38 | }
39 | test_df = pd.DataFrame({
40 | 'x1': [4],
41 | 'x2': [7],
42 | })
43 |
44 | svr = SVR(options)
45 | svr.feature_variables = options['feature_variables']
46 | svr.target_variable = options['target_variable'][0]
47 | svr.fit(training_df, options)
48 | output = svr.apply(test_df, options)
49 | np.testing.assert_approx_equal(output['predicted(y)'].values, np.array([1.1]))
50 |
51 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/AdaBoostRegressor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from pandas import DataFrame
4 | from sklearn.ensemble import AdaBoostRegressor as _AdaBoostRegressor
5 |
6 | from base import RegressorMixin, BaseAlgo
7 | from util.param_util import convert_params
8 | from util.algo_util import handle_max_features
9 | from codec import codecs_manager
10 |
11 |
12 | class AdaBoostRegressor(RegressorMixin, BaseAlgo):
13 | def __init__(self, options):
14 | self.handle_options(options)
15 | params = options.get('params', {})
16 | out_params = convert_params(
17 | params,
18 | strs=['loss', 'max_features'],
19 | floats=['learning_rate'],
20 | ints=['n_estimators'],
21 | )
22 |
23 | self.estimator = _AdaBoostRegressor(**out_params)
24 |
25 |
26 | @staticmethod
27 | def register_codecs():
28 | from codec.codecs import SimpleObjectCodec, TreeCodec
29 |
30 | codecs_manager.add_codec('algos.AdaBoostRegressor', 'AdaBoostRegressor', SimpleObjectCodec)
31 | codecs_manager.add_codec('sklearn.ensemble.classes', 'AdaBoostRegressor', SimpleObjectCodec)
32 | codecs_manager.add_codec('sklearn.tree.tree', 'DecisionTreeRegressor', SimpleObjectCodec)
33 | codecs_manager.add_codec('sklearn.ensemble.weight_boosting', 'AdaBoostRegressor', SimpleObjectCodec)
34 | codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
35 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/ExtraTreesRegressor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from pandas import DataFrame
4 | from sklearn.ensemble import ExtraTreesRegressor as _ExtraTreesRegressor
5 |
6 | from base import RegressorMixin, BaseAlgo
7 | from util.param_util import convert_params
8 | from util.algo_util import handle_max_features
9 | from codec import codecs_manager
10 |
11 |
12 | class ExtraTreesRegressor(RegressorMixin, BaseAlgo):
13 | def __init__(self, options):
14 | self.handle_options(options)
15 | params = options.get('params', {})
16 | out_params = convert_params(
17 | params,
18 | floats=['max_samples', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'min_impurity_split'],
19 | bools=['bootstrap', 'oob_score', 'warm_start'],
20 | ints=['n_estimators', 'max_depth', 'max_leaf_nodes', 'min_impurity_decrease'],
21 | strs=['criterion'],
22 | )
23 |
24 | self.estimator = _ExtraTreesRegressor(**out_params)
25 |
26 |
27 | @staticmethod
28 | def register_codecs():
29 | from codec.codecs import SimpleObjectCodec, TreeCodec
30 |
31 | codecs_manager.add_codec('algos.ExtraTreesRegressor', 'ExtraTreesRegressor', SimpleObjectCodec)
32 | codecs_manager.add_codec('sklearn.ensemble.forest', 'ExtraTreesRegressor', SimpleObjectCodec)
33 | codecs_manager.add_codec('sklearn.tree.tree', 'ExtraTreeRegressor', SimpleObjectCodec)
34 | codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
35 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/SavgolFilter.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.signal import savgol_filter
3 |
4 | from base import BaseAlgo
5 | from util.param_util import convert_params
6 | from util import df_util
7 |
8 |
9 | class SavgolFilter(BaseAlgo):
10 |
11 | def __init__(self, options):
12 | # set parameters
13 | params = options.get('params', {})
14 | out_params = convert_params(
15 | params,
16 | ints=['window_length', 'polyorder', 'deriv']
17 | )
18 |
19 | # set defaults for parameters
20 | if 'window_length' in out_params:
21 | self.window_length = out_params['window_length']
22 | else:
23 | self.window_length = 5
24 |
25 | if 'polyorder' in out_params:
26 | self.polyorder = out_params['polyorder']
27 | else:
28 | self.polyorder = 2
29 |
30 | if 'deriv' in out_params:
31 | self.deriv = out_params['deriv']
32 | else:
33 | self.deriv = 0
34 |
35 | def fit(self, df, options):
36 | X = df.copy()
37 | X, nans, columns = df_util.prepare_features(X, self.feature_variables)
38 |
39 | def f(x):
40 | return savgol_filter(x, self.window_length, self.polyorder, self.deriv)
41 |
42 | y_hat = np.apply_along_axis(f, 0, X)
43 |
44 | names = ['SG_%s' % col for col in columns]
45 | output_df = df_util.create_output_dataframe(y_hat, nans, names)
46 | df = df_util.merge_predictions(df, output_df)
47 |
48 | return df
--------------------------------------------------------------------------------
/src/bin/algos_contrib/BaggingRegressor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from pandas import DataFrame
4 | from sklearn.ensemble import BaggingRegressor as _BaggingRegressor
5 |
6 | from base import RegressorMixin, BaseAlgo
7 | from util.param_util import convert_params
8 | from util.algo_util import handle_max_features
9 | from codec import codecs_manager
10 |
11 |
12 | class BaggingRegressor(RegressorMixin, BaseAlgo):
13 | def __init__(self, options):
14 | self.handle_options(options)
15 | params = options.get('params', {})
16 | out_params = convert_params(
17 | params,
18 | floats=['max_samples', 'max_features'],
19 | bools=['bootstrap', 'bootstrap_features', 'oob_score', 'warm_start'],
20 | ints=['n_estimators'],
21 | )
22 |
23 | self.estimator = _BaggingRegressor(**out_params)
24 |
25 |
26 | @staticmethod
27 | def register_codecs():
28 | from codec.codecs import SimpleObjectCodec, TreeCodec
29 |
30 | codecs_manager.add_codec('algos.BaggingRegressor', 'BaggingRegressor', SimpleObjectCodec)
31 | codecs_manager.add_codec('sklearn.ensemble.classes', 'BaggingRegressor', SimpleObjectCodec)
32 | codecs_manager.add_codec('sklearn.tree.tree', 'DecisionTreeRegressor', SimpleObjectCodec)
33 | codecs_manager.add_codec('sklearn.ensemble.weight_boosting', 'BaggingRegressor', SimpleObjectCodec)
34 | codecs_manager.add_codec('sklearn.ensemble.bagging', 'BaggingRegressor', SimpleObjectCodec)
35 | codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
36 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/QuantileTransformer.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 |
4 | import pandas as pd
5 | from sklearn.preprocessing import QuantileTransformer as _QuantileTransformer
6 |
7 | from base import BaseAlgo, TransformerMixin
8 | from codec import codecs_manager
9 | from util.param_util import convert_params
10 | from util import df_util
11 |
12 |
13 | class QuantileTransformer(TransformerMixin, BaseAlgo):
14 |
15 | def __init__(self, options):
16 | self.handle_options(options)
17 |
18 | out_params = convert_params(
19 | options.get('params', {}),
20 | bools=['copy'],
21 | ints=['n_quantiles'],
22 | strs=['output_distribution']
23 | )
24 | self.estimator = _QuantileTransformer(**out_params)
25 | self.columns = None
26 |
27 | def rename_output(self, default_names, new_names=None):
28 | if new_names is None:
29 | new_names = 'QT'
30 | output_names = [new_names + '_' + feature for feature in self.columns]
31 | return output_names
32 |
33 | def summary(self, options):
34 | if len(options) != 2: # only model name and mlspl_limits
35 | raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__)
36 | return pd.DataFrame({'fields': self.columns})
37 |
38 | @staticmethod
39 | def register_codecs():
40 | from codec.codecs import SimpleObjectCodec
41 | codecs_manager.add_codec('algos.QuantileTransformer', 'QuantileTransformer', SimpleObjectCodec)
42 | codecs_manager.add_codec('sklearn.preprocessing.data', 'QuantileTransformer', SimpleObjectCodec)
43 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/LatentDirichletAllocation.py:
--------------------------------------------------------------------------------
1 | '''
2 | Once newer version of sklearn is used will need to change k alias from n_topics to n_components
3 | https://stackoverflow.com/a/48121678
4 | '''
5 |
6 | from sklearn.decomposition import LatentDirichletAllocation as _LatentDirichletAllocation
7 | from base import BaseAlgo, TransformerMixin
8 | from codec import codecs_manager
9 | from util.param_util import convert_params
10 |
11 | class LatentDirichletAllocation(TransformerMixin, BaseAlgo):
12 |
13 | def __init__(self, options):
14 | self.handle_options(options)
15 | out_params = convert_params(
16 | options.get('params', {}),
17 | floats=['doc_topic_prior','learning_decay','learning_offset','perp_tol','mean_change_tol'],
18 | strs=['learning_method'],
19 | ints=['k','max_iter','batch_size','evaluate_every','total_samples','max_doc_update_iter','n_jobs','verbose','random_state'],
20 | aliases={'k': 'n_topics'}
21 | )
22 |
23 | self.estimator = _LatentDirichletAllocation(**out_params)
24 |
25 | def rename_output(self, default_names, new_names):
26 | if new_names is None:
27 | new_names = 'LDA'
28 | output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
29 | return output_names
30 |
31 | @staticmethod
32 | def register_codecs():
33 | from codec.codecs import SimpleObjectCodec
34 | codecs_manager.add_codec('algos_contrib.LatentDirichletAllocation', 'LatentDirichletAllocation', SimpleObjectCodec)
35 | codecs_manager.add_codec('sklearn.decomposition.online_lda', 'LatentDirichletAllocation', SimpleObjectCodec)
36 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/OrthogonalMatchingPursuit.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.linear_model import OrthogonalMatchingPursuit as _OrthogonalMatchingPursuit
3 | from base import RegressorMixin, BaseAlgo
4 | from util.param_util import convert_params
5 | from util import df_util
6 |
7 |
8 | class OrthogonalMatchingPursuit(RegressorMixin, BaseAlgo):
9 | def __init__(self, options):
10 | self.handle_options(options)
11 |
12 | params = options.get('params', {})
13 | out_params = convert_params(
14 | params,
15 | floats=['tol'],
16 | strs=['kernel'],
17 | ints=['n_nonzero_coefs'],
18 | bools=['fit_intercept', 'normalize'],
19 | )
20 |
21 | self.estimator = _OrthogonalMatchingPursuit(**out_params)
22 |
23 | def summary(self, options):
24 | if len(options) != 2: # only model name and mlspl_limits
25 | raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__)
26 | df = pd.DataFrame({'feature': self.columns,
27 | 'coefficient': self.estimator.coef_.ravel()})
28 | idf = pd.DataFrame({'feature': ['_intercept'],
29 | 'coefficient': [self.estimator.intercept_]})
30 | return pd.concat([df, idf])
31 |
32 | @staticmethod
33 | def register_codecs():
34 | from codec.codecs import SimpleObjectCodec
35 | from codec import codecs_manager
36 | codecs_manager.add_codec('algos_contrib.OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuit', SimpleObjectCodec)
37 | codecs_manager.add_codec('sklearn.linear_model.omp', 'OrthogonalMatchingPursuit', SimpleObjectCodec)
38 |
39 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/ExtraTreesClassifier.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from pandas import DataFrame
4 | from sklearn.ensemble import ExtraTreesClassifier as _ExtraTreesClassifier
5 |
6 | from base import ClassifierMixin, BaseAlgo
7 | from codec import codecs_manager
8 | from util.param_util import convert_params
9 | from util.algo_util import handle_max_features
10 |
11 |
12 | class ExtraTreesClassifier(ClassifierMixin, BaseAlgo):
13 |
14 | def __init__(self, options):
15 | self.handle_options(options)
16 |
17 | out_params = convert_params(
18 | options.get('params', {}),
19 | ints=['random_state', 'n_estimators', 'max_depth',
20 | 'min_samples_split', 'max_leaf_nodes'],
21 | strs=['max_features', 'criterion'],
22 | )
23 |
24 | if 'max_depth' not in out_params:
25 | out_params.setdefault('max_leaf_nodes', 2000)
26 |
27 | if 'max_features' in out_params:
28 | out_params['max_features'] = handle_max_features(out_params['max_features'])
29 |
30 | self.estimator = _ExtraTreesClassifier(class_weight='balanced',
31 | **out_params)
32 |
33 | def summary(self, options):
34 | if len(options) != 2: # only model name and mlspl_limits
35 | raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__)
36 | df = DataFrame({
37 | 'feature': self.columns,
38 | 'importance': self.estimator.feature_importances_.ravel()
39 | })
40 | return df
41 |
42 | @staticmethod
43 | def register_codecs():
44 | from codec.codecs import SimpleObjectCodec, TreeCodec
45 | codecs_manager.add_codec('algos_contrib.ExtraTreesClassifier',
46 | 'ExtraTreesClassifier', SimpleObjectCodec)
47 | codecs_manager.add_codec('sklearn.ensemble.forest',
48 | 'ExtraTreesClassifier', SimpleObjectCodec)
49 | codecs_manager.add_codec('sklearn.tree.tree', 'ExtraTreeClassifier',
50 | SimpleObjectCodec)
51 | codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
52 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/CorrelationMatrix.py:
--------------------------------------------------------------------------------
1 | from base import BaseAlgo
2 |
3 |
4 | class CorrelationMatrix(BaseAlgo):
5 | """Compute and return a correlation matrix."""
6 |
7 | def __init__(self, options):
8 | """Check for valid correlation type, and save it to an attribute on self."""
9 |
10 | feature_variables = options.get('feature_variables', {})
11 | target_variable = options.get('target_variable', {})
12 |
13 | if len(feature_variables) == 0:
14 | raise RuntimeError('You must supply one or more fields')
15 |
16 | if len(target_variable) > 0:
17 | raise RuntimeError('CorrelationMatrix does not support the from clause')
18 |
19 | valid_methods = ['spearman', 'kendall', 'pearson']
20 |
21 | # Check to see if parameters exist
22 | params = options.get('params', {})
23 |
24 | # Check if method is in parameters in search
25 | if 'method' in params:
26 | if params['method'] not in valid_methods:
27 | error_msg = 'Invalid value for method: must be one of {}'.format(
28 | ', '.join(valid_methods))
29 | raise RuntimeError(error_msg)
30 |
31 | # Assign method to self for later usage
32 | self.method = params['method']
33 |
34 | # Assign default method and ensure no other parameters are present
35 | else:
36 | # Default method for correlation
37 | self.method = 'pearson'
38 |
39 | # Check for bad parameters
40 | if len(params) > 0:
41 | raise RuntimeError('The only valid parameter is method.')
42 |
43 | def fit(self, df, options):
44 | """Compute the correlations and return a DataFrame."""
45 |
46 | # df contains all the search results, including hidden fields
47 | # but the requested requested are saved as self.feature_variables
48 | requested_columns = df[self.feature_variables]
49 |
50 | # Get correlations
51 | correlations = requested_columns.corr(method=self.method)
52 |
53 | # Reset index so that all the data are in columns
54 | # (this is necessary for the corr method)
55 | output_df = correlations.reset_index()
56 |
57 | return output_df
58 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/MinMaxScaler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import pandas as pd
4 | from sklearn.preprocessing import MinMaxScaler as _MinMaxScaler
5 |
6 | from base import BaseAlgo, TransformerMixin
7 | from codec import codecs_manager
8 | from util.param_util import convert_params
9 | from util import df_util
10 |
11 |
12 | class MinMaxScaler(TransformerMixin, BaseAlgo):
13 |
14 | def __init__(self, options):
15 | self.handle_options(options)
16 |
17 | out_params = convert_params(
18 | options.get('params', {}),
19 | bools=['copy'],
20 | strs=['feature_range']
21 | )
22 | self.estimator = _MinMaxScaler(**out_params)
23 | self.columns = None
24 |
25 | def rename_output(self, default_names, new_names=None):
26 | if new_names is None:
27 | new_names = 'MMS'
28 | output_names = [new_names + '_' + feature for feature in self.columns]
29 | return output_names
30 |
31 | def partial_fit(self, df, options):
32 | # Make a copy of data, to not alter original dataframe
33 | X = df.copy()
34 |
35 | X, _, columns = df_util.prepare_features(
36 | X=X,
37 | variables=self.feature_variables,
38 | mlspl_limits=options.get('mlspl_limits'),
39 | )
40 | if self.columns is not None:
41 | df_util.handle_new_categorical_values(X, None, options, self.columns)
42 | if X.empty:
43 | return
44 | else:
45 | self.columns = columns
46 | self.estimator.partial_fit(X)
47 |
48 | def summary(self, options):
49 | if len(options) != 2: # only model name and mlspl_limits
50 | raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__)
51 | return pd.DataFrame({'fields': self.columns,
52 | 'mean': self.estimator.mean_,
53 | 'var': self.estimator.var_,
54 | 'scale': self.estimator.scale_})
55 |
56 | @staticmethod
57 | def register_codecs():
58 | from codec.codecs import SimpleObjectCodec
59 | codecs_manager.add_codec('algos_contrib.MinMaxScaler', 'MinMaxScaler', SimpleObjectCodec)
60 | codecs_manager.add_codec('sklearn.preprocessing.data', 'MinMaxScaler', SimpleObjectCodec)
61 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/MDS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from sklearn.manifold import MDS as _MDS
4 |
5 | from base import BaseAlgo, TransformerMixin
6 | from codec import codecs_manager
7 | from util.param_util import convert_params
8 |
9 | from util import df_util
10 |
11 | class MDS(TransformerMixin, BaseAlgo):
12 |
13 | def __init__(self, options):
14 | self.handle_options(options)
15 | out_params = convert_params(
16 | options.get('params', {}),
17 | ints=['k', 'max_iter', 'n_init', 'n_jobs'],
18 | floats=['eps'],
19 | bools=['metric'],
20 | aliases={'k': 'n_components'}
21 | )
22 |
23 | if 'max_iter' not in out_params:
24 | out_params.setdefault('max_iter', 300)
25 |
26 | if 'n_init' not in out_params:
27 | out_params.setdefault('n_init', 4)
28 |
29 | if 'n_jobs' not in out_params:
30 | out_params.setdefault('n_jobs', 1)
31 |
32 | if 'eps' not in out_params:
33 | out_params.setdefault('eps', 0.001)
34 |
35 | if 'metric' not in out_params:
36 | out_params.setdefault('metric', True)
37 |
38 | self.estimator = _MDS(**out_params)
39 |
40 | def rename_output(self, default_names, new_names):
41 | if new_names is None:
42 | new_names = 'MDS'
43 | output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
44 | return output_names
45 |
46 | def apply(self, df, options):
47 | # Make a copy of data, to not alter original dataframe
48 | X = df.copy()
49 |
50 | # Prepare the features
51 | X, nans, _ = df_util.prepare_features(
52 | X=X,
53 | variables=self.feature_variables,
54 | final_columns=self.columns,
55 | )
56 |
57 | # Call the transform method
58 | y_hat = self.estimator.fit_transform(X.values)
59 |
60 | # Assign output_name
61 | output_name = options.get('output_name', None)
62 | default_names = self.make_output_names(
63 | output_name=output_name,
64 | n_names=y_hat.shape[1],
65 | )
66 | output_names = self.rename_output(default_names, output_name)
67 |
68 | # Create output dataframe
69 | output = df_util.create_output_dataframe(
70 | y_hat=y_hat,
71 | nans=nans,
72 | output_names=output_names,
73 | )
74 |
75 | # Merge with original dataframe
76 | output = df_util.merge_predictions(df, output)
77 | return output
78 |
79 | @staticmethod
80 | def register_codecs():
81 | from codec.codecs import SimpleObjectCodec
82 | codecs_manager.add_codec('algos_contrib.MDS', 'MDS', SimpleObjectCodec)
83 | codecs_manager.add_codec('sklearn.manifold.MDS', 'MDS', SimpleObjectCodec)
84 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/TSNE.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from sklearn.manifold import TSNE as _TSNE
4 |
5 | from base import BaseAlgo, TransformerMixin
6 | from codec import codecs_manager
7 | from util.param_util import convert_params
8 |
9 | from util import df_util
10 |
11 | class TSNE(TransformerMixin, BaseAlgo):
12 |
13 | def __init__(self, options):
14 | self.handle_options(options)
15 | out_params = convert_params(
16 | options.get('params', {}),
17 | ints=['k', 'n_iter'],
18 | floats=['perplexity', 'early_exaggeration', 'learning_rate'],
19 | aliases={'k': 'n_components'}
20 | )
21 |
22 | if out_params['n_components'] < 1:
23 | msg = 'Invalid value for k: k must be greater than or equal to 1, but found k="{}".'
24 | raise RuntimeError(msg.format(out_params['n_components']))
25 |
26 | if 'n_iter' not in out_params:
27 | out_params.setdefault('n_iter', 200)
28 |
29 | if 'perplexity' not in out_params:
30 | out_params.setdefault('perplexity', 30.0)
31 |
32 | if 'early_exaggeration' not in out_params:
33 | out_params.setdefault('early_exaggeration', 4.0)
34 |
35 | if 'learning_rate' not in out_params:
36 | out_params.setdefault('learning_rate', 100)
37 |
38 | self.estimator = _TSNE(**out_params)
39 |
40 | def rename_output(self, default_names, new_names):
41 | if new_names is None:
42 | new_names = 'TSNE'
43 | output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))]
44 | return output_names
45 |
46 | def apply(self, df, options):
47 | # Make a copy of data, to not alter original dataframe
48 | X = df.copy()
49 |
50 | # Prepare the features
51 | X, nans, _ = df_util.prepare_features(
52 | X=X,
53 | variables=self.feature_variables,
54 | final_columns=self.columns,
55 | )
56 |
57 | # Call the transform method
58 | y_hat = self.estimator.fit_transform(X.values)
59 |
60 | # Assign output_name
61 | output_name = options.get('output_name', None)
62 | default_names = self.make_output_names(
63 | output_name=output_name,
64 | n_names=y_hat.shape[1],
65 | )
66 | output_names = self.rename_output(default_names, output_name)
67 |
68 | # Create output dataframe
69 | output = df_util.create_output_dataframe(
70 | y_hat=y_hat,
71 | nans=nans,
72 | output_names=output_names,
73 | )
74 |
75 | # Merge with original dataframe
76 | output = df_util.merge_predictions(df, output)
77 | return output
78 |
79 | @staticmethod
80 | def register_codecs():
81 | from codec.codecs import SimpleObjectCodec
82 | codecs_manager.add_codec('algos_contrib.TSNE', 'TSNE', SimpleObjectCodec)
83 | codecs_manager.add_codec('sklearn.manifold.t_sne', 'TSNE', SimpleObjectCodec)
84 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/CustomDecisionTreeClassifier.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from sklearn.tree import DecisionTreeClassifier as _DecisionTreeClassifier
4 | from base import ClassifierMixin, BaseAlgo
5 | from codec import codecs_manager
6 | from util.param_util import convert_params
7 | from util.algo_util import tree_summary
8 |
9 | #This algorithm is an updated version of DecisionTreecClassifier from MLTK and class weight parameter has been added to it
10 |
11 | class CustomDecisionTreeClassifier(ClassifierMixin, BaseAlgo):
12 | def __init__(self, options):
13 | self.handle_options(options)
14 |
15 | out_params = convert_params(
16 | options.get('params', {}),
17 | ints=['random_state', 'max_depth', 'min_samples_split', 'max_leaf_nodes'],
18 | strs=['criterion', 'splitter', 'max_features', 'class_weight'],
19 | )
20 |
21 | # whitelist valid values for criterion, as error raised by sklearn for invalid values is uninformative
22 | if 'criterion' in out_params:
23 | try:
24 | assert (out_params['criterion'] in ['gini', 'entropy'])
25 | except AssertionError:
26 | raise RuntimeError('Invalid value for option criterion: "%s"' % out_params['criterion'])
27 |
28 | # whitelist valid values for splitter, as error raised by sklearn for invalid values is uninformative
29 | if 'splitter' in out_params:
30 | try:
31 | assert (out_params['splitter'] in ['best', 'random'])
32 | except AssertionError:
33 | raise RuntimeError('Invalid value for option splitter: "%s"' % out_params['splitter'])
34 |
35 | if 'max_depth' not in out_params:
36 | out_params.setdefault('max_leaf_nodes', 2000)
37 |
38 | # EAFP... convert max_features to int or float if it is a number.
39 | try:
40 | out_params['max_features'] = float(out_params['max_features'])
41 | max_features_int = int(out_params['max_features'])
42 | if out_params['max_features'] == max_features_int:
43 | out_params['max_features'] = max_features_int
44 | except:
45 | pass
46 |
47 | if 'class_weight' in out_params:
48 | try:
49 | from ast import literal_eval
50 | out_params['class_weight'] = literal_eval(out_params['class_weight'])
51 | except Exception:
52 | raise RuntimeError('Invalid value for option class_weight: "%s"' % out_params['class_weight'])
53 |
54 | self.estimator = _DecisionTreeClassifier(**out_params)
55 |
56 | def summary(self, options):
57 | if 'args' in options:
58 | raise RuntimeError('Summarization does not take values other than parameters')
59 | return tree_summary(self, options)
60 |
61 | @staticmethod
62 | def register_codecs():
63 | from codec.codecs import SimpleObjectCodec, TreeCodec
64 | codecs_manager.add_codec('algos_contrib.CustomDecisionTreeClassifier', 'CustomDecisionTreeClassifier', SimpleObjectCodec)
65 | codecs_manager.add_codec('sklearn.tree.tree', 'DecisionTreeClassifier', SimpleObjectCodec)
66 | codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
67 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/AgglomerativeClustering.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.cluster import AgglomerativeClustering as AgClustering
3 | from sklearn.metrics import silhouette_samples
4 |
5 | from base import BaseAlgo
6 | from util.param_util import convert_params
7 | from util import df_util
8 |
9 |
10 | class AgglomerativeClustering(BaseAlgo):
11 | """Use scikit-learn's AgglomerativeClustering algorithm to cluster data."""
12 |
13 | def __init__(self, options):
14 |
15 | feature_variables = options.get('feature_variables', {})
16 | target_variable = options.get('target_variable', {})
17 |
18 | # Ensure fields are present
19 | if len(feature_variables) == 0:
20 | raise RuntimeError('You must supply one or more fields')
21 |
22 | # No from clause allowed
23 | if len(target_variable) > 0:
24 | raise RuntimeError('AgglomerativeClustering does not support the from clause')
25 |
26 | # Convert params & alias k to n_clusters
27 | params = options.get('params', {})
28 | out_params = convert_params(
29 | params,
30 | ints=['k'],
31 | strs=['linkage', 'affinity'],
32 | aliases={'k': 'n_clusters'}
33 | )
34 |
35 | # Check for valid linkage
36 | if 'linkage' in out_params:
37 | valid_linkage = ['ward', 'complete', 'average']
38 | if out_params['linkage'] not in valid_linkage:
39 | raise RuntimeError('linkage must be one of: {}'.format(', '.join(valid_linkage)))
40 |
41 | # Check for valid affinity
42 | if 'affinity' in out_params:
43 | valid_affinity = ['l1', 'l2', 'cosine', 'manhattan',
44 | 'precomputed', 'euclidean']
45 |
46 | if out_params['affinity'] not in valid_affinity:
47 | raise RuntimeError('affinity must be one of: {}'.format(', '.join(valid_affinity)))
48 |
49 | # Check for invalid affinity & linkage combination
50 | if 'linkage' in out_params and 'affinity' in out_params:
51 | if out_params['linkage'] == 'ward':
52 | if out_params['affinity'] != 'euclidean':
53 | raise RuntimeError('ward linkage (default) must use euclidean affinity (default)')
54 |
55 | # Initialize the estimator
56 | self.estimator = AgClustering(**out_params)
57 |
58 | def fit(self, df, options):
59 | """Do the clustering & merge labels with original data."""
60 | # Make a copy of the input data
61 | X = df.copy()
62 |
63 | # Use the df_util prepare_features method to
64 | # - drop null columns & rows
65 | # - convert categorical columns into dummy indicator columns
66 | # X is our cleaned data, nans is a mask of the null value locations
67 | X, nans, columns = df_util.prepare_features(X, self.feature_variables)
68 |
69 | # Do the actual clustering
70 | y_hat = self.estimator.fit_predict(X.values)
71 |
72 | # attach silhouette coefficient score for each row
73 | silhouettes = silhouette_samples(X, y_hat)
74 |
75 | # Combine the two arrays, and transpose them.
76 | y_hat = np.vstack([y_hat, silhouettes]).T
77 |
78 | # Assign default output names
79 | default_name = 'cluster'
80 |
81 | # Get the value from the as-clause if present
82 | output_name = options.get('output_name', default_name)
83 |
84 | # There are two columns - one for the labels, for the silhouette scores
85 | output_names = [output_name, 'silhouette_score']
86 |
87 | # Use the predictions & nans-mask to create a new dataframe
88 | output_df = df_util.create_output_dataframe(y_hat, nans, output_names)
89 |
90 | # Merge the dataframe with the original input data
91 | df = df_util.merge_predictions(df, output_df)
92 | return df
93 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/CollaborativeFilter.py:
--------------------------------------------------------------------------------
1 |
2 | from base import BaseAlgo
3 | import pandas as pd
4 | import numpy as np
5 |
6 | from sklearn.metrics.pairwise import pairwise_distances
7 | from cexc import get_logger
8 | from util import df_util
9 | from util.param_util import convert_params
10 |
11 | # Everyone's favorite in memory collaborative filter, not a scaleable solution for millions of users and millions of items
12 | # https://en.wikipedia.org/wiki/Collaborative_filtering
13 | # please check out more scaleable solutions in KNN or "Recommender Systems: The Textbook"
14 | # TODO add coldstart solution for nulls
15 | # TODO currently we assume a |fillnull value=0 is run in splunk prior to calling the algorithm
16 |
17 | # We ASSUME rows are users, columns are items.
18 | # TODO I seem to cause splunk memory issues with wide tables, so I should consider doing an XYSERIES like reshape
19 | # TODO and consider taking in a table of USERID, ITEM , RATING from splunk. Yucky.
20 |
21 | # TODO There are many many many other distance metrics that could be a good fit.
22 |
23 |
24 | class CollaborativeFilter(BaseAlgo):
25 | def __init__(self, options):
26 |
27 |
28 | # set parameters
29 | params = options.get('params', {})
30 | out_params = convert_params(
31 | params,
32 | strs=['user_field','rating_type','coldstart_field']
33 | )
34 |
35 | # set defaults for parameters
36 | if 'user_field' in out_params:
37 | self.user_field = out_params['user_field']
38 | else:
39 | self.user_field = "SME"
40 |
41 | self.rating_type="item"
42 | if 'rating_type' in out_params:
43 | if out_params['rating_type'] == "item":
44 | self.rating_type="item"
45 | elif out_params['rating_type'] == "user":
46 | self.rating_type="user"
47 |
48 |
49 | def fit(self, df, options):
50 | # df contains all the search results, including hidden fields
51 | # but the requested requested are saved as self.feature_variables
52 | logger = get_logger('MyCustomLogging')
53 |
54 | X=df.copy()
55 |
56 | # it is always best practice to prepare your data.
57 | # splunk has a number of hidden fields that are exposed as part of the search protocole, and we really only
58 | # want the features that are valid field names.
59 |
60 |
61 | #Make sure to turn off get_dummies
62 | X, _, self.columns = df_util.prepare_features(
63 | X=X,
64 | variables=self.feature_variables,
65 | get_dummies=False,
66 | mlspl_limits=options.get('mlspl_limits'),
67 | )
68 |
69 | # test if user field is in the list
70 | logger.debug("The user field is %s",self.user_field )
71 | try:
72 | my_list_index=(X[self.user_field].values)
73 | except:
74 | raise RuntimeError('You must specify user field that exists. You sent %s',self.user_field)
75 |
76 | X=X.drop([self.user_field],axis=1)
77 | my_list_header=(X.columns.values)
78 |
79 | #ratings as a matrix , clean that data up!
80 | X=X.replace([np.inf, -np.inf], "nan").replace("nan","0")
81 | matrix=X.values
82 | # force type for Numpy Math
83 | matrix=matrix.astype(np.float64)
84 |
85 | # should consider erroring out when you have super sparse user data
86 | # TODO add other methods via parameter
87 | user_sim = pairwise_distances(matrix, metric='cosine')
88 | item_sim = pairwise_distances(matrix.T, metric='cosine')
89 |
90 | #item prediction
91 | item_sim= matrix.dot(item_sim) / np.array([np.abs(item_sim).sum(axis=1)])
92 |
93 | #user sim
94 | mean_user_rating = matrix.mean(axis=1)
95 | matrix_diff = (matrix - mean_user_rating[:, np.newaxis])
96 | user_sim = mean_user_rating[:, np.newaxis] + user_sim.dot(matrix_diff) / np.array([np.abs(user_sim).sum(axis=1)]).T
97 |
98 | # add back into the matrix the header row
99 | if self.rating_type == "item":
100 | output_df=pd.DataFrame(item_sim,columns=my_list_header, index=my_list_index)
101 | if self.rating_type == "user":
102 | output_df=pd.DataFrame(user_sim,columns=my_list_header, index=my_list_index)
103 | output_df[self.user_field]=pd.Series(my_list_index).values
104 |
105 | return output_df
106 |
107 |
108 |
109 |
110 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/TFBinary.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | '''
3 | Copy of existing TFIDF algo but with 2 boolean options added and 3 options set
4 | so that binary output is achieved.
5 | '''
6 |
7 | from sklearn.feature_extraction.text import TfidfVectorizer as _TfidfVectorizer
8 |
9 | from base import BaseAlgo
10 | from codec import codecs_manager
11 | from util import df_util
12 | from util.param_util import convert_params
13 |
14 |
15 | class TFBinary(BaseAlgo):
16 |
17 | def handle_options(self, options):
18 | if len(options.get('feature_variables', [])) != 1 or len(options.get('target_variable', [])) > 0:
19 | raise RuntimeError('Syntax error: You must specify exactly one field')
20 |
21 | def __init__(self, options):
22 | self.handle_options(options)
23 |
24 | out_params = convert_params(
25 | options.get('params', {}),
26 | ints=['max_features'],
27 | bools=['use_idf','binary'],
28 | strs=['max_df', 'min_df',
29 | 'ngram_range', 'stop_words',
30 | 'analyzer', 'norm', 'token_pattern'],
31 | )
32 |
33 | for doc_freq, default_val in [('max_df', 1.0), ('min_df', 1)]:
34 | if doc_freq in out_params:
35 | # EAFP... convert max_df/min_df to float/int if it is a number.
36 | try:
37 | float_val = float(out_params[doc_freq])
38 | int_val = int(float_val)
39 | except ValueError:
40 | raise RuntimeError('Syntax Error: {doc_freq} requires a numeric value, e.g. {doc_freq}=1.0'.format(doc_freq=doc_freq))
41 | if float_val == 1.0:
42 | out_params[doc_freq] = default_val
43 | elif float_val == int_val:
44 | out_params[doc_freq] = int_val
45 | else:
46 | out_params[doc_freq] = float_val
47 |
48 | if 'ngram_range' in out_params.keys():
49 | try:
50 | out_params['ngram_range'] = tuple(int(i) for i in out_params['ngram_range'].split('-'))
51 | assert len(out_params['ngram_range']) == 2
52 | except:
53 | raise RuntimeError('Syntax Error: ngram_range requires a range, e.g. ngram_range=1-5')
54 |
55 | # TODO: Maybe let the user know that we make this change.
56 | out_params.setdefault('max_features', 100)
57 |
58 | # Binary defaults
59 | out_params.setdefault('use_idf', False)
60 | out_params.setdefault('norm', None)
61 | out_params.setdefault('binary', True)
62 |
63 | self.estimator = _TfidfVectorizer(**out_params)
64 |
65 | def fit(self, df, options):
66 | # Make a copy of data, to not alter original dataframe
67 | X = df.copy()
68 |
69 | # Make sure to turn off get_dummies
70 | X, _, self.columns = df_util.prepare_features(
71 | X=X,
72 | variables=self.feature_variables,
73 | get_dummies=False,
74 | mlspl_limits=options.get('mlspl_limits'),
75 | )
76 |
77 | X = X.values.ravel().astype('str')
78 | self.estimator.fit(X)
79 |
80 | def make_output_names(self, options):
81 | default_name = self.feature_variables[0] + '_tfbin'
82 | output_name = options.get('output_name', default_name)
83 | feature_names = self.estimator.get_feature_names()
84 | output_names = [output_name + '_' + str(index) + '_' + word
85 | for (index, word) in enumerate(feature_names)]
86 | return output_names
87 |
88 | def apply(self, df, options):
89 | # Make a copy of data, to not alter original dataframe
90 | X = df.copy()
91 |
92 | # Make sure to turn off get_dummies
93 | X, nans, _ = df_util.prepare_features(
94 | X=X,
95 | variables=self.feature_variables,
96 | final_columns=self.columns,
97 | get_dummies=False,
98 | mlspl_limits=options.get('mlspl_limits'),
99 | )
100 |
101 | X = X.values.ravel().astype('str')
102 | y_hat = self.estimator.transform(X)
103 |
104 | # Convert the returned sparse matrix into array
105 | y_hat = y_hat.toarray()
106 |
107 | output_names = self.make_output_names(options)
108 |
109 | output = df_util.create_output_dataframe(
110 | y_hat=y_hat,
111 | output_names=output_names,
112 | nans=nans,
113 | )
114 |
115 | df = df_util.merge_predictions(df, output)
116 | return df
117 |
118 | @staticmethod
119 | def register_codecs():
120 | from codec.codecs import SimpleObjectCodec
121 | codecs_manager.add_codec('algos_contrib.TFBinary', 'TFBinary', SimpleObjectCodec)
122 | codecs_manager.add_codec('sklearn.feature_extraction.text', 'TfidfVectorizer', SimpleObjectCodec)
123 | codecs_manager.add_codec('sklearn.feature_extraction.text', 'TfidfTransformer', SimpleObjectCodec)
124 | codecs_manager.add_codec('scipy.sparse.dia', 'dia_matrix', SimpleObjectCodec)
125 |
--------------------------------------------------------------------------------
/src/bin/test/test_contrib_util.py:
--------------------------------------------------------------------------------
1 | import mock
2 | import io
3 | import pandas as pd
4 | import pytest
5 | import sys
6 |
7 | from base import BaseAlgo
8 | from util.base_util import MLSPLNotImplementedError
9 |
10 | from contrib_util import AlgoTestUtils
11 |
12 |
13 | @pytest.fixture
14 | def min_algo_cls():
15 | class MinimalAlgo(BaseAlgo):
16 | pass
17 | return MinimalAlgo
18 |
19 |
20 | @pytest.fixture
21 | def serializable_algo_cls():
22 | class SerializableAlgo(BaseAlgo):
23 | def __init__(self, options):
24 | pass
25 |
26 | def fit(self, df, options):
27 | pass
28 |
29 | def apply(self, df, options):
30 | return df
31 |
32 | @classmethod
33 | def register_codecs(cls):
34 | from codec.codecs import SimpleObjectCodec
35 | from codec import codecs_manager
36 | codecs_manager.add_codec('test.test_contrib_util', 'SerializableAlgo', SimpleObjectCodec)
37 |
38 | # Add the class to this module so that encoder and decoder can access it.
39 | # This is only necessary for a fixture function. Normally, these classes will be defined within a module.
40 | setattr(sys.modules[__name__], 'SerializableAlgo', SerializableAlgo)
41 | return SerializableAlgo
42 |
43 |
44 | mock_algo_conf = """
45 | [MinimalAlgo]
46 | package=algos_contrib
47 | """
48 |
49 |
50 | mock_algo_conf_no_package = """
51 | [MinimalAlgo]
52 | """
53 |
54 |
55 | def test_method_signature(min_algo_cls):
56 | AlgoTestUtils.assert_method_signature(min_algo_cls, 'fit', ['self', 'df', 'options'])
57 |
58 |
59 | @mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf))
60 | def test_registered(mock_get_algos_conf_fp, min_algo_cls):
61 | AlgoTestUtils.assert_registered(min_algo_cls)
62 |
63 |
64 | def test_serializable(serializable_algo_cls):
65 | AlgoTestUtils.assert_serializable(serializable_algo_cls, input_df=pd.DataFrame({}), options={})
66 |
67 |
68 | def test_base_algo_method_signatures_default_methods(min_algo_cls):
69 | AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls)
70 |
71 |
72 | def test_base_algo_method_signatures_all_methods(min_algo_cls):
73 | AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls, required_methods=[
74 | '__init__',
75 | 'fit',
76 | 'partial_fit',
77 | 'apply',
78 | 'register_codecs',
79 | ])
80 |
81 |
82 | def test_base_algo_method_signatures_extra_methods(min_algo_cls):
83 | with pytest.raises(AssertionError) as e:
84 | extra_args = [
85 | 'extra1',
86 | 'extra2',
87 | ]
88 | AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls, required_methods=[
89 | '__init__',
90 | 'fit',
91 | 'partial_fit',
92 | 'apply',
93 | 'register_codecs',
94 | ] + extra_args)
95 | assert e.match('{}.*not in BaseAlgo'.format(extra_args))
96 |
97 |
98 | @mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf))
99 | def test_algo_basic(mock_get_algos_conf_fp, min_algo_cls):
100 | AlgoTestUtils.assert_algo_basic(min_algo_cls, serializable=False)
101 |
102 |
103 | def test_no_base_algo():
104 | class NoBaseAlgo(object):
105 | pass
106 |
107 | with pytest.raises(AssertionError) as e:
108 | AlgoTestUtils.assert_base_algo_method_signatures(NoBaseAlgo)
109 | assert e.match('must inherit from BaseAlgo')
110 |
111 |
112 | def test_method_signature_non_existent(min_algo_cls):
113 | bad_method = 'foot'
114 | with pytest.raises(AssertionError) as e:
115 | AlgoTestUtils.assert_method_signature(min_algo_cls, bad_method, ['self', 'df', 'options'])
116 | e.match("{}.*does not exist".format(bad_method))
117 |
118 |
119 | def test_method_signature_not_callable(min_algo_cls):
120 | bad_method = 'fit'
121 |
122 | # Make fit a property.
123 | min_algo_cls.fit = 'fit'
124 |
125 | with pytest.raises(AssertionError) as e:
126 | AlgoTestUtils.assert_method_signature(min_algo_cls, bad_method, ['self', 'df', 'options'])
127 | e.match("{}.*not callable".format(bad_method))
128 |
129 |
130 | @mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf))
131 | def test_unregistered(mock_get_algos_conf_fp):
132 | class UnregisteredAlgo(BaseAlgo):
133 | pass
134 |
135 | with pytest.raises(AssertionError) as e:
136 | AlgoTestUtils.assert_registered(UnregisteredAlgo)
137 | assert e.match('{}.*not registered'.format(UnregisteredAlgo.__name__))
138 |
139 |
140 | @mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf_no_package))
141 | def test_registered_with_missing_package_option(mock_get_algos_conf_fp, min_algo_cls):
142 | with pytest.raises(AssertionError) as e:
143 | AlgoTestUtils.assert_registered(min_algo_cls)
144 | assert e.match('{}.*must override.*package'.format(min_algo_cls.__name__))
145 |
146 |
147 | def test_not_serializable(min_algo_cls):
148 | with pytest.raises(MLSPLNotImplementedError) as e:
149 | AlgoTestUtils.assert_serializable(min_algo_cls, input_df=pd.DataFrame({}), options={})
150 | assert e.match('does not support saving')
151 |
152 |
153 |
--------------------------------------------------------------------------------
/src/bin/algos_contrib/IsolationForest.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from sklearn.ensemble import IsolationForest as _IsolationForest
4 | import numpy as np
5 | import pandas as pd
6 |
7 | from base import ClustererMixin, BaseAlgo
8 | from codec import codecs_manager
9 | from codec.codecs import BaseCodec
10 | from codec.flatten import flatten, expand
11 | from util import df_util
12 | from util.param_util import convert_params
13 | from cexc import get_messages_logger,get_logger
14 |
15 | class IsolationForest(ClustererMixin, BaseAlgo):
16 | """
17 | This is the implementation wrapper around Isolation Forest from scikit-learn. It inherits methods from ClustererMixin and BaseAlgo.
18 | """
19 | def __init__(self,options):
20 | self.handle_options(options)
21 | out_params = convert_params(
22 | options.get('params',{}),
23 | ints = ['n_estimators','n_jobs','random_state','verbose'],
24 | floats = ['max_samples','contamination','max_features'],
25 | bools = ['bootstrap']
26 | )
27 | self.return_scores = out_params.pop('anomaly_score', True)
28 |
29 | # whitelist n_estimators > 0
30 | if 'n_estimators' in out_params and out_params['n_estimators']<=0:
31 | msg = 'Invalid value error: n_estimators must be greater than 0 and an integer, but found n_estimators="{}".'
32 | raise RuntimeError(msg.format(out_params['n_estimators']))
33 |
34 | # whitelist max_samples > 0 and < 1
35 | if 'max_samples' in out_params and out_params['max_samples']<0 and out_params['max_samples']>1:
36 | msg = 'Invalid value error: max_samples must be greater than 0 and a float, but found max_samples="{}".'
37 | raise RuntimeError(msg.format(out_params['max_samples']))
38 |
39 | # whitelist contamination should be in (0.0, 0.5] as error raised by sklearn for values out of range
40 | if 'contamination' in out_params and not (0.0 < out_params['contamination'] <= 0.5):
41 | msg = (
42 | 'Invalid value error: Valid values for contamination are in (0.0, 0.5], '
43 | 'but found contamination="{}".'
44 | )
45 | raise RuntimeError(msg.format(out_params['contamination']))
46 |
47 | # whitelist max_features > 0 and < 1
48 | if 'max_features' in out_params and out_params['max_features']<0 and out_params['max_features']>1:
49 | msg = 'Invalid value error: max_features must be greater than 0, but found max_features="{}".'
50 | raise RuntimeError(msg.format(out_params['max_features']))
51 |
52 |
53 | self.estimator = _IsolationForest(**out_params)
54 |
55 |
56 | def apply(self, df, options):
57 | # Make a copy of data, to not alter original dataframe
58 | logger = get_logger('IsolationForest Logger')
59 | X = df.copy()
60 |
61 | X, nans, _ = df_util.prepare_features(
62 | X=X,
63 | variables=self.feature_variables,
64 | final_columns=self.columns,
65 | mlspl_limits=options.get('mlspl_limits'),
66 | )
67 |
68 | # Multiplying the result by -1 to represent Outliers with 1 and Inliers/Normal points with 1.
69 | y_hat = self.estimator.predict(X.values)*-1
70 | # Printing the accuracy for prediction of outliers
71 | accuracy = "Accuracy: {}".format(str(round((list(y_hat).count(-1)*100)/y_hat.shape[0], 2)))
72 | logger.debug(accuracy)
73 |
74 | y_hat = y_hat.astype('str')
75 |
76 | #Assign output_name
77 | default_name = 'isOutlier'
78 | new_name = options.get('output_name', None)
79 | output_name = self.rename_output(default_names=default_name, new_names=new_name)
80 |
81 | # Create output dataframe
82 | output = df_util.create_output_dataframe(
83 | y_hat=y_hat, nans=nans, output_names=output_name
84 | )
85 | # Merge with original dataframe
86 | output = df_util.merge_predictions(df, output)
87 | return output
88 |
89 | def rename_output(self, default_names, new_names=None):
90 | """Utility hook to rename output.
91 |
92 | The default behavior is to take the default_names passed in and simply
93 | return them. If however a particular algo needs to rename the columns of
94 | the output, this method can be overridden.
95 | """
96 | return new_names if new_names is not None else default_names
97 |
98 |
99 | @staticmethod
100 | def register_codecs():
101 | from codec.codecs import SimpleObjectCodec, TreeCodec
102 | codecs_manager.add_codec('algos.IsolationForest', 'IsolationForest', SimpleObjectCodec)
103 | codecs_manager.add_codec('sklearn.ensemble.iforest', 'IsolationForest', SimpleObjectCodec)
104 | codecs_manager.add_codec('sklearn.tree.tree','ExtraTreeRegressor', ExtraTreeRegressorCodec)
105 | codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec)
106 |
107 |
108 | class ExtraTreeRegressorCodec(BaseCodec):
109 | """
110 | This is an ExtraTreeRegressor Codec for saving the Isolation Forest base estimator to memory/file.
111 | """
112 | @classmethod
113 | def encode(cls, obj):
114 | import sklearn.tree
115 | assert type(obj) == sklearn.tree.tree.ExtraTreeRegressor
116 | state = obj.__getstate__()
117 | return {
118 | '__mlspl_type': [type(obj).__module__, type(obj).__name__],
119 | 'state': state
120 | }
121 |
122 | @classmethod
123 | def decode(cls,obj):
124 | from sklearn.tree.tree import ExtraTreeRegressor
125 | state = obj['state']
126 | t = ExtraTreeRegressor.__new__(ExtraTreeRegressor)
127 | t.__setstate__(state)
128 | return t
--------------------------------------------------------------------------------
/src/bin/test/contrib_util.py:
--------------------------------------------------------------------------------
1 | """ Utility methods for use in testing."""
2 | import ConfigParser
3 | import json
4 | import os
5 | from inspect import getargspec
6 |
7 | import pandas as pd
8 |
9 | from base import BaseAlgo
10 | from codec import MLSPLDecoder, MLSPLEncoder
11 |
12 |
13 | PACKAGE_NAME='algos_contrib'
14 |
15 |
16 | class AlgoTestUtils(object):
17 | """
18 | Helper methods for testing algorithm implementations
19 | """
20 | @staticmethod
21 | def assert_method_signature(algo_cls, method_name, args):
22 | """
23 | Assert the signature of the specified method
24 |
25 | Args:
26 | algo_cls (class): a custom algorithm class to check
27 | method_name (str): the name of the method
28 | args (list): expected arguments to the named method
29 |
30 | Returns:
31 | (bool): True if the method is callable and has the specified arguments, False otherwise.
32 |
33 | Raises:
34 | AssertionError
35 | """
36 | method = getattr(algo_cls, method_name, None)
37 | assert method, "Method '{}' does not exist".format(method_name)
38 | assert callable(method), "Method '{}' is not callable".format(method_name)
39 | found_args = getargspec(method).args
40 | msg = 'Method {} has signature: {} - but should have {}'.format(method, args, found_args)
41 | assert found_args == args, msg
42 |
43 | @classmethod
44 | def assert_registered(cls, algo_cls):
45 | """
46 | Assert that the algorithm is registered in the algos.conf configuration file.
47 |
48 | Args:
49 | algo_cls (class): a custom algorithm class to check
50 |
51 | Returns:
52 | (bool): True if the method is registered in algos.conf file.
53 |
54 | Raises:
55 | AssertionError
56 | """
57 | config = ConfigParser.RawConfigParser()
58 | with cls.get_algos_conf_fp() as f:
59 | config.readfp(f)
60 | algo_name = algo_cls.__name__
61 | try:
62 | package_name = config.get(algo_name, 'package')
63 | except ConfigParser.NoSectionError:
64 | assert False, "'{}' not registered in algos.conf".format(algo_name)
65 | except ConfigParser.NoOptionError:
66 | assert False, "'{}' must override 'package' option in algos.conf".format(algo_name)
67 |
68 | assert package_name == PACKAGE_NAME, "The package name must be '{}'".format(PACKAGE_NAME)
69 |
70 | @staticmethod
71 | def assert_serializable(algo_cls, input_df, options):
72 | """
73 | Assert that the model created by the algorithm is serializable.
74 |
75 | Args:
76 | algo_cls (class): a custom algorithm class to check
77 | input_df (pandas Dataframe): input dataframe for the algorithm being tested
78 | options (dict): options for the fit() (and apply(), if applicable) methods of the algorithm
79 |
80 | Returns:
81 | (bool): True if the the model is serializable, False otherwise.
82 |
83 | Raises:
84 | AssertionError
85 | """
86 | assert hasattr(algo_cls, 'register_codecs')
87 | algo_cls.register_codecs()
88 |
89 | algo_inst = algo_cls(options)
90 | algo_inst.feature_variables = ['b', 'c']
91 | algo_inst.target_variable = 'a'
92 | algo_inst.fit(input_df.copy(), options)
93 |
94 | encoded = json.dumps(algo_inst, cls=MLSPLEncoder)
95 | decoded = json.loads(encoded, cls=MLSPLDecoder)
96 |
97 | orig_y = algo_inst.apply(input_df.copy(), options)
98 | decoded_y = decoded.apply(input_df.copy(), options)
99 | pd.util.testing.assert_frame_equal(orig_y, decoded_y)
100 |
101 | @classmethod
102 | def assert_base_algo_method_signatures(cls, algo_cls, required_methods=None):
103 | """
104 | Assert that the signatures of algorithm's methods adhere to the API.
105 |
106 | Args:
107 | algo_cls (class): a custom algorithm class to check.
108 | required_methods (list): list of required method names.
109 | '__init__' and 'fit' are always required, so
110 | they do not need to be included.
111 |
112 |
113 | Returns:
114 | (bool): True if the methods adhere to the API, False otherwise.
115 |
116 | Raises:
117 | AssertionError
118 | """
119 | method_args_map = {
120 | '__init__': ['self', 'options'],
121 | 'fit': ['self', 'df', 'options'],
122 | 'partial_fit': ['self', 'df', 'options'],
123 | 'apply': ['self', 'df', 'options'],
124 | 'summary': ['self', 'options'],
125 | 'register_codecs': [],
126 | }
127 |
128 | if required_methods is None:
129 | required_methods = []
130 |
131 | assert issubclass(algo_cls, BaseAlgo), 'Algorithms must inherit from BaseAlgo.'
132 |
133 | required_method_set = set(required_methods)
134 | extra_methods = required_method_set - method_args_map.viewkeys()
135 | assert extra_methods == set(), "'{}' not in BaseAlgo".format(", ".join(extra_methods))
136 |
137 | # __init__ and fit are always required.
138 | required_method_set.add('__init__')
139 | required_method_set.add('fit')
140 |
141 | for required_method in required_method_set:
142 | cls.assert_method_signature(algo_cls, required_method, method_args_map[required_method])
143 |
144 | @classmethod
145 | def assert_algo_basic(cls, algo_cls, required_methods=None, input_df=None, options=None, serializable=True):
146 | """
147 | Assert signatures of methods, registration, and serialization
148 |
149 | Args:
150 | algo_cls (class): a custom algorithm class to check.
151 | input_df (pandas Dataframe): input dataframe for the algorithm being tested
152 | options (dict): options for the fit() (and apply(), if applicable) methods of the algorithm
153 | serializable (bool): whether to check serializability or not.
154 |
155 | Returns:
156 | (bool): True if the methods adhere to the API, False otherwise.
157 |
158 | Raises:
159 | AssertionError
160 | """
161 | cls.assert_base_algo_method_signatures(algo_cls, required_methods)
162 | cls.assert_registered(algo_cls)
163 | if serializable:
164 | # The input and options are required for serializability test.
165 | assert input_df is not None
166 | assert options is not None
167 | cls.assert_serializable(algo_cls, input_df, options)
168 |
169 | @staticmethod
170 | def get_algos_conf_fp():
171 | """
172 | Get a reference (pointer) to algos.conf file open for read
173 |
174 | This method mainly exists to aid testing.
175 |
176 | Returns:
177 | (File): algos.conf file pointer
178 | """
179 | algos_file_path = os.path.join(os.path.dirname(__file__), '..', '..', 'default', 'algos.conf')
180 | return open(algos_file_path)
181 |
182 |
183 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # mltk-algo-contrib
2 |
3 | This repo contains custom algorithms for use with the [Splunk Machine Learning Toolkit](https://splunkbase.splunk.com/app/2890/). The repo itself is also a Splunk app.
4 | Custom algorithms can be added to the Splunk Machine Learning toolkit by adhering to the [ML-SPL API](http://docs.splunk.com/Documentation/MLApp/latest/API/Introduction).
5 | The API is a thin wrapper around machine learning estimators provided by libraries such as:
6 | * [scikit-learn](scikit-learn.org)
7 | * [statsmodels](http://www.statsmodels.org/).
8 | * [scipy](https://www.scipy.org)
9 |
10 | and custom algorithms.
11 |
12 | Note that this repo is a collection of custom *algorithms* only, and not any libraries. Any libraries required
13 | should only be added to live environments manually and not to this repo.
14 |
15 | A comprehensive guide to using the ML-SPL API can be found [here](http://docs.splunk.com/Documentation/MLApp/latest/API/Introduction).
16 |
17 | A very simple example:
18 |
19 | ```python
20 | from base import BaseAlgo
21 |
22 |
23 | class CustomAlgorithm(BaseAlgo):
24 | def __init__(self, options):
25 | # Option checking & initializations here
26 | pass
27 |
28 | def fit(self, df, options):
29 | # Fit an estimator to df, a pandas DataFrame of the search results
30 | pass
31 |
32 | def partial_fit(self, df, options):
33 | # Incrementally fit a model
34 | pass
35 |
36 | def apply(self, df, options):
37 | # Apply a saved model
38 | # Modify df, a pandas DataFrame of the search results
39 | return df
40 |
41 | @staticmethod
42 | def register_codecs():
43 | # Add codecs to the codec manager
44 | pass
45 |
46 | ```
47 |
48 | # Dependencies
49 |
50 | To use the custom algorithms contained in this app, you must also have installed:
51 |
52 | - [Splunk Machine Learning Toolkit](https://splunkbase.splunk.com/app/2890/)
53 | - Python for Scientific Computing Add-on
54 | - [Linux64](https://splunkbase.splunk.com/app/2882/)
55 | - [Linux32](https://splunkbase.splunk.com/app/2884/)
56 | - [Windows64](https://splunkbase.splunk.com/app/2883/)
57 | - [macOS](https://splunkbase.splunk.com/app/2881/)
58 |
59 | # Usage
60 | This repository is contains public contributions and Splunk is not responsible for guaranteeing
61 | the correctness or validity of the algorithms. Splunk is in no way responsible for the vetting of
62 | the contents of contributed algorithms.
63 |
64 | # Deploying
65 |
66 | To use the custom algorithms in this repository, you must deploy them as a Splunk app.
67 |
68 | There are two ways to do this.
69 |
70 | ### Manual copying
71 |
72 | You can simple copy the following directories under src:
73 | * bin
74 | * default
75 | * metadata
76 |
77 | to:
78 | * ${SPLUNK_HOME}/etc/apps/SA_mltk_contrib_app (you will need to create the directory first):
79 |
80 | OR
81 |
82 | ### Build and install
83 |
84 | #### 1. Build the app:
85 |
86 | You will need to install tox. See [Test Prerequisites](#prereq)
87 |
88 | ```bash
89 | tox -e package-macos # if on Mac
90 | tox -e package-linux # if on Linux
91 | ```
92 |
93 | * The resulting gzipped tarball will be in the `target` directory (e.g. target/SA_mltk_contrib_app.tgz).
94 | * The location of the gzipped tarball can be overridden by `BUILD_DIR` environment variable.
95 | * The default app name will be `SA_mltk_contrib_app`, but this can be overridden by the `APP_NAME` environment variable.
96 |
97 | * **NOTE**: You can run `tox -e clean` to remove the `target` directory.
98 |
99 | #### 2. Install the tarball:
100 |
101 | * You can do one of the followings with the tarball from step 1:
102 | * Manually untar it in `${SPLUNK_HOME}/etc/apps` directory
103 | * Install it using the GUI:
104 | * https://docs.splunk.com/Documentation/AddOns/released/Overview/Singleserverinstall
105 |
106 | # Contributing
107 |
108 | This repository was specifically made for your contributions! See [Contributing](https://github.com/splunk/mltk-algo-contrib/blob/master/CONTRIBUTING.md) for more details.
109 |
110 | ## Developing
111 |
112 | To start developing, you will need to have Splunk installed. If you don't, read more [here](http://docs.splunk.com/Documentation/Splunk/latest/Installation/InstallonLinux).
113 |
114 | 1. clone the repo and cd into the directory:
115 |
116 | ```bash
117 | git clone https://github.com/splunk/mltk-algo-contrib.git
118 | cd mltk-algo-contrib
119 | ```
120 |
121 | 2. symlink the `src` directory to the apps folder in Splunk and restart splunkd:
122 |
123 | ```bash
124 | ln -s "$(pwd)/src" $SPLUNK_HOME/etc/apps/SA_mltk_contrib_app
125 | $SPLUNK_HOME/bin/splunk restart
126 | ```
127 | * _This will eliminate the need to deploy the app to test changes._
128 |
129 | 3. Add your new algorithm(s) to `src/bin/algos_contrib`.
130 | (See SVR.py for an example.)
131 |
132 | 4. Add a new stanza to `src/default/algos.conf`
133 |
134 | ```bash
135 | []
136 | package=algos_contrib
137 | ```
138 |
139 | * **NOTE**: Due to the way configuration file layering works in Splunk,
140 | the package name must be overridden in each section, and not
141 | in the _default_ section.
142 |
143 | 5. Add your tests to `src/bin/algos_contrib/tests/test_.py`
144 | (See test_svr.py for an example.)
145 |
146 | ## Running Tests
147 |
148 |
149 | ### Prerequisites
150 |
151 | 1. Install *tox*:
152 | * http://tox.readthedocs.io/en/latest/install.html
153 | ```bash
154 | pip install tox
155 | ```
156 |
157 | 2. Install *tox-pip-extensions*:
158 | * https://github.com/tox-dev/tox-pip-extensions
159 | ```bash
160 | pip install tox-pip-extensions
161 | ```
162 | * **NOTE**: You only need this if you do not want to
163 | recreate the virtualenv(s) manually with `tox -r`
164 | everytime you update requirements*.txt file, but
165 | this is recommended for convenience.
166 |
167 | 3. You must also have the following environment variable set to your
168 | Splunk installation directory (e.g. /opt/splunk):
169 | * SPLUNK_HOME
170 |
171 | ### Using tox
172 |
173 | To run all tests, run the following command in the root source directory:
174 |
175 | ```bash
176 | tox
177 | ```
178 |
179 | To run a single test, you can provide the directory or a file as a parameter:
180 |
181 | ```bash
182 | tox src/bin/algos_contrib/tests/
183 | tox src/bin/algos_contrib/tests/test_example_algo.py
184 | ...
185 | ```
186 |
187 | Basically, any arguments passed to *tox* will be passed as an argument to the *pytest* command.
188 | To pass in options, use double dashes (--):
189 |
190 | ```bash
191 | tox -- -k "example" # Run tests that has keyword 'example'
192 | tox -- -x # Stop after the first failure
193 | tox -- -s # Show stdout/stderr (i.e. disable capturing)
194 | ...
195 | ```
196 |
197 | ### Using Python REPL (Interactive Interpreter)
198 |
199 | ```python
200 | $ python # from src/bin directory
201 | >>> # Add the MLTK to our sys.path
202 | >>> from link_mltk import add_mltk
203 | >>> add_mltk()
204 | >>>
205 | >>> # Import our algorithm class
206 | >>> from algos_contrib.ExampleAlgo import ExampleAlgo
207 | ... (some warning from Splunk may show up)
208 | >>>
209 | >>> # Use utilities to catch common mistakes
210 | >>> from test.contrib_util import AlgoTestUtils
211 | >>> AlgoTestUtils.assert_algo_basic(ExampleAlgo, serializable=False)
212 | ```
213 |
214 | ### Package/File Naming
215 |
216 | Files and packages under _test_ directory should avoid having names
217 | that conflict with files or directories directly under:
218 | ```bash
219 | $SPLUNK_HOME/etc/apps/Splunk_ML_Toolkit/bin
220 | ```
221 |
222 | ## Pull requests
223 |
224 | Once you've finished what you're adding, make a pull request.
225 |
226 | ## Bugs? Issues?
227 |
228 | Please file issues with any information that might be needed to:
229 | - reproduce what you're experiencing
230 | - understand the problem fully
231 |
232 | # License
233 |
234 | The algorithms hosted, as well as the app itself, is licensed under the permissive Apache 2.0 license.
235 |
236 | **Any additions to this repository must be under one of these licenses:**
237 | - MIT
238 | - BSD
239 | - Apache 2.0
240 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------