├── src ├── bin │ ├── test │ │ ├── __init__.py │ │ ├── test_contrib_util.py │ │ └── contrib_util.py │ ├── algos_contrib │ │ ├── __init__.py │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── test_mds.py │ │ │ ├── test_tf_binary.py │ │ │ ├── test_savgol_filter.py │ │ │ ├── test_example_algo.py │ │ │ ├── test_correlation_matrix.py │ │ │ ├── test_collaborativefilter.py │ │ │ ├── test_agglomerative_clustering.py │ │ │ ├── test_nmf.py │ │ │ ├── test_min_max_scaler.py │ │ │ ├── test_truncated_svd.py │ │ │ ├── test_linear_svc.py │ │ │ ├── test_latent_dirichlet_allocation.py │ │ │ ├── test_CustomDecisionTreeClassifier.py │ │ │ ├── test_extra_trees_classifier.py │ │ │ ├── test_orthogonal_matching_pursuit.py │ │ │ ├── test_IsolationForest.py │ │ │ ├── test_tsne.py │ │ │ └── test_svr.py │ │ ├── ExampleAlgo.py │ │ ├── SVR.py │ │ ├── LinearSVC.py │ │ ├── NMF.py │ │ ├── TruncatedSVD.py │ │ ├── AdaBoostRegressor.py │ │ ├── ExtraTreesRegressor.py │ │ ├── SavgolFilter.py │ │ ├── BaggingRegressor.py │ │ ├── QuantileTransformer.py │ │ ├── LatentDirichletAllocation.py │ │ ├── OrthogonalMatchingPursuit.py │ │ ├── ExtraTreesClassifier.py │ │ ├── CorrelationMatrix.py │ │ ├── MinMaxScaler.py │ │ ├── MDS.py │ │ ├── TSNE.py │ │ ├── CustomDecisionTreeClassifier.py │ │ ├── AgglomerativeClustering.py │ │ ├── CollaborativeFilter.py │ │ ├── TFBinary.py │ │ └── IsolationForest.py │ ├── README.md │ ├── test.py │ └── link_mltk.py ├── default │ ├── data │ │ └── ui │ │ │ ├── views │ │ │ └── README.md │ │ │ └── nav │ │ │ └── default.xml │ ├── app.conf │ └── algos.conf └── metadata │ └── default.meta ├── .gitignore ├── requirements_1.2.txt ├── tox.ini ├── CONTRIBUTING.md ├── README.md └── LICENSE /src/bin/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bin/README.md: -------------------------------------------------------------------------------- 1 | This is where you put any scripts you want to add to this app. 2 | -------------------------------------------------------------------------------- /src/default/data/ui/views/README.md: -------------------------------------------------------------------------------- 1 | Add all the views that your app needs in this directory 2 | -------------------------------------------------------------------------------- /src/bin/test.py: -------------------------------------------------------------------------------- 1 | from link_mltk import add_mltk 2 | add_mltk() 3 | 4 | from test.util import check_signatures 5 | 6 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_mds.py: -------------------------------------------------------------------------------- 1 | from algos_contrib.MDS import MDS 2 | from test.contrib_util import AlgoTestUtils 3 | 4 | 5 | def test_algo(): 6 | AlgoTestUtils.assert_algo_basic(MDS, serializable=False) 7 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/ExampleAlgo.py: -------------------------------------------------------------------------------- 1 | from base import BaseAlgo 2 | 3 | 4 | class ExampleAlgo(BaseAlgo): 5 | def __init__(self, options): 6 | pass 7 | 8 | def fit(self, df, options): 9 | return df 10 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_tf_binary.py: -------------------------------------------------------------------------------- 1 | from algos_contrib.TFBinary import TFBinary 2 | from test.contrib_util import AlgoTestUtils 3 | 4 | 5 | def test_algo(): 6 | AlgoTestUtils.assert_algo_basic(TFBinary, serializable=False) 7 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_savgol_filter.py: -------------------------------------------------------------------------------- 1 | from algos_contrib.SavgolFilter import SavgolFilter 2 | from test.contrib_util import AlgoTestUtils 3 | 4 | 5 | def test_algo(): 6 | AlgoTestUtils.assert_algo_basic(SavgolFilter, serializable=False) 7 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_example_algo.py: -------------------------------------------------------------------------------- 1 | from algos_contrib.ExampleAlgo import ExampleAlgo 2 | from test.contrib_util import AlgoTestUtils 3 | 4 | 5 | def test_algo(): 6 | AlgoTestUtils.assert_algo_basic(ExampleAlgo, serializable=False) 7 | 8 | -------------------------------------------------------------------------------- /src/default/data/ui/nav/default.xml: -------------------------------------------------------------------------------- 1 | 8 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_correlation_matrix.py: -------------------------------------------------------------------------------- 1 | from algos_contrib.CorrelationMatrix import CorrelationMatrix 2 | from test.contrib_util import AlgoTestUtils 3 | 4 | 5 | def test_algo(): 6 | AlgoTestUtils.assert_algo_basic(CorrelationMatrix, serializable=False) 7 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_collaborativefilter.py: -------------------------------------------------------------------------------- 1 | from algos_contrib.CollaborativeFilter import CollaborativeFilter 2 | from test.contrib_util import AlgoTestUtils 3 | 4 | 5 | def test_algo(): 6 | AlgoTestUtils.assert_algo_basic(CollaborativeFilter, serializable=False) 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Jupyter Notebook 2 | .ipynb_checkpoints 3 | 4 | # macOS 5 | .DS_Store 6 | 7 | # Editors 8 | *.swp 9 | *.swo 10 | 11 | # Python stuff 12 | *.egg-info 13 | .tox 14 | **/.cache 15 | **/.pytest_cache 16 | **/*.pyc 17 | 18 | # IntelliJ 19 | **/.idea 20 | 21 | target 22 | -------------------------------------------------------------------------------- /requirements_1.2.txt: -------------------------------------------------------------------------------- 1 | attrs==17.4.0 2 | funcsigs==1.0.2 3 | mock==2.0.0 4 | more-itertools==4.1.0 5 | numpy==1.10.4 6 | pandas==0.17.1 7 | pluggy==0.6.0 8 | psutil==3.4.2 9 | py==1.5.3 10 | pytest==3.5.0 11 | scikit-learn==0.17 12 | scipy==0.17.0 13 | six==1.11.0 14 | statsmodels==0.6.1 15 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_agglomerative_clustering.py: -------------------------------------------------------------------------------- 1 | from algos_contrib.AgglomerativeClustering import AgglomerativeClustering 2 | from test.contrib_util import AlgoTestUtils 3 | 4 | 5 | def test_algo(): 6 | AlgoTestUtils.assert_algo_basic(AgglomerativeClustering, serializable=False) 7 | -------------------------------------------------------------------------------- /src/default/app.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Splunk app configuration file 3 | # 4 | 5 | [install] 6 | is_configured = 1 7 | 8 | [ui] 9 | is_visible = false 10 | label = mltk-algo-contrib 11 | 12 | [launcher] 13 | author = github.com/splunk/mltk-algo-contrib 14 | description = 15 | version = 1.0 16 | 17 | -------------------------------------------------------------------------------- /src/metadata/default.meta: -------------------------------------------------------------------------------- 1 | 2 | # Application-level permissions 3 | 4 | [] 5 | access = read : [ * ], write : [ admin, power ] 6 | 7 | ### EVENT TYPES 8 | 9 | [eventtypes] 10 | export = system 11 | 12 | 13 | ### PROPS 14 | 15 | [props] 16 | export = system 17 | 18 | 19 | ### TRANSFORMS 20 | 21 | [transforms] 22 | export = system 23 | 24 | 25 | ### LOOKUPS 26 | 27 | [lookups] 28 | export = system 29 | 30 | 31 | ### VIEWSTATES: even normal users should be able to create shared viewstates 32 | 33 | [viewstates] 34 | access = read : [ * ], write : [ * ] 35 | export = system 36 | 37 | 38 | [algos] 39 | export = system 40 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_nmf.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from algos_contrib.NMF import NMF 3 | from test.contrib_util import AlgoTestUtils 4 | 5 | 6 | def test_algo(): 7 | input_df = pd.DataFrame({ 8 | 'a': [1, 2, 3], 9 | 'b': [4, 5, 6], 10 | 'c': ['a', 'b', 'c'], 11 | }) 12 | options = { 13 | 'feature_variables': ['a', 'b', 'c'], 14 | } 15 | required_methods = ( 16 | '__init__', 17 | 'fit', 18 | 'partial_fit', 19 | 'apply', 20 | 'summary', 21 | 'register_codecs', 22 | ) 23 | AlgoTestUtils.assert_algo_basic(NMF, required_methods, input_df, options) 24 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_min_max_scaler.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from algos_contrib.MinMaxScaler import MinMaxScaler 3 | from test.contrib_util import AlgoTestUtils 4 | 5 | 6 | def test_algo(): 7 | input_df = pd.DataFrame({ 8 | 'a': [1, 2, 3], 9 | 'b': [4, 5, 6], 10 | 'c': ['a', 'b', 'c'], 11 | }) 12 | options = { 13 | 'feature_variables': ['a', 'b', 'c'], 14 | } 15 | required_methods = ( 16 | '__init__', 17 | 'fit', 18 | 'partial_fit', 19 | 'apply', 20 | 'summary', 21 | 'register_codecs', 22 | ) 23 | AlgoTestUtils.assert_algo_basic(MinMaxScaler, required_methods, input_df, options) 24 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_truncated_svd.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from algos_contrib.TruncatedSVD import TruncatedSVD 3 | from test.contrib_util import AlgoTestUtils 4 | 5 | 6 | def test_algo(): 7 | input_df = pd.DataFrame({ 8 | 'a': [1, 2, 3], 9 | 'b': [4, 5, 6], 10 | 'c': ['a', 'b', 'c'], 11 | }) 12 | options = { 13 | 'feature_variables': ['a', 'b', 'c'], 14 | } 15 | required_methods = ( 16 | '__init__', 17 | 'fit', 18 | 'partial_fit', 19 | 'apply', 20 | 'summary', 21 | 'register_codecs', 22 | ) 23 | AlgoTestUtils.assert_algo_basic(TruncatedSVD, required_methods, input_df, options) 24 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_linear_svc.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from algos_contrib.LinearSVC import LinearSVC 3 | from test.contrib_util import AlgoTestUtils 4 | 5 | 6 | 7 | 8 | def test_algo(): 9 | input_df = pd.DataFrame({ 10 | 'a': [1, 2, 3], 11 | 'b': [4, 5, 6], 12 | 'c': ['a', 'b', 'c'], 13 | }) 14 | options = { 15 | 'target_variable': ['a'], 16 | 'feature_variables': ['b', 'c'], 17 | } 18 | required_methods = ( 19 | '__init__', 20 | 'fit', 21 | 'partial_fit', 22 | 'apply', 23 | 'summary', 24 | 'register_codecs', 25 | ) 26 | AlgoTestUtils.assert_algo_basic(LinearSVC, required_methods , input_df, options) 27 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_latent_dirichlet_allocation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from algos_contrib.LatentDirichletAllocation import LatentDirichletAllocation 3 | from test.contrib_util import AlgoTestUtils 4 | 5 | 6 | def test_algo(): 7 | input_df = pd.DataFrame({ 8 | 'a': [1, 2, 3], 9 | 'b': [4, 5, 6], 10 | 'c': ['a', 'b', 'c'], 11 | }) 12 | options = { 13 | 'feature_variables': ['b', 'c'], 14 | } 15 | required_methods = ( 16 | '__init__', 17 | 'fit', 18 | 'partial_fit', 19 | 'apply', 20 | 'summary', 21 | 'register_codecs', 22 | ) 23 | AlgoTestUtils.assert_algo_basic(LatentDirichletAllocation, required_methods, input_df, options) 24 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_CustomDecisionTreeClassifier.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from algos_contrib.CustomDecisionTreeClassifier import CustomDecisionTreeClassifier 3 | from test.contrib_util import AlgoTestUtils 4 | 5 | def test_algo(): 6 | input_df = pd.DataFrame({ 7 | 'a': [1, 2, 3], 8 | 'b': [4, 5, 6], 9 | 'c': ['a', 'b', 'c'], 10 | }) 11 | options = { 12 | 'target_variable': ['a'], 13 | 'feature_variables': ['b', 'c'], 14 | } 15 | required_methods = ( 16 | '__init__', 17 | 'fit', 18 | 'apply', 19 | 'summary', 20 | 'register_codecs', 21 | ) 22 | AlgoTestUtils.assert_algo_basic(CustomDecisionTreeClassifier, required_methods , input_df, options) 23 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_extra_trees_classifier.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from algos_contrib.OrthogonalMatchingPursuit import OrthogonalMatchingPursuit 3 | from test.contrib_util import AlgoTestUtils 4 | 5 | 6 | 7 | 8 | def test_algo(): 9 | input_df = pd.DataFrame({ 10 | 'a': [1, 2, 3], 11 | 'b': [4, 5, 6], 12 | 'c': ['a', 'b', 'c'], 13 | }) 14 | options = { 15 | 'target_variable': ['a'], 16 | 'feature_variables': ['b', 'c'], 17 | } 18 | required_methods = ( 19 | '__init__', 20 | 'fit', 21 | 'partial_fit', 22 | 'apply', 23 | 'summary', 24 | 'register_codecs', 25 | ) 26 | AlgoTestUtils.assert_algo_basic(OrthogonalMatchingPursuit, required_methods , input_df, options) -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_orthogonal_matching_pursuit.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from algos_contrib.OrthogonalMatchingPursuit import OrthogonalMatchingPursuit 3 | from test.contrib_util import AlgoTestUtils 4 | 5 | 6 | 7 | 8 | def test_algo(): 9 | input_df = pd.DataFrame({ 10 | 'a': [1, 2, 3], 11 | 'b': [4, 5, 6], 12 | 'c': ['a', 'b', 'c'], 13 | }) 14 | options = { 15 | 'target_variable': ['a'], 16 | 'feature_variables': ['b', 'c'], 17 | } 18 | required_methods = ( 19 | '__init__', 20 | 'fit', 21 | 'partial_fit', 22 | 'apply', 23 | 'summary', 24 | 'register_codecs', 25 | ) 26 | AlgoTestUtils.assert_algo_basic(OrthogonalMatchingPursuit, required_methods , input_df, options) -------------------------------------------------------------------------------- /src/bin/algos_contrib/SVR.py: -------------------------------------------------------------------------------- 1 | from sklearn.svm import SVR as _SVR 2 | 3 | from base import BaseAlgo, RegressorMixin 4 | from util.param_util import convert_params 5 | 6 | 7 | class SVR(RegressorMixin, BaseAlgo): 8 | 9 | def __init__(self, options): 10 | self.handle_options(options) 11 | 12 | params = options.get('params', {}) 13 | out_params = convert_params( 14 | params, 15 | floats=['C', 'gamma'], 16 | strs=['kernel'], 17 | ints=['degree'], 18 | ) 19 | 20 | self.estimator = _SVR(**out_params) 21 | 22 | @staticmethod 23 | def register_codecs(): 24 | from codec.codecs import SimpleObjectCodec 25 | from codec import codecs_manager 26 | codecs_manager.add_codec('algos_contrib.SVR', 'SVR', SimpleObjectCodec) 27 | codecs_manager.add_codec('sklearn.svm.classes', 'SVR', SimpleObjectCodec) 28 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_IsolationForest.py: -------------------------------------------------------------------------------- 1 | from algos_contrib.IsolationForest import IsolationForest 2 | from test.contrib_util import AlgoTestUtils 3 | import pandas as pd 4 | 5 | def test_algo(): 6 | AlgoTestUtils.assert_algo_basic(IsolationForest, serializable=False) 7 | 8 | def test_algo_options(): 9 | input_df = pd.DataFrame({ 10 | 'a': [5.1, 4.9, 4.7, 4.6], 11 | 'b': [3.5, 3.0, 3.1, 3.2], 12 | 'c': [1.4, 1.4, 1.5, 1.6], 13 | 'd': [0.2, 0.2, 0.2, 0.4], 14 | 'e': ['Iris Setosa','Iris Setosa','Iris Versicolor','Iris Virginica'] 15 | }) 16 | options = { 17 | 'target_variables' : [], 18 | 'feature_variables': ['a','b','c','d'], 19 | } 20 | required_methods = ( 21 | '__init__', 22 | 'fit', 23 | 'apply', 24 | 'register_codecs', 25 | ) 26 | AlgoTestUtils.assert_algo_basic(IsolationForest, required_methods=required_methods, input_df=input_df, options=options, serializable=False) -------------------------------------------------------------------------------- /src/bin/algos_contrib/LinearSVC.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from sklearn.svm import LinearSVC as _LinearSVC 4 | 5 | from codec import codecs_manager 6 | from base import BaseAlgo, ClassifierMixin 7 | from util.param_util import convert_params 8 | 9 | 10 | class LinearSVC(ClassifierMixin, BaseAlgo): 11 | 12 | def __init__(self, options): 13 | self.handle_options(options) 14 | 15 | out_params = convert_params( 16 | options.get('params', {}), 17 | floats=['gamma', 'C', 'tol', 'intercept_scaling'], 18 | ints=['random_state','max_iter'], 19 | strs=['penalty', 'loss', 'multi_class'], 20 | bools=['dual', 'fit_intercept'], 21 | ) 22 | 23 | self.estimator = _LinearSVC(**out_params) 24 | 25 | @staticmethod 26 | def register_codecs(): 27 | from codec.codecs import SimpleObjectCodec 28 | codecs_manager.add_codec('algos_contrib.LinearSVC', 'LinearSVC', SimpleObjectCodec) 29 | codecs_manager.add_codec('sklearn.svm.classes', 'LinearSVC', SimpleObjectCodec) 30 | -------------------------------------------------------------------------------- /src/bin/link_mltk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ Small utility to add the MLTK bin path to the system path. 3 | This makes it easy to import algorithms or utilities from the MLTK.""" 4 | import os 5 | import sys 6 | 7 | 8 | def check_splunk_home(splunk_home): 9 | """ Check SPLUNK_HOME and raise if not set.""" 10 | if not splunk_home: 11 | raise RuntimeError('No $SPLUNK_HOME provided. Please set SPLUNK_HOME.') 12 | 13 | 14 | def get_mltk_bin_path(splunk_home): 15 | """ Create the path to the MLTK bin folder.""" 16 | check_splunk_home(splunk_home) 17 | mltk_path = os.path.join(splunk_home, 'etc', 'apps', 'Splunk_ML_Toolkit', 'bin') 18 | 19 | if not os.path.exists(mltk_path): 20 | raise RuntimeError('MLTK bin folder not found at {}: is MLTK installed?'.format(mltk_path)) 21 | 22 | return mltk_path 23 | 24 | 25 | def add_mltk(): 26 | """ Adds MLTK bin path to sys.path """ 27 | splunk_home = os.environ.get('SPLUNK_HOME', None) 28 | mltk_bin_path = get_mltk_bin_path(splunk_home) 29 | sys.path.insert(0, mltk_bin_path) 30 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27 3 | skipsdist = True 4 | skip_install = True 5 | tox_pip_extensions_ext_venv_update = true 6 | 7 | [testenv] 8 | passenv = 9 | SPLUNK_HOME 10 | setenv = 11 | PYTHONPATH = {env:SPLUNK_HOME}/etc/apps/Splunk_ML_Toolkit/bin 12 | APP_NAME = {env:APP_NAME:SA_mltk_contrib_app} 13 | BUILD_DIR = {toxinidir}/target 14 | deps = -r{toxinidir}/requirements_1.2.txt 15 | commands = pytest {posargs} 16 | 17 | [testenv:package-macos] 18 | platform = darwin 19 | deps = 20 | changedir = {env:BUILD_DIR} 21 | whitelist_externals = /bin/bash 22 | commands = 23 | /bin/bash -c 'tar -C {toxinidir} -s ",^src/,{env:APP_NAME}/," -cvzf {env:APP_NAME}.tgz src/\{bin,default,metadata\}' 24 | 25 | [testenv:package-linux] 26 | platform = linux 27 | deps = 28 | changedir = {env:BUILD_DIR} 29 | whitelist_externals = /bin/bash 30 | commands = 31 | /bin/bash -c 'tar -C {toxinidir} --transform="s,^src/,{env:APP_NAME}/," -cvzf {env:APP_NAME}.tgz src/\{bin,default,metadata\}' 32 | 33 | [testenv:clean] 34 | deps = 35 | whitelist_externals = /bin/rm 36 | commands = 37 | /bin/rm -rf {env:BUILD_DIR} 38 | 39 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/NMF.py: -------------------------------------------------------------------------------- 1 | from sklearn.decomposition import NMF as _NMF 2 | from base import BaseAlgo, TransformerMixin 3 | from codec import codecs_manager 4 | from util.param_util import convert_params 5 | 6 | class NMF(TransformerMixin, BaseAlgo): 7 | 8 | def __init__(self, options): 9 | self.handle_options(options) 10 | out_params = convert_params( 11 | options.get('params', {}), 12 | floats=['beta_loss','tol','alpha','l1_ratio'], 13 | strs=['init','solver'], 14 | ints=['k','max_iter','random_state'], 15 | bools=['versbose','shuffle'], 16 | aliases={'k': 'n_components'} 17 | ) 18 | 19 | self.estimator = _NMF(**out_params) 20 | 21 | def rename_output(self, default_names, new_names): 22 | if new_names is None: 23 | new_names = 'NMF' 24 | output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))] 25 | return output_names 26 | 27 | @staticmethod 28 | def register_codecs(): 29 | from codec.codecs import SimpleObjectCodec 30 | codecs_manager.add_codec('algos_contrib.NMF', 'NMF', SimpleObjectCodec) 31 | codecs_manager.add_codec('sklearn.decomposition.nmf', 'NMF', SimpleObjectCodec) 32 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/TruncatedSVD.py: -------------------------------------------------------------------------------- 1 | from sklearn.decomposition import TruncatedSVD as _TruncatedSVD 2 | from base import BaseAlgo, TransformerMixin 3 | from codec import codecs_manager 4 | from util.param_util import convert_params 5 | 6 | class TruncatedSVD(TransformerMixin, BaseAlgo): 7 | 8 | def __init__(self, options): 9 | self.handle_options(options) 10 | out_params = convert_params( 11 | options.get('params', {}), 12 | floats=['tol'], 13 | strs=['algorithm'], 14 | ints=['k','n_iter','random_state'], 15 | aliases={'k': 'n_components'} 16 | ) 17 | 18 | self.estimator = _TruncatedSVD(**out_params) 19 | 20 | def rename_output(self, default_names, new_names): 21 | if new_names is None: 22 | new_names = 'SVD' 23 | output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))] 24 | return output_names 25 | 26 | @staticmethod 27 | def register_codecs(): 28 | from codec.codecs import SimpleObjectCodec 29 | codecs_manager.add_codec('algos_contrib.TruncatedSVD', 'TruncatedSVD', SimpleObjectCodec) 30 | codecs_manager.add_codec('sklearn.decomposition.truncated_svd', 'TruncatedSVD', SimpleObjectCodec) 31 | -------------------------------------------------------------------------------- /src/default/algos.conf: -------------------------------------------------------------------------------- 1 | # Here is where algorithms are registered. 2 | [default] 3 | 4 | ######################################################################## 5 | # Due to the layering of configuration files in Splunk, we have to 6 | # override the package name in every section. 7 | ######################################################################## 8 | 9 | 10 | [AgglomerativeClustering] 11 | package=algos_contrib 12 | 13 | [CorrelationMatrix] 14 | package=algos_contrib 15 | 16 | [ExampleAlgo] 17 | package=algos_contrib 18 | 19 | [SVR] 20 | package=algos_contrib 21 | 22 | [SavgolFilter] 23 | package=algos_contrib 24 | 25 | [TSNE] 26 | package=algos_contrib 27 | 28 | [MDS] 29 | package=algos_contrib 30 | 31 | [OrthogonalMatchingPursuit] 32 | package=algos_contrib 33 | 34 | [TruncatedSVD] 35 | package=algos_contrib 36 | 37 | [LatentDirichletAllocation] 38 | package=algos_contrib 39 | 40 | [NMF] 41 | package=algos_contrib 42 | 43 | [CollaborativeFilter] 44 | package=algos_contrib 45 | 46 | [CustomDecisionTreeClassifier] 47 | package=algos_contrib 48 | 49 | [TFBinary] 50 | package = algos_contrib 51 | 52 | [MinMaxScaler] 53 | package = algos_contrib 54 | 55 | [LinearSVC] 56 | package = algos_contrib 57 | 58 | [ExtraTreesClassifier] 59 | package = algos_contrib 60 | 61 | [IsolationForest] 62 | package = algos_contrib -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_tsne.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from algos_contrib.TSNE import TSNE 3 | from test.contrib_util import AlgoTestUtils 4 | 5 | algo_options = {'feature_variables': ['Review']} 6 | 7 | 8 | def test_algo(): 9 | AlgoTestUtils.assert_algo_basic(TSNE, serializable=False) 10 | 11 | 12 | def test_valid_params(): 13 | algo_options['params'] = {'k': '1'} 14 | TSNE_algo = TSNE(algo_options) 15 | assert TSNE_algo.estimator.n_components == 1 16 | 17 | 18 | def test_invalid_params_k_not_int(): 19 | algo_options['params'] = {'k': '0.1'} 20 | with pytest.raises((RuntimeError, ValueError)) as excinfo: 21 | _ = TSNE(algo_options) 22 | assert excinfo.match('Invalid value for k: must be an int') 23 | 24 | 25 | def test_invalid_params_k_not_valid(): 26 | algo_options['params'] = {'k': '0'} 27 | with pytest.raises((RuntimeError, ValueError)) as excinfo: 28 | _ = TSNE(algo_options) 29 | assert excinfo.match('Invalid value for k: k must be greater than or equal to 1') 30 | 31 | 32 | def test_default_parameter_values(): 33 | algo_options['params'] = {'k': '1'} 34 | TSNE_algo = TSNE(algo_options) 35 | assert TSNE_algo.estimator.n_iter == 200 36 | assert TSNE_algo.estimator.perplexity == 30.0 37 | assert TSNE_algo.estimator.early_exaggeration == 4.0 38 | assert TSNE_algo.estimator.learning_rate == 100 39 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # CONTRIBUTING 2 | 3 | By submitting a Contribution to this Work, You agree that Your Contribution is made subject to the primary LICENSE 4 | file applicable to this Work. In addition, You represent that: (i) You are the copyright owner of the Contribution 5 | or (ii) You have the requisite rights to make the Contribution. 6 | 7 | ## Definitions: 8 | 9 | “You” shall mean: (i) yourself if you are making a Contribution on your own behalf; or (ii) your company, 10 | if you are making a Contribution on behalf of your company. If you are making a Contribution on behalf of your 11 | company, you represent that you have the requisite authority to do so. 12 | 13 | "Contribution" shall mean any original work of authorship, including any modifications or additions to an existing 14 | work, that is intentionally submitted by You for inclusion in, or documentation of, this project/repository. For the 15 | purposes of this definition, "submitted" means any form of electronic, verbal, or written communication submitted for 16 | inclusion in this project/repository, including but not limited to communication on electronic mailing lists, source 17 | code control systems, and issue tracking systems that are managed by, or on behalf of, the maintainers of 18 | the project/repository. 19 | 20 | “Work” shall mean the collective software, content, and documentation in this project/repository. 21 | 22 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/tests/test_svr.py: -------------------------------------------------------------------------------- 1 | from algos_contrib.SVR import SVR 2 | from test.contrib_util import AlgoTestUtils 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | def test_algo_basic(): 9 | input_df = pd.DataFrame({ 10 | 'a': [1, 2, 3], 11 | 'b': [4, 5, 6], 12 | 'c': ['a', 'b', 'c'], 13 | }) 14 | options = { 15 | 'target_variable': ['a'], 16 | 'feature_variables': ['b', 'c'], 17 | } 18 | required_methods = ( 19 | '__init__', 20 | 'fit', 21 | 'partial_fit', 22 | 'apply', 23 | 'summary', 24 | 'register_codecs', 25 | ) 26 | AlgoTestUtils.assert_algo_basic(SVR, required_methods, input_df, options) 27 | 28 | 29 | def test_prediction(): 30 | training_df = pd.DataFrame({ 31 | 'y': [1, 2, 3], 32 | 'x1': [4, 5, 6], 33 | 'x2': [7, 8, 9], 34 | }) 35 | options = { 36 | 'target_variable': ['y'], 37 | 'feature_variables': ['x1', 'x2'], 38 | } 39 | test_df = pd.DataFrame({ 40 | 'x1': [4], 41 | 'x2': [7], 42 | }) 43 | 44 | svr = SVR(options) 45 | svr.feature_variables = options['feature_variables'] 46 | svr.target_variable = options['target_variable'][0] 47 | svr.fit(training_df, options) 48 | output = svr.apply(test_df, options) 49 | np.testing.assert_approx_equal(output['predicted(y)'].values, np.array([1.1])) 50 | 51 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/AdaBoostRegressor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from pandas import DataFrame 4 | from sklearn.ensemble import AdaBoostRegressor as _AdaBoostRegressor 5 | 6 | from base import RegressorMixin, BaseAlgo 7 | from util.param_util import convert_params 8 | from util.algo_util import handle_max_features 9 | from codec import codecs_manager 10 | 11 | 12 | class AdaBoostRegressor(RegressorMixin, BaseAlgo): 13 | def __init__(self, options): 14 | self.handle_options(options) 15 | params = options.get('params', {}) 16 | out_params = convert_params( 17 | params, 18 | strs=['loss', 'max_features'], 19 | floats=['learning_rate'], 20 | ints=['n_estimators'], 21 | ) 22 | 23 | self.estimator = _AdaBoostRegressor(**out_params) 24 | 25 | 26 | @staticmethod 27 | def register_codecs(): 28 | from codec.codecs import SimpleObjectCodec, TreeCodec 29 | 30 | codecs_manager.add_codec('algos.AdaBoostRegressor', 'AdaBoostRegressor', SimpleObjectCodec) 31 | codecs_manager.add_codec('sklearn.ensemble.classes', 'AdaBoostRegressor', SimpleObjectCodec) 32 | codecs_manager.add_codec('sklearn.tree.tree', 'DecisionTreeRegressor', SimpleObjectCodec) 33 | codecs_manager.add_codec('sklearn.ensemble.weight_boosting', 'AdaBoostRegressor', SimpleObjectCodec) 34 | codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec) 35 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/ExtraTreesRegressor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from pandas import DataFrame 4 | from sklearn.ensemble import ExtraTreesRegressor as _ExtraTreesRegressor 5 | 6 | from base import RegressorMixin, BaseAlgo 7 | from util.param_util import convert_params 8 | from util.algo_util import handle_max_features 9 | from codec import codecs_manager 10 | 11 | 12 | class ExtraTreesRegressor(RegressorMixin, BaseAlgo): 13 | def __init__(self, options): 14 | self.handle_options(options) 15 | params = options.get('params', {}) 16 | out_params = convert_params( 17 | params, 18 | floats=['max_samples', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'min_impurity_split'], 19 | bools=['bootstrap', 'oob_score', 'warm_start'], 20 | ints=['n_estimators', 'max_depth', 'max_leaf_nodes', 'min_impurity_decrease'], 21 | strs=['criterion'], 22 | ) 23 | 24 | self.estimator = _ExtraTreesRegressor(**out_params) 25 | 26 | 27 | @staticmethod 28 | def register_codecs(): 29 | from codec.codecs import SimpleObjectCodec, TreeCodec 30 | 31 | codecs_manager.add_codec('algos.ExtraTreesRegressor', 'ExtraTreesRegressor', SimpleObjectCodec) 32 | codecs_manager.add_codec('sklearn.ensemble.forest', 'ExtraTreesRegressor', SimpleObjectCodec) 33 | codecs_manager.add_codec('sklearn.tree.tree', 'ExtraTreeRegressor', SimpleObjectCodec) 34 | codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec) 35 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/SavgolFilter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.signal import savgol_filter 3 | 4 | from base import BaseAlgo 5 | from util.param_util import convert_params 6 | from util import df_util 7 | 8 | 9 | class SavgolFilter(BaseAlgo): 10 | 11 | def __init__(self, options): 12 | # set parameters 13 | params = options.get('params', {}) 14 | out_params = convert_params( 15 | params, 16 | ints=['window_length', 'polyorder', 'deriv'] 17 | ) 18 | 19 | # set defaults for parameters 20 | if 'window_length' in out_params: 21 | self.window_length = out_params['window_length'] 22 | else: 23 | self.window_length = 5 24 | 25 | if 'polyorder' in out_params: 26 | self.polyorder = out_params['polyorder'] 27 | else: 28 | self.polyorder = 2 29 | 30 | if 'deriv' in out_params: 31 | self.deriv = out_params['deriv'] 32 | else: 33 | self.deriv = 0 34 | 35 | def fit(self, df, options): 36 | X = df.copy() 37 | X, nans, columns = df_util.prepare_features(X, self.feature_variables) 38 | 39 | def f(x): 40 | return savgol_filter(x, self.window_length, self.polyorder, self.deriv) 41 | 42 | y_hat = np.apply_along_axis(f, 0, X) 43 | 44 | names = ['SG_%s' % col for col in columns] 45 | output_df = df_util.create_output_dataframe(y_hat, nans, names) 46 | df = df_util.merge_predictions(df, output_df) 47 | 48 | return df -------------------------------------------------------------------------------- /src/bin/algos_contrib/BaggingRegressor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from pandas import DataFrame 4 | from sklearn.ensemble import BaggingRegressor as _BaggingRegressor 5 | 6 | from base import RegressorMixin, BaseAlgo 7 | from util.param_util import convert_params 8 | from util.algo_util import handle_max_features 9 | from codec import codecs_manager 10 | 11 | 12 | class BaggingRegressor(RegressorMixin, BaseAlgo): 13 | def __init__(self, options): 14 | self.handle_options(options) 15 | params = options.get('params', {}) 16 | out_params = convert_params( 17 | params, 18 | floats=['max_samples', 'max_features'], 19 | bools=['bootstrap', 'bootstrap_features', 'oob_score', 'warm_start'], 20 | ints=['n_estimators'], 21 | ) 22 | 23 | self.estimator = _BaggingRegressor(**out_params) 24 | 25 | 26 | @staticmethod 27 | def register_codecs(): 28 | from codec.codecs import SimpleObjectCodec, TreeCodec 29 | 30 | codecs_manager.add_codec('algos.BaggingRegressor', 'BaggingRegressor', SimpleObjectCodec) 31 | codecs_manager.add_codec('sklearn.ensemble.classes', 'BaggingRegressor', SimpleObjectCodec) 32 | codecs_manager.add_codec('sklearn.tree.tree', 'DecisionTreeRegressor', SimpleObjectCodec) 33 | codecs_manager.add_codec('sklearn.ensemble.weight_boosting', 'BaggingRegressor', SimpleObjectCodec) 34 | codecs_manager.add_codec('sklearn.ensemble.bagging', 'BaggingRegressor', SimpleObjectCodec) 35 | codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec) 36 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/QuantileTransformer.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | 4 | import pandas as pd 5 | from sklearn.preprocessing import QuantileTransformer as _QuantileTransformer 6 | 7 | from base import BaseAlgo, TransformerMixin 8 | from codec import codecs_manager 9 | from util.param_util import convert_params 10 | from util import df_util 11 | 12 | 13 | class QuantileTransformer(TransformerMixin, BaseAlgo): 14 | 15 | def __init__(self, options): 16 | self.handle_options(options) 17 | 18 | out_params = convert_params( 19 | options.get('params', {}), 20 | bools=['copy'], 21 | ints=['n_quantiles'], 22 | strs=['output_distribution'] 23 | ) 24 | self.estimator = _QuantileTransformer(**out_params) 25 | self.columns = None 26 | 27 | def rename_output(self, default_names, new_names=None): 28 | if new_names is None: 29 | new_names = 'QT' 30 | output_names = [new_names + '_' + feature for feature in self.columns] 31 | return output_names 32 | 33 | def summary(self, options): 34 | if len(options) != 2: # only model name and mlspl_limits 35 | raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__) 36 | return pd.DataFrame({'fields': self.columns}) 37 | 38 | @staticmethod 39 | def register_codecs(): 40 | from codec.codecs import SimpleObjectCodec 41 | codecs_manager.add_codec('algos.QuantileTransformer', 'QuantileTransformer', SimpleObjectCodec) 42 | codecs_manager.add_codec('sklearn.preprocessing.data', 'QuantileTransformer', SimpleObjectCodec) 43 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/LatentDirichletAllocation.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Once newer version of sklearn is used will need to change k alias from n_topics to n_components 3 | https://stackoverflow.com/a/48121678 4 | ''' 5 | 6 | from sklearn.decomposition import LatentDirichletAllocation as _LatentDirichletAllocation 7 | from base import BaseAlgo, TransformerMixin 8 | from codec import codecs_manager 9 | from util.param_util import convert_params 10 | 11 | class LatentDirichletAllocation(TransformerMixin, BaseAlgo): 12 | 13 | def __init__(self, options): 14 | self.handle_options(options) 15 | out_params = convert_params( 16 | options.get('params', {}), 17 | floats=['doc_topic_prior','learning_decay','learning_offset','perp_tol','mean_change_tol'], 18 | strs=['learning_method'], 19 | ints=['k','max_iter','batch_size','evaluate_every','total_samples','max_doc_update_iter','n_jobs','verbose','random_state'], 20 | aliases={'k': 'n_topics'} 21 | ) 22 | 23 | self.estimator = _LatentDirichletAllocation(**out_params) 24 | 25 | def rename_output(self, default_names, new_names): 26 | if new_names is None: 27 | new_names = 'LDA' 28 | output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))] 29 | return output_names 30 | 31 | @staticmethod 32 | def register_codecs(): 33 | from codec.codecs import SimpleObjectCodec 34 | codecs_manager.add_codec('algos_contrib.LatentDirichletAllocation', 'LatentDirichletAllocation', SimpleObjectCodec) 35 | codecs_manager.add_codec('sklearn.decomposition.online_lda', 'LatentDirichletAllocation', SimpleObjectCodec) 36 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/OrthogonalMatchingPursuit.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.linear_model import OrthogonalMatchingPursuit as _OrthogonalMatchingPursuit 3 | from base import RegressorMixin, BaseAlgo 4 | from util.param_util import convert_params 5 | from util import df_util 6 | 7 | 8 | class OrthogonalMatchingPursuit(RegressorMixin, BaseAlgo): 9 | def __init__(self, options): 10 | self.handle_options(options) 11 | 12 | params = options.get('params', {}) 13 | out_params = convert_params( 14 | params, 15 | floats=['tol'], 16 | strs=['kernel'], 17 | ints=['n_nonzero_coefs'], 18 | bools=['fit_intercept', 'normalize'], 19 | ) 20 | 21 | self.estimator = _OrthogonalMatchingPursuit(**out_params) 22 | 23 | def summary(self, options): 24 | if len(options) != 2: # only model name and mlspl_limits 25 | raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__) 26 | df = pd.DataFrame({'feature': self.columns, 27 | 'coefficient': self.estimator.coef_.ravel()}) 28 | idf = pd.DataFrame({'feature': ['_intercept'], 29 | 'coefficient': [self.estimator.intercept_]}) 30 | return pd.concat([df, idf]) 31 | 32 | @staticmethod 33 | def register_codecs(): 34 | from codec.codecs import SimpleObjectCodec 35 | from codec import codecs_manager 36 | codecs_manager.add_codec('algos_contrib.OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuit', SimpleObjectCodec) 37 | codecs_manager.add_codec('sklearn.linear_model.omp', 'OrthogonalMatchingPursuit', SimpleObjectCodec) 38 | 39 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/ExtraTreesClassifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from pandas import DataFrame 4 | from sklearn.ensemble import ExtraTreesClassifier as _ExtraTreesClassifier 5 | 6 | from base import ClassifierMixin, BaseAlgo 7 | from codec import codecs_manager 8 | from util.param_util import convert_params 9 | from util.algo_util import handle_max_features 10 | 11 | 12 | class ExtraTreesClassifier(ClassifierMixin, BaseAlgo): 13 | 14 | def __init__(self, options): 15 | self.handle_options(options) 16 | 17 | out_params = convert_params( 18 | options.get('params', {}), 19 | ints=['random_state', 'n_estimators', 'max_depth', 20 | 'min_samples_split', 'max_leaf_nodes'], 21 | strs=['max_features', 'criterion'], 22 | ) 23 | 24 | if 'max_depth' not in out_params: 25 | out_params.setdefault('max_leaf_nodes', 2000) 26 | 27 | if 'max_features' in out_params: 28 | out_params['max_features'] = handle_max_features(out_params['max_features']) 29 | 30 | self.estimator = _ExtraTreesClassifier(class_weight='balanced', 31 | **out_params) 32 | 33 | def summary(self, options): 34 | if len(options) != 2: # only model name and mlspl_limits 35 | raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__) 36 | df = DataFrame({ 37 | 'feature': self.columns, 38 | 'importance': self.estimator.feature_importances_.ravel() 39 | }) 40 | return df 41 | 42 | @staticmethod 43 | def register_codecs(): 44 | from codec.codecs import SimpleObjectCodec, TreeCodec 45 | codecs_manager.add_codec('algos_contrib.ExtraTreesClassifier', 46 | 'ExtraTreesClassifier', SimpleObjectCodec) 47 | codecs_manager.add_codec('sklearn.ensemble.forest', 48 | 'ExtraTreesClassifier', SimpleObjectCodec) 49 | codecs_manager.add_codec('sklearn.tree.tree', 'ExtraTreeClassifier', 50 | SimpleObjectCodec) 51 | codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec) 52 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/CorrelationMatrix.py: -------------------------------------------------------------------------------- 1 | from base import BaseAlgo 2 | 3 | 4 | class CorrelationMatrix(BaseAlgo): 5 | """Compute and return a correlation matrix.""" 6 | 7 | def __init__(self, options): 8 | """Check for valid correlation type, and save it to an attribute on self.""" 9 | 10 | feature_variables = options.get('feature_variables', {}) 11 | target_variable = options.get('target_variable', {}) 12 | 13 | if len(feature_variables) == 0: 14 | raise RuntimeError('You must supply one or more fields') 15 | 16 | if len(target_variable) > 0: 17 | raise RuntimeError('CorrelationMatrix does not support the from clause') 18 | 19 | valid_methods = ['spearman', 'kendall', 'pearson'] 20 | 21 | # Check to see if parameters exist 22 | params = options.get('params', {}) 23 | 24 | # Check if method is in parameters in search 25 | if 'method' in params: 26 | if params['method'] not in valid_methods: 27 | error_msg = 'Invalid value for method: must be one of {}'.format( 28 | ', '.join(valid_methods)) 29 | raise RuntimeError(error_msg) 30 | 31 | # Assign method to self for later usage 32 | self.method = params['method'] 33 | 34 | # Assign default method and ensure no other parameters are present 35 | else: 36 | # Default method for correlation 37 | self.method = 'pearson' 38 | 39 | # Check for bad parameters 40 | if len(params) > 0: 41 | raise RuntimeError('The only valid parameter is method.') 42 | 43 | def fit(self, df, options): 44 | """Compute the correlations and return a DataFrame.""" 45 | 46 | # df contains all the search results, including hidden fields 47 | # but the requested requested are saved as self.feature_variables 48 | requested_columns = df[self.feature_variables] 49 | 50 | # Get correlations 51 | correlations = requested_columns.corr(method=self.method) 52 | 53 | # Reset index so that all the data are in columns 54 | # (this is necessary for the corr method) 55 | output_df = correlations.reset_index() 56 | 57 | return output_df 58 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/MinMaxScaler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas as pd 4 | from sklearn.preprocessing import MinMaxScaler as _MinMaxScaler 5 | 6 | from base import BaseAlgo, TransformerMixin 7 | from codec import codecs_manager 8 | from util.param_util import convert_params 9 | from util import df_util 10 | 11 | 12 | class MinMaxScaler(TransformerMixin, BaseAlgo): 13 | 14 | def __init__(self, options): 15 | self.handle_options(options) 16 | 17 | out_params = convert_params( 18 | options.get('params', {}), 19 | bools=['copy'], 20 | strs=['feature_range'] 21 | ) 22 | self.estimator = _MinMaxScaler(**out_params) 23 | self.columns = None 24 | 25 | def rename_output(self, default_names, new_names=None): 26 | if new_names is None: 27 | new_names = 'MMS' 28 | output_names = [new_names + '_' + feature for feature in self.columns] 29 | return output_names 30 | 31 | def partial_fit(self, df, options): 32 | # Make a copy of data, to not alter original dataframe 33 | X = df.copy() 34 | 35 | X, _, columns = df_util.prepare_features( 36 | X=X, 37 | variables=self.feature_variables, 38 | mlspl_limits=options.get('mlspl_limits'), 39 | ) 40 | if self.columns is not None: 41 | df_util.handle_new_categorical_values(X, None, options, self.columns) 42 | if X.empty: 43 | return 44 | else: 45 | self.columns = columns 46 | self.estimator.partial_fit(X) 47 | 48 | def summary(self, options): 49 | if len(options) != 2: # only model name and mlspl_limits 50 | raise RuntimeError('"%s" models do not take options for summarization' % self.__class__.__name__) 51 | return pd.DataFrame({'fields': self.columns, 52 | 'mean': self.estimator.mean_, 53 | 'var': self.estimator.var_, 54 | 'scale': self.estimator.scale_}) 55 | 56 | @staticmethod 57 | def register_codecs(): 58 | from codec.codecs import SimpleObjectCodec 59 | codecs_manager.add_codec('algos_contrib.MinMaxScaler', 'MinMaxScaler', SimpleObjectCodec) 60 | codecs_manager.add_codec('sklearn.preprocessing.data', 'MinMaxScaler', SimpleObjectCodec) 61 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/MDS.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from sklearn.manifold import MDS as _MDS 4 | 5 | from base import BaseAlgo, TransformerMixin 6 | from codec import codecs_manager 7 | from util.param_util import convert_params 8 | 9 | from util import df_util 10 | 11 | class MDS(TransformerMixin, BaseAlgo): 12 | 13 | def __init__(self, options): 14 | self.handle_options(options) 15 | out_params = convert_params( 16 | options.get('params', {}), 17 | ints=['k', 'max_iter', 'n_init', 'n_jobs'], 18 | floats=['eps'], 19 | bools=['metric'], 20 | aliases={'k': 'n_components'} 21 | ) 22 | 23 | if 'max_iter' not in out_params: 24 | out_params.setdefault('max_iter', 300) 25 | 26 | if 'n_init' not in out_params: 27 | out_params.setdefault('n_init', 4) 28 | 29 | if 'n_jobs' not in out_params: 30 | out_params.setdefault('n_jobs', 1) 31 | 32 | if 'eps' not in out_params: 33 | out_params.setdefault('eps', 0.001) 34 | 35 | if 'metric' not in out_params: 36 | out_params.setdefault('metric', True) 37 | 38 | self.estimator = _MDS(**out_params) 39 | 40 | def rename_output(self, default_names, new_names): 41 | if new_names is None: 42 | new_names = 'MDS' 43 | output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))] 44 | return output_names 45 | 46 | def apply(self, df, options): 47 | # Make a copy of data, to not alter original dataframe 48 | X = df.copy() 49 | 50 | # Prepare the features 51 | X, nans, _ = df_util.prepare_features( 52 | X=X, 53 | variables=self.feature_variables, 54 | final_columns=self.columns, 55 | ) 56 | 57 | # Call the transform method 58 | y_hat = self.estimator.fit_transform(X.values) 59 | 60 | # Assign output_name 61 | output_name = options.get('output_name', None) 62 | default_names = self.make_output_names( 63 | output_name=output_name, 64 | n_names=y_hat.shape[1], 65 | ) 66 | output_names = self.rename_output(default_names, output_name) 67 | 68 | # Create output dataframe 69 | output = df_util.create_output_dataframe( 70 | y_hat=y_hat, 71 | nans=nans, 72 | output_names=output_names, 73 | ) 74 | 75 | # Merge with original dataframe 76 | output = df_util.merge_predictions(df, output) 77 | return output 78 | 79 | @staticmethod 80 | def register_codecs(): 81 | from codec.codecs import SimpleObjectCodec 82 | codecs_manager.add_codec('algos_contrib.MDS', 'MDS', SimpleObjectCodec) 83 | codecs_manager.add_codec('sklearn.manifold.MDS', 'MDS', SimpleObjectCodec) 84 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/TSNE.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from sklearn.manifold import TSNE as _TSNE 4 | 5 | from base import BaseAlgo, TransformerMixin 6 | from codec import codecs_manager 7 | from util.param_util import convert_params 8 | 9 | from util import df_util 10 | 11 | class TSNE(TransformerMixin, BaseAlgo): 12 | 13 | def __init__(self, options): 14 | self.handle_options(options) 15 | out_params = convert_params( 16 | options.get('params', {}), 17 | ints=['k', 'n_iter'], 18 | floats=['perplexity', 'early_exaggeration', 'learning_rate'], 19 | aliases={'k': 'n_components'} 20 | ) 21 | 22 | if out_params['n_components'] < 1: 23 | msg = 'Invalid value for k: k must be greater than or equal to 1, but found k="{}".' 24 | raise RuntimeError(msg.format(out_params['n_components'])) 25 | 26 | if 'n_iter' not in out_params: 27 | out_params.setdefault('n_iter', 200) 28 | 29 | if 'perplexity' not in out_params: 30 | out_params.setdefault('perplexity', 30.0) 31 | 32 | if 'early_exaggeration' not in out_params: 33 | out_params.setdefault('early_exaggeration', 4.0) 34 | 35 | if 'learning_rate' not in out_params: 36 | out_params.setdefault('learning_rate', 100) 37 | 38 | self.estimator = _TSNE(**out_params) 39 | 40 | def rename_output(self, default_names, new_names): 41 | if new_names is None: 42 | new_names = 'TSNE' 43 | output_names = ['{}_{}'.format(new_names, i+1) for i in xrange(len(default_names))] 44 | return output_names 45 | 46 | def apply(self, df, options): 47 | # Make a copy of data, to not alter original dataframe 48 | X = df.copy() 49 | 50 | # Prepare the features 51 | X, nans, _ = df_util.prepare_features( 52 | X=X, 53 | variables=self.feature_variables, 54 | final_columns=self.columns, 55 | ) 56 | 57 | # Call the transform method 58 | y_hat = self.estimator.fit_transform(X.values) 59 | 60 | # Assign output_name 61 | output_name = options.get('output_name', None) 62 | default_names = self.make_output_names( 63 | output_name=output_name, 64 | n_names=y_hat.shape[1], 65 | ) 66 | output_names = self.rename_output(default_names, output_name) 67 | 68 | # Create output dataframe 69 | output = df_util.create_output_dataframe( 70 | y_hat=y_hat, 71 | nans=nans, 72 | output_names=output_names, 73 | ) 74 | 75 | # Merge with original dataframe 76 | output = df_util.merge_predictions(df, output) 77 | return output 78 | 79 | @staticmethod 80 | def register_codecs(): 81 | from codec.codecs import SimpleObjectCodec 82 | codecs_manager.add_codec('algos_contrib.TSNE', 'TSNE', SimpleObjectCodec) 83 | codecs_manager.add_codec('sklearn.manifold.t_sne', 'TSNE', SimpleObjectCodec) 84 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/CustomDecisionTreeClassifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from sklearn.tree import DecisionTreeClassifier as _DecisionTreeClassifier 4 | from base import ClassifierMixin, BaseAlgo 5 | from codec import codecs_manager 6 | from util.param_util import convert_params 7 | from util.algo_util import tree_summary 8 | 9 | #This algorithm is an updated version of DecisionTreecClassifier from MLTK and class weight parameter has been added to it 10 | 11 | class CustomDecisionTreeClassifier(ClassifierMixin, BaseAlgo): 12 | def __init__(self, options): 13 | self.handle_options(options) 14 | 15 | out_params = convert_params( 16 | options.get('params', {}), 17 | ints=['random_state', 'max_depth', 'min_samples_split', 'max_leaf_nodes'], 18 | strs=['criterion', 'splitter', 'max_features', 'class_weight'], 19 | ) 20 | 21 | # whitelist valid values for criterion, as error raised by sklearn for invalid values is uninformative 22 | if 'criterion' in out_params: 23 | try: 24 | assert (out_params['criterion'] in ['gini', 'entropy']) 25 | except AssertionError: 26 | raise RuntimeError('Invalid value for option criterion: "%s"' % out_params['criterion']) 27 | 28 | # whitelist valid values for splitter, as error raised by sklearn for invalid values is uninformative 29 | if 'splitter' in out_params: 30 | try: 31 | assert (out_params['splitter'] in ['best', 'random']) 32 | except AssertionError: 33 | raise RuntimeError('Invalid value for option splitter: "%s"' % out_params['splitter']) 34 | 35 | if 'max_depth' not in out_params: 36 | out_params.setdefault('max_leaf_nodes', 2000) 37 | 38 | # EAFP... convert max_features to int or float if it is a number. 39 | try: 40 | out_params['max_features'] = float(out_params['max_features']) 41 | max_features_int = int(out_params['max_features']) 42 | if out_params['max_features'] == max_features_int: 43 | out_params['max_features'] = max_features_int 44 | except: 45 | pass 46 | 47 | if 'class_weight' in out_params: 48 | try: 49 | from ast import literal_eval 50 | out_params['class_weight'] = literal_eval(out_params['class_weight']) 51 | except Exception: 52 | raise RuntimeError('Invalid value for option class_weight: "%s"' % out_params['class_weight']) 53 | 54 | self.estimator = _DecisionTreeClassifier(**out_params) 55 | 56 | def summary(self, options): 57 | if 'args' in options: 58 | raise RuntimeError('Summarization does not take values other than parameters') 59 | return tree_summary(self, options) 60 | 61 | @staticmethod 62 | def register_codecs(): 63 | from codec.codecs import SimpleObjectCodec, TreeCodec 64 | codecs_manager.add_codec('algos_contrib.CustomDecisionTreeClassifier', 'CustomDecisionTreeClassifier', SimpleObjectCodec) 65 | codecs_manager.add_codec('sklearn.tree.tree', 'DecisionTreeClassifier', SimpleObjectCodec) 66 | codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec) 67 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/AgglomerativeClustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.cluster import AgglomerativeClustering as AgClustering 3 | from sklearn.metrics import silhouette_samples 4 | 5 | from base import BaseAlgo 6 | from util.param_util import convert_params 7 | from util import df_util 8 | 9 | 10 | class AgglomerativeClustering(BaseAlgo): 11 | """Use scikit-learn's AgglomerativeClustering algorithm to cluster data.""" 12 | 13 | def __init__(self, options): 14 | 15 | feature_variables = options.get('feature_variables', {}) 16 | target_variable = options.get('target_variable', {}) 17 | 18 | # Ensure fields are present 19 | if len(feature_variables) == 0: 20 | raise RuntimeError('You must supply one or more fields') 21 | 22 | # No from clause allowed 23 | if len(target_variable) > 0: 24 | raise RuntimeError('AgglomerativeClustering does not support the from clause') 25 | 26 | # Convert params & alias k to n_clusters 27 | params = options.get('params', {}) 28 | out_params = convert_params( 29 | params, 30 | ints=['k'], 31 | strs=['linkage', 'affinity'], 32 | aliases={'k': 'n_clusters'} 33 | ) 34 | 35 | # Check for valid linkage 36 | if 'linkage' in out_params: 37 | valid_linkage = ['ward', 'complete', 'average'] 38 | if out_params['linkage'] not in valid_linkage: 39 | raise RuntimeError('linkage must be one of: {}'.format(', '.join(valid_linkage))) 40 | 41 | # Check for valid affinity 42 | if 'affinity' in out_params: 43 | valid_affinity = ['l1', 'l2', 'cosine', 'manhattan', 44 | 'precomputed', 'euclidean'] 45 | 46 | if out_params['affinity'] not in valid_affinity: 47 | raise RuntimeError('affinity must be one of: {}'.format(', '.join(valid_affinity))) 48 | 49 | # Check for invalid affinity & linkage combination 50 | if 'linkage' in out_params and 'affinity' in out_params: 51 | if out_params['linkage'] == 'ward': 52 | if out_params['affinity'] != 'euclidean': 53 | raise RuntimeError('ward linkage (default) must use euclidean affinity (default)') 54 | 55 | # Initialize the estimator 56 | self.estimator = AgClustering(**out_params) 57 | 58 | def fit(self, df, options): 59 | """Do the clustering & merge labels with original data.""" 60 | # Make a copy of the input data 61 | X = df.copy() 62 | 63 | # Use the df_util prepare_features method to 64 | # - drop null columns & rows 65 | # - convert categorical columns into dummy indicator columns 66 | # X is our cleaned data, nans is a mask of the null value locations 67 | X, nans, columns = df_util.prepare_features(X, self.feature_variables) 68 | 69 | # Do the actual clustering 70 | y_hat = self.estimator.fit_predict(X.values) 71 | 72 | # attach silhouette coefficient score for each row 73 | silhouettes = silhouette_samples(X, y_hat) 74 | 75 | # Combine the two arrays, and transpose them. 76 | y_hat = np.vstack([y_hat, silhouettes]).T 77 | 78 | # Assign default output names 79 | default_name = 'cluster' 80 | 81 | # Get the value from the as-clause if present 82 | output_name = options.get('output_name', default_name) 83 | 84 | # There are two columns - one for the labels, for the silhouette scores 85 | output_names = [output_name, 'silhouette_score'] 86 | 87 | # Use the predictions & nans-mask to create a new dataframe 88 | output_df = df_util.create_output_dataframe(y_hat, nans, output_names) 89 | 90 | # Merge the dataframe with the original input data 91 | df = df_util.merge_predictions(df, output_df) 92 | return df 93 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/CollaborativeFilter.py: -------------------------------------------------------------------------------- 1 | 2 | from base import BaseAlgo 3 | import pandas as pd 4 | import numpy as np 5 | 6 | from sklearn.metrics.pairwise import pairwise_distances 7 | from cexc import get_logger 8 | from util import df_util 9 | from util.param_util import convert_params 10 | 11 | # Everyone's favorite in memory collaborative filter, not a scaleable solution for millions of users and millions of items 12 | # https://en.wikipedia.org/wiki/Collaborative_filtering 13 | # please check out more scaleable solutions in KNN or "Recommender Systems: The Textbook" 14 | # TODO add coldstart solution for nulls 15 | # TODO currently we assume a |fillnull value=0 is run in splunk prior to calling the algorithm 16 | 17 | # We ASSUME rows are users, columns are items. 18 | # TODO I seem to cause splunk memory issues with wide tables, so I should consider doing an XYSERIES like reshape 19 | # TODO and consider taking in a table of USERID, ITEM , RATING from splunk. Yucky. 20 | 21 | # TODO There are many many many other distance metrics that could be a good fit. 22 | 23 | 24 | class CollaborativeFilter(BaseAlgo): 25 | def __init__(self, options): 26 | 27 | 28 | # set parameters 29 | params = options.get('params', {}) 30 | out_params = convert_params( 31 | params, 32 | strs=['user_field','rating_type','coldstart_field'] 33 | ) 34 | 35 | # set defaults for parameters 36 | if 'user_field' in out_params: 37 | self.user_field = out_params['user_field'] 38 | else: 39 | self.user_field = "SME" 40 | 41 | self.rating_type="item" 42 | if 'rating_type' in out_params: 43 | if out_params['rating_type'] == "item": 44 | self.rating_type="item" 45 | elif out_params['rating_type'] == "user": 46 | self.rating_type="user" 47 | 48 | 49 | def fit(self, df, options): 50 | # df contains all the search results, including hidden fields 51 | # but the requested requested are saved as self.feature_variables 52 | logger = get_logger('MyCustomLogging') 53 | 54 | X=df.copy() 55 | 56 | # it is always best practice to prepare your data. 57 | # splunk has a number of hidden fields that are exposed as part of the search protocole, and we really only 58 | # want the features that are valid field names. 59 | 60 | 61 | #Make sure to turn off get_dummies 62 | X, _, self.columns = df_util.prepare_features( 63 | X=X, 64 | variables=self.feature_variables, 65 | get_dummies=False, 66 | mlspl_limits=options.get('mlspl_limits'), 67 | ) 68 | 69 | # test if user field is in the list 70 | logger.debug("The user field is %s",self.user_field ) 71 | try: 72 | my_list_index=(X[self.user_field].values) 73 | except: 74 | raise RuntimeError('You must specify user field that exists. You sent %s',self.user_field) 75 | 76 | X=X.drop([self.user_field],axis=1) 77 | my_list_header=(X.columns.values) 78 | 79 | #ratings as a matrix , clean that data up! 80 | X=X.replace([np.inf, -np.inf], "nan").replace("nan","0") 81 | matrix=X.values 82 | # force type for Numpy Math 83 | matrix=matrix.astype(np.float64) 84 | 85 | # should consider erroring out when you have super sparse user data 86 | # TODO add other methods via parameter 87 | user_sim = pairwise_distances(matrix, metric='cosine') 88 | item_sim = pairwise_distances(matrix.T, metric='cosine') 89 | 90 | #item prediction 91 | item_sim= matrix.dot(item_sim) / np.array([np.abs(item_sim).sum(axis=1)]) 92 | 93 | #user sim 94 | mean_user_rating = matrix.mean(axis=1) 95 | matrix_diff = (matrix - mean_user_rating[:, np.newaxis]) 96 | user_sim = mean_user_rating[:, np.newaxis] + user_sim.dot(matrix_diff) / np.array([np.abs(user_sim).sum(axis=1)]).T 97 | 98 | # add back into the matrix the header row 99 | if self.rating_type == "item": 100 | output_df=pd.DataFrame(item_sim,columns=my_list_header, index=my_list_index) 101 | if self.rating_type == "user": 102 | output_df=pd.DataFrame(user_sim,columns=my_list_header, index=my_list_index) 103 | output_df[self.user_field]=pd.Series(my_list_index).values 104 | 105 | return output_df 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/TFBinary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | Copy of existing TFIDF algo but with 2 boolean options added and 3 options set 4 | so that binary output is achieved. 5 | ''' 6 | 7 | from sklearn.feature_extraction.text import TfidfVectorizer as _TfidfVectorizer 8 | 9 | from base import BaseAlgo 10 | from codec import codecs_manager 11 | from util import df_util 12 | from util.param_util import convert_params 13 | 14 | 15 | class TFBinary(BaseAlgo): 16 | 17 | def handle_options(self, options): 18 | if len(options.get('feature_variables', [])) != 1 or len(options.get('target_variable', [])) > 0: 19 | raise RuntimeError('Syntax error: You must specify exactly one field') 20 | 21 | def __init__(self, options): 22 | self.handle_options(options) 23 | 24 | out_params = convert_params( 25 | options.get('params', {}), 26 | ints=['max_features'], 27 | bools=['use_idf','binary'], 28 | strs=['max_df', 'min_df', 29 | 'ngram_range', 'stop_words', 30 | 'analyzer', 'norm', 'token_pattern'], 31 | ) 32 | 33 | for doc_freq, default_val in [('max_df', 1.0), ('min_df', 1)]: 34 | if doc_freq in out_params: 35 | # EAFP... convert max_df/min_df to float/int if it is a number. 36 | try: 37 | float_val = float(out_params[doc_freq]) 38 | int_val = int(float_val) 39 | except ValueError: 40 | raise RuntimeError('Syntax Error: {doc_freq} requires a numeric value, e.g. {doc_freq}=1.0'.format(doc_freq=doc_freq)) 41 | if float_val == 1.0: 42 | out_params[doc_freq] = default_val 43 | elif float_val == int_val: 44 | out_params[doc_freq] = int_val 45 | else: 46 | out_params[doc_freq] = float_val 47 | 48 | if 'ngram_range' in out_params.keys(): 49 | try: 50 | out_params['ngram_range'] = tuple(int(i) for i in out_params['ngram_range'].split('-')) 51 | assert len(out_params['ngram_range']) == 2 52 | except: 53 | raise RuntimeError('Syntax Error: ngram_range requires a range, e.g. ngram_range=1-5') 54 | 55 | # TODO: Maybe let the user know that we make this change. 56 | out_params.setdefault('max_features', 100) 57 | 58 | # Binary defaults 59 | out_params.setdefault('use_idf', False) 60 | out_params.setdefault('norm', None) 61 | out_params.setdefault('binary', True) 62 | 63 | self.estimator = _TfidfVectorizer(**out_params) 64 | 65 | def fit(self, df, options): 66 | # Make a copy of data, to not alter original dataframe 67 | X = df.copy() 68 | 69 | # Make sure to turn off get_dummies 70 | X, _, self.columns = df_util.prepare_features( 71 | X=X, 72 | variables=self.feature_variables, 73 | get_dummies=False, 74 | mlspl_limits=options.get('mlspl_limits'), 75 | ) 76 | 77 | X = X.values.ravel().astype('str') 78 | self.estimator.fit(X) 79 | 80 | def make_output_names(self, options): 81 | default_name = self.feature_variables[0] + '_tfbin' 82 | output_name = options.get('output_name', default_name) 83 | feature_names = self.estimator.get_feature_names() 84 | output_names = [output_name + '_' + str(index) + '_' + word 85 | for (index, word) in enumerate(feature_names)] 86 | return output_names 87 | 88 | def apply(self, df, options): 89 | # Make a copy of data, to not alter original dataframe 90 | X = df.copy() 91 | 92 | # Make sure to turn off get_dummies 93 | X, nans, _ = df_util.prepare_features( 94 | X=X, 95 | variables=self.feature_variables, 96 | final_columns=self.columns, 97 | get_dummies=False, 98 | mlspl_limits=options.get('mlspl_limits'), 99 | ) 100 | 101 | X = X.values.ravel().astype('str') 102 | y_hat = self.estimator.transform(X) 103 | 104 | # Convert the returned sparse matrix into array 105 | y_hat = y_hat.toarray() 106 | 107 | output_names = self.make_output_names(options) 108 | 109 | output = df_util.create_output_dataframe( 110 | y_hat=y_hat, 111 | output_names=output_names, 112 | nans=nans, 113 | ) 114 | 115 | df = df_util.merge_predictions(df, output) 116 | return df 117 | 118 | @staticmethod 119 | def register_codecs(): 120 | from codec.codecs import SimpleObjectCodec 121 | codecs_manager.add_codec('algos_contrib.TFBinary', 'TFBinary', SimpleObjectCodec) 122 | codecs_manager.add_codec('sklearn.feature_extraction.text', 'TfidfVectorizer', SimpleObjectCodec) 123 | codecs_manager.add_codec('sklearn.feature_extraction.text', 'TfidfTransformer', SimpleObjectCodec) 124 | codecs_manager.add_codec('scipy.sparse.dia', 'dia_matrix', SimpleObjectCodec) 125 | -------------------------------------------------------------------------------- /src/bin/test/test_contrib_util.py: -------------------------------------------------------------------------------- 1 | import mock 2 | import io 3 | import pandas as pd 4 | import pytest 5 | import sys 6 | 7 | from base import BaseAlgo 8 | from util.base_util import MLSPLNotImplementedError 9 | 10 | from contrib_util import AlgoTestUtils 11 | 12 | 13 | @pytest.fixture 14 | def min_algo_cls(): 15 | class MinimalAlgo(BaseAlgo): 16 | pass 17 | return MinimalAlgo 18 | 19 | 20 | @pytest.fixture 21 | def serializable_algo_cls(): 22 | class SerializableAlgo(BaseAlgo): 23 | def __init__(self, options): 24 | pass 25 | 26 | def fit(self, df, options): 27 | pass 28 | 29 | def apply(self, df, options): 30 | return df 31 | 32 | @classmethod 33 | def register_codecs(cls): 34 | from codec.codecs import SimpleObjectCodec 35 | from codec import codecs_manager 36 | codecs_manager.add_codec('test.test_contrib_util', 'SerializableAlgo', SimpleObjectCodec) 37 | 38 | # Add the class to this module so that encoder and decoder can access it. 39 | # This is only necessary for a fixture function. Normally, these classes will be defined within a module. 40 | setattr(sys.modules[__name__], 'SerializableAlgo', SerializableAlgo) 41 | return SerializableAlgo 42 | 43 | 44 | mock_algo_conf = """ 45 | [MinimalAlgo] 46 | package=algos_contrib 47 | """ 48 | 49 | 50 | mock_algo_conf_no_package = """ 51 | [MinimalAlgo] 52 | """ 53 | 54 | 55 | def test_method_signature(min_algo_cls): 56 | AlgoTestUtils.assert_method_signature(min_algo_cls, 'fit', ['self', 'df', 'options']) 57 | 58 | 59 | @mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf)) 60 | def test_registered(mock_get_algos_conf_fp, min_algo_cls): 61 | AlgoTestUtils.assert_registered(min_algo_cls) 62 | 63 | 64 | def test_serializable(serializable_algo_cls): 65 | AlgoTestUtils.assert_serializable(serializable_algo_cls, input_df=pd.DataFrame({}), options={}) 66 | 67 | 68 | def test_base_algo_method_signatures_default_methods(min_algo_cls): 69 | AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls) 70 | 71 | 72 | def test_base_algo_method_signatures_all_methods(min_algo_cls): 73 | AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls, required_methods=[ 74 | '__init__', 75 | 'fit', 76 | 'partial_fit', 77 | 'apply', 78 | 'register_codecs', 79 | ]) 80 | 81 | 82 | def test_base_algo_method_signatures_extra_methods(min_algo_cls): 83 | with pytest.raises(AssertionError) as e: 84 | extra_args = [ 85 | 'extra1', 86 | 'extra2', 87 | ] 88 | AlgoTestUtils.assert_base_algo_method_signatures(min_algo_cls, required_methods=[ 89 | '__init__', 90 | 'fit', 91 | 'partial_fit', 92 | 'apply', 93 | 'register_codecs', 94 | ] + extra_args) 95 | assert e.match('{}.*not in BaseAlgo'.format(extra_args)) 96 | 97 | 98 | @mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf)) 99 | def test_algo_basic(mock_get_algos_conf_fp, min_algo_cls): 100 | AlgoTestUtils.assert_algo_basic(min_algo_cls, serializable=False) 101 | 102 | 103 | def test_no_base_algo(): 104 | class NoBaseAlgo(object): 105 | pass 106 | 107 | with pytest.raises(AssertionError) as e: 108 | AlgoTestUtils.assert_base_algo_method_signatures(NoBaseAlgo) 109 | assert e.match('must inherit from BaseAlgo') 110 | 111 | 112 | def test_method_signature_non_existent(min_algo_cls): 113 | bad_method = 'foot' 114 | with pytest.raises(AssertionError) as e: 115 | AlgoTestUtils.assert_method_signature(min_algo_cls, bad_method, ['self', 'df', 'options']) 116 | e.match("{}.*does not exist".format(bad_method)) 117 | 118 | 119 | def test_method_signature_not_callable(min_algo_cls): 120 | bad_method = 'fit' 121 | 122 | # Make fit a property. 123 | min_algo_cls.fit = 'fit' 124 | 125 | with pytest.raises(AssertionError) as e: 126 | AlgoTestUtils.assert_method_signature(min_algo_cls, bad_method, ['self', 'df', 'options']) 127 | e.match("{}.*not callable".format(bad_method)) 128 | 129 | 130 | @mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf)) 131 | def test_unregistered(mock_get_algos_conf_fp): 132 | class UnregisteredAlgo(BaseAlgo): 133 | pass 134 | 135 | with pytest.raises(AssertionError) as e: 136 | AlgoTestUtils.assert_registered(UnregisteredAlgo) 137 | assert e.match('{}.*not registered'.format(UnregisteredAlgo.__name__)) 138 | 139 | 140 | @mock.patch.object(AlgoTestUtils, 'get_algos_conf_fp', return_value=io.BytesIO(mock_algo_conf_no_package)) 141 | def test_registered_with_missing_package_option(mock_get_algos_conf_fp, min_algo_cls): 142 | with pytest.raises(AssertionError) as e: 143 | AlgoTestUtils.assert_registered(min_algo_cls) 144 | assert e.match('{}.*must override.*package'.format(min_algo_cls.__name__)) 145 | 146 | 147 | def test_not_serializable(min_algo_cls): 148 | with pytest.raises(MLSPLNotImplementedError) as e: 149 | AlgoTestUtils.assert_serializable(min_algo_cls, input_df=pd.DataFrame({}), options={}) 150 | assert e.match('does not support saving') 151 | 152 | 153 | -------------------------------------------------------------------------------- /src/bin/algos_contrib/IsolationForest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from sklearn.ensemble import IsolationForest as _IsolationForest 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from base import ClustererMixin, BaseAlgo 8 | from codec import codecs_manager 9 | from codec.codecs import BaseCodec 10 | from codec.flatten import flatten, expand 11 | from util import df_util 12 | from util.param_util import convert_params 13 | from cexc import get_messages_logger,get_logger 14 | 15 | class IsolationForest(ClustererMixin, BaseAlgo): 16 | """ 17 | This is the implementation wrapper around Isolation Forest from scikit-learn. It inherits methods from ClustererMixin and BaseAlgo. 18 | """ 19 | def __init__(self,options): 20 | self.handle_options(options) 21 | out_params = convert_params( 22 | options.get('params',{}), 23 | ints = ['n_estimators','n_jobs','random_state','verbose'], 24 | floats = ['max_samples','contamination','max_features'], 25 | bools = ['bootstrap'] 26 | ) 27 | self.return_scores = out_params.pop('anomaly_score', True) 28 | 29 | # whitelist n_estimators > 0 30 | if 'n_estimators' in out_params and out_params['n_estimators']<=0: 31 | msg = 'Invalid value error: n_estimators must be greater than 0 and an integer, but found n_estimators="{}".' 32 | raise RuntimeError(msg.format(out_params['n_estimators'])) 33 | 34 | # whitelist max_samples > 0 and < 1 35 | if 'max_samples' in out_params and out_params['max_samples']<0 and out_params['max_samples']>1: 36 | msg = 'Invalid value error: max_samples must be greater than 0 and a float, but found max_samples="{}".' 37 | raise RuntimeError(msg.format(out_params['max_samples'])) 38 | 39 | # whitelist contamination should be in (0.0, 0.5] as error raised by sklearn for values out of range 40 | if 'contamination' in out_params and not (0.0 < out_params['contamination'] <= 0.5): 41 | msg = ( 42 | 'Invalid value error: Valid values for contamination are in (0.0, 0.5], ' 43 | 'but found contamination="{}".' 44 | ) 45 | raise RuntimeError(msg.format(out_params['contamination'])) 46 | 47 | # whitelist max_features > 0 and < 1 48 | if 'max_features' in out_params and out_params['max_features']<0 and out_params['max_features']>1: 49 | msg = 'Invalid value error: max_features must be greater than 0, but found max_features="{}".' 50 | raise RuntimeError(msg.format(out_params['max_features'])) 51 | 52 | 53 | self.estimator = _IsolationForest(**out_params) 54 | 55 | 56 | def apply(self, df, options): 57 | # Make a copy of data, to not alter original dataframe 58 | logger = get_logger('IsolationForest Logger') 59 | X = df.copy() 60 | 61 | X, nans, _ = df_util.prepare_features( 62 | X=X, 63 | variables=self.feature_variables, 64 | final_columns=self.columns, 65 | mlspl_limits=options.get('mlspl_limits'), 66 | ) 67 | 68 | # Multiplying the result by -1 to represent Outliers with 1 and Inliers/Normal points with 1. 69 | y_hat = self.estimator.predict(X.values)*-1 70 | # Printing the accuracy for prediction of outliers 71 | accuracy = "Accuracy: {}".format(str(round((list(y_hat).count(-1)*100)/y_hat.shape[0], 2))) 72 | logger.debug(accuracy) 73 | 74 | y_hat = y_hat.astype('str') 75 | 76 | #Assign output_name 77 | default_name = 'isOutlier' 78 | new_name = options.get('output_name', None) 79 | output_name = self.rename_output(default_names=default_name, new_names=new_name) 80 | 81 | # Create output dataframe 82 | output = df_util.create_output_dataframe( 83 | y_hat=y_hat, nans=nans, output_names=output_name 84 | ) 85 | # Merge with original dataframe 86 | output = df_util.merge_predictions(df, output) 87 | return output 88 | 89 | def rename_output(self, default_names, new_names=None): 90 | """Utility hook to rename output. 91 | 92 | The default behavior is to take the default_names passed in and simply 93 | return them. If however a particular algo needs to rename the columns of 94 | the output, this method can be overridden. 95 | """ 96 | return new_names if new_names is not None else default_names 97 | 98 | 99 | @staticmethod 100 | def register_codecs(): 101 | from codec.codecs import SimpleObjectCodec, TreeCodec 102 | codecs_manager.add_codec('algos.IsolationForest', 'IsolationForest', SimpleObjectCodec) 103 | codecs_manager.add_codec('sklearn.ensemble.iforest', 'IsolationForest', SimpleObjectCodec) 104 | codecs_manager.add_codec('sklearn.tree.tree','ExtraTreeRegressor', ExtraTreeRegressorCodec) 105 | codecs_manager.add_codec('sklearn.tree._tree', 'Tree', TreeCodec) 106 | 107 | 108 | class ExtraTreeRegressorCodec(BaseCodec): 109 | """ 110 | This is an ExtraTreeRegressor Codec for saving the Isolation Forest base estimator to memory/file. 111 | """ 112 | @classmethod 113 | def encode(cls, obj): 114 | import sklearn.tree 115 | assert type(obj) == sklearn.tree.tree.ExtraTreeRegressor 116 | state = obj.__getstate__() 117 | return { 118 | '__mlspl_type': [type(obj).__module__, type(obj).__name__], 119 | 'state': state 120 | } 121 | 122 | @classmethod 123 | def decode(cls,obj): 124 | from sklearn.tree.tree import ExtraTreeRegressor 125 | state = obj['state'] 126 | t = ExtraTreeRegressor.__new__(ExtraTreeRegressor) 127 | t.__setstate__(state) 128 | return t -------------------------------------------------------------------------------- /src/bin/test/contrib_util.py: -------------------------------------------------------------------------------- 1 | """ Utility methods for use in testing.""" 2 | import ConfigParser 3 | import json 4 | import os 5 | from inspect import getargspec 6 | 7 | import pandas as pd 8 | 9 | from base import BaseAlgo 10 | from codec import MLSPLDecoder, MLSPLEncoder 11 | 12 | 13 | PACKAGE_NAME='algos_contrib' 14 | 15 | 16 | class AlgoTestUtils(object): 17 | """ 18 | Helper methods for testing algorithm implementations 19 | """ 20 | @staticmethod 21 | def assert_method_signature(algo_cls, method_name, args): 22 | """ 23 | Assert the signature of the specified method 24 | 25 | Args: 26 | algo_cls (class): a custom algorithm class to check 27 | method_name (str): the name of the method 28 | args (list): expected arguments to the named method 29 | 30 | Returns: 31 | (bool): True if the method is callable and has the specified arguments, False otherwise. 32 | 33 | Raises: 34 | AssertionError 35 | """ 36 | method = getattr(algo_cls, method_name, None) 37 | assert method, "Method '{}' does not exist".format(method_name) 38 | assert callable(method), "Method '{}' is not callable".format(method_name) 39 | found_args = getargspec(method).args 40 | msg = 'Method {} has signature: {} - but should have {}'.format(method, args, found_args) 41 | assert found_args == args, msg 42 | 43 | @classmethod 44 | def assert_registered(cls, algo_cls): 45 | """ 46 | Assert that the algorithm is registered in the algos.conf configuration file. 47 | 48 | Args: 49 | algo_cls (class): a custom algorithm class to check 50 | 51 | Returns: 52 | (bool): True if the method is registered in algos.conf file. 53 | 54 | Raises: 55 | AssertionError 56 | """ 57 | config = ConfigParser.RawConfigParser() 58 | with cls.get_algos_conf_fp() as f: 59 | config.readfp(f) 60 | algo_name = algo_cls.__name__ 61 | try: 62 | package_name = config.get(algo_name, 'package') 63 | except ConfigParser.NoSectionError: 64 | assert False, "'{}' not registered in algos.conf".format(algo_name) 65 | except ConfigParser.NoOptionError: 66 | assert False, "'{}' must override 'package' option in algos.conf".format(algo_name) 67 | 68 | assert package_name == PACKAGE_NAME, "The package name must be '{}'".format(PACKAGE_NAME) 69 | 70 | @staticmethod 71 | def assert_serializable(algo_cls, input_df, options): 72 | """ 73 | Assert that the model created by the algorithm is serializable. 74 | 75 | Args: 76 | algo_cls (class): a custom algorithm class to check 77 | input_df (pandas Dataframe): input dataframe for the algorithm being tested 78 | options (dict): options for the fit() (and apply(), if applicable) methods of the algorithm 79 | 80 | Returns: 81 | (bool): True if the the model is serializable, False otherwise. 82 | 83 | Raises: 84 | AssertionError 85 | """ 86 | assert hasattr(algo_cls, 'register_codecs') 87 | algo_cls.register_codecs() 88 | 89 | algo_inst = algo_cls(options) 90 | algo_inst.feature_variables = ['b', 'c'] 91 | algo_inst.target_variable = 'a' 92 | algo_inst.fit(input_df.copy(), options) 93 | 94 | encoded = json.dumps(algo_inst, cls=MLSPLEncoder) 95 | decoded = json.loads(encoded, cls=MLSPLDecoder) 96 | 97 | orig_y = algo_inst.apply(input_df.copy(), options) 98 | decoded_y = decoded.apply(input_df.copy(), options) 99 | pd.util.testing.assert_frame_equal(orig_y, decoded_y) 100 | 101 | @classmethod 102 | def assert_base_algo_method_signatures(cls, algo_cls, required_methods=None): 103 | """ 104 | Assert that the signatures of algorithm's methods adhere to the API. 105 | 106 | Args: 107 | algo_cls (class): a custom algorithm class to check. 108 | required_methods (list): list of required method names. 109 | '__init__' and 'fit' are always required, so 110 | they do not need to be included. 111 | 112 | 113 | Returns: 114 | (bool): True if the methods adhere to the API, False otherwise. 115 | 116 | Raises: 117 | AssertionError 118 | """ 119 | method_args_map = { 120 | '__init__': ['self', 'options'], 121 | 'fit': ['self', 'df', 'options'], 122 | 'partial_fit': ['self', 'df', 'options'], 123 | 'apply': ['self', 'df', 'options'], 124 | 'summary': ['self', 'options'], 125 | 'register_codecs': [], 126 | } 127 | 128 | if required_methods is None: 129 | required_methods = [] 130 | 131 | assert issubclass(algo_cls, BaseAlgo), 'Algorithms must inherit from BaseAlgo.' 132 | 133 | required_method_set = set(required_methods) 134 | extra_methods = required_method_set - method_args_map.viewkeys() 135 | assert extra_methods == set(), "'{}' not in BaseAlgo".format(", ".join(extra_methods)) 136 | 137 | # __init__ and fit are always required. 138 | required_method_set.add('__init__') 139 | required_method_set.add('fit') 140 | 141 | for required_method in required_method_set: 142 | cls.assert_method_signature(algo_cls, required_method, method_args_map[required_method]) 143 | 144 | @classmethod 145 | def assert_algo_basic(cls, algo_cls, required_methods=None, input_df=None, options=None, serializable=True): 146 | """ 147 | Assert signatures of methods, registration, and serialization 148 | 149 | Args: 150 | algo_cls (class): a custom algorithm class to check. 151 | input_df (pandas Dataframe): input dataframe for the algorithm being tested 152 | options (dict): options for the fit() (and apply(), if applicable) methods of the algorithm 153 | serializable (bool): whether to check serializability or not. 154 | 155 | Returns: 156 | (bool): True if the methods adhere to the API, False otherwise. 157 | 158 | Raises: 159 | AssertionError 160 | """ 161 | cls.assert_base_algo_method_signatures(algo_cls, required_methods) 162 | cls.assert_registered(algo_cls) 163 | if serializable: 164 | # The input and options are required for serializability test. 165 | assert input_df is not None 166 | assert options is not None 167 | cls.assert_serializable(algo_cls, input_df, options) 168 | 169 | @staticmethod 170 | def get_algos_conf_fp(): 171 | """ 172 | Get a reference (pointer) to algos.conf file open for read 173 | 174 | This method mainly exists to aid testing. 175 | 176 | Returns: 177 | (File): algos.conf file pointer 178 | """ 179 | algos_file_path = os.path.join(os.path.dirname(__file__), '..', '..', 'default', 'algos.conf') 180 | return open(algos_file_path) 181 | 182 | 183 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mltk-algo-contrib 2 | 3 | This repo contains custom algorithms for use with the [Splunk Machine Learning Toolkit](https://splunkbase.splunk.com/app/2890/). The repo itself is also a Splunk app. 4 | Custom algorithms can be added to the Splunk Machine Learning toolkit by adhering to the [ML-SPL API](http://docs.splunk.com/Documentation/MLApp/latest/API/Introduction). 5 | The API is a thin wrapper around machine learning estimators provided by libraries such as: 6 | * [scikit-learn](scikit-learn.org) 7 | * [statsmodels](http://www.statsmodels.org/). 8 | * [scipy](https://www.scipy.org) 9 | 10 | and custom algorithms. 11 | 12 | Note that this repo is a collection of custom *algorithms* only, and not any libraries. Any libraries required 13 | should only be added to live environments manually and not to this repo. 14 | 15 | A comprehensive guide to using the ML-SPL API can be found [here](http://docs.splunk.com/Documentation/MLApp/latest/API/Introduction). 16 | 17 | A very simple example: 18 | 19 | ```python 20 | from base import BaseAlgo 21 | 22 | 23 | class CustomAlgorithm(BaseAlgo): 24 | def __init__(self, options): 25 | # Option checking & initializations here 26 | pass 27 | 28 | def fit(self, df, options): 29 | # Fit an estimator to df, a pandas DataFrame of the search results 30 | pass 31 | 32 | def partial_fit(self, df, options): 33 | # Incrementally fit a model 34 | pass 35 | 36 | def apply(self, df, options): 37 | # Apply a saved model 38 | # Modify df, a pandas DataFrame of the search results 39 | return df 40 | 41 | @staticmethod 42 | def register_codecs(): 43 | # Add codecs to the codec manager 44 | pass 45 | 46 | ``` 47 | 48 | # Dependencies 49 | 50 | To use the custom algorithms contained in this app, you must also have installed: 51 | 52 | - [Splunk Machine Learning Toolkit](https://splunkbase.splunk.com/app/2890/) 53 | - Python for Scientific Computing Add-on 54 | - [Linux64](https://splunkbase.splunk.com/app/2882/) 55 | - [Linux32](https://splunkbase.splunk.com/app/2884/) 56 | - [Windows64](https://splunkbase.splunk.com/app/2883/) 57 | - [macOS](https://splunkbase.splunk.com/app/2881/) 58 | 59 | # Usage 60 | This repository is contains public contributions and Splunk is not responsible for guaranteeing 61 | the correctness or validity of the algorithms. Splunk is in no way responsible for the vetting of 62 | the contents of contributed algorithms. 63 | 64 | # Deploying 65 | 66 | To use the custom algorithms in this repository, you must deploy them as a Splunk app. 67 | 68 | There are two ways to do this. 69 | 70 | ### Manual copying 71 | 72 | You can simple copy the following directories under src: 73 | * bin 74 | * default 75 | * metadata 76 | 77 | to: 78 | * ${SPLUNK_HOME}/etc/apps/SA_mltk_contrib_app (you will need to create the directory first): 79 | 80 | OR 81 | 82 | ### Build and install 83 | 84 | #### 1. Build the app: 85 | 86 | You will need to install tox. See [Test Prerequisites](#prereq) 87 | 88 | ```bash 89 | tox -e package-macos # if on Mac 90 | tox -e package-linux # if on Linux 91 | ``` 92 | 93 | * The resulting gzipped tarball will be in the `target` directory (e.g. target/SA_mltk_contrib_app.tgz). 94 | * The location of the gzipped tarball can be overridden by `BUILD_DIR` environment variable. 95 | * The default app name will be `SA_mltk_contrib_app`, but this can be overridden by the `APP_NAME` environment variable. 96 | 97 | * **NOTE**: You can run `tox -e clean` to remove the `target` directory. 98 | 99 | #### 2. Install the tarball: 100 | 101 | * You can do one of the followings with the tarball from step 1: 102 | * Manually untar it in `${SPLUNK_HOME}/etc/apps` directory 103 | * Install it using the GUI: 104 | * https://docs.splunk.com/Documentation/AddOns/released/Overview/Singleserverinstall 105 | 106 | # Contributing 107 | 108 | This repository was specifically made for your contributions! See [Contributing](https://github.com/splunk/mltk-algo-contrib/blob/master/CONTRIBUTING.md) for more details. 109 | 110 | ## Developing 111 | 112 | To start developing, you will need to have Splunk installed. If you don't, read more [here](http://docs.splunk.com/Documentation/Splunk/latest/Installation/InstallonLinux). 113 | 114 | 1. clone the repo and cd into the directory: 115 | 116 | ```bash 117 | git clone https://github.com/splunk/mltk-algo-contrib.git 118 | cd mltk-algo-contrib 119 | ``` 120 | 121 | 2. symlink the `src` directory to the apps folder in Splunk and restart splunkd: 122 | 123 | ```bash 124 | ln -s "$(pwd)/src" $SPLUNK_HOME/etc/apps/SA_mltk_contrib_app 125 | $SPLUNK_HOME/bin/splunk restart 126 | ``` 127 | * _This will eliminate the need to deploy the app to test changes._ 128 | 129 | 3. Add your new algorithm(s) to `src/bin/algos_contrib`. 130 | (See SVR.py for an example.) 131 | 132 | 4. Add a new stanza to `src/default/algos.conf` 133 | 134 | ```bash 135 | [] 136 | package=algos_contrib 137 | ``` 138 | 139 | * **NOTE**: Due to the way configuration file layering works in Splunk, 140 | the package name must be overridden in each section, and not 141 | in the _default_ section. 142 | 143 | 5. Add your tests to `src/bin/algos_contrib/tests/test_.py` 144 | (See test_svr.py for an example.) 145 | 146 | ## Running Tests 147 | 148 | 149 | ### Prerequisites 150 | 151 | 1. Install *tox*: 152 | * http://tox.readthedocs.io/en/latest/install.html 153 | ```bash 154 | pip install tox 155 | ``` 156 | 157 | 2. Install *tox-pip-extensions*: 158 | * https://github.com/tox-dev/tox-pip-extensions 159 | ```bash 160 | pip install tox-pip-extensions 161 | ``` 162 | * **NOTE**: You only need this if you do not want to 163 | recreate the virtualenv(s) manually with `tox -r` 164 | everytime you update requirements*.txt file, but 165 | this is recommended for convenience. 166 | 167 | 3. You must also have the following environment variable set to your 168 | Splunk installation directory (e.g. /opt/splunk): 169 | * SPLUNK_HOME 170 | 171 | ### Using tox 172 | 173 | To run all tests, run the following command in the root source directory: 174 | 175 | ```bash 176 | tox 177 | ``` 178 | 179 | To run a single test, you can provide the directory or a file as a parameter: 180 | 181 | ```bash 182 | tox src/bin/algos_contrib/tests/ 183 | tox src/bin/algos_contrib/tests/test_example_algo.py 184 | ... 185 | ``` 186 | 187 | Basically, any arguments passed to *tox* will be passed as an argument to the *pytest* command. 188 | To pass in options, use double dashes (--): 189 | 190 | ```bash 191 | tox -- -k "example" # Run tests that has keyword 'example' 192 | tox -- -x # Stop after the first failure 193 | tox -- -s # Show stdout/stderr (i.e. disable capturing) 194 | ... 195 | ``` 196 | 197 | ### Using Python REPL (Interactive Interpreter) 198 | 199 | ```python 200 | $ python # from src/bin directory 201 | >>> # Add the MLTK to our sys.path 202 | >>> from link_mltk import add_mltk 203 | >>> add_mltk() 204 | >>> 205 | >>> # Import our algorithm class 206 | >>> from algos_contrib.ExampleAlgo import ExampleAlgo 207 | ... (some warning from Splunk may show up) 208 | >>> 209 | >>> # Use utilities to catch common mistakes 210 | >>> from test.contrib_util import AlgoTestUtils 211 | >>> AlgoTestUtils.assert_algo_basic(ExampleAlgo, serializable=False) 212 | ``` 213 | 214 | ### Package/File Naming 215 | 216 | Files and packages under _test_ directory should avoid having names 217 | that conflict with files or directories directly under: 218 | ```bash 219 | $SPLUNK_HOME/etc/apps/Splunk_ML_Toolkit/bin 220 | ``` 221 | 222 | ## Pull requests 223 | 224 | Once you've finished what you're adding, make a pull request. 225 | 226 | ## Bugs? Issues? 227 | 228 | Please file issues with any information that might be needed to: 229 | - reproduce what you're experiencing 230 | - understand the problem fully 231 | 232 | # License 233 | 234 | The algorithms hosted, as well as the app itself, is licensed under the permissive Apache 2.0 license. 235 | 236 | **Any additions to this repository must be under one of these licenses:** 237 | - MIT 238 | - BSD 239 | - Apache 2.0 240 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------