├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── COPYRIGHT ├── LICENSE ├── README.md ├── docker ├── osas-elastic-jupyterlab │ └── Dockerfile └── osas-elastic │ └── Dockerfile ├── docs ├── PIPELINE_CONFIGURATION.md └── RULES.md ├── pyproject.toml ├── requirements.txt ├── resources └── .KEEP ├── scripts ├── config │ ├── elasticsearch.yml │ └── kibana.yml ├── run_services.sh └── tmp_data │ ├── data.tar.gz │ ├── json_uploader.py │ └── result_with_score.json ├── setup.py └── src ├── __init__.py └── osas ├── __init__.py ├── api.py ├── cli.py ├── core ├── __init__.py ├── anomaly.py ├── interfaces.py ├── label_generators.py └── utils.py ├── etc ├── README.md ├── ad_config.conf ├── config.conf ├── data_config.conf └── label_config.conf ├── io_utils ├── __init__.py ├── config.py └── formatter.py ├── main ├── README.md ├── __init__.py ├── apply_rules.py ├── autoconfig.py ├── run_pipeline.py └── train_pipeline.py ├── models ├── __init__.py └── pipeline.py ├── pipeline ├── README.md ├── __init__.py ├── detect_anomalies.py ├── fetch_data.py ├── groom_data.py └── pipeline.py ├── templates ├── config_manual_update.html ├── config_static.txt ├── config_text_edit.html ├── console.html ├── generate_config.html ├── run_full_process.html ├── run_pipeline.html └── train_pipeline.html └── webserver.py /.gitignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | build/ 3 | tests/ 4 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 5 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 6 | corpus/ 7 | # User-specific stuff 8 | data 9 | .idea 10 | .idea/**/workspace.xml 11 | .idea/**/tasks.xml 12 | .idea/**/usage.statistics.xml 13 | .idea/**/dictionaries 14 | .idea/**/shelf 15 | 16 | # Generated files 17 | .idea/**/contentModel.xml 18 | 19 | # Sensitive or high-churn files 20 | .idea/**/dataSources/ 21 | .idea/**/dataSources.ids 22 | .idea/**/dataSources.local.xml 23 | .idea/**/sqlDataSources.xml 24 | .idea/**/dynamic.xml 25 | .idea/**/uiDesigner.xml 26 | .idea/**/dbnavigator.xml 27 | 28 | # Gradle 29 | .idea/**/gradle.xml 30 | .idea/**/libraries 31 | 32 | # Gradle and Maven with auto-import 33 | # When using Gradle or Maven with auto-import, you should exclude module files, 34 | # since they will be recreated, and may cause churn. Uncomment if using 35 | # auto-import. 36 | # .idea/artifacts 37 | # .idea/compiler.xml 38 | # .idea/jarRepositories.xml 39 | # .idea/modules.xml 40 | # .idea/*.iml 41 | # .idea/modules 42 | # *.iml 43 | # *.ipr 44 | 45 | # CMake 46 | cmake-build-*/ 47 | 48 | # Mongo Explorer plugin 49 | .idea/**/mongoSettings.xml 50 | 51 | # File-based project format 52 | *.iws 53 | 54 | # IntelliJ 55 | out/ 56 | 57 | # mpeltonen/sbt-idea plugin 58 | .idea_modules/ 59 | 60 | # JIRA plugin 61 | atlassian-ide-plugin.xml 62 | 63 | # Cursive Clojure plugin 64 | .idea/replstate.xml 65 | 66 | # Crashlytics plugin (for Android Studio and IntelliJ) 67 | com_crashlytics_export_strings.xml 68 | crashlytics.properties 69 | crashlytics-build.properties 70 | fabric.properties 71 | 72 | # Editor-based Rest Client 73 | .idea/httpRequests 74 | 75 | # Android studio 3.1+ serialized cache file 76 | .idea/caches/build_file_checksums.ser 77 | *.pyc 78 | 79 | *.DS_Store 80 | .DS_Store 81 | 82 | dist/ 83 | osas.egg-info/ 84 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Adobe Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at Grp-opensourceoffice@adobe.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Thanks for choosing to contribute! 4 | 5 | The following are a set of guidelines to follow when contributing to this project. 6 | 7 | ## Code Of Conduct 8 | 9 | This project adheres to the Adobe [code of conduct](CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code. Please report unacceptable behavior to Grp-opensourceoffice@adobe.com. 10 | 11 | ## Contributor License Agreement 12 | 13 | All third-party contributions to this project must be accompanied by a signed contributor license agreement. This gives Adobe permission to redistribute your contributions as part of the project. [Sign our CLA](http://opensource.adobe.com/cla.html). You only need to submit an Adobe CLA one time, so if you have submitted one previously, you are good to go! 14 | 15 | ## Code Reviews 16 | 17 | All submissions should come in the form of pull requests and need to be reviewed by project committers. Read [GitHub's pull request documentation](https://help.github.com/articles/about-pull-requests/) for more information on sending pull requests. 18 | 19 | Lastly, please follow the [pull request template](.github/PULL_REQUEST_TEMPLATE.md) when submitting a pull request! 20 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | The following copyright message should appear at the top of all 2 | source files. This file can be removed from your repository. 3 | 4 | Copyright (c) 2021 Adobe Systems Incorporated. All rights reserved. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # One Stop Anomaly Shop (OSAS) 2 | 3 | This repository implements the models, methods and techniques presented in our paper: [A Principled Approach to Enriching Security-related Data for Running Processes through Statistics and Natural Language Processing](https://www.scitepress.org/Papers/2021/103814/103814.pdf). 4 | 5 | ## Introduction video (follows quick start guide) 6 | 7 | This video is a recording of our Hack In The Box (HITB) Security Conference 2021 Amsterdam presentation. 8 | 9 | [![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/Wi5NXGzsFC4/0.jpg)](https://www.youtube.com/watch?v=Wi5NXGzsFC4)] 10 | 11 | ## Quick start guide 12 | 13 | **Step 1:** Get/build the docker image 14 | 15 | ***Option 1:*** Use precompiled image (might not reflect latest changes): 16 | 17 | ```shell 18 | docker pull tiberiu44/osas:latest 19 | docker image tag tiberiu44/osas:latest osas:latest 20 | ``` 21 | 22 | ***Option 2:*** Build the image locally 23 | 24 | ```shell 25 | git clone https://github.com/adobe/OSAS.git 26 | cd OSAS 27 | docker build . -f docker/osas-elastic/Dockerfile -t osas:latest 28 | ``` 29 | 30 | **Step 2:** After building the docker image you can start OSAS by typing: 31 | 32 | ```shell 33 | docker run -p 8888:8888/tcp -p 5601:5601/tcp -v :/app osas 34 | ``` 35 | 36 | **IMPORTANT NOTE:** Please modify the above command by adding the absolute path to your datafolder in the appropiate location 37 | 38 | After OSAS has started (it might take 1-2 minutes) you can use your browser to access some standard endpoints: 39 | * [http://localhost:5601/app/home#/](http://localhost:5601/app/home#/) - access to Kibana frontend (this is where you will see your data) 40 | * [http://localhost:8888/?token=osas](http://localhost:8888/?token=osas) - access to Jupyter Lab (open Terminal or create a Notebook) 41 | 42 | For Debug (in case you need to): 43 | 44 | ```shell 45 | docker run -p 8888:8888/tcp -p 5601:5601/tcp -v :/app -ti osas /bin/bash 46 | ``` 47 | 48 | ## Building the test pipeline 49 | 50 | This guide will take you through all the necessary steps to configure, train and run your own pipeline on your own dataset. 51 | 52 | **Prerequisite**: Add you own CSV dataset into your data-folder (the one provided in the `docker run` command) 53 | 54 | Once you started your docker image, use the [OSAS console](http://localhost:8888/osas/console) to gain CLI access to all the tools. 55 | 56 | In what follows, we assume that your dataset is called `dataset.csv`. Please update the commands as necessary in case you use a different name/location. 57 | 58 | **Be sure you are running scripts in the root folder of OSAS:** 59 | 60 | ```bash 61 | cd /osas 62 | ``` 63 | **Step 1:** Build a custom pipeline configuration file - this can be done fully manually on by bootstraping using our conf autogenerator script: 64 | ```bash 65 | python3 osas/main/autoconfig.py --input-file=/app/dataset.csv --output-file=/app/dataset.conf 66 | ``` 67 | 68 | The above command will generate a custom configuration file for your dataset. It will try guess field types and optimal combinations between fields. You can edit the generated file (which should be available in the shared data-folder), using your favourite editor. 69 | 70 | Standard templates for label generator types are: 71 | 72 | ```editorconfig 73 | [LG_MULTINOMIAL] 74 | generator_type = MultinomialField 75 | field_name = 76 | absolute_threshold = 10 77 | relative_threshold = 0.1 78 | group_by = None # this is an optional field - it can be a single attribute name or a list of names 79 | 80 | [LG_TEXT] 81 | generator_type = TextField 82 | field_name = 83 | lm_mode = char 84 | ngram_range = (3, 5) 85 | 86 | [LG_NUMERIC] 87 | generator_type = NumericField 88 | field_name = 89 | group_by = None # this is an optional field - it can be a single attribute name or a list of names 90 | 91 | [LG_MUTLINOMIAL_COMBINER] 92 | generator_type = MultinomialFieldCombiner 93 | field_names = ['', '', ...] 94 | absolute_threshold = 10 95 | relative_threshold = 0.1 96 | group_by = None # this is an optional field - it can be a single attribute name or a list of names 97 | 98 | [LG_KEYWORD] 99 | generator_type = KeywordBased 100 | field_name = 101 | keyword_list = ['', '', '', ...] 102 | 103 | [LG_REGEX] 104 | generator_type = KnowledgeBased 105 | field_name = 106 | rules_and_labels_tuple_list = [('',''), ('',''), ...] 107 | ``` 108 | 109 | You can use the above templates to add as many label generators you want. Just make sure that the header IDs are unique in the configuration file. 110 | 111 | **Step 2:** Train the pipeline 112 | 113 | ```bash 114 | python3 osas/main/train_pipeline.py --conf-file=/app/dataset.conf --input-file=/app/dataset.csv --model-file=/app/dataset.json 115 | ``` 116 | 117 | The above command will generate a pretrained pipeline using the previously created configuration file and the dataset 118 | 119 | **Step 3:** Run the pipeline on a dataset 120 | 121 | ```bash 122 | python3 osas/main/run_pipeline.py --conf-file=/app/dataset.conf --model-file=/app/dataset.json --input-file=/app/dataset.csv --output-file=/app/dataset-out.csv 123 | ``` 124 | 125 | The above command will run the pretrained pipeline on any compatible dataset. In the example we run the pipeline on the training data, but you can use previously unseen data. It will generate an output file with labels and anomaly scores and it will also import your data into Elasticsearch/Kibana. To view the result just use the the [web interface](http://localhost:5601/app/dashboards). 126 | 127 | # Developing models 128 | 129 | Now that everything is up and running, we prepared a set of development guidelines that will help you apply OSAS on your own dataset: 130 | 131 | 1. [Pipeline configuration](docs/PIPELINE_CONFIGURATION.md): This will help you understand how the label generators and anomaly scoring works in OSAS; 132 | 2. [Rule-based score modifiers and labeling](docs/RULES.md): Once you have a working OSAS pipeline, you can furhter refine your results by adding new labels and modifying the anomaly scoring based on static rules. 133 | 134 | # Citing and attribution 135 | 136 | **Full-text-paper: [A Principled Approach to Enriching Security-related Data for Running Processes through Statistics and Natural Language Processing](https://www.scitepress.org/Papers/2021/103814/103814.pdf).** 137 | 138 | If you want to use this repository in any academic work, please cite the following work: 139 | 140 | **MLA** 141 | * Boros, Tiberiu, et al. ‘A Principled Approach to Enriching Security-Related Data for Running Processes through Statistics and Natural Language Processing’. IoTBDS 2021 - 6th International Conference on Internet of Things, Big Data and Security, 2021. 142 | 143 | **APA** 144 | * Boros, T., Cotaie, A., Vikramjeet, K., Malik, V., Park, L., & Pachis, N. (2021). A principled approach to enriching security-related data for running processes through statistics and natural language processing. IoTBDS 2021 - 6th International Conference on Internet of Things, Big Data and Security. 145 | 146 | **Chicago** 147 | * Boros, Tiberiu, Andrei Cotaie, Kumar Vikramjeet, Vivek Malik, Lauren Park, and Nick Pachis. ‘A Principled Approach to Enriching Security-Related Data for Running Processes through Statistics and Natural Language Processing’. In IoTBDS 2021 - 6th International Conference on Internet of Things, Big Data and Security, 2021. 148 | 149 | **BibTeX** 150 | 151 | ```text 152 | @article{boros2021principled, 153 | title={A Principled Approach to Enriching Security-related Data for Running Processes through Statistics and Natural Language Processing}, 154 | author={Boros, Tiberiu and Cotaie, Andrei and Vikramjeet, Kumar and Malik, Vivek and Park, Lauren and Pachis, Nick}, 155 | year={2021}, 156 | booktitle={IoTBDS 2021 - 6th International Conference on Internet of Things, Big Data and Security} 157 | } 158 | ``` -------------------------------------------------------------------------------- /docker/osas-elastic-jupyterlab/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian 2 | ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 3 | RUN apt update && apt install -y gnupg2 curl procps openjdk-11-jdk unzip wget dbus sudo 4 | RUN DEBIAN_FRONTEND=noninteractive apt-get install -y locales python3 python3-pip mc nano htop git 5 | 6 | RUN wget -qO - https://d3g5vo6xdbdb9a.cloudfront.net/GPG-KEY-opendistroforelasticsearch | apt-key add - 7 | RUN echo "deb https://d3g5vo6xdbdb9a.cloudfront.net/apt stable main" | tee -a /etc/apt/sources.list.d/opendistroforelasticsearch.list 8 | RUN wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.10.2-amd64.deb && \ 9 | dpkg -i elasticsearch-oss-7.10.2-amd64.deb && \ 10 | rm elasticsearch-oss-7.10.2-amd64.deb 11 | 12 | RUN curl https://d3g5vo6xdbdb9a.cloudfront.net/tarball/opendistro-elasticsearch/opendistroforelasticsearch-1.13.0-linux-x64.tar.gz -o opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \ 13 | tar -zxf opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \ 14 | rm opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \ 15 | mv opendistroforelasticsearch-1.13.0 /elasticsearch && \ 16 | chown elasticsearch:elasticsearch elasticsearch -R && \ 17 | cd /elasticsearch && \ 18 | sudo -H -u elasticsearch bash -c './opendistro-tar-install.sh &' 19 | 20 | RUN curl -fsSL https://artifacts.elastic.co/GPG-KEY-elasticsearch | apt-key add - 21 | RUN echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" | tee -a /etc/apt/sources.list.d/elastic-7.x.list 22 | RUN apt update 23 | RUN curl https://d3g5vo6xdbdb9a.cloudfront.net/tarball/opendistroforelasticsearch-kibana/opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz -o opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz && \ 24 | tar -xf opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz && \ 25 | rm opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz 26 | 27 | # Prepare environment UTF-8 28 | RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ 29 | locale-gen 30 | ENV LANG en_US.UTF-8 31 | ENV LANGUAGE en_US:en 32 | ENV LC_ALL en_US.UTF-8 33 | 34 | #RUN echo "Cloning OSAS" && \ 35 | # cd / && \ 36 | # git clone https://github.com/adobe/OSAS.git && \ 37 | # mv OSAS osas 38 | ADD ./osas /osas/osas 39 | ADD ./docs /osas/docs 40 | ADD ./scripts /osas/scripts 41 | ADD ./resources /osas/resources 42 | RUN mkdir osas/corpus 43 | RUN mkdir osas/data 44 | COPY ./requirements.txt /osas/ 45 | 46 | RUN cd /osas/ && \ 47 | cat requirements.txt 48 | 49 | RUN cd /osas/ && \ 50 | cat requirements.txt && \ 51 | pip3 install -U pip && \ 52 | pip3 install --no-cache-dir -r requirements.txt && \ 53 | pip3 install jupyterlab 54 | 55 | ENV SHELL=/bin/bash 56 | 57 | CMD /osas/scripts/run_services.sh & jupyter lab --ip=0.0.0.0 --allow-root --ServerApp.token=osas # & cd /osas && python3 osas/webserver.py 58 | 59 | -------------------------------------------------------------------------------- /docker/osas-elastic/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian 2 | ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 3 | RUN apt update && apt install -y gnupg2 curl procps openjdk-11-jdk unzip wget dbus sudo 4 | 5 | RUN wget -qO - https://d3g5vo6xdbdb9a.cloudfront.net/GPG-KEY-opendistroforelasticsearch | apt-key add - 6 | RUN echo "deb https://d3g5vo6xdbdb9a.cloudfront.net/apt stable main" | tee -a /etc/apt/sources.list.d/opendistroforelasticsearch.list 7 | RUN wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.10.2-amd64.deb && \ 8 | dpkg -i elasticsearch-oss-7.10.2-amd64.deb && \ 9 | rm elasticsearch-oss-7.10.2-amd64.deb 10 | 11 | RUN curl https://d3g5vo6xdbdb9a.cloudfront.net/tarball/opendistro-elasticsearch/opendistroforelasticsearch-1.13.0-linux-x64.tar.gz -o opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \ 12 | tar -zxf opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \ 13 | rm opendistroforelasticsearch-1.13.0-linux-x64.tar.gz && \ 14 | mv opendistroforelasticsearch-1.13.0 /elasticsearch && \ 15 | chown elasticsearch:elasticsearch elasticsearch -R && \ 16 | cd /elasticsearch && \ 17 | sudo -H -u elasticsearch bash -c './opendistro-tar-install.sh &' 18 | 19 | RUN curl -fsSL https://artifacts.elastic.co/GPG-KEY-elasticsearch | apt-key add - 20 | RUN echo "deb https://artifacts.elastic.co/packages/7.x/apt stable main" | tee -a /etc/apt/sources.list.d/elastic-7.x.list 21 | RUN apt update 22 | RUN curl https://d3g5vo6xdbdb9a.cloudfront.net/tarball/opendistroforelasticsearch-kibana/opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz -o opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz && \ 23 | tar -xf opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz && \ 24 | rm opendistroforelasticsearch-kibana-1.13.0-linux-x64.tar.gz 25 | 26 | # Prepare environment UTF-8 27 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y locales python3 python3-pip mc nano htop git 28 | RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ 29 | locale-gen 30 | ENV LANG en_US.UTF-8 31 | ENV LANGUAGE en_US:en 32 | ENV LC_ALL en_US.UTF-8 33 | 34 | RUN echo "Cloning OSAS" && \ 35 | cd / && \ 36 | git clone https://github.com/adobe/OSAS.git && \ 37 | mv OSAS osas 38 | 39 | RUN cd /osas/ && \ 40 | pip3 install --no-cache-dir -r requirements.txt 41 | 42 | 43 | CMD /osas/scripts/run_services.sh & cd /osas && python3 osas/webserver.py 44 | 45 | -------------------------------------------------------------------------------- /docs/PIPELINE_CONFIGURATION.md: -------------------------------------------------------------------------------- 1 | # Pipeline explained 2 | 3 | The pipeline sequentially applies all label generators on the raw data, collects the labels and uses an anomaly scoring algorithm to generate anomaly scores. 4 | There are two main component classes: LabelGenerator and ScoringAlgorithm. 5 | 6 | ## Label generators 7 | 8 | **NumericField** 9 | 10 | * This type of LabelGenerator handles numerical fields. It can compute in two different ways: (1) the mean and standard deviation and generates labels 11 | according to the distance between the current value and the mean value (value<=sigma NORMAL, sigma=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click==8.1.8 2 | elasticsearch==9.0.1 3 | Flask==3.1.0 4 | ipdb==0.13.13 5 | numpy==2.2.5 6 | obfuscation_detection==0.7.2 7 | pandas==2.2.3 8 | python-decouple==3.8 9 | PyYAML==6.0.2 10 | scikit_learn==1.6.1 11 | setuptools==65.5.1 12 | tqdm==4.67.1 13 | -------------------------------------------------------------------------------- /resources/.KEEP: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/OSAS/5d8693b6c483f99f4339e7ceb8c8d041d778c61a/resources/.KEEP -------------------------------------------------------------------------------- /scripts/config/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | opendistro_security.ssl.transport.pemcert_filepath: esnode.pem 2 | opendistro_security.ssl.transport.pemkey_filepath: esnode-key.pem 3 | opendistro_security.ssl.transport.pemtrustedcas_filepath: root-ca.pem 4 | opendistro_security.ssl.transport.enforce_hostname_verification: false 5 | opendistro_security.ssl.http.pemcert_filepath: esnode.pem 6 | opendistro_security.ssl.http.pemkey_filepath: esnode-key.pem 7 | opendistro_security.ssl.http.pemtrustedcas_filepath: root-ca.pem 8 | opendistro_security.allow_unsafe_democertificates: true 9 | opendistro_security.allow_default_init_securityindex: true 10 | opendistro_security.authcz.admin_dn: 11 | - CN=kirk,OU=client,O=client,L=test, C=de 12 | 13 | opendistro_security.audit.type: internal_elasticsearch 14 | opendistro_security.enable_snapshot_restore_privilege: true 15 | opendistro_security.check_snapshot_restore_write_privileges: true 16 | opendistro_security.restapi.roles_enabled: ["all_access", "security_rest_api_access"] 17 | opendistro_security.system_indices.enabled: true 18 | opendistro_security.system_indices.indices: [".opendistro-alerting-config", ".opendistro-alerting-alert*", ".opendistro-anomaly-results*", ".opendistro-anomaly-detector*", ".opendistro-anomaly-checkpoints", ".opendistro-anomaly-detection-state", ".opendistro-reports-*", ".opendistro-notifications-*", ".opendistro-notebooks", ".opendistro-asynchronous-search-response*"] 19 | cluster.routing.allocation.disk.threshold_enabled: false 20 | node.max_local_storage_nodes: 3 21 | path: 22 | data: /data/elastic/data 23 | logs: /data/elastic/logs 24 | 25 | opendistro_security.ssl.http.enabled: false -------------------------------------------------------------------------------- /scripts/config/kibana.yml: -------------------------------------------------------------------------------- 1 | elasticsearch.hosts: http://localhost:9200 2 | elasticsearch.ssl.verificationMode: none 3 | elasticsearch.username: kibanaserver 4 | elasticsearch.password: kibanaserver 5 | elasticsearch.requestHeadersWhitelist: ["securitytenant","Authorization"] 6 | 7 | opendistro_security.multitenancy.enabled: true 8 | opendistro_security.multitenancy.tenants.preferred: ["Private", "Global"] 9 | opendistro_security.readonly_mode.roles: ["kibana_read_only"] 10 | 11 | #pendistro_security.cookie.secure: false 12 | 13 | newsfeed.enabled: false 14 | telemetry.optIn: false 15 | telemetry.enabled: false 16 | security.showInsecureClusterWarning: false 17 | server.host: "0.0.0.0" 18 | server.xsrf.disableProtection: true 19 | -------------------------------------------------------------------------------- /scripts/run_services.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | mkdir -p /data/elastic/logs 3 | mkdir -p /data/elastic/data 4 | mkdir -p /data/kibana 5 | #/etc/init.d/elasticsearch restart 6 | cd /elasticsearch/ 7 | ./opendistro-tar-install.sh 8 | 9 | 10 | cd /osas/scripts/tmp_data 11 | 12 | 13 | 14 | echo "copying data to data" 15 | tar -xvf data.tar.gz -C / 16 | 17 | 18 | 19 | cp /osas/scripts/config/elasticsearch.yml /elasticsearch/config/ 20 | cp /osas/scripts/config/kibana.yml /opendistroforelasticsearch-kibana/config/ 21 | 22 | 23 | 24 | 25 | chown elasticsearch:elasticsearch /data/elastic -R 26 | chown elasticsearch:elasticsearch /elasticsearch -R 27 | 28 | 29 | 30 | 31 | sudo -H -u elasticsearch bash -c 'ES_PATH_CONF=/elasticsearch/config /elasticsearch/bin/elasticsearch &' 32 | DATA_PATH=/data/kibana /opendistroforelasticsearch-kibana/bin/kibana -c /opendistroforelasticsearch-kibana/config/kibana.yml --allow-root & 33 | 34 | cd /osas/ 35 | export TERM=xterm 36 | #python3 osas/webserver.py 37 | 38 | 39 | 40 | 41 | #########in prod this should be taken out 42 | #echo "sleep before data push" 43 | #sleep 60 44 | #cd /osas/scripts/tmp_data 45 | # 46 | #python3 json_uploader.py -------------------------------------------------------------------------------- /scripts/tmp_data/data.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/OSAS/5d8693b6c483f99f4339e7ceb8c8d041d778c61a/scripts/tmp_data/data.tar.gz -------------------------------------------------------------------------------- /scripts/tmp_data/json_uploader.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | from elasticsearch import helpers, Elasticsearch 4 | 5 | es = Elasticsearch([{'host': 'localhost', 'port': 9200}],http_auth=('admin', 'admin')) 6 | 7 | 8 | 9 | 10 | 11 | data=json.loads(open('result_with_score.json', 'r').read()) 12 | 13 | helpers.bulk(es, data, index="anomalies", doc_type="type") -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | import os 3 | 4 | with open("README.md", "r", encoding="utf-8") as fh: 5 | long_description = fh.read() 6 | 7 | def parse_requirements(filename): 8 | """ load requirements from a pip requirements file """ 9 | lineiter = (line.strip() for line in open(filename)) 10 | return [line for line in lineiter if line and not line.startswith("#")] 11 | 12 | 13 | setuptools.setup( 14 | name="osas", 15 | version="0.9.3", 16 | author="Multiple Authors", 17 | author_email="boros@adobe.com", 18 | description="One Stop Anomaly Shop", 19 | long_description=long_description, 20 | long_description_content_type="text/markdown", 21 | url="https://github.com/adobe/OSAS/", 22 | project_urls={ 23 | "Source Code": "https://github.com/adobe/OSAS/", 24 | "Bug Tracker": "https://github.com/adobe/OSAS/issues", 25 | "Documentation": "https://github.com/adobe/OSAS/docs/" 26 | }, 27 | classifiers=[ 28 | "Programming Language :: Python :: 3.0", 29 | "License :: OSI Approved :: Apache Software License", 30 | "Operating System :: OS Independent", 31 | ], 32 | packages=setuptools.find_packages("src"), 33 | python_requires=">=3.10", 34 | include_package_data=True, 35 | install_requires=parse_requirements("requirements.txt"), 36 | package_dir={"": "src"}, 37 | entry_points = { 38 | "console_scripts": [ 39 | "osas = osas.cli:main" 40 | ] 41 | } 42 | ) 43 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/OSAS/5d8693b6c483f99f4339e7ceb8c8d041d778c61a/src/__init__.py -------------------------------------------------------------------------------- /src/osas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/OSAS/5d8693b6c483f99f4339e7ceb8c8d041d778c61a/src/osas/__init__.py -------------------------------------------------------------------------------- /src/osas/api.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import sys 3 | import hashlib 4 | import io 5 | import json 6 | import time 7 | 8 | sys.path.append('') 9 | 10 | from src.osas.pipeline import Pipeline 11 | from src.osas.pipeline import DetectAnomalies 12 | from src.osas.pipeline import GroomData 13 | 14 | 15 | class OSASConfig: 16 | def __init__(self, configparser: configparser.ConfigParser): 17 | ''' 18 | Create a new instance of OSAS configuration. If you don't want to manually use configparser to parse the input, use one of the helper methods: from_file or from_string 19 | @param configparser - instance of type RawConfigParser 20 | ''' 21 | self._config = configparser 22 | # compute md5 of conf file 23 | bw = io.StringIO() 24 | configparser.write(bw) 25 | bw.flush() 26 | bw.seek(0) 27 | bb = bw.read().encode('utf-8') 28 | self._md5 = hashlib.md5(bb).hexdigest() 29 | 30 | @staticmethod 31 | def from_file(filename: str): 32 | ''' 33 | Create a new config instance using the specified filename 34 | 35 | @param filename: path to file 36 | ''' 37 | 38 | cfg = configparser.ConfigParser() 39 | with open(filename, 'r') as f: 40 | cfg.read_file(f) 41 | 42 | oc = OSASConfig(cfg) 43 | return oc 44 | 45 | @staticmethod 46 | def from_string(string: str): 47 | ''' 48 | Create a new config instance using the specified configuration string 49 | 50 | @param string: configuration string 51 | ''' 52 | cfg = configparser.RawConfigParser() 53 | cfg.read_string(string) 54 | oc = OSASConfig(cfg) 55 | return oc 56 | 57 | def md5(self): 58 | return self._md5 59 | 60 | @property 61 | def config(self): 62 | return self._config 63 | 64 | 65 | class OSASPretrainedModel: 66 | def __init__(self, string: str): 67 | self._json = json.loads(string) 68 | self._md5 = hashlib.md5(string.encode('utf-8')).hexdigest() 69 | 70 | @staticmethod 71 | def from_file(filename: str): 72 | return OSASPretrainedModel(open(filename).read()) 73 | 74 | @staticmethod 75 | def from_string(string: str): 76 | return OSASPretrainedModel(string) 77 | 78 | def md5(self): 79 | return self._md5 80 | 81 | @property 82 | def json(self): 83 | return self._json 84 | 85 | 86 | osas_instances = {} 87 | 88 | 89 | class OSAS: 90 | def __init__(self, conf: OSASConfig, model: OSASPretrainedModel): 91 | self._pipeline = [] 92 | gd = GroomData() 93 | scoring_model_name = conf.config['AnomalyScoring']['scoring_algorithm'] 94 | for sect in conf.config: 95 | if 'generator_type' in conf.config[sect]: 96 | self._pipeline.append(gd.from_pretrained(conf.config[sect]['generator_type'], 97 | model.json['model'][sect])) 98 | da = DetectAnomalies() 99 | self._detect_anomalies = da.get_pretrained_model(scoring_model_name, json.dumps(model.json['scoring'])) 100 | 101 | @staticmethod 102 | def get_instance(conf: OSASConfig, model: OSASPretrainedModel): 103 | total_hash = '{0}_{1}'.format(conf.md5(), model.md5()) 104 | if total_hash not in osas_instances: 105 | osas_instance = OSAS(conf, model) 106 | osas_instances[total_hash] = osas_instance 107 | return osas_instance 108 | else: 109 | return osas_instances[total_hash] 110 | 111 | def __call__(self, row_dict: dict): 112 | label_list = [] 113 | for lg in self._pipeline: 114 | llist = lg(row_dict) 115 | for label in llist: 116 | label_list.append(label) 117 | # create a dummy entry 118 | 119 | dummy_ds = [{'_labels': label_list}] 120 | score = self._detect_anomalies(dummy_ds, verbose=False) 121 | return { 122 | 'labels': label_list, 123 | 'score': score 124 | } 125 | 126 | 127 | if __name__ == '__main__': 128 | cfg = OSASConfig.from_file('tests/model.conf') 129 | print(cfg.md5()) 130 | mdl = OSASPretrainedModel.from_file('tests/model.json') 131 | print(mdl.md5()) 132 | time_start = time.time() 133 | osas = OSAS.get_instance(cfg, mdl) 134 | time_first_call = time.time() 135 | osas = OSAS.get_instance(cfg, mdl) 136 | time_second_call = time.time() 137 | t1 = time_first_call - time_start 138 | t2 = time_second_call - time_first_call 139 | print("Initial instance creation took {0:.8f} seconds".format(t1)) 140 | print("Second call took {0:.8f} seconds".format(t2)) 141 | print("Speedup was {0:.3f}".format(t1 / t2)) 142 | print(osas({ 143 | 'countries': 'Somalia', 144 | })) 145 | -------------------------------------------------------------------------------- /src/osas/cli.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Adobe. All rights reserved. 2 | # This file is licensed to you under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. You may obtain a copy 4 | # of the License at http://www.apache.org/licenses/LICENSE-2.0 5 | 6 | # Unless required by applicable law or agreed to in writing, software distributed under 7 | # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS 8 | # OF ANY KIND, either express or implied. See the License for the specific language 9 | # governing permissions and limitations under the License. 10 | 11 | import os 12 | import os.path 13 | import sys 14 | import click 15 | from decouple import config 16 | import warnings 17 | 18 | warnings.filterwarnings("ignore", category=UserWarning) 19 | warnings.filterwarnings("ignore", category=DeprecationWarning) 20 | warnings.filterwarnings("ignore", category=Warning) 21 | 22 | 23 | def app_version(ctx, param, value): 24 | if not value or ctx.resilient_parsing: 25 | return 26 | 27 | from importlib.metadata import version 28 | 29 | osas_version = version("osas") 30 | 31 | click.echo(f"OSAS {osas_version}") 32 | ctx.exit() 33 | 34 | 35 | @click.group() 36 | @click.option( 37 | "--version", 38 | is_flag=True, 39 | callback=app_version, 40 | expose_value=False, 41 | is_eager=True, 42 | help="Show the version and exit.", 43 | ) 44 | def main(): 45 | pass 46 | 47 | 48 | @click.group() 49 | def ingest(): 50 | pass 51 | 52 | if __name__ == "__main__": 53 | # disable all TQDM output 54 | main() -------------------------------------------------------------------------------- /src/osas/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/OSAS/5d8693b6c483f99f4339e7ceb8c8d041d778c61a/src/osas/core/__init__.py -------------------------------------------------------------------------------- /src/osas/core/anomaly.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import sys 20 | import ast 21 | import numpy as np 22 | import tqdm 23 | from sklearn.preprocessing import MultiLabelBinarizer 24 | from sklearn.decomposition import TruncatedSVD 25 | from sklearn.neighbors import LocalOutlierFactor 26 | from sklearn.ensemble import IsolationForest 27 | import json 28 | import pickle 29 | import base64 30 | import importlib 31 | 32 | sys.path.append('') 33 | from osas.core.interfaces import AnomalyDetection, Datasource 34 | 35 | 36 | class IFAnomaly(AnomalyDetection): 37 | """ 38 | Uses LOF to detect anomalies 39 | """ 40 | 41 | def __init__(self): 42 | super().__init__() 43 | self._model = None 44 | self._data_encoder = None 45 | self._decompose = None 46 | 47 | def build_model(self, dataset: Datasource, incremental=False) -> dict: 48 | data_encoder = MultiLabelBinarizer() 49 | labels = [] 50 | for item in dataset: 51 | labels.append(item['_labels']) 52 | data_encoded = data_encoder.fit_transform(labels) 53 | self._data_encoder = data_encoder 54 | 55 | decompose = TruncatedSVD(n_components=4, n_iter=7, random_state=42) 56 | data_decomposed = decompose.fit_transform(data_encoded) 57 | self._decompose = decompose 58 | 59 | iso_forest = IsolationForest(random_state=0, n_jobs=4) 60 | iso_forest.fit(data_decomposed) 61 | 62 | self._model = iso_forest 63 | 64 | model = {'encoder': self._data_encoder, 65 | 'SVD': self._decompose, 66 | 'iso_forest': self._model 67 | } 68 | out_model = base64.b64encode(pickle.dumps(model)).decode('ascii') 69 | model = {'model': out_model} 70 | return model 71 | 72 | def __call__(self, dataset: Datasource, verbose=True) -> [float]: 73 | 74 | labels = [] 75 | for item in dataset: 76 | labels.append(item['_labels']) 77 | data_encoded = self._data_encoder.transform(labels) 78 | data_decomposed = self._decompose.transform(data_encoded) 79 | scores = self._model.score_samples(data_decomposed) 80 | 81 | return -scores 82 | 83 | @staticmethod 84 | def from_pretrained(pretrained: str) -> AnomalyDetection: 85 | tmp = json.loads(pretrained) 86 | pre_model = pickle.loads(base64.b64decode(tmp['model'])) 87 | model = IFAnomaly() 88 | model._data_encoder = pre_model['encoder'] 89 | model._decompose = pre_model['SVD'] 90 | model._model = pre_model['iso_forest'] 91 | 92 | return model 93 | 94 | 95 | class LOFAnomaly(AnomalyDetection): 96 | """ 97 | Uses LOF to detect anomalies 98 | """ 99 | 100 | def __init__(self): 101 | super().__init__() 102 | self._model = None 103 | self._data_encoder = None 104 | self._decompose = None 105 | 106 | def build_model(self, dataset: Datasource, incremental=False) -> dict: 107 | data_encoder = MultiLabelBinarizer() 108 | labels = [] 109 | for item in dataset: 110 | labels.append(item['_labels']) 111 | data_encoded = data_encoder.fit_transform(labels) 112 | self._data_encoder = data_encoder 113 | 114 | decompose = TruncatedSVD(n_components=4, n_iter=7, random_state=42) 115 | data_decomposed = decompose.fit_transform(data_encoded) 116 | self._decompose = decompose 117 | 118 | lof = LocalOutlierFactor(n_neighbors=10, n_jobs=4, novelty=True) 119 | lof.fit(data_decomposed) 120 | 121 | self._model = lof 122 | 123 | model = {'encoder': self._data_encoder, 124 | 'SVD': self._decompose, 125 | 'LOF': self._model 126 | } 127 | 128 | out_model = base64.b64encode(pickle.dumps(model)).decode('ascii') 129 | model = {'model': out_model} 130 | return model 131 | 132 | def __call__(self, dataset: Datasource, verbose=True) -> [float]: 133 | 134 | labels = [] 135 | for item in dataset: 136 | labels.append(item['_labels']) 137 | data_encoded = self._data_encoder.transform(labels) 138 | data_decomposed = self._decompose.transform(data_encoded) 139 | scores = self._model.score_samples(data_decomposed) 140 | 141 | return -scores 142 | 143 | @staticmethod 144 | def from_pretrained(pretrained: str) -> AnomalyDetection: 145 | tmp = json.loads(pretrained) 146 | pre_model = pickle.loads(base64.b64decode(tmp['model'])) 147 | model = LOFAnomaly() 148 | model._data_encoder = pre_model['encoder'] 149 | model._decompose = pre_model['SVD'] 150 | model._model = pre_model['LOF'] 151 | 152 | return model 153 | 154 | 155 | class SVDAnomaly(AnomalyDetection): 156 | """ 157 | Uses an autoencoder to compute anomaly score 158 | """ 159 | 160 | def __init__(self): 161 | super().__init__() 162 | self._data_encoder = None 163 | self._model = None 164 | 165 | def build_model(self, dataset: Datasource, incremental=False) -> dict: 166 | 167 | labels = [] 168 | for item in dataset: 169 | tmp = [] 170 | for label in item['_labels']: 171 | if isinstance(label, str): 172 | tmp.append(label) 173 | labels.append(tmp) 174 | 175 | if not incremental: 176 | data_encoder = MultiLabelBinarizer() 177 | data_encoded = data_encoder.fit_transform(labels) 178 | else: 179 | data_encoder = self._data_encoder 180 | data_encoded = data_encoder.transform(labels) 181 | self._data_encoder = data_encoder 182 | if not incremental: 183 | decompose = TruncatedSVD(n_components=4, n_iter=50, random_state=42) 184 | decompose.fit(data_encoded) 185 | else: 186 | decompose = self._model 187 | decompose.partial_fit(data_encoded) 188 | 189 | self._model = decompose 190 | 191 | model = {'encoder': self._data_encoder, 192 | 'SVD': self._model} 193 | 194 | out_model = base64.b64encode(pickle.dumps(model)).decode('ascii') 195 | model = {'model': out_model} 196 | return model 197 | 198 | def __call__(self, dataset: Datasource, verbose=True) -> [float]: 199 | 200 | labels = [] 201 | for item in dataset: 202 | labels.append(item['_labels']) 203 | data_encoded = self._data_encoder.transform(labels) 204 | data_decomposed = self._model.transform(data_encoded) 205 | data_reconstruct = self._model.inverse_transform(data_decomposed) 206 | 207 | difference = data_encoded - data_reconstruct 208 | power = np.sum(difference ** 2, axis=1) 209 | error = np.sqrt(power) 210 | 211 | return error 212 | 213 | @staticmethod 214 | def from_pretrained(pretrained: str) -> AnomalyDetection: 215 | tmp = json.loads(pretrained) 216 | pre_model = pickle.loads(base64.b64decode(tmp['model'])) 217 | model = SVDAnomaly() 218 | model._data_encoder = pre_model['encoder'] 219 | model._model = pre_model['SVD'] 220 | 221 | return model 222 | 223 | 224 | class StatisticalNGramAnomaly(AnomalyDetection): 225 | """ 226 | Uses an autoencoder to compute anomaly score 227 | """ 228 | 229 | def __init__(self): 230 | super().__init__() 231 | self._model = None 232 | 233 | def build_model(self, dataset: Datasource, incremental=False) -> dict: 234 | if not incremental: 235 | model = { 236 | '1': {'TOTAL': 0}, 237 | '2': {'TOTAL': 0}, 238 | '3': {'TOTAL': 0} 239 | } 240 | else: 241 | model = self._model 242 | # for clarity, this code is written explicitly 243 | for item in tqdm.tqdm(dataset, ncols=100, desc="\tbuilding model"): 244 | tags = item['_labels'] 245 | string_tags = [] 246 | for tag in tags: 247 | if isinstance(tag, str): 248 | string_tags.append(tag) 249 | tags = string_tags 250 | tags = list(sorted(tags)) 251 | # unigrams 252 | grams = model['1'] 253 | for ii in range(len(tags)): 254 | key = '(' + str(tags[ii]) + ')' 255 | if key in grams: 256 | grams[key]['COUNT'] += 1 257 | else: 258 | grams[key] = {'COUNT': 1} 259 | grams['TOTAL'] += 1 260 | 261 | # bigrams 262 | grams = model['2'] 263 | 264 | for ii in range(len(tags) - 1): 265 | for jj in range(ii + 1, len(tags)): 266 | key = '(' + str(tags[ii]) + ',' + str(tags[jj]) + ')' 267 | if key in grams: 268 | grams[key]['COUNT'] += 1 269 | else: 270 | grams[key] = {'COUNT': 1} 271 | grams['TOTAL'] += 1 272 | 273 | # trigrams 274 | grams = model['3'] 275 | 276 | for ii in range(len(tags) - 2): 277 | for jj in range(ii + 1, len(tags) - 1): 278 | for kk in range(jj + 1, len(tags)): 279 | key = '(' + str(tags[ii]) + ',' + str(tags[jj]) + ',' + str(tags[kk]) + ')' 280 | if key in grams: 281 | grams[key]['COUNT'] += 1 282 | else: 283 | grams[key] = {'COUNT': 1} 284 | grams['TOTAL'] += 1 285 | 286 | # convert to probs and log-probs 287 | for g in ['1', '2', '3']: 288 | grams = model[g] 289 | total = grams['TOTAL'] 290 | for key in grams: 291 | if key != 'TOTAL': 292 | grams[key]['PROB'] = grams[key]['COUNT'] / total 293 | grams[key]['NEG_LOG_PROB'] = -np.log(grams[key]['PROB']) 294 | self._model = model 295 | 296 | out_model = base64.b64encode(pickle.dumps(model)).decode('ascii') 297 | model = {'model': out_model} 298 | return model 299 | 300 | def __call__(self, dataset: Datasource, verbose=True) -> [float]: 301 | 302 | def _build_feats(tags): 303 | feats = [] 304 | string_tags = [] 305 | perp_score = 0 306 | for tag in tags: 307 | if isinstance(tag, str): 308 | string_tags.append(tag) 309 | else: 310 | perp_score += tag 311 | tags = string_tags 312 | tags = list(sorted(tags)) 313 | 314 | for ii in range(len(tags)): 315 | feats.append([tags[ii]]) 316 | for ii in range(len(tags) - 1): 317 | for jj in range(ii + 1, len(tags)): 318 | feats.append([tags[ii], tags[jj]]) 319 | 320 | for ii in range(len(tags) - 2): 321 | for jj in range(ii + 1, len(tags) - 1): 322 | for kk in range(jj + 1, len(tags)): 323 | feats.append([tags[ii], tags[jj], tags[kk]]) 324 | new_feats = [] 325 | for feat in feats: 326 | mid = "(" + ",".join(feat) + ")" 327 | new_feats.append(mid) 328 | return new_feats, perp_score 329 | 330 | def _compute_score(ngram2score, tags, handle_unseen=True): 331 | feats, perp_score = _build_feats(tags) 332 | 333 | score = 0 334 | for feat in feats: 335 | found = False 336 | if feat in ngram2score['1']: 337 | score += ngram2score['1'][feat]['NEG_LOG_PROB'] 338 | found = True 339 | elif feat in ngram2score['2']: 340 | score += ngram2score['2'][feat]['NEG_LOG_PROB'] 341 | found = True 342 | elif feat in ngram2score['3']: 343 | score += ngram2score['3'][feat]['NEG_LOG_PROB'] 344 | found = True 345 | if not found: 346 | if handle_unseen: 347 | import math 348 | score += -math.log(1e-8) 349 | return score + perp_score 350 | 351 | scores = [] 352 | if verbose: 353 | pgb = tqdm.tqdm(dataset, ncols=100, desc="\tscoring data") 354 | else: 355 | pgb = dataset 356 | for item in pgb: 357 | scores.append(_compute_score(self._model, item['_labels'])) 358 | 359 | return scores 360 | 361 | @staticmethod 362 | def from_pretrained(pretrained: str) -> AnomalyDetection: 363 | tmp = json.loads(pretrained) 364 | pre_model = pickle.loads(base64.b64decode(tmp['model'])) 365 | model = StatisticalNGramAnomaly() 366 | model._model = pre_model 367 | 368 | return model 369 | 370 | 371 | class SupervisedClassifierAnomaly(AnomalyDetection): 372 | def __init__(self): 373 | super().__init__() 374 | self.BINARY_GROUND_TRUTHS1 = {'clean', 'bad'} 375 | self.BINARY_GROUND_TRUTHS2 = {0, 1} 376 | self.BINARY_IND_TO_GROUND_TRUTH1 = ['clean', 'bad'] 377 | self.BINARY_IND_TO_GROUND_TRUTH2 = [0, 1] 378 | 379 | self._model = None 380 | self._encoder = None 381 | self._is_binary_preds = False 382 | self._ind_to_ground_truth = None 383 | 384 | def build_model(self, dataset: Datasource, ground_truth_column: str, classifier: str, init_args: dict, 385 | incremental=False) -> dict: 386 | labels = [] 387 | ground_truth_values = set() 388 | for item in dataset: 389 | labels.append(item['_labels']) 390 | ground_truth_values.add(item[ground_truth_column]) 391 | if not incremental: 392 | encoder = MultiLabelBinarizer() 393 | labels_enc = encoder.fit_transform(labels) 394 | else: 395 | encoder = self._encoder 396 | labels_enc = encoder.transform(labels) 397 | 398 | # set binary preds 399 | if ground_truth_values == self.BINARY_GROUND_TRUTHS1: 400 | # all grouth truth labels either clean or bad 401 | self._is_binary_preds = True 402 | ind_to_ground_truth = self.BINARY_IND_TO_GROUND_TRUTH1 # set bad to index 1 403 | elif ground_truth_values == self.BINARY_GROUND_TRUTHS2: 404 | # all grouth truth labels either 0 or 1 405 | self._is_binary_preds = True 406 | ind_to_ground_truth = self.BINARY_IND_TO_GROUND_TRUTH2 # set 1 to index 1 407 | else: 408 | # ground truth labels can be anything 409 | self._is_binary_preds = False 410 | ind_to_ground_truth = list(ground_truth_values) 411 | 412 | # convert ground truth values to indices 413 | ground_truth_to_ind = dict() 414 | for i in range(len(ind_to_ground_truth)): 415 | ground_truth_to_ind[ind_to_ground_truth[i]] = i 416 | model_ground_truths = [] 417 | for item in dataset: 418 | gt = item[ground_truth_column] 419 | model_ground_truths.append(ground_truth_to_ind[gt]) 420 | 421 | # get the classifier 422 | if not incremental: 423 | try: 424 | clf_parts = classifier.split('.') 425 | assert clf_parts[0] == 'sklearn' 426 | sk_pkg = importlib.import_module('{:s}.{:s}'.format(clf_parts[0], clf_parts[1])) 427 | clf_class = getattr(sys.modules[sk_pkg.__name__], clf_parts[2]) 428 | except: 429 | raise Exception( 430 | 'expected classifier to be in sklearn package format: sklearn.. (ex. sklearn.linear_model.LogisiticRegression)') 431 | clf = clf_class(**init_args) # dict unpacking for init args 432 | clf.fit(labels_enc, model_ground_truths) 433 | else: 434 | clf = self._model 435 | clf.partial_fit(labels_enc, model_ground_truths) 436 | 437 | # return model 438 | self._encoder = encoder 439 | self._ind_to_ground_truth = ind_to_ground_truth 440 | self._model = clf 441 | model = { 442 | 'encoder': self._encoder, 443 | 'ind_to_ground_truth': ind_to_ground_truth, 444 | 'is_binary_preds': self._is_binary_preds, 445 | 'classifier': self._model 446 | } 447 | out_model = base64.b64encode(pickle.dumps(model)).decode('ascii') 448 | model = {'model': out_model} 449 | return model 450 | 451 | def __call__(self, dataset: Datasource, verbose=True) -> [float]: 452 | labels = [] 453 | for item in dataset: 454 | labels.append(item['_labels']) 455 | labels_enc = self._encoder.transform(labels) 456 | 457 | preds = self._model.predict_proba(labels_enc) 458 | if self._is_binary_preds: 459 | # return the "bad" prob 460 | preds = [pred[1] for pred in preds] 461 | else: 462 | # return the class with most prob 463 | preds = [self._ind_to_ground_truth[np.argmax(pred)] for pred in preds] 464 | return preds 465 | 466 | @staticmethod 467 | def from_pretrained(pretrained: str) -> AnomalyDetection: 468 | tmp = json.loads(pretrained) 469 | pre_model = pickle.loads(base64.b64decode(tmp['model'])) 470 | model = SupervisedClassifierAnomaly() 471 | model._encoder = pre_model['encoder'] 472 | model._ind_to_ground_truth = pre_model['ind_to_ground_truth'] 473 | model._is_binary_preds = pre_model['is_binary_preds'] 474 | model._model = pre_model['classifier'] 475 | 476 | return model 477 | 478 | 479 | if __name__ == "__main__": 480 | from osas.data.datasources import CSVDataSource 481 | 482 | data_source = CSVDataSource('corpus/hubble_test_tags.csv') 483 | 484 | 485 | def coverter(x): 486 | return ast.literal_eval(x) 487 | 488 | 489 | data_source._data['_labels'] = data_source._data['_labels'].apply(lambda x: coverter(x)) 490 | 491 | model = StatisticalNGramAnomaly() 492 | tmp = model.build_model(data_source) 493 | tmp = json.dumps(tmp) 494 | model2 = StatisticalNGramAnomaly.from_pretrained(tmp) 495 | scores = model(data_source) 496 | 497 | scores2 = model2(data_source) 498 | import operator 499 | 500 | dd = {} 501 | from ipdb import set_trace 502 | 503 | for ex, score in zip(data_source, scores): 504 | dd[",".join(ex['_labels'])] = score 505 | sorted_x = sorted(dd.items(), key=operator.itemgetter(1)) 506 | 507 | set_trace() 508 | -------------------------------------------------------------------------------- /src/osas/core/interfaces.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from typing import Union, Any 20 | from abc import abstractmethod 21 | 22 | 23 | class DatasourceIterator: 24 | def __init__(self, datasource): 25 | self._ds = datasource 26 | self._index = 0 27 | 28 | def __next__(self): 29 | if self._index < len(self._ds): 30 | rez = self._ds[self._index] 31 | self._index += 1 32 | return rez 33 | else: 34 | raise StopIteration 35 | 36 | 37 | class DataColumn: 38 | def __init__(self): 39 | pass 40 | 41 | @abstractmethod 42 | def mean(self) -> float: 43 | """Computes mean for numerical columns""" 44 | pass 45 | 46 | @abstractmethod 47 | def std(self) -> float: 48 | """Computes standard deviation for numerical columns""" 49 | pass 50 | 51 | @abstractmethod 52 | def min(self) -> any: 53 | """Computes minumum value for numerical columns""" 54 | pass 55 | 56 | @abstractmethod 57 | def max(self) -> any: 58 | """Computes minumum value for numerical columns""" 59 | pass 60 | 61 | @abstractmethod 62 | def unique(self) -> list: 63 | """Computes unique values for columns""" 64 | pass 65 | 66 | @abstractmethod 67 | def value_counts(self) -> dict: 68 | """Computes histogram values for columns""" 69 | pass 70 | 71 | @abstractmethod 72 | def tolist(self) -> list: 73 | """Computes """ 74 | pass 75 | 76 | @abstractmethod 77 | def apply(self, func) -> int: 78 | """ 79 | Apply lambda function 80 | :param func: function to apply 81 | :return: 82 | """ 83 | pass 84 | 85 | @abstractmethod 86 | def __len__(self) -> int: 87 | """Returns the number of items in the collection""" 88 | pass 89 | 90 | @abstractmethod 91 | def __getitem__(self, index: int) -> dict: 92 | """Returns an item as a dictionary 93 | :param index - the index of the element 94 | """ 95 | pass 96 | 97 | @abstractmethod 98 | def __setitem__(self, index: int, value: Any) -> dict: 99 | """Sets the value for an item 100 | :param index - the index of the element 101 | """ 102 | pass 103 | 104 | def __iter__(self): 105 | return DatasourceIterator(self) 106 | 107 | 108 | class Datasource: 109 | def __init__(self): 110 | pass 111 | 112 | @abstractmethod 113 | def __len__(self) -> int: 114 | """Returns the number of items in the collection""" 115 | pass 116 | 117 | @abstractmethod 118 | def __getitem__(self, index: int) -> dict: 119 | """Returns an item as a dictionary 120 | :param index - the index of the element 121 | """ 122 | pass 123 | 124 | @abstractmethod 125 | def __setitem__(self, key: str, value: any): 126 | """ 127 | Create or set a column 128 | :param key: column name 129 | :param value: values 130 | :return: 131 | """ 132 | pass 133 | 134 | def __iter__(self): 135 | return DatasourceIterator(self) 136 | 137 | @abstractmethod 138 | def apply(self, func, axis: int = 0) -> int: 139 | """ 140 | Apply lambda function 141 | :param func: function to apply 142 | :param axis: 0-column, 1-row; default=0 143 | :return: 144 | """ 145 | pass 146 | 147 | @abstractmethod 148 | def save(self, file_handle) -> None: 149 | """ 150 | Save the data into csv format 151 | :param file_handle: open file handle for writing 152 | :return: None 153 | """ 154 | 155 | 156 | class LabelGenerator: 157 | def __init__(self): 158 | pass 159 | 160 | @abstractmethod 161 | def __call__(self, input_object: dict) -> [str]: 162 | """ 163 | Generate specific labels for the dataset entry 164 | :param input_object: an entry in the dataset 165 | :return: list of labels generated for this input object 166 | """ 167 | pass 168 | 169 | @abstractmethod 170 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict: 171 | """ 172 | This model should generate a model on the input 173 | :param dataset: the dataset used to generate the model 174 | :param count_column: use this column for clustered data. If not set, event count will be 1 175 | :return: This should be a json serializable object 176 | """ 177 | pass 178 | 179 | @staticmethod 180 | def from_pretrained(pretrained: str) -> object: 181 | """ 182 | :param pretrained: dictionary holding pretrained model 183 | :return: New instance 184 | """ 185 | pass 186 | 187 | 188 | class AnomalyDetection: 189 | def __init__(self): 190 | pass 191 | 192 | @abstractmethod 193 | def build_model(self, dataset: Datasource, incremental: bool = False) -> dict: 194 | """ 195 | This model should generate a model on the input 196 | :param dataset: the dataset used to generate the model 197 | :param incremental: perform incremental update 198 | :return: This should be a json serializable object 199 | """ 200 | pass 201 | 202 | @abstractmethod 203 | def __call__(self, dataset: Datasource, verbose=True) -> [float]: 204 | """ 205 | Scores a dataset with anomaly scores 206 | :param dataset: the dataset to score 207 | :return: an anomaly score for each example in the dataset 208 | """ 209 | pass 210 | -------------------------------------------------------------------------------- /src/osas/core/label_generators.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from cProfile import label 20 | import sys 21 | import pandas as pd 22 | import numpy as np 23 | import re 24 | import math 25 | 26 | sys.path.append('') 27 | import json 28 | from osas.core.interfaces import LabelGenerator, Datasource 29 | from osas.core.utils import Tokenizer 30 | from enum import Enum 31 | 32 | from lol.api import LOLC 33 | from lol.api import PlatformType 34 | 35 | import obfuscation_detection as od 36 | 37 | 38 | class ObfuscationFieldPlatform(Enum): 39 | LINUX = od.PlatformType.LINUX 40 | WINDOWS = od.PlatformType.WINDOWS 41 | ALL = od.PlatformType.ALL 42 | 43 | 44 | class ObfuscationField(LabelGenerator): 45 | """ 46 | This type of Label generator handles fields that contain Linux/Windows commands. It uses machine learning 47 | to predict if a command is obfuscated or not. 48 | """ 49 | 50 | def __init__(self, field_name: str = '', platform: ObfuscationFieldPlatform = ObfuscationFieldPlatform.ALL, 51 | gpu: bool = False): 52 | if platform == ObfuscationFieldPlatform.LINUX: 53 | platform = od.PlatformType.LINUX 54 | elif platform == ObfuscationFieldPlatform.WINDOWS: 55 | platform = od.PlatformType.WINDOWS 56 | else: 57 | platform = od.PlatformType.ALL 58 | platform_str = str(platform) 59 | self._model = { 60 | 'field_name': field_name, 61 | 'platform': platform_str, 62 | 'gpu': gpu 63 | } 64 | self._classifier = od.ObfuscationClassifier(platform=platform, gpu=gpu) 65 | 66 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict: 67 | return self._model 68 | 69 | @staticmethod 70 | def from_pretrained(pretrained: str) -> object: 71 | lg = ObfuscationField() 72 | lg._model = json.loads(pretrained) 73 | platform = od.PlatformType.ALL 74 | if lg._model['platform'] == 'od.PlatformType.LINUX': 75 | platform = od.PlatformType.LINUX 76 | elif lg._model['platform'] == 'od.PlatformType.WINDOWS': 77 | platform = od.PlatformType.WINDOWS 78 | lg._classifier = od.ObfuscationClassifier(platform=platform, gpu=bool(lg._model['gpu'])) 79 | return lg 80 | 81 | def __call__(self, object: dict) -> [str]: 82 | command = object[self._model['field_name']] 83 | classification = self._classifier([command])[0] 84 | if classification == 1: 85 | ret = 'OBFUSCATED' 86 | else: 87 | ret = 'NOT OBFUSCATED' 88 | return [ret] 89 | 90 | 91 | class LOLFieldPlatform(Enum): 92 | LINUX = PlatformType.LINUX 93 | WINDOWS = PlatformType.WINDOWS 94 | 95 | 96 | class LOLField(LabelGenerator): 97 | """ 98 | This type of LabelGenerator handles fields that contain Linux/Windows commands. It uses MachineLearning to 99 | predict if a command is part of a Living of the Land attack 100 | """ 101 | 102 | def __init__(self, field_name: str = '', platform: LOLFieldPlatform = LOLFieldPlatform.LINUX, return_labels=False): 103 | """ 104 | Constructor 105 | :param field_name: what field to look for in the data object 106 | :param platform: chose what model to use Windows/Linux 107 | :param return_labels: return all generated labels or just the status (BAD, GOOD, NEUTRAL) 108 | """ 109 | if platform == 'linux': 110 | platform = PlatformType.LINUX 111 | elif platform == 'windows': 112 | platform = PlatformType.WINDOWS 113 | platform_str = str(platform) 114 | self._model = { 115 | 'field_name': field_name, 116 | 'platform': platform_str, 117 | 'return_labels': return_labels 118 | } 119 | self._classifier = LOLC(platform=platform) 120 | 121 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict: 122 | return self._model 123 | 124 | @staticmethod 125 | def from_pretrained(pretrained: str) -> object: 126 | lg = LOLField() 127 | lg._model = json.loads(pretrained) 128 | platform = PlatformType.LINUX 129 | if lg._model['platform'] == 'PlatformType.WINDOWS': 130 | platform = PlatformType.WINDOWS 131 | lg._classifier = LOLC(platform=platform) 132 | return lg 133 | 134 | def __call__(self, object: dict): 135 | command = object[self._model['field_name']] 136 | status, labels = self._classifier(command) 137 | ret_labels = [status] 138 | if self._model['return_labels']: 139 | for label in labels: 140 | ret_labels.append(label) 141 | return ret_labels 142 | 143 | 144 | class NumericField(LabelGenerator): 145 | """ 146 | This type of LabelGenerator handles numerical fields. It computes the mean and standard deviation and generates 147 | labels according to the distance between the current value and the mean value 148 | (value<=sigma NORMAL, sigma dict: 196 | incremental = False 197 | if self._model['mean'] is not None: 198 | ex_mean = self._model['mean'] 199 | ex_stdev = self._model['std_dev'] 200 | ex_count = self._model['count'] 201 | incremental = True 202 | group_by = self._model['group_by'] 203 | if group_by is None: 204 | mean = 0 205 | stdev = 0 206 | count = 0 207 | else: 208 | mean = {} 209 | stdev = {} 210 | count = {} 211 | # mean 212 | for item in dataset: 213 | cc = 1 214 | if count_column is not None: 215 | cc = int(item[count_column]) 216 | if group_by is None: 217 | mean += item[self._model['field_name']] * cc 218 | count += cc 219 | else: 220 | key = self._get_group_by_value(item, group_by) 221 | if key not in mean: 222 | mean[key] = 0 223 | stdev[key] = 0 224 | count[key] = 0 225 | mean[key] += item[self._model['field_name']] * cc 226 | count[key] += cc 227 | 228 | if group_by is None: 229 | mean /= count 230 | else: 231 | for key in mean: 232 | mean[key] /= count[key] 233 | # stdev 234 | for item in dataset: 235 | cc = 1 236 | if count_column is not None: 237 | cc = int(item[count_column]) 238 | if group_by is None: 239 | stdev += ((item[self._model['field_name']] - mean) ** 2) * cc 240 | else: 241 | key = self._get_group_by_value(item, group_by) 242 | stdev[key] += ((item[self._model['field_name']] - mean[key]) ** 2) * cc 243 | 244 | if group_by is None: 245 | stdev /= count 246 | stdev = math.sqrt(stdev) 247 | else: 248 | for key in stdev: 249 | stdev[key] /= count[key] 250 | stdev[key] = math.sqrt(stdev[key]) 251 | 252 | # update if incremental 253 | if incremental: 254 | if group_by is None: 255 | new_mean = (ex_mean * ex_count + mean * count) / (ex_count + count) 256 | new_stdev = (((ex_stdev ** 2) * ex_count) + ((stdev ** 2) * count)) / (ex_count + count) 257 | new_count = ex_count + count 258 | else: 259 | new_mean = {} 260 | new_stdev = {} 261 | new_count = {} 262 | for key in mean: 263 | if key in ex_mean: 264 | new_mean[key] = (ex_mean[key] * ex_count[key] + mean[key] * count[key]) / ( 265 | ex_count[key] + count[key]) 266 | new_stdev[key] = (((ex_stdev[key] ** 2) * ex_count[key]) + ((stdev[key] ** 2) * count[key])) / ( 267 | ex_count[key] + count[key]) 268 | new_count[key] = ex_count[key] + count[key] 269 | else: 270 | new_mean[key] = mean[key] 271 | new_stdev[key] = stdev[key] 272 | new_count[key] = count[key] 273 | # transfer ex-values 274 | for key in ex_mean: 275 | if key not in mean: 276 | new_mean[key] = ex_mean[key] 277 | new_stdev[key] = ex_stdev[key] 278 | new_count[key] = ex_count[key] 279 | 280 | mean = new_mean 281 | stdev = new_stdev 282 | count = new_count 283 | # store 284 | self._model['mean'] = mean 285 | self._model['std_dev'] = stdev 286 | self._model['count'] = count 287 | # check sanity and warn user 288 | font_style = '\033[93m' 289 | mean_is_zero = False 290 | stdev_is_zero = False 291 | if self._model['group_by'] is None: 292 | if self._model['mean'] == 0: 293 | mean_is_zero = True 294 | if self._model['std_dev'] == 0: 295 | stdev_is_zero = True 296 | else: 297 | for key in self._model['mean']: 298 | if self._model['mean'][key] == 0: 299 | mean_is_zero = True 300 | if self._model['std_dev'][key] == 0: 301 | stdev_is_zero = True 302 | if mean_is_zero and self._model['stdev'] == False: 303 | sys.stdout.write('\t{0}::WARNING:You have a mean of 0. Any deviation will be flagged\n'.format(font_style)) 304 | if stdev_is_zero and self._model['stdev'] == True: 305 | sys.stdout.write( 306 | '\t{0}::WARNING:You have a standard deviation of 0. Any deviation will be flagged\n'.format(font_style)) 307 | 308 | return self._model 309 | 310 | # def build_model(self, dataset: Datasource, count_column: str = None) -> dict: 311 | # from osas.data.datasources import CSVDataColumn 312 | # incremental = False 313 | # if self._model['mean'] is not None: 314 | # ex_mean = self._model['mean'] 315 | # ex_stdev = self._model['std_dev'] 316 | # ex_count = self._model['count'] 317 | # incremental = True 318 | # if count_column is None: 319 | # mean = CSVDataColumn(dataset[self._model['field_name']]).mean() 320 | # stdev = CSVDataColumn(dataset[self._model['field_name']]).std() 321 | # count = len(dataset[self._model['field_name']]) 322 | # self._model['mean'] = mean 323 | # self._model['std_dev'] = stdev 324 | # self._model['count'] = count 325 | # else: 326 | # mean = CSVDataColumn(dataset[self._model['field_name']] * dataset[count_column]).sum() 327 | # stdev = ((CSVDataColumn(dataset[self._model['field_name']] * dataset[count_column]) - mean) ** 2).sum() 328 | # count = dataset[count_column].sum() 329 | # mean = mean / count 330 | # stdev = math.sqrt(stdev / count) 331 | # 332 | # self._model['mean'] = mean 333 | # self._model['std_dev'] = stdev 334 | # self._model['count'] = count 335 | # 336 | # if incremental: 337 | # new_count = ex_count + count 338 | # new_mean = (mean * count + ex_mean * ex_count) / new_count 339 | # new_stdev = math.sqrt(((ex_stdev ** 2) * ex_count + (stdev ** 2) * count) / new_count) 340 | # self._model['mean'] = new_mean 341 | # self._model['std_dev'] = new_stdev 342 | # self._model['count'] = new_count 343 | # 344 | # return self._model 345 | 346 | def _get_labels(self, cur_value, mean_val, std_val, stdev, stdev_borderline_threshold, 347 | stdev_outlier_threshold, spike, spike_inverse, spike_borderline_threshold, 348 | spike_outlier_threshold, label_for_normal): 349 | labels = [] 350 | if stdev: 351 | if std_val == 0: 352 | std_val = 0.01 353 | stdev_ratio = abs(cur_value - mean_val) / std_val 354 | 355 | # if using both stdev and spike, calculate a spike from the stdev 356 | if stdev and spike != 'none': 357 | if not spike_inverse: 358 | mean_val = mean_val + std_val 359 | else: 360 | mean_val = mean_val - std_val 361 | 362 | if spike == 'ratio': 363 | if not spike_inverse: 364 | if mean_val == 0: 365 | mean_val = 0.01 366 | spike_ratio = cur_value / mean_val 367 | else: 368 | if cur_value == 0: 369 | cur_value = 0.01 370 | spike_ratio = mean_val / cur_value 371 | elif spike == 'fixed': 372 | if not spike_inverse: 373 | spike_ratio = cur_value - mean_val 374 | else: 375 | spike_ratio = mean_val - cur_value 376 | 377 | field_name = self._model['field_name'].upper() 378 | 379 | if stdev and spike != 'none' and stdev_ratio < stdev_outlier_threshold: 380 | # if both are activated, and event is within stdev outlier threshold 381 | if label_for_normal: 382 | labels.append('{0}_NORMAL'.format(field_name)) 383 | else: 384 | if stdev and spike == 'none': 385 | # only stdev is activated 386 | ratio = stdev_ratio 387 | borderline_threshold = stdev_borderline_threshold 388 | outlier_threshold = stdev_outlier_threshold 389 | else: 390 | # if only spike is activated or both are activated, use spike ratio 391 | ratio = spike_ratio 392 | borderline_threshold = spike_borderline_threshold 393 | outlier_threshold = spike_outlier_threshold 394 | 395 | if label_for_normal and ratio < borderline_threshold: 396 | labels.append('{0}_NORMAL'.format(field_name)) 397 | elif borderline_threshold < ratio < outlier_threshold: 398 | labels.append('{0}_BORDERLINE'.format(field_name)) 399 | elif ratio >= outlier_threshold: 400 | labels.append('{0}_OUTLIER'.format(field_name)) 401 | 402 | return labels 403 | 404 | def __call__(self, input_object: dict) -> [str]: 405 | labels = [] 406 | mean_val = self._model['mean'] 407 | std_val = self._model['std_dev'] 408 | count_val = self._model['count'] 409 | field_name = self._model['field_name'].upper() 410 | label_for_normal = True 411 | if 'label_for_normal' in self._model: 412 | label_for_normal = self._model['label_for_normal'] 413 | 414 | stdev = True 415 | if 'stdev' in self._model: 416 | stdev = bool(self._model['stdev']) 417 | 418 | stdev_borderline_threshold = 1 419 | if 'stdev_borderline_threshold' in self._model: 420 | stdev_borderline_threshold = self._model['stdev_borderline_threshold'] 421 | 422 | stdev_outlier_threshold = 2 423 | if 'stdev_outlier_threshold' in self._model: 424 | stdev_outlier_threshold = self._model['stdev_outlier_threshold'] 425 | 426 | spike = 'none' 427 | if 'spike' in self._model: 428 | spike = self._model['spike'] 429 | 430 | spike_inverse = False 431 | if 'spike_inverse' in self._model: 432 | spike_inverse = bool(self._model['spike_inverse']) 433 | 434 | spike_borderline_threshold = 10 435 | if 'spike_borderline_threshold' in self._model: 436 | spike_borderline_threshold = self._model['spike_borderline_threshold'] 437 | 438 | spike_outlier_threshold = 20 439 | if 'spike_outlier_threshold' in self._model: 440 | spike_outlier_threshold = self._model['spike_outlier_threshold'] 441 | 442 | try: 443 | cur_value = float(input_object[self._model['field_name']]) 444 | except: 445 | return ['{0}_BAD_VALUE'.format(field_name)] 446 | group_by = self._model['group_by'] 447 | if group_by is None: 448 | new_labels = self._get_labels(cur_value, 449 | mean_val, 450 | std_val, 451 | stdev, 452 | stdev_borderline_threshold, 453 | stdev_outlier_threshold, 454 | spike, 455 | spike_inverse, 456 | spike_borderline_threshold, 457 | spike_outlier_threshold, 458 | label_for_normal) 459 | for label in new_labels: 460 | labels.append(label) 461 | # distance = abs((cur_value) - mean_val) 462 | # if label_for_normal and distance <= std_val: 463 | # labels.append(field_name + '_NORMAL') 464 | # elif std_val < distance <= (2 * std_val): 465 | # labels.append(field_name + '_BORDERLINE') 466 | # elif (2 * std_val) < distance: 467 | # labels.append(field_name + '_OUTLIER') 468 | else: 469 | key = self._get_group_by_value(input_object, group_by) 470 | if key in mean_val: 471 | count = count_val[key] 472 | if count > 5: 473 | new_labels = self._get_labels(cur_value, 474 | mean_val[key], 475 | std_val[key], 476 | stdev, 477 | stdev_borderline_threshold, 478 | stdev_outlier_threshold, 479 | spike, 480 | spike_inverse, 481 | spike_borderline_threshold, 482 | spike_outlier_threshold, 483 | label_for_normal) 484 | for label in new_labels: 485 | labels.append(label) 486 | 487 | # distance = abs((cur_value) - mean_val[key]) 488 | # 489 | # if distance <= std_val[key]: 490 | # labels.append(field_name + '_NORMAL') 491 | # elif std_val[key] < distance <= (2 * std_val[key]): 492 | # labels.append(field_name + '_BORDERLINE') 493 | # elif (2 * std_val[key]) < distance: 494 | # labels.append(field_name + '_OUTLIER') 495 | else: 496 | labels.append('RARE_KEY_FOR_{0}'.format(field_name)) 497 | else: 498 | labels.append('UNSEEN_KEY_FOR_{0}'.format(field_name)) 499 | 500 | return labels 501 | 502 | @staticmethod 503 | def from_pretrained(pretrained: str) -> LabelGenerator: 504 | lg = NumericField() 505 | lg._model = json.loads(pretrained) 506 | return lg 507 | 508 | 509 | class TextField(LabelGenerator): 510 | """ 511 | This type of LabelGenerator handles text fields. It builds a n-gram based language model and computes the 512 | perplexity of newly observed data. It also holds statistics over the training data (mean and stdev). 513 | (perplexity<=sigma NORMAL, sigma dict: 534 | unigram2count = {} 535 | for item in dataset: 536 | text = item[self._field_name] 537 | unigrams = self._get_ngrams(text, unigrams_only=True) 538 | occ_number = 1 539 | if count_column is not None: 540 | occ_number = item[count_column] 541 | for unigram in unigrams: 542 | if unigram not in unigram2count: 543 | unigram2count[unigram] = occ_number 544 | else: 545 | unigram2count[unigram] += occ_number 546 | for unigram in unigram2count: 547 | if unigram2count[unigram] > 2: 548 | self._accepted_unigrams[unigram] = 1 549 | 550 | for item in dataset: 551 | text = item[self._field_name] 552 | ngrams = self._get_ngrams(text) 553 | occ_number = 1 554 | if count_column is not None: 555 | occ_number = item[count_column] 556 | for ngram in ngrams: 557 | if len(ngram) == self._ngram_range[0]: 558 | self._total_inf += occ_number 559 | if ngram in self._model: 560 | self._model[ngram] += occ_number 561 | else: 562 | self._model[ngram] = occ_number 563 | # for ngram in self._model: 564 | # self._model[ngram] = 565 | ser_model = [self._field_name, self._lm_mode, self._ngram_range[0], self._ngram_range[1], self._mean_perplex, 566 | self._std_perplex, self._total_inf] 567 | 568 | all_perplex = np.zeros((len(dataset)), dtype=np.float) 569 | for ii in range(len(dataset)): 570 | text = item[self._field_name] 571 | all_perplex[ii] = self._compute_perplexity(text) 572 | 573 | self._mean_perplex = np.mean(all_perplex) 574 | self._std_perplex = np.std(all_perplex) 575 | ser_model[4] = self._mean_perplex 576 | ser_model[5] = self._std_perplex 577 | ser_model.append(self._accepted_unigrams) 578 | for item in self._model: 579 | ser_model.append(item) 580 | ser_model.append(self._model[item]) 581 | 582 | return ser_model 583 | 584 | def _compute_perplexity(self, text): 585 | total = 0 586 | ngrams = self._get_ngrams(text) 587 | 588 | for ngram in ngrams: 589 | if ngram in self._model: 590 | sup_count = math.log(self._model[ngram]) + 1 591 | total += 1 / sup_count 592 | # if ngram[:-1] in self._model: 593 | # inf_count = self._model[ngram[:-1]] 594 | # else: 595 | # inf_count = self._total_inf 596 | # total += math.log(sup_count / inf_count) 597 | else: 598 | total += -math.log(1e-8) # small prob for unseen events 599 | return total / len(ngrams) 600 | 601 | def __call__(self, input_object: dict) -> [str]: 602 | perplexity = self._compute_perplexity(input_object[self._field_name]) 603 | if perplexity - self._mean_perplex < 2 * self._std_perplex: 604 | return [perplexity * 10] 605 | elif perplexity - self._mean_perplex < 4 * self._std_perplex: 606 | return ['{0}_HIGH_PERPLEXITY'.format(self._field_name.upper()), perplexity * 10] 607 | else: 608 | return ['{0}_EXTREEME_PERPLEXITY'.format(self._field_name.upper()), perplexity * 10] 609 | 610 | @staticmethod 611 | def from_pretrained(pretrained: str) -> LabelGenerator: 612 | json_obj = json.loads(pretrained) 613 | field_name = json_obj[0] 614 | lm_mode = json_obj[1] 615 | ngram_range = (json_obj[2], json_obj[3]) 616 | new_instance = TextField(field_name, lm_mode, ngram_range) 617 | new_instance._mean_perplex = json_obj[4] 618 | new_instance._std_perplex = json_obj[5] 619 | new_instance._total_inf = json_obj[6] 620 | new_instance._accepted_unigrams = json_obj[7] 621 | for ii in range((len(json_obj) - 8) // 2): 622 | ngram = tuple(json_obj[ii * 2 + 8]) 623 | count = json_obj[ii * 2 + 8 + 1] 624 | new_instance._model[ngram] = count 625 | return new_instance 626 | 627 | def _get_ngrams(self, text, unigrams_only=False): 628 | text = str(text) 629 | use_chars = self._lm_mode == 'char' 630 | toks = Tokenizer.tokenize(text, use_chars=use_chars) 631 | if unigrams_only: 632 | return toks 633 | new_toks = [] 634 | for tok in toks: 635 | if tok in self._accepted_unigrams: 636 | new_toks.append(tok) 637 | else: 638 | new_toks.append('') 639 | toks = new_toks 640 | 641 | # prepend and append 642 | c_append = self._ngram_range[0] - 1 643 | start = ['' for _ in range(c_append)] 644 | stop = ['' for _ in range(c_append)] 645 | toks = start + toks + stop 646 | ngrams = [] 647 | for ngram_order in range(self._ngram_range[0], self._ngram_range[1] + 1): 648 | for ii in range(len(toks) - ngram_order): 649 | ngram = tuple(toks[ii:ii + ngram_order]) 650 | ngrams.append(ngram) 651 | return ngrams 652 | 653 | 654 | class MultinomialField(LabelGenerator): 655 | def __init__(self, field_name: str = '', absolute_threshold: int = 10, relative_threshold: float = 0.1, 656 | group_by: str = None): 657 | """ 658 | Constructor 659 | :param field_name: What field to use 660 | :param absolute_threshold: Minimum absolute value for occurrences to trigger alert for 661 | :param relative_threshold: Minimum relative value for occurrences to trigger alert for 662 | """ 663 | self._mfc = MultinomialFieldCombiner([field_name], absolute_threshold, relative_threshold, group_by=group_by) 664 | 665 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict: 666 | return self._mfc.build_model(dataset, count_column=count_column) 667 | 668 | def __call__(self, item: dict) -> [str]: 669 | lbls = self._mfc(item) 670 | lbls = [l.replace('_PAIR', '') for l in lbls] 671 | return lbls 672 | 673 | @staticmethod 674 | def from_pretrained(pretrained: str) -> LabelGenerator: 675 | lg = MultinomialFieldCombiner() 676 | lg._model = json.loads(pretrained) 677 | mf = MultinomialField() 678 | mf._mfc = lg 679 | return mf 680 | 681 | 682 | class MultinomialFieldCombiner(LabelGenerator): 683 | def __init__(self, field_names: [str] = [], absolute_threshold: int = 10, relative_threshold: float = 0.1, 684 | group_by: str = None): 685 | """ 686 | Constructor 687 | :param field_names: What fields to combine 688 | :param absolute_threshold: Minimum absolute value for occurrences to trigger alert for 689 | :param relative_threshold: Minimum relative value for occurrences to trigger alert for 690 | """ 691 | self._model = {'pair2count': {}, 692 | 'pair2prob': {}, 693 | 'absolute_threshold': absolute_threshold, 694 | 'relative_threshold': relative_threshold, 695 | 'field_names': field_names, 696 | 'group_by': group_by 697 | } 698 | 699 | def _get_group_by_value(self, item, group_by): 700 | if isinstance(group_by, str): 701 | return str(item[group_by]) 702 | else: 703 | return "({0})".format(','.join([str(item[k]) for k in group_by])) 704 | 705 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict: 706 | pair2count = self._model['pair2count'] # this is used for incremental updates 707 | group_by_field = self._model['group_by'] 708 | total = 0 709 | for item in dataset: 710 | if group_by_field is not None: 711 | gbv = self._get_group_by_value(item, group_by_field) # str(item[group_by_field]) 712 | if gbv not in self._model['pair2count']: 713 | self._model['pair2count'][gbv] = {'TOTAL': 0} 714 | pair2count = self._model['pair2count'][gbv] 715 | combined = [str(item[field]) for field in self._model['field_names']] 716 | combined = '(' + ','.join(combined) + ')' 717 | occ_number = 1 718 | if count_column is not None: 719 | occ_number = int(item[count_column]) 720 | total += occ_number 721 | if group_by_field is not None: 722 | self._model['pair2count'][gbv]['TOTAL'] += occ_number 723 | if combined not in pair2count: 724 | pair2count[combined] = occ_number 725 | else: 726 | pair2count[combined] += occ_number 727 | 728 | pair2prob = {} 729 | if group_by_field is None: 730 | for key in pair2count: 731 | pair2prob[key] = pair2count[key] / total 732 | else: 733 | pair2count = self._model['pair2count'] 734 | for k1 in pair2count: 735 | pair2prob[k1] = {} 736 | total = int(pair2count[k1]['TOTAL']) 737 | for key in pair2count[k1]: 738 | pair2prob[k1][key] = pair2count[k1][key] / total 739 | 740 | self._model['pair2count'] = pair2count 741 | self._model['pair2prob'] = pair2prob 742 | 743 | return self._model 744 | 745 | def __call__(self, item: dict) -> [str]: 746 | fname = ('_'.join(self._model['field_names'])).upper() + '_PAIR' 747 | gname = '' 748 | if self._model['group_by'] is not None: 749 | gby = self._model['group_by'] 750 | if not isinstance(self._model['group_by'], list): 751 | gby = [gby] 752 | gname = '_BASED_ON_{0}'.format('_'.join([str(k).upper() for k in gby])) 753 | combined = [str(item[field]) for field in self._model['field_names']] 754 | combined = '(' + ','.join(combined) + ')' 755 | 756 | pair2prob = self._model['pair2prob'] 757 | pair2count = self._model['pair2count'] 758 | group_by = self._model['group_by'] 759 | if group_by is not None: 760 | gbv = self._get_group_by_value(item, group_by) 761 | if gbv not in pair2prob: 762 | return [] 763 | pair2prob = self._model['pair2prob'][gbv] 764 | pair2count = self._model['pair2count'][gbv] 765 | 766 | if combined not in pair2prob: 767 | return ['UNSEEN_{0}{1}'.format(fname, gname)] 768 | else: 769 | labels = [] 770 | 771 | prob = pair2prob[combined] 772 | cnt = pair2count[combined] 773 | 774 | if cnt < self._model['absolute_threshold']: 775 | labels.append('LOW_OBS_COUNT_FOR_{0}{1}'.format(fname, gname)) 776 | if prob < self._model['relative_threshold']: 777 | labels.append('LOW_OBS_PROB_FOR_{0}{1}'.format(fname, gname)) 778 | return labels 779 | 780 | @staticmethod 781 | def from_pretrained(pretrained: str) -> LabelGenerator: 782 | lg = MultinomialFieldCombiner() 783 | lg._model = json.loads(pretrained) 784 | return lg 785 | 786 | 787 | class NumericalFieldCombiner(LabelGenerator): 788 | def __init__(self, field_names: [str], normalize=True): 789 | """ 790 | 791 | :param field_names: What fields to combine 792 | :param normalize: Normalize each field using standard deviation before processing 793 | """ 794 | self._field_names = field_names 795 | self._normalize = normalize 796 | 797 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict: 798 | pass 799 | 800 | def __call__(self, input_object: dict) -> [str]: 801 | pass 802 | 803 | @staticmethod 804 | def from_pretrained(pretrained: str) -> LabelGenerator: 805 | pass 806 | 807 | 808 | class KeywordBased(LabelGenerator): 809 | def __init__(self, keyword_list: list, field_name: str): 810 | if isinstance(keyword_list, str): 811 | keyword_list = re.sub('[^0-9a-zA-Z]+', ' ', keyword_list) 812 | keyword_list = keyword_list.split(' ') 813 | self._label_list = [item for item in keyword_list] 814 | self._field_name = field_name 815 | 816 | def __call__(self, input_object: dict): 817 | label_list = [] 818 | text = str(input_object[self._field_name]) 819 | text = re.sub('[^0-9a-zA-Z]+', ' ', text) 820 | word_list = text.split(' ') 821 | for ii in range(len(self._label_list)): 822 | if self._label_list[ii] in word_list: 823 | label_list.append("{0}_KEYWORD_{1}".format(self._field_name.upper(), self._label_list[ii].upper())) 824 | return label_list 825 | 826 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict: 827 | return {'field_name': self._field_name, 828 | 'keyword_list': self._label_list} 829 | 830 | @staticmethod 831 | def from_pretrained(pretrained: str) -> object: 832 | obj = json.loads(pretrained) 833 | keyword_list = obj['keyword_list'] 834 | field_name = obj['field_name'] 835 | klg = KeywordBased(keyword_list, field_name) 836 | return klg 837 | 838 | 839 | class KnowledgeBased(LabelGenerator): 840 | def __init__(self, rules_and_labels_tuple_list: list, field_name: str): 841 | if isinstance(rules_and_labels_tuple_list, str): 842 | # we need to parse this 843 | rules_and_labels_tuple_list = eval(rules_and_labels_tuple_list) 844 | self._regex_list = [re.compile(item[0]) for item in rules_and_labels_tuple_list] 845 | self._regex_list_str = [item[0] for item in rules_and_labels_tuple_list] 846 | self._label_list = [item[1] for item in rules_and_labels_tuple_list] 847 | self._field_name = field_name 848 | 849 | def __call__(self, input_object: dict) -> [str]: 850 | label_list = [] 851 | text = str(input_object[self._field_name]) 852 | for ii in range(len(self._label_list)): 853 | if self._regex_list[ii].search(text): 854 | label_list.append(self._label_list[ii]) 855 | return label_list 856 | 857 | def build_model(self, dataset: Datasource, count_column: str = None) -> dict: 858 | return { 859 | 'field_name': self._field_name, 860 | 'label_list': self._label_list, 861 | 'regex_list': self._regex_list_str 862 | } 863 | 864 | @staticmethod 865 | def from_pretrained(pretrained: str) -> object: 866 | obj = json.loads(pretrained) 867 | label_list = obj['label_list'] 868 | regex_list = obj['regex_list'] 869 | field_name = obj['field_name'] 870 | reg_lab = [(regex, label) for regex, label in zip(regex_list, label_list)] 871 | kblg = KnowledgeBased(reg_lab, field_name) 872 | return kblg 873 | 874 | 875 | if __name__ == '__main__': 876 | mfc = MultinomialFieldCombiner(['user', 'parent_process'], absolute_threshold=500, relative_threshold=0.005) 877 | nfc = NumericField('count') 878 | tf = TextField('command', lm_mode='token', ngram_range=(3, 5)) 879 | klg = KeywordBased(keyword_list=['bash', 'java', 'netcat', 'sudo', 'apache2'], field_name='command') 880 | from osas.data.datasources import CSVDataSource 881 | 882 | dataset = CSVDataSource('corpus/test.csv') 883 | print("Building model") 884 | klg.build_model(dataset) 885 | print("Done") 886 | 887 | # rez = mfc.build_model(dataset) 888 | for item in dataset[:20]: 889 | print("\n\n") 890 | print(item) 891 | print("") 892 | print(klg(item)) 893 | print("\n\n") 894 | print("=" * 20) 895 | -------------------------------------------------------------------------------- /src/osas/core/utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | class Tokenizer: 20 | @staticmethod 21 | def tokenize(text, use_chars=False): 22 | if use_chars: 23 | return [ch for ch in text] 24 | else: 25 | toks = [] 26 | tok = '' 27 | for ch in text: 28 | if not ch.isalnum() or ch == ' ': 29 | tok = tok.strip() 30 | if len(tok) != 0: 31 | toks.append(tok) 32 | tok = '' 33 | if ch != ' ': 34 | toks.append(ch) 35 | else: 36 | tok += ch 37 | if tok.strip() != '': 38 | toks.append(tok) 39 | 40 | return toks 41 | -------------------------------------------------------------------------------- /src/osas/etc/README.md: -------------------------------------------------------------------------------- 1 | # Main config folder 2 | -------------------------------------------------------------------------------- /src/osas/etc/ad_config.conf: -------------------------------------------------------------------------------- 1 | [NumericField] 2 | field_name='' 3 | 4 | [TextField] 5 | field_name='' 6 | lm_mode='char' 7 | ngram_range=(3, 5) 8 | 9 | [MultinomialFieldCombiner] 10 | field_names=['user', 'parent_process'] 11 | absolute_threshold=500 12 | relative_threshold=0.005 13 | 14 | [NumericalFieldCombiner] 15 | field_names=[] 16 | normalize=True 17 | -------------------------------------------------------------------------------- /src/osas/etc/config.conf: -------------------------------------------------------------------------------- 1 | [CSVDataSource] 2 | filename=corpus/hubble_test_tags.csv 3 | -------------------------------------------------------------------------------- /src/osas/etc/data_config.conf: -------------------------------------------------------------------------------- 1 | [dataX] 2 | dataurl=dataurl.com/data.csv 3 | apikey=secetkey 4 | -------------------------------------------------------------------------------- /src/osas/etc/label_config.conf: -------------------------------------------------------------------------------- 1 | [CSVDataSource] 2 | filename = corpus/test.csv 3 | 4 | [MultinomialFieldCombiner] 5 | field_names = ['user', 'parent_process'] 6 | absolute_threshold = 500 7 | relative_threshold = 0.005 8 | 9 | [NumericField] 10 | field_name = 'count' 11 | 12 | [TextField] 13 | field_name = 'command' 14 | lm_mode = 'token' 15 | ngram_range = (3, 5) 16 | 17 | [KeywordLabelGenerator] 18 | keyword_list = ['bash', 'java', 'netcat', 'sudo', 'apache'] 19 | field_name = 'command' 20 | -------------------------------------------------------------------------------- /src/osas/io_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/OSAS/5d8693b6c483f99f4339e7ceb8c8d041d778c61a/src/osas/io_utils/__init__.py -------------------------------------------------------------------------------- /src/osas/io_utils/config.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import sys 20 | import ast 21 | from builtins import object, super 22 | import collections 23 | import configparser 24 | import pandas as pd 25 | from dataclasses import dataclass, field 26 | 27 | 28 | @dataclass 29 | class Config(object): 30 | '''Generic base class to load/save config''' 31 | 32 | def _eval_str(self, s): 33 | '''convert type to actual type''' 34 | try: 35 | return ast.literal_eval(s) 36 | except: 37 | return s 38 | 39 | def save(self, filename): 40 | """Save configuration to file.""" 41 | self.__config__ = self.__class__.__name__ 42 | sorted_dict = collections.OrderedDict(sorted(self.__dict__.items())) 43 | # sort dictionary 44 | config = configparser.ConfigParser() 45 | config.add_section(self.__config__) # write header 46 | for k, v in sorted_dict.items(): # for python3 use .items() 47 | if not k.startswith("_"): # write only non-private properties 48 | if isinstance(v, float): # if we are dealing with a float 49 | str_v = str(v) 50 | if "e" not in str_v and "." not in str_v: 51 | # stopconfusion with an int by appending a ".0" 52 | v = str_v + ".0" 53 | v = str(v) 54 | config.set(self.__config__, k, v) 55 | with fopen(filename, 'w') as cfgfile: 56 | config.write(cfgfile) 57 | 58 | def load(self, filename): 59 | '''Load configuration from file''' 60 | __config__ = self.__class__.__name__ 61 | config = configparser.ConfigParser() 62 | config.read(filename) 63 | # check to see if the config file has the appropriate section 64 | if not config.has_section(__config__): 65 | sys.stderr.write("ERROR: File:{} is not a valid configuration file" 66 | " for the selected task: Missing section:[{}]\n" 67 | .format(filename, __config__)) 68 | sys.exit(1) 69 | for k, v in config.items(__config__): 70 | self.__dict__[k] = self._eval_str(v) 71 | 72 | 73 | # ****Beware**** 74 | # Don't save secrets as default config 75 | # Use local config file (not git synced) to save secrets 76 | 77 | 78 | # ML data dataclasses 79 | @dataclass 80 | class CSVDataSource(Config): 81 | filename: str = field(default='corpus/test.csv') 82 | 83 | 84 | @dataclass 85 | class CSVDataColumn(Config): 86 | data: pd.DataFrame = field(default=pd.DataFrame()) 87 | 88 | 89 | # Label Generator dataclasses 90 | @dataclass 91 | class ObfuscationField(Config): 92 | field_name: str = field(default='command') 93 | gpu: bool = field(default=False) 94 | 95 | 96 | @dataclass 97 | class NumericField(Config): 98 | field_name: str = field(default='count') 99 | group_by: str = field(default=None) 100 | mode: str = field(default='stdev') 101 | borderline_threshold: float = field(default=1) 102 | outlier_threshold: float = field(default=2) 103 | label_for_normal: bool = field(default=True) 104 | 105 | 106 | @dataclass 107 | class TextField(Config): 108 | field_name: str = field(default='command') 109 | lm_mode: str = field(default='char') 110 | ngram_range: tuple = field(default=(3, 5)) 111 | 112 | 113 | @dataclass 114 | class MultinomialField(Config): 115 | field_name: str = field(default='user') 116 | absolute_threshold: int = field(default=10) 117 | relative_threshold: float = field(default=0.1) 118 | group_by: str = field(default=None) 119 | 120 | 121 | @dataclass 122 | class LOLField(Config): 123 | field_name: str = field(default='command') 124 | platform: str = field(default='linux') 125 | 126 | 127 | @dataclass 128 | class NumericalFieldCombiner(Config): 129 | field_names: list = field(default_factory=lambda: []) 130 | normalize: bool = field(default=True) 131 | 132 | 133 | @dataclass 134 | class MultinomialFieldCombiner(Config): 135 | field_names: list = field(default_factory=lambda: []) 136 | absolute_threshold: float = field(default=500) 137 | relative_threshold: float = field(default=0.005) 138 | group_by: str = field(default=None) 139 | 140 | 141 | @dataclass 142 | class KeywordBased(Config): 143 | keyword_list: list = field(default_factory=lambda: []) 144 | field_name: str = field(default='count') 145 | 146 | 147 | @dataclass 148 | class KnowledgeBased(Config): 149 | rules_and_labels_tuple_list: list = field(default_factory=lambda: [()]) 150 | field_name: str = field(default='') 151 | 152 | # mfc = MultinomialFieldCombiner() 153 | # mfc.load('osas/etc/ad_config.conf') 154 | # print(vars(mfc)) 155 | -------------------------------------------------------------------------------- /src/osas/io_utils/formatter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # object type conversion/formatting utility functions 20 | import ast 21 | import json 22 | import sys 23 | 24 | 25 | def eval_str(x): 26 | try: 27 | return ast.literal_eval(x) 28 | except Exception as e: 29 | fstr = 'osas/io_utils/formatter.py:eval_str()' 30 | print("[{}]Error--{}".format(fstr, e), file=sys.stderr) 31 | 32 | 33 | def dict_to_str(d): 34 | try: 35 | return json.dumps(d) 36 | except Exception as e: 37 | fstr = 'osas/io_utils/formatter.py:dict_to_str()' 38 | print("[{}]Error--{}".format(fstr, e), file=sys.stderr) 39 | 40 | 41 | def str_to_dict(s): 42 | try: 43 | return json.loads(s) 44 | except Exception as e: 45 | fstr = 'osas/io_utils/formatter.py:str_to_dict()' 46 | print("[{}]Error--{}".format(fstr, e), file=sys.stderr) 47 | -------------------------------------------------------------------------------- /src/osas/main/README.md: -------------------------------------------------------------------------------- 1 | # Main execution folder 2 | -------------------------------------------------------------------------------- /src/osas/main/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/OSAS/5d8693b6c483f99f4339e7ceb8c8d041d778c61a/src/osas/main/__init__.py -------------------------------------------------------------------------------- /src/osas/main/apply_rules.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2022 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import optparse 20 | import re 21 | import sys 22 | 23 | import tqdm 24 | from elasticsearch import helpers, Elasticsearch 25 | 26 | sys.path.append('') 27 | 28 | from osas.data.datasources import CSVDataSource, Datasource 29 | import yaml 30 | import os 31 | 32 | 33 | def is_numeric(obj): 34 | attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__'] 35 | return all(hasattr(obj, attr) for attr in attrs) 36 | 37 | 38 | def _get_all_yaml_files(root: str): 39 | all_files = [] 40 | for path, subdirs, files in os.walk(root): 41 | for name in files: 42 | if name.endswith('.yaml'): 43 | all_files.append(os.path.join(path, name)) 44 | return all_files 45 | 46 | 47 | def _load_rules(rules_folder: str) -> dict: 48 | all_rule_files = _get_all_yaml_files(rules_folder) 49 | all_rules = [] 50 | for file in all_rule_files: 51 | with open(file, 'r') as f: 52 | rules_pack = yaml.safe_load(f) 53 | if rules_pack is None: 54 | continue 55 | if 'rule name' not in rules_pack: 56 | sys.stdout.write('Invalid rule file {0}. Missing rule name\n'.format(file)) 57 | sys.exit(0) 58 | if 'rule label' not in rules_pack: 59 | sys.stdout.write('Invalid rule file {0}. Missing rule label\n'.format(file)) 60 | sys.exit(0) 61 | if 'rule score' not in rules_pack: 62 | sys.stdout.write('Invalid rule file {0}. Missing rule score\n'.format(file)) 63 | sys.exit(0) 64 | all_rules.append(rules_pack) 65 | return all_rules 66 | 67 | 68 | def _apply_rules(datasource: Datasource, rules: dict): 69 | scores = datasource['score'] 70 | labels = datasource['labels'] 71 | index = 0 72 | regex_cache = {} 73 | for item in tqdm.tqdm(datasource): 74 | for rule in rules: 75 | rule_name = rule['rule name'] 76 | rule_score = float(rule['rule score']) 77 | rule_label = rule['rule label'] 78 | cases = rule['conditions'] 79 | for case in cases: 80 | valid = True 81 | for attribute_name in cases[case]: 82 | attribute_values = cases[case][attribute_name] 83 | if not isinstance(attribute_values, list): 84 | attribute_values = [attribute_values] 85 | if attribute_name not in item: 86 | sys.stdout.write('Your dataset does not contain "{0}"\n'.format(attribute_name)) 87 | sys.exit(0) 88 | found = False 89 | for attribute_value in attribute_values: 90 | if attribute_value not in regex_cache: 91 | regex_cache[attribute_value] = re.compile(attribute_value) 92 | compiled_regex=regex_cache[attribute_value] 93 | if compiled_regex.match(item[attribute_name]): 94 | found = True 95 | break 96 | if not found: 97 | valid = False 98 | break 99 | if valid: 100 | scores[index] += rule_score 101 | if len(labels[index]) > 3: 102 | labels[index] = labels[index][:-1] + ', \'' + rule_label + '\']' 103 | else: 104 | labels[index] = '[\'{0}\']'.format(rule_label) 105 | index += 1 106 | 107 | datasource['_labels'] = labels 108 | 109 | 110 | def process(params): 111 | # load and run pipeline 112 | rules_pack = _load_rules(params.rules_folder) 113 | datasource = CSVDataSource(params.input_file) 114 | _apply_rules(datasource, rules_pack) 115 | 116 | # save, if necessary 117 | if params.output_file: 118 | datasource.save(open(params.output_file, 'w')) 119 | # push to elasticsearch 120 | if not params.no_elastic: 121 | try: 122 | es = Elasticsearch([{'host': 'localhost', 'port': 9200}], http_auth=('admin', 'admin')) 123 | data = [item for item in datasource] 124 | helpers.bulk(es, data, index="anomalies", doc_type="type") 125 | except Exception as e: 126 | sys.stdout.write('Unable to push data to ElasticSearch: {0}\n'.format(str(e))) 127 | 128 | 129 | if __name__ == '__main__': 130 | parser = optparse.OptionParser() 131 | parser.add_option('--input-file', action='store', dest='input_file', help='location of the input file') 132 | parser.add_option('--rules-folder', action='store', dest='rules_folder', help='location of rules') 133 | parser.add_option('--output-file', action='store', dest='output_file', help='output-file (optional)') 134 | parser.add_option('--no-elastic', action='store_true', dest='no_elastic', help='don\'t push data to Elastic') 135 | (params, _) = parser.parse_args(sys.argv) 136 | 137 | if params.input_file and params.rules_folder: 138 | if params.no_elastic and not params.output_file: 139 | sys.stdout.write("This run will not produce any results. You need to either specify --output-file or " 140 | "remove --no-elastic\n") 141 | else: 142 | process(params) 143 | else: 144 | parser.print_help() 145 | -------------------------------------------------------------------------------- /src/osas/main/autoconfig.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import optparse 20 | import sys 21 | import inspect 22 | 23 | sys.path.append('') 24 | from osas.data.datasources import CSVDataSource 25 | from osas.core import label_generators 26 | 27 | 28 | def _get_type(val): 29 | try: 30 | x = int(val) 31 | return 'int' 32 | except: 33 | try: 34 | x = float(val) 35 | return 'float' 36 | except: 37 | if val is None: 38 | return 'none' 39 | else: 40 | return 'str' 41 | 42 | 43 | def _detect_field_type(datasource, count_column=None): 44 | item = datasource[0] 45 | field_type = {key: 'int' for key in item} 46 | sys.stdout.write('\n') 47 | sys.stdout.flush() 48 | 49 | if count_column is None: 50 | count = len(datasource) 51 | else: 52 | count = 0 53 | 54 | for item in datasource: 55 | if count_column is not None: 56 | count += item[count_column] 57 | for key in item: 58 | t = _get_type(item[key]) 59 | if t == 'float': 60 | if field_type[key] == 'int': 61 | field_type[key] = t 62 | elif t == 'str': 63 | field_type[key] = t 64 | 65 | field2val = {} 66 | for item in datasource: 67 | for key in field_type: 68 | if field_type[key] == 'str' or field_type[key] == 'int' or field_type[key] == 'float': 69 | value = item[key] 70 | if key not in field2val: 71 | field2val[key] = {} 72 | if (len(field2val[key]) - 1) / count < 0.1: 73 | if value not in field2val[key]: 74 | field2val[key][value] = '1' 75 | for key in field2val: 76 | if len(field2val[key]) / count < 0.1: 77 | field_type[key] = 'multinomial' 78 | elif field_type[key] == 'str': 79 | field_type[key] = 'text' 80 | 81 | return field_type 82 | 83 | 84 | def _get_generators(datasource: CSVDataSource, field_types: dict): 85 | generator_list = [] 86 | for key in field_types: 87 | if field_types[key] == 'int' or field_types[key] == 'float': 88 | generator_list.append(['NumericField', [key]]) 89 | if field_types[key] == 'multinomial': 90 | generator_list.append(['MultinomialField', [key]]) 91 | if field_types[key] == 'text': 92 | generator_list.append(['TextField', [key]]) 93 | assigned = {} 94 | for key1 in field_types: 95 | for key2 in field_types: 96 | if field_types[key1] == 'multinomial' and field_types[key2] == 'multinomial' and \ 97 | (key2, key1) not in assigned and key1 != key2: 98 | generator_list.append(['MultinomialFieldCombiner', [key1, key2]]) 99 | assigned[(key1, key2)] = '1' 100 | 101 | generator_list = list(sorted(generator_list, key=lambda x: x[0])) 102 | 103 | return generator_list 104 | 105 | 106 | HEADER = """; OSAS autogenerated configuration file 107 | ; 108 | ; Below we provide a list of standard label generator templates - feel free to copy-paste and edit them 109 | ; in order to cope with your own dataset 110 | ; 111 | 112 | ; [LG_MULTINOMIAL] 113 | ; generator_type = MultinomialField 114 | ; field_name = 115 | ; absolute_threshold = 10 116 | ; relative_threshold = 0.1 117 | 118 | ; [LG_TEXT] 119 | ; generator_type = TextField 120 | ; field_name = 121 | ; lm_mode = char 122 | ; ngram_range = (3, 5) 123 | 124 | ; [LG_NUMERIC] 125 | ; generator_type = NumericField 126 | ; field_name = 127 | ; label_for_normal = False 128 | ; stdev = True 129 | ; stdev_borderline_threshold = 1 130 | ; stdev_outlier_threshold = 2 131 | ; spike = none # one of 'none', 'ratio', or 'fixed' 132 | ; spike_borderline_threshold = 10 133 | ; spike_outlier_threshold = 20 134 | 135 | ; [LG_MUTLINOMIAL_COMBINER] 136 | ; generator_type = MultinomialFieldCombiner 137 | ; field_names = ['', '', ...] 138 | ; absolute_threshold = 10 139 | ; relative_threshold = 0.1 140 | 141 | ; [LG_KEYWORD] 142 | ; generator_type = KeywordBased 143 | ; field_name = 144 | ; keyword_list = ['', '', '', ...] 145 | 146 | ; [LG_REGEX] 147 | ; generator_type = KnowledgeBased 148 | ; field_name = 149 | ; rules_and_labels_tuple_list = [('',''), ('',''), ...]""" 150 | 151 | 152 | def _write_conf(generators, filename, count_column=None): 153 | f = open(filename, 'w') 154 | f.write(HEADER) 155 | f.write('\n\n') 156 | if params.count_column: 157 | f.write('[GENERAL]\n') 158 | f.write('count_column={0}\n\n'.format(count_column)) 159 | count = 0 160 | for generator in generators: 161 | count += 1 162 | f.write('[LG_{0}]\n'.format(count)) 163 | f.write('generator_type = {0}\n'.format(generator[0])) 164 | dyn_class = getattr(sys.modules[label_generators.__name__], generator[0]) 165 | 166 | signature = inspect.signature(dyn_class.__init__) 167 | for param in signature.parameters.items(): 168 | param_name = param[1].name 169 | param_value = param[1].default 170 | if param_name == 'self': 171 | continue 172 | if param_name == 'field_name' or param_name == 'field_names': 173 | if len(generator[1]) == 1: 174 | param_value = generator[1][0] 175 | else: 176 | param_value = generator[1] 177 | f.write('{0} = {1}\n'.format(param_name, param_value)) 178 | f.write('\n') 179 | f.write('[AnomalyScoring]\nscoring_algorithm = StatisticalNGramAnomaly\n') 180 | f.close() 181 | 182 | 183 | def process(params): 184 | datasource = CSVDataSource(params.input_file) 185 | sys.stdout.write('Preprocessing') 186 | if params.count_column: 187 | cc = params.count_column 188 | else: 189 | cc = None 190 | field_type = _detect_field_type(datasource, count_column=cc) 191 | sys.stdout.write('\t::Detected field types:\n') 192 | for key in field_type: 193 | sys.stdout.write('\t\t"{0}": {1}\n'.format(key, field_type[key])) 194 | 195 | generators = _get_generators(datasource, field_type) 196 | sys.stdout.write('\t::Suggested generators:\n') 197 | for item in generators: 198 | sys.stdout.write('\t\t{0}: {1}\n'.format(item[0], item[1])) 199 | 200 | _write_conf(generators, params.output_file, count_column=params.count_column) 201 | 202 | 203 | if __name__ == '__main__': 204 | parser = optparse.OptionParser() 205 | parser.add_option('--input-file', action='store', dest='input_file', help='location of the input file') 206 | parser.add_option('--output-file', action='store', dest='output_file', help='location of the output file') 207 | parser.add_option('--count-column', action='store', dest='count_column', 208 | help='if this value is set, OSAS will consider the data clustered and this column will indicate' 209 | 'the number of occurrences of the event. Otherwise, this number is considered equal to 1') 210 | (params, _) = parser.parse_args(sys.argv) 211 | 212 | if params.input_file and params.output_file: 213 | process(params) 214 | else: 215 | parser.print_help() 216 | -------------------------------------------------------------------------------- /src/osas/main/run_pipeline.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import optparse 20 | import sys 21 | import json 22 | from elasticsearch import helpers, Elasticsearch 23 | 24 | sys.path.append('') 25 | 26 | from src.osas.pipeline import Pipeline 27 | from osas.data.datasources import CSVDataSource, Datasource 28 | import numpy as np 29 | 30 | 31 | def is_numeric(obj): 32 | attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__'] 33 | return all(hasattr(obj, attr) for attr in attrs) 34 | 35 | 36 | def process(params): 37 | # load and run pipeline 38 | datasource = CSVDataSource(params.input_file) 39 | p = Pipeline('DEV') 40 | p.load_config(params.conf_file) 41 | p.load_model(params.model_file) 42 | p(datasource) 43 | # save, if necessary 44 | if params.output_file: 45 | datasource.save(open(params.output_file, 'w')) 46 | # push to elasticsearch 47 | if not params.no_elastic: 48 | try: 49 | es = Elasticsearch([{'host': 'localhost', 'port': 9200}], http_auth=('admin', 'admin')) 50 | data = [item for item in datasource] 51 | for item in data: 52 | item['model'] = p._scoring_model_name 53 | item['raw'] = str(item['labels']) 54 | for key in item: 55 | if item[key] == 'NaN' or (is_numeric(item[key]) and np.isnan(item[key])): 56 | item[key] = None 57 | helpers.bulk(es, data, index="anomalies", doc_type="type") 58 | except Exception as e: 59 | sys.stdout.write('Unable to push data to ElasticSearch: {0}\n'.format(str(e))) 60 | 61 | 62 | if __name__ == '__main__': 63 | parser = optparse.OptionParser() 64 | parser.add_option('--input-file', action='store', dest='input_file', help='location of the input file') 65 | parser.add_option('--conf-file', action='store', dest='conf_file', help='location of pipeline configuration file') 66 | parser.add_option('--model-file', action='store', dest='model_file', help='location of pretrained pipeline file') 67 | parser.add_option('--output-file', action='store', dest='output_file', help='output-file (optional)') 68 | parser.add_option('--no-elastic', action='store_true', dest='no_elastic', help='don\'t push data to Elastic') 69 | (params, _) = parser.parse_args(sys.argv) 70 | 71 | if params.input_file and params.conf_file and params.model_file: 72 | if params.no_elastic and not params.output_file: 73 | sys.stdout.write("This run will not produce any results. You need to either specify --output-file or " 74 | "remove --no-elastic\n") 75 | else: 76 | process(params) 77 | else: 78 | parser.print_help() 79 | -------------------------------------------------------------------------------- /src/osas/main/train_pipeline.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import optparse 20 | import sys 21 | import json 22 | 23 | sys.path.append('') 24 | 25 | from src.osas.pipeline import Pipeline 26 | from osas.data.datasources import CSVDataSource, Datasource 27 | 28 | 29 | def is_numeric(obj): 30 | attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__'] 31 | return all(hasattr(obj, attr) for attr in attrs) 32 | 33 | 34 | def process(params): 35 | # load and run pipeline 36 | datasource = CSVDataSource(params.input_file) 37 | p = Pipeline('DEV') 38 | p.load_config(params.conf_file) 39 | if params.incremental: 40 | p.load_model(params.orig_model_file) 41 | model = p.build_pipeline(datasource, incremental=params.incremental) 42 | json.dump(model, open(params.model_file, 'w'), indent=4) 43 | 44 | 45 | if __name__ == '__main__': 46 | parser = optparse.OptionParser() 47 | parser.add_option('--input-file', action='store', dest='input_file', help='location of the input file') 48 | parser.add_option('--conf-file', action='store', dest='conf_file', help='location of pipeline configuration file') 49 | parser.add_option('--model-file', action='store', dest='model_file', 50 | help='location where to store the pretrained pipeline file') 51 | parser.add_option('--orig-model-file', action='store', dest='orig_model_file', 52 | help='location where to store the pretrained pipeline file') 53 | parser.add_option('--incremental', action='store_true', help='perform incremental update on the model (will load ' 54 | '--orig-model-file and save at location specified by ' 55 | '--model-file)') 56 | 57 | (params, _) = parser.parse_args(sys.argv) 58 | 59 | if params.input_file and params.conf_file and params.model_file: 60 | if params.incremental and params.orig_model_file: 61 | process(params) 62 | else: 63 | if params.incremental: 64 | print("Must specify --orig-model-file") 65 | elif params.orig_model_file: 66 | print("--orig-model-file must be used with --incremental") 67 | else: 68 | process(params) 69 | else: 70 | parser.print_help() 71 | -------------------------------------------------------------------------------- /src/osas/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/OSAS/5d8693b6c483f99f4339e7ceb8c8d041d778c61a/src/osas/models/__init__.py -------------------------------------------------------------------------------- /src/osas/models/pipeline.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import sys 20 | 21 | sys.path.append('') 22 | from osas.core.interfaces import Datasource 23 | 24 | 25 | class CustomizablePipeline: 26 | def __init__(self): 27 | pass 28 | 29 | def __call__(self, dataset: Datasource) -> None: 30 | pass 31 | -------------------------------------------------------------------------------- /src/osas/pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Main pipeline folder 2 | -------------------------------------------------------------------------------- /src/osas/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/OSAS/5d8693b6c483f99f4339e7ceb8c8d041d778c61a/src/osas/pipeline/__init__.py -------------------------------------------------------------------------------- /src/osas/pipeline/detect_anomalies.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import os 20 | import sys 21 | 22 | sys.path.append('') 23 | 24 | from osas.core import anomaly 25 | from osas.io_utils import config 26 | from osas.core.interfaces import AnomalyDetection, Datasource 27 | 28 | 29 | class DetectAnomalies(): 30 | ''' class for anomalies detection wrapper methods ''' 31 | 32 | def __init__(self, env: str = 'DEV'): 33 | os.environ["OSAS_ENV"] = env 34 | 35 | def detection_model(self, name: str, load_config: bool = False): 36 | '''get model specified by name''' 37 | # get anomaly detection type by name 38 | dmClass = getattr(sys.modules[anomaly.__name__], name) 39 | # get label gen obj 40 | dm = dmClass() 41 | return dm 42 | 43 | def build_model(self, model: AnomalyDetection, dataset: Datasource) -> dict: 44 | return model.build_model(dataset) 45 | 46 | def get_scores(self, model: AnomalyDetection, dataset: Datasource) -> [float]: 47 | return model.__call__(dataset) 48 | 49 | def get_pretrained_model(self, modelName: str, pretrained_data: str) -> AnomalyDetection: 50 | dmClass = getattr(sys.modules[anomaly.__name__], modelName) 51 | return dmClass.from_pretrained(pretrained_data) 52 | -------------------------------------------------------------------------------- /src/osas/pipeline/fetch_data.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import os 20 | import sys 21 | 22 | sys.path.append('') 23 | 24 | from src.osas.pipeline import Pipeline 25 | from osas.data import datasources 26 | from osas.core.interfaces import Datasource 27 | from osas.io_utils import config 28 | 29 | 30 | class FetchData(Pipeline): 31 | ''' class for data fetching ''' 32 | 33 | def __init__(self, env: str): 34 | Pipeline.__init__(self, env) 35 | os.environ["UBA_ENV"] = env 36 | 37 | def datasource(self, name: str, load_config: str=None) -> Datasource: 38 | '''datasource generic method''' 39 | dsClass = getattr(sys.modules[datasources.__name__], name) 40 | # get args for datasource 41 | cfg = getattr(sys.modules[config.__name__], name)() 42 | if load_config: 43 | cfg.load(load_config) 44 | ds = dsClass(**(vars(cfg))) # convert obj to dict to kwargs 45 | return ds 46 | -------------------------------------------------------------------------------- /src/osas/pipeline/groom_data.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import os 20 | import sys 21 | import json 22 | 23 | sys.path.append('') 24 | 25 | # from osas.pipeline.pipeline import Pipeline 26 | from osas.core import label_generators 27 | from osas.io_utils import config 28 | from osas.core.interfaces import LabelGenerator, Datasource 29 | import configparser 30 | 31 | 32 | class GroomData(): 33 | ''' class for data grooming wrapper methods ''' 34 | 35 | def __init__(self, env: str = 'DEV'): 36 | # Pipeline.__init__(self, env) 37 | os.environ["OSAS_ENV"] = env 38 | 39 | def label_generator(self, name: str, 40 | load_config: str = None) -> LabelGenerator: 41 | '''generate label specified by name''' 42 | # get label generator class from name 43 | lgClass = getattr(sys.modules[label_generators.__name__], name) 44 | # get args for the label generator 45 | cfg = getattr(sys.modules[config.__name__], name)() 46 | if load_config: 47 | if isinstance(load_config, configparser.SectionProxy): 48 | cfg = load_config 49 | else: 50 | cfg.load(load_config) 51 | # get label gen obj 52 | # di = {key: eval(cfg[key]) for key in cfg} 53 | di = {} 54 | for key in cfg: 55 | try: 56 | val = eval(cfg[key]) 57 | except: 58 | val = cfg[key] 59 | di[key] = val 60 | del di['generator_type'] 61 | lg = lgClass(**di) # convert obj to dict to kwargs 62 | return lg 63 | 64 | def from_pretrained(self, name: str, 65 | pretrained: dict) -> LabelGenerator: 66 | '''generate label specified by name''' 67 | # get label generator class from name 68 | lgClass = getattr(sys.modules[label_generators.__name__], name) 69 | # get args for the label generator 70 | cfg = getattr(sys.modules[config.__name__], name)() 71 | return lgClass.from_pretrained(json.dumps(pretrained)) 72 | # if load_config: 73 | # if isinstance(load_config, configparser.SectionProxy): 74 | # cfg = load_config 75 | # else: 76 | # cfg.load(load_config) 77 | # # get label gen obj 78 | # # di = {key: eval(cfg[key]) for key in cfg} 79 | # di = {} 80 | # for key in cfg: 81 | # try: 82 | # val = eval(cfg[key]) 83 | # except: 84 | # val = cfg[key] 85 | # di[key] = val 86 | # del di['generator_type'] 87 | # lg = lgClass(**di) # convert obj to dict to kwargs 88 | # return lg 89 | 90 | def build_model(self, model: LabelGenerator, 91 | dataset: Datasource, count_column: str) -> dict: 92 | return model.build_model(dataset, count_column) 93 | 94 | def get_labels(self, model: LabelGenerator, 95 | input_object: dict) -> [str]: 96 | return model.__call__(input_object) 97 | 98 | def get_pretrained_model(self, modelName: str, 99 | pretrained_data: str) -> LabelGenerator: 100 | lgClass = getattr(sys.modules[label_generators.__name__], 101 | modelName) 102 | return lgClass.from_pretrained(pretrained_data) 103 | -------------------------------------------------------------------------------- /src/osas/pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | import configparser 20 | import os 21 | import sys 22 | from ast import literal_eval 23 | 24 | sys.path.append('') 25 | from src.osas.pipeline import GroomData 26 | from osas.data.datasources import CSVDataSource, Datasource 27 | from src.osas.pipeline import DetectAnomalies 28 | import json 29 | 30 | 31 | class Pipeline: 32 | ''' base class contains all template methods ''' 33 | env = None 34 | root_dir = None 35 | config = None 36 | 37 | def __init__(self, env): 38 | ''' 39 | init args 40 | - obj 41 | - env var 42 | ''' 43 | # global vars set as env vars 44 | Pipeline.env = env 45 | os.environ["OSAS_ENV"] = env # PROD/STAGE/DEV 46 | curr_dir = os.path.dirname(os.path.realpath(__file__)) 47 | Pipeline.root_dir = os.path.realpath(os.path.join(curr_dir, "../")) 48 | self._pipeline = [] 49 | self._detect_anomalies = None 50 | self._count_column = None 51 | 52 | def load_config(self, config_file, env='DEV'): 53 | ''' 54 | load configs 55 | args: 56 | - obj 57 | - configfile path 58 | - env 59 | ''' 60 | with open(config_file, "r") as f: 61 | cfg = configparser.RawConfigParser() 62 | cfg.read_file(f) 63 | self.config = cfg 64 | 65 | self._scoring_model_name = self.config['AnomalyScoring']['scoring_algorithm'] 66 | 67 | if 'GENERAL' in self.config: 68 | if 'count_column' in self.config['GENERAL']: 69 | self._count_column = self.config['GENERAL']['count_column'] 70 | 71 | def load_model(self, model_file, env='DEV'): 72 | ''' 73 | Loads a pretrained model for the current configuration 74 | :param model_file: json file where pretrained model was stored 75 | :param env: environment type 76 | :return: None 77 | ''' 78 | pretrained = json.load(open(model_file)) 79 | gd = GroomData() 80 | self._pipeline = [] 81 | for sect in self.config: 82 | print('\t::{0}'.format(sect)) 83 | if 'generator_type' in self.config[sect]: 84 | self._pipeline.append(gd.from_pretrained(self.config[sect]['generator_type'], 85 | pretrained['model'][sect])) 86 | da = DetectAnomalies() 87 | self._detect_anomalies = da.get_pretrained_model(self._scoring_model_name, json.dumps(pretrained['scoring'])) 88 | 89 | def build_pipeline(self, dataset: Datasource, incremental=False) -> dict: 90 | ''' 91 | Generates a JSON serializable object that contains data for all pretrained label generators 92 | :param dataset: dataset to train the model on 93 | :return: serializable dict object 94 | ''' 95 | gd = GroomData() 96 | ex_pipeline = self._pipeline 97 | self._pipeline = [] 98 | final_model = {'model': {}} 99 | index = 0 100 | for sect in self.config: 101 | print('\t::{0}'.format(sect)) 102 | if 'generator_type' in self.config[sect]: 103 | for key in self.config[sect]: 104 | print("\t\t::{0} = {1}".format(key, self.config[sect][key])) 105 | if incremental: 106 | lg = ex_pipeline[index] 107 | else: 108 | lg = gd.label_generator(self.config[sect]['generator_type'], self.config[sect]) 109 | index += 1 110 | print("\t\t::OBJECT: {0}".format(lg)) 111 | sys.stdout.write('\t\t::BUILDING MODEL...') 112 | sys.stdout.flush() 113 | lg_model = gd.build_model(lg, dataset, count_column=self._count_column) 114 | final_model['model'][sect] = lg_model 115 | sys.stdout.write('done\n') 116 | self._pipeline.append(lg) 117 | # remove anomaly detection update (not all models support incremental because of sklearn dependencies) 118 | # if incremental: 119 | # final_model['scoring'] = self._detect_anomalies 120 | # return final_model 121 | 122 | self(dataset, dest_field_labels='_labels') 123 | da = DetectAnomalies() 124 | if not incremental: 125 | self._detect_anomalies = da.detection_model(self.config['AnomalyScoring']['scoring_algorithm'], 126 | load_config=False) 127 | # check for classifier scoring and if so, add grouth truth column and classifier as param 128 | if self.config['AnomalyScoring']['scoring_algorithm'] == 'SupervisedClassifierAnomaly': 129 | ground_truth_column = self.config['AnomalyScoring']['ground_truth_column'] 130 | classifier = self.config['AnomalyScoring']['classifier'] 131 | # grab function args for model init from rest of conf variables 132 | init_args = dict(self.config['AnomalyScoring']) 133 | del init_args['scoring_algorithm'] 134 | del init_args['ground_truth_column'] 135 | del init_args['classifier'] 136 | # convert config values to inferred types, safely 137 | for k in init_args: 138 | try: 139 | init_args[k] = literal_eval(init_args[k]) 140 | except: 141 | # it will be a string otherwise 142 | pass 143 | # build model 144 | scoring_model = self._detect_anomalies.build_model(dataset, 145 | ground_truth_column, 146 | classifier, 147 | init_args, 148 | incremental=incremental) 149 | else: 150 | scoring_model = self._detect_anomalies.build_model(dataset, incremental=incremental) 151 | final_model['scoring'] = scoring_model 152 | return final_model 153 | 154 | def __call__(self, dataset: Datasource, dest_field_labels='labels', dest_field_score='score'): 155 | all_labels = [] 156 | for item in dataset: 157 | label_list = [] 158 | for lg in self._pipeline: 159 | llist = lg(item) 160 | for label in llist: 161 | label_list.append(label) 162 | all_labels.append(label_list) 163 | dataset[dest_field_labels] = all_labels 164 | dataset['_labels'] = all_labels 165 | if self._detect_anomalies is not None: 166 | scores = self._detect_anomalies(dataset) 167 | dataset[dest_field_score] = scores 168 | 169 | 170 | if __name__ == '__main__': 171 | p = Pipeline('DEV') 172 | p.load_config('tests/pipeline_test.conf') 173 | import time 174 | 175 | ts1 = time.time() 176 | datasource = CSVDataSource('tests/test_small.csv') 177 | ts2 = time.time() 178 | pipeline_model = p.build_pipeline(datasource) 179 | ts3 = time.time() 180 | p(datasource) 181 | ts4 = time.time() 182 | json.dump(pipeline_model, open('tests/pipeline.json', 'w'), indent=4) 183 | for item in datasource[:10]: 184 | print(item) 185 | print() 186 | print() 187 | 188 | print( 189 | "Timing:\n\tLoad dataset: {0}\n\tBuild pipeline: {1}\n\tApply models:{2}\n\tDataset size: {3} entries\n".format( 190 | ts2 - ts1, ts3 - ts2, ts4 - ts3, len(datasource))) 191 | 192 | # load 193 | p = Pipeline('DEV') 194 | p.load_config('tests/pipeline_test.conf') 195 | p.load_model('tests/pipeline.json') 196 | p(datasource) 197 | 198 | for item in datasource[:10]: 199 | print(item) 200 | print() 201 | print() 202 | -------------------------------------------------------------------------------- /src/osas/templates/config_manual_update.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 15 | 16 | 17 | 18 | {% if not input %} 19 |

Available config files in tests folder:

20 | {% endif %} 21 | 22 | {% if input %} 23 |

Selected config:

24 | {% endif %} 25 | 26 | {% if not input %} 27 | {%for i in range(0, len)%} 28 | 29 |

{{files[i]}}

30 | {%endfor%} 31 | {% endif %} 32 | {% if not input %} 33 |

Please input a valid config file from the list above

34 | {% endif %} 35 |
36 |
37 | 38 | 39 | 40 | 45 |

46 | 47 | 48 | {% if input %} 49 | 50 | 51 | .conf

52 |
    53 | 54 | 55 | 56 | 61 | 62 | 73 | 74 |

    75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 |

    86 | {%for i in range(0, len_config)%} 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | {%endfor%} 95 | 96 | 97 |
    CheckIDGenerator TypeFields

    {{config_obj[i][0]}}

    {{config_obj[i][1]}}

    {{config_obj[i][2]}}

    98 | 99 | 100 | {% endif %} 101 | 102 |

    103 | 104 |
105 |
106 | 107 | 108 |

109 | 110 |
111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /src/osas/templates/config_static.txt: -------------------------------------------------------------------------------- 1 | ; Other supported LG types: 2 | ; [KB_LG] 3 | ; generator_type = KeywordBased 4 | ; field_name = process 5 | ; keyword_list = ['java', 'bash', 'test'] 6 | ; 7 | ; [KBB_LG] 8 | ; generator_type = KnowledgeBased 9 | ; field_name = process 10 | ; rules_and_labels_tuple_list = [('*java*','java'), ('*/dev/tcp/*','tcp')] -------------------------------------------------------------------------------- /src/osas/templates/config_text_edit.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |

Confirm/manual edit config file

9 | 10 |
    11 |
    12 | 13 | 18 |

    19 | 20 |

    21 |
    22 | 23 |

    24 | 25 |
    26 | 27 |
    28 |
29 | 30 | 31 | -------------------------------------------------------------------------------- /src/osas/templates/console.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |
7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 136 | 137 | -------------------------------------------------------------------------------- /src/osas/templates/generate_config.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |

Available files in tests folder:

9 | {%for i in range(0, len)%} 10 | 11 |

{{files[i]}}

12 | {%endfor%} 13 |

Please input a valid dataset from the list above

14 |
15 |
16 | 17 | 22 |

23 | 24 | .conf

25 |
26 | 27 |
28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/osas/templates/run_full_process.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |

Available files in tests folder:

9 | {%for i in range(0, len)%} 10 | 11 |

{{files[i]}}

12 | {%endfor%} 13 |

Please input a valid dataset from the list above

14 |
15 |
16 | 17 | 22 |

23 | 24 | .csv

25 |
26 | 27 |
28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/osas/templates/run_pipeline.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |

Available datasets in tests folder:

11 | {%for i in range(0, len_dataset)%} 12 | 13 |

{{dataset[i]}}

14 | {%endfor%} 15 |

Available config in tests folder:

16 | {%for i in range(0, len)%} 17 |

{{files[i]}}

18 | {%endfor%} 19 |

Available pipelines in tests folder:

20 | {%for i in range(0, len_pipeline)%} 21 | 22 |

{{pipeline[i]}}

23 | {%endfor%} 24 |

Please input a valid config file from the list above

25 |
26 |
27 | 28 | 33 |

34 | 35 | 36 | 41 |

42 | 43 | 44 | 45 | 50 |

51 | 52 | 53 | .csv

54 |
55 | 56 |
57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /src/osas/templates/train_pipeline.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |

Available datasets in tests folder:

11 | {%for i in range(0, len_dataset)%} 12 | 13 |

{{dataset[i]}}

14 | {%endfor%} 15 | 16 |

Available config in tests folder:

17 | {%for i in range(0, len)%} 18 | 19 |

{{files[i]}}

20 | {%endfor%} 21 | 22 |

Please input a valid config file from the list above

23 |
24 |
25 | 26 | 31 |

32 | 33 | 34 | 39 |

40 | 41 | 42 | .model

43 |
44 | 45 |
46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/osas/webserver.py: -------------------------------------------------------------------------------- 1 | # 2 | # Authors: Security Intelligence Team within the Security Coordination Center 3 | # 4 | # Copyright (c) 2018 Adobe Systems Incorporated. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | from flask import Flask 20 | from flask import Response 21 | from flask import request 22 | from flask import render_template, send_from_directory, send_file 23 | from os import listdir 24 | from os.path import isfile, join 25 | import subprocess 26 | import configparser 27 | import pty 28 | import os 29 | import threading 30 | import shlex 31 | import select 32 | import struct 33 | import termios 34 | import fcntl 35 | 36 | if os.path.isdir('/app'): 37 | data_path='/app/' 38 | else: 39 | data_path = 'tests/' 40 | 41 | app = Flask(__name__) 42 | pty_buffer = [] 43 | 44 | 45 | @app.route('/', defaults={'path': ''}) 46 | @app.route('/osas') 47 | def index(): 48 | text = '''
OSAS server is running
49 |
For console interaction, go to http://127.0.0.1:8888/osas/console and follow the steps
50 |
For automated pipeline, go to http://127.0.0.1:8888/osas/run_full_process
51 |
For custom pipeline, go to http://127.0.0.1:8888/osas/generate_config and follow the steps
52 | ''' 53 | return text 54 | 55 | 56 | @app.route('/osas/static/') 57 | def assets(filename): 58 | # Add custom handling here. 59 | # Send a file download response. 60 | # print(path) 61 | print(filename) 62 | return send_file('templates/static/{0}'.format(filename)) 63 | 64 | 65 | @app.route('/osas/console', methods=['GET', 'POST']) 66 | def console_print(): 67 | return render_template("console.html") 68 | 69 | 70 | @app.route('/osas/console/read', methods=['GET', 'POST']) 71 | def console_read(): 72 | global pty_buffer 73 | tmp = pty_buffer 74 | pty_buffer = [] 75 | 76 | return ''.join([chr(c) for c in tmp]) 77 | 78 | 79 | @app.route('/osas/console/size', methods=['GET', 'POST']) 80 | def console_size(): 81 | xpix = 0 82 | ypix = 0 83 | 84 | global pty_fd 85 | data = request.json 86 | print(data) 87 | winsize = struct.pack("HHHH", data['row'], data['col'], xpix, ypix) 88 | fcntl.ioctl(pty_fd, termios.TIOCSWINSZ, winsize) 89 | return '' 90 | 91 | 92 | @app.route('/osas/console/write', methods=['GET', 'POST']) 93 | def console_write(): 94 | data = request.json 95 | # print(data) 96 | global pty_fd 97 | data = data['asciiKey'].encode() 98 | # print(data) 99 | os.write(pty_fd, data) 100 | 101 | global pty_buffer 102 | tmp = pty_buffer 103 | pty_buffer = [] 104 | # print("returning {0}".format(tmp)) 105 | return ''.join([chr(c) for c in tmp]) 106 | 107 | 108 | pty_fd = None 109 | 110 | 111 | def pty_read(f): 112 | global pty_fd 113 | pty_fd = f 114 | 115 | def rthread(fd): 116 | while (True): 117 | import time 118 | time.sleep(0.02) 119 | (data_ready, _, _) = select.select([fd], [], [], 0) 120 | if data_ready: 121 | global pty_buffer 122 | data = os.read(fd, 1024 * 1024) 123 | # print(str(data)) 124 | pty_buffer += data # data.decode("utf-8") 125 | 126 | x = threading.Thread(target=rthread, args=(f,), daemon=True) 127 | x.start() 128 | 129 | 130 | # def pty_start(): 131 | # pty.spawn("bash", pty_read) 132 | # 133 | # 134 | # x = threading.Thread(target=pty_start, args=(), daemon=True) 135 | # x.start() 136 | 137 | (child_pid, fd) = pty.fork() 138 | if child_pid == 0: 139 | # this is the child process fork. 140 | # anything printed here will show up in the pty, including the output 141 | # of this subprocess 142 | subprocess.run("bash") 143 | else: 144 | # this is the parent process fork. 145 | # store child fd and pid 146 | # app.config["fd"] = fd 147 | # app.config["child_pid"] = child_pid 148 | # set_winsize(fd, 50, 50) 149 | pty_fd = fd 150 | os.write(pty_fd, 'export TERM=xterm\n'.encode()) 151 | cmd = " ".join(shlex.quote(c) for c in "bash") 152 | print("child pid is", child_pid) 153 | print( 154 | f"starting background task with command `{cmd}` to continously read " 155 | "and forward pty output to client" 156 | ) 157 | # socketio.start_background_task(target=read_and_forward_pty_output) 158 | print("task started") 159 | print(pty_fd) 160 | pty_read(pty_fd) 161 | 162 | 163 | @app.route('/osas/generate_config', methods=['GET', 'POST']) 164 | def generate_config(): 165 | print(request.method) 166 | if request.method == 'GET': 167 | onlyfiles = [f for f in listdir(data_path) if 168 | isfile(join(data_path, f)) and '.conf' not in f and 'pipeline' not in f and '.model' not in f] 169 | files = onlyfiles 170 | 171 | return render_template("generate_config.html", files=files, len=len(files)) 172 | 173 | if request.method == 'POST': 174 | data = request.form.to_dict() 175 | # print(data) 176 | input = data['input'] 177 | output = data['output'] 178 | print(input) 179 | print(output) 180 | if '.conf' not in output: 181 | output += '.conf' 182 | 183 | def inner(): 184 | proc = subprocess.Popen(['python3 osas/main/autoconfig.py --input-file={} --output-file={} 2>&1'.format( 185 | data_path + input, data_path + output)], shell=True, stdout=subprocess.PIPE) 186 | 187 | for line in iter(proc.stdout.readline, ''): 188 | try: 189 | yield line.rstrip().decode('ascii') + '
\n' 190 | except: 191 | a = None 192 | poll = proc.poll() 193 | if poll is not None: 194 | yield 'DONE!
\n' 195 | full_text = """go to http://127.0.0.1:8888/osas/confirm_config 196 | """ 201 | # yield 'go to http://127.0.0.1:8888/osas/confirm_config' 202 | yield full_text 203 | break 204 | 205 | # 206 | return Response(inner(), mimetype='text/html') 207 | # return request.data 208 | 209 | 210 | @app.route('/osas/confirm_config', methods=['GET', 'POST']) 211 | def confirm_config(): 212 | config = configparser.ConfigParser() 213 | print(request.method) 214 | 215 | if request.method == 'GET': 216 | onlyfiles = [f for f in listdir(data_path) if 217 | isfile(join(data_path, f)) and '.conf' in f and 'pipeline' not in f] 218 | files = onlyfiles 219 | return render_template("config_manual_update.html", files=files, len=len(files)) 220 | 221 | if request.method == 'POST': 222 | print(request.form) 223 | print('here') 224 | input = request.form['input'] 225 | try: 226 | output = request.form['output'] 227 | except: 228 | output = None 229 | try: 230 | text_box = request.form['text_box'] 231 | except: 232 | text_box = None 233 | 234 | if output == None and text_box == None: 235 | files = [str(input)] 236 | config_data = 'data' 237 | config.read(data_path + input) 238 | # print(config.sections()) 239 | config_obj = [] 240 | for section in config.sections(): 241 | elem = [] 242 | if section == 'AnomalyScoring': 243 | a = 1 244 | else: 245 | 246 | elem.append(section) 247 | elem.append(config[section]['generator_type']) 248 | try: 249 | elem.append(config[section]['field_name']) 250 | except: 251 | elem.append(config[section]['field_names']) 252 | 253 | config_obj.append(elem) 254 | 255 | # print(config_obj) 256 | output = "tailored_" + input.replace('.conf', '') 257 | Anomaly_list = ['StatisticalNGramAnomaly', 'SVDAnomaly', 'LOFAnomaly', 'IFAnomaly', 'SupervisedClassifierAnomaly'] 258 | return render_template("config_manual_update.html", files=files, len=len(files), config=config_data, 259 | input=input, config_obj=config_obj, len_config=len(config_obj), 260 | anomaly_alg=Anomaly_list, output=output) 261 | 262 | elif output != None: 263 | data = request.form.to_dict() 264 | output = data['output'] + '.conf' 265 | data.pop('output') 266 | input = data['input'] 267 | data.pop('input') 268 | Anomaly = data['Anomaly'] 269 | data.pop('Anomaly') 270 | ground_truth_column = data['ground-truth-column'] 271 | data.pop('ground-truth-column') 272 | classifier = data['classifier'] 273 | data.pop('classifier') 274 | model_args = data['model-args'] 275 | data.pop('model-args') 276 | labels = list(data.keys()) 277 | print(labels) 278 | 279 | config.read(data_path + input) 280 | new_config = configparser.ConfigParser() 281 | for label in labels: 282 | print(config[label]) 283 | new_config[label] = config[label] 284 | new_config['AnomalyScoring'] = config['AnomalyScoring'] 285 | new_config['AnomalyScoring']['scoring_algorithm'] = Anomaly 286 | if Anomaly == 'SupervisedClassifierAnomaly': 287 | new_config['AnomalyScoring']['ground_truth_column'] = ground_truth_column 288 | new_config['AnomalyScoring']['classifier'] = classifier 289 | model_args = model_args.split('\n') 290 | for model_arg in model_args: 291 | model_arg = model_arg.split('=') 292 | new_config['AnomalyScoring'][model_arg[0].strip()] = model_arg[1].strip() 293 | with open(data_path + output, 'w') as configfile: 294 | new_config.write(configfile) 295 | input_data = open('osas/templates/config_static.txt', 'r').read() + "\n\n" + open(data_path + output, 296 | 'r').read() 297 | print(output) 298 | # print(input_data) 299 | return render_template("config_text_edit.html", input=[output], input_data=input_data) 300 | 301 | elif output == None and text_box != None: 302 | data = request.form.to_dict() 303 | input = data['input'] 304 | text_box = data['text_box'] 305 | 306 | with open(data_path + input, 'w') as configfile: 307 | configfile.write(text_box) 308 | return '' 309 | 310 | 311 | @app.route('/osas/train_pipeline', methods=['GET', 'POST']) 312 | def train_pipeline(): 313 | print(request.method) 314 | if request.method == 'GET': 315 | onlyfiles = [f for f in listdir(data_path) if isfile(join(data_path, f)) and '.conf' in f and '.model' not in f] 316 | files = onlyfiles 317 | 318 | onlyfiles_dataset = [f for f in listdir(data_path) if 319 | isfile(join(data_path, f)) and '.conf' not in f and '.model' not in f] 320 | dataset = onlyfiles_dataset 321 | 322 | return render_template("train_pipeline.html", files=files, len=len(files), dataset=dataset, 323 | len_dataset=len(dataset)) 324 | 325 | if request.method == 'POST': 326 | input = request.form['input'] 327 | input_conf = request.form['input_conf'] 328 | 329 | output = request.form['output'] 330 | print(input) 331 | print(output) 332 | if '.model' not in output: 333 | output += '.model' 334 | 335 | def inner(): 336 | proc = subprocess.Popen([ 337 | 'python3 osas/main/train_pipeline.py --input-file={} --conf-file={} --model-file={} 2>&1'.format( 338 | data_path + input, data_path + input_conf, data_path + output)], shell=True, 339 | stdout=subprocess.PIPE) 340 | 341 | for line in iter(proc.stdout.readline, ''): 342 | try: 343 | yield line.rstrip().decode('ascii') + '
\n' 344 | except: 345 | a = None 346 | poll = proc.poll() 347 | if poll is not None: 348 | yield 'DONE!
\n' 349 | # yield 'go to http://127.0.0.1:8888/osas/run_pipeline' 350 | full_text = """go to http://127.0.0.1:8888/osas/run_pipeline 351 | """ 356 | yield full_text 357 | break 358 | 359 | # 360 | return Response(inner(), mimetype='text/html') 361 | 362 | 363 | @app.route('/osas/run_pipeline', methods=['GET', 'POST']) 364 | def run_pipeline(): 365 | print(request.method) 366 | if request.method == 'GET': 367 | onlyfiles = [f for f in listdir(data_path) if 368 | isfile(join(data_path, f)) and '.conf' in f and 'pipeline' not in f] 369 | files = onlyfiles 370 | 371 | onlyfiles_dataset = [f for f in listdir(data_path) if 372 | isfile(join(data_path, f)) and '.conf' not in f and '.model' not in f] 373 | dataset = onlyfiles_dataset 374 | 375 | onlyfiles_dataset = [f for f in listdir(data_path) if isfile(join(data_path, f)) and '.model' in f] 376 | pipeline = onlyfiles_dataset 377 | 378 | return render_template("run_pipeline.html", files=files, len=len(files), dataset=dataset, 379 | len_dataset=len(dataset), pipeline=pipeline, len_pipeline=len(pipeline)) 380 | 381 | if request.method == 'POST': 382 | input = request.form['input'] 383 | input_conf = request.form['input_conf'] 384 | model_conf = request.form['model_conf'] 385 | 386 | output = request.form['output'] 387 | print(input) 388 | print(output) 389 | if '.csv' not in output: 390 | output += '.csv' 391 | 392 | def inner(): 393 | proc = subprocess.Popen([ 394 | 'python3 osas/main/run_pipeline.py --input-file={} --conf-file={} --model-file={} --output-file={} 2>&1'.format( 395 | data_path + input, data_path + input_conf, data_path + model_conf, 396 | data_path + output)], shell=True, stdout=subprocess.PIPE) 397 | 398 | for line in iter(proc.stdout.readline, ''): 399 | try: 400 | yield line.rstrip().decode('ascii') + '
\n' 401 | except: 402 | a = None 403 | poll = proc.poll() 404 | if poll is not None: 405 | yield 'DONE!
\n' 406 | # yield 'go to kibana http://127.0.0.1:5601' 407 | full_text = """go to http://127.0.0.1:5601 408 | """ 413 | yield full_text 414 | 415 | break 416 | 417 | # 418 | return Response(inner(), mimetype='text/html') 419 | 420 | 421 | @app.route('/osas/run_full_process', methods=['GET', 'POST']) 422 | def run_full_process(): 423 | print(request.method) 424 | if request.method == 'GET': 425 | onlyfiles = [f for f in listdir(data_path) if 426 | isfile(join(data_path, f)) and '.conf' not in f and '.model' not in f] 427 | files = onlyfiles 428 | 429 | return render_template("run_full_process.html", files=files, len=len(files)) 430 | 431 | if request.method == 'POST': 432 | input = request.form['input'] 433 | output = request.form['output'] 434 | print(input) 435 | print(output) 436 | if '.csv' not in output: 437 | output += '.csv' 438 | 439 | def inner(): 440 | import datetime 441 | stamp = str(datetime.datetime.now())[0:19].replace(' ', '_').replace(':', '_') 442 | key = input.split('.')[0] + "_" + stamp 443 | commands = [] 444 | commands.append( 445 | 'python3 osas/main/autoconfig.py --input-file={} --output-file={}.conf 2>&1'.format(data_path + input, 446 | data_path + key)) 447 | commands.append( 448 | 'python3 osas/main/train_pipeline.py --input-file={} --conf-file={}.conf --model-file={}.model 2>&1'.format( 449 | data_path + input, data_path + key, data_path + key)) 450 | commands.append( 451 | 'python3 osas/main/run_pipeline.py --input-file={} --conf-file={}.conf --model-file={}.model --output-file={} 2>&1'.format( 452 | data_path + input, data_path + key, data_path + key, data_path + output)) 453 | 454 | for command in commands: 455 | yield command + '
\n' + '
\n' 456 | proc = subprocess.Popen([command], shell=True, stdout=subprocess.PIPE) 457 | 458 | for line in iter(proc.stdout.readline, ''): 459 | 460 | try: 461 | yield line.rstrip().decode('ascii') + '
\n' 462 | except: 463 | a = None 464 | poll = proc.poll() 465 | if poll is not None: 466 | yield 'DONE!
\n' 467 | yield 'NEXT:
\n' 468 | break 469 | yield 'go to kibana http://127.0.0.1:5601' 470 | 471 | # 472 | return Response(inner(), mimetype='text/html') 473 | 474 | 475 | app.run(port=8888, host='0.0.0.0', debug=True) 476 | --------------------------------------------------------------------------------