├── .coverage ├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE ├── Readme.md ├── analysis └── luda_analysis.ipynb ├── conf.py ├── docker-compose.yml ├── main.py ├── requirements.txt ├── src ├── __init__.py ├── clustering │ ├── __init__.py │ ├── distance_matrix.py │ ├── metrics.py │ └── swalign.py ├── feeder │ ├── __init__.py │ ├── alexa_feed_downloader.py │ ├── crawler │ │ ├── __init__.py │ │ ├── crawler.py │ │ └── endrecursive.py │ ├── feed_downloader.py │ ├── iscx_feed_downloader.py │ ├── majestic_feed_downloader.py │ ├── openfish_feed_downloader.py │ ├── umbrella_feed_downloader.py │ ├── urlhaus_feed_downloader.py │ └── vt_feed_downloader.py ├── logger_code.py ├── preprocessor │ ├── __init__.py │ ├── preprocessor.py │ └── preprocessor_basic.py ├── regex │ ├── ConsoleRegexTurtle.jar │ ├── ConsoleRegexTurtle │ │ ├── build.xml │ │ ├── build │ │ │ ├── built-jar.properties │ │ │ └── classes │ │ │ │ ├── gpl.txt │ │ │ │ └── it │ │ │ │ └── units │ │ │ │ └── inginf │ │ │ │ └── male │ │ │ │ ├── console │ │ │ │ └── ConsoleRegexTurtle.class │ │ │ │ └── dto │ │ │ │ └── SimpleConfig.class │ │ ├── dist │ │ │ ├── ConsoleRegexTurtle.jar │ │ │ ├── README.TXT │ │ │ ├── lib │ │ │ │ ├── MaleRegexTree.jar │ │ │ │ ├── Random_Regex_Turtle.jar │ │ │ │ └── gson-2.2.4.jar │ │ │ └── regexturtle.sh │ │ ├── lib │ │ │ ├── CopyLibs │ │ │ │ └── org-netbeans-modules-java-j2seproject-copylibstask.jar │ │ │ ├── Gson │ │ │ │ ├── gson-2.2.4-javadoc.jar │ │ │ │ ├── gson-2.2.4-sources.jar │ │ │ │ └── gson-2.2.4.jar │ │ │ └── nblibraries.properties │ │ ├── manifest.mf │ │ ├── nbproject │ │ │ ├── build-impl.xml │ │ │ ├── genfiles.properties │ │ │ ├── private │ │ │ │ ├── config.properties │ │ │ │ ├── private.properties │ │ │ │ └── private.xml │ │ │ ├── project.properties │ │ │ └── project.xml │ │ ├── regexturtle.sh │ │ └── src │ │ │ ├── gpl.txt │ │ │ └── it │ │ │ └── units │ │ │ └── inginf │ │ │ └── male │ │ │ ├── console │ │ │ └── ConsoleRegexTurtle.java │ │ │ └── dto │ │ │ └── SimpleConfig.java │ ├── RegexRunner.jar │ ├── __init__.py │ ├── regex.py │ ├── regexturtle.sh │ └── src_regexrunner │ │ └── regexrunner │ │ ├── JsonOperation.java │ │ ├── Main.java │ │ └── Regex.java ├── use_case │ ├── use_case_clustering.py │ ├── use_case_data.py │ ├── use_case_feeder.py │ ├── use_case_preprocessor.py │ └── use_case_regex_generation.py └── utils.py └── test ├── __init__.py ├── clustering ├── data │ └── save_test │ │ ├── index.pkl │ │ └── matrix.pkl ├── test_distance_matrix.py ├── test_metrics.py └── test_swalign.py ├── config.json ├── coverage.svg ├── data_demo.csv ├── feeder ├── crawler │ └── test_crawler.py ├── data │ └── vt_key.txt ├── test_alexa_feed_downloader.py ├── test_feed_downloader.py ├── test_iscx_feed_downloader.py └── test_vt_feed_downloader.py ├── preprocessor ├── data │ └── data_preprocessing_test.csv ├── test_preprocessor.py └── test_preprocessor_basic.py ├── regex ├── data │ ├── input_cluster_2_1.json │ ├── input_correct.json │ ├── input_regex_runner.json │ ├── json_for_test.json │ ├── output_regex_runner.json │ └── results_cluster_2_1.json └── test_regex.py └── use_case ├── test_use_case_clustering.py └── test_use_case_data.py /.coverage: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/.coverage -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | test 2 | *.log 3 | log 4 | *.md 5 | !README*.md 6 | README-secret.md 7 | my_data 8 | luda_output 9 | automation 10 | data 11 | *.csv 12 | __pycache__ 13 | test 14 | personal 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.json 3 | *.log 4 | *.out 5 | *DS_Store 6 | *.pickle 7 | *.pkl 8 | .idea 9 | .pytest_cache 10 | .ipynb_checkpoints 11 | luda_output 12 | __pycache__ 13 | personal 14 | vt_key.txt 15 | luda.md 16 | 17 | !/test/data_demo.csv 18 | !/test/config.json 19 | 20 | !/test/clustering/data/save_test/* 21 | !/test/feeder/data/vt_key.txt 22 | !/test/preprocessor/data/data_preprocessing_test.csv 23 | !/test/regex/data/* 24 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim-buster 2 | 3 | WORKDIR /code 4 | RUN yes | apt-get update 5 | RUN yes | apt install build-essential 6 | RUN yes | apt-get install manpages-dev 7 | RUN yes | pip3 install Cython 8 | RUN pip3 install notebook 9 | RUN mkdir -p /usr/share/man/man1 10 | RUN yes | apt-get install default-jdk 11 | RUN yes | apt-get install vim 12 | RUN yes | apt-get install screen 13 | RUN apt-get install htop 14 | 15 | 16 | COPY requirements.txt requirements.txt 17 | 18 | RUN pip3 install -r requirements.txt 19 | 20 | COPY . . 21 | 22 | CMD [ "/bin/bash" ] 23 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://GitHub.com/Naereen/StrapDown.js/graphs/commit-activity) 2 | [![GPLv3 license](https://img.shields.io/badge/License-GPLv3-blue.svg)](http://perso.crans.org/besson/LICENSE.html) 3 | ![Coverage](test/coverage.svg "Coverage") 4 | 5 | 6 | # LUDA: Large URLs Dataset Analyzer for security 7 | 8 | 9 | _Presented at [BlackHat USA 2021 Arsenal](https://www.blackhat.com/us-21/arsenal/schedule/index.html#luda--large-urls-dataset-analyzer-for-security-23851 10 | )_ 11 | 12 | # Table of Contents 13 | 1. [Download and getting started](#Download-and-getting-started) 14 | 2. [The 5 modules](#The-5-modules) 15 | 1. [Data](#Data) 16 | 2. [Feeders](#Feeders) 17 | 3. [Preprocessing](#Preprocessing) 18 | 4. [Clustering](#Clustering) 19 | 5. [Regex generation](#Regex-generation) 20 | 3. [Deployment with docker to a remote machine](#Deployment-with-docker-to-a-remote-machine) 21 | 4. [Support and contributing to Luda](#Support-and-contributing-to-Luda) 22 | 23 | 24 | Malicious actors often reuse code to deploy their malware, phishing website or CNC server. As a result, similiaries can 25 | be found on URLs path by inspecting internet traffic. Moreover, deep learning models or even regular ML model do not 26 | fit for inline deployment in terms of running performance. However, regexes ( or YARA rules ) can be deployed on a proxy 27 | and work in real time on all the traffic. LUDA can take a set of malicious and benign URLs and return a list of regexes 28 | ready to be deployed inline ! 29 | 30 | # Download and getting started 31 | 32 | First of all, clone the repo :) 33 | 34 | Copy now test/config.json to the main directory. 35 | 36 | To make sure it will work for everyone, we will run everything inside a docker. Assuming you have docker 37 | and docker-composed on your machine, 38 | just run from the project directory 39 | 40 | ```bash 41 | docker-compose up #building the docker for the first time can take few minutes. 42 | ``` 43 | 44 | It will create a container named luda as well as running a jupyter notebook that you can access on localhost:5555 (token: luda). 45 | You noticed that it created also a folder "data" on project level that is mapped to the same folder on the docker. 46 | 47 | So now copy (on the host) test/data_demo.csv to data/data_demo.csv. The file conf.json is already set like we need. 48 | 49 | Now go to you docker with 50 | ```bash 51 | docker exec -it luda bash 52 | ``` 53 | and run 54 | ```bash 55 | python main.py #should take less than 1 min with 8cpu 16go RAM 56 | ``` 57 | 58 | It will preprocess the data, and cluster the urls. Now let's look at the clusters ! 59 | Go to you localhost:5555 to access the jupyter notebook hosted on the docker and open analysis/luda_analysis.ipynb 60 | 61 | You can run all cells adn then go to the last part "Cluster analysis". The last output cells should show you the clusters. 62 | You should see something like this 63 | 64 | ```text 65 | Name: cluster, dtype: int64 66 | #####Cluster 0 - 27 samples: #### 67 | 68 | ['/neat/serverphp/config.bin', 69 | '/serverphp/config.bin', 70 | ... 71 | '/pus1/serverphp/config.bin', 72 | '/lg/server.php/config.bin', 73 | '/ekene/Severphp/config.bin', 74 | '/server[php]/config.bin', 75 | '/versy/serverphp/config.bin'] 76 | 77 | 78 | #####Cluster 4 - 17 samples: #### 79 | 80 | ['/mupanel/post.php', 81 | '/jiz/kbpanel/post.php', 82 | ... 83 | '/low/kbpanel/post.php', 84 | '/1/kbpanel/post.php', 85 | '/new/kbpanel/post.php'] 86 | ``` 87 | 88 | Here you can choose from which cluster you would like to run the regex generation. This last part is CPU and RAM expensive and you should run 89 | it only on the clusters that looks "good". Here you can also identify path that can generate FP ( like "/index.php" for example. Check use_case_clustering.py to see how you can fix FP at this step). 90 | Let's say you choose only those two clusters (0 and 4). Change the config.json (on the docker, you can access it directly via the notebook) 91 | to be 92 | 93 | ```json 94 | { 95 | "main_file": "data_demo.csv", 96 | "data": { 97 | "run": false, 98 | "additional_files": [ 99 | { 100 | "path": "my_data/benign_data.csv", 101 | "label": "benign" 102 | }, 103 | { 104 | "path": "my_data/malicious_traffic.csv", 105 | "label": "malicious"} 106 | ] 107 | }, 108 | "feeder": { 109 | "run": false, 110 | "sources": [ 111 | "urlhaus", 112 | "openfish", 113 | "alexa" 114 | ] 115 | }, 116 | "preprocessing": { 117 | "run": false, 118 | "name": "basic" 119 | }, 120 | "clustering": { 121 | "run": false, 122 | "preprocessed_file": null, 123 | "skip_distance_computation": false, 124 | "clusterer": { 125 | "dbscan": { 126 | "eps": 20, 127 | "min_samples": 8 128 | } 129 | }, 130 | "metric": "sw", 131 | "features_folder": "luda_output/mymatrix", 132 | "filter_similarity": 30, 133 | "phishing_mode": false 134 | }, 135 | "regex": { 136 | "run": true, 137 | "benign_for_retrain": 30, 138 | "round_max": 10, 139 | "regex_folder": "myregexes", 140 | "take_existing_result": true, 141 | "min_path_for_run": 200, 142 | "cluster_list": [0,4] 143 | } 144 | } 145 | ``` 146 | 147 | We just turned off all step exept the regex generation steps that we want to run. We also added that we want run on cluster 148 | 0 and 4 only. 149 | 150 | Now again (from the docker) 151 | 152 | /!\ This step can take few hours ( ~2h on a 48CPU machine, 378GB RAM without using all its ressources) 153 | 154 | ```bash 155 | python main.py 156 | ``` 157 | 158 | Check the log on luda_output/logs/luda.log at the end you can see a small report in the log ( where you see how each signature evolved at each round) 159 | 160 | ```txt 161 | N cluster : 2 162 | N paths: 44 163 | N benign in final test: 9486 164 | Benign number for retraining : 30 165 | N round: 10 166 | 167 | Cluster sig paths: 168 | 169 | cluster_27_0 : (\.*+[^_])++ ---> [^bin]*+[^\.]*+\.bin 170 | cluster_17_4 : ([^_]\w++)++ ---> [^\.]++\.php ---> (\w*+/)++post\.php ---> [^php]++\w\w\w/?+\w++/post\.php 171 | 172 | 173 | After final testing: 174 | Cluster with 0 FP: {'cluster_17_4', 'cluster_27_0'} 175 | Number of paths covered with 0 FP: 44 176 | Percentage of paths covered with 0 FP: 100.0 % 177 | 178 | ### FP Report ### 179 | 180 | With FP : 181 | 182 | 183 | 184 | Without: 185 | 186 | ['cluster_27_0', 'cluster_17_4'] 187 | ``` 188 | 189 | You also get a report showing basic info on the run. It's a csv stored in the "regex_folder" ( following the above config, it is luda_output/myregexes/report_myregexes.csv) 190 | 191 | |id|name |regex_js |regex_java |malicious|benign|round|example_malicious |results_file |input_file | 192 | |------|------------|----------------------------------------------------|-------------------------------|---------|------|-----|---------------------------|-------------------------|-----------------------| 193 | |0 |cluster_17_4|(?=([^php]+))\1\w\w\w(?=(/?))\2(?=(\w+))\3/post\.php|[^php]++\w\w\w/?+\w++/post\.php|17 |61 |3 |/mupanel/post.php. |results_cluster_17_4.json|input_cluster_17_4.json| 194 | |1 |cluster_27_0|(?=([^bin]*))\1(?=([^\.]*))\2\.bin |[^bin]*+[^\.]*+\.bin |27 |30 |1 |/neat/serverphp/config.bin.|results_cluster_27_0.json|input_cluster_27_0.json| 195 | 196 | 197 | Congrats on your first LUDA run. You now have 2 regexes ( Java or JS) that can be used malicious urls belonging to the clusters you found :) 198 | 199 | On the next part, we will dive into LUDA architecture to understand each of its components, understand what else you can do and possibly make 200 | you contribute to the project ! 201 | 202 | 203 | 204 | 205 | LUDA is composed of **5 modules** : data, feeder, preprocessing, clustering and regex generation. 206 | 207 | To run LUDA, we need to first configure _config.json_ 208 | 209 | # The 5 modules 210 | 211 | Every part is independent and can be run separately with the config file. 212 | 213 | ## Data 214 | 215 | 216 | To provide LUDA with some URLs, you can pass some files. The **only condition** that they should have is a column named "url". 217 | However, if you provide the main file ( here data_demo.csv) it should have url, source, label, family as columns. 218 | So the easiest way to add your files is to add them on the additional_files array. 219 | 220 | LUDA will then load them and store it in its format joined with the data coming from the feeders. By default, it will look for the file 221 | in the data folder. Otherwise you can write an absolute path. 222 | The main file does not have to exists. You can add you own file in additional_files and luda will combine them. 223 | 224 | ```json 225 | "main_file": "data_demo.csv", 226 | "data": { 227 | "run": false, 228 | "additional_files": [ 229 | { 230 | "path": "my_data/benign_data.csv", 231 | "label": "benign" 232 | }, 233 | { 234 | "path": "my_data/malicious_traffic.csv", 235 | "label": "malicious"} 236 | 237 | ] 238 | }, 239 | ``` 240 | 241 | ## Feeders 242 | 243 | We implemented several feeders from malicious sources that bring you the most recent data. Among them feeders for UrlHaus, 244 | OpenPhish, Alexa, Majestic, VT etc. If a your feeder bring domains (not URLs) a crawler is available and can convert your domain 245 | into URLs. We invite you to create your own feeder and share it to this project 246 | 247 | ```json 248 | "feeder": { 249 | "run": false, 250 | "sources": [ 251 | "urlhaus", 252 | "openfish", 253 | "alexa" 254 | ] 255 | } 256 | 257 | ``` 258 | 259 | ## Preprocessing 260 | 261 | To get better results and save computation, it is *mandatory* to preprocess the data. You need to filter smartly 262 | your URLs to leave only the one that "have a chance to create a cluster". 263 | 264 | We provide a class that implemented "basic" preprocessing techniques that we are currently using. 265 | 266 | ```json 267 | "preprocessing": { 268 | "run": false, 269 | "name": "basic" 270 | } 271 | ``` 272 | 273 | ## Clustering 274 | 275 | ```json 276 | "clustering": { 277 | "run": false, 278 | "preprocessed_file": null, 279 | "skip_distance_computation": false, 280 | "clusterer": { 281 | "dbscan": { 282 | "eps": 20, 283 | "min_samples": 8 284 | } 285 | }, 286 | "metric": "sw", 287 | "features_folder": "luda_output/mymatrix", 288 | "filter_similarity": 30, 289 | "phishing_mode": false 290 | } 291 | ``` 292 | 293 | ### Distance matrix computation 294 | 295 | 296 | This is a CPU and RAM expensive step. It will use ( by default ) all your CPU and can catch 300GB RAM for a list of URLs 297 | longuer than 35k...That's why the preprocessing step is very important. At the end of the task, it will save the results 298 | in a folder ( specified in the config file) that you can reuse several times to test different parameters of the clustering. 299 | 300 | If you already have a csv file with your data. You need to write its **absolute** path in config.json in "preprocessed_file" 301 | 302 | 303 | ### Clustering algorithm 304 | 305 | We are currently using DBscan since we want to control MinPoints ( minimum points in a cluster). Moreover, since we build 306 | by ourselves the metric, we understand what is Epsilon ! Setting Epsilon to 20 for example is equivalent 307 | to say "Group together URLs that are 80% similar" 308 | 309 | 310 | This **step in quick**, you can run it several time to test different parameters 311 | 312 | ## Regex generation 313 | 314 | For this step, we use an existing research. The original code can be found here https://github.com/MaLeLabTs/RegexGenerator 315 | 316 | We already added to LUDA the dependencies that we need from this project. More details on this repo to optimize the regex generation process. 317 | 318 | We strongly advise you to first look at your cluster that you got from the clustering part before running on all your clusters. 319 | Once you chose your cluster, add their id to "cluster_list" 320 | ```json 321 | "regex": { 322 | "run": true, 323 | "benign_for_retrain": 30, 324 | "round_max": 10, 325 | "regex_folder": "myregexes", 326 | "take_existing_result": false, 327 | "min_path_for_run": 200, 328 | "cluster_list": [0,4] 329 | } 330 | 331 | ``` 332 | # Deployment with docker to a remote machine 333 | 334 | Getting an environmment ready can be achieved with 335 | 336 | ```bash 337 | docker-compose up 338 | ``` 339 | 340 | By default this docker create a container named **luda** and build an image called **luda_image** . The docker run also 341 | a Jupyter Notebook that you can access from the port 5555 (5555 is mapped to 8888 in this version) 342 | 343 | One of the most efficient way to run LUDA in another machine is to send **luda_image** to a Docker registry and pull directly 344 | on the target machine 345 | 346 | On local you can run 347 | 348 | ```bash 349 | docker-compose build # to create the image 350 | docker tag luda_image:latest your_docker_user/luda_image 351 | docker push your_docker_user/luda_image 352 | ``` 353 | 354 | and on the remote you can either run it with docker-compose (you need to it copy it there) or run 355 | 356 | ```bash 357 | docker rm -f luda; sudo docker run -it -v /home/data/:/code/data -p 5555:8888 --name luda your_docker_user/luda_image bash 358 | 359 | # We first delete luda in case of the container already exists. It will delete all the container including your notebook. 360 | # and inside it you can launch the jupyter notebook if you want (inside a screen so it stay alive if you close the tab) 361 | 362 | screen -d -m -S jupyter jupyter notebook --allow-root --no-browser --ip 0.0.0.0 --NotebookApp.token='luda' 363 | ``` 364 | Then you just need to send your data 365 | 366 | ```bash 367 | scp -i yourkey.pem data_preprocessed.csv user@your_powerfull_machine:/home/data # remember we map home/data to code/data 368 | ``` 369 | An advantage to send your data separately is first to not get a big docker image and also to update your code if needed 370 | and still test with your data since the docker volume is mapped into a persistent folder in the host machine 371 | /!\ If you add your data to your Docker, after several tries, your disk might be full. 372 | You can delete all images by running 373 | 374 | ```bash 375 | docker rmi -f $(docker images -a -q) 376 | ``` 377 | 378 | 379 | If you always need "sudo" to run docker command, you can just your user to the docker group by running 380 | 381 | ```bash 382 | sudo usermod -a -G docker [user] 383 | newgrp docker 384 | ``` 385 | ## Access the remote Jupyter Notebook 386 | 387 | Once you are your container running, you can either access the Jupyter notebook via your browser on port 5555 of your server 388 | 389 | OR you can do SSH tunneling ( if your machine does not have open port for inbound connection) 390 | ```bash 391 | ssh -N -f -L localhost::localhost:5555 -i yourkey.pem user@your_powerfull_machine 392 | ``` 393 | 394 | 395 | # Support and contributing to Luda 396 | 397 | This code is maintained. You are welcome to ask any questions directly on Git. We will try to answer as quick as possible. 398 | 399 | We also invite your to contribute to this open source. Add your feeders, preprocessing techniques, clustering algorithms 400 | or fix bugs. 401 | It can be done via pull request. More details on how to pull request [here](https://www.dataschool.io/how-to-contribute-on-github/ 402 | ). Please provide basic test with your code. 403 | 404 | ## Running the tests 405 | 406 | Adding test protect your code but also explain them to others. 407 | Make sure the project as at least 70% coverage. 408 | To check the coverage, pip install those 2 packages 409 | 410 | ```bash 411 | pip install coverage 412 | pip install coverage-badge 413 | ``` 414 | and run from the main luda directory 415 | 416 | ```bash 417 | coverage run -m pytest 418 | coverage report -m --omit="*/test*" # optional - to see the coverage without including tests 419 | coverage-badge -o test/coverage.svg -f # this will create the coverage badge loaded in the Readme 420 | ``` 421 | # Authors 422 | 423 | **Code**: [Jordan Garzon] 424 | **Algorithm**: [Jordan Garzon] and [Asaf Nadler] 425 | 426 | from [Akamai Technologies](https://www.akamai.com) 427 | 428 | 429 | [Jordan Garzon]: https://twitter.com/JordGarzon 430 | [Asaf Nadler]: https://twitter.com/AsafNadler 431 | 432 | ```text 433 | ||| ||| 434 | | | __ | | 435 | |-|_____-----/ |_| |_| \-----_____|-| 436 | |_|_________{ }| (^) |{ }__________|_| 437 | || |_| | ^ | |_| || 438 | | \| /\ |/ | 439 | | \ |--| / | 440 | = \ |__| / = 441 | + \ / + ENJOY ! 442 | \ / 443 | \ / 444 | \ / 445 | \ / 446 | \ / 447 | \ / 448 | \ / 449 | \ / 450 | \/ 451 | 452 | ``` -------------------------------------------------------------------------------- /analysis/luda_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "ExecuteTime": { 7 | "end_time": "2021-07-14T11:35:57.868516Z", 8 | "start_time": "2021-07-14T11:35:57.863398Z" 9 | } 10 | }, 11 | "source": [ 12 | "
\n", 13 | " LUDA ANALYSIS NOTEBOOK
" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": { 20 | "ExecuteTime": { 21 | "end_time": "2021-08-02T09:08:44.346388Z", 22 | "start_time": "2021-08-02T09:08:44.311578Z" 23 | } 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "import os\n", 28 | "import pickle\n", 29 | "import pandas as pd\n", 30 | "from pprint import pprint" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "ExecuteTime": { 38 | "end_time": "2021-08-02T09:08:44.490307Z", 39 | "start_time": "2021-08-02T09:08:44.477180Z" 40 | } 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "DATA = '../data/data_demo.csv'\n", 45 | "PREPROCESSED_DATA = '../data/data_demo_preprocessed.csv'\n", 46 | "MATRIX_OUTPUT = '../luda_output/mymatrix/'" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": { 53 | "ExecuteTime": { 54 | "end_time": "2021-08-02T09:08:44.642951Z", 55 | "start_time": "2021-08-02T09:08:44.623894Z" 56 | } 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "def value_counts(df_col, limit=None):\n", 61 | " normalized = df_col.value_counts(normalize=True)[:limit]\n", 62 | " normal = df_col.value_counts()[:limit]\n", 63 | " normalized.name, normal.name = 'normalized', 'count'\n", 64 | " return pd.concat([normal, normalized], axis=1)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "# Explore your data" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": { 77 | "ExecuteTime": { 78 | "end_time": "2021-07-15T10:58:46.241896Z", 79 | "start_time": "2021-07-15T10:58:46.239058Z" 80 | } 81 | }, 82 | "source": [ 83 | "## Data" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "metadata": { 90 | "ExecuteTime": { 91 | "end_time": "2021-08-02T09:08:45.756817Z", 92 | "start_time": "2021-08-02T09:08:45.317621Z" 93 | } 94 | }, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/html": [ 99 | "
\n", 100 | "\n", 113 | "\n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | "
urllabel
0http://173.243.112.132/serve/config.binmalicious
1http://194.15.112.29/2ja/panel/config.binmalicious
2http://216.170.125.134/neat/serverphp/config.binmalicious
3http://58.22.101.109/xz/cfg.binmalicious
4http://83.149.95.197/1/cfg.binmalicious
.........
10195http://fhs.mcmaster.ca/main/benefactors/braley...benign
10196http://youtube.com/watch?v=_WQSaqs-fOsbenign
10197http://randomdomain34623.com/B5iioj3SFI5gE_JbH...benign
10198http://randomdomain42219.com/-xwiPbFONIb8/AAAA...benign
10199http://randomdomain39328.com/a/AATXAJwSYig3P9W...benign
\n", 179 | "

10200 rows × 2 columns

\n", 180 | "
" 181 | ], 182 | "text/plain": [ 183 | " url label\n", 184 | "0 http://173.243.112.132/serve/config.bin malicious\n", 185 | "1 http://194.15.112.29/2ja/panel/config.bin malicious\n", 186 | "2 http://216.170.125.134/neat/serverphp/config.bin malicious\n", 187 | "3 http://58.22.101.109/xz/cfg.bin malicious\n", 188 | "4 http://83.149.95.197/1/cfg.bin malicious\n", 189 | "... ... ...\n", 190 | "10195 http://fhs.mcmaster.ca/main/benefactors/braley... benign\n", 191 | "10196 http://youtube.com/watch?v=_WQSaqs-fOs benign\n", 192 | "10197 http://randomdomain34623.com/B5iioj3SFI5gE_JbH... benign\n", 193 | "10198 http://randomdomain42219.com/-xwiPbFONIb8/AAAA... benign\n", 194 | "10199 http://randomdomain39328.com/a/AATXAJwSYig3P9W... benign\n", 195 | "\n", 196 | "[10200 rows x 2 columns]" 197 | ] 198 | }, 199 | "execution_count": 4, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "df = pd.read_csv(DATA)\n", 206 | "df" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 5, 212 | "metadata": { 213 | "ExecuteTime": { 214 | "end_time": "2021-08-02T09:08:46.405848Z", 215 | "start_time": "2021-08-02T09:08:46.190986Z" 216 | } 217 | }, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/html": [ 222 | "
\n", 223 | "\n", 236 | "\n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | "
countnormalized
benign100000.980392
malicious2000.019608
\n", 257 | "
" 258 | ], 259 | "text/plain": [ 260 | " count normalized\n", 261 | "benign 10000 0.980392\n", 262 | "malicious 200 0.019608" 263 | ] 264 | }, 265 | "execution_count": 5, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "value_counts(df['label'])" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 6, 277 | "metadata": { 278 | "ExecuteTime": { 279 | "end_time": "2021-08-02T09:08:46.483380Z", 280 | "start_time": "2021-08-02T09:08:46.428743Z" 281 | } 282 | }, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "200 unique malicious URLs\n", 289 | "9978 unique benign URLs\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "for label in ['malicious', 'benign']:\n", 295 | " _ = df[df['label'] == label]\n", 296 | " print(f\"{_['url'].nunique()} unique {label} URLs\")\n", 297 | "\n" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "## Preprocessed" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 7, 310 | "metadata": { 311 | "ExecuteTime": { 312 | "end_time": "2021-08-02T09:08:47.142816Z", 313 | "start_time": "2021-08-02T09:08:47.051959Z" 314 | } 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "df_preprocessed = pd.read_csv(PREPROCESSED_DATA)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 8, 324 | "metadata": { 325 | "ExecuteTime": { 326 | "end_time": "2021-08-02T09:08:47.379022Z", 327 | "start_time": "2021-08-02T09:08:47.369022Z" 328 | } 329 | }, 330 | "outputs": [ 331 | { 332 | "data": { 333 | "text/plain": [ 334 | "benign 10000\n", 335 | "malicious 200\n", 336 | "Name: label, dtype: int64" 337 | ] 338 | }, 339 | "execution_count": 8, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | } 343 | ], 344 | "source": [ 345 | "df_preprocessed['label'].value_counts()" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "# Cluster analysis" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "Explore your cluster before running the automatic regex generation" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 9, 365 | "metadata": { 366 | "ExecuteTime": { 367 | "end_time": "2021-08-02T09:08:48.366378Z", 368 | "start_time": "2021-08-02T09:08:48.354717Z" 369 | } 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "def load_result(data_path, folder):\n", 374 | " df = pd.read_csv(data_path)\n", 375 | " with open(os.path.join(folder, 'index_to_keep.pkl'), 'rb') as f:\n", 376 | " index_to_keep = pickle.load(f)\n", 377 | " with open(os.path.join(folder, 'labels.pkl'), 'rb') as f:\n", 378 | " labels = pickle.load(f)\n", 379 | " df = df.iloc[index_to_keep, :]\n", 380 | " df['cluster'] = labels\n", 381 | " series_cluster_count = df['cluster'].value_counts()\n", 382 | " print('Clusters : ')\n", 383 | " print(df['cluster'].value_counts())\n", 384 | " for cluster in series_cluster_count.index:\n", 385 | " if cluster == -1:\n", 386 | " continue\n", 387 | " print('#####Cluster {} - {} samples: #### \\n'.format(cluster, series_cluster_count[cluster]))\n", 388 | " pprint(df[(df['cluster']==cluster)]['path'].to_list())\n", 389 | " print('\\n')\n", 390 | " return df\n", 391 | "\n", 392 | "\n", 393 | "def get_stat_cluster(df_features):\n", 394 | " df_features_cluster = pd.DataFrame(df_features.groupby('cluster').agg({'domain': ['nunique'], 'path': 'count'}).to_records())\n", 395 | " df_features_cluster.columns = ['cluster', 'domain', 'path']\n", 396 | " df_features_cluster = df_features_cluster[df_features_cluster['cluster'] !=-1]\n", 397 | " n_path = df_features_cluster['path'].sum()\n", 398 | " print('{} paths ({} %) clustered from {} domains !'.format(n_path, round(100*n_path/df_features['path'].nunique(), 2), df_features_cluster['domain'].sum()))\n", 399 | " print('Cluster number: {}'.format(df_features_cluster['cluster'].nunique()))\n", 400 | " return df_features_cluster.sort_values('path', ascending=False)" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 10, 406 | "metadata": { 407 | "ExecuteTime": { 408 | "end_time": "2021-08-02T09:08:48.687947Z", 409 | "start_time": "2021-08-02T09:08:48.540847Z" 410 | } 411 | }, 412 | "outputs": [ 413 | { 414 | "name": "stdout", 415 | "output_type": "stream", 416 | "text": [ 417 | "Clusters : \n", 418 | " 0 27\n", 419 | " 4 17\n", 420 | " 6 16\n", 421 | " 1 16\n", 422 | " 10 15\n", 423 | " 15 12\n", 424 | " 11 12\n", 425 | " 3 10\n", 426 | " 8 9\n", 427 | " 16 8\n", 428 | " 7 8\n", 429 | " 9 8\n", 430 | " 5 8\n", 431 | " 12 8\n", 432 | " 13 8\n", 433 | " 2 8\n", 434 | " 14 8\n", 435 | "-1 2\n", 436 | "Name: cluster, dtype: int64\n", 437 | "#####Cluster 0 - 27 samples: #### \n", 438 | "\n", 439 | "['/neat/serverphp/config.bin',\n", 440 | " '/serverphp/config.bin',\n", 441 | " '/Zeus/serverphp/config.bin',\n", 442 | " '/files/serverphp/config.bin',\n", 443 | " '/high/serverphp/config.bin',\n", 444 | " '/work/server.php/config.bin',\n", 445 | " '/nice/serverphp/config.bin',\n", 446 | " '/online/serverphp/config.bin',\n", 447 | " '/adm/serverphp/config.bin',\n", 448 | " '/plain/serverphp/config.bin',\n", 449 | " '/dbb/serverphp/config.bin',\n", 450 | " '/figo/serverphp/config.bin',\n", 451 | " '/fine/serverphp/config.bin',\n", 452 | " '/sys/serverphp/config.bin',\n", 453 | " '/dbd/serverphp/config.bin',\n", 454 | " '/nku/serverphp/config.bin',\n", 455 | " '/lg/server-php/config.bin',\n", 456 | " '/crome/serverphp/config.bin',\n", 457 | " '/db/serverphp/config.bin',\n", 458 | " '/good/serverphp/config.bin',\n", 459 | " '/serverp/config.bin',\n", 460 | " '/dolls/serverphp/config.bin',\n", 461 | " '/pus1/serverphp/config.bin',\n", 462 | " '/lg/server.php/config.bin',\n", 463 | " '/ekene/Severphp/config.bin',\n", 464 | " '/server[php]/config.bin',\n", 465 | " '/versy/serverphp/config.bin']\n", 466 | "\n", 467 | "\n", 468 | "#####Cluster 4 - 17 samples: #### \n", 469 | "\n", 470 | "['/mupanel/post.php',\n", 471 | " '/jiz/kbpanel/post.php',\n", 472 | " '/sync/kbpanel/post.php',\n", 473 | " '/doc/kbpanel/post.php',\n", 474 | " '/K/kbpanel/post.php',\n", 475 | " '/panel/post.php',\n", 476 | " '/kbpanel/post.php',\n", 477 | " '/_cpanel/post.php',\n", 478 | " '/KB/kbpanel/post.php',\n", 479 | " '/led/kbpanel/post.php',\n", 480 | " '/laww/kbpanel/post.php',\n", 481 | " '/php/kbpanel/post.php',\n", 482 | " '/tru/kbpanel/post.php',\n", 483 | " '/edu/kbpanel/post.php',\n", 484 | " '/low/kbpanel/post.php',\n", 485 | " '/1/kbpanel/post.php',\n", 486 | " '/new/kbpanel/post.php']\n", 487 | "\n", 488 | "\n", 489 | "#####Cluster 6 - 16 samples: #### \n", 490 | "\n", 491 | "['/wp-content/themes/twentyeleven/img5.php',\n", 492 | " '/site/wp-content/themes/twentyeleven/e.php',\n", 493 | " '/wp-content/themes/twentyeleven/get.php',\n", 494 | " '/wp-content/themes/twentytwelve/r.php',\n", 495 | " '/wp-content/themes/twentyeleven/3.php',\n", 496 | " '/wp-content/themes/twentyeleven/post.php',\n", 497 | " '/wp-content/themes/twentyeleven/js',\n", 498 | " '/wp-content/themes/twentytwelve/img3.php',\n", 499 | " '/wp-content/themes/twentytwelve/c.php',\n", 500 | " '/wp-content/themes/twentyeleven/ccccc.php',\n", 501 | " '/wp-content/themes/twentytwelve/cccc.php',\n", 502 | " '/wp-content/themes/twentytwelve/rr.php',\n", 503 | " '/wp-content/themes/twentytwelve/g1.php',\n", 504 | " '/wp-content/themes/twentytwelve/st1.exe',\n", 505 | " '/wp-content/themes/twentyeleven/a.php',\n", 506 | " '/wp-content/themes/twentyeleven/cc.php']\n", 507 | "\n", 508 | "\n", 509 | "#####Cluster 1 - 16 samples: #### \n", 510 | "\n", 511 | "['/xz/cfg.bin',\n", 512 | " '/1/cfg.bin',\n", 513 | " '/zs/cfg.bin',\n", 514 | " '/me/cfg.bin',\n", 515 | " '/zsb/cfg.bin',\n", 516 | " '/zex/cfg.bin',\n", 517 | " '/izu/cfg.bin',\n", 518 | " '/zus/cfg.bin',\n", 519 | " '/z/cfg.bin',\n", 520 | " '/2/cfg.bin',\n", 521 | " '/te/cfg.bin',\n", 522 | " '/ZUS/cfg.bin',\n", 523 | " '/ze/cfg.bin',\n", 524 | " '/zv/cfg.bin',\n", 525 | " '/zu/cfg.bin',\n", 526 | " '/c/cfg.bin']\n", 527 | "\n", 528 | "\n", 529 | "#####Cluster 10 - 15 samples: #### \n", 530 | "\n", 531 | "['/wp-content/plugins/wp-db-backup-made/test.php',\n", 532 | " '/wp-content/plugins/wp-db-backup-made/BYA4Ks.php',\n", 533 | " '/RRYZZ/wp-content/plugins/wp-db-backup-made/test.php',\n", 534 | " '/wp-content/plugins/wp-db-backup-made/g1.php',\n", 535 | " '/wp-content/plugins/wp-db-backup-made/ap1.php',\n", 536 | " '/wp-content/plugins/wp-db-backup-made/das.db',\n", 537 | " '/wp-content/plugins/wp-db-backup-made/ap2.php',\n", 538 | " '/wp-content/plugins/wp-db-backup-made/3.php',\n", 539 | " '/wp-content/plugins/wp-db-backup-made/ap5.php',\n", 540 | " '/wp-content/plugins/wp-db-backup-made/ap4.php',\n", 541 | " '/wp-content/plugins/wp-db-backup-made/mein.hlp',\n", 542 | " '/wp-content/plugins/wp-db-backup-made/Xoeyqs.php',\n", 543 | " '/wp-content/plugins/wp-db-backup-made/3ILBop.php',\n", 544 | " '/wp-content/plugins/wp-db-backup-made/c.php',\n", 545 | " '/blog/wp-content/plugins/wp-db-backup-made/d.php']\n", 546 | "\n", 547 | "\n", 548 | "#####Cluster 15 - 12 samples: #### \n", 549 | "\n", 550 | "['/bla31/gate.php',\n", 551 | " '/bla09/gate.php',\n", 552 | " '/bla06/gate.php',\n", 553 | " '/bla10/gate.php',\n", 554 | " '/bla11/gate.php',\n", 555 | " '/bla12/gate.php',\n", 556 | " '/bla07/gate.php',\n", 557 | " '/bla30/gate.php',\n", 558 | " '/bla08/gate.php',\n", 559 | " '/bla19/gate.php',\n", 560 | " '/bla25/gate.php',\n", 561 | " '/bla05/gate.php']\n", 562 | "\n", 563 | "\n", 564 | "#####Cluster 11 - 12 samples: #### \n", 565 | "\n", 566 | "['/serve/config.bin',\n", 567 | " '/.tmp/server/config.bin',\n", 568 | " '/loja/.db/server/config.bin',\n", 569 | " '/zcp/server/config.bin',\n", 570 | " '/new/server/config.bin',\n", 571 | " '/.db/server/config.bin',\n", 572 | " '/server/config.bin',\n", 573 | " '/web/server/config.bin',\n", 574 | " '/go/server/config.bin',\n", 575 | " '/123/server/config.bin',\n", 576 | " '/servero/config.bin',\n", 577 | " '/wpi/server/config.bin']\n", 578 | "\n", 579 | "\n", 580 | "#####Cluster 3 - 10 samples: #### \n", 581 | "\n", 582 | "['/imgs/keybase/post.php',\n", 583 | " '/pcss/keybase/post.php',\n", 584 | " '/grey/keybase/post.php',\n", 585 | " '/key/keybase/post.php',\n", 586 | " '/app/keybase/post.php',\n", 587 | " '/img/keybase/post.php',\n", 588 | " '/k/keybase/post.php',\n", 589 | " '/css/keybase/post.php',\n", 590 | " '/.key/keybase/post.php',\n", 591 | " '/old/keybase/post.php']\n", 592 | "\n", 593 | "\n", 594 | "#####Cluster 8 - 9 samples: #### \n", 595 | "\n", 596 | "['/images/config.bin',\n", 597 | " '/images/1/config.bin',\n", 598 | " '/do/images/config.bin',\n", 599 | " '/image3/config.bin',\n", 600 | " '/t1/images/config.bin',\n", 601 | " '/image/config.bin',\n", 602 | " '/css/images/config.bin',\n", 603 | " '/.images/config.bin',\n", 604 | " '/wp-images/config.bin']\n", 605 | "\n", 606 | "\n", 607 | "#####Cluster 16 - 8 samples: #### \n", 608 | "\n", 609 | "['/ade/PHP/index.php',\n", 610 | " '/one/PHP/index.php',\n", 611 | " '/goe/PHP/index.php',\n", 612 | " '/gg/PHP/index.php',\n", 613 | " '/joe/PHP/index.php',\n", 614 | " '/nze/PHP/index.php',\n", 615 | " '/kg/PHP/index.php',\n", 616 | " '/ME/PHP/index.php']\n", 617 | "\n", 618 | "\n", 619 | "#####Cluster 7 - 8 samples: #### \n", 620 | "\n", 621 | "['/lol/web/config/index.php',\n", 622 | " '/web/config/index.php',\n", 623 | " '/pfd/config/index.php',\n", 624 | " '/cach/web/config/index.php',\n", 625 | " '/web/web/config/index.php',\n", 626 | " '/config/index.php',\n", 627 | " '/css/config/index.php',\n", 628 | " '/Fish/web/config/index.php']\n", 629 | "\n", 630 | "\n", 631 | "#####Cluster 9 - 8 samples: #### \n", 632 | "\n", 633 | "['/bm_a/controller.php',\n", 634 | " '/adm/controller.php',\n", 635 | " '/bm/controller.php',\n", 636 | " '/bm_b/controller.php',\n", 637 | " '/4n/controller.php',\n", 638 | " '/J/controller.php',\n", 639 | " '/3/controller.php',\n", 640 | " '/br/controller.php']\n", 641 | "\n", 642 | "\n", 643 | "#####Cluster 5 - 8 samples: #### \n", 644 | "\n", 645 | "['/imagens/logo.gif',\n", 646 | " '/images/logof.gif',\n", 647 | " '/Images/logos.gif',\n", 648 | " '/image/logos.gif',\n", 649 | " '/images/logos1.gif',\n", 650 | " '/images/flogo.gif',\n", 651 | " '/imagens/logos.gif',\n", 652 | " '/images/logo2.gif']\n", 653 | "\n", 654 | "\n", 655 | "#####Cluster 12 - 8 samples: #### \n", 656 | "\n", 657 | "['/TJ/Count.asp',\n", 658 | " '/TJ/count.asp',\n", 659 | " '/1/count.asp',\n", 660 | " '/t/Count.asp',\n", 661 | " '/f/count.asp',\n", 662 | " '/a/Count.asp',\n", 663 | " '/2/count.asp',\n", 664 | " '/jt/count.asp']\n", 665 | "\n", 666 | "\n", 667 | "#####Cluster 13 - 8 samples: #### \n", 668 | "\n", 669 | "['/wp-content/uploads/2018/1Ih',\n", 670 | " '/wp-content/uploads/2017/NVa',\n", 671 | " '/wp-content/uploads/2019/6AP0',\n", 672 | " '/wp-content/uploads/2016/04/',\n", 673 | " '/wp-content/uploads/2018/Cc',\n", 674 | " '/wp-content/uploads/2019/41',\n", 675 | " '/wp-content/uploads/2019/12/app',\n", 676 | " '/wp-content/uploads/2015/KD']\n", 677 | "\n", 678 | "\n", 679 | "#####Cluster 2 - 8 samples: #### \n", 680 | "\n", 681 | "['/office/invoice_22114.doc',\n", 682 | " '/office/invoice_11154.doc',\n", 683 | " '/office/invoice_11148.doc',\n", 684 | " '/office/invoice_221214.doc',\n", 685 | " '/office/invoice_22112.doc',\n", 686 | " '/office/invoice_22113.doc',\n", 687 | " '/office/invoice_22121.doc',\n", 688 | " '/office/invoice_21441.doc']\n", 689 | "\n", 690 | "\n", 691 | "#####Cluster 14 - 8 samples: #### \n", 692 | "\n", 693 | "['/2ja/panel/config.bin',\n", 694 | " '/ghpanel/config.bin',\n", 695 | " '/cmm/panel/config.bin',\n", 696 | " '/ceepanel/config.bin',\n", 697 | " '/idk/panel/config.bin',\n", 698 | " '/cpanel/config.bin',\n", 699 | " '/Panel/config.bin',\n", 700 | " '/ash/panel/config.bin']\n", 701 | "\n", 702 | "\n" 703 | ] 704 | } 705 | ], 706 | "source": [ 707 | "df_features = load_result(PREPROCESSED_DATA, MATRIX_OUTPUT)" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "metadata": {}, 714 | "outputs": [], 715 | "source": [] 716 | } 717 | ], 718 | "metadata": { 719 | "kernelspec": { 720 | "display_name": "Python 3", 721 | "language": "python", 722 | "name": "python3" 723 | }, 724 | "language_info": { 725 | "codemirror_mode": { 726 | "name": "ipython", 727 | "version": 3 728 | }, 729 | "file_extension": ".py", 730 | "mimetype": "text/x-python", 731 | "name": "python", 732 | "nbconvert_exporter": "python", 733 | "pygments_lexer": "ipython3", 734 | "version": "3.7.4" 735 | }, 736 | "toc": { 737 | "base_numbering": 1, 738 | "nav_menu": {}, 739 | "number_sections": true, 740 | "sideBar": true, 741 | "skip_h1_title": false, 742 | "title_cell": "Table of Contents", 743 | "title_sidebar": "Contents", 744 | "toc_cell": false, 745 | "toc_position": { 746 | "height": "calc(100% - 180px)", 747 | "left": "10px", 748 | "top": "150px", 749 | "width": "282.344px" 750 | }, 751 | "toc_section_display": true, 752 | "toc_window_display": true 753 | } 754 | }, 755 | "nbformat": 4, 756 | "nbformat_minor": 2 757 | } 758 | -------------------------------------------------------------------------------- /conf.py: -------------------------------------------------------------------------------- 1 | """ 2 | conf.py 3 | 4 | We store here alsmot all internal global variables for this project. To configure your project, in general 5 | you'll need to use config.json 6 | """ 7 | 8 | import os 9 | 10 | WORKING_DIR = os.path.dirname(os.path.abspath(__file__)) 11 | 12 | SRC_DIR = os.path.join(WORKING_DIR, 'src') 13 | 14 | LUDA_OUTPUT = os.path.join(WORKING_DIR, 'luda_output') 15 | 16 | CONFIG_FILE = os.path.join(WORKING_DIR, 'config.json') 17 | 18 | # Data 19 | 20 | DATA = os.path.join(WORKING_DIR, 'data') 21 | 22 | PREPROCESSED_SUFFIX = '_preprocessed.csv' 23 | 24 | DATA_LABELS = ['malicious', 'benign'] 25 | 26 | VT_KEY = os.path.join(WORKING_DIR, 'vt_key.txt') 27 | 28 | # Clustering 29 | 30 | MATRIX_FOLDER = os.path.join(LUDA_OUTPUT, 'matrix_output') 31 | 32 | MATRIX_STATS_FOLDER = os.path.join(LUDA_OUTPUT, 'matrix_stats') 33 | 34 | DISTANCE_MATRIX = 'matrix.pkl' 35 | 36 | INDEX = 'index.pkl' 37 | 38 | MATRIX_STATS = 'matrix_stats.pkl' 39 | 40 | SIMILARITY_MAX = 100 41 | 42 | INDEX_TO_KEEP = 'index_to_keep.pkl' 43 | 44 | LABELS = 'labels.pkl' 45 | 46 | # Logs 47 | 48 | LOGGER_NAME = 'luda' 49 | 50 | LOG_FOLDER = os.path.join(LUDA_OUTPUT, 'logs') 51 | 52 | LOGGER_FILE = os.path.join(LOG_FOLDER, 'luda.log') 53 | 54 | LOG_FILE_SIZE = 10 * 1000000 # 10 MB 55 | 56 | LOG_FILE_NUMBER = 5 57 | 58 | # Regex 59 | 60 | REGEX_FOLDER_OUTPUT = os.path.join(LUDA_OUTPUT, 'regex_output') 61 | 62 | REGEX_FOLDER = os.path.join(SRC_DIR, 'regex') 63 | 64 | REGEX_SH = os.path.join(REGEX_FOLDER, "ConsoleRegexTurtle", "dist", 'regexturtle.sh') 65 | 66 | REGEX_JAVA = os.path.join(REGEX_FOLDER, "ConsoleRegexTurtle", "dist", 'ConsoleRegexTurtle.jar') 67 | 68 | REGEX_TMP = os.path.join(REGEX_FOLDER_OUTPUT, 'tmp') 69 | 70 | BENIGN_FOR_RETRAIN = 20 71 | 72 | TEST_BATCH_SIZE = 50000 73 | 74 | REGEX_RUNNER = os.path.join(REGEX_FOLDER, 'RegexRunner.jar') 75 | 76 | INPUT_REGEX_RUNNER = os.path.join(REGEX_FOLDER_OUTPUT, 'input_regex_runner.json') 77 | 78 | OUTPUT_REGEX_RUNNER = os.path.join(REGEX_FOLDER_OUTPUT, 'output_regex_runner.json') 79 | 80 | LAST_REGEX_LIST = os.path.join(DATA, 'regex_list.json') 81 | 82 | # Coverage 83 | 84 | COVERAGE_FOLDER = os.path.join(LUDA_OUTPUT, 'coverage') 85 | 86 | # Crawler 87 | 88 | MAX_LEN_URL = 100 89 | 90 | TIMEOUT_CRAWL = 10 91 | 92 | DEPTH_MAX = 10 93 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | luda: 4 | build: . 5 | volumes: 6 | - "./data/:/code/data" 7 | ports: 8 | - "5555:8888" 9 | container_name: luda 10 | image: luda_image 11 | command: "jupyter notebook --allow-root --no-browser --ip 0.0.0.0 --NotebookApp.token='luda' " 12 | 13 | #if you don't want to run jupyter notebook, you can run this below command to keep the container alive 14 | 15 | #command: "tail -F anything" # just to keep it running 16 | 17 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from src.logger_code import init_logger 4 | from src.regex.regex import Regex 5 | from src.use_case.use_case_clustering import UseCaseClustering 6 | from src.use_case.use_case_regex_generation import UseCaseRegexGeneration 7 | from src.use_case.use_case_feeder import UseCaseFeeder 8 | from src.use_case.use_case_preprocessor import UseCasePreprocessor 9 | from src.use_case.use_case_data import UseCaseData 10 | from src.utils import process_file_name 11 | from src.utils import process_preprocessed_file_name 12 | import conf 13 | 14 | __author__ = "Jordan Garzon" 15 | __email__ = "jgarzon@akamai.com" 16 | 17 | with open(conf.CONFIG_FILE) as json_file: 18 | config = json.load(json_file) 19 | 20 | 21 | def main(): 22 | logger = init_logger() 23 | main_file = process_file_name(config['main_file']) 24 | preprocessed_file = process_preprocessed_file_name(main_file, config['clustering']['preprocessed_file']) 25 | if config['data']['run']: 26 | UseCaseData().run(main_file, config['data']['additional_files']) 27 | if config['feeder']['run']: 28 | logger.info('Running the feeders') 29 | UseCaseFeeder().fetch_and_save(config['feeder']['sources'], main_file) 30 | 31 | if config['preprocessing']['run']: 32 | logger.info('Running the preprocessing') 33 | UseCasePreprocessor().run(config['preprocessing']['name'], main_file) 34 | 35 | if config['clustering']['run']: 36 | logger.info('Running the clustering') 37 | 38 | use_case_clustering = UseCaseClustering() 39 | use_case_clustering.run(file_path=preprocessed_file, 40 | skip_compute_distance=config['clustering']['skip_distance_computation'], 41 | save_folder=config['clustering']['features_folder'], 42 | clusterer=config['clustering']['clusterer'], 43 | filter_th=config['clustering']['filter_similarity']) 44 | 45 | # Regex Step 46 | if config['regex']['run']: 47 | logger.info('Running the regexes') 48 | regex_object = Regex(project_name=config['regex']['regex_folder']) 49 | use_case_regex = UseCaseRegexGeneration(regex_object) 50 | 51 | use_case_regex.run(main_file=preprocessed_file, 52 | cluster_list=config['regex']['cluster_list'], 53 | features_folder=config['clustering']['features_folder'], 54 | benign_for_retrain=config['regex']['benign_for_retrain'], 55 | take_existing_result=config['regex']['take_existing_result'], 56 | round_max=config['regex']['round_max'], 57 | min_path_for_run=config['regex']['min_path_for_run']) 58 | 59 | 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | js_regex 2 | Cython 3 | sklearn 4 | hdbscan 5 | numpy 6 | python-Levenshtein 7 | pandas 8 | tqdm 9 | psutil 10 | tldextract 11 | beautifulsoup4 12 | urllib3 13 | requests 14 | pytest -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/__init__.py -------------------------------------------------------------------------------- /src/clustering/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/clustering/__init__.py -------------------------------------------------------------------------------- /src/clustering/distance_matrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import shutil 3 | import pickle 4 | import time 5 | import os 6 | import logging 7 | import multiprocessing as mp 8 | from collections import defaultdict 9 | from tqdm import tqdm 10 | 11 | from src.utils import create_folder 12 | from src.clustering.metrics import DISTANCE_FUNC 13 | import conf 14 | 15 | logger = logging.getLogger(conf.LOGGER_NAME) 16 | 17 | 18 | class DistanceMatrix(object): 19 | def __init__(self, url_list, matrix=None, distance_func=None, folder=None): 20 | """ 21 | Compute distance matrix from list of strings 22 | :param url_list: list of urls or paths 23 | :param matrix: ndarray 24 | :param distance_func: Example distance_func=lev.distance or DISTANCE_FUNC['sw'] 25 | :param folder: folder to save the results 26 | """ 27 | self.distance_func = distance_func 28 | if not distance_func: 29 | self.distance_func = DISTANCE_FUNC['sw'] 30 | self.url_list = url_list 31 | self.matrix = matrix 32 | self.folder = folder 33 | self.stats = defaultdict(int) 34 | 35 | def run(self, ncores=mp.cpu_count(), skip_calc=False): 36 | """ 37 | Compute the matrix distances with multiprocessing 38 | :param ncores: number of cores to use 39 | :param skip_calc: bool. If True, skip the computation to the loading phase only 40 | :return: 41 | """ 42 | if not skip_calc: 43 | create_folder(conf.MATRIX_FOLDER) 44 | self.__delete_matrix_stat_folder() 45 | _input_distance = self.__get_argument_create_matrix(ncores) 46 | processes = [mp.Process(target=self.__create_matrix_distance, args=x) for x in _input_distance] 47 | 48 | for p in processes: 49 | p.start() 50 | for p in processes: 51 | p.join() 52 | 53 | matrix = self.__get_big_matrix() # to allocate as much as you want of memory in the kernel echo 1 > /proc/sys/vm/overcommit_memory 54 | self.matrix = matrix 55 | self.stats = self.__get_big_stats() 56 | 57 | self.__save() 58 | self.__delete_matrix_stat_folder(delete_folder=True) 59 | 60 | return matrix 61 | 62 | @classmethod 63 | def load(cls, folder_save): 64 | """ 65 | Load a save folder for a future use, clustering for example. 66 | :param folder_save: path of the save folder 67 | :return: void 68 | """ 69 | with open(os.path.join(folder_save, conf.DISTANCE_MATRIX), 'rb') as pickle_file: 70 | distance_matrix = pickle.load(pickle_file) 71 | with open(os.path.join(folder_save, conf.INDEX), 'rb') as pickle_file: 72 | index = pickle.load(pickle_file) 73 | return cls(index, distance_matrix, folder=folder_save) 74 | 75 | def __get_argument_create_matrix(self, ncores): 76 | """ 77 | This function will map the computation and will give the arguments to be passed to the function 78 | create_matrix_distance 79 | :param ncores: number of cores to use 80 | :return: list of tuple 81 | """ 82 | nsamples = len(self.url_list) 83 | 84 | distance_number = nsamples * (nsamples + 1) / 2 # we compute only half of the matrix 85 | computation_per_core = round(distance_number / ncores) 86 | computation_tuple_list = [] 87 | a = nsamples 88 | b = nsamples # in case ncores = 1 89 | for i in range(ncores - 1): 90 | b = self.__get_a(a, computation_per_core) 91 | computation_tuple_list.append((a, b)) 92 | a = b 93 | computation_tuple_list.append((b, 0)) 94 | 95 | return computation_tuple_list 96 | 97 | def __create_matrix_distance(self, b, a): 98 | """ 99 | Fill the matrix between line a and b. 100 | It dumps then only the lines filled ( to save space). 101 | :param b: int 102 | :param a: int 103 | :return: void 104 | """ 105 | logger.info('Running Process {}'.format(os.getpid())) 106 | before = time.time() 107 | n = len(self.url_list) 108 | distance_matrix = np.zeros(shape=(b - a, n), 109 | dtype=np.int32) # we need double for HDBScan. We create a rectangle to save memory 110 | for i in tqdm(range(b - a)): 111 | for j in range(a + i): 112 | try: 113 | distance_score = self.distance_func(self.url_list[a + i], self.url_list[j]) 114 | distance_matrix[i, j] = int(round(100 * distance_score / max(len(self.url_list[a + i]), len( 115 | self.url_list[j])))) # we want a unique scale for short and 116 | # long string. Scale 0: 100 117 | except Exception as e: 118 | logger.error( 119 | "Error {} when computing the distance between line {} and column {}".format(e, a + i, j)) 120 | with open(os.path.join(conf.MATRIX_FOLDER, "{}_distance_matrix.pkl".format(a)), 'wb') as f: 121 | 122 | pickle.dump(distance_matrix, f, protocol=4) # protocol=4 to dump matrices bigger than 4GB 123 | 124 | logger.info('Process {} done in {} s'.format(os.getpid(), time.time() - before)) 125 | if self.stats: 126 | logger.info('Dumping stats') 127 | create_folder(conf.MATRIX_STATS_FOLDER) 128 | with open(os.path.join(conf.MATRIX_STATS_FOLDER, '{}_stats.pkl'.format(a)), 'wb') as f: 129 | pickle.dump(dict(self.stats), f) 130 | return distance_matrix 131 | 132 | def __save(self): 133 | """ 134 | Save the final results into a folder. This folder can be then used for a clustering for example 135 | :return: void 136 | """ 137 | if not self.folder: 138 | _time = int(time.time() * 1000) 139 | folder = 'save_{}'.format(_time) 140 | logger.info(f'No folder specified, we save the results in {folder}') 141 | os.mkdir(folder) 142 | self.folder = folder 143 | elif not os.path.isdir(self.folder): 144 | os.mkdir(self.folder) 145 | logger.info('Dumping matrix') 146 | with open(os.path.join(self.folder, conf.DISTANCE_MATRIX), 'wb') as f: 147 | pickle.dump(self.matrix, f, protocol=4) 148 | logger.info('Dumping index') 149 | with open(os.path.join(self.folder, conf.INDEX), 'wb') as f: 150 | pickle.dump(self.url_list, f) 151 | if self.stats: 152 | logger.info('Dumping stats') 153 | with open(os.path.join(self.folder, conf.MATRIX_STATS), 'wb') as f: 154 | pickle.dump(dict(self.stats), f) 155 | 156 | @staticmethod 157 | def __get_a(b, s): 158 | """ 159 | In a triangular matrix, the number of cells to compute between line a and line b is 160 | (b-a +1)*(a + b) /2 161 | We solved the equation to be able to get a given b and s. 162 | 163 | The idea is that s should be the same for all the processes 164 | :param b: line b - int 165 | :param s: int 166 | :return: a - int 167 | """ 168 | return int((-1 - np.sqrt(4 * (-2 * s + b ** 2 + b))) / (-2)) 169 | 170 | @staticmethod 171 | def __symmetrize(a): 172 | """ 173 | Return a symmetrized version of a 174 | """ 175 | return a + a.T - np.diag(a.diagonal()) 176 | 177 | def __get_big_matrix(self, complete_with_zero=False): 178 | """ 179 | Load all the matrices dumped by the function create_matrix_distance and symmetrize them 180 | :return: ndarray 181 | """ 182 | matrix_list = [] 183 | for file in sorted(os.listdir(conf.MATRIX_FOLDER), key=lambda x: int(x.split('_')[0])): 184 | logger.info('Loading {}'.format(file)) 185 | with open(os.path.join(conf.MATRIX_FOLDER, file), 'rb') as f: 186 | matrix_list.append(pickle.load(f)) 187 | concatenated_matrix = np.concatenate(matrix_list) 188 | if complete_with_zero: # useful when we add new urls on a computed matrix 189 | concatenated_matrix = np.concatenate( 190 | (np.zeros( 191 | shape=(concatenated_matrix.shape[1] - concatenated_matrix.shape[0], concatenated_matrix.shape[1]), 192 | dtype=np.int32), concatenated_matrix), 193 | axis=0) 194 | full_matrix = self.__symmetrize(concatenated_matrix) 195 | np.fill_diagonal(full_matrix, conf.SIMILARITY_MAX) 196 | return full_matrix 197 | 198 | @staticmethod 199 | def __get_big_stats(): 200 | """ 201 | Load all the stats dumped by the processes and combine them 202 | :return: dict 203 | """ 204 | stats = {} 205 | for file in sorted(os.listdir(conf.MATRIX_STATS_FOLDER)): 206 | logger.info('Loading {}'.format(file)) 207 | with open(os.path.join(conf.MATRIX_STATS_FOLDER, file), 'rb') as f: 208 | stats.update(pickle.load(f)) 209 | return stats 210 | 211 | @staticmethod 212 | def __delete_matrix_stat_folder(delete_folder=False): 213 | """ 214 | Delete all the temp matrices dumped 215 | :param delete_folder: bool. If True delete the folder 216 | :return: void 217 | """ 218 | for folder in [conf.MATRIX_FOLDER, conf.MATRIX_STATS_FOLDER]: 219 | shutil.rmtree(folder, ignore_errors=True) 220 | if not delete_folder: # We clean the old matrix 221 | create_folder([conf.MATRIX_FOLDER, conf.MATRIX_STATS_FOLDER]) 222 | logger.info('Old matrices deleted.') 223 | 224 | def add_url_list(self, url_list_to_add): 225 | """ 226 | Add more samples to a precomputed matrices. Done in single process only. 227 | :param url_list_to_add: url list to add 228 | :return: void 229 | """ 230 | n = len(self.url_list) 231 | base_matrix = self.matrix 232 | self.url_list += url_list_to_add 233 | self.__delete_matrix_stat_folder() 234 | self.__create_matrix_distance(len(self.url_list), n) 235 | self.matrix = sum([self.__reshape_base_matrix(base_matrix, len(url_list_to_add)), 236 | self.__get_big_matrix(complete_with_zero=True)]) 237 | self.__save() 238 | 239 | @staticmethod 240 | def __reshape_base_matrix(base_matrix, n_to_add): 241 | """ 242 | Use to add new urls to a precomputed distance matrix. It creates cells filled with 0 to the new value to be 243 | computed 244 | :param base_matrix: matrix distance computed 245 | :param n_to_add: number of samples to add 246 | :return: new matrix with the new shape 247 | """ 248 | result = np.concatenate((base_matrix, np.zeros(shape=(n_to_add, base_matrix.shape[0]), dtype=np.int32)), 249 | axis=0) 250 | result = np.concatenate((result, np.zeros(shape=(result.shape[0], n_to_add), dtype=np.int32)), axis=1) 251 | return result 252 | -------------------------------------------------------------------------------- /src/clustering/metrics.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import Levenshtein as lev 4 | from src.clustering import swalign 5 | 6 | STATS = defaultdict(int) 7 | 8 | 9 | def get_sw_distance(match, mismatch, gap_penalty): 10 | scoring = swalign.NucleotideScoringMatrix(match, mismatch) 11 | sw = swalign.LocalAlignment(scoring, gap_penalty=gap_penalty) 12 | return sw.align 13 | 14 | 15 | def longest_sub(str_a, str_b, th=10): 16 | """ 17 | Return the longest common substring between two strings. 18 | It also saves the result into a dictionary to make statistics 19 | :param str_a: str 20 | :param str_b: str 21 | :param th: size of the common string from which we can store it into the stat dict 22 | :return: 0 or 1 23 | """ 24 | global STATS 25 | m = len(str_a) 26 | n = len(str_b) 27 | counter = [[0] * (n + 1) for x in range(m + 1)] 28 | longest = 0 29 | lcs_set = set() 30 | for i in range(m): 31 | for j in range(n): 32 | if str_a[i] == str_b[j]: 33 | c = counter[i][j] + 1 34 | counter[i + 1][j + 1] = c 35 | if c > longest: 36 | lcs_set = set() 37 | longest = c 38 | lcs_set.add(str_a[i - c + 1:i + 1]) 39 | elif c == longest: 40 | lcs_set.add(str_a[i - c + 1:i + 1]) 41 | if len(lcs_set) >= 1: 42 | if len(list(lcs_set)[0]) >= th: 43 | STATS[list(lcs_set)[0]] += 1 44 | return 1 45 | return 0 46 | 47 | 48 | DISTANCE_FUNC = {'sw': get_sw_distance(match=1, mismatch=-1, gap_penalty=-1), 49 | 'lev': lev.distance, 50 | 'longest': longest_sub} 51 | -------------------------------------------------------------------------------- /src/clustering/swalign.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | I took this code from the module swalign and I adapted it to Python 3. I also removed the part not useful 4 | in our use case. You can find the original code here https://pypi.org/project/swalign/ 5 | ''' 6 | try: 7 | from StringIO import StringIO 8 | except ImportError: 9 | from io import StringIO 10 | 11 | 12 | class NucleotideScoringMatrix(object): 13 | def __init__(self, match=1, mismatch=-1): 14 | self.match = match 15 | self.mismatch = mismatch 16 | 17 | def score(self, one, two, wildcard=None): 18 | if wildcard and (one in wildcard or two in wildcard): 19 | return self.match 20 | 21 | if one == two: 22 | return self.match 23 | return self.mismatch 24 | 25 | 26 | class Matrix(object): 27 | def __init__(self, rows, cols, init=None): 28 | self.rows = rows 29 | self.cols = cols 30 | self.values = [init, ] * rows * cols 31 | 32 | def get(self, row, col): 33 | return self.values[(row * self.cols) + col] 34 | 35 | def set(self, row, col, val): 36 | self.values[(row * self.cols) + col] = val 37 | 38 | 39 | class LocalAlignment(object): 40 | def __init__(self, scoring_matrix, gap_penalty=-1, gap_extension_penalty=-1, gap_extension_decay=0.0, 41 | prefer_gap_runs=True, verbose=False, globalalign=False, wildcard=None, full_query=False): 42 | self.scoring_matrix = scoring_matrix 43 | self.gap_penalty = gap_penalty 44 | self.gap_extension_penalty = gap_extension_penalty 45 | self.gap_extension_decay = gap_extension_decay 46 | self.verbose = verbose 47 | self.prefer_gap_runs = prefer_gap_runs 48 | self.globalalign = globalalign 49 | self.wildcard = wildcard 50 | self.full_query = full_query 51 | 52 | def align(self, ref, query, ref_name='', query_name='', rc=False): 53 | orig_ref = ref 54 | orig_query = query 55 | 56 | ref = ref.upper() 57 | query = query.upper() 58 | 59 | matrix = Matrix(len(query) + 1, len(ref) + 1, (0, ' ', 0)) 60 | for row in range(1, matrix.rows): 61 | matrix.set(row, 0, (0, 'i', 0)) 62 | 63 | for col in range(1, matrix.cols): 64 | matrix.set(0, col, (0, 'd', 0)) 65 | 66 | max_val = 0 67 | max_row = 0 68 | max_col = 0 69 | 70 | # calculate matrix 71 | for row in range(1, matrix.rows): 72 | for col in range(1, matrix.cols): 73 | mm_val = matrix.get(row - 1, col - 1)[0] + self.scoring_matrix.score(query[row - 1], ref[col - 1], 74 | self.wildcard) 75 | 76 | ins_run = 0 77 | del_run = 0 78 | 79 | if matrix.get(row - 1, col)[1] == 'i': 80 | ins_run = matrix.get(row - 1, col)[2] 81 | if matrix.get(row - 1, col)[0] == 0: 82 | # no penalty to start the alignment 83 | ins_val = 0 84 | else: 85 | if not self.gap_extension_decay: 86 | ins_val = matrix.get(row - 1, col)[0] + self.gap_extension_penalty 87 | else: 88 | ins_val = matrix.get(row - 1, col)[0] + min(0, 89 | self.gap_extension_penalty + ins_run * self.gap_extension_decay) 90 | else: 91 | ins_val = matrix.get(row - 1, col)[0] + self.gap_penalty 92 | 93 | if matrix.get(row, col - 1)[1] == 'd': 94 | del_run = matrix.get(row, col - 1)[2] 95 | if matrix.get(row, col - 1)[0] == 0: 96 | # no penalty to start the alignment 97 | del_val = 0 98 | else: 99 | if not self.gap_extension_decay: 100 | del_val = matrix.get(row, col - 1)[0] + self.gap_extension_penalty 101 | else: 102 | del_val = matrix.get(row, col - 1)[0] + min(0, 103 | self.gap_extension_penalty + del_run * self.gap_extension_decay) 104 | 105 | else: 106 | del_val = matrix.get(row, col - 1)[0] + self.gap_penalty 107 | 108 | if self.globalalign or self.full_query: 109 | cell_val = max(mm_val, del_val, ins_val) 110 | else: 111 | cell_val = max(mm_val, del_val, ins_val, 0) 112 | 113 | if not self.prefer_gap_runs: 114 | ins_run = 0 115 | del_run = 0 116 | 117 | if del_run and cell_val == del_val: 118 | val = (cell_val, 'd', del_run + 1) 119 | elif ins_run and cell_val == ins_val: 120 | val = (cell_val, 'i', ins_run + 1) 121 | elif cell_val == mm_val: 122 | val = (cell_val, 'm', 0) 123 | elif cell_val == del_val: 124 | val = (cell_val, 'd', 1) 125 | elif cell_val == ins_val: 126 | val = (cell_val, 'i', 1) 127 | else: 128 | val = (0, 'x', 0) 129 | 130 | if val[0] >= max_val: 131 | max_val = val[0] 132 | max_row = row 133 | max_col = col 134 | 135 | matrix.set(row, col, val) 136 | 137 | # backtrack 138 | if self.globalalign: 139 | # backtrack from last cell 140 | row = matrix.rows - 1 141 | col = matrix.cols - 1 142 | val = matrix.get(row, col)[0] 143 | elif self.full_query: 144 | # backtrack from max in last row 145 | row = matrix.rows - 1 146 | max_val = 0 147 | col = 0 148 | for c in range(1, matrix.cols): 149 | if matrix.get(row, c)[0] > max_val: 150 | col = c 151 | max_val = matrix.get(row, c)[0] 152 | col = matrix.cols - 1 153 | val = matrix.get(row, col)[0] 154 | else: 155 | # backtrack from max 156 | row = max_row 157 | col = max_col 158 | val = max_val 159 | 160 | op = '' 161 | aln = [] 162 | 163 | path = [] 164 | while True: 165 | val, op, runlen = matrix.get(row, col) 166 | 167 | if self.globalalign: 168 | if row == 0 and col == 0: 169 | break 170 | elif self.full_query: 171 | if row == 0: 172 | break 173 | else: 174 | if val <= 0: 175 | break 176 | 177 | path.append((row, col)) 178 | aln.append(op) 179 | 180 | if op == 'm': 181 | row -= 1 182 | col -= 1 183 | elif op == 'i': 184 | row -= 1 185 | elif op == 'd': 186 | col -= 1 187 | else: 188 | break 189 | 190 | aln.reverse() 191 | 192 | if self.verbose: 193 | self.dump_matrix(ref, query, matrix, path) 194 | print(aln) 195 | print((max_row, max_col), max_val) 196 | 197 | cigar = _reduce_cigar(aln) 198 | # return Alignment(orig_query, orig_ref, row, col, cigar, max_val, ref_name, query_name, rc, self.globalalign, self.wildcard) 199 | return max_val 200 | 201 | 202 | def _reduce_cigar(operations): 203 | count = 1 204 | last = None 205 | ret = [] 206 | for op in operations: 207 | if last and op == last: 208 | count += 1 209 | elif last: 210 | ret.append((count, last.upper())) 211 | count = 1 212 | last = op 213 | 214 | if last: 215 | ret.append((count, last.upper())) 216 | return ret 217 | -------------------------------------------------------------------------------- /src/feeder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/feeder/__init__.py -------------------------------------------------------------------------------- /src/feeder/alexa_feed_downloader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import logging 3 | from typing import List 4 | 5 | 6 | from src.feeder.feed_downloader import FeedDownloader 7 | from src.feeder.feed_downloader import Url 8 | import conf 9 | 10 | logger = logging.getLogger(conf.LOGGER_NAME) 11 | 12 | 13 | class AlexaFeedDownloader(FeedDownloader): 14 | MAX_DOMAIN = 5 15 | 16 | def fetch(self) -> List[Url]: 17 | alexa_top_1m = pd.read_csv("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip", names=['rank', 'domain']) 18 | urls = self.domains_to_urls(alexa_top_1m['domain'].head(self.MAX_DOMAIN)) 19 | return [Url(url, 'Alexa', 'Benign') for url in urls] 20 | -------------------------------------------------------------------------------- /src/feeder/crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/feeder/crawler/__init__.py -------------------------------------------------------------------------------- /src/feeder/crawler/crawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | import urllib.request 4 | import urllib3 5 | import ssl 6 | import tldextract 7 | from bs4 import BeautifulSoup 8 | 9 | from .endrecursive import EndRecursive 10 | import conf 11 | 12 | logger = logging.getLogger(conf.LOGGER_NAME) 13 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 14 | ssl._create_default_https_context = ssl._create_unverified_context 15 | 16 | 17 | class Crawler(object): 18 | """ 19 | This class contains all the methods related to url crawling 20 | """ 21 | HEADERS = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8", 22 | "Accept-Encoding": "gzip, deflate", "Accept-Language": "*", "Connection": "keep-alive", 23 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " 24 | "Chrome/86.0.42400.198 Safari/537.36"} 25 | 26 | URLS = 'urls' 27 | opener = urllib.request.build_opener() 28 | opener.addheaders = [(v, k) for k, v in HEADERS.items()] 29 | urllib.request.install_opener(opener) 30 | 31 | def __init__(self, _url, lock=None, depth=conf.DEPTH_MAX): 32 | self.lock = lock 33 | self.main_domain = self.__get_primary_domain(_url) 34 | self.main_page = 'http://' + self.main_domain 35 | self.url = _url 36 | self.url_set = set() 37 | self.domain_redirected = self.main_domain # By default it's the same value 38 | self.depth = depth 39 | 40 | def run(self): 41 | """ 42 | Main method to run the crawler 43 | :return: 44 | """ 45 | url_fixed = self.fix_url(self.url) 46 | try: 47 | self.recursive_crawl(url_fixed) 48 | except EndRecursive: 49 | return self.url_set 50 | 51 | def recursive_crawl(self, _url): 52 | """ 53 | Recursive crawling on a website. It crawls all the urls found. 54 | :param _url: url to crawl 55 | :return: void 56 | """ 57 | if not _url: 58 | return None 59 | self.end_recursive_check() 60 | request = self.__request(_url) 61 | if not request: # request = None 62 | return 63 | if len(request.content) < 5: 64 | logger.info(f'We skip {_url}. EMPTY CONTENT') 65 | return 66 | soup = BeautifulSoup(request.content, 'html.parser') 67 | if len(self.url_set) == 0: # The first one, we always process 68 | self.__check_for_redirection(request) 69 | self.url_set.add(_url) 70 | self.end_recursive_check() 71 | self.__parse(_url, soup) 72 | 73 | def __parse(self, _url, soup): 74 | """ 75 | Parse the soup of the URL 76 | :param _url: url 77 | :param soup: bs4 object 78 | :return: void 79 | """ 80 | for i in soup.find_all("a"): 81 | if 'href' not in i.attrs: 82 | continue 83 | 84 | href = i.attrs['href'] 85 | if len(href) > conf.MAX_LEN_URL: 86 | logger.info('TOO LONG URL {}'.format(_url)) 87 | continue 88 | 89 | if href.startswith("/"): 90 | href = self.main_page + href 91 | 92 | if href.startswith("http"): 93 | if not (self.__get_primary_domain(href).endswith(self.main_domain)) and ( 94 | not self.__get_primary_domain(href).endswith(self.domain_redirected)): 95 | logger.debug('We skip {}'.format(href)) 96 | continue 97 | 98 | if href not in self.url_set: 99 | self.url_set.add(href) 100 | logger.info('Scraping {}'.format(href)) 101 | self.recursive_crawl(href) 102 | 103 | def __request(self, _url): 104 | """ 105 | Make the requests and handles different exceptions 106 | :param _url: url 107 | :return: request object 108 | """ 109 | try: 110 | request = requests.get(_url, timeout=conf.TIMEOUT_CRAWL, headers=self.HEADERS, 111 | verify=False) # 10 seconds timeout 112 | except requests.exceptions.ConnectTimeout as e: 113 | logger.error("CONNECT TIMEOUT for {}".format(_url)) 114 | return 115 | except requests.exceptions.ReadTimeout as e: 116 | logger.error("READ TIMEOUT for {}".format(_url)) 117 | return 118 | except requests.exceptions.SSLError as e: 119 | logger.error("SSL Error for {}. Exception {}".format(_url, e)) 120 | return 121 | except requests.exceptions.ConnectionError as e: 122 | try: 123 | if 'nodename nor servname provided, or not known' in e.args[0].reason.args[0]: 124 | logger.error(f'{_url} DOWN') 125 | else: 126 | logger.error(f'Connection error for requesting {_url}') 127 | return 128 | except Exception: 129 | logger.error(f'Connection error for requesting {_url}') 130 | return 131 | except Exception as e: 132 | logger.error(f'NEW error {e} for requesting {_url}') 133 | return 134 | return request 135 | 136 | def __check_for_redirection(self, request): 137 | new_url = request.url 138 | domain = self.__get_primary_domain(new_url) 139 | if domain != self.main_domain: 140 | self.domain_redirected = domain 141 | 142 | def __get_primary_domain(self, _url): 143 | """ 144 | Get primary domain from an URL 145 | :param _url: url 146 | :return: primary domain 147 | """ 148 | if self.lock: 149 | self.lock.acquire() 150 | primary_domain = tldextract.extract(_url).domain + '.' + tldextract.extract(_url).suffix 151 | if self.lock: 152 | self.lock.release() 153 | return primary_domain 154 | 155 | def end_recursive_check(self): 156 | if len(self.url_set) >= self.depth: 157 | logger.info('Depth max {} reached'.format(self.depth)) 158 | raise EndRecursive() # Nice way to cut the process 159 | 160 | @staticmethod 161 | def fix_url(domain): 162 | if "." not in domain: 163 | return None 164 | domain = domain.replace('\n', '') 165 | if domain.endswith('.'): 166 | domain = domain[:-1] 167 | if not domain.startswith("http"): 168 | domain = f'http://{domain}' 169 | if domain.endswith('/'): 170 | domain = domain[:-1] 171 | return domain 172 | -------------------------------------------------------------------------------- /src/feeder/crawler/endrecursive.py: -------------------------------------------------------------------------------- 1 | class EndRecursive(Exception): 2 | pass 3 | 4 | -------------------------------------------------------------------------------- /src/feeder/feed_downloader.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import logging 3 | from abc import ABC, abstractmethod 4 | from dataclasses import dataclass 5 | from typing import List 6 | 7 | from src.feeder.crawler.crawler import Crawler 8 | import conf 9 | 10 | logger = logging.getLogger(conf.LOGGER_NAME) 11 | 12 | 13 | @dataclass 14 | class Url: 15 | """ 16 | Basic format of URL 17 | """ 18 | url: str 19 | source: str 20 | label: str 21 | family: str = None 22 | 23 | def __iter__(self): 24 | """ 25 | We use this function for iterating over a list of URL object 26 | :return: iterator 27 | """ 28 | return iter([self.url, self.source, self.label, self.family]) 29 | 30 | def __post_init__(self): 31 | """ 32 | Function that run after the init and convert to lowercase the label 33 | :return: 34 | """ 35 | self.label = self.label.lower() 36 | assert self.label in ['benign', 'malicious'] 37 | 38 | 39 | class FeedDownloader(ABC): 40 | """ 41 | Abstract feeder class. 42 | 43 | You need to implement the fetch method. 44 | """ 45 | 46 | def run(self): 47 | """ 48 | Runner 49 | :return: list of Url object 50 | """ 51 | list_of_urls = self.fetch() 52 | source = list_of_urls[0].source 53 | logger.info(f'{len(list_of_urls)} downloaded from {source}') 54 | return list_of_urls 55 | 56 | @abstractmethod 57 | def fetch(self) -> List[Url]: 58 | """ 59 | Need to be implemented by each subclass 60 | :return: list of Urls object 61 | """ 62 | raise NotImplementedError 63 | 64 | def fetch_and_save(self, filename="data.csv"): 65 | """ 66 | Fetch and save Urls to CSV 67 | :param filename: path of the csv 68 | :return: list of Urls object 69 | """ 70 | list_of_urls = self.fetch() 71 | self.save_to_csv(list_of_urls, filename) 72 | return list_of_urls 73 | 74 | @staticmethod 75 | def save_to_csv(url_list, filename) -> None: 76 | """ 77 | Save list of Urls to csv 78 | :param url_list: list of Url object 79 | :param filename: path where the csv wil be stored 80 | :return: void 81 | """ 82 | columns = list(Url.__annotations__) 83 | with open(filename, 'w') as csv_file: 84 | wr = csv.writer(csv_file, delimiter=',') 85 | wr.writerow(columns) 86 | for url in url_list: 87 | wr.writerow(list(url)) 88 | logger.info(f'{len(url_list)} URls written into {filename}') 89 | 90 | @staticmethod 91 | def get_urls_from_domain(_url, depth_max=5): 92 | """ 93 | Convert domain to URLs. Run the crawler that will recursively look for URls from the same domain 94 | :param _url: url string 95 | :param depth_max: Max depth for crawling 96 | :return: url set ( not Url object) 97 | """ 98 | crawler_object = Crawler(_url, depth=depth_max) 99 | return crawler_object.run() 100 | 101 | def domains_to_urls(self, domain_list): 102 | """ 103 | List of domain to send to get_urls_from_domain 104 | :param domain_list: list of domain 105 | :return: all urls from all the domains 106 | """ 107 | url_list = [] 108 | for domain in domain_list: 109 | try: 110 | url_list += list(self.get_urls_from_domain(domain)) 111 | except Exception as e: 112 | logger.exception(e) 113 | return url_list 114 | -------------------------------------------------------------------------------- /src/feeder/iscx_feed_downloader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from io import BytesIO 3 | from zipfile import ZipFile 4 | 5 | from src.feeder.feed_downloader import FeedDownloader 6 | from src.feeder.feed_downloader import Url 7 | from src.feeder.crawler.crawler import Crawler 8 | 9 | 10 | class IscxFeedDownloader(FeedDownloader): 11 | DOWNLOAD_URL = "http://205.174.165.80/CICDataset/ISCX-URL-2016/Dataset/ISCXURL2016.zip" 12 | 13 | def fetch(self): 14 | resp = requests.get(self.DOWNLOAD_URL, headers=Crawler.HEADERS).content 15 | zipfile = ZipFile(BytesIO(resp)) 16 | result = [] 17 | for line in zipfile.open("FinalDataset/URL/Benign_list_big_final.csv").readlines(): 18 | result.append(Url(line.decode('utf-8').replace('/\r\n', ""), 'iscx', 'benign')) 19 | return result 20 | -------------------------------------------------------------------------------- /src/feeder/majestic_feed_downloader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from typing import List 3 | 4 | from src.feeder.feed_downloader import FeedDownloader 5 | from src.feeder.feed_downloader import Url 6 | 7 | 8 | class MajesticFeedDownloader(FeedDownloader): 9 | MAX_MAJESTIC = 100 10 | 11 | def fetch(self) -> List[Url]: 12 | majestic_top_1m = pd.read_csv("http://downloads.majesticseo.com/majestic_million.csv", usecols=['Domain']) 13 | urls = self.domains_to_urls(majestic_top_1m['domain'].head(self.MAX_MAJESTIC)) 14 | 15 | return [Url(url, 'Majestic', 'Benign') for url in urls] 16 | -------------------------------------------------------------------------------- /src/feeder/openfish_feed_downloader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from typing import List 3 | 4 | from src.feeder.feed_downloader import Url 5 | from src.feeder.feed_downloader import FeedDownloader 6 | 7 | 8 | class OpenPhishFeedDownloader(FeedDownloader): 9 | def fetch(self) -> List[Url]: 10 | openphish_url = "https://openphish.com/feed.txt" 11 | malicious_urls = requests.get(openphish_url).content.decode('utf-8').split('\n') 12 | return [Url(url, 'OpenPhish', 'Malicious', "Phishing") for url in malicious_urls] 13 | -------------------------------------------------------------------------------- /src/feeder/umbrella_feed_downloader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from typing import List 3 | 4 | from src.feeder.feed_downloader import Url 5 | from src.feeder.feed_downloader import FeedDownloader 6 | 7 | 8 | class UmbrellaFeedDownloader(FeedDownloader): 9 | def fetch(self) -> List[Url]: 10 | umbrella_domains = pd.read_csv("http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip", 11 | names=['rank', 'domain']) 12 | urls = self.domains_to_urls(umbrella_domains['domain']) 13 | 14 | return [Url(url, 'Umbrella', 'Benign') for url in urls] 15 | -------------------------------------------------------------------------------- /src/feeder/urlhaus_feed_downloader.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from src.feeder.feed_downloader import Url 4 | from src.feeder.feed_downloader import FeedDownloader 5 | 6 | 7 | class URLHausFeedDownloader(FeedDownloader): 8 | def fetch(self) -> List[Url]: 9 | import requests 10 | urlhaus_url = "https://urlhaus.abuse.ch/downloads/text_recent/" 11 | malicious_urls = requests.get(urlhaus_url).content.decode('utf-8').split('\r\n') 12 | return [Url(url, 'URLHaus', 'malicious') for url in malicious_urls] 13 | -------------------------------------------------------------------------------- /src/feeder/vt_feed_downloader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import math 3 | import logging 4 | 5 | from src.feeder.feed_downloader import FeedDownloader 6 | from src.feeder.feed_downloader import Url 7 | import conf 8 | 9 | logger = logging.getLogger(conf.LOGGER_NAME) 10 | 11 | 12 | class VtFeedDownloader(FeedDownloader): 13 | """ 14 | This class can be used to bring either benign or malicious URLs. 15 | We use it currently to bring benign URLs. To force VT, to return URLs with a path, we add the path 16 | should include the letter "a". Don't forget to store your key in a file. 17 | """ 18 | QUERY = """entity:url path:'a' response_code:200 p:0""" 19 | 20 | def __init__(self): 21 | self.bulk = 300 # Because the api does not accept a limit larger than 300 22 | self.api_key = self.load_key() 23 | self.headers = {'x-apikey': self.api_key} 24 | 25 | def fetch(self): 26 | return self.get_records() 27 | 28 | def get_records(self, query=QUERY, number=1000): 29 | if number <= self.bulk: 30 | bulk = number 31 | else: 32 | bulk = self.bulk 33 | 34 | params = {'query': query, 'limit': bulk} 35 | 36 | r = requests.get('https://www.virustotal.com/api/v3/intelligence/search', params, headers=self.headers) 37 | request_json = r.json() 38 | lst = request_json['data'] 39 | number_of_bulks = math.ceil(((number - bulk) / bulk)) 40 | for i in range(number_of_bulks): 41 | url = request_json['links']['next'] 42 | r = requests.get(url, headers=self.headers) 43 | request_json = r.json() 44 | lst.extend(request_json['data']) 45 | 46 | return [Url(item['attributes']['last_final_url'], 'vt', 'benign') for item in lst] 47 | 48 | @staticmethod 49 | def load_key(_path=conf.VT_KEY): 50 | try: 51 | with open(_path) as f: 52 | key = f.read() 53 | except Exception as e: 54 | logger.error('You need store the VT API key in a file before continuing. You can put it in ' 55 | '{}'.format(conf.VT_KEY)) 56 | return 57 | return key 58 | -------------------------------------------------------------------------------- /src/logger_code.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.handlers 3 | import sys 4 | import os 5 | import pathlib 6 | import conf 7 | 8 | 9 | def init_logger(debug_mode=True, logger_file=conf.LOGGER_FILE): 10 | """ 11 | Init logger file and print the log in stdout in debug_mode 12 | :param debug_mode: bool 13 | :param logger_file: path of the file log 14 | :return: logger object 15 | """ 16 | if not os.path.exists(conf.LOG_FOLDER): # we don't use the function from utils to not create a circular dependency 17 | pathlib.Path(conf.LOG_FOLDER).mkdir(parents=True, exist_ok=True) 18 | _logger = logging.getLogger(conf.LOGGER_NAME) 19 | _logger.setLevel(logging.DEBUG) 20 | formatter = logging.Formatter('%(asctime)s - %(module)s - %(levelname)s - %(message)s') 21 | fh = logging.handlers.RotatingFileHandler( 22 | logger_file, maxBytes=conf.LOG_FILE_SIZE, backupCount=conf.LOG_FILE_NUMBER) 23 | 24 | fh.setFormatter(formatter) 25 | _logger.addHandler(fh) 26 | 27 | if debug_mode: 28 | stdout_handler = logging.StreamHandler(sys.stdout) 29 | stdout_handler.setFormatter(formatter) 30 | _logger.addHandler(stdout_handler) 31 | 32 | return _logger 33 | -------------------------------------------------------------------------------- /src/preprocessor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/preprocessor/__init__.py -------------------------------------------------------------------------------- /src/preprocessor/preprocessor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import logging 3 | import os 4 | from random import randint 5 | from abc import ABC, abstractmethod 6 | from tqdm import tqdm 7 | from urllib.parse import urlparse 8 | 9 | from src.utils import create_folder 10 | import conf 11 | 12 | logger = logging.getLogger(conf.LOGGER_NAME) 13 | 14 | 15 | class Preprocessor(ABC): 16 | """ 17 | Abstract class for preprocessing classes 18 | """ 19 | 20 | def run(self, file_path): 21 | """ 22 | Basic runner 23 | :param file_path: path of the main file. In general data.csv 24 | :return: void 25 | """ 26 | df = pd.read_csv(file_path) 27 | df_basic_processed = self.basic_preprocessing(df) 28 | 29 | df_processed = self.process(df_basic_processed) 30 | create_folder(path=conf.DATA) 31 | preprocessed_file_name = os.path.join(conf.DATA, 32 | os.path.basename(file_path).replace('.csv', conf.PREPROCESSED_SUFFIX)) 33 | logger.info(f"Data preprocessed saved in {preprocessed_file_name}") 34 | df_processed.to_csv(preprocessed_file_name, index=False) 35 | return df_processed 36 | 37 | @abstractmethod 38 | def process(self, df): 39 | """ 40 | Abstract method that needs to be implemented. 41 | :param df: DataFrame object. 42 | :return: DataFrame object after your filter 43 | """ 44 | raise NotImplementedError 45 | 46 | @staticmethod 47 | def basic_preprocessing(df): 48 | """ 49 | This technique run before your preprocessing and extracts for you some basic features 50 | :param df: DataFrame with benign and malicious samples 51 | :return: DataFrame object with the new columns 52 | """ 53 | features_dict_list = [] 54 | df = df.dropna(subset=['url']) 55 | urls = df['url'].unique() 56 | logger.info(f"{len(urls)} unique URLs found in the dataframe (shape {df.shape})") 57 | for url in tqdm(urls): 58 | full_url = url 59 | if full_url.startswith("/"): # it's maybe only a path 60 | full_url = f"http://randomdomain{str(randint(0, 10 ** 5))}.com{full_url}" 61 | if not (full_url.startswith('http://') or full_url.startswith( 62 | 'https://')): # URL parse is not working well without it 63 | full_url = 'http://{}'.format(full_url) 64 | full_url = full_url.replace('\n', '') 65 | parsed_uri = urlparse(full_url) 66 | extension = '' 67 | 68 | if parsed_uri.path.find('.') != -1 and not parsed_uri.path.endswith('/'): 69 | extension = parsed_uri.path[parsed_uri.path.rfind('.'):] 70 | features_dict_list.append( 71 | {"url": url, 72 | 'full_url': full_url, 73 | 'domain': parsed_uri.netloc, 74 | 'path': parsed_uri.path, 75 | 'params': parsed_uri.params, 76 | 'query': parsed_uri.query, 77 | 'path_len': len(parsed_uri.path), 78 | 'extension': extension, 79 | 'folder_count': parsed_uri.path.count('/')}) 80 | 81 | result = pd.DataFrame(features_dict_list) 82 | result = pd.merge(df, result, how='left', on='url') 83 | result['url'] = result['full_url'] 84 | result = result.drop(columns=['full_url']) 85 | logger.info('Final df shape {}'.format(result.shape)) 86 | logger.info('Unique path: {}'.format(result['path'].nunique())) 87 | logger.info('Unique domain: {}'.format(result['domain'].nunique())) 88 | logger.info('Path with at least one folder: {} \n'.format(result[result['folder_count'] > 1]['path'].nunique())) 89 | return result 90 | -------------------------------------------------------------------------------- /src/preprocessor/preprocessor_basic.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pickle 4 | import pandas as pd 5 | import json 6 | 7 | from src.preprocessor.preprocessor import Preprocessor 8 | from src.regex.regex import Regex 9 | from src.utils import create_folder 10 | 11 | import conf 12 | 13 | logger = logging.getLogger(conf.LOGGER_NAME) 14 | 15 | MIN_LEN = 7 16 | MAX_PATH = 45000 17 | 18 | 19 | class PreprocessorBasic(Preprocessor): 20 | """ 21 | Basic preprocessor class. It filters duplicates, Urls already caught by the old regexes, too long path, 22 | select one path per domain etc 23 | """ 24 | 25 | def process(self, df): 26 | """ 27 | Method that had to be implemented. Run all the submethod and the return a DataFrame filtered. 28 | :param df: DataFrame 29 | :return: DataFrame filtered 30 | """ 31 | df_benign = df[df['label'] == 'benign'] 32 | df = df[df['label'] == 'malicious'] 33 | number_of_path_before_cleaning = df['path'].nunique() 34 | df = self.remove_path_duplicates(df) 35 | df = self.keep_one_path_per_domain(df) 36 | df = self.clean_df(df, benign_path=df_benign['path'].unique()) 37 | df = self.keep_path_with_folders(df) 38 | df = self.clean_with_regexes(df) 39 | df = self.check_size(df) 40 | self.show_stat(df) 41 | logger.info( 42 | 'In total we cleaned {} (- {} %) paths'.format(number_of_path_before_cleaning - df['path'].nunique(), 43 | (1 - df[ 44 | 'path'].nunique() / number_of_path_before_cleaning) * 100)) 45 | return pd.concat([df, df_benign], sort=False) 46 | 47 | @staticmethod 48 | def remove_path_duplicates(df): 49 | """ 50 | It should be mandatory. Take unique paths 51 | :param df: DataFrame 52 | :return: DataFrame 53 | """ 54 | logger.info('Shape with path duplicates : {}'.format(df.shape)) 55 | result = df.drop_duplicates(['path']) 56 | logger.info('Shape without path duplicates : {} (-{} %)'.format(result.shape, 57 | round(100 - 100 * result.shape[0] / df.shape[0], 58 | 2))) 59 | return result 60 | 61 | @staticmethod 62 | def keep_one_path_per_domain(df): 63 | """ 64 | To reduce the number of paths and to more generalize the regexes, we can choose only one path per domain 65 | :param df: DataFrame 66 | :return: DataFrame 67 | """ 68 | logger.info('Shape with domain duplicates : {}'.format(df.shape)) 69 | result = df.drop_duplicates(['domain']) 70 | logger.info('Shape without domain duplicates : {} (-{} %)'.format(result.shape, 71 | round( 72 | 100 - 100 * result.shape[0] / df.shape[0], 73 | 2))) 74 | return result 75 | 76 | @staticmethod 77 | def show_stat(df): 78 | """ 79 | Show basic stats 80 | :param df: DataFrame 81 | :return: void 82 | """ 83 | logger.info('Shape {}'.format(df.shape)) 84 | logger.info('Path : {}'.format(df['path'].nunique())) 85 | logger.info('Domain : {}'.format(df['domain'].nunique())) 86 | logger.info('Mean path len: {}'.format(df['path_len'].mean())) 87 | 88 | def clean_df(self, df, benign_path, path_len=MIN_LEN): 89 | """ 90 | Some basic cleaning 91 | :param df: DataFrame 92 | :param benign_path: list of benign path 93 | :param path_len: minimal path len 94 | :return: DataFrame 95 | """ 96 | df['filter_wp'] = df['path'].apply(self.clean_wordpress, args=(MIN_LEN,)) 97 | new_df = df[(df['path_len'] >= path_len) & (~df['path'].isin(benign_path)) & (df['filter_wp'] == False)] 98 | logger.info('Cleaning : {} -- > {} paths (-{}%)'.format(df['path'].nunique(), new_df['path'].nunique(), 99 | round(new_df['path'].nunique() / df['path'].nunique(), 100 | 2))) 101 | return new_df 102 | 103 | def clean_with_regexes(self, df): 104 | """ 105 | Remove from the DataFrame Urls already caught by your existing regexes. 106 | :param df: DataFrame object 107 | :return: DataFrame filtered 108 | """ 109 | if not os.path.exists(conf.LAST_REGEX_LIST): 110 | logger.info('We did not find {}. We skip the cleaning with regexes step'.format(conf.LAST_REGEX_LIST)) 111 | return df 112 | with open(conf.LAST_REGEX_LIST) as json_file: 113 | regex_list = json.load(json_file)['regexes'] 114 | already_found = self.regex_test(regex_list, list(df[df['path'].notnull()]['path'].unique()))[0] 115 | logger.info('{} paths are already found with the old regexes'.format(len(already_found))) 116 | df = df[~df['path'].isin(list(already_found))] 117 | return df 118 | 119 | @staticmethod 120 | def regex_test(regex_list, list_to_test, pickle_save=os.path.join(conf.COVERAGE_FOLDER, 'nevada_coverage.pickle')): 121 | """ 122 | 123 | :param regex_list: regex list ( string ) 124 | :param list_to_test: list of urls to test 125 | :param pickle_save: if specified, save the statistics of this test in a pickle that you can open with the 126 | Jupyter Notebook for analysis 127 | :return: tuple (set of Urls found, dictonnary with catches by regex, DataFrame with some stats) 128 | """ 129 | all_found = set() 130 | dict_found = {} 131 | for _re in regex_list: 132 | print(f'Testing regex {_re}') 133 | _, found = Regex.check_regex_list(_re, list_to_test) 134 | print(f'Match {len(found)} paths !') 135 | dict_found[_re] = found 136 | all_found = all_found.union(set(found)) 137 | data = {'regex': list(dict_found.keys()), 'count': [len(x) for x in list(dict_found.values())]} 138 | 139 | df_stat = pd.DataFrame(data) 140 | print(f"Coverage {str(df_stat['count'].sum())} {round(100 * df_stat['count'].sum() / len(list_to_test), 2)} % ") 141 | if pickle_save: 142 | create_folder(pickle_save) 143 | with open(pickle_save, 'wb') as handle: 144 | pickle.dump(dict_found, handle, protocol=pickle.HIGHEST_PROTOCOL) 145 | print(f'Results saved in {pickle_save}') 146 | return all_found, dict_found, df_stat 147 | 148 | @staticmethod 149 | def keep_path_with_folders(df, th=1): 150 | """ 151 | To avoid FP, sometimes we want to catch 'long' URLs, we more than one folder inside the path 152 | :param df: DataFrame 153 | :param th: number of folders minimum 154 | :return: DataFrame filtered 155 | """ 156 | logger.info('Shape : {}'.format(df.shape)) 157 | result = df[df['folder_count'] > th] 158 | logger.info('Shape with at least {} folder(s) : {} (-{} %)'.format(th, result.shape, 159 | round(100 - 100 * result.shape[0] / df.shape[ 160 | 0], 161 | 2))) 162 | return result 163 | 164 | @staticmethod 165 | def clean_wordpress(x, path_len_min): 166 | """ 167 | Wordpress paths create many FP, we filter here the most popular paths 168 | :param x: url 169 | :param path_len_min: path len min 170 | :return: bool 171 | """ 172 | 173 | def return_if_path_len_ok(x, key): 174 | return len(x.replace(key, '')) < path_len_min 175 | 176 | wordpress_dict = {'wp-admin': ['wp-admin/images', 'wp-admin/css', 'wp-admin/js'], 177 | 'wp-includes': ['wp-includes/css', 'wp-includes/js', 'wp-includes/images'], 178 | 'wp-content': ['wp-content/plugins', 'wp-content/themes', 'wp-content/uploads', 179 | 'wp-content/languages/themes', 'wp-content/mu-plugins'], 180 | 'images': ['images/logos.gif', 'images/logo.gif'], 181 | 'contact': ['contact-us'], 182 | 'config.bin': [], 183 | 'admin.php': [], 184 | 'login.php': [], 185 | 'index.php': [], 186 | 'gate.php': [], 187 | '.jpg': [], 188 | '.png': [], 189 | '.php': []} 190 | for wp_key in wordpress_dict: 191 | if wp_key in x: 192 | for wp_key_path in wordpress_dict[wp_key]: 193 | if wp_key_path in x: 194 | return return_if_path_len_ok(x, wp_key_path) 195 | return return_if_path_len_ok(x, wp_key) 196 | return False 197 | 198 | @staticmethod 199 | def check_size(df): 200 | """ 201 | To avoid issue with computation, we can specifiy a limit of URLs to keep. 202 | 30k unique paths can use 300GB RAM ... 203 | :param df: DataFrame 204 | :return: DataFrame after filter 205 | """ 206 | if df['path'].nunique() > MAX_PATH: 207 | logger.info('r/!\ NUMBER OF PATHS TOO HIGH {}. WE SAMPLE {} paths'.format(df['path'].nunique(), MAX_PATH)) 208 | return df.sample(MAX_PATH) 209 | return df 210 | -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle.jar -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/build.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | Builds, tests, and runs the project ConsoleRegexTurtle. 12 | 13 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/build/built-jar.properties: -------------------------------------------------------------------------------- 1 | #Wed, 29 Apr 2015 09:45:34 +0200 2 | 3 | 4 | /home/andrea/NetBeansProjects/RegexGenerator/ConsoleRegexTurtle= 5 | 6 | /home/andrea/NetBeansProjects/RegexGenerator/MaleRegexTree= 7 | 8 | /home/andrea/NetBeansProjects/RegexGenerator/Random\ Regex\ Turtle= 9 | -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/build/classes/it/units/inginf/male/console/ConsoleRegexTurtle.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/build/classes/it/units/inginf/male/console/ConsoleRegexTurtle.class -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/build/classes/it/units/inginf/male/dto/SimpleConfig.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/build/classes/it/units/inginf/male/dto/SimpleConfig.class -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/dist/ConsoleRegexTurtle.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/dist/ConsoleRegexTurtle.jar -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/dist/README.TXT: -------------------------------------------------------------------------------- 1 | ======================== 2 | BUILD OUTPUT DESCRIPTION 3 | ======================== 4 | 5 | When you build an Java application project that has a main class, the IDE 6 | automatically copies all of the JAR 7 | files on the projects classpath to your projects dist/lib folder. The IDE 8 | also adds each of the JAR files to the Class-Path element in the application 9 | JAR files manifest file (MANIFEST.MF). 10 | 11 | To run the project from the command line, go to the dist folder and 12 | type the following: 13 | 14 | java -jar "ConsoleRegexTurtle.jar" 15 | 16 | To distribute this project, zip up the dist folder (including the lib folder) 17 | and distribute the ZIP file. 18 | 19 | Notes: 20 | 21 | * If two JAR files on the project classpath have the same name, only the first 22 | JAR file is copied to the lib folder. 23 | * Only JAR files are copied to the lib folder. 24 | If the classpath contains other types of files or folders, these files (folders) 25 | are not copied. 26 | * If a library on the projects classpath also has a Class-Path element 27 | specified in the manifest,the content of the Class-Path element has to be on 28 | the projects runtime path. 29 | * To set a main class in a standard Java project, right-click the project node 30 | in the Projects window and choose Properties. Then click Run and enter the 31 | class name in the Main Class field. Alternatively, you can manually type the 32 | class name in the manifest Main-Class element. 33 | -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/dist/lib/MaleRegexTree.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/dist/lib/MaleRegexTree.jar -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/dist/lib/Random_Regex_Turtle.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/dist/lib/Random_Regex_Turtle.jar -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/dist/lib/gson-2.2.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/dist/lib/gson-2.2.4.jar -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/dist/regexturtle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Executes the command-line version of RegextTurtle; automatically sets the JAVA VM memory size based on the available system memory 3 | MEMSYSTEM=8000 4 | MAXMEM=$(( MEMSYSTEM-512 )) 5 | XMSMEM=$(( MAXMEM/2 )) 6 | echo "System memory:"$MEMSYSTEM "Mbytes" 7 | echo "RegexTurtle is going to use this amount of the system memory:"$MAXMEM "Mbytes" 8 | java -Xmx${MAXMEM}M -Xms${XMSMEM}M -jar "ConsoleRegexTurtle.jar" $@ 9 | -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/lib/CopyLibs/org-netbeans-modules-java-j2seproject-copylibstask.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/lib/CopyLibs/org-netbeans-modules-java-j2seproject-copylibstask.jar -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/lib/Gson/gson-2.2.4-javadoc.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/lib/Gson/gson-2.2.4-javadoc.jar -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/lib/Gson/gson-2.2.4-sources.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/lib/Gson/gson-2.2.4-sources.jar -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/lib/Gson/gson-2.2.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/lib/Gson/gson-2.2.4.jar -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/lib/nblibraries.properties: -------------------------------------------------------------------------------- 1 | libs.CopyLibs.classpath=\ 2 | ${base}/CopyLibs/org-netbeans-modules-java-j2seproject-copylibstask.jar 3 | libs.CopyLibs.displayName=CopyLibs Task 4 | libs.CopyLibs.prop-version=2.0 5 | libs.Gson.classpath=\ 6 | ${base}/Gson/gson-2.2.4.jar 7 | libs.Gson.javadoc=\ 8 | ${base}/Gson/gson-2.2.4-javadoc.jar!// 9 | libs.Gson.src=\ 10 | ${base}/Gson/gson-2.2.4-sources.jar!// 11 | -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/manifest.mf: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | X-COMMENT: Main-Class will be added automatically by build 3 | 4 | -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/nbproject/genfiles.properties: -------------------------------------------------------------------------------- 1 | build.xml.data.CRC32=26ed0a59 2 | build.xml.script.CRC32=edeeeee4 3 | build.xml.stylesheet.CRC32=8064a381@1.75.0.48 4 | # This file is used by a NetBeans-based IDE to track changes in generated files such as build-impl.xml. 5 | # Do not edit this file. You may delete it but then the IDE will never regenerate such files for you. 6 | nbproject/build-impl.xml.data.CRC32=26ed0a59 7 | nbproject/build-impl.xml.script.CRC32=4c760918 8 | nbproject/build-impl.xml.stylesheet.CRC32=876e7a8f@1.75.0.48 9 | -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/nbproject/private/config.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/nbproject/private/config.properties -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/nbproject/private/private.properties: -------------------------------------------------------------------------------- 1 | application.args=-t 4 -p 200 -g 500 -e 20.0 -c "interesting evolution" -d testdataset/reduced.json -o ./outputfolder/ 2 | compile.on.save=false 3 | do.depend=false 4 | do.jar=true 5 | javac.debug=true 6 | javadoc.preview=true 7 | user.properties.file=/home/andrea/.netbeans/8.0.1/build.properties 8 | -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/nbproject/private/private.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | file:/home/fab/NetBeansProjects/Rilascio%20Month%208/ConsoleRegexTurtle/src/it/units/inginf/male/console/ConsoleRegexTurtle.java 8 | 9 | 10 | file:/home/fab/NetBeansProjects/Rilascio%20Month%2012/ConsoleRegexTurtle/build.xml 11 | file:/home/fab/NetBeansProjects/Rilascio%20Month%2012/ConsoleRegexTurtle/regexturtle.sh 12 | file:/home/fab/NetBeansProjects/Rilascio%20Month%2012/ConsoleRegexTurtle/src/it/units/inginf/male/console/ConsoleRegexTurtle.java 13 | file:/home/fab/NetBeansProjects/Rilascio%20Month%2012/ConsoleRegexTurtle/src/it/units/inginf/male/dto/SimpleConfig.java 14 | 15 | 16 | file:/home/fab/Desktop/Month%2012%20Release/ConsoleRegexTurtle/src/it/units/inginf/male/console/ConsoleRegexTurtle.java 17 | file:/home/fab/Desktop/Month%2012%20Release/ConsoleRegexTurtle/src/it/units/inginf/male/console/prova.java 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/nbproject/project.properties: -------------------------------------------------------------------------------- 1 | annotation.processing.enabled=true 2 | annotation.processing.enabled.in.editor=false 3 | annotation.processing.processors.list= 4 | annotation.processing.run.all.processors=true 5 | annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output 6 | application.title=ConsoleRegexTurtle 7 | application.vendor=fab 8 | build.classes.dir=${build.dir}/classes 9 | build.classes.excludes=**/*.java,**/*.form 10 | # This directory is removed when the project is cleaned: 11 | build.dir=build 12 | build.generated.dir=${build.dir}/generated 13 | build.generated.sources.dir=${build.dir}/generated-sources 14 | # Only compile against the classpath explicitly listed here: 15 | build.sysclasspath=ignore 16 | build.test.classes.dir=${build.dir}/test/classes 17 | build.test.results.dir=${build.dir}/test/results 18 | # Uncomment to specify the preferred debugger connection transport: 19 | #debug.transport=dt_socket 20 | debug.classpath=\ 21 | ${run.classpath} 22 | debug.test.classpath=\ 23 | ${run.test.classpath} 24 | # Files in build.classes.dir which should be excluded from distribution jar 25 | dist.archive.excludes= 26 | # This directory is removed when the project is cleaned: 27 | dist.dir=dist 28 | dist.jar=${dist.dir}/ConsoleRegexTurtle.jar 29 | dist.javadoc.dir=${dist.dir}/javadoc 30 | endorsed.classpath= 31 | excludes= 32 | file.reference.WebRegexTurtle-src=../WebRegexTurtle/src 33 | includes=** 34 | jar.compress=false 35 | javac.classpath=\ 36 | ${reference.Random_Regex_Turtle.jar}:\ 37 | ${reference.MaleRegexTree.jar}:\ 38 | ${libs.Gson.classpath} 39 | # Space-separated list of extra javac options 40 | javac.compilerargs= 41 | javac.deprecation=false 42 | javac.processorpath=\ 43 | ${javac.classpath} 44 | javac.source=1.7 45 | javac.target=1.7 46 | javac.test.classpath=\ 47 | ${javac.classpath}:\ 48 | ${build.classes.dir} 49 | javac.test.processorpath=\ 50 | ${javac.test.classpath} 51 | javadoc.additionalparam= 52 | javadoc.author=false 53 | javadoc.encoding=${source.encoding} 54 | javadoc.noindex=false 55 | javadoc.nonavbar=false 56 | javadoc.notree=false 57 | javadoc.private=false 58 | javadoc.splitindex=true 59 | javadoc.use=true 60 | javadoc.version=false 61 | javadoc.windowtitle= 62 | main.class=it.units.inginf.male.console.ConsoleRegexTurtle 63 | manifest.file=manifest.mf 64 | meta.inf.dir=${src.dir}/META-INF 65 | mkdist.disabled=false 66 | platform.active=default_platform 67 | project.license=gpl30 68 | project.MaleRegexTree=../MaleRegexTree 69 | project.Random_Regex_Turtle=../Random Regex Turtle 70 | reference.MaleRegexTree.jar=${project.MaleRegexTree}/dist/MaleRegexTree.jar 71 | reference.Random_Regex_Turtle.jar=${project.Random_Regex_Turtle}/dist/Random_Regex_Turtle.jar 72 | run.classpath=\ 73 | ${javac.classpath}:\ 74 | ${build.classes.dir} 75 | # Space-separated list of JVM arguments used when running the project. 76 | # You may also define separate properties like run-sys-prop.name=value instead of -Dname=value. 77 | # To set system properties for unit tests define test-sys-prop.name=value: 78 | run.jvmargs= 79 | run.test.classpath=\ 80 | ${javac.test.classpath}:\ 81 | ${build.test.classes.dir} 82 | source.encoding=UTF-8 83 | src.dir=src 84 | test.src.dir=test 85 | -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/nbproject/project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | org.netbeans.modules.java.j2seproject 4 | 5 | 6 | ConsoleRegexTurtle 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | ./lib/nblibraries.properties 16 | 17 | 18 | 19 | MaleRegexTree 20 | jar 21 | 22 | jar 23 | clean 24 | jar 25 | 26 | 27 | Random_Regex_Turtle 28 | jar 29 | 30 | jar 31 | clean 32 | jar 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/regexturtle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Executes the command-line version of RegextTurtle; automatically sets the JAVA VM memory size based on the available system memory 3 | MEMDATA=$(free -m | grep Mem:) 4 | ARR=($MEMDATA) 5 | MEMSYSTEM=${ARR[1]} 6 | MAXMEM=$(( MEMSYSTEM-512 )) 7 | XMSMEM=$(( MAXMEM/2 )) 8 | echo "System memory:"$MEMSYSTEM "Mbytes" 9 | echo "RegexTurtle is going to use this amount of the system memory:"$MAXMEM "Mbytes" 10 | java -Xmx${MAXMEM}M -Xms${XMSMEM}M -jar "ConsoleRegexTurtle.jar" $@ -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/src/it/units/inginf/male/console/ConsoleRegexTurtle.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Machine Learning Lab - University of Trieste, 3 | * Italy (http://machinelearning.inginf.units.it/) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | */ 18 | package it.units.inginf.male.console; 19 | 20 | import com.google.gson.Gson; 21 | import com.google.gson.GsonBuilder; 22 | import it.units.inginf.male.Main; 23 | import it.units.inginf.male.configuration.Configuration; 24 | import it.units.inginf.male.dto.SimpleConfig; 25 | import it.units.inginf.male.inputs.DataSet; 26 | import it.units.inginf.male.inputs.DataSet.Example; 27 | import it.units.inginf.male.outputs.FinalSolution; 28 | import it.units.inginf.male.outputs.Results; 29 | import it.units.inginf.male.postprocessing.BasicPostprocessor; 30 | import it.units.inginf.male.postprocessing.JsonPostProcessor; 31 | import it.units.inginf.male.strategy.ExecutionStrategy; 32 | import it.units.inginf.male.strategy.impl.CoolTextualExecutionListener; 33 | import it.units.inginf.male.utils.Utils; 34 | import java.io.BufferedReader; 35 | import java.io.File; 36 | import java.io.FileInputStream; 37 | import java.io.IOException; 38 | import java.io.InputStreamReader; 39 | import java.util.logging.Level; 40 | import java.util.logging.Logger; 41 | 42 | /** 43 | * Provides a commandline tool for the GP Engine, RandomRegexTurtle. 44 | * 45 | * @author MaleLabTs 46 | */ 47 | public class ConsoleRegexTurtle { 48 | 49 | private static String WARNING_MESSAGE = "\nWARNING\n" 50 | + "The quality of the solution depends on a number of factors, including size and syntactical properties of the learning information.\n" 51 | + "The algorithms embedded in this experimental prototype have always been tested with at least 25 matches over at least 2 examples.\n" 52 | + "It is very unlikely that a smaller number of matches allows obtaining a useful solution.\n"; 53 | 54 | /** 55 | * @param args the command line arguments 56 | */ 57 | public static void main(String[] args) { 58 | SimpleConfig simpleConfiguration = new SimpleConfig(); 59 | 60 | //Set defaults for commandline parameters 61 | simpleConfiguration.datasetName = "./dataset.json"; // -d 62 | simpleConfiguration.outputFolder = "."; // -o 63 | //load simpleconfig defaults 64 | simpleConfiguration.numberOfJobs = 32; // -j 65 | simpleConfiguration.generations = 1000; // -g 66 | simpleConfiguration.numberThreads = 4; // -t 67 | simpleConfiguration.populationSize = 500; //-p 68 | simpleConfiguration.termination = 20; //-e 69 | simpleConfiguration.populateOptionalFields = false; 70 | simpleConfiguration.isStriped = false; 71 | 72 | parseArgs(args, simpleConfiguration); 73 | 74 | try { 75 | simpleConfiguration.dataset = loadDataset(simpleConfiguration.datasetName); 76 | } catch (IOException ex) { 77 | System.out.println("Problem opening the dataset file " + simpleConfiguration.datasetName + "\n"); 78 | Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex); 79 | System.exit(1); 80 | } 81 | //Output warning about learning size 82 | String message = null; 83 | int numberPositiveExamples = 0; 84 | for (Example example : simpleConfiguration.dataset.getExamples()) { 85 | if (example.getNumberMatches() > 0) { 86 | numberPositiveExamples++; 87 | } 88 | } 89 | if (simpleConfiguration.dataset.getNumberMatches() < 25 || numberPositiveExamples < 2) { 90 | message = WARNING_MESSAGE; 91 | } 92 | Configuration config = simpleConfiguration.buildConfiguration(); 93 | //change defaults for console usage 94 | config.setPostProcessor(new JsonPostProcessor()); 95 | config.getPostprocessorParameters().put(BasicPostprocessor.PARAMETER_NAME_POPULATE_OPTIONAL_FIELDS, Boolean.toString(simpleConfiguration.populateOptionalFields)); 96 | config.setOutputFolderName(simpleConfiguration.outputFolder); 97 | 98 | Results results = new Results(config); 99 | results.setComment(simpleConfiguration.comment); 100 | try { 101 | //This is an optional information 102 | results.setMachineHardwareSpecifications(Utils.cpuInfo()); 103 | } catch (IOException ex) { 104 | Logger.getLogger(ConsoleRegexTurtle.class.getName()).log(Level.SEVERE, null, ex); 105 | } 106 | CoolTextualExecutionListener consolelistener = new CoolTextualExecutionListener(message, config, results); 107 | 108 | long startTime = System.currentTimeMillis(); 109 | ExecutionStrategy strategy = config.getStrategy(); 110 | try { 111 | strategy.execute(config, consolelistener); 112 | } catch (Exception ex) { 113 | Logger.getLogger(ConsoleRegexTurtle.class.getName()).log(Level.SEVERE, null, ex); 114 | } 115 | 116 | if (config.getPostProcessor() != null) { 117 | startTime = System.currentTimeMillis() - startTime; 118 | config.getPostProcessor().elaborate(config, results, startTime); 119 | } 120 | writeBestPerformances(results.getBestSolution(), config.isIsFlagging()); 121 | } 122 | 123 | private static DataSet loadDataset(String dataSetFilename) throws IOException { 124 | FileInputStream fis = new FileInputStream(new File(dataSetFilename)); 125 | InputStreamReader isr = new InputStreamReader(fis); 126 | StringBuilder sb; 127 | try (BufferedReader bufferedReader = new BufferedReader(isr)) { 128 | sb = new StringBuilder(); 129 | String line; 130 | while ((line = bufferedReader.readLine()) != null) { 131 | sb.append(line); 132 | } 133 | } 134 | String json = sb.toString(); 135 | return loadDatasetJson(json); 136 | } 137 | 138 | private static DataSet loadDatasetJson(String jsonDataset) { 139 | Gson gson = new GsonBuilder().disableHtmlEscaping().create(); 140 | DataSet dataset = gson.fromJson(jsonDataset, DataSet.class); 141 | return dataset; 142 | } 143 | 144 | private static void writeBestPerformances(FinalSolution solution, boolean isFlagging) { 145 | if (solution != null) { 146 | System.out.println("Best on learning (JAVA): " + solution.getSolution()); 147 | System.out.println("Best on learning (JS): " + solution.getSolutionJS()); 148 | if (!isFlagging) { 149 | System.out.println("******Stats for Extraction task******"); 150 | System.out.println("******Stats on training******"); 151 | System.out.println("F-measure: " + solution.getTrainingPerformances().get("match f-measure")); 152 | System.out.println("Precision: " + solution.getTrainingPerformances().get("match precision")); 153 | System.out.println("Recall: " + solution.getTrainingPerformances().get("match recall")); 154 | System.out.println("Char precision: " + solution.getTrainingPerformances().get("character precision")); 155 | System.out.println("Char recall: " + solution.getTrainingPerformances().get("character recall")); 156 | System.out.println("******Stats on validation******"); 157 | System.out.println("F-measure " + solution.getValidationPerformances().get("match f-measure")); 158 | System.out.println("Precision: " + solution.getValidationPerformances().get("match precision")); 159 | System.out.println("Recall: " + solution.getValidationPerformances().get("match recall")); 160 | System.out.println("Char precision: " + solution.getValidationPerformances().get("character precision")); 161 | System.out.println("Char recall: " + solution.getValidationPerformances().get("character recall")); 162 | System.out.println("******Stats on learning******"); 163 | System.out.println("F-measure: " + solution.getLearningPerformances().get("match f-measure")); 164 | System.out.println("Precision: " + solution.getLearningPerformances().get("match precision")); 165 | System.out.println("Recall: " + solution.getLearningPerformances().get("match recall")); 166 | System.out.println("Char precision: " + solution.getLearningPerformances().get("character precision")); 167 | System.out.println("Char recall: " + solution.getLearningPerformances().get("character recall")); 168 | } else { 169 | System.out.println("******Stats for Flagging task******"); 170 | System.out.println("******Stats on training******"); 171 | System.out.println("Accuracy: " + solution.getTrainingPerformances().get("flag accuracy")); 172 | System.out.println("Fpr: " + solution.getTrainingPerformances().get("flag fpr")); 173 | System.out.println("Fnr: " + solution.getTrainingPerformances().get("flag fnr")); 174 | System.out.println("F-measure: " + solution.getTrainingPerformances().get("flag f-measure")); 175 | System.out.println("Precision: " + solution.getTrainingPerformances().get("flag precision")); 176 | System.out.println("Recall: " + solution.getTrainingPerformances().get("flag recall")); 177 | System.out.println("******Stats on validation******"); 178 | System.out.println("Accuracy: " + solution.getValidationPerformances().get("flag accuracy")); 179 | System.out.println("Fpr: " + solution.getValidationPerformances().get("flag fpr")); 180 | System.out.println("Fnr: " + solution.getValidationPerformances().get("flag fnr")); 181 | System.out.println("F-measure " + solution.getValidationPerformances().get("flag f-measure")); 182 | System.out.println("Precision: " + solution.getValidationPerformances().get("flag precision")); 183 | System.out.println("Recall: " + solution.getValidationPerformances().get("flag recall")); 184 | System.out.println("******Stats on learning******"); 185 | System.out.println("Accuracy: " + solution.getLearningPerformances().get("flag accuracy")); 186 | System.out.println("Fpr: " + solution.getLearningPerformances().get("flag fpr")); 187 | System.out.println("Fnr: " + solution.getLearningPerformances().get("flag fnr")); 188 | System.out.println("F-measure: " + solution.getLearningPerformances().get("flag f-measure")); 189 | System.out.println("Precision: " + solution.getLearningPerformances().get("flag precision")); 190 | System.out.println("Recall: " + solution.getLearningPerformances().get("flag recall")); 191 | } 192 | } 193 | } 194 | 195 | static private final String HELP_MESSAGE 196 | = "Usage:\n" 197 | + "java -jar ConsoleRegexTurtle -t 4 -p 500 -g 1000 -e 20.0 -c \"interesting evolution\" -x true -d dataset.json -o ./outputfolder/\n" 198 | + "\nOn linux you can invoke this tool using the alternative script:\n" 199 | + "regexturtle.sh -t 4 -p 500 -g 1000 -e 20.0 -c \"interesting evolution\" -d dataset.json -o ./outputfolder/\n" 200 | + "\nParameters:\n" 201 | + "-t number of threads, default is 2\n" 202 | + "-p population size, default is 500\n" 203 | + "-g maximum number of generations, per Job, default si 1000\n" 204 | + "-j number of Jobs, default si 32\n" 205 | + "-e percentange of number generations, defines a threshold for the separate and conquer split criteria, when best doesn't change for the provided % of generation the Job evolution separates the dataset.\n" 206 | + " Default is 20%, 200 geberations with default 1000 generations.\n" 207 | + "-d path of the dataset json file containing the examples, this parameter is mandatory.\n" 208 | + "-o name of the output folder, results.json is saved into this folder; default is '.'\n" 209 | + "-x boolean, populates an extra field in results file, when 'true' adds all dataset examples in the results file 'examples' field, default is 'false'\n" 210 | + "-s boolean, when 'true' enables dataset striping, striping is an experimental feature, default is disabled: 'false'\n" 211 | + "-c adds an optional comment string\n" 212 | + "-f enables the flagging mode: solves a flagging problem with a separate-and-conquer strategy\n" 213 | + "-h visualizes this help message\n"; 214 | 215 | static private void parseArgs(String[] args, SimpleConfig simpleConfig) { 216 | try { 217 | boolean mandatoryDatasetCheck = true; 218 | if (args.length == 0) { 219 | System.out.println(HELP_MESSAGE); 220 | } 221 | for (int i = 0; i < args.length; i++) { 222 | String string = args[i]; 223 | i = i + 1; 224 | String parameter = args[i]; 225 | switch (string) { 226 | case "-t": 227 | simpleConfig.numberThreads = Integer.valueOf(parameter); 228 | break; 229 | case "-p": 230 | simpleConfig.populationSize = Integer.valueOf(parameter); 231 | break; 232 | case "-d": 233 | simpleConfig.datasetName = parameter; 234 | mandatoryDatasetCheck = false; 235 | break; 236 | case "-o": 237 | simpleConfig.outputFolder = parameter; 238 | break; 239 | case "-g": 240 | simpleConfig.generations = Integer.valueOf(parameter); 241 | break; 242 | case "-j": 243 | simpleConfig.numberOfJobs = Integer.valueOf(parameter); 244 | break; 245 | case "-e": 246 | simpleConfig.termination = Double.valueOf(parameter); 247 | break; 248 | case "-x": 249 | simpleConfig.populateOptionalFields = Boolean.valueOf(parameter); 250 | break; 251 | case "-h": 252 | System.out.println(HELP_MESSAGE); 253 | break; 254 | case "-c": 255 | simpleConfig.comment = parameter; 256 | break; 257 | case "-s": 258 | simpleConfig.isStriped = Boolean.valueOf(parameter); 259 | break; 260 | case "-f": 261 | simpleConfig.isFlagging = true; 262 | i=i-1; //Do not use parameter 263 | break; 264 | } 265 | } 266 | 267 | if (simpleConfig.isStriped && simpleConfig.isFlagging) { 268 | System.out.println("Striping and flagging cannot be enabled toghether.\n" + HELP_MESSAGE); 269 | System.exit(1); 270 | } 271 | 272 | if (mandatoryDatasetCheck) { 273 | System.out.println("Dataset path is needed.\n" + HELP_MESSAGE); 274 | System.exit(1); 275 | } 276 | } catch (RuntimeException ex) { 277 | System.out.println("Problem parsing commandline parameters.\n" + HELP_MESSAGE); 278 | System.out.println("Error details:" + ex.toString()); 279 | System.exit(1); 280 | } 281 | 282 | } 283 | 284 | } 285 | -------------------------------------------------------------------------------- /src/regex/ConsoleRegexTurtle/src/it/units/inginf/male/dto/SimpleConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Machine Learning Lab - University of Trieste, 3 | * Italy (http://machinelearning.inginf.units.it/) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | */ 18 | package it.units.inginf.male.dto; 19 | 20 | import it.units.inginf.male.configuration.Configuration; 21 | import it.units.inginf.male.configuration.DatasetContainer; 22 | import it.units.inginf.male.generations.EmptyPopulationBuilder; 23 | import it.units.inginf.male.generations.FlaggingNaivePopulationBuilder; 24 | import it.units.inginf.male.generations.TokenizedPopulationBuilder; 25 | import it.units.inginf.male.inputs.DataSet; 26 | import it.units.inginf.male.objective.FlaggingAccuracyPrecisionLengthObjective; 27 | import it.units.inginf.male.selections.best.BasicFlaggingLearningBestSelector; 28 | import it.units.inginf.male.strategy.impl.MultithreadStrategy; 29 | import it.units.inginf.male.terminalsets.FlaggingNgramsTerminalSetBuilder; 30 | import it.units.inginf.male.terminalsets.TokenizedTerminalSetBuilder; 31 | import java.util.Arrays; 32 | import java.util.logging.Logger; 33 | 34 | 35 | /** 36 | * 37 | * @author MaleLabTs 38 | */ 39 | public class SimpleConfig { 40 | //Maximum unmatch_chars/match_chars ratio 41 | //and sets the maximum unmatch_chars/match_chars ratio; this value defines the margin size around the matches 42 | transient private final double STRIPING_DEFAULT_MARGIN_SIZE = 10; 43 | public int numberThreads; 44 | public int numberOfJobs; 45 | public int generations; 46 | public int populationSize; 47 | public DataSet dataset; 48 | public boolean populateOptionalFields; 49 | public boolean isStriped = false; 50 | public boolean isFlagging = false; 51 | 52 | transient public String datasetName; 53 | transient public String outputFolder; 54 | 55 | /** 56 | * Percentange [0,100] of the number of the generations used for the Spared termination 57 | * criteria. 58 | */ 59 | public double termination = 20.0; 60 | public String comment; 61 | 62 | public Configuration buildConfiguration(){ 63 | assert !(isFlagging&&isStriped); 64 | 65 | // 66 | Configuration configuration = new Configuration(); 67 | configuration.setConfigName("Console config"); 68 | configuration.getEvolutionParameters().setGenerations(generations); 69 | configuration.getEvolutionParameters().setPopulationSize(populationSize); 70 | configuration.setJobs(numberOfJobs); 71 | configuration.getStrategyParameters().put(MultithreadStrategy.THREADS_KEY, String.valueOf(numberThreads)); 72 | 73 | int terminationGenerations = (int)(termination * configuration.getEvolutionParameters().getGenerations() / 100.0); 74 | if(termination==100.0){ 75 | configuration.getStrategyParameters().put("terminationCriteria","false"); 76 | } else { 77 | configuration.getStrategyParameters().put("terminationCriteria","true"); 78 | } 79 | configuration.getStrategyParameters().put("terminationCriteriaGenerations", String.valueOf(terminationGenerations)); 80 | //Added terminationCriteria for the second strategy 81 | configuration.getStrategyParameters().put("terminationCriteria2","false"); 82 | 83 | if(dataset == null){ 84 | throw new IllegalArgumentException("You must define a dataset"); 85 | } 86 | dataset.populateUnmatchesFromMatches(); 87 | DatasetContainer datasetContainer = new DatasetContainer(dataset); 88 | datasetContainer.createDefaultRanges((int) configuration.getInitialSeed()); 89 | //checks if striping is needed 90 | dataset.updateStats(); 91 | if(isStriped){ 92 | Logger.getLogger(this.getClass().getName()).info("Enabled striping."); 93 | datasetContainer.setDataSetsStriped(true); 94 | datasetContainer.setDatasetStripeMarginSize(STRIPING_DEFAULT_MARGIN_SIZE); 95 | datasetContainer.setProposedNormalDatasetInterval(100);//terminationGenerations+50); 96 | } 97 | configuration.setDatasetContainer(datasetContainer); //remind that after setting the DataSetContainer.. we need to update configuration in order to invoke datacontainer update methods 98 | 99 | //FLagging configuration 100 | //is an alternative configuration, experimental, that requires changes into the configuration defaults (extractor configuration) 101 | //Changes: bestSelector, fitness, terminalset builder configuration mod, population builders(?) 102 | configuration.setIsFlagging(isFlagging); 103 | if(this.isFlagging){ 104 | configuration.setStrategy(new MultithreadStrategy()); 105 | configuration.setBestSelector(new BasicFlaggingLearningBestSelector()); 106 | configuration.setObjective(new FlaggingAccuracyPrecisionLengthObjective()); 107 | configuration.setPopulationBuilder(new FlaggingNaivePopulationBuilder()); //disable context generation 108 | configuration.setTerminalSetBuilder(new FlaggingNgramsTerminalSetBuilder()); //disable context generation 109 | //TODO change terminalSet to a more naive version? 110 | configuration.getTerminalSetBuilderParameters().put("discardWtokens", "false");//Takes significant chars too 111 | configuration.getStrategyParameters().put("isFlagging", "true"); //Enable strategy flagging 112 | //Remove lookarounds 113 | configuration.getOperators().removeAll( 114 | Arrays.asList("it.units.inginf.male.tree.operator.PositiveLookbehind","it.units.inginf.male.tree.operator.NegativeLookbehind", 115 | "it.units.inginf.male.tree.operator.PositiveLookahead", "it.units.inginf.male.tree.operator.NegativeLookahead")); 116 | } 117 | 118 | 119 | 120 | configuration.setup(); //initializes datasetcontainer, populationbuilder and terminalsetbuilder 121 | 122 | return configuration; 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/regex/RegexRunner.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/RegexRunner.jar -------------------------------------------------------------------------------- /src/regex/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/__init__.py -------------------------------------------------------------------------------- /src/regex/regex.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import subprocess 4 | import os 5 | import shutil 6 | import pandas as pd 7 | from psutil import virtual_memory 8 | 9 | import conf 10 | from src.utils import create_folder 11 | 12 | logger = logging.getLogger(conf.LOGGER_NAME) 13 | 14 | 15 | class Regex(object): 16 | """ 17 | This class handles the creation and test of Regex on a list of string. 18 | """ 19 | 20 | def __init__(self, project_name, remove_project=False): 21 | self.project_name = project_name 22 | create_folder(conf.REGEX_FOLDER_OUTPUT) 23 | self.project_folder = os.path.join(conf.REGEX_FOLDER_OUTPUT, project_name) 24 | if remove_project: 25 | shutil.rmtree(self.project_folder, ignore_errors=True, onerror=None) 26 | if not os.path.exists(self.project_folder): 27 | create_folder(self.project_folder) 28 | 29 | def run_with_benign_check(self, _cluster_dict, benign_list, benign_for_retrain=conf.BENIGN_FOR_RETRAIN, 30 | round_max=15, 31 | take_existing_result=False): 32 | """ 33 | Main API. Extract signatures fron cluster dict and check the results on a benign list 34 | :param _cluster_dict: Ex: {'cluster_1': {'match': ['pandas', 'gibbon'], 'unmatch': ['monkey']} 35 | :param benign_list: list of path to not match 36 | :param benign_for_retrain: benign to be added for the regex generation process on each step 37 | :param round_max: Max round for generating the regexes. If -1, stop until no FP is found 38 | :param take_existing_result: Load the existing results in self.project_folder and start the round 39 | with the benign samples 40 | :return: a cluster_dict that we should pass to another round if we want to continue the process. 41 | """ 42 | 43 | assert round_max >= 1 44 | benign_list = self.remove_nan_value_from_list(benign_list) 45 | n_path = sum([len(_cluster_dict[cluster]['match']) for cluster in _cluster_dict]) 46 | if take_existing_result: 47 | logger.info("Loading existing signatures in folder {}".format(self.project_folder)) 48 | cluster_result = self.get_cluster_results() 49 | else: 50 | logger.info('Extracting regex for the first time on {} cluster(s)!'.format(len(_cluster_dict))) 51 | cluster_result = self.run_cluster_dict(_cluster_dict) 52 | old_cluster_dict = _cluster_dict 53 | old_cluster_result = cluster_result 54 | cluster_sig_dict = {k: [v] for k, v in cluster_result.items()} 55 | _round = 0 56 | while len(old_cluster_dict) > 0: 57 | if _round == round_max: 58 | break 59 | logger.info(f'Starting Round {str(_round + 1)}') 60 | new_cluster_dict = self.test_cluster_dict(old_cluster_dict, old_cluster_result, benign_list, 61 | limit=benign_for_retrain) 62 | if len(new_cluster_dict) == 0: 63 | if _round == 0: 64 | logger.info("It is too good to be true. I kill the process") 65 | return 'error' 66 | logger.info(f'Creating regexes for {str(len(new_cluster_dict))} cluster(s) : {list(new_cluster_dict)}') 67 | 68 | cluster_result_new = self.run_cluster_dict(new_cluster_dict) 69 | old_cluster_dict = new_cluster_dict 70 | 71 | for cluster in new_cluster_dict: 72 | cluster_sig_dict[cluster].append(cluster_result_new[cluster]) 73 | _round += 1 74 | old_cluster_result = cluster_result_new 75 | last_cluster_dict = self.test_cluster_dict(_cluster_dict, self.get_cluster_results(), benign_list, 76 | limit=1000000) 77 | cluster_with_no_fp = {k: _cluster_dict[k] for k in set(_cluster_dict) - set(last_cluster_dict)} 78 | fp_stat = "" 79 | no_fp_cluster = [] 80 | for cluster in cluster_result: 81 | if cluster in last_cluster_dict: 82 | fp_rate = round(len(last_cluster_dict[cluster]['unmatch']) * 100 / len(benign_list), 3) 83 | fp_stat += f""" {cluster} - {str(len(last_cluster_dict[cluster]['unmatch']))} FP ( {fp_rate} % """ 84 | fp_stat += '\n' 85 | else: 86 | no_fp_cluster.append(cluster) 87 | n_path_no_fp = sum([len(cluster_with_no_fp[k]['match']) for k in cluster_with_no_fp]) 88 | cluster_path_stat = '' 89 | for cluster in dict(sorted(cluster_sig_dict.items(), key=lambda x: len(x[1]))): 90 | cluster_path_stat += f'{cluster} : ' + ' ---> '.join(cluster_sig_dict[cluster]) + '\n' 91 | summary_stat = f""" 92 | #### Summary #### 93 | 94 | Init\n: 95 | N cluster : {len(_cluster_dict)} 96 | N paths: {str(n_path)} 97 | N benign in final test: {len(benign_list)} 98 | Benign number for retraining : {benign_for_retrain} 99 | N round: {round_max} 100 | 101 | Cluster sig paths: 102 | 103 | {cluster_path_stat} 104 | 105 | After final testing: 106 | Cluster with 0 FP: {set(cluster_with_no_fp)} 107 | Number of paths covered with 0 FP: {n_path_no_fp} 108 | Percentage of paths covered with 0 FP: {round(100 * n_path_no_fp / n_path, 2)} % 109 | 110 | ### FP Report ### 111 | 112 | With FP : 113 | 114 | {fp_stat} 115 | 116 | Without: 117 | 118 | {no_fp_cluster} 119 | 120 | """ 121 | logger.info(summary_stat) 122 | return last_cluster_dict 123 | 124 | def test_cluster_dict(self, _cluster_dict, cluster_result, benign_list, limit=10): 125 | """ 126 | 127 | :param _cluster_dict: cluster_dict 128 | :param cluster_result: cluster result. Ex {'cluster_1': 'a{3}b+'} 129 | :param benign_list: list of benign 130 | :param limit: int. Stop testing after limit catches 131 | :return: a cluster_dict that contains only the cluster that have FP. 132 | """ 133 | new_cluster_dict = {} 134 | for cluster in _cluster_dict: 135 | benign_match = self.check_regex_list(cluster_result[cluster], 136 | list(set(benign_list) - set(_cluster_dict[cluster]['unmatch'])), 137 | limit=limit)[1] 138 | 139 | if len(benign_match) == 0: 140 | logger.info( 141 | 'Signature on cluster {} ( {}) has no benign match !'.format(cluster, cluster_result[cluster])) 142 | continue 143 | logger.info('{} benign match in {}'.format(len(benign_match), cluster)) 144 | new_cluster_dict[cluster] = {'match': _cluster_dict[cluster]['match'], 145 | 'unmatch': benign_match + _cluster_dict[cluster]['unmatch']} 146 | return new_cluster_dict 147 | 148 | def run_cluster_dict(self, _cluster_dict): 149 | """ 150 | Run regex extraction from a cluster dict 151 | :param _cluster_dict: cluster dict 152 | :return: cluster results 153 | """ 154 | for cluster in _cluster_dict: 155 | logger.info(f'Creating regex for cluster {cluster}') 156 | self.create_regex(_cluster_dict[cluster]['match'], cluster_name=cluster, 157 | str_to_not_match=_cluster_dict[cluster]['unmatch']) 158 | cluster_results = self.get_cluster_results() # Here we load all the results ! 159 | logger.info(f'Cluster results {cluster_results}') 160 | return cluster_results 161 | 162 | def create_regex(self, str_list, cluster_name, str_to_not_match=None): 163 | """ 164 | Create regex from a string list and list to not match 165 | :param str_list: list 166 | :param cluster_name: str 167 | :param str_to_not_match: list 168 | :return: void 169 | """ 170 | str_to_not_match = self.remove_nan_value_from_list(str_to_not_match) 171 | json_path = self.create_json_cluster([x + '.' for x in str_list], [x + '.' for x in str_to_not_match], 172 | name=cluster_name) 173 | self.run_regex_extraction(json_path, output_folder=conf.REGEX_TMP) 174 | self.__move_results(cluster_name) 175 | 176 | def __move_results(self, cluster_name): 177 | """ 178 | Move results from tmp to the folder of the project 179 | :param cluster_name: name of the cluster 180 | :return: void 181 | """ 182 | output_file = os.listdir(conf.REGEX_TMP)[0] 183 | shutil.move(os.path.join(conf.REGEX_TMP, output_file), 184 | os.path.join(self.project_folder, 'results_' + cluster_name + '.json')) 185 | 186 | def get_cluster_results(self): 187 | """ 188 | Parse the results outputed by the regex creation process 189 | :return: cluster results 190 | """ 191 | results = {} 192 | for file in os.listdir(self.project_folder): 193 | if not file.startswith('results'): 194 | continue 195 | file_splitted = file.split('_') 196 | 197 | cluster_name = '_'.join([file_splitted[1], file_splitted[2], file_splitted[3]]).replace(".json", "") 198 | with open(os.path.join(self.project_folder, file)) as json_file: 199 | cluster_result = json.load(json_file) 200 | # results[cluster_name] = cluster_result['bestSolution']["solutionJS"] 201 | results[cluster_name] = cluster_result['bestSolution']["solution"] 202 | return results 203 | 204 | @staticmethod 205 | def run_regex_extraction(json_to_extract, output_folder=conf.REGEX_TMP, 206 | mem=round(virtual_memory().available / 10 ** 9) * 1000, threads=None): 207 | """ 208 | Run the Java process that creates the regex 209 | :param json_to_extract: json 210 | :param output_folder: output folder 211 | :param mem: memory to use. By default almost all the memory available 212 | :param threads: number of threads. For example mp.cpu_count(). /!\ Can create memory issue 213 | :return: void 214 | """ 215 | 216 | args = ['java', '-Xmx{}M'.format(mem), '-Xms{}M'.format(int(mem / 2)), '-jar', conf.REGEX_JAVA, "-d", 217 | json_to_extract, "-o", output_folder] 218 | if threads: 219 | args += ["-t", str(threads)] 220 | try: 221 | logger.info(f'Running subprocess with input {json_to_extract}') 222 | subprocess.run(args) 223 | except subprocess.TimeoutExpired: 224 | print(f'Timeout reached for input {json_to_extract}') # it should not take more than 1 hour 225 | 226 | def create_json_cluster(self, str_to_match, str_to_not_match=None, name='urls_cluster', description='luda'): 227 | """ 228 | Create the input for the Java process 229 | :param str_to_match: list of str 230 | :param str_to_not_match: list of str 231 | :param name: name of the json 232 | :param description: str 233 | :return: path where the json was created 234 | """ 235 | examples = [] 236 | for el in str_to_match: 237 | examples.append({ 238 | "string": el, 239 | "match": [{"start": 0, "end": len(el) - 1}], 240 | "unmatch": []}) 241 | if len(str_to_not_match) > 0: 242 | for el in str_to_not_match: 243 | examples.append({ 244 | "string": el, 245 | "match": [], 246 | "unmatch": [{"start": 0, "end": len(el) - 1}]}) 247 | 248 | result = { 249 | "name": name, 250 | "description": description, 251 | "regexTarget": "", 252 | "examples": examples} 253 | json_path = os.path.join(self.project_folder, 'input_' + name + '.json') 254 | with open(json_path, 'w') as f: 255 | json.dump(result, f) 256 | return json_path 257 | 258 | @staticmethod 259 | def check_regex_list(sig, path_list, limit=9999999): 260 | """ 261 | Check a regex against of list of str 262 | :param sig: str 263 | :param path_list: list 264 | :param limit: int 265 | :return: tuple 266 | """ 267 | match = 0 268 | urls_match = [] 269 | batch_size = conf.TEST_BATCH_SIZE 270 | for i in range(0, len(path_list), batch_size): 271 | if match > limit: 272 | break 273 | batch = path_list[i:i + batch_size] 274 | # res = js_regex.compile(sig).search(r'{}'.format(path)) 275 | result_list = Regex.run_regex_java(sig, batch) 276 | for j, match_bool in enumerate(result_list): 277 | if match >= limit: 278 | break 279 | if not match_bool: 280 | continue 281 | detected_path = path_list[i + j] 282 | if match < 2: # We want to print only some examples 283 | # logger.info('Match on {}'.format(res.group(0))) 284 | logger.info('Match on {}'.format(detected_path)) 285 | urls_match.append(detected_path) 286 | match += 1 287 | 288 | return match, urls_match 289 | 290 | def create_result_report(self, output_file=None): 291 | if not output_file: 292 | output_file = os.path.join(self.project_folder, f'report_{self.project_name}.csv') 293 | list_of_dict = list() 294 | for file in os.listdir(self.project_folder): 295 | 296 | if 'results' in file: 297 | with open(os.path.join(self.project_folder, file)) as json_file: 298 | cluster_result = json.load(json_file) 299 | with open(os.path.join(self.project_folder, file.replace('results', 'input'))) as json_file: 300 | cluster_input = json.load(json_file) 301 | malicious = 0 302 | benign = 0 303 | example_to_keep = None 304 | for example in cluster_input['examples']: 305 | if len(example['match']) > 0: 306 | if not example_to_keep: 307 | example_to_keep = example['string'] 308 | malicious += 1 309 | else: 310 | benign += 1 311 | tmp = {'name': file.replace('results_', '').replace('.json', ''), 312 | 'regex_js': cluster_result['bestSolution']['solutionJS'], 313 | 'regex_java': cluster_result['bestSolution']['solution'], 314 | 'malicious': malicious, 315 | 'benign': benign, 316 | 'round': benign // conf.BENIGN_FOR_RETRAIN, 317 | 'example_malicious': example_to_keep, 318 | 'results_file': file, 319 | 'input_file': file.replace('results', 'input')} 320 | list_of_dict.append(tmp.copy()) 321 | df = pd.DataFrame(list_of_dict) 322 | df.to_csv(output_file) 323 | return df 324 | 325 | @staticmethod 326 | def run_regex_java(regex, list_string): 327 | """ 328 | Run Regex on list of string with Java code 329 | :param regex: regex Java 330 | :param list_string: string to test 331 | :return: list of String ie ['true', 'false', 'false'] 332 | """ 333 | with open(conf.INPUT_REGEX_RUNNER, 'w') as f: 334 | json.dump({'to_test': [x for x in list_string if str(x) != 'nan']}, f) 335 | command = ['java', '-jar', conf.REGEX_RUNNER, regex, conf.INPUT_REGEX_RUNNER, conf.OUTPUT_REGEX_RUNNER] 336 | try: 337 | subprocess.run(command, stdout=subprocess.PIPE) 338 | except subprocess.TimeoutExpired: 339 | print(f'Timeout reached for regex {regex}') # it should not take more than 1 hour ! 340 | with open(conf.OUTPUT_REGEX_RUNNER, 'r') as f: 341 | result = json.load(f) 342 | 343 | return result['results'] 344 | 345 | @staticmethod 346 | def remove_nan_value_from_list(_list): 347 | result = [] 348 | for el in _list: 349 | if str(el) == 'nan': 350 | logger.warning('You have nan value in your list !!. List extract {}'.format(_list[:3])) 351 | continue 352 | result.append(el) 353 | return result 354 | 355 | -------------------------------------------------------------------------------- /src/regex/regexturtle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Executes the command-line version of RegextTurtle; automatically sets the JAVA VM memory size based on the available system memory 3 | MEMSYSTEM=8000 4 | MAXMEM=$(( MEMSYSTEM-512 )) 5 | XMSMEM=$(( MAXMEM/2 )) 6 | echo "System memory:"$MEMSYSTEM "Mbytes" 7 | echo "RegexTurtle is going to use this amount of the system memory:"$MAXMEM "Mbytes" 8 | java -Xmx${MAXMEM}M -Xms${XMSMEM}M -jar "ConsoleRegexTurtle.jar" $@ 9 | -------------------------------------------------------------------------------- /src/regex/src_regexrunner/regexrunner/JsonOperation.java: -------------------------------------------------------------------------------- 1 | package regexrunner; 2 | 3 | import java.io.FileWriter; 4 | import java.io.IOException; 5 | 6 | import org.json.simple.JSONArray; 7 | import org.json.simple.JSONObject; 8 | 9 | 10 | 11 | import org.json.simple.parser.JSONParser; 12 | import org.json.simple.parser.ParseException; 13 | 14 | import java.io.FileReader; 15 | import java.io.Reader; 16 | import java.util.ArrayList; 17 | import java.util.Iterator; 18 | public class JsonOperation { 19 | 20 | 21 | 22 | public static ArrayList read(String jsonPath) throws IOException, ParseException { 23 | 24 | ArrayList result = new ArrayList(); 25 | 26 | JSONParser parser = new JSONParser(); 27 | Reader reader = new FileReader(jsonPath); 28 | 29 | JSONObject jsonObject = (JSONObject) parser.parse(reader); 30 | 31 | 32 | // loop array 33 | JSONArray msg = (JSONArray) jsonObject.get("to_test"); 34 | Iterator iterator = msg.iterator(); 35 | while (iterator.hasNext()) { 36 | result.add(iterator.next());} 37 | return result; 38 | } 39 | 40 | 41 | 42 | public static void write(String sig, ArrayList listOfString, String jsonPath) { 43 | int i; 44 | JSONObject obj = new JSONObject(); 45 | 46 | JSONArray list = new JSONArray(); 47 | for (i = 0; i < listOfString.size(); i++) { 48 | list.add(Regex.test(sig, listOfString.get(i))); 49 | 50 | // accessing each element of array 51 | } 52 | 53 | obj.put("results", list); 54 | 55 | try (FileWriter file = new FileWriter(jsonPath)) { 56 | file.write(obj.toJSONString()); 57 | } catch (IOException e) { 58 | e.printStackTrace(); 59 | } 60 | 61 | System.out.print("Json written in " + jsonPath); 62 | 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/regex/src_regexrunner/regexrunner/Main.java: -------------------------------------------------------------------------------- 1 | package regexrunner; 2 | 3 | 4 | import org.json.simple.JSONArray; 5 | import org.json.simple.JSONObject; 6 | import org.json.simple.parser.ParseException; 7 | 8 | import java.io.FileWriter; 9 | import java.io.IOException; 10 | import java.util.ArrayList; 11 | 12 | import regexrunner.JsonOperation; 13 | 14 | 15 | /** 16 | This code is parsing a list of string, test them agaisnt a regex and 17 | write the results in a file 18 | * @author jordang 19 | * @version 1.0 20 | * @since 2020-06-04 21 | */ 22 | 23 | 24 | public class Main{ 25 | 26 | public static void main(String[] args) throws IOException, ParseException { 27 | /** args: regex, input json, output json */ 28 | 29 | ArrayList stringList = JsonOperation.read(args[1]); 30 | JsonOperation.write(args[0], stringList, args[2]); 31 | 32 | } 33 | 34 | } -------------------------------------------------------------------------------- /src/regex/src_regexrunner/regexrunner/Regex.java: -------------------------------------------------------------------------------- 1 | package regexrunner; 2 | 3 | import java.util.ArrayList; 4 | import java.util.regex.*; 5 | 6 | 7 | public class Regex { 8 | 9 | 10 | 11 | public static boolean test(String sig, String my_string) { 12 | Pattern pattern = Pattern.compile(sig); 13 | Matcher matcher = pattern.matcher(my_string); 14 | if (matcher.find()) 15 | return true; 16 | return false; 17 | 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/use_case/use_case_clustering.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pickle 3 | import logging 4 | import os 5 | import numpy as np 6 | 7 | from src.clustering.distance_matrix import DistanceMatrix 8 | from src.clustering.metrics import DISTANCE_FUNC 9 | 10 | import conf 11 | 12 | logger = logging.getLogger(conf.LOGGER_NAME) 13 | 14 | """ 15 | ### Note about the following FP_LIST ### 16 | 17 | 1. FP ( benign) path should be added on the data step OR filtered on the preprocessing step. If you 18 | don't want ( bad practice) to do it there you can add your benign path here. 19 | 2. The MAIN use case of this following list is to add benign path AFTER having run the distance matrix computation 20 | and saw some FP on your cluster. You can filter them here and run a new clustering. 21 | 22 | 23 | """ 24 | 25 | 26 | FP_LIST = ['/wp-content', '/index.php', '/wp-includes', '/gate.php', '/admin.php', '/wp-admin', 27 | '/wp-content/uploads', 28 | '/images/logo.gif', '/login.php'] 29 | 30 | 31 | # if you don't want to rerun the feeders. You can still filter some path here but we advise to it 32 | 33 | def get_clusterer(clusterer_dict): 34 | """ 35 | Return different clustering config. We advise to use DBscan. 36 | :param clusterer_dict: dict with clustering parameters. E.g {"dbscan": {"eps": 10, "min_samples": 8}} 37 | :return: clustering object 38 | """ 39 | if not clusterer_dict: 40 | clusterer_dict = {"dbscan": {"eps": 10, "min_samples": 8}} 41 | if 'dbscan' in clusterer_dict: 42 | from sklearn.cluster import DBSCAN # We import here so we don't need to install something we don't need 43 | return DBSCAN(**clusterer_dict['dbscan'], metric='precomputed') 44 | elif 'hdbscan' in clusterer_dict: 45 | import hdbscan 46 | return hdbscan.HDBSCAN(**clusterer_dict['dbscan'], metric='precomputed') 47 | elif 'complete' in clusterer_dict: 48 | from sklearn.cluster import AgglomerativeClustering 49 | return AgglomerativeClustering(**clusterer_dict['complete'], affinity='precomputed') 50 | 51 | 52 | class UseCaseClustering(object): 53 | 54 | def run(self, file_path, skip_compute_distance=False, save_folder=None, 55 | clusterer=None, filter_th=10): 56 | """ 57 | Compute the distance between URL and cluster them 58 | :param file_path: path of the csv preprocessed 59 | :param skip_compute_distance: bool. If true, does only the clustering 60 | :param save_folder: path. Mandatory if we skip the computation step 61 | :param clusterer: clustering technique 62 | :param filter_th: threshold used to clean the matrix in the function __filter_outlier_and_fp 63 | :return: void 64 | """ 65 | if skip_compute_distance: 66 | assert save_folder is not None 67 | distance_matrix_object = DistanceMatrix.load(save_folder) 68 | else: 69 | df_features = pd.read_csv(file_path) 70 | df_features = df_features[df_features['label'] == 'malicious'] 71 | logger.info(f"For this step, we do not use the " 72 | f" {df_features[df_features['label'] == 'malicious']['path'].nunique()} benign paths ") 73 | df_features = df_features[~df_features['path'].isin(FP_LIST)] 74 | path_list = list(df_features['path'].unique()) # We take only unique !!! 75 | distance_matrix_object = DistanceMatrix(path_list, 76 | distance_func=DISTANCE_FUNC['sw'], folder=save_folder) 77 | distance_matrix_object.run() 78 | distance_matrix_object.matrix = distance_matrix_object.matrix.astype(np.double) 79 | index_to_keep, matrix_filtered = self.__filter_outlier_and_fp(distance_matrix_object.matrix, filter_th, 80 | distance_matrix_object.url_list) 81 | distance_matrix_object.matrix = matrix_filtered 82 | logger.info('We begin the clustering !') 83 | distance_matrix_object.matrix = self.distance_from_sim(distance_matrix_object.matrix) 84 | clust = get_clusterer(clusterer) 85 | clust.fit( 86 | distance_matrix_object.matrix) # we need to pass distance matrix instead of similarity 87 | logger.info('Clustering done') 88 | with open(os.path.join(distance_matrix_object.folder, 'labels.pkl'), 'wb') as f: 89 | pickle.dump(clust.labels_, f) 90 | with open(os.path.join(distance_matrix_object.folder, 'index_to_keep.pkl'), 'wb') as f: 91 | pickle.dump(index_to_keep, f) 92 | logger.info('You can find the results at {}'.format(distance_matrix_object.folder)) 93 | 94 | @staticmethod 95 | def __filter_outlier_and_fp(mat, th, path_list=None): 96 | """ 97 | Clean the matrix from outlier and FP 98 | :param mat: matrix 99 | :param th: int. If a row does not contain a value higher than th, it will be filtered. 100 | :param path_list: list 101 | :return: tuple. (list of indexes not filtered, the new matrix filtered) 102 | """ 103 | index_to_remove = [] 104 | if path_list: 105 | for fp in FP_LIST: 106 | if fp in path_list: 107 | index_to_remove.append(path_list.index(fp)) 108 | logger.info('Matrix size before filter {}'.format(mat.shape)) 109 | new_matrix = [] 110 | index_to_keep = [] 111 | for i, el in enumerate(mat): 112 | if i in index_to_remove: 113 | continue 114 | if el.max() >= th: 115 | new_matrix.append(el) 116 | index_to_keep.append(i) 117 | 118 | new_matrix = np.vstack(new_matrix) 119 | new_matrix = new_matrix[:, index_to_keep] 120 | logger.info('Matrix size after filter {}'.format(new_matrix.shape)) 121 | 122 | return index_to_keep, np.vstack(new_matrix) 123 | 124 | @staticmethod 125 | def distance_from_sim(matrix): 126 | """ 127 | 128 | Invert linearly the numbers. Shift min --> max and max--> min. Ensure than the diagonal is 0 129 | The goal is to transform a similarity matrix into a distance matrix. 130 | :param matrix: matrix 2d 131 | :return: matrix 2d 132 | """ 133 | high = conf.SIMILARITY_MAX 134 | result = np.abs(high - matrix) 135 | np.fill_diagonal(result, 0) # we ensure that the edit distance with an element and itself is 0 136 | return result 137 | -------------------------------------------------------------------------------- /src/use_case/use_case_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import logging 3 | import os 4 | 5 | import conf 6 | from src.feeder.feed_downloader import Url 7 | 8 | logger = logging.getLogger(conf.LOGGER_NAME) 9 | 10 | 11 | class UseCaseData(object): 12 | 13 | def run(self, main_file, additional_sources): 14 | final_df = pd.DataFrame(columns=list(Url.__annotations__)) 15 | if os.path.exists(main_file): 16 | df = pd.read_csv(main_file) 17 | assert list(df) == list(Url.__annotations__) 18 | logger.info( 19 | f'{main_file} already exists. We load it and concatenate it with the additional sources ( if exists)') 20 | final_df = pd.concat([final_df, df]) 21 | for path_label in additional_sources: 22 | final_df = pd.concat([final_df, self.get_basic_format_df(path_label['path'], path_label['label'])]) 23 | final_df.to_csv(main_file, index=False) 24 | return final_df 25 | 26 | def get_basic_format_df(self, file, label): 27 | self.check_label(label=label) 28 | df = self.load_df(file) 29 | df['label'] = label.lower() 30 | new_df = pd.DataFrame(df['url']) 31 | other_columns = list(Url.__annotations__) 32 | other_columns.remove('url') 33 | for column in other_columns: 34 | if column in df.columns: 35 | new_df = pd.concat([new_df, df[column]], axis=1) 36 | else: 37 | logger.warning(f'Column {column} not found in {file}. Setting it to None') 38 | new_df[column] = None 39 | logger.info(f"{new_df['url'].nunique()} unique URLs loaded from {file}") 40 | return new_df 41 | 42 | @staticmethod 43 | def check_label(label): 44 | if label.lower() not in conf.DATA_LABELS: 45 | raise Exception('You should specify a label ( malicious or benign) for you data sources') 46 | 47 | def load_df(self, file_path): 48 | SEP = [',', '\t'] # you put here several sep if you have different formats 49 | for sep in SEP: 50 | try: 51 | df = pd.read_csv(file_path, sep=sep, error_bad_lines=False) 52 | try: 53 | self.check_columns(df) 54 | except Exception: # maybe with the next sep, it will work. 55 | continue 56 | return df 57 | except Exception as e: 58 | raise Exception(f'Failed loading for file {file_path}, {e}') 59 | raise Exception(f'Failed loading for file {file_path}') 60 | 61 | @staticmethod 62 | def check_columns(df): 63 | MANDATORY_COLUMNS = ['url'] 64 | for col in MANDATORY_COLUMNS: 65 | if col not in df.columns: 66 | raise Exception(f"You should have a column named {col} at least") 67 | -------------------------------------------------------------------------------- /src/use_case/use_case_feeder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import logging 4 | 5 | from src.utils import create_folder 6 | from src.utils import df_to_url_list 7 | from src.feeder.feed_downloader import FeedDownloader, Url 8 | import conf 9 | 10 | logger = logging.getLogger(conf.LOGGER_NAME) 11 | 12 | 13 | class UseCaseFeeder(object): 14 | 15 | @staticmethod 16 | def fetch(sources): 17 | """ 18 | Once you create your feeder, add it here to call it directly from config.json 19 | :param sources: source list 20 | :return: url list object 21 | """ 22 | url_list = [] 23 | for source in sources: 24 | if source == 'urlhaus': 25 | from src.feeder.urlhaus_feed_downloader import URLHausFeedDownloader 26 | feeder_object = URLHausFeedDownloader() 27 | elif source == 'openfish': 28 | from src.feeder.openfish_feed_downloader import OpenPhishFeedDownloader 29 | feeder_object = OpenPhishFeedDownloader() 30 | elif source == 'alexa': 31 | from src.feeder.alexa_feed_downloader import AlexaFeedDownloader 32 | feeder_object = AlexaFeedDownloader() 33 | elif source == 'majestic': 34 | from src.feeder.vt_feed_downloader import VtFeedDownloader 35 | feeder_object = VtFeedDownloader() 36 | elif source == 'umbrella': 37 | from src.feeder.umbrella_feed_downloader import UmbrellaFeedDownloader 38 | feeder_object = UmbrellaFeedDownloader() 39 | elif source == 'iscx': 40 | from src.feeder.iscx_feed_downloader import IscxFeedDownloader 41 | feeder_object = IscxFeedDownloader() 42 | elif source == 'vt': 43 | from src.feeder.vt_feed_downloader import VtFeedDownloader 44 | feeder_object = VtFeedDownloader() 45 | else: 46 | continue 47 | url_list += feeder_object.run() 48 | 49 | return url_list 50 | 51 | @staticmethod 52 | def fetch_and_save(sources, filename): 53 | list_of_urls = UseCaseFeeder.fetch(sources) 54 | if os.path.exists(filename): 55 | logger.info(f'Found an existing {filename}. We append the feeders results to this file.') 56 | df = pd.read_csv(filename) 57 | assert list(df) == list(Url.__annotations__) 58 | list_of_urls += df_to_url_list(df) 59 | create_folder(filename) 60 | FeedDownloader.save_to_csv(list_of_urls, filename) 61 | -------------------------------------------------------------------------------- /src/use_case/use_case_preprocessor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import conf 4 | 5 | logger = logging.getLogger(conf.LOGGER_NAME) 6 | 7 | 8 | class UseCasePreprocessor(object): 9 | """ 10 | Add here your preprocessing technique following the "basic" syntax one. 11 | """ 12 | @staticmethod 13 | def run(preprocess_name, file_path): 14 | if preprocess_name == 'basic': 15 | from src.preprocessor.preprocessor_basic import PreprocessorBasic 16 | PreprocessorBasic().run(file_path) 17 | -------------------------------------------------------------------------------- /src/use_case/use_case_regex_generation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import pandas as pd 4 | 5 | import conf 6 | 7 | import logging 8 | 9 | logger = logging.getLogger(conf.LOGGER_NAME) 10 | 11 | 12 | class UseCaseRegexGeneration(object): 13 | def __init__(self, regex_object): 14 | self.regex_object = regex_object 15 | 16 | def run(self, main_file, features_folder, cluster_list, benign_for_retrain=20, 17 | take_existing_result=False, round_max=15, min_path_for_run=1): 18 | """ 19 | Take the data from the clustering step and extract the clusters 20 | :param main_file: file containing all the data. 21 | :param features_folder: folder_path with the features. 22 | :param cluster_list: list of cluster number to process 23 | :param benign_for_retrain: int. Number of str to not match to take into account at each round 24 | :param take_existing_result: bool. if True, will start from the last result. It allows us to run several rounds 25 | not continuously 26 | :param round_max: int. Number of round max before abandoning the cluster because of the FP 27 | :param min_path_for_run: int. minimal number of paths to run the regex extraction process 28 | :return: void 29 | """ 30 | 31 | df, df_benign = self.load_df(main_file, features_folder) 32 | str_to_not_match = df_benign['path'].unique() 33 | if len(df) < min_path_for_run: 34 | logger.error( 35 | 'Not enough path to start the clustering. Paths: {} , Min paths : {}'.format(len(df), min_path_for_run)) 36 | return 37 | cluster_dict = {} 38 | if len(cluster_list) == 0: 39 | cluster_list = df['cluster'].unique() 40 | logger.info('No cluster number given. Creating regex for all cluster.') 41 | for cluster in cluster_list: 42 | if cluster == -1: 43 | continue 44 | cluster_urls = list(df[df['cluster'] == cluster]['path'].unique()) 45 | cluster_dict['cluster_' + str(len(cluster_urls)) + '_' + str(cluster)] = {'match': cluster_urls, 46 | 'unmatch': []} 47 | self.regex_object.run_with_benign_check(_cluster_dict=cluster_dict, benign_list=str_to_not_match, 48 | benign_for_retrain=benign_for_retrain, 49 | take_existing_result=take_existing_result, round_max=round_max) 50 | self.regex_object.create_result_report() 51 | 52 | @staticmethod 53 | def load_df(main_file, features_folder): 54 | """ 55 | Load the df with the labels and the cleaning done in the clustering phase. 56 | :param main_file: main csv file. 57 | :param features_folder: folder_path with the features. 58 | :return: DataFrame 59 | """ 60 | df = pd.read_csv(main_file).drop_duplicates(['path']) # we ensure that everything is unique 61 | df_benign = df[df['label'] == 'benign'] 62 | df = df[df['label'] == 'malicious'] 63 | with open(os.path.join(features_folder, conf.INDEX_TO_KEEP), 'rb') as f: 64 | index_to_keep = pickle.load(f) 65 | with open(os.path.join(features_folder, conf.LABELS), 'rb') as f: 66 | labels = pickle.load(f) 67 | df = df.iloc[index_to_keep, :] 68 | df['cluster'] = labels 69 | return df, df_benign 70 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | 4 | import conf 5 | from src.feeder.feed_downloader import Url 6 | 7 | 8 | def create_folder(path): 9 | """ 10 | Create folder if it does not exist. If the function gets a file, it will create all the folders before 11 | :param path: path of a file or folders. Can also be a list 12 | :return: void 13 | """ 14 | 15 | def delete_one_folder(_path): 16 | if "." in os.path.basename(_path): # Check if it's a file 17 | _path = os.path.dirname(_path) 18 | pathlib.Path(_path).mkdir(parents=True, exist_ok=True) 19 | 20 | if isinstance(path, list): 21 | for _path in path: 22 | delete_one_folder(_path) 23 | elif isinstance(path, str): 24 | delete_one_folder(path) 25 | 26 | 27 | def process_file_name(filename): 28 | """ 29 | Process the main csv file. It contains the data before being preprocessed 30 | :param filename: path of the csv 31 | :return: filename fixed 32 | """ 33 | assert filename.endswith(".csv") 34 | if not os.path.isabs(filename): 35 | create_folder(filename) 36 | filename = os.path.join(conf.DATA, os.path.basename(filename)) 37 | return filename 38 | 39 | 40 | def process_preprocessed_file_name(main_file, preprocess_file): 41 | default_preprocess_file = os.path.join(conf.DATA, 42 | f"{os.path.basename(main_file).replace('.csv', '')}" 43 | f"{conf.PREPROCESSED_SUFFIX}") 44 | if preprocess_file is None: 45 | return default_preprocess_file 46 | if not os.path.exists(preprocess_file): 47 | return default_preprocess_file 48 | 49 | return preprocess_file 50 | 51 | 52 | def df_to_url_list(df): 53 | columns = list(Url.__annotations__) 54 | url_list = [Url(row[1][columns[0]], row[1][columns[1]], row[1][columns[2]], row[1][columns[3]]) for row in 55 | df.iterrows()] 56 | return url_list 57 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/test/__init__.py -------------------------------------------------------------------------------- /test/clustering/data/save_test/index.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/test/clustering/data/save_test/index.pkl -------------------------------------------------------------------------------- /test/clustering/data/save_test/matrix.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/test/clustering/data/save_test/matrix.pkl -------------------------------------------------------------------------------- /test/clustering/test_distance_matrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import mock 4 | import shutil 5 | 6 | from src.clustering.distance_matrix import DistanceMatrix 7 | from src.clustering.metrics import DISTANCE_FUNC 8 | import conf 9 | 10 | word_list = ['verify', 11 | 'palaver', 12 | 'bathrobe', 13 | 'traitorwise', 14 | 'midwatch', 15 | 'onymal', 16 | 'aphlogistic', 17 | 'trustingly', 18 | 'saponifier', 19 | 'moodle', 20 | 'isuret', 21 | 'oedogoniaceous', 22 | 'unhoard', 23 | 'receiptless', 24 | 'unfibbing', 25 | 'header', 26 | 'Tasian', 27 | 'deferral', 28 | 'expansively', 29 | 'hydramnion'] 30 | 31 | 32 | # downloaded with nltk.word.words() 33 | 34 | 35 | class DistanceMatrixSimple(object): 36 | def __init__(self, distance_func=DISTANCE_FUNC['sw']): 37 | self.distance_func = distance_func 38 | 39 | def create_matrix_distance(self, url_list): 40 | n = len(url_list) 41 | distance_matrix = np.zeros(shape=(n, n), dtype=np.double) # we need double for HDBScan 42 | for i in range(n): 43 | for j in range(i + 1, n): 44 | distance_matrix[i, j] = int( 45 | round(100 * self.distance_func(url_list[i], url_list[j]) / max(len(url_list[i]), len(url_list[j])))) 46 | distance_matrix = self.symmetrize(distance_matrix) 47 | np.fill_diagonal(distance_matrix, conf.SIMILARITY_MAX) 48 | return distance_matrix 49 | 50 | @staticmethod 51 | def symmetrize(a): 52 | return a + a.T - np.diag(a.diagonal()) 53 | 54 | 55 | @pytest.fixture() 56 | def distance_matrix_object(): 57 | return DistanceMatrix(word_list, distance_func=DISTANCE_FUNC['sw']) 58 | 59 | 60 | def test_run(distance_matrix_object): 61 | """ 62 | Here we test that the distance matrix with multiprocessing does the same job 63 | as the simple class written above 64 | :return: 65 | """ 66 | expected = DistanceMatrixSimple().create_matrix_distance(word_list) 67 | 68 | output = distance_matrix_object.run() 69 | shutil.rmtree(distance_matrix_object.folder, ignore_errors=True) 70 | 71 | assert np.array_equal(expected, output) 72 | 73 | 74 | def test_load(distance_matrix_object): 75 | distance_matrix_object.folder = 'data/save_test' 76 | distance_matrix_object.run() 77 | 78 | distance_matrix = DistanceMatrix.load('data/save_test') 79 | # shutil.rmtree(distance_matrix_object.folder, ignore_errors=True) 80 | assert np.array_equal(distance_matrix.matrix, distance_matrix_object.matrix) 81 | assert np.array_equal(distance_matrix.url_list, distance_matrix_object.url_list) 82 | 83 | 84 | def test_add_url_list(): 85 | distance_matrix = DistanceMatrix.load('data/save_test') 86 | distance_matrix.__save = mock.Mock() 87 | old_matrix_shape = distance_matrix.matrix.shape 88 | distance_matrix.add_url_list(['jordan', 'jordan.html', 'akamai']) 89 | assert distance_matrix.url_list[-3:] == ['jordan', 'jordan.html', 'akamai'] 90 | assert distance_matrix.matrix.shape == (old_matrix_shape[0] + 3, old_matrix_shape[0] + 3) 91 | 92 | 93 | def test__get_argument_create_matrix(): 94 | distance_object = DistanceMatrix(url_list=list(range(100))) 95 | print(dir(distance_object)) 96 | result = distance_object._DistanceMatrix__get_argument_create_matrix(ncores=5) # works to call private method 97 | assert [(100, 90), (90, 79), (79, 66), (66, 49), (49, 0)] == result 98 | 99 | 100 | @mock.patch('builtins.open') 101 | def test__create_matrix_distance(open_mock, distance_matrix_object): 102 | print('Len word list {}'.format(len(word_list))) 103 | a = 10 104 | b = 15 105 | word_list_processed = word_list[a:b] 106 | print(word_list_processed) 107 | result = distance_matrix_object._DistanceMatrix__create_matrix_distance(b, a) 108 | assert result.shape == (b - a, len(word_list)) 109 | assert result[2, 3] == int(round( 110 | 100 * distance_matrix_object.distance_func(word_list[a + 2], word_list[a + 3]) / max(len(word_list[a + 2]), len( 111 | word_list[a + 3])))) 112 | -------------------------------------------------------------------------------- /test/clustering/test_metrics.py: -------------------------------------------------------------------------------- 1 | from src.clustering.metrics import get_sw_distance 2 | from src.clustering.metrics import longest_sub 3 | 4 | 5 | def test_get_sw_distance(): 6 | distance = get_sw_distance(match=1, mismatch=-1, gap_penalty=-1) 7 | assert distance('/mal/xxx/a.php', '/mal/a.php') == 6 8 | 9 | 10 | def test_longest_sub(): 11 | a = 'myurl/abigfolder/home' 12 | b = 'abigfolder/akamai' 13 | assert longest_sub(a, b) == 1 14 | -------------------------------------------------------------------------------- /test/clustering/test_swalign.py: -------------------------------------------------------------------------------- 1 | from src.clustering import swalign 2 | 3 | 4 | def test_align(): 5 | match = 1 6 | mismatch = -1 7 | scoring = swalign.NucleotideScoringMatrix(match, mismatch) 8 | 9 | sw = swalign.LocalAlignment(scoring, gap_penalty=-1) 10 | assert sw.align('/mal/xxx/a.php', '/mal/a.php') == 6 11 | -------------------------------------------------------------------------------- /test/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "main_file": "data_demo.csv", 3 | "data": { 4 | "run": false, 5 | "additional_files": [ 6 | { 7 | "path": "my_data/benign_data.csv", 8 | "label": "benign" 9 | }, 10 | { 11 | "path": "my_data/malicious_traffic.csv", 12 | "label": "malicious"} 13 | 14 | ] 15 | }, 16 | "feeder": { 17 | "run": false, 18 | "sources": [ 19 | "urlhaus", 20 | "openfish", 21 | "alexa" 22 | ] 23 | }, 24 | "preprocessing": { 25 | "run": true, 26 | "name": "basic" 27 | }, 28 | "clustering": { 29 | "run": true, 30 | "preprocessed_file": null, 31 | "skip_distance_computation": false, 32 | "clusterer": { 33 | "dbscan": { 34 | "eps": 20, 35 | "min_samples": 8 36 | } 37 | }, 38 | "metric": "sw", 39 | "features_folder": "luda_output/mymatrix", 40 | "filter_similarity": 30, 41 | "phishing_mode": false 42 | }, 43 | "regex": { 44 | "run": false, 45 | "benign_for_retrain": 30, 46 | "round_max": 10, 47 | "regex_folder": "myregexes", 48 | "take_existing_result": false, 49 | "min_path_for_run": 200, 50 | "cluster_list": [0,4] 51 | } 52 | } -------------------------------------------------------------------------------- /test/coverage.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | coverage 17 | coverage 18 | 70% 19 | 70% 20 | 21 | 22 | -------------------------------------------------------------------------------- /test/feeder/crawler/test_crawler.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from src.feeder.crawler.crawler import Crawler 3 | 4 | 5 | @pytest.fixture() 6 | def crawler_object(): 7 | return Crawler(_url='akamai.com', depth=2) 8 | 9 | 10 | def test_fix_url(): 11 | result = Crawler.fix_url('randomurl.com/') 12 | assert result == 'http://randomurl.com' 13 | 14 | 15 | def test_run(crawler_object): 16 | url_set = crawler_object.run() 17 | print(url_set) 18 | assert len(url_set) >= 2 # depth 19 | -------------------------------------------------------------------------------- /test/feeder/data/vt_key.txt: -------------------------------------------------------------------------------- 1 | my_vt_key -------------------------------------------------------------------------------- /test/feeder/test_alexa_feed_downloader.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from src.logger_code import init_logger 3 | from src.feeder.alexa_feed_downloader import AlexaFeedDownloader 4 | 5 | 6 | @pytest.fixture() 7 | def alexa_feeder(): 8 | feeder = AlexaFeedDownloader() 9 | init_logger() 10 | return feeder 11 | 12 | 13 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the feeder") 14 | def test_fetch(alexa_feeder): 15 | urls = alexa_feeder.fetch() 16 | print(urls) 17 | -------------------------------------------------------------------------------- /test/feeder/test_feed_downloader.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from src.logger_code import init_logger 3 | import pytest 4 | import mock 5 | 6 | from src.feeder.feed_downloader import FeedDownloader 7 | from src.feeder.feed_downloader import Url 8 | 9 | 10 | @pytest.fixture 11 | def feed_downloader_generic(): 12 | class ExampleFeedDownloader(FeedDownloader): 13 | def fetch(self) -> List[Url]: 14 | source = 'Example source' 15 | phishingURL = Url("http://example.com/index.html", source, 'malicious') 16 | malwareURL = Url("http://example.com/index2.html", source, 'malicious') 17 | return [phishingURL, malwareURL] 18 | 19 | example_downloader = ExampleFeedDownloader() 20 | init_logger() 21 | return example_downloader 22 | 23 | 24 | def test_fetch(feed_downloader_generic): 25 | example_urls = feed_downloader_generic.fetch() 26 | assert example_urls[0].source == 'Example source' 27 | assert {url.label for url in example_urls} == {'malicious'} 28 | 29 | 30 | @pytest.mark.skip(reason="functional test. Comment this line to run the function") 31 | def test_save_to_csv(feed_downloader_generic): 32 | list_of_urls = feed_downloader_generic.fetch() 33 | feed_downloader_generic.save_to_csv(list_of_urls) 34 | 35 | 36 | @mock.patch('src.feeder.feed_downloader.Crawler') 37 | def test_get_urls_from_domain(crawler_mock, feed_downloader_generic): 38 | url = 'akamai.com' 39 | depth = 30 40 | feed_downloader_generic.get_urls_from_domain(url, depth_max=depth) 41 | crawler_mock.assert_called_with(url, depth=depth) 42 | -------------------------------------------------------------------------------- /test/feeder/test_iscx_feed_downloader.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from src.logger_code import init_logger 3 | from src.feeder.iscx_feed_downloader import IscxFeedDownloader 4 | 5 | 6 | @pytest.fixture() 7 | def iscx_feeder(): 8 | feeder = IscxFeedDownloader() 9 | init_logger() 10 | return feeder 11 | 12 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the feeder") 13 | def test_fetch(iscx_feeder): 14 | urls = iscx_feeder.fetch() 15 | print(urls) 16 | a = 1 17 | -------------------------------------------------------------------------------- /test/feeder/test_vt_feed_downloader.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from src.logger_code import init_logger 3 | from src.feeder.vt_feed_downloader import VtFeedDownloader 4 | 5 | import mock 6 | 7 | 8 | @pytest.fixture() 9 | def vt_feeder(): 10 | feeder = VtFeedDownloader() 11 | init_logger() 12 | return feeder 13 | 14 | 15 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the feeder") 16 | def test_fetch(vt_feeder): 17 | urls = vt_feeder.fetch() 18 | print(urls) 19 | 20 | 21 | @mock.patch('builtins.open', mock.mock_open(read_data='my_vt_key')) 22 | def test_load_key(vt_feeder): 23 | key = vt_feeder.load_key('data/vt_key.txt') 24 | assert key == 'my_vt_key' 25 | 26 | 27 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the feeder") 28 | def test_get_records(vt_feeder): 29 | url_list = vt_feeder.get_records(number=10) 30 | print(url_list) 31 | -------------------------------------------------------------------------------- /test/preprocessor/data/data_preprocessing_test.csv: -------------------------------------------------------------------------------- 1 | url,label 2 | http://173.243.112.132/serve/config.bin,malicious 3 | http://194.15.112.29/2ja/panel/config.bin,malicious 4 | http://216.170.125.134/neat/serverphp/config.bin,malicious 5 | http://58.22.101.109/xz/cfg.bin,malicious 6 | http://83.149.95.197/1/cfg.bin,malicious 7 | http://abbcp.cn/bm_a/controller.php,malicious 8 | http://ajana.com.au/.tmp/server/config.bin,malicious -------------------------------------------------------------------------------- /test/preprocessor/test_preprocessor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from src.preprocessor.preprocessor import Preprocessor 4 | 5 | current_folder = os.path.dirname(os.path.abspath(__file__)) 6 | 7 | 8 | @pytest.fixture 9 | def preprocessor(): 10 | return Preprocessor() 11 | 12 | 13 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the function") 14 | def test_run(preprocessor): 15 | file_path = os.path.join(current_folder, 'data.csv') 16 | preprocessor.run(file_path) 17 | -------------------------------------------------------------------------------- /test/preprocessor/test_preprocessor_basic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import mock 4 | 5 | from src.preprocessor.preprocessor_basic import PreprocessorBasic 6 | 7 | current_folder = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | 10 | @pytest.fixture 11 | def preprocessor(): 12 | return PreprocessorBasic() 13 | 14 | 15 | @mock.patch('src.preprocessor.preprocessor_basic.create_folder') 16 | @mock.patch('src.preprocessor.preprocessor_basic.pd.DataFrame.to_csv') 17 | def test_run(create_folder, to_csv_mock, preprocessor): 18 | file_path = os.path.join(current_folder, 'data', 'data_preprocessing_test.csv') 19 | df_preprocessed =preprocessor.run(file_path) 20 | assert 'filter_wp' in list(df_preprocessed) 21 | -------------------------------------------------------------------------------- /test/regex/data/input_cluster_2_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "urls_cluster", 3 | "description": "luda", 4 | "regexTarget": "", 5 | "examples": [ 6 | { 7 | "string": "/chase/home/auth/Logging_in.php", 8 | "match": [ 9 | { 10 | "start": 0, 11 | "end": 30 12 | } 13 | ], 14 | "unmatch": [] 15 | }, 16 | { 17 | "string": "/log/home/auth/Logging_in.php", 18 | "match": [ 19 | { 20 | "start": 0, 21 | "end": 28 22 | } 23 | ], 24 | "unmatch": [] 25 | }, 26 | { 27 | "string": "wp-content/uploads", 28 | "match": [], 29 | "unmatch": [ 30 | { 31 | "start": 0, 32 | "end": 17 33 | } 34 | ] 35 | }, 36 | { 37 | "string": "home.php", 38 | "match": [], 39 | "unmatch": [ 40 | { 41 | "start": 0, 42 | "end": 7 43 | } 44 | ] 45 | } 46 | ] 47 | } -------------------------------------------------------------------------------- /test/regex/data/input_correct.json: -------------------------------------------------------------------------------- 1 | {"to_test" : ["/jordan", "/asaf"]} -------------------------------------------------------------------------------- /test/regex/data/json_for_test.json: -------------------------------------------------------------------------------- 1 | {"name": "urls_cluster", "description": "luda", "regexTarget": "", "examples": [{"string": "/chase/home/auth/Logging_in.php", "match": [{"start": 0, "end": 30}], "unmatch": []}, {"string": "/log/home/auth/Logging_in.php", "match": [{"start": 0, "end": 28}], "unmatch": []}, {"string": "wp-content/uploads", "match": [], "unmatch": [{"start": 0, "end": 17}]}, {"string": "home.php", "match": [], "unmatch": [{"start": 0, "end": 7}]}]} -------------------------------------------------------------------------------- /test/regex/data/output_regex_runner.json: -------------------------------------------------------------------------------- 1 | {"results":[true,true]} -------------------------------------------------------------------------------- /test/regex/data/results_cluster_2_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "datasetName": "urls_cluster", 3 | "methodDescription": "Console config", 4 | "experimentDate": "Jul 29, 2021 3:33:38 PM", 5 | "bestSolution": { 6 | "solutionJS": "/(?=(\\w+))\\1(?=(((?=(\\w?))\\4[^@])+))\\2", 7 | "trainingPerformances": { 8 | "character recall": 1.0, 9 | "character precision": 1.0, 10 | "match precision": 1.0, 11 | "character accuracy": 1.0, 12 | "match f-measure": 1.0, 13 | "match recall": 1.0 14 | }, 15 | "validationPerformances": { 16 | "character recall": 1.0, 17 | "character precision": 1.0, 18 | "match precision": 1.0, 19 | "character accuracy": 1.0, 20 | "match f-measure": 1.0, 21 | "match recall": 1.0 22 | }, 23 | "learningPerformances": { 24 | "character recall": 1.0, 25 | "character precision": 1.0, 26 | "match precision": 1.0, 27 | "character accuracy": 1.0, 28 | "match f-measure": 1.0, 29 | "match recall": 1.0 30 | }, 31 | "solution": "/\\w++(\\w?+[^@])++", 32 | "fitness": [ 33 | 0.0, 34 | 0.0, 35 | 17.0 36 | ] 37 | }, 38 | "bestExtractions": [ 39 | [ 40 | { 41 | "start": 0, 42 | "end": 30 43 | } 44 | ], 45 | [ 46 | { 47 | "start": 0, 48 | "end": 28 49 | } 50 | ], 51 | [], 52 | [] 53 | ], 54 | "bestExtractionsStrings": [ 55 | [ 56 | "/chase/home/auth/Logging_in.ph" 57 | ], 58 | [ 59 | "/log/home/auth/Logging_in.ph" 60 | ], 61 | [], 62 | [] 63 | ], 64 | "bestExtractionsStats": [ 65 | { 66 | "fp": 0, 67 | "tp": 1 68 | }, 69 | { 70 | "fp": 0, 71 | "tp": 1 72 | }, 73 | { 74 | "fp": 0, 75 | "tp": 0 76 | }, 77 | { 78 | "fp": 0, 79 | "tp": 0 80 | } 81 | ], 82 | "overallExecutionTimeMillis": 145685, 83 | "numberMatches": 2, 84 | "numberUnmatches": 4, 85 | "numberMatchedChars": 58, 86 | "numberUnmatchedChars": 28, 87 | "numberAnnotatedChars": 86, 88 | "numberAllChars": 86, 89 | "numberTrainingMatches": 1, 90 | "numberTrainingUnmatches": 2, 91 | "characterEvaluations": 511388500 92 | } -------------------------------------------------------------------------------- /test/regex/test_regex.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import json 4 | import mock 5 | from unittest.mock import patch 6 | 7 | from src.logger_code import init_logger 8 | from src.regex.regex import Regex 9 | import conf 10 | 11 | current_folder = os.path.dirname(os.path.abspath(__file__)) 12 | 13 | INPUT_CLUSTER = os.path.join(current_folder, 'data/input_cluster_2_1.json') 14 | DATA = os.path.join(current_folder, 'data') 15 | 16 | 17 | @pytest.fixture() 18 | def regex_object(): 19 | init_logger() 20 | return Regex(project_name='test_project') 21 | 22 | 23 | @mock.patch('src.regex.regex.subprocess') 24 | def test_run_regex_extraction(subprocess_mock, regex_object): 25 | regex_object.run_regex_extraction(json_to_extract={'randomjson'}) 26 | assert subprocess_mock.run.called 27 | 28 | 29 | @pytest.mark.skip(reason="regex functional test") 30 | def test_run_regex_extraction_functional(regex_object): 31 | # Uncomment this test to really run the regex extraction. Took 2m 26sec on my computer 8cores 16Go RAM 32 | regex_object.run_regex_extraction(json_to_extract=INPUT_CLUSTER) 33 | 34 | 35 | @mock.patch('src.regex.regex.json') 36 | def test_create_json_cluster(json_mock, regex_object): 37 | str_list = ['/chase/home/auth/Logging_in.php', 38 | '/log/home/auth/Logging_in.php'] 39 | str_to_not_match = ['wp-content/uploads', 40 | 'home.php'] 41 | with open(INPUT_CLUSTER) as json_file: 42 | json_for_regex_extraction = json.load(json_file) 43 | with mock.patch('builtins.open') as open_mock: 44 | name = 'test_cluster' 45 | json_for_regex_extraction['name'] = name 46 | json_path = os.path.join(regex_object.project_folder, 'input_' + name + '.json') 47 | 48 | regex_object.create_json_cluster(str_list, str_to_not_match, name=name) 49 | assert json_mock.mock_calls[0][1][0] == json_for_regex_extraction 50 | open_mock.assert_called_with(json_path, 'w') 51 | 52 | 53 | @pytest.mark.skip(reason="regex functional test") 54 | def test_run_with_benign_check(regex_object): 55 | # Uncomment this test to really run the regex extraction. Took 2m 26sec on my computer 8cores 16Go RAM 56 | 57 | """ 58 | Toook 6m 40 sec in my computer ( 8 cpu, 16go RAM) 59 | :param regex_object: 60 | :return: 61 | """ 62 | cluster_dict = {'cluster_7_53': {'match': [ 63 | '/wp-admin/jpmorgan/chasebank/chase/home/auth/Logon_Files/Themes/default/css/style.css', 64 | '/wp-admin/jpmorgan/chasebank/chase/home/auth/Logon_Files/Themes/guest/css/style.css', 65 | '/wp-admin/jpmorgan/chasebank/chase/home/auth/Logon_Files/Themes/default/css/style_new.css', 66 | '/wp-admin/jpmorgan/chasebank/chase/home/auth/Logon_Files/Themes/default-col/css/style_new.css', 67 | '/wp-admin/jpmorgan/chasebank/chase/home/auth/Logon_Files/Themes/guest/css/style_new.css', 68 | '/wp-admin/jpmorgan/chasebank/chase/home/auth/Logon_Files/Themes/default-col/css/style.css', 69 | '/wp-admin/jpmorgan/chasebank/chase/home/auth/logon_files/themes/default-col/css/style.css'], 70 | 'unmatch': []}, 71 | 'cluster_11_3': { 72 | 'match': ['/ch18', '/ch16', '/ch13', '/ch10', '/ch17', '/ch1', '/ch11', '/ch12', '/ch19', 73 | '/ch14', '/ch15'], 'unmatch': []}} 74 | 75 | str_to_not_match = ['wp-content/uploads', 76 | 'home.php'] 77 | regex_object.run_with_benign_check(cluster_dict, benign_list=str_to_not_match, 78 | benign_for_retrain=2, take_existing_result=False, round_max=2) 79 | 80 | 81 | def test_check_regex_list(regex_object): 82 | regex_object = Regex(project_name='test') 83 | sig = ".s" 84 | string_list = ['luda', 'superman', 'akamai', 'blackhat', 'awordthatendswiths'] 85 | expected = (1, ['awordthatendswiths']) 86 | match_result = regex_object.check_regex_list(sig, string_list, limit=10) 87 | assert expected == match_result 88 | 89 | 90 | def test_get_cluster_results(regex_object): 91 | """ 92 | To run this test. You should have the file test/regex/data/results_cluster_2_1.json 93 | :param regex_object: 94 | :return: 95 | """ 96 | regex_object.project_folder = DATA 97 | cluster_results = regex_object.get_cluster_results() 98 | assert cluster_results == {'cluster_2_1': '/\\w++(\\w?+[^@])++'} 99 | 100 | 101 | @mock.patch('src.regex.regex.pd.DataFrame.to_csv') 102 | def test_create_result_report(to_csv_mock, regex_object): 103 | regex_object.project_folder = DATA 104 | df = regex_object.create_result_report() 105 | assert df['name'].to_list() == ['cluster_2_1'] 106 | assert df['regex_java'].to_list() == ['/\\w++(\\w?+[^@])++'] 107 | 108 | 109 | def test_run_regex_java(): 110 | regex = "(?:\w*+/)*+bt_version_checker\.php" 111 | string_list = ['/spyeye/Main/bt_version_checker.php', '/spye/main/bt_version_checker.php', 112 | '/WP-CD/Main/bt_version_checker.php', '/Net/Main/bt_version_checker.php', 113 | '/main/main/bt_version_checker.php', '/dbase/main/bt_version_checker.php', 114 | '/hits/bt_version_checker.php', '/Main/bt_version_checker.php', '/spy/main/bt_version_checker.php', 115 | '/sy1/bt_version_checker.php', '/grab/main/bt_version_checker.php'] 116 | 117 | result = Regex.run_regex_java(regex, string_list) 118 | assert set(result) == {True} 119 | 120 | 121 | @patch('conf.OUTPUT_REGEX_RUNNER', os.path.join(current_folder, 'data/output_regex_runner.json')) 122 | @patch('conf.INPUT_REGEX_RUNNER', os.path.join(current_folder, 'data/input_correct.json')) 123 | def test_run_regex_java_with_file(): 124 | new_open = open(conf.OUTPUT_REGEX_RUNNER, 'r') 125 | # By doing that, I can patch only the first open call 126 | with mock.patch('builtins.open') as mymock: 127 | mymock.side_effect = [mock.MagicMock(), new_open] 128 | regex = "(\\.?+[^_])++" 129 | result = Regex.run_regex_java(regex, '') 130 | assert result == [True, True] 131 | -------------------------------------------------------------------------------- /test/use_case/test_use_case_clustering.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | 4 | from src.use_case.use_case_clustering import UseCaseClustering 5 | from src.logger_code import init_logger 6 | 7 | test_folder = os.path.dirname((os.path.dirname(os.path.abspath(__file__)))) 8 | 9 | 10 | @pytest.fixture() 11 | def use_case(): 12 | logger = init_logger() 13 | return UseCaseClustering() 14 | 15 | 16 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the function") 17 | def test_run(use_case): 18 | main_file = os.path.join(test_folder, 'data', 'data_preprocessed.csv') 19 | use_case.run(main_file) 20 | -------------------------------------------------------------------------------- /test/use_case/test_use_case_data.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from src.use_case.use_case_data import UseCaseData 4 | from src.logger_code import init_logger 5 | 6 | test_folder = os.path.dirname((os.path.dirname(os.path.abspath(__file__)))) 7 | 8 | logger = init_logger() 9 | 10 | 11 | @pytest.fixture() 12 | def use_case(): 13 | return UseCaseData() 14 | 15 | 16 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the function") 17 | def test_run(use_case): 18 | main_file = os.path.join(test_folder, 'data', 'data.csv') 19 | additional_files = [os.path.join(test_folder, 'data', 'benign_alexa.hql.out'), 20 | os.path.join(test_folder, 'data', 'iscx_benign.csv')] 21 | use_case.run(main_file, additional_files) 22 | --------------------------------------------------------------------------------