├── .coverage
├── .dockerignore
├── .gitignore
├── Dockerfile
├── LICENSE
├── Readme.md
├── analysis
    └── luda_analysis.ipynb
├── conf.py
├── docker-compose.yml
├── main.py
├── requirements.txt
├── src
    ├── __init__.py
    ├── clustering
    │   ├── __init__.py
    │   ├── distance_matrix.py
    │   ├── metrics.py
    │   └── swalign.py
    ├── feeder
    │   ├── __init__.py
    │   ├── alexa_feed_downloader.py
    │   ├── crawler
    │   │   ├── __init__.py
    │   │   ├── crawler.py
    │   │   └── endrecursive.py
    │   ├── feed_downloader.py
    │   ├── iscx_feed_downloader.py
    │   ├── majestic_feed_downloader.py
    │   ├── openfish_feed_downloader.py
    │   ├── umbrella_feed_downloader.py
    │   ├── urlhaus_feed_downloader.py
    │   └── vt_feed_downloader.py
    ├── logger_code.py
    ├── preprocessor
    │   ├── __init__.py
    │   ├── preprocessor.py
    │   └── preprocessor_basic.py
    ├── regex
    │   ├── ConsoleRegexTurtle.jar
    │   ├── ConsoleRegexTurtle
    │   │   ├── build.xml
    │   │   ├── build
    │   │   │   ├── built-jar.properties
    │   │   │   └── classes
    │   │   │   │   ├── gpl.txt
    │   │   │   │   └── it
    │   │   │   │       └── units
    │   │   │   │           └── inginf
    │   │   │   │               └── male
    │   │   │   │                   ├── console
    │   │   │   │                       └── ConsoleRegexTurtle.class
    │   │   │   │                   └── dto
    │   │   │   │                       └── SimpleConfig.class
    │   │   ├── dist
    │   │   │   ├── ConsoleRegexTurtle.jar
    │   │   │   ├── README.TXT
    │   │   │   ├── lib
    │   │   │   │   ├── MaleRegexTree.jar
    │   │   │   │   ├── Random_Regex_Turtle.jar
    │   │   │   │   └── gson-2.2.4.jar
    │   │   │   └── regexturtle.sh
    │   │   ├── lib
    │   │   │   ├── CopyLibs
    │   │   │   │   └── org-netbeans-modules-java-j2seproject-copylibstask.jar
    │   │   │   ├── Gson
    │   │   │   │   ├── gson-2.2.4-javadoc.jar
    │   │   │   │   ├── gson-2.2.4-sources.jar
    │   │   │   │   └── gson-2.2.4.jar
    │   │   │   └── nblibraries.properties
    │   │   ├── manifest.mf
    │   │   ├── nbproject
    │   │   │   ├── build-impl.xml
    │   │   │   ├── genfiles.properties
    │   │   │   ├── private
    │   │   │   │   ├── config.properties
    │   │   │   │   ├── private.properties
    │   │   │   │   └── private.xml
    │   │   │   ├── project.properties
    │   │   │   └── project.xml
    │   │   ├── regexturtle.sh
    │   │   └── src
    │   │   │   ├── gpl.txt
    │   │   │   └── it
    │   │   │       └── units
    │   │   │           └── inginf
    │   │   │               └── male
    │   │   │                   ├── console
    │   │   │                       └── ConsoleRegexTurtle.java
    │   │   │                   └── dto
    │   │   │                       └── SimpleConfig.java
    │   ├── RegexRunner.jar
    │   ├── __init__.py
    │   ├── regex.py
    │   ├── regexturtle.sh
    │   └── src_regexrunner
    │   │   └── regexrunner
    │   │       ├── JsonOperation.java
    │   │       ├── Main.java
    │   │       └── Regex.java
    ├── use_case
    │   ├── use_case_clustering.py
    │   ├── use_case_data.py
    │   ├── use_case_feeder.py
    │   ├── use_case_preprocessor.py
    │   └── use_case_regex_generation.py
    └── utils.py
└── test
    ├── __init__.py
    ├── clustering
        ├── data
        │   └── save_test
        │   │   ├── index.pkl
        │   │   └── matrix.pkl
        ├── test_distance_matrix.py
        ├── test_metrics.py
        └── test_swalign.py
    ├── config.json
    ├── coverage.svg
    ├── data_demo.csv
    ├── feeder
        ├── crawler
        │   └── test_crawler.py
        ├── data
        │   └── vt_key.txt
        ├── test_alexa_feed_downloader.py
        ├── test_feed_downloader.py
        ├── test_iscx_feed_downloader.py
        └── test_vt_feed_downloader.py
    ├── preprocessor
        ├── data
        │   └── data_preprocessing_test.csv
        ├── test_preprocessor.py
        └── test_preprocessor_basic.py
    ├── regex
        ├── data
        │   ├── input_cluster_2_1.json
        │   ├── input_correct.json
        │   ├── input_regex_runner.json
        │   ├── json_for_test.json
        │   ├── output_regex_runner.json
        │   └── results_cluster_2_1.json
        └── test_regex.py
    └── use_case
        ├── test_use_case_clustering.py
        └── test_use_case_data.py


/.coverage:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/.coverage


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | test
 2 | *.log
 3 | log
 4 | *.md
 5 | !README*.md
 6 | README-secret.md
 7 | my_data
 8 | luda_output
 9 | automation
10 | data
11 | *.csv
12 | __pycache__
13 | test
14 | personal
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.csv
 2 | *.json
 3 | *.log
 4 | *.out
 5 | *DS_Store
 6 | *.pickle
 7 | *.pkl
 8 | .idea
 9 | .pytest_cache
10 | .ipynb_checkpoints
11 | luda_output
12 | __pycache__
13 | personal
14 | vt_key.txt
15 | luda.md
16 | 
17 | !/test/data_demo.csv
18 | !/test/config.json
19 | 
20 | !/test/clustering/data/save_test/*
21 | !/test/feeder/data/vt_key.txt
22 | !/test/preprocessor/data/data_preprocessing_test.csv
23 | !/test/regex/data/*
24 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-slim-buster
 2 | 
 3 | WORKDIR /code
 4 | RUN yes | apt-get update
 5 | RUN yes | apt install build-essential
 6 | RUN yes | apt-get install manpages-dev
 7 | RUN yes | pip3 install Cython
 8 | RUN pip3 install notebook
 9 | RUN mkdir -p /usr/share/man/man1
10 | RUN yes | apt-get install default-jdk
11 | RUN yes | apt-get install vim
12 | RUN yes | apt-get install screen
13 | RUN apt-get install htop
14 | 
15 | 
16 | COPY requirements.txt requirements.txt
17 | 
18 | RUN pip3 install -r requirements.txt
19 | 
20 | COPY . .
21 | 
22 | CMD [ "/bin/bash" ]
23 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
  1 | [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://GitHub.com/Naereen/StrapDown.js/graphs/commit-activity)
  2 | [![GPLv3 license](https://img.shields.io/badge/License-GPLv3-blue.svg)](http://perso.crans.org/besson/LICENSE.html)
  3 | ![Coverage](test/coverage.svg "Coverage")
  4 | 
  5 | 
  6 | # LUDA: Large URLs Dataset Analyzer for security
  7 | 
  8 | 
  9 | _Presented at [BlackHat USA 2021 Arsenal](https://www.blackhat.com/us-21/arsenal/schedule/index.html#luda--large-urls-dataset-analyzer-for-security-23851
 10 | )_ 
 11 | 
 12 | # Table of Contents
 13 | 1. [Download and getting started](#Download-and-getting-started)
 14 | 2. [The 5 modules](#The-5-modules)
 15 |     1. [Data](#Data)
 16 |     2. [Feeders](#Feeders)
 17 |     3. [Preprocessing](#Preprocessing)
 18 |     4. [Clustering](#Clustering)
 19 |     5. [Regex generation](#Regex-generation)
 20 | 3. [Deployment with docker to a remote machine](#Deployment-with-docker-to-a-remote-machine)
 21 | 4. [Support and contributing to Luda](#Support-and-contributing-to-Luda)
 22 | 
 23 | 
 24 | Malicious actors often reuse code to deploy their malware, phishing website or CNC server. As a result, similiaries can 
 25 | be found on URLs path by inspecting internet traffic. Moreover, deep learning models or even regular ML model do not 
 26 | fit for inline deployment in terms of running performance. However, regexes ( or YARA rules ) can be deployed on a proxy
 27 | and work in real time on all the traffic. LUDA can take a set of malicious and benign URLs and return a list of regexes
 28 | ready to be deployed inline ! 
 29 | 
 30 | # Download and getting started 
 31 | 
 32 | First of all, clone the repo :)
 33 | 
 34 | Copy now test/config.json to the main directory.
 35 | 
 36 | To make sure it will work for everyone, we will run everything inside a docker. Assuming you have docker 
 37 | and docker-composed on your machine,
 38 | just run from the project directory
 39 | 
 40 | ```bash
 41 | docker-compose up  #building the docker for the first time can take few minutes.
 42 | ```
 43 | 
 44 | It will create a container named luda as well as running a jupyter notebook that you can access on localhost:5555 (token: luda). 
 45 | You noticed that it created also a folder "data" on project level that is mapped to the same folder on the docker. 
 46 | 
 47 | So now copy (on the host) test/data_demo.csv to data/data_demo.csv. The file conf.json is already set like we need.
 48 | 
 49 | Now go to you docker with 
 50 | ```bash
 51 | docker exec -it luda bash
 52 | ```
 53 | and run
 54 | ```bash
 55 | python main.py #should take less than 1 min with 8cpu 16go RAM
 56 | ```
 57 | 
 58 | It will preprocess the data, and cluster the urls. Now let's look at the clusters !
 59 | Go to you localhost:5555 to access the jupyter notebook hosted on the docker and open analysis/luda_analysis.ipynb
 60 | 
 61 | You can run all cells adn then go to the last part "Cluster analysis". The last output cells should show you the clusters.
 62 | You should see something like this
 63 | 
 64 | ```text
 65 | Name: cluster, dtype: int64
 66 | #####Cluster 0 - 27 samples: #### 
 67 | 
 68 | ['/neat/serverphp/config.bin',
 69 |  '/serverphp/config.bin',
 70 | ...
 71 |  '/pus1/serverphp/config.bin',
 72 |  '/lg/server.php/config.bin',
 73 |  '/ekene/Severphp/config.bin',
 74 |  '/server[php]/config.bin',
 75 |  '/versy/serverphp/config.bin']
 76 | 
 77 | 
 78 | #####Cluster 4 - 17 samples: #### 
 79 | 
 80 | ['/mupanel/post.php',
 81 |  '/jiz/kbpanel/post.php',
 82 | ...
 83 |  '/low/kbpanel/post.php',
 84 |  '/1/kbpanel/post.php',
 85 |  '/new/kbpanel/post.php']
 86 | ```
 87 | 
 88 | Here you can choose from which cluster you would like to run the regex generation. This last part is CPU and RAM expensive and you should run
 89 | it only on the clusters that looks "good". Here you can also identify path that can generate FP ( like "/index.php" for example. Check use_case_clustering.py to see how you can fix FP at this step).
 90 | Let's say you choose only those two clusters (0 and 4). Change the config.json (on the docker, you can access it directly via the notebook)
 91 | to be 
 92 | 
 93 | ```json
 94 | {
 95 |   "main_file": "data_demo.csv",
 96 |   "data": {
 97 |     "run": false,
 98 |     "additional_files": [
 99 |       {
100 |         "path": "my_data/benign_data.csv",
101 |         "label": "benign"
102 |       },
103 |       {
104 |         "path": "my_data/malicious_traffic.csv",
105 |         "label": "malicious"}
106 |     ]
107 |   },
108 |   "feeder": {
109 |     "run": false,
110 |     "sources": [
111 |       "urlhaus",
112 |       "openfish",
113 |       "alexa"
114 |     ]
115 |   },
116 |   "preprocessing": {
117 |     "run": false,
118 |     "name": "basic"
119 |   },
120 |   "clustering": {
121 |     "run": false,
122 |     "preprocessed_file": null,
123 |     "skip_distance_computation": false,
124 |     "clusterer": {
125 |       "dbscan": {
126 |         "eps": 20,
127 |         "min_samples": 8
128 |       }
129 |     },
130 |     "metric": "sw",
131 |     "features_folder": "luda_output/mymatrix",
132 |     "filter_similarity": 30,
133 |     "phishing_mode": false
134 |   },
135 |   "regex": {
136 |     "run": true,
137 |     "benign_for_retrain": 30,
138 |     "round_max": 10,
139 |     "regex_folder": "myregexes",
140 |     "take_existing_result": true,
141 |     "min_path_for_run": 200,
142 |     "cluster_list": [0,4]
143 |   }
144 | }
145 | ```
146 | 
147 | We just turned off all step exept the regex generation steps that we want to run. We also added that we want run on cluster
148 | 0 and 4 only.
149 | 
150 | Now again (from the docker)
151 | 
152 | /!\ This step can take few hours ( ~2h on a 48CPU machine, 378GB RAM without using all its ressources)
153 | 
154 | ```bash
155 | python main.py 
156 | ```
157 | 
158 | Check the log on luda_output/logs/luda.log at the end you can see a small report in the log ( where you see how each signature evolved at each round)
159 | 
160 | ```txt
161 |         N cluster : 2
162 |         N paths: 44
163 |         N benign in final test: 9486
164 |         Benign number for retraining : 30
165 |         N round: 10
166 | 
167 |         Cluster sig paths:
168 | 
169 |         cluster_27_0 : (\.*+[^_])++ ---> [^bin]*+[^\.]*+\.bin
170 | cluster_17_4 : ([^_]\w++)++ ---> [^\.]++\.php ---> (\w*+/)++post\.php ---> [^php]++\w\w\w/?+\w++/post\.php
171 | 
172 | 
173 |         After final testing:
174 |         Cluster with 0 FP: {'cluster_17_4', 'cluster_27_0'}
175 |         Number of paths covered with 0 FP: 44
176 |         Percentage of paths covered with 0 FP: 100.0 %
177 | 
178 |         ### FP Report ###
179 | 
180 |         With FP :
181 | 
182 | 
183 | 
184 |         Without:
185 | 
186 |         ['cluster_27_0', 'cluster_17_4']
187 | ```
188 | 
189 | You also get a report showing basic info on the run. It's a csv stored in the "regex_folder" ( following the above config, it is luda_output/myregexes/report_myregexes.csv)
190 | 
191 | |id|name        |regex_js                                            |regex_java                     |malicious|benign|round|example_malicious          |results_file             |input_file             |
192 | |------|------------|----------------------------------------------------|-------------------------------|---------|------|-----|---------------------------|-------------------------|-----------------------|
193 | |0     |cluster_17_4|(?=([^php]+))\1\w\w\w(?=(/?))\2(?=(\w+))\3/post\.php|[^php]++\w\w\w/?+\w++/post\.php|17       |61    |3    |/mupanel/post.php.         |results_cluster_17_4.json|input_cluster_17_4.json|
194 | |1     |cluster_27_0|(?=([^bin]*))\1(?=([^\.]*))\2\.bin                  |[^bin]*+[^\.]*+\.bin           |27       |30    |1    |/neat/serverphp/config.bin.|results_cluster_27_0.json|input_cluster_27_0.json|
195 | 
196 | 
197 | Congrats on your first LUDA run. You now have 2 regexes ( Java or JS) that can be used malicious urls belonging to the clusters you found :)
198 | 
199 | On the next part, we will dive into LUDA architecture to understand each of its components, understand what else you can do and possibly make 
200 | you contribute to the project !
201 | 
202 | 
203 | 
204 | 
205 | LUDA is composed of **5  modules** : data, feeder, preprocessing, clustering and regex generation. 
206 | 
207 | To run LUDA, we need to first configure _config.json_
208 | 
209 | # The 5 modules
210 | 
211 | Every part is independent and can be run separately with the config file.
212 | 
213 | ## Data
214 | 
215 | 
216 | To provide LUDA with some URLs, you can pass some files. The **only condition** that they should have is a column named "url". 
217 | However, if you provide the main file ( here data_demo.csv) it should have url, source, label, family as columns. 
218 | So the easiest way to add your files is to add them on the additional_files array. 
219 | 
220 | LUDA will then load them and store it in its format joined with the data coming from the feeders. By default, it will look for the file
221 | in the data folder. Otherwise you can write an absolute path. 
222 | The main file does not have to exists. You can add you own file in additional_files and luda will combine them.
223 | 
224 | ```json
225 | "main_file": "data_demo.csv",
226 |   "data": {
227 |     "run": false,
228 |     "additional_files": [
229 |       {
230 |         "path": "my_data/benign_data.csv",
231 |         "label": "benign"
232 |       },
233 |       {
234 |         "path": "my_data/malicious_traffic.csv",
235 |         "label": "malicious"}
236 | 
237 |     ]
238 |   },
239 | ```
240 | 
241 | ## Feeders
242 | 
243 | We implemented several feeders from malicious sources that bring you the most recent data. Among them feeders for UrlHaus,
244 | OpenPhish, Alexa, Majestic, VT etc. If a your feeder bring domains (not URLs) a crawler is available and can convert your domain
245 | into URLs. We invite you to create your own feeder and share it to this project 
246 | 
247 | ```json
248 |   "feeder": {
249 |     "run": false,
250 |     "sources": [
251 |       "urlhaus",
252 |       "openfish",
253 |       "alexa"
254 |     ]
255 |   }
256 | 
257 | ```
258 | 
259 | ## Preprocessing
260 | 
261 | To get better results and save computation, it is *mandatory* to preprocess the data. You need to filter smartly 
262 | your URLs to leave only the one that "have a chance to create a cluster". 
263 | 
264 | We provide a class that implemented "basic" preprocessing techniques that we are currently using.
265 | 
266 | ```json
267 |   "preprocessing": {
268 |     "run": false,
269 |     "name": "basic"
270 |   }
271 | ```
272 | 
273 | ## Clustering 
274 | 
275 | ```json
276 |   "clustering": {
277 |     "run": false,
278 |     "preprocessed_file": null,
279 |     "skip_distance_computation": false,
280 |     "clusterer": {
281 |       "dbscan": {
282 |         "eps": 20,
283 |         "min_samples": 8
284 |       }
285 |     },
286 |     "metric": "sw",
287 |     "features_folder": "luda_output/mymatrix",
288 |     "filter_similarity": 30,
289 |     "phishing_mode": false
290 |   }
291 | ```
292 | 
293 | ### Distance matrix computation
294 | 
295 | 
296 | This is a CPU and RAM expensive step. It will use ( by default ) all your CPU and can catch 300GB RAM for a list of URLs 
297 | longuer than 35k...That's why the preprocessing step is very important. At the end of the task, it will save the results 
298 | in a folder ( specified in the config file) that you can reuse several times to test different parameters of the clustering.
299 | 
300 | If you already have a csv file with your data. You need to write its **absolute** path in config.json in "preprocessed_file" 
301 | 
302 | 
303 | ### Clustering algorithm
304 | 
305 | We are currently using DBscan since we want to control MinPoints ( minimum points in a cluster). Moreover, since we build
306 | by ourselves the metric, we understand what is Epsilon ! Setting Epsilon to 20 for example is equivalent 
307 | to say "Group together URLs that are 80% similar"
308 | 
309 | 
310 | This **step in quick**, you can run it several time to test different parameters
311 | 
312 | ## Regex generation
313 | 
314 | For this step, we use an existing research. The original code can be found here https://github.com/MaLeLabTs/RegexGenerator
315 | 
316 | We already added to LUDA the dependencies that we need from this project. More details on this repo to optimize the regex generation process.
317 | 
318 | We strongly advise you to first look at your cluster that you got from the clustering part before running on all your clusters.
319 | Once you chose your cluster, add their id to "cluster_list"
320 | ```json
321 |   "regex": {
322 |     "run": true,
323 |     "benign_for_retrain": 30,
324 |     "round_max": 10,
325 |     "regex_folder": "myregexes",
326 |     "take_existing_result": false,
327 |     "min_path_for_run": 200,
328 |     "cluster_list": [0,4]
329 |   }
330 | 
331 | ```
332 | # Deployment with docker to a remote machine
333 | 
334 | Getting an environmment ready can be achieved with 
335 | 
336 | ```bash
337 | docker-compose up
338 | ```
339 | 
340 | By default this docker create a container named **luda** and build an image called **luda_image** . The docker run also 
341 | a Jupyter Notebook that you can access from the port 5555 (5555 is mapped to 8888 in this version)
342 | 
343 | One of the most efficient way to run LUDA in another machine is to send **luda_image** to a Docker registry and pull directly
344 | on the target machine
345 | 
346 | On local you can run 
347 | 
348 | ```bash
349 | docker-compose build # to create the image
350 | docker tag luda_image:latest your_docker_user/luda_image 
351 | docker push your_docker_user/luda_image 
352 | ```
353 | 
354 | and on the remote you can either run it with docker-compose (you need to it copy it there) or run
355 | 
356 | ```bash
357 | docker rm -f luda; sudo docker run -it -v /home/data/:/code/data -p 5555:8888 --name luda your_docker_user/luda_image bash
358 | 
359 | # We first delete luda in case of the container already exists. It will delete all the container including your notebook.
360 | # and inside it you can launch the jupyter notebook if you want (inside a screen so it stay alive if you close the tab)
361 | 
362 | screen -d -m -S jupyter jupyter notebook --allow-root --no-browser --ip 0.0.0.0 --NotebookApp.token='luda'
363 | ```
364 | Then you just need to send your data 
365 | 
366 | ```bash
367 | scp -i yourkey.pem data_preprocessed.csv user@your_powerfull_machine:/home/data # remember we map home/data to code/data 
368 | ```
369 | An advantage to send your data separately is first to not get a big docker image and also to update your code if needed
370 | and still test with your data since the docker volume is mapped into a persistent folder in the host machine
371 | /!\ If you add your data to your Docker, after several tries, your disk might be full. 
372 | You can delete all images by running
373 | 
374 | ```bash
375 | docker rmi -f $(docker images -a -q)
376 | ```
377 | 
378 | 
379 | If you always need "sudo" to run docker command, you can just your user to the docker group by running
380 | 
381 | ```bash
382 | sudo usermod -a -G docker [user]
383 | newgrp docker
384 | ```
385 | ## Access the remote Jupyter Notebook
386 | 
387 | Once you are your container running, you can either access the Jupyter notebook via your browser on port 5555 of your server
388 | 
389 | OR you can do SSH tunneling ( if your machine does not have open port for inbound connection)
390 | ```bash
391 | ssh -N -f -L localhost:<FREE PORT IN YOUR LOCAL MACHINE>:localhost:5555 -i yourkey.pem user@your_powerfull_machine 
392 | ```
393 | 
394 | 
395 | # Support and contributing to Luda
396 | 
397 | This code is maintained. You are welcome to ask any questions directly on Git. We will try to answer as quick as possible.
398 | 
399 | We also invite your to contribute to this open source. Add your feeders, preprocessing techniques, clustering algorithms
400 | or fix bugs. 
401 | It can be done via pull request. More details on how to pull request [here](https://www.dataschool.io/how-to-contribute-on-github/
402 | ). Please provide basic test with your code.
403 | 
404 | ## Running the tests
405 | 
406 | Adding test protect your code but also explain them to others.
407 | Make sure the project as at least 70% coverage.
408 | To check the coverage, pip install those 2 packages
409 | 
410 | ```bash
411 | pip install coverage
412 | pip install coverage-badge
413 | ```
414 | and run from the main luda directory
415 | 
416 | ```bash
417 | coverage run -m pytest
418 | coverage report -m --omit="*/test*" # optional - to see the coverage without including tests
419 | coverage-badge -o test/coverage.svg -f # this will create the coverage badge loaded in the Readme
420 | ```
421 | # Authors
422 | 
423 | **Code**: [Jordan Garzon]
424 | **Algorithm**: [Jordan Garzon] and [Asaf Nadler]
425 | 
426 | from [Akamai Technologies](https://www.akamai.com)
427 | 
428 | 
429 | [Jordan Garzon]: https://twitter.com/JordGarzon
430 | [Asaf Nadler]: https://twitter.com/AsafNadler
431 | 
432 | ```text
433 |                |||      |||    
434 |                | |  __  | |
435 | |-|_____-----/   |_|  |_|   \-----_____|-|
436 | |_|_________{   }|  (^) |{  }__________|_|  
437 |  ||          |_| |   ^  | |_|          ||  
438 |  |              \|  /\  |/              |  
439 |  |               \ |--| /               |    
440 |  =               \ |__| /               =    
441 |  +               \      /               +   ENJOY !
442 |                   \    /    
443 |                   \    /    
444 |                    \  /
445 |                    \  /
446 |                    \  /
447 |                    \  /
448 |                    \  /    
449 |                    \  /
450 |                     \/
451 | 
452 | ```


--------------------------------------------------------------------------------
/analysis/luda_analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "ExecuteTime": {
  7 |      "end_time": "2021-07-14T11:35:57.868516Z",
  8 |      "start_time": "2021-07-14T11:35:57.863398Z"
  9 |     }
 10 |    },
 11 |    "source": [
 12 |     "<div style=\"color:blue; text-align: center; font-weight: bold; font-family: 'Courier New', monospace; font-size: 40px; \">\n",
 13 |     " LUDA ANALYSIS NOTEBOOK</div>"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {
 20 |     "ExecuteTime": {
 21 |      "end_time": "2021-08-02T09:08:44.346388Z",
 22 |      "start_time": "2021-08-02T09:08:44.311578Z"
 23 |     }
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import os\n",
 28 |     "import pickle\n",
 29 |     "import pandas as pd\n",
 30 |     "from pprint import pprint"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {
 37 |     "ExecuteTime": {
 38 |      "end_time": "2021-08-02T09:08:44.490307Z",
 39 |      "start_time": "2021-08-02T09:08:44.477180Z"
 40 |     }
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "DATA = '../data/data_demo.csv'\n",
 45 |     "PREPROCESSED_DATA = '../data/data_demo_preprocessed.csv'\n",
 46 |     "MATRIX_OUTPUT = '../luda_output/mymatrix/'"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {
 53 |     "ExecuteTime": {
 54 |      "end_time": "2021-08-02T09:08:44.642951Z",
 55 |      "start_time": "2021-08-02T09:08:44.623894Z"
 56 |     }
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "def value_counts(df_col, limit=None):\n",
 61 |     "    normalized = df_col.value_counts(normalize=True)[:limit]\n",
 62 |     "    normal = df_col.value_counts()[:limit]\n",
 63 |     "    normalized.name, normal.name = 'normalized', 'count'\n",
 64 |     "    return pd.concat([normal, normalized], axis=1)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "# Explore your data"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {
 77 |     "ExecuteTime": {
 78 |      "end_time": "2021-07-15T10:58:46.241896Z",
 79 |      "start_time": "2021-07-15T10:58:46.239058Z"
 80 |     }
 81 |    },
 82 |    "source": [
 83 |     "## Data"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 4,
 89 |    "metadata": {
 90 |     "ExecuteTime": {
 91 |      "end_time": "2021-08-02T09:08:45.756817Z",
 92 |      "start_time": "2021-08-02T09:08:45.317621Z"
 93 |     }
 94 |    },
 95 |    "outputs": [
 96 |     {
 97 |      "data": {
 98 |       "text/html": [
 99 |        "<div>\n",
100 |        "<style scoped>\n",
101 |        "    .dataframe tbody tr th:only-of-type {\n",
102 |        "        vertical-align: middle;\n",
103 |        "    }\n",
104 |        "\n",
105 |        "    .dataframe tbody tr th {\n",
106 |        "        vertical-align: top;\n",
107 |        "    }\n",
108 |        "\n",
109 |        "    .dataframe thead th {\n",
110 |        "        text-align: right;\n",
111 |        "    }\n",
112 |        "</style>\n",
113 |        "<table border=\"1\" class=\"dataframe\">\n",
114 |        "  <thead>\n",
115 |        "    <tr style=\"text-align: right;\">\n",
116 |        "      <th></th>\n",
117 |        "      <th>url</th>\n",
118 |        "      <th>label</th>\n",
119 |        "    </tr>\n",
120 |        "  </thead>\n",
121 |        "  <tbody>\n",
122 |        "    <tr>\n",
123 |        "      <td>0</td>\n",
124 |        "      <td>http://173.243.112.132/serve/config.bin</td>\n",
125 |        "      <td>malicious</td>\n",
126 |        "    </tr>\n",
127 |        "    <tr>\n",
128 |        "      <td>1</td>\n",
129 |        "      <td>http://194.15.112.29/2ja/panel/config.bin</td>\n",
130 |        "      <td>malicious</td>\n",
131 |        "    </tr>\n",
132 |        "    <tr>\n",
133 |        "      <td>2</td>\n",
134 |        "      <td>http://216.170.125.134/neat/serverphp/config.bin</td>\n",
135 |        "      <td>malicious</td>\n",
136 |        "    </tr>\n",
137 |        "    <tr>\n",
138 |        "      <td>3</td>\n",
139 |        "      <td>http://58.22.101.109/xz/cfg.bin</td>\n",
140 |        "      <td>malicious</td>\n",
141 |        "    </tr>\n",
142 |        "    <tr>\n",
143 |        "      <td>4</td>\n",
144 |        "      <td>http://83.149.95.197/1/cfg.bin</td>\n",
145 |        "      <td>malicious</td>\n",
146 |        "    </tr>\n",
147 |        "    <tr>\n",
148 |        "      <td>...</td>\n",
149 |        "      <td>...</td>\n",
150 |        "      <td>...</td>\n",
151 |        "    </tr>\n",
152 |        "    <tr>\n",
153 |        "      <td>10195</td>\n",
154 |        "      <td>http://fhs.mcmaster.ca/main/benefactors/braley...</td>\n",
155 |        "      <td>benign</td>\n",
156 |        "    </tr>\n",
157 |        "    <tr>\n",
158 |        "      <td>10196</td>\n",
159 |        "      <td>http://youtube.com/watch?v=_WQSaqs-fOs</td>\n",
160 |        "      <td>benign</td>\n",
161 |        "    </tr>\n",
162 |        "    <tr>\n",
163 |        "      <td>10197</td>\n",
164 |        "      <td>http://randomdomain34623.com/B5iioj3SFI5gE_JbH...</td>\n",
165 |        "      <td>benign</td>\n",
166 |        "    </tr>\n",
167 |        "    <tr>\n",
168 |        "      <td>10198</td>\n",
169 |        "      <td>http://randomdomain42219.com/-xwiPbFONIb8/AAAA...</td>\n",
170 |        "      <td>benign</td>\n",
171 |        "    </tr>\n",
172 |        "    <tr>\n",
173 |        "      <td>10199</td>\n",
174 |        "      <td>http://randomdomain39328.com/a/AATXAJwSYig3P9W...</td>\n",
175 |        "      <td>benign</td>\n",
176 |        "    </tr>\n",
177 |        "  </tbody>\n",
178 |        "</table>\n",
179 |        "<p>10200 rows × 2 columns</p>\n",
180 |        "</div>"
181 |       ],
182 |       "text/plain": [
183 |        "                                                     url      label\n",
184 |        "0                http://173.243.112.132/serve/config.bin  malicious\n",
185 |        "1              http://194.15.112.29/2ja/panel/config.bin  malicious\n",
186 |        "2       http://216.170.125.134/neat/serverphp/config.bin  malicious\n",
187 |        "3                        http://58.22.101.109/xz/cfg.bin  malicious\n",
188 |        "4                         http://83.149.95.197/1/cfg.bin  malicious\n",
189 |        "...                                                  ...        ...\n",
190 |        "10195  http://fhs.mcmaster.ca/main/benefactors/braley...     benign\n",
191 |        "10196             http://youtube.com/watch?v=_WQSaqs-fOs     benign\n",
192 |        "10197  http://randomdomain34623.com/B5iioj3SFI5gE_JbH...     benign\n",
193 |        "10198  http://randomdomain42219.com/-xwiPbFONIb8/AAAA...     benign\n",
194 |        "10199  http://randomdomain39328.com/a/AATXAJwSYig3P9W...     benign\n",
195 |        "\n",
196 |        "[10200 rows x 2 columns]"
197 |       ]
198 |      },
199 |      "execution_count": 4,
200 |      "metadata": {},
201 |      "output_type": "execute_result"
202 |     }
203 |    ],
204 |    "source": [
205 |     "df = pd.read_csv(DATA)\n",
206 |     "df"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 5,
212 |    "metadata": {
213 |     "ExecuteTime": {
214 |      "end_time": "2021-08-02T09:08:46.405848Z",
215 |      "start_time": "2021-08-02T09:08:46.190986Z"
216 |     }
217 |    },
218 |    "outputs": [
219 |     {
220 |      "data": {
221 |       "text/html": [
222 |        "<div>\n",
223 |        "<style scoped>\n",
224 |        "    .dataframe tbody tr th:only-of-type {\n",
225 |        "        vertical-align: middle;\n",
226 |        "    }\n",
227 |        "\n",
228 |        "    .dataframe tbody tr th {\n",
229 |        "        vertical-align: top;\n",
230 |        "    }\n",
231 |        "\n",
232 |        "    .dataframe thead th {\n",
233 |        "        text-align: right;\n",
234 |        "    }\n",
235 |        "</style>\n",
236 |        "<table border=\"1\" class=\"dataframe\">\n",
237 |        "  <thead>\n",
238 |        "    <tr style=\"text-align: right;\">\n",
239 |        "      <th></th>\n",
240 |        "      <th>count</th>\n",
241 |        "      <th>normalized</th>\n",
242 |        "    </tr>\n",
243 |        "  </thead>\n",
244 |        "  <tbody>\n",
245 |        "    <tr>\n",
246 |        "      <td>benign</td>\n",
247 |        "      <td>10000</td>\n",
248 |        "      <td>0.980392</td>\n",
249 |        "    </tr>\n",
250 |        "    <tr>\n",
251 |        "      <td>malicious</td>\n",
252 |        "      <td>200</td>\n",
253 |        "      <td>0.019608</td>\n",
254 |        "    </tr>\n",
255 |        "  </tbody>\n",
256 |        "</table>\n",
257 |        "</div>"
258 |       ],
259 |       "text/plain": [
260 |        "           count  normalized\n",
261 |        "benign     10000    0.980392\n",
262 |        "malicious    200    0.019608"
263 |       ]
264 |      },
265 |      "execution_count": 5,
266 |      "metadata": {},
267 |      "output_type": "execute_result"
268 |     }
269 |    ],
270 |    "source": [
271 |     "value_counts(df['label'])"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 6,
277 |    "metadata": {
278 |     "ExecuteTime": {
279 |      "end_time": "2021-08-02T09:08:46.483380Z",
280 |      "start_time": "2021-08-02T09:08:46.428743Z"
281 |     }
282 |    },
283 |    "outputs": [
284 |     {
285 |      "name": "stdout",
286 |      "output_type": "stream",
287 |      "text": [
288 |       "200 unique malicious URLs\n",
289 |       "9978 unique benign URLs\n"
290 |      ]
291 |     }
292 |    ],
293 |    "source": [
294 |     "for label in ['malicious', 'benign']:\n",
295 |     "    _ = df[df['label'] == label]\n",
296 |     "    print(f\"{_['url'].nunique()} unique {label} URLs\")\n",
297 |     "\n"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "## Preprocessed"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 7,
310 |    "metadata": {
311 |     "ExecuteTime": {
312 |      "end_time": "2021-08-02T09:08:47.142816Z",
313 |      "start_time": "2021-08-02T09:08:47.051959Z"
314 |     }
315 |    },
316 |    "outputs": [],
317 |    "source": [
318 |     "df_preprocessed = pd.read_csv(PREPROCESSED_DATA)"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 8,
324 |    "metadata": {
325 |     "ExecuteTime": {
326 |      "end_time": "2021-08-02T09:08:47.379022Z",
327 |      "start_time": "2021-08-02T09:08:47.369022Z"
328 |     }
329 |    },
330 |    "outputs": [
331 |     {
332 |      "data": {
333 |       "text/plain": [
334 |        "benign       10000\n",
335 |        "malicious      200\n",
336 |        "Name: label, dtype: int64"
337 |       ]
338 |      },
339 |      "execution_count": 8,
340 |      "metadata": {},
341 |      "output_type": "execute_result"
342 |     }
343 |    ],
344 |    "source": [
345 |     "df_preprocessed['label'].value_counts()"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "markdown",
350 |    "metadata": {},
351 |    "source": [
352 |     "# Cluster analysis"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "markdown",
357 |    "metadata": {},
358 |    "source": [
359 |     "Explore your cluster before running the automatic regex generation"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 9,
365 |    "metadata": {
366 |     "ExecuteTime": {
367 |      "end_time": "2021-08-02T09:08:48.366378Z",
368 |      "start_time": "2021-08-02T09:08:48.354717Z"
369 |     }
370 |    },
371 |    "outputs": [],
372 |    "source": [
373 |     "def load_result(data_path, folder):\n",
374 |     "    df = pd.read_csv(data_path)\n",
375 |     "    with open(os.path.join(folder, 'index_to_keep.pkl'), 'rb') as f:\n",
376 |     "        index_to_keep = pickle.load(f)\n",
377 |     "    with open(os.path.join(folder, 'labels.pkl'), 'rb') as f:\n",
378 |     "        labels = pickle.load(f)\n",
379 |     "    df = df.iloc[index_to_keep, :]\n",
380 |     "    df['cluster'] = labels\n",
381 |     "    series_cluster_count = df['cluster'].value_counts()\n",
382 |     "    print('Clusters : ')\n",
383 |     "    print(df['cluster'].value_counts())\n",
384 |     "    for cluster in series_cluster_count.index:\n",
385 |     "        if cluster == -1:\n",
386 |     "            continue\n",
387 |     "        print('#####Cluster {} - {} samples: #### \\n'.format(cluster, series_cluster_count[cluster]))\n",
388 |     "        pprint(df[(df['cluster']==cluster)]['path'].to_list())\n",
389 |     "        print('\\n')\n",
390 |     "    return df\n",
391 |     "\n",
392 |     "\n",
393 |     "def get_stat_cluster(df_features):\n",
394 |     "    df_features_cluster = pd.DataFrame(df_features.groupby('cluster').agg({'domain': ['nunique'], 'path': 'count'}).to_records())\n",
395 |     "    df_features_cluster.columns = ['cluster', 'domain', 'path']\n",
396 |     "    df_features_cluster = df_features_cluster[df_features_cluster['cluster'] !=-1]\n",
397 |     "    n_path = df_features_cluster['path'].sum()\n",
398 |     "    print('{} paths ({} %) clustered from {} domains !'.format(n_path, round(100*n_path/df_features['path'].nunique(), 2), df_features_cluster['domain'].sum()))\n",
399 |     "    print('Cluster number: {}'.format(df_features_cluster['cluster'].nunique()))\n",
400 |     "    return df_features_cluster.sort_values('path', ascending=False)"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": 10,
406 |    "metadata": {
407 |     "ExecuteTime": {
408 |      "end_time": "2021-08-02T09:08:48.687947Z",
409 |      "start_time": "2021-08-02T09:08:48.540847Z"
410 |     }
411 |    },
412 |    "outputs": [
413 |     {
414 |      "name": "stdout",
415 |      "output_type": "stream",
416 |      "text": [
417 |       "Clusters : \n",
418 |       " 0     27\n",
419 |       " 4     17\n",
420 |       " 6     16\n",
421 |       " 1     16\n",
422 |       " 10    15\n",
423 |       " 15    12\n",
424 |       " 11    12\n",
425 |       " 3     10\n",
426 |       " 8      9\n",
427 |       " 16     8\n",
428 |       " 7      8\n",
429 |       " 9      8\n",
430 |       " 5      8\n",
431 |       " 12     8\n",
432 |       " 13     8\n",
433 |       " 2      8\n",
434 |       " 14     8\n",
435 |       "-1      2\n",
436 |       "Name: cluster, dtype: int64\n",
437 |       "#####Cluster 0 - 27 samples: #### \n",
438 |       "\n",
439 |       "['/neat/serverphp/config.bin',\n",
440 |       " '/serverphp/config.bin',\n",
441 |       " '/Zeus/serverphp/config.bin',\n",
442 |       " '/files/serverphp/config.bin',\n",
443 |       " '/high/serverphp/config.bin',\n",
444 |       " '/work/server.php/config.bin',\n",
445 |       " '/nice/serverphp/config.bin',\n",
446 |       " '/online/serverphp/config.bin',\n",
447 |       " '/adm/serverphp/config.bin',\n",
448 |       " '/plain/serverphp/config.bin',\n",
449 |       " '/dbb/serverphp/config.bin',\n",
450 |       " '/figo/serverphp/config.bin',\n",
451 |       " '/fine/serverphp/config.bin',\n",
452 |       " '/sys/serverphp/config.bin',\n",
453 |       " '/dbd/serverphp/config.bin',\n",
454 |       " '/nku/serverphp/config.bin',\n",
455 |       " '/lg/server-php/config.bin',\n",
456 |       " '/crome/serverphp/config.bin',\n",
457 |       " '/db/serverphp/config.bin',\n",
458 |       " '/good/serverphp/config.bin',\n",
459 |       " '/serverp/config.bin',\n",
460 |       " '/dolls/serverphp/config.bin',\n",
461 |       " '/pus1/serverphp/config.bin',\n",
462 |       " '/lg/server.php/config.bin',\n",
463 |       " '/ekene/Severphp/config.bin',\n",
464 |       " '/server[php]/config.bin',\n",
465 |       " '/versy/serverphp/config.bin']\n",
466 |       "\n",
467 |       "\n",
468 |       "#####Cluster 4 - 17 samples: #### \n",
469 |       "\n",
470 |       "['/mupanel/post.php',\n",
471 |       " '/jiz/kbpanel/post.php',\n",
472 |       " '/sync/kbpanel/post.php',\n",
473 |       " '/doc/kbpanel/post.php',\n",
474 |       " '/K/kbpanel/post.php',\n",
475 |       " '/panel/post.php',\n",
476 |       " '/kbpanel/post.php',\n",
477 |       " '/_cpanel/post.php',\n",
478 |       " '/KB/kbpanel/post.php',\n",
479 |       " '/led/kbpanel/post.php',\n",
480 |       " '/laww/kbpanel/post.php',\n",
481 |       " '/php/kbpanel/post.php',\n",
482 |       " '/tru/kbpanel/post.php',\n",
483 |       " '/edu/kbpanel/post.php',\n",
484 |       " '/low/kbpanel/post.php',\n",
485 |       " '/1/kbpanel/post.php',\n",
486 |       " '/new/kbpanel/post.php']\n",
487 |       "\n",
488 |       "\n",
489 |       "#####Cluster 6 - 16 samples: #### \n",
490 |       "\n",
491 |       "['/wp-content/themes/twentyeleven/img5.php',\n",
492 |       " '/site/wp-content/themes/twentyeleven/e.php',\n",
493 |       " '/wp-content/themes/twentyeleven/get.php',\n",
494 |       " '/wp-content/themes/twentytwelve/r.php',\n",
495 |       " '/wp-content/themes/twentyeleven/3.php',\n",
496 |       " '/wp-content/themes/twentyeleven/post.php',\n",
497 |       " '/wp-content/themes/twentyeleven/js',\n",
498 |       " '/wp-content/themes/twentytwelve/img3.php',\n",
499 |       " '/wp-content/themes/twentytwelve/c.php',\n",
500 |       " '/wp-content/themes/twentyeleven/ccccc.php',\n",
501 |       " '/wp-content/themes/twentytwelve/cccc.php',\n",
502 |       " '/wp-content/themes/twentytwelve/rr.php',\n",
503 |       " '/wp-content/themes/twentytwelve/g1.php',\n",
504 |       " '/wp-content/themes/twentytwelve/st1.exe',\n",
505 |       " '/wp-content/themes/twentyeleven/a.php',\n",
506 |       " '/wp-content/themes/twentyeleven/cc.php']\n",
507 |       "\n",
508 |       "\n",
509 |       "#####Cluster 1 - 16 samples: #### \n",
510 |       "\n",
511 |       "['/xz/cfg.bin',\n",
512 |       " '/1/cfg.bin',\n",
513 |       " '/zs/cfg.bin',\n",
514 |       " '/me/cfg.bin',\n",
515 |       " '/zsb/cfg.bin',\n",
516 |       " '/zex/cfg.bin',\n",
517 |       " '/izu/cfg.bin',\n",
518 |       " '/zus/cfg.bin',\n",
519 |       " '/z/cfg.bin',\n",
520 |       " '/2/cfg.bin',\n",
521 |       " '/te/cfg.bin',\n",
522 |       " '/ZUS/cfg.bin',\n",
523 |       " '/ze/cfg.bin',\n",
524 |       " '/zv/cfg.bin',\n",
525 |       " '/zu/cfg.bin',\n",
526 |       " '/c/cfg.bin']\n",
527 |       "\n",
528 |       "\n",
529 |       "#####Cluster 10 - 15 samples: #### \n",
530 |       "\n",
531 |       "['/wp-content/plugins/wp-db-backup-made/test.php',\n",
532 |       " '/wp-content/plugins/wp-db-backup-made/BYA4Ks.php',\n",
533 |       " '/RRYZZ/wp-content/plugins/wp-db-backup-made/test.php',\n",
534 |       " '/wp-content/plugins/wp-db-backup-made/g1.php',\n",
535 |       " '/wp-content/plugins/wp-db-backup-made/ap1.php',\n",
536 |       " '/wp-content/plugins/wp-db-backup-made/das.db',\n",
537 |       " '/wp-content/plugins/wp-db-backup-made/ap2.php',\n",
538 |       " '/wp-content/plugins/wp-db-backup-made/3.php',\n",
539 |       " '/wp-content/plugins/wp-db-backup-made/ap5.php',\n",
540 |       " '/wp-content/plugins/wp-db-backup-made/ap4.php',\n",
541 |       " '/wp-content/plugins/wp-db-backup-made/mein.hlp',\n",
542 |       " '/wp-content/plugins/wp-db-backup-made/Xoeyqs.php',\n",
543 |       " '/wp-content/plugins/wp-db-backup-made/3ILBop.php',\n",
544 |       " '/wp-content/plugins/wp-db-backup-made/c.php',\n",
545 |       " '/blog/wp-content/plugins/wp-db-backup-made/d.php']\n",
546 |       "\n",
547 |       "\n",
548 |       "#####Cluster 15 - 12 samples: #### \n",
549 |       "\n",
550 |       "['/bla31/gate.php',\n",
551 |       " '/bla09/gate.php',\n",
552 |       " '/bla06/gate.php',\n",
553 |       " '/bla10/gate.php',\n",
554 |       " '/bla11/gate.php',\n",
555 |       " '/bla12/gate.php',\n",
556 |       " '/bla07/gate.php',\n",
557 |       " '/bla30/gate.php',\n",
558 |       " '/bla08/gate.php',\n",
559 |       " '/bla19/gate.php',\n",
560 |       " '/bla25/gate.php',\n",
561 |       " '/bla05/gate.php']\n",
562 |       "\n",
563 |       "\n",
564 |       "#####Cluster 11 - 12 samples: #### \n",
565 |       "\n",
566 |       "['/serve/config.bin',\n",
567 |       " '/.tmp/server/config.bin',\n",
568 |       " '/loja/.db/server/config.bin',\n",
569 |       " '/zcp/server/config.bin',\n",
570 |       " '/new/server/config.bin',\n",
571 |       " '/.db/server/config.bin',\n",
572 |       " '/server/config.bin',\n",
573 |       " '/web/server/config.bin',\n",
574 |       " '/go/server/config.bin',\n",
575 |       " '/123/server/config.bin',\n",
576 |       " '/servero/config.bin',\n",
577 |       " '/wpi/server/config.bin']\n",
578 |       "\n",
579 |       "\n",
580 |       "#####Cluster 3 - 10 samples: #### \n",
581 |       "\n",
582 |       "['/imgs/keybase/post.php',\n",
583 |       " '/pcss/keybase/post.php',\n",
584 |       " '/grey/keybase/post.php',\n",
585 |       " '/key/keybase/post.php',\n",
586 |       " '/app/keybase/post.php',\n",
587 |       " '/img/keybase/post.php',\n",
588 |       " '/k/keybase/post.php',\n",
589 |       " '/css/keybase/post.php',\n",
590 |       " '/.key/keybase/post.php',\n",
591 |       " '/old/keybase/post.php']\n",
592 |       "\n",
593 |       "\n",
594 |       "#####Cluster 8 - 9 samples: #### \n",
595 |       "\n",
596 |       "['/images/config.bin',\n",
597 |       " '/images/1/config.bin',\n",
598 |       " '/do/images/config.bin',\n",
599 |       " '/image3/config.bin',\n",
600 |       " '/t1/images/config.bin',\n",
601 |       " '/image/config.bin',\n",
602 |       " '/css/images/config.bin',\n",
603 |       " '/.images/config.bin',\n",
604 |       " '/wp-images/config.bin']\n",
605 |       "\n",
606 |       "\n",
607 |       "#####Cluster 16 - 8 samples: #### \n",
608 |       "\n",
609 |       "['/ade/PHP/index.php',\n",
610 |       " '/one/PHP/index.php',\n",
611 |       " '/goe/PHP/index.php',\n",
612 |       " '/gg/PHP/index.php',\n",
613 |       " '/joe/PHP/index.php',\n",
614 |       " '/nze/PHP/index.php',\n",
615 |       " '/kg/PHP/index.php',\n",
616 |       " '/ME/PHP/index.php']\n",
617 |       "\n",
618 |       "\n",
619 |       "#####Cluster 7 - 8 samples: #### \n",
620 |       "\n",
621 |       "['/lol/web/config/index.php',\n",
622 |       " '/web/config/index.php',\n",
623 |       " '/pfd/config/index.php',\n",
624 |       " '/cach/web/config/index.php',\n",
625 |       " '/web/web/config/index.php',\n",
626 |       " '/config/index.php',\n",
627 |       " '/css/config/index.php',\n",
628 |       " '/Fish/web/config/index.php']\n",
629 |       "\n",
630 |       "\n",
631 |       "#####Cluster 9 - 8 samples: #### \n",
632 |       "\n",
633 |       "['/bm_a/controller.php',\n",
634 |       " '/adm/controller.php',\n",
635 |       " '/bm/controller.php',\n",
636 |       " '/bm_b/controller.php',\n",
637 |       " '/4n/controller.php',\n",
638 |       " '/J/controller.php',\n",
639 |       " '/3/controller.php',\n",
640 |       " '/br/controller.php']\n",
641 |       "\n",
642 |       "\n",
643 |       "#####Cluster 5 - 8 samples: #### \n",
644 |       "\n",
645 |       "['/imagens/logo.gif',\n",
646 |       " '/images/logof.gif',\n",
647 |       " '/Images/logos.gif',\n",
648 |       " '/image/logos.gif',\n",
649 |       " '/images/logos1.gif',\n",
650 |       " '/images/flogo.gif',\n",
651 |       " '/imagens/logos.gif',\n",
652 |       " '/images/logo2.gif']\n",
653 |       "\n",
654 |       "\n",
655 |       "#####Cluster 12 - 8 samples: #### \n",
656 |       "\n",
657 |       "['/TJ/Count.asp',\n",
658 |       " '/TJ/count.asp',\n",
659 |       " '/1/count.asp',\n",
660 |       " '/t/Count.asp',\n",
661 |       " '/f/count.asp',\n",
662 |       " '/a/Count.asp',\n",
663 |       " '/2/count.asp',\n",
664 |       " '/jt/count.asp']\n",
665 |       "\n",
666 |       "\n",
667 |       "#####Cluster 13 - 8 samples: #### \n",
668 |       "\n",
669 |       "['/wp-content/uploads/2018/1Ih',\n",
670 |       " '/wp-content/uploads/2017/NVa',\n",
671 |       " '/wp-content/uploads/2019/6AP0',\n",
672 |       " '/wp-content/uploads/2016/04/',\n",
673 |       " '/wp-content/uploads/2018/Cc',\n",
674 |       " '/wp-content/uploads/2019/41',\n",
675 |       " '/wp-content/uploads/2019/12/app',\n",
676 |       " '/wp-content/uploads/2015/KD']\n",
677 |       "\n",
678 |       "\n",
679 |       "#####Cluster 2 - 8 samples: #### \n",
680 |       "\n",
681 |       "['/office/invoice_22114.doc',\n",
682 |       " '/office/invoice_11154.doc',\n",
683 |       " '/office/invoice_11148.doc',\n",
684 |       " '/office/invoice_221214.doc',\n",
685 |       " '/office/invoice_22112.doc',\n",
686 |       " '/office/invoice_22113.doc',\n",
687 |       " '/office/invoice_22121.doc',\n",
688 |       " '/office/invoice_21441.doc']\n",
689 |       "\n",
690 |       "\n",
691 |       "#####Cluster 14 - 8 samples: #### \n",
692 |       "\n",
693 |       "['/2ja/panel/config.bin',\n",
694 |       " '/ghpanel/config.bin',\n",
695 |       " '/cmm/panel/config.bin',\n",
696 |       " '/ceepanel/config.bin',\n",
697 |       " '/idk/panel/config.bin',\n",
698 |       " '/cpanel/config.bin',\n",
699 |       " '/Panel/config.bin',\n",
700 |       " '/ash/panel/config.bin']\n",
701 |       "\n",
702 |       "\n"
703 |      ]
704 |     }
705 |    ],
706 |    "source": [
707 |     "df_features = load_result(PREPROCESSED_DATA, MATRIX_OUTPUT)"
708 |    ]
709 |   },
710 |   {
711 |    "cell_type": "code",
712 |    "execution_count": null,
713 |    "metadata": {},
714 |    "outputs": [],
715 |    "source": []
716 |   }
717 |  ],
718 |  "metadata": {
719 |   "kernelspec": {
720 |    "display_name": "Python 3",
721 |    "language": "python",
722 |    "name": "python3"
723 |   },
724 |   "language_info": {
725 |    "codemirror_mode": {
726 |     "name": "ipython",
727 |     "version": 3
728 |    },
729 |    "file_extension": ".py",
730 |    "mimetype": "text/x-python",
731 |    "name": "python",
732 |    "nbconvert_exporter": "python",
733 |    "pygments_lexer": "ipython3",
734 |    "version": "3.7.4"
735 |   },
736 |   "toc": {
737 |    "base_numbering": 1,
738 |    "nav_menu": {},
739 |    "number_sections": true,
740 |    "sideBar": true,
741 |    "skip_h1_title": false,
742 |    "title_cell": "Table of Contents",
743 |    "title_sidebar": "Contents",
744 |    "toc_cell": false,
745 |    "toc_position": {
746 |     "height": "calc(100% - 180px)",
747 |     "left": "10px",
748 |     "top": "150px",
749 |     "width": "282.344px"
750 |    },
751 |    "toc_section_display": true,
752 |    "toc_window_display": true
753 |   }
754 |  },
755 |  "nbformat": 4,
756 |  "nbformat_minor": 2
757 | }
758 | 


--------------------------------------------------------------------------------
/conf.py:
--------------------------------------------------------------------------------
 1 | """
 2 | conf.py
 3 | 
 4 | We store here alsmot all internal global variables for this project. To configure your project, in general
 5 | you'll need to use config.json
 6 | """
 7 | 
 8 | import os
 9 | 
10 | WORKING_DIR = os.path.dirname(os.path.abspath(__file__))
11 | 
12 | SRC_DIR = os.path.join(WORKING_DIR, 'src')
13 | 
14 | LUDA_OUTPUT = os.path.join(WORKING_DIR, 'luda_output')
15 | 
16 | CONFIG_FILE = os.path.join(WORKING_DIR, 'config.json')
17 | 
18 | # Data
19 | 
20 | DATA = os.path.join(WORKING_DIR, 'data')
21 | 
22 | PREPROCESSED_SUFFIX = '_preprocessed.csv'
23 | 
24 | DATA_LABELS = ['malicious', 'benign']
25 | 
26 | VT_KEY = os.path.join(WORKING_DIR, 'vt_key.txt')
27 | 
28 | # Clustering
29 | 
30 | MATRIX_FOLDER = os.path.join(LUDA_OUTPUT, 'matrix_output')
31 | 
32 | MATRIX_STATS_FOLDER = os.path.join(LUDA_OUTPUT, 'matrix_stats')
33 | 
34 | DISTANCE_MATRIX = 'matrix.pkl'
35 | 
36 | INDEX = 'index.pkl'
37 | 
38 | MATRIX_STATS = 'matrix_stats.pkl'
39 | 
40 | SIMILARITY_MAX = 100
41 | 
42 | INDEX_TO_KEEP = 'index_to_keep.pkl'
43 | 
44 | LABELS = 'labels.pkl'
45 | 
46 | # Logs
47 | 
48 | LOGGER_NAME = 'luda'
49 | 
50 | LOG_FOLDER = os.path.join(LUDA_OUTPUT, 'logs')
51 | 
52 | LOGGER_FILE = os.path.join(LOG_FOLDER, 'luda.log')
53 | 
54 | LOG_FILE_SIZE = 10 * 1000000  # 10 MB
55 | 
56 | LOG_FILE_NUMBER = 5
57 | 
58 | # Regex
59 | 
60 | REGEX_FOLDER_OUTPUT = os.path.join(LUDA_OUTPUT, 'regex_output')
61 | 
62 | REGEX_FOLDER = os.path.join(SRC_DIR, 'regex')
63 | 
64 | REGEX_SH = os.path.join(REGEX_FOLDER, "ConsoleRegexTurtle", "dist", 'regexturtle.sh')
65 | 
66 | REGEX_JAVA = os.path.join(REGEX_FOLDER, "ConsoleRegexTurtle", "dist", 'ConsoleRegexTurtle.jar')
67 | 
68 | REGEX_TMP = os.path.join(REGEX_FOLDER_OUTPUT, 'tmp')
69 | 
70 | BENIGN_FOR_RETRAIN = 20
71 | 
72 | TEST_BATCH_SIZE = 50000
73 | 
74 | REGEX_RUNNER = os.path.join(REGEX_FOLDER, 'RegexRunner.jar')
75 | 
76 | INPUT_REGEX_RUNNER = os.path.join(REGEX_FOLDER_OUTPUT, 'input_regex_runner.json')
77 | 
78 | OUTPUT_REGEX_RUNNER = os.path.join(REGEX_FOLDER_OUTPUT, 'output_regex_runner.json')
79 | 
80 | LAST_REGEX_LIST = os.path.join(DATA, 'regex_list.json')
81 | 
82 | # Coverage
83 | 
84 | COVERAGE_FOLDER = os.path.join(LUDA_OUTPUT, 'coverage')
85 | 
86 | # Crawler
87 | 
88 | MAX_LEN_URL = 100
89 | 
90 | TIMEOUT_CRAWL = 10
91 | 
92 | DEPTH_MAX = 10
93 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   luda:
 4 |     build: .
 5 |     volumes:
 6 |       - "./data/:/code/data"
 7 |     ports:
 8 |       - "5555:8888"
 9 |     container_name: luda
10 |     image: luda_image
11 |     command: "jupyter notebook --allow-root --no-browser --ip 0.0.0.0 --NotebookApp.token='luda' "
12 | 
13 |     #if you don't want to run jupyter notebook, you can run this below command to keep the container alive
14 | 
15 |     #command: "tail -F anything" # just to keep it running
16 | 
17 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from src.logger_code import init_logger
 4 | from src.regex.regex import Regex
 5 | from src.use_case.use_case_clustering import UseCaseClustering
 6 | from src.use_case.use_case_regex_generation import UseCaseRegexGeneration
 7 | from src.use_case.use_case_feeder import UseCaseFeeder
 8 | from src.use_case.use_case_preprocessor import UseCasePreprocessor
 9 | from src.use_case.use_case_data import UseCaseData
10 | from src.utils import process_file_name
11 | from src.utils import process_preprocessed_file_name
12 | import conf
13 | 
14 | __author__ = "Jordan Garzon"
15 | __email__ = "jgarzon@akamai.com"
16 | 
17 | with open(conf.CONFIG_FILE) as json_file:
18 |     config = json.load(json_file)
19 | 
20 | 
21 | def main():
22 |     logger = init_logger()
23 |     main_file = process_file_name(config['main_file'])
24 |     preprocessed_file = process_preprocessed_file_name(main_file, config['clustering']['preprocessed_file'])
25 |     if config['data']['run']:
26 |         UseCaseData().run(main_file, config['data']['additional_files'])
27 |     if config['feeder']['run']:
28 |         logger.info('Running the feeders')
29 |         UseCaseFeeder().fetch_and_save(config['feeder']['sources'], main_file)
30 | 
31 |     if config['preprocessing']['run']:
32 |         logger.info('Running the preprocessing')
33 |         UseCasePreprocessor().run(config['preprocessing']['name'], main_file)
34 | 
35 |     if config['clustering']['run']:
36 |         logger.info('Running the clustering')
37 | 
38 |         use_case_clustering = UseCaseClustering()
39 |         use_case_clustering.run(file_path=preprocessed_file,
40 |                                 skip_compute_distance=config['clustering']['skip_distance_computation'],
41 |                                 save_folder=config['clustering']['features_folder'],
42 |                                 clusterer=config['clustering']['clusterer'],
43 |                                 filter_th=config['clustering']['filter_similarity'])
44 | 
45 |     # Regex Step
46 |     if config['regex']['run']:
47 |         logger.info('Running the regexes')
48 |         regex_object = Regex(project_name=config['regex']['regex_folder'])
49 |         use_case_regex = UseCaseRegexGeneration(regex_object)
50 | 
51 |         use_case_regex.run(main_file=preprocessed_file,
52 |                            cluster_list=config['regex']['cluster_list'],
53 |                            features_folder=config['clustering']['features_folder'],
54 |                            benign_for_retrain=config['regex']['benign_for_retrain'],
55 |                            take_existing_result=config['regex']['take_existing_result'],
56 |                            round_max=config['regex']['round_max'],
57 |                            min_path_for_run=config['regex']['min_path_for_run'])
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     main()
62 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | js_regex
 2 | Cython
 3 | sklearn
 4 | hdbscan
 5 | numpy
 6 | python-Levenshtein
 7 | pandas
 8 | tqdm
 9 | psutil
10 | tldextract
11 | beautifulsoup4
12 | urllib3
13 | requests
14 | pytest


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/__init__.py


--------------------------------------------------------------------------------
/src/clustering/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/clustering/__init__.py


--------------------------------------------------------------------------------
/src/clustering/distance_matrix.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import shutil
  3 | import pickle
  4 | import time
  5 | import os
  6 | import logging
  7 | import multiprocessing as mp
  8 | from collections import defaultdict
  9 | from tqdm import tqdm
 10 | 
 11 | from src.utils import create_folder
 12 | from src.clustering.metrics import DISTANCE_FUNC
 13 | import conf
 14 | 
 15 | logger = logging.getLogger(conf.LOGGER_NAME)
 16 | 
 17 | 
 18 | class DistanceMatrix(object):
 19 |     def __init__(self, url_list, matrix=None, distance_func=None, folder=None):
 20 |         """
 21 |         Compute distance matrix from list of strings
 22 |         :param url_list: list of urls or paths
 23 |         :param matrix: ndarray
 24 |         :param distance_func: Example distance_func=lev.distance or DISTANCE_FUNC['sw']
 25 |         :param folder: folder to save the results
 26 |         """
 27 |         self.distance_func = distance_func
 28 |         if not distance_func:
 29 |             self.distance_func = DISTANCE_FUNC['sw']
 30 |         self.url_list = url_list
 31 |         self.matrix = matrix
 32 |         self.folder = folder
 33 |         self.stats = defaultdict(int)
 34 | 
 35 |     def run(self, ncores=mp.cpu_count(), skip_calc=False):
 36 |         """
 37 |         Compute the matrix distances with multiprocessing
 38 |         :param ncores: number of cores to use
 39 |         :param skip_calc: bool. If True, skip the computation to the loading phase only
 40 |         :return:
 41 |         """
 42 |         if not skip_calc:
 43 |             create_folder(conf.MATRIX_FOLDER)
 44 |             self.__delete_matrix_stat_folder()
 45 |             _input_distance = self.__get_argument_create_matrix(ncores)
 46 |             processes = [mp.Process(target=self.__create_matrix_distance, args=x) for x in _input_distance]
 47 | 
 48 |             for p in processes:
 49 |                 p.start()
 50 |             for p in processes:
 51 |                 p.join()
 52 | 
 53 |         matrix = self.__get_big_matrix()  # to allocate as much as you want of memory in the kernel echo 1 > /proc/sys/vm/overcommit_memory
 54 |         self.matrix = matrix
 55 |         self.stats = self.__get_big_stats()
 56 | 
 57 |         self.__save()
 58 |         self.__delete_matrix_stat_folder(delete_folder=True)
 59 | 
 60 |         return matrix
 61 | 
 62 |     @classmethod
 63 |     def load(cls, folder_save):
 64 |         """
 65 |         Load a save folder for a future use, clustering for example.
 66 |         :param folder_save: path of the save folder
 67 |         :return: void
 68 |         """
 69 |         with open(os.path.join(folder_save, conf.DISTANCE_MATRIX), 'rb') as pickle_file:
 70 |             distance_matrix = pickle.load(pickle_file)
 71 |         with open(os.path.join(folder_save, conf.INDEX), 'rb') as pickle_file:
 72 |             index = pickle.load(pickle_file)
 73 |         return cls(index, distance_matrix, folder=folder_save)
 74 | 
 75 |     def __get_argument_create_matrix(self, ncores):
 76 |         """
 77 |         This function will map the computation and will give the arguments to be passed to the function
 78 |         create_matrix_distance
 79 |         :param ncores: number of cores to use
 80 |         :return: list of tuple
 81 |         """
 82 |         nsamples = len(self.url_list)
 83 | 
 84 |         distance_number = nsamples * (nsamples + 1) / 2  # we compute only half of the matrix
 85 |         computation_per_core = round(distance_number / ncores)
 86 |         computation_tuple_list = []
 87 |         a = nsamples
 88 |         b = nsamples  # in case ncores = 1
 89 |         for i in range(ncores - 1):
 90 |             b = self.__get_a(a, computation_per_core)
 91 |             computation_tuple_list.append((a, b))
 92 |             a = b
 93 |         computation_tuple_list.append((b, 0))
 94 | 
 95 |         return computation_tuple_list
 96 | 
 97 |     def __create_matrix_distance(self, b, a):
 98 |         """
 99 |         Fill the matrix between line a and b.
100 |         It dumps then only the lines filled ( to save space).
101 |         :param b: int
102 |         :param a: int
103 |         :return: void
104 |         """
105 |         logger.info('Running Process {}'.format(os.getpid()))
106 |         before = time.time()
107 |         n = len(self.url_list)
108 |         distance_matrix = np.zeros(shape=(b - a, n),
109 |                                    dtype=np.int32)  # we need double for HDBScan. We create a rectangle to save memory
110 |         for i in tqdm(range(b - a)):
111 |             for j in range(a + i):
112 |                 try:
113 |                     distance_score = self.distance_func(self.url_list[a + i], self.url_list[j])
114 |                     distance_matrix[i, j] = int(round(100 * distance_score / max(len(self.url_list[a + i]), len(
115 |                         self.url_list[j]))))  # we want a unique scale for short and
116 |                     # long string. Scale 0: 100
117 |                 except Exception as e:
118 |                     logger.error(
119 |                         "Error {} when computing the distance between line {} and column {}".format(e, a + i, j))
120 |         with open(os.path.join(conf.MATRIX_FOLDER, "{}_distance_matrix.pkl".format(a)), 'wb') as f:
121 | 
122 |             pickle.dump(distance_matrix, f, protocol=4)  # protocol=4 to dump matrices bigger than 4GB
123 | 
124 |         logger.info('Process {} done in {} s'.format(os.getpid(), time.time() - before))
125 |         if self.stats:
126 |             logger.info('Dumping stats')
127 |             create_folder(conf.MATRIX_STATS_FOLDER)
128 |             with open(os.path.join(conf.MATRIX_STATS_FOLDER, '{}_stats.pkl'.format(a)), 'wb') as f:
129 |                 pickle.dump(dict(self.stats), f)
130 |         return distance_matrix
131 | 
132 |     def __save(self):
133 |         """
134 |         Save the final results into a folder. This folder can be then used for a clustering for example
135 |         :return: void
136 |         """
137 |         if not self.folder:
138 |             _time = int(time.time() * 1000)
139 |             folder = 'save_{}'.format(_time)
140 |             logger.info(f'No folder specified, we save the results in {folder}')
141 |             os.mkdir(folder)
142 |             self.folder = folder
143 |         elif not os.path.isdir(self.folder):
144 |             os.mkdir(self.folder)
145 |         logger.info('Dumping matrix')
146 |         with open(os.path.join(self.folder, conf.DISTANCE_MATRIX), 'wb') as f:
147 |             pickle.dump(self.matrix, f, protocol=4)
148 |         logger.info('Dumping index')
149 |         with open(os.path.join(self.folder, conf.INDEX), 'wb') as f:
150 |             pickle.dump(self.url_list, f)
151 |         if self.stats:
152 |             logger.info('Dumping stats')
153 |             with open(os.path.join(self.folder, conf.MATRIX_STATS), 'wb') as f:
154 |                 pickle.dump(dict(self.stats), f)
155 | 
156 |     @staticmethod
157 |     def __get_a(b, s):
158 |         """
159 |         In a triangular matrix, the number of cells to compute between line a and line b is 
160 |         (b-a +1)*(a + b) /2
161 |         We solved the equation to be able to get a given b and s.
162 | 
163 |         The idea is that s should be the same for all the processes
164 |         :param b: line b - int
165 |         :param s: int
166 |         :return: a - int
167 |         """
168 |         return int((-1 - np.sqrt(4 * (-2 * s + b ** 2 + b))) / (-2))
169 | 
170 |     @staticmethod
171 |     def __symmetrize(a):
172 |         """
173 |         Return a symmetrized version of a
174 |         """
175 |         return a + a.T - np.diag(a.diagonal())
176 | 
177 |     def __get_big_matrix(self, complete_with_zero=False):
178 |         """
179 |         Load all the matrices dumped by the function create_matrix_distance and symmetrize them
180 |         :return: ndarray
181 |         """
182 |         matrix_list = []
183 |         for file in sorted(os.listdir(conf.MATRIX_FOLDER), key=lambda x: int(x.split('_')[0])):
184 |             logger.info('Loading {}'.format(file))
185 |             with open(os.path.join(conf.MATRIX_FOLDER, file), 'rb') as f:
186 |                 matrix_list.append(pickle.load(f))
187 |         concatenated_matrix = np.concatenate(matrix_list)
188 |         if complete_with_zero:  # useful when we add new urls on a computed matrix
189 |             concatenated_matrix = np.concatenate(
190 |                 (np.zeros(
191 |                     shape=(concatenated_matrix.shape[1] - concatenated_matrix.shape[0], concatenated_matrix.shape[1]),
192 |                     dtype=np.int32), concatenated_matrix),
193 |                 axis=0)
194 |         full_matrix = self.__symmetrize(concatenated_matrix)
195 |         np.fill_diagonal(full_matrix, conf.SIMILARITY_MAX)
196 |         return full_matrix
197 | 
198 |     @staticmethod
199 |     def __get_big_stats():
200 |         """
201 |         Load all the stats dumped by the processes and combine them
202 |         :return: dict
203 |         """
204 |         stats = {}
205 |         for file in sorted(os.listdir(conf.MATRIX_STATS_FOLDER)):
206 |             logger.info('Loading {}'.format(file))
207 |             with open(os.path.join(conf.MATRIX_STATS_FOLDER, file), 'rb') as f:
208 |                 stats.update(pickle.load(f))
209 |         return stats
210 | 
211 |     @staticmethod
212 |     def __delete_matrix_stat_folder(delete_folder=False):
213 |         """
214 |         Delete all the temp matrices dumped
215 |         :param delete_folder: bool. If True delete the folder
216 |         :return: void
217 |         """
218 |         for folder in [conf.MATRIX_FOLDER, conf.MATRIX_STATS_FOLDER]:
219 |             shutil.rmtree(folder, ignore_errors=True)
220 |         if not delete_folder:  # We clean the old matrix
221 |             create_folder([conf.MATRIX_FOLDER, conf.MATRIX_STATS_FOLDER])
222 |         logger.info('Old matrices deleted.')
223 | 
224 |     def add_url_list(self, url_list_to_add):
225 |         """
226 |         Add more samples to a precomputed matrices. Done in single process only.
227 |         :param url_list_to_add:  url list to add
228 |         :return: void
229 |         """
230 |         n = len(self.url_list)
231 |         base_matrix = self.matrix
232 |         self.url_list += url_list_to_add
233 |         self.__delete_matrix_stat_folder()
234 |         self.__create_matrix_distance(len(self.url_list), n)
235 |         self.matrix = sum([self.__reshape_base_matrix(base_matrix, len(url_list_to_add)),
236 |                            self.__get_big_matrix(complete_with_zero=True)])
237 |         self.__save()
238 | 
239 |     @staticmethod
240 |     def __reshape_base_matrix(base_matrix, n_to_add):
241 |         """
242 |         Use to add new urls to a precomputed distance matrix. It creates cells filled with 0 to the new value to be
243 |         computed
244 |         :param base_matrix: matrix distance computed
245 |         :param n_to_add: number of samples to add
246 |         :return: new matrix with the new shape
247 |         """
248 |         result = np.concatenate((base_matrix, np.zeros(shape=(n_to_add, base_matrix.shape[0]), dtype=np.int32)),
249 |                                 axis=0)
250 |         result = np.concatenate((result, np.zeros(shape=(result.shape[0], n_to_add), dtype=np.int32)), axis=1)
251 |         return result
252 | 


--------------------------------------------------------------------------------
/src/clustering/metrics.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | import Levenshtein as lev
 4 | from src.clustering import swalign
 5 | 
 6 | STATS = defaultdict(int)
 7 | 
 8 | 
 9 | def get_sw_distance(match, mismatch, gap_penalty):
10 |     scoring = swalign.NucleotideScoringMatrix(match, mismatch)
11 |     sw = swalign.LocalAlignment(scoring, gap_penalty=gap_penalty)
12 |     return sw.align
13 | 
14 | 
15 | def longest_sub(str_a, str_b, th=10):
16 |     """
17 |     Return the longest common substring between two strings.
18 |     It also saves the result into a dictionary to make statistics
19 |     :param str_a: str
20 |     :param str_b: str
21 |     :param th: size of the common string from which we can store it into the stat dict
22 |     :return: 0 or 1
23 |     """
24 |     global STATS
25 |     m = len(str_a)
26 |     n = len(str_b)
27 |     counter = [[0] * (n + 1) for x in range(m + 1)]
28 |     longest = 0
29 |     lcs_set = set()
30 |     for i in range(m):
31 |         for j in range(n):
32 |             if str_a[i] == str_b[j]:
33 |                 c = counter[i][j] + 1
34 |                 counter[i + 1][j + 1] = c
35 |                 if c > longest:
36 |                     lcs_set = set()
37 |                     longest = c
38 |                     lcs_set.add(str_a[i - c + 1:i + 1])
39 |                 elif c == longest:
40 |                     lcs_set.add(str_a[i - c + 1:i + 1])
41 |     if len(lcs_set) >= 1:
42 |         if len(list(lcs_set)[0]) >= th:
43 |             STATS[list(lcs_set)[0]] += 1
44 |             return 1
45 |     return 0
46 | 
47 | 
48 | DISTANCE_FUNC = {'sw': get_sw_distance(match=1, mismatch=-1, gap_penalty=-1),
49 |                  'lev': lev.distance,
50 |                  'longest': longest_sub}
51 | 


--------------------------------------------------------------------------------
/src/clustering/swalign.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | I took this code from the module swalign and I adapted it to Python 3. I also removed the part not useful
  4 | in our use case. You can find the original code here https://pypi.org/project/swalign/
  5 | '''
  6 | try:
  7 |     from StringIO import StringIO
  8 | except ImportError:
  9 |     from io import StringIO
 10 | 
 11 | 
 12 | class NucleotideScoringMatrix(object):
 13 |     def __init__(self, match=1, mismatch=-1):
 14 |         self.match = match
 15 |         self.mismatch = mismatch
 16 | 
 17 |     def score(self, one, two, wildcard=None):
 18 |         if wildcard and (one in wildcard or two in wildcard):
 19 |             return self.match
 20 | 
 21 |         if one == two:
 22 |             return self.match
 23 |         return self.mismatch
 24 | 
 25 | 
 26 | class Matrix(object):
 27 |     def __init__(self, rows, cols, init=None):
 28 |         self.rows = rows
 29 |         self.cols = cols
 30 |         self.values = [init, ] * rows * cols
 31 | 
 32 |     def get(self, row, col):
 33 |         return self.values[(row * self.cols) + col]
 34 | 
 35 |     def set(self, row, col, val):
 36 |         self.values[(row * self.cols) + col] = val
 37 | 
 38 | 
 39 | class LocalAlignment(object):
 40 |     def __init__(self, scoring_matrix, gap_penalty=-1, gap_extension_penalty=-1, gap_extension_decay=0.0,
 41 |                  prefer_gap_runs=True, verbose=False, globalalign=False, wildcard=None, full_query=False):
 42 |         self.scoring_matrix = scoring_matrix
 43 |         self.gap_penalty = gap_penalty
 44 |         self.gap_extension_penalty = gap_extension_penalty
 45 |         self.gap_extension_decay = gap_extension_decay
 46 |         self.verbose = verbose
 47 |         self.prefer_gap_runs = prefer_gap_runs
 48 |         self.globalalign = globalalign
 49 |         self.wildcard = wildcard
 50 |         self.full_query = full_query
 51 | 
 52 |     def align(self, ref, query, ref_name='', query_name='', rc=False):
 53 |         orig_ref = ref
 54 |         orig_query = query
 55 | 
 56 |         ref = ref.upper()
 57 |         query = query.upper()
 58 | 
 59 |         matrix = Matrix(len(query) + 1, len(ref) + 1, (0, ' ', 0))
 60 |         for row in range(1, matrix.rows):
 61 |             matrix.set(row, 0, (0, 'i', 0))
 62 | 
 63 |         for col in range(1, matrix.cols):
 64 |             matrix.set(0, col, (0, 'd', 0))
 65 | 
 66 |         max_val = 0
 67 |         max_row = 0
 68 |         max_col = 0
 69 | 
 70 |         # calculate matrix
 71 |         for row in range(1, matrix.rows):
 72 |             for col in range(1, matrix.cols):
 73 |                 mm_val = matrix.get(row - 1, col - 1)[0] + self.scoring_matrix.score(query[row - 1], ref[col - 1],
 74 |                                                                                      self.wildcard)
 75 | 
 76 |                 ins_run = 0
 77 |                 del_run = 0
 78 | 
 79 |                 if matrix.get(row - 1, col)[1] == 'i':
 80 |                     ins_run = matrix.get(row - 1, col)[2]
 81 |                     if matrix.get(row - 1, col)[0] == 0:
 82 |                         # no penalty to start the alignment
 83 |                         ins_val = 0
 84 |                     else:
 85 |                         if not self.gap_extension_decay:
 86 |                             ins_val = matrix.get(row - 1, col)[0] + self.gap_extension_penalty
 87 |                         else:
 88 |                             ins_val = matrix.get(row - 1, col)[0] + min(0,
 89 |                                                                         self.gap_extension_penalty + ins_run * self.gap_extension_decay)
 90 |                 else:
 91 |                     ins_val = matrix.get(row - 1, col)[0] + self.gap_penalty
 92 | 
 93 |                 if matrix.get(row, col - 1)[1] == 'd':
 94 |                     del_run = matrix.get(row, col - 1)[2]
 95 |                     if matrix.get(row, col - 1)[0] == 0:
 96 |                         # no penalty to start the alignment
 97 |                         del_val = 0
 98 |                     else:
 99 |                         if not self.gap_extension_decay:
100 |                             del_val = matrix.get(row, col - 1)[0] + self.gap_extension_penalty
101 |                         else:
102 |                             del_val = matrix.get(row, col - 1)[0] + min(0,
103 |                                                                         self.gap_extension_penalty + del_run * self.gap_extension_decay)
104 | 
105 |                 else:
106 |                     del_val = matrix.get(row, col - 1)[0] + self.gap_penalty
107 | 
108 |                 if self.globalalign or self.full_query:
109 |                     cell_val = max(mm_val, del_val, ins_val)
110 |                 else:
111 |                     cell_val = max(mm_val, del_val, ins_val, 0)
112 | 
113 |                 if not self.prefer_gap_runs:
114 |                     ins_run = 0
115 |                     del_run = 0
116 | 
117 |                 if del_run and cell_val == del_val:
118 |                     val = (cell_val, 'd', del_run + 1)
119 |                 elif ins_run and cell_val == ins_val:
120 |                     val = (cell_val, 'i', ins_run + 1)
121 |                 elif cell_val == mm_val:
122 |                     val = (cell_val, 'm', 0)
123 |                 elif cell_val == del_val:
124 |                     val = (cell_val, 'd', 1)
125 |                 elif cell_val == ins_val:
126 |                     val = (cell_val, 'i', 1)
127 |                 else:
128 |                     val = (0, 'x', 0)
129 | 
130 |                 if val[0] >= max_val:
131 |                     max_val = val[0]
132 |                     max_row = row
133 |                     max_col = col
134 | 
135 |                 matrix.set(row, col, val)
136 | 
137 |         # backtrack
138 |         if self.globalalign:
139 |             # backtrack from last cell
140 |             row = matrix.rows - 1
141 |             col = matrix.cols - 1
142 |             val = matrix.get(row, col)[0]
143 |         elif self.full_query:
144 |             # backtrack from max in last row
145 |             row = matrix.rows - 1
146 |             max_val = 0
147 |             col = 0
148 |             for c in range(1, matrix.cols):
149 |                 if matrix.get(row, c)[0] > max_val:
150 |                     col = c
151 |                     max_val = matrix.get(row, c)[0]
152 |             col = matrix.cols - 1
153 |             val = matrix.get(row, col)[0]
154 |         else:
155 |             # backtrack from max
156 |             row = max_row
157 |             col = max_col
158 |             val = max_val
159 | 
160 |         op = ''
161 |         aln = []
162 | 
163 |         path = []
164 |         while True:
165 |             val, op, runlen = matrix.get(row, col)
166 | 
167 |             if self.globalalign:
168 |                 if row == 0 and col == 0:
169 |                     break
170 |             elif self.full_query:
171 |                 if row == 0:
172 |                     break
173 |             else:
174 |                 if val <= 0:
175 |                     break
176 | 
177 |             path.append((row, col))
178 |             aln.append(op)
179 | 
180 |             if op == 'm':
181 |                 row -= 1
182 |                 col -= 1
183 |             elif op == 'i':
184 |                 row -= 1
185 |             elif op == 'd':
186 |                 col -= 1
187 |             else:
188 |                 break
189 | 
190 |         aln.reverse()
191 | 
192 |         if self.verbose:
193 |             self.dump_matrix(ref, query, matrix, path)
194 |             print(aln)
195 |             print((max_row, max_col), max_val)
196 | 
197 |         cigar = _reduce_cigar(aln)
198 |         # return Alignment(orig_query, orig_ref, row, col, cigar, max_val, ref_name, query_name, rc, self.globalalign, self.wildcard)
199 |         return max_val
200 | 
201 | 
202 | def _reduce_cigar(operations):
203 |     count = 1
204 |     last = None
205 |     ret = []
206 |     for op in operations:
207 |         if last and op == last:
208 |             count += 1
209 |         elif last:
210 |             ret.append((count, last.upper()))
211 |             count = 1
212 |         last = op
213 | 
214 |     if last:
215 |         ret.append((count, last.upper()))
216 |     return ret
217 | 


--------------------------------------------------------------------------------
/src/feeder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/feeder/__init__.py


--------------------------------------------------------------------------------
/src/feeder/alexa_feed_downloader.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import logging
 3 | from typing import List
 4 | 
 5 | 
 6 | from src.feeder.feed_downloader import FeedDownloader
 7 | from src.feeder.feed_downloader import Url
 8 | import conf
 9 | 
10 | logger = logging.getLogger(conf.LOGGER_NAME)
11 | 
12 | 
13 | class AlexaFeedDownloader(FeedDownloader):
14 |     MAX_DOMAIN = 5
15 | 
16 |     def fetch(self) -> List[Url]:
17 |         alexa_top_1m = pd.read_csv("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip", names=['rank', 'domain'])
18 |         urls = self.domains_to_urls(alexa_top_1m['domain'].head(self.MAX_DOMAIN))
19 |         return [Url(url, 'Alexa', 'Benign') for url in urls]
20 | 


--------------------------------------------------------------------------------
/src/feeder/crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/feeder/crawler/__init__.py


--------------------------------------------------------------------------------
/src/feeder/crawler/crawler.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | import urllib.request
  4 | import urllib3
  5 | import ssl
  6 | import tldextract
  7 | from bs4 import BeautifulSoup
  8 | 
  9 | from .endrecursive import EndRecursive
 10 | import conf
 11 | 
 12 | logger = logging.getLogger(conf.LOGGER_NAME)
 13 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 14 | ssl._create_default_https_context = ssl._create_unverified_context
 15 | 
 16 | 
 17 | class Crawler(object):
 18 |     """
 19 |     This class contains all the methods related to url crawling
 20 |     """
 21 |     HEADERS = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8",
 22 |                "Accept-Encoding": "gzip, deflate", "Accept-Language": "*", "Connection": "keep-alive",
 23 |                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
 24 |                              "Chrome/86.0.42400.198 Safari/537.36"}
 25 | 
 26 |     URLS = 'urls'
 27 |     opener = urllib.request.build_opener()
 28 |     opener.addheaders = [(v, k) for k, v in HEADERS.items()]
 29 |     urllib.request.install_opener(opener)
 30 | 
 31 |     def __init__(self, _url, lock=None, depth=conf.DEPTH_MAX):
 32 |         self.lock = lock
 33 |         self.main_domain = self.__get_primary_domain(_url)
 34 |         self.main_page = 'http://' + self.main_domain
 35 |         self.url = _url
 36 |         self.url_set = set()
 37 |         self.domain_redirected = self.main_domain  # By default it's the same value
 38 |         self.depth = depth
 39 | 
 40 |     def run(self):
 41 |         """
 42 |         Main method to run the crawler
 43 |         :return:
 44 |         """
 45 |         url_fixed = self.fix_url(self.url)
 46 |         try:
 47 |             self.recursive_crawl(url_fixed)
 48 |         except EndRecursive:
 49 |             return self.url_set
 50 | 
 51 |     def recursive_crawl(self, _url):
 52 |         """
 53 |         Recursive crawling on a website. It crawls all the urls found.
 54 |         :param _url: url to crawl
 55 |         :return: void
 56 |         """
 57 |         if not _url:
 58 |             return None
 59 |         self.end_recursive_check()
 60 |         request = self.__request(_url)
 61 |         if not request:  # request = None
 62 |             return
 63 |         if len(request.content) < 5:
 64 |             logger.info(f'We skip {_url}. EMPTY CONTENT')
 65 |             return
 66 |         soup = BeautifulSoup(request.content, 'html.parser')
 67 |         if len(self.url_set) == 0:  # The first one, we always process
 68 |             self.__check_for_redirection(request)
 69 |             self.url_set.add(_url)
 70 |             self.end_recursive_check()
 71 |         self.__parse(_url, soup)
 72 | 
 73 |     def __parse(self, _url, soup):
 74 |         """
 75 |         Parse the soup of the URL
 76 |         :param _url: url
 77 |         :param soup: bs4 object
 78 |         :return: void
 79 |         """
 80 |         for i in soup.find_all("a"):
 81 |             if 'href' not in i.attrs:
 82 |                 continue
 83 | 
 84 |             href = i.attrs['href']
 85 |             if len(href) > conf.MAX_LEN_URL:
 86 |                 logger.info('TOO LONG URL {}'.format(_url))
 87 |                 continue
 88 | 
 89 |             if href.startswith("/"):
 90 |                 href = self.main_page + href
 91 | 
 92 |             if href.startswith("http"):
 93 |                 if not (self.__get_primary_domain(href).endswith(self.main_domain)) and (
 94 |                         not self.__get_primary_domain(href).endswith(self.domain_redirected)):
 95 |                     logger.debug('We skip {}'.format(href))
 96 |                     continue
 97 | 
 98 |                 if href not in self.url_set:
 99 |                     self.url_set.add(href)
100 |                     logger.info('Scraping {}'.format(href))
101 |                     self.recursive_crawl(href)
102 | 
103 |     def __request(self, _url):
104 |         """
105 |         Make the requests and handles different exceptions
106 |         :param _url: url
107 |         :return: request object
108 |         """
109 |         try:
110 |             request = requests.get(_url, timeout=conf.TIMEOUT_CRAWL, headers=self.HEADERS,
111 |                                    verify=False)  # 10 seconds timeout
112 |         except requests.exceptions.ConnectTimeout as e:
113 |             logger.error("CONNECT TIMEOUT for {}".format(_url))
114 |             return
115 |         except requests.exceptions.ReadTimeout as e:
116 |             logger.error("READ TIMEOUT for {}".format(_url))
117 |             return
118 |         except requests.exceptions.SSLError as e:
119 |             logger.error("SSL Error for {}. Exception {}".format(_url, e))
120 |             return
121 |         except requests.exceptions.ConnectionError as e:
122 |             try:
123 |                 if 'nodename nor servname provided, or not known' in e.args[0].reason.args[0]:
124 |                     logger.error(f'{_url} DOWN')
125 |                 else:
126 |                     logger.error(f'Connection error for requesting {_url}')
127 |                 return
128 |             except Exception:
129 |                 logger.error(f'Connection error for requesting {_url}')
130 |                 return
131 |         except Exception as e:
132 |             logger.error(f'NEW error {e} for requesting {_url}')
133 |             return
134 |         return request
135 | 
136 |     def __check_for_redirection(self, request):
137 |         new_url = request.url
138 |         domain = self.__get_primary_domain(new_url)
139 |         if domain != self.main_domain:
140 |             self.domain_redirected = domain
141 | 
142 |     def __get_primary_domain(self, _url):
143 |         """
144 |         Get primary domain from an URL
145 |         :param _url: url
146 |         :return: primary domain
147 |         """
148 |         if self.lock:
149 |             self.lock.acquire()
150 |         primary_domain = tldextract.extract(_url).domain + '.' + tldextract.extract(_url).suffix
151 |         if self.lock:
152 |             self.lock.release()
153 |         return primary_domain
154 | 
155 |     def end_recursive_check(self):
156 |         if len(self.url_set) >= self.depth:
157 |             logger.info('Depth max {} reached'.format(self.depth))
158 |             raise EndRecursive()  # Nice way to cut the process
159 | 
160 |     @staticmethod
161 |     def fix_url(domain):
162 |         if "." not in domain:
163 |             return None
164 |         domain = domain.replace('\n', '')
165 |         if domain.endswith('.'):
166 |             domain = domain[:-1]
167 |         if not domain.startswith("http"):
168 |             domain = f'http://{domain}'
169 |         if domain.endswith('/'):
170 |             domain = domain[:-1]
171 |         return domain
172 | 


--------------------------------------------------------------------------------
/src/feeder/crawler/endrecursive.py:
--------------------------------------------------------------------------------
1 | class EndRecursive(Exception):
2 |     pass
3 | 
4 | 


--------------------------------------------------------------------------------
/src/feeder/feed_downloader.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import logging
  3 | from abc import ABC, abstractmethod
  4 | from dataclasses import dataclass
  5 | from typing import List
  6 | 
  7 | from src.feeder.crawler.crawler import Crawler
  8 | import conf
  9 | 
 10 | logger = logging.getLogger(conf.LOGGER_NAME)
 11 | 
 12 | 
 13 | @dataclass
 14 | class Url:
 15 |     """
 16 |     Basic format of URL
 17 |     """
 18 |     url: str
 19 |     source: str
 20 |     label: str
 21 |     family: str = None
 22 | 
 23 |     def __iter__(self):
 24 |         """
 25 |         We use this function for iterating over a list of URL object
 26 |         :return: iterator
 27 |         """
 28 |         return iter([self.url, self.source, self.label, self.family])
 29 | 
 30 |     def __post_init__(self):
 31 |         """
 32 |         Function that run after the init and convert to lowercase the label
 33 |         :return:
 34 |         """
 35 |         self.label = self.label.lower()
 36 |         assert self.label in ['benign', 'malicious']
 37 | 
 38 | 
 39 | class FeedDownloader(ABC):
 40 |     """
 41 |     Abstract feeder class.
 42 | 
 43 |     You need to implement the fetch method.
 44 |     """
 45 | 
 46 |     def run(self):
 47 |         """
 48 |         Runner
 49 |         :return: list of Url object
 50 |         """
 51 |         list_of_urls = self.fetch()
 52 |         source = list_of_urls[0].source
 53 |         logger.info(f'{len(list_of_urls)} downloaded from {source}')
 54 |         return list_of_urls
 55 | 
 56 |     @abstractmethod
 57 |     def fetch(self) -> List[Url]:
 58 |         """
 59 |         Need to be implemented by each subclass
 60 |         :return: list of Urls object
 61 |         """
 62 |         raise NotImplementedError
 63 | 
 64 |     def fetch_and_save(self, filename="data.csv"):
 65 |         """
 66 |         Fetch and save Urls to CSV
 67 |         :param filename: path of the csv
 68 |         :return: list of Urls object
 69 |         """
 70 |         list_of_urls = self.fetch()
 71 |         self.save_to_csv(list_of_urls, filename)
 72 |         return list_of_urls
 73 | 
 74 |     @staticmethod
 75 |     def save_to_csv(url_list, filename) -> None:
 76 |         """
 77 |         Save list of Urls to csv
 78 |         :param url_list: list of Url object
 79 |         :param filename: path where the csv wil be stored
 80 |         :return: void
 81 |         """
 82 |         columns = list(Url.__annotations__)
 83 |         with open(filename, 'w') as csv_file:
 84 |             wr = csv.writer(csv_file, delimiter=',')
 85 |             wr.writerow(columns)
 86 |             for url in url_list:
 87 |                 wr.writerow(list(url))
 88 |         logger.info(f'{len(url_list)} URls written into {filename}')
 89 | 
 90 |     @staticmethod
 91 |     def get_urls_from_domain(_url, depth_max=5):
 92 |         """
 93 |         Convert domain to URLs. Run the crawler that will recursively look for URls from the same domain
 94 |         :param _url: url string
 95 |         :param depth_max: Max depth for crawling
 96 |         :return: url set ( not Url object)
 97 |         """
 98 |         crawler_object = Crawler(_url, depth=depth_max)
 99 |         return crawler_object.run()
100 | 
101 |     def domains_to_urls(self, domain_list):
102 |         """
103 |         List of domain to send to get_urls_from_domain
104 |         :param domain_list: list of domain
105 |         :return: all urls from all the domains
106 |         """
107 |         url_list = []
108 |         for domain in domain_list:
109 |             try:
110 |                 url_list += list(self.get_urls_from_domain(domain))
111 |             except Exception as e:
112 |                 logger.exception(e)
113 |         return url_list
114 | 


--------------------------------------------------------------------------------
/src/feeder/iscx_feed_downloader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from io import BytesIO
 3 | from zipfile import ZipFile
 4 | 
 5 | from src.feeder.feed_downloader import FeedDownloader
 6 | from src.feeder.feed_downloader import Url
 7 | from src.feeder.crawler.crawler import Crawler
 8 | 
 9 | 
10 | class IscxFeedDownloader(FeedDownloader):
11 |     DOWNLOAD_URL = "http://205.174.165.80/CICDataset/ISCX-URL-2016/Dataset/ISCXURL2016.zip"
12 | 
13 |     def fetch(self):
14 |         resp = requests.get(self.DOWNLOAD_URL, headers=Crawler.HEADERS).content
15 |         zipfile = ZipFile(BytesIO(resp))
16 |         result = []
17 |         for line in zipfile.open("FinalDataset/URL/Benign_list_big_final.csv").readlines():
18 |             result.append(Url(line.decode('utf-8').replace('/\r\n', ""), 'iscx', 'benign'))
19 |         return result
20 | 


--------------------------------------------------------------------------------
/src/feeder/majestic_feed_downloader.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from typing import List
 3 | 
 4 | from src.feeder.feed_downloader import FeedDownloader
 5 | from src.feeder.feed_downloader import Url
 6 | 
 7 | 
 8 | class MajesticFeedDownloader(FeedDownloader):
 9 |     MAX_MAJESTIC = 100
10 | 
11 |     def fetch(self) -> List[Url]:
12 |         majestic_top_1m = pd.read_csv("http://downloads.majesticseo.com/majestic_million.csv", usecols=['Domain'])
13 |         urls = self.domains_to_urls(majestic_top_1m['domain'].head(self.MAX_MAJESTIC))
14 | 
15 |         return [Url(url, 'Majestic', 'Benign') for url in urls]
16 | 


--------------------------------------------------------------------------------
/src/feeder/openfish_feed_downloader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from typing import List
 3 | 
 4 | from src.feeder.feed_downloader import Url
 5 | from src.feeder.feed_downloader import FeedDownloader
 6 | 
 7 | 
 8 | class OpenPhishFeedDownloader(FeedDownloader):
 9 |     def fetch(self) -> List[Url]:
10 |         openphish_url = "https://openphish.com/feed.txt"
11 |         malicious_urls = requests.get(openphish_url).content.decode('utf-8').split('\n')
12 |         return [Url(url, 'OpenPhish', 'Malicious', "Phishing") for url in malicious_urls]
13 | 


--------------------------------------------------------------------------------
/src/feeder/umbrella_feed_downloader.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from typing import List
 3 | 
 4 | from src.feeder.feed_downloader import Url
 5 | from src.feeder.feed_downloader import FeedDownloader
 6 | 
 7 | 
 8 | class UmbrellaFeedDownloader(FeedDownloader):
 9 |     def fetch(self) -> List[Url]:
10 |         umbrella_domains = pd.read_csv("http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip",
11 |                                        names=['rank', 'domain'])
12 |         urls = self.domains_to_urls(umbrella_domains['domain'])
13 | 
14 |         return [Url(url, 'Umbrella', 'Benign') for url in urls]
15 | 


--------------------------------------------------------------------------------
/src/feeder/urlhaus_feed_downloader.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from src.feeder.feed_downloader import Url
 4 | from src.feeder.feed_downloader import FeedDownloader
 5 | 
 6 | 
 7 | class URLHausFeedDownloader(FeedDownloader):
 8 |     def fetch(self) -> List[Url]:
 9 |         import requests
10 |         urlhaus_url = "https://urlhaus.abuse.ch/downloads/text_recent/"
11 |         malicious_urls = requests.get(urlhaus_url).content.decode('utf-8').split('\r\n')
12 |         return [Url(url, 'URLHaus', 'malicious') for url in malicious_urls]
13 | 


--------------------------------------------------------------------------------
/src/feeder/vt_feed_downloader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import math
 3 | import logging
 4 | 
 5 | from src.feeder.feed_downloader import FeedDownloader
 6 | from src.feeder.feed_downloader import Url
 7 | import conf
 8 | 
 9 | logger = logging.getLogger(conf.LOGGER_NAME)
10 | 
11 | 
12 | class VtFeedDownloader(FeedDownloader):
13 |     """
14 |     This class can be used to bring either benign or malicious URLs.
15 |     We use it currently to bring benign URLs. To force VT, to return URLs with a path, we add the path
16 |     should include the letter "a". Don't forget to store your key in a file.
17 |     """
18 |     QUERY = """entity:url path:'a' response_code:200 p:0"""
19 | 
20 |     def __init__(self):
21 |         self.bulk = 300  # Because the api does not accept a limit larger than 300
22 |         self.api_key = self.load_key()
23 |         self.headers = {'x-apikey': self.api_key}
24 | 
25 |     def fetch(self):
26 |         return self.get_records()
27 | 
28 |     def get_records(self, query=QUERY, number=1000):
29 |         if number <= self.bulk:
30 |             bulk = number
31 |         else:
32 |             bulk = self.bulk
33 | 
34 |         params = {'query': query, 'limit': bulk}
35 | 
36 |         r = requests.get('https://www.virustotal.com/api/v3/intelligence/search', params, headers=self.headers)
37 |         request_json = r.json()
38 |         lst = request_json['data']
39 |         number_of_bulks = math.ceil(((number - bulk) / bulk))
40 |         for i in range(number_of_bulks):
41 |             url = request_json['links']['next']
42 |             r = requests.get(url, headers=self.headers)
43 |             request_json = r.json()
44 |             lst.extend(request_json['data'])
45 | 
46 |         return [Url(item['attributes']['last_final_url'], 'vt', 'benign') for item in lst]
47 | 
48 |     @staticmethod
49 |     def load_key(_path=conf.VT_KEY):
50 |         try:
51 |             with open(_path) as f:
52 |                 key = f.read()
53 |         except Exception as e:
54 |             logger.error('You need store the VT API key in a file before continuing. You can put it in '
55 |                          '{}'.format(conf.VT_KEY))
56 |             return
57 |         return key
58 | 


--------------------------------------------------------------------------------
/src/logger_code.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import logging.handlers
 3 | import sys
 4 | import os
 5 | import pathlib
 6 | import conf
 7 | 
 8 | 
 9 | def init_logger(debug_mode=True, logger_file=conf.LOGGER_FILE):
10 |     """
11 |     Init logger file and print the log in stdout in debug_mode
12 |     :param debug_mode: bool
13 |     :param logger_file: path of the file log
14 |     :return: logger object
15 |     """
16 |     if not os.path.exists(conf.LOG_FOLDER): # we don't use the function from utils to not create a circular dependency
17 |         pathlib.Path(conf.LOG_FOLDER).mkdir(parents=True, exist_ok=True)
18 |     _logger = logging.getLogger(conf.LOGGER_NAME)
19 |     _logger.setLevel(logging.DEBUG)
20 |     formatter = logging.Formatter('%(asctime)s - %(module)s - %(levelname)s - %(message)s')
21 |     fh = logging.handlers.RotatingFileHandler(
22 |         logger_file, maxBytes=conf.LOG_FILE_SIZE, backupCount=conf.LOG_FILE_NUMBER)
23 | 
24 |     fh.setFormatter(formatter)
25 |     _logger.addHandler(fh)
26 | 
27 |     if debug_mode:
28 |         stdout_handler = logging.StreamHandler(sys.stdout)
29 |         stdout_handler.setFormatter(formatter)
30 |         _logger.addHandler(stdout_handler)
31 | 
32 |     return _logger
33 | 


--------------------------------------------------------------------------------
/src/preprocessor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/preprocessor/__init__.py


--------------------------------------------------------------------------------
/src/preprocessor/preprocessor.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import logging
 3 | import os
 4 | from random import randint
 5 | from abc import ABC, abstractmethod
 6 | from tqdm import tqdm
 7 | from urllib.parse import urlparse
 8 | 
 9 | from src.utils import create_folder
10 | import conf
11 | 
12 | logger = logging.getLogger(conf.LOGGER_NAME)
13 | 
14 | 
15 | class Preprocessor(ABC):
16 |     """
17 |     Abstract class for preprocessing classes
18 |     """
19 | 
20 |     def run(self, file_path):
21 |         """
22 |         Basic runner
23 |         :param file_path: path of the main file. In general data.csv
24 |         :return: void
25 |         """
26 |         df = pd.read_csv(file_path)
27 |         df_basic_processed = self.basic_preprocessing(df)
28 | 
29 |         df_processed = self.process(df_basic_processed)
30 |         create_folder(path=conf.DATA)
31 |         preprocessed_file_name = os.path.join(conf.DATA,
32 |                                               os.path.basename(file_path).replace('.csv', conf.PREPROCESSED_SUFFIX))
33 |         logger.info(f"Data preprocessed saved in  {preprocessed_file_name}")
34 |         df_processed.to_csv(preprocessed_file_name, index=False)
35 |         return df_processed
36 | 
37 |     @abstractmethod
38 |     def process(self, df):
39 |         """
40 |         Abstract method that needs to be implemented.
41 |         :param df: DataFrame object.
42 |         :return: DataFrame object after your filter
43 |         """
44 |         raise NotImplementedError
45 | 
46 |     @staticmethod
47 |     def basic_preprocessing(df):
48 |         """
49 |         This technique run before your preprocessing and extracts for you some basic features
50 |         :param df: DataFrame with benign and malicious samples
51 |         :return: DataFrame object with the new columns
52 |         """
53 |         features_dict_list = []
54 |         df = df.dropna(subset=['url'])
55 |         urls = df['url'].unique()
56 |         logger.info(f"{len(urls)} unique URLs found in the dataframe (shape {df.shape})")
57 |         for url in tqdm(urls):
58 |             full_url = url
59 |             if full_url.startswith("/"):  # it's maybe only a path
60 |                 full_url = f"http://randomdomain{str(randint(0, 10 ** 5))}.com{full_url}"
61 |             if not (full_url.startswith('http://') or full_url.startswith(
62 |                     'https://')):  # URL parse is not working well without it
63 |                 full_url = 'http://{}'.format(full_url)
64 |             full_url = full_url.replace('\n', '')
65 |             parsed_uri = urlparse(full_url)
66 |             extension = ''
67 | 
68 |             if parsed_uri.path.find('.') != -1 and not parsed_uri.path.endswith('/'):
69 |                 extension = parsed_uri.path[parsed_uri.path.rfind('.'):]
70 |             features_dict_list.append(
71 |                 {"url": url,
72 |                  'full_url': full_url,
73 |                  'domain': parsed_uri.netloc,
74 |                  'path': parsed_uri.path,
75 |                  'params': parsed_uri.params,
76 |                  'query': parsed_uri.query,
77 |                  'path_len': len(parsed_uri.path),
78 |                  'extension': extension,
79 |                  'folder_count': parsed_uri.path.count('/')})
80 | 
81 |         result = pd.DataFrame(features_dict_list)
82 |         result = pd.merge(df, result, how='left', on='url')
83 |         result['url'] = result['full_url']
84 |         result = result.drop(columns=['full_url'])
85 |         logger.info('Final df shape {}'.format(result.shape))
86 |         logger.info('Unique path: {}'.format(result['path'].nunique()))
87 |         logger.info('Unique domain: {}'.format(result['domain'].nunique()))
88 |         logger.info('Path with at least one folder: {} \n'.format(result[result['folder_count'] > 1]['path'].nunique()))
89 |         return result
90 | 


--------------------------------------------------------------------------------
/src/preprocessor/preprocessor_basic.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import pickle
  4 | import pandas as pd
  5 | import json
  6 | 
  7 | from src.preprocessor.preprocessor import Preprocessor
  8 | from src.regex.regex import Regex
  9 | from src.utils import create_folder
 10 | 
 11 | import conf
 12 | 
 13 | logger = logging.getLogger(conf.LOGGER_NAME)
 14 | 
 15 | MIN_LEN = 7
 16 | MAX_PATH = 45000
 17 | 
 18 | 
 19 | class PreprocessorBasic(Preprocessor):
 20 |     """
 21 |     Basic preprocessor class. It filters duplicates, Urls already caught by the old regexes, too long path,
 22 |     select one path per domain etc
 23 |     """
 24 | 
 25 |     def process(self, df):
 26 |         """
 27 |         Method that had to be implemented. Run all the submethod and the return a DataFrame filtered.
 28 |         :param df: DataFrame
 29 |         :return: DataFrame filtered
 30 |         """
 31 |         df_benign = df[df['label'] == 'benign']
 32 |         df = df[df['label'] == 'malicious']
 33 |         number_of_path_before_cleaning = df['path'].nunique()
 34 |         df = self.remove_path_duplicates(df)
 35 |         df = self.keep_one_path_per_domain(df)
 36 |         df = self.clean_df(df, benign_path=df_benign['path'].unique())
 37 |         df = self.keep_path_with_folders(df)
 38 |         df = self.clean_with_regexes(df)
 39 |         df = self.check_size(df)
 40 |         self.show_stat(df)
 41 |         logger.info(
 42 |             'In total we cleaned {} (- {} %) paths'.format(number_of_path_before_cleaning - df['path'].nunique(),
 43 |                                                            (1 - df[
 44 |                                                                'path'].nunique() / number_of_path_before_cleaning) * 100))
 45 |         return pd.concat([df, df_benign], sort=False)
 46 | 
 47 |     @staticmethod
 48 |     def remove_path_duplicates(df):
 49 |         """
 50 |         It should be mandatory. Take unique paths
 51 |         :param df: DataFrame
 52 |         :return: DataFrame
 53 |         """
 54 |         logger.info('Shape with path duplicates : {}'.format(df.shape))
 55 |         result = df.drop_duplicates(['path'])
 56 |         logger.info('Shape without path duplicates : {} (-{} %)'.format(result.shape,
 57 |                                                                         round(100 - 100 * result.shape[0] / df.shape[0],
 58 |                                                                               2)))
 59 |         return result
 60 | 
 61 |     @staticmethod
 62 |     def keep_one_path_per_domain(df):
 63 |         """
 64 |         To reduce the number of paths and to more generalize the regexes, we can choose only one path per domain
 65 |         :param df: DataFrame
 66 |         :return: DataFrame
 67 |         """
 68 |         logger.info('Shape with domain duplicates : {}'.format(df.shape))
 69 |         result = df.drop_duplicates(['domain'])
 70 |         logger.info('Shape without domain duplicates : {} (-{} %)'.format(result.shape,
 71 |                                                                           round(
 72 |                                                                               100 - 100 * result.shape[0] / df.shape[0],
 73 |                                                                               2)))
 74 |         return result
 75 | 
 76 |     @staticmethod
 77 |     def show_stat(df):
 78 |         """
 79 |         Show basic stats
 80 |         :param df: DataFrame
 81 |         :return: void
 82 |         """
 83 |         logger.info('Shape {}'.format(df.shape))
 84 |         logger.info('Path : {}'.format(df['path'].nunique()))
 85 |         logger.info('Domain : {}'.format(df['domain'].nunique()))
 86 |         logger.info('Mean path len: {}'.format(df['path_len'].mean()))
 87 | 
 88 |     def clean_df(self, df, benign_path, path_len=MIN_LEN):
 89 |         """
 90 |         Some basic cleaning
 91 |         :param df: DataFrame
 92 |         :param benign_path: list of benign path
 93 |         :param path_len: minimal path len
 94 |         :return: DataFrame
 95 |         """
 96 |         df['filter_wp'] = df['path'].apply(self.clean_wordpress, args=(MIN_LEN,))
 97 |         new_df = df[(df['path_len'] >= path_len) & (~df['path'].isin(benign_path)) & (df['filter_wp'] == False)]
 98 |         logger.info('Cleaning : {} -- > {} paths (-{}%)'.format(df['path'].nunique(), new_df['path'].nunique(),
 99 |                                                                 round(new_df['path'].nunique() / df['path'].nunique(),
100 |                                                                       2)))
101 |         return new_df
102 | 
103 |     def clean_with_regexes(self, df):
104 |         """
105 |         Remove from the DataFrame Urls already caught by your existing regexes.
106 |         :param df: DataFrame object
107 |         :return: DataFrame filtered
108 |         """
109 |         if not os.path.exists(conf.LAST_REGEX_LIST):
110 |             logger.info('We did not find {}. We skip the cleaning with regexes step'.format(conf.LAST_REGEX_LIST))
111 |             return df
112 |         with open(conf.LAST_REGEX_LIST) as json_file:
113 |             regex_list = json.load(json_file)['regexes']
114 |         already_found = self.regex_test(regex_list, list(df[df['path'].notnull()]['path'].unique()))[0]
115 |         logger.info('{} paths are already found with the old regexes'.format(len(already_found)))
116 |         df = df[~df['path'].isin(list(already_found))]
117 |         return df
118 | 
119 |     @staticmethod
120 |     def regex_test(regex_list, list_to_test, pickle_save=os.path.join(conf.COVERAGE_FOLDER, 'nevada_coverage.pickle')):
121 |         """
122 | 
123 |         :param regex_list: regex list ( string )
124 |         :param list_to_test: list of urls to test
125 |         :param pickle_save: if specified, save the statistics of this test in a pickle that you can open with the
126 |         Jupyter Notebook for analysis
127 |         :return: tuple (set of Urls found, dictonnary with catches by regex, DataFrame with some stats)
128 |         """
129 |         all_found = set()
130 |         dict_found = {}
131 |         for _re in regex_list:
132 |             print(f'Testing regex {_re}')
133 |             _, found = Regex.check_regex_list(_re, list_to_test)
134 |             print(f'Match {len(found)} paths !')
135 |             dict_found[_re] = found
136 |             all_found = all_found.union(set(found))
137 |         data = {'regex': list(dict_found.keys()), 'count': [len(x) for x in list(dict_found.values())]}
138 | 
139 |         df_stat = pd.DataFrame(data)
140 |         print(f"Coverage {str(df_stat['count'].sum())} {round(100 * df_stat['count'].sum() / len(list_to_test), 2)} % ")
141 |         if pickle_save:
142 |             create_folder(pickle_save)
143 |             with open(pickle_save, 'wb') as handle:
144 |                 pickle.dump(dict_found, handle, protocol=pickle.HIGHEST_PROTOCOL)
145 |                 print(f'Results saved in {pickle_save}')
146 |         return all_found, dict_found, df_stat
147 | 
148 |     @staticmethod
149 |     def keep_path_with_folders(df, th=1):
150 |         """
151 |         To avoid FP, sometimes we want to catch 'long' URLs, we more than one folder inside the path
152 |         :param df: DataFrame
153 |         :param th: number of folders minimum
154 |         :return: DataFrame filtered
155 |         """
156 |         logger.info('Shape : {}'.format(df.shape))
157 |         result = df[df['folder_count'] > th]
158 |         logger.info('Shape with at least {} folder(s) : {} (-{} %)'.format(th, result.shape,
159 |                                                                            round(100 - 100 * result.shape[0] / df.shape[
160 |                                                                                0],
161 |                                                                                  2)))
162 |         return result
163 | 
164 |     @staticmethod
165 |     def clean_wordpress(x, path_len_min):
166 |         """
167 |         Wordpress paths create many FP, we filter here the most popular paths
168 |         :param x: url
169 |         :param path_len_min: path len min
170 |         :return: bool
171 |         """
172 | 
173 |         def return_if_path_len_ok(x, key):
174 |             return len(x.replace(key, '')) < path_len_min
175 | 
176 |         wordpress_dict = {'wp-admin': ['wp-admin/images', 'wp-admin/css', 'wp-admin/js'],
177 |                           'wp-includes': ['wp-includes/css', 'wp-includes/js', 'wp-includes/images'],
178 |                           'wp-content': ['wp-content/plugins', 'wp-content/themes', 'wp-content/uploads',
179 |                                          'wp-content/languages/themes', 'wp-content/mu-plugins'],
180 |                           'images': ['images/logos.gif', 'images/logo.gif'],
181 |                           'contact': ['contact-us'],
182 |                           'config.bin': [],
183 |                           'admin.php': [],
184 |                           'login.php': [],
185 |                           'index.php': [],
186 |                           'gate.php': [],
187 |                           '.jpg': [],
188 |                           '.png': [],
189 |                           '.php': []}
190 |         for wp_key in wordpress_dict:
191 |             if wp_key in x:
192 |                 for wp_key_path in wordpress_dict[wp_key]:
193 |                     if wp_key_path in x:
194 |                         return return_if_path_len_ok(x, wp_key_path)
195 |                 return return_if_path_len_ok(x, wp_key)
196 |         return False
197 | 
198 |     @staticmethod
199 |     def check_size(df):
200 |         """
201 |         To avoid issue with computation, we can specifiy a limit of URLs to keep.
202 |         30k unique paths can use 300GB RAM ...
203 |         :param df: DataFrame
204 |         :return: DataFrame after filter
205 |         """
206 |         if df['path'].nunique() > MAX_PATH:
207 |             logger.info('r/!\ NUMBER OF PATHS TOO HIGH {}. WE SAMPLE {} paths'.format(df['path'].nunique(), MAX_PATH))
208 |             return df.sample(MAX_PATH)
209 |         return df
210 | 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle.jar


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/build.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!-- You may freely edit this file. See commented blocks below for -->
 3 | <!-- some examples of how to customize the build. -->
 4 | <!-- (If you delete it and reopen the project it will be recreated.) -->
 5 | <!-- By default, only the Clean and Build commands use this build script. -->
 6 | <!-- Commands such as Run, Debug, and Test only use this build script if -->
 7 | <!-- the Compile on Save feature is turned off for the project. -->
 8 | <!-- You can turn off the Compile on Save (or Deploy on Save) setting -->
 9 | <!-- in the project's Project Properties dialog box.-->
10 | <project name="ConsoleRegexTurtle" default="default" basedir=".">
11 |     <description>Builds, tests, and runs the project ConsoleRegexTurtle.</description>
12 |     <import file="nbproject/build-impl.xml"/>
13 |     <!--
14 | 
15 |     There exist several targets which are by default empty and which can be 
16 |     used for execution of your tasks. These targets are usually executed 
17 |     before and after some main targets. They are: 
18 | 
19 |       -pre-init:                 called before initialization of project properties
20 |       -post-init:                called after initialization of project properties
21 |       -pre-compile:              called before javac compilation
22 |       -post-compile:             called after javac compilation
23 |       -pre-compile-single:       called before javac compilation of single file
24 |       -post-compile-single:      called after javac compilation of single file
25 |       -pre-compile-test:         called before javac compilation of JUnit tests
26 |       -post-compile-test:        called after javac compilation of JUnit tests
27 |       -pre-compile-test-single:  called before javac compilation of single JUnit test
28 |       -post-compile-test-single: called after javac compilation of single JUunit test
29 |       -pre-jar:                  called before JAR building
30 |       -post-jar:                 called after JAR building
31 |       -post-clean:               called after cleaning build products
32 | 
33 |     (Targets beginning with '-' are not intended to be called on their own.)
34 | 
35 |     Example of inserting an obfuscator after compilation could look like this:
36 | 
37 |         <target name="-post-compile">
38 |             <obfuscate>
39 |                 <fileset dir="${build.classes.dir}"/>
40 |             </obfuscate>
41 |         </target>
42 | 
43 |     For list of available properties check the imported 
44 |     nbproject/build-impl.xml file. 
45 | 
46 | 
47 |     Another way to customize the build is by overriding existing main targets.
48 |     The targets of interest are: 
49 | 
50 |       -init-macrodef-javac:     defines macro for javac compilation
51 |       -init-macrodef-junit:     defines macro for junit execution
52 |       -init-macrodef-debug:     defines macro for class debugging
53 |       -init-macrodef-java:      defines macro for class execution
54 |       -do-jar:                  JAR building
55 |       run:                      execution of project 
56 |       -javadoc-build:           Javadoc generation
57 |       test-report:              JUnit report generation
58 | 
59 |     An example of overriding the target for project execution could look like this:
60 | 
61 |         <target name="run" depends="ConsoleRegexTurtle-impl.jar">
62 |             <exec dir="bin" executable="launcher.exe">
63 |                 <arg file="${dist.jar}"/>
64 |             </exec>
65 |         </target>
66 | 
67 |     Notice that the overridden target depends on the jar target and not only on 
68 |     the compile target as the regular run target does. Again, for a list of available 
69 |     properties which you can use, check the target you are overriding in the
70 |     nbproject/build-impl.xml file. 
71 | 
72 |     -->
73 |     <target name="-post-jar">
74 |         <copy file="regexturtle.sh" flatten="true" todir="${dist.dir}" />
75 |     </target> 
76 | </project>
77 | 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/build/built-jar.properties:
--------------------------------------------------------------------------------
1 | #Wed, 29 Apr 2015 09:45:34 +0200
2 | 
3 | 
4 | /home/andrea/NetBeansProjects/RegexGenerator/ConsoleRegexTurtle=
5 | 
6 | /home/andrea/NetBeansProjects/RegexGenerator/MaleRegexTree=
7 | 
8 | /home/andrea/NetBeansProjects/RegexGenerator/Random\ Regex\ Turtle=
9 | 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/build/classes/it/units/inginf/male/console/ConsoleRegexTurtle.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/build/classes/it/units/inginf/male/console/ConsoleRegexTurtle.class


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/build/classes/it/units/inginf/male/dto/SimpleConfig.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/build/classes/it/units/inginf/male/dto/SimpleConfig.class


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/dist/ConsoleRegexTurtle.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/dist/ConsoleRegexTurtle.jar


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/dist/README.TXT:
--------------------------------------------------------------------------------
 1 | ========================
 2 | BUILD OUTPUT DESCRIPTION
 3 | ========================
 4 | 
 5 | When you build an Java application project that has a main class, the IDE
 6 | automatically copies all of the JAR
 7 | files on the projects classpath to your projects dist/lib folder. The IDE
 8 | also adds each of the JAR files to the Class-Path element in the application
 9 | JAR files manifest file (MANIFEST.MF).
10 | 
11 | To run the project from the command line, go to the dist folder and
12 | type the following:
13 | 
14 | java -jar "ConsoleRegexTurtle.jar" 
15 | 
16 | To distribute this project, zip up the dist folder (including the lib folder)
17 | and distribute the ZIP file.
18 | 
19 | Notes:
20 | 
21 | * If two JAR files on the project classpath have the same name, only the first
22 | JAR file is copied to the lib folder.
23 | * Only JAR files are copied to the lib folder.
24 | If the classpath contains other types of files or folders, these files (folders)
25 | are not copied.
26 | * If a library on the projects classpath also has a Class-Path element
27 | specified in the manifest,the content of the Class-Path element has to be on
28 | the projects runtime path.
29 | * To set a main class in a standard Java project, right-click the project node
30 | in the Projects window and choose Properties. Then click Run and enter the
31 | class name in the Main Class field. Alternatively, you can manually type the
32 | class name in the manifest Main-Class element.
33 | 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/dist/lib/MaleRegexTree.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/dist/lib/MaleRegexTree.jar


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/dist/lib/Random_Regex_Turtle.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/dist/lib/Random_Regex_Turtle.jar


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/dist/lib/gson-2.2.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/dist/lib/gson-2.2.4.jar


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/dist/regexturtle.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #Executes the command-line version of RegextTurtle; automatically sets the JAVA VM memory size based on the available system memory
3 | MEMSYSTEM=8000
4 | MAXMEM=$(( MEMSYSTEM-512 ))
5 | XMSMEM=$(( MAXMEM/2 ))
6 | echo "System memory:"$MEMSYSTEM "Mbytes"
7 | echo "RegexTurtle is going to use this amount of the system memory:"$MAXMEM "Mbytes" 
8 | java -Xmx${MAXMEM}M -Xms${XMSMEM}M -jar "ConsoleRegexTurtle.jar" $@ 
9 | 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/lib/CopyLibs/org-netbeans-modules-java-j2seproject-copylibstask.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/lib/CopyLibs/org-netbeans-modules-java-j2seproject-copylibstask.jar


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/lib/Gson/gson-2.2.4-javadoc.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/lib/Gson/gson-2.2.4-javadoc.jar


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/lib/Gson/gson-2.2.4-sources.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/lib/Gson/gson-2.2.4-sources.jar


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/lib/Gson/gson-2.2.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/lib/Gson/gson-2.2.4.jar


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/lib/nblibraries.properties:
--------------------------------------------------------------------------------
 1 | libs.CopyLibs.classpath=\
 2 |     ${base}/CopyLibs/org-netbeans-modules-java-j2seproject-copylibstask.jar
 3 | libs.CopyLibs.displayName=CopyLibs Task
 4 | libs.CopyLibs.prop-version=2.0
 5 | libs.Gson.classpath=\
 6 |     ${base}/Gson/gson-2.2.4.jar
 7 | libs.Gson.javadoc=\
 8 |     ${base}/Gson/gson-2.2.4-javadoc.jar!//
 9 | libs.Gson.src=\
10 |     ${base}/Gson/gson-2.2.4-sources.jar!//
11 | 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/manifest.mf:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | X-COMMENT: Main-Class will be added automatically by build
3 | 
4 | 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/nbproject/genfiles.properties:
--------------------------------------------------------------------------------
1 | build.xml.data.CRC32=26ed0a59
2 | build.xml.script.CRC32=edeeeee4
3 | build.xml.stylesheet.CRC32=8064a381@1.75.0.48
4 | # This file is used by a NetBeans-based IDE to track changes in generated files such as build-impl.xml.
5 | # Do not edit this file. You may delete it but then the IDE will never regenerate such files for you.
6 | nbproject/build-impl.xml.data.CRC32=26ed0a59
7 | nbproject/build-impl.xml.script.CRC32=4c760918
8 | nbproject/build-impl.xml.stylesheet.CRC32=876e7a8f@1.75.0.48
9 | 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/nbproject/private/config.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/ConsoleRegexTurtle/nbproject/private/config.properties


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/nbproject/private/private.properties:
--------------------------------------------------------------------------------
1 | application.args=-t 4 -p 200 -g 500 -e 20.0 -c "interesting evolution"  -d testdataset/reduced.json -o ./outputfolder/
2 | compile.on.save=false
3 | do.depend=false
4 | do.jar=true
5 | javac.debug=true
6 | javadoc.preview=true
7 | user.properties.file=/home/andrea/.netbeans/8.0.1/build.properties
8 | 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/nbproject/private/private.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project-private xmlns="http://www.netbeans.org/ns/project-private/1">
 3 |     <editor-bookmarks xmlns="http://www.netbeans.org/ns/editor-bookmarks/2" lastBookmarkId="0"/>
 4 |     <open-files xmlns="http://www.netbeans.org/ns/projectui-open-files/2">
 5 |         <group/>
 6 |         <group name="Month 8 Release">
 7 |             <file>file:/home/fab/NetBeansProjects/Rilascio%20Month%208/ConsoleRegexTurtle/src/it/units/inginf/male/console/ConsoleRegexTurtle.java</file>
 8 |         </group>
 9 |         <group name="Public Release WebRegex 2014">
10 |             <file>file:/home/fab/NetBeansProjects/Rilascio%20Month%2012/ConsoleRegexTurtle/build.xml</file>
11 |             <file>file:/home/fab/NetBeansProjects/Rilascio%20Month%2012/ConsoleRegexTurtle/regexturtle.sh</file>
12 |             <file>file:/home/fab/NetBeansProjects/Rilascio%20Month%2012/ConsoleRegexTurtle/src/it/units/inginf/male/console/ConsoleRegexTurtle.java</file>
13 |             <file>file:/home/fab/NetBeansProjects/Rilascio%20Month%2012/ConsoleRegexTurtle/src/it/units/inginf/male/dto/SimpleConfig.java</file>
14 |         </group>
15 |         <group name="Release WebRegex Month 12">
16 |             <file>file:/home/fab/Desktop/Month%2012%20Release/ConsoleRegexTurtle/src/it/units/inginf/male/console/ConsoleRegexTurtle.java</file>
17 |             <file>file:/home/fab/Desktop/Month%2012%20Release/ConsoleRegexTurtle/src/it/units/inginf/male/console/prova.java</file>
18 |         </group>
19 |         <group name="Public release month 12"/>
20 |     </open-files>
21 | </project-private>
22 | 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/nbproject/project.properties:
--------------------------------------------------------------------------------
 1 | annotation.processing.enabled=true
 2 | annotation.processing.enabled.in.editor=false
 3 | annotation.processing.processors.list=
 4 | annotation.processing.run.all.processors=true
 5 | annotation.processing.source.output=${build.generated.sources.dir}/ap-source-output
 6 | application.title=ConsoleRegexTurtle
 7 | application.vendor=fab
 8 | build.classes.dir=${build.dir}/classes
 9 | build.classes.excludes=**/*.java,**/*.form
10 | # This directory is removed when the project is cleaned:
11 | build.dir=build
12 | build.generated.dir=${build.dir}/generated
13 | build.generated.sources.dir=${build.dir}/generated-sources
14 | # Only compile against the classpath explicitly listed here:
15 | build.sysclasspath=ignore
16 | build.test.classes.dir=${build.dir}/test/classes
17 | build.test.results.dir=${build.dir}/test/results
18 | # Uncomment to specify the preferred debugger connection transport:
19 | #debug.transport=dt_socket
20 | debug.classpath=\
21 |     ${run.classpath}
22 | debug.test.classpath=\
23 |     ${run.test.classpath}
24 | # Files in build.classes.dir which should be excluded from distribution jar
25 | dist.archive.excludes=
26 | # This directory is removed when the project is cleaned:
27 | dist.dir=dist
28 | dist.jar=${dist.dir}/ConsoleRegexTurtle.jar
29 | dist.javadoc.dir=${dist.dir}/javadoc
30 | endorsed.classpath=
31 | excludes=
32 | file.reference.WebRegexTurtle-src=../WebRegexTurtle/src
33 | includes=**
34 | jar.compress=false
35 | javac.classpath=\
36 |     ${reference.Random_Regex_Turtle.jar}:\
37 |     ${reference.MaleRegexTree.jar}:\
38 |     ${libs.Gson.classpath}
39 | # Space-separated list of extra javac options
40 | javac.compilerargs=
41 | javac.deprecation=false
42 | javac.processorpath=\
43 |     ${javac.classpath}
44 | javac.source=1.7
45 | javac.target=1.7
46 | javac.test.classpath=\
47 |     ${javac.classpath}:\
48 |     ${build.classes.dir}
49 | javac.test.processorpath=\
50 |     ${javac.test.classpath}
51 | javadoc.additionalparam=
52 | javadoc.author=false
53 | javadoc.encoding=${source.encoding}
54 | javadoc.noindex=false
55 | javadoc.nonavbar=false
56 | javadoc.notree=false
57 | javadoc.private=false
58 | javadoc.splitindex=true
59 | javadoc.use=true
60 | javadoc.version=false
61 | javadoc.windowtitle=
62 | main.class=it.units.inginf.male.console.ConsoleRegexTurtle
63 | manifest.file=manifest.mf
64 | meta.inf.dir=${src.dir}/META-INF
65 | mkdist.disabled=false
66 | platform.active=default_platform
67 | project.license=gpl30
68 | project.MaleRegexTree=../MaleRegexTree
69 | project.Random_Regex_Turtle=../Random Regex Turtle
70 | reference.MaleRegexTree.jar=${project.MaleRegexTree}/dist/MaleRegexTree.jar
71 | reference.Random_Regex_Turtle.jar=${project.Random_Regex_Turtle}/dist/Random_Regex_Turtle.jar
72 | run.classpath=\
73 |     ${javac.classpath}:\
74 |     ${build.classes.dir}
75 | # Space-separated list of JVM arguments used when running the project.
76 | # You may also define separate properties like run-sys-prop.name=value instead of -Dname=value.
77 | # To set system properties for unit tests define test-sys-prop.name=value:
78 | run.jvmargs=
79 | run.test.classpath=\
80 |     ${javac.test.classpath}:\
81 |     ${build.test.classes.dir}
82 | source.encoding=UTF-8
83 | src.dir=src
84 | test.src.dir=test
85 | 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/nbproject/project.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://www.netbeans.org/ns/project/1">
 3 |     <type>org.netbeans.modules.java.j2seproject</type>
 4 |     <configuration>
 5 |         <data xmlns="http://www.netbeans.org/ns/j2se-project/3">
 6 |             <name>ConsoleRegexTurtle</name>
 7 |             <source-roots>
 8 |                 <root id="src.dir"/>
 9 |             </source-roots>
10 |             <test-roots>
11 |                 <root id="test.src.dir"/>
12 |             </test-roots>
13 |         </data>
14 |         <libraries xmlns="http://www.netbeans.org/ns/ant-project-libraries/1">
15 |             <definitions>./lib/nblibraries.properties</definitions>
16 |         </libraries>
17 |         <references xmlns="http://www.netbeans.org/ns/ant-project-references/1">
18 |             <reference>
19 |                 <foreign-project>MaleRegexTree</foreign-project>
20 |                 <artifact-type>jar</artifact-type>
21 |                 <script>build.xml</script>
22 |                 <target>jar</target>
23 |                 <clean-target>clean</clean-target>
24 |                 <id>jar</id>
25 |             </reference>
26 |             <reference>
27 |                 <foreign-project>Random_Regex_Turtle</foreign-project>
28 |                 <artifact-type>jar</artifact-type>
29 |                 <script>build.xml</script>
30 |                 <target>jar</target>
31 |                 <clean-target>clean</clean-target>
32 |                 <id>jar</id>
33 |             </reference>
34 |         </references>
35 |     </configuration>
36 | </project>
37 | 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/regexturtle.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Executes the command-line version of RegextTurtle; automatically sets the JAVA VM memory size based on the available system memory
 3 | MEMDATA=$(free -m | grep Mem:)
 4 | ARR=($MEMDATA)
 5 | MEMSYSTEM=${ARR[1]}
 6 | MAXMEM=$(( MEMSYSTEM-512 ))
 7 | XMSMEM=$(( MAXMEM/2 ))
 8 | echo "System memory:"$MEMSYSTEM "Mbytes"
 9 | echo "RegexTurtle is going to use this amount of the system memory:"$MAXMEM "Mbytes" 
10 | java -Xmx${MAXMEM}M -Xms${XMSMEM}M -jar "ConsoleRegexTurtle.jar" $@ 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/src/it/units/inginf/male/console/ConsoleRegexTurtle.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Machine Learning Lab - University of Trieste, 
  3 |  * Italy (http://machinelearning.inginf.units.it/)  
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, either version 3 of the License, or
  8 |  * (at your option) any later version.
  9 |  *
 10 |  * This program is distributed in the hope that it will be useful,
 11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |  * GNU General Public License for more details.
 14 |  *
 15 |  * You should have received a copy of the GNU General Public License
 16 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 |  */
 18 | package it.units.inginf.male.console;
 19 | 
 20 | import com.google.gson.Gson;
 21 | import com.google.gson.GsonBuilder;
 22 | import it.units.inginf.male.Main;
 23 | import it.units.inginf.male.configuration.Configuration;
 24 | import it.units.inginf.male.dto.SimpleConfig;
 25 | import it.units.inginf.male.inputs.DataSet;
 26 | import it.units.inginf.male.inputs.DataSet.Example;
 27 | import it.units.inginf.male.outputs.FinalSolution;
 28 | import it.units.inginf.male.outputs.Results;
 29 | import it.units.inginf.male.postprocessing.BasicPostprocessor;
 30 | import it.units.inginf.male.postprocessing.JsonPostProcessor;
 31 | import it.units.inginf.male.strategy.ExecutionStrategy;
 32 | import it.units.inginf.male.strategy.impl.CoolTextualExecutionListener;
 33 | import it.units.inginf.male.utils.Utils;
 34 | import java.io.BufferedReader;
 35 | import java.io.File;
 36 | import java.io.FileInputStream;
 37 | import java.io.IOException;
 38 | import java.io.InputStreamReader;
 39 | import java.util.logging.Level;
 40 | import java.util.logging.Logger;
 41 | 
 42 | /**
 43 |  * Provides a commandline tool for the GP Engine, RandomRegexTurtle.
 44 |  *
 45 |  * @author MaleLabTs
 46 |  */
 47 | public class ConsoleRegexTurtle {
 48 | 
 49 |     private static String WARNING_MESSAGE = "\nWARNING\n"
 50 |             + "The quality of the solution depends on a number of factors, including size and syntactical properties of the learning information.\n"
 51 |             + "The algorithms embedded in this experimental prototype have always been tested with at least 25 matches over at least 2 examples.\n"
 52 |             + "It is very unlikely that a smaller number of matches allows obtaining a useful solution.\n";
 53 | 
 54 |     /**
 55 |      * @param args the command line arguments
 56 |      */
 57 |     public static void main(String[] args) {
 58 |         SimpleConfig simpleConfiguration = new SimpleConfig();
 59 | 
 60 |         //Set defaults for commandline parameters
 61 |         simpleConfiguration.datasetName = "./dataset.json"; // -d
 62 |         simpleConfiguration.outputFolder = "."; // -o
 63 |         //load simpleconfig defaults
 64 |         simpleConfiguration.numberOfJobs = 32; // -j
 65 |         simpleConfiguration.generations = 1000; // -g
 66 |         simpleConfiguration.numberThreads = 4; // -t
 67 |         simpleConfiguration.populationSize = 500; //-p
 68 |         simpleConfiguration.termination = 20; //-e
 69 |         simpleConfiguration.populateOptionalFields = false;
 70 |         simpleConfiguration.isStriped = false;
 71 | 
 72 |         parseArgs(args, simpleConfiguration);
 73 | 
 74 |         try {
 75 |             simpleConfiguration.dataset = loadDataset(simpleConfiguration.datasetName);
 76 |         } catch (IOException ex) {
 77 |             System.out.println("Problem opening the dataset file " + simpleConfiguration.datasetName + "\n");
 78 |             Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
 79 |             System.exit(1);
 80 |         }
 81 |         //Output warning about learning size
 82 |         String message = null;
 83 |         int numberPositiveExamples = 0;
 84 |         for (Example example : simpleConfiguration.dataset.getExamples()) {
 85 |             if (example.getNumberMatches() > 0) {
 86 |                 numberPositiveExamples++;
 87 |             }
 88 |         }
 89 |         if (simpleConfiguration.dataset.getNumberMatches() < 25 || numberPositiveExamples < 2) {
 90 |             message = WARNING_MESSAGE;
 91 |         }
 92 |         Configuration config = simpleConfiguration.buildConfiguration();
 93 |         //change defaults for console usage
 94 |         config.setPostProcessor(new JsonPostProcessor());
 95 |         config.getPostprocessorParameters().put(BasicPostprocessor.PARAMETER_NAME_POPULATE_OPTIONAL_FIELDS, Boolean.toString(simpleConfiguration.populateOptionalFields));
 96 |         config.setOutputFolderName(simpleConfiguration.outputFolder);
 97 | 
 98 |         Results results = new Results(config);
 99 |         results.setComment(simpleConfiguration.comment);
100 |         try {
101 |             //This is an optional information
102 |             results.setMachineHardwareSpecifications(Utils.cpuInfo());
103 |         } catch (IOException ex) {
104 |             Logger.getLogger(ConsoleRegexTurtle.class.getName()).log(Level.SEVERE, null, ex);
105 |         }
106 |         CoolTextualExecutionListener consolelistener = new CoolTextualExecutionListener(message, config, results);
107 | 
108 |         long startTime = System.currentTimeMillis();
109 |         ExecutionStrategy strategy = config.getStrategy();
110 |         try {
111 |             strategy.execute(config, consolelistener);
112 |         } catch (Exception ex) {
113 |             Logger.getLogger(ConsoleRegexTurtle.class.getName()).log(Level.SEVERE, null, ex);
114 |         }
115 | 
116 |         if (config.getPostProcessor() != null) {
117 |             startTime = System.currentTimeMillis() - startTime;
118 |             config.getPostProcessor().elaborate(config, results, startTime);
119 |         }
120 |         writeBestPerformances(results.getBestSolution(), config.isIsFlagging());
121 |     }
122 | 
123 |     private static DataSet loadDataset(String dataSetFilename) throws IOException {
124 |         FileInputStream fis = new FileInputStream(new File(dataSetFilename));
125 |         InputStreamReader isr = new InputStreamReader(fis);
126 |         StringBuilder sb;
127 |         try (BufferedReader bufferedReader = new BufferedReader(isr)) {
128 |             sb = new StringBuilder();
129 |             String line;
130 |             while ((line = bufferedReader.readLine()) != null) {
131 |                 sb.append(line);
132 |             }
133 |         }
134 |         String json = sb.toString();
135 |         return loadDatasetJson(json);
136 |     }
137 | 
138 |     private static DataSet loadDatasetJson(String jsonDataset) {
139 |         Gson gson = new GsonBuilder().disableHtmlEscaping().create();
140 |         DataSet dataset = gson.fromJson(jsonDataset, DataSet.class);
141 |         return dataset;
142 |     }
143 | 
144 |     private static void writeBestPerformances(FinalSolution solution, boolean isFlagging) {
145 |         if (solution != null) {
146 |             System.out.println("Best on learning (JAVA): " + solution.getSolution());
147 |             System.out.println("Best on learning (JS): " + solution.getSolutionJS());
148 |             if (!isFlagging) {
149 |                 System.out.println("******Stats for Extraction task******");
150 |                 System.out.println("******Stats on training******");
151 |                 System.out.println("F-measure: " + solution.getTrainingPerformances().get("match f-measure"));
152 |                 System.out.println("Precision: " + solution.getTrainingPerformances().get("match precision"));
153 |                 System.out.println("Recall: " + solution.getTrainingPerformances().get("match recall"));
154 |                 System.out.println("Char precision: " + solution.getTrainingPerformances().get("character precision"));
155 |                 System.out.println("Char recall: " + solution.getTrainingPerformances().get("character recall"));
156 |                 System.out.println("******Stats on validation******");
157 |                 System.out.println("F-measure " + solution.getValidationPerformances().get("match f-measure"));
158 |                 System.out.println("Precision: " + solution.getValidationPerformances().get("match precision"));
159 |                 System.out.println("Recall: " + solution.getValidationPerformances().get("match recall"));
160 |                 System.out.println("Char precision: " + solution.getValidationPerformances().get("character precision"));
161 |                 System.out.println("Char recall: " + solution.getValidationPerformances().get("character recall"));
162 |                 System.out.println("******Stats on learning******");
163 |                 System.out.println("F-measure: " + solution.getLearningPerformances().get("match f-measure"));
164 |                 System.out.println("Precision: " + solution.getLearningPerformances().get("match precision"));
165 |                 System.out.println("Recall: " + solution.getLearningPerformances().get("match recall"));
166 |                 System.out.println("Char precision: " + solution.getLearningPerformances().get("character precision"));
167 |                 System.out.println("Char recall: " + solution.getLearningPerformances().get("character recall"));
168 |             } else {
169 |                 System.out.println("******Stats for Flagging task******");
170 |                 System.out.println("******Stats on training******");
171 |                 System.out.println("Accuracy: " + solution.getTrainingPerformances().get("flag accuracy"));
172 |                 System.out.println("Fpr: " + solution.getTrainingPerformances().get("flag fpr"));
173 |                 System.out.println("Fnr: " + solution.getTrainingPerformances().get("flag fnr"));
174 |                 System.out.println("F-measure: " + solution.getTrainingPerformances().get("flag f-measure"));
175 |                 System.out.println("Precision: " + solution.getTrainingPerformances().get("flag precision"));
176 |                 System.out.println("Recall: " + solution.getTrainingPerformances().get("flag recall"));
177 |                 System.out.println("******Stats on validation******");
178 |                 System.out.println("Accuracy: " + solution.getValidationPerformances().get("flag accuracy"));
179 |                 System.out.println("Fpr: " + solution.getValidationPerformances().get("flag fpr"));
180 |                 System.out.println("Fnr: " + solution.getValidationPerformances().get("flag fnr"));
181 |                 System.out.println("F-measure " + solution.getValidationPerformances().get("flag f-measure"));
182 |                 System.out.println("Precision: " + solution.getValidationPerformances().get("flag precision"));
183 |                 System.out.println("Recall: " + solution.getValidationPerformances().get("flag recall"));
184 |                 System.out.println("******Stats on learning******");
185 |                 System.out.println("Accuracy: " + solution.getLearningPerformances().get("flag accuracy"));
186 |                 System.out.println("Fpr: " + solution.getLearningPerformances().get("flag fpr"));
187 |                 System.out.println("Fnr: " + solution.getLearningPerformances().get("flag fnr"));
188 |                 System.out.println("F-measure: " + solution.getLearningPerformances().get("flag f-measure"));
189 |                 System.out.println("Precision: " + solution.getLearningPerformances().get("flag precision"));
190 |                 System.out.println("Recall: " + solution.getLearningPerformances().get("flag recall"));
191 |             }
192 |         }
193 |     }
194 | 
195 |     static private final String HELP_MESSAGE
196 |             = "Usage:\n"
197 |             + "java -jar ConsoleRegexTurtle -t 4 -p 500 -g 1000 -e 20.0 -c \"interesting evolution\" -x true -d dataset.json -o ./outputfolder/\n"
198 |             + "\nOn linux you can invoke this tool using the alternative script:\n"
199 |             + "regexturtle.sh -t 4 -p 500 -g 1000 -e 20.0 -c \"interesting evolution\" -d dataset.json -o ./outputfolder/\n"
200 |             + "\nParameters:\n"
201 |             + "-t number of threads, default is 2\n"
202 |             + "-p population size, default is 500\n"
203 |             + "-g maximum number of generations, per Job, default si 1000\n"
204 |             + "-j number of Jobs, default si 32\n"
205 |             + "-e percentange of number generations, defines a threshold for the separate and conquer split criteria, when best doesn't change for the provided % of generation the Job evolution separates the dataset.\n"
206 |             + "   Default is 20%, 200 geberations with default 1000 generations.\n"
207 |             + "-d path of the dataset json file containing the examples, this parameter is mandatory.\n"
208 |             + "-o name of the output folder, results.json is saved into this folder; default is '.'\n"
209 |             + "-x boolean, populates an extra field in results file, when 'true' adds all dataset examples in the results file 'examples' field, default is 'false'\n"
210 |             + "-s boolean, when 'true' enables dataset striping, striping is an experimental feature, default is disabled: 'false'\n"
211 |             + "-c adds an optional comment string\n"
212 |             + "-f enables the flagging mode: solves a flagging problem with a separate-and-conquer strategy\n"
213 |             + "-h visualizes this help message\n";
214 | 
215 |     static private void parseArgs(String[] args, SimpleConfig simpleConfig) {
216 |         try {
217 |             boolean mandatoryDatasetCheck = true;
218 |             if (args.length == 0) {
219 |                 System.out.println(HELP_MESSAGE);
220 |             }
221 |             for (int i = 0; i < args.length; i++) {
222 |                 String string = args[i];
223 |                 i = i + 1;
224 |                 String parameter = args[i];
225 |                 switch (string) {
226 |                     case "-t":
227 |                         simpleConfig.numberThreads = Integer.valueOf(parameter);
228 |                         break;
229 |                     case "-p":
230 |                         simpleConfig.populationSize = Integer.valueOf(parameter);
231 |                         break;
232 |                     case "-d":
233 |                         simpleConfig.datasetName = parameter;
234 |                         mandatoryDatasetCheck = false;
235 |                         break;
236 |                     case "-o":
237 |                         simpleConfig.outputFolder = parameter;
238 |                         break;
239 |                     case "-g":
240 |                         simpleConfig.generations = Integer.valueOf(parameter);
241 |                         break;
242 |                     case "-j":
243 |                         simpleConfig.numberOfJobs = Integer.valueOf(parameter);
244 |                         break;
245 |                     case "-e":
246 |                         simpleConfig.termination = Double.valueOf(parameter);
247 |                         break;
248 |                     case "-x":
249 |                         simpleConfig.populateOptionalFields = Boolean.valueOf(parameter);
250 |                         break;
251 |                     case "-h":
252 |                         System.out.println(HELP_MESSAGE);
253 |                         break;
254 |                     case "-c":
255 |                         simpleConfig.comment = parameter;
256 |                         break;
257 |                     case "-s":
258 |                         simpleConfig.isStriped = Boolean.valueOf(parameter);
259 |                         break;
260 |                     case "-f":
261 |                         simpleConfig.isFlagging = true;
262 |                         i=i-1; //Do not use parameter
263 |                         break;
264 |                 }
265 |             }
266 | 
267 |             if (simpleConfig.isStriped && simpleConfig.isFlagging) {
268 |                 System.out.println("Striping and flagging cannot be enabled toghether.\n" + HELP_MESSAGE);
269 |                 System.exit(1);
270 |             }
271 | 
272 |             if (mandatoryDatasetCheck) {
273 |                 System.out.println("Dataset path is needed.\n" + HELP_MESSAGE);
274 |                 System.exit(1);
275 |             }
276 |         } catch (RuntimeException ex) {
277 |             System.out.println("Problem parsing commandline parameters.\n" + HELP_MESSAGE);
278 |             System.out.println("Error details:" + ex.toString());
279 |             System.exit(1);
280 |         }
281 | 
282 |     }
283 | 
284 | }
285 | 


--------------------------------------------------------------------------------
/src/regex/ConsoleRegexTurtle/src/it/units/inginf/male/dto/SimpleConfig.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2015 Machine Learning Lab - University of Trieste, 
  3 |  * Italy (http://machinelearning.inginf.units.it/)  
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, either version 3 of the License, or
  8 |  * (at your option) any later version.
  9 |  *
 10 |  * This program is distributed in the hope that it will be useful,
 11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |  * GNU General Public License for more details.
 14 |  *
 15 |  * You should have received a copy of the GNU General Public License
 16 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 |  */
 18 | package it.units.inginf.male.dto;
 19 | 
 20 | import it.units.inginf.male.configuration.Configuration;
 21 | import it.units.inginf.male.configuration.DatasetContainer;
 22 | import it.units.inginf.male.generations.EmptyPopulationBuilder;
 23 | import it.units.inginf.male.generations.FlaggingNaivePopulationBuilder;
 24 | import it.units.inginf.male.generations.TokenizedPopulationBuilder;
 25 | import it.units.inginf.male.inputs.DataSet;
 26 | import it.units.inginf.male.objective.FlaggingAccuracyPrecisionLengthObjective;
 27 | import it.units.inginf.male.selections.best.BasicFlaggingLearningBestSelector;
 28 | import it.units.inginf.male.strategy.impl.MultithreadStrategy;
 29 | import it.units.inginf.male.terminalsets.FlaggingNgramsTerminalSetBuilder;
 30 | import it.units.inginf.male.terminalsets.TokenizedTerminalSetBuilder;
 31 | import java.util.Arrays;
 32 | import java.util.logging.Logger;
 33 | 
 34 | 
 35 | /**
 36 |  *
 37 |  * @author MaleLabTs
 38 |  */
 39 | public class SimpleConfig {
 40 |     //Maximum unmatch_chars/match_chars ratio
 41 |     //and sets the maximum unmatch_chars/match_chars ratio; this value defines the margin size around the matches 
 42 |     transient private final double STRIPING_DEFAULT_MARGIN_SIZE = 10;
 43 |     public int numberThreads;
 44 |     public int numberOfJobs;
 45 |     public int generations;
 46 |     public int populationSize;
 47 |     public DataSet dataset;
 48 |     public boolean populateOptionalFields;
 49 |     public boolean isStriped = false;
 50 |     public boolean isFlagging = false;
 51 |     
 52 |     transient public String datasetName;
 53 |     transient public String outputFolder;
 54 | 
 55 |     /**
 56 |      * Percentange [0,100] of the number of the generations used for the Spared termination
 57 |      * criteria. 
 58 |      */
 59 |     public double termination = 20.0;
 60 |     public String comment;
 61 |     
 62 |     public Configuration buildConfiguration(){
 63 |         assert !(isFlagging&&isStriped);
 64 |         
 65 |         //
 66 |         Configuration configuration = new Configuration();
 67 |         configuration.setConfigName("Console config");
 68 |         configuration.getEvolutionParameters().setGenerations(generations);
 69 |         configuration.getEvolutionParameters().setPopulationSize(populationSize);
 70 |         configuration.setJobs(numberOfJobs);
 71 |         configuration.getStrategyParameters().put(MultithreadStrategy.THREADS_KEY, String.valueOf(numberThreads));
 72 |         
 73 |         int terminationGenerations = (int)(termination * configuration.getEvolutionParameters().getGenerations() / 100.0);
 74 |         if(termination==100.0){
 75 |             configuration.getStrategyParameters().put("terminationCriteria","false");  
 76 |         } else {
 77 |             configuration.getStrategyParameters().put("terminationCriteria","true");
 78 |         }
 79 |         configuration.getStrategyParameters().put("terminationCriteriaGenerations", String.valueOf(terminationGenerations));
 80 |         //Added terminationCriteria for the second strategy
 81 |         configuration.getStrategyParameters().put("terminationCriteria2","false");
 82 |         
 83 |         if(dataset == null){
 84 |             throw new IllegalArgumentException("You must define a dataset");
 85 |         }
 86 |         dataset.populateUnmatchesFromMatches();
 87 |         DatasetContainer datasetContainer = new DatasetContainer(dataset);
 88 |         datasetContainer.createDefaultRanges((int) configuration.getInitialSeed());
 89 |         //checks if striping is needed
 90 |         dataset.updateStats();
 91 |         if(isStriped){
 92 |             Logger.getLogger(this.getClass().getName()).info("Enabled striping.");
 93 |             datasetContainer.setDataSetsStriped(true);
 94 |             datasetContainer.setDatasetStripeMarginSize(STRIPING_DEFAULT_MARGIN_SIZE);
 95 |             datasetContainer.setProposedNormalDatasetInterval(100);//terminationGenerations+50);
 96 |         }
 97 |         configuration.setDatasetContainer(datasetContainer); //remind that after setting the DataSetContainer.. we need to update configuration in order to invoke datacontainer update methods
 98 |         
 99 |         //FLagging configuration
100 |         //is an alternative configuration, experimental, that requires changes into the configuration defaults (extractor configuration)
101 |         //Changes: bestSelector, fitness, terminalset builder configuration mod, population builders(?)
102 |         configuration.setIsFlagging(isFlagging);
103 |         if(this.isFlagging){
104 |             configuration.setStrategy(new MultithreadStrategy());
105 |             configuration.setBestSelector(new BasicFlaggingLearningBestSelector());
106 |             configuration.setObjective(new FlaggingAccuracyPrecisionLengthObjective());
107 |             configuration.setPopulationBuilder(new FlaggingNaivePopulationBuilder()); //disable context generation
108 |             configuration.setTerminalSetBuilder(new FlaggingNgramsTerminalSetBuilder()); //disable context generation 
109 |             //TODO change terminalSet to a more naive version?
110 |             configuration.getTerminalSetBuilderParameters().put("discardWtokens", "false");//Takes significant chars too
111 |             configuration.getStrategyParameters().put("isFlagging", "true"); //Enable strategy flagging
112 |             //Remove lookarounds
113 |             configuration.getOperators().removeAll(
114 |                     Arrays.asList("it.units.inginf.male.tree.operator.PositiveLookbehind","it.units.inginf.male.tree.operator.NegativeLookbehind",
115 |                             "it.units.inginf.male.tree.operator.PositiveLookahead", "it.units.inginf.male.tree.operator.NegativeLookahead"));
116 |         }
117 |         
118 |         
119 |         
120 |         configuration.setup(); //initializes datasetcontainer, populationbuilder and terminalsetbuilder
121 |         
122 |         return configuration;
123 |     }
124 | }
125 | 


--------------------------------------------------------------------------------
/src/regex/RegexRunner.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/RegexRunner.jar


--------------------------------------------------------------------------------
/src/regex/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/src/regex/__init__.py


--------------------------------------------------------------------------------
/src/regex/regex.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import subprocess
  4 | import os
  5 | import shutil
  6 | import pandas as pd
  7 | from psutil import virtual_memory
  8 | 
  9 | import conf
 10 | from src.utils import create_folder
 11 | 
 12 | logger = logging.getLogger(conf.LOGGER_NAME)
 13 | 
 14 | 
 15 | class Regex(object):
 16 |     """
 17 |     This class handles the creation and test of Regex on a list of string.
 18 |     """
 19 | 
 20 |     def __init__(self, project_name, remove_project=False):
 21 |         self.project_name = project_name
 22 |         create_folder(conf.REGEX_FOLDER_OUTPUT)
 23 |         self.project_folder = os.path.join(conf.REGEX_FOLDER_OUTPUT, project_name)
 24 |         if remove_project:
 25 |             shutil.rmtree(self.project_folder, ignore_errors=True, onerror=None)
 26 |         if not os.path.exists(self.project_folder):
 27 |             create_folder(self.project_folder)
 28 | 
 29 |     def run_with_benign_check(self, _cluster_dict, benign_list, benign_for_retrain=conf.BENIGN_FOR_RETRAIN,
 30 |                               round_max=15,
 31 |                               take_existing_result=False):
 32 |         """
 33 |         Main API. Extract signatures fron cluster dict and check the results on a benign list
 34 |         :param _cluster_dict: Ex: {'cluster_1': {'match': ['pandas', 'gibbon'], 'unmatch': ['monkey']}
 35 |         :param benign_list: list of path to not match
 36 |         :param benign_for_retrain: benign to be added for the regex generation process on each step
 37 |         :param round_max: Max round for generating the regexes. If -1, stop until no FP is found
 38 |         :param take_existing_result: Load the existing results in self.project_folder and start the round
 39 |                 with the benign samples
 40 |         :return: a cluster_dict that we should pass to another round if we want to continue the process.
 41 |         """
 42 | 
 43 |         assert round_max >= 1
 44 |         benign_list = self.remove_nan_value_from_list(benign_list)
 45 |         n_path = sum([len(_cluster_dict[cluster]['match']) for cluster in _cluster_dict])
 46 |         if take_existing_result:
 47 |             logger.info("Loading existing signatures in folder {}".format(self.project_folder))
 48 |             cluster_result = self.get_cluster_results()
 49 |         else:
 50 |             logger.info('Extracting regex for the first time on {} cluster(s)!'.format(len(_cluster_dict)))
 51 |             cluster_result = self.run_cluster_dict(_cluster_dict)
 52 |         old_cluster_dict = _cluster_dict
 53 |         old_cluster_result = cluster_result
 54 |         cluster_sig_dict = {k: [v] for k, v in cluster_result.items()}
 55 |         _round = 0
 56 |         while len(old_cluster_dict) > 0:
 57 |             if _round == round_max:
 58 |                 break
 59 |             logger.info(f'Starting Round {str(_round + 1)}')
 60 |             new_cluster_dict = self.test_cluster_dict(old_cluster_dict, old_cluster_result, benign_list,
 61 |                                                       limit=benign_for_retrain)
 62 |             if len(new_cluster_dict) == 0:
 63 |                 if _round == 0:
 64 |                     logger.info("It is too good to be true. I kill the process")
 65 |                     return 'error'
 66 |             logger.info(f'Creating regexes for {str(len(new_cluster_dict))} cluster(s) : {list(new_cluster_dict)}')
 67 | 
 68 |             cluster_result_new = self.run_cluster_dict(new_cluster_dict)
 69 |             old_cluster_dict = new_cluster_dict
 70 | 
 71 |             for cluster in new_cluster_dict:
 72 |                 cluster_sig_dict[cluster].append(cluster_result_new[cluster])
 73 |             _round += 1
 74 |             old_cluster_result = cluster_result_new
 75 |         last_cluster_dict = self.test_cluster_dict(_cluster_dict, self.get_cluster_results(), benign_list,
 76 |                                                    limit=1000000)
 77 |         cluster_with_no_fp = {k: _cluster_dict[k] for k in set(_cluster_dict) - set(last_cluster_dict)}
 78 |         fp_stat = ""
 79 |         no_fp_cluster = []
 80 |         for cluster in cluster_result:
 81 |             if cluster in last_cluster_dict:
 82 |                 fp_rate = round(len(last_cluster_dict[cluster]['unmatch']) * 100 / len(benign_list), 3)
 83 |                 fp_stat += f""" {cluster} - {str(len(last_cluster_dict[cluster]['unmatch']))} FP ( {fp_rate} % """
 84 |                 fp_stat += '\n'
 85 |             else:
 86 |                 no_fp_cluster.append(cluster)
 87 |         n_path_no_fp = sum([len(cluster_with_no_fp[k]['match']) for k in cluster_with_no_fp])
 88 |         cluster_path_stat = ''
 89 |         for cluster in dict(sorted(cluster_sig_dict.items(), key=lambda x: len(x[1]))):
 90 |             cluster_path_stat += f'{cluster} : ' + ' ---> '.join(cluster_sig_dict[cluster]) + '\n'
 91 |         summary_stat = f"""
 92 |         #### Summary ####
 93 |         
 94 |         Init\n:
 95 |         N cluster : {len(_cluster_dict)}
 96 |         N paths: {str(n_path)} 
 97 |         N benign in final test: {len(benign_list)}
 98 |         Benign number for retraining : {benign_for_retrain}
 99 |         N round: {round_max}
100 |         
101 |         Cluster sig paths: 
102 | 
103 |         {cluster_path_stat}
104 | 
105 |         After final testing: 
106 |         Cluster with 0 FP: {set(cluster_with_no_fp)} 
107 |         Number of paths covered with 0 FP: {n_path_no_fp} 
108 |         Percentage of paths covered with 0 FP: {round(100 * n_path_no_fp / n_path, 2)} %
109 |         
110 |         ### FP Report ###
111 | 
112 |         With FP :
113 |         
114 |         {fp_stat}
115 |         
116 |         Without:
117 |         
118 |         {no_fp_cluster}
119 |         
120 |         """
121 |         logger.info(summary_stat)
122 |         return last_cluster_dict
123 | 
124 |     def test_cluster_dict(self, _cluster_dict, cluster_result, benign_list, limit=10):
125 |         """
126 | 
127 |         :param _cluster_dict: cluster_dict
128 |         :param cluster_result: cluster result. Ex {'cluster_1': 'a{3}b+'}
129 |         :param benign_list: list of benign
130 |         :param limit: int. Stop testing after limit catches
131 |         :return: a cluster_dict that contains only the cluster that have FP.
132 |         """
133 |         new_cluster_dict = {}
134 |         for cluster in _cluster_dict:
135 |             benign_match = self.check_regex_list(cluster_result[cluster],
136 |                                                  list(set(benign_list) - set(_cluster_dict[cluster]['unmatch'])),
137 |                                                  limit=limit)[1]
138 | 
139 |             if len(benign_match) == 0:
140 |                 logger.info(
141 |                     'Signature on cluster {} ( {}) has no benign match !'.format(cluster, cluster_result[cluster]))
142 |                 continue
143 |             logger.info('{} benign match in {}'.format(len(benign_match), cluster))
144 |             new_cluster_dict[cluster] = {'match': _cluster_dict[cluster]['match'],
145 |                                          'unmatch': benign_match + _cluster_dict[cluster]['unmatch']}
146 |         return new_cluster_dict
147 | 
148 |     def run_cluster_dict(self, _cluster_dict):
149 |         """
150 |         Run regex extraction from a cluster dict
151 |         :param _cluster_dict: cluster dict
152 |         :return: cluster results
153 |         """
154 |         for cluster in _cluster_dict:
155 |             logger.info(f'Creating regex for cluster {cluster}')
156 |             self.create_regex(_cluster_dict[cluster]['match'], cluster_name=cluster,
157 |                               str_to_not_match=_cluster_dict[cluster]['unmatch'])
158 |         cluster_results = self.get_cluster_results()  # Here we load all the results !
159 |         logger.info(f'Cluster results {cluster_results}')
160 |         return cluster_results
161 | 
162 |     def create_regex(self, str_list, cluster_name, str_to_not_match=None):
163 |         """
164 |         Create regex from a string list and list to not match
165 |         :param str_list: list
166 |         :param cluster_name: str
167 |         :param str_to_not_match: list
168 |         :return: void
169 |         """
170 |         str_to_not_match = self.remove_nan_value_from_list(str_to_not_match)
171 |         json_path = self.create_json_cluster([x + '.' for x in str_list], [x + '.' for x in str_to_not_match],
172 |                                              name=cluster_name)
173 |         self.run_regex_extraction(json_path, output_folder=conf.REGEX_TMP)
174 |         self.__move_results(cluster_name)
175 | 
176 |     def __move_results(self, cluster_name):
177 |         """
178 |         Move results from tmp to the folder of the project
179 |         :param cluster_name: name of the cluster
180 |         :return: void
181 |         """
182 |         output_file = os.listdir(conf.REGEX_TMP)[0]
183 |         shutil.move(os.path.join(conf.REGEX_TMP, output_file),
184 |                     os.path.join(self.project_folder, 'results_' + cluster_name + '.json'))
185 | 
186 |     def get_cluster_results(self):
187 |         """
188 |         Parse the results outputed by the regex creation process
189 |         :return: cluster results
190 |         """
191 |         results = {}
192 |         for file in os.listdir(self.project_folder):
193 |             if not file.startswith('results'):
194 |                 continue
195 |             file_splitted = file.split('_')
196 | 
197 |             cluster_name = '_'.join([file_splitted[1], file_splitted[2], file_splitted[3]]).replace(".json", "")
198 |             with open(os.path.join(self.project_folder, file)) as json_file:
199 |                 cluster_result = json.load(json_file)
200 |                 # results[cluster_name] = cluster_result['bestSolution']["solutionJS"]
201 |                 results[cluster_name] = cluster_result['bestSolution']["solution"]
202 |         return results
203 | 
204 |     @staticmethod
205 |     def run_regex_extraction(json_to_extract, output_folder=conf.REGEX_TMP,
206 |                              mem=round(virtual_memory().available / 10 ** 9) * 1000, threads=None):
207 |         """
208 |         Run the Java process that creates the regex
209 |         :param json_to_extract: json
210 |         :param output_folder: output folder
211 |         :param mem: memory to use. By default almost all the memory available
212 |         :param threads: number of threads. For example mp.cpu_count(). /!\ Can create memory issue
213 |         :return: void
214 |         """
215 | 
216 |         args = ['java', '-Xmx{}M'.format(mem), '-Xms{}M'.format(int(mem / 2)), '-jar', conf.REGEX_JAVA, "-d",
217 |                 json_to_extract, "-o", output_folder]
218 |         if threads:
219 |             args += ["-t", str(threads)]
220 |         try:
221 |             logger.info(f'Running subprocess with input {json_to_extract}')
222 |             subprocess.run(args)
223 |         except subprocess.TimeoutExpired:
224 |             print(f'Timeout reached for input {json_to_extract}')  # it should not take more than 1 hour
225 | 
226 |     def create_json_cluster(self, str_to_match, str_to_not_match=None, name='urls_cluster', description='luda'):
227 |         """
228 |         Create the input for the Java process
229 |         :param str_to_match: list of str
230 |         :param str_to_not_match: list of str
231 |         :param name: name of the json
232 |         :param description: str
233 |         :return: path where the json was created
234 |         """
235 |         examples = []
236 |         for el in str_to_match:
237 |             examples.append({
238 |                 "string": el,
239 |                 "match": [{"start": 0, "end": len(el) - 1}],
240 |                 "unmatch": []})
241 |         if len(str_to_not_match) > 0:
242 |             for el in str_to_not_match:
243 |                 examples.append({
244 |                     "string": el,
245 |                     "match": [],
246 |                     "unmatch": [{"start": 0, "end": len(el) - 1}]})
247 | 
248 |         result = {
249 |             "name": name,
250 |             "description": description,
251 |             "regexTarget": "",
252 |             "examples": examples}
253 |         json_path = os.path.join(self.project_folder, 'input_' + name + '.json')
254 |         with open(json_path, 'w') as f:
255 |             json.dump(result, f)
256 |         return json_path
257 | 
258 |     @staticmethod
259 |     def check_regex_list(sig, path_list, limit=9999999):
260 |         """
261 |         Check a regex against of list of str
262 |         :param sig: str
263 |         :param path_list: list
264 |         :param limit: int
265 |         :return: tuple
266 |         """
267 |         match = 0
268 |         urls_match = []
269 |         batch_size = conf.TEST_BATCH_SIZE
270 |         for i in range(0, len(path_list), batch_size):
271 |             if match > limit:
272 |                 break
273 |             batch = path_list[i:i + batch_size]
274 |             # res = js_regex.compile(sig).search(r'{}'.format(path))
275 |             result_list = Regex.run_regex_java(sig, batch)
276 |             for j, match_bool in enumerate(result_list):
277 |                 if match >= limit:
278 |                     break
279 |                 if not match_bool:
280 |                     continue
281 |                 detected_path = path_list[i + j]
282 |                 if match < 2:  # We want to print only some examples
283 |                     # logger.info('Match on {}'.format(res.group(0)))
284 |                     logger.info('Match on {}'.format(detected_path))
285 |                 urls_match.append(detected_path)
286 |                 match += 1
287 | 
288 |         return match, urls_match
289 | 
290 |     def create_result_report(self, output_file=None):
291 |         if not output_file:
292 |             output_file = os.path.join(self.project_folder, f'report_{self.project_name}.csv')
293 |         list_of_dict = list()
294 |         for file in os.listdir(self.project_folder):
295 | 
296 |             if 'results' in file:
297 |                 with open(os.path.join(self.project_folder, file)) as json_file:
298 |                     cluster_result = json.load(json_file)
299 |                 with open(os.path.join(self.project_folder, file.replace('results', 'input'))) as json_file:
300 |                     cluster_input = json.load(json_file)
301 |                     malicious = 0
302 |                     benign = 0
303 |                     example_to_keep = None
304 |                     for example in cluster_input['examples']:
305 |                         if len(example['match']) > 0:
306 |                             if not example_to_keep:
307 |                                 example_to_keep = example['string']
308 |                             malicious += 1
309 |                         else:
310 |                             benign += 1
311 |                 tmp = {'name': file.replace('results_', '').replace('.json', ''),
312 |                        'regex_js': cluster_result['bestSolution']['solutionJS'],
313 |                        'regex_java': cluster_result['bestSolution']['solution'],
314 |                        'malicious': malicious,
315 |                        'benign': benign,
316 |                        'round': benign // conf.BENIGN_FOR_RETRAIN,
317 |                        'example_malicious': example_to_keep,
318 |                        'results_file': file,
319 |                        'input_file': file.replace('results', 'input')}
320 |                 list_of_dict.append(tmp.copy())
321 |         df = pd.DataFrame(list_of_dict)
322 |         df.to_csv(output_file)
323 |         return df
324 | 
325 |     @staticmethod
326 |     def run_regex_java(regex, list_string):
327 |         """
328 |         Run Regex on list of string with Java code
329 |         :param regex: regex Java
330 |         :param list_string: string to test
331 |         :return: list of String ie ['true', 'false', 'false']
332 |         """
333 |         with open(conf.INPUT_REGEX_RUNNER, 'w') as f:
334 |             json.dump({'to_test': [x for x in list_string if str(x) != 'nan']}, f)
335 |         command = ['java', '-jar', conf.REGEX_RUNNER, regex, conf.INPUT_REGEX_RUNNER, conf.OUTPUT_REGEX_RUNNER]
336 |         try:
337 |             subprocess.run(command, stdout=subprocess.PIPE)
338 |         except subprocess.TimeoutExpired:
339 |             print(f'Timeout reached for regex {regex}')  # it should not take more than 1 hour !
340 |         with open(conf.OUTPUT_REGEX_RUNNER, 'r') as f:
341 |             result = json.load(f)
342 | 
343 |         return result['results']
344 | 
345 |     @staticmethod
346 |     def remove_nan_value_from_list(_list):
347 |         result = []
348 |         for el in _list:
349 |             if str(el) == 'nan':
350 |                 logger.warning('You have nan value in your list !!. List extract {}'.format(_list[:3]))
351 |                 continue
352 |             result.append(el)
353 |         return result
354 | 
355 | 


--------------------------------------------------------------------------------
/src/regex/regexturtle.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #Executes the command-line version of RegextTurtle; automatically sets the JAVA VM memory size based on the available system memory
3 | MEMSYSTEM=8000
4 | MAXMEM=$(( MEMSYSTEM-512 ))
5 | XMSMEM=$(( MAXMEM/2 ))
6 | echo "System memory:"$MEMSYSTEM "Mbytes"
7 | echo "RegexTurtle is going to use this amount of the system memory:"$MAXMEM "Mbytes" 
8 | java -Xmx${MAXMEM}M -Xms${XMSMEM}M -jar "ConsoleRegexTurtle.jar" $@ 
9 | 


--------------------------------------------------------------------------------
/src/regex/src_regexrunner/regexrunner/JsonOperation.java:
--------------------------------------------------------------------------------
 1 | package regexrunner;
 2 | 
 3 | import java.io.FileWriter;
 4 | import java.io.IOException;
 5 | 
 6 | import org.json.simple.JSONArray;
 7 | import org.json.simple.JSONObject;
 8 | 
 9 | 
10 | 
11 | import org.json.simple.parser.JSONParser;
12 | import org.json.simple.parser.ParseException;
13 | 
14 | import java.io.FileReader;
15 | import java.io.Reader;
16 | import java.util.ArrayList;
17 | import java.util.Iterator;
18 | public class JsonOperation {
19 | 
20 | 	
21 | 		
22 | 		public static ArrayList<String> read(String jsonPath) throws IOException, ParseException {
23 | 
24 |     		ArrayList<String> result = new ArrayList<String>();
25 | 
26 | 	        JSONParser parser = new JSONParser();
27 | 	        Reader reader = new FileReader(jsonPath);
28 | 
29 | 	            JSONObject jsonObject = (JSONObject) parser.parse(reader);
30 | 
31 | 
32 | 	            // loop array
33 | 	            JSONArray msg = (JSONArray) jsonObject.get("to_test");
34 | 	            Iterator<String> iterator = msg.iterator();
35 | 	            while (iterator.hasNext()) {
36 | 	            	result.add(iterator.next());}
37 | 		        return result;
38 | 	    }
39 | 		
40 | 	
41 | 		
42 | 		public static void write(String sig, ArrayList<String> listOfString, String jsonPath) {
43 | 		int i;
44 |         JSONObject obj = new JSONObject();
45 | 
46 |         JSONArray list = new JSONArray();
47 |         for (i = 0; i < listOfString.size(); i++) { 
48 |         	list.add(Regex.test(sig, listOfString.get(i)));
49 | 
50 |             // accessing each element of array 
51 | 	}
52 | 
53 |         obj.put("results", list);
54 | 
55 |         try (FileWriter file = new FileWriter(jsonPath)) {
56 |             file.write(obj.toJSONString());
57 |         } catch (IOException e) {
58 |             e.printStackTrace();
59 |         }
60 | 
61 |         System.out.print("Json written in " + jsonPath);
62 | 
63 |     }
64 | 	
65 | }
66 | 


--------------------------------------------------------------------------------
/src/regex/src_regexrunner/regexrunner/Main.java:
--------------------------------------------------------------------------------
 1 | package regexrunner;
 2 | 
 3 | 
 4 | import org.json.simple.JSONArray;
 5 | import org.json.simple.JSONObject;
 6 | import org.json.simple.parser.ParseException;
 7 | 
 8 | import java.io.FileWriter;
 9 | import java.io.IOException;
10 | import java.util.ArrayList;
11 | 
12 | import regexrunner.JsonOperation;
13 | 
14 | 
15 | /**
16 | This code is parsing a list of string, test them agaisnt a regex and 
17 | write the results in a file
18 | * @author  jordang
19 | * @version 1.0
20 | * @since   2020-06-04
21 | */
22 | 
23 | 
24 | public class Main{
25 | 
26 |     public static void main(String[] args) throws IOException, ParseException {
27 |     	/** args: regex, input json, output json */
28 | 
29 |     	ArrayList<String> stringList = JsonOperation.read(args[1]);
30 |     	JsonOperation.write(args[0], stringList, args[2]);
31 | 
32 |     }
33 | 
34 | }


--------------------------------------------------------------------------------
/src/regex/src_regexrunner/regexrunner/Regex.java:
--------------------------------------------------------------------------------
 1 | package regexrunner;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.regex.*;  
 5 | 
 6 | 
 7 | public class Regex {
 8 | 	
 9 | 
10 | 	
11 | 	public static boolean test(String sig, String my_string) {	
12 | 	Pattern pattern = Pattern.compile(sig);
13 | 	Matcher matcher = pattern.matcher(my_string);
14 | 	if (matcher.find())
15 | 		return true;
16 | 	return false;
17 | 		
18 | }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/use_case/use_case_clustering.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import pickle
  3 | import logging
  4 | import os
  5 | import numpy as np
  6 | 
  7 | from src.clustering.distance_matrix import DistanceMatrix
  8 | from src.clustering.metrics import DISTANCE_FUNC
  9 | 
 10 | import conf
 11 | 
 12 | logger = logging.getLogger(conf.LOGGER_NAME)
 13 | 
 14 | """
 15 | ###     Note about the following FP_LIST    ###
 16 | 
 17 | 1. FP ( benign) path should be added on the data step OR filtered on the preprocessing step. If you
 18 |     don't want ( bad practice) to do it there you can add your benign path here.
 19 | 2. The MAIN use case of this following list is to add benign path AFTER having run the distance matrix computation
 20 |     and saw some FP on your cluster. You can filter them here and run a new clustering.
 21 | 
 22 | 
 23 | """
 24 | 
 25 | 
 26 | FP_LIST = ['/wp-content', '/index.php', '/wp-includes', '/gate.php', '/admin.php', '/wp-admin',
 27 |            '/wp-content/uploads',
 28 |            '/images/logo.gif', '/login.php']
 29 | 
 30 | 
 31 | # if you don't want to rerun the feeders. You can still filter some path here but we advise to it
 32 | 
 33 | def get_clusterer(clusterer_dict):
 34 |     """
 35 |     Return different clustering config. We advise to use DBscan.
 36 |     :param clusterer_dict: dict with clustering parameters. E.g {"dbscan": {"eps": 10, "min_samples": 8}}
 37 |     :return: clustering object
 38 |     """
 39 |     if not clusterer_dict:
 40 |         clusterer_dict = {"dbscan": {"eps": 10, "min_samples": 8}}
 41 |     if 'dbscan' in clusterer_dict:
 42 |         from sklearn.cluster import DBSCAN  # We import here so we don't need to install something we don't need
 43 |         return DBSCAN(**clusterer_dict['dbscan'], metric='precomputed')
 44 |     elif 'hdbscan' in clusterer_dict:
 45 |         import hdbscan
 46 |         return hdbscan.HDBSCAN(**clusterer_dict['dbscan'], metric='precomputed')
 47 |     elif 'complete' in clusterer_dict:
 48 |         from sklearn.cluster import AgglomerativeClustering
 49 |         return AgglomerativeClustering(**clusterer_dict['complete'], affinity='precomputed')
 50 | 
 51 | 
 52 | class UseCaseClustering(object):
 53 | 
 54 |     def run(self, file_path, skip_compute_distance=False, save_folder=None,
 55 |             clusterer=None, filter_th=10):
 56 |         """
 57 |         Compute the distance between URL and cluster them
 58 |         :param file_path: path of the csv preprocessed
 59 |         :param skip_compute_distance: bool. If true, does only the clustering
 60 |         :param save_folder: path. Mandatory if we skip the computation step
 61 |         :param clusterer: clustering technique
 62 |         :param filter_th: threshold used to clean the matrix in the function __filter_outlier_and_fp
 63 |         :return: void
 64 |         """
 65 |         if skip_compute_distance:
 66 |             assert save_folder is not None
 67 |             distance_matrix_object = DistanceMatrix.load(save_folder)
 68 |         else:
 69 |             df_features = pd.read_csv(file_path)
 70 |             df_features = df_features[df_features['label'] == 'malicious']
 71 |             logger.info(f"For this step, we do not use the "
 72 |                         f" {df_features[df_features['label'] == 'malicious']['path'].nunique()} benign paths ")
 73 |             df_features = df_features[~df_features['path'].isin(FP_LIST)]
 74 |             path_list = list(df_features['path'].unique())  # We take only unique !!!
 75 |             distance_matrix_object = DistanceMatrix(path_list,
 76 |                                                     distance_func=DISTANCE_FUNC['sw'], folder=save_folder)
 77 |             distance_matrix_object.run()
 78 |         distance_matrix_object.matrix = distance_matrix_object.matrix.astype(np.double)
 79 |         index_to_keep, matrix_filtered = self.__filter_outlier_and_fp(distance_matrix_object.matrix, filter_th,
 80 |                                                                       distance_matrix_object.url_list)
 81 |         distance_matrix_object.matrix = matrix_filtered
 82 |         logger.info('We begin the clustering !')
 83 |         distance_matrix_object.matrix = self.distance_from_sim(distance_matrix_object.matrix)
 84 |         clust = get_clusterer(clusterer)
 85 |         clust.fit(
 86 |             distance_matrix_object.matrix)  # we need to pass distance matrix instead of similarity
 87 |         logger.info('Clustering done')
 88 |         with open(os.path.join(distance_matrix_object.folder, 'labels.pkl'), 'wb') as f:
 89 |             pickle.dump(clust.labels_, f)
 90 |         with open(os.path.join(distance_matrix_object.folder, 'index_to_keep.pkl'), 'wb') as f:
 91 |             pickle.dump(index_to_keep, f)
 92 |         logger.info('You can find the results at {}'.format(distance_matrix_object.folder))
 93 | 
 94 |     @staticmethod
 95 |     def __filter_outlier_and_fp(mat, th, path_list=None):
 96 |         """
 97 |         Clean the matrix from outlier and FP
 98 |         :param mat: matrix
 99 |         :param th: int. If a row does not contain a value higher than th, it will be filtered.
100 |         :param path_list: list
101 |         :return: tuple. (list of indexes not filtered, the new matrix filtered)
102 |         """
103 |         index_to_remove = []
104 |         if path_list:
105 |             for fp in FP_LIST:
106 |                 if fp in path_list:
107 |                     index_to_remove.append(path_list.index(fp))
108 |         logger.info('Matrix size before filter {}'.format(mat.shape))
109 |         new_matrix = []
110 |         index_to_keep = []
111 |         for i, el in enumerate(mat):
112 |             if i in index_to_remove:
113 |                 continue
114 |             if el.max() >= th:
115 |                 new_matrix.append(el)
116 |                 index_to_keep.append(i)
117 | 
118 |         new_matrix = np.vstack(new_matrix)
119 |         new_matrix = new_matrix[:, index_to_keep]
120 |         logger.info('Matrix size after filter {}'.format(new_matrix.shape))
121 | 
122 |         return index_to_keep, np.vstack(new_matrix)
123 | 
124 |     @staticmethod
125 |     def distance_from_sim(matrix):
126 |         """
127 | 
128 |         Invert linearly the numbers. Shift min --> max and max--> min. Ensure than the diagonal is 0
129 |         The goal is to transform a similarity matrix into a distance matrix.
130 |         :param matrix: matrix 2d
131 |         :return: matrix 2d
132 |         """
133 |         high = conf.SIMILARITY_MAX
134 |         result = np.abs(high - matrix)
135 |         np.fill_diagonal(result, 0)  # we ensure that the edit distance with an element and itself is 0
136 |         return result
137 | 


--------------------------------------------------------------------------------
/src/use_case/use_case_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import logging
 3 | import os
 4 | 
 5 | import conf
 6 | from src.feeder.feed_downloader import Url
 7 | 
 8 | logger = logging.getLogger(conf.LOGGER_NAME)
 9 | 
10 | 
11 | class UseCaseData(object):
12 | 
13 |     def run(self, main_file, additional_sources):
14 |         final_df = pd.DataFrame(columns=list(Url.__annotations__))
15 |         if os.path.exists(main_file):
16 |             df = pd.read_csv(main_file)
17 |             assert list(df) == list(Url.__annotations__)
18 |             logger.info(
19 |                 f'{main_file} already exists. We load it and concatenate it with the additional sources ( if exists)')
20 |             final_df = pd.concat([final_df, df])
21 |         for path_label in additional_sources:
22 |             final_df = pd.concat([final_df, self.get_basic_format_df(path_label['path'], path_label['label'])])
23 |         final_df.to_csv(main_file, index=False)
24 |         return final_df
25 | 
26 |     def get_basic_format_df(self, file, label):
27 |         self.check_label(label=label)
28 |         df = self.load_df(file)
29 |         df['label'] = label.lower()
30 |         new_df = pd.DataFrame(df['url'])
31 |         other_columns = list(Url.__annotations__)
32 |         other_columns.remove('url')
33 |         for column in other_columns:
34 |             if column in df.columns:
35 |                 new_df = pd.concat([new_df, df[column]], axis=1)
36 |             else:
37 |                 logger.warning(f'Column {column} not found in {file}. Setting it to None')
38 |                 new_df[column] = None
39 |         logger.info(f"{new_df['url'].nunique()} unique URLs loaded from {file}")
40 |         return new_df
41 | 
42 |     @staticmethod
43 |     def check_label(label):
44 |         if label.lower() not in conf.DATA_LABELS:
45 |             raise Exception('You should specify a label ( malicious or benign) for you data sources')
46 | 
47 |     def load_df(self, file_path):
48 |         SEP = [',', '\t']  # you put here several sep if you have different formats
49 |         for sep in SEP:
50 |             try:
51 |                 df = pd.read_csv(file_path, sep=sep, error_bad_lines=False)
52 |                 try:
53 |                     self.check_columns(df)
54 |                 except Exception:  # maybe with the next sep, it will work.
55 |                     continue
56 |                 return df
57 |             except Exception as e:
58 |                 raise Exception(f'Failed loading for file {file_path}, {e}')
59 |         raise Exception(f'Failed loading for file {file_path}')
60 | 
61 |     @staticmethod
62 |     def check_columns(df):
63 |         MANDATORY_COLUMNS = ['url']
64 |         for col in MANDATORY_COLUMNS:
65 |             if col not in df.columns:
66 |                 raise Exception(f"You should have a column named {col} at least")
67 | 


--------------------------------------------------------------------------------
/src/use_case/use_case_feeder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import logging
 4 | 
 5 | from src.utils import create_folder
 6 | from src.utils import df_to_url_list
 7 | from src.feeder.feed_downloader import FeedDownloader, Url
 8 | import conf
 9 | 
10 | logger = logging.getLogger(conf.LOGGER_NAME)
11 | 
12 | 
13 | class UseCaseFeeder(object):
14 | 
15 |     @staticmethod
16 |     def fetch(sources):
17 |         """
18 |         Once you create your feeder, add it here to call it directly from config.json
19 |         :param sources: source list
20 |         :return: url list object
21 |         """
22 |         url_list = []
23 |         for source in sources:
24 |             if source == 'urlhaus':
25 |                 from src.feeder.urlhaus_feed_downloader import URLHausFeedDownloader
26 |                 feeder_object = URLHausFeedDownloader()
27 |             elif source == 'openfish':
28 |                 from src.feeder.openfish_feed_downloader import OpenPhishFeedDownloader
29 |                 feeder_object = OpenPhishFeedDownloader()
30 |             elif source == 'alexa':
31 |                 from src.feeder.alexa_feed_downloader import AlexaFeedDownloader
32 |                 feeder_object = AlexaFeedDownloader()
33 |             elif source == 'majestic':
34 |                 from src.feeder.vt_feed_downloader import VtFeedDownloader
35 |                 feeder_object = VtFeedDownloader()
36 |             elif source == 'umbrella':
37 |                 from src.feeder.umbrella_feed_downloader import UmbrellaFeedDownloader
38 |                 feeder_object = UmbrellaFeedDownloader()
39 |             elif source == 'iscx':
40 |                 from src.feeder.iscx_feed_downloader import IscxFeedDownloader
41 |                 feeder_object = IscxFeedDownloader()
42 |             elif source == 'vt':
43 |                 from src.feeder.vt_feed_downloader import VtFeedDownloader
44 |                 feeder_object = VtFeedDownloader()
45 |             else:
46 |                 continue
47 |             url_list += feeder_object.run()
48 | 
49 |         return url_list
50 | 
51 |     @staticmethod
52 |     def fetch_and_save(sources, filename):
53 |         list_of_urls = UseCaseFeeder.fetch(sources)
54 |         if os.path.exists(filename):
55 |             logger.info(f'Found an existing {filename}. We append the feeders results to this file.')
56 |             df = pd.read_csv(filename)
57 |             assert list(df) == list(Url.__annotations__)
58 |             list_of_urls += df_to_url_list(df)
59 |         create_folder(filename)
60 |         FeedDownloader.save_to_csv(list_of_urls, filename)
61 | 


--------------------------------------------------------------------------------
/src/use_case/use_case_preprocessor.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import conf
 4 | 
 5 | logger = logging.getLogger(conf.LOGGER_NAME)
 6 | 
 7 | 
 8 | class UseCasePreprocessor(object):
 9 |     """
10 |     Add here your preprocessing technique following the "basic" syntax one.
11 |     """
12 |     @staticmethod
13 |     def run(preprocess_name, file_path):
14 |         if preprocess_name == 'basic':
15 |             from src.preprocessor.preprocessor_basic import PreprocessorBasic
16 |             PreprocessorBasic().run(file_path)
17 | 


--------------------------------------------------------------------------------
/src/use_case/use_case_regex_generation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import pandas as pd
 4 | 
 5 | import conf
 6 | 
 7 | import logging
 8 | 
 9 | logger = logging.getLogger(conf.LOGGER_NAME)
10 | 
11 | 
12 | class UseCaseRegexGeneration(object):
13 |     def __init__(self, regex_object):
14 |         self.regex_object = regex_object
15 | 
16 |     def run(self, main_file, features_folder, cluster_list, benign_for_retrain=20,
17 |             take_existing_result=False, round_max=15, min_path_for_run=1):
18 |         """
19 |         Take the data from the clustering step and extract the clusters
20 |         :param main_file: file containing all the data.
21 |         :param features_folder: folder_path with the features.
22 |         :param cluster_list: list of cluster number to process
23 |         :param benign_for_retrain: int. Number of str to not match to take into account at each round
24 |         :param take_existing_result: bool. if True, will start from the last result. It allows us to run several rounds
25 |         not continuously
26 |         :param round_max: int. Number of round max before abandoning the cluster because of the FP
27 |         :param min_path_for_run: int. minimal number of paths to run the regex extraction process
28 |         :return: void
29 |         """
30 | 
31 |         df, df_benign = self.load_df(main_file, features_folder)
32 |         str_to_not_match = df_benign['path'].unique()
33 |         if len(df) < min_path_for_run:
34 |             logger.error(
35 |                 'Not enough path to start the clustering. Paths: {} , Min paths : {}'.format(len(df), min_path_for_run))
36 |             return
37 |         cluster_dict = {}
38 |         if len(cluster_list) == 0:
39 |             cluster_list = df['cluster'].unique()
40 |             logger.info('No cluster number given. Creating regex for all cluster.')
41 |         for cluster in cluster_list:
42 |             if cluster == -1:
43 |                 continue
44 |             cluster_urls = list(df[df['cluster'] == cluster]['path'].unique())
45 |             cluster_dict['cluster_' + str(len(cluster_urls)) + '_' + str(cluster)] = {'match': cluster_urls,
46 |                                                                                       'unmatch': []}
47 |         self.regex_object.run_with_benign_check(_cluster_dict=cluster_dict, benign_list=str_to_not_match,
48 |                                                 benign_for_retrain=benign_for_retrain,
49 |                                                 take_existing_result=take_existing_result, round_max=round_max)
50 |         self.regex_object.create_result_report()
51 | 
52 |     @staticmethod
53 |     def load_df(main_file, features_folder):
54 |         """
55 |         Load the df with the labels and the cleaning done in the clustering phase.
56 |         :param main_file: main csv file.
57 |         :param features_folder: folder_path with the features.
58 |         :return: DataFrame
59 |         """
60 |         df = pd.read_csv(main_file).drop_duplicates(['path'])  # we ensure that everything is unique
61 |         df_benign = df[df['label'] == 'benign']
62 |         df = df[df['label'] == 'malicious']
63 |         with open(os.path.join(features_folder, conf.INDEX_TO_KEEP), 'rb') as f:
64 |             index_to_keep = pickle.load(f)
65 |         with open(os.path.join(features_folder, conf.LABELS), 'rb') as f:
66 |             labels = pickle.load(f)
67 |         df = df.iloc[index_to_keep, :]
68 |         df['cluster'] = labels
69 |         return df, df_benign
70 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib
 3 | 
 4 | import conf
 5 | from src.feeder.feed_downloader import Url
 6 | 
 7 | 
 8 | def create_folder(path):
 9 |     """
10 |     Create folder if it does not exist. If the function gets a file, it will create all the folders before
11 |     :param path: path of a file or folders. Can also be a list
12 |     :return: void
13 |     """
14 | 
15 |     def delete_one_folder(_path):
16 |         if "." in os.path.basename(_path):  # Check if it's a file
17 |             _path = os.path.dirname(_path)
18 |         pathlib.Path(_path).mkdir(parents=True, exist_ok=True)
19 | 
20 |     if isinstance(path, list):
21 |         for _path in path:
22 |             delete_one_folder(_path)
23 |     elif isinstance(path, str):
24 |         delete_one_folder(path)
25 | 
26 | 
27 | def process_file_name(filename):
28 |     """
29 |     Process the main csv file. It contains the data before being preprocessed
30 |     :param filename: path of the csv
31 |     :return: filename fixed
32 |     """
33 |     assert filename.endswith(".csv")
34 |     if not os.path.isabs(filename):
35 |         create_folder(filename)
36 |         filename = os.path.join(conf.DATA, os.path.basename(filename))
37 |     return filename
38 | 
39 | 
40 | def process_preprocessed_file_name(main_file, preprocess_file):
41 |     default_preprocess_file = os.path.join(conf.DATA,
42 |                                            f"{os.path.basename(main_file).replace('.csv', '')}"
43 |                                            f"{conf.PREPROCESSED_SUFFIX}")
44 |     if preprocess_file is None:
45 |         return default_preprocess_file
46 |     if not os.path.exists(preprocess_file):
47 |         return default_preprocess_file
48 | 
49 |     return preprocess_file
50 | 
51 | 
52 | def df_to_url_list(df):
53 |     columns = list(Url.__annotations__)
54 |     url_list = [Url(row[1][columns[0]], row[1][columns[1]], row[1][columns[2]], row[1][columns[3]]) for row in
55 |                 df.iterrows()]
56 |     return url_list
57 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/test/__init__.py


--------------------------------------------------------------------------------
/test/clustering/data/save_test/index.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/test/clustering/data/save_test/index.pkl


--------------------------------------------------------------------------------
/test/clustering/data/save_test/matrix.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamai/luda/eb2f5d95097f5df97fb154122a221219cafbd4e0/test/clustering/data/save_test/matrix.pkl


--------------------------------------------------------------------------------
/test/clustering/test_distance_matrix.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | import mock
  4 | import shutil
  5 | 
  6 | from src.clustering.distance_matrix import DistanceMatrix
  7 | from src.clustering.metrics import DISTANCE_FUNC
  8 | import conf
  9 | 
 10 | word_list = ['verify',
 11 |              'palaver',
 12 |              'bathrobe',
 13 |              'traitorwise',
 14 |              'midwatch',
 15 |              'onymal',
 16 |              'aphlogistic',
 17 |              'trustingly',
 18 |              'saponifier',
 19 |              'moodle',
 20 |              'isuret',
 21 |              'oedogoniaceous',
 22 |              'unhoard',
 23 |              'receiptless',
 24 |              'unfibbing',
 25 |              'header',
 26 |              'Tasian',
 27 |              'deferral',
 28 |              'expansively',
 29 |              'hydramnion']
 30 | 
 31 | 
 32 | # downloaded with nltk.word.words()
 33 | 
 34 | 
 35 | class DistanceMatrixSimple(object):
 36 |     def __init__(self, distance_func=DISTANCE_FUNC['sw']):
 37 |         self.distance_func = distance_func
 38 | 
 39 |     def create_matrix_distance(self, url_list):
 40 |         n = len(url_list)
 41 |         distance_matrix = np.zeros(shape=(n, n), dtype=np.double)  # we need double for HDBScan
 42 |         for i in range(n):
 43 |             for j in range(i + 1, n):
 44 |                 distance_matrix[i, j] = int(
 45 |                     round(100 * self.distance_func(url_list[i], url_list[j]) / max(len(url_list[i]), len(url_list[j]))))
 46 |         distance_matrix = self.symmetrize(distance_matrix)
 47 |         np.fill_diagonal(distance_matrix, conf.SIMILARITY_MAX)
 48 |         return distance_matrix
 49 | 
 50 |     @staticmethod
 51 |     def symmetrize(a):
 52 |         return a + a.T - np.diag(a.diagonal())
 53 | 
 54 | 
 55 | @pytest.fixture()
 56 | def distance_matrix_object():
 57 |     return DistanceMatrix(word_list, distance_func=DISTANCE_FUNC['sw'])
 58 | 
 59 | 
 60 | def test_run(distance_matrix_object):
 61 |     """
 62 |     Here we test that the distance matrix with multiprocessing does the same job
 63 |     as the simple class written above
 64 |     :return:
 65 |     """
 66 |     expected = DistanceMatrixSimple().create_matrix_distance(word_list)
 67 | 
 68 |     output = distance_matrix_object.run()
 69 |     shutil.rmtree(distance_matrix_object.folder, ignore_errors=True)
 70 | 
 71 |     assert np.array_equal(expected, output)
 72 | 
 73 | 
 74 | def test_load(distance_matrix_object):
 75 |     distance_matrix_object.folder = 'data/save_test'
 76 |     distance_matrix_object.run()
 77 | 
 78 |     distance_matrix = DistanceMatrix.load('data/save_test')
 79 |     # shutil.rmtree(distance_matrix_object.folder, ignore_errors=True)
 80 |     assert np.array_equal(distance_matrix.matrix, distance_matrix_object.matrix)
 81 |     assert np.array_equal(distance_matrix.url_list, distance_matrix_object.url_list)
 82 | 
 83 | 
 84 | def test_add_url_list():
 85 |     distance_matrix = DistanceMatrix.load('data/save_test')
 86 |     distance_matrix.__save = mock.Mock()
 87 |     old_matrix_shape = distance_matrix.matrix.shape
 88 |     distance_matrix.add_url_list(['jordan', 'jordan.html', 'akamai'])
 89 |     assert distance_matrix.url_list[-3:] == ['jordan', 'jordan.html', 'akamai']
 90 |     assert distance_matrix.matrix.shape == (old_matrix_shape[0] + 3, old_matrix_shape[0] + 3)
 91 | 
 92 | 
 93 | def test__get_argument_create_matrix():
 94 |     distance_object = DistanceMatrix(url_list=list(range(100)))
 95 |     print(dir(distance_object))
 96 |     result = distance_object._DistanceMatrix__get_argument_create_matrix(ncores=5)  # works to call private method
 97 |     assert [(100, 90), (90, 79), (79, 66), (66, 49), (49, 0)] == result
 98 | 
 99 | 
100 | @mock.patch('builtins.open')
101 | def test__create_matrix_distance(open_mock, distance_matrix_object):
102 |     print('Len word list {}'.format(len(word_list)))
103 |     a = 10
104 |     b = 15
105 |     word_list_processed = word_list[a:b]
106 |     print(word_list_processed)
107 |     result = distance_matrix_object._DistanceMatrix__create_matrix_distance(b, a)
108 |     assert result.shape == (b - a, len(word_list))
109 |     assert result[2, 3] == int(round(
110 |         100 * distance_matrix_object.distance_func(word_list[a + 2], word_list[a + 3]) / max(len(word_list[a + 2]), len(
111 |             word_list[a + 3]))))
112 | 


--------------------------------------------------------------------------------
/test/clustering/test_metrics.py:
--------------------------------------------------------------------------------
 1 | from src.clustering.metrics import get_sw_distance
 2 | from src.clustering.metrics import longest_sub
 3 | 
 4 | 
 5 | def test_get_sw_distance():
 6 |     distance = get_sw_distance(match=1, mismatch=-1, gap_penalty=-1)
 7 |     assert distance('/mal/xxx/a.php', '/mal/a.php') == 6
 8 | 
 9 | 
10 | def test_longest_sub():
11 |     a = 'myurl/abigfolder/home'
12 |     b = 'abigfolder/akamai'
13 |     assert longest_sub(a, b) == 1
14 | 


--------------------------------------------------------------------------------
/test/clustering/test_swalign.py:
--------------------------------------------------------------------------------
 1 | from src.clustering import swalign
 2 | 
 3 | 
 4 | def test_align():
 5 |     match = 1
 6 |     mismatch = -1
 7 |     scoring = swalign.NucleotideScoringMatrix(match, mismatch)
 8 | 
 9 |     sw = swalign.LocalAlignment(scoring, gap_penalty=-1)
10 |     assert sw.align('/mal/xxx/a.php', '/mal/a.php') == 6
11 | 


--------------------------------------------------------------------------------
/test/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "main_file": "data_demo.csv",
 3 |   "data": {
 4 |     "run": false,
 5 |     "additional_files": [
 6 |       {
 7 |         "path": "my_data/benign_data.csv",
 8 |         "label": "benign"
 9 |       },
10 |       {
11 |         "path": "my_data/malicious_traffic.csv",
12 |         "label": "malicious"}
13 | 
14 |     ]
15 |   },
16 |   "feeder": {
17 |     "run": false,
18 |     "sources": [
19 |       "urlhaus",
20 |       "openfish",
21 |       "alexa"
22 |     ]
23 |   },
24 |   "preprocessing": {
25 |     "run": true,
26 |     "name": "basic"
27 |   },
28 |   "clustering": {
29 |     "run": true,
30 |     "preprocessed_file": null,
31 |     "skip_distance_computation": false,
32 |     "clusterer": {
33 |       "dbscan": {
34 |         "eps": 20,
35 |         "min_samples": 8
36 |       }
37 |     },
38 |     "metric": "sw",
39 |     "features_folder": "luda_output/mymatrix",
40 |     "filter_similarity": 30,
41 |     "phishing_mode": false
42 |   },
43 |   "regex": {
44 |     "run": false,
45 |     "benign_for_retrain": 30,
46 |     "round_max": 10,
47 |     "regex_folder": "myregexes",
48 |     "take_existing_result": false,
49 |     "min_path_for_run": 200,
50 |     "cluster_list": [0,4]
51 |   }
52 | }


--------------------------------------------------------------------------------
/test/coverage.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg xmlns="http://www.w3.org/2000/svg" width="99" height="20">
 3 |     <linearGradient id="b" x2="0" y2="100%">
 4 |         <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
 5 |         <stop offset="1" stop-opacity=".1"/>
 6 |     </linearGradient>
 7 |     <mask id="a">
 8 |         <rect width="99" height="20" rx="3" fill="#fff"/>
 9 |     </mask>
10 |     <g mask="url(#a)">
11 |         <path fill="#555" d="M0 0h63v20H0z"/>
12 |         <path fill="#dfb317" d="M63 0h36v20H63z"/>
13 |         <path fill="url(#b)" d="M0 0h99v20H0z"/>
14 |     </g>
15 |     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
16 |         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
17 |         <text x="31.5" y="14">coverage</text>
18 |         <text x="80" y="15" fill="#010101" fill-opacity=".3">70%</text>
19 |         <text x="80" y="14">70%</text>
20 |     </g>
21 | </svg>
22 | 


--------------------------------------------------------------------------------
/test/feeder/crawler/test_crawler.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from src.feeder.crawler.crawler import Crawler
 3 | 
 4 | 
 5 | @pytest.fixture()
 6 | def crawler_object():
 7 |     return Crawler(_url='akamai.com', depth=2)
 8 | 
 9 | 
10 | def test_fix_url():
11 |     result = Crawler.fix_url('randomurl.com/')
12 |     assert result == 'http://randomurl.com'
13 | 
14 | 
15 | def test_run(crawler_object):
16 |     url_set = crawler_object.run()
17 |     print(url_set)
18 |     assert len(url_set) >= 2 # depth
19 | 


--------------------------------------------------------------------------------
/test/feeder/data/vt_key.txt:
--------------------------------------------------------------------------------
1 | my_vt_key


--------------------------------------------------------------------------------
/test/feeder/test_alexa_feed_downloader.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from src.logger_code import init_logger
 3 | from src.feeder.alexa_feed_downloader import AlexaFeedDownloader
 4 | 
 5 | 
 6 | @pytest.fixture()
 7 | def alexa_feeder():
 8 |     feeder = AlexaFeedDownloader()
 9 |     init_logger()
10 |     return feeder
11 | 
12 | 
13 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the feeder")
14 | def test_fetch(alexa_feeder):
15 |     urls = alexa_feeder.fetch()
16 |     print(urls)
17 | 


--------------------------------------------------------------------------------
/test/feeder/test_feed_downloader.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from src.logger_code import init_logger
 3 | import pytest
 4 | import mock
 5 | 
 6 | from src.feeder.feed_downloader import FeedDownloader
 7 | from src.feeder.feed_downloader import Url
 8 | 
 9 | 
10 | @pytest.fixture
11 | def feed_downloader_generic():
12 |     class ExampleFeedDownloader(FeedDownloader):
13 |         def fetch(self) -> List[Url]:
14 |             source = 'Example source'
15 |             phishingURL = Url("http://example.com/index.html", source, 'malicious')
16 |             malwareURL = Url("http://example.com/index2.html", source, 'malicious')
17 |             return [phishingURL, malwareURL]
18 | 
19 |     example_downloader = ExampleFeedDownloader()
20 |     init_logger()
21 |     return example_downloader
22 | 
23 | 
24 | def test_fetch(feed_downloader_generic):
25 |     example_urls = feed_downloader_generic.fetch()
26 |     assert example_urls[0].source == 'Example source'
27 |     assert {url.label for url in example_urls} == {'malicious'}
28 | 
29 | 
30 | @pytest.mark.skip(reason="functional test. Comment this line to run the function")
31 | def test_save_to_csv(feed_downloader_generic):
32 |     list_of_urls = feed_downloader_generic.fetch()
33 |     feed_downloader_generic.save_to_csv(list_of_urls)
34 | 
35 | 
36 | @mock.patch('src.feeder.feed_downloader.Crawler')
37 | def test_get_urls_from_domain(crawler_mock, feed_downloader_generic):
38 |     url = 'akamai.com'
39 |     depth = 30
40 |     feed_downloader_generic.get_urls_from_domain(url, depth_max=depth)
41 |     crawler_mock.assert_called_with(url, depth=depth)
42 | 


--------------------------------------------------------------------------------
/test/feeder/test_iscx_feed_downloader.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from src.logger_code import init_logger
 3 | from src.feeder.iscx_feed_downloader import IscxFeedDownloader
 4 | 
 5 | 
 6 | @pytest.fixture()
 7 | def iscx_feeder():
 8 |     feeder = IscxFeedDownloader()
 9 |     init_logger()
10 |     return feeder
11 | 
12 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the feeder")
13 | def test_fetch(iscx_feeder):
14 |     urls = iscx_feeder.fetch()
15 |     print(urls)
16 |     a = 1
17 | 


--------------------------------------------------------------------------------
/test/feeder/test_vt_feed_downloader.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from src.logger_code import init_logger
 3 | from src.feeder.vt_feed_downloader import VtFeedDownloader
 4 | 
 5 | import mock
 6 | 
 7 | 
 8 | @pytest.fixture()
 9 | def vt_feeder():
10 |     feeder = VtFeedDownloader()
11 |     init_logger()
12 |     return feeder
13 | 
14 | 
15 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the feeder")
16 | def test_fetch(vt_feeder):
17 |     urls = vt_feeder.fetch()
18 |     print(urls)
19 | 
20 | 
21 | @mock.patch('builtins.open', mock.mock_open(read_data='my_vt_key'))
22 | def test_load_key(vt_feeder):
23 |     key = vt_feeder.load_key('data/vt_key.txt')
24 |     assert key == 'my_vt_key'
25 | 
26 | 
27 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the feeder")
28 | def test_get_records(vt_feeder):
29 |     url_list = vt_feeder.get_records(number=10)
30 |     print(url_list)
31 | 


--------------------------------------------------------------------------------
/test/preprocessor/data/data_preprocessing_test.csv:
--------------------------------------------------------------------------------
1 | url,label
2 | http://173.243.112.132/serve/config.bin,malicious
3 | http://194.15.112.29/2ja/panel/config.bin,malicious
4 | http://216.170.125.134/neat/serverphp/config.bin,malicious
5 | http://58.22.101.109/xz/cfg.bin,malicious
6 | http://83.149.95.197/1/cfg.bin,malicious
7 | http://abbcp.cn/bm_a/controller.php,malicious
8 | http://ajana.com.au/.tmp/server/config.bin,malicious


--------------------------------------------------------------------------------
/test/preprocessor/test_preprocessor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | from src.preprocessor.preprocessor import Preprocessor
 4 | 
 5 | current_folder = os.path.dirname(os.path.abspath(__file__))
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def preprocessor():
10 |     return Preprocessor()
11 | 
12 | 
13 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the function")
14 | def test_run(preprocessor):
15 |     file_path = os.path.join(current_folder, 'data.csv')
16 |     preprocessor.run(file_path)
17 | 


--------------------------------------------------------------------------------
/test/preprocessor/test_preprocessor_basic.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import mock
 4 | 
 5 | from src.preprocessor.preprocessor_basic import PreprocessorBasic
 6 | 
 7 | current_folder = os.path.dirname(os.path.abspath(__file__))
 8 | 
 9 | 
10 | @pytest.fixture
11 | def preprocessor():
12 |     return PreprocessorBasic()
13 | 
14 | 
15 | @mock.patch('src.preprocessor.preprocessor_basic.create_folder')
16 | @mock.patch('src.preprocessor.preprocessor_basic.pd.DataFrame.to_csv')
17 | def test_run(create_folder, to_csv_mock, preprocessor):
18 |     file_path = os.path.join(current_folder, 'data', 'data_preprocessing_test.csv')
19 |     df_preprocessed =preprocessor.run(file_path)
20 |     assert 'filter_wp' in list(df_preprocessed)
21 | 


--------------------------------------------------------------------------------
/test/regex/data/input_cluster_2_1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "urls_cluster",
 3 |   "description": "luda",
 4 |   "regexTarget": "",
 5 |   "examples": [
 6 |     {
 7 |       "string": "/chase/home/auth/Logging_in.php",
 8 |       "match": [
 9 |         {
10 |           "start": 0,
11 |           "end": 30
12 |         }
13 |       ],
14 |       "unmatch": []
15 |     },
16 |     {
17 |       "string": "/log/home/auth/Logging_in.php",
18 |       "match": [
19 |         {
20 |           "start": 0,
21 |           "end": 28
22 |         }
23 |       ],
24 |       "unmatch": []
25 |     },
26 |     {
27 |       "string": "wp-content/uploads",
28 |       "match": [],
29 |       "unmatch": [
30 |         {
31 |           "start": 0,
32 |           "end": 17
33 |         }
34 |       ]
35 |     },
36 |     {
37 |       "string": "home.php",
38 |       "match": [],
39 |       "unmatch": [
40 |         {
41 |           "start": 0,
42 |           "end": 7
43 |         }
44 |       ]
45 |     }
46 |   ]
47 | }


--------------------------------------------------------------------------------
/test/regex/data/input_correct.json:
--------------------------------------------------------------------------------
1 | {"to_test" : ["/jordan", "/asaf"]}


--------------------------------------------------------------------------------
/test/regex/data/json_for_test.json:
--------------------------------------------------------------------------------
1 | {"name": "urls_cluster", "description": "luda", "regexTarget": "", "examples": [{"string": "/chase/home/auth/Logging_in.php", "match": [{"start": 0, "end": 30}], "unmatch": []}, {"string": "/log/home/auth/Logging_in.php", "match": [{"start": 0, "end": 28}], "unmatch": []}, {"string": "wp-content/uploads", "match": [], "unmatch": [{"start": 0, "end": 17}]}, {"string": "home.php", "match": [], "unmatch": [{"start": 0, "end": 7}]}]}


--------------------------------------------------------------------------------
/test/regex/data/output_regex_runner.json:
--------------------------------------------------------------------------------
1 | {"results":[true,true]}


--------------------------------------------------------------------------------
/test/regex/data/results_cluster_2_1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "datasetName": "urls_cluster",
 3 |   "methodDescription": "Console config",
 4 |   "experimentDate": "Jul 29, 2021 3:33:38 PM",
 5 |   "bestSolution": {
 6 |     "solutionJS": "/(?=(\\w+))\\1(?=(((?=(\\w?))\\4[^@])+))\\2",
 7 |     "trainingPerformances": {
 8 |       "character recall": 1.0,
 9 |       "character precision": 1.0,
10 |       "match precision": 1.0,
11 |       "character accuracy": 1.0,
12 |       "match f-measure": 1.0,
13 |       "match recall": 1.0
14 |     },
15 |     "validationPerformances": {
16 |       "character recall": 1.0,
17 |       "character precision": 1.0,
18 |       "match precision": 1.0,
19 |       "character accuracy": 1.0,
20 |       "match f-measure": 1.0,
21 |       "match recall": 1.0
22 |     },
23 |     "learningPerformances": {
24 |       "character recall": 1.0,
25 |       "character precision": 1.0,
26 |       "match precision": 1.0,
27 |       "character accuracy": 1.0,
28 |       "match f-measure": 1.0,
29 |       "match recall": 1.0
30 |     },
31 |     "solution": "/\\w++(\\w?+[^@])++",
32 |     "fitness": [
33 |       0.0,
34 |       0.0,
35 |       17.0
36 |     ]
37 |   },
38 |   "bestExtractions": [
39 |     [
40 |       {
41 |         "start": 0,
42 |         "end": 30
43 |       }
44 |     ],
45 |     [
46 |       {
47 |         "start": 0,
48 |         "end": 28
49 |       }
50 |     ],
51 |     [],
52 |     []
53 |   ],
54 |   "bestExtractionsStrings": [
55 |     [
56 |       "/chase/home/auth/Logging_in.ph"
57 |     ],
58 |     [
59 |       "/log/home/auth/Logging_in.ph"
60 |     ],
61 |     [],
62 |     []
63 |   ],
64 |   "bestExtractionsStats": [
65 |     {
66 |       "fp": 0,
67 |       "tp": 1
68 |     },
69 |     {
70 |       "fp": 0,
71 |       "tp": 1
72 |     },
73 |     {
74 |       "fp": 0,
75 |       "tp": 0
76 |     },
77 |     {
78 |       "fp": 0,
79 |       "tp": 0
80 |     }
81 |   ],
82 |   "overallExecutionTimeMillis": 145685,
83 |   "numberMatches": 2,
84 |   "numberUnmatches": 4,
85 |   "numberMatchedChars": 58,
86 |   "numberUnmatchedChars": 28,
87 |   "numberAnnotatedChars": 86,
88 |   "numberAllChars": 86,
89 |   "numberTrainingMatches": 1,
90 |   "numberTrainingUnmatches": 2,
91 |   "characterEvaluations": 511388500
92 | }


--------------------------------------------------------------------------------
/test/regex/test_regex.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | import json
  4 | import mock
  5 | from unittest.mock import patch
  6 | 
  7 | from src.logger_code import init_logger
  8 | from src.regex.regex import Regex
  9 | import conf
 10 | 
 11 | current_folder = os.path.dirname(os.path.abspath(__file__))
 12 | 
 13 | INPUT_CLUSTER = os.path.join(current_folder, 'data/input_cluster_2_1.json')
 14 | DATA = os.path.join(current_folder, 'data')
 15 | 
 16 | 
 17 | @pytest.fixture()
 18 | def regex_object():
 19 |     init_logger()
 20 |     return Regex(project_name='test_project')
 21 | 
 22 | 
 23 | @mock.patch('src.regex.regex.subprocess')
 24 | def test_run_regex_extraction(subprocess_mock, regex_object):
 25 |     regex_object.run_regex_extraction(json_to_extract={'randomjson'})
 26 |     assert subprocess_mock.run.called
 27 | 
 28 | 
 29 | @pytest.mark.skip(reason="regex functional test")
 30 | def test_run_regex_extraction_functional(regex_object):
 31 |     # Uncomment this test to really run the regex extraction. Took 2m 26sec on my computer 8cores 16Go RAM
 32 |     regex_object.run_regex_extraction(json_to_extract=INPUT_CLUSTER)
 33 | 
 34 | 
 35 | @mock.patch('src.regex.regex.json')
 36 | def test_create_json_cluster(json_mock, regex_object):
 37 |     str_list = ['/chase/home/auth/Logging_in.php',
 38 |                 '/log/home/auth/Logging_in.php']
 39 |     str_to_not_match = ['wp-content/uploads',
 40 |                         'home.php']
 41 |     with open(INPUT_CLUSTER) as json_file:
 42 |         json_for_regex_extraction = json.load(json_file)
 43 |     with mock.patch('builtins.open') as open_mock:
 44 |         name = 'test_cluster'
 45 |         json_for_regex_extraction['name'] = name
 46 |         json_path = os.path.join(regex_object.project_folder, 'input_' + name + '.json')
 47 | 
 48 |         regex_object.create_json_cluster(str_list, str_to_not_match, name=name)
 49 |         assert json_mock.mock_calls[0][1][0] == json_for_regex_extraction
 50 |         open_mock.assert_called_with(json_path, 'w')
 51 | 
 52 | 
 53 | @pytest.mark.skip(reason="regex functional test")
 54 | def test_run_with_benign_check(regex_object):
 55 |     # Uncomment this test to really run the regex extraction. Took 2m 26sec on my computer 8cores 16Go RAM
 56 | 
 57 |     """
 58 |     Toook 6m 40 sec in my computer ( 8 cpu, 16go RAM)
 59 |     :param regex_object:
 60 |     :return:
 61 |     """
 62 |     cluster_dict = {'cluster_7_53': {'match': [
 63 |         '/wp-admin/jpmorgan/chasebank/chase/home/auth/Logon_Files/Themes/default/css/style.css',
 64 |         '/wp-admin/jpmorgan/chasebank/chase/home/auth/Logon_Files/Themes/guest/css/style.css',
 65 |         '/wp-admin/jpmorgan/chasebank/chase/home/auth/Logon_Files/Themes/default/css/style_new.css',
 66 |         '/wp-admin/jpmorgan/chasebank/chase/home/auth/Logon_Files/Themes/default-col/css/style_new.css',
 67 |         '/wp-admin/jpmorgan/chasebank/chase/home/auth/Logon_Files/Themes/guest/css/style_new.css',
 68 |         '/wp-admin/jpmorgan/chasebank/chase/home/auth/Logon_Files/Themes/default-col/css/style.css',
 69 |         '/wp-admin/jpmorgan/chasebank/chase/home/auth/logon_files/themes/default-col/css/style.css'],
 70 |         'unmatch': []},
 71 |         'cluster_11_3': {
 72 |             'match': ['/ch18', '/ch16', '/ch13', '/ch10', '/ch17', '/ch1', '/ch11', '/ch12', '/ch19',
 73 |                       '/ch14', '/ch15'], 'unmatch': []}}
 74 | 
 75 |     str_to_not_match = ['wp-content/uploads',
 76 |                         'home.php']
 77 |     regex_object.run_with_benign_check(cluster_dict, benign_list=str_to_not_match,
 78 |                                        benign_for_retrain=2, take_existing_result=False, round_max=2)
 79 | 
 80 | 
 81 | def test_check_regex_list(regex_object):
 82 |     regex_object = Regex(project_name='test')
 83 |     sig = ".s"
 84 |     string_list = ['luda', 'superman', 'akamai', 'blackhat', 'awordthatendswiths']
 85 |     expected = (1, ['awordthatendswiths'])
 86 |     match_result = regex_object.check_regex_list(sig, string_list, limit=10)
 87 |     assert expected == match_result
 88 | 
 89 | 
 90 | def test_get_cluster_results(regex_object):
 91 |     """
 92 |     To run this test. You should have the file test/regex/data/results_cluster_2_1.json
 93 |     :param regex_object:
 94 |     :return:
 95 |     """
 96 |     regex_object.project_folder = DATA
 97 |     cluster_results = regex_object.get_cluster_results()
 98 |     assert cluster_results == {'cluster_2_1': '/\\w++(\\w?+[^@])++'}
 99 | 
100 | 
101 | @mock.patch('src.regex.regex.pd.DataFrame.to_csv')
102 | def test_create_result_report(to_csv_mock, regex_object):
103 |     regex_object.project_folder = DATA
104 |     df = regex_object.create_result_report()
105 |     assert df['name'].to_list() == ['cluster_2_1']
106 |     assert df['regex_java'].to_list() == ['/\\w++(\\w?+[^@])++']
107 | 
108 | 
109 | def test_run_regex_java():
110 |     regex = "(?:\w*+/)*+bt_version_checker\.php"
111 |     string_list = ['/spyeye/Main/bt_version_checker.php', '/spye/main/bt_version_checker.php',
112 |                    '/WP-CD/Main/bt_version_checker.php', '/Net/Main/bt_version_checker.php',
113 |                    '/main/main/bt_version_checker.php', '/dbase/main/bt_version_checker.php',
114 |                    '/hits/bt_version_checker.php', '/Main/bt_version_checker.php', '/spy/main/bt_version_checker.php',
115 |                    '/sy1/bt_version_checker.php', '/grab/main/bt_version_checker.php']
116 | 
117 |     result = Regex.run_regex_java(regex, string_list)
118 |     assert set(result) == {True}
119 | 
120 | 
121 | @patch('conf.OUTPUT_REGEX_RUNNER', os.path.join(current_folder, 'data/output_regex_runner.json'))
122 | @patch('conf.INPUT_REGEX_RUNNER', os.path.join(current_folder, 'data/input_correct.json'))
123 | def test_run_regex_java_with_file():
124 |     new_open = open(conf.OUTPUT_REGEX_RUNNER, 'r')
125 |     # By doing that, I can patch only the first open call
126 |     with mock.patch('builtins.open') as mymock:
127 |         mymock.side_effect = [mock.MagicMock(), new_open]
128 |         regex = "(\\.?+[^_])++"
129 |         result = Regex.run_regex_java(regex, '')
130 |         assert result == [True, True]
131 | 


--------------------------------------------------------------------------------
/test/use_case/test_use_case_clustering.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | 
 4 | from src.use_case.use_case_clustering import UseCaseClustering
 5 | from src.logger_code import init_logger
 6 | 
 7 | test_folder = os.path.dirname((os.path.dirname(os.path.abspath(__file__))))
 8 | 
 9 | 
10 | @pytest.fixture()
11 | def use_case():
12 |     logger = init_logger()
13 |     return UseCaseClustering()
14 | 
15 | 
16 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the function")
17 | def test_run(use_case):
18 |     main_file = os.path.join(test_folder, 'data', 'data_preprocessed.csv')
19 |     use_case.run(main_file)
20 | 


--------------------------------------------------------------------------------
/test/use_case/test_use_case_data.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | from src.use_case.use_case_data import UseCaseData
 4 | from src.logger_code import init_logger
 5 | 
 6 | test_folder = os.path.dirname((os.path.dirname(os.path.abspath(__file__))))
 7 | 
 8 | logger = init_logger()
 9 | 
10 | 
11 | @pytest.fixture()
12 | def use_case():
13 |     return UseCaseData()
14 | 
15 | 
16 | @pytest.mark.skip(reason="functional test. Can take time. Comment this line to run the function")
17 | def test_run(use_case):
18 |     main_file = os.path.join(test_folder, 'data', 'data.csv')
19 |     additional_files = [os.path.join(test_folder, 'data', 'benign_alexa.hql.out'),
20 |                         os.path.join(test_folder, 'data', 'iscx_benign.csv')]
21 |     use_case.run(main_file, additional_files)
22 | 


--------------------------------------------------------------------------------