├── runtime.txt ├── src └── ds4n6_lib │ ├── __init__.py │ ├── ml_models │ ├── __init__.py │ ├── seq2seq_lstm.py │ └── transformer.py │ ├── tools.py │ ├── d4.py │ ├── pf.py │ ├── amcache.py │ ├── winreg.py │ ├── svclist.py │ ├── unx.py │ ├── autoruns.py │ ├── kansa.py │ ├── volatility.py │ ├── mactime.py │ ├── fstl.py │ ├── pslist.py │ ├── macrobber.py │ ├── tshark.py │ ├── flist.py │ ├── utils.py │ ├── kape.py │ └── mlgraph.py ├── setup.cfg ├── pyproject.toml ├── MANIFEST.in ├── setup.py ├── requirements.txt ├── README.md └── CONTRIBUTING.md /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.10.12 -------------------------------------------------------------------------------- /src/ds4n6_lib/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/ds4n6_lib/ml_models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | -------------------------------------------------------------------------------- /src/ds4n6_lib/tools.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | def explore(df, col, max_rows=None, max_columns=None): 4 | hist = df[col].value_counts() 5 | with pd.option_context('display.max_rows', max_rows, 'display.max_columns', max_columns): 6 | print("#Count:",len(hist)) 7 | print(hist) -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include requirements.txt 4 | 5 | include src/ds4n6_lib/isim/*.yml 6 | 7 | # Patterns to exclude from any directory 8 | global-exclude *~ 9 | global-exclude *.pyc 10 | global-exclude *.pyo 11 | global-exclude .git 12 | global-exclude .ipynb_checkpoints 13 | -------------------------------------------------------------------------------- /src/ds4n6_lib/d4.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # INFO 3 | ############################################################################### 4 | # Recommended "import as": d4 5 | 6 | ############################################################################### 7 | # VARIABLES 8 | ############################################################################### 9 | # Debug Level (0: min - 5:max) ------------------------------------------------ 10 | # 0: Disabled 11 | # 1: TBD 12 | # 2: Executed functions 13 | # 3: Low detail on executed functions 14 | # 4: Medium detail on executed functions 15 | # 5: High detail on executed functions 16 | debug = 0 17 | 18 | # Other ----------------------------------------------------------------------- 19 | out = None 20 | ipregex="^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$" 21 | 22 | ############################################################################### 23 | # DECLARE VARS 24 | # not_well-formed 25 | main_nwf=[ 26 | {'find':'<\x04Data', 'replace':'', 'replace':''}, 40 | {'find':' Data ', 'replace':' '}, 41 | {'find':' <([a-zA-Z0-9_-]*)> ', 'replace':' \\1 ', 'type':'re'}, 42 | {'find':'::<([a-zA-Z0-9_-]*)>::', 'replace':'::\\1::', 'type':'re'}, 43 | ] 44 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="ds4n6_lib", 8 | version="0.8.3", 9 | author="Jess Garcia", 10 | author_email="ds4n6@one-esecurity.com", 11 | description="Bringing Data Science & Artificial Intelligence to the fingertips of the average Forensicator, and promote advances in the field", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/ds4n6/ds4n6_lib", 15 | project_urls={ 16 | "Bug Tracker" : "https://github.com/ds4n6/ds4n6_lib/issues", 17 | "Website" : "http://www.ds4n6.io/" 18 | }, 19 | keywords = ['dfir', 'datascience', 'forensics'], 20 | install_requires=[ 21 | 'requests', 22 | 'numpy', 23 | 'pandas', 24 | 'Evtx', 25 | 'python-evtx', 26 | 'ipyaggrid', 27 | 'IPython', 28 | 'ipywidgets', 29 | 'keras', 30 | 'matplotlib', 31 | 'nbformat', 32 | 'numpy', 33 | 'pandas', 34 | 'pyparsing', 35 | 'qgrid', 36 | 'ruamel.yaml', 37 | 'sklearn', 38 | 'tensorflow', 39 | 'tqdm', 40 | 'traitlets', 41 | 'xmltodict', 42 | 'networkx', 43 | 'gensim', 44 | ], 45 | classifiers=[ 46 | "Development Status :: 3 - Alpha", 47 | "Intended Audience :: Developers", 48 | "Intended Audience :: Information Technology", 49 | "Framework :: Jupyter", 50 | "Topic :: Security", 51 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 52 | "Topic :: Software Development :: Libraries :: Python Modules", 53 | "Programming Language :: Python :: 3", 54 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", 55 | "Operating System :: OS Independent", 56 | ], 57 | package_dir={"": "src"}, 58 | packages=setuptools.find_packages(where="src"), 59 | python_requires=">=3.10", 60 | ) 61 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.4.0 2 | argon2-cffi==20.1.0 3 | astunparse==1.6.3 4 | async-generator==1.10 5 | attrs==20.3.0 6 | backcall==0.2.0 7 | bleach==3.3.0 8 | cachetools==4.2.1 9 | certifi==2020.12.5 10 | cffi==1.14.5 11 | chardet==4.0.0 12 | configparser==4.0.2 13 | cycler==0.10.0 14 | decorator==5.0.5 15 | defusedxml==0.7.1 16 | entrypoints==0.3 17 | evtx==0.8.2 18 | flatbuffers==23.1.21 19 | gast==0.3.3 20 | gensim==4.3.2 21 | google-auth==2.16.0 22 | google-auth-oauthlib==1.0.0 23 | google-pasta==0.2.0 24 | grpcio==1.51.1 25 | h5py==3.8.0 26 | hexdump==3.3 27 | idna==2.10 28 | importlib-metadata==3.10.0 29 | ipyaggrid==0.2.1 30 | ipykernel==5.5.3 31 | ipython==7.22.0 32 | ipython-genutils==0.2.0 33 | ipywidgets==7.6.3 34 | jedi==0.18.0 35 | Jinja2==2.11.3 36 | joblib==1.2.0 37 | jsonschema==3.2.0 38 | jupyter-client==8.0.3 39 | jupyter-core==5.2.0 40 | jupyterlab-pygments==0.2.2 41 | jupyterlab-widgets==3.0.5 42 | Keras==2.13.1 43 | Keras-Preprocessing==1.1.2 44 | kiwisolver==1.3.1 45 | Markdown==3.3.4 46 | MarkupSafe==1.1.1 47 | matplotlib==3.7.0 48 | mistune==0.8.4 49 | more-itertools==5.0.0 50 | nbclient==0.5.3 51 | nbconvert==6.0.7 52 | nbformat==5.1.3 53 | nest-asyncio==1.5.1 54 | networkx==2.5 55 | notebook==6.3.0 56 | numpy==1.23.5 57 | oauthlib==3.1.0 58 | opt-einsum==3.3.0 59 | packaging==20.9 60 | pandas==2.1.4 61 | pandocfilters==1.4.3 62 | parso==0.8.2 63 | pexpect==4.8.0 64 | pickleshare==0.7.5 65 | Pillow==8.2.0 66 | prometheus-client==0.10.0 67 | prompt-toolkit==3.0.18 68 | protobuf==4.24.1 69 | ptyprocess==0.7.0 70 | pyasn1==0.4.8 71 | pyasn1-modules==0.2.8 72 | pycparser==2.20 73 | Pygments==2.8.1 74 | pyparsing==2.4.7 75 | pyrsistent==0.17.3 76 | python-dateutil==2.8.2 77 | python-evtx==0.7.4 78 | pytz==2021.1 79 | PyYAML==6.0.1 80 | pyzmq==25.0.2 81 | qgrid==1.3.1 82 | requests==2.25.1 83 | requests-oauthlib==1.3.0 84 | rsa==4.7.2 85 | ruamel.yaml==0.17.21 86 | ruamel.yaml.clib==0.2.7 87 | scikit-learn==1.2.1 88 | scipy==1.10.0 89 | Send2Trash==1.5.0 90 | simplejson==3.17.2 91 | six==1.15.0 92 | sklearn==0.0 93 | tensorboard==2.13.0 94 | tensorflow==2.13.0 95 | tensorflow-estimator==2.13.0 96 | termcolor==1.1.0 97 | terminado==0.9.4 98 | testpath==0.4.4 99 | threadpoolctl==2.1.0 100 | tornado==6.2 101 | tqdm==4.59.0 102 | traitlets==5.9.0 103 | typing-extensions==3.7.4.3 104 | urllib3==1.26.4 105 | wcwidth==0.2.5 106 | webencodings==0.5.1 107 | Werkzeug==1.0.1 108 | widgetsnbextension==3.5.1 109 | wrapt==1.12.1 110 | xmltodict==0.12.0 111 | zipp==1.0.0 -------------------------------------------------------------------------------- /src/ds4n6_lib/pf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | from tensorflow.keras import layers 4 | from tensorflow.keras.models import Model 5 | import tensorflow as tf 6 | from tensorflow.keras import losses 7 | import matplotlib.pyplot as plt 8 | import pandas as pd 9 | 10 | 11 | def convert_prefetch_ham_to_hml(df): 12 | df_split = df['file_referenced'].str.split("\\",expand=True) 13 | df_split = df_split.drop(columns=[0]).fillna(value='') 14 | 15 | first_column = df_split.iloc[:, 0] 16 | medium_column = [] 17 | last_column = [] 18 | for i in range(df_split.shape[0]): 19 | arr = [x for x in df_split.iloc[i, 1:] if x != ''] 20 | medium_column.append('\\'.join(arr[:-1])) 21 | last_column.append('\\'.join(arr[-1:])) # [-1:] because some len(arr) == 0 22 | 23 | list_to_df = list(zip(first_column, medium_column, last_column, df['machine_id'])) 24 | new_df = pd.DataFrame(list_to_df, columns =['A', 'B', 'C', 'machine_id']) 25 | return new_df 26 | 27 | 28 | def ml_prefetch_anomalies(df, odalg="simple_autoencoder", latent_dim = 128, epochs = 10, learning_rate = 1e-3): 29 | # Deep Learning 30 | x_train = pd.get_dummies(df).to_numpy() 31 | 32 | class Autoencoder(Model): 33 | def __init__(self, input_dim, latent_dim): 34 | super(Autoencoder, self).__init__() 35 | self.input_dim = input_dim 36 | self.latent_dim = latent_dim 37 | self.encoder = layers.Dense(latent_dim, activation='relu') 38 | self.decoder = layers.Dense(input_dim, activation='sigmoid') 39 | 40 | def call(self, x): 41 | encoded = self.encoder(x) 42 | decoded = self.decoder(encoded) 43 | return decoded 44 | 45 | def train_autoencoder(latent_dim, epochs, learning_rate): 46 | autoencoder = Autoencoder(input_dim=x_train.shape[1], latent_dim=latent_dim) 47 | opt = tf.keras.optimizers.Adam(learning_rate=learning_rate) 48 | autoencoder.compile(optimizer=opt, loss=losses.MeanSquaredError()) 49 | history = autoencoder.fit(x_train, x_train, epochs=epochs, shuffle=True, verbose=0) 50 | return autoencoder, history 51 | 52 | model, history = train_autoencoder(latent_dim=latent_dim, 53 | epochs=epochs, 54 | learning_rate=learning_rate) 55 | 56 | 57 | preds = model.predict(x_train) 58 | inference_losses = tf.keras.metrics.mean_squared_error(preds, x_train.astype('float')).numpy() 59 | 60 | ranking = [] 61 | for i, loss in zip(range(len(inference_losses)), inference_losses): 62 | fr = '\\'.join(df.iloc[i, :3]) 63 | 64 | machine_id = df.iloc[i]['machine_id'] 65 | if fr.endswith('.DLL'): 66 | ranking.append((loss, i, fr, machine_id)) 67 | 68 | ranking = sorted(ranking, key=lambda x: -x[0]) 69 | anomdf = pd.DataFrame(ranking, columns=['loss', 'source_index', 'file referenced', 'machine_id']) 70 | return anomdf[['file referenced', 'machine_id']] 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |

4 | 5 | 6 | 7 |

8 | 9 | 10 | 11 | DS4N6 stands for Data Science Forensics. 12 | 13 | We also refer to this project as DSDFIR, AI4N6 or AIDFIR, since Data Science (DS) includes Artificial Intelligence (AI), and the project goes beyond the strictly Forensics, covering the whole Digital Forensics & Incident Response (DFIR) discipline (and sometimes even beyond). But hey, we had to give the project a catchy name! 14 | 15 | The Mission of the DS4N6 project is simple: 16 | 17 | ``` 18 | Bringing Data Science & Artificial Intelligence 19 | to the fingertips of the average Forensicator, 20 | and promote advances in the field 21 | ``` 22 | 23 | The first (modest) alpha version of our ds4n6 python library, together with some easy-to-use python scripts, was originally made public after the presentation at the SANS DFIR Summit US, July 16-17. 24 | **For detailed information about the Project, the Library, its Functions, its Usage, etc., visit the project page: http://www.ds4n6.io/tools/ds4n6.py.html** 25 | 26 | ## Getting Started 27 | 28 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system. 29 | 30 | https://github.com/ds4n6/ds4n6_lib.git 31 | 32 | ### Prerequisites 33 | 34 | The DS4N6 library works on the 3.x versions of the Python programming language. The module has external dependencies related to datascience and extraction of forensic evidence. 35 | 36 | Install requirements: 37 | 38 | - python-evtx 39 | - Evtx 40 | - ipyaggrid 41 | - IPython 42 | - ipywidgets 43 | - keras 44 | - matplotlib 45 | - nbformat 46 | - numpy 47 | - pandas 48 | - pyparsing 49 | - qgrid 50 | - ruamel.yaml 51 | - sklearn 52 | - tensorflow 53 | - tqdm 54 | - traitlets 55 | - xmltodict 56 | - networkx 57 | - gensim 58 | 59 | ### Installation 60 | 61 | The installation can be easily done through pip. 62 | 63 | #### pip installation 64 | 65 | ```sh 66 | pip install -r requirements.txt 67 | ``` 68 | 69 | Finally, import in your python3 program or Jupyter Notebook as "ds". 70 | 71 | ```python 72 | import ds4n6_lib as ds 73 | ``` 74 | 75 | ## Contributing 76 | 77 | If you think you can provide value to the Community, collaborating with Research, Blog Posts, Cheatsheets, Code, etc., contact us! 78 | 79 | Please read [CONTRIBUTING.md](https://gist.github.com/PurpleBooth/b24679402957c63ec426) for details on our code of conduct, and the process for submitting pull requests to us. 80 | 81 | ### download from github 82 | 83 | All you will need to do is to clone the library, install the test, create a virtual enviroment to use it and active it. 84 | 85 | ```sh 86 | 87 | git clone https://github.com/ds4n6/ds4n6_lib 88 | 89 | virtualenv -p python3.10 .test 90 | source .test/bin/activate 91 | 92 | pip install -r requirements.txt 93 | ``` 94 | 95 | ## Authors 96 | 97 | * **Jess Garcia** - *Initial work* - http://ds4n6.io/community/jess_garcia.html 98 | 99 | See also the list of [contributors](http://ds4n6.io/community.html) who participated in this project. 100 | 101 | ## License 102 | 103 | This project is licensed under the GNU GPL v3.0 License - see the [LICENSE](LICENSE) file for details 104 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | When contributing to this repository, please first discuss the change you wish to make via issue, 4 | email, or any other method with the owners of this repository before making a change. 5 | 6 | Please note we have a code of conduct, please follow it in all your interactions with the project. 7 | 8 | ## Pull Request Process 9 | 10 | 1. Ensure any install or build dependencies are removed before the end of the layer when doing a 11 | build. 12 | 2. Update the README.md with details of changes to the interface, this includes new environment 13 | variables, exposed ports, useful file locations and container parameters. 14 | 3. Increase the version numbers in any examples files and the README.md to the new version that this 15 | Pull Request would represent. 16 | 4. You may merge the Pull Request in once you have the sign-off of two other developers, or if you 17 | do not have permission to do that, you may request the second reviewer to merge it for you. 18 | 19 | ## Code of Conduct 20 | 21 | ### Our Pledge 22 | 23 | In the interest of fostering an open and welcoming environment, we as 24 | contributors and maintainers pledge to making participation in our project and 25 | our community a harassment-free experience for everyone, regardless of age, body 26 | size, disability, ethnicity, gender identity and expression, level of experience, 27 | nationality, personal appearance, race, religion, or sexual identity and 28 | orientation. 29 | 30 | ### Our Standards 31 | 32 | Examples of behavior that contributes to creating a positive environment 33 | include: 34 | 35 | * Using welcoming and inclusive language 36 | * Being respectful of differing viewpoints and experiences 37 | * Gracefully accepting constructive criticism 38 | * Focusing on what is best for the community 39 | * Showing empathy towards other community members 40 | 41 | Examples of unacceptable behavior by participants include: 42 | 43 | * The use of sexualized language or imagery and unwelcome sexual attention or 44 | advances 45 | * Trolling, insulting/derogatory comments, and personal or political attacks 46 | * Public or private harassment 47 | * Publishing others' private information, such as a physical or electronic 48 | address, without explicit permission 49 | * Other conduct which could reasonably be considered inappropriate in a 50 | professional setting 51 | 52 | ### Our Responsibilities 53 | 54 | Project maintainers are responsible for clarifying the standards of acceptable 55 | behavior and are expected to take appropriate and fair corrective action in 56 | response to any instances of unacceptable behavior. 57 | 58 | Project maintainers have the right and responsibility to remove, edit, or 59 | reject comments, commits, code, wiki edits, issues, and other contributions 60 | that are not aligned to this Code of Conduct, or to ban temporarily or 61 | permanently any contributor for other behaviors that they deem inappropriate, 62 | threatening, offensive, or harmful. 63 | 64 | ### Scope 65 | 66 | This Code of Conduct applies both within project spaces and in public spaces 67 | when an individual is representing the project or its community. Examples of 68 | representing a project or community include using an official project e-mail 69 | address, posting via an official social media account, or acting as an appointed 70 | representative at an online or offline event. Representation of a project may be 71 | further defined and clarified by project maintainers. 72 | 73 | ### Enforcement 74 | 75 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 76 | reported by contacting the project team at ds4n6@one-esecurity.com. All 77 | complaints will be reviewed and investigated and will result in a response that 78 | is deemed necessary and appropriate to the circumstances. The project team is 79 | obligated to maintain confidentiality with regard to the reporter of an incident. 80 | Further details of specific enforcement policies may be posted separately. 81 | 82 | Project maintainers who do not follow or enforce the Code of Conduct in good 83 | faith may face temporary or permanent repercussions as determined by other 84 | members of the project's leadership. 85 | 86 | ### Attribution 87 | 88 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 89 | available at [http://contributor-covenant.org/version/1/4][version] 90 | 91 | [homepage]: http://contributor-covenant.org 92 | [version]: http://contributor-covenant.org/version/1/4/ 93 | -------------------------------------------------------------------------------- /src/ds4n6_lib/amcache.py: -------------------------------------------------------------------------------- 1 | # DS4N6 2 | # 3 | # Description: Library of functions to apply Data Science to forensics artifacts 4 | # 5 | 6 | ############################################################################### 7 | # INFO 8 | ############################################################################### 9 | # Recommended "import as": d4amch 10 | 11 | ############################################################################### 12 | # IMPORTS 13 | ############################################################################### 14 | 15 | # DEV IMPORTS ---------------------------------------------------------------- 16 | 17 | # python IMPORTS -------------------------------------------------------------- 18 | import os 19 | import glob 20 | import re 21 | import time 22 | import inspect 23 | import pickle 24 | 25 | # DS IMPORTS ------------------------------------------------------------------ 26 | import numpy as np 27 | import pandas as pd 28 | import matplotlib.pyplot as plt 29 | 30 | # DS4N6 IMPORTS --------------------------------------------------------------- 31 | import ds4n6_lib.d4 as d4 32 | import ds4n6_lib.common as d4com 33 | import ds4n6_lib.gui as d4gui 34 | import ds4n6_lib.utils as d4utl 35 | import ds4n6_lib.unx as d4unx 36 | 37 | ############################################################################### 38 | # FUNCTIONS 39 | ############################################################################### 40 | 41 | # ANALYSIS FUNCTIONS ########################################################## 42 | 43 | # simple ====================================================================== 44 | def simple_func(df, *args, **kwargs): 45 | """ Reformat the input df so the data is presented to the analyst in the 46 | friendliest possible way 47 | 48 | Parameters: 49 | df (pd.dataframe): Input data 50 | 51 | Returns: 52 | pd.DataFrame: Optionally it will return the filtered dataframe, 53 | only if ret=True is set, constant & hidden columns included 54 | If ret_out=True is set, then the output just as it is shown 55 | (without constant/hidden columns) will be return 56 | """ 57 | 58 | # Artifact-specific argument parsing ======================================= 59 | 60 | # Variables ================================================================ 61 | hiddencols = ['SHA1_', 'FileReference_'] 62 | 63 | dfout = df 64 | 65 | # Maximum number of lines in DF for beautification 66 | maxdfbprintlines = 20 67 | 68 | # Pre-Processing ========================================================== 69 | 70 | # Call to simple_common =================================================== 71 | dfout = d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines) 72 | 73 | # Post-Processing ========================================================= 74 | 75 | # Return ================================================================== 76 | return dfout 77 | 78 | # analysis ==================================================================== 79 | def analysis(obj, *args, **kwargs): 80 | """ Redirects execution to analysis_func() 81 | """ 82 | return analysis_func(obj, *args, **kwargs) 83 | 84 | def analysis_func(obj, *args, **kwargs): 85 | """ Umbrella function that redirects to different types of analysis 86 | available on the input data 87 | 88 | Parameters: 89 | obj: Input data (typically DF or dict of DFs) 90 | 91 | Returns: 92 | pd.DataFrame: Refer to each specific analysis function 93 | """ 94 | 95 | def syntax(): 96 | print('Syntax: analysis(obj, "analysis_type")\n') 97 | d4list("str-help") 98 | return 99 | 100 | def d4list(objtype): 101 | 102 | # Analysis Modules Available for this objective 103 | anlav = False 104 | print("Available Amcache analysis types:") 105 | # if objtype == None or objtype == "str-help" or objtype == "str-list" or re.search("^dict-pandas_dataframe-XXXXX", objtype): 106 | # anlav = True 107 | # print("- XXXXX_files: No.events XXXXX file (Input: XXXdfs)") 108 | 109 | if anlav == False: 110 | print('- No analysis modules available for this object ('+objtype+').') 111 | 112 | nargs = len(args) 113 | 114 | if nargs == 0: 115 | syntax() 116 | return 117 | 118 | obj = args[0] 119 | 120 | objtype = d4com.data_identify(obj) 121 | 122 | if isinstance(obj, str): 123 | if obj == "list": 124 | d4list(objtype) 125 | return 126 | if obj == "help": 127 | syntax() 128 | return 129 | 130 | if nargs == 1: 131 | syntax() 132 | return 133 | 134 | anltype = args[1] 135 | 136 | if not isinstance(anltype, str): 137 | syntax() 138 | return 139 | 140 | if anltype == "help": 141 | syntax() 142 | return 143 | elif anltype == "list": 144 | d4list(objtype) 145 | return 146 | 147 | # ANALYSIS FUNCTIONS ====================================================== 148 | 149 | # XXXdfs ------------------------------------------------------------------ 150 | # if re.search("^dict-pandas_dataframe-XXXXX", objtype): 151 | # if anltype == "XXXXX_files": 152 | # return analysis_XXXXX_files(*args, **kwargs) 153 | 154 | print("INFO: [d4amch] No analysis functions available for this data type ("+objtype+")") 155 | 156 | # DATAFRAME ACCESSOR ########################################################## 157 | 158 | @pd.api.extensions.register_dataframe_accessor("d4amch") 159 | class Ds4n6AmchAccessor: 160 | def __init__(self, pandas_obj): 161 | self._obj = pandas_obj 162 | 163 | def simple(self, *args, **kwargs): 164 | """ Redirects execution to simple_func() 165 | """ 166 | df = self._obj 167 | return simple_func(df, *args, **kwargs) 168 | 169 | 170 | @pd.api.extensions.register_dataframe_accessor("d4_amcache") 171 | class Ds4n6AmcacheAccessor: 172 | def __init__(self, pandas_obj): 173 | self._obj = pandas_obj 174 | 175 | def simple(self, *args, **kwargs): 176 | """ Redirects execution to simple_func() 177 | """ 178 | df = self._obj 179 | return simple_func(df, *args, **kwargs) 180 | -------------------------------------------------------------------------------- /src/ds4n6_lib/winreg.py: -------------------------------------------------------------------------------- 1 | # DS4N6 2 | # 3 | # Description: Library of functions to apply Data Science to forensics artifacts 4 | # 5 | 6 | 7 | ############################################################################### 8 | # INFO 9 | ############################################################################### 10 | # Recommended "import as": d4reg 11 | 12 | ############################################################################### 13 | # IMPORTS 14 | ############################################################################### 15 | 16 | # DEV IMPORTS ---------------------------------------------------------------- 17 | 18 | # python IMPORTS -------------------------------------------------------------- 19 | import os 20 | import glob 21 | import re 22 | import time 23 | import inspect 24 | import pickle 25 | 26 | # DS IMPORTS ------------------------------------------------------------------ 27 | import numpy as np 28 | import pandas as pd 29 | import matplotlib.pyplot as plt 30 | 31 | # DS4N6 IMPORTS --------------------------------------------------------------- 32 | import ds4n6_lib.d4 as d4 33 | import ds4n6_lib.common as d4com 34 | import ds4n6_lib.gui as d4gui 35 | import ds4n6_lib.utils as d4utl 36 | import ds4n6_lib.unx as d4unx 37 | 38 | ############################################################################### 39 | # FUNCTIONS 40 | ############################################################################### 41 | 42 | # ANALYSIS FUNCTIONS ########################################################## 43 | 44 | # simple ====================================================================== 45 | def simple_func(df, *args, **kwargs): 46 | """ Reformat the input df so the data is presented to the analyst in the 47 | friendliest possible way 48 | 49 | Parameters: 50 | df (pd.dataframe): Input data 51 | 52 | Returns: 53 | pd.DataFrame: Optionally it will return the filtered dataframe, 54 | only if ret=True is set, constant & hidden columns included 55 | If ret_out=True is set, then the output just as it is shown 56 | (without constant/hidden columns) will be return 57 | """ 58 | 59 | # Artifact-specific argument parsing ======================================= 60 | 61 | # Variables ================================================================ 62 | hiddencols = ['KeyLastWriteTimestamp_', 'KeyPath_', 'KeyPath-Hash_'] 63 | 64 | dfout = df 65 | 66 | # Maximum number of lines in DF for beautification 67 | maxdfbprintlines = 20 68 | 69 | # Pre-Processing ========================================================== 70 | 71 | # Call to simple_common =================================================== 72 | dfout = d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines) 73 | 74 | # Post-Processing ========================================================= 75 | 76 | # Return ================================================================== 77 | return dfout 78 | 79 | # analysis ==================================================================== 80 | def analysis(obj, *args, **kwargs): 81 | """ Redirects execution to analysis_func() 82 | """ 83 | return analysis_func(obj, *args, **kwargs) 84 | 85 | def analysis_func(obj, *args, **kwargs): 86 | """ Umbrella function that redirects to different types of analysis 87 | available on the input data 88 | 89 | Parameters: 90 | obj: Input data (typically DF or dict of DFs) 91 | 92 | Returns: 93 | pd.DataFrame: Refer to each specific analysis function 94 | """ 95 | 96 | def syntax(): 97 | print('Syntax: analysis(obj, "analysis_type")\n') 98 | d4list("str-help") 99 | return 100 | 101 | def d4list(objtype): 102 | 103 | # Analysis Modules Available for this objective 104 | anlav = False 105 | print("Available winreg analysis types:") 106 | # if objtype == None or objtype == "str-help" or objtype == "str-list" or re.search("^dict-pandas_dataframe-winreg_kv", objtype): 107 | # anlav = True 108 | # print("- winreg_files: No.events winreg file (Input: winreg dfs)") 109 | 110 | if not anlav: 111 | print('- No analysis modules available for this object ('+objtype+').') 112 | 113 | nargs = len(args) 114 | 115 | if nargs == 0: 116 | syntax() 117 | return 118 | 119 | obj = args[0] 120 | 121 | objtype = d4com.data_identify(obj) 122 | 123 | if isinstance(obj, str): 124 | if obj == "list": 125 | d4list(objtype) 126 | return 127 | if obj == "help": 128 | syntax() 129 | return 130 | 131 | if nargs == 1: 132 | syntax() 133 | return 134 | 135 | anltype = args[1] 136 | 137 | if not isinstance(anltype, str): 138 | syntax() 139 | return 140 | 141 | if anltype == "help": 142 | syntax() 143 | return 144 | elif anltype == "list": 145 | d4list(objtype) 146 | return 147 | 148 | # ANALYSIS FUNCTIONS ====================================================== 149 | 150 | # XXXdfs ------------------------------------------------------------------ 151 | # if re.search("^dict-pandas_dataframe-XXXXX", objtype): 152 | # if anltype == "XXXXX_files": 153 | # return analysis_XXXXX_files(*args, **kwargs) 154 | 155 | print("INFO: [d4reg] No analysis functions available for this data type ("+objtype+")") 156 | 157 | # DATAFRAME ACCESSOR ########################################################## 158 | 159 | @pd.api.extensions.register_dataframe_accessor("d4reg") 160 | class Ds4n6RegAccessor: 161 | def __init__(self, pandas_obj): 162 | self._obj = pandas_obj 163 | 164 | def simple(self, *args, **kwargs): 165 | """ Redirects execution to simple_func() 166 | """ 167 | df = self._obj 168 | return simple_func(df, *args, **kwargs) 169 | 170 | 171 | @pd.api.extensions.register_dataframe_accessor("d4_winreg") 172 | class Ds4n6WinRegAccessor: 173 | def __init__(self, pandas_obj): 174 | self._obj = pandas_obj 175 | 176 | def simple(self, *args, **kwargs): 177 | """ Redirects execution to simple_func() 178 | """ 179 | df = self._obj 180 | return simple_func(df, *args, **kwargs) 181 | -------------------------------------------------------------------------------- /src/ds4n6_lib/svclist.py: -------------------------------------------------------------------------------- 1 | # DS4N6 2 | # 3 | # Description: Library of functions to apply Data Science to forensics artifacts 4 | # 5 | 6 | ############################################################################### 7 | # INFO 8 | ############################################################################### 9 | # Recommended "import as": d4svclst 10 | 11 | ############################################################################### 12 | # IMPORTS 13 | ############################################################################### 14 | 15 | # DEV IMPORTS ---------------------------------------------------------------- 16 | 17 | # python IMPORTS -------------------------------------------------------------- 18 | import os 19 | import glob 20 | import re 21 | import time 22 | import inspect 23 | import pickle 24 | 25 | # DS IMPORTS ------------------------------------------------------------------ 26 | import numpy as np 27 | import pandas as pd 28 | import matplotlib.pyplot as plt 29 | 30 | # DS4N6 IMPORTS --------------------------------------------------------------- 31 | import ds4n6_lib.d4 as d4 32 | import ds4n6_lib.common as d4com 33 | import ds4n6_lib.gui as d4gui 34 | import ds4n6_lib.utils as d4utl 35 | import ds4n6_lib.unx as d4unx 36 | 37 | ############################################################################### 38 | # FUNCTIONS 39 | ############################################################################### 40 | 41 | # ANALYSIS FUNCTIONS ########################################################## 42 | 43 | # simple ====================================================================== 44 | def simple_func(df, *args, **kwargs): 45 | """ Reformat the input df so the data is presented to the analyst in the 46 | friendliest possible way 47 | 48 | Parameters: 49 | df (pd.dataframe): Input data 50 | 51 | Returns: 52 | pd.DataFrame: Optionally it will return the filtered dataframe, 53 | only if ret=True is set, constant & hidden columns included 54 | If ret_out=True is set, then the output just as it is shown 55 | (without constant/hidden columns) will be return 56 | """ 57 | 58 | if d4.debug >= 3: 59 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 60 | 61 | # Artifact-specific argument parsing ======================================= 62 | hiddencolsuser = kwargs.get('hiddencols', []) 63 | 64 | # Variables ================================================================ 65 | hiddencolsdef = [] 66 | 67 | # Merge artifact hiddencols with user-specified hiddencols + update kwargs 68 | hiddencols = hiddencolsuser + hiddencolsdef 69 | kwargs['hiddencols'] = hiddencols 70 | 71 | dfout = df 72 | 73 | # Maximum number of lines in DF for beautification 74 | maxdfbprintlines = 20 75 | 76 | # Pre-Processing ========================================================== 77 | 78 | # Call to simple_common =================================================== 79 | dfout = d4com.simple_common(df, *args, **kwargs, maxdfbprintlines=maxdfbprintlines) 80 | 81 | # Post-Processing ========================================================= 82 | 83 | # Return ================================================================== 84 | return dfout 85 | 86 | # analysis ==================================================================== 87 | def analysis(obj, *args, **kwargs): 88 | """ Redirects execution to analysis_func() 89 | """ 90 | return analysis_func(obj, *args, **kwargs) 91 | 92 | def analysis_func(obj, *args, **kwargs): 93 | """ Umbrella function that redirects to different types of analysis 94 | available on the input data 95 | 96 | Parameters: 97 | obj: Input data (typically DF or dict of DFs) 98 | 99 | Returns: 100 | pd.DataFrame: Refer to each specific analysis function 101 | """ 102 | 103 | # SUB-FUNCTIONS ########################################################### 104 | def syntax(): 105 | print('Syntax: analysis(obj, "analysis_type")\n') 106 | d4list("str-help") 107 | return 108 | 109 | def d4list(objtype): 110 | 111 | # Analysis Modules Available for this objective 112 | anlav = False 113 | print("Available XXXXX analysis types:") 114 | if objtype == None or objtype == "str-help" or objtype == "str-list" or re.search("^dict-pandas_dataframe-XXXXX", objtype): 115 | anlav = True 116 | print("- XXXXX_files: No.events XXXXX file (Input: XXXdfs)") 117 | 118 | if anlav == False: 119 | print('- No analysis modules available for this object ('+objtype+').') 120 | 121 | # FUNCTION BODY ########################################################### 122 | if d4.debug >= 3: 123 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 124 | 125 | thisdatatype = "XXXXXXXX-THIS_DATA_TYPE" 126 | 127 | nargs = len(args) 128 | 129 | if nargs == 0: 130 | syntax() 131 | return 132 | 133 | obj = args[0] 134 | 135 | objtype = d4com.data_identify(obj) 136 | 137 | if isinstance(obj, str): 138 | if obj == "list": 139 | d4list(objtype) 140 | return 141 | if obj == "help": 142 | syntax() 143 | return 144 | 145 | if nargs == 1: 146 | if thisdatatype is not None: 147 | if re.search("^dict-pandas_dataframe-"+thisdatatype, objtype) or re.search("^pandas_dataframe-"+thisdatatype, objtype): 148 | d4list(objtype) 149 | else: 150 | syntax() 151 | else: 152 | syntax() 153 | 154 | return 155 | 156 | anltype = args[1] 157 | 158 | if not isinstance(anltype, str): 159 | syntax() 160 | return 161 | 162 | if anltype == "help": 163 | syntax() 164 | return 165 | elif anltype == "list": 166 | d4list(objtype) 167 | return 168 | 169 | # ANALYSIS FUNCTIONS ====================================================== 170 | 171 | # XXXdfs ------------------------------------------------------------------ 172 | if re.search("^dict-pandas_dataframe-XXXXX", objtype): 173 | if anltype == "XXXXX_files": 174 | return analysis_XXXXX_files(*args, **kwargs) 175 | 176 | print("INFO: [d4XXX] No analysis functions available for this data type ("+objtype+")") 177 | 178 | # DATAFRAME ACCESSOR ########################################################## 179 | 180 | @pd.api.extensions.register_dataframe_accessor("d4svclst") 181 | class Ds4n6SvcListAccessor: 182 | def __init__(self, pandas_obj): 183 | self._obj = pandas_obj 184 | 185 | def simple(self, *args, **kwargs): 186 | """ Redirects execution to simple_func() 187 | """ 188 | df = self._obj 189 | return simple_func(df, *args, **kwargs) 190 | 191 | 192 | @pd.api.extensions.register_dataframe_accessor("d4_svclist") 193 | class Ds4n6SvcListAccessor: 194 | def __init__(self, pandas_obj): 195 | self._obj = pandas_obj 196 | 197 | def simple(self, *args, **kwargs): 198 | """ Redirects execution to simple_func() 199 | """ 200 | df = self._obj 201 | return simple_func(df, *args, **kwargs) 202 | -------------------------------------------------------------------------------- /src/ds4n6_lib/unx.py: -------------------------------------------------------------------------------- 1 | # DS4N6 2 | # 3 | # Description: library of functions to appy Data Science in several forensics 4 | # artifacts 5 | # 6 | 7 | ############################################################################### 8 | # IDEAS 9 | ############################################################################### 10 | # dfsed 11 | # multicol -> For a series or DF col, show in multiple cols to optimize screen 12 | # Equiv. to Linux: pr -l1 -t -3 /t 13 | 14 | ############################################################################### 15 | # INFO 16 | ############################################################################### 17 | # Recommended "import as": d4unx 18 | 19 | ############################################################################### 20 | # IMPORTS 21 | ############################################################################### 22 | 23 | # DEV IMPORTS ---------------------------------------------------------------- 24 | 25 | # python IMPORTS -------------------------------------------------------------- 26 | import re 27 | import inspect 28 | 29 | # DS IMPORTS ------------------------------------------------------------------ 30 | import numpy as np 31 | import pandas as pd 32 | 33 | ############################################################################### 34 | # FUNCTIONS 35 | ############################################################################### 36 | 37 | def xgrep_func(*args, **kwargs): 38 | 39 | def syntax(): 40 | print('Syntax: xgrep(,[],""[,""])') 41 | print(" [column(s)] -> If object is a DataFrame") 42 | print(" Options: i: case insensitive") 43 | print(" v: reverse") 44 | print(" p: explode cols with series elements") 45 | print(" t: do not apply style (highlight hits)") 46 | 47 | import collections 48 | 49 | nargs=len(args) 50 | 51 | if nargs == 0: 52 | syntax() 53 | return 54 | 55 | obj = args[0] 56 | 57 | if isinstance(obj, dict): 58 | return dictgrep(*args, **kwargs) 59 | elif isinstance(obj, pd.DataFrame): 60 | return dfgrep(*args, **kwargs) 61 | elif isinstance(obj, collections.abc.KeysView): 62 | return keysgrep(*args, **kwargs) 63 | 64 | def xgrep(*args, **kwargs): 65 | return xgrep_func(*args, **kwargs) 66 | 67 | def dfgrep(*args): 68 | """ 69 | Syntax: dfgrep("",""[,""]) 70 | Options: i: case insensitive 71 | v: reverse 72 | p: explode cols with series elements 73 | t: do not apply style (highlight hits) 74 | If your DF has only 1 column, you can skip the column name, 75 | just specify "" 76 | 77 | """ 78 | 79 | nargs=len(args) 80 | 81 | df = args[0] 82 | 83 | # If the user supplies just one arg we will assume that it is the regex 84 | # and that he wants to search the full DF for that regex 85 | if nargs == 2: 86 | cols = "*" 87 | regex = args[1] 88 | else: 89 | cols = args[1] 90 | regex = args[2] 91 | 92 | if nargs == 4: 93 | opt = args[3] 94 | else: 95 | opt = "" 96 | 97 | ndfcols=len(df.columns) 98 | 99 | if ndfcols == 1 and cols == "": 100 | cols = df.columns 101 | 102 | if cols == "*": 103 | cols = df.columns 104 | 105 | if regex == "": 106 | print("ERROR: regex cannot be empty") 107 | return 108 | 109 | # Parse Options 110 | if "v" in opt: 111 | reverse=True 112 | else: 113 | reverse=False 114 | 115 | if "i" in opt: 116 | case=False 117 | else: 118 | case=True 119 | 120 | if "t" in opt: 121 | applystyle = False 122 | else: 123 | applystyle = True 124 | 125 | dfout = pd.DataFrame([]) 126 | 127 | if isinstance(cols, str): 128 | cols=list([cols]) 129 | 130 | for col in cols: 131 | # Check if col is an existing column 132 | if col not in df.columns: 133 | print ('ERROR: column '+col+' not found in DF') 134 | return 135 | 136 | if "p" in opt: 137 | df=df.explode(col) 138 | 139 | # Identify if there are null values and fill them 140 | df=df.copy() 141 | # df[col]=df[col].fillna("d4_null") 142 | 143 | if reverse : 144 | resdf = df[~df[col].astype(str).str.contains(regex,case=case)] 145 | else: 146 | resdf = df[df[col].astype(str).str.contains(regex,case=case)] 147 | 148 | dfout = dfout.append(resdf) 149 | 150 | # for col in cols: 151 | # dfout[col]=dfout[col].fillna("d4_null") 152 | 153 | dfout = dfout.drop_duplicates() 154 | 155 | if applystyle : 156 | maxdfoutprintlines = 1000 157 | if len(dfout) >= maxdfoutprintlines: 158 | print('WARNING: Too many lines (>'+str(maxdfoutprintlines)+') in DataFrame for formatting. Returning unformatted output.') 159 | return dfout 160 | else: 161 | dfout = dfout.reset_index() 162 | return dfout.style.apply(lambda x: ["background: yellow" if re.search(regex, str(v)) else '' for v in x], axis = 1) 163 | else: 164 | return dfout 165 | 166 | def keysgrep(keys, regex, opt=""): 167 | df = pd.DataFrame(list(keys), columns=['Key']) 168 | return df.d4unx.dfgrep('Key',regex, opt) 169 | 170 | def dictgrep(mydict, regex, opt=""): 171 | # DFs dict ----------------------------------------------- 172 | if isinstance(mydict[list(mydict.keys())[0]], pd.DataFrame): 173 | 174 | outdf = pd.DataFrame([]) 175 | 176 | # Do not apply style on dfgrep 177 | dfgrepopt = opt+"t" 178 | 179 | for key in mydict.keys(): 180 | thisdf = dfgrep(mydict[key], "*", regex, dfgrepopt) 181 | thisdf.insert(0, 'dict-Key_', key) 182 | outdf = pd.concat([outdf, thisdf], ignore_index=True) 183 | 184 | # Return resulting DF 185 | if "t" in opt: 186 | return outdf.dropna(axis=1, how='all') 187 | else: 188 | return outdf.dropna(axis=1, how='all').style.apply(lambda x: ["background: yellow" if re.search(regex, v) else '' for v in x], axis = 1) 189 | else: 190 | print("ERROR: dict variant not supported.") 191 | 192 | def dfsed_func(df,col,regex,repl,opt=""): 193 | 194 | df[col]=df[col].str.replace(regex,repl) 195 | 196 | return df 197 | 198 | def vc_func(df,col,countfilter="",ascending=False): 199 | 200 | dfout = df[col].value_counts(ascending=ascending).reset_index().rename(columns={"index": col, col: "Count"}) 201 | 202 | 203 | if countfilter != "": 204 | 205 | n=int(countfilter) 206 | dfout=dfout.query(f'Count == {n}') 207 | 208 | return dfout 209 | 210 | def ddups_func(df): 211 | dfout=df.drop_duplicates() 212 | 213 | return dfout 214 | 215 | # ACCESSOR #################################################################### 216 | @pd.api.extensions.register_dataframe_accessor("d4unx") 217 | class Ds4n6UnxAccessor: 218 | def __init__(self, pandas_obj): 219 | self._obj = pandas_obj 220 | 221 | def dfgrep(self, *args, **kwargs): 222 | obj = self._obj 223 | return xgrep_func(obj, *args, **kwargs) 224 | 225 | def dfsed(self,col,regex,repl,opt=""): 226 | df=self._obj.copy() 227 | return dfsed_func(df,col,regex,repl,opt) 228 | 229 | def vc(self,col): 230 | df=self._obj 231 | return vc_func(df,col) 232 | 233 | def ddups(self): 234 | df=self._obj 235 | return df.ddups_func() 236 | 237 | -------------------------------------------------------------------------------- /src/ds4n6_lib/autoruns.py: -------------------------------------------------------------------------------- 1 | # DS4N6 2 | # 3 | # Description: library of functions to appy Data Science in several forensics 4 | # artifacts 5 | # 6 | 7 | ############################################################################### 8 | # INFO 9 | ############################################################################### 10 | # Recommended "import as": d4atrs 11 | 12 | ############################################################################### 13 | # IMPORTS 14 | ############################################################################### 15 | # DEV IMPORTS ---------------------------------------------------------------- 16 | 17 | # python IMPORTS -------------------------------------------------------------- 18 | import os 19 | import glob 20 | import re 21 | import time 22 | import pickle 23 | import inspect 24 | 25 | # DS IMPORTS ----------------------------------------------------------------- 26 | import numpy as np 27 | import pandas as pd 28 | import matplotlib.pyplot as plt 29 | from IPython.display import display, Markdown, HTML 30 | 31 | # DS4N6 IMPORTS --------------------------------------------------------------- 32 | import ds4n6_lib.d4 as d4 33 | import ds4n6_lib.common as d4com 34 | import ds4n6_lib.gui as d4gui 35 | import ds4n6_lib.utils as d4utl 36 | 37 | ############################################################################### 38 | # FUNCTIONS 39 | ############################################################################### 40 | 41 | # FILE READING FUNCTIONS ###################################################### 42 | 43 | def read_data(evdl, **kwargs): 44 | """ Read data from files or a folder 45 | 46 | Args: 47 | evdl (str): path to file/folder source 48 | kwargs: read options 49 | Returns: 50 | pandas.Dataframe or dictionary of pandas.DataFrame 51 | """ 52 | return d4com.read_data_common(evdl, **kwargs) 53 | 54 | # HARMONIZATION FUNCTIONS ##################################################### 55 | 56 | def harmonize(df, **kwargs): 57 | """ Convert DF in HAM format 58 | 59 | Args: 60 | df (pandas.DataFrame): DF to harmonize 61 | kwargs(dict): harmonize options 62 | Returns: 63 | pandas.DataFrame in HAM Format 64 | """ 65 | if d4.debug >= 2: 66 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 67 | 68 | hostname = kwargs.get('hostname', None) 69 | 70 | # Specific Harmonization Pre-Processing ----------------------------------- 71 | 72 | # Generic Harmonization --------------------------------------------------- 73 | df = d4com.harmonize_common(df, **kwargs) 74 | 75 | # Specific Harmonization Post-Processing ---------------------------------- 76 | df['D4_DataType_'] = 'autoruns' 77 | df['D4_Tool_'] = 'autoruns' 78 | if not hostname == None: 79 | df['D4_Hostname_'] = hostname 80 | 81 | # Signed_Verified_ column (boolean) - - - - - - - - - - - - - - - - - - - - 82 | signer_verifiedsr = df['Signer'].str.contains('^\\(Verified\\)') 83 | 84 | col = 'Signer' 85 | newcol = 'Signer_Verified_' 86 | 87 | colloc = df.columns.get_loc(col) 88 | newcolloc = colloc + 1 89 | if newcol not in df.columns: 90 | df.insert(newcolloc, newcol, "-") 91 | df[newcol] = signer_verifiedsr 92 | 93 | # Misc - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 94 | df['Time'] = df['Time'].astype(str).replace('', np.NaN).astype('datetime64[ns]') 95 | 96 | return df 97 | 98 | # ANALYSIS FUNCTIONS ########################################################## 99 | 100 | # simple ====================================================================== 101 | 102 | def simple_func(df, *args, **kwargs): 103 | """ Reformat the input df so the data is presented to the analyst in the 104 | friendliest possible way 105 | 106 | Parameters: 107 | df (pd.dataframe): Input data 108 | 109 | Returns: 110 | pd.DataFrame: Optionally it will return the filtered dataframe, 111 | only if ret=True is set, constant & hidden columns included 112 | If ret_out=True is set, then the output just as it is shown 113 | (without constant/hidden columns) will be return 114 | """ 115 | 116 | if d4.debug >= 2: 117 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 118 | 119 | # Variables ---------------------------------------------------------------- 120 | hiddencols = ['MD5','SHA-1','PESHA-1','PESHA-256','SHA-256','RunspaceId','IMP'] 121 | 122 | # Maximum number of lines in DF for beautification 123 | maxdfbprintlines = 20 124 | 125 | # Call to simple_common ---------------------------------------------------- 126 | return d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines) 127 | 128 | # analysis ==================================================================== 129 | def analysis(*args, **kwargs): 130 | """ Redirects execution to analysis_func() 131 | """ 132 | if d4.debug >= 2: 133 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 134 | 135 | return analysis_func(*args, **kwargs) 136 | 137 | 138 | def analysis_func(*args, **kwargs): 139 | """ Umbrella function that redirects to different types of analysis 140 | available on the input data 141 | 142 | Parameters: 143 | obj: Input data (typically DF or dict of DFs) 144 | 145 | Returns: 146 | pd.DataFrame: Refer to each specific analysis function 147 | """ 148 | 149 | if d4.debug >= 2: 150 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 151 | 152 | def syntax(): 153 | print('Syntax: analysis(obj, "analysis_type")\n') 154 | d4list("str-help") 155 | return 156 | 157 | def d4list(objtype): 158 | print("Available autoruns analysis types:") 159 | print("- find_powershell: Analyze data and find powershell") 160 | 161 | nargs = len(args) 162 | 163 | if nargs == 0: 164 | syntax() 165 | return 166 | 167 | obj = args[0] 168 | 169 | objtype = d4com.data_identify(obj) 170 | 171 | if isinstance(obj, str): 172 | if obj == "list": 173 | d4list(objtype) 174 | return 175 | if obj == "help": 176 | syntax() 177 | return 178 | 179 | if nargs == 1: 180 | syntax() 181 | return 182 | 183 | anltype = args[1] 184 | 185 | if not isinstance(anltype, str): 186 | syntax() 187 | return 188 | 189 | if anltype == "help": 190 | syntax() 191 | return 192 | elif anltype == "list": 193 | d4list(objtype) 194 | return 195 | 196 | if re.search("^pandas_dataframe-autoruns", objtype): 197 | if anltype == "find_powershell": 198 | return analysis_find_powershell(*args, **kwargs) 199 | else: 200 | print("ERROR: [autoruns] Unsupported input data.") 201 | return 202 | 203 | def analysis_find_powershell(obj, *args, **kwargs): 204 | """ Analysis that finds poweshell in the DF 205 | 206 | Args: 207 | obj: Input data (typically DF or dict of DFs) 208 | Returns: 209 | pandas.Dataframe with the results of the analysis 210 | 211 | """ 212 | if d4.debug >= 2: 213 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 214 | 215 | df = obj 216 | 217 | return df.xgrep("*", "powershell", "t" ).spl(out=True, ret=True) 218 | 219 | # DATAFRAME ACCESSOR ########################################################## 220 | 221 | @pd.api.extensions.register_dataframe_accessor("d4atrs") 222 | class Ds4n6AtrsAccessor: 223 | def __init__(self, pandas_obj): 224 | self._obj = pandas_obj 225 | 226 | def simple(self, *args, **kwargs): 227 | """ Redirects execution to simple_func() 228 | """ 229 | df = self._obj 230 | return simple_func(df, *args, **kwargs) 231 | 232 | @pd.api.extensions.register_dataframe_accessor("d4_autoruns") 233 | class Ds4n6AutorunsAccessor: 234 | def __init__(self, pandas_obj): 235 | self._obj = pandas_obj 236 | 237 | def simple(self, *args, **kwargs): 238 | """ Redirects execution to simple_func() 239 | """ 240 | df = self._obj 241 | return simple_func(df, *args, **kwargs) 242 | 243 | 244 | -------------------------------------------------------------------------------- /src/ds4n6_lib/kansa.py: -------------------------------------------------------------------------------- 1 | # DS4N6 2 | # 3 | # Description: library of functions to appy Data Science in several forensics 4 | # artifacts 5 | # 6 | 7 | ############################################################################### 8 | # INFO 9 | ############################################################################### 10 | # Recommended "import as": d4ksa 11 | 12 | ############################################################################### 13 | # IMPORTS 14 | ############################################################################### 15 | 16 | # DEV IMPORTS ---------------------------------------------------------------- 17 | 18 | # python IMPORTS -------------------------------------------------------------- 19 | import os 20 | import glob 21 | import re 22 | import time 23 | import inspect 24 | import xmltodict 25 | import json 26 | import pickle 27 | from tqdm import tqdm 28 | import xml.etree.ElementTree as et 29 | 30 | # DS IMPORTS ------------------------------------------------------------------ 31 | import numpy as np 32 | import pandas as pd 33 | import matplotlib.pyplot as plt 34 | from IPython.display import display, Markdown, HTML 35 | 36 | from sklearn.model_selection import train_test_split 37 | from tensorflow.keras.models import Model, load_model 38 | from tensorflow.keras.layers import Input, Dense 39 | 40 | # DS4N6 IMPORTS --------------------------------------------------------------- 41 | import ds4n6_lib.d4 as d4 42 | import ds4n6_lib.common as d4com 43 | import ds4n6_lib.gui as d4gui 44 | import ds4n6_lib.utils as d4utl 45 | 46 | ############################################################################### 47 | # FUNCTIONS 48 | ############################################################################### 49 | 50 | # FILE READING FUNCTIONS ###################################################### 51 | 52 | def read_data(evdl, **kwargs): 53 | return d4com.read_data_common(evdl, **kwargs) 54 | 55 | # HARMONIZATION FUNCTIONS ##################################################### 56 | 57 | def harmonize(df, **kwargs): 58 | """ Convert DF in HAM format 59 | 60 | Args: 61 | df (pandas.DataFrame): DF to harmonize 62 | kwargs(dict): harmonize options 63 | Returns: 64 | pandas.DataFrame in HAM Format 65 | """ 66 | 67 | # (1) kansa will probably be invoked with 'tool=kansa', but kansa is in 68 | # reality an orchestrator, so we will only populate the D4_Tool_ column 69 | # to kansa only if we have not been able to determine what is the 70 | # underlying tool that kansa is using for execution in the endpoint 71 | 72 | # Specific Harmonization Pre-Processing ----------------------------------- 73 | if not 'D4_Orchestrator_' in df.columns: 74 | df.insert(0, 'D4_Orchestrator_', "kansa") 75 | else: 76 | # If the D4_Orchestrator_ col exists, we are in a recursive call 77 | return df 78 | 79 | objtype = d4com.data_identify(df) 80 | 81 | # Generic Harmonization --------------------------------------------------- 82 | 83 | # Since kansa is an orchestrator, let's try to identify the specific 84 | # data type and apply the corresponding harmonization function. 85 | # If we can, we will execute the generic one. 86 | if "unknown" in objtype: 87 | df = d4com.harmonize_common(df, **kwargs) 88 | else: 89 | # Let's try to harmonize this specific df 90 | # WARNING: Since we no longer identify datatype by DF cols, this will 91 | # not work 92 | df = d4com.harmonize(df) 93 | 94 | # Specific Harmonization Post-Processing ---------------------------------- 95 | df['D4_Hostname_'] = df['PSComputerName'] 96 | 97 | if df['D4_Plugin_'].iloc[0] == "Tasklistv": 98 | df['D4_DataType_'] = "pslist" 99 | df['D4_DataType_'] = df['D4_DataType_'].astype('category') 100 | 101 | # Rename columns 102 | df = df.rename(columns={'ImageName': 'Name_', 'PID': 'PID_', 103 | 'SessionName': 'SessionName_', 104 | 'SessionNum': 'Session_', 'MemUsage': 'MemUsage_', 105 | 'Status': 'Status_', 'UserName': 'UserName_', 106 | 'CPUTime': 'CPUTime_', 'WindowTitle': 'WindowTitle_' 107 | }) 108 | 109 | elif df['D4_Plugin_'].iloc[0] == "SvcAll": 110 | df['D4_DataType_'] = "svclist" 111 | df['D4_DataType_'] = df['D4_DataType_'].astype('category') 112 | 113 | # Rename columns 114 | df = df.rename(columns={'Name': 'Name_', 'DisplayName': 'DisplayName_', 115 | 'PathName': 'FilePath_', 'StartName': 'UserName_', 116 | 'StartMode': 'StartMode_', 'State': 'State_', 117 | 'TotalSessions': 'TotalSessions_', 118 | 'Description': 'Description_' 119 | }) 120 | 121 | return df 122 | 123 | # ANALYSIS FUNCTIONS ====================================================== 124 | 125 | # simple ====================================================================== 126 | def simple_func(df, *args, **kwargs): 127 | """ Reformat the input df so the data is presented to the analyst in the 128 | friendliest possible way 129 | 130 | Parameters: 131 | df (pd.dataframe): Input data 132 | 133 | Returns: 134 | pd.DataFrame: Optionally it will return the filtered dataframe, 135 | only if ret=True is set, constant & hidden columns included 136 | If ret_out=True is set, then the output just as it is shown 137 | (without constant/hidden columns) will be return 138 | """ 139 | if d4.debug >= 4: 140 | print("DEBUG: [ksa] [simple_func()]") 141 | 142 | # Variables ---------------------------------------------------------------- 143 | hiddencols = [] 144 | 145 | # Maximum number of lines in DF for beautification 146 | maxdfbprintlines = 20 147 | 148 | # Call to simple_common ---------------------------------------------------- 149 | return d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines) 150 | 151 | # analysis ==================================================================== 152 | def analysis(*args, **kwargs): 153 | """ Redirects execution to analysis_func() 154 | """ 155 | return analysis_func(*args, **kwargs) 156 | 157 | def analysis_func(*args, **kwargs): 158 | """ Umbrella function that redirects to different types of analysis 159 | available on the input data 160 | 161 | Parameters: 162 | obj: Input data (typically DF or dict of DFs) 163 | 164 | Returns: 165 | pd.DataFrame: Refer to each specific analysis function 166 | """ 167 | def syntax(): 168 | print('Syntax: analysis(obj, "analysis_type")\n') 169 | d4list("str-help") 170 | return 171 | 172 | def d4list(objtype): 173 | 174 | # Analysis Modules Available for this objective 175 | anlav = False 176 | print("Available kansa analysis types:") 177 | if objtype == None or objtype == "str-help" or objtype == "str-list" or re.search("^dict-pandas_dataframe-kansa", objtype): 178 | anlav = True 179 | print("- kansa_files: No.events kansa file (Input: ksadfs)") 180 | 181 | if anlav == False: 182 | print('- No analysis modules available for this object ('+objtype+').') 183 | 184 | nargs = len(args) 185 | 186 | if nargs == 0: 187 | syntax() 188 | return 189 | 190 | obj = args[0] 191 | 192 | objtype = d4com.data_identify(obj) 193 | 194 | if isinstance(obj, str): 195 | if obj == "list": 196 | d4list(objtype) 197 | return 198 | if obj == "help": 199 | syntax() 200 | return 201 | 202 | if nargs == 1: 203 | syntax() 204 | return 205 | 206 | anltype = args[1] 207 | 208 | if not isinstance(anltype, str): 209 | syntax() 210 | return 211 | 212 | if anltype == "help": 213 | syntax() 214 | return 215 | elif anltype == "list": 216 | d4list(objtype) 217 | return 218 | 219 | # ksadfs ------------------------------------------------------------------ 220 | if re.search("^dict-pandas_dataframe-kansa", objtype): 221 | if anltype == "kansa_files": 222 | return analysis_kansa_files(*args, **kwargs) 223 | 224 | print("INFO: [d4ksa] No analysis functions available for this data type ("+objtype+")") 225 | 226 | def analysis_kansa_files(*args, **kwargs): 227 | """ Analysis that gives kansa files 228 | 229 | Args: 230 | obj: Input data (typically DF or dict of DFs) 231 | Returns: 232 | pandas.Dataframe with the results of the analysis 233 | 234 | """ 235 | dfs = args[0] 236 | 237 | objtype = d4com.data_identify(dfs) 238 | 239 | if objtype != "dict-pandas_dataframe-kansa": 240 | print("ERROR: Invalid object for function: "+objtype) 241 | print(" Input object should be: dict-pandas_dataframe-kansa") 242 | return 243 | 244 | outdf = pd.DataFrame([],columns=['File','NEntries']) 245 | row = pd.Series() 246 | 247 | for key in dfs.keys(): 248 | row['File'] = key 249 | row['NEntries'] = len(dfs[key]) 250 | 251 | outdf = outdf.append(row,ignore_index=True) 252 | 253 | return outdf 254 | 255 | # DATAFRAME ACCESSOR ########################################################## 256 | 257 | @pd.api.extensions.register_dataframe_accessor("d4ksa") 258 | class Ds4n6KsaAccessor: 259 | def __init__(self, pandas_obj): 260 | self._obj = pandas_obj 261 | 262 | def simple(self, *args, **kwargs): 263 | """ Redirects execution to simple_func() 264 | """ 265 | df = self._obj 266 | return simple_func(df, *args, **kwargs) 267 | 268 | @pd.api.extensions.register_dataframe_accessor("d4_kansa") 269 | class Ds4n6KansaAccessor: 270 | def __init__(self, pandas_obj): 271 | self._obj = pandas_obj 272 | 273 | def simple(self, *args, **kwargs): 274 | """ Redirects execution to simple_func() 275 | """ 276 | df = self._obj 277 | return simple_func(df, *args, **kwargs) 278 | 279 | 280 | -------------------------------------------------------------------------------- /src/ds4n6_lib/volatility.py: -------------------------------------------------------------------------------- 1 | # DS4N6 2 | # 3 | # Description: library of functions to appy Data Science in several forensics 4 | # artifacts 5 | # 6 | 7 | ############################################################################### 8 | # INFO 9 | ############################################################################### 10 | # Recommended "import as": d4vol 11 | 12 | ############################################################################### 13 | # IMPORTS 14 | ############################################################################### 15 | 16 | # DEV IMPORTS ---------------------------------------------------------------- 17 | 18 | # python IMPORTS -------------------------------------------------------------- 19 | import os 20 | import glob 21 | import re 22 | import time 23 | import inspect 24 | import json 25 | import pickle 26 | 27 | # DS IMPORTS ------------------------------------------------------------------ 28 | import numpy as np 29 | import pandas as pd 30 | from IPython.display import display, Markdown, HTML 31 | 32 | # DS4N6 IMPORTS --------------------------------------------------------------- 33 | import ds4n6_lib.d4 as d4 34 | import ds4n6_lib.common as d4com 35 | import ds4n6_lib.gui as d4gui 36 | import ds4n6_lib.utils as d4utl 37 | 38 | ############################################################################### 39 | # FUNCTIONS 40 | ############################################################################### 41 | 42 | # FILE READING FUNCTIONS ###################################################### 43 | 44 | def read_data(evdl, **kwargs): 45 | """ Read data from files or a folder 46 | 47 | Args: 48 | evdl (str): path to file/folder source 49 | kwargs: read options 50 | Returns: 51 | pandas.Dataframe or dictionary of pandas.DataFrame 52 | """ 53 | if d4.debug >= 3: 54 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 55 | 56 | return d4com.read_data_common(evdl, **kwargs) 57 | 58 | # HARMONIZATION FUNCTIONS ##################################################### 59 | 60 | def harmonize(df, **kwargs): 61 | """ Convert DF in HAM format 62 | 63 | Args: 64 | df (pandas.DataFrame): DF to harmonize 65 | kwargs(dict): harmonize options 66 | Returns: 67 | pandas.DataFrame in HAM Format 68 | """ 69 | plugin = kwargs.get('plugin', None) 70 | hostname = kwargs.get('hostname', None) 71 | 72 | # Specific Harmonization Pre-Processing ----------------------------------- 73 | if hostname is not None: 74 | df['D4_Hostname_'] = hostname 75 | if hostname is not None: 76 | df['D4_Plugin_'] = plugin 77 | if not df.index.empty and df.index[0] == ">": 78 | df.reset_index(drop=True, inplace=True) 79 | 80 | # Generic Harmonization --------------------------------------------------- 81 | df = d4com.harmonize_common(df, **kwargs) 82 | 83 | # Specific Harmonization Post-Processing ---------------------------------- 84 | 85 | # pslist - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 86 | if plugin == "pslist": 87 | df['D4_DataType_'] = "pslist" 88 | df['D4_DataType_'] = df['D4_DataType_'].astype('category') 89 | 90 | # Rename columns 91 | df = df.rename(columns={'Name': 'Name_', 'PID': 'PID_', 'PPID': 'PPID_', 92 | 'Thds': 'Threads_', 'Hnds': 'Handles_', 93 | 'Sess': 'Session_', 'Wow64': 'Wow64_', 94 | 'Start': 'Start_TStamp_', 'Exit': 'Exit_TStamp_' 95 | }) 96 | 97 | # Adjust data types 98 | df['Session_'] = df['Session_'].str.replace('^--*$','-1') 99 | df['Session_'] = df['Session_'].astype(int) 100 | df['Handles_'] = df['Handles_'].str.replace('^--*$','-1') 101 | df['Handles_'] = df['Handles_'].astype(int) 102 | df['Start_TStamp_'] = pd.to_datetime(df['Start_TStamp_']) 103 | df['Exit_TStamp_'] = pd.to_datetime(df['Exit_TStamp_']) 104 | 105 | # psscan - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 106 | elif plugin == "psscan": 107 | df['D4_DataType_'] = "pslist" 108 | df['D4_DataType_'] = df['D4_DataType_'].astype('category') 109 | 110 | # Rename columns 111 | df = df.rename(columns={'Name': 'Name_', 'PID': 'PID_', 'PPID': 'PPID_', 112 | 'Time created': 'Start_TStamp_', 'Time exited': 'Exit_TStamp_' 113 | }) 114 | 115 | # Adjust data types 116 | df['Start_TStamp_'] = pd.to_datetime(df['Start_TStamp_']) 117 | df['Exit_TStamp_'] = pd.to_datetime(df['Exit_TStamp_']) 118 | 119 | # return ------------------------------------------------------------------ 120 | return df 121 | 122 | # ANALYSIS FUNCTIONS ########################################################## 123 | 124 | # simple ====================================================================== 125 | def simple_func(df, *args, **kwargs): 126 | """ Reformat the input df so the data is presented to the analyst in the 127 | friendliest possible way 128 | 129 | Parameters: 130 | df (pd.dataframe): Input data 131 | 132 | Returns: 133 | pd.DataFrame: Optionally it will return the filtered dataframe, 134 | only if ret=True is set, constant & hidden columns included 135 | If ret_out=True is set, then the output just as it is shown 136 | (without constant/hidden columns) will be return 137 | """ 138 | 139 | if d4.debug >= 4: 140 | print("DEBUG: [vol] [simple_func()]") 141 | 142 | # Variables ---------------------------------------------------------------- 143 | hiddencols = [] 144 | 145 | # Maximum number of lines in DF for beautification 146 | maxdfbprintlines = 20 147 | 148 | # Call to simple_common ---------------------------------------------------- 149 | return d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines) 150 | 151 | # analysis() ================================================================== 152 | def analysis(*args, **kwargs): 153 | """ Redirects execution to analysis_func() 154 | """ 155 | return analysis_func(*args, **kwargs) 156 | 157 | def analysis_func(*args, **kwargs): 158 | """ Umbrella function that redirects to different types of analysis 159 | available on the input data 160 | 161 | Parameters: 162 | obj: Input data (typically DF or dict of DFs) 163 | 164 | Returns: 165 | pd.DataFrame: Refer to each specific analysis function 166 | """ 167 | def syntax(): 168 | print('Syntax: analysis(obj, "analysis_type")\n') 169 | d4list("str-help") 170 | return 171 | 172 | def d4list(objtype): 173 | print("Available volatility analysis types:") 174 | if objtype == None or objtype == "str-help" or objtype == "str-list" or re.search("^dict-pandas_dataframe-volatility", objtype): 175 | anlav = True 176 | print("- volatility_files: No.events volatility file (Input: voldfs)") 177 | 178 | if anlav == False: 179 | print('- No analysis modules available for this object ('+objtype+').') 180 | 181 | nargs = len(args) 182 | 183 | if nargs == 0: 184 | syntax() 185 | return 186 | 187 | obj = args[0] 188 | 189 | objtype = d4com.data_identify(obj) 190 | 191 | if isinstance(obj, str): 192 | if obj == "list": 193 | d4list(objtype) 194 | return 195 | if obj == "help": 196 | syntax() 197 | return 198 | 199 | if nargs == 1: 200 | syntax() 201 | return 202 | 203 | anltype = args[1] 204 | 205 | if not isinstance(anltype, str): 206 | syntax() 207 | return 208 | 209 | if anltype == "help": 210 | syntax() 211 | return 212 | elif anltype == "list": 213 | d4list(objtype) 214 | return 215 | 216 | # voldfs ------------------------------------------------------------------ 217 | if re.search("^dict-pandas_dataframe-volatility", objtype): 218 | if anltype == "volatility_files": 219 | return analysis_volatility_files(*args, **kwargs) 220 | 221 | print("INFO: [d4vol] No analysis functions available for this data type ("+objtype+")") 222 | 223 | # ANALYSIS FUNCTIONS ========================================================== 224 | 225 | def analysis_volatility_files(*args, **kwargs): 226 | """ Analysis that gives volatility files 227 | 228 | Args: 229 | obj: Input data (typically DF or dict of DFs) 230 | Returns: 231 | pandas.Dataframe with the results of the analysis 232 | 233 | """ 234 | dfs = args[0] 235 | 236 | objtype = d4com.data_identify(dfs) 237 | 238 | if not re.search("^dict-pandas_dataframe-volatility", objtype): 239 | print("ERROR: Invalid object for function: "+objtype) 240 | print(" Input object should be: dict-pandas_dataframe-volatility") 241 | return 242 | 243 | outdf = pd.DataFrame([],columns=['NEntries', 'VolFile']) 244 | row = pd.Series() 245 | 246 | for key in dfs.keys(): 247 | row['VolFile'] = key 248 | row['NEntries'] = len(dfs[key]) 249 | 250 | outdf = outdf.append(row,ignore_index=True).sort_values(by=['VolFile']).reset_index(drop=True) 251 | 252 | return outdf 253 | 254 | # DATAFRAME ACCESSOR ########################################################## 255 | 256 | @pd.api.extensions.register_dataframe_accessor("d4vol") 257 | class Ds4n6VolAccessor: 258 | def __init__(self, pandas_obj): 259 | self._obj = pandas_obj 260 | 261 | def simple(self, *args, **kwargs): 262 | """ Redirects execution to simple_func() 263 | """ 264 | df = self._obj 265 | return simple_func(df, *args, **kwargs) 266 | 267 | @pd.api.extensions.register_dataframe_accessor("d4_volatility") 268 | class Ds4n6VolatilityAccessor: 269 | def __init__(self, pandas_obj): 270 | self._obj = pandas_obj 271 | 272 | def simple(self, *args, **kwargs): 273 | """ Redirects execution to simple_func() 274 | """ 275 | df = self._obj 276 | return simple_func(df, *args, **kwargs) 277 | 278 | -------------------------------------------------------------------------------- /src/ds4n6_lib/mactime.py: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # Description: library of functions to appy Data Science in several forensics 4 | # artifacts 5 | # 6 | 7 | ############################################################################### 8 | # INFO 9 | ############################################################################### 10 | # Recommended "import as": d4mctm 11 | 12 | ############################################################################### 13 | # IMPORTS 14 | ############################################################################### 15 | 16 | # DEV IMPORTS ---------------------------------------------------------------- 17 | 18 | # python IMPORTS -------------------------------------------------------------- 19 | import os 20 | import glob 21 | import re 22 | import time 23 | import inspect 24 | import xmltodict 25 | import json 26 | import pickle 27 | from tqdm import tqdm 28 | import xml.etree.ElementTree as et 29 | 30 | # DS IMPORTS ------------------------------------------------------------------ 31 | import numpy as np 32 | import pandas as pd 33 | import matplotlib.pyplot as plt 34 | from IPython.display import display, Markdown, HTML 35 | 36 | from sklearn.model_selection import train_test_split 37 | from tensorflow.keras.models import Model, load_model 38 | from tensorflow.keras.layers import Input, Dense 39 | 40 | # DS4N6 IMPORTS --------------------------------------------------------------- 41 | import ds4n6_lib.d4 as d4 42 | import ds4n6_lib.common as d4com 43 | import ds4n6_lib.gui as d4gui 44 | import ds4n6_lib.utils as d4utl 45 | 46 | ############################################################################### 47 | # IDEAS 48 | ############################################################################### 49 | # is_deleted() 50 | # is_file() 51 | # is_dir() / is_folder() - level 52 | # ext() # filter by Extension 53 | # nofn # exclude $FILE_NAME entries 54 | 55 | ############################################################################### 56 | # FUNCTIONS 57 | ############################################################################### 58 | 59 | # FILE READING FUNCTIONS ###################################################### 60 | 61 | # FILE READING FUNCTIONS ###################################################### 62 | 63 | def read_data(evdl, **kwargs): 64 | if d4.debug >= 3: 65 | print("DEBUG: [mctm] read_data") 66 | 67 | return d4com.read_data_common(evdl, **kwargs) 68 | 69 | # HARMONIZATION FUNCTIONS ##################################################### 70 | 71 | def harmonize(df, **kwargs): 72 | """ Convert DF in HAM format 73 | 74 | Args: 75 | df (pandas.DataFrame): DF to harmonize 76 | kwargs(dict): harmonize options 77 | Returns: 78 | pandas.DataFrame in HAM Format 79 | """ 80 | objtype = d4com.data_identify(df) 81 | 82 | if objtype == "pandas_dataframe-mactime-raw": 83 | # Specific Harmonization Pre-Processing ----------------------------------- 84 | df = df.rename(columns={"Type": "MACB"}) 85 | 86 | df['Type_'] = df['Mode'].str.extract('^(.)') 87 | df['PrevType_'] = df['Mode'].str.extract('^..(.)') 88 | df['Permissions_'] = df['Mode'].str.extract('^...(.........)') 89 | 90 | # Deleted / Reallocated 91 | df['Deleted_'] = df['File Name'].str.contains(r'\ \(deleted\)$|\ \(deleted-reallocated\)$') 92 | df['Reallocated_'] = df['File Name'].str.contains(r'\ \(deleted-reallocated\)$') 93 | 94 | # [FT] Tag -> Tag_ | DriveLetter_ | VSS_ | EVOName_ | EvidenceName_ | Partition_ | FilePath_ 95 | # FT 96 | if re.search(r'^[A-Z]\[vss[0-9][0-9]\]{.*}:', df['File Name'].iloc[0]): 97 | fncolsdf = df['File Name'].str.split(":", 1, expand=True).rename(columns={0: "Tag_", 1: "FilePath_"}) 98 | fncolsdf['FilePath-Hash_'] = fncolsdf['FilePath_'].str.lower().apply(hash) 99 | fncolsdf['FSType_'] = '-' 100 | df['Hostname_'] = '-' 101 | df['SHA256_Hash_'] = '-' 102 | 103 | fncols2df = fncolsdf['Tag_'].str.extract(r'([A-Z])\[vss(.*)\]{(.*)}', expand=True).rename(columns={0: "DriveLetter_", 1: "VSS_", 2: "EVOName_"}) 104 | fncols2df['VSS_'] = fncols2df['VSS_'].astype(int) 105 | 106 | fncols3df = fncols2df['EVOName_'].str.extract('(.*)-ft-p(.*)', expand=True).rename(columns={0: "EvidenceName_", 1: "Partition_"}) 107 | fncols3df['Partition_'] = fncols3df['Partition_'].astype(int) 108 | 109 | df = pd.concat([df, fncols2df, fncols3df, fncolsdf], axis=1) 110 | 111 | else: 112 | fncolsdf = df['File Name'].str.split(":", 1, expand=True).rename(columns={0: "Tag_", 1: "FilePath_"}) 113 | df = pd.concat([df, fncolsdf], axis=1) 114 | df['Hostname_'] = '-' 115 | df['EVOName_'] = '-' 116 | df['EvidenceName_'] = '-' 117 | df['Partition_'] = '-' 118 | df['FSType_'] = '-' 119 | df['DriveLetter_'] = '-' 120 | df['VSS_'] = '-' 121 | df['TSNTFSAttr_'] = '-' 122 | df['SHA256_Hash_'] = '-' 123 | 124 | # Deal with "($FILE_NAME)" string 125 | tsntfsattrmap = {True: 'FILE_NAME', False: 'STD_INFO'} 126 | df['TSNTFSAttr_'] = df['FilePath_'].str.contains(r'\ \(\$FILE_NAME\)$').map(tsntfsattrmap) 127 | df['FilePath_'] = df['FilePath_'].str.replace(r'\ \(\$FILE_NAME\)$','') 128 | 129 | df['FilePath_'] = df['FilePath_'].str.replace(r'\ \(deleted\)$|\ \(deleted-reallocated\)$','') 130 | 131 | # Generic Harmonization --------------------------------------------------- 132 | df = d4com.harmonize_common(df, **kwargs) 133 | 134 | # Specific Harmonization Post-Processing ---------------------------------- 135 | 136 | return df 137 | 138 | # CORE FUNCTIONS (simple, analysis, etc.) ##################################### 139 | 140 | # simple ====================================================================== 141 | 142 | def simple_func(df, *args, **kwargs): 143 | """ Reformat the input df so the data is presented to the analyst in the 144 | friendliest possible way 145 | 146 | Parameters: 147 | df (pd.dataframe): Input data 148 | 149 | Returns: 150 | pd.DataFrame: Optionally it will return the filtered dataframe, 151 | only if ret=True is set, constant & hidden columns included 152 | If ret_out=True is set, then the output just as it is shown 153 | (without constant/hidden columns) will be return 154 | """ 155 | 156 | if d4.debug >= 3: 157 | print("DEBUG: [mctm] [simple_func()]") 158 | 159 | windows = kwargs.get('windows', True) 160 | 161 | # Variables ---------------------------------------------------------------- 162 | hiddencols = ['File_Name', 'FilePath-Hash_', 'SHA256_Hash_'] 163 | 164 | if windows : 165 | nonwincols = ['UID', 'GID', 'Mode', 'Permissions_'] 166 | hiddencols = hiddencols + nonwincols 167 | 168 | # Maximum number of lines in DF for beautification 169 | maxdfbprintlines = 20 170 | 171 | # Call to simple_common ---------------------------------------------------- 172 | return d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines) 173 | 174 | 175 | # DATAFRAME ACCESSOR ########################################################## 176 | 177 | @pd.api.extensions.register_dataframe_accessor("d4mctm") 178 | class Ds4n6MctmAccessor: 179 | def __init__(self, pandas_obj): 180 | self._obj = pandas_obj 181 | 182 | def simple(self, *args, **kwargs): 183 | """ Redirects execution to simple_func() 184 | """ 185 | df = self._obj 186 | return simple_func(df, *args, **kwargs) 187 | 188 | @pd.api.extensions.register_dataframe_accessor("d4_mactime") 189 | class Ds4n6MactimeAccessor: 190 | def __init__(self, pandas_obj): 191 | self._obj = pandas_obj 192 | 193 | def simple(self, *args, **kwargs): 194 | """ Redirects execution to simple_func() 195 | """ 196 | df = self._obj 197 | return simple_func(df, *args, **kwargs) 198 | 199 | # ANALYSIS #################################################################### 200 | 201 | # analysis() function ========================================================= 202 | def analysis(*args, **kwargs): 203 | """ Redirects execution to analysis_func() 204 | """ 205 | return analysis_func(*args, **kwargs) 206 | 207 | def analysis_func(*args, **kwargs): 208 | """ Umbrella function that redirects to different types of analysis 209 | available on the input data 210 | 211 | Parameters: 212 | obj: Input data (typically DF or dict of DFs) 213 | 214 | Returns: 215 | pd.DataFrame: Refer to each specific analysis function 216 | """ 217 | 218 | def syntax(): 219 | print('Syntax: analysis(obj, "analysis_type")\n') 220 | d4list("str-help") 221 | return 222 | 223 | def d4list(objtype): 224 | 225 | # Analysis Modules Available for this objective 226 | # anlav = False 227 | print("Available fstl analysis types:") 228 | print("- No analysis functions defined yet.") 229 | return 230 | 231 | # TEMPLATE 232 | #if objtype == "str-help" or objtype == "str-list" or re.search("^pandas_dataframe-fstl-mactime-standard", objtype): 233 | # anlav = True 234 | # print("- XXXXXXXXXX: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX (Input: fstldf)") 235 | 236 | # if anlav == False: 237 | # print('- No analysis modules available for this object ('+objtype+').') 238 | 239 | nargs = len(args) 240 | 241 | if nargs == 0: 242 | syntax() 243 | return 244 | 245 | obj = args[0] 246 | 247 | objtype = d4com.data_identify(obj) 248 | 249 | if isinstance(obj, str): 250 | if obj == "list": 251 | d4list(objtype) 252 | return 253 | if obj == "help": 254 | syntax() 255 | return 256 | 257 | if nargs == 1: 258 | syntax() 259 | return 260 | 261 | anltype = args[1] 262 | 263 | if not isinstance(anltype, str): 264 | syntax() 265 | return 266 | 267 | if anltype == "help": 268 | syntax() 269 | return 270 | elif anltype == "list": 271 | d4list(objtype) 272 | return 273 | 274 | # TEMPLATE 275 | # If object is a dict of dfs 276 | #elif re.search("^pandas_dataframe-evtx_file_df", objtype): 277 | # if anltype == "XXXXXXXXXXX": 278 | # return XXXXXXXXXXXXXXXXXXXXX(*args, **kwargs) 279 | #else: 280 | # print("ERROR: [fstl] Unsupported input data.") 281 | # return 282 | 283 | -------------------------------------------------------------------------------- /src/ds4n6_lib/fstl.py: -------------------------------------------------------------------------------- 1 | # DS4N6 2 | # 3 | # Description: library of functions to appy Data Science in several forensics 4 | # artifacts 5 | # 6 | 7 | ############################################################################### 8 | # INFO 9 | ############################################################################### 10 | # Recommended "import as": d4fstl 11 | 12 | ############################################################################### 13 | # IMPORTS 14 | ############################################################################### 15 | 16 | # DEV IMPORTS ---------------------------------------------------------------- 17 | 18 | # python IMPORTS -------------------------------------------------------------- 19 | import os 20 | import glob 21 | import re 22 | import time 23 | import inspect 24 | 25 | import xmltodict 26 | import json 27 | import pickle 28 | from tqdm import tqdm 29 | import xml.etree.ElementTree as et 30 | 31 | # DS IMPORTS ------------------------------------------------------------------ 32 | import numpy as np 33 | import pandas as pd 34 | import matplotlib.pyplot as plt 35 | from IPython.display import display, Markdown, HTML 36 | 37 | from sklearn.model_selection import train_test_split 38 | from tensorflow.keras.models import Model, load_model 39 | from tensorflow.keras.layers import Input, Dense 40 | 41 | # DS4N6 IMPORTS --------------------------------------------------------------- 42 | import ds4n6_lib.d4 as d4 43 | import ds4n6_lib.common as d4com 44 | import ds4n6_lib.gui as d4gui 45 | import ds4n6_lib.utils as d4utl 46 | 47 | ############################################################################### 48 | # IDEAS 49 | ############################################################################### 50 | # is_deleted() 51 | # is_file() 52 | # is_dir() / is_folder() - level 53 | # ext() # filter by Extension 54 | # nofn # exclude $FILE_NAME entries 55 | 56 | ############################################################################### 57 | # FUNCTIONS 58 | ############################################################################### 59 | # FILE READING FUNCTIONS ###################################################### 60 | 61 | def read_data(evdl, **kwargs): 62 | return d4com.read_data_common(evdl, **kwargs) 63 | 64 | # CORE FUNCTIONS (simple, analysis, etc.) ##################################### 65 | # simple ====================================================================== 66 | def simple_func(df, *args, **kwargs): 67 | """ Reformat the input df so the data is presented to the analyst in the 68 | friendliest possible way 69 | 70 | Parameters: 71 | df (pd.dataframe): Input data 72 | 73 | Returns: 74 | pd.DataFrame: Optionally it will return the filtered dataframe, 75 | only if ret=True is set, constant & hidden columns included 76 | If ret_out=True is set, then the output just as it is shown 77 | (without constant/hidden columns) will be return 78 | """ 79 | if d4.debug >= 4: 80 | print("DEBUG: [fstl] [simple_func()]") 81 | 82 | # Variables ---------------------------------------------------------------- 83 | hiddencols = [] 84 | 85 | # Maximum number of lines in DF for beautification 86 | maxdfbprintlines = 20 87 | 88 | # Call to simple_common ---------------------------------------------------- 89 | return d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines) 90 | 91 | # DATAFRAME ACCESSOR ########################################################## 92 | 93 | @pd.api.extensions.register_dataframe_accessor("d4fstl") 94 | class Ds4n6FSTLAccessor: 95 | def __init__(self, pandas_obj): 96 | self._obj = pandas_obj 97 | 98 | def simple(self, *args, **kwargs): 99 | """ Redirects execution to simple_func() 100 | """ 101 | df = self._obj 102 | return simple_func(df, *args, **kwargs) 103 | 104 | def nofn(self): 105 | return self._obj[~self._obj['FileName'].str.contains(r"\ \(\$FILE_NAME\)")] 106 | 107 | def is_deleted(self): 108 | if 'FileName' in self._obj.columns: 109 | return self._obj[self._obj['FileName'].str.contains(r"\ \(deleted\)$")] 110 | elif 'Deleted_' in self._obj.columns: 111 | return self._obj.query('Deleted_ == True') 112 | 113 | def is_file(self): 114 | return self._obj.query('Type_ == "r" | PrevType_ == "r"') 115 | 116 | def is_dir(self,level=0): 117 | return self._obj.query('Type_ == "d"') 118 | 119 | # Same as is_dir() 120 | def is_directory(self,level=0): 121 | return self._obj.query('Type_ == "d"') 122 | 123 | # Same as is_dir() 124 | def is_folder(self,level=0): 125 | return self._obj.query('Type_ == "d"') 126 | 127 | def ext(self,ext): 128 | return self._obj[self._obj['FileName'].str.contains(r"\."+ext+"$") | self._obj['FileName'].str.contains(r"\."+ext+r"\ \(\$FILE_NAME\)$") | self._obj['FileName'].str.contains(r"\."+ext+r"\ \(deleted\)$") | self._obj['FileName'].str.contains(r"\."+ext+r"\ \(\$FILE_NAME\)\ \(deleted\)$")] 129 | 130 | def just_basename(self): 131 | return self._obj['FileName'].str.replace('.*/','') 132 | 133 | def ts_m(self,exclusive=False): 134 | if exclusive == False: 135 | return self._obj.query(r'MACB.str.contains("^m...$")',engine="python") 136 | else: 137 | return self._obj.query(r'MACB.str.contains("^m\.\.\.$")',engine="python") 138 | 139 | def ts_a(self,exclusive=False): 140 | if exclusive == False: 141 | return self._obj.query('MACB.str.contains("^.a..$")',engine="python") 142 | else: 143 | return self._obj.query(r'MACB.str.contains("^\.a\.\.$")',engine="python") 144 | 145 | def ts_c(self,exclusive=False): 146 | if exclusive == False: 147 | return self._obj.query('MACB.str.contains("^..c.$")',engine="python") 148 | else: 149 | return self._obj.query(r'MACB.str.contains("^\.\.c\.$")',engine="python") 150 | 151 | def ts_b(self,exclusive=False): 152 | if exclusive == False: 153 | return self._obj.query('MACB.str.contains("^...b$")',engine="python") 154 | else: 155 | return self._obj.query(r'MACB.str.contains("^\.\.\.b$")',engine="python") 156 | 157 | @pd.api.extensions.register_dataframe_accessor("d4_fstl") 158 | class Ds4n6_FSTLAccessor: 159 | def __init__(self, pandas_obj): 160 | self._obj = pandas_obj 161 | 162 | def simple(self, *args, **kwargs): 163 | """ Redirects execution to simple_func() 164 | """ 165 | df = self._obj 166 | return simple_func(df, *args, **kwargs) 167 | 168 | def nofn(self): 169 | return self._obj[~self._obj['FileName'].str.contains(r"\ \(\$FILE_NAME\)")] 170 | 171 | def is_deleted(self): 172 | if 'FileName' in self._obj.columns: 173 | return self._obj[self._obj['FileName'].str.contains(r"\ \(deleted\)$")] 174 | elif 'Deleted_' in self._obj.columns: 175 | return self._obj.query('Deleted_ == True') 176 | 177 | def is_file(self): 178 | return self._obj.query('Type_ == "r" | PrevType_ == "r"') 179 | 180 | def is_dir(self,level=0): 181 | return self._obj.query('Type_ == "d"') 182 | 183 | # Same as is_dir() 184 | def is_directory(self,level=0): 185 | return self._obj.query('Type_ == "d"') 186 | 187 | # Same as is_dir() 188 | def is_folder(self,level=0): 189 | return self._obj.query('Type_ == "d"') 190 | 191 | def ext(self,ext): 192 | return self._obj[self._obj['FileName'].str.contains(r"\."+ext+"$") | self._obj['FileName'].str.contains(r"\."+ext+r"\ \(\$FILE_NAME\)$") | self._obj['FileName'].str.contains(r"\."+ext+r"\ \(deleted\)$") | self._obj['FileName'].str.contains(r"\."+ext+r"\ \(\$FILE_NAME\)\ \(deleted\)$")] 193 | 194 | def just_basename(self): 195 | return self._obj['FileName'].str.replace('.*/','') 196 | 197 | def ts_m(self,exclusive=False): 198 | if exclusive == False: 199 | return self._obj.query('MACB.str.contains("^m...$")',engine="python") 200 | else: 201 | return self._obj.query(r'MACB.str.contains("^m\.\.\.$")',engine="python") 202 | 203 | def ts_a(self,exclusive=False): 204 | if exclusive == False: 205 | return self._obj.query('MACB.str.contains("^.a..$")',engine="python") 206 | else: 207 | return self._obj.query(r'MACB.str.contains("^\.a\.\.$")',engine="python") 208 | 209 | def ts_c(self,exclusive=False): 210 | if exclusive == False: 211 | return self._obj.query('MACB.str.contains("^..c.$")',engine="python") 212 | else: 213 | return self._obj.query(r'MACB.str.contains("^\.\.c\.$")',engine="python") 214 | 215 | def ts_b(self,exclusive=False): 216 | if exclusive == False: 217 | return self._obj.query('MACB.str.contains("^...b$")',engine="python") 218 | else: 219 | return self._obj.query(r'MACB.str.contains("^\.\.\.b$")',engine="python") 220 | 221 | # ANALYSIS #################################################################### 222 | 223 | # analysis() function ========================================================= 224 | def analysis(*args, **kwargs): 225 | """ Redirects execution to analysis_func() 226 | """ 227 | return analysis_func(*args, **kwargs) 228 | 229 | def analysis_func(*args, **kwargs): 230 | """ Umbrella function that redirects to different types of analysis 231 | available on the input data 232 | 233 | Parameters: 234 | obj: Input data (typically DF or dict of DFs) 235 | 236 | Returns: 237 | pd.DataFrame: Refer to each specific analysis function 238 | """ 239 | 240 | def syntax(): 241 | print('Syntax: analysis(obj, "analysis_type")\n') 242 | d4list("str-help") 243 | return 244 | 245 | def d4list(objtype): 246 | 247 | # Analysis Modules Available for this objective 248 | # anlav = False 249 | print("Available fstl analysis types:") 250 | print("- No analysis functions defined yet.") 251 | return 252 | 253 | # TEMPLATE 254 | #if objtype == "str-help" or objtype == "str-list" or re.search("^pandas_dataframe-fstl-mactime-standard", objtype): 255 | # anlav = True 256 | # print("- XXXXXXXXXX: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX (Input: fstldf)") 257 | 258 | # if anlav == False: 259 | # print('- No analysis modules available for this object ('+objtype+').') 260 | 261 | nargs = len(args) 262 | 263 | if nargs == 0: 264 | syntax() 265 | return 266 | 267 | obj = args[0] 268 | 269 | objtype = d4com.data_identify(obj) 270 | 271 | if isinstance(obj, str): 272 | if obj == "list": 273 | d4list(objtype) 274 | return 275 | if obj == "help": 276 | syntax() 277 | return 278 | 279 | if nargs == 1: 280 | syntax() 281 | return 282 | 283 | anltype = args[1] 284 | 285 | if not isinstance(anltype, str): 286 | syntax() 287 | return 288 | 289 | if anltype == "help": 290 | syntax() 291 | return 292 | elif anltype == "list": 293 | d4list(objtype) 294 | return 295 | 296 | # TEMPLATE 297 | # If object is a dict of dfs 298 | #elif re.search("^pandas_dataframe-evtx_file_df", objtype): 299 | # if anltype == "XXXXXXXXXXX": 300 | # return XXXXXXXXXXXXXXXXXXXXX(*args, **kwargs) 301 | #else: 302 | # print("ERROR: [fstl] Unsupported input data.") 303 | # return 304 | 305 | 306 | -------------------------------------------------------------------------------- /src/ds4n6_lib/pslist.py: -------------------------------------------------------------------------------- 1 | # DS4N6 2 | # 3 | # Description: library of functions to appy Data Science in several forensics 4 | # artifacts 5 | # 6 | 7 | ############################################################################### 8 | # INFO 9 | ############################################################################### 10 | # Recommended "import as": d4pslst 11 | 12 | ############################################################################### 13 | # IMPORTS 14 | ############################################################################### 15 | 16 | # DEV IMPORTS ---------------------------------------------------------------- 17 | 18 | # python IMPORTS -------------------------------------------------------------- 19 | import os 20 | import glob 21 | import re 22 | import time 23 | import inspect 24 | import json 25 | import pickle 26 | 27 | # DS IMPORTS ------------------------------------------------------------------ 28 | import numpy as np 29 | import pandas as pd 30 | from IPython.display import display, Markdown, HTML 31 | 32 | # DS4N6 IMPORTS --------------------------------------------------------------- 33 | import ds4n6_lib.d4 as d4 34 | import ds4n6_lib.common as d4com 35 | import ds4n6_lib.gui as d4gui 36 | import ds4n6_lib.utils as d4utl 37 | import ds4n6_lib.unx as d4unx 38 | from ds4n6_lib.knowledge import critical_processes, boot_start_processes, process_parents 39 | 40 | ############################################################################### 41 | # FUNCTIONS 42 | ############################################################################### 43 | 44 | # ANALYSIS FUNCTIONS ########################################################## 45 | 46 | # simple ====================================================================== 47 | def simple_func(df, *args, **kwargs): 48 | """ Reformat the input df so the data is presented to the analyst in the 49 | friendliest possible way 50 | 51 | Parameters: 52 | df (pd.dataframe): Input data 53 | 54 | Returns: 55 | pd.DataFrame: Optionally it will return the filtered dataframe, 56 | only if ret=True is set, constant & hidden columns included 57 | If ret_out=True is set, then the output just as it is shown 58 | (without constant/hidden columns) will be return 59 | """ 60 | 61 | if d4.debug >= 3: 62 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 63 | 64 | # Variables ---------------------------------------------------------------- 65 | hiddencols = [] 66 | 67 | # Maximum number of lines in DF for beautification 68 | maxdfbprintlines = 20 69 | 70 | # Call to simple_common ---------------------------------------------------- 71 | return d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines) 72 | 73 | # analysis() ================================================================== 74 | def analysis(*args, **kwargs): 75 | """ Redirects execution to analysis_func() 76 | """ 77 | return analysis_func(*args, **kwargs) 78 | 79 | def analysis_func(*args, **kwargs): 80 | """ Umbrella function that redirects to different types of analysis 81 | available on the input data 82 | 83 | Parameters: 84 | obj: Input data (typically DF or dict of DFs) 85 | 86 | Returns: 87 | pd.DataFrame: Refer to each specific analysis function 88 | """ 89 | def syntax(): 90 | print('Syntax: analysis(obj, "analysis_type")\n') 91 | d4list("str-help") 92 | return 93 | 94 | def d4list(objtype): 95 | print("Available pslist analysis types:") 96 | if objtype == None or objtype == "str-help" or objtype == "str-list" or re.search("^pandas_dataframe-pslist-ham", objtype): 97 | anlav = True 98 | print("- process_stats: Show process statistics (Input: pslistdf)") 99 | print("- unfrequent_processes: Identify unfrequent processes (Input: pslistdf)") 100 | print("- boot_time_anomalies: Identify boot time proccess anomalies (Input: pslistdf)") 101 | print("- parent_process_anomalies: Identify parent process anomalies (Input: pslistdf)") 102 | 103 | if anlav == False: 104 | print('- No analysis modules available for this object ('+objtype+').') 105 | 106 | if d4.debug >= 3: 107 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 108 | 109 | nargs = len(args) 110 | 111 | if nargs == 0: 112 | syntax() 113 | return 114 | 115 | obj = args[0] 116 | 117 | objtype = d4com.data_identify(obj) 118 | 119 | if isinstance(obj, str): 120 | if obj == "list": 121 | d4list(objtype) 122 | return 123 | if obj == "help": 124 | syntax() 125 | return 126 | 127 | if nargs == 1: 128 | syntax() 129 | return 130 | 131 | anltype = args[1] 132 | 133 | if not isinstance(anltype, str): 134 | syntax() 135 | return 136 | 137 | if anltype == "help": 138 | syntax() 139 | return 140 | elif anltype == "list": 141 | d4list(objtype) 142 | return 143 | 144 | # pslistdf ---------------------------------------------------------------- 145 | if re.search("^pandas_dataframe-pslist-ham", objtype): 146 | if anltype == "process_stats": 147 | return analysis_process_stats(*args, **kwargs) 148 | elif anltype == "unfrequent_processes": 149 | return analysis_unfrequent_processes(*args, **kwargs) 150 | elif anltype == "boot_time_anomalies": 151 | return analysis_boot_time_anomalies(*args, **kwargs) 152 | elif anltype == "parent_process_anomalies": 153 | return analysis_parent_process_anomalies(*args, **kwargs) 154 | 155 | print("INFO: [d4pslst] No analysis functions available for this data type ("+objtype+")") 156 | 157 | # ANALYSIS FUNCTIONS ========================================================== 158 | 159 | def analysis_process_stats(*args, **kwargs): 160 | """ Show Process Statistics 161 | 162 | Args: 163 | obj: Input data (HAM process DF) 164 | Returns: 165 | pandas.Dataframe with the results of the analysis 166 | 167 | """ 168 | 169 | if d4.debug >= 3: 170 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 171 | 172 | # Argument parsing 173 | df = args[0] 174 | 175 | if 'Exit_TStamp_' in df.columns: 176 | print("Running:") 177 | display(df.query('Exit_TStamp_.isna()', engine="python")['Name_'].value_counts()) 178 | print("") 179 | print("Dead:") 180 | display(df.query('Exit_TStamp_.notna()', engine="python")['Name_'].value_counts()) 181 | print("") 182 | else: 183 | display(df['Name_'].value_counts()) 184 | 185 | def analysis_unfrequent_processes(*args, **kwargs): 186 | """ Analysis that find unfrequent processes 187 | 188 | Args: 189 | obj: Input data (typically DF or dict of DFs) 190 | Returns: 191 | pandas.Dataframe with the results of the analysis 192 | 193 | """ 194 | 195 | if d4.debug >= 3: 196 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 197 | 198 | # Argument parsing 199 | pslistdf = args[0] 200 | 201 | n = kwargs.get('n', 3) 202 | 203 | print("Threshold: "+str(n)) 204 | print("") 205 | 206 | pscntdf = pd.DataFrame(pslistdf['Name_'].value_counts()).reset_index().rename(columns={'Name_': 'Count', 'index': 'Name_'}) 207 | pscntdf['Count'] = pscntdf['Count'].astype(int) 208 | pscntndf = pscntdf.query('Count <= @n', engine="python") 209 | 210 | print("No. Processes with less than " + str(n) +" occurrences: " + str(len(pscntndf))) 211 | return pscntndf 212 | 213 | 214 | def analysis_boot_time_anomalies(*args, **kwargs): 215 | """ Find anomalies at boot time 216 | 217 | Parameters: 218 | pslistdf (pd.DataFrame): Dataframe with pslist info 219 | secs (int): Interval allowed for processes to start after boot 220 | 221 | Returns: 222 | pd.DataFrame: Processes that don't follow the standard start time pattern 223 | 224 | """ 225 | 226 | if d4.debug >= 3: 227 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 228 | 229 | # Argument parsing 230 | df = args[0] 231 | 232 | secs = kwargs.get('secs', 30) 233 | 234 | # Verify field requirements 235 | if not 'Start_TStamp_' in df.columns: 236 | print("ERROR: Cannot run analysis. Start_TStamp_ column not present.") 237 | return 238 | 239 | print("Min. Start Timestamp Processes:") 240 | display(df[df['Start_TStamp_'] == df['Start_TStamp_'].min()]) 241 | 242 | if 'Session_' in df.columns: 243 | bootps = df[df['Name_'].isin(boot_start_processes) & (df['Session_'] <= 1) & df['Exit_TStamp_'].isnull() ] 244 | else: 245 | bootps = df[df['Name_'].isin(boot_start_processes) & df['Exit_TStamp_'].isnull() ] 246 | 247 | return bootps[bootps['Start_TStamp_'] >= bootps['Start_TStamp_'].min() + pd.Timedelta(seconds=secs)] 248 | 249 | def analysis_parent_process_anomalies(*args, **kwargs): 250 | """ Find anomalies in parent processes 251 | 252 | Parameters: 253 | pslistdf (pd.DataFrame): Dataframe with pslist info 254 | critical_only (bool): Only critical process 255 | 256 | Returns: 257 | None 258 | """ 259 | 260 | if d4.debug >= 3: 261 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 262 | 263 | # Argument parsing 264 | df = args[0] 265 | 266 | critical_only = kwargs.get('critical_only', True) 267 | 268 | # Verify field requirements 269 | if not 'PPID_' in df.columns: 270 | print("ERROR: Cannot run analysis. PPID_ column not present.") 271 | return 272 | 273 | if 'Exit_TStamp_' in df.columns: 274 | df_alive = df[df['Exit_TStamp_'].isna()] 275 | else: 276 | df_alive = df 277 | 278 | hnpid = df_alive[['D4_Hostname_', 'Name_', 'PID_']] 279 | hnppid = df_alive[['D4_Hostname_', 'Name_', 'PPID_']] 280 | family_ext = pd.merge(hnppid, hnpid, left_on=['D4_Hostname_', 'PPID_'], right_on=['D4_Hostname_', 'PID_'], how='left').dropna() 281 | family = family_ext.drop(columns=['D4_Hostname_', 'PPID_', 'PID_']).rename(columns={'Name__x': 'Child', 'Name__y': 'Parent'}).reset_index().drop(columns=['index']) 282 | 283 | if critical_only : 284 | thisfamily = family.query('Child == @critical_processes') 285 | else: 286 | thisfamily = family 287 | 288 | family_unknown = pd.merge(thisfamily, process_parents, indicator=True, how='outer').query( '_merge=="left_only"').drop( '_merge', axis=1) 289 | 290 | display(family_unknown.groupby(["Child", "Parent"]).size().sort_values(ascending=False)) 291 | display(family_unknown) 292 | 293 | # DATAFRAME ACCESSOR ########################################################## 294 | 295 | @pd.api.extensions.register_dataframe_accessor("d4pslst") 296 | class Ds4n6PslstAccessor: 297 | def __init__(self, pandas_obj): 298 | self._obj = pandas_obj 299 | 300 | def simple(self, *args, **kwargs): 301 | """ Redirects execution to simple_func() 302 | """ 303 | df = self._obj 304 | return simple_func(df, *args, **kwargs) 305 | 306 | @pd.api.extensions.register_dataframe_accessor("d4_pslist") 307 | class Ds4n6PslistAccessor: 308 | def __init__(self, pandas_obj): 309 | self._obj = pandas_obj 310 | 311 | def simple(self, *args, **kwargs): 312 | """ Redirects execution to simple_func() 313 | """ 314 | df = self._obj 315 | return simple_func(df, *args, **kwargs) 316 | 317 | -------------------------------------------------------------------------------- /src/ds4n6_lib/macrobber.py: -------------------------------------------------------------------------------- 1 | # DS4N6 2 | # 3 | # Description: Library of functions to apply Data Science to forensics artifacts 4 | # 5 | 6 | ############################################################################### 7 | # INFO 8 | ############################################################################### 9 | # Recommended "import as": d4mcrb 10 | 11 | ############################################################################### 12 | # IMPORTS 13 | ############################################################################### 14 | 15 | # python IMPORTS -------------------------------------------------------------- 16 | import os 17 | import glob 18 | import re 19 | import time 20 | import inspect 21 | import pickle 22 | 23 | # DS IMPORTS ------------------------------------------------------------------ 24 | import numpy as np 25 | import pandas as pd 26 | import matplotlib.pyplot as plt 27 | 28 | # DS4N6 IMPORTS --------------------------------------------------------------- 29 | import ds4n6_lib.d4 as d4 30 | import ds4n6_lib.common as d4com 31 | import ds4n6_lib.gui as d4gui 32 | import ds4n6_lib.utils as d4utl 33 | # bug: unix no exist error, replace by unx. 34 | import ds4n6_lib.unx as d4unx 35 | 36 | ############################################################################### 37 | # FUNCTIONS 38 | ############################################################################### 39 | 40 | # Hidden columns in simple() funcion 41 | hiddencols = [ 'MTStampEpoch_', 'MTStamp_', 'ATStampEpoch_', 'ATStamp_', 'CTStampEpoch_', 'CTStamp_', 'Meta_', 'FileStem_', 'ParentPath_', 'ParentName_', 'PathSeparator_', 'FilePath-Hash_', 'FileName-Hash_', 'FileStem-Hash_', 'ParentPath-Hash_', 'ParentName-Hash_'] 42 | 43 | ############################################################################### 44 | # FUNCTIONS 45 | ############################################################################### 46 | 47 | # FILE READING FUNCTIONS ###################################################### 48 | 49 | def read_data(evdl, **kwargs): 50 | """ Read data from files or a folder 51 | 52 | Args: 53 | evdl (str): path to file/folder source 54 | kwargs: read options 55 | Returns: 56 | pandas.Dataframe or dictionary of pandas.DataFrame 57 | """ 58 | if d4.debug >= 3: 59 | print("DEBUG: [macrobber-read_data()]") 60 | 61 | header_names = ['MD5', 'path', 'inode', 'mode_as_string', 'UID', 'GID', 'size', 'atime', 'mtime', 'ctime', 'block_size'] 62 | 63 | kwargs['header_names'] = header_names 64 | 65 | return d4com.read_data_common(evdl, **kwargs) 66 | 67 | # HARMONIZATION FUNCTIONS ##################################################### 68 | 69 | def harmonize(df, **kwargs): 70 | """ Convert DF in HAM format 71 | 72 | Args: 73 | df (pandas.DataFrame): DF to harmonize 74 | kwargs(dict): harmonize options 75 | Returns: 76 | pandas.DataFrame in HAM Format 77 | """ 78 | data_os = kwargs.get('data_os', None) 79 | generate_hashes = kwargs.get('generate_hashes', True) 80 | path_prefix = kwargs.get('path_prefix', None) 81 | 82 | # Specific Harmonization Pre-Processing =================================== 83 | def remove_prefix(df, prefixregex): 84 | if 'FilePath_' in df.columns: 85 | df['FilePath_'] = df['FilePath_'].str.replace(prefixregex,'') 86 | return df 87 | 88 | # Harmonize to File_List_HAM 89 | 90 | # PathSeparator is tool-dependent, not only OS-dependent 91 | pathsep = '/' 92 | 93 | df['MTStampEpoch_'] = df['mtime'] 94 | df['MTStamp_'] = pd.to_datetime(df['mtime'], errors = 'coerce', unit='s') 95 | df['MTStampDate_'] = df['MTStamp_'].dt.date 96 | df['MTStampTime_'] = df['MTStamp_'].dt.ceil(freq='s').dt.time 97 | df['MTStampDoW_'] = df['MTStamp_'].dt.day_name() 98 | df['ATStampEpoch_'] = df['atime'] 99 | df['ATStamp_'] = pd.to_datetime(df['atime'], errors = 'coerce', unit='s') 100 | df['ATStampDate_'] = df['ATStamp_'].dt.date 101 | df['ATStampTime_'] = df['ATStamp_'].dt.ceil(freq='s').dt.time 102 | df['ATStampDoW_'] = df['ATStamp_'].dt.day_name() 103 | df['CTStampEpoch_'] = df['ctime'] 104 | df['CTStamp_'] = pd.to_datetime(df['ctime'], errors = 'coerce', unit='s') 105 | df['CTStampDate_'] = df['CTStamp_'].dt.date 106 | df['CTStampTime_'] = df['CTStamp_'].dt.ceil(freq='s').dt.time 107 | df['CTStampDoW_'] = df['CTStamp_'].dt.day_name() 108 | #df['BTStampEpoch_'] = df['btime'] 109 | #df['BTStamp_'] = pd.to_datetime(df['btime'], errors = 'coerce', unit='s') 110 | #df['BTStampDate_'] = df['BTStamp_'].dt.date 111 | #df['BTStampTime_'] = df['BTStamp_'].dt.ceil(freq='s').dt.time 112 | #df['BTStampDoW_'] = df['BTStamp_'].dt.day_name() 113 | df['Size_'] = df['size'].astype('int64') 114 | #df['Mode_'] = None 115 | if not data_os == "windows": 116 | df['UID_'] = df['UID'] 117 | if not data_os == "windows": 118 | df['GID_'] = df['GID'] 119 | df['Meta_'] = df['inode'] 120 | #df['File_Name'] = None 121 | df['Type_'] = df['mode_as_string'].str.extract('^(.)') 122 | #df['PrevType_'] = None 123 | if not data_os == "windows": 124 | df['Permissions_'] = df['mode_as_string'].str.replace('^.','').str.replace(r'\ .*$','') 125 | #df['Deleted_'] = None 126 | #df['Reallocated_'] = None 127 | #df['Hostname_'] = None 128 | if not df['MD5'].iloc[0] == 0: 129 | df['MD5_Hash_'] = df['MD5'] 130 | #df['SHA256_Hash_'] = None 131 | #df['DriveLetter_'] = None 132 | #df['VSS_'] = None 133 | #df['EVOName_' ] = None 134 | #df['EvidenceName_'] = None 135 | #df['Partition_'] = None 136 | #df['Tag_'] = None 137 | df['FilePath_'] = df['path'] 138 | if path_prefix is not None: 139 | df = remove_prefix(df, path_prefix) 140 | df['FileName_'] = df['FilePath_'].str.replace('.*'+pathsep,'') 141 | df['FileStem_'] = df['FileName_'].str.replace(r'\.[^\.]*$','') 142 | df['FileExtension_'] = df['FileName_'].str.replace(r'^[^\.]*$', '').str.replace(r'.*\.','').str.lower() 143 | df['ParentPath_'] = df['FilePath_'].str.replace('(.*)'+pathsep+'.*','\\1') 144 | df['ParentName_'] = df['ParentPath_'].str.replace('.*'+pathsep,'') 145 | df['PathSeparator_'] = pathsep 146 | #df['FSType_'] = None 147 | #df['TSNTFSAttr_'] = None 148 | 149 | 150 | # Path-Hash Fields - - - - - - - - - - - - - - - - - - - - - - - - - - - - 151 | if generate_hashes: 152 | df['FilePath-Hash_'] = df['FilePath_'].str.lower().apply(hash) 153 | df['FileName-Hash_'] = df['FileName_'].str.lower().apply(hash) 154 | df['FileStem-Hash_'] = df['FileStem_'].str.lower().apply(hash) 155 | df['ParentPath-Hash_'] = df['ParentPath_'].str.lower().apply(hash) 156 | df['ParentName-Hash_'] = df['ParentName_'].str.lower().apply(hash) 157 | 158 | # Generic Harmonization =================================================== 159 | df = d4com.harmonize_common(df, datatype='flist', **kwargs) 160 | 161 | # Specific Harmonization Post-Processing ================================== 162 | 163 | # return ================================================================== 164 | 165 | return df 166 | 167 | # ANALYSIS FUNCTIONS ########################################################## 168 | 169 | # simple ====================================================================== 170 | def simple_func(df, *args, **kwargs): 171 | """ Reformat the input df so the data is presented to the analyst in the 172 | friendliest possible way 173 | 174 | Parameters: 175 | df (pd.dataframe): Input data 176 | 177 | Returns: 178 | pd.DataFrame: Optionally it will return the filtered dataframe, 179 | only if ret=True is set, constant & hidden columns included 180 | If ret_out=True is set, then the output just as it is shown 181 | (without constant/hidden columns) will be return 182 | """ 183 | 184 | if d4.debug >= 4: 185 | print("DEBUG: [mcrb] [simple_func()]") 186 | 187 | # Artifact-specific argument parsing ======================================= 188 | 189 | # Variables ================================================================ 190 | dfout = df 191 | 192 | # Maximum number of lines in DF for beautification 193 | maxdfbprintlines = 20 194 | 195 | # Pre-Processing ========================================================== 196 | 197 | # Call to simple_common =================================================== 198 | dfout = d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines) 199 | 200 | # Post-Processing ========================================================= 201 | 202 | # Return ================================================================== 203 | return dfout 204 | 205 | # analysis ==================================================================== 206 | def analysis(obj, *args, **kwargs): 207 | """ Redirects execution to analysis_func() 208 | """ 209 | return analysis_func(obj, *args, **kwargs) 210 | 211 | def analysis_func(obj, *args, **kwargs): 212 | """ Umbrella function that redirects to different types of analysis 213 | available on the input data 214 | 215 | Parameters: 216 | obj: Input data (typically DF or dict of DFs) 217 | 218 | Returns: 219 | pd.DataFrame: Refer to each specific analysis function 220 | """ 221 | 222 | def syntax(): 223 | print('Syntax: analysis(obj, "analysis_type")\n') 224 | d4list("str-help") 225 | return 226 | 227 | def d4list(objtype): 228 | 229 | # Analysis Modules Available for this objective 230 | anlav = False 231 | print("Available macrobber analysis types:") 232 | if objtype == None or objtype == "str-help" or objtype == "str-list" or re.search("^dict-pandas_dataframe-macrobber", objtype): 233 | anlav = True 234 | print("- macrobber_files: No.events macrobber file (Input: macrobberdfs)") 235 | 236 | if anlav == False: 237 | print('- No analysis modules available for this object ('+objtype+').') 238 | 239 | nargs = len(args) 240 | 241 | if nargs == 0: 242 | syntax() 243 | return 244 | 245 | obj = args[0] 246 | 247 | objtype = d4com.data_identify(obj) 248 | 249 | if isinstance(obj, str): 250 | if obj == "list": 251 | d4list(objtype) 252 | return 253 | if obj == "help": 254 | syntax() 255 | return 256 | 257 | if nargs == 1: 258 | syntax() 259 | return 260 | 261 | anltype = args[1] 262 | 263 | if not isinstance(anltype, str): 264 | syntax() 265 | return 266 | 267 | if anltype == "help": 268 | syntax() 269 | return 270 | elif anltype == "list": 271 | d4list(objtype) 272 | return 273 | 274 | # ANALYSIS FUNCTIONS ====================================================== 275 | 276 | # mcrbdfs ------------------------------------------------------------------ 277 | # if re.search("^dict-pandas_dataframe-macrobber", objtype): 278 | # if anltype == "macrobber_files": 279 | # return analysis_macrobber_files(*args, **kwargs) 280 | 281 | print("INFO: [d4mcrb] No analysis functions available for this data type ("+objtype+")") 282 | 283 | # DATAFRAME ACCESSOR ########################################################## 284 | 285 | @pd.api.extensions.register_dataframe_accessor("d4mcrb") 286 | class Ds4n6McrbAccessor: 287 | def __init__(self, pandas_obj): 288 | self._obj = pandas_obj 289 | 290 | def simple(self, *args, **kwargs): 291 | """ Redirects execution to simple_func() 292 | """ 293 | df = self._obj 294 | return simple_func(df, *args, **kwargs) 295 | 296 | 297 | @pd.api.extensions.register_dataframe_accessor("d4_macrobber") 298 | class Ds4n6MacRobberAccessor: 299 | def __init__(self, pandas_obj): 300 | self._obj = pandas_obj 301 | 302 | def simple(self, *args, **kwargs): 303 | """ Redirects execution to simple_func() 304 | """ 305 | df = self._obj 306 | return simple_func(df, *args, **kwargs) 307 | -------------------------------------------------------------------------------- /src/ds4n6_lib/tshark.py: -------------------------------------------------------------------------------- 1 | # DS4N6 2 | # 3 | # Description: Library of functions to apply Data Science to forensics artifacts 4 | # 5 | 6 | ############################################################################### 7 | # INFO 8 | ############################################################################### 9 | # Recommended "import as": d4tshrk 10 | 11 | ############################################################################### 12 | # IMPORTS 13 | ############################################################################### 14 | 15 | # DEV IMPORTS ---------------------------------------------------------------- 16 | 17 | # python IMPORTS -------------------------------------------------------------- 18 | import os 19 | import glob 20 | import re 21 | import time 22 | import inspect 23 | import pickle 24 | import subprocess 25 | import json 26 | 27 | # DS IMPORTS ------------------------------------------------------------------ 28 | import numpy as np 29 | import pandas as pd 30 | import matplotlib.pyplot as plt 31 | 32 | # DS4N6 IMPORTS --------------------------------------------------------------- 33 | import ds4n6_lib.d4 as d4 34 | import ds4n6_lib.common as d4com 35 | import ds4n6_lib.gui as d4gui 36 | import ds4n6_lib.utils as d4utl 37 | import ds4n6_lib.unx as d4unx 38 | 39 | ############################################################################### 40 | # FUNCTIONS 41 | ############################################################################### 42 | 43 | # FILE READING FUNCTIONS ###################################################### 44 | 45 | def read_data(evdl, **kwargs): 46 | if d4.debug >= 3: 47 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 48 | 49 | if bool(re.search(r"\.pcap$", evdl, re.IGNORECASE)): 50 | return read_tshark_pcap(evdl, **kwargs) 51 | 52 | elif bool(re.search(r"\.json$", evdl, re.IGNORECASE)): 53 | return read_pcap_json(evdl, **kwargs) 54 | elif bool(re.search(r"\.csv$", evdl, re.IGNORECASE)): 55 | return read_pcap_csv(evdl, **kwargs) 56 | 57 | else: 58 | print("ERROR: Unable to read input file. Unsupported file extension.") 59 | return 60 | 61 | def read_tshark_pcap(evdl, **kwargs): 62 | """ Read pcap data from to json file 63 | Args: 64 | pcapf (str): path to file source 65 | kwargs: read options 66 | Returns: 67 | .json file 68 | """ 69 | cmd = "tshark -r " + evdl + " -T ek -j "'http tcp ip'" -P -V -x > " + evdl+'.json' 70 | print(cmd) 71 | subprocess.Popen(cmd, shell = True, 72 | stdout=subprocess.PIPE) 73 | 74 | evdl = evdl+'.json' 75 | 76 | return read_pcap_json(evdl,**kwargs) 77 | 78 | def read_pcap_json(evdl, **kwargs): 79 | """ Read pcap data from from a json file 80 | Args: 81 | evdl (str): path to file source 82 | kwargs: read options 83 | Returns: 84 | pandas.DataFrame (in the future a dictionary of pandas.DataFrame) 85 | """ 86 | n 87 | if d4.debug >= 3: 88 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 89 | 90 | # Parse Arguments 91 | tool = kwargs.get('tool', '') 92 | hostname = kwargs.get('hostname', '') 93 | do_harmonize = kwargs.get('harmonize', True) 94 | use_pickle = kwargs.get('use_pickle' , True) 95 | 96 | pklrawf = evdl+'.raw.pkl' 97 | pklhtmf = evdl+'.htm.pkl' 98 | 99 | if os.path.exists(pklhtmf) and use_pickle and do_harmonize : 100 | 101 | # Read from pickle 102 | print("- Saved Harmonized pickle file found:") 103 | print(" "+pklhtmf) 104 | print("- Reading data from HAM pickle file...") 105 | dfs = pickle.load(open(pklhtmf, "rb")) 106 | print("- Done.") 107 | print("") 108 | 109 | else: 110 | print("- No saved Harmonized pickle file found.") 111 | print("") 112 | 113 | 114 | 115 | 116 | 117 | with open(evdl, 'r') as f: 118 | data = [json.loads(line) for line in f] 119 | dfs = pd.json_normalize(data) 120 | 121 | return dfs 122 | 123 | def read_pcap_csv(evdl, **kwargs): 124 | """ Read pcap data from from a json file 125 | Args: 126 | evdl (str): path to file source 127 | kwargs: read options 128 | Returns: 129 | pandas.DataFrame (in the future a dictionary of pandas.DataFrame) 130 | """ 131 | 132 | if d4.debug >= 3: 133 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 134 | 135 | # Parse Arguments 136 | tool = kwargs.get('tool', '') 137 | hostname = kwargs.get('hostname', '') 138 | do_harmonize = kwargs.get('harmonize', True) 139 | use_pickle = kwargs.get('use_pickle' , True) 140 | 141 | output = pd.read_csv(evdl) 142 | output = output.rename(columns={'ip.src': 'Source_IP', 'ip.dst': 'Destination_IP', 'tcp.srcport': 'Source_TCP_Port', 'tcp.dstport': 'Destination_TCP_Port', 'frame.time': 'Frame_Time', '_ws.col.Protocol': 'Protocol', '_ws.col.Info': 'Info'}) 143 | 144 | return output 145 | 146 | 147 | # HARMONIZATION FUNCTIONS ##################################################### 148 | 149 | def harmonize(df, **kwargs): 150 | """ Function description 151 | 152 | Args: 153 | 154 | Returns: 155 | 156 | Raises: 157 | """ 158 | 159 | if d4.debug >= 3: 160 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 161 | 162 | orchestrator = kwargs.get('orchestrator', None) 163 | tool = kwargs.get('tool', None) 164 | plugin = kwargs.get('plugin', None) 165 | hostname = kwargs.get('hostname', None) 166 | 167 | # Specific Harmonization Pre-Processing =================================== 168 | 169 | # Generic Harmonization =================================================== 170 | df = d4com.harmonize_common(df, **kwargs) 171 | 172 | # Specific Harmonization Post-Processing ================================== 173 | 174 | # return ================================================================== 175 | # WARNING: For artifact-modules only 176 | # df['D4_DataType_'] = 'DATA_TYPE_HERE' 177 | 178 | return df 179 | 180 | # ANALYSIS FUNCTIONS ########################################################## 181 | 182 | # simple ====================================================================== 183 | # ANALYSIS FUNCTIONS ########################################################## 184 | 185 | # simple ====================================================================== 186 | def simple_func(df, *args, **kwargs): 187 | """ Reformat the input df so the data is presented to the analyst in the 188 | friendliest possible way 189 | 190 | Parameters: 191 | df (pd.dataframe): Input data 192 | 193 | Returns: 194 | pd.DataFrame: Optionally it will return the filtered dataframe, 195 | only if ret=True is set, constant & hidden columns included 196 | If ret_out=True is set, then the output just as it is shown 197 | (without constant/hidden columns) will be return 198 | """ 199 | 200 | if d4.debug >= 3: 201 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 202 | 203 | # Artifact-specific argument parsing ======================================= 204 | 205 | # Variables ================================================================ 206 | hiddencols = [] 207 | 208 | dfout = df 209 | 210 | # Maximum number of lines in DF for beautification 211 | maxdfbprintlines = 20 212 | 213 | # Pre-Processing ========================================================== 214 | 215 | # Call to simple_common =================================================== 216 | dfout = d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines) 217 | 218 | # Post-Processing ========================================================= 219 | 220 | # Return ================================================================== 221 | return dfout 222 | 223 | # analysis ==================================================================== 224 | def analysis(obj, *args, **kwargs): 225 | """ Redirects execution to analysis_func() 226 | """ 227 | return analysis_func(obj, *args, **kwargs) 228 | 229 | def analysis_func(obj, *args, **kwargs): 230 | """ Umbrella function that redirects to different types of analysis 231 | available on the input data 232 | 233 | Parameters: 234 | obj: Input data (typically DF or dict of DFs) 235 | 236 | Returns: 237 | pd.DataFrame: Refer to each specific analysis function 238 | """ 239 | 240 | if d4.debug >= 3: 241 | print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]") 242 | 243 | # SUB-FUNCTIONS ########################################################### 244 | def syntax(): 245 | print('Syntax: analysis(obj, "analysis_type")\n') 246 | d4list("str-help") 247 | return 248 | 249 | def d4list(objtype): 250 | 251 | # Analysis Modules Available for this objective 252 | anlav = False 253 | print("Available XXXXX analysis types:") 254 | if objtype == None or objtype == "str-help" or objtype == "str-list" or re.search("^dict-pandas_dataframe-XXXXX", objtype): 255 | anlav = True 256 | print("- XXXXX_files: No.events XXXXX file (Input: XXXdfs)") 257 | 258 | if anlav == False: 259 | print('- No analysis modules available for this object ('+objtype+').') 260 | 261 | # FUNCTION BODY ########################################################### 262 | thisdatatype = None 263 | 264 | nargs = len(args) 265 | 266 | if nargs == 0: 267 | syntax() 268 | return 269 | 270 | obj = args[0] 271 | 272 | objtype = d4com.data_identify(obj) 273 | 274 | if isinstance(obj, str): 275 | if obj == "list": 276 | d4list(objtype) 277 | return 278 | if obj == "help": 279 | syntax() 280 | return 281 | 282 | if nargs == 1: 283 | if thisdatatype is not None: 284 | if re.search("^dict-pandas_dataframe-"+thisdatatype, objtype) or re.search("^pandas_dataframe-"+thisdatatype, objtype): 285 | d4list(objtype) 286 | else: 287 | syntax() 288 | else: 289 | syntax() 290 | 291 | return 292 | 293 | anltype = args[1] 294 | 295 | if not isinstance(anltype, str): 296 | syntax() 297 | return 298 | 299 | if anltype == "help": 300 | syntax() 301 | return 302 | elif anltype == "list": 303 | d4list(objtype) 304 | return 305 | 306 | # ANALYSIS FUNCTIONS ====================================================== 307 | 308 | # XXXdfs ------------------------------------------------------------------ 309 | if re.search("^dict-pandas_dataframe-XXXXX", objtype): 310 | if anltype == "XXXXX_files": 311 | return analysis_XXXXX_files(*args, **kwargs) 312 | 313 | print("INFO: [d4XXX] No analysis functions available for this data type ("+objtype+")") 314 | 315 | # DATAFRAME ACCESSOR ########################################################## 316 | 317 | @pd.api.extensions.register_dataframe_accessor("d4tshrk") 318 | class Ds4n6TshrkAccessor: 319 | def __init__(self, pandas_obj): 320 | self._obj = pandas_obj 321 | 322 | def simple(self, *args, **kwargs): 323 | """ Redirects execution to simple_func() 324 | """ 325 | df = self._obj 326 | return simple_func(df, *args, **kwargs) 327 | 328 | 329 | @pd.api.extensions.register_dataframe_accessor("d4_tshark") 330 | class Ds4n6TsharkAccessor: 331 | def __init__(self, pandas_obj): 332 | self._obj = pandas_obj 333 | 334 | def simple(self, *args, **kwargs): 335 | """ Redirects execution to simple_func() 336 | """ 337 | df = self._obj 338 | return simple_func(df, *args, **kwargs) 339 | -------------------------------------------------------------------------------- /src/ds4n6_lib/ml_models/seq2seq_lstm.py: -------------------------------------------------------------------------------- 1 | # 2 | # Description: implementation of ML model: seq2seq - LSTM 3 | # 4 | 5 | ############################################################################################# 6 | # INFO 7 | ############################################################################################# 8 | 9 | ############################################################################################# 10 | # IMPORTS 11 | ############################################################################################# 12 | import re, string, os, time 13 | import pandas as pd 14 | import numpy as np 15 | from ast import literal_eval 16 | from gensim.models import Word2Vec 17 | from tensorflow import keras 18 | from tensorflow.keras import layers 19 | from tensorflow.keras import activations 20 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization 21 | import tensorflow as tf 22 | 23 | 24 | ############################################################################################# 25 | # Class: Seq2seqData 26 | ############################################################################################# 27 | class Seq2seqData: 28 | def __init__(self): 29 | self.batch_size = 16 30 | self.vocab_size = 0 31 | self.sequence_length = 0 32 | self.in_vectorization = TextVectorization() 33 | self.out_vectorization = TextVectorization() 34 | self.train_dset = 0 35 | 36 | def load_path_dataset(self, lm_dset, from_date, to_date, min_count): 37 | if type(lm_dset) == str: 38 | lm_dset = pd.read_csv(lm_dset) 39 | lm_dset = lm_dset.astype(str) 40 | lm_dset['path'] = lm_dset['path'].apply(literal_eval) 41 | lm_dset['date'] = pd.to_datetime(lm_dset['date'], format='%Y-%m-%d') 42 | lm_dset = lm_dset[(lm_dset['date'] >= from_date) & (lm_dset['date'] <= to_date)] 43 | model = Word2Vec(list(lm_dset['path']), vector_size=0, min_count=min_count) 44 | node_list = model.wv.index_to_key 45 | self.vocab_size = len(node_list)+5 46 | 47 | ndset = lm_dset.copy() 48 | for idx,row in lm_dset.iterrows(): 49 | for node in row.path: 50 | if node not in node_list: 51 | ndset = ndset.drop(index=idx) 52 | break 53 | return ndset 54 | 55 | def process_train_data(self, lm_dset): 56 | target_data = [] 57 | for i in lm_dset['path']: 58 | target_data.append(['[sos]'] + i + ['[eos]']) 59 | self.sequence_length = max(len(s) for s in target_data) 60 | 61 | train_in = [' '.join(i) for i in lm_dset['path']] 62 | train_out = [' '.join(i) for i in target_data] 63 | return train_in, train_out 64 | 65 | def build_train_dset(self, train_in, train_out): 66 | self._tokenizer(train_in, train_out) 67 | t_in = self.in_vectorization(train_in) 68 | t_out = self.out_vectorization(train_out) 69 | dataset = tf.data.Dataset.from_tensor_slices((t_in, t_out[:, :-1], t_out[:, 1:])) 70 | self.train_dset = dataset.shuffle(len(train_in)).batch(self.batch_size, drop_remainder=True) 71 | 72 | # AUX. FUNCTIONS 73 | def _custom_standardization(self, input_string): 74 | strip_chars = string.punctuation 75 | strip_chars = strip_chars.replace("[", "") 76 | strip_chars = strip_chars.replace("]", "") 77 | strip_chars = strip_chars.replace("-", "") 78 | strip_chars = strip_chars.replace("_", "") 79 | strip_chars = strip_chars.replace(".", "") 80 | strip_chars = strip_chars.replace(":", "") 81 | strip_chars = strip_chars.replace("&", "") 82 | strip_chars = strip_chars.replace("/", "") 83 | strip_chars = strip_chars.replace("\\", "") 84 | strip_chars = strip_chars.replace("@", "") 85 | lowercase = tf.strings.lower(input_string) 86 | return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "") 87 | 88 | def _tokenizer(self, train_in, train_out): 89 | self.in_vectorization = TextVectorization(max_tokens=self.vocab_size, output_mode="int", output_sequence_length=self.sequence_length) 90 | self.out_vectorization = TextVectorization(max_tokens=self.vocab_size, output_mode="int", output_sequence_length=self.sequence_length + 1, standardize=self._custom_standardization) 91 | self.in_vectorization.adapt(train_in) 92 | self.out_vectorization.adapt(train_out) 93 | 94 | 95 | ############################################################################################# 96 | # Class: Autoencoder (LSTM) 97 | ############################################################################################# 98 | class Autoencoder: 99 | def __init__(self, embed_dim, latent_dim, data): 100 | self.epochs = 10 101 | self.embed_dim = embed_dim 102 | self.latent_dim = latent_dim 103 | self.data = data 104 | self.encoder = None 105 | self.decoder = None 106 | 107 | def set_epochs(self, epochs): 108 | self.epochs = epochs 109 | 110 | def build_autoencoder(self): 111 | self.encoder = Encoder(self.data.vocab_size, self.embed_dim, self.latent_dim) 112 | self.decoder = Decoder(self.data.vocab_size, self.embed_dim, self.latent_dim) 113 | 114 | def fit_autoencoder(self): 115 | optimizer = tf.keras.optimizers.Adam(clipnorm=5.0) 116 | checkpoint_dir = './training_ckpt_seq2seq' 117 | checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") 118 | checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=self.encoder, decoder=self.decoder) 119 | losses, accuracies = self._main_train(self.encoder, self.decoder, self.data.train_dset, self.epochs, self.data.batch_size, optimizer, checkpoint, checkpoint_prefix) 120 | 121 | def decode_sequence(self, input_sentence, node_index_dict): 122 | decoded_err = [] 123 | t_path = input_sentence.split(' ') + (['[eos]']*(self.data.sequence_length)) 124 | tokenized_input_sentence = self.data.in_vectorization([input_sentence]) 125 | en_initial_states = self.encoder.init_states(1) 126 | en_outputs = self.encoder(tf.constant(tokenized_input_sentence), en_initial_states) 127 | de_state_h, de_state_c = en_outputs[1:] 128 | 129 | decoded_path = '[sos]' 130 | for i in range(self.data.sequence_length): 131 | tokenized_target_sentence = self.data.out_vectorization([decoded_path])[:, :-1] 132 | de_output, de_state_h, de_state_c, predictions = self.decoder(tokenized_target_sentence, (de_state_h, de_state_c)) 133 | 134 | n = t_path.pop(0) 135 | index = next((i for i, node in node_index_dict.items() if node == n), None) 136 | err = np.array(predictions)[0][i][index] 137 | decoded_err.append(err) 138 | 139 | sampled_token_index = np.argmax(predictions[0, i, :]) 140 | sampled_token = node_index_dict[sampled_token_index] 141 | decoded_path += ' ' + sampled_token 142 | 143 | if sampled_token == '[eos]': 144 | break 145 | return decoded_path, decoded_err 146 | 147 | def get_anomalies(self, train_in): 148 | node_vocab = self.data.out_vectorization.get_vocabulary() 149 | node_index_dict = dict(zip(range(len(node_vocab)), node_vocab)) 150 | 151 | e_matrix = [] 152 | test_in_paths = [pair for pair in train_in] 153 | for idx,path in enumerate(test_in_paths): 154 | dec_lm, err = self.decode_sequence(path, node_index_dict) 155 | mse = np.square(err).mean() 156 | e_matrix.append([idx, mse]) 157 | error_matrix = np.array(e_matrix) 158 | error_matrix = error_matrix[error_matrix[:, 1].argsort()] 159 | return error_matrix 160 | 161 | # AUX. FUNCTIONS 162 | def _main_train(self, encoder, decoder, dataset, n_epochs, batch_size, optimizer, checkpoint, checkpoint_prefix): 163 | losses = [] 164 | accuracies = [] 165 | print('Model: "LSTM"') 166 | print('____________________________________________________________') 167 | for e in range(n_epochs): 168 | start = time.time() 169 | en_initial_states = encoder.init_states(batch_size) 170 | for batch, (input_seq, target_seq_in, target_seq_out) in enumerate(dataset.take(-1)): 171 | loss, accuracy = self._train_step(input_seq, target_seq_in, target_seq_out, en_initial_states, optimizer) 172 | 173 | if batch % 100 == 0: 174 | losses.append(loss) 175 | accuracies.append(accuracy) 176 | print('Epoch {} Batch {} Loss {:.4f} Acc:{:.4f}'.format(e + 1, batch, loss.numpy(), accuracy.numpy())) 177 | if (e + 1) % 2 == 0: 178 | checkpoint.save(file_prefix = checkpoint_prefix) 179 | print('Time taken for 1 epoch {:.4f} sec\n'.format(time.time() - start)) 180 | return losses, accuracies 181 | 182 | def _train_step(self, input_seq, target_seq_in, target_seq_out, en_initial_states, optimizer): 183 | with tf.GradientTape() as tape: 184 | en_outputs = self.encoder(input_seq, en_initial_states) 185 | en_states = en_outputs[1:] 186 | de_states = en_states 187 | de_outputs = self.decoder(target_seq_in, de_states) 188 | logits = de_outputs[0] 189 | loss = self._loss_func(target_seq_out, logits) 190 | acc = self._accuracy_fn(target_seq_out, logits) 191 | 192 | variables = self.encoder.trainable_variables + self.decoder.trainable_variables 193 | gradients = tape.gradient(loss, variables) 194 | optimizer.apply_gradients(zip(gradients, variables)) 195 | return loss, acc 196 | 197 | def _loss_func(self, targets, logits): 198 | crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) 199 | mask = tf.math.logical_not(tf.math.equal(targets, 0)) 200 | mask = tf.cast(mask, dtype=tf.int64) 201 | loss = crossentropy(targets, logits, sample_weight=mask) 202 | return loss 203 | 204 | def _accuracy_fn(self, y_true, y_pred): 205 | pred_values = tf.keras.backend.cast(tf.keras.backend.argmax(y_pred, axis=-1), dtype='int64') 206 | correct = tf.keras.backend.cast(tf.keras.backend.equal(y_true, pred_values), dtype='float32') 207 | 208 | mask = tf.keras.backend.cast(tf.keras.backend.greater(y_true, 0), dtype='float32') 209 | n_correct = tf.keras.backend.sum(mask * correct) 210 | n_total = tf.keras.backend.sum(mask) 211 | return n_correct / n_total 212 | 213 | 214 | ############################################################################################# 215 | # Class: Encoder 216 | ############################################################################################# 217 | class Encoder(tf.keras.Model): 218 | def __init__(self, vocab_size, embedding_dim, hidden_dim): 219 | super(Encoder, self).__init__() 220 | self.hidden_dim = hidden_dim 221 | self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) 222 | self.lstm = tf.keras.layers.LSTM( 223 | hidden_dim, return_sequences=True, return_state=True) 224 | 225 | def call(self, input_sequence, states): 226 | embed = self.embedding(input_sequence) 227 | output, state_h, state_c = self.lstm(embed, initial_state=states) 228 | return output, state_h, state_c 229 | 230 | def init_states(self, batch_size): 231 | return (tf.zeros([batch_size, self.hidden_dim]), 232 | tf.zeros([batch_size, self.hidden_dim])) 233 | 234 | 235 | ############################################################################################# 236 | # Class: Decoder 237 | ############################################################################################# 238 | class Decoder(tf.keras.Model): 239 | def __init__(self, vocab_size, embedding_dim, hidden_dim): 240 | super(Decoder, self).__init__() 241 | self.hidden_dim = hidden_dim 242 | self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) 243 | self.lstm = tf.keras.layers.LSTM( 244 | hidden_dim, return_sequences=True, return_state=True) 245 | self.dense = tf.keras.layers.Dense(vocab_size) 246 | self.out = tf.keras.layers.Softmax() 247 | 248 | def call(self, input_sequence, state): 249 | embed = self.embedding(input_sequence) 250 | lstm_out, state_h, state_c = self.lstm(embed, state) 251 | logits = self.dense(lstm_out) 252 | out = self.out(logits) 253 | return logits, state_h, state_c, out -------------------------------------------------------------------------------- /src/ds4n6_lib/ml_models/transformer.py: -------------------------------------------------------------------------------- 1 | # 2 | # Description: implementation of ML model: seq2seq - Transformer 3 | # 4 | 5 | ############################################################################################# 6 | # INFO 7 | ############################################################################################# 8 | 9 | ############################################################################################# 10 | # IMPORTS 11 | ############################################################################################# 12 | import re, string 13 | import pandas as pd 14 | import numpy as np 15 | from ast import literal_eval 16 | from gensim.models import Word2Vec 17 | from tensorflow import keras 18 | from tensorflow.keras import layers 19 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization 20 | import tensorflow as tf 21 | 22 | 23 | ############################################################################################# 24 | # Class: Seq2seqData 25 | ############################################################################################# 26 | class Seq2seqData: 27 | def __init__(self): 28 | self.batch_size = 16 29 | self.vocab_size = 0 30 | self.sequence_length = 0 31 | self.in_vectorization = TextVectorization() 32 | self.out_vectorization = TextVectorization() 33 | self.train_dset = 0 34 | 35 | def load_path_dataset(self, lm_dset, from_date, to_date, min_count): 36 | if type(lm_dset) == str: 37 | lm_dset = pd.read_csv(lm_dset) 38 | lm_dset = lm_dset.astype(str) 39 | lm_dset['path'] = lm_dset['path'].apply(literal_eval) 40 | lm_dset['date'] = pd.to_datetime(lm_dset['date'], format='%Y-%m-%d') 41 | lm_dset = lm_dset[(lm_dset['date'] >= from_date) & (lm_dset['date'] <= to_date)] 42 | model = Word2Vec(list(lm_dset['path']), vector_size=0, min_count=min_count) 43 | node_list = model.wv.index_to_key 44 | self.vocab_size = len(node_list) + 5 45 | 46 | ndset = lm_dset.copy() 47 | for idx,row in lm_dset.iterrows(): 48 | for node in row.path: 49 | if node not in node_list: 50 | ndset = ndset.drop(index=idx) 51 | break 52 | return ndset 53 | 54 | def process_train_data(self, lm_dset): 55 | target_data = [] 56 | for i in lm_dset['path']: 57 | target_data.append(['[sos]'] + i + ['[eos]']) 58 | self.sequence_length = max(len(s) for s in target_data) 59 | 60 | train_in = [' '.join(i) for i in lm_dset['path']] 61 | train_out = [' '.join(i) for i in target_data] 62 | return train_in, train_out 63 | 64 | def build_train_dset(self, train_in, train_out): 65 | self._tokenizer(train_in, train_out) 66 | dataset = tf.data.Dataset.from_tensor_slices((train_in, train_out)) 67 | dataset = dataset.batch(self.batch_size) 68 | dataset = dataset.map(self._format_dataset) 69 | self.train_dset = dataset.shuffle(len(train_in)).prefetch(16).cache() 70 | 71 | # AUX. FUNCTIONS 72 | def _custom_standardization(self, input_string): 73 | strip_chars = string.punctuation 74 | strip_chars = strip_chars.replace("[", "") 75 | strip_chars = strip_chars.replace("]", "") 76 | strip_chars = strip_chars.replace("-", "") 77 | strip_chars = strip_chars.replace("_", "") 78 | strip_chars = strip_chars.replace(".", "") 79 | strip_chars = strip_chars.replace(":", "") 80 | strip_chars = strip_chars.replace("&", "") 81 | strip_chars = strip_chars.replace("/", "") 82 | strip_chars = strip_chars.replace("\\", "") 83 | strip_chars = strip_chars.replace("@", "") 84 | lowercase = tf.strings.lower(input_string) 85 | return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "") 86 | 87 | def _tokenizer(self, train_in, train_out): 88 | self.in_vectorization = TextVectorization(max_tokens=self.vocab_size, output_mode="int", output_sequence_length=self.sequence_length) 89 | self.out_vectorization = TextVectorization(max_tokens=self.vocab_size, output_mode="int", output_sequence_length=self.sequence_length + 1, standardize=self._custom_standardization) 90 | self.in_vectorization.adapt(train_in) 91 | self.out_vectorization.adapt(train_out) 92 | 93 | def _format_dataset(self, train_in, train_out): 94 | t_in = self.in_vectorization(train_in) 95 | t_out = self.out_vectorization(train_out) 96 | return ({"encoder_inputs": t_in, "decoder_inputs": t_out[:, :-1],}, t_out[:, 1:]) 97 | 98 | 99 | ############################################################################################# 100 | # Class: Autoencoder (Transformer) 101 | ############################################################################################# 102 | class Autoencoder: 103 | def __init__(self, embed_dim, latent_dim, data): 104 | self.epochs = 5 105 | self.num_heads = 1 106 | self.embed_dim = embed_dim 107 | self.latent_dim = latent_dim 108 | self.data = data 109 | self.model = None 110 | 111 | def set_epochs(self, epochs): 112 | self.epochs = epochs 113 | 114 | def set_num_heads(self, num_heads): 115 | self.num_heads = num_heads 116 | 117 | def build_autoencoder(self): 118 | encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs") 119 | x = PositionalEmbedding(self.data.sequence_length, self.data.vocab_size, self.embed_dim)(encoder_inputs) 120 | encoder_outputs = TransformerEncoder(self.embed_dim, self.latent_dim, self.num_heads)(x) 121 | encoder = keras.Model(encoder_inputs, encoder_outputs) 122 | 123 | decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs") 124 | encoded_seq_inputs = keras.Input(shape=(None, self.embed_dim), name="decoder_state_inputs") 125 | x = PositionalEmbedding(self.data.sequence_length, self.data.vocab_size, self.embed_dim)(decoder_inputs) 126 | x = TransformerDecoder(self.embed_dim, self.latent_dim, self.num_heads)(x, encoded_seq_inputs) 127 | x = layers.Dropout(0.6)(x) 128 | decoder_outputs = layers.Dense(self.data.vocab_size, activation="softmax")(x) 129 | decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs) 130 | 131 | decoder_outputs = decoder([decoder_inputs, encoder_outputs]) 132 | transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer") 133 | self.model = transformer 134 | 135 | def fit_autoencoder(self): 136 | self.model.summary() 137 | self.model.compile("rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) 138 | self.model.fit(self.data.train_dset, epochs=self.epochs) 139 | 140 | def decode_sequence(self, input_sentence, node_index_dict): 141 | decoded_err = [] 142 | t_path = input_sentence.split(' ') + (['[eos]']*(self.data.sequence_length)) 143 | 144 | tokenized_input_sentence = self.data.in_vectorization([input_sentence]) 145 | decoded_path = '[sos]' 146 | for i in range(self.data.sequence_length): 147 | tokenized_target_sentence = self.data.out_vectorization([decoded_path])[:, :-1] 148 | predictions = self.model([tokenized_input_sentence, tokenized_target_sentence]) 149 | 150 | n = t_path.pop(0) 151 | index = next((i for i, node in node_index_dict.items() if node == n), None) 152 | err = np.array(predictions)[0][i][index] 153 | decoded_err.append(err) 154 | 155 | sampled_token_index = np.argmax(predictions[0, i, :]) 156 | sampled_token = node_index_dict[sampled_token_index] 157 | decoded_path += ' ' + sampled_token 158 | 159 | if sampled_token == '[eos]': 160 | break 161 | return decoded_path, decoded_err 162 | 163 | def get_anomalies(self, train_in): 164 | node_vocab = self.data.out_vectorization.get_vocabulary() 165 | node_index_dict = dict(zip(range(len(node_vocab)), node_vocab)) 166 | 167 | e_matrix = [] 168 | test_in_paths = [pair for pair in train_in] 169 | for idx,path in enumerate(test_in_paths): 170 | dec_lm, err = self.decode_sequence(path, node_index_dict) 171 | mse = np.square(err).mean() 172 | e_matrix.append([idx, mse]) 173 | error_matrix = np.array(e_matrix) 174 | error_matrix = error_matrix[error_matrix[:, 1].argsort()] 175 | return error_matrix 176 | 177 | 178 | ############################################################################################# 179 | # Class: TransformerEncoder 180 | ############################################################################################# 181 | class TransformerEncoder(layers.Layer): 182 | def __init__(self, embed_dim, dense_dim, num_heads, **kwargs): 183 | super().__init__(**kwargs) 184 | self.embed_dim = embed_dim 185 | self.dense_dim = dense_dim 186 | self.num_heads = num_heads 187 | self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) 188 | self.dense_proj = keras.Sequential([layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]) 189 | self.layernorm_1 = layers.LayerNormalization() 190 | self.layernorm_2 = layers.LayerNormalization() 191 | self.supports_masking = True 192 | 193 | def call(self, inputs, mask=None): 194 | if mask is not None: 195 | padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32") 196 | attention_output = self.attention(query=inputs, value=inputs, key=inputs, attention_mask=padding_mask) 197 | proj_input = self.layernorm_1(inputs + attention_output) 198 | proj_output = self.dense_proj(proj_input) 199 | return self.layernorm_2(proj_input + proj_output) 200 | 201 | 202 | ############################################################################################# 203 | # Class: PositionalEmbedding 204 | ############################################################################################# 205 | class PositionalEmbedding(layers.Layer): 206 | def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs): 207 | super().__init__(**kwargs) 208 | self.token_embeddings = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim) 209 | self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=embed_dim) 210 | self.sequence_length = sequence_length 211 | self.vocab_size = vocab_size 212 | self.embed_dim = embed_dim 213 | 214 | def call(self, inputs): 215 | length = tf.shape(inputs)[-1] 216 | positions = tf.range(start=0, limit=length, delta=1) 217 | embedded_tokens = self.token_embeddings(inputs) 218 | embedded_positions = self.position_embeddings(positions) 219 | return embedded_tokens + embedded_positions 220 | 221 | def compute_mask(self, inputs, mask=None): 222 | return tf.math.not_equal(inputs, 0) 223 | 224 | 225 | ############################################################################################# 226 | # Class: TransformerDecoder 227 | ############################################################################################# 228 | class TransformerDecoder(layers.Layer): 229 | def __init__(self, embed_dim, latent_dim, num_heads, **kwargs): 230 | super().__init__(**kwargs) 231 | self.embed_dim = embed_dim 232 | self.latent_dim = latent_dim 233 | self.num_heads = num_heads 234 | self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) 235 | self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) 236 | self.dense_proj = keras.Sequential([layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]) 237 | self.layernorm_1 = layers.LayerNormalization() 238 | self.layernorm_2 = layers.LayerNormalization() 239 | self.layernorm_3 = layers.LayerNormalization() 240 | self.supports_masking = True 241 | 242 | def call(self, inputs, encoder_outputs, mask=None): 243 | causal_mask = self.get_causal_attention_mask(inputs) 244 | if mask is not None: 245 | padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32") 246 | padding_mask = tf.minimum(padding_mask, causal_mask) 247 | 248 | attention_output_1 = self.attention_1(query=inputs, value=inputs, key=inputs, attention_mask=causal_mask) 249 | out_1 = self.layernorm_1(inputs + attention_output_1) 250 | 251 | attention_output_2 = self.attention_2( 252 | query=out_1, 253 | value=encoder_outputs, 254 | key=encoder_outputs, 255 | attention_mask=padding_mask, 256 | ) 257 | out_2 = self.layernorm_2(out_1 + attention_output_2) 258 | 259 | proj_output = self.dense_proj(out_2) 260 | return self.layernorm_3(out_2 + proj_output) 261 | 262 | def get_causal_attention_mask(self, inputs): 263 | input_shape = tf.shape(inputs) 264 | batch_size, sequence_length = input_shape[0], input_shape[1] 265 | i = tf.range(sequence_length)[:, tf.newaxis] 266 | j = tf.range(sequence_length) 267 | mask = tf.cast(i >= j, dtype="int32") 268 | mask = tf.reshape(mask, (1, input_shape[1], input_shape[1])) 269 | mult = tf.concat( 270 | [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 271 | axis=0, 272 | ) 273 | return tf.tile(mask, mult) -------------------------------------------------------------------------------- /src/ds4n6_lib/flist.py: -------------------------------------------------------------------------------- 1 | # DS4N6 2 | # 3 | # Description: Library of functions to apply Data Science to forensics artifacts 4 | # 5 | 6 | ############################################################################### 7 | # INFO 8 | ############################################################################### 9 | # Recommended "import as": d4flst 10 | 11 | ############################################################################### 12 | # IMPORTS 13 | ############################################################################### 14 | 15 | # DEV IMPORTS ---------------------------------------------------------------- 16 | 17 | # python IMPORTS -------------------------------------------------------------- 18 | import os 19 | import glob 20 | import re 21 | import time 22 | import pickle 23 | import inspect 24 | 25 | # DS IMPORTS ------------------------------------------------------------------ 26 | import numpy as np 27 | import pandas as pd 28 | import matplotlib.pyplot as plt 29 | 30 | # DS4N6 IMPORTS --------------------------------------------------------------- 31 | import ds4n6_lib.d4 as d4 32 | import ds4n6_lib.common as d4com 33 | import ds4n6_lib.gui as d4gui 34 | import ds4n6_lib.utils as d4utl 35 | import ds4n6_lib.unx as d4unx 36 | 37 | ############################################################################### 38 | # VARIABLES 39 | ############################################################################### 40 | hiddencols = [ 'MTStampEpoch_', 'MTStamp_', 'ATStampEpoch_', 'ATStamp_', 'CTStampEpoch_', 'CTStamp_', 'Meta_', 'FileStem_', 'ParentName_', 'ParentPath_', 'ParentMeta_', 'PathSeparator_', 'FilePath-Hash_', 'FileName-Hash_', 'FileStem-Hash_', 'ParentPath-Hash_', 'ParentName-Hash_', 'NTFS-SeqNumber_', 'ParentSeqNumber_', 'ParentPath', 'NTFS-ReferenceCount_', 'NTFS-ReparseTarget_', 'IsDirectory_', 'NTFS-HasAds_', 'NTFS-IsAds_', 'NTFS-SI