├── runtime.txt
├── src
    └── ds4n6_lib
    │   ├── __init__.py
    │   ├── ml_models
    │       ├── __init__.py
    │       ├── seq2seq_lstm.py
    │       └── transformer.py
    │   ├── tools.py
    │   ├── d4.py
    │   ├── pf.py
    │   ├── amcache.py
    │   ├── winreg.py
    │   ├── svclist.py
    │   ├── unx.py
    │   ├── autoruns.py
    │   ├── kansa.py
    │   ├── volatility.py
    │   ├── mactime.py
    │   ├── fstl.py
    │   ├── pslist.py
    │   ├── macrobber.py
    │   ├── tshark.py
    │   ├── flist.py
    │   ├── utils.py
    │   ├── kape.py
    │   └── mlgraph.py
├── setup.cfg
├── pyproject.toml
├── MANIFEST.in
├── setup.py
├── requirements.txt
├── README.md
└── CONTRIBUTING.md


/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.10.12


--------------------------------------------------------------------------------
/src/ds4n6_lib/__init__.py:
--------------------------------------------------------------------------------
1 |  
2 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/ml_models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=42",
4 |     "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/tools.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 
3 | def explore(df, col, max_rows=None, max_columns=None):
4 |     hist = df[col].value_counts()
5 |     with pd.option_context('display.max_rows', max_rows, 'display.max_columns', max_columns):
6 |         print("#Count:",len(hist))
7 |         print(hist)


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.md
 2 | include LICENSE
 3 | include requirements.txt
 4 | 
 5 | include src/ds4n6_lib/isim/*.yml
 6 | 
 7 | # Patterns to exclude from any directory
 8 | global-exclude *~
 9 | global-exclude *.pyc
10 | global-exclude *.pyo
11 | global-exclude .git
12 | global-exclude .ipynb_checkpoints
13 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/d4.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # INFO
 3 | ###############################################################################
 4 | # Recommended "import as": d4
 5 | 
 6 | ###############################################################################
 7 | # VARIABLES
 8 | ###############################################################################
 9 | # Debug Level (0: min - 5:max) ------------------------------------------------
10 | # 0: Disabled
11 | # 1: TBD
12 | # 2: Executed functions
13 | # 3: Low    detail on executed functions
14 | # 4: Medium detail on executed functions
15 | # 5: High   detail on executed functions
16 | debug = 0
17 | 
18 | # Other -----------------------------------------------------------------------
19 | out = None
20 | ipregex="^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$"
21 | 
22 | ###############################################################################
23 | # DECLARE VARS
24 | # not_well-formed
25 | main_nwf=[
26 |      {'find':'<\x04Data', 'replace':'<Data'},
27 |      {'find':'</\x04Data', 'replace':'</Data'},
28 |      {'find':'\u0000', 'replace':'\\u0000'},
29 |      {'find':'\u0001', 'replace':'\\u0001'},
30 |      {'find':'\u0002', 'replace':'\\u0002'},
31 |      {'find':'\u0003', 'replace':'\\u0003'},
32 |      {'find':'\u0004', 'replace':'\\u0004'},
33 |      {'find':'\u0005', 'replace':'\\u0005'},
34 |      {'find':'\u0006', 'replace':'\\u0006'},
35 |      {'find':'\u0007', 'replace':'\\u0007'},
36 |      {'find':'\u0008', 'replace':'\\u0008'},
37 |      {'find':'&', 'replace':'&amp;'},
38 |      {'find':'< Name', 'replace':'<Data Name'},
39 |      {'find':'</>', 'replace':'</Data>'},
40 |      {'find':'    Data ', 'replace':'   <Data>'},
41 |      {'find':' <([a-zA-Z0-9_-]*)> ', 'replace':' \\1 ', 'type':'re'},
42 |      {'find':'::<([a-zA-Z0-9_-]*)>::', 'replace':'::\\1::', 'type':'re'},
43 |     ]
44 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="ds4n6_lib",
 8 |     version="0.8.3",
 9 |     author="Jess Garcia",
10 |     author_email="ds4n6@one-esecurity.com",
11 |     description="Bringing Data Science & Artificial Intelligence to the fingertips of the average Forensicator, and promote advances in the field",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/ds4n6/ds4n6_lib",
15 |     project_urls={
16 |       "Bug Tracker" : "https://github.com/ds4n6/ds4n6_lib/issues",
17 |       "Website"     : "http://www.ds4n6.io/"
18 |     },
19 |     keywords = ['dfir', 'datascience', 'forensics'],
20 |     install_requires=[
21 |           'requests',
22 |           'numpy',
23 |           'pandas',
24 |           'Evtx',
25 |           'python-evtx',
26 |           'ipyaggrid',
27 |           'IPython',
28 |           'ipywidgets',
29 |           'keras',
30 |           'matplotlib',
31 |           'nbformat',
32 |           'numpy',
33 |           'pandas',
34 |           'pyparsing',
35 |           'qgrid',
36 |           'ruamel.yaml',
37 |           'sklearn',
38 |           'tensorflow',
39 |           'tqdm',
40 |           'traitlets',
41 |           'xmltodict',
42 |           'networkx',
43 |           'gensim',
44 |       ],
45 |     classifiers=[
46 |       "Development Status :: 3 - Alpha",
47 |       "Intended Audience :: Developers",
48 |       "Intended Audience :: Information Technology",
49 |       "Framework :: Jupyter",
50 |       "Topic :: Security",
51 |       "Topic :: Scientific/Engineering :: Artificial Intelligence",
52 |       "Topic :: Software Development :: Libraries :: Python Modules",
53 |       "Programming Language :: Python :: 3",
54 |       "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
55 |       "Operating System :: OS Independent",
56 |     ],
57 |     package_dir={"": "src"},
58 |     packages=setuptools.find_packages(where="src"),
59 |     python_requires=">=3.10",
60 | )
61 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==1.4.0
  2 | argon2-cffi==20.1.0
  3 | astunparse==1.6.3
  4 | async-generator==1.10
  5 | attrs==20.3.0
  6 | backcall==0.2.0
  7 | bleach==3.3.0
  8 | cachetools==4.2.1
  9 | certifi==2020.12.5
 10 | cffi==1.14.5
 11 | chardet==4.0.0
 12 | configparser==4.0.2
 13 | cycler==0.10.0
 14 | decorator==5.0.5
 15 | defusedxml==0.7.1
 16 | entrypoints==0.3
 17 | evtx==0.8.2
 18 | flatbuffers==23.1.21
 19 | gast==0.3.3
 20 | gensim==4.3.2
 21 | google-auth==2.16.0
 22 | google-auth-oauthlib==1.0.0
 23 | google-pasta==0.2.0
 24 | grpcio==1.51.1
 25 | h5py==3.8.0
 26 | hexdump==3.3
 27 | idna==2.10
 28 | importlib-metadata==3.10.0
 29 | ipyaggrid==0.2.1
 30 | ipykernel==5.5.3
 31 | ipython==7.22.0
 32 | ipython-genutils==0.2.0
 33 | ipywidgets==7.6.3
 34 | jedi==0.18.0
 35 | Jinja2==2.11.3
 36 | joblib==1.2.0
 37 | jsonschema==3.2.0
 38 | jupyter-client==8.0.3
 39 | jupyter-core==5.2.0
 40 | jupyterlab-pygments==0.2.2
 41 | jupyterlab-widgets==3.0.5
 42 | Keras==2.13.1
 43 | Keras-Preprocessing==1.1.2
 44 | kiwisolver==1.3.1
 45 | Markdown==3.3.4
 46 | MarkupSafe==1.1.1
 47 | matplotlib==3.7.0
 48 | mistune==0.8.4
 49 | more-itertools==5.0.0
 50 | nbclient==0.5.3
 51 | nbconvert==6.0.7
 52 | nbformat==5.1.3
 53 | nest-asyncio==1.5.1
 54 | networkx==2.5
 55 | notebook==6.3.0
 56 | numpy==1.23.5
 57 | oauthlib==3.1.0
 58 | opt-einsum==3.3.0
 59 | packaging==20.9
 60 | pandas==2.1.4
 61 | pandocfilters==1.4.3
 62 | parso==0.8.2
 63 | pexpect==4.8.0
 64 | pickleshare==0.7.5
 65 | Pillow==8.2.0
 66 | prometheus-client==0.10.0
 67 | prompt-toolkit==3.0.18
 68 | protobuf==4.24.1
 69 | ptyprocess==0.7.0
 70 | pyasn1==0.4.8
 71 | pyasn1-modules==0.2.8
 72 | pycparser==2.20
 73 | Pygments==2.8.1
 74 | pyparsing==2.4.7
 75 | pyrsistent==0.17.3
 76 | python-dateutil==2.8.2
 77 | python-evtx==0.7.4
 78 | pytz==2021.1
 79 | PyYAML==6.0.1
 80 | pyzmq==25.0.2
 81 | qgrid==1.3.1
 82 | requests==2.25.1
 83 | requests-oauthlib==1.3.0
 84 | rsa==4.7.2
 85 | ruamel.yaml==0.17.21
 86 | ruamel.yaml.clib==0.2.7
 87 | scikit-learn==1.2.1
 88 | scipy==1.10.0
 89 | Send2Trash==1.5.0
 90 | simplejson==3.17.2
 91 | six==1.15.0
 92 | sklearn==0.0
 93 | tensorboard==2.13.0
 94 | tensorflow==2.13.0
 95 | tensorflow-estimator==2.13.0
 96 | termcolor==1.1.0
 97 | terminado==0.9.4
 98 | testpath==0.4.4
 99 | threadpoolctl==2.1.0
100 | tornado==6.2
101 | tqdm==4.59.0
102 | traitlets==5.9.0
103 | typing-extensions==3.7.4.3
104 | urllib3==1.26.4
105 | wcwidth==0.2.5
106 | webencodings==0.5.1
107 | Werkzeug==1.0.1
108 | widgetsnbextension==3.5.1
109 | wrapt==1.12.1
110 | xmltodict==0.12.0
111 | zipp==1.0.0


--------------------------------------------------------------------------------
/src/ds4n6_lib/pf.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | from tensorflow.keras import layers
 4 | from tensorflow.keras.models import Model
 5 | import tensorflow as tf
 6 | from tensorflow.keras import losses
 7 | import matplotlib.pyplot as plt
 8 | import pandas as pd
 9 | 
10 | 
11 | def convert_prefetch_ham_to_hml(df):
12 |     df_split = df['file_referenced'].str.split("\\",expand=True)
13 |     df_split = df_split.drop(columns=[0]).fillna(value='')
14 |     
15 |     first_column = df_split.iloc[:, 0]
16 |     medium_column = []
17 |     last_column = []
18 |     for i in range(df_split.shape[0]):
19 |         arr = [x for x in df_split.iloc[i, 1:] if x != '']
20 |         medium_column.append('\\'.join(arr[:-1]))
21 |         last_column.append('\\'.join(arr[-1:])) # [-1:] because some len(arr) == 0 
22 |     
23 |     list_to_df = list(zip(first_column, medium_column, last_column, df['machine_id']))
24 |     new_df = pd.DataFrame(list_to_df, columns =['A', 'B', 'C', 'machine_id'])
25 |     return new_df
26 | 
27 | 
28 | def ml_prefetch_anomalies(df, odalg="simple_autoencoder", latent_dim = 128, epochs = 10, learning_rate = 1e-3):    
29 |     # Deep Learning
30 |     x_train = pd.get_dummies(df).to_numpy()
31 |     
32 |     class Autoencoder(Model):
33 |       def __init__(self, input_dim, latent_dim):
34 |         super(Autoencoder, self).__init__()
35 |         self.input_dim = input_dim
36 |         self.latent_dim = latent_dim   
37 |         self.encoder = layers.Dense(latent_dim, activation='relu')
38 |         self.decoder = layers.Dense(input_dim, activation='sigmoid')
39 | 
40 |       def call(self, x):
41 |         encoded = self.encoder(x)
42 |         decoded = self.decoder(encoded)
43 |         return decoded
44 | 
45 |     def train_autoencoder(latent_dim, epochs, learning_rate):
46 |         autoencoder = Autoencoder(input_dim=x_train.shape[1], latent_dim=latent_dim)
47 |         opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
48 |         autoencoder.compile(optimizer=opt, loss=losses.MeanSquaredError())
49 |         history = autoencoder.fit(x_train, x_train, epochs=epochs, shuffle=True, verbose=0)
50 |         return autoencoder, history
51 | 
52 |     model, history = train_autoencoder(latent_dim=latent_dim,
53 |                                        epochs=epochs,
54 |                                        learning_rate=learning_rate)
55 | 
56 | 
57 |     preds = model.predict(x_train)
58 |     inference_losses = tf.keras.metrics.mean_squared_error(preds, x_train.astype('float')).numpy()
59 |     
60 |     ranking = []
61 |     for i, loss in zip(range(len(inference_losses)), inference_losses):
62 |         fr = '\\'.join(df.iloc[i, :3])
63 |         
64 |         machine_id = df.iloc[i]['machine_id']
65 |         if fr.endswith('.DLL'):
66 |             ranking.append((loss, i, fr, machine_id))
67 | 
68 |     ranking = sorted(ranking, key=lambda x: -x[0])
69 |     anomdf = pd.DataFrame(ranking, columns=['loss', 'source_index', 'file referenced', 'machine_id'])
70 |     return anomdf[['file referenced', 'machine_id']]
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- PROJECT LOGO -->
  2 | 
  3 | <p align="center">
  4 |   <a href="http://www.ds4n6.io">
  5 |     <img src="http://www.ds4n6.io/images/DS4N6.jpg">
  6 |   </a>
  7 | </p>
  8 | 
  9 | <a href="http://www.ds4n6.io" title=""><img src="http://ds4n6.io/images/logo-s.png" alt="" /></a>
 10 | 
 11 | DS4N6 stands for Data Science Forensics.
 12 | 
 13 | We also refer to this project as DSDFIR, AI4N6 or AIDFIR, since Data Science (DS) includes Artificial Intelligence (AI), and the project goes beyond the strictly Forensics, covering the whole Digital Forensics & Incident Response (DFIR) discipline (and sometimes even beyond). But hey, we had to give the project a catchy name!
 14 | 
 15 | The Mission of the DS4N6 project is simple:
 16 | 
 17 | ```
 18 | Bringing Data Science & Artificial Intelligence
 19 | to the fingertips of the average Forensicator,
 20 | and promote advances in the field
 21 | ```
 22 | 
 23 | The first (modest) alpha version of our ds4n6 python library, together with some easy-to-use python scripts, was originally made public after the presentation at the SANS DFIR Summit US, July 16-17.
 24 | **For detailed information about the Project, the Library, its Functions, its Usage, etc., visit the project page: http://www.ds4n6.io/tools/ds4n6.py.html**
 25 | 
 26 | ## Getting Started
 27 | 
 28 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system.
 29 | 
 30 | https://github.com/ds4n6/ds4n6_lib.git
 31 | 
 32 | ### Prerequisites
 33 | 
 34 | The DS4N6 library works on the 3.x versions of the Python programming language. The module has external dependencies related to datascience and extraction of forensic evidence.
 35 | 
 36 | Install requirements:
 37 | 
 38 |     - python-evtx
 39 |     - Evtx
 40 |     - ipyaggrid
 41 |     - IPython
 42 |     - ipywidgets
 43 |     - keras
 44 |     - matplotlib
 45 |     - nbformat
 46 |     - numpy
 47 |     - pandas
 48 |     - pyparsing
 49 |     - qgrid
 50 |     - ruamel.yaml
 51 |     - sklearn
 52 |     - tensorflow
 53 |     - tqdm
 54 |     - traitlets
 55 |     - xmltodict
 56 |     - networkx
 57 |     - gensim
 58 | 
 59 | ### Installation
 60 | 
 61 | The installation can be easily done through pip.
 62 | 
 63 | #### pip installation
 64 | 
 65 | ```sh
 66 |     pip install -r requirements.txt
 67 | ```
 68 | 
 69 | Finally, import in your python3 program or Jupyter Notebook as "ds".
 70 | 
 71 | ```python
 72 |     import ds4n6_lib as ds
 73 | ```
 74 | 
 75 | ## Contributing
 76 | 
 77 | If you think you can provide value to the Community, collaborating with Research, Blog Posts, Cheatsheets, Code, etc., contact us!
 78 | 
 79 | Please read [CONTRIBUTING.md](https://gist.github.com/PurpleBooth/b24679402957c63ec426) for details on our code of conduct, and the process for submitting pull requests to us.
 80 | 
 81 | ### download from github
 82 | 
 83 | All you will need to do is to clone the library, install the test, create a virtual enviroment to use it and active it.
 84 | 
 85 | ```sh
 86 |     
 87 |     git clone https://github.com/ds4n6/ds4n6_lib    
 88 | 
 89 |     virtualenv -p python3.10 .test
 90 |     source .test/bin/activate
 91 |     
 92 |     pip install -r requirements.txt 
 93 | ```
 94 | 
 95 | ## Authors
 96 | 
 97 | * **Jess Garcia** - *Initial work* - http://ds4n6.io/community/jess_garcia.html
 98 | 
 99 | See also the list of [contributors](http://ds4n6.io/community.html) who participated in this project.
100 | 
101 | ## License
102 | 
103 | This project is licensed under the GNU GPL v3.0 License - see the [LICENSE](LICENSE) file for details
104 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | When contributing to this repository, please first discuss the change you wish to make via issue,
 4 | email, or any other method with the owners of this repository before making a change. 
 5 | 
 6 | Please note we have a code of conduct, please follow it in all your interactions with the project.
 7 | 
 8 | ## Pull Request Process
 9 | 
10 | 1. Ensure any install or build dependencies are removed before the end of the layer when doing a 
11 |    build.
12 | 2. Update the README.md with details of changes to the interface, this includes new environment 
13 |    variables, exposed ports, useful file locations and container parameters.
14 | 3. Increase the version numbers in any examples files and the README.md to the new version that this
15 |    Pull Request would represent.
16 | 4. You may merge the Pull Request in once you have the sign-off of two other developers, or if you 
17 |    do not have permission to do that, you may request the second reviewer to merge it for you.
18 | 
19 | ## Code of Conduct
20 | 
21 | ### Our Pledge
22 | 
23 | In the interest of fostering an open and welcoming environment, we as
24 | contributors and maintainers pledge to making participation in our project and
25 | our community a harassment-free experience for everyone, regardless of age, body
26 | size, disability, ethnicity, gender identity and expression, level of experience,
27 | nationality, personal appearance, race, religion, or sexual identity and
28 | orientation.
29 | 
30 | ### Our Standards
31 | 
32 | Examples of behavior that contributes to creating a positive environment
33 | include:
34 | 
35 | * Using welcoming and inclusive language
36 | * Being respectful of differing viewpoints and experiences
37 | * Gracefully accepting constructive criticism
38 | * Focusing on what is best for the community
39 | * Showing empathy towards other community members
40 | 
41 | Examples of unacceptable behavior by participants include:
42 | 
43 | * The use of sexualized language or imagery and unwelcome sexual attention or
44 | advances
45 | * Trolling, insulting/derogatory comments, and personal or political attacks
46 | * Public or private harassment
47 | * Publishing others' private information, such as a physical or electronic
48 |   address, without explicit permission
49 | * Other conduct which could reasonably be considered inappropriate in a
50 |   professional setting
51 | 
52 | ### Our Responsibilities
53 | 
54 | Project maintainers are responsible for clarifying the standards of acceptable
55 | behavior and are expected to take appropriate and fair corrective action in
56 | response to any instances of unacceptable behavior.
57 | 
58 | Project maintainers have the right and responsibility to remove, edit, or
59 | reject comments, commits, code, wiki edits, issues, and other contributions
60 | that are not aligned to this Code of Conduct, or to ban temporarily or
61 | permanently any contributor for other behaviors that they deem inappropriate,
62 | threatening, offensive, or harmful.
63 | 
64 | ### Scope
65 | 
66 | This Code of Conduct applies both within project spaces and in public spaces
67 | when an individual is representing the project or its community. Examples of
68 | representing a project or community include using an official project e-mail
69 | address, posting via an official social media account, or acting as an appointed
70 | representative at an online or offline event. Representation of a project may be
71 | further defined and clarified by project maintainers.
72 | 
73 | ### Enforcement
74 | 
75 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
76 | reported by contacting the project team at ds4n6@one-esecurity.com. All
77 | complaints will be reviewed and investigated and will result in a response that
78 | is deemed necessary and appropriate to the circumstances. The project team is
79 | obligated to maintain confidentiality with regard to the reporter of an incident.
80 | Further details of specific enforcement policies may be posted separately.
81 | 
82 | Project maintainers who do not follow or enforce the Code of Conduct in good
83 | faith may face temporary or permanent repercussions as determined by other
84 | members of the project's leadership.
85 | 
86 | ### Attribution
87 | 
88 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
89 | available at [http://contributor-covenant.org/version/1/4][version]
90 | 
91 | [homepage]: http://contributor-covenant.org
92 | [version]: http://contributor-covenant.org/version/1/4/
93 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/amcache.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: Library of functions to apply Data Science to forensics artifacts
  4 | #
  5 | 
  6 | ###############################################################################
  7 | # INFO
  8 | ###############################################################################
  9 | # Recommended "import as": d4amch
 10 | 
 11 | ###############################################################################
 12 | # IMPORTS
 13 | ###############################################################################
 14 | 
 15 | # DEV  IMPORTS ----------------------------------------------------------------
 16 | 
 17 | # python IMPORTS --------------------------------------------------------------
 18 | import os
 19 | import glob
 20 | import re
 21 | import time
 22 | import inspect
 23 | import pickle
 24 | 
 25 | # DS IMPORTS ------------------------------------------------------------------
 26 | import numpy  as np
 27 | import pandas as pd
 28 | import matplotlib.pyplot as plt
 29 | 
 30 | # DS4N6 IMPORTS ---------------------------------------------------------------
 31 | import ds4n6_lib.d4     as d4
 32 | import ds4n6_lib.common as d4com
 33 | import ds4n6_lib.gui    as d4gui
 34 | import ds4n6_lib.utils  as d4utl
 35 | import ds4n6_lib.unx    as d4unx
 36 | 
 37 | ###############################################################################
 38 | # FUNCTIONS
 39 | ###############################################################################
 40 | 
 41 | # ANALYSIS FUNCTIONS ##########################################################
 42 | 
 43 | # simple ======================================================================
 44 | def simple_func(df, *args, **kwargs):
 45 |     """ Reformat the input df so the data is presented to the analyst in the
 46 |         friendliest possible way
 47 | 
 48 |     Parameters:
 49 |     df  (pd.dataframe):  Input data 
 50 |     
 51 |     Returns:
 52 |     pd.DataFrame: Optionally it will return the filtered dataframe, 
 53 |                   only if ret=True is set, constant & hidden columns included
 54 |                   If ret_out=True is set, then the output just as it is shown
 55 |                   (without constant/hidden columns) will be return
 56 |     """
 57 | 
 58 |     # Artifact-specific argument parsing =======================================
 59 | 
 60 |     # Variables ================================================================
 61 |     hiddencols =  ['SHA1_', 'FileReference_']
 62 | 
 63 |     dfout = df
 64 | 
 65 |     # Maximum number of lines in DF for beautification
 66 |     maxdfbprintlines = 20
 67 | 
 68 |     # Pre-Processing ==========================================================
 69 | 
 70 |     # Call to simple_common ===================================================
 71 |     dfout = d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines)
 72 | 
 73 |     # Post-Processing =========================================================
 74 | 
 75 |     # Return ==================================================================
 76 |     return dfout
 77 | 
 78 | # analysis ====================================================================
 79 | def analysis(obj, *args, **kwargs):
 80 |     """ Redirects execution to analysis_func()
 81 |     """
 82 |     return analysis_func(obj, *args, **kwargs)
 83 | 
 84 | def analysis_func(obj, *args, **kwargs):
 85 |     """ Umbrella function that redirects to different types of analysis 
 86 |         available on the input data
 87 | 
 88 |     Parameters:
 89 |     obj:          Input data (typically DF or dict of DFs)
 90 |     
 91 |     Returns:
 92 |     pd.DataFrame: Refer to each specific analysis function
 93 |     """
 94 | 
 95 |     def syntax():
 96 |         print('Syntax: analysis(obj, "analysis_type")\n')
 97 |         d4list("str-help")
 98 |         return
 99 | 
100 |     def d4list(objtype):
101 | 
102 |         # Analysis Modules Available for this objective
103 |         anlav = False
104 |         print("Available Amcache analysis types:")
105 |         # if objtype == None or objtype == "str-help" or objtype == "str-list" or  re.search("^dict-pandas_dataframe-XXXXX", objtype):
106 |         #     anlav = True
107 |         #     print("- XXXXX_files:  No.events XXXXX file (Input: XXXdfs)")
108 | 
109 |         if anlav == False:
110 |             print('- No analysis modules available for this object ('+objtype+').')
111 | 
112 |     nargs = len(args)
113 | 
114 |     if nargs == 0:
115 |         syntax()
116 |         return
117 | 
118 |     obj = args[0]
119 | 
120 |     objtype = d4com.data_identify(obj)
121 | 
122 |     if isinstance(obj, str):
123 |         if obj == "list":
124 |             d4list(objtype)
125 |             return
126 |         if obj == "help":
127 |             syntax()
128 |             return
129 | 
130 |     if nargs == 1:
131 |         syntax()
132 |         return
133 | 
134 |     anltype = args[1]
135 | 
136 |     if not isinstance(anltype, str):
137 |         syntax()
138 |         return
139 | 
140 |     if anltype == "help":
141 |         syntax()
142 |         return
143 |     elif anltype == "list":
144 |         d4list(objtype)
145 |         return
146 | 
147 |     # ANALYSIS FUNCTIONS ======================================================
148 | 
149 |     # XXXdfs ------------------------------------------------------------------
150 |     # if   re.search("^dict-pandas_dataframe-XXXXX", objtype):
151 |     #     if anltype == "XXXXX_files":
152 |     #         return analysis_XXXXX_files(*args, **kwargs)
153 | 
154 |     print("INFO: [d4amch] No analysis functions available for this data type ("+objtype+")")
155 | 
156 | # DATAFRAME ACCESSOR ##########################################################
157 | 
158 | @pd.api.extensions.register_dataframe_accessor("d4amch")
159 | class Ds4n6AmchAccessor:
160 |     def __init__(self, pandas_obj):
161 |         self._obj = pandas_obj
162 | 
163 |     def simple(self, *args, **kwargs):
164 |         """ Redirects execution to simple_func()
165 |         """
166 |         df = self._obj
167 |         return simple_func(df, *args, **kwargs)
168 | 
169 | 
170 | @pd.api.extensions.register_dataframe_accessor("d4_amcache")
171 | class Ds4n6AmcacheAccessor:
172 |     def __init__(self, pandas_obj):
173 |         self._obj = pandas_obj
174 | 
175 |     def simple(self, *args, **kwargs):
176 |         """ Redirects execution to simple_func()
177 |         """
178 |         df = self._obj
179 |         return simple_func(df, *args, **kwargs)
180 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/winreg.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: Library of functions to apply Data Science to forensics artifacts
  4 | #
  5 | 
  6 | 
  7 | ###############################################################################
  8 | # INFO
  9 | ###############################################################################
 10 | # Recommended "import as": d4reg
 11 | 
 12 | ###############################################################################
 13 | # IMPORTS
 14 | ###############################################################################
 15 | 
 16 | # DEV  IMPORTS ----------------------------------------------------------------
 17 | 
 18 | # python IMPORTS --------------------------------------------------------------
 19 | import os
 20 | import glob
 21 | import re
 22 | import time
 23 | import inspect
 24 | import pickle
 25 | 
 26 | # DS IMPORTS ------------------------------------------------------------------
 27 | import numpy  as np
 28 | import pandas as pd
 29 | import matplotlib.pyplot as plt
 30 | 
 31 | # DS4N6 IMPORTS ---------------------------------------------------------------
 32 | import ds4n6_lib.d4     as d4
 33 | import ds4n6_lib.common as d4com
 34 | import ds4n6_lib.gui    as d4gui
 35 | import ds4n6_lib.utils  as d4utl
 36 | import ds4n6_lib.unx    as d4unx
 37 | 
 38 | ###############################################################################
 39 | # FUNCTIONS
 40 | ###############################################################################
 41 | 
 42 | # ANALYSIS FUNCTIONS ##########################################################
 43 | 
 44 | # simple ======================================================================
 45 | def simple_func(df, *args, **kwargs):
 46 |     """ Reformat the input df so the data is presented to the analyst in the
 47 |         friendliest possible way
 48 | 
 49 |     Parameters:
 50 |     df  (pd.dataframe):  Input data 
 51 |     
 52 |     Returns:
 53 |     pd.DataFrame: Optionally it will return the filtered dataframe, 
 54 |                   only if ret=True is set, constant & hidden columns included
 55 |                   If ret_out=True is set, then the output just as it is shown
 56 |                   (without constant/hidden columns) will be return
 57 |     """
 58 | 
 59 |     # Artifact-specific argument parsing =======================================
 60 | 
 61 |     # Variables ================================================================
 62 |     hiddencols =  ['KeyLastWriteTimestamp_', 'KeyPath_', 'KeyPath-Hash_']
 63 | 
 64 |     dfout = df
 65 | 
 66 |     # Maximum number of lines in DF for beautification
 67 |     maxdfbprintlines = 20
 68 | 
 69 |     # Pre-Processing ==========================================================
 70 | 
 71 |     # Call to simple_common ===================================================
 72 |     dfout = d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines)
 73 | 
 74 |     # Post-Processing =========================================================
 75 | 
 76 |     # Return ==================================================================
 77 |     return dfout
 78 | 
 79 | # analysis ====================================================================
 80 | def analysis(obj, *args, **kwargs):
 81 |     """ Redirects execution to analysis_func()
 82 |     """
 83 |     return analysis_func(obj, *args, **kwargs)
 84 | 
 85 | def analysis_func(obj, *args, **kwargs):
 86 |     """ Umbrella function that redirects to different types of analysis 
 87 |         available on the input data
 88 | 
 89 |     Parameters:
 90 |     obj:          Input data (typically DF or dict of DFs)
 91 |     
 92 |     Returns:
 93 |     pd.DataFrame: Refer to each specific analysis function
 94 |     """
 95 | 
 96 |     def syntax():
 97 |         print('Syntax: analysis(obj, "analysis_type")\n')
 98 |         d4list("str-help")
 99 |         return
100 | 
101 |     def d4list(objtype):
102 | 
103 |         # Analysis Modules Available for this objective
104 |         anlav = False
105 |         print("Available winreg analysis types:")
106 |         # if objtype == None or objtype == "str-help" or objtype == "str-list" or  re.search("^dict-pandas_dataframe-winreg_kv", objtype):
107 |         #     anlav = True
108 |         #     print("- winreg_files:  No.events winreg file (Input: winreg dfs)")
109 | 
110 |         if not anlav:
111 |             print('- No analysis modules available for this object ('+objtype+').')
112 | 
113 |     nargs = len(args)
114 | 
115 |     if nargs == 0:
116 |         syntax()
117 |         return
118 | 
119 |     obj = args[0]
120 | 
121 |     objtype = d4com.data_identify(obj)
122 | 
123 |     if isinstance(obj, str):
124 |         if obj == "list":
125 |             d4list(objtype)
126 |             return
127 |         if obj == "help":
128 |             syntax()
129 |             return
130 | 
131 |     if nargs == 1:
132 |         syntax()
133 |         return
134 | 
135 |     anltype = args[1]
136 | 
137 |     if not isinstance(anltype, str):
138 |         syntax()
139 |         return
140 | 
141 |     if anltype == "help":
142 |         syntax()
143 |         return
144 |     elif anltype == "list":
145 |         d4list(objtype)
146 |         return
147 | 
148 |     # ANALYSIS FUNCTIONS ======================================================
149 | 
150 |     # XXXdfs ------------------------------------------------------------------
151 |     # if   re.search("^dict-pandas_dataframe-XXXXX", objtype):
152 |     #     if anltype == "XXXXX_files":
153 |     #         return analysis_XXXXX_files(*args, **kwargs)
154 | 
155 |     print("INFO: [d4reg] No analysis functions available for this data type ("+objtype+")")
156 | 
157 | # DATAFRAME ACCESSOR ##########################################################
158 | 
159 | @pd.api.extensions.register_dataframe_accessor("d4reg")
160 | class Ds4n6RegAccessor:
161 |     def __init__(self, pandas_obj):
162 |         self._obj = pandas_obj
163 | 
164 |     def simple(self, *args, **kwargs):
165 |         """ Redirects execution to simple_func()
166 |         """
167 |         df = self._obj
168 |         return simple_func(df, *args, **kwargs)
169 | 
170 | 
171 | @pd.api.extensions.register_dataframe_accessor("d4_winreg")
172 | class Ds4n6WinRegAccessor:
173 |     def __init__(self, pandas_obj):
174 |         self._obj = pandas_obj
175 | 
176 |     def simple(self, *args, **kwargs):
177 |         """ Redirects execution to simple_func()
178 |         """
179 |         df = self._obj
180 |         return simple_func(df, *args, **kwargs)
181 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/svclist.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: Library of functions to apply Data Science to forensics artifacts
  4 | #
  5 | 
  6 | ###############################################################################
  7 | # INFO
  8 | ###############################################################################
  9 | # Recommended "import as": d4svclst
 10 | 
 11 | ###############################################################################
 12 | # IMPORTS
 13 | ###############################################################################
 14 | 
 15 | # DEV  IMPORTS ----------------------------------------------------------------
 16 | 
 17 | # python IMPORTS --------------------------------------------------------------
 18 | import os
 19 | import glob
 20 | import re
 21 | import time
 22 | import inspect
 23 | import pickle
 24 | 
 25 | # DS IMPORTS ------------------------------------------------------------------
 26 | import numpy  as np
 27 | import pandas as pd
 28 | import matplotlib.pyplot as plt
 29 | 
 30 | # DS4N6 IMPORTS ---------------------------------------------------------------
 31 | import ds4n6_lib.d4     as d4
 32 | import ds4n6_lib.common as d4com
 33 | import ds4n6_lib.gui    as d4gui
 34 | import ds4n6_lib.utils  as d4utl
 35 | import ds4n6_lib.unx    as d4unx
 36 | 
 37 | ###############################################################################
 38 | # FUNCTIONS
 39 | ###############################################################################
 40 | 
 41 | # ANALYSIS FUNCTIONS ##########################################################
 42 | 
 43 | # simple ======================================================================
 44 | def simple_func(df, *args, **kwargs):
 45 |     """ Reformat the input df so the data is presented to the analyst in the
 46 |         friendliest possible way
 47 | 
 48 |     Parameters:
 49 |     df  (pd.dataframe):  Input data 
 50 |     
 51 |     Returns:
 52 |     pd.DataFrame: Optionally it will return the filtered dataframe, 
 53 |                   only if ret=True is set, constant & hidden columns included
 54 |                   If ret_out=True is set, then the output just as it is shown
 55 |                   (without constant/hidden columns) will be return
 56 |     """
 57 | 
 58 |     if d4.debug >= 3:
 59 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
 60 | 
 61 |     # Artifact-specific argument parsing =======================================
 62 |     hiddencolsuser = kwargs.get('hiddencols',  [])
 63 | 
 64 |     # Variables ================================================================
 65 |     hiddencolsdef =  []
 66 | 
 67 |     # Merge artifact hiddencols with user-specified hiddencols + update kwargs
 68 |     hiddencols = hiddencolsuser + hiddencolsdef 
 69 |     kwargs['hiddencols'] = hiddencols
 70 | 
 71 |     dfout = df
 72 | 
 73 |     # Maximum number of lines in DF for beautification
 74 |     maxdfbprintlines = 20
 75 | 
 76 |     # Pre-Processing ==========================================================
 77 | 
 78 |     # Call to simple_common ===================================================
 79 |     dfout = d4com.simple_common(df, *args, **kwargs, maxdfbprintlines=maxdfbprintlines)
 80 | 
 81 |     # Post-Processing =========================================================
 82 | 
 83 |     # Return ==================================================================
 84 |     return dfout
 85 | 
 86 | # analysis ====================================================================
 87 | def analysis(obj, *args, **kwargs):
 88 |     """ Redirects execution to analysis_func()
 89 |     """
 90 |     return analysis_func(obj, *args, **kwargs)
 91 | 
 92 | def analysis_func(obj, *args, **kwargs):
 93 |     """ Umbrella function that redirects to different types of analysis 
 94 |         available on the input data
 95 | 
 96 |     Parameters:
 97 |     obj:          Input data (typically DF or dict of DFs)
 98 |     
 99 |     Returns:
100 |     pd.DataFrame: Refer to each specific analysis function
101 |     """
102 | 
103 |     # SUB-FUNCTIONS ###########################################################
104 |     def syntax():
105 |         print('Syntax: analysis(obj, "analysis_type")\n')
106 |         d4list("str-help")
107 |         return
108 | 
109 |     def d4list(objtype):
110 | 
111 |         # Analysis Modules Available for this objective
112 |         anlav = False
113 |         print("Available XXXXX analysis types:")
114 |         if objtype == None or objtype == "str-help" or objtype == "str-list" or  re.search("^dict-pandas_dataframe-XXXXX", objtype):
115 |             anlav = True
116 |             print("- XXXXX_files:  No.events XXXXX file (Input: XXXdfs)")
117 | 
118 |         if anlav == False:
119 |             print('- No analysis modules available for this object ('+objtype+').')
120 | 
121 |     # FUNCTION BODY ###########################################################
122 |     if d4.debug >= 3:
123 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
124 | 
125 |     thisdatatype = "XXXXXXXX-THIS_DATA_TYPE"
126 | 
127 |     nargs = len(args)
128 | 
129 |     if nargs == 0:
130 |         syntax()
131 |         return
132 | 
133 |     obj = args[0]
134 | 
135 |     objtype = d4com.data_identify(obj)
136 | 
137 |     if isinstance(obj, str):
138 |         if obj == "list":
139 |             d4list(objtype)
140 |             return
141 |         if obj == "help":
142 |             syntax()
143 |             return
144 | 
145 |     if nargs == 1:
146 |         if thisdatatype is not None:
147 |             if re.search("^dict-pandas_dataframe-"+thisdatatype, objtype) or re.search("^pandas_dataframe-"+thisdatatype, objtype):
148 |                 d4list(objtype)
149 |             else:
150 |                 syntax()
151 |         else:
152 |             syntax()
153 | 
154 |         return
155 | 
156 |     anltype = args[1]
157 | 
158 |     if not isinstance(anltype, str):
159 |         syntax()
160 |         return
161 | 
162 |     if anltype == "help":
163 |         syntax()
164 |         return
165 |     elif anltype == "list":
166 |         d4list(objtype)
167 |         return
168 | 
169 |     # ANALYSIS FUNCTIONS ======================================================
170 | 
171 |     # XXXdfs ------------------------------------------------------------------
172 |     if   re.search("^dict-pandas_dataframe-XXXXX", objtype):
173 |         if anltype == "XXXXX_files":
174 |             return analysis_XXXXX_files(*args, **kwargs)
175 | 
176 |     print("INFO: [d4XXX] No analysis functions available for this data type ("+objtype+")")
177 | 
178 | # DATAFRAME ACCESSOR ##########################################################
179 | 
180 | @pd.api.extensions.register_dataframe_accessor("d4svclst")
181 | class Ds4n6SvcListAccessor:
182 |     def __init__(self, pandas_obj):
183 |         self._obj = pandas_obj
184 | 
185 |     def simple(self, *args, **kwargs):
186 |         """ Redirects execution to simple_func()
187 |         """
188 |         df = self._obj
189 |         return simple_func(df, *args, **kwargs)
190 | 
191 | 
192 | @pd.api.extensions.register_dataframe_accessor("d4_svclist")
193 | class Ds4n6SvcListAccessor:
194 |     def __init__(self, pandas_obj):
195 |         self._obj = pandas_obj
196 | 
197 |     def simple(self, *args, **kwargs):
198 |         """ Redirects execution to simple_func()
199 |         """
200 |         df = self._obj
201 |         return simple_func(df, *args, **kwargs)
202 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/unx.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: library of functions to appy Data Science in several forensics
  4 | #              artifacts
  5 | #
  6 | 
  7 | ###############################################################################
  8 | # IDEAS
  9 | ###############################################################################
 10 | # dfsed
 11 | # multicol -> For a series or DF col, show in multiple cols to optimize screen
 12 | #             Equiv. to Linux: pr -l1 -t -3 /t
 13 | 
 14 | ###############################################################################
 15 | # INFO
 16 | ###############################################################################
 17 | # Recommended "import as": d4unx
 18 | 
 19 | ###############################################################################
 20 | # IMPORTS
 21 | ###############################################################################
 22 | 
 23 | # DEV  IMPORTS ----------------------------------------------------------------
 24 | 
 25 | # python IMPORTS --------------------------------------------------------------
 26 | import re
 27 | import inspect
 28 | 
 29 | # DS IMPORTS ------------------------------------------------------------------
 30 | import numpy  as np
 31 | import pandas as pd
 32 | 
 33 | ###############################################################################
 34 | # FUNCTIONS
 35 | ###############################################################################
 36 | 
 37 | def xgrep_func(*args, **kwargs):
 38 | 
 39 |     def syntax():
 40 |         print('Syntax: xgrep(<object>,[<column(s)>],"<regex>"[,"<options>"])')
 41 |         print("        [column(s)] -> If object is a DataFrame")
 42 |         print("        Options: i: case insensitive")
 43 |         print("                 v: reverse")
 44 |         print("                 p: explode cols with series elements")
 45 |         print("                 t: do not apply style (highlight hits)")
 46 | 
 47 |     import collections
 48 | 
 49 |     nargs=len(args)
 50 | 
 51 |     if nargs == 0:
 52 |         syntax()
 53 |         return
 54 | 
 55 |     obj = args[0]
 56 | 
 57 |     if isinstance(obj, dict):
 58 |         return dictgrep(*args, **kwargs)
 59 |     elif isinstance(obj, pd.DataFrame):
 60 |         return dfgrep(*args, **kwargs)
 61 |     elif isinstance(obj, collections.abc.KeysView):
 62 |         return keysgrep(*args, **kwargs)
 63 | 
 64 | def xgrep(*args, **kwargs):
 65 |     return xgrep_func(*args, **kwargs)
 66 | 
 67 | def dfgrep(*args):
 68 |     """
 69 |     Syntax: dfgrep("<column>","<regex>"[,"<options>"])
 70 |             Options: i: case insensitive
 71 |                      v: reverse
 72 |                      p: explode cols with series elements
 73 |                      t: do not apply style (highlight hits)
 74 |             If your DF has only 1 column, you can skip the column name,
 75 |             just specify ""
 76 | 
 77 |     """
 78 | 
 79 |     nargs=len(args)
 80 | 
 81 |     df    = args[0]
 82 | 
 83 |     # If the user supplies just one arg we will assume that it is the regex 
 84 |     # and that he wants to search the full DF for that regex
 85 |     if nargs == 2:
 86 |         cols  = "*"
 87 |         regex = args[1]
 88 |     else:
 89 |         cols  = args[1]
 90 |         regex = args[2]
 91 | 
 92 |     if nargs == 4:
 93 |         opt = args[3]
 94 |     else:
 95 |         opt = ""
 96 | 
 97 |     ndfcols=len(df.columns)
 98 | 
 99 |     if ndfcols == 1 and cols == "":
100 |         cols = df.columns
101 | 
102 |     if cols == "*":
103 |         cols = df.columns
104 | 
105 |     if regex == "":
106 |         print("ERROR: regex cannot be empty")
107 |         return
108 | 
109 |     # Parse Options
110 |     if "v" in opt:
111 |         reverse=True
112 |     else:
113 |         reverse=False
114 | 
115 |     if "i" in opt:
116 |         case=False
117 |     else:
118 |         case=True
119 | 
120 |     if "t" in opt:
121 |         applystyle = False
122 |     else:
123 |         applystyle = True
124 | 
125 |     dfout = pd.DataFrame([])
126 | 
127 |     if isinstance(cols, str):
128 |         cols=list([cols])
129 | 
130 |     for col in cols:
131 |         # Check if col is an existing column
132 |         if col not in df.columns:
133 |             print ('ERROR: column '+col+' not found in DF')
134 |             return
135 | 
136 |         if "p" in opt:
137 |             df=df.explode(col)
138 |   
139 |         # Identify if there are null values and fill them
140 |         df=df.copy()
141 |         # df[col]=df[col].fillna("d4_null")
142 | 
143 |         if reverse :
144 |             resdf = df[~df[col].astype(str).str.contains(regex,case=case)]
145 |         else:
146 |             resdf = df[df[col].astype(str).str.contains(regex,case=case)]
147 | 
148 |         dfout = dfout.append(resdf)
149 | 
150 |     # for col in cols:
151 |     #     dfout[col]=dfout[col].fillna("d4_null")
152 |  
153 |     dfout = dfout.drop_duplicates()
154 | 
155 |     if applystyle :
156 |         maxdfoutprintlines = 1000
157 |         if len(dfout) >= maxdfoutprintlines:
158 |             print('WARNING: Too many lines (>'+str(maxdfoutprintlines)+') in DataFrame for formatting. Returning unformatted output.')
159 |             return dfout
160 |         else:
161 |             dfout = dfout.reset_index()
162 |             return dfout.style.apply(lambda x: ["background: yellow" if re.search(regex, str(v)) else '' for v in x], axis = 1)
163 |     else:
164 |         return dfout
165 | 
166 | def keysgrep(keys, regex, opt=""):
167 |     df = pd.DataFrame(list(keys), columns=['Key'])
168 |     return df.d4unx.dfgrep('Key',regex, opt)
169 | 
170 | def dictgrep(mydict, regex, opt=""):
171 |     # DFs dict -----------------------------------------------
172 |     if isinstance(mydict[list(mydict.keys())[0]], pd.DataFrame):
173 | 
174 |         outdf = pd.DataFrame([])
175 | 
176 |         # Do not apply style on dfgrep
177 |         dfgrepopt = opt+"t"
178 | 
179 |         for key in mydict.keys():
180 |             thisdf = dfgrep(mydict[key], "*", regex, dfgrepopt)
181 |             thisdf.insert(0, 'dict-Key_', key) 
182 |             outdf = pd.concat([outdf, thisdf], ignore_index=True)
183 | 
184 |         # Return resulting DF
185 |         if "t" in opt:
186 |             return outdf.dropna(axis=1, how='all')
187 |         else:
188 |             return outdf.dropna(axis=1, how='all').style.apply(lambda x: ["background: yellow" if re.search(regex, v) else '' for v in x], axis = 1)
189 |     else:
190 |         print("ERROR: dict variant not supported.")
191 | 
192 | def dfsed_func(df,col,regex,repl,opt=""):
193 | 
194 |     df[col]=df[col].str.replace(regex,repl)
195 | 
196 |     return df
197 | 
198 | def vc_func(df,col,countfilter="",ascending=False):
199 | 
200 |     dfout = df[col].value_counts(ascending=ascending).reset_index().rename(columns={"index": col, col: "Count"})
201 |       
202 |  
203 |     if countfilter != "":
204 |     
205 |         n=int(countfilter)        
206 |         dfout=dfout.query(f'Count == {n}')
207 | 
208 |     return dfout
209 | 
210 | def ddups_func(df):
211 |     dfout=df.drop_duplicates()
212 | 
213 |     return dfout
214 | 
215 | # ACCESSOR ####################################################################
216 | @pd.api.extensions.register_dataframe_accessor("d4unx")
217 | class Ds4n6UnxAccessor:
218 |     def __init__(self, pandas_obj):
219 |         self._obj = pandas_obj
220 | 
221 |     def dfgrep(self, *args, **kwargs):
222 |         obj = self._obj
223 |         return xgrep_func(obj, *args, **kwargs)
224 | 
225 |     def dfsed(self,col,regex,repl,opt=""):
226 |         df=self._obj.copy()
227 |         return dfsed_func(df,col,regex,repl,opt)
228 | 
229 |     def vc(self,col):
230 |         df=self._obj
231 |         return vc_func(df,col)
232 | 
233 |     def ddups(self):
234 |         df=self._obj
235 |         return df.ddups_func()
236 | 
237 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/autoruns.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: library of functions to appy Data Science in several forensics
  4 | #              artifacts
  5 | #
  6 | 
  7 | ###############################################################################
  8 | # INFO
  9 | ###############################################################################
 10 | # Recommended "import as": d4atrs
 11 | 
 12 | ###############################################################################
 13 | # IMPORTS
 14 | ###############################################################################
 15 | # DEV  IMPORTS ----------------------------------------------------------------
 16 | 
 17 | # python IMPORTS --------------------------------------------------------------
 18 | import os
 19 | import glob
 20 | import re
 21 | import time
 22 | import pickle
 23 | import inspect
 24 | 
 25 | # DS IMPORTS -----------------------------------------------------------------
 26 | import numpy  as np
 27 | import pandas as pd
 28 | import matplotlib.pyplot as plt
 29 | from IPython.display import display, Markdown, HTML
 30 | 
 31 | # DS4N6 IMPORTS ---------------------------------------------------------------
 32 | import ds4n6_lib.d4     as d4
 33 | import ds4n6_lib.common as d4com
 34 | import ds4n6_lib.gui    as d4gui
 35 | import ds4n6_lib.utils  as d4utl
 36 | 
 37 | ###############################################################################
 38 | # FUNCTIONS
 39 | ###############################################################################
 40 | 
 41 | # FILE READING FUNCTIONS ######################################################
 42 | 
 43 | def read_data(evdl, **kwargs):
 44 |     """ Read data from files or a folder
 45 | 
 46 |         Args: 
 47 |             evdl (str): path to file/folder source
 48 |             kwargs: read options
 49 |         Returns: 
 50 |             pandas.Dataframe or dictionary of pandas.DataFrame
 51 |     """
 52 |     return d4com.read_data_common(evdl, **kwargs)
 53 | 
 54 | # HARMONIZATION FUNCTIONS #####################################################
 55 | 
 56 | def harmonize(df, **kwargs):
 57 |     """ Convert DF in HAM format
 58 | 
 59 |         Args: 
 60 |             df (pandas.DataFrame): DF to harmonize
 61 |             kwargs(dict): harmonize options
 62 |         Returns: 
 63 |             pandas.DataFrame in HAM Format
 64 |     """
 65 |     if d4.debug >= 2:
 66 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
 67 | 
 68 |     hostname     = kwargs.get('hostname',     None)
 69 | 
 70 |     # Specific Harmonization Pre-Processing ----------------------------------- 
 71 | 
 72 |     # Generic Harmonization ---------------------------------------------------
 73 |     df = d4com.harmonize_common(df, **kwargs)
 74 | 
 75 |     # Specific Harmonization Post-Processing ----------------------------------
 76 |     df['D4_DataType_'] = 'autoruns'
 77 |     df['D4_Tool_']     = 'autoruns'
 78 |     if not hostname == None:
 79 |         df['D4_Hostname_'] = hostname
 80 | 
 81 |     # Signed_Verified_ column (boolean) - - - - - - - - - - - - - - - - - - - -
 82 |     signer_verifiedsr = df['Signer'].str.contains('^\\(Verified\\)')
 83 | 
 84 |     col    = 'Signer'
 85 |     newcol = 'Signer_Verified_'
 86 | 
 87 |     colloc = df.columns.get_loc(col)
 88 |     newcolloc = colloc + 1
 89 |     if newcol not in df.columns:
 90 |        df.insert(newcolloc, newcol, "-")
 91 |        df[newcol] = signer_verifiedsr
 92 | 
 93 |     # Misc  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 94 |     df['Time'] = df['Time'].astype(str).replace('<NA>', np.NaN).astype('datetime64[ns]')
 95 | 
 96 |     return df
 97 | 
 98 | # ANALYSIS FUNCTIONS ##########################################################
 99 | 
100 | # simple ======================================================================
101 | 
102 | def simple_func(df, *args, **kwargs):
103 |     """ Reformat the input df so the data is presented to the analyst in the
104 |         friendliest possible way
105 | 
106 |     Parameters:
107 |     df  (pd.dataframe):  Input data 
108 |     
109 |     Returns:
110 |     pd.DataFrame: Optionally it will return the filtered dataframe, 
111 |                   only if ret=True is set, constant & hidden columns included
112 |                   If ret_out=True is set, then the output just as it is shown
113 |                   (without constant/hidden columns) will be return
114 |     """
115 | 
116 |     if d4.debug >= 2:
117 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
118 | 
119 |     # Variables ----------------------------------------------------------------
120 |     hiddencols =  ['MD5','SHA-1','PESHA-1','PESHA-256','SHA-256','RunspaceId','IMP']
121 | 
122 |     # Maximum number of lines in DF for beautification
123 |     maxdfbprintlines = 20
124 | 
125 |     # Call to simple_common ----------------------------------------------------
126 |     return d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines)
127 | 
128 | # analysis ====================================================================
129 | def analysis(*args, **kwargs):
130 |     """ Redirects execution to analysis_func()
131 |     """
132 |     if d4.debug >= 2:
133 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
134 | 
135 |     return analysis_func(*args, **kwargs)
136 | 
137 | 
138 | def analysis_func(*args, **kwargs):
139 |     """ Umbrella function that redirects to different types of analysis 
140 |         available on the input data
141 | 
142 |     Parameters:
143 |     obj:          Input data (typically DF or dict of DFs)
144 |     
145 |     Returns:
146 |     pd.DataFrame: Refer to each specific analysis function
147 |     """
148 | 
149 |     if d4.debug >= 2:
150 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
151 | 
152 |     def syntax():
153 |         print('Syntax: analysis(obj, "analysis_type")\n')
154 |         d4list("str-help")
155 |         return
156 | 
157 |     def d4list(objtype):
158 |         print("Available autoruns analysis types:")
159 |         print("- find_powershell: Analyze data and find powershell")
160 |  
161 |     nargs = len(args)
162 | 
163 |     if nargs == 0:
164 |         syntax()
165 |         return
166 | 
167 |     obj = args[0]
168 | 
169 |     objtype = d4com.data_identify(obj)
170 | 
171 |     if isinstance(obj, str):
172 |         if obj == "list":
173 |             d4list(objtype)
174 |             return
175 |         if obj == "help":
176 |             syntax()
177 |             return
178 | 
179 |     if nargs == 1:
180 |         syntax()
181 |         return
182 | 
183 |     anltype = args[1]
184 | 
185 |     if not isinstance(anltype, str):
186 |         syntax()
187 |         return
188 | 
189 |     if anltype == "help":
190 |         syntax()
191 |         return
192 |     elif anltype == "list":
193 |         d4list(objtype)
194 |         return
195 | 
196 |     if re.search("^pandas_dataframe-autoruns", objtype):
197 |         if anltype == "find_powershell":
198 |             return analysis_find_powershell(*args, **kwargs)
199 |     else:
200 |         print("ERROR: [autoruns] Unsupported input data.")
201 |         return
202 | 
203 | def analysis_find_powershell(obj, *args, **kwargs):
204 |     """ Analysis that finds poweshell in the DF
205 | 
206 |         Args: 
207 |         obj:          Input data (typically DF or dict of DFs)
208 |         Returns: 
209 |         pandas.Dataframe with the results of the analysis
210 | 
211 |     """
212 |     if d4.debug >= 2:
213 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
214 | 
215 |     df = obj
216 | 
217 |     return df.xgrep("*", "powershell", "t" ).spl(out=True, ret=True)
218 | 
219 | # DATAFRAME ACCESSOR ##########################################################
220 | 
221 | @pd.api.extensions.register_dataframe_accessor("d4atrs")
222 | class Ds4n6AtrsAccessor:
223 |     def __init__(self, pandas_obj):
224 |         self._obj = pandas_obj
225 | 
226 |     def simple(self, *args, **kwargs):
227 |         """ Redirects execution to simple_func()
228 |         """
229 |         df = self._obj
230 |         return simple_func(df, *args, **kwargs)
231 | 
232 | @pd.api.extensions.register_dataframe_accessor("d4_autoruns")
233 | class Ds4n6AutorunsAccessor:
234 |     def __init__(self, pandas_obj):
235 |         self._obj = pandas_obj
236 | 
237 |     def simple(self, *args, **kwargs):
238 |         """ Redirects execution to simple_func()
239 |         """
240 |         df = self._obj
241 |         return simple_func(df, *args, **kwargs)
242 | 
243 | 
244 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/kansa.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: library of functions to appy Data Science in several forensics
  4 | #              artifacts
  5 | #
  6 | 
  7 | ###############################################################################
  8 | # INFO
  9 | ###############################################################################
 10 | # Recommended "import as": d4ksa
 11 | 
 12 | ###############################################################################
 13 | # IMPORTS
 14 | ###############################################################################
 15 | 
 16 | # DEV  IMPORTS ----------------------------------------------------------------
 17 | 
 18 | # python IMPORTS --------------------------------------------------------------
 19 | import os
 20 | import glob
 21 | import re
 22 | import time
 23 | import inspect
 24 | import xmltodict
 25 | import json
 26 | import pickle
 27 | from tqdm import tqdm
 28 | import xml.etree.ElementTree as et
 29 | 
 30 | # DS IMPORTS ------------------------------------------------------------------
 31 | import numpy  as np
 32 | import pandas as pd
 33 | import matplotlib.pyplot as plt
 34 | from IPython.display import display, Markdown, HTML
 35 | 
 36 | from sklearn.model_selection import train_test_split
 37 | from tensorflow.keras.models import Model, load_model
 38 | from tensorflow.keras.layers import Input, Dense
 39 | 
 40 | # DS4N6 IMPORTS ---------------------------------------------------------------
 41 | import ds4n6_lib.d4      as d4
 42 | import ds4n6_lib.common  as d4com
 43 | import ds4n6_lib.gui     as d4gui
 44 | import ds4n6_lib.utils   as d4utl
 45 | 
 46 | ###############################################################################
 47 | # FUNCTIONS
 48 | ###############################################################################
 49 | 
 50 | # FILE READING FUNCTIONS ######################################################
 51 | 
 52 | def read_data(evdl, **kwargs):
 53 |     return d4com.read_data_common(evdl, **kwargs)
 54 | 
 55 | # HARMONIZATION FUNCTIONS #####################################################
 56 | 
 57 | def harmonize(df, **kwargs):
 58 |     """ Convert DF in HAM format
 59 | 
 60 |         Args: 
 61 |             df (pandas.DataFrame): DF to harmonize
 62 |             kwargs(dict): harmonize options
 63 |         Returns: 
 64 |             pandas.DataFrame in HAM Format
 65 |     """
 66 | 
 67 |     # (1) kansa will probably be invoked with 'tool=kansa', but kansa is in 
 68 |     #     reality an orchestrator, so we will only populate the D4_Tool_ column 
 69 |     #     to kansa only if we have not been able to determine what is the 
 70 |     #     underlying tool that kansa is using for execution in the endpoint
 71 | 
 72 |     # Specific Harmonization Pre-Processing -----------------------------------
 73 |     if not 'D4_Orchestrator_' in df.columns:
 74 |         df.insert(0, 'D4_Orchestrator_', "kansa")
 75 |     else:
 76 |         # If the D4_Orchestrator_ col exists, we are in a recursive call
 77 |         return df
 78 | 
 79 |     objtype = d4com.data_identify(df)
 80 | 
 81 |     # Generic Harmonization ---------------------------------------------------
 82 | 
 83 |     # Since kansa is an orchestrator, let's try to identify the specific 
 84 |     # data type and apply the corresponding harmonization function.
 85 |     # If we can, we will execute the generic one.
 86 |     if "unknown" in objtype:
 87 |         df = d4com.harmonize_common(df, **kwargs)
 88 |     else:
 89 |         # Let's try to harmonize this specific df
 90 |         # WARNING: Since we no longer identify datatype by DF cols, this will
 91 |         #          not work
 92 |         df = d4com.harmonize(df)
 93 | 
 94 |     # Specific Harmonization Post-Processing ----------------------------------
 95 |     df['D4_Hostname_'] = df['PSComputerName']
 96 | 
 97 |     if df['D4_Plugin_'].iloc[0] == "Tasklistv":
 98 |         df['D4_DataType_'] = "pslist"
 99 |         df['D4_DataType_'] = df['D4_DataType_'].astype('category')
100 | 
101 |         # Rename columns
102 |         df = df.rename(columns={'ImageName': 'Name_', 'PID': 'PID_', 
103 |                                 'SessionName': 'SessionName_', 
104 |                                 'SessionNum': 'Session_', 'MemUsage': 'MemUsage_', 
105 |                                 'Status': 'Status_', 'UserName': 'UserName_', 
106 |                                 'CPUTime': 'CPUTime_', 'WindowTitle': 'WindowTitle_'
107 |                                })
108 | 
109 |     elif df['D4_Plugin_'].iloc[0] == "SvcAll":
110 |         df['D4_DataType_'] = "svclist"
111 |         df['D4_DataType_'] = df['D4_DataType_'].astype('category')
112 | 
113 |         # Rename columns
114 |         df = df.rename(columns={'Name': 'Name_', 'DisplayName': 'DisplayName_', 
115 |                                 'PathName': 'FilePath_', 'StartName': 'UserName_',
116 |                                 'StartMode': 'StartMode_', 'State': 'State_',
117 |                                 'TotalSessions': 'TotalSessions_', 
118 |                                 'Description': 'Description_'
119 |                                })
120 | 
121 |     return df
122 | 
123 | # ANALYSIS FUNCTIONS ======================================================
124 | 
125 | # simple ======================================================================
126 | def simple_func(df, *args, **kwargs):
127 |     """ Reformat the input df so the data is presented to the analyst in the
128 |         friendliest possible way
129 | 
130 |     Parameters:
131 |     df  (pd.dataframe):  Input data 
132 |     
133 |     Returns:
134 |     pd.DataFrame: Optionally it will return the filtered dataframe, 
135 |                   only if ret=True is set, constant & hidden columns included
136 |                   If ret_out=True is set, then the output just as it is shown
137 |                   (without constant/hidden columns) will be return
138 |     """
139 |     if d4.debug >= 4:
140 |         print("DEBUG: [ksa] [simple_func()]")
141 | 
142 |     # Variables ----------------------------------------------------------------
143 |     hiddencols =  []
144 | 
145 |     # Maximum number of lines in DF for beautification
146 |     maxdfbprintlines = 20
147 | 
148 |     # Call to simple_common ----------------------------------------------------
149 |     return d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines)
150 | 
151 | # analysis ====================================================================
152 | def analysis(*args, **kwargs):
153 |     """ Redirects execution to analysis_func()
154 |     """
155 |     return analysis_func(*args, **kwargs)
156 | 
157 | def analysis_func(*args, **kwargs):
158 |     """ Umbrella function that redirects to different types of analysis 
159 |         available on the input data
160 | 
161 |     Parameters:
162 |     obj:          Input data (typically DF or dict of DFs)
163 |     
164 |     Returns:
165 |     pd.DataFrame: Refer to each specific analysis function
166 |     """
167 |     def syntax():
168 |         print('Syntax: analysis(obj, "analysis_type")\n')
169 |         d4list("str-help")
170 |         return
171 | 
172 |     def d4list(objtype):
173 | 
174 |         # Analysis Modules Available for this objective
175 |         anlav = False
176 |         print("Available kansa analysis types:")
177 |         if objtype == None or objtype == "str-help" or objtype == "str-list" or  re.search("^dict-pandas_dataframe-kansa", objtype):
178 |             anlav = True
179 |             print("- kansa_files:  No.events kansa file (Input: ksadfs)")
180 | 
181 |         if anlav == False:
182 |             print('- No analysis modules available for this object ('+objtype+').')
183 | 
184 |     nargs = len(args)
185 | 
186 |     if nargs == 0:
187 |         syntax()
188 |         return
189 | 
190 |     obj = args[0]
191 | 
192 |     objtype = d4com.data_identify(obj)
193 | 
194 |     if isinstance(obj, str):
195 |         if obj == "list":
196 |             d4list(objtype)
197 |             return
198 |         if obj == "help":
199 |             syntax()
200 |             return
201 | 
202 |     if nargs == 1:
203 |         syntax()
204 |         return
205 | 
206 |     anltype = args[1]
207 | 
208 |     if not isinstance(anltype, str):
209 |         syntax()
210 |         return
211 | 
212 |     if anltype == "help":
213 |         syntax()
214 |         return
215 |     elif anltype == "list":
216 |         d4list(objtype)
217 |         return
218 | 
219 |     # ksadfs ------------------------------------------------------------------
220 |     if   re.search("^dict-pandas_dataframe-kansa", objtype):
221 |         if anltype == "kansa_files":
222 |             return analysis_kansa_files(*args, **kwargs)
223 | 
224 |     print("INFO: [d4ksa] No analysis functions available for this data type ("+objtype+")")
225 | 
226 | def analysis_kansa_files(*args, **kwargs):
227 |     """ Analysis that gives kansa files
228 | 
229 |         Args: 
230 |         obj:          Input data (typically DF or dict of DFs)
231 |         Returns: 
232 |         pandas.Dataframe with the results of the analysis
233 | 
234 |     """
235 |     dfs = args[0]
236 | 
237 |     objtype = d4com.data_identify(dfs)
238 | 
239 |     if objtype != "dict-pandas_dataframe-kansa":
240 |         print("ERROR: Invalid object for function: "+objtype)
241 |         print("       Input object should be:      dict-pandas_dataframe-kansa")
242 |         return
243 | 
244 |     outdf = pd.DataFrame([],columns=['File','NEntries'])
245 |     row = pd.Series()
246 | 
247 |     for key in dfs.keys():
248 |         row['File']  = key
249 |         row['NEntries'] = len(dfs[key])
250 | 
251 |         outdf = outdf.append(row,ignore_index=True)
252 | 
253 |     return outdf
254 | 
255 | # DATAFRAME ACCESSOR ##########################################################
256 | 
257 | @pd.api.extensions.register_dataframe_accessor("d4ksa")
258 | class Ds4n6KsaAccessor:
259 |     def __init__(self, pandas_obj):
260 |         self._obj = pandas_obj
261 | 
262 |     def simple(self, *args, **kwargs):
263 |         """ Redirects execution to simple_func()
264 |         """
265 |         df = self._obj
266 |         return simple_func(df, *args, **kwargs)
267 | 
268 | @pd.api.extensions.register_dataframe_accessor("d4_kansa")
269 | class Ds4n6KansaAccessor:
270 |     def __init__(self, pandas_obj):
271 |         self._obj = pandas_obj
272 | 
273 |     def simple(self, *args, **kwargs):
274 |         """ Redirects execution to simple_func()
275 |         """
276 |         df = self._obj
277 |         return simple_func(df, *args, **kwargs)
278 | 
279 | 
280 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/volatility.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: library of functions to appy Data Science in several forensics
  4 | #              artifacts
  5 | #
  6 | 
  7 | ###############################################################################
  8 | # INFO
  9 | ###############################################################################
 10 | # Recommended "import as": d4vol
 11 | 
 12 | ###############################################################################
 13 | # IMPORTS
 14 | ###############################################################################
 15 | 
 16 | # DEV  IMPORTS ----------------------------------------------------------------
 17 | 
 18 | # python IMPORTS --------------------------------------------------------------
 19 | import os
 20 | import glob
 21 | import re
 22 | import time
 23 | import inspect
 24 | import json
 25 | import pickle
 26 | 
 27 | # DS IMPORTS ------------------------------------------------------------------
 28 | import numpy  as np
 29 | import pandas as pd
 30 | from IPython.display import display, Markdown, HTML
 31 | 
 32 | # DS4N6 IMPORTS ---------------------------------------------------------------
 33 | import ds4n6_lib.d4     as d4
 34 | import ds4n6_lib.common as d4com
 35 | import ds4n6_lib.gui    as d4gui
 36 | import ds4n6_lib.utils  as d4utl
 37 | 
 38 | ###############################################################################
 39 | # FUNCTIONS
 40 | ###############################################################################
 41 | 
 42 | # FILE READING FUNCTIONS ######################################################
 43 | 
 44 | def read_data(evdl, **kwargs):
 45 |     """ Read data from files or a folder
 46 | 
 47 |         Args: 
 48 |             evdl (str): path to file/folder source
 49 |             kwargs: read options
 50 |         Returns: 
 51 |             pandas.Dataframe or dictionary of pandas.DataFrame
 52 |     """
 53 |     if d4.debug >= 3:
 54 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
 55 | 
 56 |     return d4com.read_data_common(evdl, **kwargs)
 57 | 
 58 | # HARMONIZATION FUNCTIONS #####################################################
 59 | 
 60 | def harmonize(df, **kwargs):
 61 |     """ Convert DF in HAM format
 62 | 
 63 |         Args: 
 64 |             df (pandas.DataFrame): DF to harmonize
 65 |             kwargs(dict): harmonize options
 66 |         Returns: 
 67 |             pandas.DataFrame in HAM Format
 68 |     """
 69 |     plugin       = kwargs.get('plugin',       None)
 70 |     hostname     = kwargs.get('hostname',     None)
 71 |     
 72 |     # Specific Harmonization Pre-Processing -----------------------------------
 73 |     if hostname is not None:
 74 |         df['D4_Hostname_'] = hostname
 75 |     if hostname is not None:
 76 |         df['D4_Plugin_'] = plugin
 77 |     if not df.index.empty and df.index[0] == ">":
 78 |         df.reset_index(drop=True, inplace=True)
 79 | 
 80 |     # Generic Harmonization ---------------------------------------------------
 81 |     df = d4com.harmonize_common(df, **kwargs)
 82 | 
 83 |     # Specific Harmonization Post-Processing ----------------------------------
 84 | 
 85 |     # pslist  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
 86 |     if plugin == "pslist":
 87 |         df['D4_DataType_'] = "pslist"
 88 |         df['D4_DataType_'] = df['D4_DataType_'].astype('category')
 89 | 
 90 |         # Rename columns
 91 |         df = df.rename(columns={'Name': 'Name_', 'PID': 'PID_', 'PPID': 'PPID_', 
 92 |                                 'Thds': 'Threads_', 'Hnds': 'Handles_',
 93 |                                 'Sess': 'Session_', 'Wow64': 'Wow64_',
 94 |                                 'Start': 'Start_TStamp_', 'Exit': 'Exit_TStamp_'
 95 |                                })
 96 | 
 97 |         # Adjust data types
 98 |         df['Session_']      = df['Session_'].str.replace('^--*$','-1')
 99 |         df['Session_']      = df['Session_'].astype(int)
100 |         df['Handles_']      = df['Handles_'].str.replace('^--*$','-1')
101 |         df['Handles_']      = df['Handles_'].astype(int)
102 |         df['Start_TStamp_'] = pd.to_datetime(df['Start_TStamp_'])
103 |         df['Exit_TStamp_']  = pd.to_datetime(df['Exit_TStamp_'])
104 | 
105 |     # psscan  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
106 |     elif plugin == "psscan":
107 |         df['D4_DataType_'] = "pslist"
108 |         df['D4_DataType_'] = df['D4_DataType_'].astype('category')
109 | 
110 |         # Rename columns
111 |         df = df.rename(columns={'Name': 'Name_', 'PID': 'PID_', 'PPID': 'PPID_', 
112 |                                 'Time created': 'Start_TStamp_', 'Time exited': 'Exit_TStamp_'
113 |                                })
114 | 
115 |         # Adjust data types
116 |         df['Start_TStamp_'] = pd.to_datetime(df['Start_TStamp_'])
117 |         df['Exit_TStamp_']  = pd.to_datetime(df['Exit_TStamp_'])
118 | 
119 |     # return ------------------------------------------------------------------
120 |     return df
121 | 
122 | # ANALYSIS FUNCTIONS ##########################################################
123 | 
124 | # simple ======================================================================
125 | def simple_func(df, *args, **kwargs):
126 |     """ Reformat the input df so the data is presented to the analyst in the
127 |         friendliest possible way
128 | 
129 |     Parameters:
130 |     df  (pd.dataframe):  Input data 
131 |     
132 |     Returns:
133 |     pd.DataFrame: Optionally it will return the filtered dataframe, 
134 |                   only if ret=True is set, constant & hidden columns included
135 |                   If ret_out=True is set, then the output just as it is shown
136 |                   (without constant/hidden columns) will be return
137 |     """
138 | 
139 |     if d4.debug >= 4:
140 |         print("DEBUG: [vol] [simple_func()]")
141 | 
142 |     # Variables ----------------------------------------------------------------
143 |     hiddencols =  []
144 | 
145 |     # Maximum number of lines in DF for beautification
146 |     maxdfbprintlines = 20
147 | 
148 |     # Call to simple_common ----------------------------------------------------
149 |     return d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines)
150 | 
151 | # analysis() ==================================================================
152 | def analysis(*args, **kwargs):
153 |     """ Redirects execution to analysis_func()
154 |     """
155 |     return analysis_func(*args, **kwargs)
156 | 
157 | def analysis_func(*args, **kwargs):
158 |     """ Umbrella function that redirects to different types of analysis 
159 |         available on the input data
160 | 
161 |     Parameters:
162 |     obj:          Input data (typically DF or dict of DFs)
163 |     
164 |     Returns:
165 |     pd.DataFrame: Refer to each specific analysis function
166 |     """
167 |     def syntax():
168 |         print('Syntax: analysis(obj, "analysis_type")\n')
169 |         d4list("str-help")
170 |         return
171 | 
172 |     def d4list(objtype):
173 |         print("Available volatility analysis types:")
174 |         if objtype == None or objtype == "str-help" or objtype == "str-list" or  re.search("^dict-pandas_dataframe-volatility", objtype):
175 |             anlav = True
176 |             print("- volatility_files:  No.events volatility file (Input: voldfs)")
177 | 
178 |         if anlav == False:
179 |             print('- No analysis modules available for this object ('+objtype+').')
180 | 
181 |     nargs = len(args)
182 | 
183 |     if nargs == 0:
184 |         syntax()
185 |         return
186 | 
187 |     obj = args[0]
188 | 
189 |     objtype = d4com.data_identify(obj)
190 | 
191 |     if isinstance(obj, str):
192 |         if obj == "list":
193 |             d4list(objtype)
194 |             return
195 |         if obj == "help":
196 |             syntax()
197 |             return
198 | 
199 |     if nargs == 1:
200 |         syntax()
201 |         return
202 | 
203 |     anltype = args[1]
204 | 
205 |     if not isinstance(anltype, str):
206 |         syntax()
207 |         return
208 | 
209 |     if anltype == "help":
210 |         syntax()
211 |         return
212 |     elif anltype == "list":
213 |         d4list(objtype)
214 |         return
215 | 
216 |     # voldfs ------------------------------------------------------------------
217 |     if   re.search("^dict-pandas_dataframe-volatility", objtype):
218 |         if anltype == "volatility_files":
219 |             return analysis_volatility_files(*args, **kwargs)
220 | 
221 |     print("INFO: [d4vol] No analysis functions available for this data type ("+objtype+")")
222 | 
223 | # ANALYSIS FUNCTIONS ==========================================================
224 | 
225 | def analysis_volatility_files(*args, **kwargs):
226 |     """ Analysis that gives volatility files
227 | 
228 |         Args: 
229 |         obj:          Input data (typically DF or dict of DFs)
230 |         Returns: 
231 |         pandas.Dataframe with the results of the analysis
232 | 
233 |     """
234 |     dfs = args[0]
235 | 
236 |     objtype = d4com.data_identify(dfs)
237 | 
238 |     if not re.search("^dict-pandas_dataframe-volatility", objtype):
239 |         print("ERROR: Invalid object for function: "+objtype)
240 |         print("       Input object should be:      dict-pandas_dataframe-volatility")
241 |         return
242 | 
243 |     outdf = pd.DataFrame([],columns=['NEntries', 'VolFile'])
244 |     row = pd.Series()
245 | 
246 |     for key in dfs.keys():
247 |         row['VolFile']  = key
248 |         row['NEntries'] = len(dfs[key])
249 | 
250 |         outdf = outdf.append(row,ignore_index=True).sort_values(by=['VolFile']).reset_index(drop=True)
251 | 
252 |     return outdf
253 | 
254 | # DATAFRAME ACCESSOR ##########################################################
255 | 
256 | @pd.api.extensions.register_dataframe_accessor("d4vol")
257 | class Ds4n6VolAccessor:
258 |     def __init__(self, pandas_obj):
259 |         self._obj = pandas_obj
260 | 
261 |     def simple(self, *args, **kwargs):
262 |         """ Redirects execution to simple_func()
263 |         """
264 |         df = self._obj
265 |         return simple_func(df, *args, **kwargs)
266 | 
267 | @pd.api.extensions.register_dataframe_accessor("d4_volatility")
268 | class Ds4n6VolatilityAccessor:
269 |     def __init__(self, pandas_obj):
270 |         self._obj = pandas_obj
271 | 
272 |     def simple(self, *args, **kwargs):
273 |         """ Redirects execution to simple_func()
274 |         """
275 |         df = self._obj
276 |         return simple_func(df, *args, **kwargs)
277 | 
278 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/mactime.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #
  3 | # Description: library of functions to appy Data Science in several forensics
  4 | #              artifacts
  5 | #
  6 | 
  7 | ###############################################################################
  8 | # INFO
  9 | ###############################################################################
 10 | # Recommended "import as": d4mctm
 11 | 
 12 | ###############################################################################
 13 | # IMPORTS
 14 | ###############################################################################
 15 | 
 16 | # DEV  IMPORTS ----------------------------------------------------------------
 17 | 
 18 | # python IMPORTS --------------------------------------------------------------
 19 | import os
 20 | import glob
 21 | import re
 22 | import time
 23 | import inspect
 24 | import xmltodict
 25 | import json
 26 | import pickle
 27 | from tqdm import tqdm
 28 | import xml.etree.ElementTree as et
 29 | 
 30 | # DS IMPORTS ------------------------------------------------------------------
 31 | import numpy  as np
 32 | import pandas as pd
 33 | import matplotlib.pyplot as plt
 34 | from IPython.display import display, Markdown, HTML
 35 | 
 36 | from sklearn.model_selection import train_test_split
 37 | from tensorflow.keras.models import Model, load_model
 38 | from tensorflow.keras.layers import Input, Dense
 39 | 
 40 | # DS4N6 IMPORTS ---------------------------------------------------------------
 41 | import ds4n6_lib.d4     as d4
 42 | import ds4n6_lib.common as d4com
 43 | import ds4n6_lib.gui    as d4gui
 44 | import ds4n6_lib.utils  as d4utl
 45 | 
 46 | ###############################################################################
 47 | # IDEAS
 48 | ###############################################################################
 49 | # is_deleted()
 50 | # is_file()
 51 | # is_dir() / is_folder() - level
 52 | # ext() # filter by Extension
 53 | # nofn  # exclude $FILE_NAME entries
 54 | 
 55 | ###############################################################################
 56 | # FUNCTIONS
 57 | ###############################################################################
 58 | 
 59 | # FILE READING FUNCTIONS ######################################################
 60 | 
 61 | # FILE READING FUNCTIONS ######################################################
 62 | 
 63 | def read_data(evdl, **kwargs):
 64 |     if d4.debug >= 3:
 65 |         print("DEBUG: [mctm] read_data")
 66 | 
 67 |     return d4com.read_data_common(evdl, **kwargs)
 68 | 
 69 | # HARMONIZATION FUNCTIONS #####################################################
 70 | 
 71 | def harmonize(df, **kwargs):
 72 |     """ Convert DF in HAM format
 73 | 
 74 |         Args: 
 75 |             df (pandas.DataFrame): DF to harmonize
 76 |             kwargs(dict): harmonize options
 77 |         Returns: 
 78 |             pandas.DataFrame in HAM Format
 79 |     """
 80 |     objtype = d4com.data_identify(df)
 81 | 
 82 |     if objtype == "pandas_dataframe-mactime-raw":
 83 |         # Specific Harmonization Pre-Processing -----------------------------------
 84 |         df = df.rename(columns={"Type": "MACB"})
 85 | 
 86 |         df['Type_']        = df['Mode'].str.extract('^(.)')
 87 |         df['PrevType_']    = df['Mode'].str.extract('^..(.)')
 88 |         df['Permissions_'] = df['Mode'].str.extract('^...(.........)')
 89 | 
 90 |         # Deleted / Reallocated
 91 |         df['Deleted_']     = df['File Name'].str.contains(r'\ \(deleted\)$|\ \(deleted-reallocated\)$')
 92 |         df['Reallocated_'] = df['File Name'].str.contains(r'\ \(deleted-reallocated\)$')
 93 | 
 94 |         # [FT] Tag -> Tag_ | DriveLetter_ | VSS_ | EVOName_ | EvidenceName_ | Partition_ | FilePath_
 95 |         # FT
 96 |         if re.search(r'^[A-Z]\[vss[0-9][0-9]\]{.*}:', df['File Name'].iloc[0]):
 97 |             fncolsdf  = df['File Name'].str.split(":", 1, expand=True).rename(columns={0: "Tag_", 1: "FilePath_"})
 98 |             fncolsdf['FilePath-Hash_'] = fncolsdf['FilePath_'].str.lower().apply(hash)
 99 |             fncolsdf['FSType_']   = '-'
100 |             df['Hostname_']    = '-'
101 |             df['SHA256_Hash_'] = '-'
102 | 
103 |             fncols2df = fncolsdf['Tag_'].str.extract(r'([A-Z])\[vss(.*)\]{(.*)}', expand=True).rename(columns={0: "DriveLetter_", 1: "VSS_", 2: "EVOName_"})
104 |             fncols2df['VSS_'] = fncols2df['VSS_'].astype(int)
105 | 
106 |             fncols3df = fncols2df['EVOName_'].str.extract('(.*)-ft-p(.*)', expand=True).rename(columns={0: "EvidenceName_", 1: "Partition_"})
107 |             fncols3df['Partition_'] = fncols3df['Partition_'].astype(int)
108 | 
109 |             df = pd.concat([df, fncols2df, fncols3df, fncolsdf], axis=1)
110 | 
111 |         else:
112 |             fncolsdf  = df['File Name'].str.split(":", 1, expand=True).rename(columns={0: "Tag_", 1: "FilePath_"})
113 |             df = pd.concat([df, fncolsdf], axis=1)
114 |             df['Hostname_']     = '-'
115 |             df['EVOName_']      = '-'
116 |             df['EvidenceName_'] = '-'
117 |             df['Partition_']    = '-'
118 |             df['FSType_']       = '-'
119 |             df['DriveLetter_']  = '-'
120 |             df['VSS_']          = '-'
121 |             df['TSNTFSAttr_']   = '-'
122 |             df['SHA256_Hash_']  = '-'
123 | 
124 |         # Deal with "($FILE_NAME)" string
125 |         tsntfsattrmap = {True: 'FILE_NAME', False: 'STD_INFO'}
126 |         df['TSNTFSAttr_']  = df['FilePath_'].str.contains(r'\ \(\$FILE_NAME\)$').map(tsntfsattrmap)
127 |         df['FilePath_']    = df['FilePath_'].str.replace(r'\ \(\$FILE_NAME\)$','')
128 | 
129 |         df['FilePath_'] = df['FilePath_'].str.replace(r'\ \(deleted\)$|\ \(deleted-reallocated\)$','')
130 |         
131 |         # Generic Harmonization ---------------------------------------------------
132 |         df = d4com.harmonize_common(df, **kwargs)
133 | 
134 |         # Specific Harmonization Post-Processing ----------------------------------
135 | 
136 |         return df
137 | 
138 | # CORE FUNCTIONS (simple, analysis, etc.) #####################################
139 | 
140 | # simple ======================================================================
141 | 
142 | def simple_func(df, *args, **kwargs):
143 |     """ Reformat the input df so the data is presented to the analyst in the
144 |         friendliest possible way
145 | 
146 |     Parameters:
147 |     df  (pd.dataframe):  Input data 
148 |     
149 |     Returns:
150 |     pd.DataFrame: Optionally it will return the filtered dataframe, 
151 |                   only if ret=True is set, constant & hidden columns included
152 |                   If ret_out=True is set, then the output just as it is shown
153 |                   (without constant/hidden columns) will be return
154 |     """
155 | 
156 |     if d4.debug >= 3:
157 |         print("DEBUG: [mctm] [simple_func()]")
158 | 
159 |     windows = kwargs.get('windows', True) 
160 | 
161 |     # Variables ----------------------------------------------------------------
162 |     hiddencols =  ['File_Name', 'FilePath-Hash_', 'SHA256_Hash_']
163 | 
164 |     if windows :
165 |         nonwincols = ['UID', 'GID', 'Mode', 'Permissions_']
166 |         hiddencols = hiddencols + nonwincols
167 | 
168 |     # Maximum number of lines in DF for beautification
169 |     maxdfbprintlines = 20
170 | 
171 |     # Call to simple_common ----------------------------------------------------
172 |     return d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines)
173 | 
174 | 
175 | # DATAFRAME ACCESSOR ##########################################################
176 | 
177 | @pd.api.extensions.register_dataframe_accessor("d4mctm")
178 | class Ds4n6MctmAccessor:
179 |     def __init__(self, pandas_obj):
180 |         self._obj = pandas_obj
181 | 
182 |     def simple(self, *args, **kwargs):
183 |         """ Redirects execution to simple_func()
184 |         """
185 |         df = self._obj
186 |         return simple_func(df, *args, **kwargs)
187 | 
188 | @pd.api.extensions.register_dataframe_accessor("d4_mactime")
189 | class Ds4n6MactimeAccessor:
190 |     def __init__(self, pandas_obj):
191 |         self._obj = pandas_obj
192 | 
193 |     def simple(self, *args, **kwargs):
194 |         """ Redirects execution to simple_func()
195 |         """
196 |         df = self._obj
197 |         return simple_func(df, *args, **kwargs)
198 | 
199 | # ANALYSIS ####################################################################
200 | 
201 | # analysis() function =========================================================
202 | def analysis(*args, **kwargs):
203 |     """ Redirects execution to analysis_func()
204 |     """
205 |     return analysis_func(*args, **kwargs)
206 | 
207 | def analysis_func(*args, **kwargs):
208 |     """ Umbrella function that redirects to different types of analysis 
209 |         available on the input data
210 | 
211 |     Parameters:
212 |     obj:          Input data (typically DF or dict of DFs)
213 |     
214 |     Returns:
215 |     pd.DataFrame: Refer to each specific analysis function
216 |     """
217 | 
218 |     def syntax():
219 |         print('Syntax: analysis(obj, "analysis_type")\n')
220 |         d4list("str-help")
221 |         return
222 | 
223 |     def d4list(objtype):
224 | 
225 |         # Analysis Modules Available for this objective
226 |         # anlav = False
227 |         print("Available fstl analysis types:")
228 |         print("- No analysis functions defined yet.")
229 |         return
230 | 
231 |         # TEMPLATE
232 |         #if objtype == "str-help" or objtype == "str-list" or  re.search("^pandas_dataframe-fstl-mactime-standard", objtype):
233 |         #    anlav = True
234 |         #    print("- XXXXXXXXXX:  XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX (Input: fstldf)")
235 | 
236 |         # if anlav == False:
237 |         #     print('- No analysis modules available for this object ('+objtype+').')
238 | 
239 |     nargs = len(args)
240 | 
241 |     if nargs == 0:
242 |         syntax()
243 |         return
244 | 
245 |     obj = args[0]
246 | 
247 |     objtype = d4com.data_identify(obj)
248 | 
249 |     if isinstance(obj, str):
250 |         if obj == "list":
251 |             d4list(objtype)
252 |             return
253 |         if obj == "help":
254 |             syntax()
255 |             return
256 | 
257 |     if nargs == 1:
258 |         syntax()
259 |         return
260 | 
261 |     anltype = args[1]
262 | 
263 |     if not isinstance(anltype, str):
264 |         syntax()
265 |         return
266 | 
267 |     if anltype == "help":
268 |         syntax()
269 |         return
270 |     elif anltype == "list":
271 |         d4list(objtype)
272 |         return
273 | 
274 |     # TEMPLATE
275 |     # If object is a dict of dfs
276 |     #elif re.search("^pandas_dataframe-evtx_file_df", objtype):
277 |     #    if anltype == "XXXXXXXXXXX":
278 |     #        return XXXXXXXXXXXXXXXXXXXXX(*args, **kwargs)
279 |     #else:
280 |     #    print("ERROR: [fstl] Unsupported input data.")
281 |     #    return
282 | 
283 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/fstl.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: library of functions to appy Data Science in several forensics
  4 | #              artifacts
  5 | #
  6 | 
  7 | ###############################################################################
  8 | # INFO
  9 | ###############################################################################
 10 | # Recommended "import as": d4fstl
 11 | 
 12 | ###############################################################################
 13 | # IMPORTS
 14 | ###############################################################################
 15 | 
 16 | # DEV  IMPORTS ----------------------------------------------------------------
 17 | 
 18 | # python IMPORTS --------------------------------------------------------------
 19 | import os
 20 | import glob
 21 | import re
 22 | import time
 23 | import inspect
 24 | 
 25 | import xmltodict
 26 | import json
 27 | import pickle
 28 | from tqdm import tqdm
 29 | import xml.etree.ElementTree as et
 30 | 
 31 | # DS IMPORTS ------------------------------------------------------------------
 32 | import numpy  as np
 33 | import pandas as pd
 34 | import matplotlib.pyplot as plt
 35 | from IPython.display import display, Markdown, HTML
 36 | 
 37 | from sklearn.model_selection import train_test_split
 38 | from tensorflow.keras.models import Model, load_model
 39 | from tensorflow.keras.layers import Input, Dense
 40 | 
 41 | # DS4N6 IMPORTS ---------------------------------------------------------------
 42 | import ds4n6_lib.d4     as d4
 43 | import ds4n6_lib.common as d4com
 44 | import ds4n6_lib.gui    as d4gui
 45 | import ds4n6_lib.utils  as d4utl
 46 | 
 47 | ###############################################################################
 48 | # IDEAS
 49 | ###############################################################################
 50 | # is_deleted()
 51 | # is_file()
 52 | # is_dir() / is_folder() - level
 53 | # ext() # filter by Extension
 54 | # nofn  # exclude $FILE_NAME entries
 55 | 
 56 | ###############################################################################
 57 | # FUNCTIONS
 58 | ###############################################################################
 59 | # FILE READING FUNCTIONS ######################################################
 60 | 
 61 | def read_data(evdl, **kwargs):
 62 |     return d4com.read_data_common(evdl, **kwargs)
 63 |     
 64 | # CORE FUNCTIONS (simple, analysis, etc.) #####################################
 65 | # simple ======================================================================
 66 | def simple_func(df, *args, **kwargs):
 67 |     """ Reformat the input df so the data is presented to the analyst in the
 68 |         friendliest possible way
 69 | 
 70 |     Parameters:
 71 |     df  (pd.dataframe):  Input data 
 72 |     
 73 |     Returns:
 74 |     pd.DataFrame: Optionally it will return the filtered dataframe, 
 75 |                   only if ret=True is set, constant & hidden columns included
 76 |                   If ret_out=True is set, then the output just as it is shown
 77 |                   (without constant/hidden columns) will be return
 78 |     """
 79 |     if d4.debug >= 4:
 80 |         print("DEBUG: [fstl] [simple_func()]")
 81 | 
 82 |     # Variables ----------------------------------------------------------------
 83 |     hiddencols =  []
 84 | 
 85 |     # Maximum number of lines in DF for beautification
 86 |     maxdfbprintlines = 20
 87 | 
 88 |     # Call to simple_common ----------------------------------------------------
 89 |     return d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines)
 90 | 
 91 | # DATAFRAME ACCESSOR ##########################################################
 92 | 
 93 | @pd.api.extensions.register_dataframe_accessor("d4fstl")
 94 | class Ds4n6FSTLAccessor:
 95 |     def __init__(self, pandas_obj):
 96 |         self._obj = pandas_obj
 97 | 
 98 |     def simple(self, *args, **kwargs):
 99 |         """ Redirects execution to simple_func()
100 |         """
101 |         df = self._obj
102 |         return simple_func(df, *args, **kwargs)
103 | 
104 |     def nofn(self):
105 |         return self._obj[~self._obj['FileName'].str.contains(r"\ \(\$FILE_NAME\)")]
106 |         
107 |     def is_deleted(self):
108 |         if 'FileName' in self._obj.columns:
109 |             return self._obj[self._obj['FileName'].str.contains(r"\ \(deleted\)$")]
110 |         elif 'Deleted_' in self._obj.columns:
111 |             return self._obj.query('Deleted_ == True')
112 |         
113 |     def is_file(self):
114 |         return self._obj.query('Type_ == "r" | PrevType_ == "r"')
115 |         
116 |     def is_dir(self,level=0):
117 |         return self._obj.query('Type_ == "d"')
118 |         
119 |     # Same as is_dir()
120 |     def is_directory(self,level=0):
121 |         return self._obj.query('Type_ == "d"')
122 |         
123 |     # Same as is_dir()
124 |     def is_folder(self,level=0):
125 |         return self._obj.query('Type_ == "d"')
126 |         
127 |     def ext(self,ext):
128 |         return self._obj[self._obj['FileName'].str.contains(r"\."+ext+"$") | self._obj['FileName'].str.contains(r"\."+ext+r"\ \(\$FILE_NAME\)$") | self._obj['FileName'].str.contains(r"\."+ext+r"\ \(deleted\)$") | self._obj['FileName'].str.contains(r"\."+ext+r"\ \(\$FILE_NAME\)\ \(deleted\)$")]
129 | 
130 |     def just_basename(self):
131 |         return self._obj['FileName'].str.replace('.*/','')
132 |         
133 |     def ts_m(self,exclusive=False):
134 |         if exclusive == False:
135 |             return self._obj.query(r'MACB.str.contains("^m...$")',engine="python")
136 |         else:
137 |             return self._obj.query(r'MACB.str.contains("^m\.\.\.$")',engine="python")
138 |         
139 |     def ts_a(self,exclusive=False):
140 |         if exclusive == False:
141 |             return self._obj.query('MACB.str.contains("^.a..$")',engine="python")
142 |         else:
143 |             return self._obj.query(r'MACB.str.contains("^\.a\.\.$")',engine="python")
144 |         
145 |     def ts_c(self,exclusive=False):
146 |         if exclusive == False:
147 |             return self._obj.query('MACB.str.contains("^..c.$")',engine="python")
148 |         else:
149 |             return self._obj.query(r'MACB.str.contains("^\.\.c\.$")',engine="python")
150 |         
151 |     def ts_b(self,exclusive=False):
152 |         if exclusive == False:
153 |             return self._obj.query('MACB.str.contains("^...b$")',engine="python")
154 |         else:
155 |             return self._obj.query(r'MACB.str.contains("^\.\.\.b$")',engine="python")
156 |    
157 | @pd.api.extensions.register_dataframe_accessor("d4_fstl")
158 | class Ds4n6_FSTLAccessor:
159 |     def __init__(self, pandas_obj):
160 |         self._obj = pandas_obj
161 | 
162 |     def simple(self, *args, **kwargs):
163 |         """ Redirects execution to simple_func()
164 |         """
165 |         df = self._obj
166 |         return simple_func(df, *args, **kwargs)
167 | 
168 |     def nofn(self):
169 |         return self._obj[~self._obj['FileName'].str.contains(r"\ \(\$FILE_NAME\)")]
170 |         
171 |     def is_deleted(self):
172 |         if 'FileName' in self._obj.columns:
173 |             return self._obj[self._obj['FileName'].str.contains(r"\ \(deleted\)$")]
174 |         elif 'Deleted_' in self._obj.columns:
175 |             return self._obj.query('Deleted_ == True')
176 |         
177 |     def is_file(self):
178 |         return self._obj.query('Type_ == "r" | PrevType_ == "r"')
179 |         
180 |     def is_dir(self,level=0):
181 |         return self._obj.query('Type_ == "d"')
182 |         
183 |     # Same as is_dir()
184 |     def is_directory(self,level=0):
185 |         return self._obj.query('Type_ == "d"')
186 |         
187 |     # Same as is_dir()
188 |     def is_folder(self,level=0):
189 |         return self._obj.query('Type_ == "d"')
190 |         
191 |     def ext(self,ext):
192 |         return self._obj[self._obj['FileName'].str.contains(r"\."+ext+"$") | self._obj['FileName'].str.contains(r"\."+ext+r"\ \(\$FILE_NAME\)$") | self._obj['FileName'].str.contains(r"\."+ext+r"\ \(deleted\)$") | self._obj['FileName'].str.contains(r"\."+ext+r"\ \(\$FILE_NAME\)\ \(deleted\)$")]
193 | 
194 |     def just_basename(self):
195 |         return self._obj['FileName'].str.replace('.*/','')
196 |         
197 |     def ts_m(self,exclusive=False):
198 |         if exclusive == False:
199 |             return self._obj.query('MACB.str.contains("^m...$")',engine="python")
200 |         else:
201 |             return self._obj.query(r'MACB.str.contains("^m\.\.\.$")',engine="python")
202 |         
203 |     def ts_a(self,exclusive=False):
204 |         if exclusive == False:
205 |             return self._obj.query('MACB.str.contains("^.a..$")',engine="python")
206 |         else:
207 |             return self._obj.query(r'MACB.str.contains("^\.a\.\.$")',engine="python")
208 |         
209 |     def ts_c(self,exclusive=False):
210 |         if exclusive == False:
211 |             return self._obj.query('MACB.str.contains("^..c.$")',engine="python")
212 |         else:
213 |             return self._obj.query(r'MACB.str.contains("^\.\.c\.$")',engine="python")
214 |         
215 |     def ts_b(self,exclusive=False):
216 |         if exclusive == False:
217 |             return self._obj.query('MACB.str.contains("^...b$")',engine="python")
218 |         else:
219 |             return self._obj.query(r'MACB.str.contains("^\.\.\.b$")',engine="python")
220 |    
221 | # ANALYSIS ####################################################################
222 | 
223 | # analysis() function =========================================================
224 | def analysis(*args, **kwargs):
225 |     """ Redirects execution to analysis_func()
226 |     """
227 |     return analysis_func(*args, **kwargs)
228 | 
229 | def analysis_func(*args, **kwargs):
230 |     """ Umbrella function that redirects to different types of analysis 
231 |         available on the input data
232 | 
233 |     Parameters:
234 |     obj:          Input data (typically DF or dict of DFs)
235 |     
236 |     Returns:
237 |     pd.DataFrame: Refer to each specific analysis function
238 |     """
239 | 
240 |     def syntax():
241 |         print('Syntax: analysis(obj, "analysis_type")\n')
242 |         d4list("str-help")
243 |         return
244 | 
245 |     def d4list(objtype):
246 | 
247 |         # Analysis Modules Available for this objective
248 |         # anlav = False
249 |         print("Available fstl analysis types:")
250 |         print("- No analysis functions defined yet.")
251 |         return
252 | 
253 |         # TEMPLATE
254 |         #if objtype == "str-help" or objtype == "str-list" or  re.search("^pandas_dataframe-fstl-mactime-standard", objtype):
255 |         #    anlav = True
256 |         #    print("- XXXXXXXXXX:  XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX (Input: fstldf)")
257 | 
258 |         # if anlav == False:
259 |         #     print('- No analysis modules available for this object ('+objtype+').')
260 | 
261 |     nargs = len(args)
262 | 
263 |     if nargs == 0:
264 |         syntax()
265 |         return
266 | 
267 |     obj = args[0]
268 | 
269 |     objtype = d4com.data_identify(obj)
270 | 
271 |     if isinstance(obj, str):
272 |         if obj == "list":
273 |             d4list(objtype)
274 |             return
275 |         if obj == "help":
276 |             syntax()
277 |             return
278 | 
279 |     if nargs == 1:
280 |         syntax()
281 |         return
282 | 
283 |     anltype = args[1]
284 | 
285 |     if not isinstance(anltype, str):
286 |         syntax()
287 |         return
288 | 
289 |     if anltype == "help":
290 |         syntax()
291 |         return
292 |     elif anltype == "list":
293 |         d4list(objtype)
294 |         return
295 | 
296 |     # TEMPLATE
297 |     # If object is a dict of dfs
298 |     #elif re.search("^pandas_dataframe-evtx_file_df", objtype):
299 |     #    if anltype == "XXXXXXXXXXX":
300 |     #        return XXXXXXXXXXXXXXXXXXXXX(*args, **kwargs)
301 |     #else:
302 |     #    print("ERROR: [fstl] Unsupported input data.")
303 |     #    return
304 | 
305 | 
306 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/pslist.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: library of functions to appy Data Science in several forensics
  4 | #              artifacts
  5 | #
  6 | 
  7 | ###############################################################################
  8 | # INFO
  9 | ###############################################################################
 10 | # Recommended "import as": d4pslst
 11 | 
 12 | ###############################################################################
 13 | # IMPORTS
 14 | ###############################################################################
 15 | 
 16 | # DEV  IMPORTS ----------------------------------------------------------------
 17 | 
 18 | # python IMPORTS --------------------------------------------------------------
 19 | import os
 20 | import glob
 21 | import re
 22 | import time
 23 | import inspect
 24 | import json
 25 | import pickle
 26 | 
 27 | # DS IMPORTS ------------------------------------------------------------------
 28 | import numpy  as np
 29 | import pandas as pd
 30 | from IPython.display import display, Markdown, HTML
 31 | 
 32 | # DS4N6 IMPORTS ---------------------------------------------------------------
 33 | import ds4n6_lib.d4     as d4
 34 | import ds4n6_lib.common as d4com
 35 | import ds4n6_lib.gui    as d4gui
 36 | import ds4n6_lib.utils  as d4utl
 37 | import ds4n6_lib.unx    as d4unx
 38 | from ds4n6_lib.knowledge import critical_processes, boot_start_processes, process_parents
 39 | 
 40 | ###############################################################################
 41 | # FUNCTIONS
 42 | ###############################################################################
 43 | 
 44 | # ANALYSIS FUNCTIONS ##########################################################
 45 | 
 46 | # simple ======================================================================
 47 | def simple_func(df, *args, **kwargs):
 48 |     """ Reformat the input df so the data is presented to the analyst in the
 49 |         friendliest possible way
 50 | 
 51 |     Parameters:
 52 |     df  (pd.dataframe):  Input data 
 53 |     
 54 |     Returns:
 55 |     pd.DataFrame: Optionally it will return the filtered dataframe, 
 56 |                   only if ret=True is set, constant & hidden columns included
 57 |                   If ret_out=True is set, then the output just as it is shown
 58 |                   (without constant/hidden columns) will be return
 59 |     """
 60 | 
 61 |     if d4.debug >= 3:
 62 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
 63 | 
 64 |     # Variables ----------------------------------------------------------------
 65 |     hiddencols =  []
 66 | 
 67 |     # Maximum number of lines in DF for beautification
 68 |     maxdfbprintlines = 20
 69 | 
 70 |     # Call to simple_common ----------------------------------------------------
 71 |     return d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines)
 72 | 
 73 | # analysis() ==================================================================
 74 | def analysis(*args, **kwargs):
 75 |     """ Redirects execution to analysis_func()
 76 |     """
 77 |     return analysis_func(*args, **kwargs)
 78 | 
 79 | def analysis_func(*args, **kwargs):
 80 |     """ Umbrella function that redirects to different types of analysis 
 81 |         available on the input data
 82 | 
 83 |     Parameters:
 84 |     obj:          Input data (typically DF or dict of DFs)
 85 |     
 86 |     Returns:
 87 |     pd.DataFrame: Refer to each specific analysis function
 88 |     """
 89 |     def syntax():
 90 |         print('Syntax: analysis(obj, "analysis_type")\n')
 91 |         d4list("str-help")
 92 |         return
 93 | 
 94 |     def d4list(objtype):
 95 |         print("Available pslist analysis types:")
 96 |         if objtype == None or objtype == "str-help" or objtype == "str-list" or  re.search("^pandas_dataframe-pslist-ham", objtype):
 97 |             anlav = True
 98 |             print("- process_stats:            Show process statistics               (Input: pslistdf)")
 99 |             print("- unfrequent_processes:     Identify unfrequent processes         (Input: pslistdf)")
100 |             print("- boot_time_anomalies:      Identify boot time proccess anomalies (Input: pslistdf)")
101 |             print("- parent_process_anomalies: Identify parent process anomalies     (Input: pslistdf)")
102 | 
103 |         if anlav == False:
104 |             print('- No analysis modules available for this object ('+objtype+').')
105 | 
106 |     if d4.debug >= 3:
107 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
108 | 
109 |     nargs = len(args)
110 | 
111 |     if nargs == 0:
112 |         syntax()
113 |         return
114 | 
115 |     obj = args[0]
116 | 
117 |     objtype = d4com.data_identify(obj)
118 | 
119 |     if isinstance(obj, str):
120 |         if obj == "list":
121 |             d4list(objtype)
122 |             return
123 |         if obj == "help":
124 |             syntax()
125 |             return
126 | 
127 |     if nargs == 1:
128 |         syntax()
129 |         return
130 | 
131 |     anltype = args[1]
132 | 
133 |     if not isinstance(anltype, str):
134 |         syntax()
135 |         return
136 | 
137 |     if anltype == "help":
138 |         syntax()
139 |         return
140 |     elif anltype == "list":
141 |         d4list(objtype)
142 |         return
143 | 
144 |     # pslistdf ----------------------------------------------------------------
145 |     if   re.search("^pandas_dataframe-pslist-ham", objtype):
146 |         if   anltype == "process_stats":
147 |             return analysis_process_stats(*args, **kwargs)
148 |         elif anltype == "unfrequent_processes":
149 |             return analysis_unfrequent_processes(*args, **kwargs)
150 |         elif anltype == "boot_time_anomalies":
151 |             return analysis_boot_time_anomalies(*args, **kwargs)
152 |         elif anltype == "parent_process_anomalies":
153 |             return analysis_parent_process_anomalies(*args, **kwargs)
154 | 
155 |     print("INFO: [d4pslst] No analysis functions available for this data type ("+objtype+")")
156 | 
157 | # ANALYSIS FUNCTIONS ==========================================================
158 | 
159 | def analysis_process_stats(*args, **kwargs):
160 |     """ Show Process Statistics
161 | 
162 |         Args: 
163 |         obj:          Input data (HAM process DF)
164 |         Returns: 
165 |         pandas.Dataframe with the results of the analysis
166 | 
167 |     """
168 | 
169 |     if d4.debug >= 3:
170 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
171 | 
172 |     # Argument parsing
173 |     df = args[0]
174 | 
175 |     if 'Exit_TStamp_' in df.columns:
176 |         print("Running:")
177 |         display(df.query('Exit_TStamp_.isna()', engine="python")['Name_'].value_counts())
178 |         print("")
179 |         print("Dead:")
180 |         display(df.query('Exit_TStamp_.notna()', engine="python")['Name_'].value_counts())
181 |         print("")
182 |     else:
183 |         display(df['Name_'].value_counts())
184 | 
185 | def analysis_unfrequent_processes(*args, **kwargs):
186 |     """ Analysis that find unfrequent processes
187 | 
188 |         Args: 
189 |         obj:          Input data (typically DF or dict of DFs)
190 |         Returns: 
191 |         pandas.Dataframe with the results of the analysis
192 | 
193 |     """
194 | 
195 |     if d4.debug >= 3:
196 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
197 | 
198 |     # Argument parsing
199 |     pslistdf = args[0]
200 | 
201 |     n = kwargs.get('n', 3)
202 | 
203 |     print("Threshold: "+str(n))
204 |     print("")
205 | 
206 |     pscntdf  = pd.DataFrame(pslistdf['Name_'].value_counts()).reset_index().rename(columns={'Name_': 'Count', 'index': 'Name_'})
207 |     pscntdf['Count'] = pscntdf['Count'].astype(int)
208 |     pscntndf = pscntdf.query('Count <= @n', engine="python")
209 |     
210 |     print("No. Processes with less than " + str(n) +" occurrences: " + str(len(pscntndf)))
211 |     return pscntndf
212 | 
213 | 
214 | def analysis_boot_time_anomalies(*args, **kwargs):
215 |     """ Find anomalies at boot time
216 | 
217 |     Parameters:
218 |     pslistdf (pd.DataFrame): Dataframe with pslist info
219 |     secs (int):              Interval allowed for processes to start after boot 
220 |     
221 |     Returns:
222 |     pd.DataFrame: Processes that don't follow the standard start time pattern
223 | 
224 |     """
225 | 
226 |     if d4.debug >= 3:
227 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
228 | 
229 |     # Argument parsing
230 |     df = args[0]
231 | 
232 |     secs = kwargs.get('secs', 30)
233 | 
234 |     # Verify field requirements
235 |     if not 'Start_TStamp_' in df.columns:
236 |         print("ERROR: Cannot run analysis. Start_TStamp_ column not present.")
237 |         return
238 | 
239 |     print("Min. Start Timestamp Processes:")
240 |     display(df[df['Start_TStamp_'] == df['Start_TStamp_'].min()])
241 | 
242 |     if 'Session_' in df.columns:
243 |         bootps = df[df['Name_'].isin(boot_start_processes)  & (df['Session_'] <= 1) & df['Exit_TStamp_'].isnull() ]
244 |     else:
245 |         bootps = df[df['Name_'].isin(boot_start_processes)  & df['Exit_TStamp_'].isnull() ]
246 | 
247 |     return bootps[bootps['Start_TStamp_'] >= bootps['Start_TStamp_'].min() + pd.Timedelta(seconds=secs)]
248 | 
249 | def analysis_parent_process_anomalies(*args, **kwargs):
250 |     """ Find anomalies in parent processes
251 | 
252 |     Parameters:
253 |     pslistdf (pd.DataFrame): Dataframe with pslist info
254 |     critical_only (bool): Only critical process
255 |     
256 |     Returns:
257 |     None
258 |     """
259 | 
260 |     if d4.debug >= 3:
261 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
262 | 
263 |     # Argument parsing
264 |     df = args[0]
265 | 
266 |     critical_only = kwargs.get('critical_only', True)
267 | 
268 |     # Verify field requirements
269 |     if not 'PPID_' in df.columns:
270 |         print("ERROR: Cannot run analysis. PPID_ column not present.")
271 |         return
272 | 
273 |     if 'Exit_TStamp_' in df.columns:
274 |         df_alive = df[df['Exit_TStamp_'].isna()]
275 |     else:
276 |         df_alive = df
277 | 
278 |     hnpid  = df_alive[['D4_Hostname_', 'Name_', 'PID_']]
279 |     hnppid = df_alive[['D4_Hostname_', 'Name_', 'PPID_']]
280 |     family_ext = pd.merge(hnppid, hnpid, left_on=['D4_Hostname_', 'PPID_'], right_on=['D4_Hostname_', 'PID_'], how='left').dropna()
281 |     family = family_ext.drop(columns=['D4_Hostname_', 'PPID_', 'PID_']).rename(columns={'Name__x': 'Child', 'Name__y': 'Parent'}).reset_index().drop(columns=['index'])
282 | 
283 |     if critical_only :
284 |         thisfamily = family.query('Child == @critical_processes')
285 |     else:
286 |         thisfamily = family
287 | 
288 |     family_unknown = pd.merge(thisfamily, process_parents, indicator=True, how='outer').query( '_merge=="left_only"').drop( '_merge', axis=1)
289 | 
290 |     display(family_unknown.groupby(["Child", "Parent"]).size().sort_values(ascending=False))
291 |     display(family_unknown)
292 | 
293 | # DATAFRAME ACCESSOR ##########################################################
294 | 
295 | @pd.api.extensions.register_dataframe_accessor("d4pslst")
296 | class Ds4n6PslstAccessor:
297 |     def __init__(self, pandas_obj):
298 |         self._obj = pandas_obj
299 | 
300 |     def simple(self, *args, **kwargs):
301 |         """ Redirects execution to simple_func()
302 |         """
303 |         df = self._obj
304 |         return simple_func(df, *args, **kwargs)
305 | 
306 | @pd.api.extensions.register_dataframe_accessor("d4_pslist")
307 | class Ds4n6PslistAccessor:
308 |     def __init__(self, pandas_obj):
309 |         self._obj = pandas_obj
310 | 
311 |     def simple(self, *args, **kwargs):
312 |         """ Redirects execution to simple_func()
313 |         """
314 |         df = self._obj
315 |         return simple_func(df, *args, **kwargs)
316 | 
317 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/macrobber.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: Library of functions to apply Data Science to forensics artifacts
  4 | #
  5 | 
  6 | ###############################################################################
  7 | # INFO
  8 | ###############################################################################
  9 | # Recommended "import as": d4mcrb
 10 | 
 11 | ###############################################################################
 12 | # IMPORTS
 13 | ###############################################################################
 14 | 
 15 | # python IMPORTS --------------------------------------------------------------
 16 | import os
 17 | import glob
 18 | import re
 19 | import time
 20 | import inspect
 21 | import pickle
 22 | 
 23 | # DS IMPORTS ------------------------------------------------------------------
 24 | import numpy  as np
 25 | import pandas as pd
 26 | import matplotlib.pyplot as plt
 27 | 
 28 | # DS4N6 IMPORTS ---------------------------------------------------------------
 29 | import ds4n6_lib.d4     as d4
 30 | import ds4n6_lib.common as d4com
 31 | import ds4n6_lib.gui    as d4gui
 32 | import ds4n6_lib.utils  as d4utl
 33 | # bug: unix no exist error, replace by unx.
 34 | import ds4n6_lib.unx   as d4unx
 35 | 
 36 | ###############################################################################
 37 | # FUNCTIONS
 38 | ###############################################################################
 39 | 
 40 | # Hidden columns in simple() funcion
 41 | hiddencols = [ 'MTStampEpoch_', 'MTStamp_', 'ATStampEpoch_', 'ATStamp_', 'CTStampEpoch_', 'CTStamp_', 'Meta_', 'FileStem_', 'ParentPath_', 'ParentName_', 'PathSeparator_', 'FilePath-Hash_', 'FileName-Hash_', 'FileStem-Hash_', 'ParentPath-Hash_', 'ParentName-Hash_'] 
 42 | 
 43 | ###############################################################################
 44 | # FUNCTIONS
 45 | ###############################################################################
 46 | 
 47 | # FILE READING FUNCTIONS ######################################################
 48 | 
 49 | def read_data(evdl, **kwargs):
 50 |     """ Read data from files or a folder
 51 | 
 52 |         Args: 
 53 |             evdl (str): path to file/folder source
 54 |             kwargs: read options
 55 |         Returns: 
 56 |             pandas.Dataframe or dictionary of pandas.DataFrame
 57 |     """
 58 |     if d4.debug >= 3:
 59 |         print("DEBUG: [macrobber-read_data()]")
 60 | 
 61 |     header_names = ['MD5', 'path', 'inode', 'mode_as_string', 'UID', 'GID', 'size', 'atime', 'mtime', 'ctime', 'block_size'] 
 62 | 
 63 |     kwargs['header_names'] = header_names
 64 | 
 65 |     return d4com.read_data_common(evdl, **kwargs)
 66 | 
 67 | # HARMONIZATION FUNCTIONS #####################################################
 68 | 
 69 | def harmonize(df, **kwargs):
 70 |     """ Convert DF in HAM format
 71 | 
 72 |         Args: 
 73 |             df (pandas.DataFrame): DF to harmonize
 74 |             kwargs(dict): harmonize options
 75 |         Returns: 
 76 |             pandas.DataFrame in HAM Format
 77 |     """
 78 |     data_os           = kwargs.get('data_os',           None)
 79 |     generate_hashes   = kwargs.get('generate_hashes',   True)
 80 |     path_prefix       = kwargs.get('path_prefix',       None)
 81 | 
 82 |     # Specific Harmonization Pre-Processing ===================================
 83 |     def remove_prefix(df, prefixregex):
 84 |         if 'FilePath_' in df.columns:
 85 |             df['FilePath_'] = df['FilePath_'].str.replace(prefixregex,'')
 86 |         return df
 87 | 
 88 |     # Harmonize to File_List_HAM
 89 | 
 90 |     # PathSeparator is tool-dependent, not only OS-dependent
 91 |     pathsep = '/'
 92 | 
 93 |     df['MTStampEpoch_']  = df['mtime']
 94 |     df['MTStamp_']       = pd.to_datetime(df['mtime'], errors = 'coerce',  unit='s')
 95 |     df['MTStampDate_']   = df['MTStamp_'].dt.date
 96 |     df['MTStampTime_']   = df['MTStamp_'].dt.ceil(freq='s').dt.time
 97 |     df['MTStampDoW_']    = df['MTStamp_'].dt.day_name()
 98 |     df['ATStampEpoch_']  = df['atime']
 99 |     df['ATStamp_']       = pd.to_datetime(df['atime'], errors = 'coerce',  unit='s')
100 |     df['ATStampDate_']   = df['ATStamp_'].dt.date
101 |     df['ATStampTime_']   = df['ATStamp_'].dt.ceil(freq='s').dt.time
102 |     df['ATStampDoW_']    = df['ATStamp_'].dt.day_name()
103 |     df['CTStampEpoch_']  = df['ctime']
104 |     df['CTStamp_']       = pd.to_datetime(df['ctime'], errors = 'coerce',  unit='s')
105 |     df['CTStampDate_']   = df['CTStamp_'].dt.date
106 |     df['CTStampTime_']   = df['CTStamp_'].dt.ceil(freq='s').dt.time
107 |     df['CTStampDoW_']    = df['CTStamp_'].dt.day_name()
108 |    #df['BTStampEpoch_']  = df['btime']
109 |    #df['BTStamp_']       = pd.to_datetime(df['btime'], errors = 'coerce',  unit='s')
110 |    #df['BTStampDate_']   = df['BTStamp_'].dt.date
111 |    #df['BTStampTime_']   = df['BTStamp_'].dt.ceil(freq='s').dt.time
112 |    #df['BTStampDoW_']    = df['BTStamp_'].dt.day_name()
113 |     df['Size_']          = df['size'].astype('int64')
114 |    #df['Mode_']          = None
115 |     if not data_os == "windows":
116 |         df['UID_']       = df['UID']
117 |     if not data_os == "windows":
118 |         df['GID_']       = df['GID']
119 |     df['Meta_']          = df['inode']
120 |    #df['File_Name']      = None
121 |     df['Type_']          = df['mode_as_string'].str.extract('^(.)')
122 |    #df['PrevType_']      = None
123 |     if not data_os == "windows":
124 |         df['Permissions_'] = df['mode_as_string'].str.replace('^.','').str.replace(r'\ .*$','')
125 |    #df['Deleted_']       = None
126 |    #df['Reallocated_']   = None
127 |    #df['Hostname_']      = None
128 |     if not df['MD5'].iloc[0] == 0:
129 |         df['MD5_Hash_']  = df['MD5']
130 |    #df['SHA256_Hash_']   = None
131 |    #df['DriveLetter_']   = None
132 |    #df['VSS_']           = None
133 |    #df['EVOName_' ]      = None
134 |    #df['EvidenceName_']  = None
135 |    #df['Partition_']     = None
136 |    #df['Tag_']           = None
137 |     df['FilePath_']      = df['path']
138 |     if path_prefix is not None:
139 |         df = remove_prefix(df, path_prefix)
140 |     df['FileName_']      = df['FilePath_'].str.replace('.*'+pathsep,'')
141 |     df['FileStem_']      = df['FileName_'].str.replace(r'\.[^\.]*$','')
142 |     df['FileExtension_'] = df['FileName_'].str.replace(r'^[^\.]*$', '').str.replace(r'.*\.','').str.lower()
143 |     df['ParentPath_']    = df['FilePath_'].str.replace('(.*)'+pathsep+'.*','\\1')
144 |     df['ParentName_']    = df['ParentPath_'].str.replace('.*'+pathsep,'')
145 |     df['PathSeparator_'] = pathsep
146 |    #df['FSType_']        = None
147 |    #df['TSNTFSAttr_']    = None
148 | 
149 | 
150 |     # Path-Hash Fields  - - - - - - - - - - - - - - - - - - - - - - - - - - - -
151 |     if generate_hashes:
152 |         df['FilePath-Hash_']   = df['FilePath_'].str.lower().apply(hash)
153 |         df['FileName-Hash_']   = df['FileName_'].str.lower().apply(hash)
154 |         df['FileStem-Hash_']   = df['FileStem_'].str.lower().apply(hash)
155 |         df['ParentPath-Hash_'] = df['ParentPath_'].str.lower().apply(hash)
156 |         df['ParentName-Hash_'] = df['ParentName_'].str.lower().apply(hash)
157 | 
158 |     # Generic Harmonization ===================================================
159 |     df = d4com.harmonize_common(df, datatype='flist', **kwargs)
160 | 
161 |     # Specific Harmonization Post-Processing ==================================
162 | 
163 |     # return ==================================================================
164 | 
165 |     return df
166 | 
167 | # ANALYSIS FUNCTIONS ##########################################################
168 | 
169 | # simple ======================================================================
170 | def simple_func(df, *args, **kwargs):
171 |     """ Reformat the input df so the data is presented to the analyst in the
172 |         friendliest possible way
173 | 
174 |     Parameters:
175 |     df  (pd.dataframe):  Input data 
176 |     
177 |     Returns:
178 |     pd.DataFrame: Optionally it will return the filtered dataframe, 
179 |                   only if ret=True is set, constant & hidden columns included
180 |                   If ret_out=True is set, then the output just as it is shown
181 |                   (without constant/hidden columns) will be return
182 |     """
183 | 
184 |     if d4.debug >= 4:
185 |         print("DEBUG: [mcrb] [simple_func()]")
186 | 
187 |     # Artifact-specific argument parsing =======================================
188 | 
189 |     # Variables ================================================================
190 |     dfout = df
191 | 
192 |     # Maximum number of lines in DF for beautification
193 |     maxdfbprintlines = 20
194 | 
195 |     # Pre-Processing ==========================================================
196 | 
197 |     # Call to simple_common ===================================================
198 |     dfout = d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines)
199 | 
200 |     # Post-Processing =========================================================
201 | 
202 |     # Return ==================================================================
203 |     return dfout
204 | 
205 | # analysis ====================================================================
206 | def analysis(obj, *args, **kwargs):
207 |     """ Redirects execution to analysis_func()
208 |     """
209 |     return analysis_func(obj, *args, **kwargs)
210 | 
211 | def analysis_func(obj, *args, **kwargs):
212 |     """ Umbrella function that redirects to different types of analysis 
213 |         available on the input data
214 | 
215 |     Parameters:
216 |     obj:          Input data (typically DF or dict of DFs)
217 |     
218 |     Returns:
219 |     pd.DataFrame: Refer to each specific analysis function
220 |     """
221 | 
222 |     def syntax():
223 |         print('Syntax: analysis(obj, "analysis_type")\n')
224 |         d4list("str-help")
225 |         return
226 | 
227 |     def d4list(objtype):
228 | 
229 |         # Analysis Modules Available for this objective
230 |         anlav = False
231 |         print("Available macrobber analysis types:")
232 |         if objtype == None or objtype == "str-help" or objtype == "str-list" or  re.search("^dict-pandas_dataframe-macrobber", objtype):
233 |             anlav = True
234 |             print("- macrobber_files:  No.events macrobber file (Input: macrobberdfs)")
235 | 
236 |         if anlav == False:
237 |             print('- No analysis modules available for this object ('+objtype+').')
238 | 
239 |     nargs = len(args)
240 | 
241 |     if nargs == 0:
242 |         syntax()
243 |         return
244 | 
245 |     obj = args[0]
246 | 
247 |     objtype = d4com.data_identify(obj)
248 | 
249 |     if isinstance(obj, str):
250 |         if obj == "list":
251 |             d4list(objtype)
252 |             return
253 |         if obj == "help":
254 |             syntax()
255 |             return
256 | 
257 |     if nargs == 1:
258 |         syntax()
259 |         return
260 | 
261 |     anltype = args[1]
262 | 
263 |     if not isinstance(anltype, str):
264 |         syntax()
265 |         return
266 | 
267 |     if anltype == "help":
268 |         syntax()
269 |         return
270 |     elif anltype == "list":
271 |         d4list(objtype)
272 |         return
273 | 
274 |     # ANALYSIS FUNCTIONS ======================================================
275 | 
276 |     # mcrbdfs ------------------------------------------------------------------
277 |     # if   re.search("^dict-pandas_dataframe-macrobber", objtype):
278 |     #     if anltype == "macrobber_files":
279 |     #         return analysis_macrobber_files(*args, **kwargs)
280 | 
281 |     print("INFO: [d4mcrb] No analysis functions available for this data type ("+objtype+")")
282 | 
283 | # DATAFRAME ACCESSOR ##########################################################
284 | 
285 | @pd.api.extensions.register_dataframe_accessor("d4mcrb")
286 | class Ds4n6McrbAccessor:
287 |     def __init__(self, pandas_obj):
288 |         self._obj = pandas_obj
289 | 
290 |     def simple(self, *args, **kwargs):
291 |         """ Redirects execution to simple_func()
292 |         """
293 |         df = self._obj
294 |         return simple_func(df, *args, **kwargs)
295 | 
296 | 
297 | @pd.api.extensions.register_dataframe_accessor("d4_macrobber")
298 | class Ds4n6MacRobberAccessor:
299 |     def __init__(self, pandas_obj):
300 |         self._obj = pandas_obj
301 | 
302 |     def simple(self, *args, **kwargs):
303 |         """ Redirects execution to simple_func()
304 |         """
305 |         df = self._obj
306 |         return simple_func(df, *args, **kwargs)
307 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/tshark.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: Library of functions to apply Data Science to forensics artifacts
  4 | #
  5 | 
  6 | ###############################################################################
  7 | # INFO
  8 | ###############################################################################
  9 | # Recommended "import as": d4tshrk
 10 | 
 11 | ###############################################################################
 12 | # IMPORTS
 13 | ###############################################################################
 14 | 
 15 | # DEV  IMPORTS ----------------------------------------------------------------
 16 | 
 17 | # python IMPORTS --------------------------------------------------------------
 18 | import os
 19 | import glob
 20 | import re
 21 | import time
 22 | import inspect
 23 | import pickle
 24 | import subprocess
 25 | import json
 26 | 
 27 | # DS IMPORTS ------------------------------------------------------------------
 28 | import numpy  as np
 29 | import pandas as pd
 30 | import matplotlib.pyplot as plt
 31 | 
 32 | # DS4N6 IMPORTS ---------------------------------------------------------------
 33 | import ds4n6_lib.d4     as d4
 34 | import ds4n6_lib.common as d4com
 35 | import ds4n6_lib.gui    as d4gui
 36 | import ds4n6_lib.utils  as d4utl
 37 | import ds4n6_lib.unx    as d4unx
 38 | 
 39 | ###############################################################################
 40 | # FUNCTIONS
 41 | ###############################################################################
 42 | 
 43 | # FILE READING FUNCTIONS ######################################################
 44 | 
 45 | def read_data(evdl, **kwargs):
 46 |     if d4.debug >= 3:
 47 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
 48 |         
 49 |     if bool(re.search(r"\.pcap$", evdl, re.IGNORECASE)):
 50 |         return read_tshark_pcap(evdl, **kwargs)
 51 |     
 52 |     elif bool(re.search(r"\.json$", evdl, re.IGNORECASE)):
 53 |         return read_pcap_json(evdl, **kwargs)
 54 |     elif bool(re.search(r"\.csv$", evdl, re.IGNORECASE)):
 55 |         return read_pcap_csv(evdl, **kwargs)
 56 |     
 57 |     else:
 58 |         print("ERROR: Unable to read input file. Unsupported file extension.")
 59 |         return
 60 | 
 61 | def read_tshark_pcap(evdl, **kwargs):
 62 |     """ Read pcap data from to json file
 63 |         Args: 
 64 |             pcapf (str): path to file source
 65 |             kwargs: read options
 66 |         Returns: 
 67 |             .json file
 68 |     """
 69 |     cmd = "tshark -r " + evdl + " -T ek -j "'http tcp ip'" -P -V -x > " + evdl+'.json'
 70 |     print(cmd)
 71 |     subprocess.Popen(cmd, shell = True, 
 72 |                                  stdout=subprocess.PIPE)
 73 |     
 74 |     evdl = evdl+'.json'
 75 |     
 76 |     return read_pcap_json(evdl,**kwargs)
 77 | 
 78 | def read_pcap_json(evdl, **kwargs):
 79 |     """ Read pcap data from from a json file
 80 |         Args: 
 81 |             evdl (str): path to file source
 82 |             kwargs: read options
 83 |         Returns: 
 84 |             pandas.DataFrame (in the future a dictionary of pandas.DataFrame)
 85 |     """
 86 |     n
 87 |     if d4.debug >= 3:
 88 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
 89 | 
 90 |     # Parse Arguments
 91 |     tool                     = kwargs.get('tool',                    '')
 92 |     hostname                 = kwargs.get('hostname',                '')
 93 |     do_harmonize             = kwargs.get('harmonize',               True)
 94 |     use_pickle               = kwargs.get('use_pickle'              , True)
 95 | 
 96 |     pklrawf = evdl+'.raw.pkl'
 97 |     pklhtmf = evdl+'.htm.pkl'
 98 | 
 99 |     if os.path.exists(pklhtmf) and use_pickle  and do_harmonize :
100 | 
101 |         # Read from pickle
102 |         print("- Saved Harmonized pickle file found:")
103 |         print("      "+pklhtmf)
104 |         print("- Reading data from HAM pickle file...")
105 |         dfs = pickle.load(open(pklhtmf, "rb"))
106 |         print("- Done.")
107 |         print("")
108 | 
109 |     else:
110 |         print("- No saved Harmonized pickle file found.")
111 |         print("")
112 |         
113 |         
114 |         
115 | 
116 |     
117 |     with open(evdl, 'r') as f:
118 |         data = [json.loads(line) for line in f]
119 |     dfs = pd.json_normalize(data)
120 |     
121 |     return dfs
122 | 
123 | def read_pcap_csv(evdl, **kwargs):
124 |     """ Read pcap data from from a json file
125 |         Args: 
126 |             evdl (str): path to file source
127 |             kwargs: read options
128 |         Returns: 
129 |             pandas.DataFrame (in the future a dictionary of pandas.DataFrame)
130 |     """
131 | 
132 |     if d4.debug >= 3:
133 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
134 | 
135 |     # Parse Arguments
136 |     tool                     = kwargs.get('tool',                    '')
137 |     hostname                 = kwargs.get('hostname',                '')
138 |     do_harmonize             = kwargs.get('harmonize',               True)
139 |     use_pickle               = kwargs.get('use_pickle'              , True)
140 |     
141 |     output = pd.read_csv(evdl)
142 |     output = output.rename(columns={'ip.src': 'Source_IP', 'ip.dst': 'Destination_IP', 'tcp.srcport': 'Source_TCP_Port', 'tcp.dstport': 'Destination_TCP_Port', 'frame.time': 'Frame_Time', '_ws.col.Protocol': 'Protocol', '_ws.col.Info': 'Info'})
143 |     
144 |     return output
145 |     
146 |     
147 | # HARMONIZATION FUNCTIONS #####################################################
148 | 
149 | def harmonize(df, **kwargs):
150 |     """ Function description
151 | 
152 |         Args: 
153 | 
154 |         Returns: 
155 | 
156 |         Raises:
157 |     """
158 | 
159 |     if d4.debug >= 3:
160 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
161 | 
162 |     orchestrator = kwargs.get('orchestrator', None)
163 |     tool         = kwargs.get('tool',         None)
164 |     plugin       = kwargs.get('plugin',       None)
165 |     hostname     = kwargs.get('hostname',     None)
166 | 
167 |     # Specific Harmonization Pre-Processing ===================================
168 | 
169 |     # Generic Harmonization ===================================================
170 |     df = d4com.harmonize_common(df, **kwargs)
171 | 
172 |     # Specific Harmonization Post-Processing ==================================
173 | 
174 |     # return ==================================================================
175 |     # WARNING: For artifact-modules only
176 |     # df['D4_DataType_'] = 'DATA_TYPE_HERE'
177 | 
178 |     return df
179 | 
180 | # ANALYSIS FUNCTIONS ##########################################################
181 | 
182 | # simple ======================================================================
183 | # ANALYSIS FUNCTIONS ##########################################################
184 | 
185 | # simple ======================================================================
186 | def simple_func(df, *args, **kwargs):
187 |     """ Reformat the input df so the data is presented to the analyst in the
188 |         friendliest possible way
189 | 
190 |     Parameters:
191 |     df  (pd.dataframe):  Input data 
192 |     
193 |     Returns:
194 |     pd.DataFrame: Optionally it will return the filtered dataframe, 
195 |                   only if ret=True is set, constant & hidden columns included
196 |                   If ret_out=True is set, then the output just as it is shown
197 |                   (without constant/hidden columns) will be return
198 |     """
199 | 
200 |     if d4.debug >= 3:
201 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
202 | 
203 |     # Artifact-specific argument parsing =======================================
204 | 
205 |     # Variables ================================================================
206 |     hiddencols =  []
207 | 
208 |     dfout = df
209 | 
210 |     # Maximum number of lines in DF for beautification
211 |     maxdfbprintlines = 20
212 | 
213 |     # Pre-Processing ==========================================================
214 | 
215 |     # Call to simple_common ===================================================
216 |     dfout = d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines)
217 | 
218 |     # Post-Processing =========================================================
219 | 
220 |     # Return ==================================================================
221 |     return dfout
222 | 
223 | # analysis ====================================================================
224 | def analysis(obj, *args, **kwargs):
225 |     """ Redirects execution to analysis_func()
226 |     """
227 |     return analysis_func(obj, *args, **kwargs)
228 | 
229 | def analysis_func(obj, *args, **kwargs):
230 |     """ Umbrella function that redirects to different types of analysis 
231 |         available on the input data
232 | 
233 |     Parameters:
234 |     obj:          Input data (typically DF or dict of DFs)
235 |     
236 |     Returns:
237 |     pd.DataFrame: Refer to each specific analysis function
238 |     """
239 | 
240 |     if d4.debug >= 3:
241 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
242 | 
243 |     # SUB-FUNCTIONS ###########################################################
244 |     def syntax():
245 |         print('Syntax: analysis(obj, "analysis_type")\n')
246 |         d4list("str-help")
247 |         return
248 | 
249 |     def d4list(objtype):
250 | 
251 |         # Analysis Modules Available for this objective
252 |         anlav = False
253 |         print("Available XXXXX analysis types:")
254 |         if objtype == None or objtype == "str-help" or objtype == "str-list" or  re.search("^dict-pandas_dataframe-XXXXX", objtype):
255 |             anlav = True
256 |             print("- XXXXX_files:  No.events XXXXX file (Input: XXXdfs)")
257 | 
258 |         if anlav == False:
259 |             print('- No analysis modules available for this object ('+objtype+').')
260 | 
261 |     # FUNCTION BODY ###########################################################
262 |     thisdatatype = None
263 | 
264 |     nargs = len(args)
265 | 
266 |     if nargs == 0:
267 |         syntax()
268 |         return
269 | 
270 |     obj = args[0]
271 | 
272 |     objtype = d4com.data_identify(obj)
273 | 
274 |     if isinstance(obj, str):
275 |         if obj == "list":
276 |             d4list(objtype)
277 |             return
278 |         if obj == "help":
279 |             syntax()
280 |             return
281 | 
282 |     if nargs == 1:
283 |         if thisdatatype is not None:
284 |             if re.search("^dict-pandas_dataframe-"+thisdatatype, objtype) or re.search("^pandas_dataframe-"+thisdatatype, objtype):
285 |                 d4list(objtype)
286 |             else:
287 |                 syntax()
288 |         else:
289 |             syntax()
290 | 
291 |         return
292 | 
293 |     anltype = args[1]
294 | 
295 |     if not isinstance(anltype, str):
296 |         syntax()
297 |         return
298 | 
299 |     if anltype == "help":
300 |         syntax()
301 |         return
302 |     elif anltype == "list":
303 |         d4list(objtype)
304 |         return
305 | 
306 |     # ANALYSIS FUNCTIONS ======================================================
307 | 
308 |     # XXXdfs ------------------------------------------------------------------
309 |     if   re.search("^dict-pandas_dataframe-XXXXX", objtype):
310 |         if anltype == "XXXXX_files":
311 |             return analysis_XXXXX_files(*args, **kwargs)
312 | 
313 |     print("INFO: [d4XXX] No analysis functions available for this data type ("+objtype+")")
314 | 
315 | # DATAFRAME ACCESSOR ##########################################################
316 | 
317 | @pd.api.extensions.register_dataframe_accessor("d4tshrk")
318 | class Ds4n6TshrkAccessor:
319 |     def __init__(self, pandas_obj):
320 |         self._obj = pandas_obj
321 | 
322 |     def simple(self, *args, **kwargs):
323 |         """ Redirects execution to simple_func()
324 |         """
325 |         df = self._obj
326 |         return simple_func(df, *args, **kwargs)
327 | 
328 | 
329 | @pd.api.extensions.register_dataframe_accessor("d4_tshark")
330 | class Ds4n6TsharkAccessor:
331 |     def __init__(self, pandas_obj):
332 |         self._obj = pandas_obj
333 | 
334 |     def simple(self, *args, **kwargs):
335 |         """ Redirects execution to simple_func()
336 |         """
337 |         df = self._obj
338 |         return simple_func(df, *args, **kwargs)
339 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/ml_models/seq2seq_lstm.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Description: implementation of ML model: seq2seq - LSTM
  3 | #
  4 | 
  5 | #############################################################################################
  6 | # INFO
  7 | #############################################################################################
  8 | 
  9 | #############################################################################################
 10 | # IMPORTS
 11 | #############################################################################################
 12 | import re, string, os, time
 13 | import pandas as pd
 14 | import numpy as np
 15 | from ast import literal_eval
 16 | from gensim.models import Word2Vec
 17 | from tensorflow import keras
 18 | from tensorflow.keras import layers
 19 | from tensorflow.keras import activations
 20 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
 21 | import tensorflow as tf
 22 | 
 23 | 
 24 | #############################################################################################
 25 | # Class: Seq2seqData
 26 | #############################################################################################
 27 | class Seq2seqData:
 28 |     def __init__(self):
 29 |         self.batch_size = 16
 30 |         self.vocab_size = 0
 31 |         self.sequence_length = 0
 32 |         self.in_vectorization = TextVectorization()
 33 |         self.out_vectorization = TextVectorization()
 34 |         self.train_dset = 0
 35 | 
 36 |     def load_path_dataset(self, lm_dset, from_date, to_date, min_count):
 37 |         if type(lm_dset) == str:
 38 |             lm_dset = pd.read_csv(lm_dset)
 39 |             lm_dset = lm_dset.astype(str)
 40 |             lm_dset['path'] = lm_dset['path'].apply(literal_eval)
 41 |         lm_dset['date'] = pd.to_datetime(lm_dset['date'], format='%Y-%m-%d')
 42 |         lm_dset = lm_dset[(lm_dset['date'] >= from_date) & (lm_dset['date'] <= to_date)]
 43 |         model = Word2Vec(list(lm_dset['path']), vector_size=0, min_count=min_count)
 44 |         node_list = model.wv.index_to_key
 45 |         self.vocab_size = len(node_list)+5
 46 |         
 47 |         ndset = lm_dset.copy()
 48 |         for idx,row in lm_dset.iterrows():
 49 |             for node in row.path:
 50 |                 if node not in node_list:
 51 |                     ndset = ndset.drop(index=idx)
 52 |                     break
 53 |         return ndset
 54 |     
 55 |     def process_train_data(self, lm_dset):
 56 |         target_data = []
 57 |         for i in lm_dset['path']:
 58 |             target_data.append(['[sos]'] + i + ['[eos]'])
 59 |         self.sequence_length = max(len(s) for s in target_data)
 60 | 
 61 |         train_in  = [' '.join(i) for i in lm_dset['path']]
 62 |         train_out = [' '.join(i) for i in target_data]
 63 |         return train_in, train_out
 64 |     
 65 |     def build_train_dset(self, train_in, train_out):
 66 |         self._tokenizer(train_in, train_out) 
 67 |         t_in = self.in_vectorization(train_in)
 68 |         t_out = self.out_vectorization(train_out)
 69 |         dataset = tf.data.Dataset.from_tensor_slices((t_in, t_out[:, :-1], t_out[:, 1:]))
 70 |         self.train_dset = dataset.shuffle(len(train_in)).batch(self.batch_size, drop_remainder=True)
 71 |     
 72 |     # AUX. FUNCTIONS
 73 |     def _custom_standardization(self, input_string):
 74 |         strip_chars = string.punctuation
 75 |         strip_chars = strip_chars.replace("[", "")
 76 |         strip_chars = strip_chars.replace("]", "")
 77 |         strip_chars = strip_chars.replace("-", "")
 78 |         strip_chars = strip_chars.replace("_", "")
 79 |         strip_chars = strip_chars.replace(".", "")
 80 |         strip_chars = strip_chars.replace(":", "")
 81 |         strip_chars = strip_chars.replace("&", "")
 82 |         strip_chars = strip_chars.replace("/", "")
 83 |         strip_chars = strip_chars.replace("\\", "")
 84 |         strip_chars = strip_chars.replace("@", "")
 85 |         lowercase = tf.strings.lower(input_string)
 86 |         return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")
 87 |     
 88 |     def _tokenizer(self, train_in, train_out):
 89 |         self.in_vectorization = TextVectorization(max_tokens=self.vocab_size, output_mode="int", output_sequence_length=self.sequence_length)
 90 |         self.out_vectorization = TextVectorization(max_tokens=self.vocab_size, output_mode="int", output_sequence_length=self.sequence_length + 1, standardize=self._custom_standardization)
 91 |         self.in_vectorization.adapt(train_in)
 92 |         self.out_vectorization.adapt(train_out)
 93 | 
 94 | 
 95 | #############################################################################################
 96 | # Class: Autoencoder (LSTM)
 97 | #############################################################################################
 98 | class Autoencoder:
 99 |     def __init__(self, embed_dim, latent_dim, data):
100 |         self.epochs = 10
101 |         self.embed_dim = embed_dim
102 |         self.latent_dim = latent_dim
103 |         self.data = data
104 |         self.encoder = None
105 |         self.decoder = None
106 | 
107 |     def set_epochs(self, epochs):
108 |         self.epochs = epochs
109 | 
110 |     def build_autoencoder(self):
111 |         self.encoder = Encoder(self.data.vocab_size, self.embed_dim, self.latent_dim)
112 |         self.decoder = Decoder(self.data.vocab_size, self.embed_dim, self.latent_dim)
113 |         
114 |     def fit_autoencoder(self):
115 |         optimizer = tf.keras.optimizers.Adam(clipnorm=5.0)
116 |         checkpoint_dir = './training_ckpt_seq2seq'
117 |         checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
118 |         checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=self.encoder, decoder=self.decoder)
119 |         losses, accuracies = self._main_train(self.encoder, self.decoder, self.data.train_dset, self.epochs, self.data.batch_size, optimizer, checkpoint, checkpoint_prefix)
120 |         
121 |     def decode_sequence(self, input_sentence, node_index_dict):
122 |         decoded_err = []
123 |         t_path = input_sentence.split(' ') + (['[eos]']*(self.data.sequence_length))
124 |         tokenized_input_sentence = self.data.in_vectorization([input_sentence])
125 |         en_initial_states = self.encoder.init_states(1)
126 |         en_outputs = self.encoder(tf.constant(tokenized_input_sentence), en_initial_states)
127 |         de_state_h, de_state_c = en_outputs[1:]  
128 |         
129 |         decoded_path = '[sos]'
130 |         for i in range(self.data.sequence_length):
131 |             tokenized_target_sentence = self.data.out_vectorization([decoded_path])[:, :-1]
132 |             de_output, de_state_h, de_state_c, predictions = self.decoder(tokenized_target_sentence, (de_state_h, de_state_c))
133 |             
134 |             n = t_path.pop(0)
135 |             index = next((i for i, node in node_index_dict.items() if node == n), None)
136 |             err = np.array(predictions)[0][i][index]
137 |             decoded_err.append(err)
138 | 
139 |             sampled_token_index = np.argmax(predictions[0, i, :])
140 |             sampled_token = node_index_dict[sampled_token_index]
141 |             decoded_path += ' ' + sampled_token
142 | 
143 |             if sampled_token == '[eos]':
144 |                 break
145 |         return decoded_path, decoded_err
146 | 
147 |     def get_anomalies(self, train_in):
148 |         node_vocab = self.data.out_vectorization.get_vocabulary()
149 |         node_index_dict = dict(zip(range(len(node_vocab)), node_vocab))
150 | 
151 |         e_matrix = []
152 |         test_in_paths = [pair for pair in train_in]   
153 |         for idx,path in enumerate(test_in_paths):
154 |             dec_lm, err = self.decode_sequence(path, node_index_dict)
155 |             mse = np.square(err).mean()
156 |             e_matrix.append([idx, mse])
157 |         error_matrix = np.array(e_matrix)
158 |         error_matrix = error_matrix[error_matrix[:, 1].argsort()]
159 |         return error_matrix
160 |     
161 |     # AUX. FUNCTIONS
162 |     def _main_train(self, encoder, decoder, dataset, n_epochs, batch_size, optimizer, checkpoint, checkpoint_prefix):
163 |         losses = []
164 |         accuracies = []
165 |         print('Model: "LSTM"')
166 |         print('____________________________________________________________')
167 |         for e in range(n_epochs):
168 |             start = time.time()
169 |             en_initial_states = encoder.init_states(batch_size)
170 |             for batch, (input_seq, target_seq_in, target_seq_out) in enumerate(dataset.take(-1)):
171 |                 loss, accuracy = self._train_step(input_seq, target_seq_in, target_seq_out, en_initial_states, optimizer)
172 | 
173 |                 if batch % 100 == 0:
174 |                     losses.append(loss)
175 |                     accuracies.append(accuracy)
176 |                     print('Epoch {} Batch {} Loss {:.4f} Acc:{:.4f}'.format(e + 1, batch, loss.numpy(), accuracy.numpy()))
177 |             if (e + 1) % 2 == 0:
178 |                 checkpoint.save(file_prefix = checkpoint_prefix)
179 |             print('Time taken for 1 epoch {:.4f} sec\n'.format(time.time() - start))
180 |         return losses, accuracies
181 | 
182 |     def _train_step(self, input_seq, target_seq_in, target_seq_out, en_initial_states, optimizer):
183 |         with tf.GradientTape() as tape:
184 |             en_outputs = self.encoder(input_seq, en_initial_states)
185 |             en_states = en_outputs[1:]
186 |             de_states = en_states
187 |             de_outputs = self.decoder(target_seq_in, de_states)
188 |             logits = de_outputs[0]
189 |             loss = self._loss_func(target_seq_out, logits)
190 |             acc = self._accuracy_fn(target_seq_out, logits)
191 | 
192 |         variables = self.encoder.trainable_variables + self.decoder.trainable_variables
193 |         gradients = tape.gradient(loss, variables)
194 |         optimizer.apply_gradients(zip(gradients, variables))
195 |         return loss, acc
196 | 
197 |     def _loss_func(self, targets, logits):
198 |         crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
199 |         mask = tf.math.logical_not(tf.math.equal(targets, 0))
200 |         mask = tf.cast(mask, dtype=tf.int64)
201 |         loss = crossentropy(targets, logits, sample_weight=mask)
202 |         return loss
203 | 
204 |     def _accuracy_fn(self, y_true, y_pred):
205 |         pred_values = tf.keras.backend.cast(tf.keras.backend.argmax(y_pred, axis=-1), dtype='int64')
206 |         correct = tf.keras.backend.cast(tf.keras.backend.equal(y_true, pred_values), dtype='float32')
207 | 
208 |         mask = tf.keras.backend.cast(tf.keras.backend.greater(y_true, 0), dtype='float32')
209 |         n_correct = tf.keras.backend.sum(mask * correct)
210 |         n_total = tf.keras.backend.sum(mask)
211 |         return n_correct / n_total
212 | 
213 | 
214 | #############################################################################################
215 | # Class: Encoder
216 | #############################################################################################
217 | class Encoder(tf.keras.Model):
218 |     def __init__(self, vocab_size, embedding_dim, hidden_dim):
219 |         super(Encoder, self).__init__()
220 |         self.hidden_dim = hidden_dim
221 |         self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
222 |         self.lstm = tf.keras.layers.LSTM(
223 |             hidden_dim, return_sequences=True, return_state=True)
224 | 
225 |     def call(self, input_sequence, states):
226 |         embed = self.embedding(input_sequence)
227 |         output, state_h, state_c = self.lstm(embed, initial_state=states)
228 |         return output, state_h, state_c
229 | 
230 |     def init_states(self, batch_size):
231 |         return (tf.zeros([batch_size, self.hidden_dim]),
232 |                 tf.zeros([batch_size, self.hidden_dim]))
233 | 
234 | 
235 | #############################################################################################
236 | # Class: Decoder
237 | #############################################################################################
238 | class Decoder(tf.keras.Model):
239 |     def __init__(self, vocab_size, embedding_dim, hidden_dim):
240 |         super(Decoder, self).__init__()
241 |         self.hidden_dim = hidden_dim
242 |         self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
243 |         self.lstm = tf.keras.layers.LSTM(
244 |             hidden_dim, return_sequences=True, return_state=True)
245 |         self.dense = tf.keras.layers.Dense(vocab_size)
246 |         self.out = tf.keras.layers.Softmax() 
247 | 
248 |     def call(self, input_sequence, state):
249 |         embed = self.embedding(input_sequence)
250 |         lstm_out, state_h, state_c = self.lstm(embed, state)
251 |         logits = self.dense(lstm_out)
252 |         out = self.out(logits)
253 |         return logits, state_h, state_c, out


--------------------------------------------------------------------------------
/src/ds4n6_lib/ml_models/transformer.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Description: implementation of ML model: seq2seq - Transformer
  3 | #
  4 | 
  5 | #############################################################################################
  6 | # INFO
  7 | #############################################################################################
  8 | 
  9 | #############################################################################################
 10 | # IMPORTS
 11 | #############################################################################################
 12 | import re, string
 13 | import pandas as pd
 14 | import numpy as np
 15 | from ast import literal_eval
 16 | from gensim.models import Word2Vec
 17 | from tensorflow import keras
 18 | from tensorflow.keras import layers
 19 | from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
 20 | import tensorflow as tf
 21 | 
 22 | 
 23 | #############################################################################################
 24 | # Class: Seq2seqData
 25 | #############################################################################################
 26 | class Seq2seqData:
 27 |     def __init__(self):
 28 |         self.batch_size = 16
 29 |         self.vocab_size = 0
 30 |         self.sequence_length = 0
 31 |         self.in_vectorization = TextVectorization()
 32 |         self.out_vectorization = TextVectorization()
 33 |         self.train_dset = 0
 34 |     
 35 |     def load_path_dataset(self, lm_dset, from_date, to_date, min_count):
 36 |         if type(lm_dset) == str:
 37 |             lm_dset = pd.read_csv(lm_dset)
 38 |             lm_dset = lm_dset.astype(str)
 39 |             lm_dset['path'] = lm_dset['path'].apply(literal_eval)
 40 |         lm_dset['date'] = pd.to_datetime(lm_dset['date'], format='%Y-%m-%d')
 41 |         lm_dset = lm_dset[(lm_dset['date'] >= from_date) & (lm_dset['date'] <= to_date)]
 42 |         model = Word2Vec(list(lm_dset['path']), vector_size=0, min_count=min_count)
 43 |         node_list = model.wv.index_to_key
 44 |         self.vocab_size = len(node_list) + 5
 45 |         
 46 |         ndset = lm_dset.copy()
 47 |         for idx,row in lm_dset.iterrows():
 48 |             for node in row.path:
 49 |                 if node not in node_list:
 50 |                     ndset = ndset.drop(index=idx)
 51 |                     break
 52 |         return ndset
 53 |     
 54 |     def process_train_data(self, lm_dset):
 55 |         target_data = []
 56 |         for i in lm_dset['path']:
 57 |             target_data.append(['[sos]'] + i + ['[eos]'])
 58 |         self.sequence_length = max(len(s) for s in target_data)
 59 | 
 60 |         train_in  = [' '.join(i) for i in lm_dset['path']]
 61 |         train_out = [' '.join(i) for i in target_data]
 62 |         return train_in, train_out
 63 | 
 64 |     def build_train_dset(self, train_in, train_out):
 65 |         self._tokenizer(train_in, train_out)
 66 |         dataset = tf.data.Dataset.from_tensor_slices((train_in, train_out))
 67 |         dataset = dataset.batch(self.batch_size)
 68 |         dataset = dataset.map(self._format_dataset)
 69 |         self.train_dset = dataset.shuffle(len(train_in)).prefetch(16).cache()
 70 |     
 71 |     # AUX. FUNCTIONS
 72 |     def _custom_standardization(self, input_string):
 73 |         strip_chars = string.punctuation
 74 |         strip_chars = strip_chars.replace("[", "")
 75 |         strip_chars = strip_chars.replace("]", "")
 76 |         strip_chars = strip_chars.replace("-", "")
 77 |         strip_chars = strip_chars.replace("_", "")
 78 |         strip_chars = strip_chars.replace(".", "")
 79 |         strip_chars = strip_chars.replace(":", "")
 80 |         strip_chars = strip_chars.replace("&", "")
 81 |         strip_chars = strip_chars.replace("/", "")
 82 |         strip_chars = strip_chars.replace("\\", "")
 83 |         strip_chars = strip_chars.replace("@", "")
 84 |         lowercase = tf.strings.lower(input_string)
 85 |         return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")
 86 |     
 87 |     def _tokenizer(self, train_in, train_out):
 88 |         self.in_vectorization = TextVectorization(max_tokens=self.vocab_size, output_mode="int", output_sequence_length=self.sequence_length)
 89 |         self.out_vectorization = TextVectorization(max_tokens=self.vocab_size, output_mode="int", output_sequence_length=self.sequence_length + 1, standardize=self._custom_standardization)
 90 |         self.in_vectorization.adapt(train_in)
 91 |         self.out_vectorization.adapt(train_out)
 92 |         
 93 |     def _format_dataset(self, train_in, train_out):
 94 |         t_in = self.in_vectorization(train_in)
 95 |         t_out = self.out_vectorization(train_out)
 96 |         return ({"encoder_inputs": t_in, "decoder_inputs": t_out[:, :-1],}, t_out[:, 1:])
 97 | 
 98 | 
 99 | #############################################################################################
100 | # Class: Autoencoder (Transformer)
101 | #############################################################################################
102 | class Autoencoder:
103 |     def __init__(self, embed_dim, latent_dim, data):
104 |         self.epochs = 5
105 |         self.num_heads = 1
106 |         self.embed_dim = embed_dim
107 |         self.latent_dim = latent_dim
108 |         self.data = data
109 |         self.model = None
110 |         
111 |     def set_epochs(self, epochs):
112 |         self.epochs = epochs
113 |     
114 |     def set_num_heads(self, num_heads):
115 |         self.num_heads = num_heads
116 |         
117 |     def build_autoencoder(self):
118 |         encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
119 |         x = PositionalEmbedding(self.data.sequence_length, self.data.vocab_size, self.embed_dim)(encoder_inputs)
120 |         encoder_outputs = TransformerEncoder(self.embed_dim, self.latent_dim, self.num_heads)(x)
121 |         encoder = keras.Model(encoder_inputs, encoder_outputs)
122 | 
123 |         decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
124 |         encoded_seq_inputs = keras.Input(shape=(None, self.embed_dim), name="decoder_state_inputs")
125 |         x = PositionalEmbedding(self.data.sequence_length, self.data.vocab_size, self.embed_dim)(decoder_inputs)
126 |         x = TransformerDecoder(self.embed_dim, self.latent_dim, self.num_heads)(x, encoded_seq_inputs)
127 |         x = layers.Dropout(0.6)(x)
128 |         decoder_outputs = layers.Dense(self.data.vocab_size, activation="softmax")(x)
129 |         decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)
130 | 
131 |         decoder_outputs = decoder([decoder_inputs, encoder_outputs])
132 |         transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer")
133 |         self.model = transformer     
134 |         
135 |     def fit_autoencoder(self):
136 |         self.model.summary()
137 |         self.model.compile("rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
138 |         self.model.fit(self.data.train_dset, epochs=self.epochs)
139 |         
140 |     def decode_sequence(self, input_sentence, node_index_dict):
141 |         decoded_err = []
142 |         t_path = input_sentence.split(' ') + (['[eos]']*(self.data.sequence_length))
143 | 
144 |         tokenized_input_sentence = self.data.in_vectorization([input_sentence])
145 |         decoded_path = '[sos]'
146 |         for i in range(self.data.sequence_length):
147 |             tokenized_target_sentence = self.data.out_vectorization([decoded_path])[:, :-1]
148 |             predictions = self.model([tokenized_input_sentence, tokenized_target_sentence])
149 | 
150 |             n = t_path.pop(0)
151 |             index = next((i for i, node in node_index_dict.items() if node == n), None)
152 |             err = np.array(predictions)[0][i][index]
153 |             decoded_err.append(err)
154 | 
155 |             sampled_token_index = np.argmax(predictions[0, i, :])
156 |             sampled_token = node_index_dict[sampled_token_index]
157 |             decoded_path += ' ' + sampled_token
158 | 
159 |             if sampled_token == '[eos]':
160 |                 break
161 |         return decoded_path, decoded_err
162 |          
163 |     def get_anomalies(self, train_in):
164 |         node_vocab = self.data.out_vectorization.get_vocabulary()
165 |         node_index_dict = dict(zip(range(len(node_vocab)), node_vocab))
166 | 
167 |         e_matrix = []
168 |         test_in_paths = [pair for pair in train_in]   
169 |         for idx,path in enumerate(test_in_paths):
170 |             dec_lm, err = self.decode_sequence(path, node_index_dict)
171 |             mse = np.square(err).mean()
172 |             e_matrix.append([idx, mse])
173 |         error_matrix = np.array(e_matrix)
174 |         error_matrix = error_matrix[error_matrix[:, 1].argsort()]
175 |         return error_matrix
176 | 
177 | 
178 | #############################################################################################
179 | # Class: TransformerEncoder
180 | #############################################################################################
181 | class TransformerEncoder(layers.Layer):
182 |     def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
183 |         super().__init__(**kwargs)
184 |         self.embed_dim = embed_dim
185 |         self.dense_dim = dense_dim
186 |         self.num_heads = num_heads
187 |         self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
188 |         self.dense_proj = keras.Sequential([layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),])
189 |         self.layernorm_1 = layers.LayerNormalization()
190 |         self.layernorm_2 = layers.LayerNormalization()
191 |         self.supports_masking = True
192 |         
193 |     def call(self, inputs, mask=None):
194 |         if mask is not None:
195 |             padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
196 |         attention_output = self.attention(query=inputs, value=inputs, key=inputs, attention_mask=padding_mask)
197 |         proj_input = self.layernorm_1(inputs + attention_output)
198 |         proj_output = self.dense_proj(proj_input)
199 |         return self.layernorm_2(proj_input + proj_output)
200 | 
201 | 
202 | #############################################################################################
203 | # Class: PositionalEmbedding
204 | #############################################################################################
205 | class PositionalEmbedding(layers.Layer):
206 |     def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
207 |         super().__init__(**kwargs)
208 |         self.token_embeddings = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
209 |         self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=embed_dim)
210 |         self.sequence_length = sequence_length
211 |         self.vocab_size = vocab_size
212 |         self.embed_dim = embed_dim
213 | 
214 |     def call(self, inputs):
215 |         length = tf.shape(inputs)[-1]
216 |         positions = tf.range(start=0, limit=length, delta=1)
217 |         embedded_tokens = self.token_embeddings(inputs)
218 |         embedded_positions = self.position_embeddings(positions)
219 |         return embedded_tokens + embedded_positions
220 | 
221 |     def compute_mask(self, inputs, mask=None):
222 |         return tf.math.not_equal(inputs, 0)
223 | 
224 |     
225 | #############################################################################################
226 | # Class: TransformerDecoder
227 | #############################################################################################
228 | class TransformerDecoder(layers.Layer):
229 |     def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
230 |         super().__init__(**kwargs)
231 |         self.embed_dim = embed_dim
232 |         self.latent_dim = latent_dim
233 |         self.num_heads = num_heads
234 |         self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
235 |         self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
236 |         self.dense_proj = keras.Sequential([layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),])
237 |         self.layernorm_1 = layers.LayerNormalization()
238 |         self.layernorm_2 = layers.LayerNormalization()
239 |         self.layernorm_3 = layers.LayerNormalization()
240 |         self.supports_masking = True
241 | 
242 |     def call(self, inputs, encoder_outputs, mask=None):
243 |         causal_mask = self.get_causal_attention_mask(inputs)
244 |         if mask is not None:
245 |             padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
246 |             padding_mask = tf.minimum(padding_mask, causal_mask)
247 | 
248 |         attention_output_1 = self.attention_1(query=inputs, value=inputs, key=inputs, attention_mask=causal_mask)
249 |         out_1 = self.layernorm_1(inputs + attention_output_1)
250 | 
251 |         attention_output_2 = self.attention_2(
252 |             query=out_1,
253 |             value=encoder_outputs,
254 |             key=encoder_outputs,
255 |             attention_mask=padding_mask,
256 |         )
257 |         out_2 = self.layernorm_2(out_1 + attention_output_2)
258 | 
259 |         proj_output = self.dense_proj(out_2)
260 |         return self.layernorm_3(out_2 + proj_output)
261 | 
262 |     def get_causal_attention_mask(self, inputs):
263 |         input_shape = tf.shape(inputs)
264 |         batch_size, sequence_length = input_shape[0], input_shape[1]
265 |         i = tf.range(sequence_length)[:, tf.newaxis]
266 |         j = tf.range(sequence_length)
267 |         mask = tf.cast(i >= j, dtype="int32")
268 |         mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
269 |         mult = tf.concat(
270 |             [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
271 |             axis=0,
272 |         )
273 |         return tf.tile(mask, mult)


--------------------------------------------------------------------------------
/src/ds4n6_lib/flist.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: Library of functions to apply Data Science to forensics artifacts
  4 | #
  5 | 
  6 | ###############################################################################
  7 | # INFO
  8 | ###############################################################################
  9 | # Recommended "import as": d4flst
 10 | 
 11 | ###############################################################################
 12 | # IMPORTS
 13 | ###############################################################################
 14 | 
 15 | # DEV  IMPORTS ----------------------------------------------------------------
 16 | 
 17 | # python IMPORTS --------------------------------------------------------------
 18 | import os
 19 | import glob
 20 | import re
 21 | import time
 22 | import pickle
 23 | import inspect
 24 | 
 25 | # DS IMPORTS ------------------------------------------------------------------
 26 | import numpy  as np
 27 | import pandas as pd
 28 | import matplotlib.pyplot as plt
 29 | 
 30 | # DS4N6 IMPORTS ---------------------------------------------------------------
 31 | import ds4n6_lib.d4     as d4
 32 | import ds4n6_lib.common as d4com
 33 | import ds4n6_lib.gui    as d4gui
 34 | import ds4n6_lib.utils  as d4utl
 35 | import ds4n6_lib.unx    as d4unx
 36 | 
 37 | ###############################################################################
 38 | # VARIABLES
 39 | ###############################################################################
 40 | hiddencols = [ 'MTStampEpoch_', 'MTStamp_', 'ATStampEpoch_', 'ATStamp_', 'CTStampEpoch_', 'CTStamp_', 'Meta_', 'FileStem_', 'ParentName_', 'ParentPath_', 'ParentMeta_', 'PathSeparator_', 'FilePath-Hash_', 'FileName-Hash_', 'FileStem-Hash_', 'ParentPath-Hash_', 'ParentName-Hash_', 'NTFS-SeqNumber_', 'ParentSeqNumber_', 'ParentPath', 'NTFS-ReferenceCount_', 'NTFS-ReparseTarget_', 'IsDirectory_', 'NTFS-HasAds_', 'NTFS-IsAds_', 'NTFS-SI<FN_', 'NTFS-uSecZeros_', 'NTFS-Copied_', 'NTFS-SiFlags_', 'NTFS-NameType_', 'NTFS-FN-BTime_', 'NTFS-FN-MTime_', 'NTFS-FN-CTime_', 'NTFS-FN-ATime_', 'NTFS-UpdateSequenceNumber_', 'NTFS-LogfileSequenceNumber_', 'NTFS-SecurityId_', 'NTFS-ObjectIdFileDroid_', 'NTFS-LoggedUtilStream_', 'NTFS-ZoneIdContents_', ]
 41 | 
 42 | ###############################################################################
 43 | # FUNCTIONS
 44 | ###############################################################################
 45 | 
 46 | # ANALYSIS FUNCTIONS ##########################################################
 47 | 
 48 | # simple ======================================================================
 49 | def simple_func(df, *args, **kwargs):
 50 |     """ Reformat the input df so the data is presented to the analyst in the
 51 |         friendliest possible way
 52 | 
 53 |     Parameters:
 54 |     df  (pd.dataframe):  Input data 
 55 |     
 56 |     Returns:
 57 |     pd.DataFrame: Optionally it will return the filtered dataframe, 
 58 |                   only if ret=True is set, constant & hidden columns included
 59 |                   If ret_out=True is set, then the output just as it is shown
 60 |                   (without constant/hidden columns) will be return
 61 |     """
 62 | 
 63 |     if d4.debug >= 2:
 64 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
 65 | 
 66 |     # Artifact-specific argument parsing =======================================
 67 | 
 68 |     # Variables ================================================================
 69 |     dfout = df
 70 | 
 71 |     # Maximum number of lines in DF for beautification
 72 |     maxdfbprintlines = 20
 73 | 
 74 |     # Pre-Processing ==========================================================
 75 | 
 76 |     # Call to simple_common ===================================================
 77 |     dfout = d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines)
 78 | 
 79 |     # Post-Processing =========================================================
 80 | 
 81 |     # Return ==================================================================
 82 |     return dfout
 83 | 
 84 | # analysis ====================================================================
 85 | def analysis(*args, **kwargs):
 86 |     """ Redirects execution to analysis_func()
 87 |     """
 88 |     if d4.debug >= 2:
 89 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
 90 | 
 91 |     return analysis_func(*args, **kwargs)
 92 | 
 93 | def analysis_func(*args, **kwargs):
 94 |     """ Umbrella function that redirects to different types of analysis 
 95 |         available on the input data
 96 | 
 97 |     Parameters:
 98 |     obj:          Input data (typically DF or dict of DFs)
 99 |     
100 |     Returns:
101 |     pd.DataFrame: Refer to each specific analysis function
102 |     """
103 | 
104 |     def syntax():
105 |         print('Syntax: analysis(obj, "analysis_type")\n')
106 |         d4list("str-help")
107 |         return
108 | 
109 |     def d4list(objtype):
110 | 
111 |         # Analysis Modules Available for this objective
112 |         anlav = False
113 |         print("Available flist analysis types:")
114 |         if objtype == None or objtype == "str-help" or objtype == "str-list" or  re.search("^pandas_dataframe-flist", objtype):
115 |             anlav = True
116 |             print("- size_top_n:  Top files by size (Input: flistdf)")
117 |             print("- exefile:  Analysis of the multiple instances of a specific file on many hosts (Input: flistdf)")
118 |             print("- unique_files_folder:  Find unique files (Input: flistdf)")
119 |             print("- exefs:  Macro analysis of all EXEs in the exefs df as a whole (Input: flistdf)")
120 | 
121 |         if anlav == False:
122 |             print('- No analysis modules available for this object ('+objtype+').')
123 | 
124 |     if d4.debug >= 2:
125 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
126 | 
127 |     nargs = len(args)
128 |     if nargs == 0:
129 |         syntax()
130 |         return
131 | 
132 |     obj = args[0]
133 | 
134 |     objtype = d4com.data_identify(obj)
135 | 
136 |     if isinstance(obj, str):
137 |         if obj == "list":
138 |             d4list(objtype)
139 |             return
140 |         if obj == "help":
141 |             syntax()
142 |             return
143 | 
144 |     if nargs == 1:
145 |         syntax()
146 |         return
147 | 
148 |     anltype = args[1]
149 |     if not isinstance(anltype, str):
150 |         syntax()
151 |         return
152 | 
153 |     if anltype == "help":
154 |         syntax()
155 |         return
156 |     elif anltype == "list":
157 |         d4list(objtype)
158 |         return
159 | 
160 |     # ANALYSIS FUNCTIONS ======================================================
161 | 
162 |     # flistdf ------------------------------------------------------------------
163 |     if   re.search("^pandas_dataframe-flist", objtype):
164 |         if anltype == "size_top_n":
165 |             return analysis_size_top_n(obj)
166 |         if anltype == "exefile":
167 |             return analysis_exefile(obj)
168 |         if anltype == "unique_files_folder":
169 |             return analysis_unique_files_folder(obj)
170 |         if anltype == "exefs":
171 |             return analysis_exefs(obj)
172 | 
173 |     print("INFO: [d4flist] No analysis functions available for this data type ("+objtype+")")
174 | 
175 | # Specific Analysis Functions =================================================
176 | 
177 | def analysis_size_top_n(fstl, n=100):
178 |     return fstl[~fstl['FileName_'].str.contains(r"\(\$FILE_NAME\)")][['Size_','FileName_']].sort_values(by='Size_', ascending=False).drop_duplicates().head(n)
179 | 
180 | 
181 | def analysis_exefile(exefs, thisexef_path='notepad.exe'):
182 |     # Description:
183 |     #     Analysis of the multiple instances of a specific file on many hosts
184 |     #     - ...
185 | 
186 |     # Variables
187 |     exef_intg_max_occs = 3
188 |     print("PARAMETERS --------------------------------------------------------\n")
189 |     print("exe path: {}".format(thisexef_path))
190 |     print("EXEFILE ANALYSIS --------------------------------------------------------\n")
191 | 
192 |     # Select instances of exef
193 |     exefs[exefs['FilePath_'].str.contains(thisexef_path,case=False)].head()
194 |     thisexefs = exefs[exefs['FilePath_'].str.contains(thisexef_path,case=False)]
195 | 
196 |     print("fsize   ANALYSIS - - - - - - - - - - - - - - - - - - - - - - - - - - - - \n")
197 |     exefgrps = thisexefs.groupby('Size_')
198 |     exefgrps_groups = exefgrps.groups
199 |     nexefgrps = len(exefgrps_groups)
200 |     print("No.groups: " + str(nexefgrps) + "\n")
201 |     # exef_sizes = exefgrps.groups.keys()
202 |     exef_sizes_occs = exefgrps.size()
203 |     print("Groups (sorted by no. occurrances of 'fsize'):    ")
204 |     print(exef_sizes_occs.sort_values(ascending=False))
205 | 
206 |     print("Interesting (no. occurrences <=" + str(exef_intg_max_occs) + "):    ")
207 |     exef_intg=exefgrps.filter(lambda x: len(x) <= exef_intg_max_occs)
208 |     print(exef_intg)
209 |     return exef_intg
210 | 
211 |     
212 | def analysis_unique_files_folder(exefs, thisexed_path="/Windows/Temp", exef_intg_max_occs="100", compop='==', recurse=False, prevdays=0, tsfield='m', verbose=False):
213 |     print("PARAMETERS --------------------------------------------------------\n")
214 |     print("exe path: {}".format(thisexed_path))
215 |     print("max occs: {}".format(exef_intg_max_occs))
216 |     print("compop: {}".format(compop))
217 |     print("recurse: {}".format(recurse))
218 |     print("prevdays: {}".format(prevdays))
219 |     print("verbose: {}".format(verbose))
220 |     print("-------------------------------------------------------------------\n")
221 |     if compop not in ['>', '<', '>=', '==', '<=']:
222 |         print("Invalid Comparison Operator: "+compop)
223 |         return False
224 | 
225 |     regexrec=thisexed_path+"/"
226 |     regexnorec=thisexed_path+"/[^/]*$"
227 | 
228 |     if recurse :
229 |         thisexefsrec=exefs[exefs['FilePath_'].str.contains(regexrec,case=False,regex=True)]
230 |         nexefsrec=len(thisexefsrec)
231 |         thisexefs=thisexefsrec
232 |         if verbose :
233 |             print("No. files (recursive):     "+str(nexefsrec)+"\n")
234 |     else:
235 |         thisexefsnorec=exefs[exefs['FilePath_'].str.contains(regexnorec,case=False,regex=True)]
236 |         nexefsnorec=len(thisexefsnorec)
237 |         thisexefs=thisexefsnorec
238 |         if verbose :
239 |             print("No. files (non-recursive): "+str(nexefsnorec)+"\n")
240 | 
241 |     exefgrps = thisexefs.groupby('FilePath_')
242 |     exefgrps_groups = exefgrps.groups
243 |     nexefgrps = len(exefgrps_groups)
244 |     # exef_sizes=exefgrps.groups.keys()
245 |     # exef_sizes_occs=exefgrps.size()
246 |     if verbose :
247 |         print("phash ANALYSIS - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \n")
248 |         print("RECURSION: "+str(recurse))
249 |         print("No.groups: "+str(nexefgrps)+"\n")
250 | 
251 |     if prevdays == 0 :
252 |         exef_intg = exefgrps.filter(lambda x: eval( str(len(x)) + compop + str(exef_intg_max_occs)) )
253 |     else:
254 |         print("No. Interesting (no. occurrences <=" + str(exef_intg_max_occs) + "): " + str(exef_intg) + "\n")
255 |         lastmtime = exef_intg.sort_values(by="MTime_").tail(1)['MTime_']
256 |         print("Last mtime: " + str(lastmtime))
257 |         prevdate = lastmtime + pd.DateOffset(days=-prevdays)
258 |         print("Previous Date: "+  str(prevdate))
259 | 
260 |     return exef_intg
261 | 
262 | 
263 | def analysis_exefs(exefs,  thisexed_path="/Windows/Temp", date_from='2019-12-01'):
264 | 
265 |     # Description:
266 |     #     Macro analysis of all EXEs in the exefs df as a whole
267 |     #     - Files that appear only a few times, etc.
268 |     print("PARAMETERS --------------------------------------------------------\n")
269 |     print("date from: {}".format(date_from))
270 |     # path-hash analysis  ===============================================================
271 |     # rare_phash_occs = 3
272 |     # thisexefilegrps = thisexefs.groupby('path-hash')
273 |     # thisexefilegrps_groups = thisexefilegrps.groups
274 |     # nthisexefilegrps = len(thisexefilegrps_groups)
275 |     # thisexefile_phash = thisexefilegrps.groups.keys()
276 |     # thisexefile_phash_occs = thisexefilegrps.size()
277 |     # print("Groups (sorted by no. occurrances of 'path-hash'):    ")
278 |     # print(thisexefile_phash_occs.sort_values(ascending=False))
279 |     # thisexefile_phash_rare = thisexefile_phash_occs[thisexefile_phash_occs == 1].sort_values(ascending=False)
280 | 
281 |     # Rare Files Analysis ---------------------------------------------------------------
282 |     # Files which appear only n times in the whole host set. 
283 |     # Since we are grouping by phash, this means they appear once in each of the n hosts
284 |     # If n=1 -> file appears only in 1 host
285 |     # exef_intg = exefgrps.filter(lambda x: len(x) <= n)    
286 | 
287 |     # Creation time analysis ------------------------------------------------------------
288 |     # Files created recently
289 |     return exefs[exefs['ATime_'] > date_from]
290 | # DATAFRAME ACCESSOR ##########################################################
291 | 
292 | @pd.api.extensions.register_dataframe_accessor("d4fsl")
293 | class Ds4n6FslAccessor:
294 |     def __init__(self, pandas_obj):
295 |         self._obj = pandas_obj
296 | 
297 |     def simple(self, *args, **kwargs):
298 |         """ Redirects execution to simple_func()
299 |         """
300 |         df = self._obj
301 |         return simple_func(df, *args, **kwargs)
302 | 
303 | 
304 | @pd.api.extensions.register_dataframe_accessor("d4_flist")
305 | class Ds4n6FSListAccessor:
306 |     def __init__(self, pandas_obj):
307 |         self._obj = pandas_obj
308 | 
309 |     def simple(self, *args, **kwargs):
310 |         """ Redirects execution to simple_func()
311 |         """
312 |         df = self._obj
313 |         return simple_func(df, *args, **kwargs)
314 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/utils.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: library of functions to appy Data Science in several forensics
  4 | #              artifacts
  5 | #
  6 | 
  7 | ###############################################################################
  8 | # INFO
  9 | ###############################################################################
 10 | # Recommended "import as": d4utl
 11 | 
 12 | ###############################################################################
 13 | # IMPORTS
 14 | ###############################################################################
 15 | 
 16 | # python IMPORTS --------------------------------------------------------------
 17 | import os
 18 | import re
 19 | import inspect
 20 | import xmltodict
 21 | import glob
 22 | 
 23 | # DS IMPORTS ------------------------------------------------------------------
 24 | import numpy  as np
 25 | import pandas as pd
 26 | import ipywidgets as ui
 27 | from IPython.display import display, Markdown, HTML
 28 | 
 29 | import ds4n6_lib.d4          as d4
 30 | import ds4n6_lib.common  as d4com
 31 | 
 32 | 
 33 | ###############################################################################
 34 | # FUNCTIONS
 35 | ###############################################################################
 36 | 
 37 | # EXTENDER pandas COLUMN/ROW FUNCTIONS ########################################
 38 | def insert_column_after_another_by_name(*args, **kwargs):
 39 |     return insert_or_move_column_after_another_by_name(*args, **kwargs)
 40 | 
 41 | def move_column_after_another_by_name(*args, **kwargs):
 42 |     return insert_or_move_column_after_another_by_name(*args, **kwargs)
 43 | 
 44 | def insert_or_move_column_after_another_by_name(df, col, refcol, defvalue=None, where2move="after"):
 45 |     """
 46 |     This function supports both inserting a new column or moving a column to
 47 |     the designated location
 48 |     """
 49 |     # Locate index of refcol
 50 |     loc = df.columns.get_loc(refcol)
 51 | 
 52 |     # New location
 53 |     if where2move == "after":
 54 |        newloc = loc + 1
 55 |     elif where2move == "before":
 56 |        newloc = loc - 1
 57 | 
 58 |     df.insert(loc=newloc, column=col+'_%%NEW%%_', value=defvalue) 
 59 | 
 60 |     # Insert col at location
 61 |     if col in df.columns:
 62 |         df[col+'_%%NEW%%_'] = df[col]
 63 |         df.drop(columns=col, inplace=True)
 64 | 
 65 |     df = df.rename(columns={col+'_%%NEW%%_': col})
 66 | 
 67 |     return df
 68 | 
 69 | # FILE READING/SAVING FUNCTIONS ###############################################
 70 | def find_files_in_folder(folder, filter="", ext='', maxdepth=4):
 71 | 
 72 |     def filter_files(objs):
 73 |         # Check which objects are files
 74 |         files = []
 75 |         for obj in objs:
 76 |             if re.search(filter, obj):
 77 |                 if os.path.isfile(obj):
 78 |                     files.append(obj)
 79 | 
 80 |         return files
 81 | 
 82 |     if d4.debug >= 4:
 83 |         print("DEBUG: - Max Depth:   "+str(maxdepth))
 84 |         print("DEBUG: - Top Folder:  "+folder)
 85 | 
 86 |     lev       = 1
 87 |     files     = []
 88 |     nnewfiles = 0
 89 |     nfiles    = 0
 90 |     wildcard  ="/*"
 91 | 
 92 |     while lev <= maxdepth:
 93 |         path_name = "{}{}{}".format(folder, wildcard*lev, ext)
 94 |         newfiles = filter_files(glob.glob(path_name))
 95 |         if newfiles == []:
 96 |             nnewfiles = 0
 97 |         else:
 98 |             files.extend(newfiles)
 99 |             nnewfiles = len(newfiles)
100 |             nfiles    = len(files)
101 | 
102 |         if d4.debug >= 4:
103 |             print("DEBUG: - Finding objs at level {}".format(lev))
104 |             print("DEBUG:     "+path_name)
105 |             print("DEBUG:   + "+str(nnewfiles)+" new   files found.")
106 |             print("DEBUG:   + "+str(nfiles)+" total files found.")
107 | 
108 |         lev += 1
109 | 
110 |     if d4.debug >= 4:
111 |         print("DEBUG: - Total Files Found:   "+str(nfiles))
112 |     
113 |     return files
114 | 
115 | # Read/Save from/to HDF =======================================================
116 | 
117 | def save_df_to_hdf_chunked(df, filename, chunksize=1000000, complib='blosc', complevel=9):
118 |     print("Saving DF to HDF Store")
119 |     print("")
120 |     print("- Details:")
121 |     print("  + File Name:  "+filename)
122 |     print("  + DF Length:  "+str(len(df)))
123 |     print("  + Chunk Size: "+str(chunksize))
124 |     print("  + No. Chunks: "+str(int(len(df)/chunksize)))
125 |     print("  + Comp. Lib.: "+complib)
126 |     print("  + Comp. Lev.: "+str(complevel))
127 |     print("")
128 | 
129 |     print("- Saving", end='')
130 |     start = 0
131 |     end = chunksize-1
132 |     cnt = 1
133 |     while end < df.shape[0]:         
134 |         chunk = df.iloc[start:end]
135 |         #chunk.to_hdf(filename, 'df'+str(int(cnt)), mode='a', format='table')
136 |         chunk.to_hdf(filename, 'df'+str(cnt), mode='a', format='table', complib=complib, complevel=complevel)
137 |         start += chunksize
138 |         end += chunksize
139 |         cnt += 1
140 |         
141 |         if cnt % 10 == 0:
142 |             print("["+str(cnt)+"]", end='')
143 |         else:
144 |             print(".", end='')
145 | 
146 |     print("")
147 |     print("- Done")
148 | 
149 | def read_df_from_hdf_chunked(filename, chunksize=1000000):
150 | 
151 |     # Figure out the no. of chunks
152 |     with pd.HDFStore(filename) as hdf:
153 |         import re
154 |         pat = re.compile(r'^/df[0-9][0-9]*$')
155 |         dfkeys = pd.Series([i for i in hdf.keys() if pat.match(i)])
156 |         dfkeys = dfkeys.str.replace('^/df','').astype(int)
157 |         maxdfidx = max(dfkeys)
158 | 
159 |     print("Details:")
160 |     print("  + File Name:  "+filename)
161 |     print("  + Chunk Size: "+str(chunksize))
162 |     print("  + No. Chunks: "+str(maxdfidx))
163 |     print("")
164 | 
165 |     print("- Reading", end='')
166 |     with pd.HDFStore(filename, mode='r') as hdf:
167 |         cnt = 1
168 |         df = pd.DataFrame()
169 |         while cnt <= maxdfidx:
170 |             chunkdf = hdf.get('df'+str(cnt))
171 |             df = pd.concat([df, chunkdf], ignore_index=True)
172 |             cnt += 1
173 | 
174 |             if cnt % 10 == 0:
175 |                 print('['+str(cnt)+']', end='')
176 |             else:
177 |                 print('.', end='')
178 |     print("")
179 |     print("- Resulting DF:")
180 |     print("  + DF Length:  "+str(len(df))) 
181 |     print("")    
182 |     
183 |     return df
184 | 
185 | # DISPLAY FUNCTIONS ###########################################################
186 | def display_side_by_side(dfs:list, captions:list):
187 |     """Display tables side by side to save vertical space
188 |     Input:
189 |         dfs: list of pandas.DataFrame
190 |         captions: list of table captions
191 |     """
192 |     output = ""
193 |     combined = dict(zip(captions, dfs))
194 |     for caption, df in combined.items():
195 |         output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
196 |         output += "\xa0\xa0\xa0"
197 |     display(HTML(output))
198 | 
199 | 
200 | # dict FUNCTIONS ##############################################################
201 | def dfsdict_stats(dfsdict):
202 |     """ Show stats of len keys of dictionary
203 |     Input:
204 |         dfsdict: dictionary        
205 |     """
206 |     for key in dfsdict.keys():
207 |         print('%8i - %-45s' % (len(dfsdict[key]),key))
208 | 
209 | 
210 | def dict_to_flatdict(dct,sep="_"):
211 |     """Flaten the dictionary 
212 |     Input:
213 |         dct: dictionary
214 |         sep: separator
215 |     """
216 |     import collections
217 | 
218 |     flatdict = collections.OrderedDict()
219 | 
220 |     def recurse(t,parent_key=""):
221 | 
222 |         if isinstance(t,list):
223 |             for i in range(len(t)):
224 |                 recurse(t[i],parent_key + sep + str(i) if parent_key else str(i))
225 |         elif isinstance(t,dict):
226 |             for k,v in t.items():
227 |                 recurse(v,parent_key + sep + k if parent_key else k)
228 |         else:
229 |             obj[parent_key] = t
230 |     
231 |     for _l1k,l1v in dct['Events'].items():
232 |         for x in range(len(l1v)):
233 |             obj = collections.OrderedDict()
234 |             recurse(l1v[x])
235 |             flatdict[x]=obj            
236 | 
237 |     return flatdict
238 | 
239 | 
240 | def dict_to_df(dct,sep="_"):
241 |     """Convert dictionary to dataframe
242 |     Input:
243 |         dct: dictionary
244 |         sep: separator
245 |     """
246 |     flatdict = dict_to_flatdict(dct, sep=sep)
247 | 
248 |     # Let's clear dct to free memory
249 |     dct = {}
250 | 
251 |     df = pd.DataFrame.from_dict(flatdict, orient='index')
252 | 
253 |     return df
254 | 
255 | 
256 | # XML  FUNCTIONS ##############################################################
257 | def xml_to_df(xmlstr, sep="_"):
258 |     """Convert xml to dataframe
259 |     Input:
260 |         xmlstr: string (xml content)
261 |         sep: separator
262 |     """
263 |     import xmltodict
264 |     # unicode escape invalid XML strings
265 |     xmlstr = escapeInvalidXML(xmlstr)
266 | 
267 |     try:
268 |         xmldct = xmltodict.parse(xmlstr)
269 |     # except Exception as err:
270 |     except xmltodict.expat.ExpatError as err:
271 |         print("    => ERROR: line: {} column: {} error_string: {}".format(err.lineno,err.offset, xmlstr.splitlines()[err.lineno-1]))
272 |         raise
273 | 
274 |     # Let's clear xmlstr to free memory. We no longer need it.
275 |     xmlstr=""
276 | 
277 |     return dict_to_df(xmldct, sep=sep)
278 | 
279 | 
280 | def escapeInvalidXML(string):
281 |     """ Escape invalid XML from ranges invalid unicode chars """
282 | 
283 |     r = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\xFF' \
284 |       + '\u0100-\uD7FF\uE000-\uFDCF\uFDE0-\uFFFD]')
285 | 
286 |     def replacer(m):
287 |         return "<u>\\u"+('%04X' % ord(m.group(0)))+"</u>"
288 |   
289 |     try:
290 |         return re.sub(r,replacer,string)
291 |     except:
292 |         return string
293 | 
294 | # df HELPERS ##################################################################
295 | def collapse_constant_columns(df):
296 |     if d4.debug >= 2:
297 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
298 | 
299 |     dfnrows = len(df)
300 | 
301 |     if dfnrows != 0:
302 |         concolsdf = pd.DataFrame()
303 | 
304 |         ncols  = df.shape[1]
305 |         colcnt = 1
306 |         for col in df.columns:
307 |             if d4.debug >= 4:
308 |                 print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()] collapse_constant_cols -> ["+str(colcnt)+"/"+str(ncols)+"] "+col)
309 |  
310 |             if col != 'nan':
311 |                 if df[col].dtype == object:
312 |                     duplicates = df[col].astype('string').drop_duplicates()
313 |                 else:
314 |                     duplicates = df[col].drop_duplicates()
315 |                 if len(duplicates) == 1:
316 |                     concolsdf = concolsdf.append([[col,str(duplicates.iloc[0])]], ignore_index=True)
317 |                     df = df.drop(columns=col)
318 | 
319 |             colcnt +=1
320 | 
321 |         concolsdf = concolsdf.rename(columns={ 0: 'Column', 1: 'Value' })
322 | 
323 |         return concolsdf, df
324 | 
325 | # NOTEBOOK FUNCTIONS ##########################################################
326 | 
327 | def nbgrep(nb,regex,celltype="code"):
328 |     """
329 |     Input:
330 |         nb: notebook file name.
331 |         regex: regex 
332 |         celltype: celltype to apply regex (code by default)
333 |     """
334 |     import nbformat
335 |     import re
336 | 
337 |     if nb == "":
338 |         print("- ERROR: notebook file name empty.")
339 |         return
340 | 
341 |     if os.path.exists(nb):
342 |         allhits = []
343 |         nb = nbformat.read(nb,as_version=4)
344 |         for c in dict(nb)['cells']:
345 |             cdict = dict(c)
346 |             if cdict['cell_type'] == celltype:
347 |                 cellsrc = cdict['source']
348 |                 hits = re.findall(regex,cellsrc)
349 |                 allhits = allhits+hits
350 | 
351 |         return allhits
352 |     else:
353 |         print("- ERROR: notebook file not found.")
354 |         print("         "+nb)
355 |         return
356 | 
357 | 
358 | def extract_dfs_from_type(df_type, dfs):
359 |     new_dfs = {}
360 |     for dfkey in dfs:
361 |         if d4com.data_identify(dfs[dfkey]).startswith(df_type):
362 |             new_dfs[dfkey] = dfs[dfkey]
363 |     return new_dfs
364 | 
365 | # ACCESSOR ####################################################################
366 | 
367 | @pd.api.extensions.register_dataframe_accessor("d4utl")
368 | class Ds4n6UtilsAccessor:
369 |     def __init__(self, pandas_obj):
370 |         self._obj = pandas_obj
371 | 
372 |     def dummy(self,arg1):
373 |         dummy=0
374 | 
375 | class PathSelector():
376 | 
377 |     def __init__(self,start_dir,select_file=True):
378 |         self.file        = None
379 |         self.select_file = select_file
380 |         self.cwd         = start_dir
381 |         self.select      = ui.SelectMultiple(options=['init'],value=(),rows=10,description='')
382 |         self.accord      = ui.Accordion(children=[self.select])
383 | 
384 |         self.accord.selected_index = None # Start closed (showing path only)
385 |         self.refresh(self.cwd)
386 |         self.select.observe(self.on_update,'value')
387 | 
388 |     def on_update(self,change):
389 |         if len(change['new']) > 0:
390 |             self.refresh(change['new'][0])
391 | 
392 |     def refresh(self,item):
393 |         global path
394 |         path = os.path.abspath(os.path.join(self.cwd,item))
395 | 
396 |         if os.path.isfile(path):
397 |             if self.select_file:
398 |                 self.accord.set_title(0,path)
399 |                 self.file = path
400 |                 self.accord.selected_index = None
401 |             else:
402 |                 self.select.value = ()
403 | 
404 |         else: # os.path.isdir(path)
405 |             self.file = None
406 |             self.cwd  = path
407 | 
408 |             # Build list of files and dirs
409 |             keys = ['[..]']
410 |             for item in os.listdir(path):
411 |                 if item[0] == '.':
412 |                     continue
413 |                 elif os.path.isdir(os.path.join(path,item)):
414 |                     keys.append('['+item+']')
415 |                 else:
416 |                     keys.append(item)
417 | 
418 |             # Sort and create list of output values
419 |             keys.sort(key=str.lower)
420 |             vals = []
421 |             for k in keys:
422 |                 if k[0] == '[':
423 |                     vals.append(k[1:-1]) # strip off brackets
424 |                 else:
425 |                     vals.append(k)
426 | 
427 |             # Update widget
428 |             self.accord.set_title(0,path)
429 |             self.select.options = list(zip(keys,vals))
430 |             with self.select.hold_trait_notifications():
431 |                 self.select.value = ()
432 | 
433 | def df_outlier_analysis(indf,sensitivity):
434 | 
435 |     # Method: analyze all fields and find the ones different from others
436 |     # Select cols that have a small no of different values
437 |     # - df.nunique() < 10          --> Bool Series w/ cols that have < 10 different values
438 |     # - df.T[df.nunique() < 10].T  --> Show those values
439 |     # - drop_duplicates()          --> Drop duplicate rows
440 |     # - .index                     --> Show row # for those rows
441 |     # - .iloc                      --> Select df rows based on row index
442 | 
443 |     # Readable version
444 |     #      intcolsmsk=df.nunique() < 10
445 |     #      inting=df.iloc[df.T[intcolsmsk].T.drop_duplicates().index]
446 |     #      return inting
447 |     # One liner version
448 | 
449 |     df = indf.copy()
450 |     nrows = df.shape[0]
451 |     maxrows = int(sensitivity*(nrows*.02))
452 |     df.drop(columns=df.columns[df.nunique()>maxrows], inplace=True) 
453 |     intgdf = df.iloc[df.T[df.nunique() < maxrows/5].T.drop_duplicates().index]
454 | 
455 |     return intgdf
456 | 
457 | 
458 | # KNOWLEDGE ###################################################################
459 | 
460 | # Regex Patterns ==============================================================
461 | ipregex=r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$"
462 | 
463 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/kape.py:
--------------------------------------------------------------------------------
  1 | # DS4N6
  2 | #
  3 | # Description: library of functions to appy Data Science in several forensics
  4 | #              artifacts
  5 | #
  6 | 
  7 | ###############################################################################
  8 | # INFO
  9 | ###############################################################################
 10 | # Recommended "import as": d4kp
 11 | 
 12 | ###############################################################################
 13 | # IMPORTS
 14 | ###############################################################################
 15 | 
 16 | # DEV  IMPORTS ----------------------------------------------------------------
 17 | 
 18 | # python IMPORTS --------------------------------------------------------------
 19 | import os
 20 | import glob
 21 | import re
 22 | import time
 23 | import inspect
 24 | import xmltodict
 25 | import json
 26 | import pickle
 27 | import math
 28 | from tqdm import tqdm
 29 | import xml.etree.ElementTree as et
 30 | 
 31 | # DS IMPORTS ------------------------------------------------------------------
 32 | import numpy  as np
 33 | import pandas as pd
 34 | import matplotlib.pyplot as plt
 35 | from IPython.display import display, Markdown, HTML
 36 | import ipywidgets as widgets
 37 | from ipywidgets import Layout
 38 | 
 39 | from traitlets import traitlets
 40 | # ML
 41 | from sklearn.model_selection import train_test_split
 42 | from tensorflow.keras.models import Model, load_model
 43 | from tensorflow.keras.layers import Input, Dense
 44 | 
 45 | # DS4N6 IMPORTS ---------------------------------------------------------------
 46 | import ds4n6_lib.d4      as d4
 47 | import ds4n6_lib.common  as d4com
 48 | import ds4n6_lib.gui     as d4gui
 49 | import ds4n6_lib.utils   as d4utl
 50 | 
 51 | ###############################################################################
 52 | # FUNCTIONS
 53 | ###############################################################################
 54 | 
 55 | # FILE READING FUNCTIONS ######################################################
 56 | 
 57 | def read_data(evdl, **kwargs):
 58 | 
 59 |     if d4.debug >= 3:
 60 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
 61 | 
 62 |     return d4com.read_data_common(evdl, **kwargs)
 63 | 
 64 | # HARMONIZATION FUNCTIONS #####################################################
 65 | 
 66 | def harmonize(df, **kwargs):
 67 |     """ Convert DF in HAM format
 68 | 
 69 |         Args: 
 70 |             df (pandas.DataFrame): DF to harmonize
 71 |             kwargs(dict): harmonize options
 72 |         Returns: 
 73 |             pandas.DataFrame in HAM Format
 74 |     """
 75 | 
 76 |     if d4.debug >= 3:
 77 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
 78 | 
 79 |     # (1) kansa will probably be invoked with 'tool=kansa', but kansa is in 
 80 |     #     reality an orchestrator, so we will only populate the Tool_ column 
 81 |     #     to kansa only if we have not been able to determine what is the 
 82 |     #     underlying tool that kansa is using for execution in the endpoint
 83 | 
 84 |     # Specific Harmonization Pre-Processing -----------------------------------
 85 |     if not 'D4_Orchestrator_' in df.columns:
 86 |         df.insert(0, 'D4_Orchestrator_', "kape")
 87 |     else:
 88 |         # If the Orchestrator_ col exists, we are in a recursive call
 89 |         return df
 90 | 
 91 |     objtype = d4com.data_identify(df)
 92 | 
 93 |     # Generic Harmonization ---------------------------------------------------
 94 | 
 95 |     # Since kansa is an orchestrator, let's try to identify the specific 
 96 |     # data type and apply the corresponding harmonization function.
 97 |     # If we can, we will execute the generic one.
 98 |     if "unknown" in objtype:
 99 |         df = d4com.harmonize_common(df, **kwargs)
100 |     else:
101 |         # Let's try to harmonize this specific df
102 |         df = d4com.harmonize(df)
103 | 
104 |     # Specific Harmonization Post-Processing ----------------------------------
105 |     d4pdf = df['D4_Plugin_'].iloc[0]
106 | 
107 |     if isinstance(d4pdf, float):
108 |         if math.isnan(d4pdf):
109 |             print('ERROR: The D4_Plugin_ column is NaN. Cannot fully Harmonize data.')
110 |             print('       Did you specify the "pluginisdfname=True"')
111 |             print('       when reading the input files?')
112 |             return
113 | 
114 |     if re.search("^Registry-Services-", d4pdf):
115 |         df['D4_Plugin_']   = "Registry-Services"
116 |         df['D4_DataType_'] = "svclist"
117 | 
118 |         # Rename columns
119 |         df = df.rename(columns={'Name':               'Name_', 
120 |                                 'DisplayName':        'DisplayName_',
121 |                                 'ImagePath':          'FilePath_',
122 |                                 'StartMode':          'StartMode_',
123 |                                 'ServiceType':        'ServiceType_',
124 |                                 'ServiceDLL':         'ServiceDLL_',
125 |                                 'Group':              'Group_',
126 |                                 'Description':        'Description_',
127 |                                 'RequiredPrivileges': 'RequiredPrivileges_'
128 |                                })
129 |     elif re.search("^EventLogs-EvtxECmd", d4pdf):
130 |         df['D4_Plugin_']   = "EventLogs-EvtxECmd"
131 |         df['D4_DataType_'] = "evtx"
132 |         # Rename columns
133 |         df = df.rename(columns={'Name':               'Name_', 
134 |                                 'DisplayName':        'DisplayName_',
135 |                                 'ImagePath':          'FilePath_',
136 |                                 'StartMode':          'StartMode_',
137 |                                 'ServiceType':        'ServiceType_',
138 |                                 'ServiceDLL':         'ServiceDLL_',
139 |                                 'Group':              'Group_',
140 |                                 'Description':        'Description_',
141 |                                 'RequiredPrivileges': 'RequiredPrivileges_',
142 |                                 'EventId':            'EventID_',
143 |                                 'SourceFile':         'evtxFileName_',
144 |                                 'TimeCreated':        'Timestamp',
145 |                                 'UserId':             'TargetUserSid',
146 |                                 'UserName':           'TargetUserName',
147 |                                 'Computer':           'WorkstationName'
148 |                                })
149 |         df['FileName_'] = df['ExecutableInfo'].str.split('\\').str[-1]
150 |         df['IpAddress'] = "0.0.0.0"
151 |         df['LogonType'] = np.nan
152 | 
153 |     elif re.search(r"^FileSystem-MFTECmd_\$MFT", d4pdf):
154 | 
155 |         df['D4_Plugin_']   = "FileSystem-MFTECmd_$MFT"
156 |         df['D4_DataType_'] = "flist"
157 | 
158 |         # Rename columns
159 |         df = df.rename(columns={'EntryNumber':              'Meta_', 
160 |                                 'SequenceNumber':           'NTFS-SeqNumber_',
161 |                                 'InUse':                    'Deleted_',
162 |                                 'ParentEntryNumber':        'ParentMeta_',
163 |                                 'ParentSequenceNumber':     'ParentSeqNumber_',
164 |                                 'ParentPath':               'ParentPath_',
165 |                                 'FileName':                 'FileName_',
166 |                                 'Extension_':               'FileExtension_',
167 |                                 'FileSize':                 'Size_',
168 |                                 'ReferenceCount':           'NTFS-ReferenceCount_',
169 |                                 'ReparseTarget':            'NTFS-ReparseTarget_',
170 |                                 'IsDirectory':              'IsDirectory_',
171 |                                 'Created0x10':              'BTime_',
172 |                                 'LastModified0x10':         'MTime_',
173 |                                 'LastRecordChange0x10':     'CTime_',
174 |                                 'LastAccess0x10':           'ATime_',
175 |                                 'HasAds':                   'NTFS-HasAds_',
176 |                                 'IsAds':                    'NTFS-IsAds_',
177 |                                 'SI<FN':                    'NTFS-SI<FN_',
178 |                                 'uSecZeros':                'NTFS-uSecZeros_',
179 |                                 'Copied':                   'NTFS-Copied_',
180 |                                 'SiFlags':                  'NTFS-SiFlags_',
181 |                                 'NameType':                 'NTFS-NameType_',
182 |                                 'Created0x30':              'NTFS-FN-BTime_',
183 |                                 'LastModified0x30':         'NTFS-FN-MTime_',
184 |                                 'LastRecordChange0x30':     'NTFS-FN-CTime_',
185 |                                 'LastAccess0x30':           'NTFS-FN-ATime_',
186 |                                 'UpdateSequenceNumber':     'NTFS-UpdateSequenceNumber_',
187 |                                 'LogfileSequenceNumber':    'NTFS-LogfileSequenceNumber_',
188 |                                 'SecurityId':               'NTFS-SecurityId_',
189 |                                 'ObjectIdFileDroid':        'NTFS-ObjectIdFileDroid_',
190 |                                 'LoggedUtilStream':         'NTFS-LoggedUtilStream_',
191 |                                 'ZoneIdContents':           'NTFS-ZoneIdContents_',     
192 |                                })                           
193 |                                                           
194 |         # Reverse InUse -> Deleted                         
195 |         df['Deleted_']    = ~df['Deleted_']
196 |         df['FileType_']   = df['IsDirectory_']
197 |         df['FileType_']   = np.where(df['FileType_'], 'd', 'f')
198 |         df['ParentPath_'] = df['ParentPath_'] .str.replace(r"^\.","").str.replace(r"^\\\.$",".")
199 |         df['FilePath_']   = df['ParentPath_'] + '\\' + df['FileName_']
200 | 
201 |     # return ------------------------------------------------------------------
202 |     return df
203 | 
204 | # ANALYSIS FUNCTIONS ##########################################################
205 | 
206 | # simple ======================================================================
207 | def simple_func(df, *args, **kwargs):
208 |     """ Reformat the input df so the data is presented to the analyst in the
209 |         friendliest possible way
210 | 
211 |     Parameters:
212 |     df  (pd.dataframe):  Input data 
213 |     
214 |     Returns:
215 |     pd.DataFrame: Optionally it will return the filtered dataframe, 
216 |                   only if ret=True is set, constant & hidden columns included
217 |                   If ret_out=True is set, then the output just as it is shown
218 |                   (without constant/hidden columns) will be return
219 |     """
220 |     if d4.debug >= 3:
221 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
222 | 
223 |     # Variables ----------------------------------------------------------------
224 |     hiddencols =  []
225 | 
226 |     # Maximum number of lines in DF for beautification
227 |     maxdfbprintlines = 20
228 | 
229 |     # Call to simple_common ----------------------------------------------------
230 |     return d4com.simple_common(df, *args, **kwargs, hiddencols=hiddencols, maxdfbprintlines=maxdfbprintlines)
231 | 
232 | 
233 | def get_source_options():
234 |     return ['wrap_cols', 'beautify_cols']
235 | 
236 | 
237 | # analysis ====================================================================
238 | def analysis(*args, **kwargs):
239 |     """ Redirects execution to analysis_func()
240 |     """
241 |     return analysis_func(*args, **kwargs)
242 | 
243 | def analysis_func(*args, **kwargs):
244 |     """ Umbrella function that redirects to different types of analysis 
245 |         available on the input data
246 | 
247 |     Parameters:
248 |     obj:          Input data (typically DF or dict of DFs)
249 |     
250 |     Returns:
251 |     pd.DataFrame: Refer to each specific analysis function
252 |     """
253 | 
254 |     # SUB-FUNCTIONS ###########################################################
255 |     def syntax(objtype=None):
256 |         print('Syntax: analysis(obj, "analysis_type")\n')
257 |         if objtype is None:
258 |             d4list("str-help")
259 |         else:
260 |             d4list(objtype)
261 |         return
262 | 
263 |     def d4list(objtype):
264 |         # Analysis Modules Available for this objective
265 |         anlav = False
266 | 
267 |         print("Available kape analysis types:")
268 |         if objtype == None or objtype == "str-help" or objtype == "str-list" or  re.search("^dict-pandas_dataframe-kape", objtype):
269 |             anlav = True
270 |             print("- kape_files:  No.events kape file (Input: kpdfs)")
271 | 
272 |         if anlav == False:
273 |             print('- No analysis modules available for this object ('+objtype+').')
274 | 
275 | 
276 |     # FUNCTION BODY ###########################################################
277 |     if d4.debug >= 3:
278 |         print("DEBUG: [DBG"+str(d4.debug)+"] ["+str(os.path.basename(__file__))+"] ["+str(inspect.currentframe().f_code.co_name)+"()]")
279 | 
280 |     nargs = len(args)
281 | 
282 |     if nargs == 0:
283 |         syntax()
284 |         return
285 | 
286 |     obj = args[0]
287 | 
288 |     objtype = d4com.data_identify(obj)
289 | 
290 |     if isinstance(obj, str):
291 |         if obj == "list":
292 |             d4list(objtype)
293 |             return
294 |         if obj == "help":
295 |             syntax()
296 |             return
297 | 
298 |     if nargs == 1:
299 |         syntax(objtype)
300 |         return
301 | 
302 |     anltype = args[1]
303 | 
304 |     if not isinstance(anltype, str):
305 |         syntax()
306 |         return
307 | 
308 |     if anltype == "help":
309 |         syntax()
310 |         return
311 |     elif anltype == "list":
312 |         d4list(objtype)
313 |         return
314 | 
315 |     # ANALYSIS FUNCTIONS ======================================================
316 | 
317 |     # kpdfs -------------------------------------------------------------------
318 |     if   re.search("^dict-pandas_dataframe-kape", objtype):
319 |         if anltype == "kape_files":
320 |             return analysis_kape_files(*args, **kwargs)
321 | 
322 |     print("INFO: [d4kp] No analysis functions available for this data type ("+objtype+")")
323 | 
324 | def analysis_kape_files(*args, **kwargs):
325 |     """ Analysis that gives kape files
326 |         Args: 
327 |         obj:          Input data (typically DF or dict of DFs)
328 |         Returns: 
329 |         pandas.Dataframe with the results of the analysis
330 | 
331 |     """
332 |     dfs = args[0]
333 | 
334 |     objtype = d4com.data_identify(dfs)
335 | 
336 |     if not objtype.startswith("dict-pandas_dataframe-kape"):
337 |         print("ERROR: Invalid object for function: "+objtype)
338 |         print("       Input object should be:      dict-pandas_dataframe-kape")
339 |         return
340 | 
341 |     outdf = pd.DataFrame([],columns=['NEntries', 'KapeFile'])
342 |     row = pd.Series()
343 | 
344 |     for key in dfs.keys():
345 |         row['KapeFile']  = key
346 |         row['NEntries'] = len(dfs[key])
347 | 
348 |         outdf = outdf.append(row,ignore_index=True).sort_values(by=['KapeFile']).reset_index(drop=True)
349 | 
350 |     #outdf.insert(0, 'Artifact/Tool', '')
351 |     #outdf.insert(0, 'Category', '')
352 |     outdf['Category'] = outdf['KapeFile'].str.replace('-.*','')
353 |     outdf['Artifact/Tool'] = outdf['KapeFile'].str.replace('^[^-]*-','').str.replace('-[^-]*$','').str.replace('_Output$','').str.replace(r'.*\.dat$','')
354 |     outdf['File']     = outdf['KapeFile'].str.replace('^[^-]*-','').str.replace(r'^([^-]*\.dat)','DUMMY-\\1').str.replace('^[^-]*$','').str.replace('^[^-]*-','')
355 | 
356 |     return outdf
357 | 
358 | # DATAFRAME ACCESSOR ##########################################################
359 | 
360 | @pd.api.extensions.register_dataframe_accessor("d4kp")
361 | class Ds4n6KpAccessor:
362 |     def __init__(self, pandas_obj):
363 |         self._obj = pandas_obj
364 | 
365 |     def simple(self, *args, **kwargs):
366 |         """ Redirects execution to simple_func()
367 |         """
368 |         df = self._obj
369 |         return simple_func(df, *args, **kwargs)
370 | 
371 | @pd.api.extensions.register_dataframe_accessor("d4_kape")
372 | class Ds4n6KapeAccessor:
373 |     def __init__(self, pandas_obj):
374 |         self._obj = pandas_obj
375 | 
376 |     def simple(self, *args, **kwargs):
377 |         """ Redirects execution to simple_func()
378 |         """
379 |         df = self._obj
380 |         return simple_func(df, *args, **kwargs)
381 | 
382 | 
383 | 


--------------------------------------------------------------------------------
/src/ds4n6_lib/mlgraph.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Description: library to apply Graph Data Science in several forensics artifacts
  3 | #
  4 | 
  5 | #############################################################################################
  6 | # INFO
  7 | #############################################################################################
  8 | # Recommended "import as": d4mlg
  9 | 
 10 | #############################################################################################
 11 | # IMPORTS
 12 | #############################################################################################
 13 | import numpy as np
 14 | import pandas as pd
 15 | import networkx as nx
 16 | from ast import literal_eval
 17 | from collections import Counter
 18 | 
 19 | #############################################################################################
 20 | # FUNCTIONS
 21 | #############################################################################################
 22 | def build_lm_dataset(dset, mode='hostname', path='',codify=False):
 23 |     """ Function to build a Lateral Movement (LM) dataset
 24 |     Syntax: build_lm_dataset(dset="<dset>", mode="<mode>", path="<path>", codify="<codify>")
 25 |     Args:
 26 |         dset (pandas.core.frame.DataFrame): Event Log Dataset. Min. columns: ['time','event_id','hostname','source_ip','source_hostname','logon_type','remote_user']
 27 |         mode (str): build mode. 'hostname' to create the LM dataset only by using known hostnames. 'ip_addr' to use IP address for unknown hostnames
 28 |         path (str): path to store the LM datasets
 29 |             - ds4n6_lm_dataset.csv: dataset with LMs
 30 |             - ds4n6_neo4j_dataset.csv: dataset with LMs to be loaded in Neo4j
 31 |         codify (bool): 'True' to codify user and host names. 'False' otherwise. Default 'False'
 32 |     Returns:
 33 |         lm_dset (core.frame.DataFrame): dataset with LMs
 34 |     """
 35 |     dset    = _clear_dataset(dset)
 36 |     dict_ip = _create_ip_dict(dset, path)
 37 |     dset    = _ip_to_hostname(dset, dict_ip, mode=mode)
 38 |     if codify:     
 39 |         cdset = _codify_dataset(dset)
 40 |         dset  = cdset[0]
 41 |         with open(path + 'dictionary.txt','w') as data: 
 42 |             data.write(str(cdset[1]))
 43 |             data.write(str(cdset[2]))
 44 | 
 45 |     lm_dset,neo4j_dset = _build_lm_dataset(dset)
 46 |     lm_dset.to_csv(path + 'ds4n6_lm_dataset.csv', index=False)
 47 |     neo4j_dset.to_csv(path + 'ds4n6_neo4j_dataset.csv', index=False)
 48 |     return lm_dset
 49 | 
 50 | def find_lm_anomalies(lm_dset, model, from_date, to_date, top_n=50, neo4j=True, path=''):
 51 |     """ Function to detect anomalous Lateral Movement (LM) with Machine Learning
 52 |     Syntax: find_lm_anomalies(lm_dset="<lm_dset>", model="<model>", from_date="<from_date>", to_date="<to_date>", top_n="<top_n>", neo4j="<neo4j>", path="<path>")
 53 |     Args:
 54 |         lm_dset (pandas.core.frame.DataFrame): Lateral Movement dataset (output of build_lm_dataset func.)
 55 |         model (str): ML model to be used. Supported models: ('s2s_lstm', 'transformer')
 56 |         from_date (str): init date of the training dataset
 57 |         to_date (str): end date of the training dataset
 58 |         top_n (str): number of anomalous LMs to be detected
 59 |         neo4j (bool): 'True' to export the output to Neo4j format. 'False' otherwise
 60 |         path (str): path to store Neo4j output datasets
 61 |             - <user>.csv: dataset with anomalous LMs by user
 62 |             - <user>_full.csv: dataset with all user activity in the input dataset (lm_dset)
 63 |     """
 64 |     if model == "transformer":
 65 |         from ds4n6_lib.ml_models.transformer import Seq2seqData, Autoencoder
 66 |     elif model == "s2s_lstm":
 67 |         from ds4n6_lib.ml_models.seq2seq_lstm import Seq2seqData, Autoencoder
 68 |     else:
 69 |         raise ValueError("Error: model '" + model + "' not supported. Try 's2s_lstm' or 'transformer'")
 70 | 
 71 |     data = Seq2seqData()
 72 |     ml_dset = data.load_path_dataset(lm_dset, from_date, to_date, min_count=0)
 73 |     train_x, train_y = data.process_train_data(ml_dset)
 74 |     data.build_train_dset(train_x, train_y)
 75 |     
 76 |     model = Autoencoder(embed_dim=16, latent_dim=300, data=data)
 77 |     model.build_autoencoder()
 78 |     model.fit_autoencoder()
 79 |     err_mtrx = model.get_anomalies(train_x)
 80 |     out = _print_top_anomalies(top_n, err_mtrx, ml_dset)
 81 |     if neo4j:
 82 |         _safe_anomalies_neo4j(top_n, err_mtrx, ml_dset, path)
 83 |     return out
 84 | 
 85 | #############################################################################################
 86 | # AUX. FUNCTIONS
 87 | #############################################################################################
 88 | def _build_adjancency(dataframe, origin, destination):
 89 |     adj_dict_ = {}
 90 |     graph = nx.from_pandas_edgelist(dataframe, source = origin, target = destination, create_using=nx.DiGraph())
 91 |     ajc = graph.adjacency()
 92 | 
 93 |     for m,n in ajc:
 94 |         adj_dict_[m]=list(n.keys())
 95 |     adj_dict = {k: v for k, v in adj_dict_.items() if v}
 96 |     return adj_dict
 97 | 
 98 | def _node_paths(graph_adj_dict, init_path):
 99 |     final_paths = [init_path]
100 |     idx = 0
101 |     iters = len(final_paths)
102 |     while iters>idx:
103 |         path = final_paths[idx]
104 |         if path[-1] in graph_adj_dict:
105 |             for node in  graph_adj_dict[path[-1]]:
106 |                 if node not in path:
107 |                     new_path = path + [node]
108 |                     final_paths.append(new_path)
109 |         idx += 1
110 |         iters = len(final_paths)    
111 |     return final_paths
112 | 
113 | def _get_single_paths(paths):
114 |     s_paths=[]
115 |     aux_paths = paths.copy()
116 |     for idx,i in enumerate(paths):  
117 |         aux_paths.remove(i)
118 |         for j in aux_paths:
119 |             if (''.join(str(x) for x in i)) in ((''.join(str(y) for y in j))):
120 |                 break
121 |             else:
122 |                 continue
123 |         else:
124 |             s_paths.append(i)
125 |         aux_paths = paths.copy()
126 |     return s_paths
127 | 
128 | def _get_paths(graph_adj):
129 |     nodes = list(graph_adj.keys())
130 |     all_paths = []
131 |     for node in nodes:
132 |         node_paths = _node_paths(graph_adj, [node])
133 |         all_paths += node_paths
134 |     return _get_single_paths(all_paths)
135 | 
136 | def _build_path_df(adjacency_df):
137 |     path_df_=[]
138 |     for i in adjacency_df:
139 |         paths=_get_paths(i[2])
140 |         [path_df_.append([i[0], i[1], path]) for path in paths]
141 |     return path_df_
142 | 
143 | def _del_local(lm_dataset): # Delete LMs with len=1
144 |     lenght = []
145 |     for i in lm_dataset['path']:
146 |         lenght.append(str(len(i)))
147 |     lm_dataset['lenght'] = lenght
148 |     lm_dataset = lm_dataset[~lm_dataset['lenght'].str.contains('1')]
149 |     lm_dataset = lm_dataset.drop(columns=['lenght'])
150 |     return lm_dataset
151 | 
152 | def _clear_dataset(dataframe):
153 |     tools = ['sabonis','masstin']
154 |     df = dataframe.astype(str).copy()
155 |     if 'D4_DataType_' in df.columns:
156 |         if df['D4_DataType_'][0] in tools:
157 |             df = df.rename(columns={'Timestamp_': 'time',
158 |                                      'EventID_': 'event_id',
159 |                                      'Computer_':'hostname',
160 |                                      'SourceIP_':'source_ip',
161 |                                      'SourceComputer_':'source_hostname',
162 |                                      'LogonType_':'logon_type',
163 |                                      'TargetUserName_':'remote_user'})
164 |             df = df[['time','event_id','hostname','source_ip','source_hostname','logon_type','remote_user']]
165 |         else:
166 |             raise TypeError('D4_DataType_ != sabonis|masstin')
167 |     df1 = df[df['event_id'].isin(['21','24','25','4624','1149'])].copy()
168 |     target_columns = ['hostname', 'source_hostname', 'remote_user']
169 |     for column in target_columns:
170 |         df1[column] = df1[column].str.lower()
171 |     df1['hostname'] = df1['hostname'].str.split('.').apply(lambda x: x[0])
172 |     df1['time'] = pd.to_datetime(df1['time'])
173 |     df1 = df1[~df1['remote_user'].str.contains('$', regex = False)]
174 |     df1 = df1[~df1['remote_user'].isin(['system','network service','anonymous','anonymous logon','nan','d4_null'])]
175 |     new_df = df1
176 | 
177 |     if True: ### Include Events: 1024 y 1102
178 |         df2 = df[df['event_id'].isin(['1024','1102'])].copy()
179 |         for column in target_columns:
180 |             df2[column] = df2[column].str.lower()
181 |         df2['source_hostname'] = df2['source_hostname'].str.split('.').apply(lambda x: x[0])
182 |         df2['time'] = pd.to_datetime(df2['time'])
183 |         new_df = pd.concat([df1, df2])
184 |     return new_df
185 | 
186 | def _create_ip_dict(dataframe, path):
187 |     dict_df = dataframe[dataframe['logon_type'].str.contains('3', regex = False)]
188 |     dict_df = dict_df[~dict_df['source_hostname'].isin(['-','nan','none'])]
189 |     dict_df['source_ip'] = np.where(dict_df['source_ip'].str.match("[0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*"), dict_df['source_ip'], "d4_null")
190 |     dict_df = dict_df[~dict_df['source_ip'].isin(['d4_null'])]
191 |     host_list=dict_df["source_hostname"].value_counts()
192 |     host_dict = {}
193 |     unreliable_hosts = []
194 | 
195 |     for hostname in host_list.index:
196 |         host_df = dict_df[dict_df['source_hostname'].isin([hostname])]['source_ip']
197 |         c = Counter(host_df)
198 |         ip = c.most_common(1)[0]
199 |         reliability = (ip[1] / len(host_df)) * 100
200 | 
201 |         if (reliability >= 70) and (ip[0] not in host_dict):
202 |             host_dict[ip[0]]=hostname
203 |         else:
204 |             unreliable_hosts.append([hostname, reliability])
205 |     with open(path + 'ip_dict.txt','w') as data:
206 |             data.write(str(host_dict).replace(",", ",\n"))
207 |     return host_dict
208 | 
209 | def _ip_to_hostname(dataframe, host_dict, mode):
210 |     for idx,event in dataframe.iterrows():
211 |         if event['source_ip'] in host_dict:
212 |             dataframe.at[idx,'source_hostname'] = host_dict[event['source_ip']]
213 |         if event['hostname'] in host_dict: # For events: 1024 y 1102
214 |             dataframe.at[idx,'hostname'] = host_dict[event['hostname']]
215 |     dataframe = dataframe[~dataframe['source_hostname'].isin(['local','-','nan','none','d4_null'])]
216 |     dataframe.set_index('time', inplace=True)
217 |     return dataframe
218 | 
219 | def _codify_dataset(dataset):
220 |     users = dataset["remote_user"].unique()
221 |     users = list(users)
222 |     users_ = ['user' + str(i).zfill(6) for i in range(len(users))]
223 |     users_dict = dict(zip(users, users_))
224 | 
225 |     hosts = pd.concat([dataset['hostname'], dataset['source_hostname']]).unique()
226 |     hosts = list(hosts)
227 |     hosts_ = ['host' + str(i).zfill(6) for i in range(len(hosts))]
228 |     hosts_dict = dict(zip(hosts, hosts_))
229 | 
230 |     dataset1 = dataset.replace({"remote_user": users_dict})
231 |     dataset2 = dataset1.replace({"hostname": hosts_dict})
232 |     dataset3 = dataset2.replace({"source_hostname": hosts_dict})
233 |     return dataset3, users_dict, hosts_dict
234 | 
235 | def _build_lm_dataset(dataframe):
236 |     dataframe['timestamp'] = dataframe.index.strftime('%H:%M:%S')
237 |     path_df,adjacency_df = [],[]
238 | 
239 |     for idx, df_day in dataframe.groupby(dataframe.index.date):
240 |         user_list = df_day["remote_user"].value_counts()
241 |         for user in user_list.index:
242 |             df_day_user = df_day.query("remote_user==@user")
243 |             adj_dict = _build_adjancency(df_day_user, 'source_hostname', 'hostname')
244 |             adjacency_df.append([idx, user, adj_dict])
245 |     path_df = _build_path_df(adjacency_df)
246 |     columns = ['date', 'user', 'path']
247 |     lm_dset = pd.DataFrame(path_df, columns = columns)
248 |     lm_dset = _del_local(lm_dset)
249 | 
250 |     lm_dset,neo4j_dset = _build_neo_dataset(dataframe, lm_dset)
251 |     return lm_dset,neo4j_dset
252 | 
253 | def _get_timestamp(date, user, paths, dset):
254 |     timestamps = []
255 |     dset_ = dset[dset.index.date == date]
256 |     for path in paths:
257 |         result = dset_[(dset_['remote_user'] == user) & (dset_['source_hostname'] == path[0]) & (dset_['hostname'] == path[1])].timestamp
258 |         timestamps.append(result.iloc[0])
259 |     return timestamps
260 | 
261 | def _build_neo_dataset(dataframe, lm_dset):
262 |     df_neo4j,timestamps = [],[]
263 |     columns = ['date', 'remote_user', 'source_hostname', 'hostname']
264 | 
265 |     for idx,path in enumerate(lm_dset['path']):
266 |         pairs = _sliding_window(path, 2)
267 |         timestamp = _get_timestamp(lm_dset['date'].iloc[idx], lm_dset['user'].iloc[idx], pairs, dataframe)
268 |         timestamps.append(timestamp)
269 |         for idx2,node in enumerate(pairs):
270 |             df_neo4j.append([str(lm_dset['date'].iloc[idx]) + ' ' + timestamp[idx2], \
271 |                              lm_dset['user'].iloc[idx], \
272 |                              node[0],
273 |                              node[1]])
274 |     lm_dset['timestamp'] = timestamps
275 |     neo4j_dset = pd.DataFrame(df_neo4j, columns = columns)
276 |     neo4j_dset = neo4j_dset.drop_duplicates()
277 |     return lm_dset,neo4j_dset
278 | 
279 | def _print_top_anomalies(top_n, error_matrix, ml_dset):
280 |     cnt = 1
281 |     err,dat,usr,mov,tim = [],[],[],[],[]
282 |     print(" ")
283 |     print("__________________________________________________________________________")
284 |     print("TOP-"+ str(top_n)+" Anomalies")
285 |     print("__________________________________________________________________________")
286 |     for anomalies in error_matrix:
287 |         if cnt > top_n:
288 |             break
289 |         else:
290 |             f1 = ml_dset['user'].iloc[int(anomalies[0])]
291 |             usr.append(f1)
292 |             print(str(cnt), ") User: " + f1)
293 | 
294 |             print("--------------+-----------------------------------------------------------")
295 | 
296 |             f2 = f'{(1-anomalies[1]):.0%}'
297 |             err.append(f2)
298 |             print("Timeline      | Lateral Movement (Error=", f2 + ')')
299 | 
300 |             print("--------------+-----------------------------------------------------------")
301 | 
302 |             f3 = ml_dset['date'].iloc[int(anomalies[0])].date()
303 |             f4 = ml_dset['path'].iloc[int(anomalies[0])]
304 |             f5 = ml_dset['timestamp'].iloc[int(anomalies[0])]
305 |             dat.append(f3)
306 |             mov.append(f4)
307 |             tim.append(f5)
308 |             print(str(f3) + "    |", f4[0])
309 |             empty = len(f4[0]) + 2
310 |             for idx,path in enumerate(f4[1:]):
311 |                 print(" |-> " + str(f5[idx]) + " |" + " "*empty + path)
312 |                 empty += len(path) + 1
313 |             print("__________________________________________________________________________")
314 |             cnt += 1
315 |             
316 |     out = pd.DataFrame()
317 |     out['date'] = dat
318 |     out['user'] = usr
319 |     out['path'] = mov
320 |     out['timestamp'] = tim
321 |     out['error'] = err
322 |     return out
323 | 
324 | def _sliding_window(elements, window_size):
325 |     windows = []
326 |     if len(elements) <= window_size:
327 |         windows.append(elements)
328 |     else:
329 |         for i in range(len(elements)- window_size + 1):
330 |             windows.append(elements[i:i+window_size])
331 |     return windows     
332 | 
333 | def _safe_anomalies_neo4j(top_n, error_matrix, ml_dataset, path=''):
334 |     users = []
335 |     df_user = []
336 |     df_user_full = []
337 |     columns = ['date', 'remote_user', 'source_hostname', 'hostname']
338 | 
339 |     for i_anomalies in error_matrix[0:top_n]:
340 |         i_user = ml_dataset['user'].iloc[int(i_anomalies[0])]
341 |         users.append(i_user)
342 |     users = set(users)
343 | 
344 |     for user in users: 
345 |         
346 |         for anomalies in error_matrix[0:top_n]:
347 |             if user == ml_dataset['user'].iloc[int(anomalies[0])]:
348 |                 paths = _sliding_window(ml_dataset['path'].iloc[int(anomalies[0])], 2)
349 |                 for nodes in paths:
350 |                     df_user.append([ml_dataset['date'].iloc[int(anomalies[0])], \
351 |                                     user, \
352 |                                     nodes[0],
353 |                                     nodes[1]])
354 |         df_user = pd.DataFrame(df_user, columns = columns)
355 |         df_user = df_user.drop_duplicates()
356 |         df_user.to_csv(path + str(user) + '.csv', index=False)
357 |         df_user = []
358 | 
359 | 
360 |         user_full = ml_dataset.query("user==@user")
361 |         for idx, upath in enumerate(user_full['path']):  # Safe full user activity
362 |             pairs = _sliding_window(upath, 2)
363 |             for node in pairs:
364 |                 if len(node) < 2:
365 |                     df_user_full.append([user_full['date'].iloc[idx], \
366 |                                      user_full['user'].iloc[idx], \
367 |                                      node[0],
368 |                                      node[0]])
369 |                 else:
370 |                     df_user_full.append([user_full['date'].iloc[idx], \
371 |                                      user_full['user'].iloc[idx], \
372 |                                      node[0],
373 |                                      node[1]])
374 |         df_user_full = pd.DataFrame(df_user_full, columns = columns)
375 |         df_user_full = df_user_full.drop_duplicates()
376 |         df_user_full.to_csv(path + str(user) + '_full.csv', index=False)
377 |         df_user_full = []


--------------------------------------------------------------------------------