├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── data
    ├── dash_gov_dot_us.json
    └── treatment_care.json
├── documentation
    ├── todo.md
    └── tutorial.md
├── domain_cat.ipynb
├── domain_cat_module.py
├── images
    ├── 2d_click.gif
    ├── 2d_v1.png
    ├── 2d_zoom.gif
    ├── 2d_zoom_select.gif
    ├── 3d_infra.gif
    ├── 3d_v1.gif
    ├── 3d_v2.gif
    ├── build_graph.png
    ├── config.png
    ├── credentials.png
    ├── dash_gov.us_substrings.gif
    ├── domain_data.png
    ├── domain_graph_2d.png
    ├── intro_3d.gif
    ├── iris.png
    ├── iris_small.png
    ├── jupyter_cell.png
    ├── pivot_heatmap.png
    ├── pivot_stats.png
    ├── pivot_value_heatmap.png
    ├── reading_data.png
    ├── run_3d.png
    ├── run_heatmap.png
    ├── running_a_cell.gif
    ├── selected_domains.png
    └── trimmed_domains.png
├── infrastructure_cat.ipynb
├── infrastructure_cat_module.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # dotenv file
  7 | .env
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # PyCharm
135 | .idea/
136 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:latest
 2 | # install the basics
 3 | RUN apt-get update && apt-get -y update
 4 | RUN apt-get install -y build-essential python3.6 python3-pip python3-dev 
 5 | RUN pip3 -q install pip --upgrade
 6 | 
 7 | # install nodejs v12
 8 | RUN apt-get install -y curl dirmngr apt-transport-https lsb-release ca-certificates
 9 | RUN curl -sL https://deb.nodesource.com/setup_12.x | bash -
10 | RUN apt-get install -y nodejs
11 | RUN apt-get install -y gcc g++ make
12 | RUN node --version
13 | RUN npm --version
14 | 
15 | # copy dependency files
16 | RUN mkdir src
17 | WORKDIR src/
18 | COPY requirements.txt .
19 | 
20 | # instsall Jupyter, domaincat requirements, and widget extensions
21 | RUN pip3 install -r requirements.txt
22 | RUN export NODE_OPTIONS=--max-old-space-size=4096
23 | RUN jupyter labextension install jupyterlab-plotly@4.14.3 --no-build
24 | RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager --no-build
25 | RUN jupyter labextension install plotlywidget@4.14.3 --no-build
26 | RUN jupyter lab build --dev-build=False --minimize=False
27 | RUN npm cache clean --force
28 | RUN unset NODE_OPTIONS
29 | 
30 | # Rest of Files copied
31 | COPY . .
32 | 
33 | # Run jupyter lab
34 | CMD ["jupyter", "lab", "--port=9999", "--no-browser", "--ip=0.0.0.0", "--allow-root"]
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 DomainTools
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DomainCAT (Domain Connectivity Analysis Tool)
  2 | 
  3 | ## "See Connections Between Domains Right Meow"
  4 | 
  5 | **The Domain Connectivity Analysis Tool is used to analyze aggregate connectivity patterns across a set of domains during security investigations**
  6 | 
  7 | This project was a collaborative effort between [myself](https://www.linkedin.com/in/jconwell/) and [Matthew Pahl](https://www.linkedin.com/in/security-sme/)
  8 | 
  9 | ## Introduction
 10 | 
 11 | When analyzing pivots during threat hunting, most people approach it from the perspective of “what can a single 
 12 | pivot tell you?” But often actors will set their domains up to use commodity hosting infrastructure, so the number of 
 13 | entities associated with a given pivot are so big they don’t really give you any useful information. 
 14 | 
 15 | This is where DomainCAT can help. Actors make decisions around domain registration and hosting options when setting 
 16 | up their malicious infrastructure. These can be considered behavioral choices.
 17 | - What registrar(s) do they use?
 18 | - What TLDs do they prefer?
 19 | - What hosting provider(s) do they like?
 20 | - What TLS cert authority do they use?
 21 | 
 22 | All of these decisions, together, makeup part of that actor’s infrastructure tools, tactics and procedures (TTPs), 
 23 | and we can analyze them as a whole to look for patterns across a set of domains. 
 24 | 
 25 | ### But wait there's more
 26 | 
 27 | ### Introducing InfraCAT
 28 | 
 29 | What if instead of nodes being domains, they were the infrastructure and the edges were the connected domains? That was 
 30 | the thought process with InfraCAT. By seeing clusters of infrastructure, you can see tightly coupled groups of domains 
 31 | based on the infrastructure they use. 
 32 | 
 33 | DomainCAT and InfraCAT are tools written in Jupyter Notebooks, a web-based interactive environment that lets you combine text, 
 34 | code, data, and interactive visualizations into your threat hunting toolbelt. The tool analyzes aggregate 
 35 | connectivity patterns across a set of domains looking at every pivot for every domain, asking; what are the shared 
 36 | pivots across these domains, how many shared pivots between each domain, do they have a small pivot count or a really 
 37 | large one? All of these aspects are taken into consideration as it builds out a connectivity graph that models how 
 38 | connected all the domains in an Iris search are to each other.
 39 | 
 40 | ### Example Visualizations:
 41 | 
 42 | #### 3D visualization of domain to domain connections based on shared infrastructure, registration and naming patterns
 43 | ![SegmentLocal](images/intro_3d.gif "segment")
 44 | 
 45 | #### 2D visualization of domain to domain connection
 46 | ![domain_graph2d.png](images/2d_zoom.gif "segment")
 47 | 
 48 | #### 3d visualization of infra to infra connection
 49 | ![3dinfra](images/3d_infra.gif)
 50 | 
 51 | ## DomainCat Tutorial
 52 | 
 53 | #### Click here for the [DomainCAT Tutorial](documentation/tutorial.md) documentation
 54 | 
 55 | ## Installation Steps: Docker (recommended)
 56 | 
 57 | _Note: building the container takes a bit of RAM to compile the resources for the jupyterlab-plotly extension. Bump up your RAM in Docker preferences to around 4Gb while building the container. Then afterwards you can drop it back down to your normal level to run the container_
 58 | 
 59 | ### Steps:
 60 | 
 61 | Clone the git repository locally
 62 | 
 63 | `$ git clone https://github.com/DomainTools/DomainCAT.git` 
 64 | 
 65 | Change directory to the domaincat folder
 66 | 
 67 | `$ cd domaincat`
 68 | 
 69 | Build the jupyter notebook container
 70 | 
 71 | `$ docker build --tag domaincat .`
 72 | 
 73 | Run the jupyter notebook
 74 | 
 75 | `$ docker run -p 9999:9999 -v $(PWD)/data:/src/data --name domaincat domaincat`
 76 | 
 77 | Mounting the data directory as a volume allows you to add new files to the container without having to rebuild it.
 78 | 
 79 | ## Installation Steps: Manual (cross your fingers)
 80 | 
 81 | _Note: this project uses JupyterLab Widgets, which requires nodejs >= 12.0.0 to be installed...which is on you_
 82 | 
 83 | ### Steps:
 84 | 
 85 | Clone the git repository locally
 86 | 
 87 | `$ git clone https://github.com/DomainTools/DomainCAT.git` 
 88 | 
 89 | Change directory to the domaincat folder
 90 | 
 91 | `$ cd domaincat`
 92 | 
 93 | Install python libraries
 94 | 
 95 | `$ pip install -r requirements.txt`
 96 | 
 97 | JupyterLab widgets extension
 98 | 
 99 | ```
100 | $ jupyter labextension install jupyterlab-plotly@4.14.3 --no-build
101 | $ jupyter labextension install @jupyter-widgets/jupyterlab-manager --no-build
102 | $ jupyter labextension install plotlywidget@4.14.3 --no-build
103 | $ jupyter lab build
104 | ```
105 | 
106 | Run the jupyter notebook
107 | 
108 | `$ jupyter lab`
109 | 
110 | ___
111 | 
112 | # Release Notes:
113 | 
114 | October 25, 2021:
115 | - Initial support for InfraCAT
116 | 
117 | August 24, 2021:
118 | - Adding a way to remove domains in the graph that you aren't interested in (look at the bottom of the notebook)
119 | - Refactor of the backend data structures to be a bit more efficient
120 | 
121 | April 27, 2021:
122 | - Added support for `dotenv` to store REST API credentials in a `.env` file
123 | - Added logic to support
124 |   - comma delimited list of domains
125 |   - domains defanged with square brackets
126 | 
127 | April 23, 2021:
128 | - Added config flag to only analyze active domains
129 | - Show count of selected domains
130 | 
131 | April 19: 2021:
132 | - Bug fix to not normalize risk scores values when calculating node color
133 | - Mo'better sorting of selected domains
134 | 
135 | April 15, 2021: 
136 | - Bug fix: wrong json element returned when querying search hash
137 | 
138 | April 14, 2021: 
139 | - Added UI to search either a list of domain names or an Iris search hash
140 | - Added UI to enter Iris REST API username and password 
141 | 
142 | April 7, 2021: 
143 | - Initial commit
144 | 
145 | ___
146 | 
147 | _Plotly Bug: in the 2D visualization of the domain graph there is a weird bug in `Plotly Visualization library` where 
148 | if your cursor is directly over the center of a node, the node's tool tip with the domain's name will disappear and 
149 | if you click the node, it unselects all nodes. So only click on a node if you see it's tool tip_
150 | 


--------------------------------------------------------------------------------
/documentation/todo.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## DomainCAT to do Tasks
 3 | - refactor the code to just use the graph data structure as much as possible and less of the domain_list map
 4 | - figure out how to make the create_date pivot a window over n days vs just 1 day
 5 | - prune connections that are below some weight threshold
 6 | - refactor append_values_with_count(s) functions to share logic
 7 | - figure out a better way to normalize registrars
 8 | - create a way to type a domain name select that domain
 9 | - create a way to type a pivot (category or value?) and select all domains that are connected
10 | - add every pivot possible. I mostly skipped the whois pivots because they aren't that useful anymore
11 | - address the comment in DomainRelationship.add. Essentially domains that share 2 or more IP addresses could potentially have their edge strength artificially boosted
12 | - maybe play around with normalizing edge weights once the graph is created, but before rendering
13 | 
14 | ## Bugs to Fix
15 | 
16 | ## Wish List
17 | 
18 | when looking at domains that are probably realted and created over a short period of time, it would be useful to have some viz that shows / groups the pivots per create date. That way you could see stuff like on day 1 TLD1 and regustrar1 were used, then day two TLD1 and registrar2 were used, then day 3 TLD2 and registrar2 were used. That kind of thing
19 | 
20 | Given a selection of domains, show what attributes they are NOT connected on
21 | 
22 | date range of domains
23 | timeline view that shows how tight or loosely connected the domains are for each day or week
24 | 
25 | auto identify the clusters and show the pivot table for each cluster
26 | 
27 | auto discovery substrings


--------------------------------------------------------------------------------
/documentation/tutorial.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # DomainCAT Tutorial
  4 | 
  5 | ## Overview
  6 | DomainCAT is a cybersecurity analysis tool used to explore domain to domain connectivity within a set of domains. It's 
  7 | useful for identifying clusters of related domains that share some common registration, naming, or infrastructure 
  8 | pattern during a cybersecurity investigation such as threat hunting. 
  9 | 
 10 | It does this by analyzing the aggregate connectivity patterns across the set of domains looking at every pivot for 
 11 | every domain, asking; what are the shared pivots across these domains, how many shared pivots between each domain, do 
 12 | they have a small pivot count or a really large one? All of these aspects are taken into consideration as it builds out 
 13 | a connectivity graph that models how connected all the domains are to each other.
 14 | 
 15 | Imagine running a search in the DomainTools Iris UI and getting 300 domains
 16 | 
 17 | <img src="../images/iris_small.png" alt="DomainTools Iris" width="800"/>
 18 | 
 19 | and turning them into an interactive 3D graph visualization where you can explore them by rotating, panning, and 
 20 | zooming into the the data to identify interesting subsets of domains are connected to each other. 
 21 | 
 22 | ![3D Domain Connectivity Graph](../images/intro_3d.gif "segment")
 23 | 
 24 | Or a 2D graph visualization where you can zoom in, select a set of domains and view exactly what pivots 
 25 | connect those specific domains together.
 26 | 
 27 | ![2D Domain Connectivity Graph](../images/2d_zoom.gif "segment")
 28 | 
 29 | So what is a “graph” in this context? A graph is just a bunch of nodes, or circles that represent domains, 
 30 | connected together by edges, or gray lines that represent the pivots two domains have in common. In the graph examples 
 31 | above you can see that some domains group tightly together with others to create clusters. Why is this? 
 32 | 
 33 | Pairs of domains that have more pivots in common with each other will have "stronger" edges and be closer together. 
 34 | This layout logic will naturally cluster groups of highly connected domains together into these clusters. 
 35 | 
 36 | Pairs of domains that have only one or two pivots in common have "weaker" edges, and will be farther apart. These
 37 | domains will appear farther out on the periphery of the graph. If the domain(s) you are investigating are in this 
 38 | set of periphery nodes, then you know right away that your search is going in the wrong direction and you might want 
 39 | to go back to Iris and adjust our search criteria.
 40 | 
 41 | ## Quick Primer on Jupyter Notebooks
 42 | DomainCAT is written in [Jupyter Notebooks](https://jupyter.org/), a web-based interactive environment that lets you 
 43 | combine text, code, data, and interactive visualizations all into one environment. Notebooks are broken up into 
 44 | cells, where each call can have either code or text. Don't worry, you don't have to know how to code python or make any 
 45 | changes to the code (mostly) to use DomainCAT, just how to run a cell. 
 46 | 
 47 | The video below is an example of how to run a cell. Just click into a cell and you'll see a blue bar on the left 
 48 | that indicates the cell now has focus. Then hit Shift+Return to run the cell. There is a little empty circle in the 
 49 | upper right of a notebook that fills in gray while the cell is running. When the cell is finished running, the 
 50 | circle becomes empty again, and there will be some sort of output below the code cell that is the results of what 
 51 | was run.
 52 | 
 53 | ![Running a notebook cell](../images/running_a_cell.gif)
 54 | 
 55 | _Note: if you happen to double-click any of the text cells, you might see the contents change font and you'll have a 
 56 | blinking cursor. Don't panic, this just means you are in edit mode. Just hit Shift+Return like you were running a 
 57 | code cell, and the text will change back to normal._
 58 | 
 59 | If you happen to edit the code by accident and mess things up, it's not a big deal. You can click into the cell 
 60 | that you changed and use the normal undo/redo hotkeys (or the Edit menu) to get the code back to where it was before 
 61 | you edited it. Worse case you can just rebuild the docker container, and you'll get a new unedited notebook with all 
 62 | the default values.
 63 | 
 64 | ## Initializing The Notebook
 65 | 
 66 | When you start the DomainCAT notebook, you'll need to click into and run the first code cell in the notebook. 
 67 | This will initialize all the code to do the connectivity analysis. You'll need to do this every time you start the notebook. 
 68 | 
 69 | ![Running a notebook cell](../images/running_a_cell.gif)
 70 | 
 71 | ## Setting Iris REST API Credentials
 72 | 
 73 | There are two ways you can enter your Iris REST API credentials. The first is entering them into this cell. 
 74 | The password textbox will not show the password in clear text.
 75 | 
 76 | <img src="../images/credentials.png" alt="Entering Iris Rest API credentials" width="800"/>
 77 | 
 78 | Alternatively you can create a `.env` fine in the root folder of the DomainCAT project and add the following block
 79 | to it, replacing the username and password with your own
 80 | 
 81 | ```
 82 | # IRIS REST API CREDS
 83 | IRIS_API_USERNAME=some_user_name
 84 | IRIS_API_KEY=some_password
 85 | ```
 86 | 
 87 | When the Jupyter Notebook initializes, the `dotenv` library will read the `.env` file and inject the credentials 
 88 | into the REST API calls
 89 | 
 90 | ## Entering Domains To Query
 91 | 
 92 | The next step is to define the list of domains to query. This cell lets you do this in one of two ways.
 93 | 
 94 | First, you can enter the raw list of domains into the large spare textbox shown below. The domains can be 
 95 | either new-line or comma delimited, and DomainCAT support defanged domains that use square brackets.   
 96 | 
 97 | <img src="../images/domain_data.png" alt="Entering Domains To Query" width="800"/>
 98 | 
 99 | The second way is to enter an Iris investigation search hash into the second text box. This hash represents
100 | an existing investigation in Iris and will query all the domains from the investigation. 
101 | 
102 | ## Reading Domain Data From DomainTools REST API
103 | 
104 | DomainCAT reads domain data by querying the 
105 | [DomainTools Iris Investigate REST API](https://www.domaintools.com/resources/api-documentation/iris-investigate) 
106 | for any investigation hash that you generate in the Iris UI.   
107 | 
108 | The next code cell (shown below) has the configuration used to query the Iris REST API.
109 | 
110 | <img src="../images/reading_data.png" alt="Running a notebook cell" width="800"/>
111 | 
112 | If you run the cell (Shift+Return) DomainCAT will query the REST api using the investigation hash are return the set of 
113 | domains from the list of domains you enterd or the Iris search hash. It will also show you the number of domains loaded
114 | into the notebook.
115 | 
116 | There are a couple of options to note:
117 | - `save_search_to_disk`: if you change this to `True` the search results will be saved to disk. This way you can 
118 |   reload the set of domains from your investigation at a later point without having to query the REST API again
119 | - `json_file_path`: this is the path and file name used to save your search results to disk for later use
120 | - `query_api`: if `True`, the REST API will query the investigation hash. If `False`, domain data will be loaded 
121 |   from the `json_file_path`
122 | 
123 | DomainCAT ships with a demo dataset of ab called `dash_gov_dot_us.json`. This is a set of domains that use 
124 | the `.us` TLD, end in `-gov`, and are less than 2 years old. To load this data set `query_api` to False and run the cell.
125 | 
126 | _Performance Note: DomainCAT performs best with less than or equal to 400 domains(ish). Any more than that and 
127 | performance and memory can become an issue and cause the notebook to crash._
128 | 
129 | ## Configuration Settings
130 | 
131 | There are a set of configuration settings that for the most part you shouldn't need to change
132 | 
133 | <img src="../images/config.png" alt="Running a notebook cell" width="800"/>
134 | 
135 | ### config.active_domains_only (default: True)
136 | If this setting is `True` DomainCAT will only analyze domains that are activly registered. Domains that were taken down or expired will be ignored. If `False`, all domains returned by Iris REST API will be analyzed.
137 | 
138 | ### config.longest_common_substring (default: 6)
139 | 
140 | DomainCAT has a new type of pivot called `longest_common_substring`. It compares every domain name to every other 
141 | domain name, and creates a pivot between two domains if they share `longest_common_substring` or more consecutive 
142 | characters. For example, the domains visas-esta-gov[.]us and visas-estausa-gov[.]us both share the substring 
143 | “visas-esta”, so a pivot would be created for the value “visas-esta” that will join every domain with this substring.  
144 | 
145 | Note: I've found anything less than 5 for this setting will create way too many connections in the domain graph, 
146 | and are not useful in an investigation. But try it if you want, it creates a pretty graph.
147 | 
148 | ### config.ignore_substrings (default: empty)
149 | 
150 | This setting is an array of string values to ignore when looking for `longest_common_substring` pivots. This is useful 
151 | when you use a string as part of Iris search, like all domains ending in "-gov". Every domain will have this substring, 
152 | so you want to remove it from consideration when creating substring pivots. 
153 | 
154 | To turn this setting off, use: `config.ignore_substrings = []`
155 | 
156 | If you have more than one string to ignore use the following pattern: `config.ignore_substrings = ["stuff", "things", "hotsauce"]`
157 | 
158 | ### config.scale_edge_strength_by_pivot_count (default: True)
159 | 
160 | Every pivot in Iris has a pivot count. This is the number of domains globally attached to this pivot. For example, 
161 | an ip address might have a pivot count of 1,000, meaning there are 1,000 domains hosted on this ip address. DomainCAT 
162 | also has a notion of _local pivots_; these are pivots between domains only within the REST API search results.  
163 | 
164 | When evaluating how important a pivot is between two domains, DomainCAT can evaluate the global pivot count and weigh 
165 | the influence of the pivot in the graph inversely proportional to the global pivot count. This means pivots with a 
166 | smaller pivot count would have edges that are stronger in the graph than pivots with a very large global pivot count. 
167 | 
168 | If this is set to `True`, DomainCAT will use this graph edge weighting strategy. If it is set to `False`, it will 
169 | weigh every edge equally.
170 | 
171 | *TODO: put link to section below about graphs, edges, and weights()
172 | 
173 | ### config.count_threshold (default: sys.maxsize)
174 | 
175 | This setting is used to filter out pivots that have a global pivot count greater than `count_threshold`.  So if it was set to `config.count_threshold = 1000`, then any pivot that had a count greater than 1,000 would not be used to create an edge in the graph. 
176 | 
177 | This setting isn't that useful when `scale_edge_strength_by_pivot_count` is turned on, as the inverse weighting will take care of this. But if `scale_edge_strength_by_pivot_count` is turned off, it can be useful for weeding out really big pivots from commodity internet infrastructure.  
178 | 
179 | ### config.min_pivot_size (default: 2)
180 | 
181 | This setting is used to filter out pivots that have a _local pivot size_ of less than `min_pivot_size`. For example, if this was set to 5, then any pivot that connects 4 or less domains returned from the REST API would be removed.
182 | 
183 | The default value of 2 keeps every pivot that connects at least 2 domains.
184 | 
185 | ### config.max_domains (default: 400000000)
186 | 
187 | This setting is related to `scale_edge_strength_by_pivot_count`. It is the theoretical maximum number of domains that could ever be on a pivot and is used to calculate the inverse pivot count weight. The default value is the approximate number of active domains, give or take a few million: 400,000,000
188 | 
189 | ### config.print_debug_output (default: False)
190 | 
191 | This setting is used to write extra debug info to the Jupyter's log to help with troubleshooting. 
192 | 
193 | ## Choose Pivots & Build the Domain Graph
194 | 
195 | By default, all pivots are turned on,  (with a few exceptions discussed below). This means that if any two domains returned
196 | by the REST API are connected to each other by any of the below pivots, they will have an edge created to connect them 
197 | together in the graph.
198 | 
199 | To turn a pivot off, just comment out the pivot by putting a `#` in front of the pivot. For example, to turn off
200 | the `create_date` pivot, just change the line like so:
201 | 
202 | ```python
203 | #    "create_date",
204 | ```
205 | 
206 | To turn a pivot back on, just remove the `#`. After any change to the code in a cell, you will have to re-run the
207 | cell for the code change to take effect.
208 | 
209 | Below are the default pivot settings:
210 | 
211 | <img src="../images/build_graph.png" alt="Choose pivots" width="800"/>
212 | 
213 | You'll note that `ns_host` and `ns_ip` are both commented out. This is because I prefer to use `ns_domain` for name
214 | server based pivots. For example, if a domain has 4 name servers like:
215 | - ns1.somedomain.com
216 | - ns2.somedomain.com
217 | - ns3.somedomain.com
218 | - ns4.somedomain.com
219 | 
220 | `ns_host` will create a pivot for each one, and if each name server has its own IP address then there will be 
221 | an additional 4 pivots created. That means if two domains share the above name servers, the edge in the graph that
222 | connects them will represent a total of 9 pivots. This does two things to the graph. First I've found that this 
223 | overemphasis the name server connectedness in the graph. Second, domains with more nameservers will have stronger 
224 | edges than domains with fewer name servers, emphasising domains with more redundant infrastructure. The same logic 
225 | applies to for MX record based pivots. 
226 | 
227 | Once you have the pivots turned on/off the way you want run this cell. DomainCAT will take all the domains returned
228 | by the Iris REST API, analyze their shared pivots, and build a graph that represents the connected structure of 
229 | these domains.
230 | 
231 | ### Brand New Pivot: longest common substring!
232 | Some might have noticed that there is a new pivot in this list that doesn’t exist in Iris called “longest_common_substring”. 
233 | This is a new local pivot that was added into DomainCAT which compares every domain name in the search to every other 
234 | domain name, and creates a pivot between two domains if they share 6 or more consecutive characters. For example, 
235 | the domains visas-esta-gov[.]us and visas-estausa-gov[.]us both share the substring “visas-esta” so they would be 
236 | connected in the graph. 
237 | 
238 | In fact, you can even turn off all the pivots except “longest_common_substring” which would show how connected all 
239 | the domains are based solely on how they were named. This technique can be useful when your investigation centers 
240 | around domain name patterns and shared infrastructure.
241 | 
242 | ## Trimmed Domains
243 | 
244 | When building the graph of connected domains, there will often be a few domains that are not connected to any other
245 | domain. These are called trimmed domains. DomainCAT will show you a count of domains that were trimmed from the
246 | graph because they were not connected. 
247 | 
248 | If you want to see which domains were trimmed out, just run the next cell in the notebook and it will print out
249 | the list of trimmed domains. 
250 | 
251 | <img src="../images/trimmed_domains.png" alt="Trimmed Domains" width="800"/>
252 | 
253 | ## Explore the 3D Domain Graph
254 | 
255 | Once the graph is built it's time to visually explore it. Run the next cell in the notebook:
256 | 
257 | <img src="../images/run_3d.png" alt="Run 3D Vizualization" width="800"/>
258 | 
259 | DomainCAT lets you explore the graph of connected domains with an interactive 3D view. You can mouse over any node 
260 | and see what the domain name is and click/drag the graph to view it from different directions. This 3D view is really 
261 | useful for gaining an understanding of the overall aggregate connectedness of all the domains, and if there are any 
262 | clusters of domains that we might want to explore.
263 | 
264 | But if you haven't turned any pivots off, your graph might look a little something like this:
265 | 
266 | ![3d domain graph](../images/3d_v1.gif)
267 | 
268 | There are so many pivots in this graph, represented by those gray lines, that it obfuscates the visualization and 
269 | really makes it hard to see what's going on, especially in the center of the graph. This is because there are some 
270 | pivots that belong to most of the domains in the graph, resulting in a crazy spider web egg-sac-looking thing
271 | like above. The good news is that not all pivots are as useful in every investigation, so we can remove them 
272 | from the graph. 
273 | 
274 | For example; the search used to pull these domains together used the TLD “.us”. So every domain would have this pivot 
275 | connecting it to every other domain. Luckily, DomainCAT is smart enough to look for pivots like that and automatically 
276 | remove them. But there are probably other pivots that are just adding noise to the graph and do not offer much value 
277 | to the investigation which we can turn off.
278 | 
279 | ## Pivot Stats
280 | 
281 | DomainCAT has a Pivot Stats view, which shows different statistics for each pivot that was found in the domains. 
282 | 
283 | <img src="../images/pivot_stats.png" alt="Pivot Stats" width="800"/>
284 | 
285 | Looking through this list, we can see that `ip_country_code` only has 7 pivots (“# of Pivots”), meaning there were 
286 | only 7 different country codes found in the domains, but it’s “# of connections” shows that almost 48% of the domains 
287 | are connected to each other with this pivot. This is a good candidate pivot to turn off in the graph and should clean 
288 | up the 3D view of the graph a bit.  
289 | 
290 | Let’s find out. Go back up to the pivots configuration cell and comment out `ip_country_code`, then run that cell 
291 | to rebuild the graph.  Then scroll down to the cell that calls `build_3d_graph_layout(graph, domain_list)` to redraw 
292 | the 3D visualization. It should look something like this:
293 | 
294 | ![3d domain graph](../images/3d_v2.gif)
295 | 
296 | Removing a single pivot really opened up the graph! As the 3D view of the graph pivots around we can see there are four 
297 | main clusters: three that seem highly connected to each other, almost in a triangle, and one fairly mixed cluster 
298 | pushed farther out to the side.
299 | 
300 | ## Pivot Tuning: An Iterative Process
301 | 
302 | The 3D graph view looks pretty good after just turning off country code, but sometimes turning off one pivot isn’t 
303 | enough. I call this process “pivot tuning;” you look at the 3D view of the graph to see if the center has opened up 
304 | enough to see its internal cluster patterns. If the graph view is still too cluttered, look at pivot statistics and 
305 | see if you can find another pivot that might be a good candidate to turn off. Pivots whose “# of connections” column 
306 | is higher than 15% are often good candidates. Also, pivots whose “# of pivots” column is pretty low but the “# of 
307 | domains” is close to 90% or greater can be helpful too. There are some pivots, like ASN or IP country code that are 
308 | very coarse and apply to a high percentage of the domains. If your investigation isn’t centered around IP geo or 
309 | network diversity, these pivots are also good candidates to turn off. 
310 | 
311 | You’ll get a feel for what works and what doesn’t as you play with it. But keep in mind that a pivot that looks 
312 | like a candidate to turn off might be critical to your investigation. For example; if the investigation is centered 
313 | around domains hosted by a single ISP, turning off “ip_isp” might be a good idea, but “ip_address” might remove 
314 | important connectivity information from the graph.
315 | 
316 | _Note: I like to have multiple cells that call `build_3d_graph_layout(graph, domain_list)`. This way, as I'm 
317 | pivot tuning, I can really see how a change in the pivots affected the graph by comparing it to the
318 | previous graph. I'll often have 3 or 4 different 3D graphs showing my progress while pivot tuning_
319 | 
320 | ## Explore the 2D Domain Graph
321 | 
322 | With the 3D graph cleaned up a bit we can now dive back into the 2 dimensional view to explore the details of the 
323 | different domain clusters or individual domains. 
324 | 
325 | Run the next cell to create the interactive 2D domain graph.
326 | 
327 | <img src="../images/2d_v1.png" alt="2D viz" width="800"/>
328 | 
329 | There are several things we can do in this view of the graph. We can zoom into a cluster to look at what domains are 
330 | in it. To do this, just click-drag over a section of the graph you want to zoom into. To zoom back out click the
331 | little house icon in the upper right of the 2D graph view.
332 | 
333 | A large cluster is sometimes actually made up of 2-3 smaller clusters that are useful to explore, but just lump 
334 | together in the zoomed-out view. If common domain name patterns are a theme in your investigation, 
335 | mousing over domains to view their name is a useful tactic to see which domains are grouped together.
336 | 
337 | ![3d domain graph](../images/2d_zoom.gif)
338 | 
339 | We can also select a region of the graph (which is different from zooming in) by clicking either the "box select"
340 | or "lasso select" icons in the upper right of the 2D view, and then click-drag the region in the graph to select.  
341 | 
342 | ![3d domain graph](../images/2d_zoom_select.gif)
343 | 
344 | Once a set of domains are selected (and all other domains become grayed out) DomainCAT will show the list of domain names.
345 | 
346 | <img src="../images/selected_domains.png" alt="2D viz" width="150"/>
347 | 
348 | If this set of domains looks like what you are looking for in your investigation, you could export this list back 
349 | into your SOAR playbook to build rules around them, or maybe add them to your firewall to block outright. Or if you 
350 | could copy them back into Iris to do a more focused investigation just on these domains. 
351 | 
352 | ## Dig into Pivot Connections
353 | 
354 | Once you have a set of selected domains, you can dig deeper into what pivots were influential in grouping them together.
355 | Run the next cell in the notebook:
356 | 
357 | <img src="../images/run_heatmap.png" alt="2D viz" width="800"/>
358 | 
359 | This will analyze all the pivots that are shared across the domains that you just selected, and show you a heatmap view
360 | of which pivots were most influential in connecting the selected domains, ordered from most influential to least. 
361 | 
362 | <img src="../images/pivot_heatmap.png" alt="2D viz" width="800"/>
363 | 
364 | Looking at the list of selected domain names, it’s not a surprise that “longest_common_substring” was the most 
365 | frequent pivot. The number in each square is the total number of pivots of that type from that domain to the other 
366 | selected domains. *This view can tell you what pivot categories were most influential in grouping these domains 
367 | together*. This information can be really valuable when your investigation didn’t include one of these 
368 | pivots in the original search criteria.
369 | 
370 | If you want to look at which specific pivot values are responsible for clustering these domains together, the below 
371 | view is a more detailed heat map which shows the actual pivot value rather than its key.
372 | 
373 | <img src="../images/pivot_value_heatmap.png" alt="2D viz" width="800"/>
374 | 
375 | From this we can see that “esta-c” is a very common naming pattern for these domains. If this was a pattern in our 
376 | source indicator(s), we might go back to Iris and add “domain contains esta-c” as an additional search criteria to 
377 | bring in a broader set of domains into our investigation.
378 | 
379 | There is another interesting pattern in this view. If you look at the ASN, ISP, IP address, and dynadot[.]com name 
380 | server rows you’ll see that they are all used by the same subset of domains in this list. But hawkhost[.]com nameserver 
381 | is used by all the other domains. Based on the domain naming pattern it’s reasonable to believe that all these domains 
382 | were created by the same actor, but it looks like this actor uses at least two different hosting and name server 
383 | infrastructures. This realization could widen out the set of domains that you can now take action on. 
384 | 
385 | There’s one other piece of functionality in the 2D graph view that is worth mentioning. Sometimes you just want to 
386 | see what domains are connected to one specific domain. If you click any node in the graph, it will automatically 
387 | select all the nodes that are connected to it and you can then explore the pivots that bind them together.
388 | 
389 | ![3d domain graph](../images/2d_click.gif)
390 | 
391 | This is useful when you have a seed domain in your investigation and you want to just dive right into it and see what 
392 | other domains are connected to it. Another useful scenario (shown above) is when you see a “connector domain” that 
393 | sits in between two clusters but is highly connected to both. Clicking on that domain and then inspecting the shared 
394 | pivots can sometimes yield valuable information about two different infrastructure patterns used by a potential actor.
395 | 
396 | # DomainCAT Tips & Best Practices
397 | 
398 | ## longest_common_substring
399 | When investigating a set of domains that have obvious common naming patterns, it can be useful to turn off all pivots 
400 | except `longest_common_substring` which would show how connected all the domains are based solely on how they were named. 
401 | I'll also often combine `longest_common_substring` with only one or two other infrastructure based pivots, 
402 | like `ns_domain` or `mx_domain`. This technique can be useful when your investigation centers around domain name 
403 | patterns and shared infrastructure.
404 | 
405 | 


--------------------------------------------------------------------------------
/domain_cat.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# DomainCAT: Domain Connectivity Analysis Tool\n",
  8 |     "\n",
  9 |     "### Analyzing the domain to domain connectivity of an Iris API Search"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# Run This First: imports all the helper functions and sets stuff up\n",
 19 |     "%run domain_cat_module.py\n",
 20 |     "\n",
 21 |     "print(\"DomainCAT is ready to go\")"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Iris REST API Credentials"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "api_username_ui = widgets.Text(placeholder='Iris API Username', description='Username:', layout={'width': '500px'}, value=\"\")\n",
 38 |     "api_pw_ui = widgets.Password(placeholder='Iris API Password', description='Password:', layout={'width': '500px'}, value=\"\")\n",
 39 |     "widgets.VBox([api_username_ui, api_pw_ui])"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## Query Domain Data From Iris Investigate API\n",
 47 |     "\n",
 48 |     "Enter either a list of return delimited domains into the Domains text box, _OR_ an Iris search hash into the hash text box.\n",
 49 |     "\n",
 50 |     "Note: if both a list of domains _AND_ a search hash is entered, the liast of domains will be queried and the search hash will be ignored"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "domain_list_ui = widgets.Textarea(placeholder='Enter list of domains', description='Domains:', layout={'height': '300px', 'width': '700px'}) \n",
 60 |     "search_hash_ui = widgets.Text(placeholder='Enter list of domains', description='Hash:', layout={'width': '700px'})\n",
 61 |     "show_iris_query_ui(domain_list_ui, search_hash_ui)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# Data Loading Config\n",
 71 |     "query_api = True\n",
 72 |     "save_search_to_disk = False\n",
 73 |     "json_file_path = \"data/dash_gov_dot_us.json\"\n",
 74 |     "\n",
 75 |     "if query_api:\n",
 76 |     "    iris_results = query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui)\n",
 77 |     "    print(f'Iris API returned {len(iris_results)} domains')\n",
 78 |     "\n",
 79 |     "    # save search results to disk to be used later\n",
 80 |     "    if save_search_to_disk:\n",
 81 |     "        with open(json_file_path, 'w') as f:\n",
 82 |     "            json.dump(iris_results, f)\n",
 83 |     "else:\n",
 84 |     "    with open(json_file_path) as json_data:\n",
 85 |     "        iris_results = json.loads(json_data.read())\n",
 86 |     "\n",
 87 |     "    print(f'Loaded {len(iris_results)} domains from {json_file_path}')"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "## DomainCAT Configuration\n",
 95 |     "\n",
 96 |     "Please refer to the DomainCAT documentation for details about these configuration options"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "config = Config()\n",
106 |     "\n",
107 |     "# only analyze domains that are active (currently registered)\n",
108 |     "config.active_domains_only = True\n",
109 |     "\n",
110 |     "# config for pivoting on matching substrings. Only matching substrings this long or longer will be used to create a pivot\n",
111 |     "config.longest_common_substring = 6\n",
112 |     "\n",
113 |     "# List of substrings to ignore when creating pivots by matching substrings\n",
114 |     "config.ignore_substrings = []\n",
115 |     "\n",
116 |     "# use the pivot count to scale how important the pivot is during graph layout. Smaller pivot counts is more influence, and vice versa\n",
117 |     "config.scale_edge_strength_by_pivot_count = True\n",
118 |     "\n",
119 |     "# Global pivot count threshold. Any pivot with more than this value is discarded. sys.maxsize effectivly keeps all pivots\n",
120 |     "config.global_count_threshold = sys.maxsize\n",
121 |     "\n",
122 |     "# The smallest pivot count size to use. Default of 2 means no pivots are filtered out because it's count is too low\n",
123 |     "config.min_pivot_size = 2\n",
124 |     "\n",
125 |     "# theoretical max pivot size for calculating edge strengths\n",
126 |     "config.max_domains = 100000000\n",
127 |     "\n",
128 |     "# If True DomainCAT will print out some debug info while building the connected graph of domains\n",
129 |     "config.print_debug_output = False"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "## Choose Which Pivots To Use & Build Domain Graph\n"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "pivot_category_config = {\n",
146 |     "    \"adsense\",\n",
147 |     "    \"google_analytics\",\n",
148 |     "    \"create_date\",\n",
149 |     "    \"redirect_domain\",\n",
150 |     "    \"registrar\",\n",
151 |     "    \"ip_address\",\n",
152 |     "    \"ip_country_code\",\n",
153 |     "    \"ip_isp\",\n",
154 |     "    \"ip_asn\",\n",
155 |     "    \"ssl_hash\",\n",
156 |     "    \"ssl_subject\",\n",
157 |     "    \"ssl_org\",\n",
158 |     "    \"ssl_email\",\n",
159 |     "    \n",
160 |     "#     # Note: commented out ns_host and ns_ip because they double count ns connectedness when used with ns_domain. \n",
161 |     "    \"ns_domain\",\n",
162 |     "#     \"ns_host\",  \n",
163 |     "    \"ns_ip\",  \n",
164 |     "    \n",
165 |     "#     # Note: commented out mx_host and mx_ip because they double counts mx connectedness when used with mx_domain    \n",
166 |     "    \"mx_domain\",\n",
167 |     "#     \"mx_host\",\n",
168 |     "    \"mx_ip\", \n",
169 |     "    \n",
170 |     "    \"tld\",\n",
171 |     "    \"longest_common_substring\",\n",
172 |     "}\n",
173 |     "\n",
174 |     "# Build the domain pivot graph structure\n",
175 |     "config.pivot_category_config = pivot_category_config\n",
176 |     "graph, pivot_categories, trimmed_domains = build_domain_pivot_graph(iris_results, config)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "## Trimmed Domains"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "print_trimmed_domains = True\n",
193 |     "if print_trimmed_domains:\n",
194 |     "    if len(trimmed_domains[\"unconnected\"]) > 0:\n",
195 |     "        print(\"trimmed unconnected domains:\")\n",
196 |     "        for domain in trimmed_domains[\"unconnected\"]: print(f\"  {domain}\")\n",
197 |     "    if len(trimmed_domains[\"create_date\"]) > 0:\n",
198 |     "        print(\"\\ntrimmed domains with only create date pivot:\")\n",
199 |     "        for domain in trimmed_domains[\"create_date\"]: print(f\"  {domain}\")"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "## Draw the Domain Graph in an Interactive 3D Layout"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "build_3d_graph_layout(graph)"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "build_3d_graph_layout(graph)"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "build_3d_graph_layout(graph)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "## Calculate & Show Pivot Statistics"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "# Calculate a bunch of pivot statistics to see how well connected all the domains in the search result are\n",
250 |     "calc_pivot_stats(graph, pivot_categories)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "## Draw the Domain Graph in an Interactive 2D Layout"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "# calculate the pivots shared in commmon across all selected domains\n",
267 |     "shared_pivots = {}\n",
268 |     "def get_2d_shared_pivots(graph, selected_domains):\n",
269 |     "    global shared_pivots\n",
270 |     "    shared_pivots = get_shared_pivots(graph, selected_domains)\n",
271 |     "    \n",
272 |     "build_2d_graph_layout(graph, get_2d_shared_pivots)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {},
278 |    "source": [
279 |     "## Heatmap of which pivots connect the most domains together: by pivot category"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "if len(shared_pivots) == 0:\n",
289 |     "    print(\"Select a set of domains in the 2D graph\")\n",
290 |     "else:\n",
291 |     "    create_pivot_heatmaps(shared_pivots)"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {},
297 |    "source": [
298 |     "## Removing domains from the graph\n",
299 |     "\n",
300 |     "Sometimes you find disconnected domains in the 3D graph visualization that make pivoting the viz really annoying. To remove domains from the graph, enter the domain(s) you want removed in the text box below and run the second cell. This will remove the domains from the graph structure without having to requery the data.\n",
301 |     "\n",
302 |     "After you do this, re-run the 3D viz and the domains should be gone."
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "remove_domains_ui = widgets.Textarea(placeholder='Enter domains to remove from graph', description='Domains:', layout={'height': '100px', 'width': '700px'}) \n",
312 |     "remove_domains_ui"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "# Run this to remove the domains in the above text box from the graph\n",
322 |     "graph = remove_domains_from_graph(graph, remove_domains_ui)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": []
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": []
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": null,
342 |    "metadata": {},
343 |    "outputs": [],
344 |    "source": []
345 |   }
346 |  ],
347 |  "metadata": {
348 |   "kernelspec": {
349 |    "display_name": "Python 3 (ipykernel)",
350 |    "language": "python",
351 |    "name": "python3"
352 |   },
353 |   "language_info": {
354 |    "codemirror_mode": {
355 |     "name": "ipython",
356 |     "version": 3
357 |    },
358 |    "file_extension": ".py",
359 |    "mimetype": "text/x-python",
360 |    "name": "python",
361 |    "nbconvert_exporter": "python",
362 |    "pygments_lexer": "ipython3",
363 |    "version": "3.8.10"
364 |   }
365 |  },
366 |  "nbformat": 4,
367 |  "nbformat_minor": 4
368 | }


--------------------------------------------------------------------------------
/domain_cat_module.py:
--------------------------------------------------------------------------------
   1 | import os
   2 | import sys
   3 | import re
   4 | import json
   5 | import math
   6 | from difflib import SequenceMatcher
   7 | import plotly.graph_objects as go
   8 | import requests 
   9 | import networkx as nx
  10 | import pandas as pd 
  11 | import numpy as np
  12 | import scipy
  13 | import matplotlib
  14 | import matplotlib.pyplot as plt
  15 | from ipywidgets import interactive, HBox, VBox
  16 | import ipywidgets as widgets
  17 | from IPython.display import HTML, display
  18 | import tabulate
  19 | from dotenv import dotenv_values
  20 | 
  21 | 
  22 | # load REST API creds from .env file
  23 | dcat_config = dotenv_values(".env")
  24 | 
  25 | def show_iris_query_ui(domain_list_ui, search_hash_ui):
  26 |     lookup_ui = widgets.VBox([
  27 |         widgets.Label(value="Enter a return delimited list of domains to lookup (no commas, no quotes)"),
  28 |         domain_list_ui,
  29 |         widgets.Label(value="Or..."),
  30 |         widgets.Label(value="Enter an Iris search hassh to lookup"),
  31 |         search_hash_ui,
  32 |     ])
  33 |     return lookup_ui
  34 | 
  35 | 
  36 | def clean_domain_list(domain_list_ui):
  37 |     # remove any quotes, spaces, or defanging square brackets
  38 |     full_domain_list = domain_list_ui.value.strip().replace(' ', '').replace('"', '').replace("'", "").replace('[', '').replace(']', '')
  39 |     # replace commas with new lines
  40 |     full_domain_list = full_domain_list.replace(",", "\n")
  41 |     # update the widget
  42 |     domain_list_ui.value = full_domain_list
  43 |     # split into array
  44 |     return full_domain_list.split("\n")
  45 | 
  46 | 
  47 | def get_rest_api_creds(api_username_ui, api_pw_ui):
  48 |     api_username = api_username_ui.value
  49 |     if len(api_username) == 0:
  50 |         api_username = dcat_config["IRIS_API_USERNAME"]
  51 |     api_key = api_pw_ui.value
  52 |     if len(api_key) == 0:
  53 |         api_key = dcat_config["IRIS_API_KEY"]
  54 |     return api_username, api_key
  55 | 
  56 | 
  57 | def query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui):
  58 |     api_username, api_key = get_rest_api_creds(api_username_ui, api_pw_ui)
  59 |     if len(domain_list_ui.value) > 0:
  60 |         # split list of domains into groups of 100 because of API restrictions
  61 |         results = []
  62 |         full_domain_list = clean_domain_list(domain_list_ui)
  63 |         max_domains = 100
  64 |         start = 0
  65 |         end = max_domains
  66 |         for x in range(math.ceil(len(full_domain_list) / max_domains)):
  67 |             # slice out max domains to query
  68 |             partial_domain_list = full_domain_list[start:end]
  69 |             # build query string
  70 |             domain_list = ",".join(partial_domain_list)
  71 |             iris_query = {"api_username": api_username, "api_key": api_key, "domain": domain_list}
  72 |             # query rest api
  73 |             print(f"...querying Iris REST API for {len(partial_domain_list)} domains")
  74 |             iris_results = _query_iris_rest_api(api_username, api_key, iris_query)
  75 |             # build up the set of return domain objects
  76 |             results = results + iris_results["response"]["results"]
  77 |             # update slice indexes
  78 |             start = end
  79 |             end += max_domains
  80 |         return results
  81 |     elif len(search_hash_ui.value) > 0:
  82 |         iris_query = {"api_username": api_username, "api_key": api_key, "search_hash": search_hash_ui.value}
  83 |         iris_results = _query_iris_rest_api(api_username, api_key, iris_query)
  84 |         iris_results = iris_results["response"]["results"]
  85 |         return iris_results
  86 |     else:
  87 |         print("Domain List and Search Hash text boxes are empty. Please enter either a list of domains or search hash to lookup")
  88 |         raise Exception("Domain List and Search Hash text boxes are empty")
  89 | 
  90 | 
  91 | def _query_iris_rest_api(api_username: str, api_key: str, iris_query: str):        
  92 |     root_api_url = "https://api.domaintools.com/v1/iris-investigate/"
  93 |     resp = requests.post(root_api_url, data=iris_query)
  94 |     if resp.status_code != 200:
  95 |         raise Exception(f'POST /iris-investigate/ {resp.status_code}: {resp.text}')
  96 |     iris_results = resp.json()
  97 |     return iris_results
  98 | 
  99 | 
 100 | def remove_domains_from_graph(graph, remove_domains_ui):
 101 |     domains = clean_domain_list(remove_domains_ui)
 102 |     for domain in domains:
 103 |         if graph.has_node(domain):
 104 |             graph.remove_node(domain)
 105 |     return graph
 106 | 
 107 | 
 108 | class Config(object):
 109 |     """ Little helper class to hold all the config values"""
 110 | 
 111 |     
 112 | class Domain(object):
 113 |     """ Little helper class to hold the domain name and risk score
 114 |     """
 115 |     def __init__(self, domain_json):
 116 |         self.json = domain_json
 117 |         self.name = domain_json["domain"]
 118 |         self.risk_score = domain_json["domain_risk"]['risk_score']
 119 |         self.pivot_categories = {}
 120 |         self.label=f"{self.name} ({self.risk_score})"
 121 |     
 122 |     def __str__(self):
 123 |         return f"name: {self.name}, risk: {self.risk_score}"
 124 |     
 125 |     def __repr__(self):
 126 |         return str(self)
 127 | 
 128 | 
 129 | class DomainRelationship(object):
 130 |     def __init__(self, weight: float, category: str):
 131 |         # this is the maximum weight that an edge can have.
 132 |         # Adjust this if you want to play around with stronger edge weights
 133 |         self.max_weight = 5.0
 134 |         self.weight = weight
 135 |         self.categories = [category]
 136 |     
 137 |     def __str__(self):
 138 |         return f"weight: {self.weight}, categories: {self.categories}"
 139 |     
 140 |     def __repr__(self):
 141 |         return str(self)
 142 |     
 143 |     def add(self, weight: float, category: str):
 144 |         """ Note: certain pivot categories can be added more than once for 2 domains;
 145 |         things like IP and name server. For example, two domains could be on the same set of 5
 146 |         IP addreese. For now the weights are just summed if there are more than one pivots of
 147 |         the same category, but maybe we need a different strategy. Since IPs have multiple pivots
 148 |         (ip address, country code, asn, isp) this means if there were 5 shared IPs between two
 149 |         domains, the weight would be: 4 * 5 * pivot_weight.
 150 |         This might over amplify the edge strength
 151 |         """
 152 |         if category not in self.categories:
 153 |             # this helps by not overly boosting the edge weight if two domains share
 154 |             # multipel IP addresses
 155 |             self.weight += weight
 156 |         if self.weight > self.max_weight:
 157 |             self.weight = self.max_weight
 158 |         self.categories.append(category)
 159 |     
 160 |     def get_description(self):
 161 |         return "<br>".join(sorted(self.categories))
 162 | 
 163 | 
 164 | class PivotValue(object):
 165 |     def __init__(self, pivot_value, pivot_count):
 166 |         self.pivot_value = pivot_value
 167 |         self.pivot_count = pivot_count
 168 |         self.domains = set()
 169 |         
 170 |     def union(self, other: "PivotValue"):
 171 |         self.domains.union(other.domains)
 172 |         
 173 |     def __str__(self):
 174 |         return f"pivot_value: {self.pivot_value}, " \
 175 |                f"pivot_count: {self.pivot_count}, " \
 176 |                f"domains: {self.domains}"
 177 |     
 178 |     def __repr__(self):
 179 |         return str(self)
 180 | 
 181 |         
 182 | def get_edge_count(n: int):
 183 |     # for a complete graph, the edge count is: n(n-1)/2
 184 |     return n * (n - 1) / 2
 185 | 
 186 | 
 187 | def build_domain_pivot_graph(iris_results: list, config: "Config"):
 188 |     """ Main workflow function that takes the results from an Iris Investigate query and
 189 |     builds the graph object of how each of the domains in the query are connected to each other"""
 190 |     
 191 |     # parse the Iris API Result to build the pivot data structure
 192 |     graph, pivot_categories = init_local_pivot_graph(iris_results, config)
 193 | 
 194 |     # normalize registrar pivots (see note in function comments)
 195 |     #if "registrar" in pivot_categories and config.normalize_registrars:
 196 |     #    normalize_similar_registrars(pivot_categories["registrar"])
 197 | 
 198 |     # create pivots for longest common substrings
 199 |     pivot_on_matching_substrings(graph, pivot_categories, config)
 200 | 
 201 |     # trim pivots from graph that have less than the set count threshold or contain all domains
 202 |     trim_pivots(pivot_categories, len(graph.nodes), config)
 203 | 
 204 |     # trim unconnected domains and domains with only a create date pivot
 205 |     trimmed_unconnected_domains = trim_unconnected_domains(graph, pivot_categories, config)
 206 |     trimmed_create_date_domains = trim_domains_with_only_create_date_pivot(graph, pivot_categories)
 207 | 
 208 |     print(f"{len(trimmed_unconnected_domains)} "
 209 |           f"domains trimmed because they were not connected to other domains")
 210 |     print(f"{len(trimmed_create_date_domains)} "
 211 |           f"domains trimmed because create_date was the only pivit")
 212 |     print(f"{len(graph.nodes)} domains in pivot structure \n")
 213 | 
 214 |     # build the graph structure based on the domain pivots
 215 |     graph = build_domain_graph(graph, pivot_categories, config)
 216 |     return (graph, 
 217 |             pivot_categories,
 218 |             {"unconnected": trimmed_unconnected_domains,
 219 |              "create_date": trimmed_create_date_domains})
 220 | 
 221 | 
 222 | def init_local_pivot_graph(iris_results: list, config: "Config"):
 223 |     """ Collect pivot categories found in result set ("ssl_hash" for example)"""
 224 |     # init empty graph
 225 |     graph = nx.Graph()
 226 |     # init pivot categories dict
 227 |     pivot_categories = {}
 228 | 
 229 |     for domain_json in iris_results:
 230 |         
 231 |         # check if domain is active or not
 232 |         if domain_json['active'] == False and config.active_domains_only:
 233 |             continue
 234 |         
 235 |         # create a domain object
 236 |         domain = Domain(domain_json)
 237 | 
 238 |         # add domain node to graph
 239 |         graph.add_node(domain.name, domain=domain)
 240 | 
 241 |         append_value_with_count(pivot_categories, 'adsense', domain_json, domain, config)
 242 |         append_value_with_count(pivot_categories, 'google_analytics', domain_json, domain, config)
 243 |         append_value_with_count(pivot_categories, 'create_date', domain_json, domain, config)
 244 |         append_value_with_count(pivot_categories, 'redirect_domain', domain_json, domain, config)
 245 |         append_value_with_count(pivot_categories, 'registrar', domain_json, domain, config)
 246 | 
 247 |         # haven't seen "ssl_email" in the wild yet, so not sure if it is a value/count or just value 
 248 |         append_values_with_counts(pivot_categories, 'ssl_email', domain_json, domain, config)
 249 | 
 250 |         # IPs are composite objects, so pull out each value for each IP
 251 |         for ip_json in domain_json["ip"]:
 252 |             # at some point add logic to add /24 in here
 253 |             append_value_with_count(pivot_categories, 'address', ip_json, domain, config, 'ip_address')
 254 |             append_value_with_count(pivot_categories, 'country_code', ip_json, domain, config, 'ip_country_code')
 255 |             append_value_with_count(pivot_categories, 'isp', ip_json, domain, config, 'ip_isp')
 256 |             append_values_with_counts(pivot_categories, 'asn', ip_json, domain, config, 'ip_asn')
 257 | 
 258 |         # name servers are composite objects, so pull out each value for each name server
 259 |         for ns_json in domain_json["name_server"]:
 260 |             append_value_with_count(pivot_categories, 'host', ns_json, domain, config, 'ns_host')
 261 |             append_value_with_count(pivot_categories, 'domain', ns_json, domain, config, 'ns_domain')
 262 |             append_values_with_counts(pivot_categories, 'ip', ns_json, domain, config, 'ns_ip')
 263 | 
 264 |         append_value(pivot_categories, 'tld', domain_json, domain, config)
 265 | 
 266 |         # ssl certs are composite objects, so pull out each value for each ssl cert
 267 |         for ssl_json in domain_json['ssl_info']:
 268 |             append_value_with_count(pivot_categories, 'hash', ssl_json, domain, config, "ssl_hash")
 269 |             append_value_with_count(pivot_categories, 'subject', ssl_json, domain, config, "ssl_subject")
 270 |             append_value_with_count(pivot_categories, 'organization', ssl_json, domain, config, "ssl_org")
 271 | 
 272 |         # mx servers are composite objects, so pull out each value for each mx server
 273 |         for mx_json in domain_json['mx']:
 274 |             append_value_with_count(pivot_categories, 'host', mx_json, domain, config, "mx_host")
 275 |             append_value_with_count(pivot_categories, 'domain', mx_json, domain, config, "mx_domain")
 276 |             append_values_with_counts(pivot_categories, 'ip', mx_json, domain, config, "mx_ip")
 277 |             # mx priority might be interesting at some point to node stringth  
 278 |     return graph, pivot_categories
 279 |     
 280 | 
 281 | def append_value(pivot_categories: dict,
 282 |                  pivot_category: str,
 283 |                  json_data: dict,
 284 |                  domain: "Domain",
 285 |                  config: "Config",
 286 |                  new_pivot_category: str = None):
 287 |     # check if pivot is in domain json
 288 |     if pivot_category in json_data:
 289 |         pivot_value = str(json_data[pivot_category]).strip()
 290 | 
 291 |         # check we have a value to add
 292 |         if len(pivot_value) > 0:
 293 |             _append_value_to_pivot(pivot_categories, pivot_category, pivot_value, None,
 294 |                                    domain, config, new_pivot_category)
 295 | 
 296 |             
 297 | def append_value_with_count(pivot_categories: dict,
 298 |                             pivot_category: str,
 299 |                             json_data: dict,
 300 |                             domain: "Domain",
 301 |                             config: "Config",
 302 |                             new_pivot_category: str = None):
 303 |     # check if pivot is in domain json
 304 |     if pivot_category in json_data:
 305 |         if isinstance(json_data[pivot_category], dict): 
 306 |             pivot_value = str(json_data[pivot_category]["value"]).strip()
 307 |             global_pivot_count = json_data[pivot_category]["count"]
 308 | 
 309 |             # trim pivots that are above the threshold (except create_date)
 310 |             if global_pivot_count < config.global_count_threshold or pivot_category == "create_date":
 311 |                 # check we have a value to add
 312 |                 if len(pivot_value) > 0 and global_pivot_count > 0:
 313 |                     _append_value_to_pivot(pivot_categories, pivot_category, pivot_value,
 314 |                                            global_pivot_count, domain, config, new_pivot_category)
 315 |             
 316 |             
 317 | def append_values_with_counts(pivot_categories: dict,
 318 |                               pivot_category: str,
 319 |                               json_data: dict,
 320 |                               domain: "Domain",
 321 |                               config: "Config",
 322 |                               new_pivot_category: str = None):
 323 |     # check if pivot is in domain json
 324 |     if pivot_category in json_data:
 325 |         for pivot in json_data[pivot_category]:
 326 |             pivot_value = str(pivot["value"]).strip()
 327 |             global_pivot_count = pivot["count"]
 328 |                         
 329 |             # check if we want to add this value
 330 |             if len(pivot_value) > 0 and global_pivot_count > 0 and global_pivot_count < config.global_count_threshold:
 331 |                 _append_value_to_pivot(pivot_categories, pivot_category, pivot_value,
 332 |                                        global_pivot_count, domain, config, new_pivot_category)
 333 | 
 334 | 
 335 | def _append_value_to_pivot(pivot_categories: dict,
 336 |                            pivot_category: str,
 337 |                            pivot_value: str,
 338 |                            global_pivot_count: int,
 339 |                            domain: "Domain",
 340 |                            config: "Config",
 341 |                            new_pivot_category: str = None):
 342 |     # if we pass in a new_pivot_category, replace pivot_category with new_pivot_category
 343 |     if new_pivot_category:
 344 |         pivot_category = new_pivot_category
 345 | 
 346 |     # check if we're capturing data for this pivot category 
 347 |     if pivot_category not in config.pivot_category_config:
 348 |         return
 349 |     
 350 |     # make sure we have the pivot dictionary 
 351 |     if pivot_category not in pivot_categories:
 352 |         pivot_categories[pivot_category] = {}
 353 | 
 354 |     # make sure we have the pivot value set
 355 |     if pivot_value not in pivot_categories[pivot_category]:
 356 |         pivot_categories[pivot_category][pivot_value] = PivotValue(pivot_value, global_pivot_count)
 357 | 
 358 |     # add domain to the pivot domain array
 359 |     pivot_categories[pivot_category][pivot_value].domains.add(domain.name)
 360 |     
 361 |     # add pivot category and value to the domain
 362 |     if pivot_category not in domain.pivot_categories:
 363 |         domain.pivot_categories[pivot_category] = []
 364 |     domain.pivot_categories[pivot_category].append(pivot_value)
 365 | 
 366 | 
 367 | def normalize_similar_registrars(registrar_pivots: dict):
 368 |     """ The same registrar can often show up in WHOIS records with different string values.
 369 |     For example:
 370 |         NAMECHEAP
 371 |         NAMECHEAP INC
 372 |         NAMECHEAP. INC
 373 |         NAMECHEAP, INC
 374 |         NAMECHEAP, INC.
 375 |     
 376 |     This function splits the registrar string by any non character value, and selects the longest
 377 |     word as the normalized registar value. If any two registrars share the same normalized value,
 378 |     then the domains from those two registrars will be merged. The end goal is all the domains
 379 |     from the 5 different namecheap registrars string values shown above would be merged into one.
 380 |     
 381 |     Note: this isn't a very good solution. There are cases where this will create invalid connections 
 382 |     between domains. For example, two different registars that shared a common longest word in 
 383 |     their name, link "NAMECHEAP, INC" and "NOT NAMECHEAP, INC". 
 384 |     
 385 |     It looks like this happens a lot so turning off the feature for now.
 386 |     
 387 |     TODO: this algorithm needs work. it allows things such as
 388 |       good
 389 |         PDR LTD. D/B/A PUBLICDOMAINREGISTRY.COM == PDR Ltd. d/b/a PublicDomainRegistry.com
 390 |         GODADDY.COM, == LLC GODADDY.COM, INC
 391 |         NAMECHEAP, INC == NameCheap, Inc.
 392 |       bad
 393 |         TUCOWS DOMAINS INC == WILD WEST DOMAINS, INC
 394 |         NETWORK SOLUTIONS, == LLC Network Solutions, LLC
 395 |         NETWORK SOLUTIONS, == LLC BIGROCK SOLUTIONS LTD
 396 |     """
 397 |     return
 398 | #     registrars = [registrar for registrar in registrar_pivots]
 399 | #     for x in range(len(registrars)):
 400 | #         reg1 = registrars[x]
 401 | #         if reg1 in registrar_pivots:
 402 | #             # normalize registrar string
 403 | #             reg1_norm = sorted(
 404 | #                 list(set(re.findall(r"[\w']+", reg1.lower()))), key=len, reverse=True)[0]
 405 | #             for y in range(x+1, len(registrars)):
 406 | #                 reg2 = registrars[y]
 407 | #                 # normalize registrar string
 408 | #                 reg2_norm = sorted(
 409 | #                     list(set(re.findall(r"[\w']+", reg2.lower()))), key=len, reverse=True)[0]
 410 | #                 if reg1_norm == reg2_norm:
 411 | #                     # pick the registrar with the most domains
 412 | #                     if registrar_pivots[reg1].pivot_count > registrar_pivots[reg2].pivot_count:
 413 | #                         reg_keep = reg1
 414 | #                         reg_pop = reg2
 415 | #                     else:
 416 | #                         reg_keep = reg2
 417 | #                         reg_pop = reg1
 418 | #                     # combine domains for matching registrars
 419 | #                     registrar_pivots[reg_keep].union(registrar_pivots[reg_pop])
 420 | #                     # remove reg_pop from dictionary of all registrar pivots
 421 | #                     registrar_pivots.pop(reg_pop)
 422 | #                     print(f"Merged registrar {reg_pop} into {reg_keep}")
 423 | 
 424 | 
 425 | def pivot_on_matching_substrings(graph: "Graph", pivot_categories: dict, config: "Config"):
 426 |     """Create pivots between domains that share a common substring of
 427 |     `config.longest_common_substring` chars long.
 428 |     
 429 |     Note: SequenceMatcher has some known issues with not finding the longest match in very long
 430 |     strings, but does a pretty good job with shorter strings such as domain names.
 431 |     https://stackoverflow.com/questions/18715688/find-common-substring-between-two-strings
 432 |     """
 433 |     domains = list(graph.nodes)
 434 |     for x in range(len(domains)):
 435 |         domain1 = graph.nodes[domains[x]]["domain"]
 436 |         string1 = domain1.name.split('.')[0]
 437 |         # pull out substrings to ignore
 438 |         if config.ignore_substrings and len(config.ignore_substrings) > 0:
 439 |             for ignore in config.ignore_substrings:
 440 |                 string1 = string1.replace(ignore, "")
 441 |         for y in range(x+1, len(domains)):
 442 |             domain2 = graph.nodes[domains[y]]["domain"]
 443 |             string2 = domain2.name.split('.')[0]
 444 |             # pull out substrings to ignore
 445 |             if config.ignore_substrings and len(config.ignore_substrings) > 0:
 446 |                 for ignore in config.ignore_substrings:
 447 |                     string2 = string2.replace(ignore, "")
 448 |             # find the longest common substring between the two domains
 449 |             matcher = SequenceMatcher(None, string1, string2, False)
 450 |             match = matcher.find_longest_match(0, len(string1), 0, len(string2))
 451 |             longest_match = string1[match.a: match.a + match.size]
 452 |             # check if the matching substring is long enough
 453 |             if len(longest_match) >= config.longest_common_substring:
 454 |                 # add pivots
 455 |                 _append_value_to_pivot(
 456 |                     pivot_categories, 
 457 |                     "longest_common_substring",
 458 |                     longest_match, None,
 459 |                     domain1, config)
 460 |                 _append_value_to_pivot(
 461 |                     pivot_categories, 
 462 |                     "longest_common_substring",
 463 |                     longest_match, None,
 464 |                     domain2, config)
 465 | 
 466 | 
 467 | def trim_pivots(pivot_categories: dict, domain_count: int, config: "Config"):
 468 |     """ Remove two types of pivots. Pivots that contain all the domains from the Iris result set, 
 469 |     and pivots that have less than the set threshold of domains in them from this Iris result set.
 470 |     By defualt, pivots that only have one domain are removed, but this can be configured by 
 471 |     setting the min_pivot_size variable to a different value. For example, if you only wanted 
 472 |     to use pivots that had 10 or more domains connected to them
 473 |     """    
 474 |     for pivot_category_key in pivot_categories:
 475 |         pivot_category = pivot_categories[pivot_category_key]
 476 |         total_pivots = 0
 477 |         del_count = 0
 478 |         for pivot_value in list(pivot_category.keys()):
 479 |             total_pivots += 1
 480 |             if len(pivot_category[pivot_value].domains) < config.min_pivot_size:
 481 |                 # check for pivots with less than the threshold value
 482 |                 del pivot_category[pivot_value]
 483 |                 del_count += 1
 484 |             elif len(pivot_category[pivot_value].domains) >= domain_count:
 485 |                 # check for pivots with all domains in them
 486 |                 del pivot_category[pivot_value]
 487 |                 if config.print_debug_output:
 488 |                     print(f"deleted {pivot_category_key}:{pivot_value}. Contained all domains")
 489 |         if config.print_debug_output:
 490 |             print(f"deleted {del_count} "
 491 |                   f"singleton pivots out of {total_pivots} "
 492 |                   f"pivots from {pivot_category_key}")
 493 | 
 494 |             
 495 | def trim_unconnected_domains(graph: "Graph", pivot_categories: dict, config: "Config"):
 496 |     """ Remove any domains that have no shared connection to any othe domain
 497 |     """
 498 |     if config.print_debug_output: print(f"{len(graph.nodes)} domains in Iris result set")
 499 |     connected_domains = set()
 500 |     for pivot_category_key in pivot_categories:
 501 |         pivot_category = pivot_categories[pivot_category_key]
 502 |         for pivot_value in list(pivot_category.keys()):
 503 |             pivot_domains = pivot_category[pivot_value].domains
 504 |             connected_domains = connected_domains.union(pivot_domains)
 505 | 
 506 |     # get the set of domains that are not connected
 507 |     domains = set(graph.nodes)
 508 |     lonely_domains = domains.difference(connected_domains)
 509 |             
 510 |     # remove unconnected domains
 511 |     for domain in lonely_domains:
 512 |         graph.remove_node(domain)        
 513 | 
 514 |     if config.print_debug_output: 
 515 |         print(f"{len(connected_domains)} domains are interconnected")
 516 |         print(f"{len(lonely_domains)} domains are unconnected")        
 517 |         print("Unconnected domains removed from graph:")
 518 |         for domain in lonely_domains:
 519 |             print(f"  {domain}")
 520 |     
 521 |     return lonely_domains
 522 | 
 523 | 
 524 | def trim_domains_with_only_create_date_pivot(graph: "Graph", pivot_categories: dict):
 525 |     """ if a domain ONLY has a create_date pivot, then that isn't a very good indicator of
 526 |     connectedness."""
 527 |     # identify domains to trim
 528 |     trimmed_domains = []
 529 |     for domain_name in graph.nodes:
 530 |         domain = graph.nodes[domain_name]["domain"]
 531 |         if len(domain.pivot_categories) == 1 and "create_date" in domain.pivot_categories:
 532 |             trimmed_domains.add(domain)
 533 |             # remove domain from graph and remove it from the main pivot_categories data structure
 534 |             graph.remove_node(domain_name)
 535 |             
 536 |             domain_create_date = domain.pivot_categories["create_date"][0]
 537 |             pivot_categories["create_date"][domain_create_date].remove(domain_name)
 538 |             if len(pivot_categories["create_date"][domain_create_date]) == 0:
 539 |                 pivot_categories["create_date"].pop(domain_create_date)
 540 |             if len(pivot_categories["create_date"]) == 0:
 541 |                 pivot_categories.pop("create_date")
 542 | 
 543 |     return trimmed_domains
 544 | 
 545 | 
 546 | def get_pivot_connection_weight(pivot_category: str,
 547 |                                 global_pivot_count: int,
 548 |                                 local_pivot_count: int,
 549 |                                 config: "Config"):
 550 |     """ If we aren't using the pivot count to set the edge weight, just return a constant value of
 551 |     1 for every pivot. If we do want to use the pivot count, use the function:
 552 |         1 - (log(pivot count) / (log(max possible pivot count)))
 553 |     This creates an inverse log ratio where small pivots have a high edge weight,
 554 |     and very large pivots have a low edge weight.
 555 |         
 556 |     Note: also experimenting with raising this log ratio to different exponents to get greater
 557 |     separation between large and small pivots: math.pow(1.0 + inverse_log_ratio, 3) - 1
 558 |     """
 559 |     if pivot_category not in config.pivot_category_config:
 560 |         raise Exception(f"Unexpected Pivot Category: {pivot_category}")
 561 |     
 562 |     # scale the edge strength based on the ratio of the global pivot count vs the max domains
 563 |     if config.scale_edge_strength_by_pivot_count:
 564 |         if global_pivot_count is None:
 565 |             # Some pivots don't have a count. For example, tld or longest common substring.
 566 |             # if global pivot count is None, for now set to 1 (?)
 567 |             # But we probably need to then normalize this weigth against the max weight calculated.
 568 |             # Also, TLD doesn't have a pivot count because it's often huge. Is that the same.
 569 |             #    importance as common substrings? Probably not.
 570 |             return 0.5
 571 |         inv_ratio = 1.0 - math.log(1.0 + global_pivot_count) / math.log(1.0 + config.max_domains)
 572 |         return inv_ratio
 573 | #         return math.pow(1.0 + inverse_log_ratio, 3) - 1
 574 |     return 1
 575 | 
 576 | 
 577 | def build_domain_graph(graph: "Graph", pivot_categories: dict, config: "Config"):
 578 |     # The graph in initialized with all it's nodes. Now we need to connect all the nodes
 579 |     # with each local pivot in the pivot_categories dict
 580 |     edge_count = 0
 581 |     for category in pivot_categories:
 582 |         for pivot_value in pivot_categories[category]:
 583 |             pivot = pivot_categories[category][pivot_value]
 584 |             pivot_domains = list(pivot.domains)
 585 | 
 586 |             # for each pair of domains in pivot, get the edge weight and create edge
 587 |             weight = get_pivot_connection_weight(category, pivot.pivot_count, len(pivot_domains), config)
 588 |             if weight > 0:
 589 |                 for x in range(len(pivot_domains)):
 590 |                     for y in range(x+1, len(pivot_domains)):
 591 |                         d1 = pivot_domains[x]
 592 |                         d2 = pivot_domains[y]
 593 |                         edge_count += 1
 594 |                         if graph.has_edge(d1, d2):
 595 |                             graph[d1][d2]['relationship'].add(weight, category)
 596 |                         else:
 597 |                             graph.add_edge(d1, d2, relationship=DomainRelationship(weight, category))
 598 | 
 599 |     # now that all edges are added, set the weight attribute with the adjusted weight
 600 |     for edge in graph.edges:
 601 |         graph[edge[0]][edge[1]]['weight'] = graph[edge[0]][edge[1]]['relationship'].weight
 602 | 
 603 |     print(f"Total Graph Connections: {edge_count}")
 604 |     print(f"Distinct Graph Connections: {len(graph.edges)}")
 605 |     return graph
 606 | 
 607 | 
 608 | def calc_pivot_stats(graph: "Graph", pivot_categories: dict):
 609 |     from IPython.display import HTML, display
 610 |     import tabulate
 611 | 
 612 |     # calc the max number of edges possible for this set of domains
 613 |     max_edge_count = get_edge_count(len(graph.nodes))
 614 | 
 615 |     # collect counts for each pivot category
 616 |     category_domain_counts = {}
 617 |     category_edge_counts = {}
 618 |     for category_key in pivot_categories:
 619 |         category_domain_counts[category_key] = 0
 620 |         category_edge_counts[category_key] = 0
 621 |         category = pivot_categories[category_key]
 622 |         for pivot_value in category:
 623 |             category_domain_counts[category_key] += len(category[pivot_value].domains)
 624 | 
 625 |             # if all domains share a pivot value, it would be considered a "connected graph" 
 626 |             #   so get the edge count for a connected graph
 627 |             edge_count = get_edge_count(len(category[pivot_value].domains))
 628 |             category_edge_counts[category_key] += round(edge_count)
 629 | 
 630 |     total_connections = 0
 631 | 
 632 |     headers = ["Pivot Category",
 633 |                "# of Domains",
 634 |                "# of Pivots",
 635 |                "avg domains per pivot",
 636 |                "# of connections"]
 637 |     table = []
 638 |     total_domains = len(graph.nodes)
 639 |     for category_key in category_domain_counts:
 640 |         cat_pivot_count = len(pivot_categories[category_key])
 641 |         if cat_pivot_count > 0:
 642 |             domain_count = category_domain_counts[category_key]
 643 |             edge_count = category_edge_counts[category_key]
 644 | 
 645 |             total_connections += edge_count
 646 | 
 647 |             avg_domains = domain_count / cat_pivot_count
 648 |             percent_of_total_domains = round(100 * (domain_count / total_domains), 2)                
 649 |             percent_of_total_edges = round(100 * (edge_count / max_edge_count), 2)            
 650 |             table.append([category_key,
 651 |                           f"{domain_count} ({percent_of_total_domains}%)",
 652 |                           cat_pivot_count,
 653 |                           round(avg_domains, 2),
 654 |                           f"{edge_count} ({percent_of_total_edges}%)"])
 655 | 
 656 |     print(f"{len(graph.nodes)} Domains in Pivot Structure")
 657 |     display(HTML(tabulate.tabulate(table, headers=headers, tablefmt='html')))    
 658 | 
 659 | 
 660 | def calc_viz_layout(layout: str, graph: "Graph", dimension: int):
 661 |     # KK layout only
 662 |     if layout == "kk":
 663 |         return nx.layout.kamada_kawai_layout(graph, dim=dimension)
 664 | 
 665 |     # spring layout only
 666 |     if layout == "fr":
 667 |         return nx.layout.spring_layout(graph, dim=dimension)
 668 | 
 669 |     # kk layout as initialization for spring layout
 670 |     if layout == "kk_to_fr":
 671 |         pos = nx.layout.kamada_kawai_layout(graph, dim=dimension, weight=None)
 672 |         return nx.layout.spring_layout(graph, pos=pos, dim=dimension)
 673 | 
 674 |     # spring layout as initialization for kk layout
 675 |     if layout == "fr_to_kk":
 676 |         pos = nx.layout.spring_layout(graph, dim=dimension)
 677 |         return nx.layout.kamada_kawai_layout(graph, pos=pos, dim=dimension)    
 678 |     raise Exception("invalid layout choice")
 679 |     
 680 |     
 681 | def build_3d_graph_layout(graph: "Graph"):
 682 |     """ Build the graph layout based on the specified algorithm and get the node positions
 683 |     in xyz dimensions"""
 684 |     pos = calc_viz_layout("kk_to_fr", graph, 3)
 685 | 
 686 |     node_labels, node_risk_scores, Xn, Yn, Zn = [], [], [], [], []
 687 |     for name in graph.nodes:
 688 |         # build x,y,z coordinates data structure for nodes
 689 |         Xn.append(pos[name][0])
 690 |         Yn.append(pos[name][1])
 691 |         Zn.append(pos[name][2])
 692 | 
 693 |         # get domain colors by risk score
 694 |         domain = graph.nodes[name]["domain"]
 695 |         node_labels.append(domain.label)
 696 |         node_risk_scores.append(domain.risk_score)
 697 | 
 698 |     # build x,y,z coordinates data structure for edges
 699 |     Xe, Ye, Ze = [], [], []
 700 |     for e in graph.edges:
 701 |         u = pos[e[0]]
 702 |         v = pos[e[1]]
 703 |         Xe+=[u[0], v[0], None]
 704 |         Ye+=[u[1], v[1], None]
 705 |         Ze+=[u[2], v[2], None]
 706 | 
 707 |     # Create the 3d Plotly graph and render it
 708 |     # build line objects for our edges
 709 |     trace1=go.Scatter3d(x=Xe, y=Ye, z=Ze,
 710 |                    mode='lines', 
 711 |                    name='edges',
 712 |                    line=dict(color='rgb(125,125,125)', width=0.5),
 713 |                    opacity=0.9, 
 714 |                    hoverinfo='none')
 715 | 
 716 |     trace2=go.Scatter3d(
 717 |                    x=Xn, y=Yn, z=Zn,
 718 |                    mode='markers', 
 719 |                    name='domains',
 720 |                    marker=dict(
 721 |                        symbol='circle', 
 722 |                        size=6,
 723 |                        showscale=True,
 724 |                        color=node_risk_scores,
 725 |                        colorscale=[[0.0, 'red'], [0.3, 'orange'], [0.5, 'yellow'], [1.0, 'green']],
 726 |                        # cmin/cmax needed so plotly doesn't normalize the scores to calculate the color
 727 |                        cmin=0, cmax=100,
 728 |                        reversescale=True,
 729 |                        line=dict(color='rgb(50,50,50)', width=0.5),
 730 |                        colorbar=dict(
 731 |                            thickness=15,
 732 |                            title='Risk Score',
 733 |                            xanchor='left',
 734 |                            titleside='right'
 735 |                        ),
 736 |                    ),
 737 |                    text=node_labels, 
 738 |                    hoverinfo='text')    
 739 |     
 740 |     # background definition, but everything is turned off
 741 |     axis=dict(showbackground=False,
 742 |               showline=False,
 743 |               zeroline=False,
 744 |               showgrid=False,
 745 |               showticklabels=False,
 746 |               title='')
 747 | 
 748 |     layout = go.Layout(
 749 |              title=f"Graph of interconnected domains ({len(node_labels)} domains)",
 750 |              width=1000, height=1000,
 751 |              showlegend=False,
 752 |              scene=dict(xaxis=dict(axis), yaxis=dict(axis), zaxis=dict(axis)),
 753 |              margin=dict(t=100), hovermode='closest')
 754 | 
 755 |     data=[trace1, trace2]
 756 |     fig=go.Figure(data=data, layout=layout)
 757 |     return fig
 758 | 
 759 | 
 760 | def build_2d_graph_layout(graph: "Graph", get_2d_shared_pivots: "function"):
 761 |     """ build the graph layout based on the specified algorithm and get the node positions
 762 |     in xy dimensions"""
 763 |     pos = calc_viz_layout("kk_to_fr", graph, 2)
 764 |     # pos = calc_viz_layout("fr_to_kk", g, 2)
 765 |     
 766 |     # build edge data
 767 |     edge_x, edge_y = [], []
 768 |     for e in graph.edges():
 769 |         x0, y0 = pos[e[0]]
 770 |         x1, y1 = pos[e[1]]
 771 |         edge_x.append(x0)
 772 |         edge_x.append(x1)
 773 |         edge_x.append(None)
 774 |         edge_y.append(y0)
 775 |         edge_y.append(y1)
 776 |         edge_y.append(None)
 777 | 
 778 |     # create edge scatter plot
 779 |     edge_trace = go.Scatter(
 780 |         x=edge_x, y=edge_y,
 781 |         line=dict(width=0.5, color='#888'),
 782 |         hoverinfo='none',
 783 |         mode='lines',
 784 |         opacity=0.6
 785 |     )
 786 | 
 787 |     # build node data
 788 |     node_adjacencies, node_risk_scores, node_text, node_x, node_y = [], [], [], [], []
 789 |     names = list(graph.nodes)
 790 |     for name in names:
 791 |         domain = graph.nodes[name]["domain"]
 792 |         x, y = pos[name]
 793 |         node_x.append(x)
 794 |         node_y.append(y)
 795 |         # get the domain's connected nodes
 796 |         neighbors = list(graph.neighbors(name))
 797 |         node_adjacencies.append(neighbors)
 798 |         # get the node text
 799 |         node_text.append(f'{name}: risk {domain.risk_score}, connections {len(neighbors)}')
 800 |         # get the domain risk score
 801 |         node_risk_scores.append(domain.risk_score)
 802 | 
 803 |     # build node scatter plot
 804 |     node_trace = go.Scatter(
 805 |         x=node_x, y=node_y,
 806 |         mode='markers',
 807 |         hoverinfo='text',
 808 |         text=node_text,
 809 |         customdata=node_adjacencies,
 810 |         marker=dict(
 811 |             showscale=True,
 812 |             reversescale=True,
 813 |             color=node_risk_scores,
 814 |             colorscale=[[0.0, 'red'], [0.3, 'orange'], [0.5, 'yellow'], [1.0, 'green']],
 815 |             # cmin/cmax needed so plotly doesn't normalize the scores to calculate the color
 816 |             cmin=0, cmax=100,
 817 |             size=10,
 818 |             colorbar=dict(
 819 |                 thickness=15,
 820 |                 title='Risk Score',
 821 |                 xanchor='left',
 822 |                 titleside='right'
 823 |             ),
 824 |             line_width=2))
 825 | 
 826 |     # create the jup widget holder for plotly
 827 |     fig = go.FigureWidget(
 828 |         [edge_trace, node_trace],
 829 |         layout=go.Layout(
 830 |         title=f'Graph of interconnected domains ({len(node_text)} domains)',
 831 |         titlefont_size=16,
 832 |         showlegend=False,
 833 |         hovermode='closest',
 834 |         margin=dict(b=5,l=5,r=5,t=30),
 835 |         xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
 836 |         yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
 837 |     )
 838 |     
 839 |     # handle selection of domains
 840 |     def node_selection_fn(trace, points, selector):
 841 |         selected_domains = [names[idx] for idx in points.point_inds]
 842 |         update_selected_domains(selected_domains)
 843 |     
 844 |     # handle node click events
 845 |     def node_click_fn(trace, points, selector):        
 846 |         if len(points.point_inds) > 1:
 847 |             print(f"node_click passed in more than 1 point: {points.point_inds}")
 848 |         
 849 |         # clear the old selected points
 850 |         trace.selectedpoints = []
 851 |         if len(points.point_inds) == 0:
 852 |             return
 853 |         
 854 |         # get the list of selected domain names
 855 |         selected_domains = [names[idx] for idx in points.point_inds]
 856 |         for id in points.point_inds:
 857 |             selected_domains = selected_domains + trace.customdata[id]
 858 | 
 859 |         # set the new selected points
 860 |         # don't like having to loop in a loop to get the domain index, but I don't know a better way
 861 |         trace.selectedpoints = points.point_inds + [names.index(name) for name in trace.customdata[id]]
 862 |         
 863 |         update_selected_domains(selected_domains)
 864 |         
 865 |     def update_selected_domains(selected_domains):
 866 |         if len(selected_domains) == 0:
 867 |             return
 868 |                 
 869 |         # sort domains by length, then alpha
 870 |         selected_domains.sort(key=len, reverse=True)
 871 |         with out:
 872 |             # write selected domains to the output widget
 873 |             print(f"Selected Domains: ({len(selected_domains)})\n")
 874 |             for selected_domain in selected_domains:
 875 |                 print(selected_domain)
 876 |         out.clear_output(wait=True)
 877 |         
 878 |         # calc pivots selected domains have in common
 879 |         get_2d_shared_pivots(graph, selected_domains)
 880 |         
 881 |     # event handler for node selection
 882 |     fig.data[1].on_selection(node_selection_fn)        
 883 |     # event handle for node click
 884 |     fig.data[1].on_click(node_click_fn)
 885 | 
 886 |     # Create a table FigureWidget that updates the list of selected domains
 887 |     out = widgets.Output(layout={'border': '1px solid black'})
 888 |     domain_ui = widgets.VBox((fig, out))
 889 |     return domain_ui
 890 |     
 891 | 
 892 | def get_shared_pivots(graph: "Graph", selected_domains: list):
 893 |     shared_pivots = {}
 894 |     for name in selected_domains:
 895 |         domain = graph.nodes[name]["domain"]
 896 |         for cat in domain.pivot_categories:
 897 |             for cat_value in domain.pivot_categories[cat]:
 898 |                 key = f"{cat}: {cat_value}"
 899 |                 if key not in shared_pivots:
 900 |                     shared_pivots[key] = []
 901 |                 shared_pivots[key].append(domain)
 902 | 
 903 |     # filter by pivots that have >= n domains
 904 |     shared_pivots = {k: v for k, v in shared_pivots.items() if len(v) >= 3}
 905 |     return shared_pivots
 906 |     
 907 |     
 908 | def create_pivot_heatmaps(shared_pivots: dict):
 909 |     print("\n Heatmap of which pivots connect the most domains together: by pivot category")
 910 |     pivot_cat_crosstab, pivot_value_crosstab = create_pivot_tables(shared_pivots)
 911 |     fig, ax = plt.subplots(figsize=(10, 10))
 912 |     im = heatmap(
 913 |         pivot_cat_crosstab,
 914 |         pivot_cat_crosstab.index,
 915 |         pivot_cat_crosstab.columns,
 916 |         ax=ax,
 917 |         cmap="Blues")
 918 |     texts = annotate_heatmap(im, valfmt="{x}")
 919 |     fig.tight_layout()
 920 |     plt.show()
 921 | 
 922 |     print("\n Heatmap of which pivots connect the most domains together: by pivot value")
 923 |     fig, ax = plt.subplots(figsize=(10, 10))
 924 |     im = heatmap(
 925 |         pivot_value_crosstab,
 926 |         pivot_value_crosstab.index,
 927 |         pivot_value_crosstab.columns,
 928 |         ax=ax,
 929 |         cmap="Blues")
 930 |     texts = annotate_heatmap(im, valfmt="{x}")
 931 |     fig.tight_layout()
 932 |     plt.show()
 933 | 
 934 |     print("\n List of the most frequent pivot values")
 935 |     create_pivot_summary(pivot_value_crosstab)
 936 |     
 937 |     
 938 | def create_pivot_tables(shared_pivots: dict):
 939 |     # Create the pandas DataFrame 
 940 |     data = []
 941 |     for pivot_value in shared_pivots:
 942 |         for d in shared_pivots[pivot_value]:
 943 |             pivot_cat = pivot_value.split(": ")[0]
 944 |             data.append([d.name, pivot_cat, pivot_value])
 945 |     df = pd.DataFrame(data, columns = ['domain', 'pivot_cat', 'pivot']) 
 946 | 
 947 |     # Build contingency table of domains to pivot
 948 |     pivot_cat_crosstab = pd.crosstab(df['pivot_cat'], df['domain'])
 949 |     pivot_value_crosstab = pd.crosstab(df['pivot'], df['domain'])
 950 | 
 951 |     # sort rows by total # of pivots
 952 |     pivot_cat_crosstab['sum'] = pivot_cat_crosstab[list(pivot_cat_crosstab.columns)].sum(axis=1)
 953 |     pivot_cat_crosstab.sort_values("sum", 0, ascending=False, inplace=True)
 954 |     pivot_cat_crosstab.drop("sum", 1, inplace=True)
 955 | 
 956 |     # sort rows by total # of pivots
 957 |     pivot_value_crosstab['sum'] = pivot_value_crosstab[list(pivot_value_crosstab.columns)].sum(axis=1)
 958 |     pivot_value_crosstab.sort_values("sum", 0, ascending=False, inplace=True)
 959 |     pivot_value_crosstab.drop("sum", 1, inplace=True)
 960 | 
 961 |     return pivot_cat_crosstab, pivot_value_crosstab
 962 | 
 963 | 
 964 | def create_pivot_summary(pivot_value_crosstab: "Pandas_CrossTab"):
 965 |     # show just an output view of pivot name and count for selection
 966 |     summary = pivot_value_crosstab.copy()
 967 |     summary['count'] = summary[list(summary.columns)].sum(axis=1)
 968 |     summary.sort_values("count", 0, ascending=False, inplace=True)
 969 |     summary = summary[["count"]]
 970 | 
 971 |     headers = ["Pivot Category", "Pivot Values", "Count"]
 972 |     table = []
 973 |     for index, row in summary.iterrows():
 974 |         cat, pivot = index.split(": ")
 975 |         table.append([cat, pivot, row["count"]])
 976 |     display(HTML(tabulate.tabulate(table, headers=headers, tablefmt='html')))
 977 | 
 978 | 
 979 | 
 980 | def heatmap(data, row_labels, col_labels, ax=None, cbar_kw={}, cbarlabel="", **kwargs):
 981 |     """
 982 |     Create a heatmap from a numpy array and two lists of labels.
 983 | 
 984 |     Parameters
 985 |     ----------
 986 |     data
 987 |         A 2D numpy array of shape (N, M).
 988 |     row_labels
 989 |         A list or array of length N with the labels for the rows.
 990 |     col_labels
 991 |         A list or array of length M with the labels for the columns.
 992 |     ax
 993 |         A `matplotlib.axes.Axes` instance to which the heatmap is plotted.  If
 994 |         not provided, use current axes or create a new one.  Optional.
 995 |     cbar_kw
 996 |         A dictionary with arguments to `matplotlib.Figure.colorbar`.  Optional.
 997 |     cbarlabel
 998 |         The label for the colorbar.  Optional.
 999 |     **kwargs
1000 |         All other arguments are forwarded to `imshow`.
1001 |     """
1002 | 
1003 |     if not ax:
1004 |         ax = plt.gca()
1005 | 
1006 |     # Plot the heatmap
1007 |     im = ax.imshow(data, **kwargs)
1008 | 
1009 |     # We want to show all ticks...
1010 |     ax.set_xticks(np.arange(data.shape[1]))
1011 |     ax.set_yticks(np.arange(data.shape[0]))
1012 |     # ... and label them with the respective list entries.
1013 |     ax.set_xticklabels(col_labels)
1014 |     ax.set_yticklabels(row_labels)
1015 | 
1016 |     # Let the horizontal axes labeling appear on top.
1017 |     ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False)
1018 | 
1019 |     # Rotate the tick labels and set their alignment.
1020 |     plt.setp(ax.get_xticklabels(), rotation=-30, ha="right", rotation_mode="anchor")
1021 | 
1022 |     # Turn spines off and create white grid.
1023 |     for edge, spine in ax.spines.items():
1024 |         spine.set_visible(False)
1025 | 
1026 |     ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
1027 |     ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
1028 |     ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
1029 |     ax.tick_params(which="minor", bottom=False, left=False)
1030 | 
1031 |     return im
1032 | 
1033 | 
1034 | def annotate_heatmap(im, data=None, valfmt="{x:.2f}", textcolors=["black", "white"],
1035 |                      threshold=None, **textkw):
1036 |     """
1037 |     A function to annotate a heatmap.
1038 | 
1039 |     Parameters
1040 |     ----------
1041 |     im
1042 |         The AxesImage to be labeled.
1043 |     data
1044 |         Data used to annotate.  If None, the image's data is used.  Optional.
1045 |     valfmt
1046 |         The format of the annotations inside the heatmap.  This should either
1047 |         use the string format method, e.g. "$ {x:.2f}", or be a
1048 |         `matplotlib.ticker.Formatter`.  Optional.
1049 |     textcolors
1050 |         A list or array of two color specifications.  The first is used for
1051 |         values below a threshold, the second for those above.  Optional.
1052 |     threshold
1053 |         Value in data units according to which the colors from textcolors are
1054 |         applied.  If None (the default) uses the middle of the colormap as
1055 |         separation.  Optional.
1056 |     **kwargs
1057 |         All other arguments are forwarded to each call to `text` used to create
1058 |         the text labels.
1059 |     """
1060 | 
1061 |     if not isinstance(data, (list, np.ndarray)):
1062 |         data = im.get_array()
1063 | 
1064 |     # Normalize the threshold to the images color range.
1065 |     if threshold is not None:
1066 |         threshold = im.norm(threshold)
1067 |     else:
1068 |         threshold = im.norm(data.max())/2.
1069 | 
1070 |     # Set default alignment to center, but allow it to be
1071 |     # overwritten by textkw.
1072 |     kw = dict(horizontalalignment="center",
1073 |               verticalalignment="center")
1074 |     kw.update(textkw)
1075 | 
1076 |     # Get the formatter in case a string is supplied
1077 |     if isinstance(valfmt, str):
1078 |         valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)
1079 | 
1080 |     # Loop over the data and create a `Text` for each "pixel".
1081 |     # Change the text's color depending on the data.
1082 |     texts = []
1083 |     for i in range(data.shape[0]):
1084 |         for j in range(data.shape[1]):
1085 |             kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
1086 |             text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
1087 |             texts.append(text)
1088 | 
1089 |     return texts


--------------------------------------------------------------------------------
/images/2d_click.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/2d_click.gif


--------------------------------------------------------------------------------
/images/2d_v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/2d_v1.png


--------------------------------------------------------------------------------
/images/2d_zoom.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/2d_zoom.gif


--------------------------------------------------------------------------------
/images/2d_zoom_select.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/2d_zoom_select.gif


--------------------------------------------------------------------------------
/images/3d_infra.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/3d_infra.gif


--------------------------------------------------------------------------------
/images/3d_v1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/3d_v1.gif


--------------------------------------------------------------------------------
/images/3d_v2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/3d_v2.gif


--------------------------------------------------------------------------------
/images/build_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/build_graph.png


--------------------------------------------------------------------------------
/images/config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/config.png


--------------------------------------------------------------------------------
/images/credentials.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/credentials.png


--------------------------------------------------------------------------------
/images/dash_gov.us_substrings.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/dash_gov.us_substrings.gif


--------------------------------------------------------------------------------
/images/domain_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/domain_data.png


--------------------------------------------------------------------------------
/images/domain_graph_2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/domain_graph_2d.png


--------------------------------------------------------------------------------
/images/intro_3d.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/intro_3d.gif


--------------------------------------------------------------------------------
/images/iris.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/iris.png


--------------------------------------------------------------------------------
/images/iris_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/iris_small.png


--------------------------------------------------------------------------------
/images/jupyter_cell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/jupyter_cell.png


--------------------------------------------------------------------------------
/images/pivot_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/pivot_heatmap.png


--------------------------------------------------------------------------------
/images/pivot_stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/pivot_stats.png


--------------------------------------------------------------------------------
/images/pivot_value_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/pivot_value_heatmap.png


--------------------------------------------------------------------------------
/images/reading_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/reading_data.png


--------------------------------------------------------------------------------
/images/run_3d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/run_3d.png


--------------------------------------------------------------------------------
/images/run_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/run_heatmap.png


--------------------------------------------------------------------------------
/images/running_a_cell.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/running_a_cell.gif


--------------------------------------------------------------------------------
/images/selected_domains.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/selected_domains.png


--------------------------------------------------------------------------------
/images/trimmed_domains.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/trimmed_domains.png


--------------------------------------------------------------------------------
/infrastructure_cat.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# InfraCAT: Infrastructure Connectivity Analysis Tool\n",
  8 |     "\n",
  9 |     "### Analyzing the infrastructure connectivity of an Iris API Search"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {
 16 |     "collapsed": false,
 17 |     "jupyter": {
 18 |      "outputs_hidden": false
 19 |     },
 20 |     "pycharm": {
 21 |      "name": "#%%\n"
 22 |     }
 23 |    },
 24 |    "outputs": [
 25 |     {
 26 |      "name": "stdout",
 27 |      "output_type": "stream",
 28 |      "text": [
 29 |       "InfraCAT is ready to go\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "# Run This First: imports all the helper functions and sets stuff up\n",
 35 |     "%run infrastructure_cat_module.py\n",
 36 |     "\n",
 37 |     "print(\"InfraCAT is ready to go\")"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## Iris REST API Credentials"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 2,
 50 |    "metadata": {
 51 |     "collapsed": false,
 52 |     "jupyter": {
 53 |      "outputs_hidden": false
 54 |     },
 55 |     "pycharm": {
 56 |      "name": "#%%\n"
 57 |     }
 58 |    },
 59 |    "outputs": [
 60 |     {
 61 |      "data": {
 62 |       "application/vnd.jupyter.widget-view+json": {
 63 |        "model_id": "842b0bc026174d7da491004391ec1055",
 64 |        "version_major": 2,
 65 |        "version_minor": 0
 66 |       },
 67 |       "text/plain": [
 68 |        "VBox(children=(Text(value='', description='Username:', layout=Layout(width='500px'), placeholder='Iris API Use…"
 69 |       ]
 70 |      },
 71 |      "metadata": {},
 72 |      "output_type": "display_data"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "api_username_ui = widgets.Text(placeholder='Iris API Username', description='Username:', layout={'width': '500px'}, value=\"\")\n",
 77 |     "api_pw_ui = widgets.Password(placeholder='Iris API Password', description='Password:', layout={'width': '500px'}, value=\"\")\n",
 78 |     "widgets.VBox([api_username_ui, api_pw_ui])"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Query Domain Data From Iris Investigate API\n",
 86 |     "\n",
 87 |     "Enter either a list of return delimited domains into the Domains text box, _OR_ an Iris search hash into the hash text box.\n",
 88 |     "\n",
 89 |     "Note: if both a list of domains _AND_ a search hash is entered, the list of domains will be queried and the search hash will be ignored"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 3,
 95 |    "metadata": {
 96 |     "collapsed": false,
 97 |     "jupyter": {
 98 |      "outputs_hidden": false
 99 |     },
100 |     "pycharm": {
101 |      "name": "#%%\n"
102 |     }
103 |    },
104 |    "outputs": [
105 |     {
106 |      "data": {
107 |       "application/vnd.jupyter.widget-view+json": {
108 |        "model_id": "e81fe61e74f24a0e952d36f48c13dbe9",
109 |        "version_major": 2,
110 |        "version_minor": 0
111 |       },
112 |       "text/plain": [
113 |        "VBox(children=(Label(value='Enter a return delimited list of domains to lookup (no commas, no quotes)'), Texta…"
114 |       ]
115 |      },
116 |      "metadata": {},
117 |      "output_type": "display_data"
118 |     }
119 |    ],
120 |    "source": [
121 |     "domain_list_ui = widgets.Textarea(placeholder='Enter list of domains', description='Domains:', layout={'height': '300px', 'width': '700px'})\n",
122 |     "search_hash_ui = widgets.Text(placeholder='Enter list of domains', description='Hash:', layout={'width': '700px'})\n",
123 |     "show_iris_query_ui(domain_list_ui, search_hash_ui)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 15,
129 |    "metadata": {
130 |     "collapsed": false,
131 |     "jupyter": {
132 |      "outputs_hidden": false
133 |     },
134 |     "pycharm": {
135 |      "name": "#%%\n"
136 |     }
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "config = Config()\n",
141 |     "\n",
142 |     "# exclude certain infrastructure from graph\n",
143 |     "# config.exclude_list = [\"EMAIL DOMAIN\"]\n",
144 |     "config.exclude_list = []\n",
145 |     "\n",
146 |     "# only show infrastructure that is under the pivot threshold\n",
147 |     "config.pivot_threshold = 500\n",
148 |     "\n",
149 |     "# Minimum should be 1 which means more than one domain has to show up in an edge\n",
150 |     "config.edge_threshold = 1\n",
151 |     "\n",
152 |     "# set whether or no to set node size to the unique number of domains in the edge\n",
153 |     "config.node_size = True"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 16,
159 |    "metadata": {
160 |     "collapsed": false,
161 |     "jupyter": {
162 |      "outputs_hidden": false
163 |     },
164 |     "pycharm": {
165 |      "name": "#%%\n"
166 |     }
167 |    },
168 |    "outputs": [
169 |     {
170 |      "name": "stdout",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "Loaded 338 domains from data/dash_gov_dot_us.json\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "query_api = False\n",
179 |     "save_search_to_disk = False\n",
180 |     "json_file_path = \"data/dash_gov_dot_us.json\"\n",
181 |     "\n",
182 |     "if query_api:\n",
183 |     "    iris_results = query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui)\n",
184 |     "    print(f'Iris API returned {len(iris_results)} domains')\n",
185 |     "\n",
186 |     "    # save search results to disk to be used later\n",
187 |     "    if save_search_to_disk:\n",
188 |     "        with open(json_file_path, 'w') as f:\n",
189 |     "            json.dump(iris_results, f)\n",
190 |     "else:\n",
191 |     "    with open(json_file_path) as json_data:\n",
192 |     "        iris_results = json.loads(json_data.read())\n",
193 |     "\n",
194 |     "    print(f'Loaded {len(iris_results)} domains from {json_file_path}')\n"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 20,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "name": "stdout",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "380\n"
207 |      ]
208 |     }
209 |    ],
210 |    "source": [
211 |     "graph, config = build_infra_graph(iris_results, config)\n",
212 |     "\n",
213 |     "print(len(graph.nodes))"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 21,
219 |    "metadata": {},
220 |    "outputs": [
221 |     {
222 |      "data": {
223 |       "application/vnd.jupyter.widget-view+json": {
224 |        "model_id": "d37f5c75ad8c4a42b01667bd93d6fb71",
225 |        "version_major": 2,
226 |        "version_minor": 0
227 |       },
228 |       "text/plain": [
229 |        "VBox(children=(FigureWidget({\n",
230 |        "    'data': [{'hoverinfo': 'none',\n",
231 |        "              'line': {'color': '#888', 'widt…"
232 |       ]
233 |      },
234 |      "metadata": {},
235 |      "output_type": "display_data"
236 |     }
237 |    ],
238 |    "source": [
239 |     "build_2d_graph_layout(graph, config)"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 22,
245 |    "metadata": {},
246 |    "outputs": [
247 |     {
248 |      "data": {
249 |       "application/vnd.jupyter.widget-view+json": {
250 |        "model_id": "5c83bc927cca4294aaba3fb6c85f728c",
251 |        "version_major": 2,
252 |        "version_minor": 0
253 |       },
254 |       "text/plain": [
255 |        "VBox(children=(FigureWidget({\n",
256 |        "    'data': [{'hoverinfo': 'none',\n",
257 |        "              'line': {'color': 'rgb(125,125,…"
258 |       ]
259 |      },
260 |      "metadata": {},
261 |      "output_type": "display_data"
262 |     }
263 |    ],
264 |    "source": [
265 |     "build_3d_graph_layout(graph, config)"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 34,
271 |    "metadata": {
272 |     "collapsed": false,
273 |     "jupyter": {
274 |      "outputs_hidden": false
275 |     },
276 |     "pycharm": {
277 |      "name": "#%%\n"
278 |     }
279 |    },
280 |    "outputs": [
281 |     {
282 |      "name": "stdout",
283 |      "output_type": "stream",
284 |      "text": [
285 |       "Loaded 195 domains from data/treatment_care.json\n"
286 |      ]
287 |     }
288 |    ],
289 |    "source": [
290 |     "query_api = False\n",
291 |     "save_search_to_disk = False\n",
292 |     "json_file_path = \"data/treatment_care.json\"\n",
293 |     "\n",
294 |     "if query_api:\n",
295 |     "    iris_results = query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui)\n",
296 |     "    print(f'Iris API returned {len(iris_results)} domains')\n",
297 |     "\n",
298 |     "    # save search results to disk to be used later\n",
299 |     "    if save_search_to_disk:\n",
300 |     "        with open(json_file_path, 'w') as f:\n",
301 |     "            json.dump(iris_results, f)\n",
302 |     "else:\n",
303 |     "    with open(json_file_path) as json_data:\n",
304 |     "        iris_results = json.loads(json_data.read())\n",
305 |     "\n",
306 |     "    print(f'Loaded {len(iris_results)} domains from {json_file_path}')"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 44,
312 |    "metadata": {},
313 |    "outputs": [],
314 |    "source": [
315 |     "config = Config()\n",
316 |     "\n",
317 |     "# exclude certain infrastructure from graph\n",
318 |     "# config.exclude_list = [\"EMAIL DOMAIN\"]\n",
319 |     "config.exclude_list = []\n",
320 |     "\n",
321 |     "# only show infrastructure that is under the pivot threshold\n",
322 |     "config.pivot_threshold = 50000\n",
323 |     "\n",
324 |     "# Minimum should be 1 which means more than one domain has to show up in an edge\n",
325 |     "config.edge_threshold = 1\n",
326 |     "\n",
327 |     "# set whether or no to set node size to the unique number of domains in the edge\n",
328 |     "config.node_size = True"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 45,
334 |    "metadata": {
335 |     "collapsed": false,
336 |     "jupyter": {
337 |      "outputs_hidden": false
338 |     },
339 |     "pycharm": {
340 |      "name": "#%%\n"
341 |     }
342 |    },
343 |    "outputs": [
344 |     {
345 |      "name": "stdout",
346 |      "output_type": "stream",
347 |      "text": [
348 |       "6\n"
349 |      ]
350 |     }
351 |    ],
352 |    "source": [
353 |     "graph, config = build_infra_graph(iris_results, config)\n",
354 |     "\n",
355 |     "print(len(graph.nodes))"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 46,
361 |    "metadata": {
362 |     "collapsed": false,
363 |     "jupyter": {
364 |      "outputs_hidden": false
365 |     },
366 |     "pycharm": {
367 |      "name": "#%%\n"
368 |     }
369 |    },
370 |    "outputs": [
371 |     {
372 |      "data": {
373 |       "application/vnd.jupyter.widget-view+json": {
374 |        "model_id": "98caf40adfc84751a057c517b69975bb",
375 |        "version_major": 2,
376 |        "version_minor": 0
377 |       },
378 |       "text/plain": [
379 |        "VBox(children=(FigureWidget({\n",
380 |        "    'data': [{'hoverinfo': 'none',\n",
381 |        "              'line': {'color': '#888', 'widt…"
382 |       ]
383 |      },
384 |      "metadata": {},
385 |      "output_type": "display_data"
386 |     }
387 |    ],
388 |    "source": [
389 |     "build_2d_graph_layout(graph, config)"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 47,
395 |    "metadata": {
396 |     "scrolled": true,
397 |     "tags": []
398 |    },
399 |    "outputs": [
400 |     {
401 |      "data": {
402 |       "application/vnd.jupyter.widget-view+json": {
403 |        "model_id": "8f6935bd23bd491c9b8446814721dfdd",
404 |        "version_major": 2,
405 |        "version_minor": 0
406 |       },
407 |       "text/plain": [
408 |        "VBox(children=(FigureWidget({\n",
409 |        "    'data': [{'hoverinfo': 'none',\n",
410 |        "              'line': {'color': 'rgb(125,125,…"
411 |       ]
412 |      },
413 |      "metadata": {},
414 |      "output_type": "display_data"
415 |     }
416 |    ],
417 |    "source": [
418 |     "build_3d_graph_layout(graph, config)"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": 48,
424 |    "metadata": {
425 |     "tags": []
426 |    },
427 |    "outputs": [
428 |     {
429 |      "name": "stdout",
430 |      "output_type": "stream",
431 |      "text": [
432 |       "18\n"
433 |      ]
434 |     }
435 |    ],
436 |    "source": [
437 |     "pair_graph, pair_config = build_pair_infra_graph(iris_results, config)\n",
438 |     "\n",
439 |     "print(len(pair_graph.nodes))"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": 49,
445 |    "metadata": {},
446 |    "outputs": [
447 |     {
448 |      "data": {
449 |       "application/vnd.jupyter.widget-view+json": {
450 |        "model_id": "a601dde8107842a39561970e8fb5f981",
451 |        "version_major": 2,
452 |        "version_minor": 0
453 |       },
454 |       "text/plain": [
455 |        "VBox(children=(FigureWidget({\n",
456 |        "    'data': [{'hoverinfo': 'none',\n",
457 |        "              'line': {'color': '#888', 'widt…"
458 |       ]
459 |      },
460 |      "metadata": {},
461 |      "output_type": "display_data"
462 |     }
463 |    ],
464 |    "source": [
465 |     "build_2d_graph_layout(pair_graph, pair_config)"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": 50,
471 |    "metadata": {},
472 |    "outputs": [
473 |     {
474 |      "data": {
475 |       "application/vnd.jupyter.widget-view+json": {
476 |        "model_id": "d4f902e406d8446c9b6eb441fde9f99d",
477 |        "version_major": 2,
478 |        "version_minor": 0
479 |       },
480 |       "text/plain": [
481 |        "VBox(children=(FigureWidget({\n",
482 |        "    'data': [{'hoverinfo': 'none',\n",
483 |        "              'line': {'color': 'rgb(125,125,…"
484 |       ]
485 |      },
486 |      "metadata": {},
487 |      "output_type": "display_data"
488 |     }
489 |    ],
490 |    "source": [
491 |     "build_3d_graph_layout(pair_graph, pair_config)"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": null,
497 |    "metadata": {
498 |     "collapsed": false,
499 |     "jupyter": {
500 |      "outputs_hidden": false
501 |     },
502 |     "pycharm": {
503 |      "name": "#%%\n"
504 |     }
505 |    },
506 |    "outputs": [],
507 |    "source": []
508 |   },
509 |   {
510 |    "cell_type": "code",
511 |    "execution_count": null,
512 |    "metadata": {
513 |     "collapsed": false,
514 |     "jupyter": {
515 |      "outputs_hidden": false
516 |     },
517 |     "pycharm": {
518 |      "name": "#%%\n"
519 |     }
520 |    },
521 |    "outputs": [],
522 |    "source": []
523 |   }
524 |  ],
525 |  "metadata": {
526 |   "kernelspec": {
527 |    "display_name": "Python 3 (ipykernel)",
528 |    "language": "python",
529 |    "name": "python3"
530 |   },
531 |   "language_info": {
532 |    "codemirror_mode": {
533 |     "name": "ipython",
534 |     "version": 3
535 |    },
536 |    "file_extension": ".py",
537 |    "mimetype": "text/x-python",
538 |    "name": "python",
539 |    "nbconvert_exporter": "python",
540 |    "pygments_lexer": "ipython3",
541 |    "version": "3.8.10"
542 |   }
543 |  },
544 |  "nbformat": 4,
545 |  "nbformat_minor": 4
546 | }


--------------------------------------------------------------------------------
/infrastructure_cat_module.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import re
  4 | import json
  5 | import math
  6 | from difflib import SequenceMatcher
  7 | import plotly.graph_objects as go
  8 | import requests
  9 | import networkx as nx
 10 | import pandas as pd
 11 | import numpy as np
 12 | import scipy
 13 | import matplotlib
 14 | import matplotlib.pyplot as plt
 15 | from ipywidgets import interactive, HBox, VBox
 16 | import ipywidgets as widgets
 17 | from IPython.display import HTML, display
 18 | import tabulate
 19 | from dotenv import dotenv_values
 20 | from domaintools import API
 21 | from configparser import ConfigParser
 22 | 
 23 | import networkx as nx
 24 | import matplotlib.pyplot as plt
 25 | 
 26 | import itertools
 27 | 
 28 | # load REST API creds from .env file
 29 | dcat_config = dotenv_values(".env")
 30 | 
 31 | 
 32 | def show_iris_query_ui(domain_list_ui, search_hash_ui):
 33 |     lookup_ui = widgets.VBox([
 34 |         widgets.Label(value="Enter a return delimited list of domains to lookup (no commas, no quotes)"),
 35 |         domain_list_ui,
 36 |         widgets.Label(value="Or..."),
 37 |         widgets.Label(value="Enter an Iris search hassh to lookup"),
 38 |         search_hash_ui,
 39 |     ])
 40 |     return lookup_ui
 41 | 
 42 | 
 43 | def clean_domain_list(domain_list_ui):
 44 |     # remove any quotes, spaces, or defanging square brackets
 45 |     full_domain_list = domain_list_ui.value.strip().replace(' ', '').replace('"', '').replace("'", "").replace('[',
 46 |                                                                                                                '').replace(
 47 |         ']', '')
 48 |     # replace commas with new lines
 49 |     full_domain_list = full_domain_list.replace(",", "\n")
 50 |     # update the widget
 51 |     domain_list_ui.value = full_domain_list
 52 |     # split into array
 53 |     return full_domain_list.split("\n")
 54 | 
 55 | 
 56 | def get_rest_api_creds(api_username_ui, api_pw_ui):
 57 |     api_username = api_username_ui.value
 58 |     if len(api_username) == 0:
 59 |         api_username = dcat_config["IRIS_API_USERNAME"]
 60 |     api_key = api_pw_ui.value
 61 |     if len(api_key) == 0:
 62 |         api_key = dcat_config["IRIS_API_KEY"]
 63 |     return api_username, api_key
 64 | 
 65 | 
 66 | def query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui):
 67 |     api_username, api_key = get_rest_api_creds(api_username_ui, api_pw_ui)
 68 |     api = API(api_username, api_key)
 69 |     if len(domain_list_ui.value) > 0:
 70 |         # split list of domains into groups of 100 because of API restrictions
 71 |         results = []
 72 |         full_domain_list = clean_domain_list(domain_list_ui)
 73 |         max_domains = 100
 74 |         start = 0
 75 |         end = max_domains
 76 |         for _ in range(math.ceil(len(full_domain_list) / max_domains)):
 77 |             # slice out max domains to query
 78 |             partial_domain_list = full_domain_list[start:end]
 79 |             # build query string
 80 |             domain_list = ",".join(partial_domain_list)
 81 |             iris_query = {"domains": domain_list}
 82 |             # query rest api
 83 |             print(f"...querying Iris REST API for {len(partial_domain_list)} domains")
 84 |             iris_results = api.iris_investigate(**iris_query)
 85 |             # build up the set of return domain objects
 86 |             results += iris_results.response().get('results', {})
 87 |             # update slice indexes
 88 |             start = end
 89 |             end += max_domains
 90 |         return results
 91 |     elif len(search_hash_ui.value) > 0:
 92 |         iris_query = {"search_hash": search_hash_ui.value}
 93 |         iris_results = api.iris_investigate(**iris_query)
 94 |         # print(iris_results.status)
 95 |         iris_results = iris_results.response().get('results', {})
 96 |         return iris_results
 97 |     else:
 98 |         print(
 99 |             "Domain List and Search Hash text boxes are empty. Please enter either a list of domains or search hash to lookup")
100 |         raise Exception("Domain List and Search Hash text boxes are empty")
101 | 
102 | 
103 | class Config(object):
104 |     """ Little helper class to hold all the config values"""
105 | 
106 | 
107 | class Domain(object):
108 |     """ Little helper class to hold the domain name and risk score
109 |     """
110 | 
111 |     def __init__(self, domain_json):
112 |         self.json = domain_json
113 |         self.name = domain_json["domain"]
114 |         self.risk_score = domain_json["domain_risk"]['risk_score']
115 |         self.pivots = {}
116 |         self.label = f"{self.name} ({self.risk_score})"
117 | 
118 |     def __str__(self):
119 |         return f"name: {self.name}, risk: {self.risk_score}"
120 | 
121 |     def __repr__(self):
122 |         return str(self)
123 | 
124 | 
125 | class DomainRelationship(object):
126 |     def __init__(self, weight: float, category: str):
127 |         # this is the maximum weight that an edge can have.
128 |         # Adjust this if you want to play around with stronger edge weights
129 |         self.max_weight = 5.0
130 |         self.weight = weight
131 |         self.categories = [category]
132 | 
133 |     def __str__(self):
134 |         return f"weight: {self.weight}, categories: {self.categories}"
135 | 
136 |     def __repr__(self):
137 |         return str(self)
138 | 
139 |     def add(self, weight: float, category: str):
140 |         """ Note: certain pivot categories can be added more than once for 2 domains;
141 |         things like IP and name server. For example, two domains could be on the same set of 5
142 |         IP addreese. For now the weights are just summed if there are more than one pivots of
143 |         the same category, but maybe we need a different strategy. Since IPs have multiple pivots
144 |         (ip address, country code, asn, isp) this means if there were 5 shared IPs between two
145 |         domains, the weight would be: 4 * 5 * pivot_weight.
146 |         This might over amplify the edge strength
147 |         """
148 |         if category not in self.categories:
149 |             # this helps by not overly boosting the edge weight if two domains share
150 |             # multipel IP addresses
151 |             self.weight += weight
152 |         self.weight = min(self.weight, self.max_weight)
153 |         self.categories.append(category)
154 | 
155 |     def get_description(self):
156 |         return "<br>".join(sorted(self.categories))
157 | 
158 | 
159 | class Pivot(object):
160 |     def __init__(self, category, value, global_count):
161 |         self.category = category
162 |         self.value = value
163 |         self.global_count = global_count
164 |         self.domains = set()
165 | 
166 |     #     def union(self, other: "Pivot"):
167 |     #         self.domains.union(other.domains)
168 | 
169 |     def label(self):
170 |         #         return f"category: {self.category}: value: {self.value} ({self.global_count})"
171 |         return f"{self.category}: {self.value} ({self.global_count})"
172 | 
173 |     def __str__(self):
174 |         return f"category: {self.category}, " \
175 |                f"value: {self.value}, " \
176 |                f"global_count: {self.global_count}, " \
177 |                f"domains: {self.domains}"
178 | 
179 |     def __repr__(self):
180 |         return str(self)
181 | 
182 | 
183 | # build graph
184 | def get_edge_count(n: int):
185 |     # for a complete graph, the edge count is: n(n-1)/2
186 |     return n * (n - 1) / 2
187 | 
188 | 
189 | # def pivot_on_matching_substrings(graph: "Graph", domains: dict, config: "Config"):
190 | #     """Create pivots between domains that share a common substring of
191 | #     `config.longest_common_substring` chars long.
192 | #
193 | #     Note: SequenceMatcher has some known issues with not finding the longest match in very long
194 | #     strings, but does a pretty good job with shorter strings such as domain names.
195 | #     https://stackoverflow.com/questions/18715688/find-common-substring-between-two-strings
196 | #     """
197 | #     domain_names = list(domains.keys())
198 | #     for x in range(len(domain_names)):
199 | #         domain1 = domain_names[x]
200 | #         string1 = domain1.split('.')[0]
201 | #         # pull out substrings to ignore
202 | #         if config.ignore_substrings and len(config.ignore_substrings) > 0:
203 | #             for ignore in config.ignore_substrings:
204 | #                 string1 = string1.replace(ignore, "")
205 | #         for y in range(x + 1, len(domain_names)):
206 | #             domain2 = domain_names[y]
207 | #             string2 = domain2.split('.')[0]
208 | #             # pull out substrings to ignore
209 | #             if config.ignore_substrings and len(config.ignore_substrings) > 0:
210 | #                 for ignore in config.ignore_substrings:
211 | #                     string2 = string2.replace(ignore, "")
212 | #             # find the longest common substring between the two domains
213 | #             matcher = SequenceMatcher(None, string1, string2, False)
214 | #             match = matcher.find_longest_match(0, len(string1), 0, len(string2))
215 | #             longest_match = string1[match.a: match.a + match.size]
216 | #             # check if the matching substring is long enough
217 | #             if len(longest_match) >= config.longest_common_substring:
218 | #                 # add pivots
219 | #                 _append_value_to_pivot(
220 | #                     graph,
221 | #                     "longest_common_substring",
222 | #                     longest_match, None,
223 | #                     domains[domain1], config)
224 | #                 _append_value_to_pivot(
225 | #                     graph,
226 | #                     "longest_common_substring",
227 | #                     longest_match, None,
228 | #                     domains[domain2], config)
229 | 
230 | 
231 | def build_pivot_graph(iris_results: list, config: "Config"):
232 |     """ Main workflow function that takes the results from an Iris Investigate query and
233 |     builds the graph object of how each of the domains in the query are connected to each other"""
234 | 
235 |     # parse the Iris API Result to build the pivot data structure
236 |     graph, domains = init_local_pivot_graph(iris_results, config)
237 |     print(len(graph.nodes))
238 |     print()
239 | 
240 |     # normalize registrar pivots (see note in function comments)
241 |     # if "registrar" in pivot_categories and config.normalize_registrars:
242 |     #    normalize_similar_registrars(pivot_categories["registrar"])
243 | 
244 |     # create pivots for longest common substrings
245 |     # pivot_on_matching_substrings(graph, domains, config)
246 |     # print(len(graph.nodes))
247 |     # print()
248 | 
249 |     # trim pivots from graph that have less than the set count threshold or contain all domains
250 |     # graph = trim_pivots(graph, len(domains), config)
251 |     # print(len(graph.nodes))
252 |     # print()
253 | 
254 |     # trim unconnected domains and domains with only a create date pivot
255 |     # TURBO: I'm not sure yet how to do this
256 |     #     trimmed_unconnected_domains = trim_unconnected_domains(graph, domains, config)
257 |     #     print(len(graph.nodes))
258 |     #     print()
259 | 
260 |     #     trimmed_create_date_domains = trim_domains_with_only_create_date_pivot(graph, pivot_categories)
261 |     #     print(len(graph.nodes))
262 |     #     print()
263 | 
264 |     #     print(f"{len(trimmed_unconnected_domains)} "
265 |     #           f"domains trimmed because they were not connected to other domains")
266 |     #     print(f"{len(trimmed_create_date_domains)} "
267 |     #           f"domains trimmed because create_date was the only pivot")
268 |     print(f"{len(graph.nodes)} nodes in graph structure \n")
269 | 
270 |     # build the graph structure based on the domain pivots
271 |     graph = build_local_pivot_graph(graph, domains, config)
272 |     return (graph, domains,
273 |             {
274 |                 #                 "unconnected": trimmed_unconnected_domains,
275 |                 #                 "create_date": trimmed_create_date_domains
276 |             }
277 |             )
278 | 
279 | 
280 | def get_pivots(data_obj, name, return_data=None, count=0, pivot_threshold=500):
281 |     """
282 |     Does a deep dive through a data object to check count vs pivot threshold.
283 |     Args:
284 |         data_obj: Either a list or dict that needs to check pivot count
285 |         name: pivot category name
286 |         return_data: Holds data to return once we reach the end of the data_obj
287 |         count: Lets us track to know when we are finished with the data_obj
288 |         pivot_threshold: Threshold to include as a pivot.
289 |     """
290 |     if return_data is None:
291 |         return_data = []
292 |     count += 1
293 |     if isinstance(data_obj, dict) and len(data_obj):
294 |         temp_name = name
295 |         for k, v in data_obj.items():
296 |             if isinstance(data_obj[k], (dict, list)):
297 |                 name = "{}_{}".format(name, k)
298 |                 temp_data = get_pivots(
299 |                     data_obj[k], name, return_data, count, pivot_threshold
300 |                 )
301 |                 if temp_data:
302 |                     return_data.append([name[1:].upper().replace("_", " "), temp_data])
303 |             name = temp_name
304 |         if "count" in data_obj and (1 < data_obj["count"] < pivot_threshold):
305 |             return data_obj["value"], data_obj["count"]
306 |     elif isinstance(data_obj, list) and len(data_obj):
307 |         for index, item in enumerate(data_obj):
308 |             temp_data = get_pivots(item, name, return_data, count, pivot_threshold)
309 |             if temp_data:
310 |                 if isinstance(temp_data, list):
311 |                     for x in temp_data:
312 |                         return_data.append(x)
313 |                 elif isinstance(temp_data, tuple):
314 |                     return_data.append([name[1:].upper().replace("_", " "), temp_data])
315 |     count -= 1
316 |     if count:
317 |         return
318 |     else:
319 |         return return_data
320 | 
321 | 
322 | def build_infra_graph(iris_results: list, config: "Config"):
323 |     graph = nx.Graph()
324 |     pv_dict = {}
325 |     config.domain_risk_dict = {}
326 |     for domain in iris_results:
327 |         if domain["domain"] not in config.domain_risk_dict:
328 |             config.domain_risk_dict[domain["domain"]] = domain.get("domain_risk", {}).get("risk_score", 0)
329 |         # GET PIVOTS
330 |         nps = get_pivots(domain, "", pivot_threshold=config.pivot_threshold)
331 |         pv_list = []
332 |         for p in nps:
333 |             if p[0] not in config.exclude_list:
334 |                 pv_list.append("{}_{}".format(p[0], p[1][0]))
335 |         # CREATE POSSIBLE NODES AND POSSIBLE EDGES
336 |         x = itertools.combinations(pv_list, 2)
337 |         for g in x:
338 |             if "{}:::{}".format(g[0], g[1]) in pv_dict:
339 |                 if domain["domain"] not in pv_dict["{}:::{}".format(g[0], g[1])]:
340 |                     pv_dict["{}:::{}".format(g[0], g[1])].append(domain["domain"])
341 |             else:
342 |                 pv_dict["{}:::{}".format(g[0], g[1])] = [domain["domain"]]
343 | 
344 |     b_pv_list = []
345 |     my_set = set()
346 | 
347 |     # FILTER OUT EDGES THAT DON'T MEET THRESHOLD
348 |     for k, v in pv_dict.items():
349 |         if len(v) > config.edge_threshold:
350 |             a = k.split(":::")
351 |             b_pv_list.append([a[0], a[1], v, len(v)])
352 |             my_set.add(a[0])
353 |             my_set.add(a[1])
354 |             # print(k, v, len(v))
355 | 
356 |     # CREATE NODES
357 |     for m in my_set:
358 |         graph.add_node(m, color='blue', size=0)
359 | 
360 |     # CREATE EDGES
361 |     for m in b_pv_list:
362 |         graph.add_edge(m[0], m[1], domains=m[2], length=m[3])
363 |     return graph, config
364 | 
365 | 
366 | def build_pair_infra_graph(iris_results: list, config: "Config"):
367 |     graph = nx.Graph()
368 |     pv_dict = {}
369 |     config.domain_risk_dict = {}
370 |     for domain in iris_results:
371 |         if domain["domain"] not in config.domain_risk_dict:
372 |             config.domain_risk_dict[domain["domain"]] = domain.get("domain_risk", {}).get("risk_score", 0)
373 |         # GET PIVOTS
374 |         nps = get_pivots(domain, "", pivot_threshold=config.pivot_threshold)
375 |         pv_list = [
376 |             "{}_{}".format(p[0], p[1][0])
377 |             for p in nps
378 |             if p[0] not in config.exclude_list
379 |         ]
380 | 
381 |         # CREATE POSSIBLE NODES AND POSSIBLE EDGES
382 |         x = itertools.combinations(pv_list, 2)
383 |         # print(x)
384 |         i_list = []
385 |         for g in x:
386 |             # print("{}:::{}".format(g[0], g[1]))
387 |             if "{}:::{}".format(g[0], g[1]) not in i_list and g[0] != g[1]:
388 |                 i_list.append("{}:::{}".format(g[0], g[1]))
389 |         y = itertools.combinations(i_list, 2)
390 |         for g in y:
391 | 
392 |             if "{}|||{}".format(g[0], g[1]) in pv_dict:
393 |                 if domain["domain"] not in pv_dict["{}|||{}".format(g[0], g[1])]:
394 |                     pv_dict["{}|||{}".format(g[0], g[1])].append(domain["domain"])
395 |             else:
396 |                 pv_dict["{}|||{}".format(g[0], g[1])] = [domain["domain"]]
397 |     # print(pv_dict)
398 |     b_pv_list = []
399 |     my_set = set()
400 | 
401 |     # FILTER OUT EDGES THAT DON'T MEET THRESHOLD
402 |     for k, v in pv_dict.items():
403 |         if len(v) > config.edge_threshold:
404 |             a = k.split("|||")
405 |             if a[0] != a[1]:
406 |                 b_pv_list.append([a[0], a[1], v, len(v)])
407 |                 my_set.add(a[0])
408 |                 my_set.add(a[1])
409 |                 # print(k, v, len(v))
410 | 
411 |         # CREATE NODES
412 |     for m in my_set:
413 |         graph.add_node(m, color='blue', size=0)
414 | 
415 |         # CREATE EDGES
416 |     for m in b_pv_list:
417 |         graph.add_edge(m[0], m[1], domains=m[2], length=m[3])
418 |     return graph, config
419 | 
420 | 
421 | def calc_viz_layout(layout: str, graph: "Graph", dimension: int):
422 |     # KK layout only
423 |     if layout == "kk":
424 |         return nx.layout.kamada_kawai_layout(graph, dim=dimension)
425 | 
426 |     # spring layout only
427 |     if layout == "fr":
428 |         return nx.layout.spring_layout(graph, dim=dimension)
429 | 
430 |     # kk layout as initialization for spring layout
431 |     if layout == "kk_to_fr":
432 |         pos = nx.layout.kamada_kawai_layout(graph, dim=dimension, weight=None)
433 |         return nx.layout.spring_layout(graph, pos=pos, dim=dimension)
434 | 
435 |     # spring layout as initialization for kk layout
436 |     if layout == "fr_to_kk":
437 |         pos = nx.layout.spring_layout(graph, dim=dimension)
438 |         return nx.layout.kamada_kawai_layout(graph, pos=pos, dim=dimension)
439 |     raise Exception("invalid layout choice")
440 | 
441 | 
442 | def average_risk_score(domain_list, domain_dict):
443 |     total = sum(domain_dict[d] for d in domain_list)
444 |     avg_risk_score = int(total / len(domain_list))
445 |     # print(avg_risk_score)
446 |     if avg_risk_score >= 90:
447 |         color = 'red'
448 |     elif avg_risk_score >= 75:
449 |         color = 'orange'
450 |     elif avg_risk_score >= 55:
451 |         color = 'yellow'
452 |     else:
453 |         color = 'green'
454 |     return color, avg_risk_score
455 | 
456 | 
457 | def build_3d_graph_layout(graph: "Graph", config):
458 |     """ Build the graph layout based on the specified algorithm and get the node positions
459 |     in xyz dimensions"""
460 | 
461 |     pos = calc_viz_layout("kk_to_fr", graph, 3)
462 | 
463 |     node_labels, node_risk_scores, node_size, names, Xn, Yn, Zn = [], [], [], [], [], [], []
464 |     i = 0
465 |     for node in graph.nodes(data=True):
466 |         # build x,y,z coordinates data structure for nodes
467 |         Xn.append(pos[node[0]][0])
468 |         Yn.append(pos[node[0]][1])
469 |         Zn.append(pos[node[0]][2])
470 |         domain_set = set()
471 |         for e in graph.edges(node[0], data=True):
472 |             domain_set.update(e[2]['domains'])
473 |         domain_list = list(domain_set)
474 |         color, avg_risk_score = average_risk_score(domain_list, config.domain_risk_dict)
475 |         node_labels.append(
476 |             "{}<br>Avg Risk Score: {}<br>Number of unique domains on edges: {}".format(node[0], avg_risk_score,
477 |                                                                                        len(domain_list)))
478 |         node_risk_scores.append(color)
479 |         node_size.append(len(domain_list))
480 |         names.append(domain_list)
481 | 
482 |     if not config.node_size:
483 |         node_size = 6
484 | 
485 |     # build x,y,z coordinates data structure for edges
486 |     Xe, Ye, Ze = [], [], []
487 |     for e in graph.edges:
488 |         u = pos[e[0]]
489 |         v = pos[e[1]]
490 |         Xe += [u[0], v[0], None]
491 |         Ye += [u[1], v[1], None]
492 |         Ze += [u[2], v[2], None]
493 | 
494 |     # Create the 3d Plotly graph and render it
495 |     # build line objects for our edges
496 |     trace1 = go.Scatter3d(x=Xe, y=Ye, z=Ze,
497 |                           mode='lines',
498 |                           name='domains',
499 |                           line=dict(color='rgb(125,125,125)', width=0.5),
500 |                           opacity=0.9,
501 |                           hoverinfo='none')
502 | 
503 |     trace2 = go.Scatter3d(
504 |         x=Xn, y=Yn, z=Zn,
505 |         mode='markers',
506 |         name='pivots',
507 |         marker=dict(
508 |             symbol='circle',
509 |             size=node_size,
510 |             color=node_risk_scores,
511 |             line=dict(color='rgb(50,50,50)', width=0.5),
512 |         ),
513 |         text=node_labels,
514 |         hoverinfo='text')
515 | 
516 |     # background definition, but everything is turned off
517 |     axis = dict(showbackground=False,
518 |                 showline=False,
519 |                 zeroline=False,
520 |                 showgrid=False,
521 |                 showticklabels=False,
522 |                 title='')
523 | 
524 |     layout = go.Layout(
525 |         title=f"Graph of interconnected infrastructure ({len(node_labels)} infra nodes)",
526 |         width=1000, height=1000,
527 |         showlegend=False,
528 |         scene=dict(xaxis=dict(axis), yaxis=dict(axis), zaxis=dict(axis)),
529 |         margin=dict(t=100), hovermode='closest')
530 | 
531 |     data = [trace1, trace2]
532 |     fig = go.FigureWidget(data=data, layout=layout)
533 | 
534 |     # handle selection of domains
535 |     # def node_selection_fn(trace, points, selector):
536 |     #     selected_domains = [names[idx] for idx in points.point_inds]
537 |     #     update_selected_domains(selected_domains)
538 | 
539 |     # handle node click events
540 |     def node_click_fn(trace, points, selector):
541 |         if len(points.point_inds) > 1:
542 |             print(f"node_click passed in more than 1 point: {points.point_inds}")
543 | 
544 |         # clear the old selected points
545 |         # trace.selectedpoints = []
546 |         # if len(points.point_inds) == 0:
547 |         # return
548 | 
549 |         # get the list of selected domain names
550 |         selected_domains = [names[idx] for idx in points.point_inds]
551 |         # for id in points.point_inds:
552 |         # selected_domains = selected_domains + trace.customdata[id]
553 | 
554 |         # set the new selected points
555 |         # don't like having to loop in a loop to get the domain index, but I don't know a better way
556 |         # trace.selectedpoints = points.point_inds + [names.index(name) for name in trace.customdata[id]]
557 | 
558 |         update_selected_domains(selected_domains)
559 | 
560 |     def update_selected_domains(selected_domains):
561 |         if len(selected_domains) == 0:
562 |             return
563 | 
564 |         # sort domains by length, then alpha
565 |         selected_domains.sort(key=len, reverse=True)
566 |         with out:
567 |             # write selected domains to the output widget
568 |             print(f"Selected Infra: ({len(selected_domains)})\n")
569 |             for selected_domain in selected_domains:
570 |                 print(selected_domain)
571 |         out.clear_output(wait=True)
572 | 
573 |         # calc pivots selected domains have in common
574 |         # get_2d_shared_pivots(graph, selected_domains)
575 | 
576 |     # event handler for node selection
577 |     # fig.data[1].on_selection(node_selection_fn)
578 |     # event handle for node click
579 |     fig.data[1].on_click(node_click_fn)
580 | 
581 |     # Create a table FigureWidget that updates the list of selected domains
582 |     out = widgets.Output(layout={'border': '1px solid black'})
583 |     domain_ui = widgets.VBox((fig, out))
584 |     return domain_ui
585 | 
586 | 
587 | def build_2d_graph_layout(graph: "Graph", config):
588 |     """ build the graph layout based on the specified algorithm and get the node positions
589 |     in xy dimensions"""
590 |     pos = calc_viz_layout("kk_to_fr", graph, 2)
591 |     # pos = calc_viz_layout("fr_to_kk", g, 2)
592 | 
593 |     # build edge data
594 |     edge_x, edge_y = [], []
595 |     for e in graph.edges():
596 |         x0, y0 = pos[e[0]]
597 |         x1, y1 = pos[e[1]]
598 |         edge_x.append(x0)
599 |         edge_x.append(x1)
600 |         edge_x.append(None)
601 |         edge_y.append(y0)
602 |         edge_y.append(y1)
603 |         edge_y.append(None)
604 | 
605 |     # create edge scatter plot
606 |     edge_trace = go.Scatter(
607 |         x=edge_x, y=edge_y,
608 |         line=dict(width=0.5, color='#888'),
609 |         hoverinfo='none',
610 |         mode='lines',
611 |         opacity=0.6
612 |     )
613 | 
614 |     # build node data
615 |     node_adjacencies, node_risk_scores, node_text, node_labels, node_size, node_x, node_y = [], [], [], [], [], [], []
616 |     names = list(graph.nodes)
617 |     for name in graph.nodes(data=True):
618 |         domain = graph.nodes[name[0]]
619 |         x, y = pos[name[0]]
620 |         node_x.append(x)
621 |         node_y.append(y)
622 |         # get the domain's connected nodes
623 |         neighbors = list(graph.neighbors(name[0]))
624 |         node_adjacencies.append(neighbors)
625 |         domain_set = set()
626 |         for e in graph.edges(name[0], data=True):
627 |             domain_set.update(e[2]['domains'])
628 |         domain_list = list(domain_set)
629 |         color, avg_risk_score = average_risk_score(domain_list, config.domain_risk_dict)
630 |         node_labels.append(
631 |             "{}<br>Avg Risk Score: {}<br>Number of unique domains on edges: {}".format(name[0], avg_risk_score,
632 |                                                                                        len(domain_list)))
633 |         node_risk_scores.append(color)
634 |         node_size.append(len(domain_list))
635 |         names.append(domain_list)
636 | 
637 |     if not config.node_size:
638 |         node_size = 6
639 | 
640 |     # build node scatter plot
641 |     node_trace = go.Scatter(
642 |         x=node_x, y=node_y,
643 |         mode='markers',
644 |         hoverinfo='text',
645 |         text=node_labels,
646 |         customdata=node_adjacencies,
647 |         marker=dict(
648 |             showscale=True,
649 |             reversescale=True,
650 |             color=node_risk_scores,
651 |             colorscale=[[0.0, 'red'], [0.3, 'orange'], [0.5, 'yellow'], [1.0, 'green']],
652 |             # cmin/cmax needed so plotly doesn't normalize the scores to calculate the color
653 |             cmin=0, cmax=100,
654 |             size=node_size,
655 |             colorbar=dict(
656 |                 thickness=15,
657 |                 title='Risk Score',
658 |                 xanchor='left',
659 |                 titleside='right'
660 |             ),
661 |             line_width=2))
662 | 
663 |     # create the jup widget holder for plotly
664 |     fig = go.FigureWidget(
665 |         [edge_trace, node_trace],
666 |         layout=go.Layout(
667 |             title=f'Graph of interconnected infrastructure ({len(node_labels)} infra nodes)',
668 |             titlefont_size=16,
669 |             showlegend=False,
670 |             hovermode='closest',
671 |             margin=dict(b=5, l=5, r=5, t=30),
672 |             xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
673 |             yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
674 |     )
675 | 
676 |     # handle selection of domains
677 |     def node_selection_fn(trace, points, selector):
678 |         selected_domains = [names[idx] for idx in points.point_inds]
679 |         update_selected_domains(selected_domains)
680 | 
681 |     # handle node click events
682 |     def node_click_fn(trace, points, selector):
683 |         if len(points.point_inds) > 1:
684 |             print(f"node_click passed in more than 1 point: {points.point_inds}")
685 | 
686 |         # clear the old selected points
687 |         trace.selectedpoints = []
688 |         if len(points.point_inds) == 0:
689 |             return
690 | 
691 |         # get the list of selected domain names
692 |         selected_domains = [names[idx] for idx in points.point_inds]
693 |         for id in points.point_inds:
694 |             selected_domains = selected_domains + trace.customdata[id]
695 | 
696 |         # set the new selected points
697 |         # don't like having to loop in a loop to get the domain index, but I don't know a better way
698 |         trace.selectedpoints = points.point_inds + [names.index(name) for name in trace.customdata[id]]
699 | 
700 |         update_selected_domains(selected_domains)
701 | 
702 |     def update_selected_domains(selected_domains):
703 |         if len(selected_domains):
704 |             return
705 | 
706 |         # sort domains by length, then alpha
707 |         selected_domains.sort(key=len, reverse=True)
708 |         with out:
709 |             # write selected domains to the output widget
710 |             print(f"Selected Infra: ({len(selected_domains)})\n")
711 |             for selected_domain in selected_domains:
712 |                 print(selected_domain)
713 |         out.clear_output(wait=True)
714 | 
715 | 
716 |     # event handler for node selection
717 |     fig.data[1].on_selection(node_selection_fn)
718 |     # event handle for node click
719 |     fig.data[1].on_click(node_click_fn)
720 | 
721 |     # Create a table FigureWidget that updates the list of selected domains
722 |     out = widgets.Output(layout={'border': '1px solid black'})
723 |     domain_ui = widgets.VBox((fig, out))
724 |     return domain_ui
725 | 
726 | 
727 | def get_shared_pivots(graph: "Graph", selected_domains: list):
728 |     shared_pivots = {}
729 |     for name in selected_domains:
730 |         domain = graph.nodes[name]["domain"]
731 |         for cat in domain.pivot_categories:
732 |             for cat_value in domain.pivot_categories[cat]:
733 |                 key = f"{cat}: {cat_value}"
734 |                 if key not in shared_pivots:
735 |                     shared_pivots[key] = []
736 |                 shared_pivots[key].append(domain)
737 | 
738 |     # filter by pivots that have >= n domains
739 |     shared_pivots = {k: v for k, v in shared_pivots.items() if len(v) >= 3}
740 |     return shared_pivots


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | jupyterlab
 2 | ipywidgets>=7.5
 3 | networkx
 4 | plotly==4.14.3
 5 | tabulate
 6 | numpy
 7 | scipy
 8 | matplotlib
 9 | pandas
10 | python-dotenv
11 | domaintools-api


--------------------------------------------------------------------------------