├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── data
├── dash_gov_dot_us.json
└── treatment_care.json
├── documentation
├── todo.md
└── tutorial.md
├── domain_cat.ipynb
├── domain_cat_module.py
├── images
├── 2d_click.gif
├── 2d_v1.png
├── 2d_zoom.gif
├── 2d_zoom_select.gif
├── 3d_infra.gif
├── 3d_v1.gif
├── 3d_v2.gif
├── build_graph.png
├── config.png
├── credentials.png
├── dash_gov.us_substrings.gif
├── domain_data.png
├── domain_graph_2d.png
├── intro_3d.gif
├── iris.png
├── iris_small.png
├── jupyter_cell.png
├── pivot_heatmap.png
├── pivot_stats.png
├── pivot_value_heatmap.png
├── reading_data.png
├── run_3d.png
├── run_heatmap.png
├── running_a_cell.gif
├── selected_domains.png
└── trimmed_domains.png
├── infrastructure_cat.ipynb
├── infrastructure_cat_module.py
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # dotenv file
7 | .env
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | pip-wheel-metadata/
27 | share/python-wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | *.py,cover
54 | .hypothesis/
55 | .pytest_cache/
56 |
57 | # Translations
58 | *.mo
59 | *.pot
60 |
61 | # Django stuff:
62 | *.log
63 | local_settings.py
64 | db.sqlite3
65 | db.sqlite3-journal
66 |
67 | # Flask stuff:
68 | instance/
69 | .webassets-cache
70 |
71 | # Scrapy stuff:
72 | .scrapy
73 |
74 | # Sphinx documentation
75 | docs/_build/
76 |
77 | # PyBuilder
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98 | __pypackages__/
99 |
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 |
104 | # SageMath parsed files
105 | *.sage.py
106 |
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 |
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 |
120 | # Rope project settings
121 | .ropeproject
122 |
123 | # mkdocs documentation
124 | /site
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 |
134 | # PyCharm
135 | .idea/
136 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:latest
2 | # install the basics
3 | RUN apt-get update && apt-get -y update
4 | RUN apt-get install -y build-essential python3.6 python3-pip python3-dev
5 | RUN pip3 -q install pip --upgrade
6 |
7 | # install nodejs v12
8 | RUN apt-get install -y curl dirmngr apt-transport-https lsb-release ca-certificates
9 | RUN curl -sL https://deb.nodesource.com/setup_12.x | bash -
10 | RUN apt-get install -y nodejs
11 | RUN apt-get install -y gcc g++ make
12 | RUN node --version
13 | RUN npm --version
14 |
15 | # copy dependency files
16 | RUN mkdir src
17 | WORKDIR src/
18 | COPY requirements.txt .
19 |
20 | # instsall Jupyter, domaincat requirements, and widget extensions
21 | RUN pip3 install -r requirements.txt
22 | RUN export NODE_OPTIONS=--max-old-space-size=4096
23 | RUN jupyter labextension install jupyterlab-plotly@4.14.3 --no-build
24 | RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager --no-build
25 | RUN jupyter labextension install plotlywidget@4.14.3 --no-build
26 | RUN jupyter lab build --dev-build=False --minimize=False
27 | RUN npm cache clean --force
28 | RUN unset NODE_OPTIONS
29 |
30 | # Rest of Files copied
31 | COPY . .
32 |
33 | # Run jupyter lab
34 | CMD ["jupyter", "lab", "--port=9999", "--no-browser", "--ip=0.0.0.0", "--allow-root"]
35 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 DomainTools
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DomainCAT (Domain Connectivity Analysis Tool)
2 |
3 | ## "See Connections Between Domains Right Meow"
4 |
5 | **The Domain Connectivity Analysis Tool is used to analyze aggregate connectivity patterns across a set of domains during security investigations**
6 |
7 | This project was a collaborative effort between [myself](https://www.linkedin.com/in/jconwell/) and [Matthew Pahl](https://www.linkedin.com/in/security-sme/)
8 |
9 | ## Introduction
10 |
11 | When analyzing pivots during threat hunting, most people approach it from the perspective of “what can a single
12 | pivot tell you?” But often actors will set their domains up to use commodity hosting infrastructure, so the number of
13 | entities associated with a given pivot are so big they don’t really give you any useful information.
14 |
15 | This is where DomainCAT can help. Actors make decisions around domain registration and hosting options when setting
16 | up their malicious infrastructure. These can be considered behavioral choices.
17 | - What registrar(s) do they use?
18 | - What TLDs do they prefer?
19 | - What hosting provider(s) do they like?
20 | - What TLS cert authority do they use?
21 |
22 | All of these decisions, together, makeup part of that actor’s infrastructure tools, tactics and procedures (TTPs),
23 | and we can analyze them as a whole to look for patterns across a set of domains.
24 |
25 | ### But wait there's more
26 |
27 | ### Introducing InfraCAT
28 |
29 | What if instead of nodes being domains, they were the infrastructure and the edges were the connected domains? That was
30 | the thought process with InfraCAT. By seeing clusters of infrastructure, you can see tightly coupled groups of domains
31 | based on the infrastructure they use.
32 |
33 | DomainCAT and InfraCAT are tools written in Jupyter Notebooks, a web-based interactive environment that lets you combine text,
34 | code, data, and interactive visualizations into your threat hunting toolbelt. The tool analyzes aggregate
35 | connectivity patterns across a set of domains looking at every pivot for every domain, asking; what are the shared
36 | pivots across these domains, how many shared pivots between each domain, do they have a small pivot count or a really
37 | large one? All of these aspects are taken into consideration as it builds out a connectivity graph that models how
38 | connected all the domains in an Iris search are to each other.
39 |
40 | ### Example Visualizations:
41 |
42 | #### 3D visualization of domain to domain connections based on shared infrastructure, registration and naming patterns
43 | 
44 |
45 | #### 2D visualization of domain to domain connection
46 | 
47 |
48 | #### 3d visualization of infra to infra connection
49 | 
50 |
51 | ## DomainCat Tutorial
52 |
53 | #### Click here for the [DomainCAT Tutorial](documentation/tutorial.md) documentation
54 |
55 | ## Installation Steps: Docker (recommended)
56 |
57 | _Note: building the container takes a bit of RAM to compile the resources for the jupyterlab-plotly extension. Bump up your RAM in Docker preferences to around 4Gb while building the container. Then afterwards you can drop it back down to your normal level to run the container_
58 |
59 | ### Steps:
60 |
61 | Clone the git repository locally
62 |
63 | `$ git clone https://github.com/DomainTools/DomainCAT.git`
64 |
65 | Change directory to the domaincat folder
66 |
67 | `$ cd domaincat`
68 |
69 | Build the jupyter notebook container
70 |
71 | `$ docker build --tag domaincat .`
72 |
73 | Run the jupyter notebook
74 |
75 | `$ docker run -p 9999:9999 -v $(PWD)/data:/src/data --name domaincat domaincat`
76 |
77 | Mounting the data directory as a volume allows you to add new files to the container without having to rebuild it.
78 |
79 | ## Installation Steps: Manual (cross your fingers)
80 |
81 | _Note: this project uses JupyterLab Widgets, which requires nodejs >= 12.0.0 to be installed...which is on you_
82 |
83 | ### Steps:
84 |
85 | Clone the git repository locally
86 |
87 | `$ git clone https://github.com/DomainTools/DomainCAT.git`
88 |
89 | Change directory to the domaincat folder
90 |
91 | `$ cd domaincat`
92 |
93 | Install python libraries
94 |
95 | `$ pip install -r requirements.txt`
96 |
97 | JupyterLab widgets extension
98 |
99 | ```
100 | $ jupyter labextension install jupyterlab-plotly@4.14.3 --no-build
101 | $ jupyter labextension install @jupyter-widgets/jupyterlab-manager --no-build
102 | $ jupyter labextension install plotlywidget@4.14.3 --no-build
103 | $ jupyter lab build
104 | ```
105 |
106 | Run the jupyter notebook
107 |
108 | `$ jupyter lab`
109 |
110 | ___
111 |
112 | # Release Notes:
113 |
114 | October 25, 2021:
115 | - Initial support for InfraCAT
116 |
117 | August 24, 2021:
118 | - Adding a way to remove domains in the graph that you aren't interested in (look at the bottom of the notebook)
119 | - Refactor of the backend data structures to be a bit more efficient
120 |
121 | April 27, 2021:
122 | - Added support for `dotenv` to store REST API credentials in a `.env` file
123 | - Added logic to support
124 | - comma delimited list of domains
125 | - domains defanged with square brackets
126 |
127 | April 23, 2021:
128 | - Added config flag to only analyze active domains
129 | - Show count of selected domains
130 |
131 | April 19: 2021:
132 | - Bug fix to not normalize risk scores values when calculating node color
133 | - Mo'better sorting of selected domains
134 |
135 | April 15, 2021:
136 | - Bug fix: wrong json element returned when querying search hash
137 |
138 | April 14, 2021:
139 | - Added UI to search either a list of domain names or an Iris search hash
140 | - Added UI to enter Iris REST API username and password
141 |
142 | April 7, 2021:
143 | - Initial commit
144 |
145 | ___
146 |
147 | _Plotly Bug: in the 2D visualization of the domain graph there is a weird bug in `Plotly Visualization library` where
148 | if your cursor is directly over the center of a node, the node's tool tip with the domain's name will disappear and
149 | if you click the node, it unselects all nodes. So only click on a node if you see it's tool tip_
150 |
--------------------------------------------------------------------------------
/documentation/todo.md:
--------------------------------------------------------------------------------
1 |
2 | ## DomainCAT to do Tasks
3 | - refactor the code to just use the graph data structure as much as possible and less of the domain_list map
4 | - figure out how to make the create_date pivot a window over n days vs just 1 day
5 | - prune connections that are below some weight threshold
6 | - refactor append_values_with_count(s) functions to share logic
7 | - figure out a better way to normalize registrars
8 | - create a way to type a domain name select that domain
9 | - create a way to type a pivot (category or value?) and select all domains that are connected
10 | - add every pivot possible. I mostly skipped the whois pivots because they aren't that useful anymore
11 | - address the comment in DomainRelationship.add. Essentially domains that share 2 or more IP addresses could potentially have their edge strength artificially boosted
12 | - maybe play around with normalizing edge weights once the graph is created, but before rendering
13 |
14 | ## Bugs to Fix
15 |
16 | ## Wish List
17 |
18 | when looking at domains that are probably realted and created over a short period of time, it would be useful to have some viz that shows / groups the pivots per create date. That way you could see stuff like on day 1 TLD1 and regustrar1 were used, then day two TLD1 and registrar2 were used, then day 3 TLD2 and registrar2 were used. That kind of thing
19 |
20 | Given a selection of domains, show what attributes they are NOT connected on
21 |
22 | date range of domains
23 | timeline view that shows how tight or loosely connected the domains are for each day or week
24 |
25 | auto identify the clusters and show the pivot table for each cluster
26 |
27 | auto discovery substrings
--------------------------------------------------------------------------------
/documentation/tutorial.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # DomainCAT Tutorial
4 |
5 | ## Overview
6 | DomainCAT is a cybersecurity analysis tool used to explore domain to domain connectivity within a set of domains. It's
7 | useful for identifying clusters of related domains that share some common registration, naming, or infrastructure
8 | pattern during a cybersecurity investigation such as threat hunting.
9 |
10 | It does this by analyzing the aggregate connectivity patterns across the set of domains looking at every pivot for
11 | every domain, asking; what are the shared pivots across these domains, how many shared pivots between each domain, do
12 | they have a small pivot count or a really large one? All of these aspects are taken into consideration as it builds out
13 | a connectivity graph that models how connected all the domains are to each other.
14 |
15 | Imagine running a search in the DomainTools Iris UI and getting 300 domains
16 |
17 |
18 |
19 | and turning them into an interactive 3D graph visualization where you can explore them by rotating, panning, and
20 | zooming into the the data to identify interesting subsets of domains are connected to each other.
21 |
22 | 
23 |
24 | Or a 2D graph visualization where you can zoom in, select a set of domains and view exactly what pivots
25 | connect those specific domains together.
26 |
27 | 
28 |
29 | So what is a “graph” in this context? A graph is just a bunch of nodes, or circles that represent domains,
30 | connected together by edges, or gray lines that represent the pivots two domains have in common. In the graph examples
31 | above you can see that some domains group tightly together with others to create clusters. Why is this?
32 |
33 | Pairs of domains that have more pivots in common with each other will have "stronger" edges and be closer together.
34 | This layout logic will naturally cluster groups of highly connected domains together into these clusters.
35 |
36 | Pairs of domains that have only one or two pivots in common have "weaker" edges, and will be farther apart. These
37 | domains will appear farther out on the periphery of the graph. If the domain(s) you are investigating are in this
38 | set of periphery nodes, then you know right away that your search is going in the wrong direction and you might want
39 | to go back to Iris and adjust our search criteria.
40 |
41 | ## Quick Primer on Jupyter Notebooks
42 | DomainCAT is written in [Jupyter Notebooks](https://jupyter.org/), a web-based interactive environment that lets you
43 | combine text, code, data, and interactive visualizations all into one environment. Notebooks are broken up into
44 | cells, where each call can have either code or text. Don't worry, you don't have to know how to code python or make any
45 | changes to the code (mostly) to use DomainCAT, just how to run a cell.
46 |
47 | The video below is an example of how to run a cell. Just click into a cell and you'll see a blue bar on the left
48 | that indicates the cell now has focus. Then hit Shift+Return to run the cell. There is a little empty circle in the
49 | upper right of a notebook that fills in gray while the cell is running. When the cell is finished running, the
50 | circle becomes empty again, and there will be some sort of output below the code cell that is the results of what
51 | was run.
52 |
53 | 
54 |
55 | _Note: if you happen to double-click any of the text cells, you might see the contents change font and you'll have a
56 | blinking cursor. Don't panic, this just means you are in edit mode. Just hit Shift+Return like you were running a
57 | code cell, and the text will change back to normal._
58 |
59 | If you happen to edit the code by accident and mess things up, it's not a big deal. You can click into the cell
60 | that you changed and use the normal undo/redo hotkeys (or the Edit menu) to get the code back to where it was before
61 | you edited it. Worse case you can just rebuild the docker container, and you'll get a new unedited notebook with all
62 | the default values.
63 |
64 | ## Initializing The Notebook
65 |
66 | When you start the DomainCAT notebook, you'll need to click into and run the first code cell in the notebook.
67 | This will initialize all the code to do the connectivity analysis. You'll need to do this every time you start the notebook.
68 |
69 | 
70 |
71 | ## Setting Iris REST API Credentials
72 |
73 | There are two ways you can enter your Iris REST API credentials. The first is entering them into this cell.
74 | The password textbox will not show the password in clear text.
75 |
76 |
77 |
78 | Alternatively you can create a `.env` fine in the root folder of the DomainCAT project and add the following block
79 | to it, replacing the username and password with your own
80 |
81 | ```
82 | # IRIS REST API CREDS
83 | IRIS_API_USERNAME=some_user_name
84 | IRIS_API_KEY=some_password
85 | ```
86 |
87 | When the Jupyter Notebook initializes, the `dotenv` library will read the `.env` file and inject the credentials
88 | into the REST API calls
89 |
90 | ## Entering Domains To Query
91 |
92 | The next step is to define the list of domains to query. This cell lets you do this in one of two ways.
93 |
94 | First, you can enter the raw list of domains into the large spare textbox shown below. The domains can be
95 | either new-line or comma delimited, and DomainCAT support defanged domains that use square brackets.
96 |
97 |
98 |
99 | The second way is to enter an Iris investigation search hash into the second text box. This hash represents
100 | an existing investigation in Iris and will query all the domains from the investigation.
101 |
102 | ## Reading Domain Data From DomainTools REST API
103 |
104 | DomainCAT reads domain data by querying the
105 | [DomainTools Iris Investigate REST API](https://www.domaintools.com/resources/api-documentation/iris-investigate)
106 | for any investigation hash that you generate in the Iris UI.
107 |
108 | The next code cell (shown below) has the configuration used to query the Iris REST API.
109 |
110 |
111 |
112 | If you run the cell (Shift+Return) DomainCAT will query the REST api using the investigation hash are return the set of
113 | domains from the list of domains you enterd or the Iris search hash. It will also show you the number of domains loaded
114 | into the notebook.
115 |
116 | There are a couple of options to note:
117 | - `save_search_to_disk`: if you change this to `True` the search results will be saved to disk. This way you can
118 | reload the set of domains from your investigation at a later point without having to query the REST API again
119 | - `json_file_path`: this is the path and file name used to save your search results to disk for later use
120 | - `query_api`: if `True`, the REST API will query the investigation hash. If `False`, domain data will be loaded
121 | from the `json_file_path`
122 |
123 | DomainCAT ships with a demo dataset of ab called `dash_gov_dot_us.json`. This is a set of domains that use
124 | the `.us` TLD, end in `-gov`, and are less than 2 years old. To load this data set `query_api` to False and run the cell.
125 |
126 | _Performance Note: DomainCAT performs best with less than or equal to 400 domains(ish). Any more than that and
127 | performance and memory can become an issue and cause the notebook to crash._
128 |
129 | ## Configuration Settings
130 |
131 | There are a set of configuration settings that for the most part you shouldn't need to change
132 |
133 |
134 |
135 | ### config.active_domains_only (default: True)
136 | If this setting is `True` DomainCAT will only analyze domains that are activly registered. Domains that were taken down or expired will be ignored. If `False`, all domains returned by Iris REST API will be analyzed.
137 |
138 | ### config.longest_common_substring (default: 6)
139 |
140 | DomainCAT has a new type of pivot called `longest_common_substring`. It compares every domain name to every other
141 | domain name, and creates a pivot between two domains if they share `longest_common_substring` or more consecutive
142 | characters. For example, the domains visas-esta-gov[.]us and visas-estausa-gov[.]us both share the substring
143 | “visas-esta”, so a pivot would be created for the value “visas-esta” that will join every domain with this substring.
144 |
145 | Note: I've found anything less than 5 for this setting will create way too many connections in the domain graph,
146 | and are not useful in an investigation. But try it if you want, it creates a pretty graph.
147 |
148 | ### config.ignore_substrings (default: empty)
149 |
150 | This setting is an array of string values to ignore when looking for `longest_common_substring` pivots. This is useful
151 | when you use a string as part of Iris search, like all domains ending in "-gov". Every domain will have this substring,
152 | so you want to remove it from consideration when creating substring pivots.
153 |
154 | To turn this setting off, use: `config.ignore_substrings = []`
155 |
156 | If you have more than one string to ignore use the following pattern: `config.ignore_substrings = ["stuff", "things", "hotsauce"]`
157 |
158 | ### config.scale_edge_strength_by_pivot_count (default: True)
159 |
160 | Every pivot in Iris has a pivot count. This is the number of domains globally attached to this pivot. For example,
161 | an ip address might have a pivot count of 1,000, meaning there are 1,000 domains hosted on this ip address. DomainCAT
162 | also has a notion of _local pivots_; these are pivots between domains only within the REST API search results.
163 |
164 | When evaluating how important a pivot is between two domains, DomainCAT can evaluate the global pivot count and weigh
165 | the influence of the pivot in the graph inversely proportional to the global pivot count. This means pivots with a
166 | smaller pivot count would have edges that are stronger in the graph than pivots with a very large global pivot count.
167 |
168 | If this is set to `True`, DomainCAT will use this graph edge weighting strategy. If it is set to `False`, it will
169 | weigh every edge equally.
170 |
171 | *TODO: put link to section below about graphs, edges, and weights()
172 |
173 | ### config.count_threshold (default: sys.maxsize)
174 |
175 | This setting is used to filter out pivots that have a global pivot count greater than `count_threshold`. So if it was set to `config.count_threshold = 1000`, then any pivot that had a count greater than 1,000 would not be used to create an edge in the graph.
176 |
177 | This setting isn't that useful when `scale_edge_strength_by_pivot_count` is turned on, as the inverse weighting will take care of this. But if `scale_edge_strength_by_pivot_count` is turned off, it can be useful for weeding out really big pivots from commodity internet infrastructure.
178 |
179 | ### config.min_pivot_size (default: 2)
180 |
181 | This setting is used to filter out pivots that have a _local pivot size_ of less than `min_pivot_size`. For example, if this was set to 5, then any pivot that connects 4 or less domains returned from the REST API would be removed.
182 |
183 | The default value of 2 keeps every pivot that connects at least 2 domains.
184 |
185 | ### config.max_domains (default: 400000000)
186 |
187 | This setting is related to `scale_edge_strength_by_pivot_count`. It is the theoretical maximum number of domains that could ever be on a pivot and is used to calculate the inverse pivot count weight. The default value is the approximate number of active domains, give or take a few million: 400,000,000
188 |
189 | ### config.print_debug_output (default: False)
190 |
191 | This setting is used to write extra debug info to the Jupyter's log to help with troubleshooting.
192 |
193 | ## Choose Pivots & Build the Domain Graph
194 |
195 | By default, all pivots are turned on, (with a few exceptions discussed below). This means that if any two domains returned
196 | by the REST API are connected to each other by any of the below pivots, they will have an edge created to connect them
197 | together in the graph.
198 |
199 | To turn a pivot off, just comment out the pivot by putting a `#` in front of the pivot. For example, to turn off
200 | the `create_date` pivot, just change the line like so:
201 |
202 | ```python
203 | # "create_date",
204 | ```
205 |
206 | To turn a pivot back on, just remove the `#`. After any change to the code in a cell, you will have to re-run the
207 | cell for the code change to take effect.
208 |
209 | Below are the default pivot settings:
210 |
211 |
212 |
213 | You'll note that `ns_host` and `ns_ip` are both commented out. This is because I prefer to use `ns_domain` for name
214 | server based pivots. For example, if a domain has 4 name servers like:
215 | - ns1.somedomain.com
216 | - ns2.somedomain.com
217 | - ns3.somedomain.com
218 | - ns4.somedomain.com
219 |
220 | `ns_host` will create a pivot for each one, and if each name server has its own IP address then there will be
221 | an additional 4 pivots created. That means if two domains share the above name servers, the edge in the graph that
222 | connects them will represent a total of 9 pivots. This does two things to the graph. First I've found that this
223 | overemphasis the name server connectedness in the graph. Second, domains with more nameservers will have stronger
224 | edges than domains with fewer name servers, emphasising domains with more redundant infrastructure. The same logic
225 | applies to for MX record based pivots.
226 |
227 | Once you have the pivots turned on/off the way you want run this cell. DomainCAT will take all the domains returned
228 | by the Iris REST API, analyze their shared pivots, and build a graph that represents the connected structure of
229 | these domains.
230 |
231 | ### Brand New Pivot: longest common substring!
232 | Some might have noticed that there is a new pivot in this list that doesn’t exist in Iris called “longest_common_substring”.
233 | This is a new local pivot that was added into DomainCAT which compares every domain name in the search to every other
234 | domain name, and creates a pivot between two domains if they share 6 or more consecutive characters. For example,
235 | the domains visas-esta-gov[.]us and visas-estausa-gov[.]us both share the substring “visas-esta” so they would be
236 | connected in the graph.
237 |
238 | In fact, you can even turn off all the pivots except “longest_common_substring” which would show how connected all
239 | the domains are based solely on how they were named. This technique can be useful when your investigation centers
240 | around domain name patterns and shared infrastructure.
241 |
242 | ## Trimmed Domains
243 |
244 | When building the graph of connected domains, there will often be a few domains that are not connected to any other
245 | domain. These are called trimmed domains. DomainCAT will show you a count of domains that were trimmed from the
246 | graph because they were not connected.
247 |
248 | If you want to see which domains were trimmed out, just run the next cell in the notebook and it will print out
249 | the list of trimmed domains.
250 |
251 |
252 |
253 | ## Explore the 3D Domain Graph
254 |
255 | Once the graph is built it's time to visually explore it. Run the next cell in the notebook:
256 |
257 |
258 |
259 | DomainCAT lets you explore the graph of connected domains with an interactive 3D view. You can mouse over any node
260 | and see what the domain name is and click/drag the graph to view it from different directions. This 3D view is really
261 | useful for gaining an understanding of the overall aggregate connectedness of all the domains, and if there are any
262 | clusters of domains that we might want to explore.
263 |
264 | But if you haven't turned any pivots off, your graph might look a little something like this:
265 |
266 | 
267 |
268 | There are so many pivots in this graph, represented by those gray lines, that it obfuscates the visualization and
269 | really makes it hard to see what's going on, especially in the center of the graph. This is because there are some
270 | pivots that belong to most of the domains in the graph, resulting in a crazy spider web egg-sac-looking thing
271 | like above. The good news is that not all pivots are as useful in every investigation, so we can remove them
272 | from the graph.
273 |
274 | For example; the search used to pull these domains together used the TLD “.us”. So every domain would have this pivot
275 | connecting it to every other domain. Luckily, DomainCAT is smart enough to look for pivots like that and automatically
276 | remove them. But there are probably other pivots that are just adding noise to the graph and do not offer much value
277 | to the investigation which we can turn off.
278 |
279 | ## Pivot Stats
280 |
281 | DomainCAT has a Pivot Stats view, which shows different statistics for each pivot that was found in the domains.
282 |
283 |
284 |
285 | Looking through this list, we can see that `ip_country_code` only has 7 pivots (“# of Pivots”), meaning there were
286 | only 7 different country codes found in the domains, but it’s “# of connections” shows that almost 48% of the domains
287 | are connected to each other with this pivot. This is a good candidate pivot to turn off in the graph and should clean
288 | up the 3D view of the graph a bit.
289 |
290 | Let’s find out. Go back up to the pivots configuration cell and comment out `ip_country_code`, then run that cell
291 | to rebuild the graph. Then scroll down to the cell that calls `build_3d_graph_layout(graph, domain_list)` to redraw
292 | the 3D visualization. It should look something like this:
293 |
294 | 
295 |
296 | Removing a single pivot really opened up the graph! As the 3D view of the graph pivots around we can see there are four
297 | main clusters: three that seem highly connected to each other, almost in a triangle, and one fairly mixed cluster
298 | pushed farther out to the side.
299 |
300 | ## Pivot Tuning: An Iterative Process
301 |
302 | The 3D graph view looks pretty good after just turning off country code, but sometimes turning off one pivot isn’t
303 | enough. I call this process “pivot tuning;” you look at the 3D view of the graph to see if the center has opened up
304 | enough to see its internal cluster patterns. If the graph view is still too cluttered, look at pivot statistics and
305 | see if you can find another pivot that might be a good candidate to turn off. Pivots whose “# of connections” column
306 | is higher than 15% are often good candidates. Also, pivots whose “# of pivots” column is pretty low but the “# of
307 | domains” is close to 90% or greater can be helpful too. There are some pivots, like ASN or IP country code that are
308 | very coarse and apply to a high percentage of the domains. If your investigation isn’t centered around IP geo or
309 | network diversity, these pivots are also good candidates to turn off.
310 |
311 | You’ll get a feel for what works and what doesn’t as you play with it. But keep in mind that a pivot that looks
312 | like a candidate to turn off might be critical to your investigation. For example; if the investigation is centered
313 | around domains hosted by a single ISP, turning off “ip_isp” might be a good idea, but “ip_address” might remove
314 | important connectivity information from the graph.
315 |
316 | _Note: I like to have multiple cells that call `build_3d_graph_layout(graph, domain_list)`. This way, as I'm
317 | pivot tuning, I can really see how a change in the pivots affected the graph by comparing it to the
318 | previous graph. I'll often have 3 or 4 different 3D graphs showing my progress while pivot tuning_
319 |
320 | ## Explore the 2D Domain Graph
321 |
322 | With the 3D graph cleaned up a bit we can now dive back into the 2 dimensional view to explore the details of the
323 | different domain clusters or individual domains.
324 |
325 | Run the next cell to create the interactive 2D domain graph.
326 |
327 |
328 |
329 | There are several things we can do in this view of the graph. We can zoom into a cluster to look at what domains are
330 | in it. To do this, just click-drag over a section of the graph you want to zoom into. To zoom back out click the
331 | little house icon in the upper right of the 2D graph view.
332 |
333 | A large cluster is sometimes actually made up of 2-3 smaller clusters that are useful to explore, but just lump
334 | together in the zoomed-out view. If common domain name patterns are a theme in your investigation,
335 | mousing over domains to view their name is a useful tactic to see which domains are grouped together.
336 |
337 | 
338 |
339 | We can also select a region of the graph (which is different from zooming in) by clicking either the "box select"
340 | or "lasso select" icons in the upper right of the 2D view, and then click-drag the region in the graph to select.
341 |
342 | 
343 |
344 | Once a set of domains are selected (and all other domains become grayed out) DomainCAT will show the list of domain names.
345 |
346 |
347 |
348 | If this set of domains looks like what you are looking for in your investigation, you could export this list back
349 | into your SOAR playbook to build rules around them, or maybe add them to your firewall to block outright. Or if you
350 | could copy them back into Iris to do a more focused investigation just on these domains.
351 |
352 | ## Dig into Pivot Connections
353 |
354 | Once you have a set of selected domains, you can dig deeper into what pivots were influential in grouping them together.
355 | Run the next cell in the notebook:
356 |
357 |
358 |
359 | This will analyze all the pivots that are shared across the domains that you just selected, and show you a heatmap view
360 | of which pivots were most influential in connecting the selected domains, ordered from most influential to least.
361 |
362 |
363 |
364 | Looking at the list of selected domain names, it’s not a surprise that “longest_common_substring” was the most
365 | frequent pivot. The number in each square is the total number of pivots of that type from that domain to the other
366 | selected domains. *This view can tell you what pivot categories were most influential in grouping these domains
367 | together*. This information can be really valuable when your investigation didn’t include one of these
368 | pivots in the original search criteria.
369 |
370 | If you want to look at which specific pivot values are responsible for clustering these domains together, the below
371 | view is a more detailed heat map which shows the actual pivot value rather than its key.
372 |
373 |
374 |
375 | From this we can see that “esta-c” is a very common naming pattern for these domains. If this was a pattern in our
376 | source indicator(s), we might go back to Iris and add “domain contains esta-c” as an additional search criteria to
377 | bring in a broader set of domains into our investigation.
378 |
379 | There is another interesting pattern in this view. If you look at the ASN, ISP, IP address, and dynadot[.]com name
380 | server rows you’ll see that they are all used by the same subset of domains in this list. But hawkhost[.]com nameserver
381 | is used by all the other domains. Based on the domain naming pattern it’s reasonable to believe that all these domains
382 | were created by the same actor, but it looks like this actor uses at least two different hosting and name server
383 | infrastructures. This realization could widen out the set of domains that you can now take action on.
384 |
385 | There’s one other piece of functionality in the 2D graph view that is worth mentioning. Sometimes you just want to
386 | see what domains are connected to one specific domain. If you click any node in the graph, it will automatically
387 | select all the nodes that are connected to it and you can then explore the pivots that bind them together.
388 |
389 | 
390 |
391 | This is useful when you have a seed domain in your investigation and you want to just dive right into it and see what
392 | other domains are connected to it. Another useful scenario (shown above) is when you see a “connector domain” that
393 | sits in between two clusters but is highly connected to both. Clicking on that domain and then inspecting the shared
394 | pivots can sometimes yield valuable information about two different infrastructure patterns used by a potential actor.
395 |
396 | # DomainCAT Tips & Best Practices
397 |
398 | ## longest_common_substring
399 | When investigating a set of domains that have obvious common naming patterns, it can be useful to turn off all pivots
400 | except `longest_common_substring` which would show how connected all the domains are based solely on how they were named.
401 | I'll also often combine `longest_common_substring` with only one or two other infrastructure based pivots,
402 | like `ns_domain` or `mx_domain`. This technique can be useful when your investigation centers around domain name
403 | patterns and shared infrastructure.
404 |
405 |
--------------------------------------------------------------------------------
/domain_cat.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# DomainCAT: Domain Connectivity Analysis Tool\n",
8 | "\n",
9 | "### Analyzing the domain to domain connectivity of an Iris API Search"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Run This First: imports all the helper functions and sets stuff up\n",
19 | "%run domain_cat_module.py\n",
20 | "\n",
21 | "print(\"DomainCAT is ready to go\")"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "## Iris REST API Credentials"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "api_username_ui = widgets.Text(placeholder='Iris API Username', description='Username:', layout={'width': '500px'}, value=\"\")\n",
38 | "api_pw_ui = widgets.Password(placeholder='Iris API Password', description='Password:', layout={'width': '500px'}, value=\"\")\n",
39 | "widgets.VBox([api_username_ui, api_pw_ui])"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "## Query Domain Data From Iris Investigate API\n",
47 | "\n",
48 | "Enter either a list of return delimited domains into the Domains text box, _OR_ an Iris search hash into the hash text box.\n",
49 | "\n",
50 | "Note: if both a list of domains _AND_ a search hash is entered, the liast of domains will be queried and the search hash will be ignored"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "domain_list_ui = widgets.Textarea(placeholder='Enter list of domains', description='Domains:', layout={'height': '300px', 'width': '700px'}) \n",
60 | "search_hash_ui = widgets.Text(placeholder='Enter list of domains', description='Hash:', layout={'width': '700px'})\n",
61 | "show_iris_query_ui(domain_list_ui, search_hash_ui)"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "# Data Loading Config\n",
71 | "query_api = True\n",
72 | "save_search_to_disk = False\n",
73 | "json_file_path = \"data/dash_gov_dot_us.json\"\n",
74 | "\n",
75 | "if query_api:\n",
76 | " iris_results = query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui)\n",
77 | " print(f'Iris API returned {len(iris_results)} domains')\n",
78 | "\n",
79 | " # save search results to disk to be used later\n",
80 | " if save_search_to_disk:\n",
81 | " with open(json_file_path, 'w') as f:\n",
82 | " json.dump(iris_results, f)\n",
83 | "else:\n",
84 | " with open(json_file_path) as json_data:\n",
85 | " iris_results = json.loads(json_data.read())\n",
86 | "\n",
87 | " print(f'Loaded {len(iris_results)} domains from {json_file_path}')"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "## DomainCAT Configuration\n",
95 | "\n",
96 | "Please refer to the DomainCAT documentation for details about these configuration options"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "config = Config()\n",
106 | "\n",
107 | "# only analyze domains that are active (currently registered)\n",
108 | "config.active_domains_only = True\n",
109 | "\n",
110 | "# config for pivoting on matching substrings. Only matching substrings this long or longer will be used to create a pivot\n",
111 | "config.longest_common_substring = 6\n",
112 | "\n",
113 | "# List of substrings to ignore when creating pivots by matching substrings\n",
114 | "config.ignore_substrings = []\n",
115 | "\n",
116 | "# use the pivot count to scale how important the pivot is during graph layout. Smaller pivot counts is more influence, and vice versa\n",
117 | "config.scale_edge_strength_by_pivot_count = True\n",
118 | "\n",
119 | "# Global pivot count threshold. Any pivot with more than this value is discarded. sys.maxsize effectivly keeps all pivots\n",
120 | "config.global_count_threshold = sys.maxsize\n",
121 | "\n",
122 | "# The smallest pivot count size to use. Default of 2 means no pivots are filtered out because it's count is too low\n",
123 | "config.min_pivot_size = 2\n",
124 | "\n",
125 | "# theoretical max pivot size for calculating edge strengths\n",
126 | "config.max_domains = 100000000\n",
127 | "\n",
128 | "# If True DomainCAT will print out some debug info while building the connected graph of domains\n",
129 | "config.print_debug_output = False"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "## Choose Which Pivots To Use & Build Domain Graph\n"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "pivot_category_config = {\n",
146 | " \"adsense\",\n",
147 | " \"google_analytics\",\n",
148 | " \"create_date\",\n",
149 | " \"redirect_domain\",\n",
150 | " \"registrar\",\n",
151 | " \"ip_address\",\n",
152 | " \"ip_country_code\",\n",
153 | " \"ip_isp\",\n",
154 | " \"ip_asn\",\n",
155 | " \"ssl_hash\",\n",
156 | " \"ssl_subject\",\n",
157 | " \"ssl_org\",\n",
158 | " \"ssl_email\",\n",
159 | " \n",
160 | "# # Note: commented out ns_host and ns_ip because they double count ns connectedness when used with ns_domain. \n",
161 | " \"ns_domain\",\n",
162 | "# \"ns_host\", \n",
163 | " \"ns_ip\", \n",
164 | " \n",
165 | "# # Note: commented out mx_host and mx_ip because they double counts mx connectedness when used with mx_domain \n",
166 | " \"mx_domain\",\n",
167 | "# \"mx_host\",\n",
168 | " \"mx_ip\", \n",
169 | " \n",
170 | " \"tld\",\n",
171 | " \"longest_common_substring\",\n",
172 | "}\n",
173 | "\n",
174 | "# Build the domain pivot graph structure\n",
175 | "config.pivot_category_config = pivot_category_config\n",
176 | "graph, pivot_categories, trimmed_domains = build_domain_pivot_graph(iris_results, config)"
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "metadata": {},
182 | "source": [
183 | "## Trimmed Domains"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "print_trimmed_domains = True\n",
193 | "if print_trimmed_domains:\n",
194 | " if len(trimmed_domains[\"unconnected\"]) > 0:\n",
195 | " print(\"trimmed unconnected domains:\")\n",
196 | " for domain in trimmed_domains[\"unconnected\"]: print(f\" {domain}\")\n",
197 | " if len(trimmed_domains[\"create_date\"]) > 0:\n",
198 | " print(\"\\ntrimmed domains with only create date pivot:\")\n",
199 | " for domain in trimmed_domains[\"create_date\"]: print(f\" {domain}\")"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "## Draw the Domain Graph in an Interactive 3D Layout"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "build_3d_graph_layout(graph)"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "build_3d_graph_layout(graph)"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "build_3d_graph_layout(graph)"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "## Calculate & Show Pivot Statistics"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "# Calculate a bunch of pivot statistics to see how well connected all the domains in the search result are\n",
250 | "calc_pivot_stats(graph, pivot_categories)"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "## Draw the Domain Graph in an Interactive 2D Layout"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "# calculate the pivots shared in commmon across all selected domains\n",
267 | "shared_pivots = {}\n",
268 | "def get_2d_shared_pivots(graph, selected_domains):\n",
269 | " global shared_pivots\n",
270 | " shared_pivots = get_shared_pivots(graph, selected_domains)\n",
271 | " \n",
272 | "build_2d_graph_layout(graph, get_2d_shared_pivots)"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {},
278 | "source": [
279 | "## Heatmap of which pivots connect the most domains together: by pivot category"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "if len(shared_pivots) == 0:\n",
289 | " print(\"Select a set of domains in the 2D graph\")\n",
290 | "else:\n",
291 | " create_pivot_heatmaps(shared_pivots)"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {},
297 | "source": [
298 | "## Removing domains from the graph\n",
299 | "\n",
300 | "Sometimes you find disconnected domains in the 3D graph visualization that make pivoting the viz really annoying. To remove domains from the graph, enter the domain(s) you want removed in the text box below and run the second cell. This will remove the domains from the graph structure without having to requery the data.\n",
301 | "\n",
302 | "After you do this, re-run the 3D viz and the domains should be gone."
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": null,
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "remove_domains_ui = widgets.Textarea(placeholder='Enter domains to remove from graph', description='Domains:', layout={'height': '100px', 'width': '700px'}) \n",
312 | "remove_domains_ui"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "# Run this to remove the domains in the above text box from the graph\n",
322 | "graph = remove_domains_from_graph(graph, remove_domains_ui)"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {},
329 | "outputs": [],
330 | "source": []
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "metadata": {},
336 | "outputs": [],
337 | "source": []
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "metadata": {},
343 | "outputs": [],
344 | "source": []
345 | }
346 | ],
347 | "metadata": {
348 | "kernelspec": {
349 | "display_name": "Python 3 (ipykernel)",
350 | "language": "python",
351 | "name": "python3"
352 | },
353 | "language_info": {
354 | "codemirror_mode": {
355 | "name": "ipython",
356 | "version": 3
357 | },
358 | "file_extension": ".py",
359 | "mimetype": "text/x-python",
360 | "name": "python",
361 | "nbconvert_exporter": "python",
362 | "pygments_lexer": "ipython3",
363 | "version": "3.8.10"
364 | }
365 | },
366 | "nbformat": 4,
367 | "nbformat_minor": 4
368 | }
--------------------------------------------------------------------------------
/domain_cat_module.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import re
4 | import json
5 | import math
6 | from difflib import SequenceMatcher
7 | import plotly.graph_objects as go
8 | import requests
9 | import networkx as nx
10 | import pandas as pd
11 | import numpy as np
12 | import scipy
13 | import matplotlib
14 | import matplotlib.pyplot as plt
15 | from ipywidgets import interactive, HBox, VBox
16 | import ipywidgets as widgets
17 | from IPython.display import HTML, display
18 | import tabulate
19 | from dotenv import dotenv_values
20 |
21 |
22 | # load REST API creds from .env file
23 | dcat_config = dotenv_values(".env")
24 |
25 | def show_iris_query_ui(domain_list_ui, search_hash_ui):
26 | lookup_ui = widgets.VBox([
27 | widgets.Label(value="Enter a return delimited list of domains to lookup (no commas, no quotes)"),
28 | domain_list_ui,
29 | widgets.Label(value="Or..."),
30 | widgets.Label(value="Enter an Iris search hassh to lookup"),
31 | search_hash_ui,
32 | ])
33 | return lookup_ui
34 |
35 |
36 | def clean_domain_list(domain_list_ui):
37 | # remove any quotes, spaces, or defanging square brackets
38 | full_domain_list = domain_list_ui.value.strip().replace(' ', '').replace('"', '').replace("'", "").replace('[', '').replace(']', '')
39 | # replace commas with new lines
40 | full_domain_list = full_domain_list.replace(",", "\n")
41 | # update the widget
42 | domain_list_ui.value = full_domain_list
43 | # split into array
44 | return full_domain_list.split("\n")
45 |
46 |
47 | def get_rest_api_creds(api_username_ui, api_pw_ui):
48 | api_username = api_username_ui.value
49 | if len(api_username) == 0:
50 | api_username = dcat_config["IRIS_API_USERNAME"]
51 | api_key = api_pw_ui.value
52 | if len(api_key) == 0:
53 | api_key = dcat_config["IRIS_API_KEY"]
54 | return api_username, api_key
55 |
56 |
57 | def query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui):
58 | api_username, api_key = get_rest_api_creds(api_username_ui, api_pw_ui)
59 | if len(domain_list_ui.value) > 0:
60 | # split list of domains into groups of 100 because of API restrictions
61 | results = []
62 | full_domain_list = clean_domain_list(domain_list_ui)
63 | max_domains = 100
64 | start = 0
65 | end = max_domains
66 | for x in range(math.ceil(len(full_domain_list) / max_domains)):
67 | # slice out max domains to query
68 | partial_domain_list = full_domain_list[start:end]
69 | # build query string
70 | domain_list = ",".join(partial_domain_list)
71 | iris_query = {"api_username": api_username, "api_key": api_key, "domain": domain_list}
72 | # query rest api
73 | print(f"...querying Iris REST API for {len(partial_domain_list)} domains")
74 | iris_results = _query_iris_rest_api(api_username, api_key, iris_query)
75 | # build up the set of return domain objects
76 | results = results + iris_results["response"]["results"]
77 | # update slice indexes
78 | start = end
79 | end += max_domains
80 | return results
81 | elif len(search_hash_ui.value) > 0:
82 | iris_query = {"api_username": api_username, "api_key": api_key, "search_hash": search_hash_ui.value}
83 | iris_results = _query_iris_rest_api(api_username, api_key, iris_query)
84 | iris_results = iris_results["response"]["results"]
85 | return iris_results
86 | else:
87 | print("Domain List and Search Hash text boxes are empty. Please enter either a list of domains or search hash to lookup")
88 | raise Exception("Domain List and Search Hash text boxes are empty")
89 |
90 |
91 | def _query_iris_rest_api(api_username: str, api_key: str, iris_query: str):
92 | root_api_url = "https://api.domaintools.com/v1/iris-investigate/"
93 | resp = requests.post(root_api_url, data=iris_query)
94 | if resp.status_code != 200:
95 | raise Exception(f'POST /iris-investigate/ {resp.status_code}: {resp.text}')
96 | iris_results = resp.json()
97 | return iris_results
98 |
99 |
100 | def remove_domains_from_graph(graph, remove_domains_ui):
101 | domains = clean_domain_list(remove_domains_ui)
102 | for domain in domains:
103 | if graph.has_node(domain):
104 | graph.remove_node(domain)
105 | return graph
106 |
107 |
108 | class Config(object):
109 | """ Little helper class to hold all the config values"""
110 |
111 |
112 | class Domain(object):
113 | """ Little helper class to hold the domain name and risk score
114 | """
115 | def __init__(self, domain_json):
116 | self.json = domain_json
117 | self.name = domain_json["domain"]
118 | self.risk_score = domain_json["domain_risk"]['risk_score']
119 | self.pivot_categories = {}
120 | self.label=f"{self.name} ({self.risk_score})"
121 |
122 | def __str__(self):
123 | return f"name: {self.name}, risk: {self.risk_score}"
124 |
125 | def __repr__(self):
126 | return str(self)
127 |
128 |
129 | class DomainRelationship(object):
130 | def __init__(self, weight: float, category: str):
131 | # this is the maximum weight that an edge can have.
132 | # Adjust this if you want to play around with stronger edge weights
133 | self.max_weight = 5.0
134 | self.weight = weight
135 | self.categories = [category]
136 |
137 | def __str__(self):
138 | return f"weight: {self.weight}, categories: {self.categories}"
139 |
140 | def __repr__(self):
141 | return str(self)
142 |
143 | def add(self, weight: float, category: str):
144 | """ Note: certain pivot categories can be added more than once for 2 domains;
145 | things like IP and name server. For example, two domains could be on the same set of 5
146 | IP addreese. For now the weights are just summed if there are more than one pivots of
147 | the same category, but maybe we need a different strategy. Since IPs have multiple pivots
148 | (ip address, country code, asn, isp) this means if there were 5 shared IPs between two
149 | domains, the weight would be: 4 * 5 * pivot_weight.
150 | This might over amplify the edge strength
151 | """
152 | if category not in self.categories:
153 | # this helps by not overly boosting the edge weight if two domains share
154 | # multipel IP addresses
155 | self.weight += weight
156 | if self.weight > self.max_weight:
157 | self.weight = self.max_weight
158 | self.categories.append(category)
159 |
160 | def get_description(self):
161 | return "
".join(sorted(self.categories))
162 |
163 |
164 | class PivotValue(object):
165 | def __init__(self, pivot_value, pivot_count):
166 | self.pivot_value = pivot_value
167 | self.pivot_count = pivot_count
168 | self.domains = set()
169 |
170 | def union(self, other: "PivotValue"):
171 | self.domains.union(other.domains)
172 |
173 | def __str__(self):
174 | return f"pivot_value: {self.pivot_value}, " \
175 | f"pivot_count: {self.pivot_count}, " \
176 | f"domains: {self.domains}"
177 |
178 | def __repr__(self):
179 | return str(self)
180 |
181 |
182 | def get_edge_count(n: int):
183 | # for a complete graph, the edge count is: n(n-1)/2
184 | return n * (n - 1) / 2
185 |
186 |
187 | def build_domain_pivot_graph(iris_results: list, config: "Config"):
188 | """ Main workflow function that takes the results from an Iris Investigate query and
189 | builds the graph object of how each of the domains in the query are connected to each other"""
190 |
191 | # parse the Iris API Result to build the pivot data structure
192 | graph, pivot_categories = init_local_pivot_graph(iris_results, config)
193 |
194 | # normalize registrar pivots (see note in function comments)
195 | #if "registrar" in pivot_categories and config.normalize_registrars:
196 | # normalize_similar_registrars(pivot_categories["registrar"])
197 |
198 | # create pivots for longest common substrings
199 | pivot_on_matching_substrings(graph, pivot_categories, config)
200 |
201 | # trim pivots from graph that have less than the set count threshold or contain all domains
202 | trim_pivots(pivot_categories, len(graph.nodes), config)
203 |
204 | # trim unconnected domains and domains with only a create date pivot
205 | trimmed_unconnected_domains = trim_unconnected_domains(graph, pivot_categories, config)
206 | trimmed_create_date_domains = trim_domains_with_only_create_date_pivot(graph, pivot_categories)
207 |
208 | print(f"{len(trimmed_unconnected_domains)} "
209 | f"domains trimmed because they were not connected to other domains")
210 | print(f"{len(trimmed_create_date_domains)} "
211 | f"domains trimmed because create_date was the only pivit")
212 | print(f"{len(graph.nodes)} domains in pivot structure \n")
213 |
214 | # build the graph structure based on the domain pivots
215 | graph = build_domain_graph(graph, pivot_categories, config)
216 | return (graph,
217 | pivot_categories,
218 | {"unconnected": trimmed_unconnected_domains,
219 | "create_date": trimmed_create_date_domains})
220 |
221 |
222 | def init_local_pivot_graph(iris_results: list, config: "Config"):
223 | """ Collect pivot categories found in result set ("ssl_hash" for example)"""
224 | # init empty graph
225 | graph = nx.Graph()
226 | # init pivot categories dict
227 | pivot_categories = {}
228 |
229 | for domain_json in iris_results:
230 |
231 | # check if domain is active or not
232 | if domain_json['active'] == False and config.active_domains_only:
233 | continue
234 |
235 | # create a domain object
236 | domain = Domain(domain_json)
237 |
238 | # add domain node to graph
239 | graph.add_node(domain.name, domain=domain)
240 |
241 | append_value_with_count(pivot_categories, 'adsense', domain_json, domain, config)
242 | append_value_with_count(pivot_categories, 'google_analytics', domain_json, domain, config)
243 | append_value_with_count(pivot_categories, 'create_date', domain_json, domain, config)
244 | append_value_with_count(pivot_categories, 'redirect_domain', domain_json, domain, config)
245 | append_value_with_count(pivot_categories, 'registrar', domain_json, domain, config)
246 |
247 | # haven't seen "ssl_email" in the wild yet, so not sure if it is a value/count or just value
248 | append_values_with_counts(pivot_categories, 'ssl_email', domain_json, domain, config)
249 |
250 | # IPs are composite objects, so pull out each value for each IP
251 | for ip_json in domain_json["ip"]:
252 | # at some point add logic to add /24 in here
253 | append_value_with_count(pivot_categories, 'address', ip_json, domain, config, 'ip_address')
254 | append_value_with_count(pivot_categories, 'country_code', ip_json, domain, config, 'ip_country_code')
255 | append_value_with_count(pivot_categories, 'isp', ip_json, domain, config, 'ip_isp')
256 | append_values_with_counts(pivot_categories, 'asn', ip_json, domain, config, 'ip_asn')
257 |
258 | # name servers are composite objects, so pull out each value for each name server
259 | for ns_json in domain_json["name_server"]:
260 | append_value_with_count(pivot_categories, 'host', ns_json, domain, config, 'ns_host')
261 | append_value_with_count(pivot_categories, 'domain', ns_json, domain, config, 'ns_domain')
262 | append_values_with_counts(pivot_categories, 'ip', ns_json, domain, config, 'ns_ip')
263 |
264 | append_value(pivot_categories, 'tld', domain_json, domain, config)
265 |
266 | # ssl certs are composite objects, so pull out each value for each ssl cert
267 | for ssl_json in domain_json['ssl_info']:
268 | append_value_with_count(pivot_categories, 'hash', ssl_json, domain, config, "ssl_hash")
269 | append_value_with_count(pivot_categories, 'subject', ssl_json, domain, config, "ssl_subject")
270 | append_value_with_count(pivot_categories, 'organization', ssl_json, domain, config, "ssl_org")
271 |
272 | # mx servers are composite objects, so pull out each value for each mx server
273 | for mx_json in domain_json['mx']:
274 | append_value_with_count(pivot_categories, 'host', mx_json, domain, config, "mx_host")
275 | append_value_with_count(pivot_categories, 'domain', mx_json, domain, config, "mx_domain")
276 | append_values_with_counts(pivot_categories, 'ip', mx_json, domain, config, "mx_ip")
277 | # mx priority might be interesting at some point to node stringth
278 | return graph, pivot_categories
279 |
280 |
281 | def append_value(pivot_categories: dict,
282 | pivot_category: str,
283 | json_data: dict,
284 | domain: "Domain",
285 | config: "Config",
286 | new_pivot_category: str = None):
287 | # check if pivot is in domain json
288 | if pivot_category in json_data:
289 | pivot_value = str(json_data[pivot_category]).strip()
290 |
291 | # check we have a value to add
292 | if len(pivot_value) > 0:
293 | _append_value_to_pivot(pivot_categories, pivot_category, pivot_value, None,
294 | domain, config, new_pivot_category)
295 |
296 |
297 | def append_value_with_count(pivot_categories: dict,
298 | pivot_category: str,
299 | json_data: dict,
300 | domain: "Domain",
301 | config: "Config",
302 | new_pivot_category: str = None):
303 | # check if pivot is in domain json
304 | if pivot_category in json_data:
305 | if isinstance(json_data[pivot_category], dict):
306 | pivot_value = str(json_data[pivot_category]["value"]).strip()
307 | global_pivot_count = json_data[pivot_category]["count"]
308 |
309 | # trim pivots that are above the threshold (except create_date)
310 | if global_pivot_count < config.global_count_threshold or pivot_category == "create_date":
311 | # check we have a value to add
312 | if len(pivot_value) > 0 and global_pivot_count > 0:
313 | _append_value_to_pivot(pivot_categories, pivot_category, pivot_value,
314 | global_pivot_count, domain, config, new_pivot_category)
315 |
316 |
317 | def append_values_with_counts(pivot_categories: dict,
318 | pivot_category: str,
319 | json_data: dict,
320 | domain: "Domain",
321 | config: "Config",
322 | new_pivot_category: str = None):
323 | # check if pivot is in domain json
324 | if pivot_category in json_data:
325 | for pivot in json_data[pivot_category]:
326 | pivot_value = str(pivot["value"]).strip()
327 | global_pivot_count = pivot["count"]
328 |
329 | # check if we want to add this value
330 | if len(pivot_value) > 0 and global_pivot_count > 0 and global_pivot_count < config.global_count_threshold:
331 | _append_value_to_pivot(pivot_categories, pivot_category, pivot_value,
332 | global_pivot_count, domain, config, new_pivot_category)
333 |
334 |
335 | def _append_value_to_pivot(pivot_categories: dict,
336 | pivot_category: str,
337 | pivot_value: str,
338 | global_pivot_count: int,
339 | domain: "Domain",
340 | config: "Config",
341 | new_pivot_category: str = None):
342 | # if we pass in a new_pivot_category, replace pivot_category with new_pivot_category
343 | if new_pivot_category:
344 | pivot_category = new_pivot_category
345 |
346 | # check if we're capturing data for this pivot category
347 | if pivot_category not in config.pivot_category_config:
348 | return
349 |
350 | # make sure we have the pivot dictionary
351 | if pivot_category not in pivot_categories:
352 | pivot_categories[pivot_category] = {}
353 |
354 | # make sure we have the pivot value set
355 | if pivot_value not in pivot_categories[pivot_category]:
356 | pivot_categories[pivot_category][pivot_value] = PivotValue(pivot_value, global_pivot_count)
357 |
358 | # add domain to the pivot domain array
359 | pivot_categories[pivot_category][pivot_value].domains.add(domain.name)
360 |
361 | # add pivot category and value to the domain
362 | if pivot_category not in domain.pivot_categories:
363 | domain.pivot_categories[pivot_category] = []
364 | domain.pivot_categories[pivot_category].append(pivot_value)
365 |
366 |
367 | def normalize_similar_registrars(registrar_pivots: dict):
368 | """ The same registrar can often show up in WHOIS records with different string values.
369 | For example:
370 | NAMECHEAP
371 | NAMECHEAP INC
372 | NAMECHEAP. INC
373 | NAMECHEAP, INC
374 | NAMECHEAP, INC.
375 |
376 | This function splits the registrar string by any non character value, and selects the longest
377 | word as the normalized registar value. If any two registrars share the same normalized value,
378 | then the domains from those two registrars will be merged. The end goal is all the domains
379 | from the 5 different namecheap registrars string values shown above would be merged into one.
380 |
381 | Note: this isn't a very good solution. There are cases where this will create invalid connections
382 | between domains. For example, two different registars that shared a common longest word in
383 | their name, link "NAMECHEAP, INC" and "NOT NAMECHEAP, INC".
384 |
385 | It looks like this happens a lot so turning off the feature for now.
386 |
387 | TODO: this algorithm needs work. it allows things such as
388 | good
389 | PDR LTD. D/B/A PUBLICDOMAINREGISTRY.COM == PDR Ltd. d/b/a PublicDomainRegistry.com
390 | GODADDY.COM, == LLC GODADDY.COM, INC
391 | NAMECHEAP, INC == NameCheap, Inc.
392 | bad
393 | TUCOWS DOMAINS INC == WILD WEST DOMAINS, INC
394 | NETWORK SOLUTIONS, == LLC Network Solutions, LLC
395 | NETWORK SOLUTIONS, == LLC BIGROCK SOLUTIONS LTD
396 | """
397 | return
398 | # registrars = [registrar for registrar in registrar_pivots]
399 | # for x in range(len(registrars)):
400 | # reg1 = registrars[x]
401 | # if reg1 in registrar_pivots:
402 | # # normalize registrar string
403 | # reg1_norm = sorted(
404 | # list(set(re.findall(r"[\w']+", reg1.lower()))), key=len, reverse=True)[0]
405 | # for y in range(x+1, len(registrars)):
406 | # reg2 = registrars[y]
407 | # # normalize registrar string
408 | # reg2_norm = sorted(
409 | # list(set(re.findall(r"[\w']+", reg2.lower()))), key=len, reverse=True)[0]
410 | # if reg1_norm == reg2_norm:
411 | # # pick the registrar with the most domains
412 | # if registrar_pivots[reg1].pivot_count > registrar_pivots[reg2].pivot_count:
413 | # reg_keep = reg1
414 | # reg_pop = reg2
415 | # else:
416 | # reg_keep = reg2
417 | # reg_pop = reg1
418 | # # combine domains for matching registrars
419 | # registrar_pivots[reg_keep].union(registrar_pivots[reg_pop])
420 | # # remove reg_pop from dictionary of all registrar pivots
421 | # registrar_pivots.pop(reg_pop)
422 | # print(f"Merged registrar {reg_pop} into {reg_keep}")
423 |
424 |
425 | def pivot_on_matching_substrings(graph: "Graph", pivot_categories: dict, config: "Config"):
426 | """Create pivots between domains that share a common substring of
427 | `config.longest_common_substring` chars long.
428 |
429 | Note: SequenceMatcher has some known issues with not finding the longest match in very long
430 | strings, but does a pretty good job with shorter strings such as domain names.
431 | https://stackoverflow.com/questions/18715688/find-common-substring-between-two-strings
432 | """
433 | domains = list(graph.nodes)
434 | for x in range(len(domains)):
435 | domain1 = graph.nodes[domains[x]]["domain"]
436 | string1 = domain1.name.split('.')[0]
437 | # pull out substrings to ignore
438 | if config.ignore_substrings and len(config.ignore_substrings) > 0:
439 | for ignore in config.ignore_substrings:
440 | string1 = string1.replace(ignore, "")
441 | for y in range(x+1, len(domains)):
442 | domain2 = graph.nodes[domains[y]]["domain"]
443 | string2 = domain2.name.split('.')[0]
444 | # pull out substrings to ignore
445 | if config.ignore_substrings and len(config.ignore_substrings) > 0:
446 | for ignore in config.ignore_substrings:
447 | string2 = string2.replace(ignore, "")
448 | # find the longest common substring between the two domains
449 | matcher = SequenceMatcher(None, string1, string2, False)
450 | match = matcher.find_longest_match(0, len(string1), 0, len(string2))
451 | longest_match = string1[match.a: match.a + match.size]
452 | # check if the matching substring is long enough
453 | if len(longest_match) >= config.longest_common_substring:
454 | # add pivots
455 | _append_value_to_pivot(
456 | pivot_categories,
457 | "longest_common_substring",
458 | longest_match, None,
459 | domain1, config)
460 | _append_value_to_pivot(
461 | pivot_categories,
462 | "longest_common_substring",
463 | longest_match, None,
464 | domain2, config)
465 |
466 |
467 | def trim_pivots(pivot_categories: dict, domain_count: int, config: "Config"):
468 | """ Remove two types of pivots. Pivots that contain all the domains from the Iris result set,
469 | and pivots that have less than the set threshold of domains in them from this Iris result set.
470 | By defualt, pivots that only have one domain are removed, but this can be configured by
471 | setting the min_pivot_size variable to a different value. For example, if you only wanted
472 | to use pivots that had 10 or more domains connected to them
473 | """
474 | for pivot_category_key in pivot_categories:
475 | pivot_category = pivot_categories[pivot_category_key]
476 | total_pivots = 0
477 | del_count = 0
478 | for pivot_value in list(pivot_category.keys()):
479 | total_pivots += 1
480 | if len(pivot_category[pivot_value].domains) < config.min_pivot_size:
481 | # check for pivots with less than the threshold value
482 | del pivot_category[pivot_value]
483 | del_count += 1
484 | elif len(pivot_category[pivot_value].domains) >= domain_count:
485 | # check for pivots with all domains in them
486 | del pivot_category[pivot_value]
487 | if config.print_debug_output:
488 | print(f"deleted {pivot_category_key}:{pivot_value}. Contained all domains")
489 | if config.print_debug_output:
490 | print(f"deleted {del_count} "
491 | f"singleton pivots out of {total_pivots} "
492 | f"pivots from {pivot_category_key}")
493 |
494 |
495 | def trim_unconnected_domains(graph: "Graph", pivot_categories: dict, config: "Config"):
496 | """ Remove any domains that have no shared connection to any othe domain
497 | """
498 | if config.print_debug_output: print(f"{len(graph.nodes)} domains in Iris result set")
499 | connected_domains = set()
500 | for pivot_category_key in pivot_categories:
501 | pivot_category = pivot_categories[pivot_category_key]
502 | for pivot_value in list(pivot_category.keys()):
503 | pivot_domains = pivot_category[pivot_value].domains
504 | connected_domains = connected_domains.union(pivot_domains)
505 |
506 | # get the set of domains that are not connected
507 | domains = set(graph.nodes)
508 | lonely_domains = domains.difference(connected_domains)
509 |
510 | # remove unconnected domains
511 | for domain in lonely_domains:
512 | graph.remove_node(domain)
513 |
514 | if config.print_debug_output:
515 | print(f"{len(connected_domains)} domains are interconnected")
516 | print(f"{len(lonely_domains)} domains are unconnected")
517 | print("Unconnected domains removed from graph:")
518 | for domain in lonely_domains:
519 | print(f" {domain}")
520 |
521 | return lonely_domains
522 |
523 |
524 | def trim_domains_with_only_create_date_pivot(graph: "Graph", pivot_categories: dict):
525 | """ if a domain ONLY has a create_date pivot, then that isn't a very good indicator of
526 | connectedness."""
527 | # identify domains to trim
528 | trimmed_domains = []
529 | for domain_name in graph.nodes:
530 | domain = graph.nodes[domain_name]["domain"]
531 | if len(domain.pivot_categories) == 1 and "create_date" in domain.pivot_categories:
532 | trimmed_domains.add(domain)
533 | # remove domain from graph and remove it from the main pivot_categories data structure
534 | graph.remove_node(domain_name)
535 |
536 | domain_create_date = domain.pivot_categories["create_date"][0]
537 | pivot_categories["create_date"][domain_create_date].remove(domain_name)
538 | if len(pivot_categories["create_date"][domain_create_date]) == 0:
539 | pivot_categories["create_date"].pop(domain_create_date)
540 | if len(pivot_categories["create_date"]) == 0:
541 | pivot_categories.pop("create_date")
542 |
543 | return trimmed_domains
544 |
545 |
546 | def get_pivot_connection_weight(pivot_category: str,
547 | global_pivot_count: int,
548 | local_pivot_count: int,
549 | config: "Config"):
550 | """ If we aren't using the pivot count to set the edge weight, just return a constant value of
551 | 1 for every pivot. If we do want to use the pivot count, use the function:
552 | 1 - (log(pivot count) / (log(max possible pivot count)))
553 | This creates an inverse log ratio where small pivots have a high edge weight,
554 | and very large pivots have a low edge weight.
555 |
556 | Note: also experimenting with raising this log ratio to different exponents to get greater
557 | separation between large and small pivots: math.pow(1.0 + inverse_log_ratio, 3) - 1
558 | """
559 | if pivot_category not in config.pivot_category_config:
560 | raise Exception(f"Unexpected Pivot Category: {pivot_category}")
561 |
562 | # scale the edge strength based on the ratio of the global pivot count vs the max domains
563 | if config.scale_edge_strength_by_pivot_count:
564 | if global_pivot_count is None:
565 | # Some pivots don't have a count. For example, tld or longest common substring.
566 | # if global pivot count is None, for now set to 1 (?)
567 | # But we probably need to then normalize this weigth against the max weight calculated.
568 | # Also, TLD doesn't have a pivot count because it's often huge. Is that the same.
569 | # importance as common substrings? Probably not.
570 | return 0.5
571 | inv_ratio = 1.0 - math.log(1.0 + global_pivot_count) / math.log(1.0 + config.max_domains)
572 | return inv_ratio
573 | # return math.pow(1.0 + inverse_log_ratio, 3) - 1
574 | return 1
575 |
576 |
577 | def build_domain_graph(graph: "Graph", pivot_categories: dict, config: "Config"):
578 | # The graph in initialized with all it's nodes. Now we need to connect all the nodes
579 | # with each local pivot in the pivot_categories dict
580 | edge_count = 0
581 | for category in pivot_categories:
582 | for pivot_value in pivot_categories[category]:
583 | pivot = pivot_categories[category][pivot_value]
584 | pivot_domains = list(pivot.domains)
585 |
586 | # for each pair of domains in pivot, get the edge weight and create edge
587 | weight = get_pivot_connection_weight(category, pivot.pivot_count, len(pivot_domains), config)
588 | if weight > 0:
589 | for x in range(len(pivot_domains)):
590 | for y in range(x+1, len(pivot_domains)):
591 | d1 = pivot_domains[x]
592 | d2 = pivot_domains[y]
593 | edge_count += 1
594 | if graph.has_edge(d1, d2):
595 | graph[d1][d2]['relationship'].add(weight, category)
596 | else:
597 | graph.add_edge(d1, d2, relationship=DomainRelationship(weight, category))
598 |
599 | # now that all edges are added, set the weight attribute with the adjusted weight
600 | for edge in graph.edges:
601 | graph[edge[0]][edge[1]]['weight'] = graph[edge[0]][edge[1]]['relationship'].weight
602 |
603 | print(f"Total Graph Connections: {edge_count}")
604 | print(f"Distinct Graph Connections: {len(graph.edges)}")
605 | return graph
606 |
607 |
608 | def calc_pivot_stats(graph: "Graph", pivot_categories: dict):
609 | from IPython.display import HTML, display
610 | import tabulate
611 |
612 | # calc the max number of edges possible for this set of domains
613 | max_edge_count = get_edge_count(len(graph.nodes))
614 |
615 | # collect counts for each pivot category
616 | category_domain_counts = {}
617 | category_edge_counts = {}
618 | for category_key in pivot_categories:
619 | category_domain_counts[category_key] = 0
620 | category_edge_counts[category_key] = 0
621 | category = pivot_categories[category_key]
622 | for pivot_value in category:
623 | category_domain_counts[category_key] += len(category[pivot_value].domains)
624 |
625 | # if all domains share a pivot value, it would be considered a "connected graph"
626 | # so get the edge count for a connected graph
627 | edge_count = get_edge_count(len(category[pivot_value].domains))
628 | category_edge_counts[category_key] += round(edge_count)
629 |
630 | total_connections = 0
631 |
632 | headers = ["Pivot Category",
633 | "# of Domains",
634 | "# of Pivots",
635 | "avg domains per pivot",
636 | "# of connections"]
637 | table = []
638 | total_domains = len(graph.nodes)
639 | for category_key in category_domain_counts:
640 | cat_pivot_count = len(pivot_categories[category_key])
641 | if cat_pivot_count > 0:
642 | domain_count = category_domain_counts[category_key]
643 | edge_count = category_edge_counts[category_key]
644 |
645 | total_connections += edge_count
646 |
647 | avg_domains = domain_count / cat_pivot_count
648 | percent_of_total_domains = round(100 * (domain_count / total_domains), 2)
649 | percent_of_total_edges = round(100 * (edge_count / max_edge_count), 2)
650 | table.append([category_key,
651 | f"{domain_count} ({percent_of_total_domains}%)",
652 | cat_pivot_count,
653 | round(avg_domains, 2),
654 | f"{edge_count} ({percent_of_total_edges}%)"])
655 |
656 | print(f"{len(graph.nodes)} Domains in Pivot Structure")
657 | display(HTML(tabulate.tabulate(table, headers=headers, tablefmt='html')))
658 |
659 |
660 | def calc_viz_layout(layout: str, graph: "Graph", dimension: int):
661 | # KK layout only
662 | if layout == "kk":
663 | return nx.layout.kamada_kawai_layout(graph, dim=dimension)
664 |
665 | # spring layout only
666 | if layout == "fr":
667 | return nx.layout.spring_layout(graph, dim=dimension)
668 |
669 | # kk layout as initialization for spring layout
670 | if layout == "kk_to_fr":
671 | pos = nx.layout.kamada_kawai_layout(graph, dim=dimension, weight=None)
672 | return nx.layout.spring_layout(graph, pos=pos, dim=dimension)
673 |
674 | # spring layout as initialization for kk layout
675 | if layout == "fr_to_kk":
676 | pos = nx.layout.spring_layout(graph, dim=dimension)
677 | return nx.layout.kamada_kawai_layout(graph, pos=pos, dim=dimension)
678 | raise Exception("invalid layout choice")
679 |
680 |
681 | def build_3d_graph_layout(graph: "Graph"):
682 | """ Build the graph layout based on the specified algorithm and get the node positions
683 | in xyz dimensions"""
684 | pos = calc_viz_layout("kk_to_fr", graph, 3)
685 |
686 | node_labels, node_risk_scores, Xn, Yn, Zn = [], [], [], [], []
687 | for name in graph.nodes:
688 | # build x,y,z coordinates data structure for nodes
689 | Xn.append(pos[name][0])
690 | Yn.append(pos[name][1])
691 | Zn.append(pos[name][2])
692 |
693 | # get domain colors by risk score
694 | domain = graph.nodes[name]["domain"]
695 | node_labels.append(domain.label)
696 | node_risk_scores.append(domain.risk_score)
697 |
698 | # build x,y,z coordinates data structure for edges
699 | Xe, Ye, Ze = [], [], []
700 | for e in graph.edges:
701 | u = pos[e[0]]
702 | v = pos[e[1]]
703 | Xe+=[u[0], v[0], None]
704 | Ye+=[u[1], v[1], None]
705 | Ze+=[u[2], v[2], None]
706 |
707 | # Create the 3d Plotly graph and render it
708 | # build line objects for our edges
709 | trace1=go.Scatter3d(x=Xe, y=Ye, z=Ze,
710 | mode='lines',
711 | name='edges',
712 | line=dict(color='rgb(125,125,125)', width=0.5),
713 | opacity=0.9,
714 | hoverinfo='none')
715 |
716 | trace2=go.Scatter3d(
717 | x=Xn, y=Yn, z=Zn,
718 | mode='markers',
719 | name='domains',
720 | marker=dict(
721 | symbol='circle',
722 | size=6,
723 | showscale=True,
724 | color=node_risk_scores,
725 | colorscale=[[0.0, 'red'], [0.3, 'orange'], [0.5, 'yellow'], [1.0, 'green']],
726 | # cmin/cmax needed so plotly doesn't normalize the scores to calculate the color
727 | cmin=0, cmax=100,
728 | reversescale=True,
729 | line=dict(color='rgb(50,50,50)', width=0.5),
730 | colorbar=dict(
731 | thickness=15,
732 | title='Risk Score',
733 | xanchor='left',
734 | titleside='right'
735 | ),
736 | ),
737 | text=node_labels,
738 | hoverinfo='text')
739 |
740 | # background definition, but everything is turned off
741 | axis=dict(showbackground=False,
742 | showline=False,
743 | zeroline=False,
744 | showgrid=False,
745 | showticklabels=False,
746 | title='')
747 |
748 | layout = go.Layout(
749 | title=f"Graph of interconnected domains ({len(node_labels)} domains)",
750 | width=1000, height=1000,
751 | showlegend=False,
752 | scene=dict(xaxis=dict(axis), yaxis=dict(axis), zaxis=dict(axis)),
753 | margin=dict(t=100), hovermode='closest')
754 |
755 | data=[trace1, trace2]
756 | fig=go.Figure(data=data, layout=layout)
757 | return fig
758 |
759 |
760 | def build_2d_graph_layout(graph: "Graph", get_2d_shared_pivots: "function"):
761 | """ build the graph layout based on the specified algorithm and get the node positions
762 | in xy dimensions"""
763 | pos = calc_viz_layout("kk_to_fr", graph, 2)
764 | # pos = calc_viz_layout("fr_to_kk", g, 2)
765 |
766 | # build edge data
767 | edge_x, edge_y = [], []
768 | for e in graph.edges():
769 | x0, y0 = pos[e[0]]
770 | x1, y1 = pos[e[1]]
771 | edge_x.append(x0)
772 | edge_x.append(x1)
773 | edge_x.append(None)
774 | edge_y.append(y0)
775 | edge_y.append(y1)
776 | edge_y.append(None)
777 |
778 | # create edge scatter plot
779 | edge_trace = go.Scatter(
780 | x=edge_x, y=edge_y,
781 | line=dict(width=0.5, color='#888'),
782 | hoverinfo='none',
783 | mode='lines',
784 | opacity=0.6
785 | )
786 |
787 | # build node data
788 | node_adjacencies, node_risk_scores, node_text, node_x, node_y = [], [], [], [], []
789 | names = list(graph.nodes)
790 | for name in names:
791 | domain = graph.nodes[name]["domain"]
792 | x, y = pos[name]
793 | node_x.append(x)
794 | node_y.append(y)
795 | # get the domain's connected nodes
796 | neighbors = list(graph.neighbors(name))
797 | node_adjacencies.append(neighbors)
798 | # get the node text
799 | node_text.append(f'{name}: risk {domain.risk_score}, connections {len(neighbors)}')
800 | # get the domain risk score
801 | node_risk_scores.append(domain.risk_score)
802 |
803 | # build node scatter plot
804 | node_trace = go.Scatter(
805 | x=node_x, y=node_y,
806 | mode='markers',
807 | hoverinfo='text',
808 | text=node_text,
809 | customdata=node_adjacencies,
810 | marker=dict(
811 | showscale=True,
812 | reversescale=True,
813 | color=node_risk_scores,
814 | colorscale=[[0.0, 'red'], [0.3, 'orange'], [0.5, 'yellow'], [1.0, 'green']],
815 | # cmin/cmax needed so plotly doesn't normalize the scores to calculate the color
816 | cmin=0, cmax=100,
817 | size=10,
818 | colorbar=dict(
819 | thickness=15,
820 | title='Risk Score',
821 | xanchor='left',
822 | titleside='right'
823 | ),
824 | line_width=2))
825 |
826 | # create the jup widget holder for plotly
827 | fig = go.FigureWidget(
828 | [edge_trace, node_trace],
829 | layout=go.Layout(
830 | title=f'Graph of interconnected domains ({len(node_text)} domains)',
831 | titlefont_size=16,
832 | showlegend=False,
833 | hovermode='closest',
834 | margin=dict(b=5,l=5,r=5,t=30),
835 | xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
836 | yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
837 | )
838 |
839 | # handle selection of domains
840 | def node_selection_fn(trace, points, selector):
841 | selected_domains = [names[idx] for idx in points.point_inds]
842 | update_selected_domains(selected_domains)
843 |
844 | # handle node click events
845 | def node_click_fn(trace, points, selector):
846 | if len(points.point_inds) > 1:
847 | print(f"node_click passed in more than 1 point: {points.point_inds}")
848 |
849 | # clear the old selected points
850 | trace.selectedpoints = []
851 | if len(points.point_inds) == 0:
852 | return
853 |
854 | # get the list of selected domain names
855 | selected_domains = [names[idx] for idx in points.point_inds]
856 | for id in points.point_inds:
857 | selected_domains = selected_domains + trace.customdata[id]
858 |
859 | # set the new selected points
860 | # don't like having to loop in a loop to get the domain index, but I don't know a better way
861 | trace.selectedpoints = points.point_inds + [names.index(name) for name in trace.customdata[id]]
862 |
863 | update_selected_domains(selected_domains)
864 |
865 | def update_selected_domains(selected_domains):
866 | if len(selected_domains) == 0:
867 | return
868 |
869 | # sort domains by length, then alpha
870 | selected_domains.sort(key=len, reverse=True)
871 | with out:
872 | # write selected domains to the output widget
873 | print(f"Selected Domains: ({len(selected_domains)})\n")
874 | for selected_domain in selected_domains:
875 | print(selected_domain)
876 | out.clear_output(wait=True)
877 |
878 | # calc pivots selected domains have in common
879 | get_2d_shared_pivots(graph, selected_domains)
880 |
881 | # event handler for node selection
882 | fig.data[1].on_selection(node_selection_fn)
883 | # event handle for node click
884 | fig.data[1].on_click(node_click_fn)
885 |
886 | # Create a table FigureWidget that updates the list of selected domains
887 | out = widgets.Output(layout={'border': '1px solid black'})
888 | domain_ui = widgets.VBox((fig, out))
889 | return domain_ui
890 |
891 |
892 | def get_shared_pivots(graph: "Graph", selected_domains: list):
893 | shared_pivots = {}
894 | for name in selected_domains:
895 | domain = graph.nodes[name]["domain"]
896 | for cat in domain.pivot_categories:
897 | for cat_value in domain.pivot_categories[cat]:
898 | key = f"{cat}: {cat_value}"
899 | if key not in shared_pivots:
900 | shared_pivots[key] = []
901 | shared_pivots[key].append(domain)
902 |
903 | # filter by pivots that have >= n domains
904 | shared_pivots = {k: v for k, v in shared_pivots.items() if len(v) >= 3}
905 | return shared_pivots
906 |
907 |
908 | def create_pivot_heatmaps(shared_pivots: dict):
909 | print("\n Heatmap of which pivots connect the most domains together: by pivot category")
910 | pivot_cat_crosstab, pivot_value_crosstab = create_pivot_tables(shared_pivots)
911 | fig, ax = plt.subplots(figsize=(10, 10))
912 | im = heatmap(
913 | pivot_cat_crosstab,
914 | pivot_cat_crosstab.index,
915 | pivot_cat_crosstab.columns,
916 | ax=ax,
917 | cmap="Blues")
918 | texts = annotate_heatmap(im, valfmt="{x}")
919 | fig.tight_layout()
920 | plt.show()
921 |
922 | print("\n Heatmap of which pivots connect the most domains together: by pivot value")
923 | fig, ax = plt.subplots(figsize=(10, 10))
924 | im = heatmap(
925 | pivot_value_crosstab,
926 | pivot_value_crosstab.index,
927 | pivot_value_crosstab.columns,
928 | ax=ax,
929 | cmap="Blues")
930 | texts = annotate_heatmap(im, valfmt="{x}")
931 | fig.tight_layout()
932 | plt.show()
933 |
934 | print("\n List of the most frequent pivot values")
935 | create_pivot_summary(pivot_value_crosstab)
936 |
937 |
938 | def create_pivot_tables(shared_pivots: dict):
939 | # Create the pandas DataFrame
940 | data = []
941 | for pivot_value in shared_pivots:
942 | for d in shared_pivots[pivot_value]:
943 | pivot_cat = pivot_value.split(": ")[0]
944 | data.append([d.name, pivot_cat, pivot_value])
945 | df = pd.DataFrame(data, columns = ['domain', 'pivot_cat', 'pivot'])
946 |
947 | # Build contingency table of domains to pivot
948 | pivot_cat_crosstab = pd.crosstab(df['pivot_cat'], df['domain'])
949 | pivot_value_crosstab = pd.crosstab(df['pivot'], df['domain'])
950 |
951 | # sort rows by total # of pivots
952 | pivot_cat_crosstab['sum'] = pivot_cat_crosstab[list(pivot_cat_crosstab.columns)].sum(axis=1)
953 | pivot_cat_crosstab.sort_values("sum", 0, ascending=False, inplace=True)
954 | pivot_cat_crosstab.drop("sum", 1, inplace=True)
955 |
956 | # sort rows by total # of pivots
957 | pivot_value_crosstab['sum'] = pivot_value_crosstab[list(pivot_value_crosstab.columns)].sum(axis=1)
958 | pivot_value_crosstab.sort_values("sum", 0, ascending=False, inplace=True)
959 | pivot_value_crosstab.drop("sum", 1, inplace=True)
960 |
961 | return pivot_cat_crosstab, pivot_value_crosstab
962 |
963 |
964 | def create_pivot_summary(pivot_value_crosstab: "Pandas_CrossTab"):
965 | # show just an output view of pivot name and count for selection
966 | summary = pivot_value_crosstab.copy()
967 | summary['count'] = summary[list(summary.columns)].sum(axis=1)
968 | summary.sort_values("count", 0, ascending=False, inplace=True)
969 | summary = summary[["count"]]
970 |
971 | headers = ["Pivot Category", "Pivot Values", "Count"]
972 | table = []
973 | for index, row in summary.iterrows():
974 | cat, pivot = index.split(": ")
975 | table.append([cat, pivot, row["count"]])
976 | display(HTML(tabulate.tabulate(table, headers=headers, tablefmt='html')))
977 |
978 |
979 |
980 | def heatmap(data, row_labels, col_labels, ax=None, cbar_kw={}, cbarlabel="", **kwargs):
981 | """
982 | Create a heatmap from a numpy array and two lists of labels.
983 |
984 | Parameters
985 | ----------
986 | data
987 | A 2D numpy array of shape (N, M).
988 | row_labels
989 | A list or array of length N with the labels for the rows.
990 | col_labels
991 | A list or array of length M with the labels for the columns.
992 | ax
993 | A `matplotlib.axes.Axes` instance to which the heatmap is plotted. If
994 | not provided, use current axes or create a new one. Optional.
995 | cbar_kw
996 | A dictionary with arguments to `matplotlib.Figure.colorbar`. Optional.
997 | cbarlabel
998 | The label for the colorbar. Optional.
999 | **kwargs
1000 | All other arguments are forwarded to `imshow`.
1001 | """
1002 |
1003 | if not ax:
1004 | ax = plt.gca()
1005 |
1006 | # Plot the heatmap
1007 | im = ax.imshow(data, **kwargs)
1008 |
1009 | # We want to show all ticks...
1010 | ax.set_xticks(np.arange(data.shape[1]))
1011 | ax.set_yticks(np.arange(data.shape[0]))
1012 | # ... and label them with the respective list entries.
1013 | ax.set_xticklabels(col_labels)
1014 | ax.set_yticklabels(row_labels)
1015 |
1016 | # Let the horizontal axes labeling appear on top.
1017 | ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False)
1018 |
1019 | # Rotate the tick labels and set their alignment.
1020 | plt.setp(ax.get_xticklabels(), rotation=-30, ha="right", rotation_mode="anchor")
1021 |
1022 | # Turn spines off and create white grid.
1023 | for edge, spine in ax.spines.items():
1024 | spine.set_visible(False)
1025 |
1026 | ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
1027 | ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
1028 | ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
1029 | ax.tick_params(which="minor", bottom=False, left=False)
1030 |
1031 | return im
1032 |
1033 |
1034 | def annotate_heatmap(im, data=None, valfmt="{x:.2f}", textcolors=["black", "white"],
1035 | threshold=None, **textkw):
1036 | """
1037 | A function to annotate a heatmap.
1038 |
1039 | Parameters
1040 | ----------
1041 | im
1042 | The AxesImage to be labeled.
1043 | data
1044 | Data used to annotate. If None, the image's data is used. Optional.
1045 | valfmt
1046 | The format of the annotations inside the heatmap. This should either
1047 | use the string format method, e.g. "$ {x:.2f}", or be a
1048 | `matplotlib.ticker.Formatter`. Optional.
1049 | textcolors
1050 | A list or array of two color specifications. The first is used for
1051 | values below a threshold, the second for those above. Optional.
1052 | threshold
1053 | Value in data units according to which the colors from textcolors are
1054 | applied. If None (the default) uses the middle of the colormap as
1055 | separation. Optional.
1056 | **kwargs
1057 | All other arguments are forwarded to each call to `text` used to create
1058 | the text labels.
1059 | """
1060 |
1061 | if not isinstance(data, (list, np.ndarray)):
1062 | data = im.get_array()
1063 |
1064 | # Normalize the threshold to the images color range.
1065 | if threshold is not None:
1066 | threshold = im.norm(threshold)
1067 | else:
1068 | threshold = im.norm(data.max())/2.
1069 |
1070 | # Set default alignment to center, but allow it to be
1071 | # overwritten by textkw.
1072 | kw = dict(horizontalalignment="center",
1073 | verticalalignment="center")
1074 | kw.update(textkw)
1075 |
1076 | # Get the formatter in case a string is supplied
1077 | if isinstance(valfmt, str):
1078 | valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)
1079 |
1080 | # Loop over the data and create a `Text` for each "pixel".
1081 | # Change the text's color depending on the data.
1082 | texts = []
1083 | for i in range(data.shape[0]):
1084 | for j in range(data.shape[1]):
1085 | kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
1086 | text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
1087 | texts.append(text)
1088 |
1089 | return texts
--------------------------------------------------------------------------------
/images/2d_click.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/2d_click.gif
--------------------------------------------------------------------------------
/images/2d_v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/2d_v1.png
--------------------------------------------------------------------------------
/images/2d_zoom.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/2d_zoom.gif
--------------------------------------------------------------------------------
/images/2d_zoom_select.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/2d_zoom_select.gif
--------------------------------------------------------------------------------
/images/3d_infra.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/3d_infra.gif
--------------------------------------------------------------------------------
/images/3d_v1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/3d_v1.gif
--------------------------------------------------------------------------------
/images/3d_v2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/3d_v2.gif
--------------------------------------------------------------------------------
/images/build_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/build_graph.png
--------------------------------------------------------------------------------
/images/config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/config.png
--------------------------------------------------------------------------------
/images/credentials.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/credentials.png
--------------------------------------------------------------------------------
/images/dash_gov.us_substrings.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/dash_gov.us_substrings.gif
--------------------------------------------------------------------------------
/images/domain_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/domain_data.png
--------------------------------------------------------------------------------
/images/domain_graph_2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/domain_graph_2d.png
--------------------------------------------------------------------------------
/images/intro_3d.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/intro_3d.gif
--------------------------------------------------------------------------------
/images/iris.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/iris.png
--------------------------------------------------------------------------------
/images/iris_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/iris_small.png
--------------------------------------------------------------------------------
/images/jupyter_cell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/jupyter_cell.png
--------------------------------------------------------------------------------
/images/pivot_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/pivot_heatmap.png
--------------------------------------------------------------------------------
/images/pivot_stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/pivot_stats.png
--------------------------------------------------------------------------------
/images/pivot_value_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/pivot_value_heatmap.png
--------------------------------------------------------------------------------
/images/reading_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/reading_data.png
--------------------------------------------------------------------------------
/images/run_3d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/run_3d.png
--------------------------------------------------------------------------------
/images/run_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/run_heatmap.png
--------------------------------------------------------------------------------
/images/running_a_cell.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/running_a_cell.gif
--------------------------------------------------------------------------------
/images/selected_domains.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/selected_domains.png
--------------------------------------------------------------------------------
/images/trimmed_domains.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/trimmed_domains.png
--------------------------------------------------------------------------------
/infrastructure_cat.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# InfraCAT: Infrastructure Connectivity Analysis Tool\n",
8 | "\n",
9 | "### Analyzing the infrastructure connectivity of an Iris API Search"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {
16 | "collapsed": false,
17 | "jupyter": {
18 | "outputs_hidden": false
19 | },
20 | "pycharm": {
21 | "name": "#%%\n"
22 | }
23 | },
24 | "outputs": [
25 | {
26 | "name": "stdout",
27 | "output_type": "stream",
28 | "text": [
29 | "InfraCAT is ready to go\n"
30 | ]
31 | }
32 | ],
33 | "source": [
34 | "# Run This First: imports all the helper functions and sets stuff up\n",
35 | "%run infrastructure_cat_module.py\n",
36 | "\n",
37 | "print(\"InfraCAT is ready to go\")"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "## Iris REST API Credentials"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 2,
50 | "metadata": {
51 | "collapsed": false,
52 | "jupyter": {
53 | "outputs_hidden": false
54 | },
55 | "pycharm": {
56 | "name": "#%%\n"
57 | }
58 | },
59 | "outputs": [
60 | {
61 | "data": {
62 | "application/vnd.jupyter.widget-view+json": {
63 | "model_id": "842b0bc026174d7da491004391ec1055",
64 | "version_major": 2,
65 | "version_minor": 0
66 | },
67 | "text/plain": [
68 | "VBox(children=(Text(value='', description='Username:', layout=Layout(width='500px'), placeholder='Iris API Use…"
69 | ]
70 | },
71 | "metadata": {},
72 | "output_type": "display_data"
73 | }
74 | ],
75 | "source": [
76 | "api_username_ui = widgets.Text(placeholder='Iris API Username', description='Username:', layout={'width': '500px'}, value=\"\")\n",
77 | "api_pw_ui = widgets.Password(placeholder='Iris API Password', description='Password:', layout={'width': '500px'}, value=\"\")\n",
78 | "widgets.VBox([api_username_ui, api_pw_ui])"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "## Query Domain Data From Iris Investigate API\n",
86 | "\n",
87 | "Enter either a list of return delimited domains into the Domains text box, _OR_ an Iris search hash into the hash text box.\n",
88 | "\n",
89 | "Note: if both a list of domains _AND_ a search hash is entered, the list of domains will be queried and the search hash will be ignored"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 3,
95 | "metadata": {
96 | "collapsed": false,
97 | "jupyter": {
98 | "outputs_hidden": false
99 | },
100 | "pycharm": {
101 | "name": "#%%\n"
102 | }
103 | },
104 | "outputs": [
105 | {
106 | "data": {
107 | "application/vnd.jupyter.widget-view+json": {
108 | "model_id": "e81fe61e74f24a0e952d36f48c13dbe9",
109 | "version_major": 2,
110 | "version_minor": 0
111 | },
112 | "text/plain": [
113 | "VBox(children=(Label(value='Enter a return delimited list of domains to lookup (no commas, no quotes)'), Texta…"
114 | ]
115 | },
116 | "metadata": {},
117 | "output_type": "display_data"
118 | }
119 | ],
120 | "source": [
121 | "domain_list_ui = widgets.Textarea(placeholder='Enter list of domains', description='Domains:', layout={'height': '300px', 'width': '700px'})\n",
122 | "search_hash_ui = widgets.Text(placeholder='Enter list of domains', description='Hash:', layout={'width': '700px'})\n",
123 | "show_iris_query_ui(domain_list_ui, search_hash_ui)"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 15,
129 | "metadata": {
130 | "collapsed": false,
131 | "jupyter": {
132 | "outputs_hidden": false
133 | },
134 | "pycharm": {
135 | "name": "#%%\n"
136 | }
137 | },
138 | "outputs": [],
139 | "source": [
140 | "config = Config()\n",
141 | "\n",
142 | "# exclude certain infrastructure from graph\n",
143 | "# config.exclude_list = [\"EMAIL DOMAIN\"]\n",
144 | "config.exclude_list = []\n",
145 | "\n",
146 | "# only show infrastructure that is under the pivot threshold\n",
147 | "config.pivot_threshold = 500\n",
148 | "\n",
149 | "# Minimum should be 1 which means more than one domain has to show up in an edge\n",
150 | "config.edge_threshold = 1\n",
151 | "\n",
152 | "# set whether or no to set node size to the unique number of domains in the edge\n",
153 | "config.node_size = True"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 16,
159 | "metadata": {
160 | "collapsed": false,
161 | "jupyter": {
162 | "outputs_hidden": false
163 | },
164 | "pycharm": {
165 | "name": "#%%\n"
166 | }
167 | },
168 | "outputs": [
169 | {
170 | "name": "stdout",
171 | "output_type": "stream",
172 | "text": [
173 | "Loaded 338 domains from data/dash_gov_dot_us.json\n"
174 | ]
175 | }
176 | ],
177 | "source": [
178 | "query_api = False\n",
179 | "save_search_to_disk = False\n",
180 | "json_file_path = \"data/dash_gov_dot_us.json\"\n",
181 | "\n",
182 | "if query_api:\n",
183 | " iris_results = query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui)\n",
184 | " print(f'Iris API returned {len(iris_results)} domains')\n",
185 | "\n",
186 | " # save search results to disk to be used later\n",
187 | " if save_search_to_disk:\n",
188 | " with open(json_file_path, 'w') as f:\n",
189 | " json.dump(iris_results, f)\n",
190 | "else:\n",
191 | " with open(json_file_path) as json_data:\n",
192 | " iris_results = json.loads(json_data.read())\n",
193 | "\n",
194 | " print(f'Loaded {len(iris_results)} domains from {json_file_path}')\n"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 20,
200 | "metadata": {},
201 | "outputs": [
202 | {
203 | "name": "stdout",
204 | "output_type": "stream",
205 | "text": [
206 | "380\n"
207 | ]
208 | }
209 | ],
210 | "source": [
211 | "graph, config = build_infra_graph(iris_results, config)\n",
212 | "\n",
213 | "print(len(graph.nodes))"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 21,
219 | "metadata": {},
220 | "outputs": [
221 | {
222 | "data": {
223 | "application/vnd.jupyter.widget-view+json": {
224 | "model_id": "d37f5c75ad8c4a42b01667bd93d6fb71",
225 | "version_major": 2,
226 | "version_minor": 0
227 | },
228 | "text/plain": [
229 | "VBox(children=(FigureWidget({\n",
230 | " 'data': [{'hoverinfo': 'none',\n",
231 | " 'line': {'color': '#888', 'widt…"
232 | ]
233 | },
234 | "metadata": {},
235 | "output_type": "display_data"
236 | }
237 | ],
238 | "source": [
239 | "build_2d_graph_layout(graph, config)"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 22,
245 | "metadata": {},
246 | "outputs": [
247 | {
248 | "data": {
249 | "application/vnd.jupyter.widget-view+json": {
250 | "model_id": "5c83bc927cca4294aaba3fb6c85f728c",
251 | "version_major": 2,
252 | "version_minor": 0
253 | },
254 | "text/plain": [
255 | "VBox(children=(FigureWidget({\n",
256 | " 'data': [{'hoverinfo': 'none',\n",
257 | " 'line': {'color': 'rgb(125,125,…"
258 | ]
259 | },
260 | "metadata": {},
261 | "output_type": "display_data"
262 | }
263 | ],
264 | "source": [
265 | "build_3d_graph_layout(graph, config)"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": 34,
271 | "metadata": {
272 | "collapsed": false,
273 | "jupyter": {
274 | "outputs_hidden": false
275 | },
276 | "pycharm": {
277 | "name": "#%%\n"
278 | }
279 | },
280 | "outputs": [
281 | {
282 | "name": "stdout",
283 | "output_type": "stream",
284 | "text": [
285 | "Loaded 195 domains from data/treatment_care.json\n"
286 | ]
287 | }
288 | ],
289 | "source": [
290 | "query_api = False\n",
291 | "save_search_to_disk = False\n",
292 | "json_file_path = \"data/treatment_care.json\"\n",
293 | "\n",
294 | "if query_api:\n",
295 | " iris_results = query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui)\n",
296 | " print(f'Iris API returned {len(iris_results)} domains')\n",
297 | "\n",
298 | " # save search results to disk to be used later\n",
299 | " if save_search_to_disk:\n",
300 | " with open(json_file_path, 'w') as f:\n",
301 | " json.dump(iris_results, f)\n",
302 | "else:\n",
303 | " with open(json_file_path) as json_data:\n",
304 | " iris_results = json.loads(json_data.read())\n",
305 | "\n",
306 | " print(f'Loaded {len(iris_results)} domains from {json_file_path}')"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": 44,
312 | "metadata": {},
313 | "outputs": [],
314 | "source": [
315 | "config = Config()\n",
316 | "\n",
317 | "# exclude certain infrastructure from graph\n",
318 | "# config.exclude_list = [\"EMAIL DOMAIN\"]\n",
319 | "config.exclude_list = []\n",
320 | "\n",
321 | "# only show infrastructure that is under the pivot threshold\n",
322 | "config.pivot_threshold = 50000\n",
323 | "\n",
324 | "# Minimum should be 1 which means more than one domain has to show up in an edge\n",
325 | "config.edge_threshold = 1\n",
326 | "\n",
327 | "# set whether or no to set node size to the unique number of domains in the edge\n",
328 | "config.node_size = True"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": 45,
334 | "metadata": {
335 | "collapsed": false,
336 | "jupyter": {
337 | "outputs_hidden": false
338 | },
339 | "pycharm": {
340 | "name": "#%%\n"
341 | }
342 | },
343 | "outputs": [
344 | {
345 | "name": "stdout",
346 | "output_type": "stream",
347 | "text": [
348 | "6\n"
349 | ]
350 | }
351 | ],
352 | "source": [
353 | "graph, config = build_infra_graph(iris_results, config)\n",
354 | "\n",
355 | "print(len(graph.nodes))"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": 46,
361 | "metadata": {
362 | "collapsed": false,
363 | "jupyter": {
364 | "outputs_hidden": false
365 | },
366 | "pycharm": {
367 | "name": "#%%\n"
368 | }
369 | },
370 | "outputs": [
371 | {
372 | "data": {
373 | "application/vnd.jupyter.widget-view+json": {
374 | "model_id": "98caf40adfc84751a057c517b69975bb",
375 | "version_major": 2,
376 | "version_minor": 0
377 | },
378 | "text/plain": [
379 | "VBox(children=(FigureWidget({\n",
380 | " 'data': [{'hoverinfo': 'none',\n",
381 | " 'line': {'color': '#888', 'widt…"
382 | ]
383 | },
384 | "metadata": {},
385 | "output_type": "display_data"
386 | }
387 | ],
388 | "source": [
389 | "build_2d_graph_layout(graph, config)"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": 47,
395 | "metadata": {
396 | "scrolled": true,
397 | "tags": []
398 | },
399 | "outputs": [
400 | {
401 | "data": {
402 | "application/vnd.jupyter.widget-view+json": {
403 | "model_id": "8f6935bd23bd491c9b8446814721dfdd",
404 | "version_major": 2,
405 | "version_minor": 0
406 | },
407 | "text/plain": [
408 | "VBox(children=(FigureWidget({\n",
409 | " 'data': [{'hoverinfo': 'none',\n",
410 | " 'line': {'color': 'rgb(125,125,…"
411 | ]
412 | },
413 | "metadata": {},
414 | "output_type": "display_data"
415 | }
416 | ],
417 | "source": [
418 | "build_3d_graph_layout(graph, config)"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": 48,
424 | "metadata": {
425 | "tags": []
426 | },
427 | "outputs": [
428 | {
429 | "name": "stdout",
430 | "output_type": "stream",
431 | "text": [
432 | "18\n"
433 | ]
434 | }
435 | ],
436 | "source": [
437 | "pair_graph, pair_config = build_pair_infra_graph(iris_results, config)\n",
438 | "\n",
439 | "print(len(pair_graph.nodes))"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": 49,
445 | "metadata": {},
446 | "outputs": [
447 | {
448 | "data": {
449 | "application/vnd.jupyter.widget-view+json": {
450 | "model_id": "a601dde8107842a39561970e8fb5f981",
451 | "version_major": 2,
452 | "version_minor": 0
453 | },
454 | "text/plain": [
455 | "VBox(children=(FigureWidget({\n",
456 | " 'data': [{'hoverinfo': 'none',\n",
457 | " 'line': {'color': '#888', 'widt…"
458 | ]
459 | },
460 | "metadata": {},
461 | "output_type": "display_data"
462 | }
463 | ],
464 | "source": [
465 | "build_2d_graph_layout(pair_graph, pair_config)"
466 | ]
467 | },
468 | {
469 | "cell_type": "code",
470 | "execution_count": 50,
471 | "metadata": {},
472 | "outputs": [
473 | {
474 | "data": {
475 | "application/vnd.jupyter.widget-view+json": {
476 | "model_id": "d4f902e406d8446c9b6eb441fde9f99d",
477 | "version_major": 2,
478 | "version_minor": 0
479 | },
480 | "text/plain": [
481 | "VBox(children=(FigureWidget({\n",
482 | " 'data': [{'hoverinfo': 'none',\n",
483 | " 'line': {'color': 'rgb(125,125,…"
484 | ]
485 | },
486 | "metadata": {},
487 | "output_type": "display_data"
488 | }
489 | ],
490 | "source": [
491 | "build_3d_graph_layout(pair_graph, pair_config)"
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": null,
497 | "metadata": {
498 | "collapsed": false,
499 | "jupyter": {
500 | "outputs_hidden": false
501 | },
502 | "pycharm": {
503 | "name": "#%%\n"
504 | }
505 | },
506 | "outputs": [],
507 | "source": []
508 | },
509 | {
510 | "cell_type": "code",
511 | "execution_count": null,
512 | "metadata": {
513 | "collapsed": false,
514 | "jupyter": {
515 | "outputs_hidden": false
516 | },
517 | "pycharm": {
518 | "name": "#%%\n"
519 | }
520 | },
521 | "outputs": [],
522 | "source": []
523 | }
524 | ],
525 | "metadata": {
526 | "kernelspec": {
527 | "display_name": "Python 3 (ipykernel)",
528 | "language": "python",
529 | "name": "python3"
530 | },
531 | "language_info": {
532 | "codemirror_mode": {
533 | "name": "ipython",
534 | "version": 3
535 | },
536 | "file_extension": ".py",
537 | "mimetype": "text/x-python",
538 | "name": "python",
539 | "nbconvert_exporter": "python",
540 | "pygments_lexer": "ipython3",
541 | "version": "3.8.10"
542 | }
543 | },
544 | "nbformat": 4,
545 | "nbformat_minor": 4
546 | }
--------------------------------------------------------------------------------
/infrastructure_cat_module.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import re
4 | import json
5 | import math
6 | from difflib import SequenceMatcher
7 | import plotly.graph_objects as go
8 | import requests
9 | import networkx as nx
10 | import pandas as pd
11 | import numpy as np
12 | import scipy
13 | import matplotlib
14 | import matplotlib.pyplot as plt
15 | from ipywidgets import interactive, HBox, VBox
16 | import ipywidgets as widgets
17 | from IPython.display import HTML, display
18 | import tabulate
19 | from dotenv import dotenv_values
20 | from domaintools import API
21 | from configparser import ConfigParser
22 |
23 | import networkx as nx
24 | import matplotlib.pyplot as plt
25 |
26 | import itertools
27 |
28 | # load REST API creds from .env file
29 | dcat_config = dotenv_values(".env")
30 |
31 |
32 | def show_iris_query_ui(domain_list_ui, search_hash_ui):
33 | lookup_ui = widgets.VBox([
34 | widgets.Label(value="Enter a return delimited list of domains to lookup (no commas, no quotes)"),
35 | domain_list_ui,
36 | widgets.Label(value="Or..."),
37 | widgets.Label(value="Enter an Iris search hassh to lookup"),
38 | search_hash_ui,
39 | ])
40 | return lookup_ui
41 |
42 |
43 | def clean_domain_list(domain_list_ui):
44 | # remove any quotes, spaces, or defanging square brackets
45 | full_domain_list = domain_list_ui.value.strip().replace(' ', '').replace('"', '').replace("'", "").replace('[',
46 | '').replace(
47 | ']', '')
48 | # replace commas with new lines
49 | full_domain_list = full_domain_list.replace(",", "\n")
50 | # update the widget
51 | domain_list_ui.value = full_domain_list
52 | # split into array
53 | return full_domain_list.split("\n")
54 |
55 |
56 | def get_rest_api_creds(api_username_ui, api_pw_ui):
57 | api_username = api_username_ui.value
58 | if len(api_username) == 0:
59 | api_username = dcat_config["IRIS_API_USERNAME"]
60 | api_key = api_pw_ui.value
61 | if len(api_key) == 0:
62 | api_key = dcat_config["IRIS_API_KEY"]
63 | return api_username, api_key
64 |
65 |
66 | def query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui):
67 | api_username, api_key = get_rest_api_creds(api_username_ui, api_pw_ui)
68 | api = API(api_username, api_key)
69 | if len(domain_list_ui.value) > 0:
70 | # split list of domains into groups of 100 because of API restrictions
71 | results = []
72 | full_domain_list = clean_domain_list(domain_list_ui)
73 | max_domains = 100
74 | start = 0
75 | end = max_domains
76 | for _ in range(math.ceil(len(full_domain_list) / max_domains)):
77 | # slice out max domains to query
78 | partial_domain_list = full_domain_list[start:end]
79 | # build query string
80 | domain_list = ",".join(partial_domain_list)
81 | iris_query = {"domains": domain_list}
82 | # query rest api
83 | print(f"...querying Iris REST API for {len(partial_domain_list)} domains")
84 | iris_results = api.iris_investigate(**iris_query)
85 | # build up the set of return domain objects
86 | results += iris_results.response().get('results', {})
87 | # update slice indexes
88 | start = end
89 | end += max_domains
90 | return results
91 | elif len(search_hash_ui.value) > 0:
92 | iris_query = {"search_hash": search_hash_ui.value}
93 | iris_results = api.iris_investigate(**iris_query)
94 | # print(iris_results.status)
95 | iris_results = iris_results.response().get('results', {})
96 | return iris_results
97 | else:
98 | print(
99 | "Domain List and Search Hash text boxes are empty. Please enter either a list of domains or search hash to lookup")
100 | raise Exception("Domain List and Search Hash text boxes are empty")
101 |
102 |
103 | class Config(object):
104 | """ Little helper class to hold all the config values"""
105 |
106 |
107 | class Domain(object):
108 | """ Little helper class to hold the domain name and risk score
109 | """
110 |
111 | def __init__(self, domain_json):
112 | self.json = domain_json
113 | self.name = domain_json["domain"]
114 | self.risk_score = domain_json["domain_risk"]['risk_score']
115 | self.pivots = {}
116 | self.label = f"{self.name} ({self.risk_score})"
117 |
118 | def __str__(self):
119 | return f"name: {self.name}, risk: {self.risk_score}"
120 |
121 | def __repr__(self):
122 | return str(self)
123 |
124 |
125 | class DomainRelationship(object):
126 | def __init__(self, weight: float, category: str):
127 | # this is the maximum weight that an edge can have.
128 | # Adjust this if you want to play around with stronger edge weights
129 | self.max_weight = 5.0
130 | self.weight = weight
131 | self.categories = [category]
132 |
133 | def __str__(self):
134 | return f"weight: {self.weight}, categories: {self.categories}"
135 |
136 | def __repr__(self):
137 | return str(self)
138 |
139 | def add(self, weight: float, category: str):
140 | """ Note: certain pivot categories can be added more than once for 2 domains;
141 | things like IP and name server. For example, two domains could be on the same set of 5
142 | IP addreese. For now the weights are just summed if there are more than one pivots of
143 | the same category, but maybe we need a different strategy. Since IPs have multiple pivots
144 | (ip address, country code, asn, isp) this means if there were 5 shared IPs between two
145 | domains, the weight would be: 4 * 5 * pivot_weight.
146 | This might over amplify the edge strength
147 | """
148 | if category not in self.categories:
149 | # this helps by not overly boosting the edge weight if two domains share
150 | # multipel IP addresses
151 | self.weight += weight
152 | self.weight = min(self.weight, self.max_weight)
153 | self.categories.append(category)
154 |
155 | def get_description(self):
156 | return "
".join(sorted(self.categories))
157 |
158 |
159 | class Pivot(object):
160 | def __init__(self, category, value, global_count):
161 | self.category = category
162 | self.value = value
163 | self.global_count = global_count
164 | self.domains = set()
165 |
166 | # def union(self, other: "Pivot"):
167 | # self.domains.union(other.domains)
168 |
169 | def label(self):
170 | # return f"category: {self.category}: value: {self.value} ({self.global_count})"
171 | return f"{self.category}: {self.value} ({self.global_count})"
172 |
173 | def __str__(self):
174 | return f"category: {self.category}, " \
175 | f"value: {self.value}, " \
176 | f"global_count: {self.global_count}, " \
177 | f"domains: {self.domains}"
178 |
179 | def __repr__(self):
180 | return str(self)
181 |
182 |
183 | # build graph
184 | def get_edge_count(n: int):
185 | # for a complete graph, the edge count is: n(n-1)/2
186 | return n * (n - 1) / 2
187 |
188 |
189 | # def pivot_on_matching_substrings(graph: "Graph", domains: dict, config: "Config"):
190 | # """Create pivots between domains that share a common substring of
191 | # `config.longest_common_substring` chars long.
192 | #
193 | # Note: SequenceMatcher has some known issues with not finding the longest match in very long
194 | # strings, but does a pretty good job with shorter strings such as domain names.
195 | # https://stackoverflow.com/questions/18715688/find-common-substring-between-two-strings
196 | # """
197 | # domain_names = list(domains.keys())
198 | # for x in range(len(domain_names)):
199 | # domain1 = domain_names[x]
200 | # string1 = domain1.split('.')[0]
201 | # # pull out substrings to ignore
202 | # if config.ignore_substrings and len(config.ignore_substrings) > 0:
203 | # for ignore in config.ignore_substrings:
204 | # string1 = string1.replace(ignore, "")
205 | # for y in range(x + 1, len(domain_names)):
206 | # domain2 = domain_names[y]
207 | # string2 = domain2.split('.')[0]
208 | # # pull out substrings to ignore
209 | # if config.ignore_substrings and len(config.ignore_substrings) > 0:
210 | # for ignore in config.ignore_substrings:
211 | # string2 = string2.replace(ignore, "")
212 | # # find the longest common substring between the two domains
213 | # matcher = SequenceMatcher(None, string1, string2, False)
214 | # match = matcher.find_longest_match(0, len(string1), 0, len(string2))
215 | # longest_match = string1[match.a: match.a + match.size]
216 | # # check if the matching substring is long enough
217 | # if len(longest_match) >= config.longest_common_substring:
218 | # # add pivots
219 | # _append_value_to_pivot(
220 | # graph,
221 | # "longest_common_substring",
222 | # longest_match, None,
223 | # domains[domain1], config)
224 | # _append_value_to_pivot(
225 | # graph,
226 | # "longest_common_substring",
227 | # longest_match, None,
228 | # domains[domain2], config)
229 |
230 |
231 | def build_pivot_graph(iris_results: list, config: "Config"):
232 | """ Main workflow function that takes the results from an Iris Investigate query and
233 | builds the graph object of how each of the domains in the query are connected to each other"""
234 |
235 | # parse the Iris API Result to build the pivot data structure
236 | graph, domains = init_local_pivot_graph(iris_results, config)
237 | print(len(graph.nodes))
238 | print()
239 |
240 | # normalize registrar pivots (see note in function comments)
241 | # if "registrar" in pivot_categories and config.normalize_registrars:
242 | # normalize_similar_registrars(pivot_categories["registrar"])
243 |
244 | # create pivots for longest common substrings
245 | # pivot_on_matching_substrings(graph, domains, config)
246 | # print(len(graph.nodes))
247 | # print()
248 |
249 | # trim pivots from graph that have less than the set count threshold or contain all domains
250 | # graph = trim_pivots(graph, len(domains), config)
251 | # print(len(graph.nodes))
252 | # print()
253 |
254 | # trim unconnected domains and domains with only a create date pivot
255 | # TURBO: I'm not sure yet how to do this
256 | # trimmed_unconnected_domains = trim_unconnected_domains(graph, domains, config)
257 | # print(len(graph.nodes))
258 | # print()
259 |
260 | # trimmed_create_date_domains = trim_domains_with_only_create_date_pivot(graph, pivot_categories)
261 | # print(len(graph.nodes))
262 | # print()
263 |
264 | # print(f"{len(trimmed_unconnected_domains)} "
265 | # f"domains trimmed because they were not connected to other domains")
266 | # print(f"{len(trimmed_create_date_domains)} "
267 | # f"domains trimmed because create_date was the only pivot")
268 | print(f"{len(graph.nodes)} nodes in graph structure \n")
269 |
270 | # build the graph structure based on the domain pivots
271 | graph = build_local_pivot_graph(graph, domains, config)
272 | return (graph, domains,
273 | {
274 | # "unconnected": trimmed_unconnected_domains,
275 | # "create_date": trimmed_create_date_domains
276 | }
277 | )
278 |
279 |
280 | def get_pivots(data_obj, name, return_data=None, count=0, pivot_threshold=500):
281 | """
282 | Does a deep dive through a data object to check count vs pivot threshold.
283 | Args:
284 | data_obj: Either a list or dict that needs to check pivot count
285 | name: pivot category name
286 | return_data: Holds data to return once we reach the end of the data_obj
287 | count: Lets us track to know when we are finished with the data_obj
288 | pivot_threshold: Threshold to include as a pivot.
289 | """
290 | if return_data is None:
291 | return_data = []
292 | count += 1
293 | if isinstance(data_obj, dict) and len(data_obj):
294 | temp_name = name
295 | for k, v in data_obj.items():
296 | if isinstance(data_obj[k], (dict, list)):
297 | name = "{}_{}".format(name, k)
298 | temp_data = get_pivots(
299 | data_obj[k], name, return_data, count, pivot_threshold
300 | )
301 | if temp_data:
302 | return_data.append([name[1:].upper().replace("_", " "), temp_data])
303 | name = temp_name
304 | if "count" in data_obj and (1 < data_obj["count"] < pivot_threshold):
305 | return data_obj["value"], data_obj["count"]
306 | elif isinstance(data_obj, list) and len(data_obj):
307 | for index, item in enumerate(data_obj):
308 | temp_data = get_pivots(item, name, return_data, count, pivot_threshold)
309 | if temp_data:
310 | if isinstance(temp_data, list):
311 | for x in temp_data:
312 | return_data.append(x)
313 | elif isinstance(temp_data, tuple):
314 | return_data.append([name[1:].upper().replace("_", " "), temp_data])
315 | count -= 1
316 | if count:
317 | return
318 | else:
319 | return return_data
320 |
321 |
322 | def build_infra_graph(iris_results: list, config: "Config"):
323 | graph = nx.Graph()
324 | pv_dict = {}
325 | config.domain_risk_dict = {}
326 | for domain in iris_results:
327 | if domain["domain"] not in config.domain_risk_dict:
328 | config.domain_risk_dict[domain["domain"]] = domain.get("domain_risk", {}).get("risk_score", 0)
329 | # GET PIVOTS
330 | nps = get_pivots(domain, "", pivot_threshold=config.pivot_threshold)
331 | pv_list = []
332 | for p in nps:
333 | if p[0] not in config.exclude_list:
334 | pv_list.append("{}_{}".format(p[0], p[1][0]))
335 | # CREATE POSSIBLE NODES AND POSSIBLE EDGES
336 | x = itertools.combinations(pv_list, 2)
337 | for g in x:
338 | if "{}:::{}".format(g[0], g[1]) in pv_dict:
339 | if domain["domain"] not in pv_dict["{}:::{}".format(g[0], g[1])]:
340 | pv_dict["{}:::{}".format(g[0], g[1])].append(domain["domain"])
341 | else:
342 | pv_dict["{}:::{}".format(g[0], g[1])] = [domain["domain"]]
343 |
344 | b_pv_list = []
345 | my_set = set()
346 |
347 | # FILTER OUT EDGES THAT DON'T MEET THRESHOLD
348 | for k, v in pv_dict.items():
349 | if len(v) > config.edge_threshold:
350 | a = k.split(":::")
351 | b_pv_list.append([a[0], a[1], v, len(v)])
352 | my_set.add(a[0])
353 | my_set.add(a[1])
354 | # print(k, v, len(v))
355 |
356 | # CREATE NODES
357 | for m in my_set:
358 | graph.add_node(m, color='blue', size=0)
359 |
360 | # CREATE EDGES
361 | for m in b_pv_list:
362 | graph.add_edge(m[0], m[1], domains=m[2], length=m[3])
363 | return graph, config
364 |
365 |
366 | def build_pair_infra_graph(iris_results: list, config: "Config"):
367 | graph = nx.Graph()
368 | pv_dict = {}
369 | config.domain_risk_dict = {}
370 | for domain in iris_results:
371 | if domain["domain"] not in config.domain_risk_dict:
372 | config.domain_risk_dict[domain["domain"]] = domain.get("domain_risk", {}).get("risk_score", 0)
373 | # GET PIVOTS
374 | nps = get_pivots(domain, "", pivot_threshold=config.pivot_threshold)
375 | pv_list = [
376 | "{}_{}".format(p[0], p[1][0])
377 | for p in nps
378 | if p[0] not in config.exclude_list
379 | ]
380 |
381 | # CREATE POSSIBLE NODES AND POSSIBLE EDGES
382 | x = itertools.combinations(pv_list, 2)
383 | # print(x)
384 | i_list = []
385 | for g in x:
386 | # print("{}:::{}".format(g[0], g[1]))
387 | if "{}:::{}".format(g[0], g[1]) not in i_list and g[0] != g[1]:
388 | i_list.append("{}:::{}".format(g[0], g[1]))
389 | y = itertools.combinations(i_list, 2)
390 | for g in y:
391 |
392 | if "{}|||{}".format(g[0], g[1]) in pv_dict:
393 | if domain["domain"] not in pv_dict["{}|||{}".format(g[0], g[1])]:
394 | pv_dict["{}|||{}".format(g[0], g[1])].append(domain["domain"])
395 | else:
396 | pv_dict["{}|||{}".format(g[0], g[1])] = [domain["domain"]]
397 | # print(pv_dict)
398 | b_pv_list = []
399 | my_set = set()
400 |
401 | # FILTER OUT EDGES THAT DON'T MEET THRESHOLD
402 | for k, v in pv_dict.items():
403 | if len(v) > config.edge_threshold:
404 | a = k.split("|||")
405 | if a[0] != a[1]:
406 | b_pv_list.append([a[0], a[1], v, len(v)])
407 | my_set.add(a[0])
408 | my_set.add(a[1])
409 | # print(k, v, len(v))
410 |
411 | # CREATE NODES
412 | for m in my_set:
413 | graph.add_node(m, color='blue', size=0)
414 |
415 | # CREATE EDGES
416 | for m in b_pv_list:
417 | graph.add_edge(m[0], m[1], domains=m[2], length=m[3])
418 | return graph, config
419 |
420 |
421 | def calc_viz_layout(layout: str, graph: "Graph", dimension: int):
422 | # KK layout only
423 | if layout == "kk":
424 | return nx.layout.kamada_kawai_layout(graph, dim=dimension)
425 |
426 | # spring layout only
427 | if layout == "fr":
428 | return nx.layout.spring_layout(graph, dim=dimension)
429 |
430 | # kk layout as initialization for spring layout
431 | if layout == "kk_to_fr":
432 | pos = nx.layout.kamada_kawai_layout(graph, dim=dimension, weight=None)
433 | return nx.layout.spring_layout(graph, pos=pos, dim=dimension)
434 |
435 | # spring layout as initialization for kk layout
436 | if layout == "fr_to_kk":
437 | pos = nx.layout.spring_layout(graph, dim=dimension)
438 | return nx.layout.kamada_kawai_layout(graph, pos=pos, dim=dimension)
439 | raise Exception("invalid layout choice")
440 |
441 |
442 | def average_risk_score(domain_list, domain_dict):
443 | total = sum(domain_dict[d] for d in domain_list)
444 | avg_risk_score = int(total / len(domain_list))
445 | # print(avg_risk_score)
446 | if avg_risk_score >= 90:
447 | color = 'red'
448 | elif avg_risk_score >= 75:
449 | color = 'orange'
450 | elif avg_risk_score >= 55:
451 | color = 'yellow'
452 | else:
453 | color = 'green'
454 | return color, avg_risk_score
455 |
456 |
457 | def build_3d_graph_layout(graph: "Graph", config):
458 | """ Build the graph layout based on the specified algorithm and get the node positions
459 | in xyz dimensions"""
460 |
461 | pos = calc_viz_layout("kk_to_fr", graph, 3)
462 |
463 | node_labels, node_risk_scores, node_size, names, Xn, Yn, Zn = [], [], [], [], [], [], []
464 | i = 0
465 | for node in graph.nodes(data=True):
466 | # build x,y,z coordinates data structure for nodes
467 | Xn.append(pos[node[0]][0])
468 | Yn.append(pos[node[0]][1])
469 | Zn.append(pos[node[0]][2])
470 | domain_set = set()
471 | for e in graph.edges(node[0], data=True):
472 | domain_set.update(e[2]['domains'])
473 | domain_list = list(domain_set)
474 | color, avg_risk_score = average_risk_score(domain_list, config.domain_risk_dict)
475 | node_labels.append(
476 | "{}
Avg Risk Score: {}
Number of unique domains on edges: {}".format(node[0], avg_risk_score,
477 | len(domain_list)))
478 | node_risk_scores.append(color)
479 | node_size.append(len(domain_list))
480 | names.append(domain_list)
481 |
482 | if not config.node_size:
483 | node_size = 6
484 |
485 | # build x,y,z coordinates data structure for edges
486 | Xe, Ye, Ze = [], [], []
487 | for e in graph.edges:
488 | u = pos[e[0]]
489 | v = pos[e[1]]
490 | Xe += [u[0], v[0], None]
491 | Ye += [u[1], v[1], None]
492 | Ze += [u[2], v[2], None]
493 |
494 | # Create the 3d Plotly graph and render it
495 | # build line objects for our edges
496 | trace1 = go.Scatter3d(x=Xe, y=Ye, z=Ze,
497 | mode='lines',
498 | name='domains',
499 | line=dict(color='rgb(125,125,125)', width=0.5),
500 | opacity=0.9,
501 | hoverinfo='none')
502 |
503 | trace2 = go.Scatter3d(
504 | x=Xn, y=Yn, z=Zn,
505 | mode='markers',
506 | name='pivots',
507 | marker=dict(
508 | symbol='circle',
509 | size=node_size,
510 | color=node_risk_scores,
511 | line=dict(color='rgb(50,50,50)', width=0.5),
512 | ),
513 | text=node_labels,
514 | hoverinfo='text')
515 |
516 | # background definition, but everything is turned off
517 | axis = dict(showbackground=False,
518 | showline=False,
519 | zeroline=False,
520 | showgrid=False,
521 | showticklabels=False,
522 | title='')
523 |
524 | layout = go.Layout(
525 | title=f"Graph of interconnected infrastructure ({len(node_labels)} infra nodes)",
526 | width=1000, height=1000,
527 | showlegend=False,
528 | scene=dict(xaxis=dict(axis), yaxis=dict(axis), zaxis=dict(axis)),
529 | margin=dict(t=100), hovermode='closest')
530 |
531 | data = [trace1, trace2]
532 | fig = go.FigureWidget(data=data, layout=layout)
533 |
534 | # handle selection of domains
535 | # def node_selection_fn(trace, points, selector):
536 | # selected_domains = [names[idx] for idx in points.point_inds]
537 | # update_selected_domains(selected_domains)
538 |
539 | # handle node click events
540 | def node_click_fn(trace, points, selector):
541 | if len(points.point_inds) > 1:
542 | print(f"node_click passed in more than 1 point: {points.point_inds}")
543 |
544 | # clear the old selected points
545 | # trace.selectedpoints = []
546 | # if len(points.point_inds) == 0:
547 | # return
548 |
549 | # get the list of selected domain names
550 | selected_domains = [names[idx] for idx in points.point_inds]
551 | # for id in points.point_inds:
552 | # selected_domains = selected_domains + trace.customdata[id]
553 |
554 | # set the new selected points
555 | # don't like having to loop in a loop to get the domain index, but I don't know a better way
556 | # trace.selectedpoints = points.point_inds + [names.index(name) for name in trace.customdata[id]]
557 |
558 | update_selected_domains(selected_domains)
559 |
560 | def update_selected_domains(selected_domains):
561 | if len(selected_domains) == 0:
562 | return
563 |
564 | # sort domains by length, then alpha
565 | selected_domains.sort(key=len, reverse=True)
566 | with out:
567 | # write selected domains to the output widget
568 | print(f"Selected Infra: ({len(selected_domains)})\n")
569 | for selected_domain in selected_domains:
570 | print(selected_domain)
571 | out.clear_output(wait=True)
572 |
573 | # calc pivots selected domains have in common
574 | # get_2d_shared_pivots(graph, selected_domains)
575 |
576 | # event handler for node selection
577 | # fig.data[1].on_selection(node_selection_fn)
578 | # event handle for node click
579 | fig.data[1].on_click(node_click_fn)
580 |
581 | # Create a table FigureWidget that updates the list of selected domains
582 | out = widgets.Output(layout={'border': '1px solid black'})
583 | domain_ui = widgets.VBox((fig, out))
584 | return domain_ui
585 |
586 |
587 | def build_2d_graph_layout(graph: "Graph", config):
588 | """ build the graph layout based on the specified algorithm and get the node positions
589 | in xy dimensions"""
590 | pos = calc_viz_layout("kk_to_fr", graph, 2)
591 | # pos = calc_viz_layout("fr_to_kk", g, 2)
592 |
593 | # build edge data
594 | edge_x, edge_y = [], []
595 | for e in graph.edges():
596 | x0, y0 = pos[e[0]]
597 | x1, y1 = pos[e[1]]
598 | edge_x.append(x0)
599 | edge_x.append(x1)
600 | edge_x.append(None)
601 | edge_y.append(y0)
602 | edge_y.append(y1)
603 | edge_y.append(None)
604 |
605 | # create edge scatter plot
606 | edge_trace = go.Scatter(
607 | x=edge_x, y=edge_y,
608 | line=dict(width=0.5, color='#888'),
609 | hoverinfo='none',
610 | mode='lines',
611 | opacity=0.6
612 | )
613 |
614 | # build node data
615 | node_adjacencies, node_risk_scores, node_text, node_labels, node_size, node_x, node_y = [], [], [], [], [], [], []
616 | names = list(graph.nodes)
617 | for name in graph.nodes(data=True):
618 | domain = graph.nodes[name[0]]
619 | x, y = pos[name[0]]
620 | node_x.append(x)
621 | node_y.append(y)
622 | # get the domain's connected nodes
623 | neighbors = list(graph.neighbors(name[0]))
624 | node_adjacencies.append(neighbors)
625 | domain_set = set()
626 | for e in graph.edges(name[0], data=True):
627 | domain_set.update(e[2]['domains'])
628 | domain_list = list(domain_set)
629 | color, avg_risk_score = average_risk_score(domain_list, config.domain_risk_dict)
630 | node_labels.append(
631 | "{}
Avg Risk Score: {}
Number of unique domains on edges: {}".format(name[0], avg_risk_score,
632 | len(domain_list)))
633 | node_risk_scores.append(color)
634 | node_size.append(len(domain_list))
635 | names.append(domain_list)
636 |
637 | if not config.node_size:
638 | node_size = 6
639 |
640 | # build node scatter plot
641 | node_trace = go.Scatter(
642 | x=node_x, y=node_y,
643 | mode='markers',
644 | hoverinfo='text',
645 | text=node_labels,
646 | customdata=node_adjacencies,
647 | marker=dict(
648 | showscale=True,
649 | reversescale=True,
650 | color=node_risk_scores,
651 | colorscale=[[0.0, 'red'], [0.3, 'orange'], [0.5, 'yellow'], [1.0, 'green']],
652 | # cmin/cmax needed so plotly doesn't normalize the scores to calculate the color
653 | cmin=0, cmax=100,
654 | size=node_size,
655 | colorbar=dict(
656 | thickness=15,
657 | title='Risk Score',
658 | xanchor='left',
659 | titleside='right'
660 | ),
661 | line_width=2))
662 |
663 | # create the jup widget holder for plotly
664 | fig = go.FigureWidget(
665 | [edge_trace, node_trace],
666 | layout=go.Layout(
667 | title=f'Graph of interconnected infrastructure ({len(node_labels)} infra nodes)',
668 | titlefont_size=16,
669 | showlegend=False,
670 | hovermode='closest',
671 | margin=dict(b=5, l=5, r=5, t=30),
672 | xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
673 | yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
674 | )
675 |
676 | # handle selection of domains
677 | def node_selection_fn(trace, points, selector):
678 | selected_domains = [names[idx] for idx in points.point_inds]
679 | update_selected_domains(selected_domains)
680 |
681 | # handle node click events
682 | def node_click_fn(trace, points, selector):
683 | if len(points.point_inds) > 1:
684 | print(f"node_click passed in more than 1 point: {points.point_inds}")
685 |
686 | # clear the old selected points
687 | trace.selectedpoints = []
688 | if len(points.point_inds) == 0:
689 | return
690 |
691 | # get the list of selected domain names
692 | selected_domains = [names[idx] for idx in points.point_inds]
693 | for id in points.point_inds:
694 | selected_domains = selected_domains + trace.customdata[id]
695 |
696 | # set the new selected points
697 | # don't like having to loop in a loop to get the domain index, but I don't know a better way
698 | trace.selectedpoints = points.point_inds + [names.index(name) for name in trace.customdata[id]]
699 |
700 | update_selected_domains(selected_domains)
701 |
702 | def update_selected_domains(selected_domains):
703 | if len(selected_domains):
704 | return
705 |
706 | # sort domains by length, then alpha
707 | selected_domains.sort(key=len, reverse=True)
708 | with out:
709 | # write selected domains to the output widget
710 | print(f"Selected Infra: ({len(selected_domains)})\n")
711 | for selected_domain in selected_domains:
712 | print(selected_domain)
713 | out.clear_output(wait=True)
714 |
715 |
716 | # event handler for node selection
717 | fig.data[1].on_selection(node_selection_fn)
718 | # event handle for node click
719 | fig.data[1].on_click(node_click_fn)
720 |
721 | # Create a table FigureWidget that updates the list of selected domains
722 | out = widgets.Output(layout={'border': '1px solid black'})
723 | domain_ui = widgets.VBox((fig, out))
724 | return domain_ui
725 |
726 |
727 | def get_shared_pivots(graph: "Graph", selected_domains: list):
728 | shared_pivots = {}
729 | for name in selected_domains:
730 | domain = graph.nodes[name]["domain"]
731 | for cat in domain.pivot_categories:
732 | for cat_value in domain.pivot_categories[cat]:
733 | key = f"{cat}: {cat_value}"
734 | if key not in shared_pivots:
735 | shared_pivots[key] = []
736 | shared_pivots[key].append(domain)
737 |
738 | # filter by pivots that have >= n domains
739 | shared_pivots = {k: v for k, v in shared_pivots.items() if len(v) >= 3}
740 | return shared_pivots
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | ipywidgets>=7.5
3 | networkx
4 | plotly==4.14.3
5 | tabulate
6 | numpy
7 | scipy
8 | matplotlib
9 | pandas
10 | python-dotenv
11 | domaintools-api
--------------------------------------------------------------------------------