├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── data ├── dash_gov_dot_us.json └── treatment_care.json ├── documentation ├── todo.md └── tutorial.md ├── domain_cat.ipynb ├── domain_cat_module.py ├── images ├── 2d_click.gif ├── 2d_v1.png ├── 2d_zoom.gif ├── 2d_zoom_select.gif ├── 3d_infra.gif ├── 3d_v1.gif ├── 3d_v2.gif ├── build_graph.png ├── config.png ├── credentials.png ├── dash_gov.us_substrings.gif ├── domain_data.png ├── domain_graph_2d.png ├── intro_3d.gif ├── iris.png ├── iris_small.png ├── jupyter_cell.png ├── pivot_heatmap.png ├── pivot_stats.png ├── pivot_value_heatmap.png ├── reading_data.png ├── run_3d.png ├── run_heatmap.png ├── running_a_cell.gif ├── selected_domains.png └── trimmed_domains.png ├── infrastructure_cat.ipynb ├── infrastructure_cat_module.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # dotenv file 7 | .env 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # PyCharm 135 | .idea/ 136 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:latest 2 | # install the basics 3 | RUN apt-get update && apt-get -y update 4 | RUN apt-get install -y build-essential python3.6 python3-pip python3-dev 5 | RUN pip3 -q install pip --upgrade 6 | 7 | # install nodejs v12 8 | RUN apt-get install -y curl dirmngr apt-transport-https lsb-release ca-certificates 9 | RUN curl -sL https://deb.nodesource.com/setup_12.x | bash - 10 | RUN apt-get install -y nodejs 11 | RUN apt-get install -y gcc g++ make 12 | RUN node --version 13 | RUN npm --version 14 | 15 | # copy dependency files 16 | RUN mkdir src 17 | WORKDIR src/ 18 | COPY requirements.txt . 19 | 20 | # instsall Jupyter, domaincat requirements, and widget extensions 21 | RUN pip3 install -r requirements.txt 22 | RUN export NODE_OPTIONS=--max-old-space-size=4096 23 | RUN jupyter labextension install jupyterlab-plotly@4.14.3 --no-build 24 | RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager --no-build 25 | RUN jupyter labextension install plotlywidget@4.14.3 --no-build 26 | RUN jupyter lab build --dev-build=False --minimize=False 27 | RUN npm cache clean --force 28 | RUN unset NODE_OPTIONS 29 | 30 | # Rest of Files copied 31 | COPY . . 32 | 33 | # Run jupyter lab 34 | CMD ["jupyter", "lab", "--port=9999", "--no-browser", "--ip=0.0.0.0", "--allow-root"] 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 DomainTools 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DomainCAT (Domain Connectivity Analysis Tool) 2 | 3 | ## "See Connections Between Domains Right Meow" 4 | 5 | **The Domain Connectivity Analysis Tool is used to analyze aggregate connectivity patterns across a set of domains during security investigations** 6 | 7 | This project was a collaborative effort between [myself](https://www.linkedin.com/in/jconwell/) and [Matthew Pahl](https://www.linkedin.com/in/security-sme/) 8 | 9 | ## Introduction 10 | 11 | When analyzing pivots during threat hunting, most people approach it from the perspective of “what can a single 12 | pivot tell you?” But often actors will set their domains up to use commodity hosting infrastructure, so the number of 13 | entities associated with a given pivot are so big they don’t really give you any useful information. 14 | 15 | This is where DomainCAT can help. Actors make decisions around domain registration and hosting options when setting 16 | up their malicious infrastructure. These can be considered behavioral choices. 17 | - What registrar(s) do they use? 18 | - What TLDs do they prefer? 19 | - What hosting provider(s) do they like? 20 | - What TLS cert authority do they use? 21 | 22 | All of these decisions, together, makeup part of that actor’s infrastructure tools, tactics and procedures (TTPs), 23 | and we can analyze them as a whole to look for patterns across a set of domains. 24 | 25 | ### But wait there's more 26 | 27 | ### Introducing InfraCAT 28 | 29 | What if instead of nodes being domains, they were the infrastructure and the edges were the connected domains? That was 30 | the thought process with InfraCAT. By seeing clusters of infrastructure, you can see tightly coupled groups of domains 31 | based on the infrastructure they use. 32 | 33 | DomainCAT and InfraCAT are tools written in Jupyter Notebooks, a web-based interactive environment that lets you combine text, 34 | code, data, and interactive visualizations into your threat hunting toolbelt. The tool analyzes aggregate 35 | connectivity patterns across a set of domains looking at every pivot for every domain, asking; what are the shared 36 | pivots across these domains, how many shared pivots between each domain, do they have a small pivot count or a really 37 | large one? All of these aspects are taken into consideration as it builds out a connectivity graph that models how 38 | connected all the domains in an Iris search are to each other. 39 | 40 | ### Example Visualizations: 41 | 42 | #### 3D visualization of domain to domain connections based on shared infrastructure, registration and naming patterns 43 | ![SegmentLocal](images/intro_3d.gif "segment") 44 | 45 | #### 2D visualization of domain to domain connection 46 | ![domain_graph2d.png](images/2d_zoom.gif "segment") 47 | 48 | #### 3d visualization of infra to infra connection 49 | ![3dinfra](images/3d_infra.gif) 50 | 51 | ## DomainCat Tutorial 52 | 53 | #### Click here for the [DomainCAT Tutorial](documentation/tutorial.md) documentation 54 | 55 | ## Installation Steps: Docker (recommended) 56 | 57 | _Note: building the container takes a bit of RAM to compile the resources for the jupyterlab-plotly extension. Bump up your RAM in Docker preferences to around 4Gb while building the container. Then afterwards you can drop it back down to your normal level to run the container_ 58 | 59 | ### Steps: 60 | 61 | Clone the git repository locally 62 | 63 | `$ git clone https://github.com/DomainTools/DomainCAT.git` 64 | 65 | Change directory to the domaincat folder 66 | 67 | `$ cd domaincat` 68 | 69 | Build the jupyter notebook container 70 | 71 | `$ docker build --tag domaincat .` 72 | 73 | Run the jupyter notebook 74 | 75 | `$ docker run -p 9999:9999 -v $(PWD)/data:/src/data --name domaincat domaincat` 76 | 77 | Mounting the data directory as a volume allows you to add new files to the container without having to rebuild it. 78 | 79 | ## Installation Steps: Manual (cross your fingers) 80 | 81 | _Note: this project uses JupyterLab Widgets, which requires nodejs >= 12.0.0 to be installed...which is on you_ 82 | 83 | ### Steps: 84 | 85 | Clone the git repository locally 86 | 87 | `$ git clone https://github.com/DomainTools/DomainCAT.git` 88 | 89 | Change directory to the domaincat folder 90 | 91 | `$ cd domaincat` 92 | 93 | Install python libraries 94 | 95 | `$ pip install -r requirements.txt` 96 | 97 | JupyterLab widgets extension 98 | 99 | ``` 100 | $ jupyter labextension install jupyterlab-plotly@4.14.3 --no-build 101 | $ jupyter labextension install @jupyter-widgets/jupyterlab-manager --no-build 102 | $ jupyter labextension install plotlywidget@4.14.3 --no-build 103 | $ jupyter lab build 104 | ``` 105 | 106 | Run the jupyter notebook 107 | 108 | `$ jupyter lab` 109 | 110 | ___ 111 | 112 | # Release Notes: 113 | 114 | October 25, 2021: 115 | - Initial support for InfraCAT 116 | 117 | August 24, 2021: 118 | - Adding a way to remove domains in the graph that you aren't interested in (look at the bottom of the notebook) 119 | - Refactor of the backend data structures to be a bit more efficient 120 | 121 | April 27, 2021: 122 | - Added support for `dotenv` to store REST API credentials in a `.env` file 123 | - Added logic to support 124 | - comma delimited list of domains 125 | - domains defanged with square brackets 126 | 127 | April 23, 2021: 128 | - Added config flag to only analyze active domains 129 | - Show count of selected domains 130 | 131 | April 19: 2021: 132 | - Bug fix to not normalize risk scores values when calculating node color 133 | - Mo'better sorting of selected domains 134 | 135 | April 15, 2021: 136 | - Bug fix: wrong json element returned when querying search hash 137 | 138 | April 14, 2021: 139 | - Added UI to search either a list of domain names or an Iris search hash 140 | - Added UI to enter Iris REST API username and password 141 | 142 | April 7, 2021: 143 | - Initial commit 144 | 145 | ___ 146 | 147 | _Plotly Bug: in the 2D visualization of the domain graph there is a weird bug in `Plotly Visualization library` where 148 | if your cursor is directly over the center of a node, the node's tool tip with the domain's name will disappear and 149 | if you click the node, it unselects all nodes. So only click on a node if you see it's tool tip_ 150 | -------------------------------------------------------------------------------- /documentation/todo.md: -------------------------------------------------------------------------------- 1 | 2 | ## DomainCAT to do Tasks 3 | - refactor the code to just use the graph data structure as much as possible and less of the domain_list map 4 | - figure out how to make the create_date pivot a window over n days vs just 1 day 5 | - prune connections that are below some weight threshold 6 | - refactor append_values_with_count(s) functions to share logic 7 | - figure out a better way to normalize registrars 8 | - create a way to type a domain name select that domain 9 | - create a way to type a pivot (category or value?) and select all domains that are connected 10 | - add every pivot possible. I mostly skipped the whois pivots because they aren't that useful anymore 11 | - address the comment in DomainRelationship.add. Essentially domains that share 2 or more IP addresses could potentially have their edge strength artificially boosted 12 | - maybe play around with normalizing edge weights once the graph is created, but before rendering 13 | 14 | ## Bugs to Fix 15 | 16 | ## Wish List 17 | 18 | when looking at domains that are probably realted and created over a short period of time, it would be useful to have some viz that shows / groups the pivots per create date. That way you could see stuff like on day 1 TLD1 and regustrar1 were used, then day two TLD1 and registrar2 were used, then day 3 TLD2 and registrar2 were used. That kind of thing 19 | 20 | Given a selection of domains, show what attributes they are NOT connected on 21 | 22 | date range of domains 23 | timeline view that shows how tight or loosely connected the domains are for each day or week 24 | 25 | auto identify the clusters and show the pivot table for each cluster 26 | 27 | auto discovery substrings -------------------------------------------------------------------------------- /documentation/tutorial.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # DomainCAT Tutorial 4 | 5 | ## Overview 6 | DomainCAT is a cybersecurity analysis tool used to explore domain to domain connectivity within a set of domains. It's 7 | useful for identifying clusters of related domains that share some common registration, naming, or infrastructure 8 | pattern during a cybersecurity investigation such as threat hunting. 9 | 10 | It does this by analyzing the aggregate connectivity patterns across the set of domains looking at every pivot for 11 | every domain, asking; what are the shared pivots across these domains, how many shared pivots between each domain, do 12 | they have a small pivot count or a really large one? All of these aspects are taken into consideration as it builds out 13 | a connectivity graph that models how connected all the domains are to each other. 14 | 15 | Imagine running a search in the DomainTools Iris UI and getting 300 domains 16 | 17 | DomainTools Iris 18 | 19 | and turning them into an interactive 3D graph visualization where you can explore them by rotating, panning, and 20 | zooming into the the data to identify interesting subsets of domains are connected to each other. 21 | 22 | ![3D Domain Connectivity Graph](../images/intro_3d.gif "segment") 23 | 24 | Or a 2D graph visualization where you can zoom in, select a set of domains and view exactly what pivots 25 | connect those specific domains together. 26 | 27 | ![2D Domain Connectivity Graph](../images/2d_zoom.gif "segment") 28 | 29 | So what is a “graph” in this context? A graph is just a bunch of nodes, or circles that represent domains, 30 | connected together by edges, or gray lines that represent the pivots two domains have in common. In the graph examples 31 | above you can see that some domains group tightly together with others to create clusters. Why is this? 32 | 33 | Pairs of domains that have more pivots in common with each other will have "stronger" edges and be closer together. 34 | This layout logic will naturally cluster groups of highly connected domains together into these clusters. 35 | 36 | Pairs of domains that have only one or two pivots in common have "weaker" edges, and will be farther apart. These 37 | domains will appear farther out on the periphery of the graph. If the domain(s) you are investigating are in this 38 | set of periphery nodes, then you know right away that your search is going in the wrong direction and you might want 39 | to go back to Iris and adjust our search criteria. 40 | 41 | ## Quick Primer on Jupyter Notebooks 42 | DomainCAT is written in [Jupyter Notebooks](https://jupyter.org/), a web-based interactive environment that lets you 43 | combine text, code, data, and interactive visualizations all into one environment. Notebooks are broken up into 44 | cells, where each call can have either code or text. Don't worry, you don't have to know how to code python or make any 45 | changes to the code (mostly) to use DomainCAT, just how to run a cell. 46 | 47 | The video below is an example of how to run a cell. Just click into a cell and you'll see a blue bar on the left 48 | that indicates the cell now has focus. Then hit Shift+Return to run the cell. There is a little empty circle in the 49 | upper right of a notebook that fills in gray while the cell is running. When the cell is finished running, the 50 | circle becomes empty again, and there will be some sort of output below the code cell that is the results of what 51 | was run. 52 | 53 | ![Running a notebook cell](../images/running_a_cell.gif) 54 | 55 | _Note: if you happen to double-click any of the text cells, you might see the contents change font and you'll have a 56 | blinking cursor. Don't panic, this just means you are in edit mode. Just hit Shift+Return like you were running a 57 | code cell, and the text will change back to normal._ 58 | 59 | If you happen to edit the code by accident and mess things up, it's not a big deal. You can click into the cell 60 | that you changed and use the normal undo/redo hotkeys (or the Edit menu) to get the code back to where it was before 61 | you edited it. Worse case you can just rebuild the docker container, and you'll get a new unedited notebook with all 62 | the default values. 63 | 64 | ## Initializing The Notebook 65 | 66 | When you start the DomainCAT notebook, you'll need to click into and run the first code cell in the notebook. 67 | This will initialize all the code to do the connectivity analysis. You'll need to do this every time you start the notebook. 68 | 69 | ![Running a notebook cell](../images/running_a_cell.gif) 70 | 71 | ## Setting Iris REST API Credentials 72 | 73 | There are two ways you can enter your Iris REST API credentials. The first is entering them into this cell. 74 | The password textbox will not show the password in clear text. 75 | 76 | Entering Iris Rest API credentials 77 | 78 | Alternatively you can create a `.env` fine in the root folder of the DomainCAT project and add the following block 79 | to it, replacing the username and password with your own 80 | 81 | ``` 82 | # IRIS REST API CREDS 83 | IRIS_API_USERNAME=some_user_name 84 | IRIS_API_KEY=some_password 85 | ``` 86 | 87 | When the Jupyter Notebook initializes, the `dotenv` library will read the `.env` file and inject the credentials 88 | into the REST API calls 89 | 90 | ## Entering Domains To Query 91 | 92 | The next step is to define the list of domains to query. This cell lets you do this in one of two ways. 93 | 94 | First, you can enter the raw list of domains into the large spare textbox shown below. The domains can be 95 | either new-line or comma delimited, and DomainCAT support defanged domains that use square brackets. 96 | 97 | Entering Domains To Query 98 | 99 | The second way is to enter an Iris investigation search hash into the second text box. This hash represents 100 | an existing investigation in Iris and will query all the domains from the investigation. 101 | 102 | ## Reading Domain Data From DomainTools REST API 103 | 104 | DomainCAT reads domain data by querying the 105 | [DomainTools Iris Investigate REST API](https://www.domaintools.com/resources/api-documentation/iris-investigate) 106 | for any investigation hash that you generate in the Iris UI. 107 | 108 | The next code cell (shown below) has the configuration used to query the Iris REST API. 109 | 110 | Running a notebook cell 111 | 112 | If you run the cell (Shift+Return) DomainCAT will query the REST api using the investigation hash are return the set of 113 | domains from the list of domains you enterd or the Iris search hash. It will also show you the number of domains loaded 114 | into the notebook. 115 | 116 | There are a couple of options to note: 117 | - `save_search_to_disk`: if you change this to `True` the search results will be saved to disk. This way you can 118 | reload the set of domains from your investigation at a later point without having to query the REST API again 119 | - `json_file_path`: this is the path and file name used to save your search results to disk for later use 120 | - `query_api`: if `True`, the REST API will query the investigation hash. If `False`, domain data will be loaded 121 | from the `json_file_path` 122 | 123 | DomainCAT ships with a demo dataset of ab called `dash_gov_dot_us.json`. This is a set of domains that use 124 | the `.us` TLD, end in `-gov`, and are less than 2 years old. To load this data set `query_api` to False and run the cell. 125 | 126 | _Performance Note: DomainCAT performs best with less than or equal to 400 domains(ish). Any more than that and 127 | performance and memory can become an issue and cause the notebook to crash._ 128 | 129 | ## Configuration Settings 130 | 131 | There are a set of configuration settings that for the most part you shouldn't need to change 132 | 133 | Running a notebook cell 134 | 135 | ### config.active_domains_only (default: True) 136 | If this setting is `True` DomainCAT will only analyze domains that are activly registered. Domains that were taken down or expired will be ignored. If `False`, all domains returned by Iris REST API will be analyzed. 137 | 138 | ### config.longest_common_substring (default: 6) 139 | 140 | DomainCAT has a new type of pivot called `longest_common_substring`. It compares every domain name to every other 141 | domain name, and creates a pivot between two domains if they share `longest_common_substring` or more consecutive 142 | characters. For example, the domains visas-esta-gov[.]us and visas-estausa-gov[.]us both share the substring 143 | “visas-esta”, so a pivot would be created for the value “visas-esta” that will join every domain with this substring. 144 | 145 | Note: I've found anything less than 5 for this setting will create way too many connections in the domain graph, 146 | and are not useful in an investigation. But try it if you want, it creates a pretty graph. 147 | 148 | ### config.ignore_substrings (default: empty) 149 | 150 | This setting is an array of string values to ignore when looking for `longest_common_substring` pivots. This is useful 151 | when you use a string as part of Iris search, like all domains ending in "-gov". Every domain will have this substring, 152 | so you want to remove it from consideration when creating substring pivots. 153 | 154 | To turn this setting off, use: `config.ignore_substrings = []` 155 | 156 | If you have more than one string to ignore use the following pattern: `config.ignore_substrings = ["stuff", "things", "hotsauce"]` 157 | 158 | ### config.scale_edge_strength_by_pivot_count (default: True) 159 | 160 | Every pivot in Iris has a pivot count. This is the number of domains globally attached to this pivot. For example, 161 | an ip address might have a pivot count of 1,000, meaning there are 1,000 domains hosted on this ip address. DomainCAT 162 | also has a notion of _local pivots_; these are pivots between domains only within the REST API search results. 163 | 164 | When evaluating how important a pivot is between two domains, DomainCAT can evaluate the global pivot count and weigh 165 | the influence of the pivot in the graph inversely proportional to the global pivot count. This means pivots with a 166 | smaller pivot count would have edges that are stronger in the graph than pivots with a very large global pivot count. 167 | 168 | If this is set to `True`, DomainCAT will use this graph edge weighting strategy. If it is set to `False`, it will 169 | weigh every edge equally. 170 | 171 | *TODO: put link to section below about graphs, edges, and weights() 172 | 173 | ### config.count_threshold (default: sys.maxsize) 174 | 175 | This setting is used to filter out pivots that have a global pivot count greater than `count_threshold`. So if it was set to `config.count_threshold = 1000`, then any pivot that had a count greater than 1,000 would not be used to create an edge in the graph. 176 | 177 | This setting isn't that useful when `scale_edge_strength_by_pivot_count` is turned on, as the inverse weighting will take care of this. But if `scale_edge_strength_by_pivot_count` is turned off, it can be useful for weeding out really big pivots from commodity internet infrastructure. 178 | 179 | ### config.min_pivot_size (default: 2) 180 | 181 | This setting is used to filter out pivots that have a _local pivot size_ of less than `min_pivot_size`. For example, if this was set to 5, then any pivot that connects 4 or less domains returned from the REST API would be removed. 182 | 183 | The default value of 2 keeps every pivot that connects at least 2 domains. 184 | 185 | ### config.max_domains (default: 400000000) 186 | 187 | This setting is related to `scale_edge_strength_by_pivot_count`. It is the theoretical maximum number of domains that could ever be on a pivot and is used to calculate the inverse pivot count weight. The default value is the approximate number of active domains, give or take a few million: 400,000,000 188 | 189 | ### config.print_debug_output (default: False) 190 | 191 | This setting is used to write extra debug info to the Jupyter's log to help with troubleshooting. 192 | 193 | ## Choose Pivots & Build the Domain Graph 194 | 195 | By default, all pivots are turned on, (with a few exceptions discussed below). This means that if any two domains returned 196 | by the REST API are connected to each other by any of the below pivots, they will have an edge created to connect them 197 | together in the graph. 198 | 199 | To turn a pivot off, just comment out the pivot by putting a `#` in front of the pivot. For example, to turn off 200 | the `create_date` pivot, just change the line like so: 201 | 202 | ```python 203 | # "create_date", 204 | ``` 205 | 206 | To turn a pivot back on, just remove the `#`. After any change to the code in a cell, you will have to re-run the 207 | cell for the code change to take effect. 208 | 209 | Below are the default pivot settings: 210 | 211 | Choose pivots 212 | 213 | You'll note that `ns_host` and `ns_ip` are both commented out. This is because I prefer to use `ns_domain` for name 214 | server based pivots. For example, if a domain has 4 name servers like: 215 | - ns1.somedomain.com 216 | - ns2.somedomain.com 217 | - ns3.somedomain.com 218 | - ns4.somedomain.com 219 | 220 | `ns_host` will create a pivot for each one, and if each name server has its own IP address then there will be 221 | an additional 4 pivots created. That means if two domains share the above name servers, the edge in the graph that 222 | connects them will represent a total of 9 pivots. This does two things to the graph. First I've found that this 223 | overemphasis the name server connectedness in the graph. Second, domains with more nameservers will have stronger 224 | edges than domains with fewer name servers, emphasising domains with more redundant infrastructure. The same logic 225 | applies to for MX record based pivots. 226 | 227 | Once you have the pivots turned on/off the way you want run this cell. DomainCAT will take all the domains returned 228 | by the Iris REST API, analyze their shared pivots, and build a graph that represents the connected structure of 229 | these domains. 230 | 231 | ### Brand New Pivot: longest common substring! 232 | Some might have noticed that there is a new pivot in this list that doesn’t exist in Iris called “longest_common_substring”. 233 | This is a new local pivot that was added into DomainCAT which compares every domain name in the search to every other 234 | domain name, and creates a pivot between two domains if they share 6 or more consecutive characters. For example, 235 | the domains visas-esta-gov[.]us and visas-estausa-gov[.]us both share the substring “visas-esta” so they would be 236 | connected in the graph. 237 | 238 | In fact, you can even turn off all the pivots except “longest_common_substring” which would show how connected all 239 | the domains are based solely on how they were named. This technique can be useful when your investigation centers 240 | around domain name patterns and shared infrastructure. 241 | 242 | ## Trimmed Domains 243 | 244 | When building the graph of connected domains, there will often be a few domains that are not connected to any other 245 | domain. These are called trimmed domains. DomainCAT will show you a count of domains that were trimmed from the 246 | graph because they were not connected. 247 | 248 | If you want to see which domains were trimmed out, just run the next cell in the notebook and it will print out 249 | the list of trimmed domains. 250 | 251 | Trimmed Domains 252 | 253 | ## Explore the 3D Domain Graph 254 | 255 | Once the graph is built it's time to visually explore it. Run the next cell in the notebook: 256 | 257 | Run 3D Vizualization 258 | 259 | DomainCAT lets you explore the graph of connected domains with an interactive 3D view. You can mouse over any node 260 | and see what the domain name is and click/drag the graph to view it from different directions. This 3D view is really 261 | useful for gaining an understanding of the overall aggregate connectedness of all the domains, and if there are any 262 | clusters of domains that we might want to explore. 263 | 264 | But if you haven't turned any pivots off, your graph might look a little something like this: 265 | 266 | ![3d domain graph](../images/3d_v1.gif) 267 | 268 | There are so many pivots in this graph, represented by those gray lines, that it obfuscates the visualization and 269 | really makes it hard to see what's going on, especially in the center of the graph. This is because there are some 270 | pivots that belong to most of the domains in the graph, resulting in a crazy spider web egg-sac-looking thing 271 | like above. The good news is that not all pivots are as useful in every investigation, so we can remove them 272 | from the graph. 273 | 274 | For example; the search used to pull these domains together used the TLD “.us”. So every domain would have this pivot 275 | connecting it to every other domain. Luckily, DomainCAT is smart enough to look for pivots like that and automatically 276 | remove them. But there are probably other pivots that are just adding noise to the graph and do not offer much value 277 | to the investigation which we can turn off. 278 | 279 | ## Pivot Stats 280 | 281 | DomainCAT has a Pivot Stats view, which shows different statistics for each pivot that was found in the domains. 282 | 283 | Pivot Stats 284 | 285 | Looking through this list, we can see that `ip_country_code` only has 7 pivots (“# of Pivots”), meaning there were 286 | only 7 different country codes found in the domains, but it’s “# of connections” shows that almost 48% of the domains 287 | are connected to each other with this pivot. This is a good candidate pivot to turn off in the graph and should clean 288 | up the 3D view of the graph a bit. 289 | 290 | Let’s find out. Go back up to the pivots configuration cell and comment out `ip_country_code`, then run that cell 291 | to rebuild the graph. Then scroll down to the cell that calls `build_3d_graph_layout(graph, domain_list)` to redraw 292 | the 3D visualization. It should look something like this: 293 | 294 | ![3d domain graph](../images/3d_v2.gif) 295 | 296 | Removing a single pivot really opened up the graph! As the 3D view of the graph pivots around we can see there are four 297 | main clusters: three that seem highly connected to each other, almost in a triangle, and one fairly mixed cluster 298 | pushed farther out to the side. 299 | 300 | ## Pivot Tuning: An Iterative Process 301 | 302 | The 3D graph view looks pretty good after just turning off country code, but sometimes turning off one pivot isn’t 303 | enough. I call this process “pivot tuning;” you look at the 3D view of the graph to see if the center has opened up 304 | enough to see its internal cluster patterns. If the graph view is still too cluttered, look at pivot statistics and 305 | see if you can find another pivot that might be a good candidate to turn off. Pivots whose “# of connections” column 306 | is higher than 15% are often good candidates. Also, pivots whose “# of pivots” column is pretty low but the “# of 307 | domains” is close to 90% or greater can be helpful too. There are some pivots, like ASN or IP country code that are 308 | very coarse and apply to a high percentage of the domains. If your investigation isn’t centered around IP geo or 309 | network diversity, these pivots are also good candidates to turn off. 310 | 311 | You’ll get a feel for what works and what doesn’t as you play with it. But keep in mind that a pivot that looks 312 | like a candidate to turn off might be critical to your investigation. For example; if the investigation is centered 313 | around domains hosted by a single ISP, turning off “ip_isp” might be a good idea, but “ip_address” might remove 314 | important connectivity information from the graph. 315 | 316 | _Note: I like to have multiple cells that call `build_3d_graph_layout(graph, domain_list)`. This way, as I'm 317 | pivot tuning, I can really see how a change in the pivots affected the graph by comparing it to the 318 | previous graph. I'll often have 3 or 4 different 3D graphs showing my progress while pivot tuning_ 319 | 320 | ## Explore the 2D Domain Graph 321 | 322 | With the 3D graph cleaned up a bit we can now dive back into the 2 dimensional view to explore the details of the 323 | different domain clusters or individual domains. 324 | 325 | Run the next cell to create the interactive 2D domain graph. 326 | 327 | 2D viz 328 | 329 | There are several things we can do in this view of the graph. We can zoom into a cluster to look at what domains are 330 | in it. To do this, just click-drag over a section of the graph you want to zoom into. To zoom back out click the 331 | little house icon in the upper right of the 2D graph view. 332 | 333 | A large cluster is sometimes actually made up of 2-3 smaller clusters that are useful to explore, but just lump 334 | together in the zoomed-out view. If common domain name patterns are a theme in your investigation, 335 | mousing over domains to view their name is a useful tactic to see which domains are grouped together. 336 | 337 | ![3d domain graph](../images/2d_zoom.gif) 338 | 339 | We can also select a region of the graph (which is different from zooming in) by clicking either the "box select" 340 | or "lasso select" icons in the upper right of the 2D view, and then click-drag the region in the graph to select. 341 | 342 | ![3d domain graph](../images/2d_zoom_select.gif) 343 | 344 | Once a set of domains are selected (and all other domains become grayed out) DomainCAT will show the list of domain names. 345 | 346 | 2D viz 347 | 348 | If this set of domains looks like what you are looking for in your investigation, you could export this list back 349 | into your SOAR playbook to build rules around them, or maybe add them to your firewall to block outright. Or if you 350 | could copy them back into Iris to do a more focused investigation just on these domains. 351 | 352 | ## Dig into Pivot Connections 353 | 354 | Once you have a set of selected domains, you can dig deeper into what pivots were influential in grouping them together. 355 | Run the next cell in the notebook: 356 | 357 | 2D viz 358 | 359 | This will analyze all the pivots that are shared across the domains that you just selected, and show you a heatmap view 360 | of which pivots were most influential in connecting the selected domains, ordered from most influential to least. 361 | 362 | 2D viz 363 | 364 | Looking at the list of selected domain names, it’s not a surprise that “longest_common_substring” was the most 365 | frequent pivot. The number in each square is the total number of pivots of that type from that domain to the other 366 | selected domains. *This view can tell you what pivot categories were most influential in grouping these domains 367 | together*. This information can be really valuable when your investigation didn’t include one of these 368 | pivots in the original search criteria. 369 | 370 | If you want to look at which specific pivot values are responsible for clustering these domains together, the below 371 | view is a more detailed heat map which shows the actual pivot value rather than its key. 372 | 373 | 2D viz 374 | 375 | From this we can see that “esta-c” is a very common naming pattern for these domains. If this was a pattern in our 376 | source indicator(s), we might go back to Iris and add “domain contains esta-c” as an additional search criteria to 377 | bring in a broader set of domains into our investigation. 378 | 379 | There is another interesting pattern in this view. If you look at the ASN, ISP, IP address, and dynadot[.]com name 380 | server rows you’ll see that they are all used by the same subset of domains in this list. But hawkhost[.]com nameserver 381 | is used by all the other domains. Based on the domain naming pattern it’s reasonable to believe that all these domains 382 | were created by the same actor, but it looks like this actor uses at least two different hosting and name server 383 | infrastructures. This realization could widen out the set of domains that you can now take action on. 384 | 385 | There’s one other piece of functionality in the 2D graph view that is worth mentioning. Sometimes you just want to 386 | see what domains are connected to one specific domain. If you click any node in the graph, it will automatically 387 | select all the nodes that are connected to it and you can then explore the pivots that bind them together. 388 | 389 | ![3d domain graph](../images/2d_click.gif) 390 | 391 | This is useful when you have a seed domain in your investigation and you want to just dive right into it and see what 392 | other domains are connected to it. Another useful scenario (shown above) is when you see a “connector domain” that 393 | sits in between two clusters but is highly connected to both. Clicking on that domain and then inspecting the shared 394 | pivots can sometimes yield valuable information about two different infrastructure patterns used by a potential actor. 395 | 396 | # DomainCAT Tips & Best Practices 397 | 398 | ## longest_common_substring 399 | When investigating a set of domains that have obvious common naming patterns, it can be useful to turn off all pivots 400 | except `longest_common_substring` which would show how connected all the domains are based solely on how they were named. 401 | I'll also often combine `longest_common_substring` with only one or two other infrastructure based pivots, 402 | like `ns_domain` or `mx_domain`. This technique can be useful when your investigation centers around domain name 403 | patterns and shared infrastructure. 404 | 405 | -------------------------------------------------------------------------------- /domain_cat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# DomainCAT: Domain Connectivity Analysis Tool\n", 8 | "\n", 9 | "### Analyzing the domain to domain connectivity of an Iris API Search" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# Run This First: imports all the helper functions and sets stuff up\n", 19 | "%run domain_cat_module.py\n", 20 | "\n", 21 | "print(\"DomainCAT is ready to go\")" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Iris REST API Credentials" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "api_username_ui = widgets.Text(placeholder='Iris API Username', description='Username:', layout={'width': '500px'}, value=\"\")\n", 38 | "api_pw_ui = widgets.Password(placeholder='Iris API Password', description='Password:', layout={'width': '500px'}, value=\"\")\n", 39 | "widgets.VBox([api_username_ui, api_pw_ui])" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Query Domain Data From Iris Investigate API\n", 47 | "\n", 48 | "Enter either a list of return delimited domains into the Domains text box, _OR_ an Iris search hash into the hash text box.\n", 49 | "\n", 50 | "Note: if both a list of domains _AND_ a search hash is entered, the liast of domains will be queried and the search hash will be ignored" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "domain_list_ui = widgets.Textarea(placeholder='Enter list of domains', description='Domains:', layout={'height': '300px', 'width': '700px'}) \n", 60 | "search_hash_ui = widgets.Text(placeholder='Enter list of domains', description='Hash:', layout={'width': '700px'})\n", 61 | "show_iris_query_ui(domain_list_ui, search_hash_ui)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# Data Loading Config\n", 71 | "query_api = True\n", 72 | "save_search_to_disk = False\n", 73 | "json_file_path = \"data/dash_gov_dot_us.json\"\n", 74 | "\n", 75 | "if query_api:\n", 76 | " iris_results = query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui)\n", 77 | " print(f'Iris API returned {len(iris_results)} domains')\n", 78 | "\n", 79 | " # save search results to disk to be used later\n", 80 | " if save_search_to_disk:\n", 81 | " with open(json_file_path, 'w') as f:\n", 82 | " json.dump(iris_results, f)\n", 83 | "else:\n", 84 | " with open(json_file_path) as json_data:\n", 85 | " iris_results = json.loads(json_data.read())\n", 86 | "\n", 87 | " print(f'Loaded {len(iris_results)} domains from {json_file_path}')" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "## DomainCAT Configuration\n", 95 | "\n", 96 | "Please refer to the DomainCAT documentation for details about these configuration options" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "config = Config()\n", 106 | "\n", 107 | "# only analyze domains that are active (currently registered)\n", 108 | "config.active_domains_only = True\n", 109 | "\n", 110 | "# config for pivoting on matching substrings. Only matching substrings this long or longer will be used to create a pivot\n", 111 | "config.longest_common_substring = 6\n", 112 | "\n", 113 | "# List of substrings to ignore when creating pivots by matching substrings\n", 114 | "config.ignore_substrings = []\n", 115 | "\n", 116 | "# use the pivot count to scale how important the pivot is during graph layout. Smaller pivot counts is more influence, and vice versa\n", 117 | "config.scale_edge_strength_by_pivot_count = True\n", 118 | "\n", 119 | "# Global pivot count threshold. Any pivot with more than this value is discarded. sys.maxsize effectivly keeps all pivots\n", 120 | "config.global_count_threshold = sys.maxsize\n", 121 | "\n", 122 | "# The smallest pivot count size to use. Default of 2 means no pivots are filtered out because it's count is too low\n", 123 | "config.min_pivot_size = 2\n", 124 | "\n", 125 | "# theoretical max pivot size for calculating edge strengths\n", 126 | "config.max_domains = 100000000\n", 127 | "\n", 128 | "# If True DomainCAT will print out some debug info while building the connected graph of domains\n", 129 | "config.print_debug_output = False" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "## Choose Which Pivots To Use & Build Domain Graph\n" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "pivot_category_config = {\n", 146 | " \"adsense\",\n", 147 | " \"google_analytics\",\n", 148 | " \"create_date\",\n", 149 | " \"redirect_domain\",\n", 150 | " \"registrar\",\n", 151 | " \"ip_address\",\n", 152 | " \"ip_country_code\",\n", 153 | " \"ip_isp\",\n", 154 | " \"ip_asn\",\n", 155 | " \"ssl_hash\",\n", 156 | " \"ssl_subject\",\n", 157 | " \"ssl_org\",\n", 158 | " \"ssl_email\",\n", 159 | " \n", 160 | "# # Note: commented out ns_host and ns_ip because they double count ns connectedness when used with ns_domain. \n", 161 | " \"ns_domain\",\n", 162 | "# \"ns_host\", \n", 163 | " \"ns_ip\", \n", 164 | " \n", 165 | "# # Note: commented out mx_host and mx_ip because they double counts mx connectedness when used with mx_domain \n", 166 | " \"mx_domain\",\n", 167 | "# \"mx_host\",\n", 168 | " \"mx_ip\", \n", 169 | " \n", 170 | " \"tld\",\n", 171 | " \"longest_common_substring\",\n", 172 | "}\n", 173 | "\n", 174 | "# Build the domain pivot graph structure\n", 175 | "config.pivot_category_config = pivot_category_config\n", 176 | "graph, pivot_categories, trimmed_domains = build_domain_pivot_graph(iris_results, config)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "## Trimmed Domains" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "print_trimmed_domains = True\n", 193 | "if print_trimmed_domains:\n", 194 | " if len(trimmed_domains[\"unconnected\"]) > 0:\n", 195 | " print(\"trimmed unconnected domains:\")\n", 196 | " for domain in trimmed_domains[\"unconnected\"]: print(f\" {domain}\")\n", 197 | " if len(trimmed_domains[\"create_date\"]) > 0:\n", 198 | " print(\"\\ntrimmed domains with only create date pivot:\")\n", 199 | " for domain in trimmed_domains[\"create_date\"]: print(f\" {domain}\")" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "## Draw the Domain Graph in an Interactive 3D Layout" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "build_3d_graph_layout(graph)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "build_3d_graph_layout(graph)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "build_3d_graph_layout(graph)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "## Calculate & Show Pivot Statistics" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "# Calculate a bunch of pivot statistics to see how well connected all the domains in the search result are\n", 250 | "calc_pivot_stats(graph, pivot_categories)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "## Draw the Domain Graph in an Interactive 2D Layout" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "# calculate the pivots shared in commmon across all selected domains\n", 267 | "shared_pivots = {}\n", 268 | "def get_2d_shared_pivots(graph, selected_domains):\n", 269 | " global shared_pivots\n", 270 | " shared_pivots = get_shared_pivots(graph, selected_domains)\n", 271 | " \n", 272 | "build_2d_graph_layout(graph, get_2d_shared_pivots)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "## Heatmap of which pivots connect the most domains together: by pivot category" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "if len(shared_pivots) == 0:\n", 289 | " print(\"Select a set of domains in the 2D graph\")\n", 290 | "else:\n", 291 | " create_pivot_heatmaps(shared_pivots)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "## Removing domains from the graph\n", 299 | "\n", 300 | "Sometimes you find disconnected domains in the 3D graph visualization that make pivoting the viz really annoying. To remove domains from the graph, enter the domain(s) you want removed in the text box below and run the second cell. This will remove the domains from the graph structure without having to requery the data.\n", 301 | "\n", 302 | "After you do this, re-run the 3D viz and the domains should be gone." 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "remove_domains_ui = widgets.Textarea(placeholder='Enter domains to remove from graph', description='Domains:', layout={'height': '100px', 'width': '700px'}) \n", 312 | "remove_domains_ui" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "# Run this to remove the domains in the above text box from the graph\n", 322 | "graph = remove_domains_from_graph(graph, remove_domains_ui)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [] 345 | } 346 | ], 347 | "metadata": { 348 | "kernelspec": { 349 | "display_name": "Python 3 (ipykernel)", 350 | "language": "python", 351 | "name": "python3" 352 | }, 353 | "language_info": { 354 | "codemirror_mode": { 355 | "name": "ipython", 356 | "version": 3 357 | }, 358 | "file_extension": ".py", 359 | "mimetype": "text/x-python", 360 | "name": "python", 361 | "nbconvert_exporter": "python", 362 | "pygments_lexer": "ipython3", 363 | "version": "3.8.10" 364 | } 365 | }, 366 | "nbformat": 4, 367 | "nbformat_minor": 4 368 | } -------------------------------------------------------------------------------- /domain_cat_module.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import re 4 | import json 5 | import math 6 | from difflib import SequenceMatcher 7 | import plotly.graph_objects as go 8 | import requests 9 | import networkx as nx 10 | import pandas as pd 11 | import numpy as np 12 | import scipy 13 | import matplotlib 14 | import matplotlib.pyplot as plt 15 | from ipywidgets import interactive, HBox, VBox 16 | import ipywidgets as widgets 17 | from IPython.display import HTML, display 18 | import tabulate 19 | from dotenv import dotenv_values 20 | 21 | 22 | # load REST API creds from .env file 23 | dcat_config = dotenv_values(".env") 24 | 25 | def show_iris_query_ui(domain_list_ui, search_hash_ui): 26 | lookup_ui = widgets.VBox([ 27 | widgets.Label(value="Enter a return delimited list of domains to lookup (no commas, no quotes)"), 28 | domain_list_ui, 29 | widgets.Label(value="Or..."), 30 | widgets.Label(value="Enter an Iris search hassh to lookup"), 31 | search_hash_ui, 32 | ]) 33 | return lookup_ui 34 | 35 | 36 | def clean_domain_list(domain_list_ui): 37 | # remove any quotes, spaces, or defanging square brackets 38 | full_domain_list = domain_list_ui.value.strip().replace(' ', '').replace('"', '').replace("'", "").replace('[', '').replace(']', '') 39 | # replace commas with new lines 40 | full_domain_list = full_domain_list.replace(",", "\n") 41 | # update the widget 42 | domain_list_ui.value = full_domain_list 43 | # split into array 44 | return full_domain_list.split("\n") 45 | 46 | 47 | def get_rest_api_creds(api_username_ui, api_pw_ui): 48 | api_username = api_username_ui.value 49 | if len(api_username) == 0: 50 | api_username = dcat_config["IRIS_API_USERNAME"] 51 | api_key = api_pw_ui.value 52 | if len(api_key) == 0: 53 | api_key = dcat_config["IRIS_API_KEY"] 54 | return api_username, api_key 55 | 56 | 57 | def query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui): 58 | api_username, api_key = get_rest_api_creds(api_username_ui, api_pw_ui) 59 | if len(domain_list_ui.value) > 0: 60 | # split list of domains into groups of 100 because of API restrictions 61 | results = [] 62 | full_domain_list = clean_domain_list(domain_list_ui) 63 | max_domains = 100 64 | start = 0 65 | end = max_domains 66 | for x in range(math.ceil(len(full_domain_list) / max_domains)): 67 | # slice out max domains to query 68 | partial_domain_list = full_domain_list[start:end] 69 | # build query string 70 | domain_list = ",".join(partial_domain_list) 71 | iris_query = {"api_username": api_username, "api_key": api_key, "domain": domain_list} 72 | # query rest api 73 | print(f"...querying Iris REST API for {len(partial_domain_list)} domains") 74 | iris_results = _query_iris_rest_api(api_username, api_key, iris_query) 75 | # build up the set of return domain objects 76 | results = results + iris_results["response"]["results"] 77 | # update slice indexes 78 | start = end 79 | end += max_domains 80 | return results 81 | elif len(search_hash_ui.value) > 0: 82 | iris_query = {"api_username": api_username, "api_key": api_key, "search_hash": search_hash_ui.value} 83 | iris_results = _query_iris_rest_api(api_username, api_key, iris_query) 84 | iris_results = iris_results["response"]["results"] 85 | return iris_results 86 | else: 87 | print("Domain List and Search Hash text boxes are empty. Please enter either a list of domains or search hash to lookup") 88 | raise Exception("Domain List and Search Hash text boxes are empty") 89 | 90 | 91 | def _query_iris_rest_api(api_username: str, api_key: str, iris_query: str): 92 | root_api_url = "https://api.domaintools.com/v1/iris-investigate/" 93 | resp = requests.post(root_api_url, data=iris_query) 94 | if resp.status_code != 200: 95 | raise Exception(f'POST /iris-investigate/ {resp.status_code}: {resp.text}') 96 | iris_results = resp.json() 97 | return iris_results 98 | 99 | 100 | def remove_domains_from_graph(graph, remove_domains_ui): 101 | domains = clean_domain_list(remove_domains_ui) 102 | for domain in domains: 103 | if graph.has_node(domain): 104 | graph.remove_node(domain) 105 | return graph 106 | 107 | 108 | class Config(object): 109 | """ Little helper class to hold all the config values""" 110 | 111 | 112 | class Domain(object): 113 | """ Little helper class to hold the domain name and risk score 114 | """ 115 | def __init__(self, domain_json): 116 | self.json = domain_json 117 | self.name = domain_json["domain"] 118 | self.risk_score = domain_json["domain_risk"]['risk_score'] 119 | self.pivot_categories = {} 120 | self.label=f"{self.name} ({self.risk_score})" 121 | 122 | def __str__(self): 123 | return f"name: {self.name}, risk: {self.risk_score}" 124 | 125 | def __repr__(self): 126 | return str(self) 127 | 128 | 129 | class DomainRelationship(object): 130 | def __init__(self, weight: float, category: str): 131 | # this is the maximum weight that an edge can have. 132 | # Adjust this if you want to play around with stronger edge weights 133 | self.max_weight = 5.0 134 | self.weight = weight 135 | self.categories = [category] 136 | 137 | def __str__(self): 138 | return f"weight: {self.weight}, categories: {self.categories}" 139 | 140 | def __repr__(self): 141 | return str(self) 142 | 143 | def add(self, weight: float, category: str): 144 | """ Note: certain pivot categories can be added more than once for 2 domains; 145 | things like IP and name server. For example, two domains could be on the same set of 5 146 | IP addreese. For now the weights are just summed if there are more than one pivots of 147 | the same category, but maybe we need a different strategy. Since IPs have multiple pivots 148 | (ip address, country code, asn, isp) this means if there were 5 shared IPs between two 149 | domains, the weight would be: 4 * 5 * pivot_weight. 150 | This might over amplify the edge strength 151 | """ 152 | if category not in self.categories: 153 | # this helps by not overly boosting the edge weight if two domains share 154 | # multipel IP addresses 155 | self.weight += weight 156 | if self.weight > self.max_weight: 157 | self.weight = self.max_weight 158 | self.categories.append(category) 159 | 160 | def get_description(self): 161 | return "
".join(sorted(self.categories)) 162 | 163 | 164 | class PivotValue(object): 165 | def __init__(self, pivot_value, pivot_count): 166 | self.pivot_value = pivot_value 167 | self.pivot_count = pivot_count 168 | self.domains = set() 169 | 170 | def union(self, other: "PivotValue"): 171 | self.domains.union(other.domains) 172 | 173 | def __str__(self): 174 | return f"pivot_value: {self.pivot_value}, " \ 175 | f"pivot_count: {self.pivot_count}, " \ 176 | f"domains: {self.domains}" 177 | 178 | def __repr__(self): 179 | return str(self) 180 | 181 | 182 | def get_edge_count(n: int): 183 | # for a complete graph, the edge count is: n(n-1)/2 184 | return n * (n - 1) / 2 185 | 186 | 187 | def build_domain_pivot_graph(iris_results: list, config: "Config"): 188 | """ Main workflow function that takes the results from an Iris Investigate query and 189 | builds the graph object of how each of the domains in the query are connected to each other""" 190 | 191 | # parse the Iris API Result to build the pivot data structure 192 | graph, pivot_categories = init_local_pivot_graph(iris_results, config) 193 | 194 | # normalize registrar pivots (see note in function comments) 195 | #if "registrar" in pivot_categories and config.normalize_registrars: 196 | # normalize_similar_registrars(pivot_categories["registrar"]) 197 | 198 | # create pivots for longest common substrings 199 | pivot_on_matching_substrings(graph, pivot_categories, config) 200 | 201 | # trim pivots from graph that have less than the set count threshold or contain all domains 202 | trim_pivots(pivot_categories, len(graph.nodes), config) 203 | 204 | # trim unconnected domains and domains with only a create date pivot 205 | trimmed_unconnected_domains = trim_unconnected_domains(graph, pivot_categories, config) 206 | trimmed_create_date_domains = trim_domains_with_only_create_date_pivot(graph, pivot_categories) 207 | 208 | print(f"{len(trimmed_unconnected_domains)} " 209 | f"domains trimmed because they were not connected to other domains") 210 | print(f"{len(trimmed_create_date_domains)} " 211 | f"domains trimmed because create_date was the only pivit") 212 | print(f"{len(graph.nodes)} domains in pivot structure \n") 213 | 214 | # build the graph structure based on the domain pivots 215 | graph = build_domain_graph(graph, pivot_categories, config) 216 | return (graph, 217 | pivot_categories, 218 | {"unconnected": trimmed_unconnected_domains, 219 | "create_date": trimmed_create_date_domains}) 220 | 221 | 222 | def init_local_pivot_graph(iris_results: list, config: "Config"): 223 | """ Collect pivot categories found in result set ("ssl_hash" for example)""" 224 | # init empty graph 225 | graph = nx.Graph() 226 | # init pivot categories dict 227 | pivot_categories = {} 228 | 229 | for domain_json in iris_results: 230 | 231 | # check if domain is active or not 232 | if domain_json['active'] == False and config.active_domains_only: 233 | continue 234 | 235 | # create a domain object 236 | domain = Domain(domain_json) 237 | 238 | # add domain node to graph 239 | graph.add_node(domain.name, domain=domain) 240 | 241 | append_value_with_count(pivot_categories, 'adsense', domain_json, domain, config) 242 | append_value_with_count(pivot_categories, 'google_analytics', domain_json, domain, config) 243 | append_value_with_count(pivot_categories, 'create_date', domain_json, domain, config) 244 | append_value_with_count(pivot_categories, 'redirect_domain', domain_json, domain, config) 245 | append_value_with_count(pivot_categories, 'registrar', domain_json, domain, config) 246 | 247 | # haven't seen "ssl_email" in the wild yet, so not sure if it is a value/count or just value 248 | append_values_with_counts(pivot_categories, 'ssl_email', domain_json, domain, config) 249 | 250 | # IPs are composite objects, so pull out each value for each IP 251 | for ip_json in domain_json["ip"]: 252 | # at some point add logic to add /24 in here 253 | append_value_with_count(pivot_categories, 'address', ip_json, domain, config, 'ip_address') 254 | append_value_with_count(pivot_categories, 'country_code', ip_json, domain, config, 'ip_country_code') 255 | append_value_with_count(pivot_categories, 'isp', ip_json, domain, config, 'ip_isp') 256 | append_values_with_counts(pivot_categories, 'asn', ip_json, domain, config, 'ip_asn') 257 | 258 | # name servers are composite objects, so pull out each value for each name server 259 | for ns_json in domain_json["name_server"]: 260 | append_value_with_count(pivot_categories, 'host', ns_json, domain, config, 'ns_host') 261 | append_value_with_count(pivot_categories, 'domain', ns_json, domain, config, 'ns_domain') 262 | append_values_with_counts(pivot_categories, 'ip', ns_json, domain, config, 'ns_ip') 263 | 264 | append_value(pivot_categories, 'tld', domain_json, domain, config) 265 | 266 | # ssl certs are composite objects, so pull out each value for each ssl cert 267 | for ssl_json in domain_json['ssl_info']: 268 | append_value_with_count(pivot_categories, 'hash', ssl_json, domain, config, "ssl_hash") 269 | append_value_with_count(pivot_categories, 'subject', ssl_json, domain, config, "ssl_subject") 270 | append_value_with_count(pivot_categories, 'organization', ssl_json, domain, config, "ssl_org") 271 | 272 | # mx servers are composite objects, so pull out each value for each mx server 273 | for mx_json in domain_json['mx']: 274 | append_value_with_count(pivot_categories, 'host', mx_json, domain, config, "mx_host") 275 | append_value_with_count(pivot_categories, 'domain', mx_json, domain, config, "mx_domain") 276 | append_values_with_counts(pivot_categories, 'ip', mx_json, domain, config, "mx_ip") 277 | # mx priority might be interesting at some point to node stringth 278 | return graph, pivot_categories 279 | 280 | 281 | def append_value(pivot_categories: dict, 282 | pivot_category: str, 283 | json_data: dict, 284 | domain: "Domain", 285 | config: "Config", 286 | new_pivot_category: str = None): 287 | # check if pivot is in domain json 288 | if pivot_category in json_data: 289 | pivot_value = str(json_data[pivot_category]).strip() 290 | 291 | # check we have a value to add 292 | if len(pivot_value) > 0: 293 | _append_value_to_pivot(pivot_categories, pivot_category, pivot_value, None, 294 | domain, config, new_pivot_category) 295 | 296 | 297 | def append_value_with_count(pivot_categories: dict, 298 | pivot_category: str, 299 | json_data: dict, 300 | domain: "Domain", 301 | config: "Config", 302 | new_pivot_category: str = None): 303 | # check if pivot is in domain json 304 | if pivot_category in json_data: 305 | if isinstance(json_data[pivot_category], dict): 306 | pivot_value = str(json_data[pivot_category]["value"]).strip() 307 | global_pivot_count = json_data[pivot_category]["count"] 308 | 309 | # trim pivots that are above the threshold (except create_date) 310 | if global_pivot_count < config.global_count_threshold or pivot_category == "create_date": 311 | # check we have a value to add 312 | if len(pivot_value) > 0 and global_pivot_count > 0: 313 | _append_value_to_pivot(pivot_categories, pivot_category, pivot_value, 314 | global_pivot_count, domain, config, new_pivot_category) 315 | 316 | 317 | def append_values_with_counts(pivot_categories: dict, 318 | pivot_category: str, 319 | json_data: dict, 320 | domain: "Domain", 321 | config: "Config", 322 | new_pivot_category: str = None): 323 | # check if pivot is in domain json 324 | if pivot_category in json_data: 325 | for pivot in json_data[pivot_category]: 326 | pivot_value = str(pivot["value"]).strip() 327 | global_pivot_count = pivot["count"] 328 | 329 | # check if we want to add this value 330 | if len(pivot_value) > 0 and global_pivot_count > 0 and global_pivot_count < config.global_count_threshold: 331 | _append_value_to_pivot(pivot_categories, pivot_category, pivot_value, 332 | global_pivot_count, domain, config, new_pivot_category) 333 | 334 | 335 | def _append_value_to_pivot(pivot_categories: dict, 336 | pivot_category: str, 337 | pivot_value: str, 338 | global_pivot_count: int, 339 | domain: "Domain", 340 | config: "Config", 341 | new_pivot_category: str = None): 342 | # if we pass in a new_pivot_category, replace pivot_category with new_pivot_category 343 | if new_pivot_category: 344 | pivot_category = new_pivot_category 345 | 346 | # check if we're capturing data for this pivot category 347 | if pivot_category not in config.pivot_category_config: 348 | return 349 | 350 | # make sure we have the pivot dictionary 351 | if pivot_category not in pivot_categories: 352 | pivot_categories[pivot_category] = {} 353 | 354 | # make sure we have the pivot value set 355 | if pivot_value not in pivot_categories[pivot_category]: 356 | pivot_categories[pivot_category][pivot_value] = PivotValue(pivot_value, global_pivot_count) 357 | 358 | # add domain to the pivot domain array 359 | pivot_categories[pivot_category][pivot_value].domains.add(domain.name) 360 | 361 | # add pivot category and value to the domain 362 | if pivot_category not in domain.pivot_categories: 363 | domain.pivot_categories[pivot_category] = [] 364 | domain.pivot_categories[pivot_category].append(pivot_value) 365 | 366 | 367 | def normalize_similar_registrars(registrar_pivots: dict): 368 | """ The same registrar can often show up in WHOIS records with different string values. 369 | For example: 370 | NAMECHEAP 371 | NAMECHEAP INC 372 | NAMECHEAP. INC 373 | NAMECHEAP, INC 374 | NAMECHEAP, INC. 375 | 376 | This function splits the registrar string by any non character value, and selects the longest 377 | word as the normalized registar value. If any two registrars share the same normalized value, 378 | then the domains from those two registrars will be merged. The end goal is all the domains 379 | from the 5 different namecheap registrars string values shown above would be merged into one. 380 | 381 | Note: this isn't a very good solution. There are cases where this will create invalid connections 382 | between domains. For example, two different registars that shared a common longest word in 383 | their name, link "NAMECHEAP, INC" and "NOT NAMECHEAP, INC". 384 | 385 | It looks like this happens a lot so turning off the feature for now. 386 | 387 | TODO: this algorithm needs work. it allows things such as 388 | good 389 | PDR LTD. D/B/A PUBLICDOMAINREGISTRY.COM == PDR Ltd. d/b/a PublicDomainRegistry.com 390 | GODADDY.COM, == LLC GODADDY.COM, INC 391 | NAMECHEAP, INC == NameCheap, Inc. 392 | bad 393 | TUCOWS DOMAINS INC == WILD WEST DOMAINS, INC 394 | NETWORK SOLUTIONS, == LLC Network Solutions, LLC 395 | NETWORK SOLUTIONS, == LLC BIGROCK SOLUTIONS LTD 396 | """ 397 | return 398 | # registrars = [registrar for registrar in registrar_pivots] 399 | # for x in range(len(registrars)): 400 | # reg1 = registrars[x] 401 | # if reg1 in registrar_pivots: 402 | # # normalize registrar string 403 | # reg1_norm = sorted( 404 | # list(set(re.findall(r"[\w']+", reg1.lower()))), key=len, reverse=True)[0] 405 | # for y in range(x+1, len(registrars)): 406 | # reg2 = registrars[y] 407 | # # normalize registrar string 408 | # reg2_norm = sorted( 409 | # list(set(re.findall(r"[\w']+", reg2.lower()))), key=len, reverse=True)[0] 410 | # if reg1_norm == reg2_norm: 411 | # # pick the registrar with the most domains 412 | # if registrar_pivots[reg1].pivot_count > registrar_pivots[reg2].pivot_count: 413 | # reg_keep = reg1 414 | # reg_pop = reg2 415 | # else: 416 | # reg_keep = reg2 417 | # reg_pop = reg1 418 | # # combine domains for matching registrars 419 | # registrar_pivots[reg_keep].union(registrar_pivots[reg_pop]) 420 | # # remove reg_pop from dictionary of all registrar pivots 421 | # registrar_pivots.pop(reg_pop) 422 | # print(f"Merged registrar {reg_pop} into {reg_keep}") 423 | 424 | 425 | def pivot_on_matching_substrings(graph: "Graph", pivot_categories: dict, config: "Config"): 426 | """Create pivots between domains that share a common substring of 427 | `config.longest_common_substring` chars long. 428 | 429 | Note: SequenceMatcher has some known issues with not finding the longest match in very long 430 | strings, but does a pretty good job with shorter strings such as domain names. 431 | https://stackoverflow.com/questions/18715688/find-common-substring-between-two-strings 432 | """ 433 | domains = list(graph.nodes) 434 | for x in range(len(domains)): 435 | domain1 = graph.nodes[domains[x]]["domain"] 436 | string1 = domain1.name.split('.')[0] 437 | # pull out substrings to ignore 438 | if config.ignore_substrings and len(config.ignore_substrings) > 0: 439 | for ignore in config.ignore_substrings: 440 | string1 = string1.replace(ignore, "") 441 | for y in range(x+1, len(domains)): 442 | domain2 = graph.nodes[domains[y]]["domain"] 443 | string2 = domain2.name.split('.')[0] 444 | # pull out substrings to ignore 445 | if config.ignore_substrings and len(config.ignore_substrings) > 0: 446 | for ignore in config.ignore_substrings: 447 | string2 = string2.replace(ignore, "") 448 | # find the longest common substring between the two domains 449 | matcher = SequenceMatcher(None, string1, string2, False) 450 | match = matcher.find_longest_match(0, len(string1), 0, len(string2)) 451 | longest_match = string1[match.a: match.a + match.size] 452 | # check if the matching substring is long enough 453 | if len(longest_match) >= config.longest_common_substring: 454 | # add pivots 455 | _append_value_to_pivot( 456 | pivot_categories, 457 | "longest_common_substring", 458 | longest_match, None, 459 | domain1, config) 460 | _append_value_to_pivot( 461 | pivot_categories, 462 | "longest_common_substring", 463 | longest_match, None, 464 | domain2, config) 465 | 466 | 467 | def trim_pivots(pivot_categories: dict, domain_count: int, config: "Config"): 468 | """ Remove two types of pivots. Pivots that contain all the domains from the Iris result set, 469 | and pivots that have less than the set threshold of domains in them from this Iris result set. 470 | By defualt, pivots that only have one domain are removed, but this can be configured by 471 | setting the min_pivot_size variable to a different value. For example, if you only wanted 472 | to use pivots that had 10 or more domains connected to them 473 | """ 474 | for pivot_category_key in pivot_categories: 475 | pivot_category = pivot_categories[pivot_category_key] 476 | total_pivots = 0 477 | del_count = 0 478 | for pivot_value in list(pivot_category.keys()): 479 | total_pivots += 1 480 | if len(pivot_category[pivot_value].domains) < config.min_pivot_size: 481 | # check for pivots with less than the threshold value 482 | del pivot_category[pivot_value] 483 | del_count += 1 484 | elif len(pivot_category[pivot_value].domains) >= domain_count: 485 | # check for pivots with all domains in them 486 | del pivot_category[pivot_value] 487 | if config.print_debug_output: 488 | print(f"deleted {pivot_category_key}:{pivot_value}. Contained all domains") 489 | if config.print_debug_output: 490 | print(f"deleted {del_count} " 491 | f"singleton pivots out of {total_pivots} " 492 | f"pivots from {pivot_category_key}") 493 | 494 | 495 | def trim_unconnected_domains(graph: "Graph", pivot_categories: dict, config: "Config"): 496 | """ Remove any domains that have no shared connection to any othe domain 497 | """ 498 | if config.print_debug_output: print(f"{len(graph.nodes)} domains in Iris result set") 499 | connected_domains = set() 500 | for pivot_category_key in pivot_categories: 501 | pivot_category = pivot_categories[pivot_category_key] 502 | for pivot_value in list(pivot_category.keys()): 503 | pivot_domains = pivot_category[pivot_value].domains 504 | connected_domains = connected_domains.union(pivot_domains) 505 | 506 | # get the set of domains that are not connected 507 | domains = set(graph.nodes) 508 | lonely_domains = domains.difference(connected_domains) 509 | 510 | # remove unconnected domains 511 | for domain in lonely_domains: 512 | graph.remove_node(domain) 513 | 514 | if config.print_debug_output: 515 | print(f"{len(connected_domains)} domains are interconnected") 516 | print(f"{len(lonely_domains)} domains are unconnected") 517 | print("Unconnected domains removed from graph:") 518 | for domain in lonely_domains: 519 | print(f" {domain}") 520 | 521 | return lonely_domains 522 | 523 | 524 | def trim_domains_with_only_create_date_pivot(graph: "Graph", pivot_categories: dict): 525 | """ if a domain ONLY has a create_date pivot, then that isn't a very good indicator of 526 | connectedness.""" 527 | # identify domains to trim 528 | trimmed_domains = [] 529 | for domain_name in graph.nodes: 530 | domain = graph.nodes[domain_name]["domain"] 531 | if len(domain.pivot_categories) == 1 and "create_date" in domain.pivot_categories: 532 | trimmed_domains.add(domain) 533 | # remove domain from graph and remove it from the main pivot_categories data structure 534 | graph.remove_node(domain_name) 535 | 536 | domain_create_date = domain.pivot_categories["create_date"][0] 537 | pivot_categories["create_date"][domain_create_date].remove(domain_name) 538 | if len(pivot_categories["create_date"][domain_create_date]) == 0: 539 | pivot_categories["create_date"].pop(domain_create_date) 540 | if len(pivot_categories["create_date"]) == 0: 541 | pivot_categories.pop("create_date") 542 | 543 | return trimmed_domains 544 | 545 | 546 | def get_pivot_connection_weight(pivot_category: str, 547 | global_pivot_count: int, 548 | local_pivot_count: int, 549 | config: "Config"): 550 | """ If we aren't using the pivot count to set the edge weight, just return a constant value of 551 | 1 for every pivot. If we do want to use the pivot count, use the function: 552 | 1 - (log(pivot count) / (log(max possible pivot count))) 553 | This creates an inverse log ratio where small pivots have a high edge weight, 554 | and very large pivots have a low edge weight. 555 | 556 | Note: also experimenting with raising this log ratio to different exponents to get greater 557 | separation between large and small pivots: math.pow(1.0 + inverse_log_ratio, 3) - 1 558 | """ 559 | if pivot_category not in config.pivot_category_config: 560 | raise Exception(f"Unexpected Pivot Category: {pivot_category}") 561 | 562 | # scale the edge strength based on the ratio of the global pivot count vs the max domains 563 | if config.scale_edge_strength_by_pivot_count: 564 | if global_pivot_count is None: 565 | # Some pivots don't have a count. For example, tld or longest common substring. 566 | # if global pivot count is None, for now set to 1 (?) 567 | # But we probably need to then normalize this weigth against the max weight calculated. 568 | # Also, TLD doesn't have a pivot count because it's often huge. Is that the same. 569 | # importance as common substrings? Probably not. 570 | return 0.5 571 | inv_ratio = 1.0 - math.log(1.0 + global_pivot_count) / math.log(1.0 + config.max_domains) 572 | return inv_ratio 573 | # return math.pow(1.0 + inverse_log_ratio, 3) - 1 574 | return 1 575 | 576 | 577 | def build_domain_graph(graph: "Graph", pivot_categories: dict, config: "Config"): 578 | # The graph in initialized with all it's nodes. Now we need to connect all the nodes 579 | # with each local pivot in the pivot_categories dict 580 | edge_count = 0 581 | for category in pivot_categories: 582 | for pivot_value in pivot_categories[category]: 583 | pivot = pivot_categories[category][pivot_value] 584 | pivot_domains = list(pivot.domains) 585 | 586 | # for each pair of domains in pivot, get the edge weight and create edge 587 | weight = get_pivot_connection_weight(category, pivot.pivot_count, len(pivot_domains), config) 588 | if weight > 0: 589 | for x in range(len(pivot_domains)): 590 | for y in range(x+1, len(pivot_domains)): 591 | d1 = pivot_domains[x] 592 | d2 = pivot_domains[y] 593 | edge_count += 1 594 | if graph.has_edge(d1, d2): 595 | graph[d1][d2]['relationship'].add(weight, category) 596 | else: 597 | graph.add_edge(d1, d2, relationship=DomainRelationship(weight, category)) 598 | 599 | # now that all edges are added, set the weight attribute with the adjusted weight 600 | for edge in graph.edges: 601 | graph[edge[0]][edge[1]]['weight'] = graph[edge[0]][edge[1]]['relationship'].weight 602 | 603 | print(f"Total Graph Connections: {edge_count}") 604 | print(f"Distinct Graph Connections: {len(graph.edges)}") 605 | return graph 606 | 607 | 608 | def calc_pivot_stats(graph: "Graph", pivot_categories: dict): 609 | from IPython.display import HTML, display 610 | import tabulate 611 | 612 | # calc the max number of edges possible for this set of domains 613 | max_edge_count = get_edge_count(len(graph.nodes)) 614 | 615 | # collect counts for each pivot category 616 | category_domain_counts = {} 617 | category_edge_counts = {} 618 | for category_key in pivot_categories: 619 | category_domain_counts[category_key] = 0 620 | category_edge_counts[category_key] = 0 621 | category = pivot_categories[category_key] 622 | for pivot_value in category: 623 | category_domain_counts[category_key] += len(category[pivot_value].domains) 624 | 625 | # if all domains share a pivot value, it would be considered a "connected graph" 626 | # so get the edge count for a connected graph 627 | edge_count = get_edge_count(len(category[pivot_value].domains)) 628 | category_edge_counts[category_key] += round(edge_count) 629 | 630 | total_connections = 0 631 | 632 | headers = ["Pivot Category", 633 | "# of Domains", 634 | "# of Pivots", 635 | "avg domains per pivot", 636 | "# of connections"] 637 | table = [] 638 | total_domains = len(graph.nodes) 639 | for category_key in category_domain_counts: 640 | cat_pivot_count = len(pivot_categories[category_key]) 641 | if cat_pivot_count > 0: 642 | domain_count = category_domain_counts[category_key] 643 | edge_count = category_edge_counts[category_key] 644 | 645 | total_connections += edge_count 646 | 647 | avg_domains = domain_count / cat_pivot_count 648 | percent_of_total_domains = round(100 * (domain_count / total_domains), 2) 649 | percent_of_total_edges = round(100 * (edge_count / max_edge_count), 2) 650 | table.append([category_key, 651 | f"{domain_count} ({percent_of_total_domains}%)", 652 | cat_pivot_count, 653 | round(avg_domains, 2), 654 | f"{edge_count} ({percent_of_total_edges}%)"]) 655 | 656 | print(f"{len(graph.nodes)} Domains in Pivot Structure") 657 | display(HTML(tabulate.tabulate(table, headers=headers, tablefmt='html'))) 658 | 659 | 660 | def calc_viz_layout(layout: str, graph: "Graph", dimension: int): 661 | # KK layout only 662 | if layout == "kk": 663 | return nx.layout.kamada_kawai_layout(graph, dim=dimension) 664 | 665 | # spring layout only 666 | if layout == "fr": 667 | return nx.layout.spring_layout(graph, dim=dimension) 668 | 669 | # kk layout as initialization for spring layout 670 | if layout == "kk_to_fr": 671 | pos = nx.layout.kamada_kawai_layout(graph, dim=dimension, weight=None) 672 | return nx.layout.spring_layout(graph, pos=pos, dim=dimension) 673 | 674 | # spring layout as initialization for kk layout 675 | if layout == "fr_to_kk": 676 | pos = nx.layout.spring_layout(graph, dim=dimension) 677 | return nx.layout.kamada_kawai_layout(graph, pos=pos, dim=dimension) 678 | raise Exception("invalid layout choice") 679 | 680 | 681 | def build_3d_graph_layout(graph: "Graph"): 682 | """ Build the graph layout based on the specified algorithm and get the node positions 683 | in xyz dimensions""" 684 | pos = calc_viz_layout("kk_to_fr", graph, 3) 685 | 686 | node_labels, node_risk_scores, Xn, Yn, Zn = [], [], [], [], [] 687 | for name in graph.nodes: 688 | # build x,y,z coordinates data structure for nodes 689 | Xn.append(pos[name][0]) 690 | Yn.append(pos[name][1]) 691 | Zn.append(pos[name][2]) 692 | 693 | # get domain colors by risk score 694 | domain = graph.nodes[name]["domain"] 695 | node_labels.append(domain.label) 696 | node_risk_scores.append(domain.risk_score) 697 | 698 | # build x,y,z coordinates data structure for edges 699 | Xe, Ye, Ze = [], [], [] 700 | for e in graph.edges: 701 | u = pos[e[0]] 702 | v = pos[e[1]] 703 | Xe+=[u[0], v[0], None] 704 | Ye+=[u[1], v[1], None] 705 | Ze+=[u[2], v[2], None] 706 | 707 | # Create the 3d Plotly graph and render it 708 | # build line objects for our edges 709 | trace1=go.Scatter3d(x=Xe, y=Ye, z=Ze, 710 | mode='lines', 711 | name='edges', 712 | line=dict(color='rgb(125,125,125)', width=0.5), 713 | opacity=0.9, 714 | hoverinfo='none') 715 | 716 | trace2=go.Scatter3d( 717 | x=Xn, y=Yn, z=Zn, 718 | mode='markers', 719 | name='domains', 720 | marker=dict( 721 | symbol='circle', 722 | size=6, 723 | showscale=True, 724 | color=node_risk_scores, 725 | colorscale=[[0.0, 'red'], [0.3, 'orange'], [0.5, 'yellow'], [1.0, 'green']], 726 | # cmin/cmax needed so plotly doesn't normalize the scores to calculate the color 727 | cmin=0, cmax=100, 728 | reversescale=True, 729 | line=dict(color='rgb(50,50,50)', width=0.5), 730 | colorbar=dict( 731 | thickness=15, 732 | title='Risk Score', 733 | xanchor='left', 734 | titleside='right' 735 | ), 736 | ), 737 | text=node_labels, 738 | hoverinfo='text') 739 | 740 | # background definition, but everything is turned off 741 | axis=dict(showbackground=False, 742 | showline=False, 743 | zeroline=False, 744 | showgrid=False, 745 | showticklabels=False, 746 | title='') 747 | 748 | layout = go.Layout( 749 | title=f"Graph of interconnected domains ({len(node_labels)} domains)", 750 | width=1000, height=1000, 751 | showlegend=False, 752 | scene=dict(xaxis=dict(axis), yaxis=dict(axis), zaxis=dict(axis)), 753 | margin=dict(t=100), hovermode='closest') 754 | 755 | data=[trace1, trace2] 756 | fig=go.Figure(data=data, layout=layout) 757 | return fig 758 | 759 | 760 | def build_2d_graph_layout(graph: "Graph", get_2d_shared_pivots: "function"): 761 | """ build the graph layout based on the specified algorithm and get the node positions 762 | in xy dimensions""" 763 | pos = calc_viz_layout("kk_to_fr", graph, 2) 764 | # pos = calc_viz_layout("fr_to_kk", g, 2) 765 | 766 | # build edge data 767 | edge_x, edge_y = [], [] 768 | for e in graph.edges(): 769 | x0, y0 = pos[e[0]] 770 | x1, y1 = pos[e[1]] 771 | edge_x.append(x0) 772 | edge_x.append(x1) 773 | edge_x.append(None) 774 | edge_y.append(y0) 775 | edge_y.append(y1) 776 | edge_y.append(None) 777 | 778 | # create edge scatter plot 779 | edge_trace = go.Scatter( 780 | x=edge_x, y=edge_y, 781 | line=dict(width=0.5, color='#888'), 782 | hoverinfo='none', 783 | mode='lines', 784 | opacity=0.6 785 | ) 786 | 787 | # build node data 788 | node_adjacencies, node_risk_scores, node_text, node_x, node_y = [], [], [], [], [] 789 | names = list(graph.nodes) 790 | for name in names: 791 | domain = graph.nodes[name]["domain"] 792 | x, y = pos[name] 793 | node_x.append(x) 794 | node_y.append(y) 795 | # get the domain's connected nodes 796 | neighbors = list(graph.neighbors(name)) 797 | node_adjacencies.append(neighbors) 798 | # get the node text 799 | node_text.append(f'{name}: risk {domain.risk_score}, connections {len(neighbors)}') 800 | # get the domain risk score 801 | node_risk_scores.append(domain.risk_score) 802 | 803 | # build node scatter plot 804 | node_trace = go.Scatter( 805 | x=node_x, y=node_y, 806 | mode='markers', 807 | hoverinfo='text', 808 | text=node_text, 809 | customdata=node_adjacencies, 810 | marker=dict( 811 | showscale=True, 812 | reversescale=True, 813 | color=node_risk_scores, 814 | colorscale=[[0.0, 'red'], [0.3, 'orange'], [0.5, 'yellow'], [1.0, 'green']], 815 | # cmin/cmax needed so plotly doesn't normalize the scores to calculate the color 816 | cmin=0, cmax=100, 817 | size=10, 818 | colorbar=dict( 819 | thickness=15, 820 | title='Risk Score', 821 | xanchor='left', 822 | titleside='right' 823 | ), 824 | line_width=2)) 825 | 826 | # create the jup widget holder for plotly 827 | fig = go.FigureWidget( 828 | [edge_trace, node_trace], 829 | layout=go.Layout( 830 | title=f'Graph of interconnected domains ({len(node_text)} domains)', 831 | titlefont_size=16, 832 | showlegend=False, 833 | hovermode='closest', 834 | margin=dict(b=5,l=5,r=5,t=30), 835 | xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), 836 | yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)) 837 | ) 838 | 839 | # handle selection of domains 840 | def node_selection_fn(trace, points, selector): 841 | selected_domains = [names[idx] for idx in points.point_inds] 842 | update_selected_domains(selected_domains) 843 | 844 | # handle node click events 845 | def node_click_fn(trace, points, selector): 846 | if len(points.point_inds) > 1: 847 | print(f"node_click passed in more than 1 point: {points.point_inds}") 848 | 849 | # clear the old selected points 850 | trace.selectedpoints = [] 851 | if len(points.point_inds) == 0: 852 | return 853 | 854 | # get the list of selected domain names 855 | selected_domains = [names[idx] for idx in points.point_inds] 856 | for id in points.point_inds: 857 | selected_domains = selected_domains + trace.customdata[id] 858 | 859 | # set the new selected points 860 | # don't like having to loop in a loop to get the domain index, but I don't know a better way 861 | trace.selectedpoints = points.point_inds + [names.index(name) for name in trace.customdata[id]] 862 | 863 | update_selected_domains(selected_domains) 864 | 865 | def update_selected_domains(selected_domains): 866 | if len(selected_domains) == 0: 867 | return 868 | 869 | # sort domains by length, then alpha 870 | selected_domains.sort(key=len, reverse=True) 871 | with out: 872 | # write selected domains to the output widget 873 | print(f"Selected Domains: ({len(selected_domains)})\n") 874 | for selected_domain in selected_domains: 875 | print(selected_domain) 876 | out.clear_output(wait=True) 877 | 878 | # calc pivots selected domains have in common 879 | get_2d_shared_pivots(graph, selected_domains) 880 | 881 | # event handler for node selection 882 | fig.data[1].on_selection(node_selection_fn) 883 | # event handle for node click 884 | fig.data[1].on_click(node_click_fn) 885 | 886 | # Create a table FigureWidget that updates the list of selected domains 887 | out = widgets.Output(layout={'border': '1px solid black'}) 888 | domain_ui = widgets.VBox((fig, out)) 889 | return domain_ui 890 | 891 | 892 | def get_shared_pivots(graph: "Graph", selected_domains: list): 893 | shared_pivots = {} 894 | for name in selected_domains: 895 | domain = graph.nodes[name]["domain"] 896 | for cat in domain.pivot_categories: 897 | for cat_value in domain.pivot_categories[cat]: 898 | key = f"{cat}: {cat_value}" 899 | if key not in shared_pivots: 900 | shared_pivots[key] = [] 901 | shared_pivots[key].append(domain) 902 | 903 | # filter by pivots that have >= n domains 904 | shared_pivots = {k: v for k, v in shared_pivots.items() if len(v) >= 3} 905 | return shared_pivots 906 | 907 | 908 | def create_pivot_heatmaps(shared_pivots: dict): 909 | print("\n Heatmap of which pivots connect the most domains together: by pivot category") 910 | pivot_cat_crosstab, pivot_value_crosstab = create_pivot_tables(shared_pivots) 911 | fig, ax = plt.subplots(figsize=(10, 10)) 912 | im = heatmap( 913 | pivot_cat_crosstab, 914 | pivot_cat_crosstab.index, 915 | pivot_cat_crosstab.columns, 916 | ax=ax, 917 | cmap="Blues") 918 | texts = annotate_heatmap(im, valfmt="{x}") 919 | fig.tight_layout() 920 | plt.show() 921 | 922 | print("\n Heatmap of which pivots connect the most domains together: by pivot value") 923 | fig, ax = plt.subplots(figsize=(10, 10)) 924 | im = heatmap( 925 | pivot_value_crosstab, 926 | pivot_value_crosstab.index, 927 | pivot_value_crosstab.columns, 928 | ax=ax, 929 | cmap="Blues") 930 | texts = annotate_heatmap(im, valfmt="{x}") 931 | fig.tight_layout() 932 | plt.show() 933 | 934 | print("\n List of the most frequent pivot values") 935 | create_pivot_summary(pivot_value_crosstab) 936 | 937 | 938 | def create_pivot_tables(shared_pivots: dict): 939 | # Create the pandas DataFrame 940 | data = [] 941 | for pivot_value in shared_pivots: 942 | for d in shared_pivots[pivot_value]: 943 | pivot_cat = pivot_value.split(": ")[0] 944 | data.append([d.name, pivot_cat, pivot_value]) 945 | df = pd.DataFrame(data, columns = ['domain', 'pivot_cat', 'pivot']) 946 | 947 | # Build contingency table of domains to pivot 948 | pivot_cat_crosstab = pd.crosstab(df['pivot_cat'], df['domain']) 949 | pivot_value_crosstab = pd.crosstab(df['pivot'], df['domain']) 950 | 951 | # sort rows by total # of pivots 952 | pivot_cat_crosstab['sum'] = pivot_cat_crosstab[list(pivot_cat_crosstab.columns)].sum(axis=1) 953 | pivot_cat_crosstab.sort_values("sum", 0, ascending=False, inplace=True) 954 | pivot_cat_crosstab.drop("sum", 1, inplace=True) 955 | 956 | # sort rows by total # of pivots 957 | pivot_value_crosstab['sum'] = pivot_value_crosstab[list(pivot_value_crosstab.columns)].sum(axis=1) 958 | pivot_value_crosstab.sort_values("sum", 0, ascending=False, inplace=True) 959 | pivot_value_crosstab.drop("sum", 1, inplace=True) 960 | 961 | return pivot_cat_crosstab, pivot_value_crosstab 962 | 963 | 964 | def create_pivot_summary(pivot_value_crosstab: "Pandas_CrossTab"): 965 | # show just an output view of pivot name and count for selection 966 | summary = pivot_value_crosstab.copy() 967 | summary['count'] = summary[list(summary.columns)].sum(axis=1) 968 | summary.sort_values("count", 0, ascending=False, inplace=True) 969 | summary = summary[["count"]] 970 | 971 | headers = ["Pivot Category", "Pivot Values", "Count"] 972 | table = [] 973 | for index, row in summary.iterrows(): 974 | cat, pivot = index.split(": ") 975 | table.append([cat, pivot, row["count"]]) 976 | display(HTML(tabulate.tabulate(table, headers=headers, tablefmt='html'))) 977 | 978 | 979 | 980 | def heatmap(data, row_labels, col_labels, ax=None, cbar_kw={}, cbarlabel="", **kwargs): 981 | """ 982 | Create a heatmap from a numpy array and two lists of labels. 983 | 984 | Parameters 985 | ---------- 986 | data 987 | A 2D numpy array of shape (N, M). 988 | row_labels 989 | A list or array of length N with the labels for the rows. 990 | col_labels 991 | A list or array of length M with the labels for the columns. 992 | ax 993 | A `matplotlib.axes.Axes` instance to which the heatmap is plotted. If 994 | not provided, use current axes or create a new one. Optional. 995 | cbar_kw 996 | A dictionary with arguments to `matplotlib.Figure.colorbar`. Optional. 997 | cbarlabel 998 | The label for the colorbar. Optional. 999 | **kwargs 1000 | All other arguments are forwarded to `imshow`. 1001 | """ 1002 | 1003 | if not ax: 1004 | ax = plt.gca() 1005 | 1006 | # Plot the heatmap 1007 | im = ax.imshow(data, **kwargs) 1008 | 1009 | # We want to show all ticks... 1010 | ax.set_xticks(np.arange(data.shape[1])) 1011 | ax.set_yticks(np.arange(data.shape[0])) 1012 | # ... and label them with the respective list entries. 1013 | ax.set_xticklabels(col_labels) 1014 | ax.set_yticklabels(row_labels) 1015 | 1016 | # Let the horizontal axes labeling appear on top. 1017 | ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False) 1018 | 1019 | # Rotate the tick labels and set their alignment. 1020 | plt.setp(ax.get_xticklabels(), rotation=-30, ha="right", rotation_mode="anchor") 1021 | 1022 | # Turn spines off and create white grid. 1023 | for edge, spine in ax.spines.items(): 1024 | spine.set_visible(False) 1025 | 1026 | ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True) 1027 | ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True) 1028 | ax.grid(which="minor", color="w", linestyle='-', linewidth=3) 1029 | ax.tick_params(which="minor", bottom=False, left=False) 1030 | 1031 | return im 1032 | 1033 | 1034 | def annotate_heatmap(im, data=None, valfmt="{x:.2f}", textcolors=["black", "white"], 1035 | threshold=None, **textkw): 1036 | """ 1037 | A function to annotate a heatmap. 1038 | 1039 | Parameters 1040 | ---------- 1041 | im 1042 | The AxesImage to be labeled. 1043 | data 1044 | Data used to annotate. If None, the image's data is used. Optional. 1045 | valfmt 1046 | The format of the annotations inside the heatmap. This should either 1047 | use the string format method, e.g. "$ {x:.2f}", or be a 1048 | `matplotlib.ticker.Formatter`. Optional. 1049 | textcolors 1050 | A list or array of two color specifications. The first is used for 1051 | values below a threshold, the second for those above. Optional. 1052 | threshold 1053 | Value in data units according to which the colors from textcolors are 1054 | applied. If None (the default) uses the middle of the colormap as 1055 | separation. Optional. 1056 | **kwargs 1057 | All other arguments are forwarded to each call to `text` used to create 1058 | the text labels. 1059 | """ 1060 | 1061 | if not isinstance(data, (list, np.ndarray)): 1062 | data = im.get_array() 1063 | 1064 | # Normalize the threshold to the images color range. 1065 | if threshold is not None: 1066 | threshold = im.norm(threshold) 1067 | else: 1068 | threshold = im.norm(data.max())/2. 1069 | 1070 | # Set default alignment to center, but allow it to be 1071 | # overwritten by textkw. 1072 | kw = dict(horizontalalignment="center", 1073 | verticalalignment="center") 1074 | kw.update(textkw) 1075 | 1076 | # Get the formatter in case a string is supplied 1077 | if isinstance(valfmt, str): 1078 | valfmt = matplotlib.ticker.StrMethodFormatter(valfmt) 1079 | 1080 | # Loop over the data and create a `Text` for each "pixel". 1081 | # Change the text's color depending on the data. 1082 | texts = [] 1083 | for i in range(data.shape[0]): 1084 | for j in range(data.shape[1]): 1085 | kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)]) 1086 | text = im.axes.text(j, i, valfmt(data[i, j], None), **kw) 1087 | texts.append(text) 1088 | 1089 | return texts -------------------------------------------------------------------------------- /images/2d_click.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/2d_click.gif -------------------------------------------------------------------------------- /images/2d_v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/2d_v1.png -------------------------------------------------------------------------------- /images/2d_zoom.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/2d_zoom.gif -------------------------------------------------------------------------------- /images/2d_zoom_select.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/2d_zoom_select.gif -------------------------------------------------------------------------------- /images/3d_infra.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/3d_infra.gif -------------------------------------------------------------------------------- /images/3d_v1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/3d_v1.gif -------------------------------------------------------------------------------- /images/3d_v2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/3d_v2.gif -------------------------------------------------------------------------------- /images/build_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/build_graph.png -------------------------------------------------------------------------------- /images/config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/config.png -------------------------------------------------------------------------------- /images/credentials.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/credentials.png -------------------------------------------------------------------------------- /images/dash_gov.us_substrings.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/dash_gov.us_substrings.gif -------------------------------------------------------------------------------- /images/domain_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/domain_data.png -------------------------------------------------------------------------------- /images/domain_graph_2d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/domain_graph_2d.png -------------------------------------------------------------------------------- /images/intro_3d.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/intro_3d.gif -------------------------------------------------------------------------------- /images/iris.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/iris.png -------------------------------------------------------------------------------- /images/iris_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/iris_small.png -------------------------------------------------------------------------------- /images/jupyter_cell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/jupyter_cell.png -------------------------------------------------------------------------------- /images/pivot_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/pivot_heatmap.png -------------------------------------------------------------------------------- /images/pivot_stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/pivot_stats.png -------------------------------------------------------------------------------- /images/pivot_value_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/pivot_value_heatmap.png -------------------------------------------------------------------------------- /images/reading_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/reading_data.png -------------------------------------------------------------------------------- /images/run_3d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/run_3d.png -------------------------------------------------------------------------------- /images/run_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/run_heatmap.png -------------------------------------------------------------------------------- /images/running_a_cell.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/running_a_cell.gif -------------------------------------------------------------------------------- /images/selected_domains.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/selected_domains.png -------------------------------------------------------------------------------- /images/trimmed_domains.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DomainTools/DomainCAT/c64a2b70769f18cfde70da4342d7a07dc93ec882/images/trimmed_domains.png -------------------------------------------------------------------------------- /infrastructure_cat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# InfraCAT: Infrastructure Connectivity Analysis Tool\n", 8 | "\n", 9 | "### Analyzing the infrastructure connectivity of an Iris API Search" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "collapsed": false, 17 | "jupyter": { 18 | "outputs_hidden": false 19 | }, 20 | "pycharm": { 21 | "name": "#%%\n" 22 | } 23 | }, 24 | "outputs": [ 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "InfraCAT is ready to go\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "# Run This First: imports all the helper functions and sets stuff up\n", 35 | "%run infrastructure_cat_module.py\n", 36 | "\n", 37 | "print(\"InfraCAT is ready to go\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Iris REST API Credentials" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": { 51 | "collapsed": false, 52 | "jupyter": { 53 | "outputs_hidden": false 54 | }, 55 | "pycharm": { 56 | "name": "#%%\n" 57 | } 58 | }, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "application/vnd.jupyter.widget-view+json": { 63 | "model_id": "842b0bc026174d7da491004391ec1055", 64 | "version_major": 2, 65 | "version_minor": 0 66 | }, 67 | "text/plain": [ 68 | "VBox(children=(Text(value='', description='Username:', layout=Layout(width='500px'), placeholder='Iris API Use…" 69 | ] 70 | }, 71 | "metadata": {}, 72 | "output_type": "display_data" 73 | } 74 | ], 75 | "source": [ 76 | "api_username_ui = widgets.Text(placeholder='Iris API Username', description='Username:', layout={'width': '500px'}, value=\"\")\n", 77 | "api_pw_ui = widgets.Password(placeholder='Iris API Password', description='Password:', layout={'width': '500px'}, value=\"\")\n", 78 | "widgets.VBox([api_username_ui, api_pw_ui])" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "## Query Domain Data From Iris Investigate API\n", 86 | "\n", 87 | "Enter either a list of return delimited domains into the Domains text box, _OR_ an Iris search hash into the hash text box.\n", 88 | "\n", 89 | "Note: if both a list of domains _AND_ a search hash is entered, the list of domains will be queried and the search hash will be ignored" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 3, 95 | "metadata": { 96 | "collapsed": false, 97 | "jupyter": { 98 | "outputs_hidden": false 99 | }, 100 | "pycharm": { 101 | "name": "#%%\n" 102 | } 103 | }, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "application/vnd.jupyter.widget-view+json": { 108 | "model_id": "e81fe61e74f24a0e952d36f48c13dbe9", 109 | "version_major": 2, 110 | "version_minor": 0 111 | }, 112 | "text/plain": [ 113 | "VBox(children=(Label(value='Enter a return delimited list of domains to lookup (no commas, no quotes)'), Texta…" 114 | ] 115 | }, 116 | "metadata": {}, 117 | "output_type": "display_data" 118 | } 119 | ], 120 | "source": [ 121 | "domain_list_ui = widgets.Textarea(placeholder='Enter list of domains', description='Domains:', layout={'height': '300px', 'width': '700px'})\n", 122 | "search_hash_ui = widgets.Text(placeholder='Enter list of domains', description='Hash:', layout={'width': '700px'})\n", 123 | "show_iris_query_ui(domain_list_ui, search_hash_ui)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 15, 129 | "metadata": { 130 | "collapsed": false, 131 | "jupyter": { 132 | "outputs_hidden": false 133 | }, 134 | "pycharm": { 135 | "name": "#%%\n" 136 | } 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "config = Config()\n", 141 | "\n", 142 | "# exclude certain infrastructure from graph\n", 143 | "# config.exclude_list = [\"EMAIL DOMAIN\"]\n", 144 | "config.exclude_list = []\n", 145 | "\n", 146 | "# only show infrastructure that is under the pivot threshold\n", 147 | "config.pivot_threshold = 500\n", 148 | "\n", 149 | "# Minimum should be 1 which means more than one domain has to show up in an edge\n", 150 | "config.edge_threshold = 1\n", 151 | "\n", 152 | "# set whether or no to set node size to the unique number of domains in the edge\n", 153 | "config.node_size = True" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 16, 159 | "metadata": { 160 | "collapsed": false, 161 | "jupyter": { 162 | "outputs_hidden": false 163 | }, 164 | "pycharm": { 165 | "name": "#%%\n" 166 | } 167 | }, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "Loaded 338 domains from data/dash_gov_dot_us.json\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "query_api = False\n", 179 | "save_search_to_disk = False\n", 180 | "json_file_path = \"data/dash_gov_dot_us.json\"\n", 181 | "\n", 182 | "if query_api:\n", 183 | " iris_results = query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui)\n", 184 | " print(f'Iris API returned {len(iris_results)} domains')\n", 185 | "\n", 186 | " # save search results to disk to be used later\n", 187 | " if save_search_to_disk:\n", 188 | " with open(json_file_path, 'w') as f:\n", 189 | " json.dump(iris_results, f)\n", 190 | "else:\n", 191 | " with open(json_file_path) as json_data:\n", 192 | " iris_results = json.loads(json_data.read())\n", 193 | "\n", 194 | " print(f'Loaded {len(iris_results)} domains from {json_file_path}')\n" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 20, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "380\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "graph, config = build_infra_graph(iris_results, config)\n", 212 | "\n", 213 | "print(len(graph.nodes))" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 21, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "data": { 223 | "application/vnd.jupyter.widget-view+json": { 224 | "model_id": "d37f5c75ad8c4a42b01667bd93d6fb71", 225 | "version_major": 2, 226 | "version_minor": 0 227 | }, 228 | "text/plain": [ 229 | "VBox(children=(FigureWidget({\n", 230 | " 'data': [{'hoverinfo': 'none',\n", 231 | " 'line': {'color': '#888', 'widt…" 232 | ] 233 | }, 234 | "metadata": {}, 235 | "output_type": "display_data" 236 | } 237 | ], 238 | "source": [ 239 | "build_2d_graph_layout(graph, config)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 22, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "application/vnd.jupyter.widget-view+json": { 250 | "model_id": "5c83bc927cca4294aaba3fb6c85f728c", 251 | "version_major": 2, 252 | "version_minor": 0 253 | }, 254 | "text/plain": [ 255 | "VBox(children=(FigureWidget({\n", 256 | " 'data': [{'hoverinfo': 'none',\n", 257 | " 'line': {'color': 'rgb(125,125,…" 258 | ] 259 | }, 260 | "metadata": {}, 261 | "output_type": "display_data" 262 | } 263 | ], 264 | "source": [ 265 | "build_3d_graph_layout(graph, config)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 34, 271 | "metadata": { 272 | "collapsed": false, 273 | "jupyter": { 274 | "outputs_hidden": false 275 | }, 276 | "pycharm": { 277 | "name": "#%%\n" 278 | } 279 | }, 280 | "outputs": [ 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "Loaded 195 domains from data/treatment_care.json\n" 286 | ] 287 | } 288 | ], 289 | "source": [ 290 | "query_api = False\n", 291 | "save_search_to_disk = False\n", 292 | "json_file_path = \"data/treatment_care.json\"\n", 293 | "\n", 294 | "if query_api:\n", 295 | " iris_results = query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui)\n", 296 | " print(f'Iris API returned {len(iris_results)} domains')\n", 297 | "\n", 298 | " # save search results to disk to be used later\n", 299 | " if save_search_to_disk:\n", 300 | " with open(json_file_path, 'w') as f:\n", 301 | " json.dump(iris_results, f)\n", 302 | "else:\n", 303 | " with open(json_file_path) as json_data:\n", 304 | " iris_results = json.loads(json_data.read())\n", 305 | "\n", 306 | " print(f'Loaded {len(iris_results)} domains from {json_file_path}')" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 44, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "config = Config()\n", 316 | "\n", 317 | "# exclude certain infrastructure from graph\n", 318 | "# config.exclude_list = [\"EMAIL DOMAIN\"]\n", 319 | "config.exclude_list = []\n", 320 | "\n", 321 | "# only show infrastructure that is under the pivot threshold\n", 322 | "config.pivot_threshold = 50000\n", 323 | "\n", 324 | "# Minimum should be 1 which means more than one domain has to show up in an edge\n", 325 | "config.edge_threshold = 1\n", 326 | "\n", 327 | "# set whether or no to set node size to the unique number of domains in the edge\n", 328 | "config.node_size = True" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 45, 334 | "metadata": { 335 | "collapsed": false, 336 | "jupyter": { 337 | "outputs_hidden": false 338 | }, 339 | "pycharm": { 340 | "name": "#%%\n" 341 | } 342 | }, 343 | "outputs": [ 344 | { 345 | "name": "stdout", 346 | "output_type": "stream", 347 | "text": [ 348 | "6\n" 349 | ] 350 | } 351 | ], 352 | "source": [ 353 | "graph, config = build_infra_graph(iris_results, config)\n", 354 | "\n", 355 | "print(len(graph.nodes))" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 46, 361 | "metadata": { 362 | "collapsed": false, 363 | "jupyter": { 364 | "outputs_hidden": false 365 | }, 366 | "pycharm": { 367 | "name": "#%%\n" 368 | } 369 | }, 370 | "outputs": [ 371 | { 372 | "data": { 373 | "application/vnd.jupyter.widget-view+json": { 374 | "model_id": "98caf40adfc84751a057c517b69975bb", 375 | "version_major": 2, 376 | "version_minor": 0 377 | }, 378 | "text/plain": [ 379 | "VBox(children=(FigureWidget({\n", 380 | " 'data': [{'hoverinfo': 'none',\n", 381 | " 'line': {'color': '#888', 'widt…" 382 | ] 383 | }, 384 | "metadata": {}, 385 | "output_type": "display_data" 386 | } 387 | ], 388 | "source": [ 389 | "build_2d_graph_layout(graph, config)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 47, 395 | "metadata": { 396 | "scrolled": true, 397 | "tags": [] 398 | }, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "application/vnd.jupyter.widget-view+json": { 403 | "model_id": "8f6935bd23bd491c9b8446814721dfdd", 404 | "version_major": 2, 405 | "version_minor": 0 406 | }, 407 | "text/plain": [ 408 | "VBox(children=(FigureWidget({\n", 409 | " 'data': [{'hoverinfo': 'none',\n", 410 | " 'line': {'color': 'rgb(125,125,…" 411 | ] 412 | }, 413 | "metadata": {}, 414 | "output_type": "display_data" 415 | } 416 | ], 417 | "source": [ 418 | "build_3d_graph_layout(graph, config)" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 48, 424 | "metadata": { 425 | "tags": [] 426 | }, 427 | "outputs": [ 428 | { 429 | "name": "stdout", 430 | "output_type": "stream", 431 | "text": [ 432 | "18\n" 433 | ] 434 | } 435 | ], 436 | "source": [ 437 | "pair_graph, pair_config = build_pair_infra_graph(iris_results, config)\n", 438 | "\n", 439 | "print(len(pair_graph.nodes))" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 49, 445 | "metadata": {}, 446 | "outputs": [ 447 | { 448 | "data": { 449 | "application/vnd.jupyter.widget-view+json": { 450 | "model_id": "a601dde8107842a39561970e8fb5f981", 451 | "version_major": 2, 452 | "version_minor": 0 453 | }, 454 | "text/plain": [ 455 | "VBox(children=(FigureWidget({\n", 456 | " 'data': [{'hoverinfo': 'none',\n", 457 | " 'line': {'color': '#888', 'widt…" 458 | ] 459 | }, 460 | "metadata": {}, 461 | "output_type": "display_data" 462 | } 463 | ], 464 | "source": [ 465 | "build_2d_graph_layout(pair_graph, pair_config)" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 50, 471 | "metadata": {}, 472 | "outputs": [ 473 | { 474 | "data": { 475 | "application/vnd.jupyter.widget-view+json": { 476 | "model_id": "d4f902e406d8446c9b6eb441fde9f99d", 477 | "version_major": 2, 478 | "version_minor": 0 479 | }, 480 | "text/plain": [ 481 | "VBox(children=(FigureWidget({\n", 482 | " 'data': [{'hoverinfo': 'none',\n", 483 | " 'line': {'color': 'rgb(125,125,…" 484 | ] 485 | }, 486 | "metadata": {}, 487 | "output_type": "display_data" 488 | } 489 | ], 490 | "source": [ 491 | "build_3d_graph_layout(pair_graph, pair_config)" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": { 498 | "collapsed": false, 499 | "jupyter": { 500 | "outputs_hidden": false 501 | }, 502 | "pycharm": { 503 | "name": "#%%\n" 504 | } 505 | }, 506 | "outputs": [], 507 | "source": [] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "metadata": { 513 | "collapsed": false, 514 | "jupyter": { 515 | "outputs_hidden": false 516 | }, 517 | "pycharm": { 518 | "name": "#%%\n" 519 | } 520 | }, 521 | "outputs": [], 522 | "source": [] 523 | } 524 | ], 525 | "metadata": { 526 | "kernelspec": { 527 | "display_name": "Python 3 (ipykernel)", 528 | "language": "python", 529 | "name": "python3" 530 | }, 531 | "language_info": { 532 | "codemirror_mode": { 533 | "name": "ipython", 534 | "version": 3 535 | }, 536 | "file_extension": ".py", 537 | "mimetype": "text/x-python", 538 | "name": "python", 539 | "nbconvert_exporter": "python", 540 | "pygments_lexer": "ipython3", 541 | "version": "3.8.10" 542 | } 543 | }, 544 | "nbformat": 4, 545 | "nbformat_minor": 4 546 | } -------------------------------------------------------------------------------- /infrastructure_cat_module.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import re 4 | import json 5 | import math 6 | from difflib import SequenceMatcher 7 | import plotly.graph_objects as go 8 | import requests 9 | import networkx as nx 10 | import pandas as pd 11 | import numpy as np 12 | import scipy 13 | import matplotlib 14 | import matplotlib.pyplot as plt 15 | from ipywidgets import interactive, HBox, VBox 16 | import ipywidgets as widgets 17 | from IPython.display import HTML, display 18 | import tabulate 19 | from dotenv import dotenv_values 20 | from domaintools import API 21 | from configparser import ConfigParser 22 | 23 | import networkx as nx 24 | import matplotlib.pyplot as plt 25 | 26 | import itertools 27 | 28 | # load REST API creds from .env file 29 | dcat_config = dotenv_values(".env") 30 | 31 | 32 | def show_iris_query_ui(domain_list_ui, search_hash_ui): 33 | lookup_ui = widgets.VBox([ 34 | widgets.Label(value="Enter a return delimited list of domains to lookup (no commas, no quotes)"), 35 | domain_list_ui, 36 | widgets.Label(value="Or..."), 37 | widgets.Label(value="Enter an Iris search hassh to lookup"), 38 | search_hash_ui, 39 | ]) 40 | return lookup_ui 41 | 42 | 43 | def clean_domain_list(domain_list_ui): 44 | # remove any quotes, spaces, or defanging square brackets 45 | full_domain_list = domain_list_ui.value.strip().replace(' ', '').replace('"', '').replace("'", "").replace('[', 46 | '').replace( 47 | ']', '') 48 | # replace commas with new lines 49 | full_domain_list = full_domain_list.replace(",", "\n") 50 | # update the widget 51 | domain_list_ui.value = full_domain_list 52 | # split into array 53 | return full_domain_list.split("\n") 54 | 55 | 56 | def get_rest_api_creds(api_username_ui, api_pw_ui): 57 | api_username = api_username_ui.value 58 | if len(api_username) == 0: 59 | api_username = dcat_config["IRIS_API_USERNAME"] 60 | api_key = api_pw_ui.value 61 | if len(api_key) == 0: 62 | api_key = dcat_config["IRIS_API_KEY"] 63 | return api_username, api_key 64 | 65 | 66 | def query_iris_rest_api(api_username_ui, api_pw_ui, domain_list_ui, search_hash_ui): 67 | api_username, api_key = get_rest_api_creds(api_username_ui, api_pw_ui) 68 | api = API(api_username, api_key) 69 | if len(domain_list_ui.value) > 0: 70 | # split list of domains into groups of 100 because of API restrictions 71 | results = [] 72 | full_domain_list = clean_domain_list(domain_list_ui) 73 | max_domains = 100 74 | start = 0 75 | end = max_domains 76 | for _ in range(math.ceil(len(full_domain_list) / max_domains)): 77 | # slice out max domains to query 78 | partial_domain_list = full_domain_list[start:end] 79 | # build query string 80 | domain_list = ",".join(partial_domain_list) 81 | iris_query = {"domains": domain_list} 82 | # query rest api 83 | print(f"...querying Iris REST API for {len(partial_domain_list)} domains") 84 | iris_results = api.iris_investigate(**iris_query) 85 | # build up the set of return domain objects 86 | results += iris_results.response().get('results', {}) 87 | # update slice indexes 88 | start = end 89 | end += max_domains 90 | return results 91 | elif len(search_hash_ui.value) > 0: 92 | iris_query = {"search_hash": search_hash_ui.value} 93 | iris_results = api.iris_investigate(**iris_query) 94 | # print(iris_results.status) 95 | iris_results = iris_results.response().get('results', {}) 96 | return iris_results 97 | else: 98 | print( 99 | "Domain List and Search Hash text boxes are empty. Please enter either a list of domains or search hash to lookup") 100 | raise Exception("Domain List and Search Hash text boxes are empty") 101 | 102 | 103 | class Config(object): 104 | """ Little helper class to hold all the config values""" 105 | 106 | 107 | class Domain(object): 108 | """ Little helper class to hold the domain name and risk score 109 | """ 110 | 111 | def __init__(self, domain_json): 112 | self.json = domain_json 113 | self.name = domain_json["domain"] 114 | self.risk_score = domain_json["domain_risk"]['risk_score'] 115 | self.pivots = {} 116 | self.label = f"{self.name} ({self.risk_score})" 117 | 118 | def __str__(self): 119 | return f"name: {self.name}, risk: {self.risk_score}" 120 | 121 | def __repr__(self): 122 | return str(self) 123 | 124 | 125 | class DomainRelationship(object): 126 | def __init__(self, weight: float, category: str): 127 | # this is the maximum weight that an edge can have. 128 | # Adjust this if you want to play around with stronger edge weights 129 | self.max_weight = 5.0 130 | self.weight = weight 131 | self.categories = [category] 132 | 133 | def __str__(self): 134 | return f"weight: {self.weight}, categories: {self.categories}" 135 | 136 | def __repr__(self): 137 | return str(self) 138 | 139 | def add(self, weight: float, category: str): 140 | """ Note: certain pivot categories can be added more than once for 2 domains; 141 | things like IP and name server. For example, two domains could be on the same set of 5 142 | IP addreese. For now the weights are just summed if there are more than one pivots of 143 | the same category, but maybe we need a different strategy. Since IPs have multiple pivots 144 | (ip address, country code, asn, isp) this means if there were 5 shared IPs between two 145 | domains, the weight would be: 4 * 5 * pivot_weight. 146 | This might over amplify the edge strength 147 | """ 148 | if category not in self.categories: 149 | # this helps by not overly boosting the edge weight if two domains share 150 | # multipel IP addresses 151 | self.weight += weight 152 | self.weight = min(self.weight, self.max_weight) 153 | self.categories.append(category) 154 | 155 | def get_description(self): 156 | return "
".join(sorted(self.categories)) 157 | 158 | 159 | class Pivot(object): 160 | def __init__(self, category, value, global_count): 161 | self.category = category 162 | self.value = value 163 | self.global_count = global_count 164 | self.domains = set() 165 | 166 | # def union(self, other: "Pivot"): 167 | # self.domains.union(other.domains) 168 | 169 | def label(self): 170 | # return f"category: {self.category}: value: {self.value} ({self.global_count})" 171 | return f"{self.category}: {self.value} ({self.global_count})" 172 | 173 | def __str__(self): 174 | return f"category: {self.category}, " \ 175 | f"value: {self.value}, " \ 176 | f"global_count: {self.global_count}, " \ 177 | f"domains: {self.domains}" 178 | 179 | def __repr__(self): 180 | return str(self) 181 | 182 | 183 | # build graph 184 | def get_edge_count(n: int): 185 | # for a complete graph, the edge count is: n(n-1)/2 186 | return n * (n - 1) / 2 187 | 188 | 189 | # def pivot_on_matching_substrings(graph: "Graph", domains: dict, config: "Config"): 190 | # """Create pivots between domains that share a common substring of 191 | # `config.longest_common_substring` chars long. 192 | # 193 | # Note: SequenceMatcher has some known issues with not finding the longest match in very long 194 | # strings, but does a pretty good job with shorter strings such as domain names. 195 | # https://stackoverflow.com/questions/18715688/find-common-substring-between-two-strings 196 | # """ 197 | # domain_names = list(domains.keys()) 198 | # for x in range(len(domain_names)): 199 | # domain1 = domain_names[x] 200 | # string1 = domain1.split('.')[0] 201 | # # pull out substrings to ignore 202 | # if config.ignore_substrings and len(config.ignore_substrings) > 0: 203 | # for ignore in config.ignore_substrings: 204 | # string1 = string1.replace(ignore, "") 205 | # for y in range(x + 1, len(domain_names)): 206 | # domain2 = domain_names[y] 207 | # string2 = domain2.split('.')[0] 208 | # # pull out substrings to ignore 209 | # if config.ignore_substrings and len(config.ignore_substrings) > 0: 210 | # for ignore in config.ignore_substrings: 211 | # string2 = string2.replace(ignore, "") 212 | # # find the longest common substring between the two domains 213 | # matcher = SequenceMatcher(None, string1, string2, False) 214 | # match = matcher.find_longest_match(0, len(string1), 0, len(string2)) 215 | # longest_match = string1[match.a: match.a + match.size] 216 | # # check if the matching substring is long enough 217 | # if len(longest_match) >= config.longest_common_substring: 218 | # # add pivots 219 | # _append_value_to_pivot( 220 | # graph, 221 | # "longest_common_substring", 222 | # longest_match, None, 223 | # domains[domain1], config) 224 | # _append_value_to_pivot( 225 | # graph, 226 | # "longest_common_substring", 227 | # longest_match, None, 228 | # domains[domain2], config) 229 | 230 | 231 | def build_pivot_graph(iris_results: list, config: "Config"): 232 | """ Main workflow function that takes the results from an Iris Investigate query and 233 | builds the graph object of how each of the domains in the query are connected to each other""" 234 | 235 | # parse the Iris API Result to build the pivot data structure 236 | graph, domains = init_local_pivot_graph(iris_results, config) 237 | print(len(graph.nodes)) 238 | print() 239 | 240 | # normalize registrar pivots (see note in function comments) 241 | # if "registrar" in pivot_categories and config.normalize_registrars: 242 | # normalize_similar_registrars(pivot_categories["registrar"]) 243 | 244 | # create pivots for longest common substrings 245 | # pivot_on_matching_substrings(graph, domains, config) 246 | # print(len(graph.nodes)) 247 | # print() 248 | 249 | # trim pivots from graph that have less than the set count threshold or contain all domains 250 | # graph = trim_pivots(graph, len(domains), config) 251 | # print(len(graph.nodes)) 252 | # print() 253 | 254 | # trim unconnected domains and domains with only a create date pivot 255 | # TURBO: I'm not sure yet how to do this 256 | # trimmed_unconnected_domains = trim_unconnected_domains(graph, domains, config) 257 | # print(len(graph.nodes)) 258 | # print() 259 | 260 | # trimmed_create_date_domains = trim_domains_with_only_create_date_pivot(graph, pivot_categories) 261 | # print(len(graph.nodes)) 262 | # print() 263 | 264 | # print(f"{len(trimmed_unconnected_domains)} " 265 | # f"domains trimmed because they were not connected to other domains") 266 | # print(f"{len(trimmed_create_date_domains)} " 267 | # f"domains trimmed because create_date was the only pivot") 268 | print(f"{len(graph.nodes)} nodes in graph structure \n") 269 | 270 | # build the graph structure based on the domain pivots 271 | graph = build_local_pivot_graph(graph, domains, config) 272 | return (graph, domains, 273 | { 274 | # "unconnected": trimmed_unconnected_domains, 275 | # "create_date": trimmed_create_date_domains 276 | } 277 | ) 278 | 279 | 280 | def get_pivots(data_obj, name, return_data=None, count=0, pivot_threshold=500): 281 | """ 282 | Does a deep dive through a data object to check count vs pivot threshold. 283 | Args: 284 | data_obj: Either a list or dict that needs to check pivot count 285 | name: pivot category name 286 | return_data: Holds data to return once we reach the end of the data_obj 287 | count: Lets us track to know when we are finished with the data_obj 288 | pivot_threshold: Threshold to include as a pivot. 289 | """ 290 | if return_data is None: 291 | return_data = [] 292 | count += 1 293 | if isinstance(data_obj, dict) and len(data_obj): 294 | temp_name = name 295 | for k, v in data_obj.items(): 296 | if isinstance(data_obj[k], (dict, list)): 297 | name = "{}_{}".format(name, k) 298 | temp_data = get_pivots( 299 | data_obj[k], name, return_data, count, pivot_threshold 300 | ) 301 | if temp_data: 302 | return_data.append([name[1:].upper().replace("_", " "), temp_data]) 303 | name = temp_name 304 | if "count" in data_obj and (1 < data_obj["count"] < pivot_threshold): 305 | return data_obj["value"], data_obj["count"] 306 | elif isinstance(data_obj, list) and len(data_obj): 307 | for index, item in enumerate(data_obj): 308 | temp_data = get_pivots(item, name, return_data, count, pivot_threshold) 309 | if temp_data: 310 | if isinstance(temp_data, list): 311 | for x in temp_data: 312 | return_data.append(x) 313 | elif isinstance(temp_data, tuple): 314 | return_data.append([name[1:].upper().replace("_", " "), temp_data]) 315 | count -= 1 316 | if count: 317 | return 318 | else: 319 | return return_data 320 | 321 | 322 | def build_infra_graph(iris_results: list, config: "Config"): 323 | graph = nx.Graph() 324 | pv_dict = {} 325 | config.domain_risk_dict = {} 326 | for domain in iris_results: 327 | if domain["domain"] not in config.domain_risk_dict: 328 | config.domain_risk_dict[domain["domain"]] = domain.get("domain_risk", {}).get("risk_score", 0) 329 | # GET PIVOTS 330 | nps = get_pivots(domain, "", pivot_threshold=config.pivot_threshold) 331 | pv_list = [] 332 | for p in nps: 333 | if p[0] not in config.exclude_list: 334 | pv_list.append("{}_{}".format(p[0], p[1][0])) 335 | # CREATE POSSIBLE NODES AND POSSIBLE EDGES 336 | x = itertools.combinations(pv_list, 2) 337 | for g in x: 338 | if "{}:::{}".format(g[0], g[1]) in pv_dict: 339 | if domain["domain"] not in pv_dict["{}:::{}".format(g[0], g[1])]: 340 | pv_dict["{}:::{}".format(g[0], g[1])].append(domain["domain"]) 341 | else: 342 | pv_dict["{}:::{}".format(g[0], g[1])] = [domain["domain"]] 343 | 344 | b_pv_list = [] 345 | my_set = set() 346 | 347 | # FILTER OUT EDGES THAT DON'T MEET THRESHOLD 348 | for k, v in pv_dict.items(): 349 | if len(v) > config.edge_threshold: 350 | a = k.split(":::") 351 | b_pv_list.append([a[0], a[1], v, len(v)]) 352 | my_set.add(a[0]) 353 | my_set.add(a[1]) 354 | # print(k, v, len(v)) 355 | 356 | # CREATE NODES 357 | for m in my_set: 358 | graph.add_node(m, color='blue', size=0) 359 | 360 | # CREATE EDGES 361 | for m in b_pv_list: 362 | graph.add_edge(m[0], m[1], domains=m[2], length=m[3]) 363 | return graph, config 364 | 365 | 366 | def build_pair_infra_graph(iris_results: list, config: "Config"): 367 | graph = nx.Graph() 368 | pv_dict = {} 369 | config.domain_risk_dict = {} 370 | for domain in iris_results: 371 | if domain["domain"] not in config.domain_risk_dict: 372 | config.domain_risk_dict[domain["domain"]] = domain.get("domain_risk", {}).get("risk_score", 0) 373 | # GET PIVOTS 374 | nps = get_pivots(domain, "", pivot_threshold=config.pivot_threshold) 375 | pv_list = [ 376 | "{}_{}".format(p[0], p[1][0]) 377 | for p in nps 378 | if p[0] not in config.exclude_list 379 | ] 380 | 381 | # CREATE POSSIBLE NODES AND POSSIBLE EDGES 382 | x = itertools.combinations(pv_list, 2) 383 | # print(x) 384 | i_list = [] 385 | for g in x: 386 | # print("{}:::{}".format(g[0], g[1])) 387 | if "{}:::{}".format(g[0], g[1]) not in i_list and g[0] != g[1]: 388 | i_list.append("{}:::{}".format(g[0], g[1])) 389 | y = itertools.combinations(i_list, 2) 390 | for g in y: 391 | 392 | if "{}|||{}".format(g[0], g[1]) in pv_dict: 393 | if domain["domain"] not in pv_dict["{}|||{}".format(g[0], g[1])]: 394 | pv_dict["{}|||{}".format(g[0], g[1])].append(domain["domain"]) 395 | else: 396 | pv_dict["{}|||{}".format(g[0], g[1])] = [domain["domain"]] 397 | # print(pv_dict) 398 | b_pv_list = [] 399 | my_set = set() 400 | 401 | # FILTER OUT EDGES THAT DON'T MEET THRESHOLD 402 | for k, v in pv_dict.items(): 403 | if len(v) > config.edge_threshold: 404 | a = k.split("|||") 405 | if a[0] != a[1]: 406 | b_pv_list.append([a[0], a[1], v, len(v)]) 407 | my_set.add(a[0]) 408 | my_set.add(a[1]) 409 | # print(k, v, len(v)) 410 | 411 | # CREATE NODES 412 | for m in my_set: 413 | graph.add_node(m, color='blue', size=0) 414 | 415 | # CREATE EDGES 416 | for m in b_pv_list: 417 | graph.add_edge(m[0], m[1], domains=m[2], length=m[3]) 418 | return graph, config 419 | 420 | 421 | def calc_viz_layout(layout: str, graph: "Graph", dimension: int): 422 | # KK layout only 423 | if layout == "kk": 424 | return nx.layout.kamada_kawai_layout(graph, dim=dimension) 425 | 426 | # spring layout only 427 | if layout == "fr": 428 | return nx.layout.spring_layout(graph, dim=dimension) 429 | 430 | # kk layout as initialization for spring layout 431 | if layout == "kk_to_fr": 432 | pos = nx.layout.kamada_kawai_layout(graph, dim=dimension, weight=None) 433 | return nx.layout.spring_layout(graph, pos=pos, dim=dimension) 434 | 435 | # spring layout as initialization for kk layout 436 | if layout == "fr_to_kk": 437 | pos = nx.layout.spring_layout(graph, dim=dimension) 438 | return nx.layout.kamada_kawai_layout(graph, pos=pos, dim=dimension) 439 | raise Exception("invalid layout choice") 440 | 441 | 442 | def average_risk_score(domain_list, domain_dict): 443 | total = sum(domain_dict[d] for d in domain_list) 444 | avg_risk_score = int(total / len(domain_list)) 445 | # print(avg_risk_score) 446 | if avg_risk_score >= 90: 447 | color = 'red' 448 | elif avg_risk_score >= 75: 449 | color = 'orange' 450 | elif avg_risk_score >= 55: 451 | color = 'yellow' 452 | else: 453 | color = 'green' 454 | return color, avg_risk_score 455 | 456 | 457 | def build_3d_graph_layout(graph: "Graph", config): 458 | """ Build the graph layout based on the specified algorithm and get the node positions 459 | in xyz dimensions""" 460 | 461 | pos = calc_viz_layout("kk_to_fr", graph, 3) 462 | 463 | node_labels, node_risk_scores, node_size, names, Xn, Yn, Zn = [], [], [], [], [], [], [] 464 | i = 0 465 | for node in graph.nodes(data=True): 466 | # build x,y,z coordinates data structure for nodes 467 | Xn.append(pos[node[0]][0]) 468 | Yn.append(pos[node[0]][1]) 469 | Zn.append(pos[node[0]][2]) 470 | domain_set = set() 471 | for e in graph.edges(node[0], data=True): 472 | domain_set.update(e[2]['domains']) 473 | domain_list = list(domain_set) 474 | color, avg_risk_score = average_risk_score(domain_list, config.domain_risk_dict) 475 | node_labels.append( 476 | "{}
Avg Risk Score: {}
Number of unique domains on edges: {}".format(node[0], avg_risk_score, 477 | len(domain_list))) 478 | node_risk_scores.append(color) 479 | node_size.append(len(domain_list)) 480 | names.append(domain_list) 481 | 482 | if not config.node_size: 483 | node_size = 6 484 | 485 | # build x,y,z coordinates data structure for edges 486 | Xe, Ye, Ze = [], [], [] 487 | for e in graph.edges: 488 | u = pos[e[0]] 489 | v = pos[e[1]] 490 | Xe += [u[0], v[0], None] 491 | Ye += [u[1], v[1], None] 492 | Ze += [u[2], v[2], None] 493 | 494 | # Create the 3d Plotly graph and render it 495 | # build line objects for our edges 496 | trace1 = go.Scatter3d(x=Xe, y=Ye, z=Ze, 497 | mode='lines', 498 | name='domains', 499 | line=dict(color='rgb(125,125,125)', width=0.5), 500 | opacity=0.9, 501 | hoverinfo='none') 502 | 503 | trace2 = go.Scatter3d( 504 | x=Xn, y=Yn, z=Zn, 505 | mode='markers', 506 | name='pivots', 507 | marker=dict( 508 | symbol='circle', 509 | size=node_size, 510 | color=node_risk_scores, 511 | line=dict(color='rgb(50,50,50)', width=0.5), 512 | ), 513 | text=node_labels, 514 | hoverinfo='text') 515 | 516 | # background definition, but everything is turned off 517 | axis = dict(showbackground=False, 518 | showline=False, 519 | zeroline=False, 520 | showgrid=False, 521 | showticklabels=False, 522 | title='') 523 | 524 | layout = go.Layout( 525 | title=f"Graph of interconnected infrastructure ({len(node_labels)} infra nodes)", 526 | width=1000, height=1000, 527 | showlegend=False, 528 | scene=dict(xaxis=dict(axis), yaxis=dict(axis), zaxis=dict(axis)), 529 | margin=dict(t=100), hovermode='closest') 530 | 531 | data = [trace1, trace2] 532 | fig = go.FigureWidget(data=data, layout=layout) 533 | 534 | # handle selection of domains 535 | # def node_selection_fn(trace, points, selector): 536 | # selected_domains = [names[idx] for idx in points.point_inds] 537 | # update_selected_domains(selected_domains) 538 | 539 | # handle node click events 540 | def node_click_fn(trace, points, selector): 541 | if len(points.point_inds) > 1: 542 | print(f"node_click passed in more than 1 point: {points.point_inds}") 543 | 544 | # clear the old selected points 545 | # trace.selectedpoints = [] 546 | # if len(points.point_inds) == 0: 547 | # return 548 | 549 | # get the list of selected domain names 550 | selected_domains = [names[idx] for idx in points.point_inds] 551 | # for id in points.point_inds: 552 | # selected_domains = selected_domains + trace.customdata[id] 553 | 554 | # set the new selected points 555 | # don't like having to loop in a loop to get the domain index, but I don't know a better way 556 | # trace.selectedpoints = points.point_inds + [names.index(name) for name in trace.customdata[id]] 557 | 558 | update_selected_domains(selected_domains) 559 | 560 | def update_selected_domains(selected_domains): 561 | if len(selected_domains) == 0: 562 | return 563 | 564 | # sort domains by length, then alpha 565 | selected_domains.sort(key=len, reverse=True) 566 | with out: 567 | # write selected domains to the output widget 568 | print(f"Selected Infra: ({len(selected_domains)})\n") 569 | for selected_domain in selected_domains: 570 | print(selected_domain) 571 | out.clear_output(wait=True) 572 | 573 | # calc pivots selected domains have in common 574 | # get_2d_shared_pivots(graph, selected_domains) 575 | 576 | # event handler for node selection 577 | # fig.data[1].on_selection(node_selection_fn) 578 | # event handle for node click 579 | fig.data[1].on_click(node_click_fn) 580 | 581 | # Create a table FigureWidget that updates the list of selected domains 582 | out = widgets.Output(layout={'border': '1px solid black'}) 583 | domain_ui = widgets.VBox((fig, out)) 584 | return domain_ui 585 | 586 | 587 | def build_2d_graph_layout(graph: "Graph", config): 588 | """ build the graph layout based on the specified algorithm and get the node positions 589 | in xy dimensions""" 590 | pos = calc_viz_layout("kk_to_fr", graph, 2) 591 | # pos = calc_viz_layout("fr_to_kk", g, 2) 592 | 593 | # build edge data 594 | edge_x, edge_y = [], [] 595 | for e in graph.edges(): 596 | x0, y0 = pos[e[0]] 597 | x1, y1 = pos[e[1]] 598 | edge_x.append(x0) 599 | edge_x.append(x1) 600 | edge_x.append(None) 601 | edge_y.append(y0) 602 | edge_y.append(y1) 603 | edge_y.append(None) 604 | 605 | # create edge scatter plot 606 | edge_trace = go.Scatter( 607 | x=edge_x, y=edge_y, 608 | line=dict(width=0.5, color='#888'), 609 | hoverinfo='none', 610 | mode='lines', 611 | opacity=0.6 612 | ) 613 | 614 | # build node data 615 | node_adjacencies, node_risk_scores, node_text, node_labels, node_size, node_x, node_y = [], [], [], [], [], [], [] 616 | names = list(graph.nodes) 617 | for name in graph.nodes(data=True): 618 | domain = graph.nodes[name[0]] 619 | x, y = pos[name[0]] 620 | node_x.append(x) 621 | node_y.append(y) 622 | # get the domain's connected nodes 623 | neighbors = list(graph.neighbors(name[0])) 624 | node_adjacencies.append(neighbors) 625 | domain_set = set() 626 | for e in graph.edges(name[0], data=True): 627 | domain_set.update(e[2]['domains']) 628 | domain_list = list(domain_set) 629 | color, avg_risk_score = average_risk_score(domain_list, config.domain_risk_dict) 630 | node_labels.append( 631 | "{}
Avg Risk Score: {}
Number of unique domains on edges: {}".format(name[0], avg_risk_score, 632 | len(domain_list))) 633 | node_risk_scores.append(color) 634 | node_size.append(len(domain_list)) 635 | names.append(domain_list) 636 | 637 | if not config.node_size: 638 | node_size = 6 639 | 640 | # build node scatter plot 641 | node_trace = go.Scatter( 642 | x=node_x, y=node_y, 643 | mode='markers', 644 | hoverinfo='text', 645 | text=node_labels, 646 | customdata=node_adjacencies, 647 | marker=dict( 648 | showscale=True, 649 | reversescale=True, 650 | color=node_risk_scores, 651 | colorscale=[[0.0, 'red'], [0.3, 'orange'], [0.5, 'yellow'], [1.0, 'green']], 652 | # cmin/cmax needed so plotly doesn't normalize the scores to calculate the color 653 | cmin=0, cmax=100, 654 | size=node_size, 655 | colorbar=dict( 656 | thickness=15, 657 | title='Risk Score', 658 | xanchor='left', 659 | titleside='right' 660 | ), 661 | line_width=2)) 662 | 663 | # create the jup widget holder for plotly 664 | fig = go.FigureWidget( 665 | [edge_trace, node_trace], 666 | layout=go.Layout( 667 | title=f'Graph of interconnected infrastructure ({len(node_labels)} infra nodes)', 668 | titlefont_size=16, 669 | showlegend=False, 670 | hovermode='closest', 671 | margin=dict(b=5, l=5, r=5, t=30), 672 | xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), 673 | yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)) 674 | ) 675 | 676 | # handle selection of domains 677 | def node_selection_fn(trace, points, selector): 678 | selected_domains = [names[idx] for idx in points.point_inds] 679 | update_selected_domains(selected_domains) 680 | 681 | # handle node click events 682 | def node_click_fn(trace, points, selector): 683 | if len(points.point_inds) > 1: 684 | print(f"node_click passed in more than 1 point: {points.point_inds}") 685 | 686 | # clear the old selected points 687 | trace.selectedpoints = [] 688 | if len(points.point_inds) == 0: 689 | return 690 | 691 | # get the list of selected domain names 692 | selected_domains = [names[idx] for idx in points.point_inds] 693 | for id in points.point_inds: 694 | selected_domains = selected_domains + trace.customdata[id] 695 | 696 | # set the new selected points 697 | # don't like having to loop in a loop to get the domain index, but I don't know a better way 698 | trace.selectedpoints = points.point_inds + [names.index(name) for name in trace.customdata[id]] 699 | 700 | update_selected_domains(selected_domains) 701 | 702 | def update_selected_domains(selected_domains): 703 | if len(selected_domains): 704 | return 705 | 706 | # sort domains by length, then alpha 707 | selected_domains.sort(key=len, reverse=True) 708 | with out: 709 | # write selected domains to the output widget 710 | print(f"Selected Infra: ({len(selected_domains)})\n") 711 | for selected_domain in selected_domains: 712 | print(selected_domain) 713 | out.clear_output(wait=True) 714 | 715 | 716 | # event handler for node selection 717 | fig.data[1].on_selection(node_selection_fn) 718 | # event handle for node click 719 | fig.data[1].on_click(node_click_fn) 720 | 721 | # Create a table FigureWidget that updates the list of selected domains 722 | out = widgets.Output(layout={'border': '1px solid black'}) 723 | domain_ui = widgets.VBox((fig, out)) 724 | return domain_ui 725 | 726 | 727 | def get_shared_pivots(graph: "Graph", selected_domains: list): 728 | shared_pivots = {} 729 | for name in selected_domains: 730 | domain = graph.nodes[name]["domain"] 731 | for cat in domain.pivot_categories: 732 | for cat_value in domain.pivot_categories[cat]: 733 | key = f"{cat}: {cat_value}" 734 | if key not in shared_pivots: 735 | shared_pivots[key] = [] 736 | shared_pivots[key].append(domain) 737 | 738 | # filter by pivots that have >= n domains 739 | shared_pivots = {k: v for k, v in shared_pivots.items() if len(v) >= 3} 740 | return shared_pivots -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jupyterlab 2 | ipywidgets>=7.5 3 | networkx 4 | plotly==4.14.3 5 | tabulate 6 | numpy 7 | scipy 8 | matplotlib 9 | pandas 10 | python-dotenv 11 | domaintools-api --------------------------------------------------------------------------------