├── README.md ├── kupa3.py └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | # Tracking the trackers. Draw connections between scripts and domains on website. 2 | 3 | ## Description 4 | Kupa3 allows you to draw connections between scripts on specific website. It search for javascript code or source attribute, in html code, and crawls it in order to draw a dependency graph. This approach can help bug hunters to discover subdomains and examine javascript calls, OSINT researchers to check what companies are connected to each other or for tracking advertisement companies. At the end, graph is saved in gexf format for exploring it in Gephi. 5 | 6 | Background: https://medium.com/@woj_ciech/tracking-the-trackers-draw-connections-between-scripts-and-domains-on-website-360bc6a306df 7 | 8 | ## Requirements 9 | - Python 3 10 | - BeautifulSoup 11 | - NetworkX 12 | - Matplotlib 13 | 14 | ``` 15 | pip3 install -r requirements.txt 16 | ``` 17 | 18 | 19 | ## Usage 20 | ``` 21 | root@kali:~# python kupa3.py -h 22 | 23 | ( ,&&&. 24 | ) .,.&& 25 | ( ( \=__/ 26 | ) ,'-'. 27 | ( ( ,, _.__|/ /| 28 | ) /\ -((------((_|___/ | 29 | ( // | (`' (( `'--| 30 | _ -.;_/ \--._ \ \-._/. 31 | (_;-// | \ \-'.\ <_,\_\`--'| 32 | ( `.__ _ ___,') <_,-'__,' 33 | jrei `'(_ )_)(_)_)' asciiart.eu 34 | 35 | Tracking the trackers. Draw connections between scripts and domains on website. 36 | medium.com/@woj_ciech github.com/woj-ciech 37 | example: python3 kupa3.py https://nsa.gov 38 | 39 | usage: kupa3.py [-h] [--url URL] 40 | 41 | optional arguments: 42 | -h, --help show this help message and exit 43 | --url URL URL of website (default: https://nsa.gov) 44 | ``` 45 | 46 | ## Output 47 | ![Reddit.com](https://i.imgur.com/WcQKMKa.png "Graph for reddit.com") 48 | -------------------------------------------------------------------------------- /kupa3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import urllib.request 5 | from bs4 import BeautifulSoup 6 | import re 7 | from urllib.parse import urlparse 8 | import networkx as nx 9 | import matplotlib.pyplot as plt 10 | import argparse 11 | import sys 12 | 13 | desc = """ ( ,&&&. 14 | ) .,.&& 15 | ( ( \=__/ 16 | ) ,'-'. 17 | ( ( ,, _.__|/ /| 18 | ) /\ -((------((_|___/ | 19 | ( // | (`' (( `'--| 20 | _ -.;_/ \\--._ \\ \-._/. 21 | (_;-// | \ \-'.\ <_,\_\`--'| 22 | ( `.__ _ ___,') <_,-'__,' 23 | jrei `'(_ )_)(_)_)' asciiart.eu\n 24 | Tracking the trackers. Draw connections between scripts and domains on website. 25 | medium.com/@woj_ciech github.com/woj-ciech 26 | example: python3 kupa3.py https://nsa.gov\n""" 27 | 28 | print (desc) 29 | 30 | parser = argparse.ArgumentParser( 31 | formatter_class=argparse.ArgumentDefaultsHelpFormatter # added to show default value 32 | ) 33 | 34 | parser.add_argument("--url", help="URL of website", default="https://nsa.gov") 35 | G = nx.Graph() 36 | 37 | args = parser.parse_args() 38 | 39 | url = args.url 40 | 41 | # two regexps, i have no idea why only one does not work 42 | regex1 = r"""(?i)\b((?:https?:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|biz|int|at|ca|eu|fr)\/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(? tag in html code 57 | def getscripts(url): 58 | list_of_scripts = [] 59 | 60 | # get website 61 | try: 62 | oururl = urllib.request.urlopen(url, timeout=10).read() 63 | except Exception as e: 64 | print(e) 65 | sys.exit() 66 | soup = BeautifulSoup(oururl, 'html.parser') 67 | 68 | # find 123 | def extractscripts(list_of_scripts): 124 | urls = [] 125 | for i in list_of_scripts: 126 | src = 0 127 | try: 128 | if i.attrs['src']: 129 | k = i.attrs['src'] 130 | # I suspect if link start with '//' it refers to external resources, for example src=//www.subdomain.domain.com/a.js 131 | if i.attrs['src'].startswith("//"): 132 | k = "https:" + i.attrs['src'] 133 | # If it starts with '/' it refers to local resource like /application/static/tracking.js 134 | elif i.attrs['src'].startswith("/"): 135 | k = url + i.attrs['src'] 136 | 137 | urls.append(k) 138 | src = 1 139 | except: 140 | src = 0 141 | 142 | if not src: #If there is not 'src' attribute 143 | links = re.findall(regex1, str(i)) # Find all links from javascript code 144 | if links: 145 | for j in links: 146 | if '.js' in j: 147 | clean_link = re.split(r"""\"|\\|'| |\(|""", j) # hehe 148 | urls.append(clean_link[0]) 149 | return urls 150 | 151 | # Start 152 | scripts = getscripts(url) #Get all scripts 153 | extracted_scripts = extractscripts(scripts) # Extract 'src' from script and look for url directly inside of the script 154 | 155 | print("-------------------- " + url + " -----------------------") 156 | for script in extracted_scripts: # add http or https 157 | if not script.startswith(('http://', 'https://')): 158 | script = 'https://' + script 159 | 160 | print(script) 161 | G.add_edge(url, script) 162 | getlinks(script) 163 | 164 | nx.draw(G, with_labels=True) 165 | plt.savefig("simple_path.png") # save as png 166 | nx.write_gexf(G, parsed_arg[1] + ".gexf") 167 | print("Saved as " + parsed_arg[1] + ".gexf") 168 | plt.show() # display 169 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.7.1 2 | matplotlib==2.0.0 3 | networkx==2.2 4 | requests==2.20.0 5 | urllib3==1.22 6 | --------------------------------------------------------------------------------