├── README.md
├── kupa3.py
└── requirements.txt


/README.md:
--------------------------------------------------------------------------------
 1 | # Tracking the trackers. Draw connections between scripts and domains on website.
 2 | 
 3 | ## Description
 4 | Kupa3 allows you to draw connections between scripts on specific website. It search for javascript code or source attribute, in html code, and crawls it in order to draw a dependency graph. This approach can help bug hunters to discover subdomains and examine javascript calls, OSINT researchers to check what companies are connected to each other or for tracking advertisement companies. At the end, graph is saved in gexf format for exploring it in Gephi.
 5 | 
 6 | Background: https://medium.com/@woj_ciech/tracking-the-trackers-draw-connections-between-scripts-and-domains-on-website-360bc6a306df
 7 | 
 8 | ## Requirements
 9 | - Python 3
10 | - BeautifulSoup
11 | - NetworkX
12 | - Matplotlib
13 | 
14 | ```
15 | pip3 install -r requirements.txt
16 | ```
17 | 
18 | 
19 | ## Usage
20 | ```
21 | root@kali:~# python kupa3.py -h
22 | 
23 |            (                 ,&&&.
24 |             )                .,.&&
25 |            (  (              \=__/
26 |                )             ,'-'.
27 |          (    (  ,,      _.__|/ /|
28 |           ) /\ -((------((_|___/ |
29 |         (  // | (`'      ((  `'--|
30 |       _ -.;_/ \--._      \ \-._/.
31 |      (_;-// | \ \-'.\    <_,\_\`--'|
32 |      ( `.__ _  ___,')      <_,-'__,'
33 | jrei  `'(_ )_)(_)_)' asciiart.eu
34 | 
35 | Tracking the trackers. Draw connections between scripts and domains on website.
36 | medium.com/@woj_ciech github.com/woj-ciech
37 | example: python3 kupa3.py https://nsa.gov
38 | 
39 | usage: kupa3.py [-h] [--url URL]
40 | 
41 | optional arguments:
42 |   -h, --help  show this help message and exit
43 |   --url URL   URL of website (default: https://nsa.gov)
44 | ```
45 | 
46 | ## Output
47 | ![Reddit.com](https://i.imgur.com/WcQKMKa.png "Graph for reddit.com")
48 | 


--------------------------------------------------------------------------------
/kupa3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import urllib.request
  5 | from bs4 import BeautifulSoup
  6 | import re
  7 | from urllib.parse import urlparse
  8 | import networkx as nx
  9 | import matplotlib.pyplot as plt
 10 | import argparse
 11 | import sys
 12 | 
 13 | desc = """           (                 ,&&&.
 14 |             )                .,.&&
 15 |            (  (              \=__/
 16 |                )             ,'-'.
 17 |          (    (  ,,      _.__|/ /|
 18 |           ) /\ -((------((_|___/ |
 19 |         (  // | (`'      ((  `'--|
 20 |       _ -.;_/ \\--._      \\ \-._/.
 21 |      (_;-// | \ \-'.\    <_,\_\`--'|
 22 |      ( `.__ _  ___,')      <_,-'__,'
 23 | jrei  `'(_ )_)(_)_)' asciiart.eu\n
 24 | Tracking the trackers. Draw connections between scripts and domains on website.
 25 | medium.com/@woj_ciech github.com/woj-ciech
 26 | example: python3 kupa3.py https://nsa.gov\n"""
 27 | 
 28 | print (desc)
 29 | 
 30 | parser = argparse.ArgumentParser(
 31 |     formatter_class=argparse.ArgumentDefaultsHelpFormatter  # added to show default value
 32 | )
 33 | 
 34 | parser.add_argument("--url", help="URL of website", default="https://nsa.gov")
 35 | G = nx.Graph()
 36 | 
 37 | args = parser.parse_args()
 38 | 
 39 | url = args.url
 40 | 
 41 | # two regexps, i have no idea why only one does not work
 42 | regex1 = r"""(?i)\b((?:https?:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|biz|int|at|ca|eu|fr)\/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|biz|int|at)\b\/?(?!@)))"""
 43 | regex2 = r"""(?i)\b((?:https?:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|info|at}uk|us)\/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|biz|int)\b\/?(?!@)))"""
 44 | 
 45 | parsed_arg = urlparse(url)
 46 | try:
 47 |     req = urllib.request.Request(url, headers={
 48 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0'})
 49 | except Exception as e:
 50 |     print(str(e))
 51 |     sys.exit()
 52 | 
 53 | G.add_node(url)
 54 | 
 55 | 
 56 | ######It gets everything what is inside <script> tag in html code
 57 | def getscripts(url):
 58 |     list_of_scripts = []
 59 | 
 60 |     # get website
 61 |     try:
 62 |         oururl = urllib.request.urlopen(url, timeout=10).read()
 63 |     except Exception as e:
 64 |         print(e)
 65 |         sys.exit()
 66 |     soup = BeautifulSoup(oururl, 'html.parser')
 67 | 
 68 |     # find <script> tags
 69 |     for script in soup.findAll("script"):
 70 |         list_of_scripts.append(script.extract())
 71 | 
 72 |     return list_of_scripts
 73 | 
 74 | 
 75 | non_js_url = []
 76 | js_url = []
 77 | 
 78 | 
 79 | # It gets links from javascript code
 80 | def getlinks(request, depth=1):
 81 |     try:
 82 |         req = urllib.request.urlopen(request, timeout=10).read()
 83 | 
 84 |         link2 = re.findall(regex2, str(req))
 85 |         # for every link in found links
 86 |         for j in link2:
 87 |             try:
 88 |                 second_depth = re.split(r"""\"|\\|'| |\(|""", j)  # xD
 89 |                 second_depth_url = second_depth[0]  # clean url
 90 |                 parsed_url = urlparse(second_depth_url)
 91 |                 if second_depth_url not in js_url:  # check for repetitive to avoid infinite loop
 92 |                     if '.js' in parsed_url[2] and 'github' not in parsed_url[1]:  # check if it has javascript extension
 93 |                         if not second_depth_url.startswith(
 94 |                                 ('http://', 'https://')):  # check if url starts with http or https
 95 |                             second_depth_url = 'https://' + second_depth_url  # if not add it
 96 | 
 97 |                         # Magic?
 98 |                         if depth > 1:
 99 |                             G.add_edge(request, second_depth_url)
100 |                         else:
101 |                             G.add_edge(request, second_depth_url)
102 | 
103 |                         print("----" * depth + second_depth_url)
104 |                         js_url.append(second_depth_url)
105 |                         getlinks(second_depth_url, depth=depth + 1)  # recursion
106 | 
107 |                     else:
108 |                         if parsed_url[1].endswith(("com", "net", "org", "edu", "io")) and parsed_url[
109 |                             1] not in non_js_url:  # Check if url is domain indeed, and if it's unique
110 |                             print("----" * depth + parsed_url[1])
111 |                             G.add_edge(request, second_depth_url)
112 |                             non_js_url.append(parsed_url[1])  # append to list to avoid repetitive
113 |                 else:
114 |                     # print ("Infinite LOOP")
115 |                     break
116 |             except Exception as e:
117 |                 print("----" * depth + str(e))
118 | 
119 |     except Exception as e:
120 |         print("----" * depth + str(e))
121 | 
122 | #It checks if <script> tag has a 'src' attribute, if is save it to list | if not look inside the script <script> whatever </script>
123 | def extractscripts(list_of_scripts):
124 |     urls = []
125 |     for i in list_of_scripts:
126 |         src = 0
127 |         try:
128 |             if i.attrs['src']:
129 |                 k = i.attrs['src']
130 |                 # I suspect if link start with '//' it refers to external resources, for example src=//www.subdomain.domain.com/a.js
131 |                 if i.attrs['src'].startswith("//"):
132 |                     k = "https:" + i.attrs['src']
133 |                 # If it starts with '/' it refers to local resource like /application/static/tracking.js
134 |                 elif i.attrs['src'].startswith("/"):
135 |                     k = url + i.attrs['src']
136 | 
137 |                 urls.append(k)
138 |                 src = 1
139 |         except:
140 |             src = 0
141 | 
142 |         if not src: #If there is not 'src' attribute
143 |             links = re.findall(regex1, str(i))  # Find all links from javascript code
144 |             if links:
145 |                 for j in links:
146 |                     if '.js' in j:
147 |                         clean_link = re.split(r"""\"|\\|'| |\(|""", j)  # hehe
148 |                         urls.append(clean_link[0])
149 |     return urls
150 | 
151 | # Start
152 | scripts = getscripts(url) #Get all scripts
153 | extracted_scripts = extractscripts(scripts)  # Extract 'src' from script and look for url directly inside of the script
154 | 
155 | print("-------------------- " + url + " -----------------------")
156 | for script in extracted_scripts:  # add http or https
157 |     if not script.startswith(('http://', 'https://')):
158 |         script = 'https://' + script
159 | 
160 |     print(script)
161 |     G.add_edge(url, script)
162 |     getlinks(script)
163 | 
164 | nx.draw(G, with_labels=True)
165 | plt.savefig("simple_path.png")  # save as png
166 | nx.write_gexf(G, parsed_arg[1] + ".gexf")
167 | print("Saved as " + parsed_arg[1] + ".gexf")
168 | plt.show()  # display
169 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.7.1
2 | matplotlib==2.0.0
3 | networkx==2.2
4 | requests==2.20.0
5 | urllib3==1.22
6 | 


--------------------------------------------------------------------------------