├── jsa
    ├── automation
    │   ├── 404_js_wayback.sh
    │   ├── js_files_extraction.py
    │   └── tld_detection.py
    ├── automation.sh
    └── jsa.py
├── README.md
├── Dockerfile
└── webpack
    └── unwebpack_sourcemap.py


/jsa/automation/404_js_wayback.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | url=$1
 4 | 
 5 | status_code=$(curl --insecure --connect-timeout 100 -sL -w "%{http_code}" $url -o /dev/null)
 6 | 
 7 | if [ $status_code != "200" ]
 8 | then
 9 | printf "https://web.archive.org/web/20060102150405if_/$url\n"
10 | fi
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Automatic crawl domain for js files, search .js files in waybackmachine + gau
 2 | 
 3 | Find sensitive strings, credentials, apikeys in crawled JS files with trufflehog.
 4 | 
 5 | Just works. 
 6 | 
 7 | Example usage:
 8 | 
 9 | docker run --dns=8.8.8.8 --rm --privileged=true --ulimit nofile=1048576:1048576 --cpu-shares 256 -v jsa:/jsa 5631/jsa /jsa/URLINPUT.txt /jsa/OUTPUTDIRECTORY >> /jsa/jsa.output
10 | 
11 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM golang:latest
 2 | 
 3 | COPY . .
 4 | 
 5 | ENV HOME /go/jsa
 6 | ENV GOPATH=/go/
 7 | ENV PATH $PATH:$GOPATH
 8 | ENV PATH $PATH:/go/jsa
 9 | 
10 | WORKDIR /go/jsa/
11 | 
12 | RUN apt -y update && apt -y install git  	\
13 | 				    wget 	\
14 | 				    python3 	\
15 | 				    python3-pip parallel
16 | 
17 | RUN GO111MODULE=on go install github.com/lc/gau@latest && GO111MODULE=on go install github.com/jaeles-project/gospider@latest
18 | 
19 | 
20 | 
21 | RUN pip3 install bs4 --break-system-packages && pip3 install requests --break-system-packages
22 | 
23 | RUN chmod +x automation.sh && chmod +x automation/404_js_wayback.sh
24 | 
25 | RUN git clone https://github.com/trufflesecurity/trufflehog.git && cd trufflehog && go install
26 | 
27 | ENTRYPOINT ["automation.sh"]
28 | 
29 | 
30 | 
31 | #RUN pip3 install idna==2.10 && pip3 install tldextract && pip3 install -r /go/linkfinder/requirements.txt


--------------------------------------------------------------------------------
/jsa/automation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILENAME=$1
 4 | 
 5 | LINES=$(cat $FILENAME)
 6 | 
 7 | mkdir $2
 8 | mkdir /tmp
 9 | mkdir $2/download/
10 | cd $2/download/
11 | 
12 | task(){
13 | 
14 | 	LINE=$1
15 | 	i=$2
16 | 	outputdir=$3
17 | 
18 | 	echo i: $i
19 | 
20 | 	sleep 3
21 | 
22 | 	mkdir $outputdir/$i
23 | 	printf "Crawl... $LINE\n"
24 | 
25 | 	printf $LINE | timeout 1800 gospider -t 1 --concurrent 1 -d 1 --other-source --include-other-source --delay 1 --timeout 120 --js=false --sitemap --depth 2 --robots --blacklist eot,jpg,jpeg,gif,css,tif,tiff,png,ttf,otf,woff,woff2,ico,pdf,svg,txt,mp4,avi,mpeg4,mp3,webm,ogv,gif,jpg,jpeg,png > $outputdir/$i/gospider.txt
26 | 	cat $outputdir/$i/gospider.txt | grep -vE 'https?:\/\/.*\.json' | grep -vE 'jquery|bootstrap|ga.js|watch.js|wp-embed|angular|wf\-|recaptcha|gtm.js|google|sweetalert|i18n' | grep -E 'https?:\/\/.*\.js' -o | sort -u > $outputdir/$i/wget.txt
27 | 
28 | 	## lauching wayback with a "js only" mode to reduce execution time
29 | 	printf 'Launching Gau with wayback..\n'
30 | 	printf $LINE | xargs -I{} echo "{}/*&filter=mimetype:application/javascript&somevar=" | gau -providers wayback -b eot,jpg,jpeg,gif,css,tif,tiff,png,ttf,otf,woff,woff2,ico,pdf,svg,txt,mp4,avi,mpeg4,mp3,webm,ogv,gif,jpg,jpeg,png | tee $outputdir/$i/gau.txt >/dev/null   ##gau
31 | 	printf $LINE | xargs -I{} echo "{}/*&filter=mimetype:text/javascript&somevar=" | gau -providers wayback -b eot,jpg,jpeg,gif,css,tif,tiff,png,ttf,otf,woff,woff2,ico,pdf,svg,txt,mp4,avi,mpeg4,mp3,webm,ogv,gif,jpg,jpeg,png | tee -a $outputdir/$i/gau.txt >/dev/null   ##gau
32 | 
33 | 	## if js file parsed from wayback didn't return 200 live, we are generating a URL to see a file's content on wayback's server;
34 | 	## it's useless for endpoints discovery but there is a point to search for credentials in the old content; that's what we'll do
35 | 	## only wayback as of now
36 | 	chmod -R 777 $outputdir/$i/
37 | 
38 | 	#printf "Fetching URLs for 404 js files from wayback..\n"
39 | 	#cat $outputdir/$i/gau.txt | cut -d '?' -f1 | cut -d '#' -f1 | grep '.*\.js$' | sort -u | parallel --gnu -j 2 "/go/jsa/automation/404_js_wayback.sh {}" | tee -a $outputdir/$i/creds_search.txt >/dev/null
40 | 	#cat $outputdir/$i/wget.txt | cut -d '?' -f1 | cut -d '#' -f1 | grep '.*\.js$' | sort -u | parallel --gnu -j 2 "/go/jsa/automation/404_js_wayback.sh {}" | tee -a $outputdir/$i/creds_search.txt >/dev/null
41 | 	## save all endpoints to the file for future processing
42 | 
43 | 	## extracting js files from js files
44 | 	printf "Printing deep-level js files..\n"
45 | 	cat $outputdir/$i/wget.txt | parallel --gnu --pipe -j 2 "timeout 6000 python3 /go/jsa/automation/js_files_extraction.py | tee -a $outputdir/$i/wget.txt"
46 | 
47 | 	printf "wget discovered JS files for local creds scan + webpack + api paths\n"
48 | 	sed 's/$/.map/' $outputdir/$i/wget.txt > $outputdir/$i/wgetmap.txt
49 | 
50 | 	cat $outputdir/$i/wget.txt | sed 'p;s/\//-/g' | sed 'N;s/\n/ -O /' | xargs wget -c --no-directories -P '$outputdir/download/' --retry-on-host-error --tries=5 --content-disposition --no-check-certificate --timeout=160 --trust-server-names
51 | 	cat $outputdir/$i/creds_search.txt | sed 'p;s/\//-/g' | sed 'N;s/\n/ -O /' | xargs wget -c --no-directories -P '$outputdir/download/' --retry-on-host-error --tries=7 --content-disposition --no-check-certificate --timeout=160 --trust-server-names
52 | 	cat $outputdir/$i/wgetmap.txt | sed 'p;s/\//-/g' | sed 'N;s/\n/ -O /' | xargs wget -c --no-directories -P '$outputdir/download/' --retry-on-host-error --tries=5 --content-disposition --no-check-certificate --timeout=160 --trust-server-names
53 | 
54 | 	mkdir $outputdir/$i
55 | 
56 | 	outputurl=${LINE//:/.}
57 | 	outputurl=${outputurl//\//.}
58 | 	
59 | 	python3 /go/webpack/unwebpack_sourcemap.py --make-directory --disable-ssl-verification --detect $LINE $outputdir/$i/$outputurl
60 | }
61 | 
62 | i=0
63 | for LINE in $LINES
64 | do   
65 | 	((i=i+1))
66 | 	task "$LINE" "$i" "$2" & #call all domains in parallel
67 | done
68 | 
69 | wait
70 | pwd
71 | 
72 | if [ ! -f "/jsa/shasums" ];
73 | then
74 |     touch /jsa/shasums
75 | fi
76 | 
77 | #get sha sum for each file and verify that it havnt been scaned earlier
78 | for filename in *
79 | do
80 |     currentfilehash=$(cat "$filename" | sha1sum | head -c 40)
81 | 
82 | 	if grep -Fxq "$currentfilehash" /jsa/shasums
83 | 	then
84 | 	    rm "$filename"
85 | 	else
86 | 	    echo "$currentfilehash" >> /jsa/shasums
87 | 	fi    
88 | done
89 | 
90 | trufflehog filesystem --directory=$2 >> $2/out.txt
91 | 


--------------------------------------------------------------------------------
/jsa/automation/js_files_extraction.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import requests
  3 | import io
  4 | import sys
  5 | from datetime import datetime
  6 | import warnings
  7 | from requests.packages.urllib3.exceptions import InsecureRequestWarning
  8 | from tld_detection import tld_detection
  9 | 
 10 | js_file = sys.stdin.readlines()
 11 | 
 12 | 
 13 | original_lines = []
 14 | js_files_2nd_lvl = []
 15 | js_files_3rd_lvl = []
 16 | js_files_4th_lvl = []
 17 | 
 18 | js_files_3rd_lvl_original = []
 19 | tmp_list = []
 20 | 
 21 | def deduplication(input, original_lines):  ## filtering + deduplication
 22 |     existing_lines = []
 23 |     for line in input:  ## Filtering the output of subjs (#$ and ?v=$)
 24 |         line = re.sub("\\?v=.*?$", "", line)
 25 |         line = re.sub("#.*?$", "", line)
 26 |         existing_lines.append(line)
 27 |     for line in existing_lines:  ## Deleting duplicates
 28 |         line = line.strip()
 29 |         if line not in original_lines:
 30 |             original_lines.append(line)
 31 | 
 32 | clear_url_global = []
 33 | 
 34 | def main_func(original_lines, js_files):
 35 |     for line in original_lines:  ## main loop
 36 | 
 37 |         # tld = re.sub("^[a-z]+\.", "", domain_name)              ## matching TLD
 38 | 
 39 |         clear_url0 = re.findall("^(.*?)\\b/", line)
 40 |         global clear_url
 41 |         clear_url = re.sub("\['|'\]", "", str(clear_url0))  ## matching URL without js part
 42 |         domain_name = tld_detection(clear_url)
 43 |         if "[]" in clear_url:
 44 |             continue
 45 |         ##if str(domain_name) not in str(line):     ## deleting since this is automation and we need a clear output
 46 |         ## excluding 3rd party js files & print 'em
 47 |         # print("3rd party JS file has been found: " + line)
 48 |         # continue
 49 |         warnings.simplefilter('ignore', InsecureRequestWarning)
 50 |         js_file_status = requests.head(line,
 51 |                                        verify=False).status_code  ## fastly (HEAD) finding out a status code of js file url
 52 |         if js_file_status == 200:  ## if js file exists (to reduce time)
 53 |             warnings.simplefilter('ignore', InsecureRequestWarning)
 54 |             js_file_content = requests.get(line, verify=False)  ## fetching js file's content
 55 |             # filename = "%s/%s" % (directory_with_js_files, name_for_wget)
 56 |             # os.makedirs(os.path.dirname(filename))  ##creating dir with a js file
 57 |             # js_file_write = open(filename, "w")  ## it's for js file downloading
 58 | 
 59 |             # js_file_write.write(js_file_content.text)  ## wget for js file into the directory
 60 |             u = re.findall("\"\/[a-zA-Z0-9_?&=/\-\#\.]*\"", js_file_content.text)  ## matching "string"
 61 |             u = str(u).replace("', '", "\n").replace("[]", "")
 62 |             u = re.sub("\['|'\]|\"", "", u)
 63 |             u = re.sub(
 64 |                 ".css|.png|.jpg|.svg|.jpeg|.ico|.gif|.woff|.woff2|.swf", "", u,
 65 |                 flags=re.M)  ## excluding not desirable file extensions
 66 |             u = re.sub(".*?\.(facebook|twitter).(net|com)(/)|(/|/\?|/#|#)$", "", u,
 67 |                        flags=re.M)  ##preparing for deduplication with / /? # deleting
 68 |             u = re.sub("(\n\n)", "\n", u, flags=re.M)
 69 |             if re.findall("^//", u):
 70 |                 u = re.sub("^//(.*?)/", clear_url + "/", u, flags=re.M)
 71 |             else:
 72 |                 u = re.sub("^", clear_url, u, flags=re.M)
 73 |             u_lines = io.StringIO(u).readlines()  ## endpoints
 74 | 
 75 |             for one in u_lines:
 76 |                 if re.findall("\.js$", one):
 77 |                     ##if re.findall("^//", one) and verbose is True:  ## excluding 3rd party 2nd lvl js files & print 'em
 78 |                     ##print("3rd party JS file has been found: " + one)                         ## deleting since this is automation and we need a clear output
 79 |                     if re.findall("^//", one):
 80 |                         one = re.sub("^//(.*?)/", clear_url + "/", one)  # one = re.sub("\n", "", one)
 81 |                         js_files.append(one)
 82 |                     if re.findall("^/", one):
 83 |                         one = re.sub("^/", clear_url + "/", one)
 84 |                     if re.findall("^\b", one):  ## if js file doesn't have / at ^, it'll be added
 85 |                         one = re.sub("^", clear_url + "/", one)  # one = re.sub("\n", "", one)
 86 |                         js_files.append(one)
 87 |                     if re.findall("^\[\]/", one):
 88 |                         one = re.sub("^\[\]", clear_url, one)
 89 |                         js_files.append(one)
 90 |                     else:  ## printing js files found on 2nd level
 91 |                         js_files.append(one)
 92 |         ##elif js_file_status == 404:  ## todo make it for subjs output only          ## deleting since this is automation and we need a clear output
 93 |         # print(
 94 |         # "JS file {} returned 404 code. Check the host and try to apply file upload with path traversal/PUT method file upload.".format(line))
 95 | 
 96 | 
 97 | deduplication(js_file, original_lines)
 98 | 
 99 | main_func(original_lines, js_files_2nd_lvl)
100 | 
101 | if len(js_files_2nd_lvl) != 0:  ## processing 2nd level js files
102 |     ##if verbose is True:               ## deleting since this is automation and we need a clear output
103 |         ##print("\nJS files 2nd level:\n")
104 |     js_files_2nd_lvl_original = []
105 |     deduplication(js_files_2nd_lvl, js_files_2nd_lvl_original)  ## removing dupes
106 |     for l in js_files_2nd_lvl_original:  ## printing a list
107 |         j2 = re.findall("\.js", l)  ## sometimes (I don't know why though), non-js files leak to the list
108 |         if len(j2) == 0:
109 |             continue
110 |         elif l not in original_lines:
111 |             print(l)
112 |     main_func(js_files_2nd_lvl_original, js_files_3rd_lvl)
113 | 
114 | if len(js_files_3rd_lvl) != 0:
115 |     ##if verbose is True:
116 |         ##print("JS files 3rd level:\n")
117 |     deduplication(js_files_3rd_lvl, js_files_3rd_lvl_original)  ## removing dupes
118 |     for l in js_files_3rd_lvl_original:  ## printing a list
119 |         j3 = re.findall("\.js$", l)  ## sometimes (I don't know why though), non-js files leak to the list
120 |         if len(j3) == 0:
121 |             continue
122 |         elif l not in js_files_2nd_lvl_original and original_lines:
123 |             if re.findall("^htt(p|s)(.*?)\w//(.*?)/", l):
124 |                 l = re.sub("^htt(p|s)(.*?)\w//(.*?)/", clear_url + "/", l, flags=re.M)
125 |                 tmp_list.append(l)
126 |                 print(l)
127 |     js_files_3rd_lvl_original.clear()
128 |     js_files_3rd_lvl_original = tmp_list
129 | 
130 |     #main_func(js_files_3rd_lvl, js_files_4th_lvl)
131 | 
132 | 


--------------------------------------------------------------------------------
/jsa/jsa.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import re
  3 | import requests
  4 | import io
  5 | import os
  6 | import argparse
  7 | import sys
  8 | from datetime import datetime
  9 | import warnings
 10 | from requests.packages.urllib3.exceptions import InsecureRequestWarning
 11 | from tld_detection import tld_detection
 12 | 
 13 | ## Implement reading from specified file
 14 | 
 15 | parser = argparse.ArgumentParser()
 16 | parser.add_argument('-v', "--verbose", help='verbose', action='store_true')
 17 | parser.add_argument('-e', "--exclude", help='exclude & print 3rd party js files', action='store_true')
 18 | parser.add_argument('-f', "--file", help='js file URL in format htt(p|ps)://(.*)/name.js', action='append')
 19 | 
 20 | verbose = parser.parse_args().verbose
 21 | exclude = parser.parse_args().exclude
 22 | if not sys.stdin.isatty():
 23 |     global js_file
 24 |     js_file = sys.stdin.readlines()
 25 | elif parser.parse_args().file:
 26 |     js_file = parser.parse_args().file
 27 | else:
 28 |     print("Please specify js file in STDIN or in argument -f!")
 29 |     exit()
 30 | 
 31 | # js_file = open("/Users/max/test13.txt", "r").readlines()
 32 | 
 33 | 
 34 | ## just some containers for future values
 35 | 
 36 | original_lines = []
 37 | 
 38 | all_endpoints_1st_lvl = []
 39 | all_endpoints_original = []
 40 | js_files_2nd_lvl = []
 41 | 
 42 | all_endpoints_2nd_lvl = []
 43 | all_endpoints_2nd_lvl_original = []
 44 | js_files_3rd_lvl = []
 45 | 
 46 | all_endpoints_3rd_lvl = []
 47 | all_endpoints_3rd_lvl_original = []
 48 | tmp_list = []
 49 | js_files_4th_lvl = []  ## just for passing it to the main func, it won't be processed actually
 50 | 
 51 | 
 52 | ####
 53 | # now = datetime.now()
 54 | # now = str(now).replace(" ", "_").replace(":", "-")
 55 | # now = re.sub("\..*?$", "", now)
 56 | 
 57 | # curpath = os.path.abspath(os.curdir)
 58 | 
 59 | # directory_with_js_files = "%s/js_files/%s/" % (curpath, now)  ## directory of downloaded js files for other tools
 60 | 
 61 | 
 62 | ###
 63 | 
 64 | def deduplication(input, original_lines):  ## filtering + deduplication
 65 |     existing_lines = []
 66 |     for line in input:  ## Filtering the output of subjs (#$ and ?v=$)
 67 |         line = re.sub("\\?v=.*?$", "", line)
 68 |         line = re.sub("#.*?$", "", line)
 69 |         existing_lines.append(line)
 70 |     for line in existing_lines:  ## Deleting duplicates
 71 |         line = line.strip()
 72 |         if line not in original_lines:
 73 |             original_lines.append(line)
 74 | 
 75 | 
 76 | def main_func(original_lines, js_files, all_endpoints):
 77 |     for line in original_lines:  ## main loop
 78 | 
 79 |         clear_url0 = re.findall("^(.*?)\\b/", line)
 80 |         global clear_url
 81 |         clear_url = re.sub("\['|'\]", "", str(clear_url0))  ## matching URL without js part
 82 |         domain_name = tld_detection(clear_url)
 83 |         if "[]" in clear_url:
 84 |             continue
 85 |         if str(domain_name) not in str(line) and exclude is True:
 86 |             ## excluding 3rd party js files & print 'em
 87 | 
 88 |             print("Possible (if not CDN) 3rd party JS file has been found: " + line)
 89 |         warnings.simplefilter('ignore', InsecureRequestWarning)
 90 |         try:
 91 |             js_file_status = requests.get(line, verify=False).status_code  ## finding out a status code of js file url
 92 |         except Exception:
 93 |              pass
 94 |         if js_file_status == 200:  ## if js file exists (to reduce time)
 95 |             warnings.simplefilter('ignore', InsecureRequestWarning)
 96 |             js_file_content = requests.get(line, verify=False)  ## fetching js file's content
 97 | 
 98 |             # filename = "%s/%s" % (directory_with_js_files, name_for_wget)
 99 |             # os.makedirs(os.path.dirname(filename))  ##creating dir with a js file
100 |             # js_file_write = open(filename, "w")  ## it's for js file downloading
101 | 
102 |             # js_file_write.write(js_file_content.text)  ## wget for js file into the directory
103 | 
104 |             u = re.findall("\"\/[a-zA-Z0-9_?&=/\-\#\.]*\"", js_file_content.text)  ## matching "string"
105 |             u = str(u).replace("', '", "\n").replace("[]", "")
106 |             u = re.sub("\['|'\]|\"", "", u)
107 |             u = re.sub(
108 |                 ".css|.png|.jpg|.svg|.jpeg|.ico|.gif|.woff|.woff2|.swf", "", u,
109 |                 flags=re.M)  ## excluding not desirable file extensions
110 |             u = re.sub(".*?\.(facebook|twitter).(net|com)(/)|(/|/\?|/#|#)$", "", u,
111 |                        flags=re.M)  ##preparing for deduplication with / /? # deleting
112 |             u = re.sub("(\n\n)", "\n", u, flags=re.M)
113 | 
114 |             if re.findall("^//", u):
115 |                 u = re.sub("^//(.*?)/", clear_url + "/", u, flags=re.M)  ## it's for js files
116 |             else:
117 |                 u = re.sub("^", clear_url, u, flags=re.M)
118 |             u_lines = io.StringIO(u).readlines()  ## endpoints
119 | 
120 |             for one in u_lines:
121 |                 if re.findall("\.js$", one):
122 |                     if re.findall("^//", one) and verbose is True:  ## excluding 3rd party 2nd lvl js files & print 'em
123 |                         if not re.findall("^//%s" % domain_name, one):
124 |                             print("Possible (if not CDN) 3rd party JS file has been found: " + one)
125 |                     if re.findall("^//", one):
126 |                         one = re.sub("^//(.*?)/", clear_url + "/", one)  # one = re.sub("\n", "", one)
127 |                         js_files.append(one)
128 |                     if re.findall("^/", one):
129 |                         one = re.sub("^/", clear_url + "/", one)
130 |                     if re.findall("^\b", one):  ## if js file doesn't have / at ^, it'll be added
131 |                         one = re.sub("^", clear_url + "/", one)  # one = re.sub("\n", "", one)
132 |                         js_files.append(one)
133 |                     if re.findall("^\[\]/", one):
134 |                         one = re.sub("^\[\]", clear_url, one)
135 |                         js_files.append(one)
136 |                     else:  ## printing js files found on 2nd level
137 |                         js_files.append(one)
138 |                 else:
139 |                     all_endpoints.append(one)  ## printing 1st lvl endpoints
140 |         elif js_file_status == 404 and verbose is True:  ## todo make it for subjs output only
141 |             print(
142 |                 "JS file {} returned 404 code. Check the host and try to apply file upload with path traversal/PUT method file upload.".format(
143 |                     line))
144 | 
145 | 
146 | deduplication(js_file, original_lines)
147 | main_func(original_lines, js_files_2nd_lvl, all_endpoints_1st_lvl)
148 | 
149 | if len(all_endpoints_1st_lvl) != 0:
150 |     temp0 = []
151 |     for l in all_endpoints_1st_lvl:
152 | 
153 |         clear_domain = re.findall("http(s)://(.*)(?=/)", l)
154 |         clear_domain = re.findall(", '(.*?)'", str(clear_domain))
155 |         clear_domain = ''.join(clear_domain)
156 | 
157 |         t = re.findall("^(.*?)(?<=com)", l)
158 |         l = re.sub("(/|/\?|/#|#|/\.)$", "", l)  ## additionally deleting / /? /#
159 | 
160 |         if not re.findall("%s$" % clear_domain,
161 |                           l):  ## removing clear urls without actual endpoints like http(s)://domain.com
162 |             if "[]//" in l:
163 |                 l = l.replace("[]//", "//%s" % clear_domain)
164 |                 temp0.append(l)
165 | 
166 |             if not re.findall("%s/\W" % clear_domain, l):  ## deleting endpoints containing
167 |                 ## non-word character (not a-z0-9) http(s)://domain.com/(.|[]{},
168 |                 if not re.findall("%s/[a-z0-9]{1}$" % clear_domain,
169 |                                   l):  ## deleting endpoints containing 1 word character like http(s)://domain.com/1|a|1a;
170 |                     temp0.append(l)  ## most likely to be an endpoint and not a javascript variable
171 | 
172 |     all_endpoints_1st_lvl.clear()  ## deleting current list w/ endpoints
173 |     all_endpoints_1st_lvl = temp0  ##substitution
174 | 
175 |     deduplication(all_endpoints_1st_lvl, all_endpoints_original)  ## deleting dupes
176 |     for l in all_endpoints_original:  ## printing a list
177 |         if "[]" in l:
178 |             continue
179 |         else:
180 |             print(l)
181 | 
182 | if len(js_files_2nd_lvl) != 0:  ## processing 2nd level js files
183 |     printed = False
184 |     js_files_2nd_lvl_original = []
185 |     deduplication(js_files_2nd_lvl, js_files_2nd_lvl_original)  ## removing dupes
186 |     for l in js_files_2nd_lvl_original:  ## printing a list
187 | 
188 |         j2 = re.findall("\.js$", l)  ## sometimes (I don't know why though), non-js files leak to the list
189 |         if len(j2) == 0:
190 |             continue
191 |         if l not in original_lines:
192 |             if printed is False and verbose is True:                ## printing a text only one time if verbose mode
193 |                 print("\nJS files 2nd level:\n")
194 |                 printed = True
195 |             if verbose is True:
196 |                 print(l)
197 | 
198 |     main_func(js_files_2nd_lvl_original, js_files_3rd_lvl, all_endpoints_2nd_lvl)
199 | 
200 | if len(js_files_3rd_lvl) != 0:
201 |     printed = False
202 |     js_files_3rd_lvl_original = []
203 |     deduplication(js_files_3rd_lvl, js_files_3rd_lvl_original)  ## removing dupes
204 |     for l in js_files_3rd_lvl_original:  ## printing a list
205 | 
206 |         j3 = re.findall("\.js$", l)  ## sometimes (I don't know why though), non-js files leak to the list
207 |         if len(j3) == 0:
208 |             continue
209 |         if l not in js_files_2nd_lvl_original and original_lines:
210 |             if printed is False and verbose is True:                ## printing a text only one time if verbose mode
211 |                 print("\nJS files 3rd level:\n")
212 |                 printed = True
213 |             if verbose is True:
214 |                 if re.findall("^htt(p|s)(.*?)\w//(.*?)/", l):
215 |                     l = re.sub("^htt(p|s)(.*?)\w//(.*?)/", clear_url + "/", l, flags=re.M)
216 |                 print(l)
217 | 
218 |     main_func(js_files_3rd_lvl, js_files_4th_lvl, all_endpoints_3rd_lvl)
219 | 
220 | 
221 | 
222 | if all_endpoints_2nd_lvl:  ## printing 2nd level endpoints
223 |     temp1 = []
224 |     for l in all_endpoints_2nd_lvl:
225 | 
226 |         clear_domain = re.findall("http(s)://(.*)(?=/)", l)
227 |         clear_domain = re.findall(", '(.*?)'", str(clear_domain))
228 |         clear_domain = ''.join(clear_domain)
229 | 
230 |         t = re.findall("^(.*?)(?<=com)", l)
231 |         l = re.sub("(/|/\?|/#|#|/\.)$", "", l)  ## additionally deleting / /? /#
232 | 
233 |         if not re.findall("%s$" % clear_domain,
234 |                           l):  ## removing clear urls without actual endpoints like http(s)://domain.com
235 |             if "[]//" in l:
236 |                 l = l.replace("[]//", "//%s" % clear_domain)
237 |                 temp1.append(l)
238 | 
239 |             if not re.findall("%s/\W" % clear_domain, l):  ## deleting endpoints containing
240 |                 ## non-word character (not a-z0-9) http(s)://domain.com/(.|[]{},
241 |                 if not re.findall("%s/[a-z0-9]{1}$" % clear_domain,
242 |                                   l):  ## deleting endpoints containing 1 word character like http(s)://domain.com/1|a|1a;
243 |                     temp1.append(l)  ## most likely to be an endpoint and not a javascript variable
244 | 
245 |     all_endpoints_2nd_lvl.clear()  ## deleting current list w/ endpoints
246 |     all_endpoints_2nd_lvl = temp1  ##substitution
247 |     printed = False
248 |     deduplication(all_endpoints_2nd_lvl, all_endpoints_2nd_lvl_original)    ## deleting dupes
249 |     for l in all_endpoints_2nd_lvl_original:  ## printing a lists
250 |         if "[]" in l:
251 |             continue
252 |         elif l not in all_endpoints_original:
253 |             if printed is False and verbose is True:            ## printing a text only one time if verbose mode
254 |                 print("\nEndpoints 2nd level:\n")
255 |                 printed = True
256 |             print(l)  ##printing URL with endpoint if it's original
257 | 
258 | if all_endpoints_3rd_lvl:
259 |     all_endpoints_3rd_lvl_original = []
260 |     temp2 = []
261 |     for l in all_endpoints_3rd_lvl:
262 | 
263 |         clear_domain = re.findall("http(s)://(.*)(?=/)", l)
264 |         clear_domain = re.findall(", '(.*?)'", str(clear_domain))
265 |         clear_domain = ''.join(clear_domain)
266 | 
267 |         t = re.findall("^(.*?)(?<=com)", l)
268 |         l = re.sub("(/|/\?|/#|#|/\.)$", "", l)  ## additionally deleting / /? /#
269 | 
270 |         if not re.findall("%s$" % clear_domain,
271 |                           l):  ## removing clear urls without actual endpoints like http(s)://domain.com
272 |             if "[]//" in l:
273 |                 l = l.replace("[]//", "//%s" % clear_domain)
274 |                 temp2.append(l)
275 | 
276 |             if not re.findall("%s/\W" % clear_domain, l):  ## deleting endpoints containing
277 |                 ## non-word character (not a-z0-9) http(s)://domain.com/(.|[]{},
278 |                 if not re.findall("%s/[a-z0-9]{1}$" % clear_domain,
279 |                                   l):  ## deleting endpoints containing 1 word character like http(s)://domain.com/1|a|1a;
280 |                     temp2.append(l)  ## most likely to be an endpoint and not a javascript variable
281 | 
282 |     all_endpoints_3rd_lvl.clear()  ## deleting current list w/ endpoints
283 |     all_endpoints_3rd_lvl = temp2  ##substitution
284 |     printed = False
285 |     all_endpoints_2nd_lvl_original = []  ## deleting dupes
286 |     deduplication(all_endpoints_3rd_lvl, all_endpoints_3rd_lvl_original)
287 |     for l in all_endpoints_3rd_lvl_original:  ## printing a lists
288 |         if "[]" in l:
289 |             continue
290 |         elif l not in all_endpoints_original and all_endpoints_2nd_lvl_original:
291 |             if printed is False and verbose is True:
292 |                 print("Endpoints 3rd level:\n")
293 |                 printed = True
294 |             print(l)
295 | 
296 | # if os.path.exists(directory_with_js_files) is True:
297 | # os.system("retire %s" % directory_with_js_files)
298 | 
299 | ## Deleting duplicates from the js files 2nd level
300 | 
301 | ## Deleting duplicates from the endpoints 1st level
302 | 
303 | ## Deleting duplicates from the js files 3rdnd level
304 | 
305 | ## Deleting duplicates from the endpoints 2nd level
306 | 


--------------------------------------------------------------------------------
/webpack/unwebpack_sourcemap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 |     unwebpack_sourcemap.py
  4 |     by rarecoil (github.com/rarecoil/unwebpack-sourcemap)
  5 | 
  6 |     Reads Webpack source maps and extracts the disclosed
  7 |     uncompiled/commented source code for review. Can detect and
  8 |     attempt to read sourcemaps from Webpack bundles with the `-d`
  9 |     flag. Puts source into a directory structure similar to dev.
 10 | """
 11 | 
 12 | import argparse
 13 | import json
 14 | import os
 15 | import re
 16 | import string
 17 | import sys
 18 | from urllib.parse import urlparse
 19 | from unicodedata import normalize
 20 | 
 21 | import requests
 22 | from bs4 import BeautifulSoup, SoupStrainer
 23 | 
 24 | 
 25 | class SourceMapExtractor(object):
 26 |     """Primary SourceMapExtractor class. Feed this arguments."""
 27 | 
 28 |     _target = None
 29 |     _is_local = False
 30 |     _attempt_sourcemap_detection = False
 31 |     _output_directory = ""
 32 |     _target_extracted_sourcemaps = []
 33 | 
 34 |     _path_sanitiser = None
 35 | 
 36 | 
 37 |     def __init__(self, options):
 38 |         """Initialize the class."""
 39 |         if 'output_directory' not in options:
 40 |             raise SourceMapExtractorError("output_directory must be set in options.")
 41 |         else:
 42 |             self._output_directory = os.path.abspath(options['output_directory'])
 43 |             if not os.path.isdir(self._output_directory):
 44 |                 if options['make_directory'] is True:
 45 |                     os.mkdir(self._output_directory)
 46 |                 else:
 47 |                     raise SourceMapExtractorError("output_directory does not exist. Pass --make-directory to auto-make it.")
 48 | 
 49 |         self._path_sanitiser = PathSanitiser(self._output_directory)
 50 | 
 51 |         if options['disable_ssl_verification'] == True:
 52 |             self.disable_verify_ssl = True
 53 |         else:
 54 |             self.disable_verify_ssl = False
 55 |           
 56 |         if options['local'] == True:
 57 |             self._is_local = True
 58 | 
 59 |         if options['detect'] == True:
 60 |             self._attempt_sourcemap_detection = True
 61 | 
 62 |         self._validate_target(options['uri_or_file'])
 63 | 
 64 | 
 65 |     def run(self):
 66 |         """Run extraction process."""
 67 |         if self._is_local == False:
 68 |             if self._attempt_sourcemap_detection:
 69 |                 detected_sourcemaps = self._detect_js_sourcemaps(self._target)
 70 |                 for sourcemap in detected_sourcemaps:
 71 |                     self._parse_remote_sourcemap(sourcemap)
 72 |             else:
 73 |                 self._parse_remote_sourcemap(self._target)
 74 | 
 75 |         else:
 76 |             self._parse_sourcemap(self._target)
 77 | 
 78 | 
 79 |     def _validate_target(self, target):
 80 |         """Do some basic validation on the target."""
 81 |         parsed = urlparse(target)
 82 |         if self._is_local is True:
 83 |             self._target = os.path.abspath(target)
 84 |             if not os.path.isfile(self._target):
 85 |                 raise SourceMapExtractorError("uri_or_file is set to be a file, but doesn't seem to exist. check your path.")
 86 |         else:
 87 |             if parsed.scheme == "":
 88 |                 raise SourceMapExtractorError("uri_or_file isn't a URI, and --local was not set. set --local?")
 89 |             file, ext = os.path.splitext(parsed.path)
 90 |             self._target = target
 91 |             if ext != '.map' and self._attempt_sourcemap_detection is False:
 92 |                 print("WARNING: URI does not have .map extension, and --detect is not flagged.")
 93 | 
 94 | 
 95 |     def _parse_remote_sourcemap(self, uri):
 96 |         """GET a remote sourcemap and parse it."""
 97 |         data, final_uri = self._get_remote_data(uri)
 98 |         if data is not None:
 99 |             self._parse_sourcemap(data, True)
100 |         else:
101 |             print("WARNING: Could not retrieve sourcemap from URI %s" % final_uri)
102 | 
103 | 
104 |     def _detect_js_sourcemaps(self, uri):
105 |         """Pull HTML and attempt to find JS files, then read the JS files and look for sourceMappingURL."""
106 |         remote_sourcemaps = []
107 |         data, final_uri = self._get_remote_data(uri)
108 | 
109 |         # TODO: scan to see if this is a sourcemap instead of assuming HTML
110 |         print("Detecting sourcemaps in HTML at %s" % final_uri)
111 |         script_strainer = SoupStrainer("script", src=True)
112 |         try:
113 |             soup = BeautifulSoup(data, "html.parser", parse_only=script_strainer)
114 |         except:
115 |             raise SourceMapExtractorError("Could not parse HTML at URI %s" % final_uri)
116 | 
117 |         for script in soup:
118 |             source = script['src']
119 |             parsed_uri = urlparse(source)
120 |             next_target_uri = ""
121 |             if parsed_uri.scheme != '':
122 |                 next_target_uri = source
123 |             else:
124 |                 current_uri = urlparse(final_uri)
125 |                 built_uri = current_uri.scheme + "://" + current_uri.netloc + source
126 |                 next_target_uri = built_uri
127 | 
128 |             js_data, last_target_uri = self._get_remote_data(next_target_uri)
129 |             # get last line of file
130 |             last_line = js_data.rstrip().split("\n")[-1]
131 |             regex = "\\/\\/#\s*sourceMappingURL=(.*)$"
132 |             matches = re.search(regex, last_line)
133 |             if matches:
134 |                 asset = matches.groups(0)[0].strip()
135 |                 asset_target = urlparse(asset)
136 |                 if asset_target.scheme != '':
137 |                     print("Detected sourcemap at remote location %s" % asset)
138 |                     remote_sourcemaps.append(asset)
139 |                 else:
140 |                     current_uri = urlparse(last_target_uri)
141 |                     asset_uri = current_uri.scheme + '://' + \
142 |                         current_uri.netloc + \
143 |                         os.path.dirname(current_uri.path) + \
144 |                         '/' + asset
145 |                     print("Detected sourcemap at remote location %s" % asset_uri)
146 |                     remote_sourcemaps.append(asset_uri)
147 | 
148 |         return remote_sourcemaps
149 | 
150 | 
151 |     def _parse_sourcemap(self, target, is_str=False):
152 |         map_data = ""
153 |         if is_str is False:
154 |             if os.path.isfile(target):
155 |                 with open(target, 'r', encoding='utf-8', errors='ignore') as f:
156 |                     map_data = f.read()
157 |         else:
158 |             map_data = target
159 | 
160 |         # with the sourcemap data, pull directory structures
161 |         try:
162 |             map_object = json.loads(map_data)
163 |         except json.JSONDecodeError:
164 |             print("ERROR: Failed to parse sourcemap %s. Are you sure this is a sourcemap?" % target)
165 |             return False
166 | 
167 |         # we need `sourcesContent` and `sources`.
168 |         # do a basic validation check to make sure these exist and agree.
169 |         if 'sources' not in map_object or 'sourcesContent' not in map_object:
170 |             print("ERROR: Sourcemap does not contain sources and/or sourcesContent, cannot extract.")
171 |             return False
172 | 
173 |         if len(map_object['sources']) != len(map_object['sourcesContent']):
174 |             print("WARNING: sources != sourcesContent, filenames may not match content")
175 | 
176 |         for source, content in zip(map_object['sources'], map_object['sourcesContent']):
177 |             # remove webpack:// from paths
178 |             # and do some checks on it
179 |             write_path = self._get_sanitised_file_path(source)
180 |             if write_path is None:
181 |                 print("ERROR: Could not sanitize path %s" % source)
182 |                 continue
183 | 
184 |             os.makedirs(os.path.dirname(write_path), mode=0o755, exist_ok=True)
185 |             with open(write_path, 'w', encoding='utf-8', errors='ignore', newline='') as f:
186 |                 print("Writing %s..." % os.path.basename(write_path))
187 |                 f.write(content)
188 | 
189 |     def _get_sanitised_file_path(self, sourcePath):
190 |         """Sanitise webpack paths for separators/relative paths"""
191 |         sourcePath = sourcePath.replace("webpack:///", "")
192 |         exts = sourcePath.split(" ")
193 | 
194 |         if exts[0] == "external":
195 |             print("WARNING: Found external sourcemap %s, not currently supported. Skipping" % exts[1])
196 |             return None
197 | 
198 |         path, filename = os.path.split(sourcePath)
199 |         if path[:2] == './':
200 |             path = path[2:]
201 |         if path[:3] == '../':
202 |             path = 'parent_dir/' + path[3:]
203 |         if path[:1] == '.':
204 |             path = ""
205 | 
206 |         filepath = self._path_sanitiser.make_valid_file_path(path, filename)
207 |         return filepath
208 | 
209 |     def _get_remote_data(self, uri):
210 |         """Get remote data via http."""
211 | 
212 |         if self.disable_verify_ssl == True:
213 |             result = requests.get(uri, timeout=180, verify=False)
214 |         else:
215 |             result = requests.get(uri, timeout=180, verify=False)
216 | 
217 |         # Redirect
218 |         if not uri == result.url:
219 |             return self._get_remote_data(result.url)
220 | 
221 |         if result.status_code == 200:
222 |             return result.text, result.url
223 |         else:
224 |             print("WARNING: Got status code %d for URI %s" % (result.status_code, result.url))
225 |             return None, result.url
226 | 
227 | 
228 | class PathSanitiser(object):
229 |     """https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python"""
230 | 
231 |     EMPTY_NAME = "empty"
232 | 
233 |     empty_idx = 0
234 |     root_path = ""
235 | 
236 |     def __init__(self, root_path):
237 |         self.root_path = root_path
238 | 
239 |     def ensure_directory_exists(self, path_directory):
240 |         if not os.path.exists(path_directory):
241 |             os.makedirs(path_directory)
242 | 
243 |     def os_path_separators(self):
244 |         seps = []
245 |         for sep in os.path.sep, os.path.altsep:
246 |             if sep:
247 |                 seps.append(sep)
248 |         return seps
249 | 
250 |     def sanitise_filesystem_name(self, potential_file_path_name):
251 |         # Sort out unicode characters
252 |         valid_filename = normalize('NFKD', potential_file_path_name).encode('ascii', 'ignore').decode('ascii')
253 |         # Replace path separators with underscores
254 |         for sep in self.os_path_separators():
255 |             valid_filename = valid_filename.replace(sep, '_')
256 |         # Ensure only valid characters
257 |         valid_chars = "-_.() {0}{1}".format(string.ascii_letters, string.digits)
258 |         valid_filename = "".join(ch for ch in valid_filename if ch in valid_chars)
259 |         # Ensure at least one letter or number to ignore names such as '..'
260 |         valid_chars = "{0}{1}".format(string.ascii_letters, string.digits)
261 |         test_filename = "".join(ch for ch in potential_file_path_name if ch in valid_chars)
262 |         if len(test_filename) == 0:
263 |             # Replace empty file name or file path part with the following
264 |             valid_filename = self.EMPTY_NAME + '_' + str(self.empty_idx)
265 |             self.empty_idx += 1
266 |         return valid_filename
267 | 
268 |     def get_root_path(self):
269 |         # Replace with your own root file path, e.g. '/place/to/save/files/'
270 |         filepath = self.root_path
271 |         filepath = os.path.abspath(filepath)
272 |         # ensure trailing path separator (/)
273 |         if not any(filepath[-1] == sep for sep in self.os_path_separators()):
274 |             filepath = '{0}{1}'.format(filepath, os.path.sep)
275 |         self.ensure_directory_exists(filepath)
276 |         return filepath
277 | 
278 |     def path_split_into_list(self, path):
279 |         # Gets all parts of the path as a list, excluding path separators
280 |         parts = []
281 |         while True:
282 |             newpath, tail = os.path.split(path)
283 |             if newpath == path:
284 |                 assert not tail
285 |                 if path and path not in self.os_path_separators():
286 |                     parts.append(path)
287 |                 break
288 |             if tail and tail not in self.os_path_separators():
289 |                 parts.append(tail)
290 |             path = newpath
291 |         parts.reverse()
292 |         return parts
293 | 
294 |     def sanitise_filesystem_path(self, potential_file_path):
295 |         # Splits up a path and sanitises the name of each part separately
296 |         path_parts_list = self.path_split_into_list(potential_file_path)
297 |         sanitised_path = ''
298 |         for path_component in path_parts_list:
299 |             sanitised_path = '{0}{1}{2}'.format(sanitised_path,
300 |                 self.sanitise_filesystem_name(path_component),
301 |                 os.path.sep)
302 |         return sanitised_path
303 | 
304 |     def check_if_path_is_under(self, parent_path, child_path):
305 |         # Using the function to split paths into lists of component parts, check that one path is underneath another
306 |         child_parts = self.path_split_into_list(child_path)
307 |         parent_parts = self.path_split_into_list(parent_path)
308 |         if len(parent_parts) > len(child_parts):
309 |             return False
310 |         return all(part1==part2 for part1, part2 in zip(child_parts, parent_parts))
311 | 
312 |     def make_valid_file_path(self, path=None, filename=None):
313 |         root_path = self.get_root_path()
314 |         if path:
315 |             sanitised_path = self.sanitise_filesystem_path(path)
316 |             if filename:
317 |                 sanitised_filename = self.sanitise_filesystem_name(filename)
318 |                 complete_path = os.path.join(root_path, sanitised_path, sanitised_filename)
319 |             else:
320 |                 complete_path = os.path.join(root_path, sanitised_path)
321 |         else:
322 |             if filename:
323 |                 sanitised_filename = self.sanitise_filesystem_name(filename)
324 |                 complete_path = os.path.join(root_path, sanitised_filename)
325 |             else:
326 |                 complete_path = complete_path
327 |         complete_path = os.path.abspath(complete_path)
328 |         if self.check_if_path_is_under(root_path, complete_path):
329 |             return complete_path
330 |         else:
331 |             return None
332 | 
333 | class SourceMapExtractorError(Exception):
334 |     pass
335 | 
336 | 
337 | if __name__ == "__main__":
338 |     parser = argparse.ArgumentParser(
339 |         description="A tool to extract code from Webpack sourcemaps. Turns black boxes into gray ones.")
340 |     parser.add_argument("-l", "--local", action="store_true", default=False)
341 |     parser.add_argument("-d", "--detect", action="store_true", default=False,
342 |         help="Attempt to detect sourcemaps from JS assets in retrieved HTML.")
343 |     parser.add_argument("--make-directory", action="store_true", default=False,
344 |         help="Make the output directory if it doesn't exist.")
345 |     parser.add_argument("--dangerously-write-paths", action="store_true", default=False,
346 |         help="Write full paths. WARNING: Be careful here, you are pulling directories from an untrusted source.")
347 |     parser.add_argument("--disable-ssl-verification", action="store_true", default=False,
348 |          help="The script will not verify the site's SSL certificate.")
349 | 
350 |     parser.add_argument("uri_or_file", help="The target URI or file.")
351 |     parser.add_argument("output_directory", help="Directory to output from sourcemap to.")
352 | 
353 |     if (len(sys.argv) < 3):
354 |         parser.print_usage()
355 |         sys.exit(1)
356 | 
357 |     args = parser.parse_args()
358 |     extractor = SourceMapExtractor(vars(args))
359 |     extractor.run()
360 | 


--------------------------------------------------------------------------------
/jsa/automation/tld_detection.py:
--------------------------------------------------------------------------------
   1 | import re
   2 | 
   3 | tlds_1st_lvl = [
   4 |     "aaa",
   5 |     "aarp",
   6 |     "abarth",
   7 |     "abb",
   8 |     "abbott",
   9 |     "abbvie",
  10 |     "abc",
  11 |     "able",
  12 |     "abogado",
  13 |     "abudhabi",
  14 |     "ac",
  15 |     "academy",
  16 |     "accenture",
  17 |     "accountant",
  18 |     "accountants",
  19 |     "aco",
  20 |     "active",
  21 |     "actor",
  22 |     "ad",
  23 |     "adac",
  24 |     "ads",
  25 |     "adult",
  26 |     "ae",
  27 |     "aeg",
  28 |     "aero",
  29 |     "aetna",
  30 |     "af",
  31 |     "afamilycompany",
  32 |     "afl",
  33 |     "africa",
  34 |     "ag",
  35 |     "agakhan",
  36 |     "agency",
  37 |     "ai",
  38 |     "aig",
  39 |     "aigo",
  40 |     "airbus",
  41 |     "airforce",
  42 |     "airtel",
  43 |     "akdn",
  44 |     "al",
  45 |     "alfaromeo",
  46 |     "alibaba",
  47 |     "alipay",
  48 |     "allfinanz",
  49 |     "allstate",
  50 |     "ally",
  51 |     "alsace",
  52 |     "alstom",
  53 |     "am",
  54 |     "americanexpress",
  55 |     "americanfamily",
  56 |     "amex",
  57 |     "amfam",
  58 |     "amica",
  59 |     "amsterdam",
  60 |     "an",
  61 |     "analytics",
  62 |     "android",
  63 |     "anquan",
  64 |     "anz",
  65 |     "ao",
  66 |     "aol",
  67 |     "apartments",
  68 |     "app",
  69 |     "apple",
  70 |     "aq",
  71 |     "aquarelle",
  72 |     "ar",
  73 |     "arab",
  74 |     "aramco",
  75 |     "archi",
  76 |     "army",
  77 |     "arpa",
  78 |     "art",
  79 |     "arte",
  80 |     "as",
  81 |     "asda",
  82 |     "asia",
  83 |     "associates",
  84 |     "at",
  85 |     "athleta",
  86 |     "attorney",
  87 |     "au",
  88 |     "auction",
  89 |     "audi",
  90 |     "audible",
  91 |     "audio",
  92 |     "auspost",
  93 |     "author",
  94 |     "auto",
  95 |     "autos",
  96 |     "avianca",
  97 |     "aw",
  98 |     "aws",
  99 |     "ax",
 100 |     "axa",
 101 |     "az",
 102 |     "azure",
 103 |     "ba",
 104 |     "baby",
 105 |     "baidu",
 106 |     "banamex",
 107 |     "bananarepublic",
 108 |     "band",
 109 |     "bank",
 110 |     "bar",
 111 |     "barcelona",
 112 |     "barclaycard",
 113 |     "barclays",
 114 |     "barefoot",
 115 |     "bargains",
 116 |     "baseball",
 117 |     "basketball",
 118 |     "bauhaus",
 119 |     "bayern",
 120 |     "bb",
 121 |     "bbc",
 122 |     "bbt",
 123 |     "bbva",
 124 |     "bcg",
 125 |     "bcn",
 126 |     "bd",
 127 |     "be",
 128 |     "beats",
 129 |     "beauty",
 130 |     "beer",
 131 |     "bentley",
 132 |     "berlin",
 133 |     "best",
 134 |     "bestbuy",
 135 |     "bet",
 136 |     "bf",
 137 |     "bg",
 138 |     "bh",
 139 |     "bharti",
 140 |     "bi",
 141 |     "bible",
 142 |     "bid",
 143 |     "bike",
 144 |     "bing",
 145 |     "bingo",
 146 |     "bio",
 147 |     "biz",
 148 |     "bj",
 149 |     "bl",
 150 |     "black",
 151 |     "blackfriday",
 152 |     "blanco",
 153 |     "blockbuster",
 154 |     "blog",
 155 |     "bloomberg",
 156 |     "blue",
 157 |     "bm",
 158 |     "bms",
 159 |     "bmw",
 160 |     "bn",
 161 |     "bnl",
 162 |     "bnpparibas",
 163 |     "bo",
 164 |     "boats",
 165 |     "boehringer",
 166 |     "bofa",
 167 |     "bom",
 168 |     "bond",
 169 |     "boo",
 170 |     "book",
 171 |     "booking",
 172 |     "boots",
 173 |     "bosch",
 174 |     "bostik",
 175 |     "boston",
 176 |     "bot",
 177 |     "boutique",
 178 |     "box",
 179 |     "bq",
 180 |     "br",
 181 |     "bradesco",
 182 |     "bridgestone",
 183 |     "broadway",
 184 |     "broker",
 185 |     "brother",
 186 |     "brussels",
 187 |     "bs",
 188 |     "bt",
 189 |     "budapest",
 190 |     "bugatti",
 191 |     "build",
 192 |     "builders",
 193 |     "business",
 194 |     "buy",
 195 |     "buzz",
 196 |     "bv",
 197 |     "bw",
 198 |     "by",
 199 |     "bz",
 200 |     "bzh",
 201 |     "ca",
 202 |     "cab",
 203 |     "cafe",
 204 |     "cal",
 205 |     "call",
 206 |     "calvinklein",
 207 |     "cam",
 208 |     "camera",
 209 |     "camp",
 210 |     "cancerresearch",
 211 |     "canon",
 212 |     "capetown",
 213 |     "capital",
 214 |     "capitalone",
 215 |     "car",
 216 |     "caravan",
 217 |     "cards",
 218 |     "care",
 219 |     "career",
 220 |     "careers",
 221 |     "cars",
 222 |     "cartier",
 223 |     "casa",
 224 |     "case",
 225 |     "caseih",
 226 |     "cash",
 227 |     "casino",
 228 |     "cat",
 229 |     "catering",
 230 |     "catholic",
 231 |     "cba",
 232 |     "cbn",
 233 |     "cbre",
 234 |     "cbs",
 235 |     "cc",
 236 |     "cd",
 237 |     "ceb",
 238 |     "center",
 239 |     "ceo",
 240 |     "cern",
 241 |     "cf",
 242 |     "cfa",
 243 |     "cfd",
 244 |     "cg",
 245 |     "ch",
 246 |     "chanel",
 247 |     "channel",
 248 |     "charity",
 249 |     "chase",
 250 |     "chat",
 251 |     "cheap",
 252 |     "chintai",
 253 |     "chloe",
 254 |     "christmas",
 255 |     "chrome",
 256 |     "chrysler",
 257 |     "church",
 258 |     "ci",
 259 |     "cipriani",
 260 |     "circle",
 261 |     "cisco",
 262 |     "citadel",
 263 |     "citi",
 264 |     "citic",
 265 |     "city",
 266 |     "cityeats",
 267 |     "ck",
 268 |     "cl",
 269 |     "claims",
 270 |     "cleaning",
 271 |     "click",
 272 |     "clinic",
 273 |     "clinique",
 274 |     "clothing",
 275 |     "cloud",
 276 |     "club",
 277 |     "clubmed",
 278 |     "cm",
 279 |     "cn",
 280 |     "co",
 281 |     "coach",
 282 |     "codes",
 283 |     "coffee",
 284 |     "college",
 285 |     "cologne",
 286 |     "com",
 287 |     "comcast",
 288 |     "commbank",
 289 |     "community",
 290 |     "company",
 291 |     "compare",
 292 |     "computer",
 293 |     "comsec",
 294 |     "condos",
 295 |     "construction",
 296 |     "consulting",
 297 |     "contact",
 298 |     "contractors",
 299 |     "cooking",
 300 |     "cookingchannel",
 301 |     "cool",
 302 |     "coop",
 303 |     "corsica",
 304 |     "country",
 305 |     "coupon",
 306 |     "coupons",
 307 |     "courses",
 308 |     "cr",
 309 |     "credit",
 310 |     "creditcard",
 311 |     "creditunion",
 312 |     "cricket",
 313 |     "crown",
 314 |     "crs",
 315 |     "cruise",
 316 |     "cruises",
 317 |     "csc",
 318 |     "cu",
 319 |     "cuisinella",
 320 |     "cv",
 321 |     "cw",
 322 |     "cx",
 323 |     "cy",
 324 |     "cymru",
 325 |     "cyou",
 326 |     "cz",
 327 |     "dabur",
 328 |     "dad",
 329 |     "dance",
 330 |     "data",
 331 |     "date",
 332 |     "dating",
 333 |     "datsun",
 334 |     "day",
 335 |     "dclk",
 336 |     "dds",
 337 |     "de",
 338 |     "deal",
 339 |     "dealer",
 340 |     "deals",
 341 |     "degree",
 342 |     "delivery",
 343 |     "dell",
 344 |     "deloitte",
 345 |     "delta",
 346 |     "democrat",
 347 |     "dental",
 348 |     "dentist",
 349 |     "desi",
 350 |     "design",
 351 |     "dev",
 352 |     "dhl",
 353 |     "diamonds",
 354 |     "diet",
 355 |     "digital",
 356 |     "direct",
 357 |     "directory",
 358 |     "discount",
 359 |     "discover",
 360 |     "dish",
 361 |     "diy",
 362 |     "dj",
 363 |     "dk",
 364 |     "dm",
 365 |     "dnp",
 366 |     "do",
 367 |     "docs",
 368 |     "doctor",
 369 |     "dodge",
 370 |     "dog",
 371 |     "doha",
 372 |     "domains",
 373 |     "doosan",
 374 |     "dot",
 375 |     "download",
 376 |     "drive",
 377 |     "dtv",
 378 |     "dubai",
 379 |     "duck",
 380 |     "dunlop",
 381 |     "duns",
 382 |     "dupont",
 383 |     "durban",
 384 |     "dvag",
 385 |     "dvr",
 386 |     "dz",
 387 |     "earth",
 388 |     "eat",
 389 |     "ec",
 390 |     "eco",
 391 |     "edeka",
 392 |     "edu",
 393 |     "education",
 394 |     "ee",
 395 |     "eg",
 396 |     "eh",
 397 |     "email",
 398 |     "emerck",
 399 |     "energy",
 400 |     "engineer",
 401 |     "engineering",
 402 |     "enterprises",
 403 |     "epost",
 404 |     "epson",
 405 |     "equipment",
 406 |     "er",
 407 |     "ericsson",
 408 |     "erni",
 409 |     "es",
 410 |     "esq",
 411 |     "estate",
 412 |     "esurance",
 413 |     "et",
 414 |     "etisalat",
 415 |     "eu",
 416 |     "eurovision",
 417 |     "eus",
 418 |     "events",
 419 |     "everbank",
 420 |     "exchange",
 421 |     "expert",
 422 |     "exposed",
 423 |     "express",
 424 |     "extraspace",
 425 |     "fage",
 426 |     "fail",
 427 |     "fairwinds",
 428 |     "faith",
 429 |     "family",
 430 |     "fan",
 431 |     "fans",
 432 |     "farm",
 433 |     "farmers",
 434 |     "fashion",
 435 |     "fast",
 436 |     "fedex",
 437 |     "feedback",
 438 |     "ferrari",
 439 |     "ferrero",
 440 |     "fi",
 441 |     "fiat",
 442 |     "fidelity",
 443 |     "fido",
 444 |     "film",
 445 |     "final",
 446 |     "finance",
 447 |     "financial",
 448 |     "fire",
 449 |     "firestone",
 450 |     "firmdale",
 451 |     "fish",
 452 |     "fishing",
 453 |     "fit",
 454 |     "fitness",
 455 |     "fj",
 456 |     "fk",
 457 |     "flickr",
 458 |     "flights",
 459 |     "flir",
 460 |     "florist",
 461 |     "flowers",
 462 |     "flsmidth",
 463 |     "fly",
 464 |     "fm",
 465 |     "fo",
 466 |     "foo",
 467 |     "food",
 468 |     "foodnetwork",
 469 |     "football",
 470 |     "ford",
 471 |     "forex",
 472 |     "forsale",
 473 |     "forum",
 474 |     "foundation",
 475 |     "fox",
 476 |     "fr",
 477 |     "free",
 478 |     "fresenius",
 479 |     "frl",
 480 |     "frogans",
 481 |     "frontdoor",
 482 |     "frontier",
 483 |     "ftr",
 484 |     "fujitsu",
 485 |     "fujixerox",
 486 |     "fun",
 487 |     "fund",
 488 |     "furniture",
 489 |     "futbol",
 490 |     "fyi",
 491 |     "ga",
 492 |     "gal",
 493 |     "gallery",
 494 |     "gallo",
 495 |     "gallup",
 496 |     "game",
 497 |     "games",
 498 |     "gap",
 499 |     "garden",
 500 |     "gb",
 501 |     "gbiz",
 502 |     "gd",
 503 |     "gdn",
 504 |     "ge",
 505 |     "gea",
 506 |     "gent",
 507 |     "genting",
 508 |     "george",
 509 |     "gf",
 510 |     "gg",
 511 |     "ggee",
 512 |     "gh",
 513 |     "gi",
 514 |     "gift",
 515 |     "gifts",
 516 |     "gives",
 517 |     "giving",
 518 |     "gl",
 519 |     "glade",
 520 |     "glass",
 521 |     "gle",
 522 |     "global",
 523 |     "globo",
 524 |     "gm",
 525 |     "gmail",
 526 |     "gmbh",
 527 |     "gmo",
 528 |     "gmx",
 529 |     "gn",
 530 |     "godaddy",
 531 |     "gold",
 532 |     "goldpoint",
 533 |     "golf",
 534 |     "goo",
 535 |     "goodhands",
 536 |     "goodyear",
 537 |     "goog",
 538 |     "google",
 539 |     "gop",
 540 |     "got",
 541 |     "gov",
 542 |     "gp",
 543 |     "gq",
 544 |     "gr",
 545 |     "grainger",
 546 |     "graphics",
 547 |     "gratis",
 548 |     "green",
 549 |     "gripe",
 550 |     "grocery",
 551 |     "group",
 552 |     "gs",
 553 |     "gt",
 554 |     "gu",
 555 |     "guardian",
 556 |     "gucci",
 557 |     "guge",
 558 |     "guide",
 559 |     "guitars",
 560 |     "guru",
 561 |     "gw",
 562 |     "gy",
 563 |     "hair",
 564 |     "hamburg",
 565 |     "hangout",
 566 |     "haus",
 567 |     "hbo",
 568 |     "hdfc",
 569 |     "hdfcbank",
 570 |     "health",
 571 |     "healthcare",
 572 |     "help",
 573 |     "helsinki",
 574 |     "here",
 575 |     "hermes",
 576 |     "hgtv",
 577 |     "hiphop",
 578 |     "hisamitsu",
 579 |     "hitachi",
 580 |     "hiv",
 581 |     "hk",
 582 |     "hkt",
 583 |     "hm",
 584 |     "hn",
 585 |     "hockey",
 586 |     "holdings",
 587 |     "holiday",
 588 |     "homedepot",
 589 |     "homegoods",
 590 |     "homes",
 591 |     "homesense",
 592 |     "honda",
 593 |     "honeywell",
 594 |     "horse",
 595 |     "hospital",
 596 |     "host",
 597 |     "hosting",
 598 |     "hot",
 599 |     "hoteles",
 600 |     "hotels",
 601 |     "hotmail",
 602 |     "house",
 603 |     "how",
 604 |     "hr",
 605 |     "hsbc",
 606 |     "ht",
 607 |     "htc",
 608 |     "hu",
 609 |     "hughes",
 610 |     "hyatt",
 611 |     "hyundai",
 612 |     "ibm",
 613 |     "icbc",
 614 |     "ice",
 615 |     "icu",
 616 |     "id",
 617 |     "ie",
 618 |     "ieee",
 619 |     "ifm",
 620 |     "iinet",
 621 |     "ikano",
 622 |     "il",
 623 |     "im",
 624 |     "imamat",
 625 |     "imdb",
 626 |     "immo",
 627 |     "immobilien",
 628 |     "in",
 629 |     "industries",
 630 |     "infiniti",
 631 |     "info",
 632 |     "ing",
 633 |     "ink",
 634 |     "institute",
 635 |     "insurance",
 636 |     "insure",
 637 |     "int",
 638 |     "intel",
 639 |     "international",
 640 |     "intuit",
 641 |     "investments",
 642 |     "io",
 643 |     "ipiranga",
 644 |     "iq",
 645 |     "ir",
 646 |     "irish",
 647 |     "is",
 648 |     "iselect",
 649 |     "ismaili",
 650 |     "ist",
 651 |     "istanbul",
 652 |     "it",
 653 |     "itau",
 654 |     "itv",
 655 |     "iveco",
 656 |     "iwc",
 657 |     "jaguar",
 658 |     "java",
 659 |     "jcb",
 660 |     "jcp",
 661 |     "je",
 662 |     "jeep",
 663 |     "jetzt",
 664 |     "jewelry",
 665 |     "jio",
 666 |     "jlc",
 667 |     "jll",
 668 |     "jm",
 669 |     "jmp",
 670 |     "jnj",
 671 |     "jo",
 672 |     "jobs",
 673 |     "joburg",
 674 |     "jot",
 675 |     "joy",
 676 |     "jp",
 677 |     "jpmorgan",
 678 |     "jprs",
 679 |     "juegos",
 680 |     "juniper",
 681 |     "kaufen",
 682 |     "kddi",
 683 |     "ke",
 684 |     "kerryhotels",
 685 |     "kerrylogistics",
 686 |     "kerryproperties",
 687 |     "kfh",
 688 |     "kg",
 689 |     "kh",
 690 |     "ki",
 691 |     "kia",
 692 |     "kim",
 693 |     "kinder",
 694 |     "kindle",
 695 |     "kitchen",
 696 |     "kiwi",
 697 |     "km",
 698 |     "kn",
 699 |     "koeln",
 700 |     "komatsu",
 701 |     "kosher",
 702 |     "kp",
 703 |     "kpmg",
 704 |     "kpn",
 705 |     "kr",
 706 |     "krd",
 707 |     "kred",
 708 |     "kuokgroup",
 709 |     "kw",
 710 |     "ky",
 711 |     "kyoto",
 712 |     "kz",
 713 |     "la",
 714 |     "lacaixa",
 715 |     "ladbrokes",
 716 |     "lamborghini",
 717 |     "lamer",
 718 |     "lancaster",
 719 |     "lancia",
 720 |     "lancome",
 721 |     "land",
 722 |     "landrover",
 723 |     "lanxess",
 724 |     "lasalle",
 725 |     "lat",
 726 |     "latino",
 727 |     "latrobe",
 728 |     "law",
 729 |     "lawyer",
 730 |     "lb",
 731 |     "lc",
 732 |     "lds",
 733 |     "lease",
 734 |     "leclerc",
 735 |     "lefrak",
 736 |     "legal",
 737 |     "lego",
 738 |     "lexus",
 739 |     "lgbt",
 740 |     "li",
 741 |     "liaison",
 742 |     "lidl",
 743 |     "life",
 744 |     "lifeinsurance",
 745 |     "lifestyle",
 746 |     "lighting",
 747 |     "like",
 748 |     "lilly",
 749 |     "limited",
 750 |     "limo",
 751 |     "lincoln",
 752 |     "linde",
 753 |     "link",
 754 |     "lipsy",
 755 |     "live",
 756 |     "living",
 757 |     "lixil",
 758 |     "lk",
 759 |     "llc",
 760 |     "loan",
 761 |     "loans",
 762 |     "locker",
 763 |     "locus",
 764 |     "loft",
 765 |     "lol",
 766 |     "london",
 767 |     "lotte",
 768 |     "lotto",
 769 |     "love",
 770 |     "lpl",
 771 |     "lplfinancial",
 772 |     "lr",
 773 |     "ls",
 774 |     "lt",
 775 |     "ltd",
 776 |     "ltda",
 777 |     "lu",
 778 |     "lundbeck",
 779 |     "lupin",
 780 |     "luxe",
 781 |     "luxury",
 782 |     "lv",
 783 |     "ly",
 784 |     "ma",
 785 |     "macys",
 786 |     "madrid",
 787 |     "maif",
 788 |     "maison",
 789 |     "makeup",
 790 |     "man",
 791 |     "management",
 792 |     "mango",
 793 |     "map",
 794 |     "market",
 795 |     "marketing",
 796 |     "markets",
 797 |     "marriott",
 798 |     "marshalls",
 799 |     "maserati",
 800 |     "mattel",
 801 |     "mba",
 802 |     "mc",
 803 |     "mcd",
 804 |     "mcdonalds",
 805 |     "mckinsey",
 806 |     "md",
 807 |     "me",
 808 |     "med",
 809 |     "media",
 810 |     "meet",
 811 |     "melbourne",
 812 |     "meme",
 813 |     "memorial",
 814 |     "men",
 815 |     "menu",
 816 |     "meo",
 817 |     "merckmsd",
 818 |     "metlife",
 819 |     "mf",
 820 |     "mg",
 821 |     "mh",
 822 |     "miami",
 823 |     "microsoft",
 824 |     "mil",
 825 |     "mini",
 826 |     "mint",
 827 |     "mit",
 828 |     "mitsubishi",
 829 |     "mk",
 830 |     "ml",
 831 |     "mlb",
 832 |     "mls",
 833 |     "mm",
 834 |     "mma",
 835 |     "mn",
 836 |     "mo",
 837 |     "mobi",
 838 |     "mobile",
 839 |     "mobily",
 840 |     "moda",
 841 |     "moe",
 842 |     "moi",
 843 |     "mom",
 844 |     "monash",
 845 |     "money",
 846 |     "monster",
 847 |     "montblanc",
 848 |     "mopar",
 849 |     "mormon",
 850 |     "mortgage",
 851 |     "moscow",
 852 |     "moto",
 853 |     "motorcycles",
 854 |     "mov",
 855 |     "movie",
 856 |     "movistar",
 857 |     "mp",
 858 |     "mq",
 859 |     "mr",
 860 |     "ms",
 861 |     "msd",
 862 |     "mt",
 863 |     "mtn",
 864 |     "mtpc",
 865 |     "mtr",
 866 |     "mu",
 867 |     "museum",
 868 |     "mutual",
 869 |     "mutuelle",
 870 |     "mv",
 871 |     "mw",
 872 |     "mx",
 873 |     "my",
 874 |     "mz",
 875 |     "na",
 876 |     "nab",
 877 |     "nadex",
 878 |     "nagoya",
 879 |     "name",
 880 |     "nationwide",
 881 |     "natura",
 882 |     "navy",
 883 |     "nba",
 884 |     "nc",
 885 |     "ne",
 886 |     "nec",
 887 |     "net",
 888 |     "netbank",
 889 |     "netflix",
 890 |     "network",
 891 |     "neustar",
 892 |     "new",
 893 |     "newholland",
 894 |     "news",
 895 |     "next",
 896 |     "nextdirect",
 897 |     "nexus",
 898 |     "nf",
 899 |     "nfl",
 900 |     "ng",
 901 |     "ngo",
 902 |     "nhk",
 903 |     "ni",
 904 |     "nico",
 905 |     "nike",
 906 |     "nikon",
 907 |     "ninja",
 908 |     "nissan",
 909 |     "nissay",
 910 |     "nl",
 911 |     "no",
 912 |     "nokia",
 913 |     "northwesternmutual",
 914 |     "norton",
 915 |     "now",
 916 |     "nowruz",
 917 |     "nowtv",
 918 |     "np",
 919 |     "nr",
 920 |     "nra",
 921 |     "nrw",
 922 |     "ntt",
 923 |     "nu",
 924 |     "nyc",
 925 |     "nz",
 926 |     "obi",
 927 |     "observer",
 928 |     "off",
 929 |     "office",
 930 |     "okinawa",
 931 |     "olayan",
 932 |     "olayangroup",
 933 |     "oldnavy",
 934 |     "ollo",
 935 |     "om",
 936 |     "omega",
 937 |     "one",
 938 |     "ong",
 939 |     "onl",
 940 |     "online",
 941 |     "onyourside",
 942 |     "ooo",
 943 |     "open",
 944 |     "oracle",
 945 |     "orange",
 946 |     "org",
 947 |     "organic",
 948 |     "orientexpress",
 949 |     "origins",
 950 |     "osaka",
 951 |     "otsuka",
 952 |     "ott",
 953 |     "ovh",
 954 |     "pa",
 955 |     "page",
 956 |     "pamperedchef",
 957 |     "panasonic",
 958 |     "panerai",
 959 |     "paris",
 960 |     "pars",
 961 |     "partners",
 962 |     "parts",
 963 |     "party",
 964 |     "passagens",
 965 |     "pay",
 966 |     "pccw",
 967 |     "pe",
 968 |     "pet",
 969 |     "pf",
 970 |     "pfizer",
 971 |     "pg",
 972 |     "ph",
 973 |     "pharmacy",
 974 |     "phd",
 975 |     "philips",
 976 |     "phone",
 977 |     "photo",
 978 |     "photography",
 979 |     "photos",
 980 |     "physio",
 981 |     "piaget",
 982 |     "pics",
 983 |     "pictet",
 984 |     "pictures",
 985 |     "pid",
 986 |     "pin",
 987 |     "ping",
 988 |     "pink",
 989 |     "pioneer",
 990 |     "pizza",
 991 |     "pk",
 992 |     "pl",
 993 |     "place",
 994 |     "play",
 995 |     "playstation",
 996 |     "plumbing",
 997 |     "plus",
 998 |     "pm",
 999 |     "pn",
1000 |     "pnc",
1001 |     "pohl",
1002 |     "poker",
1003 |     "politie",
1004 |     "porn",
1005 |     "post",
1006 |     "pr",
1007 |     "pramerica",
1008 |     "praxi",
1009 |     "press",
1010 |     "prime",
1011 |     "pro",
1012 |     "prod",
1013 |     "productions",
1014 |     "prof",
1015 |     "progressive",
1016 |     "promo",
1017 |     "properties",
1018 |     "property",
1019 |     "protection",
1020 |     "pru",
1021 |     "prudential",
1022 |     "ps",
1023 |     "pt",
1024 |     "pub",
1025 |     "pw",
1026 |     "pwc",
1027 |     "py",
1028 |     "qa",
1029 |     "qpon",
1030 |     "quebec",
1031 |     "quest",
1032 |     "qvc",
1033 |     "racing",
1034 |     "radio",
1035 |     "raid",
1036 |     "re",
1037 |     "read",
1038 |     "realestate",
1039 |     "realtor",
1040 |     "realty",
1041 |     "recipes",
1042 |     "red",
1043 |     "redstone",
1044 |     "redumbrella",
1045 |     "rehab",
1046 |     "reise",
1047 |     "reisen",
1048 |     "reit",
1049 |     "reliance",
1050 |     "ren",
1051 |     "rent",
1052 |     "rentals",
1053 |     "repair",
1054 |     "report",
1055 |     "republican",
1056 |     "rest",
1057 |     "restaurant",
1058 |     "review",
1059 |     "reviews",
1060 |     "rexroth",
1061 |     "rich",
1062 |     "richardli",
1063 |     "ricoh",
1064 |     "rightathome",
1065 |     "ril",
1066 |     "rio",
1067 |     "rip",
1068 |     "rmit",
1069 |     "ro",
1070 |     "rocher",
1071 |     "rocks",
1072 |     "rodeo",
1073 |     "rogers",
1074 |     "room",
1075 |     "rs",
1076 |     "rsvp",
1077 |     "ru",
1078 |     "rugby",
1079 |     "ruhr",
1080 |     "run",
1081 |     "rw",
1082 |     "rwe",
1083 |     "ryukyu",
1084 |     "sa",
1085 |     "saarland",
1086 |     "safe",
1087 |     "safety",
1088 |     "sakura",
1089 |     "sale",
1090 |     "salon",
1091 |     "samsclub",
1092 |     "samsung",
1093 |     "sandvik",
1094 |     "sandvikcoromant",
1095 |     "sanofi",
1096 |     "sap",
1097 |     "sapo",
1098 |     "sarl",
1099 |     "sas",
1100 |     "save",
1101 |     "saxo",
1102 |     "sb",
1103 |     "sbi",
1104 |     "sbs",
1105 |     "sc",
1106 |     "sca",
1107 |     "scb",
1108 |     "schaeffler",
1109 |     "schmidt",
1110 |     "scholarships",
1111 |     "school",
1112 |     "schule",
1113 |     "schwarz",
1114 |     "science",
1115 |     "scjohnson",
1116 |     "scor",
1117 |     "scot",
1118 |     "sd",
1119 |     "se",
1120 |     "search",
1121 |     "seat",
1122 |     "secure",
1123 |     "security",
1124 |     "seek",
1125 |     "select",
1126 |     "sener",
1127 |     "services",
1128 |     "ses",
1129 |     "seven",
1130 |     "sew",
1131 |     "sex",
1132 |     "sexy",
1133 |     "sfr",
1134 |     "sg",
1135 |     "sh",
1136 |     "shangrila",
1137 |     "sharp",
1138 |     "shaw",
1139 |     "shell",
1140 |     "shia",
1141 |     "shiksha",
1142 |     "shoes",
1143 |     "shop",
1144 |     "shopping",
1145 |     "shouji",
1146 |     "show",
1147 |     "showtime",
1148 |     "shriram",
1149 |     "si",
1150 |     "silk",
1151 |     "sina",
1152 |     "singles",
1153 |     "site",
1154 |     "sj",
1155 |     "sk",
1156 |     "ski",
1157 |     "skin",
1158 |     "sky",
1159 |     "skype",
1160 |     "sl",
1161 |     "sling",
1162 |     "sm",
1163 |     "smart",
1164 |     "smile",
1165 |     "sn",
1166 |     "sncf",
1167 |     "so",
1168 |     "soccer",
1169 |     "social",
1170 |     "softbank",
1171 |     "software",
1172 |     "sohu",
1173 |     "solar",
1174 |     "solutions",
1175 |     "song",
1176 |     "sony",
1177 |     "soy",
1178 |     "space",
1179 |     "spiegel",
1180 |     "sport",
1181 |     "spot",
1182 |     "spreadbetting",
1183 |     "sr",
1184 |     "srl",
1185 |     "srt",
1186 |     "ss",
1187 |     "st",
1188 |     "stada",
1189 |     "staples",
1190 |     "star",
1191 |     "starhub",
1192 |     "statebank",
1193 |     "statefarm",
1194 |     "statoil",
1195 |     "stc",
1196 |     "stcgroup",
1197 |     "stockholm",
1198 |     "storage",
1199 |     "store",
1200 |     "stream",
1201 |     "studio",
1202 |     "study",
1203 |     "style",
1204 |     "su",
1205 |     "sucks",
1206 |     "supplies",
1207 |     "supply",
1208 |     "support",
1209 |     "surf",
1210 |     "surgery",
1211 |     "suzuki",
1212 |     "sv",
1213 |     "swatch",
1214 |     "swiftcover",
1215 |     "swiss",
1216 |     "sx",
1217 |     "sy",
1218 |     "sydney",
1219 |     "symantec",
1220 |     "systems",
1221 |     "sz",
1222 |     "tab",
1223 |     "taipei",
1224 |     "talk",
1225 |     "taobao",
1226 |     "target",
1227 |     "tatamotors",
1228 |     "tatar",
1229 |     "tattoo",
1230 |     "tax",
1231 |     "taxi",
1232 |     "tc",
1233 |     "tci",
1234 |     "td",
1235 |     "tdk",
1236 |     "team",
1237 |     "tech",
1238 |     "technology",
1239 |     "tel",
1240 |     "telecity",
1241 |     "telefonica",
1242 |     "temasek",
1243 |     "tennis",
1244 |     "teva",
1245 |     "tf",
1246 |     "tg",
1247 |     "th",
1248 |     "thd",
1249 |     "theater",
1250 |     "theatre",
1251 |     "tiaa",
1252 |     "tickets",
1253 |     "tienda",
1254 |     "tiffany",
1255 |     "tips",
1256 |     "tires",
1257 |     "tirol",
1258 |     "tj",
1259 |     "tjmaxx",
1260 |     "tjx",
1261 |     "tk",
1262 |     "tkmaxx",
1263 |     "tl",
1264 |     "tm",
1265 |     "tmall",
1266 |     "tn",
1267 |     "to",
1268 |     "today",
1269 |     "tokyo",
1270 |     "tools",
1271 |     "top",
1272 |     "toray",
1273 |     "toshiba",
1274 |     "total",
1275 |     "tours",
1276 |     "town",
1277 |     "toyota",
1278 |     "toys",
1279 |     "tp",
1280 |     "tr",
1281 |     "trade",
1282 |     "trading",
1283 |     "training",
1284 |     "travel",
1285 |     "travelchannel",
1286 |     "travelers",
1287 |     "travelersinsurance",
1288 |     "trust",
1289 |     "trv",
1290 |     "tt",
1291 |     "tube",
1292 |     "tui",
1293 |     "tunes",
1294 |     "tushu",
1295 |     "tv",
1296 |     "tvs",
1297 |     "tw",
1298 |     "tz",
1299 |     "ua",
1300 |     "ubank",
1301 |     "ubs",
1302 |     "uconnect",
1303 |     "ug",
1304 |     "uk",
1305 |     "um",
1306 |     "unicom",
1307 |     "university",
1308 |     "uno",
1309 |     "uol",
1310 |     "ups",
1311 |     "us",
1312 |     "uy",
1313 |     "uz",
1314 |     "va",
1315 |     "vacations",
1316 |     "vana",
1317 |     "vanguard",
1318 |     "vc",
1319 |     "ve",
1320 |     "vegas",
1321 |     "ventures",
1322 |     "verisign",
1323 |     "versicherung",
1324 |     "vet",
1325 |     "vg",
1326 |     "vi",
1327 |     "viajes",
1328 |     "video",
1329 |     "vig",
1330 |     "viking",
1331 |     "villas",
1332 |     "vin",
1333 |     "vip",
1334 |     "virgin",
1335 |     "visa",
1336 |     "vision",
1337 |     "vista",
1338 |     "vistaprint",
1339 |     "viva",
1340 |     "vivo",
1341 |     "vlaanderen",
1342 |     "vn",
1343 |     "vodka",
1344 |     "volkswagen",
1345 |     "volvo",
1346 |     "vote",
1347 |     "voting",
1348 |     "voto",
1349 |     "voyage",
1350 |     "vu",
1351 |     "vuelos",
1352 |     "wales",
1353 |     "walmart",
1354 |     "walter",
1355 |     "wang",
1356 |     "wanggou",
1357 |     "warman",
1358 |     "watch",
1359 |     "watches",
1360 |     "weather",
1361 |     "weatherchannel",
1362 |     "webcam",
1363 |     "weber",
1364 |     "website",
1365 |     "wed",
1366 |     "wedding",
1367 |     "weibo",
1368 |     "weir",
1369 |     "wf",
1370 |     "whoswho",
1371 |     "wien",
1372 |     "wiki",
1373 |     "williamhill",
1374 |     "win",
1375 |     "windows",
1376 |     "wine",
1377 |     "winners",
1378 |     "wme",
1379 |     "wolterskluwer",
1380 |     "woodside",
1381 |     "work",
1382 |     "works",
1383 |     "world",
1384 |     "wow",
1385 |     "ws",
1386 |     "wtc",
1387 |     "wtf",
1388 |     "xbox",
1389 |     "xerox",
1390 |     "xfinity",
1391 |     "xihuan",
1392 |     "xin",
1393 |     "测试",
1394 |     "कॉम",
1395 |     "परीक्षा",
1396 |     "セール",
1397 |     "佛山",
1398 |     "ಭಾರತ",
1399 |     "慈善",
1400 |     "集团",
1401 |     "在线",
1402 |     "한국",
1403 |     "ଭାରତ",
1404 |     "大众汽车",
1405 |     "点看",
1406 |     "คอม",
1407 |     "ভাৰত",
1408 |     "ভারত",
1409 |     "八卦",
1410 |     "‏موقع‎",
1411 |     "বাংলা",
1412 |     "公益",
1413 |     "公司",
1414 |     "香格里拉",
1415 |     "网站",
1416 |     "移动",
1417 |     "我爱你",
1418 |     "москва",
1419 |     "испытание",
1420 |     "қаз",
1421 |     "католик",
1422 |     "онлайн",
1423 |     "сайт",
1424 |     "联通",
1425 |     "срб",
1426 |     "бг",
1427 |     "бел",
1428 |     "‏קום‎",
1429 |     "时尚",
1430 |     "微博",
1431 |     "테스트",
1432 |     "淡马锡",
1433 |     "ファッション",
1434 |     "орг",
1435 |     "नेट",
1436 |     "ストア",
1437 |     "삼성",
1438 |     "சிங்கப்பூர்",
1439 |     "商标",
1440 |     "商店",
1441 |     "商城",
1442 |     "дети",
1443 |     "мкд",
1444 |     "‏טעסט‎",
1445 |     "ею",
1446 |     "ポイント",
1447 |     "新闻",
1448 |     "工行",
1449 |     "家電",
1450 |     "‏كوم‎",
1451 |     "中文网",
1452 |     "中信",
1453 |     "中国",
1454 |     "中國",
1455 |     "娱乐",
1456 |     "谷歌",
1457 |     "భారత్",
1458 |     "ලංකා",
1459 |     "電訊盈科",
1460 |     "购物",
1461 |     "測試",
1462 |     "クラウド",
1463 |     "ભારત",
1464 |     "通販",
1465 |     "भारतम्",
1466 |     "भारत",
1467 |     "भारोत",
1468 |     "‏آزمایشی‎",
1469 |     "பரிட்சை",
1470 |     "网店",
1471 |     "संगठन",
1472 |     "餐厅",
1473 |     "网络",
1474 |     "ком",
1475 |     "укр",
1476 |     "香港",
1477 |     "诺基亚",
1478 |     "食品",
1479 |     "δοκιμή",
1480 |     "飞利浦",
1481 |     "‏إختبار‎",
1482 |     "台湾",
1483 |     "台灣",
1484 |     "手表",
1485 |     "手机",
1486 |     "мон",
1487 |     "‏الجزائر‎",
1488 |     "‏عمان‎",
1489 |     "‏ارامكو‎",
1490 |     "‏ایران‎",
1491 |     "‏العليان‎",
1492 |     "‏اتصالات‎",
1493 |     "‏امارات‎",
1494 |     "‏بازار‎",
1495 |     "‏موريتانيا‎",
1496 |     "‏پاکستان‎",
1497 |     "‏الاردن‎",
1498 |     "‏موبايلي‎",
1499 |     "‏بارت‎",
1500 |     "‏بھارت‎",
1501 |     "‏المغرب‎",
1502 |     "‏ابوظبي‎",
1503 |     "‏السعودية‎",
1504 |     "‏ڀارت‎",
1505 |     "‏كاثوليك‎",
1506 |     "‏سودان‎",
1507 |     "‏همراه‎",
1508 |     "‏عراق‎",
1509 |     "‏مليسيا‎",
1510 |     "澳門",
1511 |     "닷컴",
1512 |     "政府",
1513 |     "‏شبكة‎",
1514 |     "‏بيتك‎",
1515 |     "‏عرب‎",
1516 |     "გე",
1517 |     "机构",
1518 |     "组织机构",
1519 |     "健康",
1520 |     "ไทย",
1521 |     "‏سورية‎",
1522 |     "招聘",
1523 |     "рус",
1524 |     "рф",
1525 |     "珠宝",
1526 |     "‏تونس‎",
1527 |     "大拿",
1528 |     "みんな",
1529 |     "グーグル",
1530 |     "ελ",
1531 |     "世界",
1532 |     "書籍",
1533 |     "ഭാരതം",
1534 |     "ਭਾਰਤ",
1535 |     "网址",
1536 |     "닷넷",
1537 |     "コム",
1538 |     "天主教",
1539 |     "游戏",
1540 |     "vermögensberater",
1541 |     "vermögensberatung",
1542 |     "企业",
1543 |     "信息",
1544 |     "嘉里大酒店",
1545 |     "嘉里",
1546 |     "‏مصر‎",
1547 |     "‏قطر‎",
1548 |     "广东",
1549 |     "இலங்கை",
1550 |     "இந்தியா",
1551 |     "հայ",
1552 |     "新加坡",
1553 |     "‏فلسطين‎",
1554 |     "テスト",
1555 |     "政务",
1556 |     "xperia",
1557 |     "xxx",
1558 |     "xyz",
1559 |     "yachts",
1560 |     "yahoo",
1561 |     "yamaxun",
1562 |     "yandex",
1563 |     "ye",
1564 |     "yodobashi",
1565 |     "yoga",
1566 |     "yokohama",
1567 |     "you",
1568 |     "youtube",
1569 |     "yt",
1570 |     "yun",
1571 |     "za",
1572 |     "zappos",
1573 |     "zara",
1574 |     "zero",
1575 |     "zip",
1576 |     "zippo",
1577 |     "zm",
1578 |     "zone",
1579 |     "zuerich",
1580 |     "zw",]
1581 | tlds_2nd_lvl = ["ab.ca","ac.ac","ac.ae","ac.at","ac.be","ac.cn","ac.il","ac.in","ac.jp","ac.kr","ac.th","ac.uk","ac.sg","ad.jp","adm.br","adv.br","ah.cn","am.br","arq.br","art.br","arts.ro","asn.au","asso.fr","asso.mc","bc.ca","bio.br","biz.pl","biz.tr","bj.cn","bel.tr","br.com","cn.com","cng.br","cnt.br","co.ac","co.at","co.de","co.gl","co.hk","co.id","co.il","co.in","co.jp","co.kr","co.mg","co.ms","co.nz","co.th","cp.tz","co.uk","co.ve","co.vi","co.za","com.ag","com.ai","com.ar","com.au","com.br","com.co","com.cn","com.cy","com.de","com.do","com.ec","com.es","com.fj","com.fr","com.gl","com.gt","com.hk","com.hr","com.hu","com.kg","com.ki","com.lc","com.mg","com.mm","com.ms","com.mt","com.mu","com.mx","com.my","com.na","com.nf","com.ng","com.ni","com.pa","com.ph","com.pl","com.pt","com.qa","com.ro","com.ru","com.sb","com.sc","com.sg","com.sv","com.tr","com.tw","com.ua","com.uy","com.ve","com.vn","cq.cn","de.com","de.org","ecn.br","ed.jp","edu.au","edu.cn","edu.hk","edu.mm","edu.my","edu.pl","edu.pt","edu.qa","edu.sg","edu.tr","edu.tw","eng.br","ernet.in","esp.br","etc.br","eti.br","eu.com","eu.int","eu.lv","firm.in","firm.ro","fm.br","fot.br","fst.br","g12.br","gb.com","gb.net","gd.cn","gen.in","go.jp","go.kr","go.th","gov.au","gov.az","gov.br","gov.cn","gov.il","gov.in","gov.mm","gov.my","gov.qa","gov.sg","gov.tr","gov.tw","gov.uk","govt.nz","gr.jp","gs.cn","gv.ac","gv.at","gx.cn","gz.cn","he.cn","hi.cn","hk.cn","hl.cn","hu.com","id.au","idv.tw","in.ua","in.th","ind.br","ind.in","inf.br","info.pl","info.ro","info.tr","info.ve","iwi.nz","jl.cn","jor.br","js.cn","jus.br","k12.il","k12.tr","kr.com","lel.br","lg.jp","ln.cn","ltd.uk","maori.nz","mb.ca","me.uk","med.br","mi.th","mil.br","mil.uk","mo.cn","mod.uk","muni.il","nb.ca","ne.jp","ne.kr","net.ag","net.ai","net.au","net.br","net.cn","net.do","net.gl","net.hk","net.il","net.in","net.kg","net.ki","net.lc","net.mg","net.mm","net.mu","net.ni","net.nz","net.pl","net.ru","net.sb","net.sc","net.sg","net.th","net.tr","net.tw","net.uk","net.ve","nf.ca","nhs.uk","nm.cn","nm.kr","no.com","nom.br","nom.ni","nom.ro","ns.ca","nt.ca","nt.ro","ntr.br","nx.cn","odo.br","off.ai","on.ca","or.ac","or.at","or.jp","or.kr","or.th","org.ag","org.ai","org.au","org.br","org.cn","org.do","org.es","org.gl","org.hk","org.in","org.kg","org.ki","org.lc","org.mg","org.mm","org.ms","org.nf","org.ng","org.ni","org.nz","org.pl","org.ro","org.ru","org.sb","org.sc","org.sg","org.tr","org.tw","org.uk","org.ve","pe.ca","plc.uk","police.uk","ppg.br","presse.fr","pro.br","psc.br","psi.br","qc.ca","qc.com","qh.cn","rec.br","rec.ro","res.in","sa.com","sc.cn","sch.uk","se.com","se.net","sh.cn","sk.ca","slg.br","sn.cn","store.ro","tj.cn","tm.fr","tm.mc","tm.ro","tmp.br","tur.br","tv.br","tv.tr","tw.cn","uk.com","uk.net","us.com","uy.com","vet.br","waw.pl","web.ve","www.ro","xj.cn","xz.cn","yk.ca","yn.cn","zj.cn","zlg.br"]
1582 | 
1583 | 
1584 | def tld_detection(line):
1585 |     probe_global = []
1586 |     global tld_value
1587 |     for tld in tlds_2nd_lvl:
1588 |         line = re.sub("/$", "", line)
1589 |         probe = re.findall("\.%s$" %tld, line)
1590 |         if str(probe) != "[]":
1591 |             probe_global.append(tld)
1592 |             if str(probe_global) != "[]":
1593 |                 return tld
1594 | 
1595 |     if str(probe_global) == "[]":
1596 |         for tld in tlds_1st_lvl:
1597 |             probe = re.findall("\.%s$" % tld, line)
1598 |             if str(probe) != "[]":
1599 |                 return tld
1600 | 


--------------------------------------------------------------------------------