├── jsa ├── automation │ ├── 404_js_wayback.sh │ ├── js_files_extraction.py │ └── tld_detection.py ├── automation.sh └── jsa.py ├── README.md ├── Dockerfile └── webpack └── unwebpack_sourcemap.py /jsa/automation/404_js_wayback.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | url=$1 4 | 5 | status_code=$(curl --insecure --connect-timeout 100 -sL -w "%{http_code}" $url -o /dev/null) 6 | 7 | if [ $status_code != "200" ] 8 | then 9 | printf "https://web.archive.org/web/20060102150405if_/$url\n" 10 | fi 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Automatic crawl domain for js files, search .js files in waybackmachine + gau 2 | 3 | Find sensitive strings, credentials, apikeys in crawled JS files with trufflehog. 4 | 5 | Just works. 6 | 7 | Example usage: 8 | 9 | docker run --dns=8.8.8.8 --rm --privileged=true --ulimit nofile=1048576:1048576 --cpu-shares 256 -v jsa:/jsa 5631/jsa /jsa/URLINPUT.txt /jsa/OUTPUTDIRECTORY >> /jsa/jsa.output 10 | 11 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:latest 2 | 3 | COPY . . 4 | 5 | ENV HOME /go/jsa 6 | ENV GOPATH=/go/ 7 | ENV PATH $PATH:$GOPATH 8 | ENV PATH $PATH:/go/jsa 9 | 10 | WORKDIR /go/jsa/ 11 | 12 | RUN apt -y update && apt -y install git \ 13 | wget \ 14 | python3 \ 15 | python3-pip parallel 16 | 17 | RUN GO111MODULE=on go install github.com/lc/gau@latest && GO111MODULE=on go install github.com/jaeles-project/gospider@latest 18 | 19 | 20 | 21 | RUN pip3 install bs4 --break-system-packages && pip3 install requests --break-system-packages 22 | 23 | RUN chmod +x automation.sh && chmod +x automation/404_js_wayback.sh 24 | 25 | RUN git clone https://github.com/trufflesecurity/trufflehog.git && cd trufflehog && go install 26 | 27 | ENTRYPOINT ["automation.sh"] 28 | 29 | 30 | 31 | #RUN pip3 install idna==2.10 && pip3 install tldextract && pip3 install -r /go/linkfinder/requirements.txt -------------------------------------------------------------------------------- /jsa/automation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILENAME=$1 4 | 5 | LINES=$(cat $FILENAME) 6 | 7 | mkdir $2 8 | mkdir /tmp 9 | mkdir $2/download/ 10 | cd $2/download/ 11 | 12 | task(){ 13 | 14 | LINE=$1 15 | i=$2 16 | outputdir=$3 17 | 18 | echo i: $i 19 | 20 | sleep 3 21 | 22 | mkdir $outputdir/$i 23 | printf "Crawl... $LINE\n" 24 | 25 | printf $LINE | timeout 1800 gospider -t 1 --concurrent 1 -d 1 --other-source --include-other-source --delay 1 --timeout 120 --js=false --sitemap --depth 2 --robots --blacklist eot,jpg,jpeg,gif,css,tif,tiff,png,ttf,otf,woff,woff2,ico,pdf,svg,txt,mp4,avi,mpeg4,mp3,webm,ogv,gif,jpg,jpeg,png > $outputdir/$i/gospider.txt 26 | cat $outputdir/$i/gospider.txt | grep -vE 'https?:\/\/.*\.json' | grep -vE 'jquery|bootstrap|ga.js|watch.js|wp-embed|angular|wf\-|recaptcha|gtm.js|google|sweetalert|i18n' | grep -E 'https?:\/\/.*\.js' -o | sort -u > $outputdir/$i/wget.txt 27 | 28 | ## lauching wayback with a "js only" mode to reduce execution time 29 | printf 'Launching Gau with wayback..\n' 30 | printf $LINE | xargs -I{} echo "{}/*&filter=mimetype:application/javascript&somevar=" | gau -providers wayback -b eot,jpg,jpeg,gif,css,tif,tiff,png,ttf,otf,woff,woff2,ico,pdf,svg,txt,mp4,avi,mpeg4,mp3,webm,ogv,gif,jpg,jpeg,png | tee $outputdir/$i/gau.txt >/dev/null ##gau 31 | printf $LINE | xargs -I{} echo "{}/*&filter=mimetype:text/javascript&somevar=" | gau -providers wayback -b eot,jpg,jpeg,gif,css,tif,tiff,png,ttf,otf,woff,woff2,ico,pdf,svg,txt,mp4,avi,mpeg4,mp3,webm,ogv,gif,jpg,jpeg,png | tee -a $outputdir/$i/gau.txt >/dev/null ##gau 32 | 33 | ## if js file parsed from wayback didn't return 200 live, we are generating a URL to see a file's content on wayback's server; 34 | ## it's useless for endpoints discovery but there is a point to search for credentials in the old content; that's what we'll do 35 | ## only wayback as of now 36 | chmod -R 777 $outputdir/$i/ 37 | 38 | #printf "Fetching URLs for 404 js files from wayback..\n" 39 | #cat $outputdir/$i/gau.txt | cut -d '?' -f1 | cut -d '#' -f1 | grep '.*\.js$' | sort -u | parallel --gnu -j 2 "/go/jsa/automation/404_js_wayback.sh {}" | tee -a $outputdir/$i/creds_search.txt >/dev/null 40 | #cat $outputdir/$i/wget.txt | cut -d '?' -f1 | cut -d '#' -f1 | grep '.*\.js$' | sort -u | parallel --gnu -j 2 "/go/jsa/automation/404_js_wayback.sh {}" | tee -a $outputdir/$i/creds_search.txt >/dev/null 41 | ## save all endpoints to the file for future processing 42 | 43 | ## extracting js files from js files 44 | printf "Printing deep-level js files..\n" 45 | cat $outputdir/$i/wget.txt | parallel --gnu --pipe -j 2 "timeout 6000 python3 /go/jsa/automation/js_files_extraction.py | tee -a $outputdir/$i/wget.txt" 46 | 47 | printf "wget discovered JS files for local creds scan + webpack + api paths\n" 48 | sed 's/$/.map/' $outputdir/$i/wget.txt > $outputdir/$i/wgetmap.txt 49 | 50 | cat $outputdir/$i/wget.txt | sed 'p;s/\//-/g' | sed 'N;s/\n/ -O /' | xargs wget -c --no-directories -P '$outputdir/download/' --retry-on-host-error --tries=5 --content-disposition --no-check-certificate --timeout=160 --trust-server-names 51 | cat $outputdir/$i/creds_search.txt | sed 'p;s/\//-/g' | sed 'N;s/\n/ -O /' | xargs wget -c --no-directories -P '$outputdir/download/' --retry-on-host-error --tries=7 --content-disposition --no-check-certificate --timeout=160 --trust-server-names 52 | cat $outputdir/$i/wgetmap.txt | sed 'p;s/\//-/g' | sed 'N;s/\n/ -O /' | xargs wget -c --no-directories -P '$outputdir/download/' --retry-on-host-error --tries=5 --content-disposition --no-check-certificate --timeout=160 --trust-server-names 53 | 54 | mkdir $outputdir/$i 55 | 56 | outputurl=${LINE//:/.} 57 | outputurl=${outputurl//\//.} 58 | 59 | python3 /go/webpack/unwebpack_sourcemap.py --make-directory --disable-ssl-verification --detect $LINE $outputdir/$i/$outputurl 60 | } 61 | 62 | i=0 63 | for LINE in $LINES 64 | do 65 | ((i=i+1)) 66 | task "$LINE" "$i" "$2" & #call all domains in parallel 67 | done 68 | 69 | wait 70 | pwd 71 | 72 | if [ ! -f "/jsa/shasums" ]; 73 | then 74 | touch /jsa/shasums 75 | fi 76 | 77 | #get sha sum for each file and verify that it havnt been scaned earlier 78 | for filename in * 79 | do 80 | currentfilehash=$(cat "$filename" | sha1sum | head -c 40) 81 | 82 | if grep -Fxq "$currentfilehash" /jsa/shasums 83 | then 84 | rm "$filename" 85 | else 86 | echo "$currentfilehash" >> /jsa/shasums 87 | fi 88 | done 89 | 90 | trufflehog filesystem --directory=$2 >> $2/out.txt 91 | -------------------------------------------------------------------------------- /jsa/automation/js_files_extraction.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | import io 4 | import sys 5 | from datetime import datetime 6 | import warnings 7 | from requests.packages.urllib3.exceptions import InsecureRequestWarning 8 | from tld_detection import tld_detection 9 | 10 | js_file = sys.stdin.readlines() 11 | 12 | 13 | original_lines = [] 14 | js_files_2nd_lvl = [] 15 | js_files_3rd_lvl = [] 16 | js_files_4th_lvl = [] 17 | 18 | js_files_3rd_lvl_original = [] 19 | tmp_list = [] 20 | 21 | def deduplication(input, original_lines): ## filtering + deduplication 22 | existing_lines = [] 23 | for line in input: ## Filtering the output of subjs (#$ and ?v=$) 24 | line = re.sub("\\?v=.*?$", "", line) 25 | line = re.sub("#.*?$", "", line) 26 | existing_lines.append(line) 27 | for line in existing_lines: ## Deleting duplicates 28 | line = line.strip() 29 | if line not in original_lines: 30 | original_lines.append(line) 31 | 32 | clear_url_global = [] 33 | 34 | def main_func(original_lines, js_files): 35 | for line in original_lines: ## main loop 36 | 37 | # tld = re.sub("^[a-z]+\.", "", domain_name) ## matching TLD 38 | 39 | clear_url0 = re.findall("^(.*?)\\b/", line) 40 | global clear_url 41 | clear_url = re.sub("\['|'\]", "", str(clear_url0)) ## matching URL without js part 42 | domain_name = tld_detection(clear_url) 43 | if "[]" in clear_url: 44 | continue 45 | ##if str(domain_name) not in str(line): ## deleting since this is automation and we need a clear output 46 | ## excluding 3rd party js files & print 'em 47 | # print("3rd party JS file has been found: " + line) 48 | # continue 49 | warnings.simplefilter('ignore', InsecureRequestWarning) 50 | js_file_status = requests.head(line, 51 | verify=False).status_code ## fastly (HEAD) finding out a status code of js file url 52 | if js_file_status == 200: ## if js file exists (to reduce time) 53 | warnings.simplefilter('ignore', InsecureRequestWarning) 54 | js_file_content = requests.get(line, verify=False) ## fetching js file's content 55 | # filename = "%s/%s" % (directory_with_js_files, name_for_wget) 56 | # os.makedirs(os.path.dirname(filename)) ##creating dir with a js file 57 | # js_file_write = open(filename, "w") ## it's for js file downloading 58 | 59 | # js_file_write.write(js_file_content.text) ## wget for js file into the directory 60 | u = re.findall("\"\/[a-zA-Z0-9_?&=/\-\#\.]*\"", js_file_content.text) ## matching "string" 61 | u = str(u).replace("', '", "\n").replace("[]", "") 62 | u = re.sub("\['|'\]|\"", "", u) 63 | u = re.sub( 64 | ".css|.png|.jpg|.svg|.jpeg|.ico|.gif|.woff|.woff2|.swf", "", u, 65 | flags=re.M) ## excluding not desirable file extensions 66 | u = re.sub(".*?\.(facebook|twitter).(net|com)(/)|(/|/\?|/#|#)$", "", u, 67 | flags=re.M) ##preparing for deduplication with / /? # deleting 68 | u = re.sub("(\n\n)", "\n", u, flags=re.M) 69 | if re.findall("^//", u): 70 | u = re.sub("^//(.*?)/", clear_url + "/", u, flags=re.M) 71 | else: 72 | u = re.sub("^", clear_url, u, flags=re.M) 73 | u_lines = io.StringIO(u).readlines() ## endpoints 74 | 75 | for one in u_lines: 76 | if re.findall("\.js$", one): 77 | ##if re.findall("^//", one) and verbose is True: ## excluding 3rd party 2nd lvl js files & print 'em 78 | ##print("3rd party JS file has been found: " + one) ## deleting since this is automation and we need a clear output 79 | if re.findall("^//", one): 80 | one = re.sub("^//(.*?)/", clear_url + "/", one) # one = re.sub("\n", "", one) 81 | js_files.append(one) 82 | if re.findall("^/", one): 83 | one = re.sub("^/", clear_url + "/", one) 84 | if re.findall("^\b", one): ## if js file doesn't have / at ^, it'll be added 85 | one = re.sub("^", clear_url + "/", one) # one = re.sub("\n", "", one) 86 | js_files.append(one) 87 | if re.findall("^\[\]/", one): 88 | one = re.sub("^\[\]", clear_url, one) 89 | js_files.append(one) 90 | else: ## printing js files found on 2nd level 91 | js_files.append(one) 92 | ##elif js_file_status == 404: ## todo make it for subjs output only ## deleting since this is automation and we need a clear output 93 | # print( 94 | # "JS file {} returned 404 code. Check the host and try to apply file upload with path traversal/PUT method file upload.".format(line)) 95 | 96 | 97 | deduplication(js_file, original_lines) 98 | 99 | main_func(original_lines, js_files_2nd_lvl) 100 | 101 | if len(js_files_2nd_lvl) != 0: ## processing 2nd level js files 102 | ##if verbose is True: ## deleting since this is automation and we need a clear output 103 | ##print("\nJS files 2nd level:\n") 104 | js_files_2nd_lvl_original = [] 105 | deduplication(js_files_2nd_lvl, js_files_2nd_lvl_original) ## removing dupes 106 | for l in js_files_2nd_lvl_original: ## printing a list 107 | j2 = re.findall("\.js", l) ## sometimes (I don't know why though), non-js files leak to the list 108 | if len(j2) == 0: 109 | continue 110 | elif l not in original_lines: 111 | print(l) 112 | main_func(js_files_2nd_lvl_original, js_files_3rd_lvl) 113 | 114 | if len(js_files_3rd_lvl) != 0: 115 | ##if verbose is True: 116 | ##print("JS files 3rd level:\n") 117 | deduplication(js_files_3rd_lvl, js_files_3rd_lvl_original) ## removing dupes 118 | for l in js_files_3rd_lvl_original: ## printing a list 119 | j3 = re.findall("\.js$", l) ## sometimes (I don't know why though), non-js files leak to the list 120 | if len(j3) == 0: 121 | continue 122 | elif l not in js_files_2nd_lvl_original and original_lines: 123 | if re.findall("^htt(p|s)(.*?)\w//(.*?)/", l): 124 | l = re.sub("^htt(p|s)(.*?)\w//(.*?)/", clear_url + "/", l, flags=re.M) 125 | tmp_list.append(l) 126 | print(l) 127 | js_files_3rd_lvl_original.clear() 128 | js_files_3rd_lvl_original = tmp_list 129 | 130 | #main_func(js_files_3rd_lvl, js_files_4th_lvl) 131 | 132 | -------------------------------------------------------------------------------- /jsa/jsa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import re 3 | import requests 4 | import io 5 | import os 6 | import argparse 7 | import sys 8 | from datetime import datetime 9 | import warnings 10 | from requests.packages.urllib3.exceptions import InsecureRequestWarning 11 | from tld_detection import tld_detection 12 | 13 | ## Implement reading from specified file 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('-v', "--verbose", help='verbose', action='store_true') 17 | parser.add_argument('-e', "--exclude", help='exclude & print 3rd party js files', action='store_true') 18 | parser.add_argument('-f', "--file", help='js file URL in format htt(p|ps)://(.*)/name.js', action='append') 19 | 20 | verbose = parser.parse_args().verbose 21 | exclude = parser.parse_args().exclude 22 | if not sys.stdin.isatty(): 23 | global js_file 24 | js_file = sys.stdin.readlines() 25 | elif parser.parse_args().file: 26 | js_file = parser.parse_args().file 27 | else: 28 | print("Please specify js file in STDIN or in argument -f!") 29 | exit() 30 | 31 | # js_file = open("/Users/max/test13.txt", "r").readlines() 32 | 33 | 34 | ## just some containers for future values 35 | 36 | original_lines = [] 37 | 38 | all_endpoints_1st_lvl = [] 39 | all_endpoints_original = [] 40 | js_files_2nd_lvl = [] 41 | 42 | all_endpoints_2nd_lvl = [] 43 | all_endpoints_2nd_lvl_original = [] 44 | js_files_3rd_lvl = [] 45 | 46 | all_endpoints_3rd_lvl = [] 47 | all_endpoints_3rd_lvl_original = [] 48 | tmp_list = [] 49 | js_files_4th_lvl = [] ## just for passing it to the main func, it won't be processed actually 50 | 51 | 52 | #### 53 | # now = datetime.now() 54 | # now = str(now).replace(" ", "_").replace(":", "-") 55 | # now = re.sub("\..*?$", "", now) 56 | 57 | # curpath = os.path.abspath(os.curdir) 58 | 59 | # directory_with_js_files = "%s/js_files/%s/" % (curpath, now) ## directory of downloaded js files for other tools 60 | 61 | 62 | ### 63 | 64 | def deduplication(input, original_lines): ## filtering + deduplication 65 | existing_lines = [] 66 | for line in input: ## Filtering the output of subjs (#$ and ?v=$) 67 | line = re.sub("\\?v=.*?$", "", line) 68 | line = re.sub("#.*?$", "", line) 69 | existing_lines.append(line) 70 | for line in existing_lines: ## Deleting duplicates 71 | line = line.strip() 72 | if line not in original_lines: 73 | original_lines.append(line) 74 | 75 | 76 | def main_func(original_lines, js_files, all_endpoints): 77 | for line in original_lines: ## main loop 78 | 79 | clear_url0 = re.findall("^(.*?)\\b/", line) 80 | global clear_url 81 | clear_url = re.sub("\['|'\]", "", str(clear_url0)) ## matching URL without js part 82 | domain_name = tld_detection(clear_url) 83 | if "[]" in clear_url: 84 | continue 85 | if str(domain_name) not in str(line) and exclude is True: 86 | ## excluding 3rd party js files & print 'em 87 | 88 | print("Possible (if not CDN) 3rd party JS file has been found: " + line) 89 | warnings.simplefilter('ignore', InsecureRequestWarning) 90 | try: 91 | js_file_status = requests.get(line, verify=False).status_code ## finding out a status code of js file url 92 | except Exception: 93 | pass 94 | if js_file_status == 200: ## if js file exists (to reduce time) 95 | warnings.simplefilter('ignore', InsecureRequestWarning) 96 | js_file_content = requests.get(line, verify=False) ## fetching js file's content 97 | 98 | # filename = "%s/%s" % (directory_with_js_files, name_for_wget) 99 | # os.makedirs(os.path.dirname(filename)) ##creating dir with a js file 100 | # js_file_write = open(filename, "w") ## it's for js file downloading 101 | 102 | # js_file_write.write(js_file_content.text) ## wget for js file into the directory 103 | 104 | u = re.findall("\"\/[a-zA-Z0-9_?&=/\-\#\.]*\"", js_file_content.text) ## matching "string" 105 | u = str(u).replace("', '", "\n").replace("[]", "") 106 | u = re.sub("\['|'\]|\"", "", u) 107 | u = re.sub( 108 | ".css|.png|.jpg|.svg|.jpeg|.ico|.gif|.woff|.woff2|.swf", "", u, 109 | flags=re.M) ## excluding not desirable file extensions 110 | u = re.sub(".*?\.(facebook|twitter).(net|com)(/)|(/|/\?|/#|#)$", "", u, 111 | flags=re.M) ##preparing for deduplication with / /? # deleting 112 | u = re.sub("(\n\n)", "\n", u, flags=re.M) 113 | 114 | if re.findall("^//", u): 115 | u = re.sub("^//(.*?)/", clear_url + "/", u, flags=re.M) ## it's for js files 116 | else: 117 | u = re.sub("^", clear_url, u, flags=re.M) 118 | u_lines = io.StringIO(u).readlines() ## endpoints 119 | 120 | for one in u_lines: 121 | if re.findall("\.js$", one): 122 | if re.findall("^//", one) and verbose is True: ## excluding 3rd party 2nd lvl js files & print 'em 123 | if not re.findall("^//%s" % domain_name, one): 124 | print("Possible (if not CDN) 3rd party JS file has been found: " + one) 125 | if re.findall("^//", one): 126 | one = re.sub("^//(.*?)/", clear_url + "/", one) # one = re.sub("\n", "", one) 127 | js_files.append(one) 128 | if re.findall("^/", one): 129 | one = re.sub("^/", clear_url + "/", one) 130 | if re.findall("^\b", one): ## if js file doesn't have / at ^, it'll be added 131 | one = re.sub("^", clear_url + "/", one) # one = re.sub("\n", "", one) 132 | js_files.append(one) 133 | if re.findall("^\[\]/", one): 134 | one = re.sub("^\[\]", clear_url, one) 135 | js_files.append(one) 136 | else: ## printing js files found on 2nd level 137 | js_files.append(one) 138 | else: 139 | all_endpoints.append(one) ## printing 1st lvl endpoints 140 | elif js_file_status == 404 and verbose is True: ## todo make it for subjs output only 141 | print( 142 | "JS file {} returned 404 code. Check the host and try to apply file upload with path traversal/PUT method file upload.".format( 143 | line)) 144 | 145 | 146 | deduplication(js_file, original_lines) 147 | main_func(original_lines, js_files_2nd_lvl, all_endpoints_1st_lvl) 148 | 149 | if len(all_endpoints_1st_lvl) != 0: 150 | temp0 = [] 151 | for l in all_endpoints_1st_lvl: 152 | 153 | clear_domain = re.findall("http(s)://(.*)(?=/)", l) 154 | clear_domain = re.findall(", '(.*?)'", str(clear_domain)) 155 | clear_domain = ''.join(clear_domain) 156 | 157 | t = re.findall("^(.*?)(?<=com)", l) 158 | l = re.sub("(/|/\?|/#|#|/\.)$", "", l) ## additionally deleting / /? /# 159 | 160 | if not re.findall("%s$" % clear_domain, 161 | l): ## removing clear urls without actual endpoints like http(s)://domain.com 162 | if "[]//" in l: 163 | l = l.replace("[]//", "//%s" % clear_domain) 164 | temp0.append(l) 165 | 166 | if not re.findall("%s/\W" % clear_domain, l): ## deleting endpoints containing 167 | ## non-word character (not a-z0-9) http(s)://domain.com/(.|[]{}, 168 | if not re.findall("%s/[a-z0-9]{1}$" % clear_domain, 169 | l): ## deleting endpoints containing 1 word character like http(s)://domain.com/1|a|1a; 170 | temp0.append(l) ## most likely to be an endpoint and not a javascript variable 171 | 172 | all_endpoints_1st_lvl.clear() ## deleting current list w/ endpoints 173 | all_endpoints_1st_lvl = temp0 ##substitution 174 | 175 | deduplication(all_endpoints_1st_lvl, all_endpoints_original) ## deleting dupes 176 | for l in all_endpoints_original: ## printing a list 177 | if "[]" in l: 178 | continue 179 | else: 180 | print(l) 181 | 182 | if len(js_files_2nd_lvl) != 0: ## processing 2nd level js files 183 | printed = False 184 | js_files_2nd_lvl_original = [] 185 | deduplication(js_files_2nd_lvl, js_files_2nd_lvl_original) ## removing dupes 186 | for l in js_files_2nd_lvl_original: ## printing a list 187 | 188 | j2 = re.findall("\.js$", l) ## sometimes (I don't know why though), non-js files leak to the list 189 | if len(j2) == 0: 190 | continue 191 | if l not in original_lines: 192 | if printed is False and verbose is True: ## printing a text only one time if verbose mode 193 | print("\nJS files 2nd level:\n") 194 | printed = True 195 | if verbose is True: 196 | print(l) 197 | 198 | main_func(js_files_2nd_lvl_original, js_files_3rd_lvl, all_endpoints_2nd_lvl) 199 | 200 | if len(js_files_3rd_lvl) != 0: 201 | printed = False 202 | js_files_3rd_lvl_original = [] 203 | deduplication(js_files_3rd_lvl, js_files_3rd_lvl_original) ## removing dupes 204 | for l in js_files_3rd_lvl_original: ## printing a list 205 | 206 | j3 = re.findall("\.js$", l) ## sometimes (I don't know why though), non-js files leak to the list 207 | if len(j3) == 0: 208 | continue 209 | if l not in js_files_2nd_lvl_original and original_lines: 210 | if printed is False and verbose is True: ## printing a text only one time if verbose mode 211 | print("\nJS files 3rd level:\n") 212 | printed = True 213 | if verbose is True: 214 | if re.findall("^htt(p|s)(.*?)\w//(.*?)/", l): 215 | l = re.sub("^htt(p|s)(.*?)\w//(.*?)/", clear_url + "/", l, flags=re.M) 216 | print(l) 217 | 218 | main_func(js_files_3rd_lvl, js_files_4th_lvl, all_endpoints_3rd_lvl) 219 | 220 | 221 | 222 | if all_endpoints_2nd_lvl: ## printing 2nd level endpoints 223 | temp1 = [] 224 | for l in all_endpoints_2nd_lvl: 225 | 226 | clear_domain = re.findall("http(s)://(.*)(?=/)", l) 227 | clear_domain = re.findall(", '(.*?)'", str(clear_domain)) 228 | clear_domain = ''.join(clear_domain) 229 | 230 | t = re.findall("^(.*?)(?<=com)", l) 231 | l = re.sub("(/|/\?|/#|#|/\.)$", "", l) ## additionally deleting / /? /# 232 | 233 | if not re.findall("%s$" % clear_domain, 234 | l): ## removing clear urls without actual endpoints like http(s)://domain.com 235 | if "[]//" in l: 236 | l = l.replace("[]//", "//%s" % clear_domain) 237 | temp1.append(l) 238 | 239 | if not re.findall("%s/\W" % clear_domain, l): ## deleting endpoints containing 240 | ## non-word character (not a-z0-9) http(s)://domain.com/(.|[]{}, 241 | if not re.findall("%s/[a-z0-9]{1}$" % clear_domain, 242 | l): ## deleting endpoints containing 1 word character like http(s)://domain.com/1|a|1a; 243 | temp1.append(l) ## most likely to be an endpoint and not a javascript variable 244 | 245 | all_endpoints_2nd_lvl.clear() ## deleting current list w/ endpoints 246 | all_endpoints_2nd_lvl = temp1 ##substitution 247 | printed = False 248 | deduplication(all_endpoints_2nd_lvl, all_endpoints_2nd_lvl_original) ## deleting dupes 249 | for l in all_endpoints_2nd_lvl_original: ## printing a lists 250 | if "[]" in l: 251 | continue 252 | elif l not in all_endpoints_original: 253 | if printed is False and verbose is True: ## printing a text only one time if verbose mode 254 | print("\nEndpoints 2nd level:\n") 255 | printed = True 256 | print(l) ##printing URL with endpoint if it's original 257 | 258 | if all_endpoints_3rd_lvl: 259 | all_endpoints_3rd_lvl_original = [] 260 | temp2 = [] 261 | for l in all_endpoints_3rd_lvl: 262 | 263 | clear_domain = re.findall("http(s)://(.*)(?=/)", l) 264 | clear_domain = re.findall(", '(.*?)'", str(clear_domain)) 265 | clear_domain = ''.join(clear_domain) 266 | 267 | t = re.findall("^(.*?)(?<=com)", l) 268 | l = re.sub("(/|/\?|/#|#|/\.)$", "", l) ## additionally deleting / /? /# 269 | 270 | if not re.findall("%s$" % clear_domain, 271 | l): ## removing clear urls without actual endpoints like http(s)://domain.com 272 | if "[]//" in l: 273 | l = l.replace("[]//", "//%s" % clear_domain) 274 | temp2.append(l) 275 | 276 | if not re.findall("%s/\W" % clear_domain, l): ## deleting endpoints containing 277 | ## non-word character (not a-z0-9) http(s)://domain.com/(.|[]{}, 278 | if not re.findall("%s/[a-z0-9]{1}$" % clear_domain, 279 | l): ## deleting endpoints containing 1 word character like http(s)://domain.com/1|a|1a; 280 | temp2.append(l) ## most likely to be an endpoint and not a javascript variable 281 | 282 | all_endpoints_3rd_lvl.clear() ## deleting current list w/ endpoints 283 | all_endpoints_3rd_lvl = temp2 ##substitution 284 | printed = False 285 | all_endpoints_2nd_lvl_original = [] ## deleting dupes 286 | deduplication(all_endpoints_3rd_lvl, all_endpoints_3rd_lvl_original) 287 | for l in all_endpoints_3rd_lvl_original: ## printing a lists 288 | if "[]" in l: 289 | continue 290 | elif l not in all_endpoints_original and all_endpoints_2nd_lvl_original: 291 | if printed is False and verbose is True: 292 | print("Endpoints 3rd level:\n") 293 | printed = True 294 | print(l) 295 | 296 | # if os.path.exists(directory_with_js_files) is True: 297 | # os.system("retire %s" % directory_with_js_files) 298 | 299 | ## Deleting duplicates from the js files 2nd level 300 | 301 | ## Deleting duplicates from the endpoints 1st level 302 | 303 | ## Deleting duplicates from the js files 3rdnd level 304 | 305 | ## Deleting duplicates from the endpoints 2nd level 306 | -------------------------------------------------------------------------------- /webpack/unwebpack_sourcemap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | unwebpack_sourcemap.py 4 | by rarecoil (github.com/rarecoil/unwebpack-sourcemap) 5 | 6 | Reads Webpack source maps and extracts the disclosed 7 | uncompiled/commented source code for review. Can detect and 8 | attempt to read sourcemaps from Webpack bundles with the `-d` 9 | flag. Puts source into a directory structure similar to dev. 10 | """ 11 | 12 | import argparse 13 | import json 14 | import os 15 | import re 16 | import string 17 | import sys 18 | from urllib.parse import urlparse 19 | from unicodedata import normalize 20 | 21 | import requests 22 | from bs4 import BeautifulSoup, SoupStrainer 23 | 24 | 25 | class SourceMapExtractor(object): 26 | """Primary SourceMapExtractor class. Feed this arguments.""" 27 | 28 | _target = None 29 | _is_local = False 30 | _attempt_sourcemap_detection = False 31 | _output_directory = "" 32 | _target_extracted_sourcemaps = [] 33 | 34 | _path_sanitiser = None 35 | 36 | 37 | def __init__(self, options): 38 | """Initialize the class.""" 39 | if 'output_directory' not in options: 40 | raise SourceMapExtractorError("output_directory must be set in options.") 41 | else: 42 | self._output_directory = os.path.abspath(options['output_directory']) 43 | if not os.path.isdir(self._output_directory): 44 | if options['make_directory'] is True: 45 | os.mkdir(self._output_directory) 46 | else: 47 | raise SourceMapExtractorError("output_directory does not exist. Pass --make-directory to auto-make it.") 48 | 49 | self._path_sanitiser = PathSanitiser(self._output_directory) 50 | 51 | if options['disable_ssl_verification'] == True: 52 | self.disable_verify_ssl = True 53 | else: 54 | self.disable_verify_ssl = False 55 | 56 | if options['local'] == True: 57 | self._is_local = True 58 | 59 | if options['detect'] == True: 60 | self._attempt_sourcemap_detection = True 61 | 62 | self._validate_target(options['uri_or_file']) 63 | 64 | 65 | def run(self): 66 | """Run extraction process.""" 67 | if self._is_local == False: 68 | if self._attempt_sourcemap_detection: 69 | detected_sourcemaps = self._detect_js_sourcemaps(self._target) 70 | for sourcemap in detected_sourcemaps: 71 | self._parse_remote_sourcemap(sourcemap) 72 | else: 73 | self._parse_remote_sourcemap(self._target) 74 | 75 | else: 76 | self._parse_sourcemap(self._target) 77 | 78 | 79 | def _validate_target(self, target): 80 | """Do some basic validation on the target.""" 81 | parsed = urlparse(target) 82 | if self._is_local is True: 83 | self._target = os.path.abspath(target) 84 | if not os.path.isfile(self._target): 85 | raise SourceMapExtractorError("uri_or_file is set to be a file, but doesn't seem to exist. check your path.") 86 | else: 87 | if parsed.scheme == "": 88 | raise SourceMapExtractorError("uri_or_file isn't a URI, and --local was not set. set --local?") 89 | file, ext = os.path.splitext(parsed.path) 90 | self._target = target 91 | if ext != '.map' and self._attempt_sourcemap_detection is False: 92 | print("WARNING: URI does not have .map extension, and --detect is not flagged.") 93 | 94 | 95 | def _parse_remote_sourcemap(self, uri): 96 | """GET a remote sourcemap and parse it.""" 97 | data, final_uri = self._get_remote_data(uri) 98 | if data is not None: 99 | self._parse_sourcemap(data, True) 100 | else: 101 | print("WARNING: Could not retrieve sourcemap from URI %s" % final_uri) 102 | 103 | 104 | def _detect_js_sourcemaps(self, uri): 105 | """Pull HTML and attempt to find JS files, then read the JS files and look for sourceMappingURL.""" 106 | remote_sourcemaps = [] 107 | data, final_uri = self._get_remote_data(uri) 108 | 109 | # TODO: scan to see if this is a sourcemap instead of assuming HTML 110 | print("Detecting sourcemaps in HTML at %s" % final_uri) 111 | script_strainer = SoupStrainer("script", src=True) 112 | try: 113 | soup = BeautifulSoup(data, "html.parser", parse_only=script_strainer) 114 | except: 115 | raise SourceMapExtractorError("Could not parse HTML at URI %s" % final_uri) 116 | 117 | for script in soup: 118 | source = script['src'] 119 | parsed_uri = urlparse(source) 120 | next_target_uri = "" 121 | if parsed_uri.scheme != '': 122 | next_target_uri = source 123 | else: 124 | current_uri = urlparse(final_uri) 125 | built_uri = current_uri.scheme + "://" + current_uri.netloc + source 126 | next_target_uri = built_uri 127 | 128 | js_data, last_target_uri = self._get_remote_data(next_target_uri) 129 | # get last line of file 130 | last_line = js_data.rstrip().split("\n")[-1] 131 | regex = "\\/\\/#\s*sourceMappingURL=(.*)$" 132 | matches = re.search(regex, last_line) 133 | if matches: 134 | asset = matches.groups(0)[0].strip() 135 | asset_target = urlparse(asset) 136 | if asset_target.scheme != '': 137 | print("Detected sourcemap at remote location %s" % asset) 138 | remote_sourcemaps.append(asset) 139 | else: 140 | current_uri = urlparse(last_target_uri) 141 | asset_uri = current_uri.scheme + '://' + \ 142 | current_uri.netloc + \ 143 | os.path.dirname(current_uri.path) + \ 144 | '/' + asset 145 | print("Detected sourcemap at remote location %s" % asset_uri) 146 | remote_sourcemaps.append(asset_uri) 147 | 148 | return remote_sourcemaps 149 | 150 | 151 | def _parse_sourcemap(self, target, is_str=False): 152 | map_data = "" 153 | if is_str is False: 154 | if os.path.isfile(target): 155 | with open(target, 'r', encoding='utf-8', errors='ignore') as f: 156 | map_data = f.read() 157 | else: 158 | map_data = target 159 | 160 | # with the sourcemap data, pull directory structures 161 | try: 162 | map_object = json.loads(map_data) 163 | except json.JSONDecodeError: 164 | print("ERROR: Failed to parse sourcemap %s. Are you sure this is a sourcemap?" % target) 165 | return False 166 | 167 | # we need `sourcesContent` and `sources`. 168 | # do a basic validation check to make sure these exist and agree. 169 | if 'sources' not in map_object or 'sourcesContent' not in map_object: 170 | print("ERROR: Sourcemap does not contain sources and/or sourcesContent, cannot extract.") 171 | return False 172 | 173 | if len(map_object['sources']) != len(map_object['sourcesContent']): 174 | print("WARNING: sources != sourcesContent, filenames may not match content") 175 | 176 | for source, content in zip(map_object['sources'], map_object['sourcesContent']): 177 | # remove webpack:// from paths 178 | # and do some checks on it 179 | write_path = self._get_sanitised_file_path(source) 180 | if write_path is None: 181 | print("ERROR: Could not sanitize path %s" % source) 182 | continue 183 | 184 | os.makedirs(os.path.dirname(write_path), mode=0o755, exist_ok=True) 185 | with open(write_path, 'w', encoding='utf-8', errors='ignore', newline='') as f: 186 | print("Writing %s..." % os.path.basename(write_path)) 187 | f.write(content) 188 | 189 | def _get_sanitised_file_path(self, sourcePath): 190 | """Sanitise webpack paths for separators/relative paths""" 191 | sourcePath = sourcePath.replace("webpack:///", "") 192 | exts = sourcePath.split(" ") 193 | 194 | if exts[0] == "external": 195 | print("WARNING: Found external sourcemap %s, not currently supported. Skipping" % exts[1]) 196 | return None 197 | 198 | path, filename = os.path.split(sourcePath) 199 | if path[:2] == './': 200 | path = path[2:] 201 | if path[:3] == '../': 202 | path = 'parent_dir/' + path[3:] 203 | if path[:1] == '.': 204 | path = "" 205 | 206 | filepath = self._path_sanitiser.make_valid_file_path(path, filename) 207 | return filepath 208 | 209 | def _get_remote_data(self, uri): 210 | """Get remote data via http.""" 211 | 212 | if self.disable_verify_ssl == True: 213 | result = requests.get(uri, timeout=180, verify=False) 214 | else: 215 | result = requests.get(uri, timeout=180, verify=False) 216 | 217 | # Redirect 218 | if not uri == result.url: 219 | return self._get_remote_data(result.url) 220 | 221 | if result.status_code == 200: 222 | return result.text, result.url 223 | else: 224 | print("WARNING: Got status code %d for URI %s" % (result.status_code, result.url)) 225 | return None, result.url 226 | 227 | 228 | class PathSanitiser(object): 229 | """https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python""" 230 | 231 | EMPTY_NAME = "empty" 232 | 233 | empty_idx = 0 234 | root_path = "" 235 | 236 | def __init__(self, root_path): 237 | self.root_path = root_path 238 | 239 | def ensure_directory_exists(self, path_directory): 240 | if not os.path.exists(path_directory): 241 | os.makedirs(path_directory) 242 | 243 | def os_path_separators(self): 244 | seps = [] 245 | for sep in os.path.sep, os.path.altsep: 246 | if sep: 247 | seps.append(sep) 248 | return seps 249 | 250 | def sanitise_filesystem_name(self, potential_file_path_name): 251 | # Sort out unicode characters 252 | valid_filename = normalize('NFKD', potential_file_path_name).encode('ascii', 'ignore').decode('ascii') 253 | # Replace path separators with underscores 254 | for sep in self.os_path_separators(): 255 | valid_filename = valid_filename.replace(sep, '_') 256 | # Ensure only valid characters 257 | valid_chars = "-_.() {0}{1}".format(string.ascii_letters, string.digits) 258 | valid_filename = "".join(ch for ch in valid_filename if ch in valid_chars) 259 | # Ensure at least one letter or number to ignore names such as '..' 260 | valid_chars = "{0}{1}".format(string.ascii_letters, string.digits) 261 | test_filename = "".join(ch for ch in potential_file_path_name if ch in valid_chars) 262 | if len(test_filename) == 0: 263 | # Replace empty file name or file path part with the following 264 | valid_filename = self.EMPTY_NAME + '_' + str(self.empty_idx) 265 | self.empty_idx += 1 266 | return valid_filename 267 | 268 | def get_root_path(self): 269 | # Replace with your own root file path, e.g. '/place/to/save/files/' 270 | filepath = self.root_path 271 | filepath = os.path.abspath(filepath) 272 | # ensure trailing path separator (/) 273 | if not any(filepath[-1] == sep for sep in self.os_path_separators()): 274 | filepath = '{0}{1}'.format(filepath, os.path.sep) 275 | self.ensure_directory_exists(filepath) 276 | return filepath 277 | 278 | def path_split_into_list(self, path): 279 | # Gets all parts of the path as a list, excluding path separators 280 | parts = [] 281 | while True: 282 | newpath, tail = os.path.split(path) 283 | if newpath == path: 284 | assert not tail 285 | if path and path not in self.os_path_separators(): 286 | parts.append(path) 287 | break 288 | if tail and tail not in self.os_path_separators(): 289 | parts.append(tail) 290 | path = newpath 291 | parts.reverse() 292 | return parts 293 | 294 | def sanitise_filesystem_path(self, potential_file_path): 295 | # Splits up a path and sanitises the name of each part separately 296 | path_parts_list = self.path_split_into_list(potential_file_path) 297 | sanitised_path = '' 298 | for path_component in path_parts_list: 299 | sanitised_path = '{0}{1}{2}'.format(sanitised_path, 300 | self.sanitise_filesystem_name(path_component), 301 | os.path.sep) 302 | return sanitised_path 303 | 304 | def check_if_path_is_under(self, parent_path, child_path): 305 | # Using the function to split paths into lists of component parts, check that one path is underneath another 306 | child_parts = self.path_split_into_list(child_path) 307 | parent_parts = self.path_split_into_list(parent_path) 308 | if len(parent_parts) > len(child_parts): 309 | return False 310 | return all(part1==part2 for part1, part2 in zip(child_parts, parent_parts)) 311 | 312 | def make_valid_file_path(self, path=None, filename=None): 313 | root_path = self.get_root_path() 314 | if path: 315 | sanitised_path = self.sanitise_filesystem_path(path) 316 | if filename: 317 | sanitised_filename = self.sanitise_filesystem_name(filename) 318 | complete_path = os.path.join(root_path, sanitised_path, sanitised_filename) 319 | else: 320 | complete_path = os.path.join(root_path, sanitised_path) 321 | else: 322 | if filename: 323 | sanitised_filename = self.sanitise_filesystem_name(filename) 324 | complete_path = os.path.join(root_path, sanitised_filename) 325 | else: 326 | complete_path = complete_path 327 | complete_path = os.path.abspath(complete_path) 328 | if self.check_if_path_is_under(root_path, complete_path): 329 | return complete_path 330 | else: 331 | return None 332 | 333 | class SourceMapExtractorError(Exception): 334 | pass 335 | 336 | 337 | if __name__ == "__main__": 338 | parser = argparse.ArgumentParser( 339 | description="A tool to extract code from Webpack sourcemaps. Turns black boxes into gray ones.") 340 | parser.add_argument("-l", "--local", action="store_true", default=False) 341 | parser.add_argument("-d", "--detect", action="store_true", default=False, 342 | help="Attempt to detect sourcemaps from JS assets in retrieved HTML.") 343 | parser.add_argument("--make-directory", action="store_true", default=False, 344 | help="Make the output directory if it doesn't exist.") 345 | parser.add_argument("--dangerously-write-paths", action="store_true", default=False, 346 | help="Write full paths. WARNING: Be careful here, you are pulling directories from an untrusted source.") 347 | parser.add_argument("--disable-ssl-verification", action="store_true", default=False, 348 | help="The script will not verify the site's SSL certificate.") 349 | 350 | parser.add_argument("uri_or_file", help="The target URI or file.") 351 | parser.add_argument("output_directory", help="Directory to output from sourcemap to.") 352 | 353 | if (len(sys.argv) < 3): 354 | parser.print_usage() 355 | sys.exit(1) 356 | 357 | args = parser.parse_args() 358 | extractor = SourceMapExtractor(vars(args)) 359 | extractor.run() 360 | -------------------------------------------------------------------------------- /jsa/automation/tld_detection.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | tlds_1st_lvl = [ 4 | "aaa", 5 | "aarp", 6 | "abarth", 7 | "abb", 8 | "abbott", 9 | "abbvie", 10 | "abc", 11 | "able", 12 | "abogado", 13 | "abudhabi", 14 | "ac", 15 | "academy", 16 | "accenture", 17 | "accountant", 18 | "accountants", 19 | "aco", 20 | "active", 21 | "actor", 22 | "ad", 23 | "adac", 24 | "ads", 25 | "adult", 26 | "ae", 27 | "aeg", 28 | "aero", 29 | "aetna", 30 | "af", 31 | "afamilycompany", 32 | "afl", 33 | "africa", 34 | "ag", 35 | "agakhan", 36 | "agency", 37 | "ai", 38 | "aig", 39 | "aigo", 40 | "airbus", 41 | "airforce", 42 | "airtel", 43 | "akdn", 44 | "al", 45 | "alfaromeo", 46 | "alibaba", 47 | "alipay", 48 | "allfinanz", 49 | "allstate", 50 | "ally", 51 | "alsace", 52 | "alstom", 53 | "am", 54 | "americanexpress", 55 | "americanfamily", 56 | "amex", 57 | "amfam", 58 | "amica", 59 | "amsterdam", 60 | "an", 61 | "analytics", 62 | "android", 63 | "anquan", 64 | "anz", 65 | "ao", 66 | "aol", 67 | "apartments", 68 | "app", 69 | "apple", 70 | "aq", 71 | "aquarelle", 72 | "ar", 73 | "arab", 74 | "aramco", 75 | "archi", 76 | "army", 77 | "arpa", 78 | "art", 79 | "arte", 80 | "as", 81 | "asda", 82 | "asia", 83 | "associates", 84 | "at", 85 | "athleta", 86 | "attorney", 87 | "au", 88 | "auction", 89 | "audi", 90 | "audible", 91 | "audio", 92 | "auspost", 93 | "author", 94 | "auto", 95 | "autos", 96 | "avianca", 97 | "aw", 98 | "aws", 99 | "ax", 100 | "axa", 101 | "az", 102 | "azure", 103 | "ba", 104 | "baby", 105 | "baidu", 106 | "banamex", 107 | "bananarepublic", 108 | "band", 109 | "bank", 110 | "bar", 111 | "barcelona", 112 | "barclaycard", 113 | "barclays", 114 | "barefoot", 115 | "bargains", 116 | "baseball", 117 | "basketball", 118 | "bauhaus", 119 | "bayern", 120 | "bb", 121 | "bbc", 122 | "bbt", 123 | "bbva", 124 | "bcg", 125 | "bcn", 126 | "bd", 127 | "be", 128 | "beats", 129 | "beauty", 130 | "beer", 131 | "bentley", 132 | "berlin", 133 | "best", 134 | "bestbuy", 135 | "bet", 136 | "bf", 137 | "bg", 138 | "bh", 139 | "bharti", 140 | "bi", 141 | "bible", 142 | "bid", 143 | "bike", 144 | "bing", 145 | "bingo", 146 | "bio", 147 | "biz", 148 | "bj", 149 | "bl", 150 | "black", 151 | "blackfriday", 152 | "blanco", 153 | "blockbuster", 154 | "blog", 155 | "bloomberg", 156 | "blue", 157 | "bm", 158 | "bms", 159 | "bmw", 160 | "bn", 161 | "bnl", 162 | "bnpparibas", 163 | "bo", 164 | "boats", 165 | "boehringer", 166 | "bofa", 167 | "bom", 168 | "bond", 169 | "boo", 170 | "book", 171 | "booking", 172 | "boots", 173 | "bosch", 174 | "bostik", 175 | "boston", 176 | "bot", 177 | "boutique", 178 | "box", 179 | "bq", 180 | "br", 181 | "bradesco", 182 | "bridgestone", 183 | "broadway", 184 | "broker", 185 | "brother", 186 | "brussels", 187 | "bs", 188 | "bt", 189 | "budapest", 190 | "bugatti", 191 | "build", 192 | "builders", 193 | "business", 194 | "buy", 195 | "buzz", 196 | "bv", 197 | "bw", 198 | "by", 199 | "bz", 200 | "bzh", 201 | "ca", 202 | "cab", 203 | "cafe", 204 | "cal", 205 | "call", 206 | "calvinklein", 207 | "cam", 208 | "camera", 209 | "camp", 210 | "cancerresearch", 211 | "canon", 212 | "capetown", 213 | "capital", 214 | "capitalone", 215 | "car", 216 | "caravan", 217 | "cards", 218 | "care", 219 | "career", 220 | "careers", 221 | "cars", 222 | "cartier", 223 | "casa", 224 | "case", 225 | "caseih", 226 | "cash", 227 | "casino", 228 | "cat", 229 | "catering", 230 | "catholic", 231 | "cba", 232 | "cbn", 233 | "cbre", 234 | "cbs", 235 | "cc", 236 | "cd", 237 | "ceb", 238 | "center", 239 | "ceo", 240 | "cern", 241 | "cf", 242 | "cfa", 243 | "cfd", 244 | "cg", 245 | "ch", 246 | "chanel", 247 | "channel", 248 | "charity", 249 | "chase", 250 | "chat", 251 | "cheap", 252 | "chintai", 253 | "chloe", 254 | "christmas", 255 | "chrome", 256 | "chrysler", 257 | "church", 258 | "ci", 259 | "cipriani", 260 | "circle", 261 | "cisco", 262 | "citadel", 263 | "citi", 264 | "citic", 265 | "city", 266 | "cityeats", 267 | "ck", 268 | "cl", 269 | "claims", 270 | "cleaning", 271 | "click", 272 | "clinic", 273 | "clinique", 274 | "clothing", 275 | "cloud", 276 | "club", 277 | "clubmed", 278 | "cm", 279 | "cn", 280 | "co", 281 | "coach", 282 | "codes", 283 | "coffee", 284 | "college", 285 | "cologne", 286 | "com", 287 | "comcast", 288 | "commbank", 289 | "community", 290 | "company", 291 | "compare", 292 | "computer", 293 | "comsec", 294 | "condos", 295 | "construction", 296 | "consulting", 297 | "contact", 298 | "contractors", 299 | "cooking", 300 | "cookingchannel", 301 | "cool", 302 | "coop", 303 | "corsica", 304 | "country", 305 | "coupon", 306 | "coupons", 307 | "courses", 308 | "cr", 309 | "credit", 310 | "creditcard", 311 | "creditunion", 312 | "cricket", 313 | "crown", 314 | "crs", 315 | "cruise", 316 | "cruises", 317 | "csc", 318 | "cu", 319 | "cuisinella", 320 | "cv", 321 | "cw", 322 | "cx", 323 | "cy", 324 | "cymru", 325 | "cyou", 326 | "cz", 327 | "dabur", 328 | "dad", 329 | "dance", 330 | "data", 331 | "date", 332 | "dating", 333 | "datsun", 334 | "day", 335 | "dclk", 336 | "dds", 337 | "de", 338 | "deal", 339 | "dealer", 340 | "deals", 341 | "degree", 342 | "delivery", 343 | "dell", 344 | "deloitte", 345 | "delta", 346 | "democrat", 347 | "dental", 348 | "dentist", 349 | "desi", 350 | "design", 351 | "dev", 352 | "dhl", 353 | "diamonds", 354 | "diet", 355 | "digital", 356 | "direct", 357 | "directory", 358 | "discount", 359 | "discover", 360 | "dish", 361 | "diy", 362 | "dj", 363 | "dk", 364 | "dm", 365 | "dnp", 366 | "do", 367 | "docs", 368 | "doctor", 369 | "dodge", 370 | "dog", 371 | "doha", 372 | "domains", 373 | "doosan", 374 | "dot", 375 | "download", 376 | "drive", 377 | "dtv", 378 | "dubai", 379 | "duck", 380 | "dunlop", 381 | "duns", 382 | "dupont", 383 | "durban", 384 | "dvag", 385 | "dvr", 386 | "dz", 387 | "earth", 388 | "eat", 389 | "ec", 390 | "eco", 391 | "edeka", 392 | "edu", 393 | "education", 394 | "ee", 395 | "eg", 396 | "eh", 397 | "email", 398 | "emerck", 399 | "energy", 400 | "engineer", 401 | "engineering", 402 | "enterprises", 403 | "epost", 404 | "epson", 405 | "equipment", 406 | "er", 407 | "ericsson", 408 | "erni", 409 | "es", 410 | "esq", 411 | "estate", 412 | "esurance", 413 | "et", 414 | "etisalat", 415 | "eu", 416 | "eurovision", 417 | "eus", 418 | "events", 419 | "everbank", 420 | "exchange", 421 | "expert", 422 | "exposed", 423 | "express", 424 | "extraspace", 425 | "fage", 426 | "fail", 427 | "fairwinds", 428 | "faith", 429 | "family", 430 | "fan", 431 | "fans", 432 | "farm", 433 | "farmers", 434 | "fashion", 435 | "fast", 436 | "fedex", 437 | "feedback", 438 | "ferrari", 439 | "ferrero", 440 | "fi", 441 | "fiat", 442 | "fidelity", 443 | "fido", 444 | "film", 445 | "final", 446 | "finance", 447 | "financial", 448 | "fire", 449 | "firestone", 450 | "firmdale", 451 | "fish", 452 | "fishing", 453 | "fit", 454 | "fitness", 455 | "fj", 456 | "fk", 457 | "flickr", 458 | "flights", 459 | "flir", 460 | "florist", 461 | "flowers", 462 | "flsmidth", 463 | "fly", 464 | "fm", 465 | "fo", 466 | "foo", 467 | "food", 468 | "foodnetwork", 469 | "football", 470 | "ford", 471 | "forex", 472 | "forsale", 473 | "forum", 474 | "foundation", 475 | "fox", 476 | "fr", 477 | "free", 478 | "fresenius", 479 | "frl", 480 | "frogans", 481 | "frontdoor", 482 | "frontier", 483 | "ftr", 484 | "fujitsu", 485 | "fujixerox", 486 | "fun", 487 | "fund", 488 | "furniture", 489 | "futbol", 490 | "fyi", 491 | "ga", 492 | "gal", 493 | "gallery", 494 | "gallo", 495 | "gallup", 496 | "game", 497 | "games", 498 | "gap", 499 | "garden", 500 | "gb", 501 | "gbiz", 502 | "gd", 503 | "gdn", 504 | "ge", 505 | "gea", 506 | "gent", 507 | "genting", 508 | "george", 509 | "gf", 510 | "gg", 511 | "ggee", 512 | "gh", 513 | "gi", 514 | "gift", 515 | "gifts", 516 | "gives", 517 | "giving", 518 | "gl", 519 | "glade", 520 | "glass", 521 | "gle", 522 | "global", 523 | "globo", 524 | "gm", 525 | "gmail", 526 | "gmbh", 527 | "gmo", 528 | "gmx", 529 | "gn", 530 | "godaddy", 531 | "gold", 532 | "goldpoint", 533 | "golf", 534 | "goo", 535 | "goodhands", 536 | "goodyear", 537 | "goog", 538 | "google", 539 | "gop", 540 | "got", 541 | "gov", 542 | "gp", 543 | "gq", 544 | "gr", 545 | "grainger", 546 | "graphics", 547 | "gratis", 548 | "green", 549 | "gripe", 550 | "grocery", 551 | "group", 552 | "gs", 553 | "gt", 554 | "gu", 555 | "guardian", 556 | "gucci", 557 | "guge", 558 | "guide", 559 | "guitars", 560 | "guru", 561 | "gw", 562 | "gy", 563 | "hair", 564 | "hamburg", 565 | "hangout", 566 | "haus", 567 | "hbo", 568 | "hdfc", 569 | "hdfcbank", 570 | "health", 571 | "healthcare", 572 | "help", 573 | "helsinki", 574 | "here", 575 | "hermes", 576 | "hgtv", 577 | "hiphop", 578 | "hisamitsu", 579 | "hitachi", 580 | "hiv", 581 | "hk", 582 | "hkt", 583 | "hm", 584 | "hn", 585 | "hockey", 586 | "holdings", 587 | "holiday", 588 | "homedepot", 589 | "homegoods", 590 | "homes", 591 | "homesense", 592 | "honda", 593 | "honeywell", 594 | "horse", 595 | "hospital", 596 | "host", 597 | "hosting", 598 | "hot", 599 | "hoteles", 600 | "hotels", 601 | "hotmail", 602 | "house", 603 | "how", 604 | "hr", 605 | "hsbc", 606 | "ht", 607 | "htc", 608 | "hu", 609 | "hughes", 610 | "hyatt", 611 | "hyundai", 612 | "ibm", 613 | "icbc", 614 | "ice", 615 | "icu", 616 | "id", 617 | "ie", 618 | "ieee", 619 | "ifm", 620 | "iinet", 621 | "ikano", 622 | "il", 623 | "im", 624 | "imamat", 625 | "imdb", 626 | "immo", 627 | "immobilien", 628 | "in", 629 | "industries", 630 | "infiniti", 631 | "info", 632 | "ing", 633 | "ink", 634 | "institute", 635 | "insurance", 636 | "insure", 637 | "int", 638 | "intel", 639 | "international", 640 | "intuit", 641 | "investments", 642 | "io", 643 | "ipiranga", 644 | "iq", 645 | "ir", 646 | "irish", 647 | "is", 648 | "iselect", 649 | "ismaili", 650 | "ist", 651 | "istanbul", 652 | "it", 653 | "itau", 654 | "itv", 655 | "iveco", 656 | "iwc", 657 | "jaguar", 658 | "java", 659 | "jcb", 660 | "jcp", 661 | "je", 662 | "jeep", 663 | "jetzt", 664 | "jewelry", 665 | "jio", 666 | "jlc", 667 | "jll", 668 | "jm", 669 | "jmp", 670 | "jnj", 671 | "jo", 672 | "jobs", 673 | "joburg", 674 | "jot", 675 | "joy", 676 | "jp", 677 | "jpmorgan", 678 | "jprs", 679 | "juegos", 680 | "juniper", 681 | "kaufen", 682 | "kddi", 683 | "ke", 684 | "kerryhotels", 685 | "kerrylogistics", 686 | "kerryproperties", 687 | "kfh", 688 | "kg", 689 | "kh", 690 | "ki", 691 | "kia", 692 | "kim", 693 | "kinder", 694 | "kindle", 695 | "kitchen", 696 | "kiwi", 697 | "km", 698 | "kn", 699 | "koeln", 700 | "komatsu", 701 | "kosher", 702 | "kp", 703 | "kpmg", 704 | "kpn", 705 | "kr", 706 | "krd", 707 | "kred", 708 | "kuokgroup", 709 | "kw", 710 | "ky", 711 | "kyoto", 712 | "kz", 713 | "la", 714 | "lacaixa", 715 | "ladbrokes", 716 | "lamborghini", 717 | "lamer", 718 | "lancaster", 719 | "lancia", 720 | "lancome", 721 | "land", 722 | "landrover", 723 | "lanxess", 724 | "lasalle", 725 | "lat", 726 | "latino", 727 | "latrobe", 728 | "law", 729 | "lawyer", 730 | "lb", 731 | "lc", 732 | "lds", 733 | "lease", 734 | "leclerc", 735 | "lefrak", 736 | "legal", 737 | "lego", 738 | "lexus", 739 | "lgbt", 740 | "li", 741 | "liaison", 742 | "lidl", 743 | "life", 744 | "lifeinsurance", 745 | "lifestyle", 746 | "lighting", 747 | "like", 748 | "lilly", 749 | "limited", 750 | "limo", 751 | "lincoln", 752 | "linde", 753 | "link", 754 | "lipsy", 755 | "live", 756 | "living", 757 | "lixil", 758 | "lk", 759 | "llc", 760 | "loan", 761 | "loans", 762 | "locker", 763 | "locus", 764 | "loft", 765 | "lol", 766 | "london", 767 | "lotte", 768 | "lotto", 769 | "love", 770 | "lpl", 771 | "lplfinancial", 772 | "lr", 773 | "ls", 774 | "lt", 775 | "ltd", 776 | "ltda", 777 | "lu", 778 | "lundbeck", 779 | "lupin", 780 | "luxe", 781 | "luxury", 782 | "lv", 783 | "ly", 784 | "ma", 785 | "macys", 786 | "madrid", 787 | "maif", 788 | "maison", 789 | "makeup", 790 | "man", 791 | "management", 792 | "mango", 793 | "map", 794 | "market", 795 | "marketing", 796 | "markets", 797 | "marriott", 798 | "marshalls", 799 | "maserati", 800 | "mattel", 801 | "mba", 802 | "mc", 803 | "mcd", 804 | "mcdonalds", 805 | "mckinsey", 806 | "md", 807 | "me", 808 | "med", 809 | "media", 810 | "meet", 811 | "melbourne", 812 | "meme", 813 | "memorial", 814 | "men", 815 | "menu", 816 | "meo", 817 | "merckmsd", 818 | "metlife", 819 | "mf", 820 | "mg", 821 | "mh", 822 | "miami", 823 | "microsoft", 824 | "mil", 825 | "mini", 826 | "mint", 827 | "mit", 828 | "mitsubishi", 829 | "mk", 830 | "ml", 831 | "mlb", 832 | "mls", 833 | "mm", 834 | "mma", 835 | "mn", 836 | "mo", 837 | "mobi", 838 | "mobile", 839 | "mobily", 840 | "moda", 841 | "moe", 842 | "moi", 843 | "mom", 844 | "monash", 845 | "money", 846 | "monster", 847 | "montblanc", 848 | "mopar", 849 | "mormon", 850 | "mortgage", 851 | "moscow", 852 | "moto", 853 | "motorcycles", 854 | "mov", 855 | "movie", 856 | "movistar", 857 | "mp", 858 | "mq", 859 | "mr", 860 | "ms", 861 | "msd", 862 | "mt", 863 | "mtn", 864 | "mtpc", 865 | "mtr", 866 | "mu", 867 | "museum", 868 | "mutual", 869 | "mutuelle", 870 | "mv", 871 | "mw", 872 | "mx", 873 | "my", 874 | "mz", 875 | "na", 876 | "nab", 877 | "nadex", 878 | "nagoya", 879 | "name", 880 | "nationwide", 881 | "natura", 882 | "navy", 883 | "nba", 884 | "nc", 885 | "ne", 886 | "nec", 887 | "net", 888 | "netbank", 889 | "netflix", 890 | "network", 891 | "neustar", 892 | "new", 893 | "newholland", 894 | "news", 895 | "next", 896 | "nextdirect", 897 | "nexus", 898 | "nf", 899 | "nfl", 900 | "ng", 901 | "ngo", 902 | "nhk", 903 | "ni", 904 | "nico", 905 | "nike", 906 | "nikon", 907 | "ninja", 908 | "nissan", 909 | "nissay", 910 | "nl", 911 | "no", 912 | "nokia", 913 | "northwesternmutual", 914 | "norton", 915 | "now", 916 | "nowruz", 917 | "nowtv", 918 | "np", 919 | "nr", 920 | "nra", 921 | "nrw", 922 | "ntt", 923 | "nu", 924 | "nyc", 925 | "nz", 926 | "obi", 927 | "observer", 928 | "off", 929 | "office", 930 | "okinawa", 931 | "olayan", 932 | "olayangroup", 933 | "oldnavy", 934 | "ollo", 935 | "om", 936 | "omega", 937 | "one", 938 | "ong", 939 | "onl", 940 | "online", 941 | "onyourside", 942 | "ooo", 943 | "open", 944 | "oracle", 945 | "orange", 946 | "org", 947 | "organic", 948 | "orientexpress", 949 | "origins", 950 | "osaka", 951 | "otsuka", 952 | "ott", 953 | "ovh", 954 | "pa", 955 | "page", 956 | "pamperedchef", 957 | "panasonic", 958 | "panerai", 959 | "paris", 960 | "pars", 961 | "partners", 962 | "parts", 963 | "party", 964 | "passagens", 965 | "pay", 966 | "pccw", 967 | "pe", 968 | "pet", 969 | "pf", 970 | "pfizer", 971 | "pg", 972 | "ph", 973 | "pharmacy", 974 | "phd", 975 | "philips", 976 | "phone", 977 | "photo", 978 | "photography", 979 | "photos", 980 | "physio", 981 | "piaget", 982 | "pics", 983 | "pictet", 984 | "pictures", 985 | "pid", 986 | "pin", 987 | "ping", 988 | "pink", 989 | "pioneer", 990 | "pizza", 991 | "pk", 992 | "pl", 993 | "place", 994 | "play", 995 | "playstation", 996 | "plumbing", 997 | "plus", 998 | "pm", 999 | "pn", 1000 | "pnc", 1001 | "pohl", 1002 | "poker", 1003 | "politie", 1004 | "porn", 1005 | "post", 1006 | "pr", 1007 | "pramerica", 1008 | "praxi", 1009 | "press", 1010 | "prime", 1011 | "pro", 1012 | "prod", 1013 | "productions", 1014 | "prof", 1015 | "progressive", 1016 | "promo", 1017 | "properties", 1018 | "property", 1019 | "protection", 1020 | "pru", 1021 | "prudential", 1022 | "ps", 1023 | "pt", 1024 | "pub", 1025 | "pw", 1026 | "pwc", 1027 | "py", 1028 | "qa", 1029 | "qpon", 1030 | "quebec", 1031 | "quest", 1032 | "qvc", 1033 | "racing", 1034 | "radio", 1035 | "raid", 1036 | "re", 1037 | "read", 1038 | "realestate", 1039 | "realtor", 1040 | "realty", 1041 | "recipes", 1042 | "red", 1043 | "redstone", 1044 | "redumbrella", 1045 | "rehab", 1046 | "reise", 1047 | "reisen", 1048 | "reit", 1049 | "reliance", 1050 | "ren", 1051 | "rent", 1052 | "rentals", 1053 | "repair", 1054 | "report", 1055 | "republican", 1056 | "rest", 1057 | "restaurant", 1058 | "review", 1059 | "reviews", 1060 | "rexroth", 1061 | "rich", 1062 | "richardli", 1063 | "ricoh", 1064 | "rightathome", 1065 | "ril", 1066 | "rio", 1067 | "rip", 1068 | "rmit", 1069 | "ro", 1070 | "rocher", 1071 | "rocks", 1072 | "rodeo", 1073 | "rogers", 1074 | "room", 1075 | "rs", 1076 | "rsvp", 1077 | "ru", 1078 | "rugby", 1079 | "ruhr", 1080 | "run", 1081 | "rw", 1082 | "rwe", 1083 | "ryukyu", 1084 | "sa", 1085 | "saarland", 1086 | "safe", 1087 | "safety", 1088 | "sakura", 1089 | "sale", 1090 | "salon", 1091 | "samsclub", 1092 | "samsung", 1093 | "sandvik", 1094 | "sandvikcoromant", 1095 | "sanofi", 1096 | "sap", 1097 | "sapo", 1098 | "sarl", 1099 | "sas", 1100 | "save", 1101 | "saxo", 1102 | "sb", 1103 | "sbi", 1104 | "sbs", 1105 | "sc", 1106 | "sca", 1107 | "scb", 1108 | "schaeffler", 1109 | "schmidt", 1110 | "scholarships", 1111 | "school", 1112 | "schule", 1113 | "schwarz", 1114 | "science", 1115 | "scjohnson", 1116 | "scor", 1117 | "scot", 1118 | "sd", 1119 | "se", 1120 | "search", 1121 | "seat", 1122 | "secure", 1123 | "security", 1124 | "seek", 1125 | "select", 1126 | "sener", 1127 | "services", 1128 | "ses", 1129 | "seven", 1130 | "sew", 1131 | "sex", 1132 | "sexy", 1133 | "sfr", 1134 | "sg", 1135 | "sh", 1136 | "shangrila", 1137 | "sharp", 1138 | "shaw", 1139 | "shell", 1140 | "shia", 1141 | "shiksha", 1142 | "shoes", 1143 | "shop", 1144 | "shopping", 1145 | "shouji", 1146 | "show", 1147 | "showtime", 1148 | "shriram", 1149 | "si", 1150 | "silk", 1151 | "sina", 1152 | "singles", 1153 | "site", 1154 | "sj", 1155 | "sk", 1156 | "ski", 1157 | "skin", 1158 | "sky", 1159 | "skype", 1160 | "sl", 1161 | "sling", 1162 | "sm", 1163 | "smart", 1164 | "smile", 1165 | "sn", 1166 | "sncf", 1167 | "so", 1168 | "soccer", 1169 | "social", 1170 | "softbank", 1171 | "software", 1172 | "sohu", 1173 | "solar", 1174 | "solutions", 1175 | "song", 1176 | "sony", 1177 | "soy", 1178 | "space", 1179 | "spiegel", 1180 | "sport", 1181 | "spot", 1182 | "spreadbetting", 1183 | "sr", 1184 | "srl", 1185 | "srt", 1186 | "ss", 1187 | "st", 1188 | "stada", 1189 | "staples", 1190 | "star", 1191 | "starhub", 1192 | "statebank", 1193 | "statefarm", 1194 | "statoil", 1195 | "stc", 1196 | "stcgroup", 1197 | "stockholm", 1198 | "storage", 1199 | "store", 1200 | "stream", 1201 | "studio", 1202 | "study", 1203 | "style", 1204 | "su", 1205 | "sucks", 1206 | "supplies", 1207 | "supply", 1208 | "support", 1209 | "surf", 1210 | "surgery", 1211 | "suzuki", 1212 | "sv", 1213 | "swatch", 1214 | "swiftcover", 1215 | "swiss", 1216 | "sx", 1217 | "sy", 1218 | "sydney", 1219 | "symantec", 1220 | "systems", 1221 | "sz", 1222 | "tab", 1223 | "taipei", 1224 | "talk", 1225 | "taobao", 1226 | "target", 1227 | "tatamotors", 1228 | "tatar", 1229 | "tattoo", 1230 | "tax", 1231 | "taxi", 1232 | "tc", 1233 | "tci", 1234 | "td", 1235 | "tdk", 1236 | "team", 1237 | "tech", 1238 | "technology", 1239 | "tel", 1240 | "telecity", 1241 | "telefonica", 1242 | "temasek", 1243 | "tennis", 1244 | "teva", 1245 | "tf", 1246 | "tg", 1247 | "th", 1248 | "thd", 1249 | "theater", 1250 | "theatre", 1251 | "tiaa", 1252 | "tickets", 1253 | "tienda", 1254 | "tiffany", 1255 | "tips", 1256 | "tires", 1257 | "tirol", 1258 | "tj", 1259 | "tjmaxx", 1260 | "tjx", 1261 | "tk", 1262 | "tkmaxx", 1263 | "tl", 1264 | "tm", 1265 | "tmall", 1266 | "tn", 1267 | "to", 1268 | "today", 1269 | "tokyo", 1270 | "tools", 1271 | "top", 1272 | "toray", 1273 | "toshiba", 1274 | "total", 1275 | "tours", 1276 | "town", 1277 | "toyota", 1278 | "toys", 1279 | "tp", 1280 | "tr", 1281 | "trade", 1282 | "trading", 1283 | "training", 1284 | "travel", 1285 | "travelchannel", 1286 | "travelers", 1287 | "travelersinsurance", 1288 | "trust", 1289 | "trv", 1290 | "tt", 1291 | "tube", 1292 | "tui", 1293 | "tunes", 1294 | "tushu", 1295 | "tv", 1296 | "tvs", 1297 | "tw", 1298 | "tz", 1299 | "ua", 1300 | "ubank", 1301 | "ubs", 1302 | "uconnect", 1303 | "ug", 1304 | "uk", 1305 | "um", 1306 | "unicom", 1307 | "university", 1308 | "uno", 1309 | "uol", 1310 | "ups", 1311 | "us", 1312 | "uy", 1313 | "uz", 1314 | "va", 1315 | "vacations", 1316 | "vana", 1317 | "vanguard", 1318 | "vc", 1319 | "ve", 1320 | "vegas", 1321 | "ventures", 1322 | "verisign", 1323 | "versicherung", 1324 | "vet", 1325 | "vg", 1326 | "vi", 1327 | "viajes", 1328 | "video", 1329 | "vig", 1330 | "viking", 1331 | "villas", 1332 | "vin", 1333 | "vip", 1334 | "virgin", 1335 | "visa", 1336 | "vision", 1337 | "vista", 1338 | "vistaprint", 1339 | "viva", 1340 | "vivo", 1341 | "vlaanderen", 1342 | "vn", 1343 | "vodka", 1344 | "volkswagen", 1345 | "volvo", 1346 | "vote", 1347 | "voting", 1348 | "voto", 1349 | "voyage", 1350 | "vu", 1351 | "vuelos", 1352 | "wales", 1353 | "walmart", 1354 | "walter", 1355 | "wang", 1356 | "wanggou", 1357 | "warman", 1358 | "watch", 1359 | "watches", 1360 | "weather", 1361 | "weatherchannel", 1362 | "webcam", 1363 | "weber", 1364 | "website", 1365 | "wed", 1366 | "wedding", 1367 | "weibo", 1368 | "weir", 1369 | "wf", 1370 | "whoswho", 1371 | "wien", 1372 | "wiki", 1373 | "williamhill", 1374 | "win", 1375 | "windows", 1376 | "wine", 1377 | "winners", 1378 | "wme", 1379 | "wolterskluwer", 1380 | "woodside", 1381 | "work", 1382 | "works", 1383 | "world", 1384 | "wow", 1385 | "ws", 1386 | "wtc", 1387 | "wtf", 1388 | "xbox", 1389 | "xerox", 1390 | "xfinity", 1391 | "xihuan", 1392 | "xin", 1393 | "测试", 1394 | "कॉम", 1395 | "परीक्षा", 1396 | "セール", 1397 | "佛山", 1398 | "ಭಾರತ", 1399 | "慈善", 1400 | "集团", 1401 | "在线", 1402 | "한국", 1403 | "ଭାରତ", 1404 | "大众汽车", 1405 | "点看", 1406 | "คอม", 1407 | "ভাৰত", 1408 | "ভারত", 1409 | "八卦", 1410 | "‏موقع‎", 1411 | "বাংলা", 1412 | "公益", 1413 | "公司", 1414 | "香格里拉", 1415 | "网站", 1416 | "移动", 1417 | "我爱你", 1418 | "москва", 1419 | "испытание", 1420 | "қаз", 1421 | "католик", 1422 | "онлайн", 1423 | "сайт", 1424 | "联通", 1425 | "срб", 1426 | "бг", 1427 | "бел", 1428 | "‏קום‎", 1429 | "时尚", 1430 | "微博", 1431 | "테스트", 1432 | "淡马锡", 1433 | "ファッション", 1434 | "орг", 1435 | "नेट", 1436 | "ストア", 1437 | "삼성", 1438 | "சிங்கப்பூர்", 1439 | "商标", 1440 | "商店", 1441 | "商城", 1442 | "дети", 1443 | "мкд", 1444 | "‏טעסט‎", 1445 | "ею", 1446 | "ポイント", 1447 | "新闻", 1448 | "工行", 1449 | "家電", 1450 | "‏كوم‎", 1451 | "中文网", 1452 | "中信", 1453 | "中国", 1454 | "中國", 1455 | "娱乐", 1456 | "谷歌", 1457 | "భారత్", 1458 | "ලංකා", 1459 | "電訊盈科", 1460 | "购物", 1461 | "測試", 1462 | "クラウド", 1463 | "ભારત", 1464 | "通販", 1465 | "भारतम्", 1466 | "भारत", 1467 | "भारोत", 1468 | "‏آزمایشی‎", 1469 | "பரிட்சை", 1470 | "网店", 1471 | "संगठन", 1472 | "餐厅", 1473 | "网络", 1474 | "ком", 1475 | "укр", 1476 | "香港", 1477 | "诺基亚", 1478 | "食品", 1479 | "δοκιμή", 1480 | "飞利浦", 1481 | "‏إختبار‎", 1482 | "台湾", 1483 | "台灣", 1484 | "手表", 1485 | "手机", 1486 | "мон", 1487 | "‏الجزائر‎", 1488 | "‏عمان‎", 1489 | "‏ارامكو‎", 1490 | "‏ایران‎", 1491 | "‏العليان‎", 1492 | "‏اتصالات‎", 1493 | "‏امارات‎", 1494 | "‏بازار‎", 1495 | "‏موريتانيا‎", 1496 | "‏پاکستان‎", 1497 | "‏الاردن‎", 1498 | "‏موبايلي‎", 1499 | "‏بارت‎", 1500 | "‏بھارت‎", 1501 | "‏المغرب‎", 1502 | "‏ابوظبي‎", 1503 | "‏السعودية‎", 1504 | "‏ڀارت‎", 1505 | "‏كاثوليك‎", 1506 | "‏سودان‎", 1507 | "‏همراه‎", 1508 | "‏عراق‎", 1509 | "‏مليسيا‎", 1510 | "澳門", 1511 | "닷컴", 1512 | "政府", 1513 | "‏شبكة‎", 1514 | "‏بيتك‎", 1515 | "‏عرب‎", 1516 | "გე", 1517 | "机构", 1518 | "组织机构", 1519 | "健康", 1520 | "ไทย", 1521 | "‏سورية‎", 1522 | "招聘", 1523 | "рус", 1524 | "рф", 1525 | "珠宝", 1526 | "‏تونس‎", 1527 | "大拿", 1528 | "みんな", 1529 | "グーグル", 1530 | "ελ", 1531 | "世界", 1532 | "書籍", 1533 | "ഭാരതം", 1534 | "ਭਾਰਤ", 1535 | "网址", 1536 | "닷넷", 1537 | "コム", 1538 | "天主教", 1539 | "游戏", 1540 | "vermögensberater", 1541 | "vermögensberatung", 1542 | "企业", 1543 | "信息", 1544 | "嘉里大酒店", 1545 | "嘉里", 1546 | "‏مصر‎", 1547 | "‏قطر‎", 1548 | "广东", 1549 | "இலங்கை", 1550 | "இந்தியா", 1551 | "հայ", 1552 | "新加坡", 1553 | "‏فلسطين‎", 1554 | "テスト", 1555 | "政务", 1556 | "xperia", 1557 | "xxx", 1558 | "xyz", 1559 | "yachts", 1560 | "yahoo", 1561 | "yamaxun", 1562 | "yandex", 1563 | "ye", 1564 | "yodobashi", 1565 | "yoga", 1566 | "yokohama", 1567 | "you", 1568 | "youtube", 1569 | "yt", 1570 | "yun", 1571 | "za", 1572 | "zappos", 1573 | "zara", 1574 | "zero", 1575 | "zip", 1576 | "zippo", 1577 | "zm", 1578 | "zone", 1579 | "zuerich", 1580 | "zw",] 1581 | tlds_2nd_lvl = ["ab.ca","ac.ac","ac.ae","ac.at","ac.be","ac.cn","ac.il","ac.in","ac.jp","ac.kr","ac.th","ac.uk","ac.sg","ad.jp","adm.br","adv.br","ah.cn","am.br","arq.br","art.br","arts.ro","asn.au","asso.fr","asso.mc","bc.ca","bio.br","biz.pl","biz.tr","bj.cn","bel.tr","br.com","cn.com","cng.br","cnt.br","co.ac","co.at","co.de","co.gl","co.hk","co.id","co.il","co.in","co.jp","co.kr","co.mg","co.ms","co.nz","co.th","cp.tz","co.uk","co.ve","co.vi","co.za","com.ag","com.ai","com.ar","com.au","com.br","com.co","com.cn","com.cy","com.de","com.do","com.ec","com.es","com.fj","com.fr","com.gl","com.gt","com.hk","com.hr","com.hu","com.kg","com.ki","com.lc","com.mg","com.mm","com.ms","com.mt","com.mu","com.mx","com.my","com.na","com.nf","com.ng","com.ni","com.pa","com.ph","com.pl","com.pt","com.qa","com.ro","com.ru","com.sb","com.sc","com.sg","com.sv","com.tr","com.tw","com.ua","com.uy","com.ve","com.vn","cq.cn","de.com","de.org","ecn.br","ed.jp","edu.au","edu.cn","edu.hk","edu.mm","edu.my","edu.pl","edu.pt","edu.qa","edu.sg","edu.tr","edu.tw","eng.br","ernet.in","esp.br","etc.br","eti.br","eu.com","eu.int","eu.lv","firm.in","firm.ro","fm.br","fot.br","fst.br","g12.br","gb.com","gb.net","gd.cn","gen.in","go.jp","go.kr","go.th","gov.au","gov.az","gov.br","gov.cn","gov.il","gov.in","gov.mm","gov.my","gov.qa","gov.sg","gov.tr","gov.tw","gov.uk","govt.nz","gr.jp","gs.cn","gv.ac","gv.at","gx.cn","gz.cn","he.cn","hi.cn","hk.cn","hl.cn","hu.com","id.au","idv.tw","in.ua","in.th","ind.br","ind.in","inf.br","info.pl","info.ro","info.tr","info.ve","iwi.nz","jl.cn","jor.br","js.cn","jus.br","k12.il","k12.tr","kr.com","lel.br","lg.jp","ln.cn","ltd.uk","maori.nz","mb.ca","me.uk","med.br","mi.th","mil.br","mil.uk","mo.cn","mod.uk","muni.il","nb.ca","ne.jp","ne.kr","net.ag","net.ai","net.au","net.br","net.cn","net.do","net.gl","net.hk","net.il","net.in","net.kg","net.ki","net.lc","net.mg","net.mm","net.mu","net.ni","net.nz","net.pl","net.ru","net.sb","net.sc","net.sg","net.th","net.tr","net.tw","net.uk","net.ve","nf.ca","nhs.uk","nm.cn","nm.kr","no.com","nom.br","nom.ni","nom.ro","ns.ca","nt.ca","nt.ro","ntr.br","nx.cn","odo.br","off.ai","on.ca","or.ac","or.at","or.jp","or.kr","or.th","org.ag","org.ai","org.au","org.br","org.cn","org.do","org.es","org.gl","org.hk","org.in","org.kg","org.ki","org.lc","org.mg","org.mm","org.ms","org.nf","org.ng","org.ni","org.nz","org.pl","org.ro","org.ru","org.sb","org.sc","org.sg","org.tr","org.tw","org.uk","org.ve","pe.ca","plc.uk","police.uk","ppg.br","presse.fr","pro.br","psc.br","psi.br","qc.ca","qc.com","qh.cn","rec.br","rec.ro","res.in","sa.com","sc.cn","sch.uk","se.com","se.net","sh.cn","sk.ca","slg.br","sn.cn","store.ro","tj.cn","tm.fr","tm.mc","tm.ro","tmp.br","tur.br","tv.br","tv.tr","tw.cn","uk.com","uk.net","us.com","uy.com","vet.br","waw.pl","web.ve","www.ro","xj.cn","xz.cn","yk.ca","yn.cn","zj.cn","zlg.br"] 1582 | 1583 | 1584 | def tld_detection(line): 1585 | probe_global = [] 1586 | global tld_value 1587 | for tld in tlds_2nd_lvl: 1588 | line = re.sub("/$", "", line) 1589 | probe = re.findall("\.%s$" %tld, line) 1590 | if str(probe) != "[]": 1591 | probe_global.append(tld) 1592 | if str(probe_global) != "[]": 1593 | return tld 1594 | 1595 | if str(probe_global) == "[]": 1596 | for tld in tlds_1st_lvl: 1597 | probe = re.findall("\.%s$" % tld, line) 1598 | if str(probe) != "[]": 1599 | return tld 1600 | --------------------------------------------------------------------------------