├── README.md
└── websitesearch.sh


/README.md:
--------------------------------------------------------------------------------
 1 | # wesbitesearch
 2 | ## Website search bash script for SEO comparisons / OSINT. 
 3 | Ignores robots.txt, goes slowly so has the side effect of making some analytics software like Google Analytics think this is organic traffic.
 4 | 
 5 | ## Current version 1.6
 6 | 
 7 | ### Usage:
 8 | ./websitesearch.sh url depth searchquery urllist
 9 | * URL: https:// or http:// url. Note, does not cope well with websites that redirect www. to root
10 | * Depth: Number of levels of links that should be followed
11 | * Search Query: Will search pages for a string and return pages with matches. i.e. passwords, secrets, SEO comparisons
12 | * URLList: Replaces first level of scraping with a file that is a list of URLs
13 | 
14 | ### Features:
15 | * Progress indicator renders on html output while in progress, this is a bit glitchy but functional
16 | * Scrapes website to specified depth
17 | * Returns location of web server on google map
18 | * Shows page structure and location of email addresses with js plugin (visualising link structure)
19 | * Shows h1, h2, h3 tags on pages that contain search term
20 | * Returns emails from all pages
21 | * Recent projects appear in bottom menu
22 | * Does not follow links outisde the chosen domain (this is a feature not a bug)
23 | * Outputs debug file for error checking
24 | 
25 | ### Todo:
26 | * Config file for API keys etc.
27 | * Improve comamnd line args so that search queries etc. are not needed, set default depth for quick search
28 | * Edge link chart for visualising link structure
29 | * Allow comparison searches for the same domain
30 | * Allow specific output html file (not hardcoded)
31 | * Text based output if required
32 | 
33 | ### If i can be bothered:
34 | * Move scraping function to python script and allow multithreading to improve speed
35 | 
36 | -------------------------------------------
37 | 


--------------------------------------------------------------------------------
/websitesearch.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | ########################################################
  3 | # websitesearch.sh
  4 | # Version 1.6
  5 | ########################################################
  6 | # Usage:
  7 | # websitesearch.sh url depth searchquery urllist
  8 | #
  9 | # url = $1
 10 | # depth = $2
 11 | # searchquery = $3
 12 | # urllist = $4
 13 | #
 14 | 
 15 | breakloop ()
 16 | {
 17 | 	exitloop=1
 18 | }
 19 | 
 20 | trap breakloop SIGINT SIGTERM
 21 | 
 22 | function ProgressBar {
 23 | let _progress=(${1}*100/${2}*100)/100
 24 | let _done=(${_progress}*4)/10
 25 | let _left=40-$_done
 26 | _fill=$(printf "%${_done}s")
 27 | _empty=$(printf "%${_left}s")
 28 | 
 29 | printf "\rProgress : [${_fill// /\#}${_empty// /-}] ${_progress}%%" | tee progressbar
 30 | progressbarString=$(cat progressbar)
 31 | 
 32 | # Update progress % on html output file
 33 | sed -i "s/Progress.*/${progressbarString}/g" $htmlOutput
 34 |  
 35 | }
 36 | 
 37 | #########################################################
 38 | # Check command line args
 39 | #########################################################
 40 | 
 41 | if [ "$#" -eq 4 ]; then
 42 | 	if [ -s $4 ]; then
 43 | 		listgiven=1
 44 | 		cp $4 level0.links
 45 | 		depth=depth+1
 46 | 	else 
 47 | 		echo "Error: url list file does not exist" >&2; exit 1
 48 | 	fi
 49 | fi
 50 | 
 51 | if [ "$#" -gt 3 ]; then
 52 | 	searchquery=$3
 53 | 	#searchquery=$(echo $3 | sed -e "s/'/'\\\\''/g; 1s/^/'/; \$s/\$/'/")
 54 | 	if [ $searchquery = '' ]; then
 55 | 		echo "Error: search query is invalid" >&2; exit 1
 56 | 	fi	
 57 | fi
 58 | 
 59 | if [ "$#" -gt 2 ]; then
 60 | 	re='^[0-9]+$'
 61 | 	if ! [[ $2 =~ $re ]]; then
 62 |  		echo "Error: Depth is not a number" >&2; exit 1
 63 | 	else
 64 | 		depth=$2
 65 | 	fi
 66 | fi
 67 | 
 68 | if [ "$#" -eq 0 ]; then
 69 | 	echo "Usage: websitesearch url depth searchquery urllist" >&2; exit 1
 70 | fi
 71 | 
 72 | if [ "$#" -gt 1 ]; then
 73 | 	re='(https?|ftp|file)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]'
 74 | 	if [[ $1 =~ $re ]]; then 
 75 | 
 76 | 		#Make a folder for the project files
 77 | 		project=$(echo $1 | sed -e 's/https:\/\///g' -e 's/\///g' -e 's/http:\/\///g')
 78 | 		mkdir -p $project
 79 | 		cd $project
 80 | 		touch pages.txt
 81 | 		printf "\r\n"
 82 | 
 83 | 		## Add in HTML Output feature
 84 | 		## Modify the htmlOutput variable to change the location of the output file, will add a command line parameter for this in future
 85 | 		htmlName=$(echo $project | sed -e 's/\/\(.*\)./\1/')
 86 | 		htmlOutput=$(echo "/var/www/html/$htmlName.html")
 87 | 		echo "html file is called: $htmlName"
 88 | 	       	dateStarted=$(date +"%m-%d-%Y")	
 89 | 		
 90 | 		## Add in HTML Header and styles
 91 | 		printf '\r<html>\n<head>\n' > $htmlOutput
 92 | 		echo "<title>Websitesearch $htmlName $dateStarted</title>" >> $htmlOutput
 93 | 		# Auto-refresh page
 94 | 		printf '<meta http-equiv="refresh" content="2" >\n' >> $htmlOutput
 95 | 		# CSS
 96 | 		
 97 | 		# Google maps
 98 | 		printf '<style>#map { width: 100%%; height: 400px; background-color: black; } </style>' >> $htmlOutput
 99 | 
100 | 		# Header menu
101 | 		printf '<style>ul { list-style-type: none; margin: 0; padding: 0; overflow: hidden; background-color: #333; position: fixed; top: 0; width:100%%; }\r\nli { float: left; }\r\nli a { display: block; color: white; text-align: center; padding: 14px 16px; text-decoration: none; }\r\nli a:hover:not(.active) { background-color: #111; }\r\n.active { background-color: #4CAF50;
102 | }\r\n</style>' >> $htmlOutput
103 | 
104 | 		# Footer menu for recent projects
105 | 		printf '<style>projects { list-style-type: none; margin: 0; padding: 0; overflow: hidden; background-color: #333; position: fixed; bottom: 0; width: 100%%;}</style>\r\n' >> $htmlOutput
106 | 
107 | 		# Buttons for pages
108 | 	       printf '\r\n<style>.linkButton { -moz-box-shadow:inset 0px 34px 0px -15px #b54b3a; -webkit-box-shadow:inset 0px 34px 0px -15px #b54b3a; box-shadow:inset 0px 34px 0px -15px #b54b3a; background-color:#a73f2d; border:1px solid #241d13; display:inline-block; cursor:pointer; color:#ffffff; font-family:Arial; font-size:15px; font-weight:bold; padding:9px 23px; text-decoration:none; text-shadow:0px -1px 0px #7a2a1d; }\r\n.linkButton:hover { background-color:#b34332; }\r\n .linkButton:active { position:relative; top:1px; }</style>' >> $htmlOutput	
109 | 
110 | 	        # CSS for filename heading
111 | 		printf '\r\n<style>.filename { -moz-box-shadow:inset 0px 1px 0px 0px #ffffff;-webkit-box-shadow:inset 0px 1px 0px 0px #ffffff;box-shadow:inset 0px 1px 0px 0px #ffffff;background:-webkit-gradient(linear, left top, left bottom, color-stop(0.05, #ffffff), color-stop(1, #f6f6f6));background:-moz-linear-gradient(top, #ffffff 5%%, #f6f6f6 100%%);background:-webkit-linear-gradient(top, #ffffff 5%%, #f6f6f6 100%%);background:-o-linear-gradient(top, #ffffff 5%%, #f6f6f6 100%%);background:-ms-linear-gradient(top, #ffffff 5%%, #f6f6f6 100%%);background:linear-gradient(to bottom, #ffffff 5%%, #f6f6f6 100%%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr="#ffffff", endColorstr="#f6f6f6",GradientType=0);background-color:#ffffff;-moz-border-radius:6px;-webkit-border-radius:6px;border-radius:6px;border:1px solid #dcdcdc;display:inline-block;cursor:pointer;color:#666666;font-family:Arial;font-size:15px;font-weight:bold;padding:6px 24px;text-decoration:none;text-shadow:0px 1px 0px #ffffff;}.filename:hover {background:-webkit-gradient(linear, left top, left bottom, color-stop(0.05, #f6f6f6), color-stop(1, #ffffff));	background:-moz-linear-gradient(top, #f6f6f6 5%%, #ffffff 100%%);background:-webkit-linear-gradient(top, #f6f6f6 5%%, #ffffff 100%%);background:-o-linear-gradient(top, #f6f6f6 5%%, #ffffff 100%%);background:-ms-linear-gradient(top, #f6f6f6 5%%, #ffffff 100%%);	background:linear-gradient(to bottom, #f6f6f6 5%%, #ffffff 100%%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr="#f6f6f6", endColorstr="#ffffff",GradientType=0);background-color:#f6f6f6;}.filename:active {position:relative;top:1px;}</style>' >> $htmlOutput
112 | 		
113 | 		# Google fonts
114 | 		printf '<link href="https://fonts.googleapis.com/css?family=Zilla Slab" rel="stylesheet">\n<link href="https://fonts.googleapis.com/css?family=Source Code Pro" rel="stylesheet">\n' >> $htmlOutput
115 | 		printf '<style>\r\na { color: white; }\r\nbody { font-family: "Source Code Pro"; font-size: 12px; background-color: black; }\r\nh1 { color: white; font-size; 35px}\r\nh2 { color: crimson; font-size: 25px }\r\nh3 { color: white; }\r\np { color: lime; }\r\nheading { font-family: "Source Code Pro"; font-size: 18px; color: white; float: right; text-align: right vertical-align: middle}\r\ndiv.search { line-height: 100%%; colour: lime; }\r\ndiv.header { font-family: "Zilla Slab"; font-size: 20px; line-height: 80%%; }\r\ndiv.scan { color: lime; font-size: 15px; line-height: 80%%; }\r\nsearchterm { color: white; font-size 25 }\r\n</style>' >> $htmlOutput
116 | 		
117 | 		# d3 Graph for link structure (emails, 404s)
118 | 	 	printf '<style>\r\n.link {\r\nfill: none;\r\nstroke: #666;\r\nstroke-width: 1.5px;\r\n}\r\n#404\r\n{\r\nfill: red;\r\n}\r\n#ahref {\r\nfill: green;\r\n}\r\n.link.ahref\r\n{\r\nstroke: green;\r\n}\r\n.link.email {\r\nstroke-dasharray: 0,2 1;\r\n}\r\n.link.404 { stroke: red; }\r\ncircle {\r\nfill: #ccc;\r\nstroke: #333;\r\nstroke-width: 1.5px;\r\n}\r\ntext {\r\nfont: 10px sans-serif;\r\npointer-events: none;\r\ntext-shadow: 0 1px 0 #fff, 1px 0 0 #fff, 0 -1px 0 #fff, -1px 0 0 #fff;\r\n}\r\n</style>' >> $htmlOutput
119 | 		printf '\r\n<script src="https://d3js.org/d3.v3.min.js"></script>\r\n' >> $htmlOutput
120 | 		structurePages=$(echo pages.txt)
121 | 
122 | 		# Edge point graph for link structure
123 | 		#printf '<style>\r\n.node {\r\n  font: 300 11px "Helvetica Neue", Helvetica, Arial, sans-serif;\r\n  fill: #bbb;\r\n}\r\n\r\n.node:hover {\r\n  fill: #000;\r\n}\r\n\r\n.link {\r\n  stroke: steelblue;\r\n  stroke-opacity: 0.4;\r\n  fill: none;\r\n  pointer-events: none;\r\n}\r\n\r\n.node:hover,\r\n.node--source,\r\n.node--target {\r\n  font-weight: 700;\r\n}\r\n\r\n.node--source {\r\n  fill: #2ca02c;\r\n}\r\n\r\n.node--target {\r\n  fill: #d62728;\r\n}\r\n\r\n.link--source,\r\n.link--target {\r\n  stroke-opacity: 1;\r\n  stroke-width: 2px;\r\n}\r\n\r\n.link--source {\r\n  stroke: #d62728;\r\n}\r\n\r\n.link--target {\r\n  stroke: #2ca02c;\r\n}\r\n\r\n</style>\r\n' >> $htmlOutput
124 | 
125 | 		# Menu items (header)
126 | 		printf '\n</head>\n\r<body>\r\n<ul>\r\n<li><a class="active" href="#header">Home</a></li>\r\n  <li><a href="#search">Search</a></li>\r\n<li><a href="#files">Files</a></li>\r\n<li><a href="#results">Results</a></li>\r\n<li><a href="#404s">404s</a></li>\r\n <li><a href="#headings">Heading Tags</a></li>\r\n <li><a href="#emails">Emails</a></li>\r\n <li><a href="#otherfiles">Other Files</a></li>\r\n <li><a href="#structure">Link Structure</a></li>\r\n<li><a class="heading">websitesearch Version 1.6  Date: %s  Depth: %d</a></li></ul>\r\n<div class="header" id="header"></br>\r\n</div></br>' $dateStarted $2 >> $htmlOutput
127 | 		
128 | 		# Menu items (footer)
129 | 		printf '<projects>' >> $htmlOutput
130 | 		# Insert current project as active list element
131 | 		printf '<li><a class="active" href="%s">%s</a></li>\r\n' $project $project >> $htmlOutput
132 | 		# Get files in webroot
133 | 		
134 | 		ls /var/www/html/*.html > projectlist
135 | 		sed -i -e 's/\/var\/www\/html\///g' projectlist
136 | 		# remove specific files from projectlist
137 | 		sed -i -e 's/pages.html//g' projectlist
138 | 
139 | 		while read line; do
140 | 			if [[ ${line}!=${project} ]]; then
141 | 				label=$(echo $line | sed -e 's/.html//g')
142 | 				printf '<li><a href="%s">%s</a></li>' $line $label >> $htmlOutput
143 | 			fi
144 | 		done < projectlist
145 | 
146 | 		printf '</projects>' >> $htmlOutput
147 | 
148 | 		# Get the starting (root) file
149 | 		wget -o wgetlog -O startHere $1
150 | 		indexfile="startHere"
151 | 		grep "failed" wgetlog > errors
152 | 		if [ -s errors ]; then
153 | 			rm wgetlog errors
154 | 			rm -Rf "$project"
155 | 			echo "Error: URL is invalid or down, please check" >&2; exit 1
156 | 		elif [[ $3!='' ]]; then
157 | 			echo "Checking" $1 "for references to" $3 "at a depth of" $2
158 | 		else
159 | 			echo "Checking" $1
160 | 		fi
161 | 	else
162 | 		echo "Error: URL is invalid"  >&2; exit 1
163 | 	fi
164 | fi
165 | 
166 | #########################################################
167 | # Geo-location of server
168 | #########################################################
169 | geodata=$(echo "geodata")
170 | printf "<div id='geo_data'>\r\n<p>" > geodatadiv
171 | printf "<h2 id='geo_data'>\r\nServer Geo Information</h2>" >> geodatadiv
172 | server=$(echo ${1} | sed -e 's/^.*\:\/\///g')
173 | #echo "Server:" $server
174 | serverip=$(nslookup $server | grep "Address:" | tail -1 | sed -e 's/Address\://g' | sed -e 's/\#.*$//g' | sed -e 's/\s//g')
175 | ipinfo=`curl -s https://ipinfo.io/${serverip}`
176 | echo "ServerIP:" $serverip
177 | 
178 | # Get geo data from IP
179 | curl -s https://ipvigilante.com/${serverip} | jq '.data.latitude, .data.longitude, .data.city_name, .data.country_name' | \
180 |         while read -r LATITUDE; do
181 |                 read -r LONGITUDE
182 |                 read -r CITY
183 |                 read -r COUNTRY
184 |                 echo "${LATITUDE},${LONGITUDE},${CITY},${COUNTRY}" | tr --delete \" > ${geodata}
185 |         done
186 | cat geodata >> geodatadiv
187 | lat=$(cat geodata | awk -F, '{print $1}')
188 | long=$(cat geodata | awk -F, '{print $2}')
189 | rm geodata
190 | echo "<p>" >> $htmlOutput
191 | echo geodatadiv
192 | cat geodatadiv >> $htmlOutput
193 | echo "</p>" >> $htmlOutput
194 | 
195 | # Put marker on google map
196 | ## NOTE!!! See google maps API key at the end of this line, you'll need to add yours in here for this to work.
197 | printf "<div id='map'></div>\r\n <script>function initMap() {\r\n var server = {lat: ${lat}, lng: ${long}};\r\n var map = new google.maps.Map(\r\n document.getElementById('map'), {zoom: 12, center: server, styles: [\r\n{elementType: 'geometry', stylers: [{color: '#242f3e'}]},\r\n{elementType: 'labels.text.stroke', stylers: [{color: '#242f3e'}]},\r\n{elementType: 'labels.text.fill', stylers: [{color: '#746855'}]},\r\n{\r\nfeatureType: 'administrative.locality',\r\nelementType: 'labels.text.fill',\r\nstylers: [{color: '#d59563'}]\r\n},\r\n{\r\nfeatureType: 'poi',\r\nelementType: 'labels.text.fill',\r\nstylers: [{color: '#d59563'}]\r\n},\r\n{\r\nfeatureType: 'poi.park',\r\nelementType: 'geometry',\r\nstylers: [{color: '#263c3f'}]\r\n},\r\n{\r\nfeatureType: 'poi.park',\r\nelementType: 'labels.text.fill',\r\nstylers: [{color: '#6b9a76'}]\r\n},\r\n{\r\nfeatureType: 'road',\r\nelementType: 'geometry',\r\nstylers: [{color: '#38414e'}]\r\n},\r\n{\r\nfeatureType: 'road',\r\nelementType: 'geometry.stroke',\r\nstylers: [{color: '#212a37'}]\r\n},\r\n{\r\nfeatureType: 'road',\r\nelementType: 'labels.text.fill',\r\nstylers: [{color: '#9ca5b3'}]\r\n},\r\n{\r\nfeatureType: 'road.highway',\r\nelementType: 'geometry',\r\nstylers: [{color: '#746855'}]\r\n},\r\n{\r\nfeatureType: 'road.highway',\r\nelementType: 'geometry.stroke',\r\nstylers: [{color: '#1f2835'}]\r\n},\r\n{\r\nfeatureType: 'road.highway',\r\nelementType: 'labels.text.fill',\r\nstylers: [{color: '#f3d19c'}]\r\n},\r\n{\r\nfeatureType: 'transit',\r\nelementType: 'geometry',\r\nstylers: [{color: '#2f3948'}]\r\n},\r\n{\r\nfeatureType: 'transit.station',\r\nelementType: 'labels.text.fill',\r\nstylers: [{color: '#d59563'}]\r\n},\r\n{\r\nfeatureType: 'water',\r\nelementType: 'geometry',\r\nstylers: [{color: '#070824'}]\r\n},\r\n{\r\nfeatureType: 'water',\r\nelementType: 'labels.text.fill',\r\nstylers: [{color: '#515c6d'}]\r\n},\r\n{\r\nfeatureType: 'water',\r\nelementType: 'labels.text.stroke',\r\nstylers: [{color: '#17263c'}]\r\n}\r\n]});\r\n var marker = new google.maps.Marker({position: server, map: map});\r\n}</script>\r\n <script async defer src='https://maps.googleapis.com/maps/api/js?key=INSERTMAPSAPIKEY&callback=initMap'></script>\r\n" >> $htmlOutput
198 | 
199 | # Close the div
200 | echo "</p></div>" >> $htmlOutput
201 | 
202 | #########################################################
203 | # Scrape level 0
204 | #########################################################
205 | 
206 | sizeoffile=$(wc -l "$indexfile")
207 | 
208 | _start=1
209 | _end=$sizeoffile
210 | n=1
211 | 
212 | startTime=`date +%s`
213 | 
214 | if [ $listgiven!=1 ]; then
215 | 	while read line; do
216 | 		ProgressBar ${n} ${_end}
217 | 		echo $line | grep -o '<a .*href=.*>' | sed -e 's/<a /\n<a /g' | sed -e 's/<a .*href=['"'"'"]//' -e 's/["'"'"'].*$//' -e '/^$/ d' > linkline 
218 | 		cat linkline | grep $1 > isFullPath
219 | 		if [ -s isFullPath ]; then
220 | 			cat linkline >> level0.links
221 | 			linkline=$(cat linkline)
222 | 			if [[ ! -z "$linkline" ]]; then echo $1 "references:" $linkline >> pages.txt; fi			
223 | 			rm isFullPath
224 | 		else
225 | 			sed -i -e "s#^#${1}#" linkline
226 | 			cat linkline >> level0.links
227 | 			linkline=$(cat linkline)
228 | 			if [[ ! -z "$linkline" ]]; then echo $1 "references:" $linkline >> pages.txt; fi
229 | 		fi
230 | 		rm linkline
231 | 		n=$((n+1))
232 | 	done < $indexfile
233 | fi
234 | 
235 | sort level0.links > level0sorted
236 | uniq level0sorted > level0.links
237 | rm level0sorted
238 | 
239 | endTime=`date +%s`
240 | runTime=$((endTime-startTime))
241 | 
242 | printf "\r\n"
243 | echo "                     Completed in ${runTime} seconds"
244 | 
245 | #########################################################
246 | # Scrape deeper levels
247 | #########################################################
248 | n=0
249 | touch checkedpages
250 | touch level1.links
251 | touch level2.links
252 | touch level3.links
253 | 
254 | printf '<div class="Scan" id="search">' >> $htmlOutput
255 | printf "\r\n<h2 id='depth'> </h2>" >> $htmlOutput
256 | 
257 | while [[ $n -le $depth && $exitloop!=1 ]]
258 | do
259 | 	sed -i -e 's/Please wait\.\.\.<\/br>//g' $htmlOutput
260 | 	next=$((n+1))
261 | 	previous=$((n-1))
262 | 	nextlevel=$(echo "level${next}.links")
263 | 	previouslevel=$(echo "level${previous}.links")
264 | 	linkfile=$(echo "level${n}.links")
265 | 	previous=$((n-1))
266 | 	noLinks=1
267 | 	noLevelZeroLinks=$(wc -l "$linkfile" | sed -e "s/${linkfile}//g")
268 | 	_end=$noLevelZeroLinks
269 | 
270 | 	if [[ "$n" == 0 ]]
271 |        		then previousfile=$(echo "/dev/null")
272 |        	else
273 | 	       	previousfile=$(echo "level${previous}.links")
274 | 	fi
275 | 
276 | 	if [[ "$n" == 0 ]] || [[ -s ${previousfile} ]]; then
277 | 		startTime=`date +%s`
278 | 		depthString=$(printf "<h2 id='depth'>Going %d deep: Checking %d links</h2>" $n $noLevelZeroLinks)
279 | 		sed -i -e "s#<h2 id='depth'>.*<\/h2>#${depthString}#g" $htmlOutput
280 | 		printf '\r\n' >> $htmlOutput
281 | 
282 | 		if [[ $n -ne 0 ]]; then sed -i -e "s/<h2>Going.*/${depthString}/g"; fi
283 | 		echo "Going ${n} deep: Checking ${noLevelZeroLinks} links"
284 | 		pleasewait=$(printf '<p>> Please wait...</p>\r\n' $project)
285 | 	        echo $pleasewait >> $htmlOutput
286 | 		startingscan=$(printf '<p>> Progress: starting scan</p>\r\n' $project)
287 | 	        sed -i -e "s#${pleasewait}#${startingscan}#g" $htmlOutput
288 | 
289 | 		# Replace whitespace with new line character on links file to resolve multiple links on single line issue
290 | 		sed -i -e 's/ /\n/g' $linkfile
291 | 
292 | 		while read line; do
293 | 
294 | 		strippedline=$(echo $line | sed -e 's#.*http://##g' | sed -e 's#.*https://##g')
295 | 
296 | 		if [[ "$exitloop" == 1 ]]; then break; fi
297 | 		
298 | 		#printf "\r\n========= Getting %4d of %4d =========\n" $noLinks $noLevelZeroLinks	
299 | 		ProgressBar ${noLinks} ${_end}
300 | 
301 | 		# Check if page has already been crawled
302 | 		if grep -q "$line" checkedpages; then
303 | 			echo "Page" $line "already checked" >> debug.txt
304 | 		else
305 | 			echo $line | grep "#" > hasHash
306 | 			#echo $line | grep -e '^.*\.comhttp.*' > multipleLinks
307 | 			#echo $line | grep -e '^.*\.nethttp.*' >> multipleLinks
308 | 			echo $line | grep -e '^.*(\.nethttp|\.comhttp|\.auhttp|\/htt).*' >> multipleLinks
309 | 
310 | 			if [ -s hasHash ] || [ -s multipleLinks ]; then
311 | 				echo $line "has # in it, or has external link" >> debug.txt
312 | 				rm hasHash multipleLinks
313 | 			else
314 | 				
315 | 				# Remove // from any url's
316 | 				echo $line | sed -e 's#///#/#2' > downloadme
317 | 				sed -i -e 's#//#/#g' downloadme
318 | 				sed -i -e 's#http:#http:/#g' downloadme
319 | 				sed -i -e 's#https:#https:/#g' downloadme
320 | 				downloadme=$(cat downloadme)
321 | 				
322 | 				# Check for specific file types to exclude from downloading
323 | 				grep -E "^.*(\.exe|\.zip|\.pdf|\.tar|\.gz|\.doc|\.docx)" downloadme > ignore
324 | 				rm downloadme
325 | 
326 | 				if [ -s ignore ]; then
327 | 					echo $downloadme "contains ignored filetype" >> debug.txt
328 | 					echo "<p>$downloadme contains ignore filetype</p>" >> otherFiles
329 | 					rm ignore
330 | 				else
331 | 					echo "Getting" $downloadme >> debug.txt
332 | 					wget -nc --timeout=1 --tries=1 -o wgetOutPut $downloadme
333 | 				fi
334 | 
335 | 				baseFile=$(basename "$downloadme")
336 | 
337 | 				if [ "$baseFile" == '#' ]; then
338 | 					baseFile='index'
339 | 				fi
340 | 	
341 | 				#Output if page does not exist; ADD IN which page references this?
342 | 				if grep -q "404" wgetOutPut; then echo $downloadme "is 404" >> debug.txt 
343 | 					echo "$downloadme is 404" >> 404s
344 | 					echo "$downloadme is 404" >> pages.txt
345 | 
346 | 				else
347 | 					if [ -s "$baseFile" ]; then
348 | 					#echo "basefile exists"
349 | 					pages=1
350 | 
351 | 					numberOfLinesBase=$(wc -l $baseFile | sed -e "s/$baseFile//g" )
352 | 
353 | 					while read htmlLine; do
354 | 						if [[ "$exitloop" == 1 ]]; then break; fi
355 | 						
356 | 						# Detect email addresses
357 | 						shopt -s extglob
358 | 						email=''
359 | 						email=$(echo $htmlLine | grep mailto | sed -e 's/*mailto:\(.*\)\s/\1/' | sed -e 's/^.*mailto://' | sed -e 's/".*$//' | sed -e 's/'\''.*$//')
360 | 						echo "$email" >> emails
361 | 						echo $downloadme "email" $email >> pages.txt
362 | 
363 | 						# Detect soft 404s
364 | 						echo $htmlLine | grep "404" > isSoft404
365 | 						if [[ -s isSoft404 ]]; then echo "> $downloadme might be soft 404" >> 404s; 
366 | 						else	
367 | 							echo $htmlLine | grep -o '<a .*href=.*>' | sed -e 's/<a /\n<a /g' | sed -e 's/<a .*href=['"'"'"]//' -e 's/["'"'"'].*$//' -e '/^$/ d' > linkline
368 | 							cat linkline | grep "https" > isFullPath
369 | 							cat linkline | grep "http" >> isFullPath
370 | 							
371 | 							# Replace empty space \s in line with new line character \n to separate multiple links on a line
372 | 							sed -i -e 's#\s#\n#g' linkline
373 | 							
374 | 							if [[ -n linkline ]]; then
375 | 								if [[ linkline != *"#"* ]]; then
376 | 									if [ -s isFullPath ]; then
377 | 										link=$(cat linkline)
378 | 										echo $link >> $nextlevel
379 | 										echo $htmlLine | grep "$3" > containsSearchQuery
380 | 										echo $htmlLine | grep "$project" > isInternalLink
381 | 										if [[ -s containsSearchQuery && -s isInternalLink ]]; then 
382 | 											echo $downloadme "references:" $link >> pages.txt
383 | 										fi
384 | 										rm containsSearchQuery isInternalLink
385 | 										# html output
386 | 										#echo "</br>" $downloadme "references:" $link >> $htmlOutput
387 | 										echo $downloadme >> checkedpages
388 | 										rm isFullPath
389 | 									else
390 | 										cat linkline | grep "/" > hasslash
391 | 										if [ -s hasslash ]; then
392 | 											sed -i -e "s#^#${1}#" linkline
393 | 											rm hasslash
394 | 										else
395 | 											sed -i -e "s#^#${1}#" linkline
396 | 										fi
397 | 	
398 | 										link=$(cat linkline)
399 | 										echo $link >> $nextlevel
400 | 										# If downloaded page contains the search query, add to pages.txt
401 | 										# Make the graph more useful
402 | 										echo $htmlLine | grep "$3" > containsSearchQuery
403 | 										echo $htmlLine | grep "$project" > isInternalLink
404 | 										if [[ -s containsSearchQuery && -s isInternalLink ]]; then 
405 | 											echo $downloadme "references:" $link >> pages.txt
406 | 										fi
407 | 										rm containsSearchQuery isInternalLink
408 | 										#html output
409 | 										#echo '</br> <a href="'${downloadme}'">'${baseFile}'</a> references: '${link}'' >> ${htmlOutput}
410 | 										echo $downloadme >> checkedpages
411 | 									fi
412 | 									sort pages.txt > upages.txt
413 | 									uniq upages.txt > pages.txt
414 | 									sort $nextlevel > sortednext
415 | 									uniq sortednext > $nextlevel
416 | 									#uniqpages=$(wc -l ../pages.txt)
417 | 									#printf "\r==== Checked %6d of %6d lines ====" $pages $numberOfLinesBase
418 | 							       		pages=$((pages+1))
419 | 									sed -i '/^$/d' $nextlevel
420 | 									rm linkline
421 | 								fi
422 | 							fi
423 | 						fi
424 | 					done < $baseFile
425 | 					fi
426 | 				fi
427 | 			fi
428 | 		fi
429 | 		noLinks=$((noLinks+1))
430 | 		#rm baseFile isFullPath startHere
431 | 		done < $linkfile
432 | 	fi
433 | 
434 | 	# Iterate $n to move through depth levels
435 | 	(( n=n+1 ))
436 | 
437 | 	# Time taken to scan depth $n
438 | 	endTime=`date +%s`
439 | 	runTime=$((endTime-startTime))
440 | 	printf '\r\n'
441 | 	printf '<p>> Completed scan depth %d in %d seconds </p>' $n $runTime >> $htmlOutput
442 | done
443 | 
444 | if [[ $exitloop == 1 ]]; then printf '\r\n<p>> Search terminated by user</p>' >> $htmlOutput; fi
445 | 
446 | cp pages.txt /var/www/html/pages.html
447 | 
448 | #########################################################
449 | # Search query
450 | #########################################################
451 | 
452 | echo "Doing a search now..."
453 | 
454 | instancesOfSearch=0
455 | 
456 | #ls | grep -P ".[0-9]{1}" | xargs -d"\n" rm
457 | ls > pagestocheck 
458 | 
459 | sed -i 's/404s//g; s/emails//g; s/headertags//g; s/checkedpages//g; s/\s//g; s/debug.txt//g; s/errors.txt//g; s/hasHash//g; s/level0.links//g; s/level1.links//g; s/level2.links//g; s/level3.links//g; s/sortednext//g; s/index.html/Homepage/g; s/ignore//g; s/linkline//g; s/multipleLinks//g; s/pagestocheck//g; s/pages.txt//g; s/startHere//g; s/wgetlog//g; s/wgetOutPut//g;' pagestocheck
460 | 
461 | if [ -s pagestocheck ]; then
462 | 	while read line; do
463 | 		if [ "$line"!='' ]; then
464 | 			cat "${line}" | grep "$3" > searchOut 
465 | 			if [ -s searchOut ]; then
466 | 				occurences=$(grep -o -i $3 "${line}" | wc -l)
467 | 				
468 | 				# Number of heading tags
469 | 				h1tags=$(cat ${line} |  grep "^<[Hh][1]>" | sed -e 's/<span.*">//g; s/<\/span.*//g; s/<h1>//g; s/<strong>//g; s/\&nbsp\;//g' | wc -l)
470 | 				h2tags=$(cat ${line} |  grep "^<[Hh][2]>" | sed -e 's/<span.*">//g; s/<\/span.*//g; s/<h2>//g; s/<strong>//g; s/\&nbsp\;//g' | wc -l)
471 | 				h3tags=$(cat ${line} |  grep "^<[Hh][3]>" | sed -e 's/<span.*">//g; s/<\/span.*//g; s/<h3>//g; s/<strong>//g; s/\&nbsp\;//g' | wc -l)
472 | 				h4tags=$(cat ${line} |  grep "^<[Hh][4]>" | sed -e 's/<span.*">//g; s/<\/span.*//g; s/<h4>//g; s/<strong>//g; s/\&nbsp\;//g' | wc -l)
473 | 				
474 | 				# Extract heading tags
475 | 				echo $line |  grep "^<[Hh][1]>" | sed -e 's/<span.*">//g; s/<\/span.*//g; s/<h1>//g; s/<strong>//g; s/\&nbsp\;//g' > h1
476 | 				if [ -s h1 ]; then
477 | 					h1=$(cat h1)
478 | 					echo "Page:" $line "h1 tag:" $h1 >> headertags
479 | 					rm h1
480 | 				fi	
481 | 				echo $line | grep "^<[Hh][2]>" | sed -e 's/<span.*">//g; s/<\/span.*//g; s/<h2>//g; s/<strong>//g; s/\&nbsp\;//g' > h2 
482 | 				if [ -s h2 ]; then
483 | 					h2=$(cat h2)
484 | 					echo "Page:" $line "h2 tag:" $h2 >> headertags
485 | 					rm h2
486 | 				fi
487 | 				echo $line | grep "^<[Hh][3]>" | sed -e 's/<span.*">//g; s/<\/span.*//g; s/<h3>//g; s/<strong>//g; s/\&nbsp\;//g' > h3
488 | 				if [ -s h3 ]; then
489 | 					h3=$(cat h3)
490 | 					echo "Page:" $line "h3 tag:" $h3 >> headertags
491 | 					rm h3
492 | 				fi	
493 | 				echo $line | grep "^<[Hh][4]>" | sed -e 's/<span.*">//g; s/<\/span.*//g; s/<h4>//g; s/<strong>//g; s/\&nbsp\;//g' > h4
494 | 				if [ -s h4 ]; then
495 | 					h4=$(cat h4)
496 | 					echo "Page:" $line "h4 tag:" $h4 >> headertags
497 | 					rm h4
498 | 				fi		
499 | 
500 | 				echo "DIVID${line}DIVENDheader3-----------${line}-------------endhead3" | tee --append searchReturns 
501 | 				head -20 searchOut | cut -c1-500 | tee --append searchReturns 
502 | 				echo $line >> pagesWithSearchQuery
503 | 				echo ' ' >> searchReturns
504 | 				echo "$3 appears $occurences times" >> searchReturns
505 | 				echo "Number of h1 tags:" $h1tags >> searchReturns
506 | 				echo "Number of h2 tags:" $h2tags >> searchReturns
507 | 				# echo "header3GOTOHOMELINKendhead3" >> searchReturns
508 | 				instancesOfSearch=$((instancesOfSearch+occurences))
509 | 				if [ -s searchOut ]; then rm searchOut; fi
510 | 			fi
511 | 		fi
512 | 	done < pagestocheck
513 | fi
514 | 
515 | echo '<div class="Search" id="files">' >> $htmlOutput
516 | printf '<h2>Search Results:</h2>' >> $htmlOutput
517 | 
518 | if [ -s pagesWithSearchQuery ]; then
519 | 	echo "> Found ${instancesOfSearch} instances of ${3} in the following files:" | tee --append $htmlOutput
520 | 	echo "<p>" >> $htmlOutput
521 | 	#cat "$line" | tee --append $htmlOutput
522 | 	while read line; do
523 | 		echo $line | sed "s/[^ ][^ ]*/<a href='#&' class="linkButton">${line}<\/a>/g" | tee --append $htmlOutput
524 | 	done < pagesWithSearchQuery
525 | 	echo "</p>" >> $htmlOutput
526 | 	# Heading for contents from files with search query
527 | 	echo "</div><div class="Search" id="results"><h2>Contents from files with search query:</h2></br>" >> $htmlOutput
528 | else
529 | 	echo "> No instances of ${3} found" | tee --append $htmlOutput
530 | fi
531 | ##########################################################
532 | # Search Contents
533 | ##########################################################
534 | 
535 | # Show contents from files that have search query / Wrap search term in h3 tags to make it stand out
536 | cat searchReturns | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g; s/"//g; s/'"'"'//g; s/header3/<h3>/g; s/endhead3/<\/h3>/g; s/DIVID/<div id="/g; s/DIVEND/">/g; s/GOTOHOMELINK/<a href="#header">Go to top<\/a>/g' | sed -e "s#${3}#<searchTerm>${3}</searchTerm>#g" | perl -p -e 's/\n/<\/br>/' >> $htmlOutput
537 | 
538 | ##########################################################
539 | # 404's
540 | ##########################################################
541 | 
542 | printf '<h2>404s:</h2>' >> $htmlOutput
543 | printf '<div id="404s">' >> $htmlOutput
544 | 
545 | if [ -s 404s ]; then
546 | 	while read line; do
547 | 		echo "<p> ${line} </p>" >> $htmlOutput
548 | 	done < 404s
549 | else
550 | 	echo "<p> All good :) </p>" >> $htmlOutput
551 | fi
552 | 
553 | printf '</div>' >> $htmlOutput
554 | 
555 | ##########################################################
556 | # Heading Tags
557 | ##########################################################
558 | 
559 | printf '<h2>Heading Tags:</h2>' >> $htmlOutput
560 | printf '<div id="headings">' >> $htmlOutput
561 | 
562 | if [ -s headertags ]; then
563 | 	while read line; do
564 | 		printf '<p> %s </p>' $line >> $htmlOutput
565 | 	done < headertags
566 | else
567 | 	echo "<p>> No heading tags detected </p>" >> $htmlOutput
568 | fi
569 | 
570 | printf '</div>' >> $htmlOutput
571 | 
572 | ##########################################################
573 | # Show detected emails
574 | ##########################################################
575 | 
576 | printf '<h2>Email Addresses:</h2>' >> $htmlOutput
577 | printf '<div id="emails">' >> $htmlOutput
578 | 
579 | if [ -s emails ]; then
580 | 	
581 | 	sort emails > sortedemails
582 | 	uniq sortedemails > emails
583 | 
584 | 	while read line; do
585 | 		printf '<p> %s </p>' $line >> emailsdiv
586 | 	done < emails
587 | 
588 | 	rm sortedemails emails
589 | 
590 | 	if [ -s emailsdiv ]; then cat emailsdiv >> $htmlOutput;	fi
591 | else
592 | 	echo "<p>> No email addresses detected </p>" >> $htmlOutput
593 | fi
594 | 
595 | printf '</div>' >> $htmlOutput
596 | 
597 | ##########################################################
598 | # Other Files
599 | ##########################################################
600 | 
601 | printf '<h2>Other Files:</h2>' >> $htmlOutput
602 | printf '<div id="otherfiles">' >> $htmlOutput
603 | 
604 | if [ -s OtherFiles ]; then
605 | 	while read line; do
606 | 		echo ${line} | sed 's/&/\&amp;/g; s/</\&lt;/g; s/>/\&gt;/g; s/"//g; s/'"'"'//g;' > strippedfilelink
607 | 		strippedfilelink=$(cat strippedfilelink)
608 | 	 	rm strippedfilelink
609 | 		echo "<p> ${strippedfilelink} </p>" >> $htmlOutput
610 | 	done < OtherFiles
611 | else
612 | 	echo "<p>> No additional files (docx, doc, pdf, exe, zip etc.) detected </p>" >> $htmlOutput
613 | fi
614 | 
615 | printf '</div>' >> $htmlOutput
616 | 
617 | ##########################################################
618 | # Link Structure (Emails and 404s; links)
619 | ##########################################################
620 | 
621 | printf '<h2>Link structure:</h2>' >> $htmlOutput
622 | printf '<div id="structure">' >> $htmlOutput
623 | printf '<script>\r\nvar links = [' >> $htmlOutput
624 | 
625 | chartHeight=$(wc -l $structurePages | sed -e "s/${structurePages}//g")
626 | chartHeight=$(expr $chartHeight \* 9)
627 | 
628 | # Create data structure
629 | while read line; do
630 | 	trimmedLine=$(echo $line | sed -e 's/${project}//g')
631 | 	is404=$(echo $trimmedLine | awk '{print $2}')
632 | 	level1=$(echo $trimmedLine | awk '{print $1}' | sed -e 's,'"$1"',$,g')
633 | 	level2=$(echo $trimmedLine | awk '{print $3}' | sed -e 's,'"$1"',$,g')
634 | 	isEmail=$(echo $trimmedLine | awk '{print $2}') 
635 | 	if [[ "$level2" != '' ]]; then
636 | 		# is it a 404?
637 | 		if [[ "$is404" == "is" ]]; then
638 | 			printf '{source: "%s", target: "404", type: "404"},\r\n' $level1 >> $htmlOutput
639 | 		# is it an email?
640 | 		elif [[ "$isEmail" == "email" ]]; then
641 | 			printf '{source: "%s", target: "%s", type: "email"},\r\n' $level1 $level2 >> $htmlOutput
642 | 		else
643 | 		# must be a link
644 | 			# is it internal?
645 | 			echo $level2 > lineFile
646 | 			if grep -q "$" lineFile; then
647 | 				printf '{source: "%s", target: "%s", type: "ahref"},\r\n' $level1 $level2 >> $htmlOutput
648 | 			fi
649 | 			rm lineFile
650 | 		fi
651 | 	fi
652 | done < $structurePages
653 | 
654 | # Create chart
655 | printf '];\r\n' >> $htmlOutput
656 | printf 'var nodes = {};\r\nlinks.forEach(function(link) {\r\nlink.source = nodes[link.source] || (nodes[link.source] = {name: link.source});\r\nlink.target = nodes[link.target] || (nodes[link.target] = {name: link.target});\r\n});\r\nvar width = 1200,\r\nheight = %d;\r\nvar force = d3.layout.force()    \r\n.nodes(d3.values(nodes))    \r\n.links(links)    \r\n.size([width, height])    \r\n.linkDistance(50)    \r\n.charge(-200)    \r\n.on("tick", tick)    \r\n.start();\r\nvar svg = d3.select("body").append("svg")    \r\n.attr("width", width)    \r\n.attr("height", height);\r\nsvg.append("defs").selectAll("marker")    \r\n.data(["404", "ahref", "email"])  \r\n.enter().append("marker")    \r\n.attr("id", function(d) { return d; })    \r\n.attr("viewBox", "0 -5 10 10")    \r\n.attr("refX", 10)    \r\n.attr("refY", -1.5)    \r\n.attr("markerWidth", 6)    \r\n.attr("markerHeight", 6)    \r\n.attr("orient", "auto")  \r\n.append("path")    \r\n.attr("d", "M0,-5L10,0L0,5");\r\nvar path = svg.append("g").selectAll("path")    \r\n.data(force.links())  \r\n.enter().append("path")    \r\n.attr("class", function(d) { return "link " + d.type; })    \r\n.attr("marker-end", function(d) { return "url(#" + d.type + ")"; });\r\nvar circle = svg.append("g").selectAll("circle")    \r\n.data(force.nodes())  \r\n.enter().append("circle")    \r\n.attr("r", 6)    \r\n.call(force.drag);\r\nvar text = svg.append("g").selectAll("text")    \r\n.data(force.nodes())  \r\n.enter().append("text")    \r\n.attr("x", 8)    \r\n.attr("y", ".12em")    \r\n.text(function(d) { return d.name; });\r\nfunction tick() {  \r\npath.attr("d", linkArc);  \r\ncircle.attr("transform", transform);  \r\ntext.attr("transform", transform);\r\n}\r\nfunction linkArc(d) {  \r\nvar dx = d.target.x - d.source.x,      \r\ndy = d.target.y - d.source.y,      \r\ndr = Math.sqrt(dx * dx + dy * dy);  \r\nreturn "M" + d.source.x + "," + d.source.y + "A" + dr + "," + dr + " 0 0,1 " + d.target.x + "," + d.target.y;\r\n}\r\nfunction transform(d) {  \r\nreturn "translate(" + d.x + "," + d.y + ")";\r\n}\r\n</script>' $chartHeight >> $htmlOutput
657 | 
658 | printf '</div></div>' >> $htmlOutput
659 | 
660 | printf '<script async src="https://cse.google.com/cse.js?cx=003366497132526601933:yjjgyhat87y"></script>
661 | <div class="gcse-search"></div>' >> $htmlOutput
662 | 
663 | 
664 | ##########################################################
665 | # Clean up
666 | ##########################################################
667 | 
668 | # Close html tags
669 | echo "</div></body></html>" >> $htmlOutput
670 | if [ -s searchReturns ]; then
671 | 	rm searchReturns pagesWithSearchQuery
672 | fi
673 | 
674 | rm hasHash ignore linkline multipleLinks pagestocheck startHere wgetOutPut
675 | 
676 | # Turn off refresh
677 | sed -i 's/<meta http-equiv="refresh" content="2"/<meta /g' $htmlOutput  
678 | 
679 | ##########################################################
680 | ##########################################################
681 | 


--------------------------------------------------------------------------------