├── README.md └── websitesearch.sh /README.md: -------------------------------------------------------------------------------- 1 | # wesbitesearch 2 | ## Website search bash script for SEO comparisons / OSINT. 3 | Ignores robots.txt, goes slowly so has the side effect of making some analytics software like Google Analytics think this is organic traffic. 4 | 5 | ## Current version 1.6 6 | 7 | ### Usage: 8 | ./websitesearch.sh url depth searchquery urllist 9 | * URL: https:// or http:// url. Note, does not cope well with websites that redirect www. to root 10 | * Depth: Number of levels of links that should be followed 11 | * Search Query: Will search pages for a string and return pages with matches. i.e. passwords, secrets, SEO comparisons 12 | * URLList: Replaces first level of scraping with a file that is a list of URLs 13 | 14 | ### Features: 15 | * Progress indicator renders on html output while in progress, this is a bit glitchy but functional 16 | * Scrapes website to specified depth 17 | * Returns location of web server on google map 18 | * Shows page structure and location of email addresses with js plugin (visualising link structure) 19 | * Shows h1, h2, h3 tags on pages that contain search term 20 | * Returns emails from all pages 21 | * Recent projects appear in bottom menu 22 | * Does not follow links outisde the chosen domain (this is a feature not a bug) 23 | * Outputs debug file for error checking 24 | 25 | ### Todo: 26 | * Config file for API keys etc. 27 | * Improve comamnd line args so that search queries etc. are not needed, set default depth for quick search 28 | * Edge link chart for visualising link structure 29 | * Allow comparison searches for the same domain 30 | * Allow specific output html file (not hardcoded) 31 | * Text based output if required 32 | 33 | ### If i can be bothered: 34 | * Move scraping function to python script and allow multithreading to improve speed 35 | 36 | ------------------------------------------- 37 | -------------------------------------------------------------------------------- /websitesearch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ######################################################## 3 | # websitesearch.sh 4 | # Version 1.6 5 | ######################################################## 6 | # Usage: 7 | # websitesearch.sh url depth searchquery urllist 8 | # 9 | # url = $1 10 | # depth = $2 11 | # searchquery = $3 12 | # urllist = $4 13 | # 14 | 15 | breakloop () 16 | { 17 | exitloop=1 18 | } 19 | 20 | trap breakloop SIGINT SIGTERM 21 | 22 | function ProgressBar { 23 | let _progress=(${1}*100/${2}*100)/100 24 | let _done=(${_progress}*4)/10 25 | let _left=40-$_done 26 | _fill=$(printf "%${_done}s") 27 | _empty=$(printf "%${_left}s") 28 | 29 | printf "\rProgress : [${_fill// /\#}${_empty// /-}] ${_progress}%%" | tee progressbar 30 | progressbarString=$(cat progressbar) 31 | 32 | # Update progress % on html output file 33 | sed -i "s/Progress.*/${progressbarString}/g" $htmlOutput 34 | 35 | } 36 | 37 | ######################################################### 38 | # Check command line args 39 | ######################################################### 40 | 41 | if [ "$#" -eq 4 ]; then 42 | if [ -s $4 ]; then 43 | listgiven=1 44 | cp $4 level0.links 45 | depth=depth+1 46 | else 47 | echo "Error: url list file does not exist" >&2; exit 1 48 | fi 49 | fi 50 | 51 | if [ "$#" -gt 3 ]; then 52 | searchquery=$3 53 | #searchquery=$(echo $3 | sed -e "s/'/'\\\\''/g; 1s/^/'/; \$s/\$/'/") 54 | if [ $searchquery = '' ]; then 55 | echo "Error: search query is invalid" >&2; exit 1 56 | fi 57 | fi 58 | 59 | if [ "$#" -gt 2 ]; then 60 | re='^[0-9]+$' 61 | if ! [[ $2 =~ $re ]]; then 62 | echo "Error: Depth is not a number" >&2; exit 1 63 | else 64 | depth=$2 65 | fi 66 | fi 67 | 68 | if [ "$#" -eq 0 ]; then 69 | echo "Usage: websitesearch url depth searchquery urllist" >&2; exit 1 70 | fi 71 | 72 | if [ "$#" -gt 1 ]; then 73 | re='(https?|ftp|file)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]' 74 | if [[ $1 =~ $re ]]; then 75 | 76 | #Make a folder for the project files 77 | project=$(echo $1 | sed -e 's/https:\/\///g' -e 's/\///g' -e 's/http:\/\///g') 78 | mkdir -p $project 79 | cd $project 80 | touch pages.txt 81 | printf "\r\n" 82 | 83 | ## Add in HTML Output feature 84 | ## Modify the htmlOutput variable to change the location of the output file, will add a command line parameter for this in future 85 | htmlName=$(echo $project | sed -e 's/\/\(.*\)./\1/') 86 | htmlOutput=$(echo "/var/www/html/$htmlName.html") 87 | echo "html file is called: $htmlName" 88 | dateStarted=$(date +"%m-%d-%Y") 89 | 90 | ## Add in HTML Header and styles 91 | printf '\r\n
\n' > $htmlOutput 92 | echo "" > geodatadiv 171 | printf "
" >> $htmlOutput 191 | echo geodatadiv 192 | cat geodatadiv >> $htmlOutput 193 | echo "
" >> $htmlOutput 194 | 195 | # Put marker on google map 196 | ## NOTE!!! See google maps API key at the end of this line, you'll need to add yours in here for this to work. 197 | printf "\r\n \r\n \r\n" >> $htmlOutput 198 | 199 | # Close the div 200 | echo "> Please wait...
\r\n' $project) 285 | echo $pleasewait >> $htmlOutput 286 | startingscan=$(printf '> Progress: starting scan
\r\n' $project) 287 | sed -i -e "s#${pleasewait}#${startingscan}#g" $htmlOutput 288 | 289 | # Replace whitespace with new line character on links file to resolve multiple links on single line issue 290 | sed -i -e 's/ /\n/g' $linkfile 291 | 292 | while read line; do 293 | 294 | strippedline=$(echo $line | sed -e 's#.*http://##g' | sed -e 's#.*https://##g') 295 | 296 | if [[ "$exitloop" == 1 ]]; then break; fi 297 | 298 | #printf "\r\n========= Getting %4d of %4d =========\n" $noLinks $noLevelZeroLinks 299 | ProgressBar ${noLinks} ${_end} 300 | 301 | # Check if page has already been crawled 302 | if grep -q "$line" checkedpages; then 303 | echo "Page" $line "already checked" >> debug.txt 304 | else 305 | echo $line | grep "#" > hasHash 306 | #echo $line | grep -e '^.*\.comhttp.*' > multipleLinks 307 | #echo $line | grep -e '^.*\.nethttp.*' >> multipleLinks 308 | echo $line | grep -e '^.*(\.nethttp|\.comhttp|\.auhttp|\/htt).*' >> multipleLinks 309 | 310 | if [ -s hasHash ] || [ -s multipleLinks ]; then 311 | echo $line "has # in it, or has external link" >> debug.txt 312 | rm hasHash multipleLinks 313 | else 314 | 315 | # Remove // from any url's 316 | echo $line | sed -e 's#///#/#2' > downloadme 317 | sed -i -e 's#//#/#g' downloadme 318 | sed -i -e 's#http:#http:/#g' downloadme 319 | sed -i -e 's#https:#https:/#g' downloadme 320 | downloadme=$(cat downloadme) 321 | 322 | # Check for specific file types to exclude from downloading 323 | grep -E "^.*(\.exe|\.zip|\.pdf|\.tar|\.gz|\.doc|\.docx)" downloadme > ignore 324 | rm downloadme 325 | 326 | if [ -s ignore ]; then 327 | echo $downloadme "contains ignored filetype" >> debug.txt 328 | echo "$downloadme contains ignore filetype
" >> otherFiles 329 | rm ignore 330 | else 331 | echo "Getting" $downloadme >> debug.txt 332 | wget -nc --timeout=1 --tries=1 -o wgetOutPut $downloadme 333 | fi 334 | 335 | baseFile=$(basename "$downloadme") 336 | 337 | if [ "$baseFile" == '#' ]; then 338 | baseFile='index' 339 | fi 340 | 341 | #Output if page does not exist; ADD IN which page references this? 342 | if grep -q "404" wgetOutPut; then echo $downloadme "is 404" >> debug.txt 343 | echo "$downloadme is 404" >> 404s 344 | echo "$downloadme is 404" >> pages.txt 345 | 346 | else 347 | if [ -s "$baseFile" ]; then 348 | #echo "basefile exists" 349 | pages=1 350 | 351 | numberOfLinesBase=$(wc -l $baseFile | sed -e "s/$baseFile//g" ) 352 | 353 | while read htmlLine; do 354 | if [[ "$exitloop" == 1 ]]; then break; fi 355 | 356 | # Detect email addresses 357 | shopt -s extglob 358 | email='' 359 | email=$(echo $htmlLine | grep mailto | sed -e 's/*mailto:\(.*\)\s/\1/' | sed -e 's/^.*mailto://' | sed -e 's/".*$//' | sed -e 's/'\''.*$//') 360 | echo "$email" >> emails 361 | echo $downloadme "email" $email >> pages.txt 362 | 363 | # Detect soft 404s 364 | echo $htmlLine | grep "404" > isSoft404 365 | if [[ -s isSoft404 ]]; then echo "> $downloadme might be soft 404" >> 404s; 366 | else 367 | echo $htmlLine | grep -o '' | sed -e 's/ linkline 368 | cat linkline | grep "https" > isFullPath 369 | cat linkline | grep "http" >> isFullPath 370 | 371 | # Replace empty space \s in line with new line character \n to separate multiple links on a line 372 | sed -i -e 's#\s#\n#g' linkline 373 | 374 | if [[ -n linkline ]]; then 375 | if [[ linkline != *"#"* ]]; then 376 | if [ -s isFullPath ]; then 377 | link=$(cat linkline) 378 | echo $link >> $nextlevel 379 | echo $htmlLine | grep "$3" > containsSearchQuery 380 | echo $htmlLine | grep "$project" > isInternalLink 381 | if [[ -s containsSearchQuery && -s isInternalLink ]]; then 382 | echo $downloadme "references:" $link >> pages.txt 383 | fi 384 | rm containsSearchQuery isInternalLink 385 | # html output 386 | #echo "" $downloadme "references:" $link >> $htmlOutput 387 | echo $downloadme >> checkedpages 388 | rm isFullPath 389 | else 390 | cat linkline | grep "/" > hasslash 391 | if [ -s hasslash ]; then 392 | sed -i -e "s#^#${1}#" linkline 393 | rm hasslash 394 | else 395 | sed -i -e "s#^#${1}#" linkline 396 | fi 397 | 398 | link=$(cat linkline) 399 | echo $link >> $nextlevel 400 | # If downloaded page contains the search query, add to pages.txt 401 | # Make the graph more useful 402 | echo $htmlLine | grep "$3" > containsSearchQuery 403 | echo $htmlLine | grep "$project" > isInternalLink 404 | if [[ -s containsSearchQuery && -s isInternalLink ]]; then 405 | echo $downloadme "references:" $link >> pages.txt 406 | fi 407 | rm containsSearchQuery isInternalLink 408 | #html output 409 | #echo ' '${baseFile}' references: '${link}'' >> ${htmlOutput} 410 | echo $downloadme >> checkedpages 411 | fi 412 | sort pages.txt > upages.txt 413 | uniq upages.txt > pages.txt 414 | sort $nextlevel > sortednext 415 | uniq sortednext > $nextlevel 416 | #uniqpages=$(wc -l ../pages.txt) 417 | #printf "\r==== Checked %6d of %6d lines ====" $pages $numberOfLinesBase 418 | pages=$((pages+1)) 419 | sed -i '/^$/d' $nextlevel 420 | rm linkline 421 | fi 422 | fi 423 | fi 424 | done < $baseFile 425 | fi 426 | fi 427 | fi 428 | fi 429 | noLinks=$((noLinks+1)) 430 | #rm baseFile isFullPath startHere 431 | done < $linkfile 432 | fi 433 | 434 | # Iterate $n to move through depth levels 435 | (( n=n+1 )) 436 | 437 | # Time taken to scan depth $n 438 | endTime=`date +%s` 439 | runTime=$((endTime-startTime)) 440 | printf '\r\n' 441 | printf '> Completed scan depth %d in %d seconds
' $n $runTime >> $htmlOutput 442 | done 443 | 444 | if [[ $exitloop == 1 ]]; then printf '\r\n> Search terminated by user
' >> $htmlOutput; fi 445 | 446 | cp pages.txt /var/www/html/pages.html 447 | 448 | ######################################################### 449 | # Search query 450 | ######################################################### 451 | 452 | echo "Doing a search now..." 453 | 454 | instancesOfSearch=0 455 | 456 | #ls | grep -P ".[0-9]{1}" | xargs -d"\n" rm 457 | ls > pagestocheck 458 | 459 | sed -i 's/404s//g; s/emails//g; s/headertags//g; s/checkedpages//g; s/\s//g; s/debug.txt//g; s/errors.txt//g; s/hasHash//g; s/level0.links//g; s/level1.links//g; s/level2.links//g; s/level3.links//g; s/sortednext//g; s/index.html/Homepage/g; s/ignore//g; s/linkline//g; s/multipleLinks//g; s/pagestocheck//g; s/pages.txt//g; s/startHere//g; s/wgetlog//g; s/wgetOutPut//g;' pagestocheck 460 | 461 | if [ -s pagestocheck ]; then 462 | while read line; do 463 | if [ "$line"!='' ]; then 464 | cat "${line}" | grep "$3" > searchOut 465 | if [ -s searchOut ]; then 466 | occurences=$(grep -o -i $3 "${line}" | wc -l) 467 | 468 | # Number of heading tags 469 | h1tags=$(cat ${line} | grep "^<[Hh][1]>" | sed -e 's/" >> $htmlOutput 521 | #cat "$line" | tee --append $htmlOutput 522 | while read line; do 523 | echo $line | sed "s/[^ ][^ ]*/${line}<\/a>/g" | tee --append $htmlOutput 524 | done < pagesWithSearchQuery 525 | echo "
" >> $htmlOutput 526 | # Heading for contents from files with search query 527 | echo "