├── README.md └── websitesearch.sh /README.md: -------------------------------------------------------------------------------- 1 | # wesbitesearch 2 | ## Website search bash script for SEO comparisons / OSINT. 3 | Ignores robots.txt, goes slowly so has the side effect of making some analytics software like Google Analytics think this is organic traffic. 4 | 5 | ## Current version 1.6 6 | 7 | ### Usage: 8 | ./websitesearch.sh url depth searchquery urllist 9 | * URL: https:// or http:// url. Note, does not cope well with websites that redirect www. to root 10 | * Depth: Number of levels of links that should be followed 11 | * Search Query: Will search pages for a string and return pages with matches. i.e. passwords, secrets, SEO comparisons 12 | * URLList: Replaces first level of scraping with a file that is a list of URLs 13 | 14 | ### Features: 15 | * Progress indicator renders on html output while in progress, this is a bit glitchy but functional 16 | * Scrapes website to specified depth 17 | * Returns location of web server on google map 18 | * Shows page structure and location of email addresses with js plugin (visualising link structure) 19 | * Shows h1, h2, h3 tags on pages that contain search term 20 | * Returns emails from all pages 21 | * Recent projects appear in bottom menu 22 | * Does not follow links outisde the chosen domain (this is a feature not a bug) 23 | * Outputs debug file for error checking 24 | 25 | ### Todo: 26 | * Config file for API keys etc. 27 | * Improve comamnd line args so that search queries etc. are not needed, set default depth for quick search 28 | * Edge link chart for visualising link structure 29 | * Allow comparison searches for the same domain 30 | * Allow specific output html file (not hardcoded) 31 | * Text based output if required 32 | 33 | ### If i can be bothered: 34 | * Move scraping function to python script and allow multithreading to improve speed 35 | 36 | ------------------------------------------- 37 | -------------------------------------------------------------------------------- /websitesearch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ######################################################## 3 | # websitesearch.sh 4 | # Version 1.6 5 | ######################################################## 6 | # Usage: 7 | # websitesearch.sh url depth searchquery urllist 8 | # 9 | # url = $1 10 | # depth = $2 11 | # searchquery = $3 12 | # urllist = $4 13 | # 14 | 15 | breakloop () 16 | { 17 | exitloop=1 18 | } 19 | 20 | trap breakloop SIGINT SIGTERM 21 | 22 | function ProgressBar { 23 | let _progress=(${1}*100/${2}*100)/100 24 | let _done=(${_progress}*4)/10 25 | let _left=40-$_done 26 | _fill=$(printf "%${_done}s") 27 | _empty=$(printf "%${_left}s") 28 | 29 | printf "\rProgress : [${_fill// /\#}${_empty// /-}] ${_progress}%%" | tee progressbar 30 | progressbarString=$(cat progressbar) 31 | 32 | # Update progress % on html output file 33 | sed -i "s/Progress.*/${progressbarString}/g" $htmlOutput 34 | 35 | } 36 | 37 | ######################################################### 38 | # Check command line args 39 | ######################################################### 40 | 41 | if [ "$#" -eq 4 ]; then 42 | if [ -s $4 ]; then 43 | listgiven=1 44 | cp $4 level0.links 45 | depth=depth+1 46 | else 47 | echo "Error: url list file does not exist" >&2; exit 1 48 | fi 49 | fi 50 | 51 | if [ "$#" -gt 3 ]; then 52 | searchquery=$3 53 | #searchquery=$(echo $3 | sed -e "s/'/'\\\\''/g; 1s/^/'/; \$s/\$/'/") 54 | if [ $searchquery = '' ]; then 55 | echo "Error: search query is invalid" >&2; exit 1 56 | fi 57 | fi 58 | 59 | if [ "$#" -gt 2 ]; then 60 | re='^[0-9]+$' 61 | if ! [[ $2 =~ $re ]]; then 62 | echo "Error: Depth is not a number" >&2; exit 1 63 | else 64 | depth=$2 65 | fi 66 | fi 67 | 68 | if [ "$#" -eq 0 ]; then 69 | echo "Usage: websitesearch url depth searchquery urllist" >&2; exit 1 70 | fi 71 | 72 | if [ "$#" -gt 1 ]; then 73 | re='(https?|ftp|file)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]' 74 | if [[ $1 =~ $re ]]; then 75 | 76 | #Make a folder for the project files 77 | project=$(echo $1 | sed -e 's/https:\/\///g' -e 's/\///g' -e 's/http:\/\///g') 78 | mkdir -p $project 79 | cd $project 80 | touch pages.txt 81 | printf "\r\n" 82 | 83 | ## Add in HTML Output feature 84 | ## Modify the htmlOutput variable to change the location of the output file, will add a command line parameter for this in future 85 | htmlName=$(echo $project | sed -e 's/\/\(.*\)./\1/') 86 | htmlOutput=$(echo "/var/www/html/$htmlName.html") 87 | echo "html file is called: $htmlName" 88 | dateStarted=$(date +"%m-%d-%Y") 89 | 90 | ## Add in HTML Header and styles 91 | printf '\r\n\n' > $htmlOutput 92 | echo "Websitesearch $htmlName $dateStarted" >> $htmlOutput 93 | # Auto-refresh page 94 | printf '\n' >> $htmlOutput 95 | # CSS 96 | 97 | # Google maps 98 | printf '' >> $htmlOutput 99 | 100 | # Header menu 101 | printf '' >> $htmlOutput 103 | 104 | # Footer menu for recent projects 105 | printf '\r\n' >> $htmlOutput 106 | 107 | # Buttons for pages 108 | printf '\r\n' >> $htmlOutput 109 | 110 | # CSS for filename heading 111 | printf '\r\n' >> $htmlOutput 112 | 113 | # Google fonts 114 | printf '\n\n' >> $htmlOutput 115 | printf '' >> $htmlOutput 116 | 117 | # d3 Graph for link structure (emails, 404s) 118 | printf '' >> $htmlOutput 119 | printf '\r\n\r\n' >> $htmlOutput 120 | structurePages=$(echo pages.txt) 121 | 122 | # Edge point graph for link structure 123 | #printf '\r\n' >> $htmlOutput 124 | 125 | # Menu items (header) 126 | printf '\n\n\r\r\n\r\n
' $dateStarted $2 >> $htmlOutput 127 | 128 | # Menu items (footer) 129 | printf '' >> $htmlOutput 130 | # Insert current project as active list element 131 | printf '
  • %s
  • \r\n' $project $project >> $htmlOutput 132 | # Get files in webroot 133 | 134 | ls /var/www/html/*.html > projectlist 135 | sed -i -e 's/\/var\/www\/html\///g' projectlist 136 | # remove specific files from projectlist 137 | sed -i -e 's/pages.html//g' projectlist 138 | 139 | while read line; do 140 | if [[ ${line}!=${project} ]]; then 141 | label=$(echo $line | sed -e 's/.html//g') 142 | printf '
  • %s
  • ' $line $label >> $htmlOutput 143 | fi 144 | done < projectlist 145 | 146 | printf '
    ' >> $htmlOutput 147 | 148 | # Get the starting (root) file 149 | wget -o wgetlog -O startHere $1 150 | indexfile="startHere" 151 | grep "failed" wgetlog > errors 152 | if [ -s errors ]; then 153 | rm wgetlog errors 154 | rm -Rf "$project" 155 | echo "Error: URL is invalid or down, please check" >&2; exit 1 156 | elif [[ $3!='' ]]; then 157 | echo "Checking" $1 "for references to" $3 "at a depth of" $2 158 | else 159 | echo "Checking" $1 160 | fi 161 | else 162 | echo "Error: URL is invalid" >&2; exit 1 163 | fi 164 | fi 165 | 166 | ######################################################### 167 | # Geo-location of server 168 | ######################################################### 169 | geodata=$(echo "geodata") 170 | printf "
    \r\n

    " > geodatadiv 171 | printf "

    \r\nServer Geo Information

    " >> geodatadiv 172 | server=$(echo ${1} | sed -e 's/^.*\:\/\///g') 173 | #echo "Server:" $server 174 | serverip=$(nslookup $server | grep "Address:" | tail -1 | sed -e 's/Address\://g' | sed -e 's/\#.*$//g' | sed -e 's/\s//g') 175 | ipinfo=`curl -s https://ipinfo.io/${serverip}` 176 | echo "ServerIP:" $serverip 177 | 178 | # Get geo data from IP 179 | curl -s https://ipvigilante.com/${serverip} | jq '.data.latitude, .data.longitude, .data.city_name, .data.country_name' | \ 180 | while read -r LATITUDE; do 181 | read -r LONGITUDE 182 | read -r CITY 183 | read -r COUNTRY 184 | echo "${LATITUDE},${LONGITUDE},${CITY},${COUNTRY}" | tr --delete \" > ${geodata} 185 | done 186 | cat geodata >> geodatadiv 187 | lat=$(cat geodata | awk -F, '{print $1}') 188 | long=$(cat geodata | awk -F, '{print $2}') 189 | rm geodata 190 | echo "

    " >> $htmlOutput 191 | echo geodatadiv 192 | cat geodatadiv >> $htmlOutput 193 | echo "

    " >> $htmlOutput 194 | 195 | # Put marker on google map 196 | ## NOTE!!! See google maps API key at the end of this line, you'll need to add yours in here for this to work. 197 | printf "
    \r\n \r\n \r\n" >> $htmlOutput 198 | 199 | # Close the div 200 | echo "

    " >> $htmlOutput 201 | 202 | ######################################################### 203 | # Scrape level 0 204 | ######################################################### 205 | 206 | sizeoffile=$(wc -l "$indexfile") 207 | 208 | _start=1 209 | _end=$sizeoffile 210 | n=1 211 | 212 | startTime=`date +%s` 213 | 214 | if [ $listgiven!=1 ]; then 215 | while read line; do 216 | ProgressBar ${n} ${_end} 217 | echo $line | grep -o '' | sed -e 's/ linkline 218 | cat linkline | grep $1 > isFullPath 219 | if [ -s isFullPath ]; then 220 | cat linkline >> level0.links 221 | linkline=$(cat linkline) 222 | if [[ ! -z "$linkline" ]]; then echo $1 "references:" $linkline >> pages.txt; fi 223 | rm isFullPath 224 | else 225 | sed -i -e "s#^#${1}#" linkline 226 | cat linkline >> level0.links 227 | linkline=$(cat linkline) 228 | if [[ ! -z "$linkline" ]]; then echo $1 "references:" $linkline >> pages.txt; fi 229 | fi 230 | rm linkline 231 | n=$((n+1)) 232 | done < $indexfile 233 | fi 234 | 235 | sort level0.links > level0sorted 236 | uniq level0sorted > level0.links 237 | rm level0sorted 238 | 239 | endTime=`date +%s` 240 | runTime=$((endTime-startTime)) 241 | 242 | printf "\r\n" 243 | echo " Completed in ${runTime} seconds" 244 | 245 | ######################################################### 246 | # Scrape deeper levels 247 | ######################################################### 248 | n=0 249 | touch checkedpages 250 | touch level1.links 251 | touch level2.links 252 | touch level3.links 253 | 254 | printf '