├── tests ├── urls_redirect_4.txt ├── test.sh └── urls_valid_4.txt ├── .travis.yml ├── LICENSE.md ├── Makefile ├── README.md └── linkcheck /tests/urls_redirect_4.txt: -------------------------------------------------------------------------------- 1 | # Redirect URLS 2 | https://google.com 3 | https://yahoo.com 4 | https://bing.com 5 | https://www.duckduckgo.com 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | ### 4 | ### Enable sudo (required for docker service) 5 | ### 6 | sudo: required 7 | 8 | 9 | ### 10 | ### Language 11 | ### 12 | language: minimal 13 | 14 | 15 | ### 16 | ### Lint 17 | ### 18 | before_script: 19 | - make lint 20 | 21 | 22 | ### 23 | ### Test 24 | ### 25 | script: 26 | - make test 27 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 cytopia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ifneq (,) 2 | .error This Makefile requires GNU Make. 3 | endif 4 | 5 | # ------------------------------------------------------------------------------------------------- 6 | # Default configuration 7 | # ------------------------------------------------------------------------------------------------- 8 | .PHONY: help test lint 9 | 10 | 11 | # ------------------------------------------------------------------------------------------------- 12 | # Default Target 13 | # ------------------------------------------------------------------------------------------------- 14 | help: 15 | @printf "%s\n" "make test Test linkcheck" 16 | @printf "%s\n" "make lint Lint source files with shellcheck" 17 | @printf "%s\n" "make help Show help" 18 | 19 | 20 | # ------------------------------------------------------------------------------------------------- 21 | # Targets 22 | # ------------------------------------------------------------------------------------------------- 23 | lint: 24 | docker run --rm -v $(PWD):/mnt koalaman/shellcheck:stable --shell=bash linkcheck 25 | 26 | test: 27 | ./tests/test.sh 28 | -------------------------------------------------------------------------------- /tests/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Be strict 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 9 | 10 | 11 | ### 12 | ### Find exactly 4 unique URLs 13 | ### 14 | "${DIR}/../linkcheck" -r 10 -t 30 "${DIR}/urls_valid_4.txt" 15 | echo 16 | 17 | if [ "$( "${DIR}/../linkcheck" -r 10 -t 30 "${DIR}/urls_valid_4.txt" | grep -c "[OK]" )" -eq "4" ]; then 18 | echo "[OK]" 19 | echo 20 | else 21 | >&2 echo "FAILED: Not exactly 4 succeeded URLs found." 22 | exit 1 23 | fi 24 | 25 | 26 | ### 27 | ### Find exactly 4 redirecting URLs and fail 28 | ### 29 | "${DIR}/../linkcheck" -r 10 -t 30 "${DIR}/urls_redirect_4.txt" || true 30 | echo 31 | 32 | if [ "$( "${DIR}/../linkcheck" -r 10 -t 30 "${DIR}/urls_redirect_4.txt" | grep -c "[ERR]" )" -eq "4" ]; then 33 | echo "[OK]" 34 | echo 35 | else 36 | >&2 echo "FAILED: Not exactly 4 redirecting URLs found." 37 | exit 1 38 | fi 39 | 40 | 41 | ### 42 | ### Find exactly 4 redirecting URLs and allow 43 | ### 44 | "${DIR}/../linkcheck" -r 10 -t 30 -c '200,301,302' "${DIR}/urls_redirect_4.txt" || true 45 | echo 46 | 47 | if [ "$( "${DIR}/../linkcheck" -r 10 -t 30 -c '200,301,302' "${DIR}/urls_redirect_4.txt" | grep -c "[OK]" )" -eq "4" ]; then 48 | echo "[OK]" 49 | echo 50 | else 51 | >&2 echo "FAILED: Not exactly 4 redirecting URLs found." 52 | exit 1 53 | fi 54 | 55 | 56 | ### 57 | ### Find exactly 4 redirecting URLs and follow them 58 | ### 59 | "${DIR}/../linkcheck" -l -r 10 -t 30 "${DIR}/urls_redirect_4.txt" 60 | echo 61 | 62 | if [ "$( "${DIR}/../linkcheck" -l -r 10 -t 30 "${DIR}/urls_redirect_4.txt" | grep -c "[OK]" )" -eq "4" ]; then 63 | echo "[OK]" 64 | echo 65 | else 66 | >&2 echo "FAILED: Not exactly 4 redirecting and successfully followed URLs found." 67 | exit 1 68 | fi 69 | -------------------------------------------------------------------------------- /tests/urls_valid_4.txt: -------------------------------------------------------------------------------- 1 | # Plain URLs 2 | https://www.baka-tsuki.org/project/?title=Ore_no_Im%C5%8Dto_ga_Konna_ni_Kawaii_Wake_ga_Nai 3 | https://www.phoronix.com/scan.php?page=news_item&px=U2-NVMe-PCI-E-SSD-Adapter 4 | https://en.wikipedia.org/w/index.php?title=Erlang_(programming_language)&diff=847224766&oldid=847107459 5 | https://en.wiktionary.org/wiki/%E8%8C%B6 6 | 7 | # Single Quoted URLs 8 | 'https://www.phoronix.com/scan.php?page=news_item&px=U2-NVMe-PCI-E-SSD-Adapter' 9 | 'https://en.wikipedia.org/w/index.php?title=Erlang_(programming_language)&diff=847224766&oldid=847107459' 10 | 'https://en.wiktionary.org/wiki/%E8%8C%B6' 11 | 12 | # Double Quoted URLs 13 | "https://www.phoronix.com/scan.php?page=news_item&px=U2-NVMe-PCI-E-SSD-Adapter" 14 | "https://en.wikipedia.org/w/index.php?title=Erlang_(programming_language)&diff=847224766&oldid=847107459" 15 | "https://en.wiktionary.org/wiki/%E8%8C%B6" 16 | 17 | # Bracketed URLs 18 | 19 | 20 | > 21 | 22 | # Round Bracketed URLs 23 | ] 75 | linkcheck --version 76 | linkcheck --help 77 | 78 | 79 | Options: 80 | 81 | -e Limit search to those file extensions. 82 | Defaults to limiting on non-binary files. 83 | Accepts comma separated string of extensions: 84 | -e txt 85 | -e txt,rst 86 | -e sh,py.c,h 87 | 88 | -i Ignore all URLs matching the specified regex. 89 | Defaults to: ^http(s)?:\/\/(127\.0\.0\.1)|(localhost).*$ 90 | Accepts a single regex string: 91 | -i '^http(?):\/\/my-comapny.com.*$' 92 | 93 | -t Specify curl timeout in seconds, after which probing stops for one url. 94 | Defaults to 10 seconds. 95 | Accepts a positive integer: 96 | -t 5 97 | -t 10 98 | 99 | -r Specify how many time to retry probing a single URL, before giving up. 100 | Defaults to 3 times. 101 | Accepts a positive integer: 102 | -r 5 103 | -r 10 104 | 105 | -c Specify HTTP status codes that are valid for success. 106 | Any code not specified in here will produce an error for the given URL. 107 | Defaults to '200'. 108 | Accepts comma separated string of http status codes: 109 | -c '200' 110 | -c '200,301' 111 | -c '200,301,302' 112 | 113 | -k Ignore invalid SSL certificates for HTTPS connections. 114 | Defaults to error on invalid SSL certificates. 115 | This is just a single flag with no other arguments. 116 | 117 | -l Specify whether to follow redirect URLs or not. 118 | This argument does not accept parameters. 119 | Defaults to not following redirects. 120 | 121 | -v Be verbose and also show affected files. 122 | 123 | --version Show version and exit. 124 | --help Show this help screen. 125 | 126 | 127 | Optional arguments: 128 | 129 | Specify what directory to scan files for URLs. 130 | Defaults to current directory. 131 | ``` 132 | 133 | 134 | ## License 135 | 136 | [MIT License](LICENSE.md) 137 | 138 | Copyright (c) 2018 [cytopia](https://github.com/cytopia) 139 | -------------------------------------------------------------------------------- /linkcheck: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Be strict 4 | set -e 5 | set -u 6 | set -o pipefail 7 | 8 | 9 | ############################################################ 10 | # Overwritable global variables 11 | ############################################################ 12 | 13 | 14 | ### 15 | ### In what path to look for files 16 | ### 17 | SEARCH_PATH="." 18 | 19 | 20 | ### 21 | ### Comma separated list of file extensions to scan for urls 22 | ### 23 | EXTENSIONS="" 24 | 25 | 26 | ### 27 | ### Regex to exclude URLs from being tested 28 | ### 29 | URL_REGEX_EXCLUDE="^http(s)?:\\/\\/(127\\.0\\.0\\.1)|(localhost).*$" 30 | 31 | 32 | ### 33 | ### Timeout in seconds to see if a site is alive 34 | ### 35 | TIMEOUT=10 36 | 37 | 38 | ### 39 | ### How many times to probe one URL to see if it is alive 40 | ### 41 | RETRIES=3 42 | 43 | 44 | ### 45 | ### Comma separated list of acceptable http status codes 46 | ### to define that the URL is alive 47 | ### 48 | STATUS_CODES=200 49 | 50 | 51 | ### 52 | ### Allow insecure SSL connections if chosen 53 | ### This is exactly: curl -k 54 | ### 55 | INSECURE_SSL="" 56 | 57 | 58 | ### 59 | ### Follow redirects 60 | ### This is exactly: curl -L 61 | ### 62 | FOLLOW_REDIRECT="" 63 | 64 | 65 | ### 66 | ### Be verbose 67 | ### 68 | VERBOSE=0 69 | 70 | 71 | ############################################################ 72 | # Fixed global variables 73 | ############################################################ 74 | 75 | ### 76 | ### Regex to scan for URLs 77 | ### 78 | URL_REGEX="http(s)?:\\/\\/[-+%=?&():,._/#0-9a-zA-Z]+" 79 | 80 | MY_VERSION="v0.14" 81 | 82 | 83 | ### 84 | ### Curl defaults 85 | ### 86 | ### Some sites are very pickey about giving you correct return code if they think 87 | ### you are nun human-enough. 88 | ### This adds some sane defaults to all curl requests 89 | ### 90 | ### Note: Additionally 'Host' will be added dynamically 91 | ### Host: FQDN of URL 92 | ### 93 | CURL_DEFAULTS="" 94 | CURL_DEFAULTS="${CURL_DEFAULTS} -H 'Cache-Control: max-age=0'" 95 | #CURL_DEFAULTS="${CURL_DEFAULTS} -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36'" 96 | CURL_DEFAULTS="${CURL_DEFAULTS} -H 'Accept-Language: en-US,en;q=0.8,en-GB;q=0.6,es;q=0.4'" 97 | CURL_DEFAULTS="${CURL_DEFAULTS} -H 'DNT: 1'" 98 | CURL_DEFAULTS="${CURL_DEFAULTS} -H 'Referer: https://www.google.com'" 99 | 100 | 101 | ############################################################ 102 | # Functions 103 | ############################################################ 104 | 105 | ### 106 | ### Usage 107 | ### 108 | print_usage() { 109 | echo "Usage: linkcheck [-e -i -t -r -c -k -l -v] []" 110 | echo " linkcheck --version" 111 | echo " linkcheck --help" 112 | echo 113 | echo 114 | echo "Options:" 115 | echo 116 | echo "-e Limit search to those file extensions." 117 | echo " Defaults to limiting on non-binary files." 118 | echo " Accepts comma separated string of extensions:" 119 | echo " -e txt" 120 | echo " -e txt,rst" 121 | echo " -e sh,py.c,h" 122 | echo 123 | echo "-i Ignore all URLs matching the specified regex." 124 | echo ' Defaults to: ^http(s)?:\/\/(127\.0\.0\.1)|(localhost).*$' 125 | echo " Accepts a single regex string:" 126 | echo " -i '^http(?):\\/\\/my-comapny.com.*$'" 127 | echo 128 | echo "-t Specify curl timeout in seconds, after which probing stops for one url." 129 | echo " Defaults to 10 seconds." 130 | echo " Accepts a positive integer:" 131 | echo " -t 5" 132 | echo " -t 10" 133 | echo 134 | echo "-r Specify how many time to retry probing a single URL, before giving up." 135 | echo " Defaults to 3 times." 136 | echo " Accepts a positive integer:" 137 | echo " -r 5" 138 | echo " -r 10" 139 | echo 140 | echo "-c Specify HTTP status codes that are valid for success." 141 | echo " Any code not specified in here will produce an error for the given URL." 142 | echo " Defaults to '200'." 143 | echo " Accepts comma separated string of http status codes:" 144 | echo " -c '200'" 145 | echo " -c '200,301'" 146 | echo " -c '200,301,302'" 147 | echo 148 | echo "-k Ignore invalid SSL certificates for HTTPS connections." 149 | echo " This argument does not accept any parameters." 150 | echo " Defaults to error on invalid SSL certificates." 151 | echo 152 | echo "-l Specify whether to follow redirect URLs or not." 153 | echo " This argument does not accept any parameters." 154 | echo " Defaults to not following redirects." 155 | echo 156 | echo "-v Be verbose and also show affected files." 157 | echo 158 | echo "--version Show version and exit." 159 | echo "--help Show this help screen." 160 | echo 161 | echo 162 | echo "Optional arguments:" 163 | echo 164 | echo " Specify what directory to scan files for URLs." 165 | echo " Defaults to current directory." 166 | echo 167 | echo 168 | } 169 | 170 | 171 | ### 172 | ### Version 173 | ### 174 | print_version() { 175 | echo "linkcheck ${MY_VERSION} by cytopia" 176 | echo "https://github.com/cytopia/linkcheck" 177 | } 178 | 179 | 180 | ### 181 | ### Set value (used to store stdout and stderr in two different variables) 182 | ### 183 | setval() { 184 | printf -v "$1" "%s" "$(cat)"; 185 | declare -p "$1"; 186 | } 187 | 188 | 189 | ### 190 | ### Sanitize URL 191 | ### 192 | sanitize_url() { 193 | local url="${1}" 194 | local invalid="[,.!\\)]\$" 195 | 196 | # Remove any trailing garbage 197 | while [[ ${url} =~ ${invalid} ]]; do 198 | url="${url::-1}" 199 | done 200 | 201 | echo "${url}" 202 | } 203 | 204 | 205 | ### 206 | ### Gather URLs from files 207 | ### 208 | gather_urls() { 209 | local path="${1}" 210 | local extensions="${2}" 211 | local reg_include="${3}" 212 | local reg_exclude="${4}" 213 | 214 | local find_ext= 215 | local find_cmd= 216 | 217 | if [ -n "${extensions}" ]; then 218 | find_ext=" \\( -iname \\*.${extensions//,/ -o -iname \\*.} \\)" 219 | fi 220 | 221 | find_cmd="find ${path}${find_ext} -type f -exec grep -IEo '${reg_include}' '{}' \\; | sort -u" 222 | >&2 echo "\$ ${find_cmd}" 223 | 224 | # Loop through uniqued URLs 225 | for url in $(eval "${find_cmd}" 2>/dev/null); do 226 | # Ignore any 'Binary file...' results 227 | if echo "${url}" | grep -Eq '^htt'; then 228 | # Remove any trailing garbage 229 | url="$( sanitize_url "${url}" )" 230 | 231 | # Ignore URLs excluded by regex 232 | if ! echo "${url}" | grep -qE "${reg_exclude}"; then 233 | echo "${url}" 234 | fi 235 | fi 236 | done 237 | } 238 | 239 | 240 | ### 241 | ### Gather files by URL 242 | ### 243 | gather_files_by_url() { 244 | local url="${1}" 245 | local path="${2}" 246 | local extensions="${3}" 247 | 248 | local find_ext= 249 | local find_cmd= 250 | 251 | if [ -n "${extensions}" ]; then 252 | find_ext=" \\( -iname \\*.${extensions//,/ -o -iname \\*.} \\)" 253 | fi 254 | 255 | find "${path}${find_ext}" -type f -exec grep -FHIn "${url}" '{}' \; | sort -u 256 | } 257 | 258 | 259 | ### 260 | ### Probe URLs for availability 261 | ### 262 | probe_urls() { 263 | local urls="${1}" 264 | local timeout="${2}" 265 | local retries="${3}" 266 | local status_codes="${4}" 267 | local insecure_ssl="${5}" 268 | local follow_redirect="${6}" 269 | 270 | local clr_test="\\033[0;33m" # Yellow 271 | local clr_fail="\\033[0;31m" # Red 272 | local clr_err="\\033[0;31m" # Red 273 | local clr_ok="\\033[0;32m" # Green 274 | local clr_rst="\\033[m" # Reset to normal 275 | 276 | local host= 277 | local ret_code=0 278 | 279 | status_codes="${status_codes//,/|}" # comma to | 280 | status_codes="${status_codes//[[:space:]]/}" # remove whitespace 281 | 282 | # Remove duplicates 283 | urls="$( echo "${urls}" | sort -u )" 284 | 285 | # Probe each url 286 | for url in ${urls}; do 287 | 288 | # Determine hostname for Host header 289 | host="$( echo "${url}" | sed 's|^http\(s\)*://||g' | sed 's|/.*$||g' )" 290 | 291 | opts="-SsI" 292 | opts="${opts} --retry-delay 2" 293 | opts="${opts} --retry ${retries}" 294 | opts="${opts} --connect-timeout ${timeout}" 295 | opts="${opts} ${insecure_ssl}" 296 | opts="${opts} -H 'Host: ${host}'" 297 | opts="${opts} ${CURL_DEFAULTS}" 298 | opts="${opts} ${follow_redirect}" 299 | #echo "curl ${opts} ${url}" 300 | 301 | printf "${clr_test}[TEST]${clr_rst} %s ..." "${url}" 302 | 303 | # Get header from URL 304 | eval "$(eval "curl ${opts} \"${url}\"" 2> >(setval errval) > >(setval header); <<<$? setval retval)"; 305 | 306 | # Curl request failed 307 | # shellcheck disable=SC2154 308 | if [ "${retval}" != "0" ]; then 309 | # shellcheck disable=SC2154 310 | printf "\\r${clr_fail}[FAIL]${clr_rst} %s %s\\n" "${url}" "${errval}" 311 | ret_code=1 312 | 313 | # Curl request succeeded 314 | else 315 | # shellcheck disable=SC2154 316 | line="$( echo "${header}" | grep -E '^HTTP/(1|2)' )" 317 | stat="$( echo "${line}" | awk '{print $2}' )" 318 | 319 | if ! echo "${stat}" | tail -1 | grep -qE "${status_codes}"; then 320 | # Fix error line for multiline (in case of redirects via -l option) 321 | line="$( echo "${line}" | paste -sd "," | sed 's/,/ -> /g' | head -1 | tr -d '\n' | tr -d '\r' | sed 's/\s*$//g' )" 322 | printf "\\r${clr_err}[ERR]${clr_rst} %s ${clr_err}%s${clr_rst}\\n" "${url}" "(${line})" 323 | if [ "${VERBOSE}" -eq "1" ]; then 324 | gather_files_by_url "${url}" "${SEARCH_PATH}" "${EXTENSIONS}" 325 | fi 326 | 327 | ret_code=1 328 | else 329 | # Fix status code for multiline (in case of redirects via -l option) 330 | stat="$( echo "${stat}" | paste -sd "," | sed 's/,/ -> /g' )" 331 | printf "\\r${clr_ok}[OK]${clr_rst} %s ${clr_ok}%s${clr_rst}\\n" "${url}" "(${stat})" 332 | fi 333 | fi 334 | done 335 | return ${ret_code} 336 | } 337 | 338 | 339 | ############################################################ 340 | # Entrypoint: arguments 341 | ############################################################ 342 | #-e -i -t -r -c 343 | while [ $# -gt 0 ]; do 344 | case "${1}" in 345 | 346 | # ---------------------------------------- 347 | -e) 348 | shift 349 | if [ "${#}" -gt "0" ]; then 350 | EXTENSIONS="${1}" 351 | else 352 | >&2 echo "Error, -e requires an argument." 353 | exit 1 354 | fi 355 | ;; 356 | 357 | # ---------------------------------------- 358 | -i) 359 | shift 360 | if [ "${#}" -gt "0" ]; then 361 | URL_REGEX_EXCLUDE="${1}" 362 | else 363 | >&2 echo "Error, -i requires an argument." 364 | exit 1 365 | fi 366 | ;; 367 | 368 | # ---------------------------------------- 369 | -t) 370 | shift 371 | if [ "${#}" -gt "0" ]; then 372 | TIMEOUT="${1}" 373 | else 374 | >&2 echo "Error, -t requires an argument." 375 | exit 1 376 | fi 377 | ;; 378 | 379 | # ---------------------------------------- 380 | -r) 381 | shift 382 | if [ "${#}" -gt "0" ]; then 383 | RETRIES="${1}" 384 | else 385 | >&2 echo "Error, -r requires an argument." 386 | exit 1 387 | fi 388 | ;; 389 | # ---------------------------------------- 390 | -c) 391 | shift 392 | if [ "${#}" -gt "0" ]; then 393 | STATUS_CODES="${1}" 394 | else 395 | >&2 echo "Error, -c requires an argument." 396 | exit 1 397 | fi 398 | ;; 399 | 400 | # ---------------------------------------- 401 | -k) 402 | INSECURE_SSL="-k" 403 | ;; 404 | 405 | # ---------------------------------------- 406 | -l) 407 | FOLLOW_REDIRECT="-L" 408 | ;; 409 | 410 | # ---------------------------------------- 411 | -v) 412 | VERBOSE=1 413 | ;; 414 | 415 | # ---------------------------------------- 416 | --help) 417 | print_usage 418 | exit 0 419 | ;; 420 | 421 | # ---------------------------------------- 422 | --version) 423 | print_version 424 | exit 0 425 | ;; 426 | 427 | # ---------------------------------------- 428 | *) 429 | # If it is the last argument, its the path 430 | if [ "${#}" = "1" ]; then 431 | SEARCH_PATH="${1}" 432 | else 433 | echo "Invalid argument: ${1}" 434 | echo "Type 'linkcheck --help' for available options." 435 | exit 1 436 | fi 437 | ;; 438 | esac 439 | shift 440 | done 441 | 442 | 443 | 444 | MY_URLS="$( gather_urls "${SEARCH_PATH}" "${EXTENSIONS}" "${URL_REGEX}" "${URL_REGEX_EXCLUDE}" )" 445 | 446 | probe_urls "${MY_URLS}" "${TIMEOUT}" "${RETRIES}" "${STATUS_CODES}" "${INSECURE_SSL}" "${FOLLOW_REDIRECT}" 447 | --------------------------------------------------------------------------------