├── .gitignore ├── .travis.yml ├── README.md ├── LICENSE ├── crawler ├── functions.sh └── crawler.bats /.gitignore: -------------------------------------------------------------------------------- 1 | bats/ 2 | *.log 3 | index.html -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: bash 2 | install: 3 | - git clone https://github.com/sstephenson/bats.git 4 | - cd bats 5 | - sudo ./install.sh /usr/local 6 | - cd .. 7 | script: 8 | - bats --tap crawler.bats 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bash-crawler 2 | 3 | > Get a site links with bash 4 | 5 | ## Requirements 6 | 7 | 1. Get all the links from a website 8 | 2. For each link, get all links of the page 9 | 10 | ## Usage 11 | 12 | ```bash 13 | $ crawler 14 | 15 | Usage 16 | $ crawler [options] 17 | 18 | Options 19 | --depth, -d Depth of the links searched 5 20 | --help, -h Prints this helps false 21 | --list, -l Print a list formatted output false 22 | --verbose, -v Verbose output false 23 | 24 | Examples 25 | $ crawler -d 5 www.github.com 26 | ``` 27 | 28 | ## Tests 29 | 30 | Tests were made using [bats][bats] framework, to run tests type: 31 | 32 | ```bash 33 | $ bats --tap crawler.bats 34 | ``` 35 | 36 | ## License 37 | 38 | [MIT](/LICENSE) 39 | 40 | [bats]: https://github.com/sstephenson/bats 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Yerko Palma 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /crawler: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | version=0.1.0 4 | 5 | __dirname=$(dirname "$(readlink -f "$0")") 6 | . "$(readlink -f "$__dirname/functions.sh")" 7 | 8 | if [ $# -lt 1 ]; then 9 | print_help 10 | exit 1 11 | fi 12 | 13 | # parse valid options 14 | parse_options $@ 15 | 16 | if $verbose; then 17 | echo $CRAWLER_DEPTH 18 | fi 19 | 20 | # content = $target !== '' ? $(wget -qO - $target) || '' 21 | [[ -n $target ]] && content=$(wget -qO - $target) || content='' 22 | 23 | if [[ -z $content ]]; then 24 | echo "The site $1 content could not be reached" 25 | exit 1 26 | fi 27 | 28 | keep_reading=1 29 | i=0 30 | 31 | # remove the head before procesing 32 | content=${content##*} 44 | else 45 | # stop when there is no more 'href' in the given string 46 | keep_reading=0 47 | fi 48 | done 49 | unique_links=($(echo "${links[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ')) 50 | final_links=(filter_protocol "${unique_links[@]}") 51 | 52 | result[$target]=("${final_links[@]}") 53 | 54 | if [[ $CRAWLER_DEPTH -gt 0 ]]; then 55 | (( CRAWLER_DEPTH-- )) 56 | for flink in "${!final_links[@]}"; do 57 | ./$0 -d $CRAWLER_DEPTH $flink 58 | done 59 | else 60 | export result 61 | export CRAWLER_DEPTH 62 | print_result 63 | fi 64 | -------------------------------------------------------------------------------- /functions.sh: -------------------------------------------------------------------------------- 1 | print_help () { 2 | echo "Usage" 3 | echo " $ crawler [options] " 4 | echo "" 5 | echo "Options" 6 | echo " --depth, -d Depth of the links searched 5" 7 | echo " --list, -l Print a list formatted output false" 8 | echo "" 9 | echo "Examples" 10 | echo " $ crawler -d 5 www.github.com" 11 | } 12 | 13 | parse_options () { 14 | # set defaults 15 | CRAWLER_DEPTH=5 16 | while [[ $# -gt 0 ]]; do 17 | key="$1" 18 | case $key in 19 | -h|--help) 20 | print_help 21 | ;; 22 | -v|--verbose) 23 | verbose=true 24 | ;; 25 | -l|--list) 26 | list=true 27 | ;; 28 | -d) 29 | CRAWLER_DEPTH="$2" 30 | shift # past argument 31 | ;; 32 | --depth=*) 33 | CRAWLER_DEPTH="${1#*=}" 34 | ;; 35 | *) 36 | if [ -n $1 ]; then 37 | target=$1 38 | else 39 | print_help 40 | fi 41 | ;; 42 | esac 43 | shift # past argument or value 44 | done 45 | } 46 | 47 | filter_protocol () { 48 | allowed_protocols=(http https / \#) 49 | _links=("$@") 50 | 51 | for i in "${!_links[@]}"; do 52 | # echo "link: ${_links[i]}" 53 | # if link start with allowed protocol, is fine 54 | shouldStop=0 55 | for protocol in "${allowed_protocols[@]}"; do 56 | if [[ ${_links[i]} == "$protocol"* ]]; then 57 | shouldStop=1 58 | break 59 | fi 60 | done 61 | if [[ $shouldStop -eq 1 ]]; then 62 | continue 63 | fi 64 | # return 0 65 | # else, if link does not contain ':' is fine 66 | if [[ ${_links[i]} != *:[^0-9]* ]]; then 67 | # unset '_links[i]' 68 | printf "" 69 | else 70 | # echo "link: ${_links[i]}" 71 | unset '_links[i]' 72 | fi 73 | done 74 | return $_links 75 | } 76 | 77 | print_result () { 78 | is_array=$(declare -p result 2> /dev/null | grep -q '^declare \-a' && echo 1 || echo 0) 79 | # 1 is array 80 | # 0 is not array 81 | if [[ $is_array -eq 1 ]]; then 82 | exit 1 83 | fi 84 | } 85 | -------------------------------------------------------------------------------- /crawler.bats: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bats 2 | 3 | @test "With no arguments, should print help" { 4 | run ./crawler 5 | 6 | [ "${lines[0]}" = "Usage" ] 7 | [ "${lines[1]}" = " $ crawler [options] " ] 8 | [ "${lines[2]}" = "Options" ] 9 | [ "${lines[3]}" = " --depth, -d Depth of the links searched 5" ] 10 | [ "${lines[4]}" = " --list, -l Print a list formatted output false" ] 11 | [ "${lines[5]}" = "Examples" ] 12 | [ "${lines[6]}" = " $ crawler -d 5 www.github.com" ] 13 | } 14 | 15 | @test "-h and --help should print help" { 16 | run ./crawler -h 17 | 18 | [ "${lines[0]}" = "Usage" ] 19 | [ "${lines[1]}" = " $ crawler [options] " ] 20 | [ "${lines[2]}" = "Options" ] 21 | [ "${lines[3]}" = " --depth, -d Depth of the links searched 5" ] 22 | [ "${lines[4]}" = " --list, -l Print a list formatted output false" ] 23 | [ "${lines[5]}" = "Examples" ] 24 | [ "${lines[6]}" = " $ crawler -d 5 www.github.com" ] 25 | 26 | run ./crawler --help 27 | 28 | [ "${lines[0]}" = "Usage" ] 29 | [ "${lines[1]}" = " $ crawler [options] " ] 30 | [ "${lines[2]}" = "Options" ] 31 | [ "${lines[3]}" = " --depth, -d Depth of the links searched 5" ] 32 | [ "${lines[4]}" = " --list, -l Print a list formatted output false" ] 33 | [ "${lines[5]}" = "Examples" ] 34 | [ "${lines[6]}" = " $ crawler -d 5 www.github.com" ] 35 | } 36 | 37 | @test "if wget fails, should print 'not reacheble site' error" { 38 | run ./crawler fakesite 39 | 40 | # [ "$output" = "The site fakesite content could not be reached" ] 41 | [ "${lines[0]}" = "The site fakesite content could not be reached" ] 42 | } 43 | 44 | @test "set depth option if defined" { 45 | run ./crawler -v -d 5 46 | [ "${lines[0]}" = "5" ] 47 | 48 | run ./crawler --verbose --depth=4 49 | [ "${lines[0]}" = "4" ] 50 | } 51 | 52 | @test "set list option if defined" { 53 | skip 54 | run ./crawler -vl -d 2 55 | } 56 | 57 | @test "throw unknown option when option not recognized" { 58 | skip 59 | } 60 | 61 | @test "depth default to 10 and list default to false" { 62 | skip 63 | } 64 | --------------------------------------------------------------------------------