├── README.md └── theripper.sh /README.md: -------------------------------------------------------------------------------- 1 | # Datahoarder 2 | 3 | Scripts for grabbing files of the Internet 4 | 5 | ## theripper.sh 6 | 7 | ***Description:*** Uses wget's spider with aria2c's parallel downloading 8 | 9 | ***Usage:*** ./theripper.sh "opendirlink" "opendirsubstring" 10 | 11 | ***Example*** ./theripper.sh "http://link.com/blabla/doraemon/" "http://link.com/blabla" 12 | -------------------------------------------------------------------------------- /theripper.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ################################################################## 4 | # Description: Uses wget's spider with aria2c's parallel downloading 5 | # Usage: ./theripper.sh "opendirlink" "opendirsubstring" 6 | # Example: ./theripper.sh "http://link.com/blabla/doraemon/" "http://link.com/blabla" 7 | #################################################################### 8 | 9 | set -e 10 | 11 | URL=$1 12 | ROOT_PATH=$2 13 | LIST=./list-$$.txt 14 | MAX_CONNECTIONS_PER_SERVER=16 15 | USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36" 16 | 17 | usage() { 18 | cat < $logfile.tmp 33 | while read line; do 34 | sed -i "\|$line|d" $logfile 35 | done < $logfile.tmp 36 | cat $logfile | grep -i '^--[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9]--' | \ 37 | grep '[^'/']$' | sed -e 's/^--[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9]-- //g' > $LIST 38 | #Delete the folder made by wget (deletes all empty directories in the directory this script is run 39 | find . -type d -empty -delete #If you have a fix for this contact me since it should only delete the folder created by wget 40 | } 41 | 42 | download() { 43 | while read link; do 44 | #urldecode the links 45 | DECODED_LINK=$(echo $link | printf "%b\n" "$(sed 's/+/ /g; s/%\([0-9a-f][0-9a-f]\)/\\x\1/g;')";) 46 | DECODED_ROOT_PATH=$(echo $ROOT_PATH | printf "%b\n" "$(sed 's/+/ /g; s/%\([0-9a-f][0-9a-f]\)/\\x\1/g;')";) 47 | # Remove text after last / 48 | FULL_PATH=$(echo $DECODED_LINK | sed 's%/[^/]*$%/%') 49 | FILE_PATH=${FULL_PATH#${DECODED_ROOT_PATH}/} 50 | echo "${link}" >> link-$$.down 51 | echo " dir=$FILE_PATH" >> link-$$.down 52 | echo " continue=true" >> link-$$.down 53 | echo " max-connection-per-server=$MAX_CONNECTIONS_PER_SERVER" >> link-$$.down 54 | echo " split=16" >> link-$$.down 55 | echo " user-agent=$USER_AGENT" >> link-$$.down 56 | echo " header=Accept: text/html" >> link-$$.down 57 | echo -e " min-split-size=1M\n" >> link-$$.down 58 | done < $LIST 59 | #Download links 60 | aria2c -i link-$$.down -j 10 61 | 62 | } 63 | 64 | if [[ -z $1 || -z $2 || $# -ge 3 ]]; then 65 | usage 66 | exit 1 67 | fi 68 | 69 | echo "Creating list of urls..." 70 | spider 71 | echo "Index created!" 72 | download 73 | 74 | # Cleanup 75 | rm opendir-$$.log 76 | rm opendir-$$.log.tmp 77 | rm list-$$.txt 78 | rm link-$$.down 79 | --------------------------------------------------------------------------------