├── LICENCE
├── README.md
└── prsync


/LICENCE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Nathan S. Watson-Haigh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This script is a simple drop-in replacement for `rsync` for parallelising your data transfer.
 2 | 
 3 | Rsync is the tool of choice for copying/syncing data between locations.
 4 | It is capable of only transfering files which have changed and resuming upload/downloads.
 5 | However, the transfer speed of a single `rsync` can be somewhat slow.
 6 | This is a problem when transfering a large amount of data as it will take some time to complete.
 7 | 
 8 | If your rsync contains lots of files, you can benefit from transfering files in parallel.
 9 | Thus benfiting from a more effective use of your available network bandwidth and gettging the job done faster.
10 | 
11 | # Usage
12 | 
13 | If your `rsync` command looks like this:
14 | 
15 | ```bash
16 | rsync \
17 |   --times --recursive --progress \
18 |   --exclude "raw_reads" --exclude ".snakemake" \
19 |   user@example.com:/my_remote_dir/ /my_local_dir/
20 | ```
21 | 
22 | Simply replace the `rsync` executable for this script:
23 | 
24 | ```bash
25 | ./prsync \
26 |   --times --recursive --progress \
27 |   --exclude "raw_reads" --exclude ".snakemake" \
28 |   user@example.com:/my_remote_dir/ /my_local_dir/
29 | ```
30 | 
31 | ## Number of Parallel Jobs
32 | 
33 | By default, the script will use 1 parallel job for each processor on the machine.
34 | This is determined by `nproc` and if this fails, we fall back to `10` parallel jobs for transfering files.
35 | This behaviour can be overriden by using `--parallel` as the first command line argument to the script:
36 | 
37 | ```bash
38 | ./prsync \
39 |   --parallel=20 \
40 |   --times --recursive --progress \
41 |   --exclude "raw_reads" --exclude ".snakemake" \
42 |   user@example.com:/my_remote_dir/ /my_local_dir/
43 | ```
44 | 
45 | # Implementation
46 | 
47 | The list of files to be transfered is calulated by first running `rsync` in dry-run mode.
48 | It is then split into `N` chunks based on the value of `--parallel` (10 by default).
49 | Each "chunk" of files is then passed to parallel `rsync` process.
50 | 
51 | To ensure a more balanced distribution of files among chunks, files are sorted by decreasing filesize and then assigned to the chunk with the least data to process.
52 | This ensures that chunks are of approximately the same size and have the same number of files to process.
53 | Thus parallel `rsync` processes will complete at around the same time.
54 | 


--------------------------------------------------------------------------------
/prsync:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | # Inspired by https://gist.github.com/akorn/644855ddaa8065f564be
  4 | 
  5 | # Usage:
  6 | #   rsync_parallel.sh [--parallel=N] [rsync args...]
  7 | # 
  8 | # Options:
  9 | #   --parallel=N	Use N parallel processes for transfer. Default is to use all available processors (`nproc`) or fail back to 10.
 10 | #
 11 | # Notes:
 12 | #   * Requires GNU Parallel
 13 | #   * Use with ssh-keys. Lots of password prompts will get very annoying.
 14 | #   * Does an itemize-changes first, then chunks the resulting file list and launches N parallel
 15 | #     rsyncs to transfer a chunk each.
 16 | #   * be a little careful with the options you pass through to rsync. Normal ones will work, you 
 17 | #     might want to test weird options upfront.
 18 | #
 19 | 
 20 | # Define colours for STDERR text
 21 | RED='\033[0;31m'
 22 | ORANGE='\033[0;33m'
 23 | GREEN='\033[0;32m'
 24 | NC='\033[0m' # No Color
 25 | 
 26 | if ! command -v parallel &> /dev/null
 27 | then
 28 |   echo -e "${RED}parallel could not be found${NC}"
 29 |   exit
 30 | fi
 31 | 
 32 | if [[ "$1" == --parallel=* ]]; then
 33 |   PARALLEL_RSYNC="${1##*=}"
 34 |   shift
 35 | else
 36 |   PARALLEL_RSYNC=$(nproc 2> /dev/null || echo 10)
 37 | fi
 38 | echo -e "${GREEN}INFO: Using up to ${PARALLEL_RSYNC} processes for transfer ...${NC}"
 39 | 
 40 | TMPDIR=$(mktemp -d)
 41 | trap 'rm -rf "${TMPDIR}"' EXIT
 42 | 
 43 | echo -e "${GREEN}INFO: Determining file list for transfer ...${NC}"
 44 | # sorted by size (descending)
 45 | rsync "$@" --out-format="%l %n" --no-v --dry-run 2> /dev/null \
 46 |   | grep -v "sending incremental file list" \
 47 |   | sort --numeric-sort --reverse \
 48 |   > "${TMPDIR}/files.all"
 49 | 
 50 | # check for nothing-to-do
 51 | TOTAL_FILES=$(wc -l < "${TMPDIR}/files.all")
 52 | TOTAL_SIZE=$(awk '{ts+=$1}END{printf "%.0f", ts}' < "${TMPDIR}/files.all")
 53 | echo -e "${GREEN}INFO: ${TOTAL_FILES} ($(( TOTAL_SIZE/1024**2 )) MB) files to transfer.${NC}"
 54 | if [ "${TOTAL_FILES}" -eq "0" ]; then
 55 |   echo -e "${ORANGE}WARN: Nothing to transfer :)${NC}"
 56 |   exit 0
 57 | fi
 58 | 
 59 | function array_min {
 60 |   ARR=("$@")
 61 | 
 62 |   # Default index for min value
 63 |   min_i=0
 64 | 
 65 |   # Default min value
 66 |   min_v=${ARR[$min_i]}
 67 | 
 68 |   for i in "${!ARR[@]}"; do
 69 |     v="${ARR[$i]}"
 70 | 
 71 |     (( v < min_v )) && min_v=${v} && min_i=${i}
 72 |   done
 73 | 
 74 |   MIN_I="${min_i}"
 75 |   #echo "${min_i}"
 76 | }
 77 | 
 78 | echo -e "${GREEN}INFO: Distributing files among chunks ...${NC}"
 79 | # declare chunk-size array
 80 | for ((I = 0 ; I < PARALLEL_RSYNC ; I++ )); do
 81 |   CHUNKS["${I}"]=0 
 82 | done
 83 | 
 84 | # add each file to the emptiest chunk, so they're as balanced by size as possible
 85 | PROGRESS=0
 86 | SECONDS=0
 87 | while read -r FSIZE FPATH; do
 88 |   PROGRESS=$((PROGRESS+1))
 89 | 
 90 |   array_min "${CHUNKS[@]}"
 91 |   #MIN_I=$(array_min ${CHUNKS[@]})
 92 | 
 93 |   CHUNKS[MIN_I]=$(( CHUNKS[MIN_I] + FSIZE ))
 94 |   echo "${FPATH}" >> "${TMPDIR}/chunk.${MIN_I}"
 95 | 
 96 |   if ! ((PROGRESS % 25000)); then
 97 |     >&2 echo -e "${GREEN}INFO: ${PROGRESS} of ${TOTAL_FILES} (${SECONDS}s)${NC}"
 98 |   fi
 99 | done < "${TMPDIR}/files.all"
100 | # Reverse the list of files in every other chunk so some are sorted largest->smallest and others smallest->largest
101 | # This will aid transfer efficiency, so some "queues" are processing large files first, while others are processing small files first.
102 | for ((I = 1 ; I < PARALLEL_RSYNC ; I+=2 )); do
103 |   # If the list of files to transfer is small, we may not have any files to work with
104 |   if [ ! -f "${TMPDIR}/chunk.${I}" ]; then
105 |     continue
106 |   fi
107 |   
108 |   tac "${TMPDIR}/chunk.${I}" > "${TMPDIR}/chunk.${I}.r" && mv "${TMPDIR}/chunk.${I}.r" "${TMPDIR}/chunk.${I}"
109 | done
110 | 
111 | echo -e "${GREEN}DONE (${SECONDS}s)${NC}"
112 | 
113 | #find "${TMPDIR}" -type f -name "chunk.*" -exec cat {} \;
114 | 
115 | echo -e "${GREEN}INFO: Starting transfers ...${NC}"
116 | find "${TMPDIR}" -type f -name "chunk.*" | parallel -j "${PARALLEL_RSYNC}" -t --verbose --progress rsync --files-from={} "$@"
117 | echo -e "${GREEN}DONE (${SECONDS}s)${NC}"
118 | 


--------------------------------------------------------------------------------