├── README.md ├── scripts_from_the_past ├── named-md5 ├── check-topmed-jobs ├── topmed_md5_prep.sh ├── submit-md5-jobs ├── generate-topmed-md5-worklist ├── generate-topmed-copy-script └── wh_lib.py ├── original_steps ├── md5-steps.sh ├── validation-steps.sh └── steps.sh ├── docs ├── alternate_login_with_sug-app1.md └── topmed_automation.md ├── LICENSE ├── multiplex_scripts ├── topmed_md5_prep_shep.sh ├── submit-md5-jobs ├── submit-cram-validation ├── generate-topmed-md5-worklist-tsv ├── generate-topmed-copy-script-tsv ├── wh_lib.py.bk01 └── wh_lib.py ├── topmed.sh └── topmed_automation.sh /README.md: -------------------------------------------------------------------------------- 1 | # topmed-tools 2 | *For use by the Submissions Team*
3 | A collection of scripts and steps for sharing data (initially used as a part of the TOPMed submissions pipeline)
4 | 5 | **Requires lots of love** 6 | -------------------------------------------------------------------------------- /scripts_from_the_past/named-md5: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Create a .md5 file with the specified name for the source file. 4 | 5 | SRC="$1" 6 | DST="$2" 7 | 8 | DST_DIR=$(dirname "$DST") 9 | DST_NAME=$(basename "$DST") 10 | 11 | mkdir -p "$DST_DIR" 12 | md5sum <"$SRC" | sed "s/-/$DST_NAME/" >"$DST".md5 13 | -------------------------------------------------------------------------------- /scripts_from_the_past/check-topmed-jobs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | batch_name="$1" 4 | 5 | cut -f2 "$batch_name"_md5_jobs.txt | 6 | xargs -n1 checkjob -v >"$batch_name".checkjob 7 | 8 | ( 9 | echo '' 10 | cut -f2 "$batch_name"_md5_jobs.txt | xargs -n1 checkjob -v --xml 11 | echo 12 | echo '' 13 | ) >"$batch_name".checkjob.xml 14 | -------------------------------------------------------------------------------- /original_steps/md5-steps.sh: -------------------------------------------------------------------------------- 1 | cd /stornext/snfs1/submissions/topmed 2 | 3 | # basename $(ls -d topmed-shared/batches/*_batch??_????-??-?? | tail -n 1) 4 | batch_name=$(ls -t topmed-shared/batches/globus/ | grep '_batch.._....-..-..' | head -n1) 5 | echo $batch_name 6 | mkdir md5-batches/"$batch_name" 7 | ll md5-batches/ 8 | echo submit-md5-jobs topmed-shared/batches/globus/"$batch_name"/"$batch_name"_md5 md5-batches/"$batch_name" 9 | submit-md5-jobs topmed-shared/batches/globus/"$batch_name"/"$batch_name"_md5 md5-batches/"$batch_name" 10 | -------------------------------------------------------------------------------- /docs/alternate_login_with_sug-app1.md: -------------------------------------------------------------------------------- 1 | # Using the App node for the cram-processing pipeline 2 | 3 | **The current practice is to use the login nodes to carry out the steps in the cram-processing pipeline. 4 | Automation of cram-processing-tools reflects this practice.** 5 | 6 | ## Demonstration of how to use an app-node as an alternative to the login-node. 7 | 8 | - Login nodes are usually for data transfer with laptop 9 | - Example: `rsync my_directory sug-login1.hgsc.bcm.edu:/groups/submissions/users/person` 10 | 11 | - On laptop, this script...: 12 | ``` 13 | (main) bin$ cat sa1 14 | #!/usr/bin/env bash 15 | 16 | login_host=${DEFAULT_SUG_LOGIN:-$s4} 17 | 18 | ssh -t $login_host ssh sug-app1 "$@" 19 | ``` 20 | - ...Is equivalent to: 21 | ``` 22 | ssh -t sug-login4.hgsc.bcm.edu ssh sug-app1 23 | ``` 24 | 25 | - The -t on the outer ssh says: 26 | - Do not allocate a pseudo-terminal, just pass the connection through to the inner ssh, which will allocate the one pseudo-terminal 27 | -------------------------------------------------------------------------------- /original_steps/validation-steps.sh: -------------------------------------------------------------------------------- 1 | cd /stornext/snfs1/submissions/topmed 2 | 3 | # batch_name=$( 4 | # basename $(ls -d topmed-shared/batches/*_batch??_????-??-?? | tail -n 1) 5 | # ) 6 | batch_name=$(ls -t topmed-shared/batches/globus/ | grep '_batch.._....-..-..' | head -n1) 7 | echo $batch_name 8 | 9 | mkdir -p validation-batches/${batch_name}/input 10 | cat topmed-shared/batches/globus/${batch_name}/${batch_name}_md5 | 11 | sed 's/^msub-md5/msub-val/' | 12 | tee validation-batches/${batch_name}/${batch_name}_val | 13 | tail 14 | 15 | pushd validation-batches/${batch_name}/input/ 16 | 17 | #creates symlinks to all the bams in the input directory 18 | ECHO=echo 19 | msub-val() { $ECHO ln -s "$2" "$1"; } 20 | . ../${batch_name}_val 21 | unset ECHO 22 | . ../${batch_name}_val 23 | 24 | cd .. 25 | ls input/NWD* | head -n5 | xargs -n1 echo submit-cram-validation run_a 26 | # ls input/NWD* | head -n5 | xargs -n1 submit-validation run_a 27 | # ls input/NWD* | tail -n+6 | xargs -n1 submit-validation run_b 28 | ls input/NWD* | xargs -n1 submit-cram-validation run_a proj-dm0019 29 | 30 | popd 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 BCM-HGSC Submissions 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /multiplex_scripts/topmed_md5_prep_shep.sh: -------------------------------------------------------------------------------- 1 | batch_name="$1" 2 | samples="$2" 3 | batch_type="$3" 4 | if ((`find . -name '*.out' -size 0 | xargs ls | wc -l` == $samples)); then 5 | echo "$samples MD5_s were generated" 6 | echo `find . -name '*.out' -size 0 | xargs rm` 7 | fi 8 | if (( `(find . -name '*.out' | wc -l )` == 0 )); then 9 | echo "All outs have been erased will ensure Manifest contains correct number of samples" 10 | fi 11 | if (( `cat *md5 | tee Manifest.txt | wc -l ` == $samples)); then 12 | echo "Manifest has been created succeffully, will now copy to designated area" 13 | echo `cat *md5 | tee Manifest.txt | wc -l - ../${batch_name}_md5` 14 | #echo `cp -p Manifest.txt /stornext/snfs1/submissions/topmed/topmed-shared/batches/globus/$batch_name ` 15 | echo cp Manifest.txt ../ 16 | echo `scp Manifest.txt christis@hgsc-aspera1.hgsc.bcm.edu:/share/share/globusupload/submissions/$batch_type/$batch_name/` 17 | echo "Manifest has been copied, have a good day" 18 | else 19 | echo " ERROR. There are below 100 MD5s..." 20 | 21 | fi 22 | #error is that it is making the Manifest script but is not making the copy in the spot it is supposed to be made 23 | -------------------------------------------------------------------------------- /scripts_from_the_past/topmed_md5_prep.sh: -------------------------------------------------------------------------------- 1 | batch_name="$1" 2 | samples="$2" 3 | batch_type="$3" 4 | if ((`find . -name '*.out' -size 0 | xargs ls | wc -l` == $samples)); then 5 | echo "$samples MD5_s were generated" 6 | echo `find . -name '*.out' -size 0 | xargs rm` 7 | fi 8 | if (( `(find . -name '*.out' | wc -l )` == 0 )); then 9 | echo "All outs have been erased will ensure Manifest contains correct number of samples" 10 | fi 11 | if (( `cat *md5 | tee Manifest.txt | wc -l ` == $samples)); then 12 | echo "Manifest has been created succeffully, will now copy to designated area" 13 | echo `cat *md5 | tee Manifest.txt | wc -l - /stornext/snfs1/submissions/topmed/topmed-shared/batches/globus/$batch_name/${batch_name}_md5` 14 | echo `cp -p Manifest.txt /stornext/snfs1/submissions/topmed/topmed-shared/batches/globus/$batch_name ` 15 | echo `scp Manifest.txt christis@hgsc-aspera1.hgsc.bcm.edu:/share/share/globusupload/submissions/$batch_type/$batch_name/` 16 | echo "Manifest has been copied, have a good day" 17 | else 18 | echo " ERROR. There are below 100 MD5s..." 19 | 20 | fi 21 | #error is that it is making the Manifest script but is not making the copy in the spot it is supposed to be made 22 | -------------------------------------------------------------------------------- /multiplex_scripts/submit-md5-jobs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | BATCH_FILE_PATH="$1" # Example: Sarcoidosis_batch02_2016-02-05_md5_a 4 | DST="$2" # Directory where all results will go 5 | PROJECT_CODE="$3" 6 | # Example batch file: 7 | # msub-md5 'NWD414772-HK2HCCCXX-1.hgv.bam' '/stornext/snfs120/next-gen/Illumina/Instruments/E00212/160109_ST-E00212_0182_AHK2HCCCXX/Results/Project_160109_ST-E00212_0182_AHK2HCCCXX/Sample_HK2HCCCXX-1/HK2HCCCXX-1.hgv.bam' 8 | # msub-md5 'NWD520478-HK2HCCCXX-2.hgv.bam' '/stornext/snfs120/next-gen/Illumina/Instruments/E00212/160109_ST-E00212_0182_AHK2HCCCXX/Results/Project_160109_ST-E00212_0182_AHK2HCCCXX/Sample_HK2HCCCXX-2/HK2HCCCXX-2.hgv.bam' 9 | 10 | # Can override in $(pwd)/topmed_msub_init.sh or prepend to batch file: 11 | JOB_BATCH_NAME=$(basename "$BATCH_FILE_PATH") 12 | BATCH_SUFFIX=$(echo "$JOB_BATCH_NAME"_ | cut -d_ -f5 | cut -c1) 13 | #PROJECT_CODE=proj-dm0019 14 | QUEUE=normal # or "analysis" for long jobs `three_hours` for short jobs 15 | RESOURCES=nodes=1:ppn=1,mem=10mb,walltime=35:00:00 16 | 17 | err() { echo "$@" >&2; } 18 | 19 | if [ "$#" -ne 3 ]; then 20 | err 'Usage:' 21 | err "$0 BATCH_FILE DESTINATION_DIR" 22 | err 23 | exit 1 24 | fi 25 | 26 | msub-md5() { 27 | NEW_NAME="$1" 28 | SRC="$2" 29 | command="/hgsc_software/submissions/bin/named-md5 '$SRC' '$NEW_NAME'" 30 | JOB_NAME=md5"$BATCH_SUFFIX"-"$NEW_NAME" 31 | MSUB_OPTS="-q $QUEUE -A $PROJECT_CODE -l $RESOURCES -j oe" 32 | echo $NEW_NAME | tr \\n \\t 33 | echo $command | 34 | msub $MSUB_OPTS -d "$DST" -o "$NEW_NAME".out -N $JOB_NAME | 35 | sed '1s/.sug-moab$//' 36 | } 37 | 38 | # Allow for customization, such as during testing. 39 | if [ -f topmed_msub_init.sh ]; then 40 | err 'loading topmed_msub_init.sh' 41 | . topmed_msub_init.sh 42 | fi 43 | 44 | ################################################## 45 | # Now this script starts doing things to the OS... 46 | ################################################## 47 | 48 | mkdir -p "$DST" 49 | DST="$(cd "$DST"; pwd)" 50 | err DST: $DST 51 | 52 | (. "$BATCH_FILE_PATH") | tee "$DST/$JOB_BATCH_NAME"_jobs.txt | cat -n 53 | -------------------------------------------------------------------------------- /scripts_from_the_past/submit-md5-jobs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | BATCH_FILE_PATH="$1" # Example: Sarcoidosis_batch02_2016-02-05_md5_a 4 | DST="$2" # Directory where all results will go 5 | PROJECT_CODE="$3" #proj-dm0019 6 | # Example batch file: 7 | # msub-md5 'NWD414772-HK2HCCCXX-1.hgv.bam' '/stornext/snfs120/next-gen/Illumina/Instruments/E00212/160109_ST-E00212_0182_AHK2HCCCXX/Results/Project_160109_ST-E00212_0182_AHK2HCCCXX/Sample_HK2HCCCXX-1/HK2HCCCXX-1.hgv.bam' 8 | # msub-md5 'NWD520478-HK2HCCCXX-2.hgv.bam' '/stornext/snfs120/next-gen/Illumina/Instruments/E00212/160109_ST-E00212_0182_AHK2HCCCXX/Results/Project_160109_ST-E00212_0182_AHK2HCCCXX/Sample_HK2HCCCXX-2/HK2HCCCXX-2.hgv.bam' 9 | 10 | # Can override in $(pwd)/topmed_msub_init.sh or prepend to batch file: 11 | JOB_BATCH_NAME=$(basename "$BATCH_FILE_PATH") 12 | BATCH_SUFFIX=$(echo "$JOB_BATCH_NAME"_ | cut -d_ -f5 | cut -c1) 13 | #PROJECT_CODE=proj-dm0019 14 | QUEUE=normal # or "analysis" for long jobs `three_hours` for short jobs 15 | RESOURCES=nodes=1:ppn=1,mem=10mb,walltime=35:00:00 16 | 17 | err() { echo "$@" >&2; } 18 | 19 | if [ "$#" -ne 3 ]; then 20 | err 'Usage:' 21 | err "$0 BATCH_FILE DESTINATION_DIR proj_code" 22 | err 23 | exit 1 24 | fi 25 | 26 | msub-md5() { 27 | NEW_NAME="$1" 28 | SRC="$2" 29 | command="/hgsc_software/submissions/bin/named-md5 '$SRC' '$NEW_NAME'" 30 | JOB_NAME=md5"$BATCH_SUFFIX"-"$NEW_NAME" 31 | MSUB_OPTS="-q $QUEUE -A $PROJECT_CODE -l $RESOURCES -j oe" 32 | echo $NEW_NAME | tr \\n \\t 33 | echo $command | 34 | msub $MSUB_OPTS -d "$DST" -o "$NEW_NAME".out -N $JOB_NAME | 35 | sed '1s/.sug-moab$//' 36 | } 37 | 38 | # Allow for customization, such as during testing. 39 | if [ -f topmed_msub_init.sh ]; then 40 | err 'loading topmed_msub_init.sh' 41 | . topmed_msub_init.sh 42 | fi 43 | 44 | ################################################## 45 | # Now this script starts doing things to the OS... 46 | ################################################## 47 | 48 | mkdir -p "$DST" 49 | DST="$(cd "$DST"; pwd)" 50 | err DST: $DST 51 | 52 | (. "$BATCH_FILE_PATH") | tee "$DST/$JOB_BATCH_NAME"_jobs.txt | cat -n 53 | -------------------------------------------------------------------------------- /multiplex_scripts/submit-cram-validation: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DEFAULT_INIT_SCRIPT='submit-validation-init.sh' 4 | DEFAULT_MEM=15gb 5 | DEFAULT_WALLTIME=24:00:00 6 | 7 | usage() { 8 | err "usage: $(basename "$0") DST_DIR BAM_PATH [PROJECT_CODE] [QUEUE]" 9 | err " Submit validation job." 10 | err " DST_DIR: output & working directory" 11 | err 12 | err "Environment:" 13 | err " SUBMIT_VALIDATION_INIT: default ($DEFAULT_INIT_SCRIPT) sourced at" 14 | err " end of variable initialization" 15 | err " MEM: default ($DEFAULT_MEM)" 16 | err " WALLTIME: default ($DEFAULT_WALLTIME)" 17 | exit 1 18 | } >&2 19 | 20 | err() { echo "$@" >&2; } 21 | 22 | if [[ $# -lt 2 ]] || [[ $# -gt 4 ]]; then 23 | usage 24 | fi 25 | 26 | dst_dir=$(readlink -m "$1") 27 | cram_path=$(python3 -c "import os; print(os.path.abspath('$2'))") 28 | project_code_code="$3") 29 | #project_code=${3:-proj-dm0019} # TODO: handle default outside of script 30 | queue=${4:-analysis} # for long jobs 31 | 32 | MEM=${MEM:-$DEFAULT_MEM} 33 | WALLTIME=${WALLTIME:-$DEFAULT_WALLTIME} 34 | 35 | JAVA='/hgsc_software/java/jdk1.8.0_74/bin/java' 36 | JAR='/hgsc_software/picard/picard-tools-2.6.0/picard.jar' 37 | VALIDATE_ARGS='ValidateSamFile' 38 | TIME_FMT='cmd: %C\nerr: %x\nsecs: %e\nproc: %P\nkb: %M' 39 | REFERENCE='/stornext/snfs1/submissions/resources/referneces/GRCh38.fa' 40 | resources=nodes=1:ppn=1,mem=$MEM,walltime=$WALLTIME 41 | msub_opts="-q $queue -A $project_code -l $resources" 42 | out_name=$(basename "$cram_path") 43 | job_name="$out_name.val" 44 | 45 | # Allow for customization, such as during testing. 46 | INIT_SCRIPT=${SUBMIT_VALIDATION_INIT:-"$DEFAULT_INIT_SCRIPT"} 47 | if [[ -f "$INIT_SCRIPT" ]]; then 48 | err "loading $INIT_SCRIPT" 49 | . "$INIT_SCRIPT" 50 | fi 51 | 52 | mkdir -p "$dst_dir" 53 | 54 | command="/usr/bin/time -f'$TIME_FMT' -o '$job_name.time' '$JAVA' -jar '$JAR' $VALIDATE_ARGS I='$cram_path' R='$REFERENCE' IGNORE=MISSING_TAG_NM " 55 | echo $command | 56 | msub $msub_opts -N "$job_name" -d "$dst_dir" -o "$job_name".out -e "$job_name".err | 57 | sed '1s/.sug-moab$//' | 58 | tee -a "$dst_dir"/"$job_name".job 59 | -------------------------------------------------------------------------------- /docs/topmed_automation.md: -------------------------------------------------------------------------------- 1 | # TOPmed Automation 2 | 3 | `topmed_automation` is a bash script that runs a majority of steps for cram processing. 4 | 5 | **As of now, the user is still responsible for:** 6 | - Checking if aspera has sufficient space. 7 | - Checking validation after it is done. 8 | - Copying the crams over to aspera with the generated copy script. 9 | - Creating the manifest and copying it to aspera with `topmed_md5_prep_shep.sh` 10 | 11 | ## Setup 12 | 13 | #### Passwordless Entry 14 | 15 | You must have passwordless entry when using `ssh` to a copy node. 16 | 17 | - ssh into the login node. `ssh USERNAME@sug-login#.hgsc.bcm.edu` 18 | - `cd .ssh`. You should have a `id_rsa.pub` file and a `authorized_keys` file. 19 | - Append your public key to the authorized_keys file. `cat id_rsa.pub >> autorized_keys` 20 | 21 | You should no longer need to enter your password when login to any of the other nodes. 22 | 23 | 24 | ## How to Run 25 | To run the script type: `./topmed_automation PM_CODE PM_PATH` 26 | 27 | - `PM_CODE` is the project code that is given in the RT. An example code would be `proj-dm0021`. 28 | - `PM_PATH` is the path given to you by the project manager. An example path would look like this: `/hgsc_software/groups/project-managers/tech/metadata/v1/topmed/YR3/cardiomyopathy/01/03a` 29 | 30 | 31 | ## Workflow 32 | 33 | - Check if `shepherd.yaml` exist in the users `.config` directory. 34 | - If it does, proceed. 35 | - It it does not, the `shepherd.yaml` file will be created and then proceeds. 36 | - Checks if previous named tmux sessions `topmed-copy` and `topmed-login` exist. 37 | - It if it does, the user will need to manually close it just in case they have an important task running. 38 | - If it does not, proceed. 39 | - Run shepherd's `accept_batch` script. 40 | - Generate the copy script with `generate-topmed-copy-script-tsv` 41 | - Generate the md5_worklist script with `generate-topmed-md5-worklist-tsv` 42 | - Create the input directory under `validation/` 43 | - Create symlinks to all the crams in the input directory 44 | - In the `topmed-login` session, submit md5 jobs to the cluster with the `submit-md5-jobs` script. 45 | - In the `topmed-login` session, run validation with the `submit-cram-validation_phase5`script. 46 | -------------------------------------------------------------------------------- /scripts_from_the_past/generate-topmed-md5-worklist: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Generate the worklist for submitting a bunch of jobs that compute MD5s. 4 | Execute "submit-md5-jobs worklist_file_path" to then submit all the jobs.""" 5 | 6 | import argparse 7 | from functools import partial 8 | import logging 9 | import sys 10 | 11 | import wh_lib 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def main(): 18 | args = parse_args() 19 | config_logging(args) 20 | run(args) 21 | logging.shutdown() 22 | 23 | 24 | def parse_args(): 25 | parser = argparse.ArgumentParser(description=__doc__) 26 | parser.add_argument('worklist_file_path') 27 | parser.add_argument('-v', '--verbose', action='store_true') 28 | args = parser.parse_args() 29 | return args 30 | 31 | 32 | def config_logging(args): 33 | global logger 34 | level = logging.DEBUG if args.verbose else logging.INFO 35 | logging.basicConfig(level=level) 36 | logger = logging.getLogger('gen-topmed-md5') 37 | 38 | 39 | def run(args): 40 | logger.debug('args: %r', args) 41 | worklist = wh_lib.parse_excel_records(args.worklist_file_path) 42 | batch_dir, project_dir = extract_batch_info(worklist) 43 | worklist_name = '{}_md5'.format(batch_dir) 44 | new_dir = '{0}/{1}'.format(project_dir, batch_dir) 45 | with open(worklist_name, 'w') as fout: 46 | output_copy_script(fout, new_dir, worklist) 47 | 48 | 49 | def extract_batch_info(worklist): 50 | """Return batch_dir & project_dir, which must be the same for all rows.""" 51 | batch_dirs = set(r.batch for r in worklist) 52 | project_dirs = set(r.hgsc_xfer_subdir for r in worklist) 53 | assert len(batch_dirs) == 1, ( 54 | 'batch must be the same for all BAMs in worklist' 55 | ) 56 | assert len(project_dirs) == 1, ( 57 | 'hgsc_xfer_subdir must be the same for all BAMs in worklist' 58 | ) 59 | batch_dir = list(batch_dirs)[0] 60 | project_dir = list(project_dirs)[0] 61 | return batch_dir, project_dir 62 | 63 | 64 | def output_copy_script(fout, new_dir, worklist): 65 | pr = partial(print, file=fout) 66 | for r in worklist: 67 | pr("msub-md5 '{}' '{}'".format(r.new_bam_name, r.bam_path)) 68 | 69 | 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /multiplex_scripts/generate-topmed-md5-worklist-tsv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Generate the worklist for submitting a bunch of jobs that compute MD5s. 4 | Execute "submit-md5-jobs worklist_file_path" to then submit all the jobs.""" 5 | 6 | import argparse 7 | from functools import partial 8 | import logging 9 | import sys 10 | 11 | import wh_lib 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def main(): 18 | args = parse_args() 19 | config_logging(args) 20 | run(args) 21 | logging.shutdown() 22 | 23 | 24 | def parse_args(): 25 | parser = argparse.ArgumentParser(description=__doc__) 26 | parser.add_argument('worklist_file_path') 27 | parser.add_argument('-v', '--verbose', action='store_true') 28 | args = parser.parse_args() 29 | return args 30 | 31 | 32 | def config_logging(args): 33 | global logger 34 | level = logging.DEBUG if args.verbose else logging.INFO 35 | logging.basicConfig(level=level) 36 | logger = logging.getLogger('gen-topmed-md5') 37 | 38 | 39 | def run(args): 40 | logger.debug('args: %r', args) 41 | worklist = wh_lib.parse_tsv_file(args.worklist_file_path) 42 | batch_dir, project_dir = extract_batch_info(worklist) 43 | worklist_name = '{}_md5'.format(batch_dir) 44 | new_dir = '{0}/{1}'.format(project_dir, batch_dir) 45 | with open(worklist_name, 'w') as fout: 46 | output_copy_script(fout, new_dir, worklist) 47 | 48 | 49 | def extract_batch_info(worklist): 50 | """Return batch_dir & project_dir, which must be the same for all rows.""" 51 | batch_dirs = set(r.batch for r in worklist) 52 | project_dirs = set(r.hgsc_xfer_subdir for r in worklist) 53 | assert len(batch_dirs) == 1, ( 54 | 'batch must be the same for all BAMs in worklist' 55 | ) 56 | assert len(project_dirs) == 1, ( 57 | 'hgsc_xfer_subdir must be the same for all BAMs in worklist' 58 | ) 59 | batch_dir = list(batch_dirs)[0] 60 | project_dir = list(project_dirs)[0] 61 | return batch_dir, project_dir 62 | 63 | 64 | def output_copy_script(fout, new_dir, worklist): 65 | pr = partial(print, file=fout) 66 | for r in worklist: 67 | pr("msub-md5 '{}' '{}'".format(r.new_cram_name, r.cram_path)) #need to change based on how Jennifer sets up 68 | 69 | 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /multiplex_scripts/generate-topmed-copy-script-tsv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Generate the copy-with-rename script that stages TOPMed files.""" 4 | 5 | import argparse 6 | from functools import partial 7 | import logging 8 | import sys 9 | 10 | import wh_lib 11 | 12 | 13 | PROLOG = '''#/usr/bin/env bash 14 | 15 | DST="$1" 16 | 17 | if [ -f topmed_cvcp_init.sh ]; then 18 | . topmed_cvcp_init.sh 19 | fi 20 | ''' 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | def main(): 26 | args = parse_args() 27 | config_logging(args) 28 | run(args) 29 | logging.shutdown() 30 | 31 | 32 | def parse_args(): 33 | parser = argparse.ArgumentParser(description=__doc__) 34 | parser.add_argument('worklist_file_path') 35 | parser.add_argument('-v', '--verbose', action='store_true') 36 | args = parser.parse_args() 37 | return args 38 | 39 | 40 | def config_logging(args): 41 | global logger 42 | level = logging.DEBUG if args.verbose else logging.INFO 43 | logging.basicConfig(level=level) 44 | logger = logging.getLogger('gen-topmed-copy') 45 | 46 | 47 | def run(args): 48 | logger.info('args: %r', args) 49 | worklist = wh_lib.parse_tsv_file(args.worklist_file_path) 50 | batch_dir, project_dir = extract_batch_info(worklist) 51 | script_name = '{}.sh'.format(batch_dir) 52 | new_dir = '{0}/{1}'.format(project_dir, batch_dir) 53 | with open(script_name, 'w') as fout: 54 | output_copy_script(fout, new_dir, worklist) 55 | 56 | 57 | def extract_batch_info(worklist): 58 | """Return batch_dir & project_dir, which must be the same for all rows.""" 59 | batch_dirs = set(r.batch for r in worklist) 60 | project_dirs = set(r.hgsc_xfer_subdir for r in worklist) 61 | assert len(batch_dirs) == 1, ( 62 | 'batch must be the same for all CRAMs in worklist' 63 | ) 64 | assert len(project_dirs) == 1, ( 65 | 'hgsc_xfer_subdir must be the same for all BAMs in worklist' 66 | ) 67 | batch_dir = list(batch_dirs)[0] 68 | project_dir = list(project_dirs)[0] 69 | return batch_dir, project_dir 70 | 71 | 72 | def output_copy_script(fout, new_dir, worklist): 73 | pr = partial(print, file=fout) 74 | pr(PROLOG) 75 | pr('mkdir -p "$DST/{}"'.format(new_dir)) 76 | pr() 77 | for r in worklist: 78 | pr( 79 | 'cvcp $TOPMED_CVCP_OPTS "{}" "$DST/{}/{}"'.format( 80 | r.cram_path, new_dir, r.new_cram_name 81 | ) 82 | ) 83 | 84 | 85 | if __name__ == '__main__': 86 | main() 87 | -------------------------------------------------------------------------------- /scripts_from_the_past/generate-topmed-copy-script: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Generate the copy-with-rename script that stages TOPMed files.""" 4 | 5 | import argparse 6 | from functools import partial 7 | import logging 8 | import sys 9 | 10 | import wh_lib 11 | 12 | 13 | PROLOG = '''#/usr/bin/env bash 14 | 15 | DST="$1" 16 | 17 | if [ -f topmed_cvcp_init.sh ]; then 18 | . topmed_cvcp_init.sh 19 | fi 20 | ''' 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | def main(): 26 | args = parse_args() 27 | config_logging(args) 28 | run(args) 29 | logging.shutdown() 30 | 31 | 32 | def parse_args(): 33 | parser = argparse.ArgumentParser(description=__doc__) 34 | parser.add_argument('worklist_file_path') 35 | parser.add_argument('-v', '--verbose', action='store_true') 36 | args = parser.parse_args() 37 | return args 38 | 39 | 40 | def config_logging(args): 41 | global logger 42 | level = logging.DEBUG if args.verbose else logging.INFO 43 | logging.basicConfig(level=level) 44 | logger = logging.getLogger('gen-topmed-copy') 45 | 46 | 47 | def run(args): 48 | logger.info('args: %r', args) 49 | worklist = wh_lib.parse_excel_records(args.worklist_file_path) 50 | batch_dir, project_dir = extract_batch_info(worklist) 51 | script_name = '{}.sh'.format(batch_dir) 52 | new_dir = '{0}/{1}'.format(project_dir, batch_dir) 53 | with open(script_name, 'w') as fout: 54 | output_copy_script(fout, new_dir, worklist) 55 | 56 | 57 | def extract_batch_info(worklist): 58 | """Return batch_dir & project_dir, which must be the same for all rows.""" 59 | batch_dirs = set(r.batch for r in worklist) 60 | project_dirs = set(r.hgsc_xfer_subdir for r in worklist) 61 | assert len(batch_dirs) == 1, ( 62 | 'batch must be the same for all BAMs in worklist' 63 | ) 64 | assert len(project_dirs) == 1, ( 65 | 'hgsc_xfer_subdir must be the same for all BAMs in worklist' 66 | ) 67 | batch_dir = list(batch_dirs)[0] 68 | project_dir = list(project_dirs)[0] 69 | return batch_dir, project_dir 70 | 71 | 72 | def output_copy_script(fout, new_dir, worklist): 73 | pr = partial(print, file=fout) 74 | pr(PROLOG) 75 | pr('mkdir -p "$DST/{}"'.format(new_dir)) 76 | pr() 77 | for r in worklist: 78 | pr( 79 | 'cvcp $TOPMED_CVCP_OPTS "{}" "$DST/{}/{}"'.format( 80 | r.bam_path, new_dir, r.new_bam_name 81 | ) 82 | ) 83 | 84 | 85 | if __name__ == '__main__': 86 | main() 87 | -------------------------------------------------------------------------------- /topmed.sh: -------------------------------------------------------------------------------- 1 | TOPMED Steps: Initializing: 2 | 3 | under /users/rajendra/.config make shepherd.yaml 4 | cat > 5 | sub_root: /groups/submissions/metadata/v1/topmed/ 6 | asp_root: /aspera/share/globusupload/submissions/ 7 | 8 | 9 | ------In the Copy Node (tmux):------- 10 | (in the home directory) 11 | 1) Check space on Aspera: df -h /aspera/share/globusupload 12 | 2) Run 'accept_batch' and the path jennifer gives you 13 | 3) In groups submissions directory (/groups/submissions/metadata/v1/topmed/topmed/YR3/harvard/01/{created directory}), copy the tsv from the project manager location 14 | 4) Run the workbook function: 15 | 16 | workbook=$(ls -t *_batch???_mplx.tsv | head -n1) 17 | if [[ $workbook =~ ' ' ]]; then 18 | echo 'ERROR: There is a space in the name of the workbook!!!' 19 | echo 'These steps will FAIL!' 20 | fi 21 | 22 | batch_name=$(echo $workbook | sed -e s/_mplx.tsv$//) 23 | echo $batch_name 24 | 25 | 5) Run the scripts used to generate: (i) copy script, (ii) md5_worklist 26 | /stornext/snfs1/submissions/topmed/topmed-code/topmed_multiplex_code/generate-topmed-copy-script-tsv $workbook 27 | /stornext/snfs1/submissions/topmed/topmed-code/topmed_multiplex_code/generate-topmed-md5-worklist-tsv $workbook 28 | 29 | 30 | 6) Run the following validation prep steps: 31 | mkdir -p validation/input 32 | cat ${batch_name}_md5 | sed 's/^msub-md5/msub-val/' | tee validation/${batch_name}_val | tail 33 | 34 | pushd validation/input/ 35 | 36 | #creates symlinks to all the bams in the input directory 37 | ECHO=echo 38 | msub-val() { $ECHO ln -s "$2" "$1"; } 39 | . ../${batch_name}_val 40 | unset ECHO 41 | . ../${batch_name}_val 42 | 43 | 11) Copy the crams 44 | ### run the copy script (be sure to include the first part of the destination path) 45 | ./{copy_script}.sh /aspera/share/globusupload/submissions 46 | 47 | 13) Convert the md5s into a manifest and copy to aspera using the copy script ### you may need to copy the script into your working directory 48 | cp /stornext/snfs1/submissions/topmed/topmed-code/topmed_md5_prep_shep.sh . 49 | cd md5/ 50 | ../topmed_md5_prep_shep.sh $batch_name {no. of samples} {Cohort_name} 51 | 52 | 53 | ------In the Login Node (tmux):--------- 54 | 7) cd into working directory (/groups/submissions/metadata/v1/topmed/topmed/YR3/harvard/01/{created directory}) 55 | 8) 'md5' directory should already exist 56 | ### redefine batch name as necessary 57 | 58 | 9) Submit the md5 jobs to the cluster: 59 | echo submit-md5-jobs "$batch_name"_md5 md5 60 | submit-md5-jobs "$batch_name"_md5 md5/ proj-dm0021 61 | 62 | 10) Run the validation 63 | cd .. ###into the validation directory, make sure you have a copy of the submit_cram_validation script ### run_a is the directory name 64 | ls input/NWD* | head -n5 | xargs -n1 echo ../submit-cram-validation_phase5 run_a 65 | ls input/NWD* | xargs -n1 submit-cram-validation_phase5 run_a 66 | 67 | 12) Check the validation, once complete 68 | -------------------------------------------------------------------------------- /original_steps/steps.sh: -------------------------------------------------------------------------------- 1 | sgenerate-topmed-md5-worklist $workbookz# Connect to: 2 | # cifs://hgsc-naf01-b.hgsc.bcm.edu/tcga/other-submissions/topmed-shared 3 | 4 | #make new directorie 5 | 6 | # Drag attachment (eg. TOPMed_THRV_batch03_2016-04-04.xlsx) into new created directory. 7 | 8 | 9 | 10 | 11 | 12 | cd /stornext/snfs1/submissions/topmed 13 | pushd topmed-shared 14 | 15 | cd batches/globus/"filename" 16 | 17 | 18 | workbook=$(ls -t TOPMed_*_batch??_????-??-??.xlsx | head -n1) 19 | if [[ $workbook =~ ' ' ]]; then 20 | echo 'ERROR: There is a space in the name of the workbook!!!' 21 | echo 'These steps will FAIL!' 22 | fi 23 | 24 | batch_name=$(echo $workbook | sed -e s/^TOPMed_// -e s/.xlsx$//) 25 | echo $batch_name 26 | #mkdir batches/$batch_name 27 | # made the directory earlier 28 | #pasted it into the directory earlier 29 | #mv $workbook batches/$batch_name/ 30 | #chmod -wx batches/$batch_name/$workbook 31 | 32 | pushd batches/globus/$batch_name/ 33 | generate-topmed-copy-script $workbook 34 | generate-topmed-md5-worklist $workbook 35 | #chmod -wx $batch_name* 36 | 37 | popd 38 | popd 39 | 40 | # Upload copy/rename script to ticket. 41 | 42 | ( 43 | echo 'Update the RT ticket with the following:' 44 | echo 45 | echo "Attached is the copy script. You can also find it here:" 46 | echo "/data/tcga/other-submissions/topmed-shared/batches/globus/$batch_name/$batch_name.sh" 47 | echo "Moving on to MD5 and validation..." 48 | ) 49 | 50 | # Execute md5-steps. 51 | # need to do this once jobs are complete 52 | #jobs completed 53 | 54 | pushd md5-batches/$batch_name 55 | #copy to md5 56 | cp /stornext/snfs1/submissions/topmed/topmed-code/topmed_md5_prep.sh . 57 | #then run, but specifiy the batch 58 | ./topmed_md5_prep.sh $batch_name 100 THRV 59 | 60 | 61 | 62 | """ #OLDWAY OF DOING IT# 63 | ( 64 | pushd md5-batches/$batch_name 65 | find . -name '*.out' -size 0 | xargs ls 66 | find . -name '*.out' -size 0 | xargs rm 67 | find . -name '*.out' 68 | cat *md5 | tee Manifest.txt | wc -l 69 | cat *md5 | tee Manifest.txt | wc -l - ../../topmed-shared/batches/globus/$batch_name/${batch_name}_md5 70 | # CHECK: the first two numbers generated should match. 71 | chmod -w Manifest.txt 72 | 73 | cp -p Manifest.txt ../../topmed-shared/batches/globus/$batch_name/ 74 | ) 75 | """ 76 | 77 | 78 | scp Manifest.txt christis@hgsc-aspera1.hgsc.bcm.edu:/share/share/globusupload/submissions/AFIB/$batch_name 79 | 80 | 81 | # Upload Manifest file to ticket. 82 | 83 | ( 84 | echo 'Update the RT ticket with the following:' 85 | echo 86 | echo "Attached is the Manifest.txt file. You can also find it here:" 87 | echo "/data/tcga/other-submissions/topmed-shared/batches/globus/$batch_name/Manifest.txt" 88 | echo "Moving on to validation..." 89 | ) 90 | 91 | # Execute validation-steps. 92 | 93 | pushd validation-batches/$batch_name/ 94 | ( 95 | echo $(ls run*/*.job | wc -l) job 96 | echo $(ls run*/*.err | wc -l) err 97 | echo $(ls run*/*.out | wc -l) out 98 | echo $(ls run*/*.time | wc -l) time 99 | cat run*/*.out | uniq -c 100 | ) 101 | # Should all be the same number. 102 | popd 103 | -------------------------------------------------------------------------------- /topmed_automation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PM_CODE=$1 4 | PM_PATH=$2 5 | 6 | SUB_ROOT=/groups/submissions/metadata/v1/topmed 7 | SUB_PATH=${PM_PATH:56} 8 | 9 | ASP_ROOT=/aspera/share/globusupload/submissions 10 | 11 | ######### 12 | # Usage # 13 | ######### 14 | usage() { 15 | err "usage: $(basename "$0") PM_CODE PM_PATH" 16 | err " Run TOPmed automation for validation and copy steps." 17 | err " PM_CODE: code provided by the project manager" 18 | err " PM_PATH: path provided by the project manager" 19 | err 20 | exit 1 21 | } >&2 22 | 23 | err() { echo "$@" >&2; } 24 | 25 | if [[ $# -ne 2 ]]; then 26 | usage 27 | fi 28 | 29 | ############### 30 | # Function(s) # 31 | ############### 32 | tmux_session_exist () { 33 | if [ $? != 0 ]; then 34 | echo "ERROR: tmux session already exist." 35 | echo "Make sure to exit previous sessions before continuing" 36 | exit 1 37 | fi 38 | } 39 | 40 | 41 | ######### 42 | # Setup # 43 | ######### 44 | 45 | # TODO: Aspera space issue. 46 | # Remind user? 47 | 48 | # Check if shepherd config file exist 49 | 50 | CONFIG_FILE=/users/$USER/.config/shepherd.yaml 51 | 52 | if [ ! -f $CONFIG_FILE ]; then 53 | cat << EOF > $CONFIG_FILE 54 | sub_root: $SUB_ROOT 55 | asp_root: $ASP_ROOT 56 | EOF 57 | fi 58 | 59 | # Check if tmux sessions exist 60 | # tmux new -s topmed-copy -d 61 | # tmux_session_exist 62 | tmux new -s topmed-login -d 63 | tmux_session_exist 64 | echo "Created topmed-login session" 65 | 66 | 67 | ######################################### 68 | # Run accept_batch script from Shepherd # 69 | ######################################### 70 | 71 | ACCEPT_BATCH_PATH=/hgsc_software/submissions/bin/accept_batch 72 | 73 | $ACCEPT_BATCH_PATH $PM_PATH 74 | if [ $? != 0 ]; then 75 | echo "WARNING: accept_batch encountered errors." 76 | echo "ERROR: Stopping the pipeline." 77 | exit 1 78 | fi 79 | 80 | # Change to working directory 81 | cd $SUB_ROOT/$SUB_PATH 82 | 83 | # Assign workbook 84 | workbook=$(ls -t *_batch???_mplx.tsv | head -n1) 85 | 86 | # Assign batch name 87 | batch_name=$(echo $workbook | sed -e s/_mplx.tsv$//) 88 | 89 | # Run scripts to generate the copy script and md5_worklist 90 | stornext_path=/stornext/snfs1/submissions/topmed/topmed-code/topmed_multiplex_code 91 | $stornext_path/generate-topmed-copy-script-tsv $workbook 92 | echo "Generated copy script" 93 | $stornext_path/generate-topmed-md5-worklist-tsv $workbook 94 | echo "Generated md5_worklist script" 95 | 96 | # Run validation prep steps 97 | # Create input directory 98 | mkdir validation/input 99 | echo "Created input directory under validation directory" 100 | sed 's/^msub-md5/msub-val/' ${batch_name}_md5 > validation/${batch_name}_val 101 | echo "Created ${batch_name}_val under validation/" 102 | 103 | # Create symlinks to all the bams in the input directory 104 | cd validation/input 105 | ECHO=echo 106 | msub-val() { $ECHO ln -s "$2" "$1"; } 107 | . ../${batch_name}_val 108 | unset ECHO 109 | . ../${batch_name}_val 110 | echo "Successfully created symlinks" 111 | 112 | ################################## 113 | # Submit md5 jobs to the cluster # 114 | ################################## 115 | 116 | submit_md5_jobs=/hgsc_software/submissions/noarch/apps/topmed-code/submit-md5-jobs 117 | md5_file_path="$batch_name"_md5 118 | tmux send-keys -t topmed-login "ssh sug-login4" C-m 119 | tmux send-keys -t topmed-login "cd $SUB_ROOT/$SUB_PATH" C-m 120 | # This is assuming user doesnt have to enter a password 121 | # Should prob clarify, it's easier 122 | tmux send-keys -t topmed-login "$submit_md5_jobs $md5_file_path md5/ $PM_CODE" C-m 123 | 124 | # Run the validation 125 | submit-cram-validation_phase5=/hgsc_software/groups/submissions/metadata/v1/topmed/topmed/YR3/scripts_mr/submit-cram-validation_phase5 126 | tmux send-keys -t topmed-login "cd validation/" C-m 127 | tmux send-keys -t topmed-login "ls input/NWD* | xargs -n1 $submit-cram-validation_phase5 run_a" C-m 128 | 129 | echo "Validation is running, check validation in tmux session topmed-login once completed!" -------------------------------------------------------------------------------- /multiplex_scripts/wh_lib.py.bk01: -------------------------------------------------------------------------------- 1 | import collections 2 | import csv 3 | import glob 4 | import itertools 5 | import locale 6 | import operator 7 | import os 8 | import pprint 9 | import re 10 | import sys 11 | 12 | try: 13 | import openpyxl 14 | except ImportError: 15 | pass # optional dependency, only used when parsing .xlsx files 16 | try: 17 | import xlrd 18 | except ImportError: 19 | pass # optional dependency, only used when parsing .xls files 20 | import yaml 21 | 22 | 23 | locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') 24 | 25 | 26 | def select(collection, *items): 27 | return operator.itemgetter(*items)(collection) 28 | 29 | 30 | def print_table(iterable): 31 | """Print out the finite input as a table. Each item in iterable must be an 32 | iterable with roughly the same number of items.""" 33 | # Slurp the entire iterable. 34 | rows = list(iterable) 35 | # Compute column widths. 36 | col_widths = [] 37 | for row in rows: 38 | for col_num, col_val in enumerate(row): 39 | col_len = len(str(col_val)) 40 | if col_num < len(col_widths): 41 | col_widths[col_num] = max(col_widths[col_num], col_len) 42 | else: 43 | col_widths.append(col_len) 44 | # Format output. 45 | for row in rows: 46 | # Output all but last column in padded format. 47 | for col_val, col_width in list(zip(row, col_widths))[:-1]: 48 | col_str = str(col_val) 49 | if isinstance(col_val, int) and not isinstance(col_val, bool): 50 | sys.stdout.write(col_str.rjust(col_width)) 51 | else: 52 | sys.stdout.write(col_str.ljust(col_width)) 53 | sys.stdout.write(' | ') 54 | # Output the last column as-is. 55 | sys.stdout.write(str(row[-1])) 56 | # Add the newline. 57 | sys.stdout.write('\n') 58 | 59 | 60 | def yp(data, stream=sys.stdout): 61 | """Pretty print as YAML.""" 62 | yd(data, stream) 63 | 64 | 65 | def yf(data): 66 | """Format as pretty YAML""" 67 | return yd(data) 68 | 69 | 70 | def yd(data, stream=None): 71 | return yaml.safe_dump(devolve(data), stream, default_flow_style=False) 72 | # TODO: devolve is not self-referential safe 73 | 74 | 75 | def fetch_tsv_header(file_path): 76 | with open(file_path) as fin: 77 | reader = csv.Reader(fin, delimiter='\t') 78 | return next(reader) 79 | 80 | 81 | def parse_tsv_file(file_path, cls_or_fcn=None, fieldnames=None): 82 | """cls_or_fcn must construct an object from a dict.""" 83 | return list(iterate_tsv_file(file_path, cls_or_fcn, fieldnames)) 84 | 85 | 86 | def iterate_tsv_file(file_path, cls_or_fcn=None, fieldnames=None): 87 | """cls_or_fcn must construct an object from a dict.""" 88 | with open(file_path) as fin: 89 | for r in iterate_tsv_stream(fin, cls_or_fcn, fieldnames): 90 | yield r 91 | 92 | 93 | def iterate_tsv_stream(stream, cls_or_fcn=None, fieldnames=None): 94 | """cls_or_fcn must construct an object from a dict.""" 95 | cls_or_fcn = cls_or_fcn or Record 96 | reader = csv.DictReader(stream, fieldnames=fieldnames, delimiter='\t') 97 | for it in reader: 98 | yield cls_or_fcn(it) 99 | 100 | 101 | def default_filter(record): 102 | return any(record.values()) 103 | 104 | 105 | def parse_excel_records(file_path, 106 | cls_or_fcn=None, 107 | sheet_name=None, 108 | filter=default_filter): 109 | extension = os.path.splitext(file_path)[1] 110 | if extension == '.xlsx': 111 | parse_result = parse_xlsx_records(file_path, cls_or_fcn, sheet_name) 112 | elif extension == '.xls': 113 | parse_result = parse_xls_records(file_path, cls_or_fcn, sheet_name) 114 | else: 115 | raise NotImplementedError(file_path) 116 | return [record for record in parse_result if filter(record)] 117 | 118 | 119 | def parse_xlsx_records(file_path, cls_or_fcn, sheet_name): 120 | fcn = get_pair_constructor(cls_or_fcn) 121 | wb = openpyxl.load_workbook(file_path, data_only=True) 122 | if sheet_name: 123 | ws = wb.get_sheet_by_name(sheet_name) 124 | else: 125 | ws = wb.worksheets[0] 126 | rows = ws.rows 127 | header = tuple(normalize_field_name(c.value) for c in rows[0]) 128 | return [fcn(header, (c.value for c in row)) for row in rows[1:]] 129 | 130 | 131 | def parse_xls_records(file_path, cls_or_fcn, sheet_name): 132 | # TODO: use sheet_name 133 | fcn = get_pair_constructor(cls_or_fcn) 134 | wb = xlrd.open_workbook(file_path) 135 | ws = wb.sheet_by_index(0) 136 | header = tuple(normalize_field_name(c) for c in ws.row_values(0)) 137 | return [fcn(header, ws.row_values(i)) for i in range(1, ws.nrows)] 138 | 139 | 140 | def get_pair_constructor(cls_or_fcn): 141 | """Return a callable that constructs an object from a header, data pair.""" 142 | if not cls_or_fcn: 143 | return Record.from_pair 144 | elif isinstance(cls_or_fcn, type): 145 | return cls_or_fcn.from_pair 146 | else: 147 | return cls_or_fcn 148 | 149 | 150 | class Record(collections.MutableMapping): 151 | def __init__(self, mapping=None, **kwds): 152 | if mapping: 153 | if isinstance(mapping, collections.Mapping): 154 | gen = mapping.items() 155 | else: 156 | gen = mapping 157 | for k, v in gen: 158 | self.__dict__[normalize_field_name(k)] = normalize_value(v) 159 | for k, v in kwds.items(): 160 | self.__dict__[normalize_field_name(k)] = normalize_value(v) 161 | 162 | @classmethod 163 | def from_pair(cls, header, data): 164 | """Alternate constructor""" 165 | return cls(zip(header, data)) 166 | 167 | def __repr__(self): 168 | return '%s(%r)' % (self.__class__.__name__, self.__dict__) 169 | # TODO: Should I use self.mapping here? 170 | 171 | def __getitem__(self, key): 172 | return self.mapping[key] 173 | 174 | def __setitem__(self, key, value): 175 | self.mapping[key] = value 176 | 177 | def __delitem__(self, key): 178 | del self.mapping[key] 179 | 180 | # TODO: Is this inherited from collections.MutableMapping? 181 | def __iter__(self): 182 | return iter(self.mapping) 183 | 184 | # TODO: Is this inherited from collections.MutableMapping? 185 | def __len__(self): 186 | return len(self.mapping) 187 | 188 | @property 189 | def mapping(self): 190 | return self.__dict__ 191 | 192 | @property 193 | def attributes(self): 194 | return self.mapping.keys() 195 | 196 | def pp(self): 197 | pprint.pprint(self.to_dict()) 198 | 199 | def to_dict(self): 200 | return devolve(self) 201 | 202 | 203 | def normalize_field_name(field_name): 204 | """lowercase with underscores, etc""" 205 | result = field_name 206 | if result.endswith('?'): 207 | result = result[:-1] 208 | if not result.startswith('is_'): 209 | result = 'is_' + result 210 | result = result.strip().lower().replace(' ', '_').replace( 211 | '-', '_').replace('/', '_').replace('?', '_').replace('%', 'pct') 212 | return result 213 | 214 | 215 | def normalize_value(value): 216 | """Convert empty string to None""" 217 | if value == '': 218 | value = None 219 | return value 220 | 221 | 222 | def devolve(data): 223 | """Recursively convert to just JSON-compatible types.""" 224 | # TODO: possible infinite recursion 225 | is_string = isinstance(data, str) 226 | is_iterable = isinstance(data, collections.Iterable) 227 | is_mapping = isinstance(data, collections.Mapping) 228 | is_record = isinstance(data, Record) 229 | if is_record: 230 | result = devolve(data.__dict__) 231 | elif is_mapping: 232 | result = {k: devolve(v) for k, v in data.items()} 233 | elif is_iterable and not is_string: 234 | result = [devolve(it) for it in data] 235 | elif hasattr(data, '__dict__'): 236 | result = data.__dict__ 237 | else: 238 | result = data 239 | return result 240 | 241 | 242 | def multiplicities(iterable): 243 | """Count the number of singletons, the number of duplicates, etc. 244 | Returns a collections.Counter instance.""" 245 | return collections.Counter(collections.Counter(iterable).values()) 246 | 247 | 248 | def val_or_none_key(getter_fcn): 249 | """Wraps getter_fcn, returning a key that is a tuple of (0 or 1, val) where 250 | val=getter_fcn(obj), and the int is 0 if val is None.""" 251 | def result_key_fcn(obj): 252 | val = getter_fcn(obj) 253 | n = 0 if val is None else 1 254 | return n, val 255 | return result_key_fcn 256 | 257 | 258 | def count(iterable, n=None, 259 | primary_reverse=True, 260 | secondary_reverse=False, 261 | primary_key=operator.itemgetter(1), 262 | secondary_key=val_or_none_key(operator.itemgetter(0)) 263 | ): 264 | """Wraps collections.Counter. Counts, sorts the result, and takes the 265 | first n. The primary sorting criteria is the count; the secondary sorting 266 | criteria is the value. The default sort is descending by count and 267 | ascending by value.""" 268 | result = sorted(collections.Counter(iterable).items(), 269 | key=secondary_key, reverse=secondary_reverse) 270 | result.sort(key=primary_key, reverse=primary_reverse) 271 | return result[:n] 272 | 273 | 274 | class MinMax(object): 275 | def __init__(self, min_start=None, max_start=None, count_start=0): 276 | self.count = count_start 277 | self.min = min_start 278 | self.max = max_start 279 | 280 | def add(self, value): 281 | self.count += 1 282 | if self.min is None or self.min > value: 283 | self.min = value 284 | if self.max is None or self.max < value: 285 | self.max = value 286 | 287 | def __repr__(self): 288 | return '%s(%r, %r)' % (self.__class__.__name__, self.min, self.max) 289 | 290 | 291 | def slice_by_value(sequence, start=None, end=None, step=1): 292 | """Returns the earliest slice of the sequence bounded by the 293 | start and end values. Omitted optional parameters work as expected 294 | for slicing. 295 | 296 | slice_by_value('hello there world', 'o', 'w', 2) -> 'otee' 297 | """ 298 | i_start = i_end = None 299 | if start is not None: 300 | i_start = sequence.index(start) 301 | if end is not None: 302 | i_end = sequence.index(end) 303 | return sequence[i_start:i_end:step] 304 | 305 | 306 | def update_subset(record, fields, *source_records, **kwds): 307 | """Given a destination record, a sequence of fields, and source 308 | for each field, copy over the first value found in the source records. 309 | The argument for fields must be an iterable where each item is either a 310 | string or a pair of strings. If it is a pair of strings, they name 311 | the destination and source field names. If keyword argument "required" 312 | is True and any of the fields are missing from the source records, 313 | then a KeyError is raised.""" 314 | required = kwds.pop('required', True) 315 | assert not kwds, 'Only "required" keyword supported' 316 | for field in fields: 317 | if isinstance(field, str): 318 | dst_name = src_name = field 319 | else: 320 | dst_name, src_name = field 321 | assert isinstance(dst_name, str) 322 | assert isinstance(src_name, str) 323 | value = fetch(src_name, *source_records, required=required) 324 | # TODO: assert value? 325 | if value is not None: 326 | setattr(record, dst_name, value) 327 | 328 | 329 | def fetch(field, *source_records, **kwds): 330 | """Return the value from the first record in the arguments that 331 | contains the specified field. If no record in the chain contains 332 | that field, return the default value. The default value is specified 333 | by the "default" keyword argument or None. If keyword argument 334 | "required" is True and any of the fields are missing from the source 335 | records, then a KeyError is raised.""" 336 | default = kwds.pop('default', None) 337 | required = kwds.pop('required', False) 338 | assert not kwds, 'Only "default" and "required" keyword supported' 339 | for record in source_records: 340 | if hasattr(record, field): 341 | return getattr(record, field) 342 | # Must use default. 343 | if required: 344 | raise KeyError(field) 345 | return default 346 | 347 | 348 | def replace_fields(field_list, *pairs): 349 | """Given a list of field names and one or more pairs, 350 | replace each item named in a pair by the pair. 351 | 352 | fl = 'one two three'.split() 353 | replace_fields(fl, ('two', 'spam')) 354 | # ['one', ('two', 'spam'), 'three'] 355 | """ 356 | result = list(field_list) 357 | for field_name, source in pairs: 358 | index = field_list.index(field_name) 359 | result[index] = field_name, source 360 | return result 361 | 362 | 363 | def rekey_map(mapping, replacements): 364 | """Given an iterable of destination/source pairs in replacements, 365 | create a new dict that is the same as the original except for the 366 | new key names.""" 367 | result = dict(mapping) 368 | for dst, src in replacements: 369 | value = result[src] 370 | result[dst] = value 371 | del result[src] 372 | return result 373 | 374 | 375 | class TsvDialect(csv.Dialect): 376 | """Standard Unix-style TSV format. 377 | Also compatible with MAGE-TAB spec v1.0. 378 | See MAGE-TABv1.0.pdf section 3.1.6 379 | http://www.mged.org/mage-tab/MAGE-TABv1.0.pdf 380 | http://www.mged.org/mage-tab/""" 381 | delimiter = '\t' 382 | doublequote = False 383 | escapechar = '\\' 384 | lineterminator = '\n' 385 | quotechar = '"' 386 | quoting = csv.QUOTE_MINIMAL 387 | skipinitialspace = False 388 | -------------------------------------------------------------------------------- /scripts_from_the_past/wh_lib.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import csv 3 | import glob 4 | import itertools 5 | import locale 6 | import operator 7 | import os 8 | import pprint 9 | import re 10 | import sys 11 | 12 | try: 13 | import openpyxl 14 | except ImportError: 15 | pass # optional dependency, only used when parsing .xlsx files 16 | try: 17 | import xlrd 18 | except ImportError: 19 | pass # optional dependency, only used when parsing .xls files 20 | import yaml 21 | 22 | 23 | locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') 24 | 25 | 26 | def select(collection, *items): 27 | return operator.itemgetter(*items)(collection) 28 | 29 | 30 | def print_table(iterable): 31 | """Print out the finite input as a table. Each item in iterable must be an 32 | iterable with roughly the same number of items.""" 33 | # Slurp the entire iterable. 34 | rows = list(iterable) 35 | # Compute column widths. 36 | col_widths = [] 37 | for row in rows: 38 | for col_num, col_val in enumerate(row): 39 | col_len = len(str(col_val)) 40 | if col_num < len(col_widths): 41 | col_widths[col_num] = max(col_widths[col_num], col_len) 42 | else: 43 | col_widths.append(col_len) 44 | # Format output. 45 | for row in rows: 46 | # Output all but last column in padded format. 47 | for col_val, col_width in list(zip(row, col_widths))[:-1]: 48 | col_str = str(col_val) 49 | if isinstance(col_val, int) and not isinstance(col_val, bool): 50 | sys.stdout.write(col_str.rjust(col_width)) 51 | else: 52 | sys.stdout.write(col_str.ljust(col_width)) 53 | sys.stdout.write(' | ') 54 | # Output the last column as-is. 55 | sys.stdout.write(str(row[-1])) 56 | # Add the newline. 57 | sys.stdout.write('\n') 58 | 59 | 60 | def yp(data, stream=sys.stdout): 61 | """Pretty print as YAML.""" 62 | yd(data, stream) 63 | 64 | 65 | def yf(data): 66 | """Format as pretty YAML""" 67 | return yd(data) 68 | 69 | 70 | def yd(data, stream=None): 71 | return yaml.safe_dump(devolve(data), stream, default_flow_style=False) 72 | # TODO: devolve is not self-referential safe 73 | 74 | 75 | def fetch_tsv_header(file_path): 76 | with open(file_path) as fin: 77 | reader = csv.Reader(fin, delimiter='\t') 78 | return next(reader) 79 | 80 | 81 | def parse_tsv_file(file_path, cls_or_fcn=None, fieldnames=None): 82 | """cls_or_fcn must construct an object from a dict.""" 83 | return list(iterate_tsv_file(file_path, cls_or_fcn, fieldnames)) 84 | 85 | 86 | def iterate_tsv_file(file_path, cls_or_fcn=None, fieldnames=None): 87 | """cls_or_fcn must construct an object from a dict.""" 88 | with open(file_path) as fin: 89 | for r in iterate_tsv_stream(fin, cls_or_fcn, fieldnames): 90 | yield r 91 | 92 | 93 | def iterate_tsv_stream(stream, cls_or_fcn=None, fieldnames=None): 94 | """cls_or_fcn must construct an object from a dict.""" 95 | cls_or_fcn = cls_or_fcn or Record 96 | reader = csv.DictReader(stream, fieldnames=fieldnames, delimiter='\t') 97 | for it in reader: 98 | yield cls_or_fcn(it) 99 | 100 | 101 | def default_filter(record): 102 | return any(record.values()) 103 | 104 | 105 | def parse_excel_records(file_path, 106 | cls_or_fcn=None, 107 | sheet_name=None, 108 | filter=default_filter): 109 | extension = os.path.splitext(file_path)[1] 110 | if extension == '.xlsx': 111 | parse_result = parse_xlsx_records(file_path, cls_or_fcn, sheet_name) 112 | elif extension == '.xls': 113 | parse_result = parse_xls_records(file_path, cls_or_fcn, sheet_name) 114 | else: 115 | raise NotImplementedError(file_path) 116 | return [record for record in parse_result if filter(record)] 117 | 118 | 119 | def parse_xlsx_records(file_path, cls_or_fcn, sheet_name): 120 | fcn = get_pair_constructor(cls_or_fcn) 121 | wb = openpyxl.load_workbook(file_path, data_only=True) 122 | if sheet_name: 123 | ws = wb.get_sheet_by_name(sheet_name) 124 | else: 125 | ws = wb.worksheets[0] 126 | rows = ws.rows 127 | header = tuple(normalize_field_name(c.value) for c in rows[0]) 128 | return [fcn(header, (c.value for c in row)) for row in rows[1:]] 129 | 130 | 131 | def parse_xls_records(file_path, cls_or_fcn, sheet_name): 132 | # TODO: use sheet_name 133 | fcn = get_pair_constructor(cls_or_fcn) 134 | wb = xlrd.open_workbook(file_path) 135 | ws = wb.sheet_by_index(0) 136 | header = tuple(normalize_field_name(c) for c in ws.row_values(0)) 137 | return [fcn(header, ws.row_values(i)) for i in range(1, ws.nrows)] 138 | 139 | 140 | def get_pair_constructor(cls_or_fcn): 141 | """Return a callable that constructs an object from a header, data pair.""" 142 | if not cls_or_fcn: 143 | return Record.from_pair 144 | elif isinstance(cls_or_fcn, type): 145 | return cls_or_fcn.from_pair 146 | else: 147 | return cls_or_fcn 148 | 149 | 150 | class Record(collections.MutableMapping): 151 | def __init__(self, mapping=None, **kwds): 152 | if mapping: 153 | if isinstance(mapping, collections.Mapping): 154 | gen = mapping.items() 155 | else: 156 | gen = mapping 157 | for k, v in gen: 158 | self.__dict__[normalize_field_name(k)] = normalize_value(v) 159 | for k, v in kwds.items(): 160 | self.__dict__[normalize_field_name(k)] = normalize_value(v) 161 | 162 | @classmethod 163 | def from_pair(cls, header, data): 164 | """Alternate constructor""" 165 | return cls(zip(header, data)) 166 | 167 | def __repr__(self): 168 | return '%s(%r)' % (self.__class__.__name__, self.__dict__) 169 | # TODO: Should I use self.mapping here? 170 | 171 | def __getitem__(self, key): 172 | return self.mapping[key] 173 | 174 | def __setitem__(self, key, value): 175 | self.mapping[key] = value 176 | 177 | def __delitem__(self, key): 178 | del self.mapping[key] 179 | 180 | # TODO: Is this inherited from collections.MutableMapping? 181 | def __iter__(self): 182 | return iter(self.mapping) 183 | 184 | # TODO: Is this inherited from collections.MutableMapping? 185 | def __len__(self): 186 | return len(self.mapping) 187 | 188 | @property 189 | def mapping(self): 190 | return self.__dict__ 191 | 192 | @property 193 | def attributes(self): 194 | return self.mapping.keys() 195 | 196 | def pp(self): 197 | pprint.pprint(self.to_dict()) 198 | 199 | def to_dict(self): 200 | return devolve(self) 201 | 202 | 203 | def normalize_field_name(field_name): 204 | """lowercase with underscores, etc""" 205 | result = field_name 206 | if result.endswith('?'): 207 | result = result[:-1] 208 | if not result.startswith('is_'): 209 | result = 'is_' + result 210 | result = result.strip().lower().replace(' ', '_').replace( 211 | '-', '_').replace('/', '_').replace('?', '_').replace('%', 'pct') 212 | return result 213 | 214 | 215 | def normalize_value(value): 216 | """Convert empty string to None""" 217 | if value == '': 218 | value = None 219 | return value 220 | 221 | 222 | def devolve(data): 223 | """Recursively convert to just JSON-compatible types.""" 224 | # TODO: possible infinite recursion 225 | is_string = isinstance(data, str) 226 | is_iterable = isinstance(data, collections.Iterable) 227 | is_mapping = isinstance(data, collections.Mapping) 228 | is_record = isinstance(data, Record) 229 | if is_record: 230 | result = devolve(data.__dict__) 231 | elif is_mapping: 232 | result = {k: devolve(v) for k, v in data.items()} 233 | elif is_iterable and not is_string: 234 | result = [devolve(it) for it in data] 235 | elif hasattr(data, '__dict__'): 236 | result = data.__dict__ 237 | else: 238 | result = data 239 | return result 240 | 241 | 242 | def multiplicities(iterable): 243 | """Count the number of singletons, the number of duplicates, etc. 244 | Returns a collections.Counter instance.""" 245 | return collections.Counter(collections.Counter(iterable).values()) 246 | 247 | 248 | def val_or_none_key(getter_fcn): 249 | """Wraps getter_fcn, returning a key that is a tuple of (0 or 1, val) where 250 | val=getter_fcn(obj), and the int is 0 if val is None.""" 251 | def result_key_fcn(obj): 252 | val = getter_fcn(obj) 253 | n = 0 if val is None else 1 254 | return n, val 255 | return result_key_fcn 256 | 257 | 258 | def count(iterable, n=None, 259 | primary_reverse=True, 260 | secondary_reverse=False, 261 | primary_key=operator.itemgetter(1), 262 | secondary_key=val_or_none_key(operator.itemgetter(0)) 263 | ): 264 | """Wraps collections.Counter. Counts, sorts the result, and takes the 265 | first n. The primary sorting criteria is the count; the secondary sorting 266 | criteria is the value. The default sort is descending by count and 267 | ascending by value.""" 268 | result = sorted(collections.Counter(iterable).items(), 269 | key=secondary_key, reverse=secondary_reverse) 270 | result.sort(key=primary_key, reverse=primary_reverse) 271 | return result[:n] 272 | 273 | 274 | class MinMax(object): 275 | def __init__(self, min_start=None, max_start=None, count_start=0): 276 | self.count = count_start 277 | self.min = min_start 278 | self.max = max_start 279 | 280 | def add(self, value): 281 | self.count += 1 282 | if self.min is None or self.min > value: 283 | self.min = value 284 | if self.max is None or self.max < value: 285 | self.max = value 286 | 287 | def __repr__(self): 288 | return '%s(%r, %r)' % (self.__class__.__name__, self.min, self.max) 289 | 290 | 291 | def slice_by_value(sequence, start=None, end=None, step=1): 292 | """Returns the earliest slice of the sequence bounded by the 293 | start and end values. Omitted optional parameters work as expected 294 | for slicing. 295 | 296 | slice_by_value('hello there world', 'o', 'w', 2) -> 'otee' 297 | """ 298 | i_start = i_end = None 299 | if start is not None: 300 | i_start = sequence.index(start) 301 | if end is not None: 302 | i_end = sequence.index(end) 303 | return sequence[i_start:i_end:step] 304 | 305 | 306 | def update_subset(record, fields, *source_records, **kwds): 307 | """Given a destination record, a sequence of fields, and source 308 | for each field, copy over the first value found in the source records. 309 | The argument for fields must be an iterable where each item is either a 310 | string or a pair of strings. If it is a pair of strings, they name 311 | the destination and source field names. If keyword argument "required" 312 | is True and any of the fields are missing from the source records, 313 | then a KeyError is raised.""" 314 | required = kwds.pop('required', True) 315 | assert not kwds, 'Only "required" keyword supported' 316 | for field in fields: 317 | if isinstance(field, str): 318 | dst_name = src_name = field 319 | else: 320 | dst_name, src_name = field 321 | assert isinstance(dst_name, str) 322 | assert isinstance(src_name, str) 323 | value = fetch(src_name, *source_records, required=required) 324 | # TODO: assert value? 325 | if value is not None: 326 | setattr(record, dst_name, value) 327 | 328 | 329 | def fetch(field, *source_records, **kwds): 330 | """Return the value from the first record in the arguments that 331 | contains the specified field. If no record in the chain contains 332 | that field, return the default value. The default value is specified 333 | by the "default" keyword argument or None. If keyword argument 334 | "required" is True and any of the fields are missing from the source 335 | records, then a KeyError is raised.""" 336 | default = kwds.pop('default', None) 337 | required = kwds.pop('required', False) 338 | assert not kwds, 'Only "default" and "required" keyword supported' 339 | for record in source_records: 340 | if hasattr(record, field): 341 | return getattr(record, field) 342 | # Must use default. 343 | if required: 344 | raise KeyError(field) 345 | return default 346 | 347 | 348 | def replace_fields(field_list, *pairs): 349 | """Given a list of field names and one or more pairs, 350 | replace each item named in a pair by the pair. 351 | 352 | fl = 'one two three'.split() 353 | replace_fields(fl, ('two', 'spam')) 354 | # ['one', ('two', 'spam'), 'three'] 355 | """ 356 | result = list(field_list) 357 | for field_name, source in pairs: 358 | index = field_list.index(field_name) 359 | result[index] = field_name, source 360 | return result 361 | 362 | 363 | def rekey_map(mapping, replacements): 364 | """Given an iterable of destination/source pairs in replacements, 365 | create a new dict that is the same as the original except for the 366 | new key names.""" 367 | result = dict(mapping) 368 | for dst, src in replacements: 369 | value = result[src] 370 | result[dst] = value 371 | del result[src] 372 | return result 373 | 374 | 375 | class TsvDialect(csv.Dialect): 376 | """Standard Unix-style TSV format. 377 | Also compatible with MAGE-TAB spec v1.0. 378 | See MAGE-TABv1.0.pdf section 3.1.6 379 | http://www.mged.org/mage-tab/MAGE-TABv1.0.pdf 380 | http://www.mged.org/mage-tab/""" 381 | delimiter = '\t' 382 | doublequote = False 383 | escapechar = '\\' 384 | lineterminator = '\n' 385 | quotechar = '"' 386 | quoting = csv.QUOTE_MINIMAL 387 | skipinitialspace = False 388 | -------------------------------------------------------------------------------- /multiplex_scripts/wh_lib.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import csv 3 | import glob 4 | import itertools 5 | import locale 6 | import operator 7 | import os 8 | import pprint 9 | import re 10 | import sys 11 | 12 | try: 13 | import openpyxl 14 | except ImportError: 15 | pass # optional dependency, only used when parsing .xlsx files 16 | try: 17 | import xlrd 18 | except ImportError: 19 | pass # optional dependency, only used when parsing .xls files 20 | import yaml 21 | 22 | 23 | locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') 24 | 25 | 26 | def select(collection, *items): 27 | return operator.itemgetter(*items)(collection) 28 | 29 | 30 | def print_table(iterable): 31 | """Print out the finite input as a table. Each item in iterable must be an 32 | iterable with roughly the same number of items.""" 33 | # Slurp the entire iterable. 34 | rows = list(iterable) 35 | # Compute column widths. 36 | col_widths = [] 37 | for row in rows: 38 | for col_num, col_val in enumerate(row): 39 | col_len = len(str(col_val)) 40 | if col_num < len(col_widths): 41 | col_widths[col_num] = max(col_widths[col_num], col_len) 42 | else: 43 | col_widths.append(col_len) 44 | # Format output. 45 | for row in rows: 46 | # Output all but last column in padded format. 47 | for col_val, col_width in list(zip(row, col_widths))[:-1]: 48 | col_str = str(col_val) 49 | if isinstance(col_val, int) and not isinstance(col_val, bool): 50 | sys.stdout.write(col_str.rjust(col_width)) 51 | else: 52 | sys.stdout.write(col_str.ljust(col_width)) 53 | sys.stdout.write(' | ') 54 | # Output the last column as-is. 55 | sys.stdout.write(str(row[-1])) 56 | # Add the newline. 57 | sys.stdout.write('\n') 58 | 59 | 60 | def yp(data, stream=sys.stdout): 61 | """Pretty print as YAML.""" 62 | yd(data, stream) 63 | 64 | 65 | def yf(data): 66 | """Format as pretty YAML""" 67 | return yd(data) 68 | 69 | 70 | def yd(data, stream=None): 71 | return yaml.safe_dump(devolve(data), stream, default_flow_style=False) 72 | # TODO: devolve is not self-referential safe 73 | 74 | 75 | def fetch_tsv_header(file_path): 76 | with open(file_path) as fin: 77 | reader = csv.Reader(fin, delimiter='\t') 78 | return next(reader) 79 | 80 | 81 | def parse_tsv_file(file_path, cls_or_fcn=None, fieldnames=None): 82 | """cls_or_fcn must construct an object from a dict.""" 83 | return list(iterate_tsv_file(file_path, cls_or_fcn, fieldnames)) 84 | 85 | 86 | def iterate_tsv_file(file_path, cls_or_fcn=None, fieldnames=None): 87 | """cls_or_fcn must construct an object from a dict.""" 88 | with open(file_path) as fin: 89 | for r in iterate_tsv_stream(fin, cls_or_fcn, fieldnames): 90 | yield r 91 | 92 | 93 | def iterate_tsv_stream(stream, cls_or_fcn=None, fieldnames=None): 94 | """cls_or_fcn must construct an object from a dict.""" 95 | cls_or_fcn = cls_or_fcn or Record 96 | reader = csv.DictReader(stream, fieldnames=fieldnames, delimiter='\t') 97 | for it in reader: 98 | yield cls_or_fcn(it) 99 | 100 | 101 | def default_filter(record): 102 | return any(record.values()) 103 | 104 | 105 | def parse_excel_records(file_path, 106 | cls_or_fcn=None, 107 | sheet_name=None, 108 | filter=default_filter): 109 | extension = os.path.splitext(file_path)[1] 110 | if extension == '.xlsx': 111 | parse_result = parse_xlsx_records(file_path, cls_or_fcn, sheet_name) 112 | elif extension == '.xls': 113 | parse_result = parse_xls_records(file_path, cls_or_fcn, sheet_name) 114 | #elif extension == '.tsv': 115 | # parse_result = parse_tsv_file(file_path, cls_or_fcn,fieldnames) 116 | else: 117 | raise NotImplementedError(file_path) 118 | return [record for record in parse_result if filter(record)] 119 | 120 | 121 | def parse_xlsx_records(file_path, cls_or_fcn, sheet_name): 122 | fcn = get_pair_constructor(cls_or_fcn) 123 | wb = openpyxl.load_workbook(file_path, data_only=True) 124 | if sheet_name: 125 | ws = wb.get_sheet_by_name(sheet_name) 126 | else: 127 | ws = wb.worksheets[0] 128 | rows = iter(ws.rows) 129 | header = tuple(normalize_field_name(c.value) for c in next(rows)) 130 | return [fcn(header, (c.value for c in row)) for row in rows] 131 | 132 | 133 | def parse_xls_records(file_path, cls_or_fcn, sheet_name): 134 | # TODO: use sheet_name 135 | fcn = get_pair_constructor(cls_or_fcn) 136 | wb = xlrd.open_workbook(file_path) 137 | ws = wb.sheet_by_index(0) 138 | header = tuple(normalize_field_name(c) for c in ws.row_values(0)) 139 | return [fcn(header, ws.row_values(i)) for i in range(1, ws.nrows)] 140 | 141 | 142 | def get_pair_constructor(cls_or_fcn): 143 | """Return a callable that constructs an object from a header, data pair.""" 144 | if not cls_or_fcn: 145 | return Record.from_pair 146 | elif isinstance(cls_or_fcn, type): 147 | return cls_or_fcn.from_pair 148 | else: 149 | return cls_or_fcn 150 | 151 | 152 | class Record(collections.MutableMapping): 153 | def __init__(self, mapping=None, **kwds): 154 | if mapping: 155 | if isinstance(mapping, collections.Mapping): 156 | gen = mapping.items() 157 | else: 158 | gen = mapping 159 | for k, v in gen: 160 | self.__dict__[normalize_field_name(k)] = normalize_value(v) 161 | for k, v in kwds.items(): 162 | self.__dict__[normalize_field_name(k)] = normalize_value(v) 163 | 164 | @classmethod 165 | def from_pair(cls, header, data): 166 | """Alternate constructor""" 167 | return cls(zip(header, data)) 168 | 169 | def __repr__(self): 170 | return '%s(%r)' % (self.__class__.__name__, self.__dict__) 171 | # TODO: Should I use self.mapping here? 172 | 173 | def __getitem__(self, key): 174 | return self.mapping[key] 175 | 176 | def __setitem__(self, key, value): 177 | self.mapping[key] = value 178 | 179 | def __delitem__(self, key): 180 | del self.mapping[key] 181 | 182 | # TODO: Is this inherited from collections.MutableMapping? 183 | def __iter__(self): 184 | return iter(self.mapping) 185 | 186 | # TODO: Is this inherited from collections.MutableMapping? 187 | def __len__(self): 188 | return len(self.mapping) 189 | 190 | @property 191 | def mapping(self): 192 | return self.__dict__ 193 | 194 | @property 195 | def attributes(self): 196 | return self.mapping.keys() 197 | 198 | def pp(self): 199 | pprint.pprint(self.to_dict()) 200 | 201 | def to_dict(self): 202 | return devolve(self) 203 | 204 | 205 | def normalize_field_name(field_name): 206 | """lowercase with underscores, etc""" 207 | result = field_name 208 | if result.endswith('?'): 209 | result = result[:-1] 210 | if not result.startswith('is_'): 211 | result = 'is_' + result 212 | result = result.strip().lower().replace(' ', '_').replace( 213 | '-', '_').replace('/', '_').replace('?', '_').replace('%', 'pct') 214 | return result 215 | 216 | 217 | def normalize_value(value): 218 | """Convert empty string to None""" 219 | if value == '': 220 | value = None 221 | return value 222 | 223 | 224 | def devolve(data): 225 | """Recursively convert to just JSON-compatible types.""" 226 | # TODO: possible infinite recursion 227 | is_string = isinstance(data, str) 228 | is_iterable = isinstance(data, collections.Iterable) 229 | is_mapping = isinstance(data, collections.Mapping) 230 | is_record = isinstance(data, Record) 231 | if is_record: 232 | result = devolve(data.__dict__) 233 | elif is_mapping: 234 | result = {k: devolve(v) for k, v in data.items()} 235 | elif is_iterable and not is_string: 236 | result = [devolve(it) for it in data] 237 | elif hasattr(data, '__dict__'): 238 | result = data.__dict__ 239 | else: 240 | result = data 241 | return result 242 | 243 | 244 | def multiplicities(iterable): 245 | """Count the number of singletons, the number of duplicates, etc. 246 | Returns a collections.Counter instance.""" 247 | return collections.Counter(collections.Counter(iterable).values()) 248 | 249 | 250 | def val_or_none_key(getter_fcn): 251 | """Wraps getter_fcn, returning a key that is a tuple of (0 or 1, val) where 252 | val=getter_fcn(obj), and the int is 0 if val is None.""" 253 | def result_key_fcn(obj): 254 | val = getter_fcn(obj) 255 | n = 0 if val is None else 1 256 | return n, val 257 | return result_key_fcn 258 | 259 | 260 | def count(iterable, n=None, 261 | primary_reverse=True, 262 | secondary_reverse=False, 263 | primary_key=operator.itemgetter(1), 264 | secondary_key=val_or_none_key(operator.itemgetter(0)) 265 | ): 266 | """Wraps collections.Counter. Counts, sorts the result, and takes the 267 | first n. The primary sorting criteria is the count; the secondary sorting 268 | criteria is the value. The default sort is descending by count and 269 | ascending by value.""" 270 | result = sorted(collections.Counter(iterable).items(), 271 | key=secondary_key, reverse=secondary_reverse) 272 | result.sort(key=primary_key, reverse=primary_reverse) 273 | return result[:n] 274 | 275 | 276 | class MinMax(object): 277 | def __init__(self, min_start=None, max_start=None, count_start=0): 278 | self.count = count_start 279 | self.min = min_start 280 | self.max = max_start 281 | 282 | def add(self, value): 283 | self.count += 1 284 | if self.min is None or self.min > value: 285 | self.min = value 286 | if self.max is None or self.max < value: 287 | self.max = value 288 | 289 | def __repr__(self): 290 | return '%s(%r, %r)' % (self.__class__.__name__, self.min, self.max) 291 | 292 | 293 | def slice_by_value(sequence, start=None, end=None, step=1): 294 | """Returns the earliest slice of the sequence bounded by the 295 | start and end values. Omitted optional parameters work as expected 296 | for slicing. 297 | 298 | slice_by_value('hello there world', 'o', 'w', 2) -> 'otee' 299 | """ 300 | i_start = i_end = None 301 | if start is not None: 302 | i_start = sequence.index(start) 303 | if end is not None: 304 | i_end = sequence.index(end) 305 | return sequence[i_start:i_end:step] 306 | 307 | 308 | def update_subset(record, fields, *source_records, **kwds): 309 | """Given a destination record, a sequence of fields, and source 310 | for each field, copy over the first value found in the source records. 311 | The argument for fields must be an iterable where each item is either a 312 | string or a pair of strings. If it is a pair of strings, they name 313 | the destination and source field names. If keyword argument "required" 314 | is True and any of the fields are missing from the source records, 315 | then a KeyError is raised.""" 316 | required = kwds.pop('required', True) 317 | assert not kwds, 'Only "required" keyword supported' 318 | for field in fields: 319 | if isinstance(field, str): 320 | dst_name = src_name = field 321 | else: 322 | dst_name, src_name = field 323 | assert isinstance(dst_name, str) 324 | assert isinstance(src_name, str) 325 | value = fetch(src_name, *source_records, required=required) 326 | # TODO: assert value? 327 | if value is not None: 328 | setattr(record, dst_name, value) 329 | 330 | 331 | def fetch(field, *source_records, **kwds): 332 | """Return the value from the first record in the arguments that 333 | contains the specified field. If no record in the chain contains 334 | that field, return the default value. The default value is specified 335 | by the "default" keyword argument or None. If keyword argument 336 | "required" is True and any of the fields are missing from the source 337 | records, then a KeyError is raised.""" 338 | default = kwds.pop('default', None) 339 | required = kwds.pop('required', False) 340 | assert not kwds, 'Only "default" and "required" keyword supported' 341 | for record in source_records: 342 | if hasattr(record, field): 343 | return getattr(record, field) 344 | # Must use default. 345 | if required: 346 | raise KeyError(field) 347 | return default 348 | 349 | 350 | def replace_fields(field_list, *pairs): 351 | """Given a list of field names and one or more pairs, 352 | replace each item named in a pair by the pair. 353 | 354 | fl = 'one two three'.split() 355 | replace_fields(fl, ('two', 'spam')) 356 | # ['one', ('two', 'spam'), 'three'] 357 | """ 358 | result = list(field_list) 359 | for field_name, source in pairs: 360 | index = field_list.index(field_name) 361 | result[index] = field_name, source 362 | return result 363 | 364 | 365 | def rekey_map(mapping, replacements): 366 | """Given an iterable of destination/source pairs in replacements, 367 | create a new dict that is the same as the original except for the 368 | new key names.""" 369 | result = dict(mapping) 370 | for dst, src in replacements: 371 | value = result[src] 372 | result[dst] = value 373 | del result[src] 374 | return result 375 | 376 | 377 | class TsvDialect(csv.Dialect): 378 | """Standard Unix-style TSV format. 379 | Also compatible with MAGE-TAB spec v1.0. 380 | See MAGE-TABv1.0.pdf section 3.1.6 381 | http://www.mged.org/mage-tab/MAGE-TABv1.0.pdf 382 | http://www.mged.org/mage-tab/""" 383 | delimiter = '\t' 384 | doublequote = False 385 | escapechar = '\\' 386 | lineterminator = '\n' 387 | quotechar = '"' 388 | quoting = csv.QUOTE_MINIMAL 389 | skipinitialspace = False 390 | --------------------------------------------------------------------------------