├── README.md
├── scripts_from_the_past
├── named-md5
├── check-topmed-jobs
├── topmed_md5_prep.sh
├── submit-md5-jobs
├── generate-topmed-md5-worklist
├── generate-topmed-copy-script
└── wh_lib.py
├── original_steps
├── md5-steps.sh
├── validation-steps.sh
└── steps.sh
├── docs
├── alternate_login_with_sug-app1.md
└── topmed_automation.md
├── LICENSE
├── multiplex_scripts
├── topmed_md5_prep_shep.sh
├── submit-md5-jobs
├── submit-cram-validation
├── generate-topmed-md5-worklist-tsv
├── generate-topmed-copy-script-tsv
├── wh_lib.py.bk01
└── wh_lib.py
├── topmed.sh
└── topmed_automation.sh
/README.md:
--------------------------------------------------------------------------------
1 | # topmed-tools
2 | *For use by the Submissions Team*
3 | A collection of scripts and steps for sharing data (initially used as a part of the TOPMed submissions pipeline)
4 |
5 | **Requires lots of love**
6 |
--------------------------------------------------------------------------------
/scripts_from_the_past/named-md5:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Create a .md5 file with the specified name for the source file.
4 |
5 | SRC="$1"
6 | DST="$2"
7 |
8 | DST_DIR=$(dirname "$DST")
9 | DST_NAME=$(basename "$DST")
10 |
11 | mkdir -p "$DST_DIR"
12 | md5sum <"$SRC" | sed "s/-/$DST_NAME/" >"$DST".md5
13 |
--------------------------------------------------------------------------------
/scripts_from_the_past/check-topmed-jobs:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | batch_name="$1"
4 |
5 | cut -f2 "$batch_name"_md5_jobs.txt |
6 | xargs -n1 checkjob -v >"$batch_name".checkjob
7 |
8 | (
9 | echo ''
10 | cut -f2 "$batch_name"_md5_jobs.txt | xargs -n1 checkjob -v --xml
11 | echo
12 | echo ''
13 | ) >"$batch_name".checkjob.xml
14 |
--------------------------------------------------------------------------------
/original_steps/md5-steps.sh:
--------------------------------------------------------------------------------
1 | cd /stornext/snfs1/submissions/topmed
2 |
3 | # basename $(ls -d topmed-shared/batches/*_batch??_????-??-?? | tail -n 1)
4 | batch_name=$(ls -t topmed-shared/batches/globus/ | grep '_batch.._....-..-..' | head -n1)
5 | echo $batch_name
6 | mkdir md5-batches/"$batch_name"
7 | ll md5-batches/
8 | echo submit-md5-jobs topmed-shared/batches/globus/"$batch_name"/"$batch_name"_md5 md5-batches/"$batch_name"
9 | submit-md5-jobs topmed-shared/batches/globus/"$batch_name"/"$batch_name"_md5 md5-batches/"$batch_name"
10 |
--------------------------------------------------------------------------------
/docs/alternate_login_with_sug-app1.md:
--------------------------------------------------------------------------------
1 | # Using the App node for the cram-processing pipeline
2 |
3 | **The current practice is to use the login nodes to carry out the steps in the cram-processing pipeline.
4 | Automation of cram-processing-tools reflects this practice.**
5 |
6 | ## Demonstration of how to use an app-node as an alternative to the login-node.
7 |
8 | - Login nodes are usually for data transfer with laptop
9 | - Example: `rsync my_directory sug-login1.hgsc.bcm.edu:/groups/submissions/users/person`
10 |
11 | - On laptop, this script...:
12 | ```
13 | (main) bin$ cat sa1
14 | #!/usr/bin/env bash
15 |
16 | login_host=${DEFAULT_SUG_LOGIN:-$s4}
17 |
18 | ssh -t $login_host ssh sug-app1 "$@"
19 | ```
20 | - ...Is equivalent to:
21 | ```
22 | ssh -t sug-login4.hgsc.bcm.edu ssh sug-app1
23 | ```
24 |
25 | - The -t on the outer ssh says:
26 | - Do not allocate a pseudo-terminal, just pass the connection through to the inner ssh, which will allocate the one pseudo-terminal
27 |
--------------------------------------------------------------------------------
/original_steps/validation-steps.sh:
--------------------------------------------------------------------------------
1 | cd /stornext/snfs1/submissions/topmed
2 |
3 | # batch_name=$(
4 | # basename $(ls -d topmed-shared/batches/*_batch??_????-??-?? | tail -n 1)
5 | # )
6 | batch_name=$(ls -t topmed-shared/batches/globus/ | grep '_batch.._....-..-..' | head -n1)
7 | echo $batch_name
8 |
9 | mkdir -p validation-batches/${batch_name}/input
10 | cat topmed-shared/batches/globus/${batch_name}/${batch_name}_md5 |
11 | sed 's/^msub-md5/msub-val/' |
12 | tee validation-batches/${batch_name}/${batch_name}_val |
13 | tail
14 |
15 | pushd validation-batches/${batch_name}/input/
16 |
17 | #creates symlinks to all the bams in the input directory
18 | ECHO=echo
19 | msub-val() { $ECHO ln -s "$2" "$1"; }
20 | . ../${batch_name}_val
21 | unset ECHO
22 | . ../${batch_name}_val
23 |
24 | cd ..
25 | ls input/NWD* | head -n5 | xargs -n1 echo submit-cram-validation run_a
26 | # ls input/NWD* | head -n5 | xargs -n1 submit-validation run_a
27 | # ls input/NWD* | tail -n+6 | xargs -n1 submit-validation run_b
28 | ls input/NWD* | xargs -n1 submit-cram-validation run_a proj-dm0019
29 |
30 | popd
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 BCM-HGSC Submissions
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/multiplex_scripts/topmed_md5_prep_shep.sh:
--------------------------------------------------------------------------------
1 | batch_name="$1"
2 | samples="$2"
3 | batch_type="$3"
4 | if ((`find . -name '*.out' -size 0 | xargs ls | wc -l` == $samples)); then
5 | echo "$samples MD5_s were generated"
6 | echo `find . -name '*.out' -size 0 | xargs rm`
7 | fi
8 | if (( `(find . -name '*.out' | wc -l )` == 0 )); then
9 | echo "All outs have been erased will ensure Manifest contains correct number of samples"
10 | fi
11 | if (( `cat *md5 | tee Manifest.txt | wc -l ` == $samples)); then
12 | echo "Manifest has been created succeffully, will now copy to designated area"
13 | echo `cat *md5 | tee Manifest.txt | wc -l - ../${batch_name}_md5`
14 | #echo `cp -p Manifest.txt /stornext/snfs1/submissions/topmed/topmed-shared/batches/globus/$batch_name `
15 | echo cp Manifest.txt ../
16 | echo `scp Manifest.txt christis@hgsc-aspera1.hgsc.bcm.edu:/share/share/globusupload/submissions/$batch_type/$batch_name/`
17 | echo "Manifest has been copied, have a good day"
18 | else
19 | echo " ERROR. There are below 100 MD5s..."
20 |
21 | fi
22 | #error is that it is making the Manifest script but is not making the copy in the spot it is supposed to be made
23 |
--------------------------------------------------------------------------------
/scripts_from_the_past/topmed_md5_prep.sh:
--------------------------------------------------------------------------------
1 | batch_name="$1"
2 | samples="$2"
3 | batch_type="$3"
4 | if ((`find . -name '*.out' -size 0 | xargs ls | wc -l` == $samples)); then
5 | echo "$samples MD5_s were generated"
6 | echo `find . -name '*.out' -size 0 | xargs rm`
7 | fi
8 | if (( `(find . -name '*.out' | wc -l )` == 0 )); then
9 | echo "All outs have been erased will ensure Manifest contains correct number of samples"
10 | fi
11 | if (( `cat *md5 | tee Manifest.txt | wc -l ` == $samples)); then
12 | echo "Manifest has been created succeffully, will now copy to designated area"
13 | echo `cat *md5 | tee Manifest.txt | wc -l - /stornext/snfs1/submissions/topmed/topmed-shared/batches/globus/$batch_name/${batch_name}_md5`
14 | echo `cp -p Manifest.txt /stornext/snfs1/submissions/topmed/topmed-shared/batches/globus/$batch_name `
15 | echo `scp Manifest.txt christis@hgsc-aspera1.hgsc.bcm.edu:/share/share/globusupload/submissions/$batch_type/$batch_name/`
16 | echo "Manifest has been copied, have a good day"
17 | else
18 | echo " ERROR. There are below 100 MD5s..."
19 |
20 | fi
21 | #error is that it is making the Manifest script but is not making the copy in the spot it is supposed to be made
22 |
--------------------------------------------------------------------------------
/multiplex_scripts/submit-md5-jobs:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | BATCH_FILE_PATH="$1" # Example: Sarcoidosis_batch02_2016-02-05_md5_a
4 | DST="$2" # Directory where all results will go
5 | PROJECT_CODE="$3"
6 | # Example batch file:
7 | # msub-md5 'NWD414772-HK2HCCCXX-1.hgv.bam' '/stornext/snfs120/next-gen/Illumina/Instruments/E00212/160109_ST-E00212_0182_AHK2HCCCXX/Results/Project_160109_ST-E00212_0182_AHK2HCCCXX/Sample_HK2HCCCXX-1/HK2HCCCXX-1.hgv.bam'
8 | # msub-md5 'NWD520478-HK2HCCCXX-2.hgv.bam' '/stornext/snfs120/next-gen/Illumina/Instruments/E00212/160109_ST-E00212_0182_AHK2HCCCXX/Results/Project_160109_ST-E00212_0182_AHK2HCCCXX/Sample_HK2HCCCXX-2/HK2HCCCXX-2.hgv.bam'
9 |
10 | # Can override in $(pwd)/topmed_msub_init.sh or prepend to batch file:
11 | JOB_BATCH_NAME=$(basename "$BATCH_FILE_PATH")
12 | BATCH_SUFFIX=$(echo "$JOB_BATCH_NAME"_ | cut -d_ -f5 | cut -c1)
13 | #PROJECT_CODE=proj-dm0019
14 | QUEUE=normal # or "analysis" for long jobs `three_hours` for short jobs
15 | RESOURCES=nodes=1:ppn=1,mem=10mb,walltime=35:00:00
16 |
17 | err() { echo "$@" >&2; }
18 |
19 | if [ "$#" -ne 3 ]; then
20 | err 'Usage:'
21 | err "$0 BATCH_FILE DESTINATION_DIR"
22 | err
23 | exit 1
24 | fi
25 |
26 | msub-md5() {
27 | NEW_NAME="$1"
28 | SRC="$2"
29 | command="/hgsc_software/submissions/bin/named-md5 '$SRC' '$NEW_NAME'"
30 | JOB_NAME=md5"$BATCH_SUFFIX"-"$NEW_NAME"
31 | MSUB_OPTS="-q $QUEUE -A $PROJECT_CODE -l $RESOURCES -j oe"
32 | echo $NEW_NAME | tr \\n \\t
33 | echo $command |
34 | msub $MSUB_OPTS -d "$DST" -o "$NEW_NAME".out -N $JOB_NAME |
35 | sed '1s/.sug-moab$//'
36 | }
37 |
38 | # Allow for customization, such as during testing.
39 | if [ -f topmed_msub_init.sh ]; then
40 | err 'loading topmed_msub_init.sh'
41 | . topmed_msub_init.sh
42 | fi
43 |
44 | ##################################################
45 | # Now this script starts doing things to the OS...
46 | ##################################################
47 |
48 | mkdir -p "$DST"
49 | DST="$(cd "$DST"; pwd)"
50 | err DST: $DST
51 |
52 | (. "$BATCH_FILE_PATH") | tee "$DST/$JOB_BATCH_NAME"_jobs.txt | cat -n
53 |
--------------------------------------------------------------------------------
/scripts_from_the_past/submit-md5-jobs:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | BATCH_FILE_PATH="$1" # Example: Sarcoidosis_batch02_2016-02-05_md5_a
4 | DST="$2" # Directory where all results will go
5 | PROJECT_CODE="$3" #proj-dm0019
6 | # Example batch file:
7 | # msub-md5 'NWD414772-HK2HCCCXX-1.hgv.bam' '/stornext/snfs120/next-gen/Illumina/Instruments/E00212/160109_ST-E00212_0182_AHK2HCCCXX/Results/Project_160109_ST-E00212_0182_AHK2HCCCXX/Sample_HK2HCCCXX-1/HK2HCCCXX-1.hgv.bam'
8 | # msub-md5 'NWD520478-HK2HCCCXX-2.hgv.bam' '/stornext/snfs120/next-gen/Illumina/Instruments/E00212/160109_ST-E00212_0182_AHK2HCCCXX/Results/Project_160109_ST-E00212_0182_AHK2HCCCXX/Sample_HK2HCCCXX-2/HK2HCCCXX-2.hgv.bam'
9 |
10 | # Can override in $(pwd)/topmed_msub_init.sh or prepend to batch file:
11 | JOB_BATCH_NAME=$(basename "$BATCH_FILE_PATH")
12 | BATCH_SUFFIX=$(echo "$JOB_BATCH_NAME"_ | cut -d_ -f5 | cut -c1)
13 | #PROJECT_CODE=proj-dm0019
14 | QUEUE=normal # or "analysis" for long jobs `three_hours` for short jobs
15 | RESOURCES=nodes=1:ppn=1,mem=10mb,walltime=35:00:00
16 |
17 | err() { echo "$@" >&2; }
18 |
19 | if [ "$#" -ne 3 ]; then
20 | err 'Usage:'
21 | err "$0 BATCH_FILE DESTINATION_DIR proj_code"
22 | err
23 | exit 1
24 | fi
25 |
26 | msub-md5() {
27 | NEW_NAME="$1"
28 | SRC="$2"
29 | command="/hgsc_software/submissions/bin/named-md5 '$SRC' '$NEW_NAME'"
30 | JOB_NAME=md5"$BATCH_SUFFIX"-"$NEW_NAME"
31 | MSUB_OPTS="-q $QUEUE -A $PROJECT_CODE -l $RESOURCES -j oe"
32 | echo $NEW_NAME | tr \\n \\t
33 | echo $command |
34 | msub $MSUB_OPTS -d "$DST" -o "$NEW_NAME".out -N $JOB_NAME |
35 | sed '1s/.sug-moab$//'
36 | }
37 |
38 | # Allow for customization, such as during testing.
39 | if [ -f topmed_msub_init.sh ]; then
40 | err 'loading topmed_msub_init.sh'
41 | . topmed_msub_init.sh
42 | fi
43 |
44 | ##################################################
45 | # Now this script starts doing things to the OS...
46 | ##################################################
47 |
48 | mkdir -p "$DST"
49 | DST="$(cd "$DST"; pwd)"
50 | err DST: $DST
51 |
52 | (. "$BATCH_FILE_PATH") | tee "$DST/$JOB_BATCH_NAME"_jobs.txt | cat -n
53 |
--------------------------------------------------------------------------------
/multiplex_scripts/submit-cram-validation:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | DEFAULT_INIT_SCRIPT='submit-validation-init.sh'
4 | DEFAULT_MEM=15gb
5 | DEFAULT_WALLTIME=24:00:00
6 |
7 | usage() {
8 | err "usage: $(basename "$0") DST_DIR BAM_PATH [PROJECT_CODE] [QUEUE]"
9 | err " Submit validation job."
10 | err " DST_DIR: output & working directory"
11 | err
12 | err "Environment:"
13 | err " SUBMIT_VALIDATION_INIT: default ($DEFAULT_INIT_SCRIPT) sourced at"
14 | err " end of variable initialization"
15 | err " MEM: default ($DEFAULT_MEM)"
16 | err " WALLTIME: default ($DEFAULT_WALLTIME)"
17 | exit 1
18 | } >&2
19 |
20 | err() { echo "$@" >&2; }
21 |
22 | if [[ $# -lt 2 ]] || [[ $# -gt 4 ]]; then
23 | usage
24 | fi
25 |
26 | dst_dir=$(readlink -m "$1")
27 | cram_path=$(python3 -c "import os; print(os.path.abspath('$2'))")
28 | project_code_code="$3")
29 | #project_code=${3:-proj-dm0019} # TODO: handle default outside of script
30 | queue=${4:-analysis} # for long jobs
31 |
32 | MEM=${MEM:-$DEFAULT_MEM}
33 | WALLTIME=${WALLTIME:-$DEFAULT_WALLTIME}
34 |
35 | JAVA='/hgsc_software/java/jdk1.8.0_74/bin/java'
36 | JAR='/hgsc_software/picard/picard-tools-2.6.0/picard.jar'
37 | VALIDATE_ARGS='ValidateSamFile'
38 | TIME_FMT='cmd: %C\nerr: %x\nsecs: %e\nproc: %P\nkb: %M'
39 | REFERENCE='/stornext/snfs1/submissions/resources/referneces/GRCh38.fa'
40 | resources=nodes=1:ppn=1,mem=$MEM,walltime=$WALLTIME
41 | msub_opts="-q $queue -A $project_code -l $resources"
42 | out_name=$(basename "$cram_path")
43 | job_name="$out_name.val"
44 |
45 | # Allow for customization, such as during testing.
46 | INIT_SCRIPT=${SUBMIT_VALIDATION_INIT:-"$DEFAULT_INIT_SCRIPT"}
47 | if [[ -f "$INIT_SCRIPT" ]]; then
48 | err "loading $INIT_SCRIPT"
49 | . "$INIT_SCRIPT"
50 | fi
51 |
52 | mkdir -p "$dst_dir"
53 |
54 | command="/usr/bin/time -f'$TIME_FMT' -o '$job_name.time' '$JAVA' -jar '$JAR' $VALIDATE_ARGS I='$cram_path' R='$REFERENCE' IGNORE=MISSING_TAG_NM "
55 | echo $command |
56 | msub $msub_opts -N "$job_name" -d "$dst_dir" -o "$job_name".out -e "$job_name".err |
57 | sed '1s/.sug-moab$//' |
58 | tee -a "$dst_dir"/"$job_name".job
59 |
--------------------------------------------------------------------------------
/docs/topmed_automation.md:
--------------------------------------------------------------------------------
1 | # TOPmed Automation
2 |
3 | `topmed_automation` is a bash script that runs a majority of steps for cram processing.
4 |
5 | **As of now, the user is still responsible for:**
6 | - Checking if aspera has sufficient space.
7 | - Checking validation after it is done.
8 | - Copying the crams over to aspera with the generated copy script.
9 | - Creating the manifest and copying it to aspera with `topmed_md5_prep_shep.sh`
10 |
11 | ## Setup
12 |
13 | #### Passwordless Entry
14 |
15 | You must have passwordless entry when using `ssh` to a copy node.
16 |
17 | - ssh into the login node. `ssh USERNAME@sug-login#.hgsc.bcm.edu`
18 | - `cd .ssh`. You should have a `id_rsa.pub` file and a `authorized_keys` file.
19 | - Append your public key to the authorized_keys file. `cat id_rsa.pub >> autorized_keys`
20 |
21 | You should no longer need to enter your password when login to any of the other nodes.
22 |
23 |
24 | ## How to Run
25 | To run the script type: `./topmed_automation PM_CODE PM_PATH`
26 |
27 | - `PM_CODE` is the project code that is given in the RT. An example code would be `proj-dm0021`.
28 | - `PM_PATH` is the path given to you by the project manager. An example path would look like this: `/hgsc_software/groups/project-managers/tech/metadata/v1/topmed/YR3/cardiomyopathy/01/03a`
29 |
30 |
31 | ## Workflow
32 |
33 | - Check if `shepherd.yaml` exist in the users `.config` directory.
34 | - If it does, proceed.
35 | - It it does not, the `shepherd.yaml` file will be created and then proceeds.
36 | - Checks if previous named tmux sessions `topmed-copy` and `topmed-login` exist.
37 | - It if it does, the user will need to manually close it just in case they have an important task running.
38 | - If it does not, proceed.
39 | - Run shepherd's `accept_batch` script.
40 | - Generate the copy script with `generate-topmed-copy-script-tsv`
41 | - Generate the md5_worklist script with `generate-topmed-md5-worklist-tsv`
42 | - Create the input directory under `validation/`
43 | - Create symlinks to all the crams in the input directory
44 | - In the `topmed-login` session, submit md5 jobs to the cluster with the `submit-md5-jobs` script.
45 | - In the `topmed-login` session, run validation with the `submit-cram-validation_phase5`script.
46 |
--------------------------------------------------------------------------------
/scripts_from_the_past/generate-topmed-md5-worklist:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """Generate the worklist for submitting a bunch of jobs that compute MD5s.
4 | Execute "submit-md5-jobs worklist_file_path" to then submit all the jobs."""
5 |
6 | import argparse
7 | from functools import partial
8 | import logging
9 | import sys
10 |
11 | import wh_lib
12 |
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | def main():
18 | args = parse_args()
19 | config_logging(args)
20 | run(args)
21 | logging.shutdown()
22 |
23 |
24 | def parse_args():
25 | parser = argparse.ArgumentParser(description=__doc__)
26 | parser.add_argument('worklist_file_path')
27 | parser.add_argument('-v', '--verbose', action='store_true')
28 | args = parser.parse_args()
29 | return args
30 |
31 |
32 | def config_logging(args):
33 | global logger
34 | level = logging.DEBUG if args.verbose else logging.INFO
35 | logging.basicConfig(level=level)
36 | logger = logging.getLogger('gen-topmed-md5')
37 |
38 |
39 | def run(args):
40 | logger.debug('args: %r', args)
41 | worklist = wh_lib.parse_excel_records(args.worklist_file_path)
42 | batch_dir, project_dir = extract_batch_info(worklist)
43 | worklist_name = '{}_md5'.format(batch_dir)
44 | new_dir = '{0}/{1}'.format(project_dir, batch_dir)
45 | with open(worklist_name, 'w') as fout:
46 | output_copy_script(fout, new_dir, worklist)
47 |
48 |
49 | def extract_batch_info(worklist):
50 | """Return batch_dir & project_dir, which must be the same for all rows."""
51 | batch_dirs = set(r.batch for r in worklist)
52 | project_dirs = set(r.hgsc_xfer_subdir for r in worklist)
53 | assert len(batch_dirs) == 1, (
54 | 'batch must be the same for all BAMs in worklist'
55 | )
56 | assert len(project_dirs) == 1, (
57 | 'hgsc_xfer_subdir must be the same for all BAMs in worklist'
58 | )
59 | batch_dir = list(batch_dirs)[0]
60 | project_dir = list(project_dirs)[0]
61 | return batch_dir, project_dir
62 |
63 |
64 | def output_copy_script(fout, new_dir, worklist):
65 | pr = partial(print, file=fout)
66 | for r in worklist:
67 | pr("msub-md5 '{}' '{}'".format(r.new_bam_name, r.bam_path))
68 |
69 |
70 | if __name__ == '__main__':
71 | main()
72 |
--------------------------------------------------------------------------------
/multiplex_scripts/generate-topmed-md5-worklist-tsv:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """Generate the worklist for submitting a bunch of jobs that compute MD5s.
4 | Execute "submit-md5-jobs worklist_file_path" to then submit all the jobs."""
5 |
6 | import argparse
7 | from functools import partial
8 | import logging
9 | import sys
10 |
11 | import wh_lib
12 |
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | def main():
18 | args = parse_args()
19 | config_logging(args)
20 | run(args)
21 | logging.shutdown()
22 |
23 |
24 | def parse_args():
25 | parser = argparse.ArgumentParser(description=__doc__)
26 | parser.add_argument('worklist_file_path')
27 | parser.add_argument('-v', '--verbose', action='store_true')
28 | args = parser.parse_args()
29 | return args
30 |
31 |
32 | def config_logging(args):
33 | global logger
34 | level = logging.DEBUG if args.verbose else logging.INFO
35 | logging.basicConfig(level=level)
36 | logger = logging.getLogger('gen-topmed-md5')
37 |
38 |
39 | def run(args):
40 | logger.debug('args: %r', args)
41 | worklist = wh_lib.parse_tsv_file(args.worklist_file_path)
42 | batch_dir, project_dir = extract_batch_info(worklist)
43 | worklist_name = '{}_md5'.format(batch_dir)
44 | new_dir = '{0}/{1}'.format(project_dir, batch_dir)
45 | with open(worklist_name, 'w') as fout:
46 | output_copy_script(fout, new_dir, worklist)
47 |
48 |
49 | def extract_batch_info(worklist):
50 | """Return batch_dir & project_dir, which must be the same for all rows."""
51 | batch_dirs = set(r.batch for r in worklist)
52 | project_dirs = set(r.hgsc_xfer_subdir for r in worklist)
53 | assert len(batch_dirs) == 1, (
54 | 'batch must be the same for all BAMs in worklist'
55 | )
56 | assert len(project_dirs) == 1, (
57 | 'hgsc_xfer_subdir must be the same for all BAMs in worklist'
58 | )
59 | batch_dir = list(batch_dirs)[0]
60 | project_dir = list(project_dirs)[0]
61 | return batch_dir, project_dir
62 |
63 |
64 | def output_copy_script(fout, new_dir, worklist):
65 | pr = partial(print, file=fout)
66 | for r in worklist:
67 | pr("msub-md5 '{}' '{}'".format(r.new_cram_name, r.cram_path)) #need to change based on how Jennifer sets up
68 |
69 |
70 | if __name__ == '__main__':
71 | main()
72 |
--------------------------------------------------------------------------------
/multiplex_scripts/generate-topmed-copy-script-tsv:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """Generate the copy-with-rename script that stages TOPMed files."""
4 |
5 | import argparse
6 | from functools import partial
7 | import logging
8 | import sys
9 |
10 | import wh_lib
11 |
12 |
13 | PROLOG = '''#/usr/bin/env bash
14 |
15 | DST="$1"
16 |
17 | if [ -f topmed_cvcp_init.sh ]; then
18 | . topmed_cvcp_init.sh
19 | fi
20 | '''
21 |
22 | logger = logging.getLogger(__name__)
23 |
24 |
25 | def main():
26 | args = parse_args()
27 | config_logging(args)
28 | run(args)
29 | logging.shutdown()
30 |
31 |
32 | def parse_args():
33 | parser = argparse.ArgumentParser(description=__doc__)
34 | parser.add_argument('worklist_file_path')
35 | parser.add_argument('-v', '--verbose', action='store_true')
36 | args = parser.parse_args()
37 | return args
38 |
39 |
40 | def config_logging(args):
41 | global logger
42 | level = logging.DEBUG if args.verbose else logging.INFO
43 | logging.basicConfig(level=level)
44 | logger = logging.getLogger('gen-topmed-copy')
45 |
46 |
47 | def run(args):
48 | logger.info('args: %r', args)
49 | worklist = wh_lib.parse_tsv_file(args.worklist_file_path)
50 | batch_dir, project_dir = extract_batch_info(worklist)
51 | script_name = '{}.sh'.format(batch_dir)
52 | new_dir = '{0}/{1}'.format(project_dir, batch_dir)
53 | with open(script_name, 'w') as fout:
54 | output_copy_script(fout, new_dir, worklist)
55 |
56 |
57 | def extract_batch_info(worklist):
58 | """Return batch_dir & project_dir, which must be the same for all rows."""
59 | batch_dirs = set(r.batch for r in worklist)
60 | project_dirs = set(r.hgsc_xfer_subdir for r in worklist)
61 | assert len(batch_dirs) == 1, (
62 | 'batch must be the same for all CRAMs in worklist'
63 | )
64 | assert len(project_dirs) == 1, (
65 | 'hgsc_xfer_subdir must be the same for all BAMs in worklist'
66 | )
67 | batch_dir = list(batch_dirs)[0]
68 | project_dir = list(project_dirs)[0]
69 | return batch_dir, project_dir
70 |
71 |
72 | def output_copy_script(fout, new_dir, worklist):
73 | pr = partial(print, file=fout)
74 | pr(PROLOG)
75 | pr('mkdir -p "$DST/{}"'.format(new_dir))
76 | pr()
77 | for r in worklist:
78 | pr(
79 | 'cvcp $TOPMED_CVCP_OPTS "{}" "$DST/{}/{}"'.format(
80 | r.cram_path, new_dir, r.new_cram_name
81 | )
82 | )
83 |
84 |
85 | if __name__ == '__main__':
86 | main()
87 |
--------------------------------------------------------------------------------
/scripts_from_the_past/generate-topmed-copy-script:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | """Generate the copy-with-rename script that stages TOPMed files."""
4 |
5 | import argparse
6 | from functools import partial
7 | import logging
8 | import sys
9 |
10 | import wh_lib
11 |
12 |
13 | PROLOG = '''#/usr/bin/env bash
14 |
15 | DST="$1"
16 |
17 | if [ -f topmed_cvcp_init.sh ]; then
18 | . topmed_cvcp_init.sh
19 | fi
20 | '''
21 |
22 | logger = logging.getLogger(__name__)
23 |
24 |
25 | def main():
26 | args = parse_args()
27 | config_logging(args)
28 | run(args)
29 | logging.shutdown()
30 |
31 |
32 | def parse_args():
33 | parser = argparse.ArgumentParser(description=__doc__)
34 | parser.add_argument('worklist_file_path')
35 | parser.add_argument('-v', '--verbose', action='store_true')
36 | args = parser.parse_args()
37 | return args
38 |
39 |
40 | def config_logging(args):
41 | global logger
42 | level = logging.DEBUG if args.verbose else logging.INFO
43 | logging.basicConfig(level=level)
44 | logger = logging.getLogger('gen-topmed-copy')
45 |
46 |
47 | def run(args):
48 | logger.info('args: %r', args)
49 | worklist = wh_lib.parse_excel_records(args.worklist_file_path)
50 | batch_dir, project_dir = extract_batch_info(worklist)
51 | script_name = '{}.sh'.format(batch_dir)
52 | new_dir = '{0}/{1}'.format(project_dir, batch_dir)
53 | with open(script_name, 'w') as fout:
54 | output_copy_script(fout, new_dir, worklist)
55 |
56 |
57 | def extract_batch_info(worklist):
58 | """Return batch_dir & project_dir, which must be the same for all rows."""
59 | batch_dirs = set(r.batch for r in worklist)
60 | project_dirs = set(r.hgsc_xfer_subdir for r in worklist)
61 | assert len(batch_dirs) == 1, (
62 | 'batch must be the same for all BAMs in worklist'
63 | )
64 | assert len(project_dirs) == 1, (
65 | 'hgsc_xfer_subdir must be the same for all BAMs in worklist'
66 | )
67 | batch_dir = list(batch_dirs)[0]
68 | project_dir = list(project_dirs)[0]
69 | return batch_dir, project_dir
70 |
71 |
72 | def output_copy_script(fout, new_dir, worklist):
73 | pr = partial(print, file=fout)
74 | pr(PROLOG)
75 | pr('mkdir -p "$DST/{}"'.format(new_dir))
76 | pr()
77 | for r in worklist:
78 | pr(
79 | 'cvcp $TOPMED_CVCP_OPTS "{}" "$DST/{}/{}"'.format(
80 | r.bam_path, new_dir, r.new_bam_name
81 | )
82 | )
83 |
84 |
85 | if __name__ == '__main__':
86 | main()
87 |
--------------------------------------------------------------------------------
/topmed.sh:
--------------------------------------------------------------------------------
1 | TOPMED Steps: Initializing:
2 |
3 | under /users/rajendra/.config make shepherd.yaml
4 | cat >
5 | sub_root: /groups/submissions/metadata/v1/topmed/
6 | asp_root: /aspera/share/globusupload/submissions/
7 |
8 |
9 | ------In the Copy Node (tmux):-------
10 | (in the home directory)
11 | 1) Check space on Aspera: df -h /aspera/share/globusupload
12 | 2) Run 'accept_batch' and the path jennifer gives you
13 | 3) In groups submissions directory (/groups/submissions/metadata/v1/topmed/topmed/YR3/harvard/01/{created directory}), copy the tsv from the project manager location
14 | 4) Run the workbook function:
15 |
16 | workbook=$(ls -t *_batch???_mplx.tsv | head -n1)
17 | if [[ $workbook =~ ' ' ]]; then
18 | echo 'ERROR: There is a space in the name of the workbook!!!'
19 | echo 'These steps will FAIL!'
20 | fi
21 |
22 | batch_name=$(echo $workbook | sed -e s/_mplx.tsv$//)
23 | echo $batch_name
24 |
25 | 5) Run the scripts used to generate: (i) copy script, (ii) md5_worklist
26 | /stornext/snfs1/submissions/topmed/topmed-code/topmed_multiplex_code/generate-topmed-copy-script-tsv $workbook
27 | /stornext/snfs1/submissions/topmed/topmed-code/topmed_multiplex_code/generate-topmed-md5-worklist-tsv $workbook
28 |
29 |
30 | 6) Run the following validation prep steps:
31 | mkdir -p validation/input
32 | cat ${batch_name}_md5 | sed 's/^msub-md5/msub-val/' | tee validation/${batch_name}_val | tail
33 |
34 | pushd validation/input/
35 |
36 | #creates symlinks to all the bams in the input directory
37 | ECHO=echo
38 | msub-val() { $ECHO ln -s "$2" "$1"; }
39 | . ../${batch_name}_val
40 | unset ECHO
41 | . ../${batch_name}_val
42 |
43 | 11) Copy the crams
44 | ### run the copy script (be sure to include the first part of the destination path)
45 | ./{copy_script}.sh /aspera/share/globusupload/submissions
46 |
47 | 13) Convert the md5s into a manifest and copy to aspera using the copy script ### you may need to copy the script into your working directory
48 | cp /stornext/snfs1/submissions/topmed/topmed-code/topmed_md5_prep_shep.sh .
49 | cd md5/
50 | ../topmed_md5_prep_shep.sh $batch_name {no. of samples} {Cohort_name}
51 |
52 |
53 | ------In the Login Node (tmux):---------
54 | 7) cd into working directory (/groups/submissions/metadata/v1/topmed/topmed/YR3/harvard/01/{created directory})
55 | 8) 'md5' directory should already exist
56 | ### redefine batch name as necessary
57 |
58 | 9) Submit the md5 jobs to the cluster:
59 | echo submit-md5-jobs "$batch_name"_md5 md5
60 | submit-md5-jobs "$batch_name"_md5 md5/ proj-dm0021
61 |
62 | 10) Run the validation
63 | cd .. ###into the validation directory, make sure you have a copy of the submit_cram_validation script ### run_a is the directory name
64 | ls input/NWD* | head -n5 | xargs -n1 echo ../submit-cram-validation_phase5 run_a
65 | ls input/NWD* | xargs -n1 submit-cram-validation_phase5 run_a
66 |
67 | 12) Check the validation, once complete
68 |
--------------------------------------------------------------------------------
/original_steps/steps.sh:
--------------------------------------------------------------------------------
1 | sgenerate-topmed-md5-worklist $workbookz# Connect to:
2 | # cifs://hgsc-naf01-b.hgsc.bcm.edu/tcga/other-submissions/topmed-shared
3 |
4 | #make new directorie
5 |
6 | # Drag attachment (eg. TOPMed_THRV_batch03_2016-04-04.xlsx) into new created directory.
7 |
8 |
9 |
10 |
11 |
12 | cd /stornext/snfs1/submissions/topmed
13 | pushd topmed-shared
14 |
15 | cd batches/globus/"filename"
16 |
17 |
18 | workbook=$(ls -t TOPMed_*_batch??_????-??-??.xlsx | head -n1)
19 | if [[ $workbook =~ ' ' ]]; then
20 | echo 'ERROR: There is a space in the name of the workbook!!!'
21 | echo 'These steps will FAIL!'
22 | fi
23 |
24 | batch_name=$(echo $workbook | sed -e s/^TOPMed_// -e s/.xlsx$//)
25 | echo $batch_name
26 | #mkdir batches/$batch_name
27 | # made the directory earlier
28 | #pasted it into the directory earlier
29 | #mv $workbook batches/$batch_name/
30 | #chmod -wx batches/$batch_name/$workbook
31 |
32 | pushd batches/globus/$batch_name/
33 | generate-topmed-copy-script $workbook
34 | generate-topmed-md5-worklist $workbook
35 | #chmod -wx $batch_name*
36 |
37 | popd
38 | popd
39 |
40 | # Upload copy/rename script to ticket.
41 |
42 | (
43 | echo 'Update the RT ticket with the following:'
44 | echo
45 | echo "Attached is the copy script. You can also find it here:"
46 | echo "/data/tcga/other-submissions/topmed-shared/batches/globus/$batch_name/$batch_name.sh"
47 | echo "Moving on to MD5 and validation..."
48 | )
49 |
50 | # Execute md5-steps.
51 | # need to do this once jobs are complete
52 | #jobs completed
53 |
54 | pushd md5-batches/$batch_name
55 | #copy to md5
56 | cp /stornext/snfs1/submissions/topmed/topmed-code/topmed_md5_prep.sh .
57 | #then run, but specifiy the batch
58 | ./topmed_md5_prep.sh $batch_name 100 THRV
59 |
60 |
61 |
62 | """ #OLDWAY OF DOING IT#
63 | (
64 | pushd md5-batches/$batch_name
65 | find . -name '*.out' -size 0 | xargs ls
66 | find . -name '*.out' -size 0 | xargs rm
67 | find . -name '*.out'
68 | cat *md5 | tee Manifest.txt | wc -l
69 | cat *md5 | tee Manifest.txt | wc -l - ../../topmed-shared/batches/globus/$batch_name/${batch_name}_md5
70 | # CHECK: the first two numbers generated should match.
71 | chmod -w Manifest.txt
72 |
73 | cp -p Manifest.txt ../../topmed-shared/batches/globus/$batch_name/
74 | )
75 | """
76 |
77 |
78 | scp Manifest.txt christis@hgsc-aspera1.hgsc.bcm.edu:/share/share/globusupload/submissions/AFIB/$batch_name
79 |
80 |
81 | # Upload Manifest file to ticket.
82 |
83 | (
84 | echo 'Update the RT ticket with the following:'
85 | echo
86 | echo "Attached is the Manifest.txt file. You can also find it here:"
87 | echo "/data/tcga/other-submissions/topmed-shared/batches/globus/$batch_name/Manifest.txt"
88 | echo "Moving on to validation..."
89 | )
90 |
91 | # Execute validation-steps.
92 |
93 | pushd validation-batches/$batch_name/
94 | (
95 | echo $(ls run*/*.job | wc -l) job
96 | echo $(ls run*/*.err | wc -l) err
97 | echo $(ls run*/*.out | wc -l) out
98 | echo $(ls run*/*.time | wc -l) time
99 | cat run*/*.out | uniq -c
100 | )
101 | # Should all be the same number.
102 | popd
103 |
--------------------------------------------------------------------------------
/topmed_automation.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PM_CODE=$1
4 | PM_PATH=$2
5 |
6 | SUB_ROOT=/groups/submissions/metadata/v1/topmed
7 | SUB_PATH=${PM_PATH:56}
8 |
9 | ASP_ROOT=/aspera/share/globusupload/submissions
10 |
11 | #########
12 | # Usage #
13 | #########
14 | usage() {
15 | err "usage: $(basename "$0") PM_CODE PM_PATH"
16 | err " Run TOPmed automation for validation and copy steps."
17 | err " PM_CODE: code provided by the project manager"
18 | err " PM_PATH: path provided by the project manager"
19 | err
20 | exit 1
21 | } >&2
22 |
23 | err() { echo "$@" >&2; }
24 |
25 | if [[ $# -ne 2 ]]; then
26 | usage
27 | fi
28 |
29 | ###############
30 | # Function(s) #
31 | ###############
32 | tmux_session_exist () {
33 | if [ $? != 0 ]; then
34 | echo "ERROR: tmux session already exist."
35 | echo "Make sure to exit previous sessions before continuing"
36 | exit 1
37 | fi
38 | }
39 |
40 |
41 | #########
42 | # Setup #
43 | #########
44 |
45 | # TODO: Aspera space issue.
46 | # Remind user?
47 |
48 | # Check if shepherd config file exist
49 |
50 | CONFIG_FILE=/users/$USER/.config/shepherd.yaml
51 |
52 | if [ ! -f $CONFIG_FILE ]; then
53 | cat << EOF > $CONFIG_FILE
54 | sub_root: $SUB_ROOT
55 | asp_root: $ASP_ROOT
56 | EOF
57 | fi
58 |
59 | # Check if tmux sessions exist
60 | # tmux new -s topmed-copy -d
61 | # tmux_session_exist
62 | tmux new -s topmed-login -d
63 | tmux_session_exist
64 | echo "Created topmed-login session"
65 |
66 |
67 | #########################################
68 | # Run accept_batch script from Shepherd #
69 | #########################################
70 |
71 | ACCEPT_BATCH_PATH=/hgsc_software/submissions/bin/accept_batch
72 |
73 | $ACCEPT_BATCH_PATH $PM_PATH
74 | if [ $? != 0 ]; then
75 | echo "WARNING: accept_batch encountered errors."
76 | echo "ERROR: Stopping the pipeline."
77 | exit 1
78 | fi
79 |
80 | # Change to working directory
81 | cd $SUB_ROOT/$SUB_PATH
82 |
83 | # Assign workbook
84 | workbook=$(ls -t *_batch???_mplx.tsv | head -n1)
85 |
86 | # Assign batch name
87 | batch_name=$(echo $workbook | sed -e s/_mplx.tsv$//)
88 |
89 | # Run scripts to generate the copy script and md5_worklist
90 | stornext_path=/stornext/snfs1/submissions/topmed/topmed-code/topmed_multiplex_code
91 | $stornext_path/generate-topmed-copy-script-tsv $workbook
92 | echo "Generated copy script"
93 | $stornext_path/generate-topmed-md5-worklist-tsv $workbook
94 | echo "Generated md5_worklist script"
95 |
96 | # Run validation prep steps
97 | # Create input directory
98 | mkdir validation/input
99 | echo "Created input directory under validation directory"
100 | sed 's/^msub-md5/msub-val/' ${batch_name}_md5 > validation/${batch_name}_val
101 | echo "Created ${batch_name}_val under validation/"
102 |
103 | # Create symlinks to all the bams in the input directory
104 | cd validation/input
105 | ECHO=echo
106 | msub-val() { $ECHO ln -s "$2" "$1"; }
107 | . ../${batch_name}_val
108 | unset ECHO
109 | . ../${batch_name}_val
110 | echo "Successfully created symlinks"
111 |
112 | ##################################
113 | # Submit md5 jobs to the cluster #
114 | ##################################
115 |
116 | submit_md5_jobs=/hgsc_software/submissions/noarch/apps/topmed-code/submit-md5-jobs
117 | md5_file_path="$batch_name"_md5
118 | tmux send-keys -t topmed-login "ssh sug-login4" C-m
119 | tmux send-keys -t topmed-login "cd $SUB_ROOT/$SUB_PATH" C-m
120 | # This is assuming user doesnt have to enter a password
121 | # Should prob clarify, it's easier
122 | tmux send-keys -t topmed-login "$submit_md5_jobs $md5_file_path md5/ $PM_CODE" C-m
123 |
124 | # Run the validation
125 | submit-cram-validation_phase5=/hgsc_software/groups/submissions/metadata/v1/topmed/topmed/YR3/scripts_mr/submit-cram-validation_phase5
126 | tmux send-keys -t topmed-login "cd validation/" C-m
127 | tmux send-keys -t topmed-login "ls input/NWD* | xargs -n1 $submit-cram-validation_phase5 run_a" C-m
128 |
129 | echo "Validation is running, check validation in tmux session topmed-login once completed!"
--------------------------------------------------------------------------------
/multiplex_scripts/wh_lib.py.bk01:
--------------------------------------------------------------------------------
1 | import collections
2 | import csv
3 | import glob
4 | import itertools
5 | import locale
6 | import operator
7 | import os
8 | import pprint
9 | import re
10 | import sys
11 |
12 | try:
13 | import openpyxl
14 | except ImportError:
15 | pass # optional dependency, only used when parsing .xlsx files
16 | try:
17 | import xlrd
18 | except ImportError:
19 | pass # optional dependency, only used when parsing .xls files
20 | import yaml
21 |
22 |
23 | locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
24 |
25 |
26 | def select(collection, *items):
27 | return operator.itemgetter(*items)(collection)
28 |
29 |
30 | def print_table(iterable):
31 | """Print out the finite input as a table. Each item in iterable must be an
32 | iterable with roughly the same number of items."""
33 | # Slurp the entire iterable.
34 | rows = list(iterable)
35 | # Compute column widths.
36 | col_widths = []
37 | for row in rows:
38 | for col_num, col_val in enumerate(row):
39 | col_len = len(str(col_val))
40 | if col_num < len(col_widths):
41 | col_widths[col_num] = max(col_widths[col_num], col_len)
42 | else:
43 | col_widths.append(col_len)
44 | # Format output.
45 | for row in rows:
46 | # Output all but last column in padded format.
47 | for col_val, col_width in list(zip(row, col_widths))[:-1]:
48 | col_str = str(col_val)
49 | if isinstance(col_val, int) and not isinstance(col_val, bool):
50 | sys.stdout.write(col_str.rjust(col_width))
51 | else:
52 | sys.stdout.write(col_str.ljust(col_width))
53 | sys.stdout.write(' | ')
54 | # Output the last column as-is.
55 | sys.stdout.write(str(row[-1]))
56 | # Add the newline.
57 | sys.stdout.write('\n')
58 |
59 |
60 | def yp(data, stream=sys.stdout):
61 | """Pretty print as YAML."""
62 | yd(data, stream)
63 |
64 |
65 | def yf(data):
66 | """Format as pretty YAML"""
67 | return yd(data)
68 |
69 |
70 | def yd(data, stream=None):
71 | return yaml.safe_dump(devolve(data), stream, default_flow_style=False)
72 | # TODO: devolve is not self-referential safe
73 |
74 |
75 | def fetch_tsv_header(file_path):
76 | with open(file_path) as fin:
77 | reader = csv.Reader(fin, delimiter='\t')
78 | return next(reader)
79 |
80 |
81 | def parse_tsv_file(file_path, cls_or_fcn=None, fieldnames=None):
82 | """cls_or_fcn must construct an object from a dict."""
83 | return list(iterate_tsv_file(file_path, cls_or_fcn, fieldnames))
84 |
85 |
86 | def iterate_tsv_file(file_path, cls_or_fcn=None, fieldnames=None):
87 | """cls_or_fcn must construct an object from a dict."""
88 | with open(file_path) as fin:
89 | for r in iterate_tsv_stream(fin, cls_or_fcn, fieldnames):
90 | yield r
91 |
92 |
93 | def iterate_tsv_stream(stream, cls_or_fcn=None, fieldnames=None):
94 | """cls_or_fcn must construct an object from a dict."""
95 | cls_or_fcn = cls_or_fcn or Record
96 | reader = csv.DictReader(stream, fieldnames=fieldnames, delimiter='\t')
97 | for it in reader:
98 | yield cls_or_fcn(it)
99 |
100 |
101 | def default_filter(record):
102 | return any(record.values())
103 |
104 |
105 | def parse_excel_records(file_path,
106 | cls_or_fcn=None,
107 | sheet_name=None,
108 | filter=default_filter):
109 | extension = os.path.splitext(file_path)[1]
110 | if extension == '.xlsx':
111 | parse_result = parse_xlsx_records(file_path, cls_or_fcn, sheet_name)
112 | elif extension == '.xls':
113 | parse_result = parse_xls_records(file_path, cls_or_fcn, sheet_name)
114 | else:
115 | raise NotImplementedError(file_path)
116 | return [record for record in parse_result if filter(record)]
117 |
118 |
119 | def parse_xlsx_records(file_path, cls_or_fcn, sheet_name):
120 | fcn = get_pair_constructor(cls_or_fcn)
121 | wb = openpyxl.load_workbook(file_path, data_only=True)
122 | if sheet_name:
123 | ws = wb.get_sheet_by_name(sheet_name)
124 | else:
125 | ws = wb.worksheets[0]
126 | rows = ws.rows
127 | header = tuple(normalize_field_name(c.value) for c in rows[0])
128 | return [fcn(header, (c.value for c in row)) for row in rows[1:]]
129 |
130 |
131 | def parse_xls_records(file_path, cls_or_fcn, sheet_name):
132 | # TODO: use sheet_name
133 | fcn = get_pair_constructor(cls_or_fcn)
134 | wb = xlrd.open_workbook(file_path)
135 | ws = wb.sheet_by_index(0)
136 | header = tuple(normalize_field_name(c) for c in ws.row_values(0))
137 | return [fcn(header, ws.row_values(i)) for i in range(1, ws.nrows)]
138 |
139 |
140 | def get_pair_constructor(cls_or_fcn):
141 | """Return a callable that constructs an object from a header, data pair."""
142 | if not cls_or_fcn:
143 | return Record.from_pair
144 | elif isinstance(cls_or_fcn, type):
145 | return cls_or_fcn.from_pair
146 | else:
147 | return cls_or_fcn
148 |
149 |
150 | class Record(collections.MutableMapping):
151 | def __init__(self, mapping=None, **kwds):
152 | if mapping:
153 | if isinstance(mapping, collections.Mapping):
154 | gen = mapping.items()
155 | else:
156 | gen = mapping
157 | for k, v in gen:
158 | self.__dict__[normalize_field_name(k)] = normalize_value(v)
159 | for k, v in kwds.items():
160 | self.__dict__[normalize_field_name(k)] = normalize_value(v)
161 |
162 | @classmethod
163 | def from_pair(cls, header, data):
164 | """Alternate constructor"""
165 | return cls(zip(header, data))
166 |
167 | def __repr__(self):
168 | return '%s(%r)' % (self.__class__.__name__, self.__dict__)
169 | # TODO: Should I use self.mapping here?
170 |
171 | def __getitem__(self, key):
172 | return self.mapping[key]
173 |
174 | def __setitem__(self, key, value):
175 | self.mapping[key] = value
176 |
177 | def __delitem__(self, key):
178 | del self.mapping[key]
179 |
180 | # TODO: Is this inherited from collections.MutableMapping?
181 | def __iter__(self):
182 | return iter(self.mapping)
183 |
184 | # TODO: Is this inherited from collections.MutableMapping?
185 | def __len__(self):
186 | return len(self.mapping)
187 |
188 | @property
189 | def mapping(self):
190 | return self.__dict__
191 |
192 | @property
193 | def attributes(self):
194 | return self.mapping.keys()
195 |
196 | def pp(self):
197 | pprint.pprint(self.to_dict())
198 |
199 | def to_dict(self):
200 | return devolve(self)
201 |
202 |
203 | def normalize_field_name(field_name):
204 | """lowercase with underscores, etc"""
205 | result = field_name
206 | if result.endswith('?'):
207 | result = result[:-1]
208 | if not result.startswith('is_'):
209 | result = 'is_' + result
210 | result = result.strip().lower().replace(' ', '_').replace(
211 | '-', '_').replace('/', '_').replace('?', '_').replace('%', 'pct')
212 | return result
213 |
214 |
215 | def normalize_value(value):
216 | """Convert empty string to None"""
217 | if value == '':
218 | value = None
219 | return value
220 |
221 |
222 | def devolve(data):
223 | """Recursively convert to just JSON-compatible types."""
224 | # TODO: possible infinite recursion
225 | is_string = isinstance(data, str)
226 | is_iterable = isinstance(data, collections.Iterable)
227 | is_mapping = isinstance(data, collections.Mapping)
228 | is_record = isinstance(data, Record)
229 | if is_record:
230 | result = devolve(data.__dict__)
231 | elif is_mapping:
232 | result = {k: devolve(v) for k, v in data.items()}
233 | elif is_iterable and not is_string:
234 | result = [devolve(it) for it in data]
235 | elif hasattr(data, '__dict__'):
236 | result = data.__dict__
237 | else:
238 | result = data
239 | return result
240 |
241 |
242 | def multiplicities(iterable):
243 | """Count the number of singletons, the number of duplicates, etc.
244 | Returns a collections.Counter instance."""
245 | return collections.Counter(collections.Counter(iterable).values())
246 |
247 |
248 | def val_or_none_key(getter_fcn):
249 | """Wraps getter_fcn, returning a key that is a tuple of (0 or 1, val) where
250 | val=getter_fcn(obj), and the int is 0 if val is None."""
251 | def result_key_fcn(obj):
252 | val = getter_fcn(obj)
253 | n = 0 if val is None else 1
254 | return n, val
255 | return result_key_fcn
256 |
257 |
258 | def count(iterable, n=None,
259 | primary_reverse=True,
260 | secondary_reverse=False,
261 | primary_key=operator.itemgetter(1),
262 | secondary_key=val_or_none_key(operator.itemgetter(0))
263 | ):
264 | """Wraps collections.Counter. Counts, sorts the result, and takes the
265 | first n. The primary sorting criteria is the count; the secondary sorting
266 | criteria is the value. The default sort is descending by count and
267 | ascending by value."""
268 | result = sorted(collections.Counter(iterable).items(),
269 | key=secondary_key, reverse=secondary_reverse)
270 | result.sort(key=primary_key, reverse=primary_reverse)
271 | return result[:n]
272 |
273 |
274 | class MinMax(object):
275 | def __init__(self, min_start=None, max_start=None, count_start=0):
276 | self.count = count_start
277 | self.min = min_start
278 | self.max = max_start
279 |
280 | def add(self, value):
281 | self.count += 1
282 | if self.min is None or self.min > value:
283 | self.min = value
284 | if self.max is None or self.max < value:
285 | self.max = value
286 |
287 | def __repr__(self):
288 | return '%s(%r, %r)' % (self.__class__.__name__, self.min, self.max)
289 |
290 |
291 | def slice_by_value(sequence, start=None, end=None, step=1):
292 | """Returns the earliest slice of the sequence bounded by the
293 | start and end values. Omitted optional parameters work as expected
294 | for slicing.
295 |
296 | slice_by_value('hello there world', 'o', 'w', 2) -> 'otee'
297 | """
298 | i_start = i_end = None
299 | if start is not None:
300 | i_start = sequence.index(start)
301 | if end is not None:
302 | i_end = sequence.index(end)
303 | return sequence[i_start:i_end:step]
304 |
305 |
306 | def update_subset(record, fields, *source_records, **kwds):
307 | """Given a destination record, a sequence of fields, and source
308 | for each field, copy over the first value found in the source records.
309 | The argument for fields must be an iterable where each item is either a
310 | string or a pair of strings. If it is a pair of strings, they name
311 | the destination and source field names. If keyword argument "required"
312 | is True and any of the fields are missing from the source records,
313 | then a KeyError is raised."""
314 | required = kwds.pop('required', True)
315 | assert not kwds, 'Only "required" keyword supported'
316 | for field in fields:
317 | if isinstance(field, str):
318 | dst_name = src_name = field
319 | else:
320 | dst_name, src_name = field
321 | assert isinstance(dst_name, str)
322 | assert isinstance(src_name, str)
323 | value = fetch(src_name, *source_records, required=required)
324 | # TODO: assert value?
325 | if value is not None:
326 | setattr(record, dst_name, value)
327 |
328 |
329 | def fetch(field, *source_records, **kwds):
330 | """Return the value from the first record in the arguments that
331 | contains the specified field. If no record in the chain contains
332 | that field, return the default value. The default value is specified
333 | by the "default" keyword argument or None. If keyword argument
334 | "required" is True and any of the fields are missing from the source
335 | records, then a KeyError is raised."""
336 | default = kwds.pop('default', None)
337 | required = kwds.pop('required', False)
338 | assert not kwds, 'Only "default" and "required" keyword supported'
339 | for record in source_records:
340 | if hasattr(record, field):
341 | return getattr(record, field)
342 | # Must use default.
343 | if required:
344 | raise KeyError(field)
345 | return default
346 |
347 |
348 | def replace_fields(field_list, *pairs):
349 | """Given a list of field names and one or more pairs,
350 | replace each item named in a pair by the pair.
351 |
352 | fl = 'one two three'.split()
353 | replace_fields(fl, ('two', 'spam'))
354 | # ['one', ('two', 'spam'), 'three']
355 | """
356 | result = list(field_list)
357 | for field_name, source in pairs:
358 | index = field_list.index(field_name)
359 | result[index] = field_name, source
360 | return result
361 |
362 |
363 | def rekey_map(mapping, replacements):
364 | """Given an iterable of destination/source pairs in replacements,
365 | create a new dict that is the same as the original except for the
366 | new key names."""
367 | result = dict(mapping)
368 | for dst, src in replacements:
369 | value = result[src]
370 | result[dst] = value
371 | del result[src]
372 | return result
373 |
374 |
375 | class TsvDialect(csv.Dialect):
376 | """Standard Unix-style TSV format.
377 | Also compatible with MAGE-TAB spec v1.0.
378 | See MAGE-TABv1.0.pdf section 3.1.6
379 | http://www.mged.org/mage-tab/MAGE-TABv1.0.pdf
380 | http://www.mged.org/mage-tab/"""
381 | delimiter = '\t'
382 | doublequote = False
383 | escapechar = '\\'
384 | lineterminator = '\n'
385 | quotechar = '"'
386 | quoting = csv.QUOTE_MINIMAL
387 | skipinitialspace = False
388 |
--------------------------------------------------------------------------------
/scripts_from_the_past/wh_lib.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import csv
3 | import glob
4 | import itertools
5 | import locale
6 | import operator
7 | import os
8 | import pprint
9 | import re
10 | import sys
11 |
12 | try:
13 | import openpyxl
14 | except ImportError:
15 | pass # optional dependency, only used when parsing .xlsx files
16 | try:
17 | import xlrd
18 | except ImportError:
19 | pass # optional dependency, only used when parsing .xls files
20 | import yaml
21 |
22 |
23 | locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
24 |
25 |
26 | def select(collection, *items):
27 | return operator.itemgetter(*items)(collection)
28 |
29 |
30 | def print_table(iterable):
31 | """Print out the finite input as a table. Each item in iterable must be an
32 | iterable with roughly the same number of items."""
33 | # Slurp the entire iterable.
34 | rows = list(iterable)
35 | # Compute column widths.
36 | col_widths = []
37 | for row in rows:
38 | for col_num, col_val in enumerate(row):
39 | col_len = len(str(col_val))
40 | if col_num < len(col_widths):
41 | col_widths[col_num] = max(col_widths[col_num], col_len)
42 | else:
43 | col_widths.append(col_len)
44 | # Format output.
45 | for row in rows:
46 | # Output all but last column in padded format.
47 | for col_val, col_width in list(zip(row, col_widths))[:-1]:
48 | col_str = str(col_val)
49 | if isinstance(col_val, int) and not isinstance(col_val, bool):
50 | sys.stdout.write(col_str.rjust(col_width))
51 | else:
52 | sys.stdout.write(col_str.ljust(col_width))
53 | sys.stdout.write(' | ')
54 | # Output the last column as-is.
55 | sys.stdout.write(str(row[-1]))
56 | # Add the newline.
57 | sys.stdout.write('\n')
58 |
59 |
60 | def yp(data, stream=sys.stdout):
61 | """Pretty print as YAML."""
62 | yd(data, stream)
63 |
64 |
65 | def yf(data):
66 | """Format as pretty YAML"""
67 | return yd(data)
68 |
69 |
70 | def yd(data, stream=None):
71 | return yaml.safe_dump(devolve(data), stream, default_flow_style=False)
72 | # TODO: devolve is not self-referential safe
73 |
74 |
75 | def fetch_tsv_header(file_path):
76 | with open(file_path) as fin:
77 | reader = csv.Reader(fin, delimiter='\t')
78 | return next(reader)
79 |
80 |
81 | def parse_tsv_file(file_path, cls_or_fcn=None, fieldnames=None):
82 | """cls_or_fcn must construct an object from a dict."""
83 | return list(iterate_tsv_file(file_path, cls_or_fcn, fieldnames))
84 |
85 |
86 | def iterate_tsv_file(file_path, cls_or_fcn=None, fieldnames=None):
87 | """cls_or_fcn must construct an object from a dict."""
88 | with open(file_path) as fin:
89 | for r in iterate_tsv_stream(fin, cls_or_fcn, fieldnames):
90 | yield r
91 |
92 |
93 | def iterate_tsv_stream(stream, cls_or_fcn=None, fieldnames=None):
94 | """cls_or_fcn must construct an object from a dict."""
95 | cls_or_fcn = cls_or_fcn or Record
96 | reader = csv.DictReader(stream, fieldnames=fieldnames, delimiter='\t')
97 | for it in reader:
98 | yield cls_or_fcn(it)
99 |
100 |
101 | def default_filter(record):
102 | return any(record.values())
103 |
104 |
105 | def parse_excel_records(file_path,
106 | cls_or_fcn=None,
107 | sheet_name=None,
108 | filter=default_filter):
109 | extension = os.path.splitext(file_path)[1]
110 | if extension == '.xlsx':
111 | parse_result = parse_xlsx_records(file_path, cls_or_fcn, sheet_name)
112 | elif extension == '.xls':
113 | parse_result = parse_xls_records(file_path, cls_or_fcn, sheet_name)
114 | else:
115 | raise NotImplementedError(file_path)
116 | return [record for record in parse_result if filter(record)]
117 |
118 |
119 | def parse_xlsx_records(file_path, cls_or_fcn, sheet_name):
120 | fcn = get_pair_constructor(cls_or_fcn)
121 | wb = openpyxl.load_workbook(file_path, data_only=True)
122 | if sheet_name:
123 | ws = wb.get_sheet_by_name(sheet_name)
124 | else:
125 | ws = wb.worksheets[0]
126 | rows = ws.rows
127 | header = tuple(normalize_field_name(c.value) for c in rows[0])
128 | return [fcn(header, (c.value for c in row)) for row in rows[1:]]
129 |
130 |
131 | def parse_xls_records(file_path, cls_or_fcn, sheet_name):
132 | # TODO: use sheet_name
133 | fcn = get_pair_constructor(cls_or_fcn)
134 | wb = xlrd.open_workbook(file_path)
135 | ws = wb.sheet_by_index(0)
136 | header = tuple(normalize_field_name(c) for c in ws.row_values(0))
137 | return [fcn(header, ws.row_values(i)) for i in range(1, ws.nrows)]
138 |
139 |
140 | def get_pair_constructor(cls_or_fcn):
141 | """Return a callable that constructs an object from a header, data pair."""
142 | if not cls_or_fcn:
143 | return Record.from_pair
144 | elif isinstance(cls_or_fcn, type):
145 | return cls_or_fcn.from_pair
146 | else:
147 | return cls_or_fcn
148 |
149 |
150 | class Record(collections.MutableMapping):
151 | def __init__(self, mapping=None, **kwds):
152 | if mapping:
153 | if isinstance(mapping, collections.Mapping):
154 | gen = mapping.items()
155 | else:
156 | gen = mapping
157 | for k, v in gen:
158 | self.__dict__[normalize_field_name(k)] = normalize_value(v)
159 | for k, v in kwds.items():
160 | self.__dict__[normalize_field_name(k)] = normalize_value(v)
161 |
162 | @classmethod
163 | def from_pair(cls, header, data):
164 | """Alternate constructor"""
165 | return cls(zip(header, data))
166 |
167 | def __repr__(self):
168 | return '%s(%r)' % (self.__class__.__name__, self.__dict__)
169 | # TODO: Should I use self.mapping here?
170 |
171 | def __getitem__(self, key):
172 | return self.mapping[key]
173 |
174 | def __setitem__(self, key, value):
175 | self.mapping[key] = value
176 |
177 | def __delitem__(self, key):
178 | del self.mapping[key]
179 |
180 | # TODO: Is this inherited from collections.MutableMapping?
181 | def __iter__(self):
182 | return iter(self.mapping)
183 |
184 | # TODO: Is this inherited from collections.MutableMapping?
185 | def __len__(self):
186 | return len(self.mapping)
187 |
188 | @property
189 | def mapping(self):
190 | return self.__dict__
191 |
192 | @property
193 | def attributes(self):
194 | return self.mapping.keys()
195 |
196 | def pp(self):
197 | pprint.pprint(self.to_dict())
198 |
199 | def to_dict(self):
200 | return devolve(self)
201 |
202 |
203 | def normalize_field_name(field_name):
204 | """lowercase with underscores, etc"""
205 | result = field_name
206 | if result.endswith('?'):
207 | result = result[:-1]
208 | if not result.startswith('is_'):
209 | result = 'is_' + result
210 | result = result.strip().lower().replace(' ', '_').replace(
211 | '-', '_').replace('/', '_').replace('?', '_').replace('%', 'pct')
212 | return result
213 |
214 |
215 | def normalize_value(value):
216 | """Convert empty string to None"""
217 | if value == '':
218 | value = None
219 | return value
220 |
221 |
222 | def devolve(data):
223 | """Recursively convert to just JSON-compatible types."""
224 | # TODO: possible infinite recursion
225 | is_string = isinstance(data, str)
226 | is_iterable = isinstance(data, collections.Iterable)
227 | is_mapping = isinstance(data, collections.Mapping)
228 | is_record = isinstance(data, Record)
229 | if is_record:
230 | result = devolve(data.__dict__)
231 | elif is_mapping:
232 | result = {k: devolve(v) for k, v in data.items()}
233 | elif is_iterable and not is_string:
234 | result = [devolve(it) for it in data]
235 | elif hasattr(data, '__dict__'):
236 | result = data.__dict__
237 | else:
238 | result = data
239 | return result
240 |
241 |
242 | def multiplicities(iterable):
243 | """Count the number of singletons, the number of duplicates, etc.
244 | Returns a collections.Counter instance."""
245 | return collections.Counter(collections.Counter(iterable).values())
246 |
247 |
248 | def val_or_none_key(getter_fcn):
249 | """Wraps getter_fcn, returning a key that is a tuple of (0 or 1, val) where
250 | val=getter_fcn(obj), and the int is 0 if val is None."""
251 | def result_key_fcn(obj):
252 | val = getter_fcn(obj)
253 | n = 0 if val is None else 1
254 | return n, val
255 | return result_key_fcn
256 |
257 |
258 | def count(iterable, n=None,
259 | primary_reverse=True,
260 | secondary_reverse=False,
261 | primary_key=operator.itemgetter(1),
262 | secondary_key=val_or_none_key(operator.itemgetter(0))
263 | ):
264 | """Wraps collections.Counter. Counts, sorts the result, and takes the
265 | first n. The primary sorting criteria is the count; the secondary sorting
266 | criteria is the value. The default sort is descending by count and
267 | ascending by value."""
268 | result = sorted(collections.Counter(iterable).items(),
269 | key=secondary_key, reverse=secondary_reverse)
270 | result.sort(key=primary_key, reverse=primary_reverse)
271 | return result[:n]
272 |
273 |
274 | class MinMax(object):
275 | def __init__(self, min_start=None, max_start=None, count_start=0):
276 | self.count = count_start
277 | self.min = min_start
278 | self.max = max_start
279 |
280 | def add(self, value):
281 | self.count += 1
282 | if self.min is None or self.min > value:
283 | self.min = value
284 | if self.max is None or self.max < value:
285 | self.max = value
286 |
287 | def __repr__(self):
288 | return '%s(%r, %r)' % (self.__class__.__name__, self.min, self.max)
289 |
290 |
291 | def slice_by_value(sequence, start=None, end=None, step=1):
292 | """Returns the earliest slice of the sequence bounded by the
293 | start and end values. Omitted optional parameters work as expected
294 | for slicing.
295 |
296 | slice_by_value('hello there world', 'o', 'w', 2) -> 'otee'
297 | """
298 | i_start = i_end = None
299 | if start is not None:
300 | i_start = sequence.index(start)
301 | if end is not None:
302 | i_end = sequence.index(end)
303 | return sequence[i_start:i_end:step]
304 |
305 |
306 | def update_subset(record, fields, *source_records, **kwds):
307 | """Given a destination record, a sequence of fields, and source
308 | for each field, copy over the first value found in the source records.
309 | The argument for fields must be an iterable where each item is either a
310 | string or a pair of strings. If it is a pair of strings, they name
311 | the destination and source field names. If keyword argument "required"
312 | is True and any of the fields are missing from the source records,
313 | then a KeyError is raised."""
314 | required = kwds.pop('required', True)
315 | assert not kwds, 'Only "required" keyword supported'
316 | for field in fields:
317 | if isinstance(field, str):
318 | dst_name = src_name = field
319 | else:
320 | dst_name, src_name = field
321 | assert isinstance(dst_name, str)
322 | assert isinstance(src_name, str)
323 | value = fetch(src_name, *source_records, required=required)
324 | # TODO: assert value?
325 | if value is not None:
326 | setattr(record, dst_name, value)
327 |
328 |
329 | def fetch(field, *source_records, **kwds):
330 | """Return the value from the first record in the arguments that
331 | contains the specified field. If no record in the chain contains
332 | that field, return the default value. The default value is specified
333 | by the "default" keyword argument or None. If keyword argument
334 | "required" is True and any of the fields are missing from the source
335 | records, then a KeyError is raised."""
336 | default = kwds.pop('default', None)
337 | required = kwds.pop('required', False)
338 | assert not kwds, 'Only "default" and "required" keyword supported'
339 | for record in source_records:
340 | if hasattr(record, field):
341 | return getattr(record, field)
342 | # Must use default.
343 | if required:
344 | raise KeyError(field)
345 | return default
346 |
347 |
348 | def replace_fields(field_list, *pairs):
349 | """Given a list of field names and one or more pairs,
350 | replace each item named in a pair by the pair.
351 |
352 | fl = 'one two three'.split()
353 | replace_fields(fl, ('two', 'spam'))
354 | # ['one', ('two', 'spam'), 'three']
355 | """
356 | result = list(field_list)
357 | for field_name, source in pairs:
358 | index = field_list.index(field_name)
359 | result[index] = field_name, source
360 | return result
361 |
362 |
363 | def rekey_map(mapping, replacements):
364 | """Given an iterable of destination/source pairs in replacements,
365 | create a new dict that is the same as the original except for the
366 | new key names."""
367 | result = dict(mapping)
368 | for dst, src in replacements:
369 | value = result[src]
370 | result[dst] = value
371 | del result[src]
372 | return result
373 |
374 |
375 | class TsvDialect(csv.Dialect):
376 | """Standard Unix-style TSV format.
377 | Also compatible with MAGE-TAB spec v1.0.
378 | See MAGE-TABv1.0.pdf section 3.1.6
379 | http://www.mged.org/mage-tab/MAGE-TABv1.0.pdf
380 | http://www.mged.org/mage-tab/"""
381 | delimiter = '\t'
382 | doublequote = False
383 | escapechar = '\\'
384 | lineterminator = '\n'
385 | quotechar = '"'
386 | quoting = csv.QUOTE_MINIMAL
387 | skipinitialspace = False
388 |
--------------------------------------------------------------------------------
/multiplex_scripts/wh_lib.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import csv
3 | import glob
4 | import itertools
5 | import locale
6 | import operator
7 | import os
8 | import pprint
9 | import re
10 | import sys
11 |
12 | try:
13 | import openpyxl
14 | except ImportError:
15 | pass # optional dependency, only used when parsing .xlsx files
16 | try:
17 | import xlrd
18 | except ImportError:
19 | pass # optional dependency, only used when parsing .xls files
20 | import yaml
21 |
22 |
23 | locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
24 |
25 |
26 | def select(collection, *items):
27 | return operator.itemgetter(*items)(collection)
28 |
29 |
30 | def print_table(iterable):
31 | """Print out the finite input as a table. Each item in iterable must be an
32 | iterable with roughly the same number of items."""
33 | # Slurp the entire iterable.
34 | rows = list(iterable)
35 | # Compute column widths.
36 | col_widths = []
37 | for row in rows:
38 | for col_num, col_val in enumerate(row):
39 | col_len = len(str(col_val))
40 | if col_num < len(col_widths):
41 | col_widths[col_num] = max(col_widths[col_num], col_len)
42 | else:
43 | col_widths.append(col_len)
44 | # Format output.
45 | for row in rows:
46 | # Output all but last column in padded format.
47 | for col_val, col_width in list(zip(row, col_widths))[:-1]:
48 | col_str = str(col_val)
49 | if isinstance(col_val, int) and not isinstance(col_val, bool):
50 | sys.stdout.write(col_str.rjust(col_width))
51 | else:
52 | sys.stdout.write(col_str.ljust(col_width))
53 | sys.stdout.write(' | ')
54 | # Output the last column as-is.
55 | sys.stdout.write(str(row[-1]))
56 | # Add the newline.
57 | sys.stdout.write('\n')
58 |
59 |
60 | def yp(data, stream=sys.stdout):
61 | """Pretty print as YAML."""
62 | yd(data, stream)
63 |
64 |
65 | def yf(data):
66 | """Format as pretty YAML"""
67 | return yd(data)
68 |
69 |
70 | def yd(data, stream=None):
71 | return yaml.safe_dump(devolve(data), stream, default_flow_style=False)
72 | # TODO: devolve is not self-referential safe
73 |
74 |
75 | def fetch_tsv_header(file_path):
76 | with open(file_path) as fin:
77 | reader = csv.Reader(fin, delimiter='\t')
78 | return next(reader)
79 |
80 |
81 | def parse_tsv_file(file_path, cls_or_fcn=None, fieldnames=None):
82 | """cls_or_fcn must construct an object from a dict."""
83 | return list(iterate_tsv_file(file_path, cls_or_fcn, fieldnames))
84 |
85 |
86 | def iterate_tsv_file(file_path, cls_or_fcn=None, fieldnames=None):
87 | """cls_or_fcn must construct an object from a dict."""
88 | with open(file_path) as fin:
89 | for r in iterate_tsv_stream(fin, cls_or_fcn, fieldnames):
90 | yield r
91 |
92 |
93 | def iterate_tsv_stream(stream, cls_or_fcn=None, fieldnames=None):
94 | """cls_or_fcn must construct an object from a dict."""
95 | cls_or_fcn = cls_or_fcn or Record
96 | reader = csv.DictReader(stream, fieldnames=fieldnames, delimiter='\t')
97 | for it in reader:
98 | yield cls_or_fcn(it)
99 |
100 |
101 | def default_filter(record):
102 | return any(record.values())
103 |
104 |
105 | def parse_excel_records(file_path,
106 | cls_or_fcn=None,
107 | sheet_name=None,
108 | filter=default_filter):
109 | extension = os.path.splitext(file_path)[1]
110 | if extension == '.xlsx':
111 | parse_result = parse_xlsx_records(file_path, cls_or_fcn, sheet_name)
112 | elif extension == '.xls':
113 | parse_result = parse_xls_records(file_path, cls_or_fcn, sheet_name)
114 | #elif extension == '.tsv':
115 | # parse_result = parse_tsv_file(file_path, cls_or_fcn,fieldnames)
116 | else:
117 | raise NotImplementedError(file_path)
118 | return [record for record in parse_result if filter(record)]
119 |
120 |
121 | def parse_xlsx_records(file_path, cls_or_fcn, sheet_name):
122 | fcn = get_pair_constructor(cls_or_fcn)
123 | wb = openpyxl.load_workbook(file_path, data_only=True)
124 | if sheet_name:
125 | ws = wb.get_sheet_by_name(sheet_name)
126 | else:
127 | ws = wb.worksheets[0]
128 | rows = iter(ws.rows)
129 | header = tuple(normalize_field_name(c.value) for c in next(rows))
130 | return [fcn(header, (c.value for c in row)) for row in rows]
131 |
132 |
133 | def parse_xls_records(file_path, cls_or_fcn, sheet_name):
134 | # TODO: use sheet_name
135 | fcn = get_pair_constructor(cls_or_fcn)
136 | wb = xlrd.open_workbook(file_path)
137 | ws = wb.sheet_by_index(0)
138 | header = tuple(normalize_field_name(c) for c in ws.row_values(0))
139 | return [fcn(header, ws.row_values(i)) for i in range(1, ws.nrows)]
140 |
141 |
142 | def get_pair_constructor(cls_or_fcn):
143 | """Return a callable that constructs an object from a header, data pair."""
144 | if not cls_or_fcn:
145 | return Record.from_pair
146 | elif isinstance(cls_or_fcn, type):
147 | return cls_or_fcn.from_pair
148 | else:
149 | return cls_or_fcn
150 |
151 |
152 | class Record(collections.MutableMapping):
153 | def __init__(self, mapping=None, **kwds):
154 | if mapping:
155 | if isinstance(mapping, collections.Mapping):
156 | gen = mapping.items()
157 | else:
158 | gen = mapping
159 | for k, v in gen:
160 | self.__dict__[normalize_field_name(k)] = normalize_value(v)
161 | for k, v in kwds.items():
162 | self.__dict__[normalize_field_name(k)] = normalize_value(v)
163 |
164 | @classmethod
165 | def from_pair(cls, header, data):
166 | """Alternate constructor"""
167 | return cls(zip(header, data))
168 |
169 | def __repr__(self):
170 | return '%s(%r)' % (self.__class__.__name__, self.__dict__)
171 | # TODO: Should I use self.mapping here?
172 |
173 | def __getitem__(self, key):
174 | return self.mapping[key]
175 |
176 | def __setitem__(self, key, value):
177 | self.mapping[key] = value
178 |
179 | def __delitem__(self, key):
180 | del self.mapping[key]
181 |
182 | # TODO: Is this inherited from collections.MutableMapping?
183 | def __iter__(self):
184 | return iter(self.mapping)
185 |
186 | # TODO: Is this inherited from collections.MutableMapping?
187 | def __len__(self):
188 | return len(self.mapping)
189 |
190 | @property
191 | def mapping(self):
192 | return self.__dict__
193 |
194 | @property
195 | def attributes(self):
196 | return self.mapping.keys()
197 |
198 | def pp(self):
199 | pprint.pprint(self.to_dict())
200 |
201 | def to_dict(self):
202 | return devolve(self)
203 |
204 |
205 | def normalize_field_name(field_name):
206 | """lowercase with underscores, etc"""
207 | result = field_name
208 | if result.endswith('?'):
209 | result = result[:-1]
210 | if not result.startswith('is_'):
211 | result = 'is_' + result
212 | result = result.strip().lower().replace(' ', '_').replace(
213 | '-', '_').replace('/', '_').replace('?', '_').replace('%', 'pct')
214 | return result
215 |
216 |
217 | def normalize_value(value):
218 | """Convert empty string to None"""
219 | if value == '':
220 | value = None
221 | return value
222 |
223 |
224 | def devolve(data):
225 | """Recursively convert to just JSON-compatible types."""
226 | # TODO: possible infinite recursion
227 | is_string = isinstance(data, str)
228 | is_iterable = isinstance(data, collections.Iterable)
229 | is_mapping = isinstance(data, collections.Mapping)
230 | is_record = isinstance(data, Record)
231 | if is_record:
232 | result = devolve(data.__dict__)
233 | elif is_mapping:
234 | result = {k: devolve(v) for k, v in data.items()}
235 | elif is_iterable and not is_string:
236 | result = [devolve(it) for it in data]
237 | elif hasattr(data, '__dict__'):
238 | result = data.__dict__
239 | else:
240 | result = data
241 | return result
242 |
243 |
244 | def multiplicities(iterable):
245 | """Count the number of singletons, the number of duplicates, etc.
246 | Returns a collections.Counter instance."""
247 | return collections.Counter(collections.Counter(iterable).values())
248 |
249 |
250 | def val_or_none_key(getter_fcn):
251 | """Wraps getter_fcn, returning a key that is a tuple of (0 or 1, val) where
252 | val=getter_fcn(obj), and the int is 0 if val is None."""
253 | def result_key_fcn(obj):
254 | val = getter_fcn(obj)
255 | n = 0 if val is None else 1
256 | return n, val
257 | return result_key_fcn
258 |
259 |
260 | def count(iterable, n=None,
261 | primary_reverse=True,
262 | secondary_reverse=False,
263 | primary_key=operator.itemgetter(1),
264 | secondary_key=val_or_none_key(operator.itemgetter(0))
265 | ):
266 | """Wraps collections.Counter. Counts, sorts the result, and takes the
267 | first n. The primary sorting criteria is the count; the secondary sorting
268 | criteria is the value. The default sort is descending by count and
269 | ascending by value."""
270 | result = sorted(collections.Counter(iterable).items(),
271 | key=secondary_key, reverse=secondary_reverse)
272 | result.sort(key=primary_key, reverse=primary_reverse)
273 | return result[:n]
274 |
275 |
276 | class MinMax(object):
277 | def __init__(self, min_start=None, max_start=None, count_start=0):
278 | self.count = count_start
279 | self.min = min_start
280 | self.max = max_start
281 |
282 | def add(self, value):
283 | self.count += 1
284 | if self.min is None or self.min > value:
285 | self.min = value
286 | if self.max is None or self.max < value:
287 | self.max = value
288 |
289 | def __repr__(self):
290 | return '%s(%r, %r)' % (self.__class__.__name__, self.min, self.max)
291 |
292 |
293 | def slice_by_value(sequence, start=None, end=None, step=1):
294 | """Returns the earliest slice of the sequence bounded by the
295 | start and end values. Omitted optional parameters work as expected
296 | for slicing.
297 |
298 | slice_by_value('hello there world', 'o', 'w', 2) -> 'otee'
299 | """
300 | i_start = i_end = None
301 | if start is not None:
302 | i_start = sequence.index(start)
303 | if end is not None:
304 | i_end = sequence.index(end)
305 | return sequence[i_start:i_end:step]
306 |
307 |
308 | def update_subset(record, fields, *source_records, **kwds):
309 | """Given a destination record, a sequence of fields, and source
310 | for each field, copy over the first value found in the source records.
311 | The argument for fields must be an iterable where each item is either a
312 | string or a pair of strings. If it is a pair of strings, they name
313 | the destination and source field names. If keyword argument "required"
314 | is True and any of the fields are missing from the source records,
315 | then a KeyError is raised."""
316 | required = kwds.pop('required', True)
317 | assert not kwds, 'Only "required" keyword supported'
318 | for field in fields:
319 | if isinstance(field, str):
320 | dst_name = src_name = field
321 | else:
322 | dst_name, src_name = field
323 | assert isinstance(dst_name, str)
324 | assert isinstance(src_name, str)
325 | value = fetch(src_name, *source_records, required=required)
326 | # TODO: assert value?
327 | if value is not None:
328 | setattr(record, dst_name, value)
329 |
330 |
331 | def fetch(field, *source_records, **kwds):
332 | """Return the value from the first record in the arguments that
333 | contains the specified field. If no record in the chain contains
334 | that field, return the default value. The default value is specified
335 | by the "default" keyword argument or None. If keyword argument
336 | "required" is True and any of the fields are missing from the source
337 | records, then a KeyError is raised."""
338 | default = kwds.pop('default', None)
339 | required = kwds.pop('required', False)
340 | assert not kwds, 'Only "default" and "required" keyword supported'
341 | for record in source_records:
342 | if hasattr(record, field):
343 | return getattr(record, field)
344 | # Must use default.
345 | if required:
346 | raise KeyError(field)
347 | return default
348 |
349 |
350 | def replace_fields(field_list, *pairs):
351 | """Given a list of field names and one or more pairs,
352 | replace each item named in a pair by the pair.
353 |
354 | fl = 'one two three'.split()
355 | replace_fields(fl, ('two', 'spam'))
356 | # ['one', ('two', 'spam'), 'three']
357 | """
358 | result = list(field_list)
359 | for field_name, source in pairs:
360 | index = field_list.index(field_name)
361 | result[index] = field_name, source
362 | return result
363 |
364 |
365 | def rekey_map(mapping, replacements):
366 | """Given an iterable of destination/source pairs in replacements,
367 | create a new dict that is the same as the original except for the
368 | new key names."""
369 | result = dict(mapping)
370 | for dst, src in replacements:
371 | value = result[src]
372 | result[dst] = value
373 | del result[src]
374 | return result
375 |
376 |
377 | class TsvDialect(csv.Dialect):
378 | """Standard Unix-style TSV format.
379 | Also compatible with MAGE-TAB spec v1.0.
380 | See MAGE-TABv1.0.pdf section 3.1.6
381 | http://www.mged.org/mage-tab/MAGE-TABv1.0.pdf
382 | http://www.mged.org/mage-tab/"""
383 | delimiter = '\t'
384 | doublequote = False
385 | escapechar = '\\'
386 | lineterminator = '\n'
387 | quotechar = '"'
388 | quoting = csv.QUOTE_MINIMAL
389 | skipinitialspace = False
390 |
--------------------------------------------------------------------------------