├── README.md ├── generate_bam_header_from_tsv.sh ├── create_worklist.sh ├── create_dir_and_links_on_nexus.sh ├── extract_rg_pg_lines.sh ├── download_md5s.sh ├── upload_bam_headers_to_nexus.sh ├── LICENSE ├── check_jobs_status.sh ├── generate_xmls.sh ├── fix_insert_sizes.py ├── remove_failed_samples_from_tsv.py ├── check_if_aspera_completed.sh ├── .gitignore ├── generate_merged_bam_header.py ├── process_bams_and_submit_from_nexus.sh ├── generate_bam_header.sh ├── replace_bam_paths_with_dna_nexus_file_ids.groovy └── dna-nexus-steps.sh /README.md: -------------------------------------------------------------------------------- 1 | # scripts-for-nexus-pipeline 2 | 3 | version 1.0.0+dev 4 | 5 | Internal scripts for working with DNAnexus 6 | -------------------------------------------------------------------------------- /generate_bam_header_from_tsv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | tsv_file=$1 4 | data_dir=$2 5 | project_name=$3 6 | platform=$4 7 | 8 | cat $tsv_file | while read line 9 | do 10 | sample=`echo "$line" | cut -f1` 11 | sample_dir="$data_dir/$sample" 12 | 13 | if [ -d $sample_dir ] 14 | then 15 | echo "$sample_dir exists.." 16 | else 17 | echo mkdir $sample_dir 18 | fi 19 | 20 | library=`echo "$line" | cut -f2` 21 | run=`echo "$line" | cut -f6` 22 | center="BCM" 23 | 24 | done -------------------------------------------------------------------------------- /create_worklist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | tsv_file=$1 4 | nexus_dir=$2 5 | extension=$3 6 | 7 | cat $tsv_file | while read line 8 | do 9 | sample_name=`echo "$line" | cut -f1` 10 | bam_file_id=`echo "$line" | cut -f5 | cut -d"=" -f1` 11 | bam_file_name=`echo "$line" | cut -f5 | cut -d"=" -f2` 12 | 13 | final_header_file_name="$sample_name.header.sam" 14 | 15 | output_bam_file_name="$sample_name"_"$extension"_Illumina.bam 16 | 17 | echo -e $sample_name"\t"$nexus_dir"\t"$bam_file_id"\t"$bam_file_name"\t"$final_header_file_name"\t"$output_bam_file_name 18 | done -------------------------------------------------------------------------------- /create_dir_and_links_on_nexus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # need to be logged into dna nexus 4 | # need to be in the correct project 5 | 6 | worklist_file=$1 7 | 8 | cat $worklist_file | while read line 9 | do 10 | sample_name=`echo "$line" | cut -f1` 11 | 12 | nexus_path=`echo "$line" | cut -f2` 13 | nexus_data_dir="$nexus_path/working/data" 14 | 15 | dx ls $nexus_data_dir > /dev/null 16 | 17 | if [ $? -eq 0 ] 18 | then 19 | sample_dir="$nexus_data_dir/$sample_name" 20 | bam_file_id=`echo "$line" | cut -f3` 21 | 22 | echo "dx mkdir -p $sample_dir" 23 | dx mkdir -p $sample_dir 24 | 25 | echo "dx cp $bam_file_id $sample_dir/" 26 | dx cp $bam_file_id $sample_dir/ 27 | else 28 | echo "nexus dir doesn't exist:" $nexus_data_dir 29 | exit 1 30 | fi 31 | done 32 | -------------------------------------------------------------------------------- /extract_rg_pg_lines.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | tsv_file=$1 4 | data_dir=$2 5 | 6 | wxs_headers_dir="/groups/submissions/projects/charges/dna-nexus-downloads/wes_headers" 7 | 8 | cat $tsv_file | while read line 9 | do 10 | sample=`echo "$line" | cut -f1` 11 | sample_dir=$data_dir/$sample 12 | 13 | barcode=`echo "$line" | cut -f4` 14 | number_of_bam_headers=`ls $wxs_headers_dir/$barcode*header | wc -l` 15 | 16 | incomplete_header_path=$sample_dir/$sample".incomplete.header.sam" 17 | 18 | if [ $number_of_bam_headers -eq 1 ] 19 | then 20 | bam_header=`ls $wxs_headers_dir/$barcode*header` 21 | echo "extracting rg and pg lines from:" $bam_header 22 | grep -e "@RG" -e "@PG" $bam_header >> $incomplete_header_path 23 | elif [ $number_of_bam_headers -eq 0 ] 24 | then 25 | echo "no bam header exists for:" $sample 26 | else 27 | echo "more than one bam header exists for:" $sample 28 | fi 29 | done 30 | -------------------------------------------------------------------------------- /download_md5s.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | worklist_file=$1 4 | 5 | cat $worklist_file | while read line 6 | do 7 | sample=`echo "$line" | cut -f1` 8 | nexus_path=`echo "$line" | cut -f2` 9 | submission_bam_filename=`echo "$line" | cut -f6` 10 | 11 | nexus_data_dir="$nexus_path/working/data" 12 | nexus_sample_dir="$nexus_data_dir/$sample" 13 | submission_bam_path="$nexus_sample_dir/$submission_bam_filename" 14 | md5_file_path="$nexus_sample_dir/$submission_bam_filename".md5 15 | 16 | dx ls $nexus_sample_dir > /dev/null 17 | 18 | if [ $? -eq 0 ] 19 | then 20 | dx ls $md5_file_path > /dev/null 21 | 22 | if [ $? -eq 0 ] 23 | then 24 | echo dx download $md5_file_path -o data/$sample/ --no-progress 25 | dx download $md5_file_path -o data/$sample/ --no-progress 26 | else 27 | echo "md5file missing for sample:" $sample 28 | fi 29 | else 30 | echo "sample dir missing for sample:" $sample 31 | fi 32 | done -------------------------------------------------------------------------------- /upload_bam_headers_to_nexus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | worklist_file=$1 4 | data_dir=$2 5 | 6 | # Requires: dnanexus-upload-agent/ua 7 | upload_agent=$(which ua) 8 | 9 | cat $worklist_file | while read line 10 | do 11 | sample=`echo "$line" | cut -f1` 12 | nexus_path=`echo "$line" | cut -f2` 13 | 14 | nexus_data_dir="$nexus_path/working/data" 15 | nexus_sample_dir="$nexus_data_dir/$sample" 16 | 17 | dx ls $nexus_sample_dir 18 | 19 | if [ $? -eq 0 ] 20 | then 21 | header_filename=`echo "$line" | cut -f5` 22 | header_file_path=$data_dir/$sample/$header_filename 23 | 24 | if [ -f $header_file_path ] 25 | then 26 | echo $upload_agent $header_file_path --do-not-compress -f $nexus_sample_dir 27 | $upload_agent $header_file_path --do-not-compress -f $nexus_sample_dir 28 | else 29 | echo "header file missing for sample:" $sample 30 | fi 31 | else 32 | echo "nexus sample dir does not exist for sample:" $sample 33 | fi 34 | done 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /check_jobs_status.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | log_file=$1 4 | 5 | num_of_jobs=`cat $log_file | grep job | wc -l` 6 | echo "number of jobs submitted:" $num_of_jobs 7 | 8 | num_of_done=0 9 | num_of_running=0 10 | num_of_failed=0 11 | num_of_waiting=0 12 | 13 | for id in `cat $log_file | grep job` 14 | do 15 | state=`dx describe $id | grep -e "State"` 16 | 17 | if [[ $state == *done ]] 18 | then 19 | num_of_done=`expr $num_of_done + 1` 20 | elif [[ $state == *running ]] 21 | then 22 | num_of_running=`expr $num_of_running + 1` 23 | elif [[ $state == *failed ]] 24 | then 25 | num_of_failed=`expr $num_of_failed + 1` 26 | elif [[ $state == *runnable ]] 27 | then 28 | num_of_waiting=`expr $num_of_waiting + 1` 29 | else 30 | echo "unknown status:" $state 31 | fi 32 | 33 | done 34 | 35 | state_known=`expr $num_of_done + $num_of_running + $num_of_failed + $num_of_waiting` 36 | state_unknown=`expr $num_of_jobs - $state_known` 37 | 38 | echo "Done:" $num_of_done 39 | echo "Running:" $num_of_running 40 | echo "Failed:" $num_of_failed 41 | echo "Waiting:" $num_of_waiting 42 | echo "State unknown:" $state_unknown 43 | -------------------------------------------------------------------------------- /generate_xmls.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | tsv_file="$1" # chs_wgs_f1_b100_20141218.final.tsv 4 | biosample_file="$2" # chs_wgs_f1_b100_20141218.biosamples 5 | data_dir="$3" # data/ 6 | extension="$4" # "Illumina" 7 | 8 | date=`date +%Y%m%d` 9 | 10 | cat $tsv_file | while read line 11 | do 12 | sample=`echo "$line" | cut -f1` 13 | sample_dir=`ls -d $data_dir/$sample` 14 | 15 | metadata_dirname="bcm-sra-$sample"_"$extension"_"$date" 16 | metadata_dir=$sample_dir/$metadata_dirname 17 | 18 | if [[ -a $metadata_dir ]] 19 | then 20 | echo "$metadata_dir exists" 21 | else 22 | echo mkdir $metadata_dir 23 | mkdir $metadata_dir 24 | fi 25 | 26 | scripts_home="/groups/submissions/software/noarch/apps/bcm-hgsc-nexgen-submission-pipeline/4-package-metadata" 27 | 28 | # experiment xml 29 | echo python $scripts_home/generate_experiment_xml.py $sample $tsv_file $biosample_file $metadata_dir 30 | python $scripts_home/generate_experiment_xml.py $sample $tsv_file $biosample_file $metadata_dir 31 | # run xml 32 | echo python $scripts_home/generate_run_xml.py $sample_dir $metadata_dir 33 | python $scripts_home/generate_run_xml.py $sample_dir $metadata_dir 34 | mv $metadata_dir/runs.yaml $sample_dir 35 | done 36 | -------------------------------------------------------------------------------- /fix_insert_sizes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | 6 | tsv_file = sys.argv[1] 7 | info_file = sys.argv[2] 8 | barcode_pos = int(sys.argv[3]) 9 | insert_size_pos = int(sys.argv[4]) 10 | 11 | output_filename = os.path.splitext(tsv_file)[0] + '_correct_insert_size' + os.path.splitext(tsv_file)[1] 12 | 13 | info_map = {} 14 | with open(info_file) as f: 15 | for raw_line in f: 16 | line = raw_line.rstrip() 17 | barcode = line.split('\t')[barcode_pos] 18 | insert_size = line.split('\t')[insert_size_pos] 19 | info_map[barcode] = insert_size 20 | 21 | 22 | with open(tsv_file) as f: 23 | for raw_line in f: 24 | line = raw_line.rstrip() 25 | barcode = line.split('\t')[3] 26 | tsv_insert_size = line.split('\t')[15] 27 | final_insert_size = None 28 | 29 | if 'ERROR' in tsv_insert_size: 30 | final_insert_size = info_map[barcode] 31 | elif 'MANUAL' in tsv_insert_size: 32 | final_insert_size = info_map[barcode] 33 | else: 34 | final_insert_size = tsv_insert_size 35 | 36 | final_line = line.replace(tsv_insert_size, final_insert_size) 37 | 38 | with open(output_filename, 'a') as of: 39 | of.write(final_line + '\n') 40 | -------------------------------------------------------------------------------- /remove_failed_samples_from_tsv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import collections 4 | import os 5 | import sys 6 | 7 | qc_log_file = sys.argv[1] 8 | tsv_file = sys.argv[2] 9 | 10 | file_prefix = os.path.splitext(tsv_file)[0] 11 | file_suffix = os.path.splitext(tsv_file)[1] 12 | 13 | passed_qc_tsv = os.path.join(file_prefix + '_passed_qc' + file_suffix) 14 | failed_qc_tsv = os.path.join(file_prefix + '_failed_qc' + file_suffix) 15 | 16 | print qc_log_file, tsv_file, passed_qc_tsv, failed_qc_tsv 17 | 18 | samples = set() 19 | tsv_map = collections.defaultdict(list) 20 | with open(tsv_file) as f: 21 | for raw_line in f: 22 | line = raw_line.rstrip() 23 | sample = line.split('\t')[0] 24 | samples.add(sample) 25 | tsv_map[sample].append(line) 26 | 27 | qc_failed_samples = [] 28 | with open(qc_log_file) as f: 29 | for raw_line in f: 30 | line = raw_line.rstrip() 31 | if line.startswith('ERROR'): 32 | qc_failed_samples.append(line.split()[1]) 33 | 34 | print len(tsv_map), len(samples), len(qc_failed_samples) 35 | 36 | for sample in samples: 37 | if sample in qc_failed_samples: 38 | for value in tsv_map[sample]: 39 | with open(failed_qc_tsv, 'a') as f: 40 | f.write(value + '\n') 41 | else: 42 | for value in tsv_map[sample]: 43 | with open(passed_qc_tsv, 'a') as f: 44 | f.write(value + '\n') 45 | -------------------------------------------------------------------------------- /check_if_aspera_completed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # needs to be run from working OR ARIC/ARIC-B200-SUB/BAM-I-WXS dir 4 | 5 | worklist_file=$1 6 | 7 | num_of_samples=`cat $worklist_file | wc -l` 8 | echo "number of samples in worklist:" $num_of_samples 9 | 10 | cat $worklist_file | while read line 11 | do 12 | sample=`echo "$line" | cut -f1` 13 | nexus_path=`echo "$line" | cut -f2` 14 | submission_bam_filename=`echo "$line" | cut -f6` 15 | 16 | nexus_data_dir="$nexus_path/working/data" 17 | nexus_sample_dir="$nexus_data_dir/$sample" 18 | nexus_submitted_dir="$nexus_path/submitted" 19 | 20 | bam_file_path="$nexus_sample_dir/$submission_bam_filename" 21 | 22 | ( dx ls $nexus_sample_dir && dx ls $nexus_submitted_dir ) > /dev/null 23 | 24 | if [ $? -eq 0 ] 25 | then 26 | aspera_log_file="$nexus_sample_dir/$submission_bam_filename".aspera.log 27 | 28 | dx ls $aspera_log_file > /dev/null 29 | 30 | if [ $? -eq 0 ] 31 | then 32 | bams_completed=`dx cat $aspera_log_file | grep -v -e uploaded -v -e started -v -e finished | grep -c Completed` 33 | 34 | if [ $bams_completed -eq 1 ] 35 | then 36 | echo "bam upload success:" $bam_file_path 37 | echo "dx mv $nexus_sample_dir $nexus_submitted_dir/" 38 | dx mv $nexus_sample_dir $nexus_submitted_dir/ 39 | echo "mv data/$sample ../ready" 40 | mv data/$sample ../ready 41 | else 42 | echo "bam upload failed:" $bam_file_path 43 | fi 44 | else 45 | echo "bam not queued up for upload:" $bam_file_path 46 | fi 47 | fi 48 | done 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /generate_merged_bam_header.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | 6 | 7 | sample = sys.argv[1] 8 | tsv_file = sys.argv[2] 9 | rg_incomplete_file = sys.argv[3] 10 | project = sys.argv[4] 11 | 12 | 13 | tsv_header_info = {} 14 | with open(tsv_file) as f: 15 | for raw_line in f: 16 | line = raw_line.rstrip() 17 | if line.startswith(sample): 18 | tsv_header_info[line.split('\t')[3]] = line.split('\t')[1], line.split('\t')[5] 19 | 20 | 21 | with open(rg_incomplete_file) as f: 22 | for raw_line in f: 23 | line = raw_line.rstrip() 24 | id = pu = lb = sm = ds = cn = None 25 | for rg_value in line.split('\t'): 26 | if rg_value == "@RG": 27 | pass 28 | else: 29 | key, value = rg_value.split(":") 30 | if 'ID' in key: 31 | id = key + ":" + value 32 | elif 'PU' in key: 33 | pu_barcode = value.split('_')[2] 34 | if tsv_header_info[pu_barcode]: 35 | lb_value, pu_value = tsv_header_info[pu_barcode] 36 | pu = key + ":" + pu_value 37 | else: 38 | print "ERROR: %s doesn't exist match the value in the tsv" %(pu_barcode) 39 | elif 'LB' in key: 40 | assert lb_value == value # assumption : one library name exists for both runs 41 | lb = key + ':' + lb_value 42 | elif 'SM' in key: 43 | sm = key + ":" + sample 44 | else: 45 | pass 46 | ds = "DS:" + project 47 | cn = "CN:BCM" 48 | print "@RG\t%s\t%s\t%s\t%s\t%s\t%s" %(id, lb, pu, sm, ds, cn) 49 | 50 | 51 | -------------------------------------------------------------------------------- /process_bams_and_submit_from_nexus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | worklist_file=$1 4 | 5 | # head -1 $worklist_file | while read line 6 | # tail -n +2 $worklist_file | while read line 7 | cat $worklist_file | while read line 8 | do 9 | sample=`echo "$line" | cut -f1` 10 | nexus_path=`echo "$line" | cut -f2` 11 | 12 | nexus_data_dir="$nexus_path/working/data" 13 | nexus_sample_dir="$nexus_data_dir/$sample" 14 | 15 | dx ls --folders $nexus_sample_dir 16 | 17 | if [ $? -eq 0 ] 18 | then 19 | bam_filename=`echo "$line" | cut -f4` 20 | bam_file_path=$nexus_sample_dir/$bam_filename 21 | 22 | header_filename=`echo "$line" | cut -f5` 23 | header_file_path=$nexus_sample_dir/$header_filename 24 | 25 | ( dx ls $bam_file_path && dx ls $header_file_path ) > /dev/null 26 | 27 | if [ $? -eq 0 ] 28 | then 29 | output_bam_filename=`echo "$line" | cut -f6` 30 | 31 | aspera_key_file="project-BX7XGq00YKX3X12J59PVZ98Z:file-BX7XJJ00YKXKX2x400670B69" 32 | 33 | # 600m; server name - gap-upload 34 | # app_id="project-BX7XGq00YKX3X12J59PVZ98Z:applet-BXVBVz80x83xZvQkj1Qbxx7v" 35 | 36 | # 200m; server name - gap-upload 37 | # app_id="project-BX7XGq00YKX3X12J59PVZ98Z:applet-BXbFGg80yGGQJ5kxp1zB0xy5" 38 | 39 | # set transfer speed; server name - gap-submit 40 | app_id="project-BX7XGq00YKX3X12J59PVZ98Z:applet-BbXFBb00KP03F3pJ1VF01kQF" 41 | 42 | 43 | echo "dx run $app_id -i input_bam_file=$bam_file_path -i input_header_file=$header_file_path -i output_bam_filename=$output_bam_filename -i aspera_key_file=$aspera_key_file -i transfer_speed=200 --destination=$nexus_sample_dir --brief" 44 | dx run $app_id -i input_bam_file=$bam_file_path -i input_header_file=$header_file_path -i output_bam_filename=$output_bam_filename -i aspera_key_file=$aspera_key_file -i transfer_speed=200 --destination=$nexus_sample_dir --brief 45 | else 46 | echo "bam/header do not exist for sample:" $sample 47 | fi 48 | else 49 | echo "nexus sample dir does not exist for sample:" $sample 50 | fi 51 | done 52 | -------------------------------------------------------------------------------- /generate_bam_header.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | tsv_file=$1 4 | project_name=$2 5 | data_dir=$3 6 | sq_file=$4 7 | 8 | cat $tsv_file | while read line 9 | do 10 | sample_name=`echo "$line" | cut -f1` 11 | library_name=`echo "$line" | cut -f2` 12 | run_name=`echo "$line" | cut -f6` 13 | 14 | sample_dir="$data_dir/$sample_name" 15 | 16 | ls $sample_dir > /dev/null 17 | 18 | if [ $? -eq 0 ] 19 | then 20 | incomplete_header_file=$(ls $sample_dir/*incomplete.header.sam) 21 | 22 | # check if the downloaded file is 0 bytes or not 23 | if [ -f $incomplete_header_file ] && [ -s $incomplete_header_file ] 24 | then 25 | incorrect_rg_file="$incomplete_header_file".RG 26 | 27 | header_file=$sample_dir/$sample_name.header.sam 28 | 29 | rg_file="$header_file".RG 30 | pg_file="$header_file".PG 31 | 32 | cat $incomplete_header_file | grep "@RG" > $incorrect_rg_file 33 | cat $incomplete_header_file | grep "@PG" > $pg_file 34 | 35 | # check if rg file is 0 bytes or not 36 | if [ -f $incorrect_rg_file ] && [ -s $incorrect_rg_file ] 37 | then 38 | number_of_rgs=`cat $incorrect_rg_file | wc -l` 39 | 40 | if [ $number_of_rgs -eq 0 ] 41 | then 42 | echo "missing rg tags for sample:" $sample_name 43 | echo "assuming one @RG for this bam" 44 | id="ID:0" 45 | number_of_rgs=1 46 | elif [ $number_of_rgs -gt 1 ] 47 | then 48 | echo "multiple rgs exist for sample:" $sample_name 49 | fi 50 | fi 51 | 52 | if [ $number_of_rgs -eq 1 ] 53 | then 54 | id=`cat $incorrect_rg_file | cut -f2` 55 | echo -e "@RG\t"$id"\tPL:Illumina\tPU:"$run_name"\tLB:"$library_name"\tDS:"$project_name"\tSM:"$sample_name"\tCN:BCM" > $rg_file 56 | cat $sq_file $rg_file $pg_file > $header_file 57 | else 58 | echo "multiple RG exist for sample:" $sample_name 59 | fi 60 | else 61 | echo "extracted header file does not exist:" $incomplete_header_file 62 | fi 63 | else 64 | echo "missing sample dir for sample:" $sample_name 65 | fi 66 | done 67 | -------------------------------------------------------------------------------- /replace_bam_paths_with_dna_nexus_file_ids.groovy: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env groovy 2 | 3 | if(args.size() != 6) 4 | { 5 | println "usage: replace_bam_paths_with_dna_nexus_file_ids.groovy " 6 | println "column in the info file starts at 0th position" 7 | System.exit(-1) 8 | } 9 | 10 | // generated from metadata-lookup; has fixed columns 11 | def tsv_file = new File(args[0]) 12 | 13 | // can contain random columns 14 | def bam_paths_file = new File(args[1]) 15 | def project_id = args[2] 16 | def barcode_position = args[3].toInteger() 17 | def file_id_position = args[4].toInteger() 18 | def bam_position = args[5].toInteger() 19 | 20 | println "tsv file: ${tsv_file.name}" 21 | println "bam paths file: ${bam_paths_file.name}" 22 | println "nexus project id: ${project_id}" 23 | println "barcode position: ${barcode_position}" 24 | println "bam position: ${bam_position}" 25 | 26 | barcode_path_map = get_bam_paths(bam_paths_file, project_id, barcode_position, file_id_position, bam_position) 27 | 28 | replace_bam_paths(tsv_file, barcode_path_map) 29 | 30 | def get_bam_paths(bam_paths_file, project_id, barcode_pos, file_id_pos, bam_pos) 31 | { 32 | def path_map = [:] 33 | bam_paths_file.eachLine 34 | { 35 | line-> 36 | barcode = line.split('\t')[barcode_pos] 37 | file_id = line.split('\t')[file_id_pos] 38 | bam_file_name = line.split('\t')[bam_pos] 39 | path = "${project_id}:${file_id}=${bam_file_name}" 40 | path_map[barcode] = path 41 | } 42 | return path_map 43 | } 44 | 45 | def replace_bam_paths(tsv_file, barcode_path_map) 46 | { 47 | def out_filename = "${tsv_file.name}".replace('.tsv', '_correct_bam_file_ids.tsv') 48 | def out_file_handle = new File(out_filename) 49 | println "\n************************************" 50 | println "writing out new bam paths to ${out_filename}" 51 | 52 | tsv_file.eachLine 53 | { 54 | line-> 55 | tsv_barcode = line.split('\t')[3] 56 | path_1 = line.split('\t')[4] 57 | path_2 = line.split('\t')[6] 58 | 59 | if(barcode_path_map[tsv_barcode]) 60 | { 61 | if(path_1 == path_2) 62 | { 63 | path = path_1 64 | line = line.replace(path, barcode_path_map[tsv_barcode]) 65 | } 66 | else 67 | { 68 | temp_line = line.replace(path_1, barcode_path_map[tsv_barcode]) 69 | line = temp_line.replace(path_2, barcode_path_map[tsv_barcode]) 70 | } 71 | out_file_handle << line + '\n' 72 | } 73 | else 74 | { 75 | println "\nERROR: path missing for barcode: ${tsv_barcode}" 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /dna-nexus-steps.sh: -------------------------------------------------------------------------------- 1 | # create a project on dna-nexus by logging in through their website 2 | submissions_test_20141030 3 | 4 | # login using the command line 5 | dx login 6 | 7 | # asks you to select a project on dna-nexus 8 | Available projects (CONTRIBUTE or higher): 9 | 0) submissions_test_20141030 (ADMINISTER) 10 | 1) submissions_test_20140918 (ADMINISTER) 11 | 12 | # select 1) 13 | # create test-bam dir under that project 14 | dx mkdir test-bam 15 | dx cd test-bam 16 | 17 | # upload a bam file to test-bam/ 18 | ua path/to/sample-1.bam --do-not-compress 19 | # => DID NOT upload to test-bam/ went into the project dir despite the cd 20 | 21 | ua path/to/sample-1.bam --do-not-compress -f /test-bam 22 | # => DID upload to test-bam/ ; all dirs in the project dir start with '/' as root 23 | 24 | # to change to a different project 25 | dx select 26 | 27 | # asks you to select a project on dna-nexus 28 | Available projects (CONTRIBUTE or higher): 29 | 0) submissions_test_20141030 (ADMINISTER) 30 | 1) submissions_test_20140918 (ADMINISTER) 31 | 32 | # OR 33 | 34 | # you can specify the project name 35 | dx select submissions_test_20141030 36 | 37 | # will try to ls the sample-1.bam uploaded to a different project 38 | dx ls submissions_test_20140918:/test-bam 39 | 40 | # remove recently copied file just to test the rm command 41 | dx rm /test-bam/sample-1.bam 42 | 43 | # copy it again since we need it 44 | dx cp submissions_test_20140918:/test-bam/sample-1.bam submissions_test_20141030:/test-bam 45 | 46 | ## possible commands for the pipeline 47 | # create dir structure and copy the bam for submission 48 | 49 | # do not need an app to do this 50 | # dx-run-app-locally generate-dir-structure/ -iworklist=submission_20141210.worklist 51 | 52 | cat submission_20141210.worklist | cut -f1,2 | while read line; do nexus_path=`echo "$line" | cut -f1`; source_bam=`echo "$line" | cut -f2`; dx mkdir -p $nexus_path; dx cp $source_bam $nexus_path/; done 53 | 54 | # extract rg and pg lines from the bams at nexus 55 | cat submission_20141210.worklist | while read line; do nexus_path=`echo "$line" | cut -f2`; bam_file=`echo "$line" | cut -f3`; bam_file_path=$nexus_path/$bam_file; sample_name=`echo "$line" | cut -f1`; output_rg_pg_file=`echo "$line" | cut -f4`; echo dx run extract-rg-pg-from-bam -i input_bam_file=$bam_file_path -i output_rg_pg_filename=$output_rg_pg_file; dx run extract-rg-pg-from-bam -i input_bam_file=$bam_file_path -i output_rg_pg_filename=$output_rg_pg_file --destination=$nexus_path --brief; done 56 | 57 | # download rg and pg lines from the bams at nexus 58 | cd project-1/batch-1/working/data 59 | cat ../../../../submission_20141210.worklist | while read line; do sample_name=`echo "$line" | cut -f1`; nexus_path=`echo "$line" | cut -f2`; incomplete_header_file=`echo "$line" | cut -f4`; incomplete_header_file_path=$nexus_path/$incomplete_header_file; echo dx download $incomplete_header_file_path -o $sample_name/ --no-progress; dx download $incomplete_header_file_path -o $sample_name/ --no-progress; done 60 | 61 | # generate bam header locally 62 | cd project-1/batch-1/working 63 | sh generate_bam_header.sh test.final.tsv "CHARGE-S ARIC" data /users/pipeline/p-submit/svn-installed/resource/header/GRCh37-lite/SQHeader.txt 64 | 65 | # upload bam headers to nexus 66 | cat ../../../submission_20141210.worklist | while read line; do sample_name=`echo "$line" | cut -f1`; nexus_path=`echo "$line" | cut -f2`; header_file=`echo "$line" | cut -f5`; header_file_path=$(ls data/$sample_name/$header_file); echo ua $header_file_path --do-not-compress -f $nexus_path; ua $header_file_path --do-not-compress -f $nexus_path; done 67 | 68 | #### NOT NEEDED ANYMORE ###### 69 | # run the generate bam header 70 | # cat submission_20141210.worklist | while read line; do nexus_path=`echo "$line" | cut -f2`; bam_file=`echo "$line" | cut -f3`; bam_file_path=$nexus_path/$bam_file; incomplete_header_file=`echo "$line" | cut -f4`; incomplete_header_file_path=$nexus_path/$incomplete_header_file; output_header_file=`echo "$line" | cut -f5`; echo dx run generate-bam-header -i input_bam_file=$bam_file_path -i input_sq_rg_file=$incomplete_header_file_path -i output_header_filename=$output_header_file --destination=$nexus_path --brief; dx run generate-bam-header -i input_bam_file=$bam_file_path -i input_sq_rg_file=$incomplete_header_file_path -i output_header_filename=$output_header_file --destination=$nexus_path --brief; done 71 | ############################## 72 | 73 | # run process bam 74 | cat submission_20141210.worklist | while read line; do nexus_path=`echo "$line" | cut -f2`; bam_file=`echo "$line" | cut -f3`; bam_file_path=$nexus_path/$bam_file; input_header_file=`echo "$line" | cut -f5`; input_header_file_path=$nexus_path/$input_header_file; output_bam_filename=`echo "$line" | cut -f6`; echo dx run process-bam -i input_bam_file=$bam_file_path -i input_header_file=$input_header_file_path -i output_bam_filename=$output_bam_filename --destination=$nexus_path --brief; dx run process-bam -i input_bam_file=$bam_file_path -i input_header_file=$input_header_file_path -i output_bam_filename=$output_bam_filename --destination=$nexus_path --brief; done 75 | 76 | --------------------------------------------------------------------------------