├── README.md
├── generate_bam_header_from_tsv.sh
├── create_worklist.sh
├── create_dir_and_links_on_nexus.sh
├── extract_rg_pg_lines.sh
├── download_md5s.sh
├── upload_bam_headers_to_nexus.sh
├── LICENSE
├── check_jobs_status.sh
├── generate_xmls.sh
├── fix_insert_sizes.py
├── remove_failed_samples_from_tsv.py
├── check_if_aspera_completed.sh
├── .gitignore
├── generate_merged_bam_header.py
├── process_bams_and_submit_from_nexus.sh
├── generate_bam_header.sh
├── replace_bam_paths_with_dna_nexus_file_ids.groovy
└── dna-nexus-steps.sh


/README.md:
--------------------------------------------------------------------------------
1 | # scripts-for-nexus-pipeline
2 | 
3 | version 1.0.0+dev
4 | 
5 | Internal scripts for working with DNAnexus
6 | 


--------------------------------------------------------------------------------
/generate_bam_header_from_tsv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | tsv_file=$1
 4 | data_dir=$2
 5 | project_name=$3
 6 | platform=$4
 7 | 
 8 | cat $tsv_file | while read line
 9 | do
10 |   sample=`echo "$line" | cut -f1`
11 |   sample_dir="$data_dir/$sample"
12 | 
13 |   if [ -d $sample_dir ]
14 |   then
15 |       echo "$sample_dir exists.."
16 |   else
17 |       echo mkdir $sample_dir
18 |   fi
19 | 
20 |   library=`echo "$line" | cut -f2`
21 |   run=`echo "$line" | cut -f6`
22 |   center="BCM"
23 | 
24 | done


--------------------------------------------------------------------------------
/create_worklist.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | tsv_file=$1
 4 | nexus_dir=$2
 5 | extension=$3
 6 | 
 7 | cat $tsv_file | while read line
 8 | do
 9 |   sample_name=`echo "$line" | cut -f1`
10 |   bam_file_id=`echo "$line" | cut -f5 | cut -d"=" -f1`
11 |   bam_file_name=`echo "$line" | cut -f5 | cut -d"=" -f2`
12 | 
13 |   final_header_file_name="$sample_name.header.sam"
14 | 
15 |   output_bam_file_name="$sample_name"_"$extension"_Illumina.bam
16 |   
17 |   echo -e $sample_name"\t"$nexus_dir"\t"$bam_file_id"\t"$bam_file_name"\t"$final_header_file_name"\t"$output_bam_file_name
18 | done


--------------------------------------------------------------------------------
/create_dir_and_links_on_nexus.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # need to be logged into dna nexus
 4 | # need to be in the correct project
 5 | 
 6 | worklist_file=$1
 7 | 
 8 | cat $worklist_file | while read line
 9 | do
10 |   sample_name=`echo "$line" | cut -f1`
11 | 
12 |   nexus_path=`echo "$line" | cut -f2`
13 |   nexus_data_dir="$nexus_path/working/data"
14 | 
15 |   dx ls $nexus_data_dir > /dev/null
16 | 
17 |   if [ $? -eq 0 ]
18 |   then
19 |       sample_dir="$nexus_data_dir/$sample_name"
20 |       bam_file_id=`echo "$line" | cut -f3`
21 | 
22 |       echo "dx mkdir -p $sample_dir"
23 |       dx mkdir -p $sample_dir
24 |       
25 |       echo "dx cp $bam_file_id $sample_dir/"
26 |       dx cp $bam_file_id $sample_dir/
27 |   else
28 |       echo "nexus dir doesn't exist:" $nexus_data_dir
29 |       exit 1
30 |   fi
31 | done
32 | 


--------------------------------------------------------------------------------
/extract_rg_pg_lines.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | tsv_file=$1
 4 | data_dir=$2
 5 | 
 6 | wxs_headers_dir="/groups/submissions/projects/charges/dna-nexus-downloads/wes_headers"
 7 | 
 8 | cat $tsv_file | while read line
 9 | do
10 |     sample=`echo "$line" | cut -f1`
11 |     sample_dir=$data_dir/$sample
12 | 
13 |     barcode=`echo "$line" | cut -f4`
14 |     number_of_bam_headers=`ls $wxs_headers_dir/$barcode*header | wc -l`
15 |   
16 |     incomplete_header_path=$sample_dir/$sample".incomplete.header.sam"
17 | 
18 |     if [ $number_of_bam_headers -eq 1 ]
19 |     then
20 | 	bam_header=`ls $wxs_headers_dir/$barcode*header`
21 | 	echo "extracting rg and pg lines from:" $bam_header
22 | 	grep -e "@RG" -e "@PG" $bam_header >> $incomplete_header_path
23 |     elif [ $number_of_bam_headers -eq 0 ]
24 |     then
25 | 	echo "no bam header exists for:" $sample
26 |     else
27 | 	echo "more than one bam header exists for:" $sample
28 |     fi
29 | done
30 | 


--------------------------------------------------------------------------------
/download_md5s.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | worklist_file=$1
 4 | 
 5 | cat $worklist_file | while read line
 6 | do
 7 |   sample=`echo "$line" | cut -f1`
 8 |   nexus_path=`echo "$line" | cut -f2`
 9 |   submission_bam_filename=`echo "$line" | cut -f6`
10 | 
11 |   nexus_data_dir="$nexus_path/working/data"
12 |   nexus_sample_dir="$nexus_data_dir/$sample"
13 |   submission_bam_path="$nexus_sample_dir/$submission_bam_filename"
14 |   md5_file_path="$nexus_sample_dir/$submission_bam_filename".md5
15 | 
16 |   dx ls $nexus_sample_dir > /dev/null
17 | 
18 |   if [ $? -eq 0 ]
19 |   then
20 |       dx ls $md5_file_path > /dev/null
21 |       
22 |       if [ $? -eq 0 ]
23 |       then
24 | 	  echo dx download $md5_file_path -o data/$sample/ --no-progress
25 | 	  dx download $md5_file_path -o data/$sample/ --no-progress
26 |       else
27 | 	  echo "md5file missing for sample:" $sample
28 |       fi
29 |   else
30 |       echo "sample dir missing for sample:" $sample
31 |   fi
32 | done


--------------------------------------------------------------------------------
/upload_bam_headers_to_nexus.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | worklist_file=$1
 4 | data_dir=$2
 5 | 
 6 | # Requires: dnanexus-upload-agent/ua
 7 | upload_agent=$(which ua)
 8 | 
 9 | cat $worklist_file | while read line
10 | do
11 |   sample=`echo "$line" | cut -f1`
12 |   nexus_path=`echo "$line" | cut -f2`
13 | 
14 |   nexus_data_dir="$nexus_path/working/data"
15 |   nexus_sample_dir="$nexus_data_dir/$sample"
16 | 
17 |   dx ls $nexus_sample_dir
18 | 
19 |   if [ $? -eq 0 ]
20 |   then
21 |       header_filename=`echo "$line" | cut -f5`
22 |       header_file_path=$data_dir/$sample/$header_filename
23 | 
24 |       if [ -f $header_file_path ]
25 |       then
26 | 	  echo $upload_agent $header_file_path --do-not-compress -f $nexus_sample_dir
27 | 	  $upload_agent $header_file_path --do-not-compress -f $nexus_sample_dir
28 |       else
29 | 	  echo "header file missing for sample:" $sample
30 |       fi
31 |   else
32 |       echo "nexus sample dir does not exist for sample:" $sample
33 |   fi
34 | done
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/check_jobs_status.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | log_file=$1
 4 | 
 5 | num_of_jobs=`cat $log_file | grep job | wc -l`
 6 | echo "number of jobs submitted:" $num_of_jobs
 7 | 
 8 | num_of_done=0
 9 | num_of_running=0
10 | num_of_failed=0
11 | num_of_waiting=0
12 | 
13 | for id in `cat $log_file | grep job`
14 | do 
15 |    state=`dx describe $id | grep -e "State"`
16 | 
17 |    if [[ $state == *done ]]
18 |    then
19 |        num_of_done=`expr $num_of_done + 1`
20 |    elif [[ $state == *running ]]
21 |    then
22 |        num_of_running=`expr $num_of_running + 1`
23 |    elif [[ $state == *failed ]]
24 |    then
25 |        num_of_failed=`expr $num_of_failed + 1`
26 |    elif [[ $state == *runnable ]]
27 |    then
28 |        num_of_waiting=`expr $num_of_waiting + 1`
29 |    else
30 |        echo "unknown status:" $state
31 |    fi
32 | 
33 | done
34 | 
35 | state_known=`expr $num_of_done + $num_of_running + $num_of_failed + $num_of_waiting`
36 | state_unknown=`expr $num_of_jobs - $state_known`
37 | 
38 | echo "Done:" $num_of_done
39 | echo "Running:" $num_of_running
40 | echo "Failed:" $num_of_failed
41 | echo "Waiting:" $num_of_waiting
42 | echo "State unknown:" $state_unknown
43 | 


--------------------------------------------------------------------------------
/generate_xmls.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | tsv_file="$1"               # chs_wgs_f1_b100_20141218.final.tsv
 4 | biosample_file="$2"         # chs_wgs_f1_b100_20141218.biosamples
 5 | data_dir="$3"               # data/
 6 | extension="$4"              # "Illumina"
 7 | 
 8 | date=`date +%Y%m%d`
 9 | 
10 | cat $tsv_file | while read line
11 | do
12 |     sample=`echo "$line" | cut -f1`
13 |     sample_dir=`ls -d $data_dir/$sample`
14 | 
15 |     metadata_dirname="bcm-sra-$sample"_"$extension"_"$date"
16 |     metadata_dir=$sample_dir/$metadata_dirname
17 | 
18 |     if [[ -a $metadata_dir ]]
19 |     then
20 | 	echo "$metadata_dir exists"
21 |     else
22 | 	echo mkdir $metadata_dir
23 | 	mkdir $metadata_dir
24 |     fi
25 | 
26 |     scripts_home="/groups/submissions/software/noarch/apps/bcm-hgsc-nexgen-submission-pipeline/4-package-metadata"
27 | 
28 |     # experiment xml
29 |     echo python $scripts_home/generate_experiment_xml.py $sample $tsv_file $biosample_file $metadata_dir
30 |     python $scripts_home/generate_experiment_xml.py $sample $tsv_file $biosample_file $metadata_dir
31 |     # run xml
32 |     echo python $scripts_home/generate_run_xml.py $sample_dir $metadata_dir
33 |     python $scripts_home/generate_run_xml.py $sample_dir $metadata_dir
34 |     mv $metadata_dir/runs.yaml $sample_dir
35 | done
36 | 


--------------------------------------------------------------------------------
/fix_insert_sizes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | tsv_file = sys.argv[1]
 7 | info_file = sys.argv[2]
 8 | barcode_pos = int(sys.argv[3])
 9 | insert_size_pos = int(sys.argv[4])
10 | 
11 | output_filename = os.path.splitext(tsv_file)[0] + '_correct_insert_size' + os.path.splitext(tsv_file)[1]
12 | 
13 | info_map = {}
14 | with open(info_file) as f:
15 |     for raw_line in f:
16 |         line = raw_line.rstrip()
17 |         barcode = line.split('\t')[barcode_pos]
18 |         insert_size = line.split('\t')[insert_size_pos]
19 |         info_map[barcode] = insert_size
20 | 
21 | 
22 | with open(tsv_file) as f:
23 |     for raw_line in f:
24 |         line = raw_line.rstrip()
25 |         barcode = line.split('\t')[3]
26 |         tsv_insert_size = line.split('\t')[15]
27 |         final_insert_size = None
28 | 
29 |         if 'ERROR' in tsv_insert_size:
30 |             final_insert_size = info_map[barcode]
31 |         elif 'MANUAL' in tsv_insert_size:
32 |             final_insert_size = info_map[barcode]
33 |         else:
34 |             final_insert_size = tsv_insert_size
35 |             
36 |         final_line = line.replace(tsv_insert_size, final_insert_size)
37 |         
38 |         with open(output_filename, 'a') as of:
39 |             of.write(final_line + '\n')
40 | 


--------------------------------------------------------------------------------
/remove_failed_samples_from_tsv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import collections
 4 | import os
 5 | import sys
 6 | 
 7 | qc_log_file = sys.argv[1]
 8 | tsv_file = sys.argv[2]
 9 | 
10 | file_prefix = os.path.splitext(tsv_file)[0]
11 | file_suffix = os.path.splitext(tsv_file)[1]
12 | 
13 | passed_qc_tsv = os.path.join(file_prefix + '_passed_qc' + file_suffix)
14 | failed_qc_tsv = os.path.join(file_prefix + '_failed_qc' + file_suffix)
15 | 
16 | print qc_log_file, tsv_file, passed_qc_tsv, failed_qc_tsv
17 | 
18 | samples = set()
19 | tsv_map = collections.defaultdict(list)
20 | with open(tsv_file) as f:
21 |     for raw_line in f:
22 |         line = raw_line.rstrip()
23 |         sample = line.split('\t')[0]
24 |         samples.add(sample)
25 |         tsv_map[sample].append(line)
26 |         
27 | qc_failed_samples = []
28 | with open(qc_log_file) as f:
29 |     for raw_line in f:
30 |         line = raw_line.rstrip()
31 |         if line.startswith('ERROR'):
32 |             qc_failed_samples.append(line.split()[1])
33 | 
34 | print len(tsv_map), len(samples), len(qc_failed_samples)
35 | 
36 | for sample in samples:
37 |     if sample in qc_failed_samples:
38 |         for value in tsv_map[sample]:
39 |             with open(failed_qc_tsv, 'a') as f:
40 |                 f.write(value + '\n')
41 |     else:
42 |         for value in tsv_map[sample]:
43 |             with open(passed_qc_tsv, 'a') as f:
44 |                 f.write(value + '\n')
45 | 


--------------------------------------------------------------------------------
/check_if_aspera_completed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # needs to be run from working OR ARIC/ARIC-B200-SUB/BAM-I-WXS dir
 4 | 
 5 | worklist_file=$1
 6 | 
 7 | num_of_samples=`cat $worklist_file | wc -l`
 8 | echo "number of samples in worklist:" $num_of_samples 
 9 | 
10 | cat $worklist_file | while read line
11 | do 
12 |   sample=`echo "$line" | cut -f1`
13 |   nexus_path=`echo "$line" | cut -f2`
14 |   submission_bam_filename=`echo "$line" | cut -f6`
15 | 
16 |   nexus_data_dir="$nexus_path/working/data"
17 |   nexus_sample_dir="$nexus_data_dir/$sample"
18 |   nexus_submitted_dir="$nexus_path/submitted"
19 | 
20 |   bam_file_path="$nexus_sample_dir/$submission_bam_filename"
21 | 
22 |   ( dx ls $nexus_sample_dir && dx ls $nexus_submitted_dir ) > /dev/null
23 | 
24 |   if [ $? -eq 0 ]
25 |   then
26 |       aspera_log_file="$nexus_sample_dir/$submission_bam_filename".aspera.log
27 | 
28 |       dx ls $aspera_log_file > /dev/null
29 | 
30 |       if [ $? -eq 0 ]
31 |       then
32 | 	  bams_completed=`dx cat $aspera_log_file | grep -v -e uploaded -v -e started -v -e finished | grep -c Completed`
33 | 	
34 | 	  if [ $bams_completed -eq 1 ]
35 |  	  then
36 |  	      echo "bam upload success:" $bam_file_path
37 |  	      echo "dx mv $nexus_sample_dir $nexus_submitted_dir/"
38 | 	      dx mv $nexus_sample_dir $nexus_submitted_dir/
39 | 	      echo "mv data/$sample ../ready"
40 | 	      mv data/$sample ../ready
41 |  	  else
42 |  	      echo "bam upload failed:" $bam_file_path
43 |  	  fi
44 |       else
45 |  	  echo "bam not queued up for upload:" $bam_file_path
46 |       fi
47 |   fi
48 | done
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/generate_merged_bam_header.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | 
 7 | sample = sys.argv[1]
 8 | tsv_file = sys.argv[2]
 9 | rg_incomplete_file = sys.argv[3]
10 | project = sys.argv[4]
11 | 
12 | 
13 | tsv_header_info = {}
14 | with open(tsv_file) as f:
15 |     for raw_line in f:
16 |         line = raw_line.rstrip()
17 |         if line.startswith(sample):
18 |             tsv_header_info[line.split('\t')[3]] = line.split('\t')[1], line.split('\t')[5]
19 | 
20 | 
21 | with open(rg_incomplete_file) as f:
22 |     for raw_line in f:
23 |         line = raw_line.rstrip()
24 |         id = pu = lb = sm = ds = cn = None
25 |         for rg_value in line.split('\t'):
26 |             if rg_value == "@RG":
27 |                 pass
28 |             else:
29 |                 key, value = rg_value.split(":")
30 |                 if 'ID' in key:
31 |                     id = key + ":" + value
32 |                 elif 'PU' in key:
33 |                     pu_barcode = value.split('_')[2]
34 |                     if tsv_header_info[pu_barcode]:
35 |                         lb_value, pu_value = tsv_header_info[pu_barcode]
36 |                         pu = key + ":" + pu_value
37 |                     else:
38 |                         print "ERROR: %s doesn't exist match the value in the tsv" %(pu_barcode)
39 |                 elif 'LB' in key:
40 |                     assert lb_value == value         # assumption : one library name exists for both runs
41 |                     lb = key + ':' + lb_value
42 |                 elif 'SM' in key:
43 |                     sm = key + ":" + sample
44 |                 else:
45 |                     pass
46 |             ds = "DS:" + project
47 |             cn = "CN:BCM"
48 |         print "@RG\t%s\t%s\t%s\t%s\t%s\t%s" %(id, lb, pu, sm, ds, cn)
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/process_bams_and_submit_from_nexus.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | worklist_file=$1
 4 | 
 5 | # head -1 $worklist_file | while read line
 6 | # tail -n +2 $worklist_file | while read line
 7 | cat $worklist_file | while read line
 8 | do
 9 |   sample=`echo "$line" | cut -f1`
10 |   nexus_path=`echo "$line" | cut -f2`
11 | 
12 |   nexus_data_dir="$nexus_path/working/data"
13 |   nexus_sample_dir="$nexus_data_dir/$sample"
14 | 
15 |   dx ls --folders $nexus_sample_dir
16 | 
17 |   if [ $? -eq 0 ]
18 |   then
19 |       bam_filename=`echo "$line" | cut -f4`
20 |       bam_file_path=$nexus_sample_dir/$bam_filename
21 | 
22 |       header_filename=`echo "$line" | cut -f5`
23 |       header_file_path=$nexus_sample_dir/$header_filename
24 | 
25 |       ( dx ls $bam_file_path && dx ls $header_file_path ) > /dev/null
26 | 
27 |       if [ $? -eq 0 ]
28 |       then
29 | 	  output_bam_filename=`echo "$line" | cut -f6`
30 |       
31 | 	  aspera_key_file="project-BX7XGq00YKX3X12J59PVZ98Z:file-BX7XJJ00YKXKX2x400670B69"
32 | 
33 | 	  # 600m; server name - gap-upload
34 | 	  # app_id="project-BX7XGq00YKX3X12J59PVZ98Z:applet-BXVBVz80x83xZvQkj1Qbxx7v"
35 | 
36 | 	  # 200m; server name - gap-upload
37 | 	  # app_id="project-BX7XGq00YKX3X12J59PVZ98Z:applet-BXbFGg80yGGQJ5kxp1zB0xy5"
38 | 
39 |           # set transfer speed; server name - gap-submit
40 |           app_id="project-BX7XGq00YKX3X12J59PVZ98Z:applet-BbXFBb00KP03F3pJ1VF01kQF"
41 | 
42 | 
43 | 	  echo "dx run $app_id -i input_bam_file=$bam_file_path -i input_header_file=$header_file_path -i output_bam_filename=$output_bam_filename -i aspera_key_file=$aspera_key_file -i transfer_speed=200 --destination=$nexus_sample_dir --brief"
44 | 	  dx run $app_id -i input_bam_file=$bam_file_path -i input_header_file=$header_file_path -i output_bam_filename=$output_bam_filename -i aspera_key_file=$aspera_key_file -i transfer_speed=200 --destination=$nexus_sample_dir --brief
45 |       else
46 | 	  echo "bam/header do not exist for sample:" $sample
47 |       fi
48 |   else
49 |       echo "nexus sample dir does not exist for sample:" $sample
50 |   fi
51 | done
52 | 


--------------------------------------------------------------------------------
/generate_bam_header.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | tsv_file=$1
 4 | project_name=$2
 5 | data_dir=$3
 6 | sq_file=$4
 7 | 
 8 | cat $tsv_file | while read line
 9 | do
10 |   sample_name=`echo "$line" | cut -f1`
11 |   library_name=`echo "$line" | cut -f2`
12 |   run_name=`echo "$line" | cut -f6`
13 | 
14 |   sample_dir="$data_dir/$sample_name"
15 | 
16 |   ls $sample_dir > /dev/null
17 | 
18 |   if [ $? -eq 0 ]
19 |   then
20 |       incomplete_header_file=$(ls $sample_dir/*incomplete.header.sam)
21 |   
22 |       # check if the downloaded file is 0 bytes or not
23 |       if [ -f $incomplete_header_file ] && [ -s $incomplete_header_file ]
24 |       then
25 | 	  incorrect_rg_file="$incomplete_header_file".RG
26 | 
27 | 	  header_file=$sample_dir/$sample_name.header.sam
28 | 
29 | 	  rg_file="$header_file".RG
30 | 	  pg_file="$header_file".PG
31 | 
32 | 	  cat $incomplete_header_file | grep "@RG" > $incorrect_rg_file
33 | 	  cat $incomplete_header_file | grep "@PG" > $pg_file
34 | 
35 |           # check if rg file is 0 bytes or not
36 | 	  if [ -f $incorrect_rg_file ] && [ -s $incorrect_rg_file ]
37 | 	  then
38 | 	      number_of_rgs=`cat $incorrect_rg_file | wc -l`
39 | 	  
40 | 	      if [ $number_of_rgs -eq 0 ]
41 | 	      then
42 | 		  echo "missing rg tags for sample:" $sample_name
43 | 		  echo "assuming one @RG for this bam"
44 | 		  id="ID:0"
45 | 		  number_of_rgs=1
46 | 	      elif [ $number_of_rgs -gt 1 ]
47 | 	      then
48 | 		  echo "multiple rgs exist for sample:" $sample_name
49 | 	      fi
50 | 	  fi
51 | 
52 | 	  if [ $number_of_rgs -eq 1 ]
53 | 	  then
54 | 	      id=`cat $incorrect_rg_file | cut -f2`
55 | 	      echo -e "@RG\t"$id"\tPL:Illumina\tPU:"$run_name"\tLB:"$library_name"\tDS:"$project_name"\tSM:"$sample_name"\tCN:BCM" > $rg_file
56 | 	      cat $sq_file $rg_file $pg_file > $header_file
57 | 	  else
58 | 	      echo "multiple RG exist for sample:" $sample_name
59 | 	  fi
60 |       else
61 | 	  echo "extracted header file does not exist:" $incomplete_header_file
62 |       fi
63 |   else
64 |       echo "missing sample dir for sample:" $sample_name
65 |   fi
66 | done
67 | 


--------------------------------------------------------------------------------
/replace_bam_paths_with_dna_nexus_file_ids.groovy:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env groovy
 2 | 
 3 | if(args.size() != 6)
 4 | {
 5 |     println "usage: replace_bam_paths_with_dna_nexus_file_ids.groovy <tsv file> <info file> <nexus project id> <position of barcode in the info file> <position of nexus file id in the info file> <position of bam file name in the info file>"
 6 |     println "column in the info file starts at 0th position"
 7 |     System.exit(-1)
 8 | }
 9 | 
10 | // generated from metadata-lookup; has fixed columns
11 | def tsv_file = new File(args[0])
12 | 
13 | // can contain random columns
14 | def bam_paths_file = new File(args[1])
15 | def project_id = args[2]
16 | def barcode_position = args[3].toInteger()
17 | def file_id_position = args[4].toInteger()
18 | def bam_position = args[5].toInteger()
19 | 
20 | println "tsv file: ${tsv_file.name}"
21 | println "bam paths file: ${bam_paths_file.name}"
22 | println "nexus project id: ${project_id}"
23 | println "barcode position: ${barcode_position}"
24 | println "bam position: ${bam_position}"
25 | 
26 | barcode_path_map = get_bam_paths(bam_paths_file, project_id, barcode_position, file_id_position, bam_position)
27 | 
28 | replace_bam_paths(tsv_file, barcode_path_map)
29 | 
30 | def get_bam_paths(bam_paths_file, project_id, barcode_pos, file_id_pos, bam_pos)
31 | {
32 |     def path_map = [:]
33 |     bam_paths_file.eachLine
34 |     {
35 |         line->
36 | 	barcode = line.split('\t')[barcode_pos]
37 | 	file_id = line.split('\t')[file_id_pos]
38 | 	bam_file_name = line.split('\t')[bam_pos]
39 | 	path = "${project_id}:${file_id}=${bam_file_name}"
40 | 	path_map[barcode] = path
41 |     }
42 |     return path_map
43 | }
44 | 
45 | def replace_bam_paths(tsv_file, barcode_path_map)
46 | {
47 |     def out_filename = "${tsv_file.name}".replace('.tsv', '_correct_bam_file_ids.tsv')
48 |     def out_file_handle = new File(out_filename)
49 |     println "\n************************************"
50 |     println "writing out new bam paths to ${out_filename}"
51 | 
52 |     tsv_file.eachLine
53 |     {
54 |         line->
55 | 	tsv_barcode = line.split('\t')[3]
56 | 	path_1 = line.split('\t')[4]
57 | 	path_2 = line.split('\t')[6]
58 | 
59 |         if(barcode_path_map[tsv_barcode])
60 |         {
61 |             if(path_1 == path_2)
62 |             {
63 |                 path = path_1
64 | 	        line = line.replace(path, barcode_path_map[tsv_barcode])
65 |             }
66 |             else
67 |             {
68 | 	        temp_line = line.replace(path_1, barcode_path_map[tsv_barcode])
69 | 	        line = temp_line.replace(path_2, barcode_path_map[tsv_barcode])
70 |             }
71 | 	    out_file_handle << line + '\n'
72 |         }
73 | 	else
74 | 	{
75 | 	    println "\nERROR: path missing for barcode: ${tsv_barcode}"
76 | 	}
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/dna-nexus-steps.sh:
--------------------------------------------------------------------------------
 1 | # create a project on dna-nexus by logging in through their website
 2 | submissions_test_20141030
 3 | 
 4 | # login using the command line
 5 | dx login
 6 | 
 7 | # asks you to select a project on dna-nexus
 8 | Available projects (CONTRIBUTE or higher):
 9 | 0) submissions_test_20141030 (ADMINISTER)
10 | 1) submissions_test_20140918 (ADMINISTER)
11 | 
12 | # select 1)
13 | # create test-bam dir under that project
14 | dx mkdir test-bam
15 | dx cd test-bam
16 | 
17 | # upload a bam file to test-bam/
18 | ua path/to/sample-1.bam --do-not-compress 
19 | # => DID NOT upload to test-bam/ went into the project dir despite the cd
20 | 
21 | ua path/to/sample-1.bam --do-not-compress -f /test-bam
22 | # => DID upload to test-bam/ ; all dirs in the project dir start with '/' as root
23 | 
24 | # to change to a different project
25 | dx select 
26 | 
27 | # asks you to select a project on dna-nexus
28 | Available projects (CONTRIBUTE or higher):
29 | 0) submissions_test_20141030 (ADMINISTER)
30 | 1) submissions_test_20140918 (ADMINISTER)
31 | 
32 | # OR
33 | 
34 | # you can specify the project name
35 | dx select submissions_test_20141030
36 | 
37 | # will try to ls the sample-1.bam uploaded to a different project
38 | dx ls submissions_test_20140918:/test-bam
39 | 
40 | # remove recently copied file just to test the rm command
41 | dx rm /test-bam/sample-1.bam
42 | 
43 | # copy it again since we need it
44 | dx cp submissions_test_20140918:/test-bam/sample-1.bam submissions_test_20141030:/test-bam
45 | 
46 | ## possible commands for the pipeline
47 | # create dir structure and copy the bam for submission
48 | 
49 | # do not need an app to do this
50 | # dx-run-app-locally generate-dir-structure/ -iworklist=submission_20141210.worklist
51 | 
52 | cat submission_20141210.worklist | cut -f1,2 | while read line; do nexus_path=`echo "$line" | cut -f1`; source_bam=`echo "$line" | cut -f2`; dx mkdir -p $nexus_path; dx cp $source_bam $nexus_path/; done
53 | 
54 | # extract rg and pg lines from the bams at nexus
55 | cat submission_20141210.worklist | while read line; do nexus_path=`echo "$line" | cut -f2`; bam_file=`echo "$line" | cut -f3`; bam_file_path=$nexus_path/$bam_file; sample_name=`echo "$line" | cut -f1`; output_rg_pg_file=`echo "$line" | cut -f4`; echo dx run extract-rg-pg-from-bam -i input_bam_file=$bam_file_path -i output_rg_pg_filename=$output_rg_pg_file; dx run extract-rg-pg-from-bam -i input_bam_file=$bam_file_path -i output_rg_pg_filename=$output_rg_pg_file --destination=$nexus_path --brief; done
56 | 
57 | # download rg and pg lines from the bams at nexus
58 | cd project-1/batch-1/working/data
59 | cat ../../../../submission_20141210.worklist | while read line; do sample_name=`echo "$line" | cut -f1`; nexus_path=`echo "$line" | cut -f2`; incomplete_header_file=`echo "$line" | cut -f4`; incomplete_header_file_path=$nexus_path/$incomplete_header_file; echo dx download $incomplete_header_file_path -o $sample_name/ --no-progress; dx download $incomplete_header_file_path -o $sample_name/ --no-progress; done
60 | 
61 | # generate bam header locally
62 | cd project-1/batch-1/working
63 | sh generate_bam_header.sh test.final.tsv "CHARGE-S ARIC" data /users/pipeline/p-submit/svn-installed/resource/header/GRCh37-lite/SQHeader.txt
64 | 
65 | # upload bam headers to nexus
66 | cat ../../../submission_20141210.worklist | while read line; do sample_name=`echo "$line" | cut -f1`; nexus_path=`echo "$line" | cut -f2`; header_file=`echo "$line" | cut -f5`; header_file_path=$(ls data/$sample_name/$header_file); echo ua $header_file_path --do-not-compress -f $nexus_path; ua $header_file_path --do-not-compress -f $nexus_path; done
67 | 
68 | #### NOT NEEDED ANYMORE ######
69 | # run the generate bam header
70 | # cat submission_20141210.worklist | while read line; do nexus_path=`echo "$line" | cut -f2`; bam_file=`echo "$line" | cut -f3`; bam_file_path=$nexus_path/$bam_file; incomplete_header_file=`echo "$line" | cut -f4`; incomplete_header_file_path=$nexus_path/$incomplete_header_file; output_header_file=`echo "$line" | cut -f5`; echo dx run generate-bam-header -i input_bam_file=$bam_file_path -i input_sq_rg_file=$incomplete_header_file_path -i output_header_filename=$output_header_file --destination=$nexus_path --brief; dx run generate-bam-header -i input_bam_file=$bam_file_path -i input_sq_rg_file=$incomplete_header_file_path -i output_header_filename=$output_header_file --destination=$nexus_path --brief; done
71 | ##############################
72 | 
73 | # run process bam
74 | cat submission_20141210.worklist | while read line; do nexus_path=`echo "$line" | cut -f2`; bam_file=`echo "$line" | cut -f3`; bam_file_path=$nexus_path/$bam_file; input_header_file=`echo "$line" | cut -f5`; input_header_file_path=$nexus_path/$input_header_file; output_bam_filename=`echo "$line" | cut -f6`; echo dx run process-bam -i input_bam_file=$bam_file_path -i input_header_file=$input_header_file_path -i output_bam_filename=$output_bam_filename --destination=$nexus_path --brief; dx run process-bam -i input_bam_file=$bam_file_path -i input_header_file=$input_header_file_path -i output_bam_filename=$output_bam_filename --destination=$nexus_path --brief; done
75 | 
76 | 


--------------------------------------------------------------------------------