├── filter_bam_file_for_popscle_dsc_pileup.sh
├── filter_vcf_file_for_popscle.sh
├── sort_vcf_same_as_bam.sh
├── README.md
└── popscle_dsc_pileup_merge_splitted.py


/filter_bam_file_for_popscle_dsc_pileup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Copyright (C): 2020-2021 - Gert Hulselmans
  4 | #
  5 | # Purpose: Filter BAM file for usage with popscle dsc-pileup by keeping reads:
  6 | #           - which overlap with SNPs in the VCF file
  7 | #           - and which have a cell barcode (default: "CB" tag) contained in the cell barcode list
  8 | #         Keeping only relevant reads for popscle dsc-pileup can speedup it up quite significantly
  9 | #         (depending on the reduction of the number of reads in the filtered BAM file vs original).
 10 | 
 11 | 
 12 | 
 13 | # Function to check if any of the programs in a pipe failed.
 14 | check_exit_codes () {
 15 |     local GET_PIPESTATUS="${PIPESTATUS[@]}";
 16 |     local exit_code;
 17 | 
 18 |     for exit_code in ${GET_PIPESTATUS} ; do
 19 |         if [ ${exit_code} -ne 0 ] ; then
 20 |              return ${exit_code};
 21 |         fi
 22 |     done
 23 | 
 24 |     return 0;
 25 | }
 26 | 
 27 | 
 28 | 
 29 | # Check if necessary programs are installed.
 30 | check_if_programs_exists () {
 31 |     local exit_code=0;
 32 | 
 33 |     # Check if bedtools is installed.
 34 |     if ! type bedtools > /dev/null 2>&1 ; then
 35 |         printf 'Error: "bedtools" could not be found in PATH.\n' > /dev/stderr;
 36 |         exit_code=2;
 37 |     fi
 38 | 
 39 |     # Check if samtools is installed.
 40 |     if ! type samtools > /dev/null 2>&1 ; then
 41 |         printf 'Error: "samtools" could not be found in PATH.\n' > /dev/stderr;
 42 |         exit_code=2;
 43 |     fi
 44 | 
 45 |     if [ ${exit_code} -eq 2 ] ; then
 46 |         return ${exit_code};
 47 |     fi
 48 | 
 49 |     # Check if samtools 1.10 or higher is installed (needs to have "-D STR:FILE" or "-D, --tag-file STR:FILE" option).
 50 |     if ! samtools view --help 2>&1 | grep -q -- '-D.*STR:FILE' ; then
 51 |         printf 'Error: The version of "samtools" (%s) should be 1.10 or higher (%s found).\n' \
 52 |             "$(type samtools)" \
 53 |             "$(samtools --version | head -n 1)" \
 54 |           > /dev/stderr;
 55 |         exit_code=2;
 56 |     fi
 57 | 
 58 |     return ${exit_code};
 59 | }
 60 | 
 61 | 
 62 | 
 63 | filter_bam_file_for_popscle_dsc_pileup () {
 64 |     local input_bam_filename="${1}";
 65 |     local barcodes_tsv_filename="${2}";
 66 |     local vcf_filename="${3}";
 67 |     local output_bam_filename="${4}";
 68 |     local barcode_tag="${5:-CB}";
 69 | 
 70 |     local exit_code=0;
 71 | 
 72 |     if [ ${#@} -lt 4 ] ; then
 73 |         printf 'Usage:   filter_bam_file_for_popscle_dsc_pileup input_bam_filename barcodes_tsv_filename vcf_filename output_bam_filename [barcode_tag]\n\n';
 74 |         printf 'Purpose: Filter BAM file for usage with popscle dsc-pileup by keeping reads:\n';
 75 |         printf '           - which overlap with SNPs in the VCF file\n';
 76 |         printf '           - and which have a cell barcode (default: "CB" tag) contained in the cell barcode list\n';
 77 |         printf '         Keeping only relevant reads for popscle dsc-pileup can speedup it up quite significantly\n';
 78 |         printf '         (depending on the reduction of the number of reads in the filtered BAM file vs original).\n\n';
 79 | 
 80 |         return 1;
 81 |     fi
 82 | 
 83 |     if [ ! -f  "${input_bam_filename}" ] ; then
 84 |         printf 'Error: Input (CellRanger) BAM file "%s" could not be found.\n' "${input_bam_filename}" > /dev/stderr;
 85 |         return 2;
 86 |     fi
 87 | 
 88 |     if [ ! -f  "${barcodes_tsv_filename}" ] ; then
 89 |         printf 'Error: File with barcodes "%s" could not be found.\n' "${barcodes_tsv_filename}" > /dev/stderr;
 90 |         return 2;
 91 |     fi
 92 | 
 93 |     if [ ! -f  "${vcf_filename}" ] ; then
 94 |         printf 'Error: File with unique SNPs per sample "%s" could not be found.\n' "${vcf_filename}" > /dev/stderr;
 95 |         return 2;
 96 |     fi
 97 | 
 98 |     if [ ${#barcode_tag} -ne 2 ] ; then
 99 |         printf 'Error: Barcode tag "%s" should be 2 characters.\n' "${barcode_tag}" > /dev/stderr;
100 |         return 2;
101 |     fi
102 | 
103 |     # Check if bedtools and samtools are in PATH.
104 |     if ! check_if_programs_exists ; then
105 |         return 2;
106 |     fi
107 | 
108 |     # Create much smaller BAM file for dsc-pileup of popscle:
109 |     #   - Convert VCF file with unique SNPs for each sample
110 |     #     to a BED file and merge adjacent SNP regions to one.
111 |     #   - Only include reads that contain a SNP position
112 |     #     and which contain a cell barcode of interest.
113 |     if [ "${barcodes_tsv_filename%.gz}".gz = "${barcodes_tsv_filename}" ] ; then
114 |         # Barcodes file is compressed with gzip.
115 |         bedtools merge -i "${vcf_filename}" \
116 |           | samtools view\
117 |                 -@ 8 \
118 |                 --write-index \
119 |                 -L - \
120 |                 -D "${barcode_tag}":<(zcat "${barcodes_tsv_filename}") \
121 |                 -o "${output_bam_filename}" \
122 |                 "${input_bam_filename}";
123 | 
124 |         # Check if any of the previous commands failed.
125 |         check_exit_codes;
126 | 
127 |         exit_code=$?;
128 |     else
129 |         # Barcodes file is uncompressed.
130 |         bedtools merge -i "${vcf_filename}" \
131 |           | samtools view\
132 |                 -@ 8 \
133 |                 --write-index \
134 |                 -L - \
135 |                 -D "${barcode_tag}":"${barcodes_tsv_filename}" \
136 |                 -o "${output_bam_filename}" \
137 |                 "${input_bam_filename}";
138 | 
139 |         # Check if any of the previous commands failed.
140 |         check_exit_codes;
141 | 
142 |         exit_code=$?;
143 |     fi
144 | 
145 | 
146 |     return ${exit_code};
147 | }
148 | 
149 | 
150 | 
151 | filter_bam_file_for_popscle_dsc_pileup "${@}";
152 | 


--------------------------------------------------------------------------------
/filter_vcf_file_for_popscle.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Copyright (C): 2020-2021 - Gert Hulselmans
  4 | #
  5 | # Purpose: Functions for filtering VCF files for usage with popscle by removing mutations which are not informative.
  6 | #
  7 | #
  8 | # BCFtools filtering expressions manual:
  9 | #   https://www.htslib.org/doc/bcftools.html#expressions
 10 | 
 11 | 
 12 | 
 13 | # Function to check if any of the programs in a pipe failed.
 14 | check_exit_codes () {
 15 |     local GET_PIPESTATUS="${PIPESTATUS[@]}";
 16 |     local exit_code;
 17 | 
 18 |     for exit_code in ${GET_PIPESTATUS} ; do
 19 |         if [ ${exit_code} -ne 0 ] ; then
 20 |              return ${exit_code};
 21 |         fi
 22 |     done
 23 | 
 24 |     return 0;
 25 | }
 26 | 
 27 | 
 28 | 
 29 | # Check if necessary programs are installed.
 30 | check_if_programs_exists () {
 31 |     local exit_code=0;
 32 | 
 33 |     # Check if awk is installed.
 34 |     if ! type awk > /dev/null 2>&1 ; then
 35 |         printf 'Error: "awk" could not be found in PATH.\n' > /dev/stderr;
 36 |         exit_code=2;
 37 |     fi
 38 | 
 39 |     # Check if bcftools is installed.
 40 |     if ! type bcftools > /dev/null 2>&1 ; then
 41 |         printf 'Error: "bcftools" could not be found in PATH.\n' > /dev/stderr;
 42 |         exit_code=2;
 43 |     fi
 44 | 
 45 |     return ${exit_code};
 46 | }
 47 | 
 48 | 
 49 | 
 50 | get_number_of_samples_in_vcf () {
 51 |     # VCF input file to use or stdin when no VCF input file is given.
 52 |     local vcf_input_file="${1:-/dev/stdin}";
 53 | 
 54 |     # Only look at the VCF header and count the number of samples in the "#CHROM" line.
 55 |     bcftools view -h "${vcf_input_file}" \
 56 |         | awk \
 57 |             -F '\t' \
 58 |             '
 59 |             {
 60 |                 if ( $1 == "#CHROM" ) {
 61 |                     if ( NF > 9 ) {
 62 |                         nbr_samples = NF - 9;
 63 |                         print nbr_samples;
 64 |                     } else {
 65 |                         print "0";
 66 |                     }
 67 |                 }
 68 |             }'
 69 | 
 70 |     check_exit_codes;
 71 | 
 72 |     return $?;
 73 | }
 74 | 
 75 | 
 76 | 
 77 | get_samples_names_in_vcf () {
 78 |     # VCF input file to use or stdin when no VCF input file is given.
 79 |     local vcf_input_file="${1:-/dev/stdin}";
 80 | 
 81 |     # Only look at the VCF header and print the sample names listed in the "#CHROM" line.
 82 |     bcftools view -h "${vcf_input_file}" \
 83 |         | awk \
 84 |             -F '\t' \
 85 |             -v "vcf_input_file=${vcf_input_file}" \
 86 |             '
 87 |             {
 88 |                 if ( $1 == "#CHROM" ) {
 89 |                     if  ( NF > 9 ) {
 90 |                         # Print all sample names.
 91 |                         for (sample_column_idx=10 ; sample_column_idx <= NF; sample_column_idx++) {
 92 |                             print $sample_column_idx;
 93 |                         }
 94 | 
 95 |                         exit(0);
 96 |                     } else {
 97 |                         printf "Error: No sample names found in VCF file \"%s\".\n", vcf_input_file > "/dev/stderr";
 98 | 
 99 |                         exit(1);
100 |                     }
101 |                 }
102 |             }'
103 | 
104 |     check_exit_codes;
105 | 
106 |     return $?;
107 | }
108 | 
109 | 
110 | 
111 | subset_samples_from_vcf () {
112 |     # Comma separated list of samples to extract from VCF file.
113 |     local samples="${1}";
114 | 
115 |     # VCF input file to use or stdin when no VCF input file is given.
116 |     local vcf_input_file="${2:-/dev/stdin}";
117 |     
118 |     if [ ${#@} -lt 1 ] ; then
119 |         printf 'Usage: subset_samples_from_vcf comma_separated_samples_names [VCF_file]\n';
120 |         return 1;
121 |     fi
122 | 
123 |     # Extract specific samples from VCF file.
124 |     bcftools view --samples "${samples}" "${vcf_input_file}";
125 | 
126 |     return $?;
127 | }
128 | 
129 | 
130 | 
131 | only_keep_snps () {
132 |     # VCF input file to use or stdin when no VCF input file is given.
133 |     local vcf_input_file="${1:-/dev/stdin}";
134 | 
135 |     # Filter out all non SNPs mutations.
136 |     bcftools view --types 'snps' "${vcf_input_file}";
137 | 
138 |     return $?;
139 | }
140 | 
141 | 
142 | 
143 | filter_out_mutations_missing_genotype_for_one_or_more_samples () {
144 |     # VCF input file to use or stdin when no VCF input file is given.
145 |     local vcf_input_file="${1:-/dev/stdin}";
146 | 
147 |     # Filter out mutations which have missing genotypes for one or more samples
148 |     # as those mutations are not very informative.
149 |     bcftools view --genotype '^miss' "${vcf_input_file}";
150 | 
151 |     return $?;
152 | }
153 | 
154 | 
155 | 
156 | filter_out_mutations_heterozygous_for_one_or_more_samples () {
157 |     # VCF input file to use or stdin when no VCF input file is given.
158 |     local vcf_input_file="${1:-/dev/stdin}";
159 | 
160 |     # Filter out mutations which are heterozygous for one or more samples.
161 |     bcftools view --genotype '^het' "${vcf_input_file}";
162 | 
163 |     return $?;
164 | }
165 | 
166 | 
167 | 
168 | filter_out_mutations_homozygous_reference_in_all_samples () {
169 |     # VCF input file to use or stdin when no VCF input file is given.
170 |     local vcf_input_file="${1:-/dev/stdin}";
171 | 
172 |     # Filter out mutation which are homozygous reference in all samples.
173 |     bcftools view --exclude 'AC=0' "${vcf_input_file}";
174 | 
175 |     return $?;
176 | }
177 | 
178 | 
179 | 
180 | filter_out_mutations_heterozygous_in_all_samples () {
181 |     # VCF input file to use or stdin when no VCF input file is given.
182 |     local vcf_input_file="${1:-/dev/stdin}";
183 | 
184 |     # Filter out mutation which are heterozygous in all samples.
185 |     bcftools view --exclude 'COUNT(GT="het")=N_SAMPLES' "${vcf_input_file}";
186 | 
187 |     return $?;
188 | }
189 | 
190 | 
191 | 
192 | filter_out_mutations_homozygous_in_all_samples () {
193 |     # VCF input file to use or stdin when no VCF input file is given.
194 |     local vcf_input_file="${1:-/dev/stdin}";
195 | 
196 |     # Filter out mutations which are homozygous in all samples.
197 |     bcftools view \
198 |         --exclude 'COUNT(GT="AA") = N_SAMPLES' \
199 |         "${vcf_input_file}";
200 | 
201 |     return $?;
202 | }
203 | 
204 | 
205 | 
206 | only_keep_mutations_homozygous_in_one_sample () {
207 |     # VCF input file to use or stdin when no VCF input file is given.
208 |     local vcf_input_file="${1:-/dev/stdin}";
209 | 
210 |     # Only keep mutations (homozygous) which are found only in one sample,
211 |     # but not at all (heterozygous/homozygous) in other samples.
212 |     #bcftools view --include 'AC=2 && ( GT = "1|1" | GT = "1/1")' "${vcf_input_file}";
213 |     bcftools view \
214 |         --include 'COUNT(GT="AA") = 1 && COUNT(GT="RR") = (N_SAMPLES - 1)' \
215 |         "${vcf_input_file}";
216 | 
217 |     return $?;
218 | }
219 | 
220 | 
221 | 
222 | only_keep_mutations_heterozygous_or_homozygous_in_one_sample () {
223 |     # VCF input file to use or stdin when no VCF input file is given.
224 |     local vcf_input_file="${1:-/dev/stdin}";
225 | 
226 |     # Only keep mutations (heterozygous/homozygous) which are found only in
227 |     # one sample, but not at all (heterozygous/homozygous) in other samples.
228 |     bcftools view \
229 |         --include '( COUNT(GT="AA") = 1 || COUNT(GT="AR") = 1 ) && COUNT(GT="RR") = (N_SAMPLES - 1)' \
230 |         --include '( AC=2 && ( GT = "1|1" | GT = "1/1") || AC=1 && ( GT = "0|1" | GT = "1|0" | GT = "0/1" | GT = "0/1") )' \
231 |         "${vcf_input_file}";
232 | 
233 |     return $?;
234 | }
235 | 
236 | 
237 | 
238 | calculate_AF_AC_AN_values_based_on_genotype_info () {
239 |     # VCF input file to use or stdin when no VCF input file is given.
240 |     local vcf_input_file="${1:-/dev/stdin}";
241 | 
242 |     # (Re)calculate AF, AC, AN values bases on the genotype info provided for each sample.
243 |     bcftools plugin fill-tags "${vcf_input_file}" -- --tags 'AF,AC,AN';
244 | 
245 |     return $?;
246 | }
247 | 
248 | 


--------------------------------------------------------------------------------
/sort_vcf_same_as_bam.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Copyright (C): 2020-2021 - Gert Hulselmans
  4 | #
  5 | # Purpose: Sort VCF file in the same order as the BAM file, so it can be used with popscle.
  6 | 
  7 | 
  8 | 
  9 | # Function to check if any of the programs in a pipe failed.
 10 | check_exit_codes () {
 11 |     local GET_PIPESTATUS="${PIPESTATUS[@]}";
 12 |     local exit_code;
 13 | 
 14 |     for exit_code in ${GET_PIPESTATUS} ; do
 15 |         if [ ${exit_code} -ne 0 ] ; then
 16 |              return ${exit_code};
 17 |         fi
 18 |     done
 19 | 
 20 |     return 0;
 21 | }
 22 | 
 23 | 
 24 | 
 25 | # Check if necessary programs are installed.
 26 | check_if_programs_exists () {
 27 |     local exit_code=0;
 28 | 
 29 |     # Check if awk is installed.
 30 |     if ! type awk > /dev/null 2>&1 ; then
 31 |         printf 'Error: "awk" could not be found in PATH.\n' > /dev/stderr;
 32 |         exit_code=2;
 33 |     fi
 34 | 
 35 |     # Check if bcftools is installed.
 36 |     if ! type bcftools > /dev/null 2>&1 ; then
 37 |         printf 'Error: "bcftools" could not be found in PATH.\n' > /dev/stderr;
 38 |         exit_code=2;
 39 |     fi
 40 | 
 41 |     # Check if samtools is installed.
 42 |     if ! type samtools > /dev/null 2>&1 ; then
 43 |         printf 'Error: "samtools" could not be found in PATH.\n' > /dev/stderr;
 44 |         exit_code=2;
 45 |     fi
 46 | 
 47 |     return ${exit_code};
 48 | }
 49 | 
 50 | 
 51 | 
 52 | # Get order of the contigs (chromosomes) and their length from the BAM header.
 53 | get_contig_order_from_bam () {
 54 |     local bam_input_file="${1}";
 55 |     local output_type="${2}";
 56 | 
 57 |     if [ ${#@} -ne 2 ] ; then
 58 |         printf 'Usage: get_contig_order_from_bam BAM_file output_type\n\n';
 59 |         printf 'Arguments:\n';
 60 |         printf '  - BAM_file: BAM file from which to get the contig order and contig lengths.\n';
 61 |         printf '  - output_type:\n';
 62 |         printf '      - "names":        Return contig names.\n';
 63 |         printf '      - "chrom_sizes":  Return contig names and contig lengths.\n';
 64 |         printf '      - "vcf":          Return VCF header section for contigs.\n\n';
 65 |         return 1;
 66 |     fi
 67 | 
 68 |     case "${output_type}" in
 69 |         'names')
 70 |             ;;
 71 |         'chrom_sizes')
 72 |             ;;
 73 |         'vcf')
 74 |             ;;
 75 |         *)
 76 |             printf 'Error: output_type "%s" is not supported.\n' "${output_type}" > /dev/stderr;
 77 |             return 1;
 78 |             ;;
 79 |     esac
 80 | 
 81 |     check_if_programs_exists || return $?;
 82 | 
 83 |     # Get the order of the contigs from the BAM header.
 84 |     samtools view -H "${bam_input_file}" \
 85 |       | awk \
 86 |             -F '\t' \
 87 |             -v output_type="${output_type}" \
 88 |             '
 89 |             {
 90 |                 # Only look at sequence header fields.
 91 |                 if ($1 == "@SQ") {
 92 |                     contig_idx += 1;
 93 |                     contig_name = "";
 94 |                     contig_length = "";
 95 | 
 96 |                     # Extract contig (chromosome) name and contig (chromosome) length.
 97 |                     for (i = 2; i <= NF; i++) {
 98 |                         if ($i ~ /^SN:/) {
 99 |                             contig_name = substr($i, 4);
100 |                         }
101 | 
102 |                         if ($i ~ /^LN:/) {
103 |                             contig_length = substr($i, 4);
104 |                         }
105 | 
106 |                         # Create contig order to name and contig order to length and vcf contig appings.
107 |                         contig_idx_to_name[contig_idx] = contig_name;
108 |                         contig_idx_to_length[contig_idx] = contig_length;
109 |                         contig_idx_to_vcf_contig[contig_idx] = sprintf("##contig=<ID=%s,length=%s>", contig_name, contig_length);
110 |                     }
111 |                 }
112 |             } END {
113 |                 if (contig_idx == 0) {
114 |                     printf "Error: No \"@SQ\" header line found in BAM file.\n" > "/dev/stderr";
115 |                     exit(1);
116 |                 } else if (output_type == "names") {
117 |                     contig_names = "";
118 | 
119 |                     for (contig_idx = 1; contig_idx <= length(contig_idx_to_name); contig_idx++) {
120 |                         contig_names = contig_names " " contig_idx_to_name[contig_idx];
121 |                     }
122 | 
123 |                     # Print all contig names (without leading space).
124 |                     print substr(contig_names, 2);
125 |                 } else if (output_type == "chrom_sizes") {
126 |                     # Print all contig names with their length in a TAB separated fashion.
127 |                     for (contig_idx = 1; contig_idx <= length(contig_idx_to_name); contig_idx++) {
128 |                         print contig_idx_to_name[contig_idx] "\t" contig_idx_to_length[contig_idx];
129 |                     }
130 |                 } else if (output_type == "vcf") {
131 |                     # Print VCF header section for contigs.
132 |                     for (contig_idx = 1; contig_idx <= length(contig_idx_to_vcf_contig); contig_idx++) {
133 |                         print contig_idx_to_vcf_contig[contig_idx];
134 |                     }
135 |                 }
136 |             }'
137 | 
138 |       check_exit_codes;
139 | 
140 |       return $?;
141 | }
142 | 
143 | 
144 | 
145 | # Sort VCF file in the same order as the BAM file, so it can be used with popscle.
146 | sort_vcf_same_as_bam () {
147 |     local bam_input_file="${1}";
148 |     local vcf_input_file="${2}";
149 |     local vcf_type="${3:-v}";
150 | 
151 |     if [ ${#@} -lt 2 ] ; then
152 |         printf 'Usage: sort_vcf_same_as_bam BAM_file VCF_file [VCF_type]\n\n';
153 |         printf 'Arguments:\n';
154 |         printf '  - BAM_file: BAM file from which to get the contig order to sort the VCF file.\n';
155 |         printf '  - VCF_file: VCF file to sort by contig order as defined in the BAM file.\n';
156 |         printf '  - VCF_type: VCF ouput file type (default: same as input VCF file type):\n';
157 |         printf '              v: uncompressed VCF, z: compressed VCF,\n';
158 |         printf '              u: uncompressed BCF, b: compressed BCF\n\n';
159 |         printf 'Purpose:\n';
160 |         printf '  Sort VCF file in the same order as the BAM file, so it can be used with popscle.\n\n';
161 |         return 1;
162 |     fi
163 | 
164 |     check_if_programs_exists || return $?;
165 | 
166 |     # If VCF type is not specified, try to guess it from the filename extension.
167 |     if [ ${#@} -eq 2 ] ; then
168 |         if [ "${vcf_input_file%.vcf.gz}" != "${vcf_input_file}" ] ; then
169 |             vcf_type='z';
170 |         elif [ "${vcf_input_file%.bcf}" != "${vcf_input_file}" ] ; then
171 |              vcf_type='b';
172 |         fi
173 |     fi
174 | 
175 |     # Sort VCF file by same chromosome order as BAM file.
176 |     cat <(
177 |           # Create new VCF header:
178 |           #   - Get VCF header of VCF input file.
179 |           #   - Remove all contig header lines and "#CHROM" line from the VCF header.
180 |           #   - Append contig headers in the order they appear in the input BAM file.
181 |           #   - Add "#CHROM" line as last line of the new VCF header.
182 |           bcftools view -h "${vcf_input_file}" \
183 |             | awk \
184 |                 '
185 |                 {
186 |                     if ($1 !~ /^##contig=/ && $1 !~ /^#CHROM/) {
187 |                         # Remove all contig header lines and "#CHROM" line.
188 |                         print $0;
189 |                     }
190 |                 }' \
191 |             | cat \
192 |                 - \
193 |                 <(get_contig_order_from_bam "${bam_input_file}" 'vcf') \
194 |                 <(bcftools view -h "${vcf_input_file}" | tail -n 1) \
195 |         ) \
196 |         <(bcftools view -H -O v "${vcf_input_file}") \
197 |       | bcftools sort -O "${vcf_type}";
198 | 
199 |     check_exit_codes;
200 | 
201 |     return $?;
202 | }
203 | 
204 | 
205 | 
206 | sort_vcf_same_as_bam "${@}";
207 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Helper tools for popscle
  2 | 
  3 | Collection of tools to make [popscle](https://github.com/statgen/popscle) easier to use.
  4 | 
  5 | 
  6 | 
  7 | ## Filter BAM file for usage with popscle dsc-pileup
  8 | 
  9 | Filter BAM file for usage with popscle dsc-pileup by keeping reads:
 10 |   - which overlap with SNPs in the VCF file
 11 |   - and which have a cell barcode (default: "CB" tag) contained in the cell barcode list
 12 | Keeping only relevant reads for popscle dsc-pileup can speedup it up quite significantly
 13 | (depending on the reduction of the number of reads in the filtered BAM file vs original).
 14 | 
 15 | ```
 16 | $ ./filter_bam_file_for_popscle_dsc_pileup.sh
 17 | Usage:   filter_bam_file_for_popscle_dsc_pileup input_bam_filename barcodes_tsv_filename vcf_filename output_bam_filename [barcode_tag]
 18 | 
 19 | Purpose: Filter BAM file for usage with popscle dsc-pileup by keeping reads:
 20 |            - which overlap with SNPs in the VCF file
 21 |            - and which have a cell barcode (default: "CB" tag) contained in the cell barcode list
 22 |          Keeping only relevant reads for popscle dsc-pileup can speedup it up quite significantly
 23 |          (depending on the reduction of the number of reads in the filtered BAM file vs original).
 24 | 
 25 | ```
 26 | 
 27 | ###  Example
 28 | 
 29 | ```bash
 30 | # Create filtered BAM with only the reads dsc-pileup needs.
 31 | ./filter_bam_file_for_popscle_dsc_pileup.sh \
 32 |     ./samples_to_demultiplex/outs/possorted_genome_bam.bam \
 33 |     ./samples_to_demultiplex/outs/filtered_feature_bc_matrix/barcodes.tsv \
 34 |     samples.vcf \
 35 |     /tmp/samples_to_demultiplex.filter_bam_file_for_popscle_dsc_pileup.bam
 36 | 
 37 | # Use filtered BAM file for dsc-pileup.
 38 | popscle dsc-pileup \
 39 |     --sam /tmp/samples_to_demultiplex.filter_bam_file_for_popscle_dsc_pileup.bam \
 40 |     --vcf samples.vcf \
 41 |     --group-list ./samples_to_demultiplex/outs/filtered_feature_bc_matrix/barcodes.tsv \
 42 |     --out samples_to_demultiplex.pileup
 43 | ```
 44 | 
 45 | 
 46 | 
 47 | ## Sort VCF file in the same order as the BAM file
 48 | 
 49 | Sort VCF file in the same order as the BAM file so the following `popscle dsc-pileup`
 50 | error can be solved easily:
 51 | 
 52 | ```
 53 | [E:%s] Your VCF/BCF files and SAM/BAM/CRAM files have different ordering of chromosomes. SAM/BAM/CRAM file has %s before %s, but VCF/BCF file has %s after %s"
 54 | ```
 55 | 
 56 | ```
 57 | $ ./sort_vcf_same_as_bam.sh
 58 | Usage: sort_vcf_same_as_bam BAM_file VCF_file [VCF_type]
 59 | 
 60 | Arguments:
 61 |   - BAM_file: BAM file from which to get the contig order to sort the VCF file.
 62 |   - VCF_file: VCF file to sort by contig order as defined in the BAM file.
 63 |   - VCF_type: VCF ouput file type (default: same as input VCF file type):
 64 |               v: uncompressed VCF, z: compressed VCF,
 65 |               u: uncompressed BCF, b: compressed BCF
 66 | 
 67 | Purpose:
 68 |   Sort VCF file in the same order as the BAM file, so it can be used with popscle.
 69 | ```
 70 | 
 71 | ### Examples
 72 | 
 73 | ```bash
 74 | # Sort VCF file in the same order as the BAM file, so it can be used with popscle.
 75 | ./sort_vcf_same_as_bam.sh \
 76 |     ./samples_to_demultiplex/outs/possorted_genome_bam.bam \
 77 |     samples.vcf \
 78 |   > /tmp/samples.sorted_as_in_bam.vcf
 79 | 
 80 | # Sort gzipped VCF file in the same order as the BAM file and write compressed VCF file.
 81 | ./sort_vcf_same_as_bam.sh \
 82 |     ./samples_to_demultiplex/outs/possorted_genome_bam.bam \
 83 |     samples.vcf.gz \
 84 |   > /tmp/samples.sorted_as_in_bam.vcf.gz
 85 | 
 86 | # Sort gzipped VCF file in the same order as the BAM file and write uncompressed VCF file.
 87 | ./sort_vcf_same_as_bam.sh \
 88 |     ./samples_to_demultiplex/outs/possorted_genome_bam.bam \
 89 |     samples.vcfi.gz \
 90 |     v \
 91 |   > /tmp/samples.sorted_as_in_bam.vcf
 92 | ```
 93 | 
 94 | 
 95 | 
 96 | ## Create filtered VCF files.
 97 | 
 98 | [BCFtools](https://www.htslib.org) can be used for filtering VCF files.
 99 | 
100 | Looking at the [BCFtools filtering expressions manual](https://www.htslib.org/doc/bcftools.html#expressions)
101 | gives an idea how to create your own filters for mutations.
102 | 
103 | 
104 | 
105 | ### Import functions
106 | 
107 | Import functions in current shell.
108 | 
109 | ```bash
110 | # Import functions.
111 | source filter_vcf_file_for_popscle.sh
112 | 
113 | # Check if all needed programs are installed.
114 | check_if_programs_exists
115 | ```
116 | 
117 | 
118 | 
119 | ### Get number of samples in BCF/VCF file.
120 | 
121 | ```bash
122 | get_number_of_samples_in_vcf [VCF_file]
123 | ```
124 | 
125 | Example:
126 | 
127 | ```
128 | $ get_number_of_samples_in_vcf DGRP2.source_NCSU.dm6.final.bcf
129 | 205
130 | ```
131 | 
132 | 
133 | 
134 | ### Get samples names in BCF/VCF file.
135 | 
136 | Get all the sample names available in the BCF/VCF file (after the `FORMAT` column).
137 | 
138 | ```
139 | get_samples_names_in_vcf [VCF_file]
140 | ```
141 | 
142 | Example:
143 | 
144 | ```
145 | $ get_samples_names_in_vcf DGRP2.source_NCSU.dm6.final.bcf | head
146 | DGRP-021
147 | DGRP-026
148 | DGRP-028
149 | DGRP-031
150 | DGRP-032
151 | DGRP-038
152 | DGRP-040
153 | DGRP-041
154 | DGRP-042
155 | DGRP-045
156 | ```
157 | 
158 | 
159 | 
160 | ### Subset samples from BCF/VCF file.
161 | 
162 | Extract only certain samples from a VCF file with multiple samples.
163 | 
164 | ```bash
165 | subset_samples_from_vcf comma_separated_samples_names [VCF_file]
166 | ```
167 | 
168 | Example:
169 | 
170 | ```
171 | subset_samples_from_vcf DGRP-032,DGRP-026,DGRP-042 DGRP2.source_NCSU.dm6.final.bcf | get_samples_names_in_vcf
172 | DGRP-032
173 | DGRP-026
174 | DGRP-042
175 | ```
176 | 
177 | 
178 | 
179 | ### Only keep SNPs from BCF/VCF file.
180 | 
181 | Only keep SNPs from VCF file (filter out INDELs and other mutations).
182 | 
183 | ```bash
184 | only_keep_snps [VCF_file]
185 | ```
186 | 
187 | 
188 | 
189 | ### Filter out mutations missing genotype info for one or more samples.
190 | 
191 | Filter out mutations missing genotype (`./.`) info for one or more samples.
192 | 
193 | For those mutations no info is available if the sample has the reference and/or mutations,
194 | so it might be better to skip this mutation in `popscle dsc-pileup`.
195 | 
196 | ```bash
197 | filter_out_mutations_missing_genotype_for_one_or_more_samples [VCF_file]
198 | ```
199 | 
200 | 
201 | 
202 | ### Filter out mutations heterozygous for one or more samples.
203 | 
204 | Filter out mutations that are heterozygous for one or more samples.
205 | 
206 | This can be useful to reduce the number of mutations for `popscle dsc-pileup` when working with inbred lines
207 | (all mutations are supposed to be homozygous).
208 | In combination with `filter_out_mutations_not_unique_for_one_sample`, the number of mutations can be reduced
209 | even further.
210 | 
211 | ```bash
212 | filter_out_mutations_heterozygous_for_one_or_more_samples [VCF_file]
213 | ```
214 | 
215 | 
216 | 
217 | ### Filter out mutations homozygous reference in all samples.
218 | 
219 | Filter out mutations that have homozygous reference calls in all samples.
220 | 
221 | If the mutation position contains the reference for both alleles in all samples,
222 | the mutation is not informative and can be skipped for `popscle dsc-pileup`.
223 | 
224 | ```bash
225 | filter_out_mutations_homozygous_reference_in_all_samples [VCF_file]
226 | ```
227 | 
228 | 
229 | 
230 | ### Filter out mutations heterozygous in all samples.
231 | 
232 | Filter out mutations that are heterozygous in all samples.
233 | 
234 | If all samples are inbred lines, you might want to remove all non-homozygous SNPs.
235 | 
236 | ```bash
237 | filter_out_mutations_heterozygous_in_all_samples [VCF_file]
238 | ```
239 | 
240 | 
241 | 
242 | ### Filter out mutations homozygous in all samples.
243 | 
244 | Filter out mutations that are homozygous in all samples.
245 | 
246 | If the mutation position contains the mutation for both alleles in all samples,
247 | the mutation is not informative and can be skipped for `popscle dsc-pileup`.
248 | 
249 | ```bash
250 | filter_out_mutations_homozygous_in_all_samples [VCF_file]
251 | ```
252 | 
253 | 
254 | 
255 | ### Only keep mutations heterozygous or homozygous in one sample.
256 | 
257 | Only keep mutations (heterozygous/homozygous) which are found only in
258 | one sample, but not at all (heterozygous/homozygous) in other samples.
259 | 
260 | ```bash
261 | only_keep_mutations_heterozygous_or_homozygous_in_one_sample [VCF_file]
262 | ```
263 | 
264 | 
265 | 
266 | ### Calculate allele frequency, allele count and total number of alleles.
267 | 
268 | Calculate allele frequency (`AF`), allele count (`AC`) and total number of alleles (`AN`) from genotype info of each sample.
269 | 
270 | This will add `AF`, `AC` and `AN` info fields or update those fields based on the genotype info of each sample in case they
271 | were set incorrectly.
272 | 
273 | It is recommended to run this function before running:
274 |   - `filter_out_mutations_homozygous_reference_in_all_samples`: needs correct value for `AC`.
275 |   - `filter_out_mutations_homozygous_in_all_samples`: needs correct value for `AC` and `AN`.
276 |   - `only_keep_mutations_homozygous_in_one_sample`: needs correct value for `AC`.
277 |   - `only_keep_mutations_heterozygous_in_one_sample`: needs correct value for `AC`.
278 |   - `only_keep_mutations_heterozygous_or_homozygous_in_one_sample`: needs correct value for `AC`.
279 | 
280 | Running it after `subset_samples_from_vcf` is also recommended as that function only updates 'AF' but not `AC` and `AN`.
281 | 
282 | ```bash
283 | calculate_AF_AC_AN_values_based_on_genotype_info [VCF_file]
284 | ```
285 | 
286 | 
287 | 
288 | ### Examples
289 | 
290 | Create (minimal) VCF file for `popscle dsc-pileup` for 3 inbread lines (homozygous genotype SNPs are very common):
291 |   - Only keep mutations for 3 selected samples
292 |   - Only keep SNPs
293 |   - (Re)calculate allele frequency (`AF`), allele count (`AC`), total number of alleles (`AN`).
294 |   - Remove all SNPs which are missing genotype information for at least one sample (not useful to call those positions in `popscle dsc-pileup`).
295 |   - Remove all SNPs which are homozygous reference in all samples (not useful to call those positions in `popscle dsc-pileup`).
296 |   - Remove all SNPs which are homozygous in all samples (not useful to call those positions in `popscle dsc-pileup`).
297 |   - Remove all SNPs which are heterozygous in at least one sample (those mutations shouldn't exist in inbred lines).
298 |   - Only keep SNPs heterozygous or homozygous in one sample (but as heterozygous mutations are already filtered out, only keep homozygous ones).
299 | 
300 | ```bash
301 | subset_samples_from_vcf DGRP-032,DGRP-026,DGRP-042 DGRP2.source_BCM-HGSC.dm6.final.bcf \
302 |   | only_keep_snps \
303 |   | calculate_AF_AC_AN_values_based_on_genotype_info \
304 |   | filter_out_mutations_missing_genotype_for_one_or_more_samples \
305 |   | filter_out_mutations_homozygous_reference_in_all_samples \
306 |   | filter_out_mutations_homozygous_in_all_samples \
307 |   | filter_out_mutations_heterozygous_for_one_or_more_samples \
308 |   | only_keep_mutations_heterozygous_or_homozygous_in_one_sample \
309 |   > output.vcf
310 | ```
311 | 
312 | Create (minimal) VCF file for `popscle dsc-pileup` (heterozygous genotype SNPs are very common):
313 |   - Only keep SNPs.
314 |   - (Re)calculate allele frequency (`AF`), allele count (`AC`), total number of alleles (`AN`).
315 |   - Remove all SNPs which are missing genotype information for at least one sample (not useful to call those positions in `popscle dsc-pileup`).
316 |   - Remove all SNPs which are homozygous reference in all samples (not useful to call those positions in `popscle dsc-pileup`).
317 |   - Remove all SNPs which are homozygous in all samples (not useful to call those positions in `popscle dsc-pileup`).
318 | 
319 | ```bash
320 | only_keep_snps input.vcf \
321 |   | calculate_AF_AC_AN_values_based_on_genotype_info \
322 |   | filter_out_mutations_missing_genotype_for_one_or_more_samples \
323 |   | filter_out_mutations_homozygous_reference_in_all_samples \
324 |   | filter_out_mutations_homozygous_in_all_samples \
325 |   > output.vcf
326 | ```
327 | 
328 | 


--------------------------------------------------------------------------------
/popscle_dsc_pileup_merge_splitted.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import argparse
  6 | import glob
  7 | import io
  8 | import os.path
  9 | import shutil
 10 | import sys
 11 | 
 12 | import polars as pl
 13 | 
 14 | try:
 15 |     from isal import igzip as gzip_mod  # type: ignore[import]
 16 | except ImportError:
 17 |     import gzip as gzip_mod
 18 | 
 19 | 
 20 | def write_popscle_pileup_cel_full_filename(
 21 |     popscle_dsc_output_prefix: str,
 22 |     popscle_dsc_output_full_prefix: str,
 23 | ) -> pl.DataFrame | None:
 24 |     popscle_pileup_cel_dfs = []
 25 | 
 26 |     popscle_pileup_cel_full_filename = f"{popscle_dsc_output_full_prefix}.pileup.cel.gz"
 27 | 
 28 |     if os.path.exists(popscle_pileup_cel_full_filename):
 29 |         print(
 30 |             f'Error: popscle pileup CEL full file "{popscle_pileup_cel_full_filename}" already exists.',
 31 |             file=sys.stderr,
 32 |         )
 33 |         return None
 34 | 
 35 |     for i, popscle_pileup_cel_part_filename in enumerate(
 36 |         sorted(glob.glob(f"{popscle_dsc_output_prefix}.*.pileup.cel.gz"))
 37 |     ):
 38 |         print(
 39 |             f'Reading partial popscle pileup CEL file "{popscle_pileup_cel_part_filename}" ...',
 40 |             file=sys.stderr,
 41 |         )
 42 | 
 43 |         popscle_pileup_cel_dfs.append(
 44 |             # Read partial popscle pileup CEL file.
 45 |             pl.read_csv(
 46 |                 popscle_pileup_cel_part_filename,
 47 |                 separator="\t",
 48 |                 has_header=True,
 49 |                 dtypes={
 50 |                     "#DROPLET_ID": pl.Int64,
 51 |                     "BARCODE": pl.Utf8,
 52 |                     "NUM.READ": pl.Int64,
 53 |                     "NUM.UMI": pl.Int64,
 54 |                     "NUM.UMIwSNP": pl.Int64,
 55 |                     "NUM.SNP": pl.Int64,
 56 |                 },
 57 |             )
 58 |             .rename({"#DROPLET_ID": "DROPLET_ID_PARTITIONED"})
 59 |             .with_columns(
 60 |                 # Add current partition as a column.
 61 |                 pl.lit(i).alias("PARTITION")
 62 |             )
 63 |         )
 64 | 
 65 |     # Combine partial popscle pileup CEL files and add real "DROPLET_ID" for the full
 66 |     # dataset.
 67 |     popscle_pileup_cel_df = pl.concat(popscle_pileup_cel_dfs).with_row_count(
 68 |         name="DROPLET_ID",
 69 |         offset=0,
 70 |     )
 71 | 
 72 |     with gzip_mod.open(popscle_pileup_cel_full_filename, "w") as fh_full:
 73 |         print(
 74 |             f'Writing popscle pileup PLP full file "{popscle_pileup_cel_full_filename}" ...',
 75 |             file=sys.stderr,
 76 |         )
 77 | 
 78 |         # Create BytesIO object to temporarily write the corrected popscle pileup CEL
 79 |         # file to.
 80 |         bytes_io_tsv = io.BytesIO()
 81 | 
 82 |         # Remove "DROPLET_ID_PARTITIONED" and "PARTITION" columns before writing
 83 |         # corrected popscle pileup CEL file.
 84 |         popscle_pileup_cel_df.select(
 85 |             [
 86 |                 pl.col("DROPLET_ID").alias("#DROPLET_ID"),
 87 |                 pl.col("BARCODE"),
 88 |                 pl.col("NUM.READ"),
 89 |                 pl.col("NUM.UMI"),
 90 |                 pl.col("NUM.UMIwSNP"),
 91 |                 pl.col("NUM.SNP"),
 92 |             ],
 93 |         ).write_csv(
 94 |             bytes_io_tsv,
 95 |             has_header=True,
 96 |             separator="\t",
 97 |         )
 98 | 
 99 |         # Write BytesIO object with corrected popscle pileup CEL output
100 |         # to full popscle pileup CEL file.
101 |         fh_full.write(bytes_io_tsv.getbuffer())
102 | 
103 |     # Return corrected popscle pileup CEL output with "DROPLET_ID_PARTITIONED" and
104 |     # "PARTITION" columns as Polars DataFrame.
105 |     return popscle_pileup_cel_df
106 | 
107 | 
108 | def write_popscle_pileup_plp_full_filename(
109 |     popscle_pileup_cel_df: pl.DataFrame,
110 |     popscle_dsc_output_prefix: str,
111 |     popscle_dsc_output_full_prefix: str,
112 | ) -> bool:
113 |     popscle_pileup_plp_full_filename = f"{popscle_dsc_output_full_prefix}.pileup.plp.gz"
114 | 
115 |     if os.path.exists(popscle_pileup_plp_full_filename):
116 |         print(
117 |             f'Error: popscle pileup PLP full file "{popscle_pileup_plp_full_filename}" already exists.',
118 |             file=sys.stderr,
119 |         )
120 |         return False
121 | 
122 |     with gzip_mod.open(popscle_pileup_plp_full_filename, "w") as fh_full:
123 |         print(
124 |             f'Writing popscle pileup PLP full file "{popscle_pileup_plp_full_filename}" ...',
125 |             file=sys.stderr,
126 |         )
127 | 
128 |         for i, popscle_pileup_plp_part_filename in enumerate(
129 |             sorted(glob.glob(f"{popscle_dsc_output_prefix}.*.pileup.plp.gz"))
130 |         ):
131 |             print(
132 |                 f'Reading partial popscle pileup PLP file "{popscle_pileup_plp_part_filename}" ...',
133 |                 file=sys.stderr,
134 |             )
135 | 
136 |             # Create BytesIO object to temporarily write the corrected popscle pileup
137 |             # PLP file to.
138 |             bytes_io_tsv = io.BytesIO()
139 | 
140 |             (
141 |                 # Read partial popscle pileup PLP file.
142 |                 pl.read_csv(
143 |                     popscle_pileup_plp_part_filename,
144 |                     separator="\t",
145 |                     has_header=True,
146 |                     dtypes={
147 |                         "#DROPLET_ID": pl.Int64,
148 |                         "SNP_ID": pl.Int64,
149 |                         "ALLELES": pl.Utf8,
150 |                         "BASEQS": pl.Utf8,
151 |                     },
152 |                 )
153 |                 .lazy()
154 |                 .rename({"#DROPLET_ID": "DROPLET_ID_PARTITIONED"})
155 |                 .with_columns(
156 |                     # Add current partition as a column.
157 |                     pl.lit(i).alias("PARTITION")
158 |                 )
159 |                 # Correct "DROPLET_ID" column from partial popscle pileup
160 |                 # PLP file with real "DROPLET_ID" for the full dataset.
161 |                 .join(
162 |                     popscle_pileup_cel_df.lazy(),
163 |                     on=["PARTITION", "DROPLET_ID_PARTITIONED"],
164 |                     how="inner",
165 |                 )
166 |                 .select(
167 |                     pl.col("DROPLET_ID").alias("#DROPLET_ID"),
168 |                     pl.col("SNP_ID"),
169 |                     pl.col("ALLELES"),
170 |                     pl.col("BASEQS"),
171 |                 )
172 |                 .collect(streaming=True)
173 |                 .write_csv(
174 |                     bytes_io_tsv,
175 |                     # Write header only for first partial popscle pileup PLP file.
176 |                     has_header=i == 0,
177 |                     separator="\t",
178 |                 )
179 |             )
180 | 
181 |             # Write/append BytesIO object with corrected popscle pileup PLP output
182 |             # to full popscle pileup PLP file.
183 |             fh_full.write(bytes_io_tsv.getbuffer())
184 | 
185 |     return True
186 | 
187 | 
188 | def write_popscle_pileup_umi_full_filename(
189 |     popscle_dsc_output_prefix: str,
190 |     popscle_dsc_output_full_prefix: str,
191 | ) -> bool:
192 |     line_count = 0
193 |     popscle_pileup_umi_full_filename = f"{popscle_dsc_output_full_prefix}.pileup.umi.gz"
194 | 
195 |     if os.path.exists(popscle_pileup_umi_full_filename):
196 |         print(
197 |             f'Error: popscle pileup UMI full file "{popscle_pileup_umi_full_filename}" already exists.',
198 |             file=sys.stderr,
199 |         )
200 |         return False
201 | 
202 |     with gzip_mod.open(popscle_pileup_umi_full_filename, "wt") as fh_full:
203 |         print(
204 |             f'Writing popscle pileup UMI full file "{popscle_pileup_umi_full_filename}" ...',
205 |             file=sys.stderr,
206 |         )
207 | 
208 |         for popscle_pileup_umi_part_filename in sorted(
209 |             glob.glob(f"{popscle_dsc_output_prefix}.*.pileup.umi.gz")
210 |         ):
211 |             print(
212 |                 f'Reading partial popscle pileup UMI file "{popscle_pileup_umi_part_filename}" ...',
213 |                 file=sys.stderr,
214 |             )
215 |             with gzip_mod.open(popscle_pileup_umi_part_filename, "rt") as fh:
216 |                 for line in fh:
217 |                     print(
218 |                         str(line_count),
219 |                         line.split("\t", 1)[1],
220 |                         sep="\t",
221 |                         end="",
222 |                         file=fh_full,
223 |                     )
224 |                     line_count += 1
225 | 
226 |     return True
227 | 
228 | 
229 | def write_popscle_pileup_var_full_filename(
230 |     popscle_dsc_output_prefix: str,
231 |     popscle_dsc_output_full_prefix: str,
232 | ) -> bool:
233 |     popscle_pileup_var_full_filename = f"{popscle_dsc_output_full_prefix}.pileup.var.gz"
234 |     popscle_pileup_var_most_complete_df = None
235 |     popscle_pileup_var_most_complete_filename = None
236 | 
237 |     if os.path.exists(popscle_pileup_var_full_filename):
238 |         print(
239 |             f'Error: popscle pileup VAR full file "{popscle_pileup_var_full_filename}" already exists.',
240 |             file=sys.stderr,
241 |         )
242 |         return False
243 | 
244 |     # Read each partial popscle dsc pileup var file and take the one that contains the
245 |     # most mutations as it seems that popscle will always include all mutations with
246 |     # the same SNP_ID.
247 |     for popscle_pileup_var_part_filename in sorted(
248 |         glob.glob(f"{popscle_dsc_output_prefix}.*.pileup.var.gz")
249 |     ):
250 |         print(
251 |             f'Reading partial popscle pileup VAR file "{popscle_pileup_var_part_filename}" ...',
252 |             file=sys.stderr,
253 |         )
254 | 
255 |         popscle_pileup_var_df = pl.read_csv(
256 |             popscle_pileup_var_part_filename,
257 |             separator="\t",
258 |             has_header=True,
259 |             dtypes={
260 |                 "#SNP_ID": pl.Int64,
261 |                 "CHROM": pl.Utf8,
262 |                 "POS": pl.Int64,
263 |                 "REF": pl.Utf8,
264 |                 "ALT": pl.Utf8,
265 |                 "AF": pl.Utf8,
266 |             },
267 |         ).rename({"#SNP_ID": "SNP_ID"})
268 | 
269 |         if popscle_pileup_var_most_complete_df is None:
270 |             popscle_pileup_var_most_complete_df = popscle_pileup_var_df
271 |             popscle_pileup_var_most_complete_filename = popscle_pileup_var_part_filename
272 |         elif popscle_pileup_var_df.height > popscle_pileup_var_most_complete_df.height:
273 |             # Check if the last element of the smallest file is found exactly in the
274 |             # bigger. If this assert fails, our assumption is wrong.
275 |             assert (
276 |                 popscle_pileup_var_most_complete_df[
277 |                     popscle_pileup_var_most_complete_df.height - 1
278 |                 ].to_struct("LAST_SNP")
279 |                 == popscle_pileup_var_df[
280 |                     popscle_pileup_var_most_complete_df.height - 1
281 |                 ].to_struct("LAST_SNP")
282 |             ).item(), (
283 |                 "SNP_IDs do not match between different popscle dsc pileup var files."
284 |             )
285 | 
286 |             popscle_pileup_var_most_complete_df = popscle_pileup_var_df
287 |             popscle_pileup_var_most_complete_filename = popscle_pileup_var_part_filename
288 | 
289 |     if popscle_pileup_var_most_complete_filename:
290 |         print(
291 |             f'Writing popscle pileup VAR full file "{popscle_pileup_var_full_filename}" ...',
292 |             file=sys.stderr,
293 |         )
294 |         shutil.copy2(
295 |             popscle_pileup_var_most_complete_filename, popscle_pileup_var_full_filename
296 |         )
297 | 
298 |         return True
299 | 
300 |     return False
301 | 
302 | 
303 | def main() -> None:
304 |     parser = argparse.ArgumentParser(
305 |         description="Merge popscle dsc pileup outputs from multiple popscle dsc "
306 |         "pileup runs with the same VCF file as input, but with a different list "
307 |         "of cell barcodes."
308 |     )
309 | 
310 |     parser.add_argument(
311 |         "-i",
312 |         "--input",
313 |         dest="popscle_dsc_output_prefix",
314 |         action="store",
315 |         type=str,
316 |         required=True,
317 |         help="popscle pileup dsc output prefix for partial popscle dsc pileup output.",
318 |     )
319 | 
320 |     parser.add_argument(
321 |         "-o",
322 |         "--output",
323 |         dest="popscle_dsc_output_full_prefix",
324 |         action="store",
325 |         type=str,
326 |         required=True,
327 |         help="popscle pileup dsc output prefix for full popscle dsc pileup output.",
328 |     )
329 | 
330 |     args = parser.parse_args()
331 | 
332 |     popscle_pileup_cel_full_filename = (
333 |         f"{args.popscle_dsc_output_full_prefix}.pileup.cel.gz"
334 |     )
335 |     popscle_pileup_plp_full_filename = (
336 |         f"{args.popscle_dsc_output_full_prefix}.pileup.plp.gz"
337 |     )
338 |     popscle_pileup_umi_full_filename = (
339 |         f"{args.popscle_dsc_output_full_prefix}.pileup.umi.gz"
340 |     )
341 |     popscle_pileup_var_full_filename = (
342 |         f"{args.popscle_dsc_output_full_prefix}.pileup.var.gz"
343 |     )
344 | 
345 |     output_exists = False
346 | 
347 |     if os.path.exists(popscle_pileup_cel_full_filename):
348 |         print(
349 |             f'Error: popscle pileup CEL full file "{popscle_pileup_cel_full_filename}" already exists.',
350 |             file=sys.stderr,
351 |         )
352 |         output_exists = True
353 | 
354 |     if os.path.exists(popscle_pileup_plp_full_filename):
355 |         print(
356 |             f'Error: popscle pileup PLP full file "{popscle_pileup_plp_full_filename}" already exists.',
357 |             file=sys.stderr,
358 |         )
359 |         output_exists = True
360 | 
361 |     if os.path.exists(popscle_pileup_umi_full_filename):
362 |         print(
363 |             f'Error: popscle pileup UMI full file "{popscle_pileup_umi_full_filename}" already exists.',
364 |             file=sys.stderr,
365 |         )
366 |         output_exists = True
367 | 
368 |     if os.path.exists(popscle_pileup_var_full_filename):
369 |         print(
370 |             f'Error: popscle pileup VAR full file "{popscle_pileup_var_full_filename}" already exists.',
371 |             file=sys.stderr,
372 |         )
373 |         output_exists = True
374 | 
375 |     if output_exists:
376 |         sys.exit(1)
377 | 
378 |     popscle_pileup_cel_df = write_popscle_pileup_cel_full_filename(
379 |         popscle_dsc_output_prefix=args.popscle_dsc_output_prefix,
380 |         popscle_dsc_output_full_prefix=args.popscle_dsc_output_full_prefix,
381 |     )
382 | 
383 |     if popscle_pileup_cel_df is None:
384 |         sys.exit(1)
385 | 
386 |     write_popscle_pileup_plp_full_filename(
387 |         popscle_pileup_cel_df=popscle_pileup_cel_df,
388 |         popscle_dsc_output_prefix=args.popscle_dsc_output_prefix,
389 |         popscle_dsc_output_full_prefix=args.popscle_dsc_output_full_prefix,
390 |     )
391 | 
392 |     write_popscle_pileup_umi_full_filename(
393 |         popscle_dsc_output_prefix=args.popscle_dsc_output_prefix,
394 |         popscle_dsc_output_full_prefix=args.popscle_dsc_output_full_prefix,
395 |     )
396 |     write_popscle_pileup_var_full_filename(
397 |         popscle_dsc_output_prefix=args.popscle_dsc_output_prefix,
398 |         popscle_dsc_output_full_prefix=args.popscle_dsc_output_full_prefix,
399 |     )
400 | 
401 | 
402 | if __name__ == "__main__":
403 |     main()
404 | 


--------------------------------------------------------------------------------