├── filter_bam_file_for_popscle_dsc_pileup.sh ├── filter_vcf_file_for_popscle.sh ├── sort_vcf_same_as_bam.sh ├── README.md └── popscle_dsc_pileup_merge_splitted.py /filter_bam_file_for_popscle_dsc_pileup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (C): 2020-2021 - Gert Hulselmans 4 | # 5 | # Purpose: Filter BAM file for usage with popscle dsc-pileup by keeping reads: 6 | # - which overlap with SNPs in the VCF file 7 | # - and which have a cell barcode (default: "CB" tag) contained in the cell barcode list 8 | # Keeping only relevant reads for popscle dsc-pileup can speedup it up quite significantly 9 | # (depending on the reduction of the number of reads in the filtered BAM file vs original). 10 | 11 | 12 | 13 | # Function to check if any of the programs in a pipe failed. 14 | check_exit_codes () { 15 | local GET_PIPESTATUS="${PIPESTATUS[@]}"; 16 | local exit_code; 17 | 18 | for exit_code in ${GET_PIPESTATUS} ; do 19 | if [ ${exit_code} -ne 0 ] ; then 20 | return ${exit_code}; 21 | fi 22 | done 23 | 24 | return 0; 25 | } 26 | 27 | 28 | 29 | # Check if necessary programs are installed. 30 | check_if_programs_exists () { 31 | local exit_code=0; 32 | 33 | # Check if bedtools is installed. 34 | if ! type bedtools > /dev/null 2>&1 ; then 35 | printf 'Error: "bedtools" could not be found in PATH.\n' > /dev/stderr; 36 | exit_code=2; 37 | fi 38 | 39 | # Check if samtools is installed. 40 | if ! type samtools > /dev/null 2>&1 ; then 41 | printf 'Error: "samtools" could not be found in PATH.\n' > /dev/stderr; 42 | exit_code=2; 43 | fi 44 | 45 | if [ ${exit_code} -eq 2 ] ; then 46 | return ${exit_code}; 47 | fi 48 | 49 | # Check if samtools 1.10 or higher is installed (needs to have "-D STR:FILE" or "-D, --tag-file STR:FILE" option). 50 | if ! samtools view --help 2>&1 | grep -q -- '-D.*STR:FILE' ; then 51 | printf 'Error: The version of "samtools" (%s) should be 1.10 or higher (%s found).\n' \ 52 | "$(type samtools)" \ 53 | "$(samtools --version | head -n 1)" \ 54 | > /dev/stderr; 55 | exit_code=2; 56 | fi 57 | 58 | return ${exit_code}; 59 | } 60 | 61 | 62 | 63 | filter_bam_file_for_popscle_dsc_pileup () { 64 | local input_bam_filename="${1}"; 65 | local barcodes_tsv_filename="${2}"; 66 | local vcf_filename="${3}"; 67 | local output_bam_filename="${4}"; 68 | local barcode_tag="${5:-CB}"; 69 | 70 | local exit_code=0; 71 | 72 | if [ ${#@} -lt 4 ] ; then 73 | printf 'Usage: filter_bam_file_for_popscle_dsc_pileup input_bam_filename barcodes_tsv_filename vcf_filename output_bam_filename [barcode_tag]\n\n'; 74 | printf 'Purpose: Filter BAM file for usage with popscle dsc-pileup by keeping reads:\n'; 75 | printf ' - which overlap with SNPs in the VCF file\n'; 76 | printf ' - and which have a cell barcode (default: "CB" tag) contained in the cell barcode list\n'; 77 | printf ' Keeping only relevant reads for popscle dsc-pileup can speedup it up quite significantly\n'; 78 | printf ' (depending on the reduction of the number of reads in the filtered BAM file vs original).\n\n'; 79 | 80 | return 1; 81 | fi 82 | 83 | if [ ! -f "${input_bam_filename}" ] ; then 84 | printf 'Error: Input (CellRanger) BAM file "%s" could not be found.\n' "${input_bam_filename}" > /dev/stderr; 85 | return 2; 86 | fi 87 | 88 | if [ ! -f "${barcodes_tsv_filename}" ] ; then 89 | printf 'Error: File with barcodes "%s" could not be found.\n' "${barcodes_tsv_filename}" > /dev/stderr; 90 | return 2; 91 | fi 92 | 93 | if [ ! -f "${vcf_filename}" ] ; then 94 | printf 'Error: File with unique SNPs per sample "%s" could not be found.\n' "${vcf_filename}" > /dev/stderr; 95 | return 2; 96 | fi 97 | 98 | if [ ${#barcode_tag} -ne 2 ] ; then 99 | printf 'Error: Barcode tag "%s" should be 2 characters.\n' "${barcode_tag}" > /dev/stderr; 100 | return 2; 101 | fi 102 | 103 | # Check if bedtools and samtools are in PATH. 104 | if ! check_if_programs_exists ; then 105 | return 2; 106 | fi 107 | 108 | # Create much smaller BAM file for dsc-pileup of popscle: 109 | # - Convert VCF file with unique SNPs for each sample 110 | # to a BED file and merge adjacent SNP regions to one. 111 | # - Only include reads that contain a SNP position 112 | # and which contain a cell barcode of interest. 113 | if [ "${barcodes_tsv_filename%.gz}".gz = "${barcodes_tsv_filename}" ] ; then 114 | # Barcodes file is compressed with gzip. 115 | bedtools merge -i "${vcf_filename}" \ 116 | | samtools view\ 117 | -@ 8 \ 118 | --write-index \ 119 | -L - \ 120 | -D "${barcode_tag}":<(zcat "${barcodes_tsv_filename}") \ 121 | -o "${output_bam_filename}" \ 122 | "${input_bam_filename}"; 123 | 124 | # Check if any of the previous commands failed. 125 | check_exit_codes; 126 | 127 | exit_code=$?; 128 | else 129 | # Barcodes file is uncompressed. 130 | bedtools merge -i "${vcf_filename}" \ 131 | | samtools view\ 132 | -@ 8 \ 133 | --write-index \ 134 | -L - \ 135 | -D "${barcode_tag}":"${barcodes_tsv_filename}" \ 136 | -o "${output_bam_filename}" \ 137 | "${input_bam_filename}"; 138 | 139 | # Check if any of the previous commands failed. 140 | check_exit_codes; 141 | 142 | exit_code=$?; 143 | fi 144 | 145 | 146 | return ${exit_code}; 147 | } 148 | 149 | 150 | 151 | filter_bam_file_for_popscle_dsc_pileup "${@}"; 152 | -------------------------------------------------------------------------------- /filter_vcf_file_for_popscle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (C): 2020-2021 - Gert Hulselmans 4 | # 5 | # Purpose: Functions for filtering VCF files for usage with popscle by removing mutations which are not informative. 6 | # 7 | # 8 | # BCFtools filtering expressions manual: 9 | # https://www.htslib.org/doc/bcftools.html#expressions 10 | 11 | 12 | 13 | # Function to check if any of the programs in a pipe failed. 14 | check_exit_codes () { 15 | local GET_PIPESTATUS="${PIPESTATUS[@]}"; 16 | local exit_code; 17 | 18 | for exit_code in ${GET_PIPESTATUS} ; do 19 | if [ ${exit_code} -ne 0 ] ; then 20 | return ${exit_code}; 21 | fi 22 | done 23 | 24 | return 0; 25 | } 26 | 27 | 28 | 29 | # Check if necessary programs are installed. 30 | check_if_programs_exists () { 31 | local exit_code=0; 32 | 33 | # Check if awk is installed. 34 | if ! type awk > /dev/null 2>&1 ; then 35 | printf 'Error: "awk" could not be found in PATH.\n' > /dev/stderr; 36 | exit_code=2; 37 | fi 38 | 39 | # Check if bcftools is installed. 40 | if ! type bcftools > /dev/null 2>&1 ; then 41 | printf 'Error: "bcftools" could not be found in PATH.\n' > /dev/stderr; 42 | exit_code=2; 43 | fi 44 | 45 | return ${exit_code}; 46 | } 47 | 48 | 49 | 50 | get_number_of_samples_in_vcf () { 51 | # VCF input file to use or stdin when no VCF input file is given. 52 | local vcf_input_file="${1:-/dev/stdin}"; 53 | 54 | # Only look at the VCF header and count the number of samples in the "#CHROM" line. 55 | bcftools view -h "${vcf_input_file}" \ 56 | | awk \ 57 | -F '\t' \ 58 | ' 59 | { 60 | if ( $1 == "#CHROM" ) { 61 | if ( NF > 9 ) { 62 | nbr_samples = NF - 9; 63 | print nbr_samples; 64 | } else { 65 | print "0"; 66 | } 67 | } 68 | }' 69 | 70 | check_exit_codes; 71 | 72 | return $?; 73 | } 74 | 75 | 76 | 77 | get_samples_names_in_vcf () { 78 | # VCF input file to use or stdin when no VCF input file is given. 79 | local vcf_input_file="${1:-/dev/stdin}"; 80 | 81 | # Only look at the VCF header and print the sample names listed in the "#CHROM" line. 82 | bcftools view -h "${vcf_input_file}" \ 83 | | awk \ 84 | -F '\t' \ 85 | -v "vcf_input_file=${vcf_input_file}" \ 86 | ' 87 | { 88 | if ( $1 == "#CHROM" ) { 89 | if ( NF > 9 ) { 90 | # Print all sample names. 91 | for (sample_column_idx=10 ; sample_column_idx <= NF; sample_column_idx++) { 92 | print $sample_column_idx; 93 | } 94 | 95 | exit(0); 96 | } else { 97 | printf "Error: No sample names found in VCF file \"%s\".\n", vcf_input_file > "/dev/stderr"; 98 | 99 | exit(1); 100 | } 101 | } 102 | }' 103 | 104 | check_exit_codes; 105 | 106 | return $?; 107 | } 108 | 109 | 110 | 111 | subset_samples_from_vcf () { 112 | # Comma separated list of samples to extract from VCF file. 113 | local samples="${1}"; 114 | 115 | # VCF input file to use or stdin when no VCF input file is given. 116 | local vcf_input_file="${2:-/dev/stdin}"; 117 | 118 | if [ ${#@} -lt 1 ] ; then 119 | printf 'Usage: subset_samples_from_vcf comma_separated_samples_names [VCF_file]\n'; 120 | return 1; 121 | fi 122 | 123 | # Extract specific samples from VCF file. 124 | bcftools view --samples "${samples}" "${vcf_input_file}"; 125 | 126 | return $?; 127 | } 128 | 129 | 130 | 131 | only_keep_snps () { 132 | # VCF input file to use or stdin when no VCF input file is given. 133 | local vcf_input_file="${1:-/dev/stdin}"; 134 | 135 | # Filter out all non SNPs mutations. 136 | bcftools view --types 'snps' "${vcf_input_file}"; 137 | 138 | return $?; 139 | } 140 | 141 | 142 | 143 | filter_out_mutations_missing_genotype_for_one_or_more_samples () { 144 | # VCF input file to use or stdin when no VCF input file is given. 145 | local vcf_input_file="${1:-/dev/stdin}"; 146 | 147 | # Filter out mutations which have missing genotypes for one or more samples 148 | # as those mutations are not very informative. 149 | bcftools view --genotype '^miss' "${vcf_input_file}"; 150 | 151 | return $?; 152 | } 153 | 154 | 155 | 156 | filter_out_mutations_heterozygous_for_one_or_more_samples () { 157 | # VCF input file to use or stdin when no VCF input file is given. 158 | local vcf_input_file="${1:-/dev/stdin}"; 159 | 160 | # Filter out mutations which are heterozygous for one or more samples. 161 | bcftools view --genotype '^het' "${vcf_input_file}"; 162 | 163 | return $?; 164 | } 165 | 166 | 167 | 168 | filter_out_mutations_homozygous_reference_in_all_samples () { 169 | # VCF input file to use or stdin when no VCF input file is given. 170 | local vcf_input_file="${1:-/dev/stdin}"; 171 | 172 | # Filter out mutation which are homozygous reference in all samples. 173 | bcftools view --exclude 'AC=0' "${vcf_input_file}"; 174 | 175 | return $?; 176 | } 177 | 178 | 179 | 180 | filter_out_mutations_heterozygous_in_all_samples () { 181 | # VCF input file to use or stdin when no VCF input file is given. 182 | local vcf_input_file="${1:-/dev/stdin}"; 183 | 184 | # Filter out mutation which are heterozygous in all samples. 185 | bcftools view --exclude 'COUNT(GT="het")=N_SAMPLES' "${vcf_input_file}"; 186 | 187 | return $?; 188 | } 189 | 190 | 191 | 192 | filter_out_mutations_homozygous_in_all_samples () { 193 | # VCF input file to use or stdin when no VCF input file is given. 194 | local vcf_input_file="${1:-/dev/stdin}"; 195 | 196 | # Filter out mutations which are homozygous in all samples. 197 | bcftools view \ 198 | --exclude 'COUNT(GT="AA") = N_SAMPLES' \ 199 | "${vcf_input_file}"; 200 | 201 | return $?; 202 | } 203 | 204 | 205 | 206 | only_keep_mutations_homozygous_in_one_sample () { 207 | # VCF input file to use or stdin when no VCF input file is given. 208 | local vcf_input_file="${1:-/dev/stdin}"; 209 | 210 | # Only keep mutations (homozygous) which are found only in one sample, 211 | # but not at all (heterozygous/homozygous) in other samples. 212 | #bcftools view --include 'AC=2 && ( GT = "1|1" | GT = "1/1")' "${vcf_input_file}"; 213 | bcftools view \ 214 | --include 'COUNT(GT="AA") = 1 && COUNT(GT="RR") = (N_SAMPLES - 1)' \ 215 | "${vcf_input_file}"; 216 | 217 | return $?; 218 | } 219 | 220 | 221 | 222 | only_keep_mutations_heterozygous_or_homozygous_in_one_sample () { 223 | # VCF input file to use or stdin when no VCF input file is given. 224 | local vcf_input_file="${1:-/dev/stdin}"; 225 | 226 | # Only keep mutations (heterozygous/homozygous) which are found only in 227 | # one sample, but not at all (heterozygous/homozygous) in other samples. 228 | bcftools view \ 229 | --include '( COUNT(GT="AA") = 1 || COUNT(GT="AR") = 1 ) && COUNT(GT="RR") = (N_SAMPLES - 1)' \ 230 | --include '( AC=2 && ( GT = "1|1" | GT = "1/1") || AC=1 && ( GT = "0|1" | GT = "1|0" | GT = "0/1" | GT = "0/1") )' \ 231 | "${vcf_input_file}"; 232 | 233 | return $?; 234 | } 235 | 236 | 237 | 238 | calculate_AF_AC_AN_values_based_on_genotype_info () { 239 | # VCF input file to use or stdin when no VCF input file is given. 240 | local vcf_input_file="${1:-/dev/stdin}"; 241 | 242 | # (Re)calculate AF, AC, AN values bases on the genotype info provided for each sample. 243 | bcftools plugin fill-tags "${vcf_input_file}" -- --tags 'AF,AC,AN'; 244 | 245 | return $?; 246 | } 247 | 248 | -------------------------------------------------------------------------------- /sort_vcf_same_as_bam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (C): 2020-2021 - Gert Hulselmans 4 | # 5 | # Purpose: Sort VCF file in the same order as the BAM file, so it can be used with popscle. 6 | 7 | 8 | 9 | # Function to check if any of the programs in a pipe failed. 10 | check_exit_codes () { 11 | local GET_PIPESTATUS="${PIPESTATUS[@]}"; 12 | local exit_code; 13 | 14 | for exit_code in ${GET_PIPESTATUS} ; do 15 | if [ ${exit_code} -ne 0 ] ; then 16 | return ${exit_code}; 17 | fi 18 | done 19 | 20 | return 0; 21 | } 22 | 23 | 24 | 25 | # Check if necessary programs are installed. 26 | check_if_programs_exists () { 27 | local exit_code=0; 28 | 29 | # Check if awk is installed. 30 | if ! type awk > /dev/null 2>&1 ; then 31 | printf 'Error: "awk" could not be found in PATH.\n' > /dev/stderr; 32 | exit_code=2; 33 | fi 34 | 35 | # Check if bcftools is installed. 36 | if ! type bcftools > /dev/null 2>&1 ; then 37 | printf 'Error: "bcftools" could not be found in PATH.\n' > /dev/stderr; 38 | exit_code=2; 39 | fi 40 | 41 | # Check if samtools is installed. 42 | if ! type samtools > /dev/null 2>&1 ; then 43 | printf 'Error: "samtools" could not be found in PATH.\n' > /dev/stderr; 44 | exit_code=2; 45 | fi 46 | 47 | return ${exit_code}; 48 | } 49 | 50 | 51 | 52 | # Get order of the contigs (chromosomes) and their length from the BAM header. 53 | get_contig_order_from_bam () { 54 | local bam_input_file="${1}"; 55 | local output_type="${2}"; 56 | 57 | if [ ${#@} -ne 2 ] ; then 58 | printf 'Usage: get_contig_order_from_bam BAM_file output_type\n\n'; 59 | printf 'Arguments:\n'; 60 | printf ' - BAM_file: BAM file from which to get the contig order and contig lengths.\n'; 61 | printf ' - output_type:\n'; 62 | printf ' - "names": Return contig names.\n'; 63 | printf ' - "chrom_sizes": Return contig names and contig lengths.\n'; 64 | printf ' - "vcf": Return VCF header section for contigs.\n\n'; 65 | return 1; 66 | fi 67 | 68 | case "${output_type}" in 69 | 'names') 70 | ;; 71 | 'chrom_sizes') 72 | ;; 73 | 'vcf') 74 | ;; 75 | *) 76 | printf 'Error: output_type "%s" is not supported.\n' "${output_type}" > /dev/stderr; 77 | return 1; 78 | ;; 79 | esac 80 | 81 | check_if_programs_exists || return $?; 82 | 83 | # Get the order of the contigs from the BAM header. 84 | samtools view -H "${bam_input_file}" \ 85 | | awk \ 86 | -F '\t' \ 87 | -v output_type="${output_type}" \ 88 | ' 89 | { 90 | # Only look at sequence header fields. 91 | if ($1 == "@SQ") { 92 | contig_idx += 1; 93 | contig_name = ""; 94 | contig_length = ""; 95 | 96 | # Extract contig (chromosome) name and contig (chromosome) length. 97 | for (i = 2; i <= NF; i++) { 98 | if ($i ~ /^SN:/) { 99 | contig_name = substr($i, 4); 100 | } 101 | 102 | if ($i ~ /^LN:/) { 103 | contig_length = substr($i, 4); 104 | } 105 | 106 | # Create contig order to name and contig order to length and vcf contig appings. 107 | contig_idx_to_name[contig_idx] = contig_name; 108 | contig_idx_to_length[contig_idx] = contig_length; 109 | contig_idx_to_vcf_contig[contig_idx] = sprintf("##contig=", contig_name, contig_length); 110 | } 111 | } 112 | } END { 113 | if (contig_idx == 0) { 114 | printf "Error: No \"@SQ\" header line found in BAM file.\n" > "/dev/stderr"; 115 | exit(1); 116 | } else if (output_type == "names") { 117 | contig_names = ""; 118 | 119 | for (contig_idx = 1; contig_idx <= length(contig_idx_to_name); contig_idx++) { 120 | contig_names = contig_names " " contig_idx_to_name[contig_idx]; 121 | } 122 | 123 | # Print all contig names (without leading space). 124 | print substr(contig_names, 2); 125 | } else if (output_type == "chrom_sizes") { 126 | # Print all contig names with their length in a TAB separated fashion. 127 | for (contig_idx = 1; contig_idx <= length(contig_idx_to_name); contig_idx++) { 128 | print contig_idx_to_name[contig_idx] "\t" contig_idx_to_length[contig_idx]; 129 | } 130 | } else if (output_type == "vcf") { 131 | # Print VCF header section for contigs. 132 | for (contig_idx = 1; contig_idx <= length(contig_idx_to_vcf_contig); contig_idx++) { 133 | print contig_idx_to_vcf_contig[contig_idx]; 134 | } 135 | } 136 | }' 137 | 138 | check_exit_codes; 139 | 140 | return $?; 141 | } 142 | 143 | 144 | 145 | # Sort VCF file in the same order as the BAM file, so it can be used with popscle. 146 | sort_vcf_same_as_bam () { 147 | local bam_input_file="${1}"; 148 | local vcf_input_file="${2}"; 149 | local vcf_type="${3:-v}"; 150 | 151 | if [ ${#@} -lt 2 ] ; then 152 | printf 'Usage: sort_vcf_same_as_bam BAM_file VCF_file [VCF_type]\n\n'; 153 | printf 'Arguments:\n'; 154 | printf ' - BAM_file: BAM file from which to get the contig order to sort the VCF file.\n'; 155 | printf ' - VCF_file: VCF file to sort by contig order as defined in the BAM file.\n'; 156 | printf ' - VCF_type: VCF ouput file type (default: same as input VCF file type):\n'; 157 | printf ' v: uncompressed VCF, z: compressed VCF,\n'; 158 | printf ' u: uncompressed BCF, b: compressed BCF\n\n'; 159 | printf 'Purpose:\n'; 160 | printf ' Sort VCF file in the same order as the BAM file, so it can be used with popscle.\n\n'; 161 | return 1; 162 | fi 163 | 164 | check_if_programs_exists || return $?; 165 | 166 | # If VCF type is not specified, try to guess it from the filename extension. 167 | if [ ${#@} -eq 2 ] ; then 168 | if [ "${vcf_input_file%.vcf.gz}" != "${vcf_input_file}" ] ; then 169 | vcf_type='z'; 170 | elif [ "${vcf_input_file%.bcf}" != "${vcf_input_file}" ] ; then 171 | vcf_type='b'; 172 | fi 173 | fi 174 | 175 | # Sort VCF file by same chromosome order as BAM file. 176 | cat <( 177 | # Create new VCF header: 178 | # - Get VCF header of VCF input file. 179 | # - Remove all contig header lines and "#CHROM" line from the VCF header. 180 | # - Append contig headers in the order they appear in the input BAM file. 181 | # - Add "#CHROM" line as last line of the new VCF header. 182 | bcftools view -h "${vcf_input_file}" \ 183 | | awk \ 184 | ' 185 | { 186 | if ($1 !~ /^##contig=/ && $1 !~ /^#CHROM/) { 187 | # Remove all contig header lines and "#CHROM" line. 188 | print $0; 189 | } 190 | }' \ 191 | | cat \ 192 | - \ 193 | <(get_contig_order_from_bam "${bam_input_file}" 'vcf') \ 194 | <(bcftools view -h "${vcf_input_file}" | tail -n 1) \ 195 | ) \ 196 | <(bcftools view -H -O v "${vcf_input_file}") \ 197 | | bcftools sort -O "${vcf_type}"; 198 | 199 | check_exit_codes; 200 | 201 | return $?; 202 | } 203 | 204 | 205 | 206 | sort_vcf_same_as_bam "${@}"; 207 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Helper tools for popscle 2 | 3 | Collection of tools to make [popscle](https://github.com/statgen/popscle) easier to use. 4 | 5 | 6 | 7 | ## Filter BAM file for usage with popscle dsc-pileup 8 | 9 | Filter BAM file for usage with popscle dsc-pileup by keeping reads: 10 | - which overlap with SNPs in the VCF file 11 | - and which have a cell barcode (default: "CB" tag) contained in the cell barcode list 12 | Keeping only relevant reads for popscle dsc-pileup can speedup it up quite significantly 13 | (depending on the reduction of the number of reads in the filtered BAM file vs original). 14 | 15 | ``` 16 | $ ./filter_bam_file_for_popscle_dsc_pileup.sh 17 | Usage: filter_bam_file_for_popscle_dsc_pileup input_bam_filename barcodes_tsv_filename vcf_filename output_bam_filename [barcode_tag] 18 | 19 | Purpose: Filter BAM file for usage with popscle dsc-pileup by keeping reads: 20 | - which overlap with SNPs in the VCF file 21 | - and which have a cell barcode (default: "CB" tag) contained in the cell barcode list 22 | Keeping only relevant reads for popscle dsc-pileup can speedup it up quite significantly 23 | (depending on the reduction of the number of reads in the filtered BAM file vs original). 24 | 25 | ``` 26 | 27 | ### Example 28 | 29 | ```bash 30 | # Create filtered BAM with only the reads dsc-pileup needs. 31 | ./filter_bam_file_for_popscle_dsc_pileup.sh \ 32 | ./samples_to_demultiplex/outs/possorted_genome_bam.bam \ 33 | ./samples_to_demultiplex/outs/filtered_feature_bc_matrix/barcodes.tsv \ 34 | samples.vcf \ 35 | /tmp/samples_to_demultiplex.filter_bam_file_for_popscle_dsc_pileup.bam 36 | 37 | # Use filtered BAM file for dsc-pileup. 38 | popscle dsc-pileup \ 39 | --sam /tmp/samples_to_demultiplex.filter_bam_file_for_popscle_dsc_pileup.bam \ 40 | --vcf samples.vcf \ 41 | --group-list ./samples_to_demultiplex/outs/filtered_feature_bc_matrix/barcodes.tsv \ 42 | --out samples_to_demultiplex.pileup 43 | ``` 44 | 45 | 46 | 47 | ## Sort VCF file in the same order as the BAM file 48 | 49 | Sort VCF file in the same order as the BAM file so the following `popscle dsc-pileup` 50 | error can be solved easily: 51 | 52 | ``` 53 | [E:%s] Your VCF/BCF files and SAM/BAM/CRAM files have different ordering of chromosomes. SAM/BAM/CRAM file has %s before %s, but VCF/BCF file has %s after %s" 54 | ``` 55 | 56 | ``` 57 | $ ./sort_vcf_same_as_bam.sh 58 | Usage: sort_vcf_same_as_bam BAM_file VCF_file [VCF_type] 59 | 60 | Arguments: 61 | - BAM_file: BAM file from which to get the contig order to sort the VCF file. 62 | - VCF_file: VCF file to sort by contig order as defined in the BAM file. 63 | - VCF_type: VCF ouput file type (default: same as input VCF file type): 64 | v: uncompressed VCF, z: compressed VCF, 65 | u: uncompressed BCF, b: compressed BCF 66 | 67 | Purpose: 68 | Sort VCF file in the same order as the BAM file, so it can be used with popscle. 69 | ``` 70 | 71 | ### Examples 72 | 73 | ```bash 74 | # Sort VCF file in the same order as the BAM file, so it can be used with popscle. 75 | ./sort_vcf_same_as_bam.sh \ 76 | ./samples_to_demultiplex/outs/possorted_genome_bam.bam \ 77 | samples.vcf \ 78 | > /tmp/samples.sorted_as_in_bam.vcf 79 | 80 | # Sort gzipped VCF file in the same order as the BAM file and write compressed VCF file. 81 | ./sort_vcf_same_as_bam.sh \ 82 | ./samples_to_demultiplex/outs/possorted_genome_bam.bam \ 83 | samples.vcf.gz \ 84 | > /tmp/samples.sorted_as_in_bam.vcf.gz 85 | 86 | # Sort gzipped VCF file in the same order as the BAM file and write uncompressed VCF file. 87 | ./sort_vcf_same_as_bam.sh \ 88 | ./samples_to_demultiplex/outs/possorted_genome_bam.bam \ 89 | samples.vcfi.gz \ 90 | v \ 91 | > /tmp/samples.sorted_as_in_bam.vcf 92 | ``` 93 | 94 | 95 | 96 | ## Create filtered VCF files. 97 | 98 | [BCFtools](https://www.htslib.org) can be used for filtering VCF files. 99 | 100 | Looking at the [BCFtools filtering expressions manual](https://www.htslib.org/doc/bcftools.html#expressions) 101 | gives an idea how to create your own filters for mutations. 102 | 103 | 104 | 105 | ### Import functions 106 | 107 | Import functions in current shell. 108 | 109 | ```bash 110 | # Import functions. 111 | source filter_vcf_file_for_popscle.sh 112 | 113 | # Check if all needed programs are installed. 114 | check_if_programs_exists 115 | ``` 116 | 117 | 118 | 119 | ### Get number of samples in BCF/VCF file. 120 | 121 | ```bash 122 | get_number_of_samples_in_vcf [VCF_file] 123 | ``` 124 | 125 | Example: 126 | 127 | ``` 128 | $ get_number_of_samples_in_vcf DGRP2.source_NCSU.dm6.final.bcf 129 | 205 130 | ``` 131 | 132 | 133 | 134 | ### Get samples names in BCF/VCF file. 135 | 136 | Get all the sample names available in the BCF/VCF file (after the `FORMAT` column). 137 | 138 | ``` 139 | get_samples_names_in_vcf [VCF_file] 140 | ``` 141 | 142 | Example: 143 | 144 | ``` 145 | $ get_samples_names_in_vcf DGRP2.source_NCSU.dm6.final.bcf | head 146 | DGRP-021 147 | DGRP-026 148 | DGRP-028 149 | DGRP-031 150 | DGRP-032 151 | DGRP-038 152 | DGRP-040 153 | DGRP-041 154 | DGRP-042 155 | DGRP-045 156 | ``` 157 | 158 | 159 | 160 | ### Subset samples from BCF/VCF file. 161 | 162 | Extract only certain samples from a VCF file with multiple samples. 163 | 164 | ```bash 165 | subset_samples_from_vcf comma_separated_samples_names [VCF_file] 166 | ``` 167 | 168 | Example: 169 | 170 | ``` 171 | subset_samples_from_vcf DGRP-032,DGRP-026,DGRP-042 DGRP2.source_NCSU.dm6.final.bcf | get_samples_names_in_vcf 172 | DGRP-032 173 | DGRP-026 174 | DGRP-042 175 | ``` 176 | 177 | 178 | 179 | ### Only keep SNPs from BCF/VCF file. 180 | 181 | Only keep SNPs from VCF file (filter out INDELs and other mutations). 182 | 183 | ```bash 184 | only_keep_snps [VCF_file] 185 | ``` 186 | 187 | 188 | 189 | ### Filter out mutations missing genotype info for one or more samples. 190 | 191 | Filter out mutations missing genotype (`./.`) info for one or more samples. 192 | 193 | For those mutations no info is available if the sample has the reference and/or mutations, 194 | so it might be better to skip this mutation in `popscle dsc-pileup`. 195 | 196 | ```bash 197 | filter_out_mutations_missing_genotype_for_one_or_more_samples [VCF_file] 198 | ``` 199 | 200 | 201 | 202 | ### Filter out mutations heterozygous for one or more samples. 203 | 204 | Filter out mutations that are heterozygous for one or more samples. 205 | 206 | This can be useful to reduce the number of mutations for `popscle dsc-pileup` when working with inbred lines 207 | (all mutations are supposed to be homozygous). 208 | In combination with `filter_out_mutations_not_unique_for_one_sample`, the number of mutations can be reduced 209 | even further. 210 | 211 | ```bash 212 | filter_out_mutations_heterozygous_for_one_or_more_samples [VCF_file] 213 | ``` 214 | 215 | 216 | 217 | ### Filter out mutations homozygous reference in all samples. 218 | 219 | Filter out mutations that have homozygous reference calls in all samples. 220 | 221 | If the mutation position contains the reference for both alleles in all samples, 222 | the mutation is not informative and can be skipped for `popscle dsc-pileup`. 223 | 224 | ```bash 225 | filter_out_mutations_homozygous_reference_in_all_samples [VCF_file] 226 | ``` 227 | 228 | 229 | 230 | ### Filter out mutations heterozygous in all samples. 231 | 232 | Filter out mutations that are heterozygous in all samples. 233 | 234 | If all samples are inbred lines, you might want to remove all non-homozygous SNPs. 235 | 236 | ```bash 237 | filter_out_mutations_heterozygous_in_all_samples [VCF_file] 238 | ``` 239 | 240 | 241 | 242 | ### Filter out mutations homozygous in all samples. 243 | 244 | Filter out mutations that are homozygous in all samples. 245 | 246 | If the mutation position contains the mutation for both alleles in all samples, 247 | the mutation is not informative and can be skipped for `popscle dsc-pileup`. 248 | 249 | ```bash 250 | filter_out_mutations_homozygous_in_all_samples [VCF_file] 251 | ``` 252 | 253 | 254 | 255 | ### Only keep mutations heterozygous or homozygous in one sample. 256 | 257 | Only keep mutations (heterozygous/homozygous) which are found only in 258 | one sample, but not at all (heterozygous/homozygous) in other samples. 259 | 260 | ```bash 261 | only_keep_mutations_heterozygous_or_homozygous_in_one_sample [VCF_file] 262 | ``` 263 | 264 | 265 | 266 | ### Calculate allele frequency, allele count and total number of alleles. 267 | 268 | Calculate allele frequency (`AF`), allele count (`AC`) and total number of alleles (`AN`) from genotype info of each sample. 269 | 270 | This will add `AF`, `AC` and `AN` info fields or update those fields based on the genotype info of each sample in case they 271 | were set incorrectly. 272 | 273 | It is recommended to run this function before running: 274 | - `filter_out_mutations_homozygous_reference_in_all_samples`: needs correct value for `AC`. 275 | - `filter_out_mutations_homozygous_in_all_samples`: needs correct value for `AC` and `AN`. 276 | - `only_keep_mutations_homozygous_in_one_sample`: needs correct value for `AC`. 277 | - `only_keep_mutations_heterozygous_in_one_sample`: needs correct value for `AC`. 278 | - `only_keep_mutations_heterozygous_or_homozygous_in_one_sample`: needs correct value for `AC`. 279 | 280 | Running it after `subset_samples_from_vcf` is also recommended as that function only updates 'AF' but not `AC` and `AN`. 281 | 282 | ```bash 283 | calculate_AF_AC_AN_values_based_on_genotype_info [VCF_file] 284 | ``` 285 | 286 | 287 | 288 | ### Examples 289 | 290 | Create (minimal) VCF file for `popscle dsc-pileup` for 3 inbread lines (homozygous genotype SNPs are very common): 291 | - Only keep mutations for 3 selected samples 292 | - Only keep SNPs 293 | - (Re)calculate allele frequency (`AF`), allele count (`AC`), total number of alleles (`AN`). 294 | - Remove all SNPs which are missing genotype information for at least one sample (not useful to call those positions in `popscle dsc-pileup`). 295 | - Remove all SNPs which are homozygous reference in all samples (not useful to call those positions in `popscle dsc-pileup`). 296 | - Remove all SNPs which are homozygous in all samples (not useful to call those positions in `popscle dsc-pileup`). 297 | - Remove all SNPs which are heterozygous in at least one sample (those mutations shouldn't exist in inbred lines). 298 | - Only keep SNPs heterozygous or homozygous in one sample (but as heterozygous mutations are already filtered out, only keep homozygous ones). 299 | 300 | ```bash 301 | subset_samples_from_vcf DGRP-032,DGRP-026,DGRP-042 DGRP2.source_BCM-HGSC.dm6.final.bcf \ 302 | | only_keep_snps \ 303 | | calculate_AF_AC_AN_values_based_on_genotype_info \ 304 | | filter_out_mutations_missing_genotype_for_one_or_more_samples \ 305 | | filter_out_mutations_homozygous_reference_in_all_samples \ 306 | | filter_out_mutations_homozygous_in_all_samples \ 307 | | filter_out_mutations_heterozygous_for_one_or_more_samples \ 308 | | only_keep_mutations_heterozygous_or_homozygous_in_one_sample \ 309 | > output.vcf 310 | ``` 311 | 312 | Create (minimal) VCF file for `popscle dsc-pileup` (heterozygous genotype SNPs are very common): 313 | - Only keep SNPs. 314 | - (Re)calculate allele frequency (`AF`), allele count (`AC`), total number of alleles (`AN`). 315 | - Remove all SNPs which are missing genotype information for at least one sample (not useful to call those positions in `popscle dsc-pileup`). 316 | - Remove all SNPs which are homozygous reference in all samples (not useful to call those positions in `popscle dsc-pileup`). 317 | - Remove all SNPs which are homozygous in all samples (not useful to call those positions in `popscle dsc-pileup`). 318 | 319 | ```bash 320 | only_keep_snps input.vcf \ 321 | | calculate_AF_AC_AN_values_based_on_genotype_info \ 322 | | filter_out_mutations_missing_genotype_for_one_or_more_samples \ 323 | | filter_out_mutations_homozygous_reference_in_all_samples \ 324 | | filter_out_mutations_homozygous_in_all_samples \ 325 | > output.vcf 326 | ``` 327 | 328 | -------------------------------------------------------------------------------- /popscle_dsc_pileup_merge_splitted.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from __future__ import annotations 4 | 5 | import argparse 6 | import glob 7 | import io 8 | import os.path 9 | import shutil 10 | import sys 11 | 12 | import polars as pl 13 | 14 | try: 15 | from isal import igzip as gzip_mod # type: ignore[import] 16 | except ImportError: 17 | import gzip as gzip_mod 18 | 19 | 20 | def write_popscle_pileup_cel_full_filename( 21 | popscle_dsc_output_prefix: str, 22 | popscle_dsc_output_full_prefix: str, 23 | ) -> pl.DataFrame | None: 24 | popscle_pileup_cel_dfs = [] 25 | 26 | popscle_pileup_cel_full_filename = f"{popscle_dsc_output_full_prefix}.pileup.cel.gz" 27 | 28 | if os.path.exists(popscle_pileup_cel_full_filename): 29 | print( 30 | f'Error: popscle pileup CEL full file "{popscle_pileup_cel_full_filename}" already exists.', 31 | file=sys.stderr, 32 | ) 33 | return None 34 | 35 | for i, popscle_pileup_cel_part_filename in enumerate( 36 | sorted(glob.glob(f"{popscle_dsc_output_prefix}.*.pileup.cel.gz")) 37 | ): 38 | print( 39 | f'Reading partial popscle pileup CEL file "{popscle_pileup_cel_part_filename}" ...', 40 | file=sys.stderr, 41 | ) 42 | 43 | popscle_pileup_cel_dfs.append( 44 | # Read partial popscle pileup CEL file. 45 | pl.read_csv( 46 | popscle_pileup_cel_part_filename, 47 | separator="\t", 48 | has_header=True, 49 | dtypes={ 50 | "#DROPLET_ID": pl.Int64, 51 | "BARCODE": pl.Utf8, 52 | "NUM.READ": pl.Int64, 53 | "NUM.UMI": pl.Int64, 54 | "NUM.UMIwSNP": pl.Int64, 55 | "NUM.SNP": pl.Int64, 56 | }, 57 | ) 58 | .rename({"#DROPLET_ID": "DROPLET_ID_PARTITIONED"}) 59 | .with_columns( 60 | # Add current partition as a column. 61 | pl.lit(i).alias("PARTITION") 62 | ) 63 | ) 64 | 65 | # Combine partial popscle pileup CEL files and add real "DROPLET_ID" for the full 66 | # dataset. 67 | popscle_pileup_cel_df = pl.concat(popscle_pileup_cel_dfs).with_row_count( 68 | name="DROPLET_ID", 69 | offset=0, 70 | ) 71 | 72 | with gzip_mod.open(popscle_pileup_cel_full_filename, "w") as fh_full: 73 | print( 74 | f'Writing popscle pileup PLP full file "{popscle_pileup_cel_full_filename}" ...', 75 | file=sys.stderr, 76 | ) 77 | 78 | # Create BytesIO object to temporarily write the corrected popscle pileup CEL 79 | # file to. 80 | bytes_io_tsv = io.BytesIO() 81 | 82 | # Remove "DROPLET_ID_PARTITIONED" and "PARTITION" columns before writing 83 | # corrected popscle pileup CEL file. 84 | popscle_pileup_cel_df.select( 85 | [ 86 | pl.col("DROPLET_ID").alias("#DROPLET_ID"), 87 | pl.col("BARCODE"), 88 | pl.col("NUM.READ"), 89 | pl.col("NUM.UMI"), 90 | pl.col("NUM.UMIwSNP"), 91 | pl.col("NUM.SNP"), 92 | ], 93 | ).write_csv( 94 | bytes_io_tsv, 95 | has_header=True, 96 | separator="\t", 97 | ) 98 | 99 | # Write BytesIO object with corrected popscle pileup CEL output 100 | # to full popscle pileup CEL file. 101 | fh_full.write(bytes_io_tsv.getbuffer()) 102 | 103 | # Return corrected popscle pileup CEL output with "DROPLET_ID_PARTITIONED" and 104 | # "PARTITION" columns as Polars DataFrame. 105 | return popscle_pileup_cel_df 106 | 107 | 108 | def write_popscle_pileup_plp_full_filename( 109 | popscle_pileup_cel_df: pl.DataFrame, 110 | popscle_dsc_output_prefix: str, 111 | popscle_dsc_output_full_prefix: str, 112 | ) -> bool: 113 | popscle_pileup_plp_full_filename = f"{popscle_dsc_output_full_prefix}.pileup.plp.gz" 114 | 115 | if os.path.exists(popscle_pileup_plp_full_filename): 116 | print( 117 | f'Error: popscle pileup PLP full file "{popscle_pileup_plp_full_filename}" already exists.', 118 | file=sys.stderr, 119 | ) 120 | return False 121 | 122 | with gzip_mod.open(popscle_pileup_plp_full_filename, "w") as fh_full: 123 | print( 124 | f'Writing popscle pileup PLP full file "{popscle_pileup_plp_full_filename}" ...', 125 | file=sys.stderr, 126 | ) 127 | 128 | for i, popscle_pileup_plp_part_filename in enumerate( 129 | sorted(glob.glob(f"{popscle_dsc_output_prefix}.*.pileup.plp.gz")) 130 | ): 131 | print( 132 | f'Reading partial popscle pileup PLP file "{popscle_pileup_plp_part_filename}" ...', 133 | file=sys.stderr, 134 | ) 135 | 136 | # Create BytesIO object to temporarily write the corrected popscle pileup 137 | # PLP file to. 138 | bytes_io_tsv = io.BytesIO() 139 | 140 | ( 141 | # Read partial popscle pileup PLP file. 142 | pl.read_csv( 143 | popscle_pileup_plp_part_filename, 144 | separator="\t", 145 | has_header=True, 146 | dtypes={ 147 | "#DROPLET_ID": pl.Int64, 148 | "SNP_ID": pl.Int64, 149 | "ALLELES": pl.Utf8, 150 | "BASEQS": pl.Utf8, 151 | }, 152 | ) 153 | .lazy() 154 | .rename({"#DROPLET_ID": "DROPLET_ID_PARTITIONED"}) 155 | .with_columns( 156 | # Add current partition as a column. 157 | pl.lit(i).alias("PARTITION") 158 | ) 159 | # Correct "DROPLET_ID" column from partial popscle pileup 160 | # PLP file with real "DROPLET_ID" for the full dataset. 161 | .join( 162 | popscle_pileup_cel_df.lazy(), 163 | on=["PARTITION", "DROPLET_ID_PARTITIONED"], 164 | how="inner", 165 | ) 166 | .select( 167 | pl.col("DROPLET_ID").alias("#DROPLET_ID"), 168 | pl.col("SNP_ID"), 169 | pl.col("ALLELES"), 170 | pl.col("BASEQS"), 171 | ) 172 | .collect(streaming=True) 173 | .write_csv( 174 | bytes_io_tsv, 175 | # Write header only for first partial popscle pileup PLP file. 176 | has_header=i == 0, 177 | separator="\t", 178 | ) 179 | ) 180 | 181 | # Write/append BytesIO object with corrected popscle pileup PLP output 182 | # to full popscle pileup PLP file. 183 | fh_full.write(bytes_io_tsv.getbuffer()) 184 | 185 | return True 186 | 187 | 188 | def write_popscle_pileup_umi_full_filename( 189 | popscle_dsc_output_prefix: str, 190 | popscle_dsc_output_full_prefix: str, 191 | ) -> bool: 192 | line_count = 0 193 | popscle_pileup_umi_full_filename = f"{popscle_dsc_output_full_prefix}.pileup.umi.gz" 194 | 195 | if os.path.exists(popscle_pileup_umi_full_filename): 196 | print( 197 | f'Error: popscle pileup UMI full file "{popscle_pileup_umi_full_filename}" already exists.', 198 | file=sys.stderr, 199 | ) 200 | return False 201 | 202 | with gzip_mod.open(popscle_pileup_umi_full_filename, "wt") as fh_full: 203 | print( 204 | f'Writing popscle pileup UMI full file "{popscle_pileup_umi_full_filename}" ...', 205 | file=sys.stderr, 206 | ) 207 | 208 | for popscle_pileup_umi_part_filename in sorted( 209 | glob.glob(f"{popscle_dsc_output_prefix}.*.pileup.umi.gz") 210 | ): 211 | print( 212 | f'Reading partial popscle pileup UMI file "{popscle_pileup_umi_part_filename}" ...', 213 | file=sys.stderr, 214 | ) 215 | with gzip_mod.open(popscle_pileup_umi_part_filename, "rt") as fh: 216 | for line in fh: 217 | print( 218 | str(line_count), 219 | line.split("\t", 1)[1], 220 | sep="\t", 221 | end="", 222 | file=fh_full, 223 | ) 224 | line_count += 1 225 | 226 | return True 227 | 228 | 229 | def write_popscle_pileup_var_full_filename( 230 | popscle_dsc_output_prefix: str, 231 | popscle_dsc_output_full_prefix: str, 232 | ) -> bool: 233 | popscle_pileup_var_full_filename = f"{popscle_dsc_output_full_prefix}.pileup.var.gz" 234 | popscle_pileup_var_most_complete_df = None 235 | popscle_pileup_var_most_complete_filename = None 236 | 237 | if os.path.exists(popscle_pileup_var_full_filename): 238 | print( 239 | f'Error: popscle pileup VAR full file "{popscle_pileup_var_full_filename}" already exists.', 240 | file=sys.stderr, 241 | ) 242 | return False 243 | 244 | # Read each partial popscle dsc pileup var file and take the one that contains the 245 | # most mutations as it seems that popscle will always include all mutations with 246 | # the same SNP_ID. 247 | for popscle_pileup_var_part_filename in sorted( 248 | glob.glob(f"{popscle_dsc_output_prefix}.*.pileup.var.gz") 249 | ): 250 | print( 251 | f'Reading partial popscle pileup VAR file "{popscle_pileup_var_part_filename}" ...', 252 | file=sys.stderr, 253 | ) 254 | 255 | popscle_pileup_var_df = pl.read_csv( 256 | popscle_pileup_var_part_filename, 257 | separator="\t", 258 | has_header=True, 259 | dtypes={ 260 | "#SNP_ID": pl.Int64, 261 | "CHROM": pl.Utf8, 262 | "POS": pl.Int64, 263 | "REF": pl.Utf8, 264 | "ALT": pl.Utf8, 265 | "AF": pl.Utf8, 266 | }, 267 | ).rename({"#SNP_ID": "SNP_ID"}) 268 | 269 | if popscle_pileup_var_most_complete_df is None: 270 | popscle_pileup_var_most_complete_df = popscle_pileup_var_df 271 | popscle_pileup_var_most_complete_filename = popscle_pileup_var_part_filename 272 | elif popscle_pileup_var_df.height > popscle_pileup_var_most_complete_df.height: 273 | # Check if the last element of the smallest file is found exactly in the 274 | # bigger. If this assert fails, our assumption is wrong. 275 | assert ( 276 | popscle_pileup_var_most_complete_df[ 277 | popscle_pileup_var_most_complete_df.height - 1 278 | ].to_struct("LAST_SNP") 279 | == popscle_pileup_var_df[ 280 | popscle_pileup_var_most_complete_df.height - 1 281 | ].to_struct("LAST_SNP") 282 | ).item(), ( 283 | "SNP_IDs do not match between different popscle dsc pileup var files." 284 | ) 285 | 286 | popscle_pileup_var_most_complete_df = popscle_pileup_var_df 287 | popscle_pileup_var_most_complete_filename = popscle_pileup_var_part_filename 288 | 289 | if popscle_pileup_var_most_complete_filename: 290 | print( 291 | f'Writing popscle pileup VAR full file "{popscle_pileup_var_full_filename}" ...', 292 | file=sys.stderr, 293 | ) 294 | shutil.copy2( 295 | popscle_pileup_var_most_complete_filename, popscle_pileup_var_full_filename 296 | ) 297 | 298 | return True 299 | 300 | return False 301 | 302 | 303 | def main() -> None: 304 | parser = argparse.ArgumentParser( 305 | description="Merge popscle dsc pileup outputs from multiple popscle dsc " 306 | "pileup runs with the same VCF file as input, but with a different list " 307 | "of cell barcodes." 308 | ) 309 | 310 | parser.add_argument( 311 | "-i", 312 | "--input", 313 | dest="popscle_dsc_output_prefix", 314 | action="store", 315 | type=str, 316 | required=True, 317 | help="popscle pileup dsc output prefix for partial popscle dsc pileup output.", 318 | ) 319 | 320 | parser.add_argument( 321 | "-o", 322 | "--output", 323 | dest="popscle_dsc_output_full_prefix", 324 | action="store", 325 | type=str, 326 | required=True, 327 | help="popscle pileup dsc output prefix for full popscle dsc pileup output.", 328 | ) 329 | 330 | args = parser.parse_args() 331 | 332 | popscle_pileup_cel_full_filename = ( 333 | f"{args.popscle_dsc_output_full_prefix}.pileup.cel.gz" 334 | ) 335 | popscle_pileup_plp_full_filename = ( 336 | f"{args.popscle_dsc_output_full_prefix}.pileup.plp.gz" 337 | ) 338 | popscle_pileup_umi_full_filename = ( 339 | f"{args.popscle_dsc_output_full_prefix}.pileup.umi.gz" 340 | ) 341 | popscle_pileup_var_full_filename = ( 342 | f"{args.popscle_dsc_output_full_prefix}.pileup.var.gz" 343 | ) 344 | 345 | output_exists = False 346 | 347 | if os.path.exists(popscle_pileup_cel_full_filename): 348 | print( 349 | f'Error: popscle pileup CEL full file "{popscle_pileup_cel_full_filename}" already exists.', 350 | file=sys.stderr, 351 | ) 352 | output_exists = True 353 | 354 | if os.path.exists(popscle_pileup_plp_full_filename): 355 | print( 356 | f'Error: popscle pileup PLP full file "{popscle_pileup_plp_full_filename}" already exists.', 357 | file=sys.stderr, 358 | ) 359 | output_exists = True 360 | 361 | if os.path.exists(popscle_pileup_umi_full_filename): 362 | print( 363 | f'Error: popscle pileup UMI full file "{popscle_pileup_umi_full_filename}" already exists.', 364 | file=sys.stderr, 365 | ) 366 | output_exists = True 367 | 368 | if os.path.exists(popscle_pileup_var_full_filename): 369 | print( 370 | f'Error: popscle pileup VAR full file "{popscle_pileup_var_full_filename}" already exists.', 371 | file=sys.stderr, 372 | ) 373 | output_exists = True 374 | 375 | if output_exists: 376 | sys.exit(1) 377 | 378 | popscle_pileup_cel_df = write_popscle_pileup_cel_full_filename( 379 | popscle_dsc_output_prefix=args.popscle_dsc_output_prefix, 380 | popscle_dsc_output_full_prefix=args.popscle_dsc_output_full_prefix, 381 | ) 382 | 383 | if popscle_pileup_cel_df is None: 384 | sys.exit(1) 385 | 386 | write_popscle_pileup_plp_full_filename( 387 | popscle_pileup_cel_df=popscle_pileup_cel_df, 388 | popscle_dsc_output_prefix=args.popscle_dsc_output_prefix, 389 | popscle_dsc_output_full_prefix=args.popscle_dsc_output_full_prefix, 390 | ) 391 | 392 | write_popscle_pileup_umi_full_filename( 393 | popscle_dsc_output_prefix=args.popscle_dsc_output_prefix, 394 | popscle_dsc_output_full_prefix=args.popscle_dsc_output_full_prefix, 395 | ) 396 | write_popscle_pileup_var_full_filename( 397 | popscle_dsc_output_prefix=args.popscle_dsc_output_prefix, 398 | popscle_dsc_output_full_prefix=args.popscle_dsc_output_full_prefix, 399 | ) 400 | 401 | 402 | if __name__ == "__main__": 403 | main() 404 | --------------------------------------------------------------------------------