├── CohortDataQC_final.sh ├── Merge_Phase_RFmix.sh ├── PGC_LAI_pipeline1.png ├── README.md ├── find_cg_at_snps.py ├── match_against_1000g_v2.py └── update_rsID_bim_arg.py /CohortDataQC_final.sh: -------------------------------------------------------------------------------- 1 | ## !/bin/sh 2 | 3 | ## A script for processing cohort data files for use in LAI with 1000 genomes reference individuals 4 | ## Written by Elizabeth Atkinson. 1/23/18 5 | # sub-scripts for processing steps courtesy of Meng Lin and Chris Gignoux 6 | 7 | ## Required parameters: 8 | ## 1. Binary plink files for the cohort data in question. Will just have to put in the bed stem (not including file extension) 9 | ## 2. plink installed and on the path 10 | ## 3. R installed and on the path 11 | ## 4. python installed and on the path 12 | ## Result: A new set of binary plink files where all allele rsIDs are renamed to dbsnp144, sites are oriented to 1000 genomes, with non-matching sites, indels, duplicates, sex chromosomes, and triallelic sites removed. Output plink files will be the input DATA name suffixed with QC. 13 | 14 | ## Usage is CohortDataQC.sh 15 | 16 | 17 | ## Unpack the parameters into labelled variables 18 | DATA=$1 19 | DBSNP=$2 20 | LEG=$3 21 | 22 | #keep only the autosomes in the data file 23 | plink --bfile $DATA --chr 1-22 --make-bed --out $DATA.auto 24 | 25 | ## Find and get rid of duplicate loci in the bim file 26 | #then keep the good SNPs in the plink file 27 | 28 | cut -f2,4 $DATA.auto.bim| uniq -f1 > $DATA.NonDupSNPs 29 | cut -f2,4 $DATA.auto.bim| uniq -D -f1 > $DATA.DuplicateSNPs 30 | cat $DATA.DuplicateSNPs | uniq -f1 > $DATA.FirstDup 31 | cat $DATA.NonDupSNPs $DATA.FirstDup > $DATA.SNPstoKeep 32 | 33 | #set up environment to run plink again 34 | plink --bfile $DATA.auto --extract $DATA.SNPstoKeep --make-bed --out $DATA.auto.nodup 35 | 36 | 37 | ## Update SNP IDs to dbsnp 144 38 | python update_rsID_bim_ega.py --bim $DATA.auto.nodup.bim --bed $DBSNP --format T --codechr F --out $DATA.auto.nodup.dbsnp.bim 39 | 40 | #copy the other files over to this name 41 | cp $DATA.auto.nodup.bed $DATA.auto.nodup.dbsnp.bed 42 | cp $DATA.auto.nodup.fam $DATA.auto.nodup.dbsnp.fam 43 | 44 | #orient to 1000G 45 | python match_against_1000g_v2.py --bim $DATA.auto.nodup.dbsnp.bim --legend $LEG --out $DATA.1kg 46 | # This script has three outputs: ##modified to be suffixed to allow for full paths to be input 47 | # 1) [outfile].Indel.txt: a bim file of indels 48 | # 2) [outfile].NonMatching.txt: a bim file containing loci not found in 1000 genome, or has different coding alleles than 1000 genome (tri-allelic, for example). They should be removed. 49 | # 3) [outfile].FlipStrand.txt: a bim file containing loci to flip. 50 | 51 | #combine the lists of indels and triallelic/non-matching sites into one list of bad SNPs to remove 52 | cat $DATA.1kg.Indel.txt $DATA.1kg.NonMatching.txt > $DATA.1kg.badsites.txt 53 | 54 | ## Flip strands for flipped sites and remove non-matching loci using plink. 55 | plink --bfile $DATA.auto.nodup.dbsnp --exclude $DATA.1kg.badsites.txt --make-bed --out $DATA.auto.nodup.dbsnp.1ksites 56 | plink --bfile $DATA.auto.nodup.dbsnp.1ksites --flip $DATA.1kg.FlipStrand.txt --make-bed --out $DATA.auto.nodup.dbsnp.1ksites.flip 57 | 58 | ##export a warning flag if there are too many mismatched sites compared to 1000 genomes 59 | 60 | wc -l $DATA.1kg.NonMatching.txt > nonCount 61 | wc -l $DATA.1kg.FlipStrand.txt > flipcount 62 | wc -l $DATA.auto.nodup.dbsnp.bim > totalsites 63 | paste nonCount flipcount totalsites > SiteCounts 64 | 65 | awk '{if ($1/$5 > 0.01) print "WARNING: "$1/$5*100"% of sites are problematic when compared to 1000G. This could be indicative of a different reference build or other date file incompatibility." }' SiteCounts > Warnings.out 66 | 67 | ## Find and remove A/T C/G loci 68 | python find_cg_at_snps.py $DATA.auto.nodup.dbsnp.1ksites.flip.bim > $DATA.ATCGsites 69 | 70 | plink --bfile $DATA.auto.nodup.dbsnp.1ksites.flip --exclude $DATA.ATCGsites --make-bed --out $DATA.QCed 71 | 72 | #cohort data is now formatted to merge properly with the 1000G reference panel 73 | -------------------------------------------------------------------------------- /Merge_Phase_RFmix.sh: -------------------------------------------------------------------------------- 1 | ## !/bin/sh 2 | ### Elizabeth G. Atkinson 3 | ### 2/22/18 4 | ## post-processing shapeit haps/sample files to be input into RFmix for cohort local ancestry inference 5 | 6 | ## Usage is sh Merge_Phase_RFmix.sh 7 | ## the script will only consider the autosomes unless modified 8 | ## data stem is the core filename before "*chr{1-22}.haps/sample" 9 | ## shapeit2, RFMix v2, plink, and vcftools are expected to be in the path 10 | ## GEN expected to be in the format of the hapmap recombination map 11 | ## ancestry reference map assigns reference individuals to their ancestral populations of origin, as described in the RFMix manual 12 | 13 | ## Unpack the parameters into labelled variables 14 | DATA=$1 15 | REF=$2 16 | GEN=$3 17 | MAP=$4 18 | 19 | #cohort data is now formatted to merge properly with the 1000G reference panel from step 1 - suffixed .QCed 20 | #merge 1000G and cohort data 21 | plink --bfile $REF --bmerge $DATA.QCed --make-bed --out $DATA.QCed.1kmerge 22 | 23 | #filter merged dataset to only include well-genotyped sites present on both the cohort and 1000G platforms - 90% genotyping rate and MAF >= 0.5% 24 | plink --bfile $DATA.QCed.1kmerge --allow-no-sex --make-bed --geno 0.1 --maf 0.005 --out $DATA.QCed.1kmerge.filt 25 | 26 | #separate out the chromosomes for phasing 27 | for i in {1..22}; do plink --bfile $DATA.QCed.1kmerge.filt --allow-no-sex --chr ${i} --make-bed --out $DATA.QCed.1kmerge.filt.chr${i} ;done 28 | 29 | #then phase them with SHAPEIT2 30 | #assuming all chroms are present in the same genetic map file linked to in initial command 31 | ###NOTE - this conducts joint phasing with the reference panel. In many cases you'll want to phase the cohort data using the reference panel as a separate flag. 32 | #That can be instead implemented in SHAPEIT2 with a flag similar to: 33 | # --input-ref reference.haplotypes.gz reference.legend.gz reference.sample \ 34 | 35 | for i in {1..22}; do \ 36 | shapeit --input-bed $DATA.QCed.1kmerge.filt.chr${i} -M $GEN -O $DATA.QCed.1kmerge.filt.phased.chr${i} --thread 8 ;done 37 | 38 | ##also make a list of the individuals 39 | cut -d' ' -f2 $DATA.fam > $DATA.indivs.txt 40 | 41 | #convert the shapeit output into VCF format to put into RFmixv2... 42 | for i in {1..22}; do shapeit -convert --input-haps $DATA.chr$i --output-vcf $DATA.chr${i}.vcf ;done 43 | 44 | #make a vcf file of just the cohort individuals 45 | for i in {1..22}; do vcftools --vcf $DATA.chr${i}.vcf --keep $DATA.indivs.txt --recode --out $DATA.cohort.chr${i} ;done 46 | 47 | #make a vcf file of just the ref individuals, assuming they're everyone who wasn't in the cohort 48 | for i in {1..22}; do vcftools --vcf $DATA.chr${i}.vcf --remove $DATA.indivs.txt --recode --out $DATA.ref.chr${i} ;done 49 | 50 | #bgzip these 51 | for i in {1..22}; do bgzip $DATA.ref.chr${i}.recode.vcf ;done 52 | for i in {1..22}; do bgzip $DATA.cohort.chr${i}.recode.vcf ;done 53 | 54 | #and run RFmix. Split for each chromosome separately 55 | #the recombination map might need to be further processed to make RFMix happy depending on the format 56 | for i in {1..22}; do \ 57 | rfmix -f $DATA.cohort.chr${i}.recode.vcf.gz -r $DATA.ref.chr${i}.recode.vcf.gz --chromosome=${i} -m $MAP -g $GEN -n 5 -e 1 --reanalyze-reference --num-threads 8 -o $DATA.rfmix.chr${i} ;done 58 | 59 | 60 | -------------------------------------------------------------------------------- /PGC_LAI_pipeline1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atkinson-Lab/Post-QC/5933243cb5640f2485842a5dfe02972df8510043/PGC_LAI_pipeline1.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Post-QC 2 | Post variant-calling QC pipeline for orienting cohort data with a reference file. An additional step for merging, phasing jointly with a reference panel, and running local ancestry inference is also included. Both steps are implemented as automated scripts that can be called with bash. Subscripts are in python. 3 | 4 | 5 | 6 | Dependencies: 7 | Programs need to be on the path. 8 | Step 1 (cohort data post-QC and/or merging): 9 | python. Plus modules: pandas, numpy, argparser 10 | PLINK 11 | 12 | Step 2: (phasing) 13 | PLINK 14 | SHAPEIT2 15 | VCFtools 16 | RFmix 17 | 18 | 19 | Detailed description of STEP 1: Cohort Data Post-QC and harmonization with a reference panel 20 | Post-variant calling QC to clean and consistently prepare the data for downstream analysis. The steps conducted, in order, are as follows: 21 | 1. Extract only autosomes in data file 22 | 2. Find and get rid of duplicate loci 23 | 3. Update SNP IDs to dbsnp 144 (Sherry et al., 2001) 24 | 4. Orient data to 1000 genome reference. This involves 3 substeps: 25 | a. Find and remove indels 26 | b. Find and remove loci not found in 1000 genome, or that have different coding alleles than 1000 genome (tri-allelic, for example) 27 | c. Flip alleles that are on the wrong strand 28 | 5. Remove A/T, G/C loci 29 | 30 | This script also outputs warnings if more than 2% of sites are incongruous between the input dataset and 1000G locations. A large discrepancy in SNP physical locations can occur if the datasets are on different reference builds. A/T and G/C loci are unable to be strand-resolved and for this are routinely removed. 31 | 32 | Though this has been tested with 1000G, any reference panel in legend format should work. 33 | 34 | Usage: 35 | 36 | Lauch step 1, data QC and harmonization: 37 | ```sh CohortDataQC_final.sh ``` 38 | 39 | Output is suffixed with .QCed 40 | 41 | STEP 2: Merging and phasing. 42 | 43 | Post-QC'ed cohort data is then intersected and jointly phased with a user-specified reference panel of individuals. When merging, the script documents and removes any remaining conflicting and multi-allelic sites. The merged dataset is then filtered to include only informative SNPs present in both the cohort data and the reference panel using a minor allele frequency filter of 0.5% and a genotype missingness cutoff of 90%. The program Shapeit2 (O’Connell et al., 2014) is used to phase each chromosome separately, informed by a recombination map that is expected to be in the format of the HapMap combined b37 recombination map (The International HapMap Consortium 2005). This merged, filtered, phased data is then fed into RFmix with a user specified reference individual map file, as required by RFmix. Detailed desciption of this file is in the RFmixv2 manual (https://github.com/slowkoni/rfmix/blob/master/MANUAL.md). Some manual processing of recombination map files may be required depending on the original format of the file used. 44 | 45 | Usage: 46 | Launch step 2: 47 | ```sh Merge_Phase_RFmix.sh ``` 48 | 49 | Output will be in haps/sample format from SHAPEIT2, followed by local ancestry calls from RFMix. 50 | 51 | -------------------------------------------------------------------------------- /find_cg_at_snps.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from sys import argv 3 | bimfile = argv[-1] 4 | assert bimfile.endswith('.bim') 5 | 6 | for line in file(bimfile): 7 | line = line.strip().split() 8 | alleles = line[-2:] 9 | if sorted(alleles) in (['C','G'],['A','T']): 10 | print line[1] 11 | -------------------------------------------------------------------------------- /match_against_1000g_v2.py: -------------------------------------------------------------------------------- 1 | __author__= "mlin" 2 | from optparse import OptionParser 3 | import pandas as pd 4 | 5 | USAGE = """ 6 | match_against_1000g.py --bim 7 | --legend 8 | --out 9 | """ 10 | 11 | parser = OptionParser(USAGE) 12 | parser.add_option("--bim", help = "PLINK bim file") 13 | parser.add_option("-l", "--legend", help = "1000 genome legend", default="/vault/public/1000g/LegendFiles/autosome_1000GP_Phase3_GRCh37_combined.legend") 14 | parser.add_option("-o", "--out", default = "NA") 15 | 16 | (options,args)=parser.parse_args() 17 | 18 | bimfile = options.bim 19 | legendfile = options.legend 20 | if(options.out=="NA"): 21 | out = bimfile 22 | else: 23 | out = options.out 24 | 25 | ## functions ## 26 | 27 | def StrFlip(letter): 28 | #flip allele to the opposite strand 29 | if letter =='0': 30 | return '0' 31 | elif letter=='G': 32 | return 'C' 33 | elif letter=='C': 34 | return 'G' 35 | elif letter=='T': 36 | return 'A' 37 | elif letter=='A': 38 | return 'T' 39 | else: 40 | return letter # D, I, other indels etc. 41 | 42 | def gFlip(gtype): 43 | # e.g flip A:G to T:C 44 | g1 = gtype.split(":")[0] 45 | g2 = gtype.split(":")[1] 46 | a1 = StrFlip(g1) 47 | a2 = StrFlip(g2) 48 | return a1 + ":" + a2 49 | 50 | 51 | #read in 52 | bim = pd.read_csv(bimfile, header=-1, sep='\s+', dtype = str) 53 | legend = pd.read_csv(legendfile, header=0, sep='\s+', dtype = str) 54 | 55 | #creating an index according to chr and bp, then A1 and A2 allele 56 | bim['position'] = pd.Series(bim[0].astype(str)).str.cat(bim[3].astype(str), sep=":") # key in the form of chr:bp 57 | bim['gtype1'] = pd.Series(bim[4].astype(str)).str.cat(bim[5].astype(str), sep=":") 58 | bim['gtype2'] = pd.Series(bim[5].astype(str)).str.cat(bim[4].astype(str), sep=":") 59 | 60 | legend['index'] = pd.Series(legend['chr']).str.cat(legend['position'], sep=":") 61 | legend['gtype'] = pd.Series(legend['a0']).str.cat(legend['a1'],sep=":") 62 | 63 | #dictionary on legend 64 | legend_dict = dict(zip(legend['index'],legend['gtype'])) 65 | 66 | # a mark of indels 67 | bim['indel'] = 'NA' 68 | for i in range(0, bim[0].size): 69 | bim['indel'][i] = int((bim[4][i]=='D') | (bim[4][i]=='I') | (bim[5][i]=='D') | (bim[5][i]=='I')) | (len(bim['gtype1'][i])>3) 70 | 71 | # a mark of snps to remove or flip 72 | bim['remove'] = 0 73 | bim['flip'] = 0 74 | 75 | 76 | 77 | # start matching 78 | for i,snp in enumerate(bim['position']): 79 | if i%1000==0: 80 | print "On line: ", i 81 | if bim['indel'][i] == 0: #not indel 82 | gtype = legend_dict.get(snp, 'NA') #locus not found in 1000g 83 | if gtype=='NA': 84 | bim['remove'][i]=1 85 | elif gtype==bim['gtype1'][i] or gtype==bim['gtype2'][i]: #match 1000g alt and ref 86 | continue 87 | elif gtype==gFlip(bim['gtype1'][i]) or gtype==gFlip(bim['gtype2'][i]): # need to flip strand to match 88 | bim['flip'][i]=1 89 | elif bim['gtype1'][i].split(":")[0]=='0' or bim['gtype1'][i].split(":")[1]=='0': # locus fixed 90 | bim_allele = bim['gtype1'][i].strip('0').strip(':') 91 | legend_allele = gtype.split(":") 92 | if bim_allele in legend_allele: # fixed but match 1000g 93 | continue 94 | elif StrFlip(bim_allele) in legend_allele: # fixed but need to be flipped 95 | bim['flip'][i]=1 96 | else: 97 | bim['remove'][i]=1 98 | else: 99 | bim['remove'][i]=1 100 | 101 | 102 | out_indel = file(out + ".Indel.txt", 'w') 103 | out_remove = file(out + ".NonMatching.txt", 'w') 104 | out_flip = file( out + ".FlipStrand.txt", 'w') 105 | 106 | for i in range(0,bim[0].size): 107 | if bim['indel'][i]==1: 108 | out_indel.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (bim[0][i], bim[1][i], bim[2][i], bim[3][i], bim[4][i], bim[5][i])) 109 | if bim['remove'][i]==1: 110 | out_remove.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (bim[0][i], bim[1][i], bim[2][i], bim[3][i], bim[4][i], bim[5][i])) 111 | if bim['flip'][i]==1: 112 | out_flip.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (bim[0][i], bim[1][i], bim[2][i], bim[3][i], bim[4][i], bim[5][i])) 113 | 114 | out_indel.close() 115 | out_remove.close() 116 | out_flip.close() 117 | -------------------------------------------------------------------------------- /update_rsID_bim_arg.py: -------------------------------------------------------------------------------- 1 | n__author__= "mlin, adapted for specific use by egatkinson" 2 | import argparse 3 | import numpy as np 4 | import pandas as pd 5 | 6 | USAGE = """ 7 | update_rsID_bim.py --bim 8 | --bed 9 | --format 10 | --codechr < if need to use X, Y, XY, M in bed file, default T> 11 | --out 12 | """ 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--bim", help = "PLINK bim file to update SNP ID") 16 | parser.add_argument("--bed", help = "e.g. dbSNP build bed file") 17 | parser.add_argument("-f", "--format", default = "F", help = "T or F, formatting unmatched ID as chr:bp") 18 | parser.add_argument("-c", "--codechr", default = "T", help = "T or F") 19 | parser.add_argument("-o", "--out", default = "out") 20 | 21 | args=parser.parse_args() 22 | 23 | bimfile = args.bim 24 | bedfile = args.bed 25 | out = file(args.out,'w') 26 | 27 | #read in 28 | bim = pd.read_csv(bimfile, header=-1, sep='\s+', dtype = str) 29 | bed = pd.read_csv(bedfile, header=-1, sep='\s+', dtype = str) 30 | 31 | bim['chr'] = bim[0] # temporarily store original chr values 32 | if args.codechr=='T': 33 | for i in range(0,bim[0].size): 34 | if bim[0][i]=="23": 35 | bim[0][i] = "X" 36 | if bim[0][i]=="24": 37 | bim[0][i] = "Y" 38 | if bim[0][i]=="25": 39 | bim[0][i] = "XY" 40 | if bim[0][i]=="26": 41 | bim[0][i] = "M" 42 | 43 | #creating an index according to chr and bp 44 | ##bim['position'] = pd.Series(['chr%s']*len(bim[0]) % bim[0].astype(str)).str.cat(bim[3].astype(str),sep="_") 45 | bim['position'] = pd.Series(bim[0].astype(str)).str.cat(bim[3].astype(str), sep=":") # key in the form of chr:bp 46 | if 'chr' in bed[0][0]: 47 | ##bed['position'] = pd.Series(bed[0].astype(str)).str.cat(bed[2].astype(str),sep="_") 48 | bed['position'] = pd.Series(bed[0].astype(str).str.lstrip('chr')).str.cat(bed[2].astype(str),sep=":") 49 | else: 50 | ##bed['position'] = pd.Series(['chr%s']*len(bed[0]) % bed[0].astype(str)).str.cat(bed[2].astype(str),sep="_") 51 | bed['position'] = pd.Series(bed[0].astype(str)).str.cat(bed[2].astype(str),sep=":") 52 | 53 | #building a dictionary based on index and rsID of the reference 54 | bed_dict = dict(zip(bed['position'],bed[3])) 55 | 56 | # a mark of indels 57 | bim['indel'] = 'NA' 58 | for i in range(0, bim[0].size): 59 | bim['indel'][i] = int((bim[4][i]=='D') | (bim[4][i]=='I') | (bim[5][i]=='D') | (bim[5][i]=='I') | (len(bim[4][i])>1) | (len(bim[5][i])>1)) 60 | 61 | #updating new rsIDs of bim 62 | bim['new'] = [0]*len(bim[0]) 63 | bim[0]=bim['chr'] # convert back to original chr values 64 | for i,snp in enumerate(bim['position']): 65 | if (args.format == 'T'): 66 | if bim['indel'][i] == 0: #not indel 67 | bim['new'][i] = bed_dict.get(snp, snp) #if not matched, return "chr:bp" as the new ID of the locus 68 | else: #indel renamed as 'chr:pos_indel' 69 | bim['new'][i] = snp + '_indel' 70 | else: 71 | if bim['indel'][i] == 0: 72 | bim['new'][i] = bed_dict.get(snp, bim[1][i]) 73 | else: #indel, don't change name 74 | bim['new'][i] = bim[1][i] 75 | out.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (bim[0][i],bim['new'][i],str(bim[2][i]),str(int(bim[3][i])),bim[4][i],bim[5][i])) 76 | out.close() 77 | --------------------------------------------------------------------------------