├── CohortDataQC_final.sh
├── Merge_Phase_RFmix.sh
├── PGC_LAI_pipeline1.png
├── README.md
├── find_cg_at_snps.py
├── match_against_1000g_v2.py
└── update_rsID_bim_arg.py


/CohortDataQC_final.sh:
--------------------------------------------------------------------------------
 1 | ## !/bin/sh
 2 | 
 3 | ## A script for processing cohort data files for use in LAI with 1000 genomes reference individuals
 4 | ## Written by Elizabeth Atkinson. 1/23/18
 5 | # sub-scripts for processing steps courtesy of Meng Lin and Chris Gignoux
 6 | 
 7 | ## Required parameters:
 8 | ## 1. Binary plink files for the cohort data in question. Will just have to put in the bed stem (not including file extension)
 9 | ## 2. plink installed and on the path
10 | ## 3. R installed and on the path
11 | ## 4. python installed and on the path
12 | ## Result: A new set of binary plink files where all allele rsIDs are renamed to dbsnp144, sites are oriented to 1000 genomes, with non-matching sites, indels, duplicates, sex chromosomes, and triallelic sites removed. Output plink files will be the input DATA name suffixed with QC.
13 | 
14 | ## Usage is CohortDataQC.sh <data-stem> <dbSNP-bedfile> <ref-panel-legend, i.e. from 1kG>
15 | 
16 | 
17 | ##  Unpack the parameters into labelled variables
18 | DATA=$1
19 | DBSNP=$2
20 | LEG=$3
21 | 
22 | #keep only the autosomes in the data file
23 | plink --bfile $DATA --chr 1-22 --make-bed --out $DATA.auto
24 | 
25 | ## Find and get rid of duplicate loci in the bim file
26 | #then keep the good SNPs in the plink file
27 | 
28 | cut -f2,4 $DATA.auto.bim| uniq -f1 > $DATA.NonDupSNPs
29 | cut -f2,4 $DATA.auto.bim| uniq -D -f1 > $DATA.DuplicateSNPs
30 | cat $DATA.DuplicateSNPs | uniq -f1 > $DATA.FirstDup
31 | cat $DATA.NonDupSNPs $DATA.FirstDup > $DATA.SNPstoKeep
32 | 
33 | #set up environment to run plink again
34 | plink --bfile $DATA.auto --extract $DATA.SNPstoKeep --make-bed --out $DATA.auto.nodup
35 | 
36 | 
37 | ## Update SNP IDs to dbsnp 144
38 | python update_rsID_bim_ega.py --bim $DATA.auto.nodup.bim --bed $DBSNP --format T --codechr F --out $DATA.auto.nodup.dbsnp.bim
39 | 
40 | #copy the other files over to this name
41 | cp $DATA.auto.nodup.bed $DATA.auto.nodup.dbsnp.bed
42 | cp $DATA.auto.nodup.fam $DATA.auto.nodup.dbsnp.fam
43 | 
44 | #orient to 1000G
45 | python match_against_1000g_v2.py --bim $DATA.auto.nodup.dbsnp.bim --legend $LEG --out $DATA.1kg
46 | # This script has three outputs: ##modified to be suffixed to allow for full paths to be input
47 | # 1) [outfile].Indel.txt: a bim file of indels 
48 | # 2) [outfile].NonMatching.txt: a bim file containing loci not found in 1000 genome, or has different coding alleles than 1000 genome (tri-allelic, for example). They should be removed.
49 | # 3) [outfile].FlipStrand.txt: a bim file containing loci to flip. 
50 | 
51 | #combine the lists of indels and triallelic/non-matching sites into one list of bad SNPs to remove
52 | cat $DATA.1kg.Indel.txt $DATA.1kg.NonMatching.txt > $DATA.1kg.badsites.txt
53 | 
54 | ## Flip strands for flipped sites and remove non-matching loci using plink. 
55 | plink --bfile $DATA.auto.nodup.dbsnp --exclude $DATA.1kg.badsites.txt --make-bed --out $DATA.auto.nodup.dbsnp.1ksites
56 | plink --bfile $DATA.auto.nodup.dbsnp.1ksites --flip $DATA.1kg.FlipStrand.txt --make-bed --out $DATA.auto.nodup.dbsnp.1ksites.flip
57 | 
58 | ##export a warning flag if there are too many mismatched sites compared to 1000 genomes
59 | 
60 | wc -l $DATA.1kg.NonMatching.txt > nonCount
61 | wc -l $DATA.1kg.FlipStrand.txt > flipcount
62 | wc -l $DATA.auto.nodup.dbsnp.bim > totalsites
63 | paste nonCount flipcount totalsites > SiteCounts
64 | 
65 | awk '{if ($1/$5 > 0.01) print "WARNING: "$1/$5*100"% of sites are problematic when compared to 1000G. This could be indicative of a different reference build or other date file incompatibility." }' SiteCounts > Warnings.out
66 | 
67 | ## Find and remove A/T C/G loci
68 | python find_cg_at_snps.py $DATA.auto.nodup.dbsnp.1ksites.flip.bim > $DATA.ATCGsites
69 | 
70 | plink --bfile $DATA.auto.nodup.dbsnp.1ksites.flip --exclude $DATA.ATCGsites --make-bed --out $DATA.QCed
71 | 
72 | #cohort data is now formatted to merge properly with the 1000G reference panel
73 | 


--------------------------------------------------------------------------------
/Merge_Phase_RFmix.sh:
--------------------------------------------------------------------------------
 1 | ## !/bin/sh
 2 | ### Elizabeth G. Atkinson 
 3 | ### 2/22/18
 4 | ## post-processing shapeit haps/sample files to be input into RFmix for cohort local ancestry inference
 5 | 
 6 | ## Usage is sh Merge_Phase_RFmix.sh <data-stem> <ref-stem> <genetic-recomb-map> <ancestry-ref-map>
 7 | ## the script will only consider the autosomes unless modified
 8 | ## data stem is the core filename before "*chr{1-22}.haps/sample"
 9 | ## shapeit2, RFMix v2, plink, and vcftools are expected to be in the path 
10 | ## GEN expected to be in the format of the hapmap recombination map
11 | ## ancestry reference map assigns reference individuals to their ancestral populations of origin, as described in the RFMix manual
12 | 
13 | ##  Unpack the parameters into labelled variables
14 | DATA=$1
15 | REF=$2
16 | GEN=$3
17 | MAP=$4
18 | 
19 | #cohort data is now formatted to merge properly with the 1000G reference panel from step 1 - suffixed .QCed
20 | #merge 1000G and cohort data
21 | plink --bfile $REF --bmerge $DATA.QCed --make-bed --out $DATA.QCed.1kmerge
22 | 
23 | #filter merged dataset to only include well-genotyped sites present on both the cohort and 1000G platforms - 90% genotyping rate and MAF >= 0.5%
24 | plink --bfile $DATA.QCed.1kmerge --allow-no-sex --make-bed --geno 0.1 --maf 0.005 --out $DATA.QCed.1kmerge.filt
25 | 
26 | #separate out the chromosomes for phasing
27 | for i in {1..22}; do plink --bfile $DATA.QCed.1kmerge.filt --allow-no-sex --chr ${i} --make-bed --out $DATA.QCed.1kmerge.filt.chr${i} ;done
28 | 
29 | #then phase them with SHAPEIT2
30 | #assuming all chroms are present in the same genetic map file linked to in initial command
31 | ###NOTE - this conducts joint phasing with the reference panel. In many cases you'll want to phase the cohort data using the reference panel as a separate flag.
32 | #That can be instead implemented in SHAPEIT2 with a flag similar to:
33 | #  --input-ref reference.haplotypes.gz reference.legend.gz reference.sample \
34 | 
35 | for i in {1..22}; do \
36 | shapeit --input-bed $DATA.QCed.1kmerge.filt.chr${i} -M $GEN -O $DATA.QCed.1kmerge.filt.phased.chr${i} --thread 8 ;done
37 | 
38 | ##also make a list of the individuals 
39 | cut -d' ' -f2 $DATA.fam > $DATA.indivs.txt
40 | 
41 | #convert the shapeit output into VCF format to put into RFmixv2...
42 | for i in {1..22}; do shapeit -convert --input-haps $DATA.chr$i --output-vcf $DATA.chr${i}.vcf ;done
43 | 
44 | #make a vcf file of just the cohort individuals
45 | for i in {1..22}; do vcftools --vcf $DATA.chr${i}.vcf --keep $DATA.indivs.txt --recode --out $DATA.cohort.chr${i} ;done
46 | 
47 | #make a vcf file of just the ref individuals, assuming they're everyone who wasn't in the cohort
48 | for i in {1..22}; do vcftools --vcf $DATA.chr${i}.vcf --remove $DATA.indivs.txt --recode --out $DATA.ref.chr${i} ;done
49 | 
50 | #bgzip these
51 | for i in {1..22}; do bgzip $DATA.ref.chr${i}.recode.vcf ;done
52 | for i in {1..22}; do bgzip $DATA.cohort.chr${i}.recode.vcf ;done
53 | 
54 | #and run RFmix. Split for each chromosome separately
55 | #the recombination map might need to be further processed to make RFMix happy depending on the format
56 | for i in {1..22}; do \
57 | rfmix -f $DATA.cohort.chr${i}.recode.vcf.gz -r $DATA.ref.chr${i}.recode.vcf.gz --chromosome=${i} -m $MAP -g $GEN -n 5 -e 1 --reanalyze-reference --num-threads 8 -o $DATA.rfmix.chr${i} ;done
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/PGC_LAI_pipeline1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atkinson-Lab/Post-QC/5933243cb5640f2485842a5dfe02972df8510043/PGC_LAI_pipeline1.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Post-QC
 2 | Post variant-calling QC pipeline for orienting cohort data with a reference file. An additional step for merging, phasing jointly with a reference panel, and running local ancestry inference is also included. Both steps are implemented as automated scripts that can be called with bash. Subscripts are in python.        
 3 | 
 4 | 
 5 | 
 6 | Dependencies:
 7 | Programs need to be on the path.
 8 | Step 1 (cohort data post-QC and/or merging):
 9 |      python. Plus modules: pandas, numpy, argparser
10 |      PLINK 
11 | 
12 | Step 2: (phasing)
13 |      PLINK
14 |      SHAPEIT2
15 |      VCFtools
16 |      RFmix
17 | 
18 | 
19 | Detailed description of STEP 1: Cohort Data Post-QC and harmonization with a reference panel
20 | Post-variant calling QC to clean and consistently prepare the data for downstream analysis. The steps conducted, in order, are as follows:
21 | 1.	Extract only autosomes in data file 
22 | 2.	Find and get rid of duplicate loci
23 | 3.	Update SNP IDs to dbsnp 144 (Sherry et al., 2001)
24 | 4.	Orient data to 1000 genome reference. This involves 3 substeps:
25 |      a.	Find and remove indels
26 |      b.	Find and remove loci not found in 1000 genome, or that have different coding alleles than 1000 genome (tri-allelic, for example)
27 |      c.	Flip alleles that are on the wrong strand
28 | 5.	Remove A/T, G/C loci
29 | 
30 | This script also outputs warnings if more than 2% of sites are incongruous between the input dataset and 1000G locations. A large discrepancy in SNP physical locations can occur if the datasets are on different reference builds. A/T and G/C loci are unable to be strand-resolved and for this are routinely removed.
31 | 
32 | Though this has been tested with 1000G, any reference panel in legend format should work.
33 | 
34 | Usage:
35 | 
36 | Lauch step 1, data QC and harmonization:  
37 | ```sh CohortDataQC_final.sh <pink binary file stem> <dbSNP-bedfile> <ref-panel-legend, i.e. from 1kG> ```
38 | 
39 | Output is suffixed with .QCed
40 | 
41 | STEP 2: Merging and phasing.
42 | 
43 | Post-QC'ed cohort data is then intersected and jointly phased with a user-specified reference panel of individuals. When merging, the script documents and removes any remaining conflicting and multi-allelic sites. The merged dataset is then filtered to include only informative SNPs present in both the cohort data and the reference panel using a minor allele frequency filter of 0.5% and a genotype missingness cutoff of 90%. The program Shapeit2 (O’Connell et al., 2014) is used to phase each chromosome separately, informed by a recombination map that is expected to be in the format of the HapMap combined b37 recombination map (The International HapMap Consortium 2005). This merged, filtered, phased data is then fed into RFmix with a user specified reference individual map file, as required by RFmix. Detailed desciption of this file is in the RFmixv2 manual (https://github.com/slowkoni/rfmix/blob/master/MANUAL.md). Some manual processing of recombination map files may be required depending on the original format of the file used. 
44 | 
45 | Usage:
46 | Launch step 2:  
47 | ```sh Merge_Phase_RFmix.sh <data-stem> <ref-stem> <genetic-recomb-map> <ancestry-ref-map> ``` 
48 | 
49 | Output will be in haps/sample format from SHAPEIT2, followed by local ancestry calls from RFMix.
50 | 
51 | 


--------------------------------------------------------------------------------
/find_cg_at_snps.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from sys import argv
 3 | bimfile = argv[-1]
 4 | assert bimfile.endswith('.bim')
 5 | 
 6 | for line in file(bimfile):
 7 |        	line = line.strip().split()
 8 |        	alleles = line[-2:]
 9 |        	if sorted(alleles) in (['C','G'],['A','T']):
10 |        		print line[1]
11 | 


--------------------------------------------------------------------------------
/match_against_1000g_v2.py:
--------------------------------------------------------------------------------
  1 | __author__= "mlin"
  2 | from optparse import OptionParser
  3 | import pandas as pd 
  4 | 
  5 | USAGE = """
  6 | match_against_1000g.py --bim 
  7 |                              --legend <legend file >
  8 |                              --out <mid fix name>
  9 | """
 10 | 
 11 | parser = OptionParser(USAGE)
 12 | parser.add_option("--bim", help = "PLINK bim file")
 13 | parser.add_option("-l", "--legend", help = "1000 genome legend", default="/vault/public/1000g/LegendFiles/autosome_1000GP_Phase3_GRCh37_combined.legend")
 14 | parser.add_option("-o", "--out", default = "NA")
 15 | 
 16 | (options,args)=parser.parse_args()
 17 | 
 18 | bimfile = options.bim
 19 | legendfile = options.legend
 20 | if(options.out=="NA"):
 21 |     out = bimfile
 22 | else:
 23 |     out = options.out
 24 | 
 25 | ## functions ##
 26 | 
 27 | def StrFlip(letter):
 28 |     #flip allele to the opposite strand
 29 |     if letter =='0':
 30 |         return '0'
 31 |     elif letter=='G':
 32 |         return 'C'
 33 |     elif letter=='C':
 34 |         return 'G'
 35 |     elif letter=='T':
 36 |         return 'A'
 37 |     elif letter=='A':
 38 |         return 'T'
 39 |     else:
 40 |         return letter # D, I, other indels etc. 
 41 |     
 42 | def gFlip(gtype):
 43 |     # e.g flip A:G to T:C
 44 |     g1 = gtype.split(":")[0]
 45 |     g2 = gtype.split(":")[1]
 46 |     a1 = StrFlip(g1)
 47 |     a2 = StrFlip(g2)
 48 |     return a1 + ":" + a2 
 49 | 
 50 | 
 51 | #read in 
 52 | bim = pd.read_csv(bimfile, header=-1, sep='\s+', dtype = str)
 53 | legend = pd.read_csv(legendfile, header=0, sep='\s+', dtype = str)
 54 | 
 55 | #creating an index according to chr and bp, then A1 and A2 allele
 56 | bim['position'] = pd.Series(bim[0].astype(str)).str.cat(bim[3].astype(str), sep=":") # key in the form of chr:bp
 57 | bim['gtype1'] = pd.Series(bim[4].astype(str)).str.cat(bim[5].astype(str), sep=":")
 58 | bim['gtype2'] = pd.Series(bim[5].astype(str)).str.cat(bim[4].astype(str), sep=":")
 59 | 
 60 | legend['index'] = pd.Series(legend['chr']).str.cat(legend['position'], sep=":")
 61 | legend['gtype'] = pd.Series(legend['a0']).str.cat(legend['a1'],sep=":")
 62 | 
 63 | #dictionary on legend
 64 | legend_dict = dict(zip(legend['index'],legend['gtype']))
 65 | 
 66 | # a mark of indels
 67 | bim['indel'] = 'NA'
 68 | for i in range(0, bim[0].size):
 69 |     bim['indel'][i] = int((bim[4][i]=='D') | (bim[4][i]=='I') | (bim[5][i]=='D') | (bim[5][i]=='I')) | (len(bim['gtype1'][i])>3)
 70 | 
 71 | # a mark of snps to remove or flip
 72 | bim['remove'] = 0
 73 | bim['flip'] = 0
 74 | 
 75 | 
 76 | 
 77 | # start matching
 78 | for i,snp in enumerate(bim['position']):
 79 |     if i%1000==0:
 80 |         print "On line: ", i
 81 |     if bim['indel'][i] == 0: #not indel
 82 |         gtype = legend_dict.get(snp, 'NA') #locus not found in 1000g
 83 |         if gtype=='NA':
 84 |             bim['remove'][i]=1
 85 |         elif gtype==bim['gtype1'][i] or gtype==bim['gtype2'][i]: #match 1000g alt and ref
 86 |             continue
 87 |         elif gtype==gFlip(bim['gtype1'][i]) or gtype==gFlip(bim['gtype2'][i]): # need to flip strand to match
 88 |             bim['flip'][i]=1
 89 |         elif bim['gtype1'][i].split(":")[0]=='0' or bim['gtype1'][i].split(":")[1]=='0': # locus fixed
 90 |             bim_allele = bim['gtype1'][i].strip('0').strip(':') 
 91 |             legend_allele = gtype.split(":")
 92 |             if bim_allele in legend_allele: # fixed but match 1000g
 93 |                 continue
 94 |             elif StrFlip(bim_allele) in legend_allele: # fixed but need to be flipped
 95 |                 bim['flip'][i]=1
 96 |             else:
 97 |                 bim['remove'][i]=1
 98 |         else:
 99 |             bim['remove'][i]=1
100 | 
101 | 
102 | out_indel = file(out + ".Indel.txt", 'w')
103 | out_remove = file(out + ".NonMatching.txt", 'w')
104 | out_flip = file( out + ".FlipStrand.txt", 'w')
105 | 
106 | for i in range(0,bim[0].size):
107 |     if bim['indel'][i]==1:
108 |         out_indel.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (bim[0][i], bim[1][i], bim[2][i], bim[3][i], bim[4][i], bim[5][i]))
109 |     if bim['remove'][i]==1:
110 |         out_remove.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (bim[0][i], bim[1][i], bim[2][i], bim[3][i], bim[4][i], bim[5][i]))
111 |     if bim['flip'][i]==1:
112 |         out_flip.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (bim[0][i], bim[1][i], bim[2][i], bim[3][i], bim[4][i], bim[5][i]))
113 |         
114 | out_indel.close()
115 | out_remove.close()
116 | out_flip.close()
117 | 


--------------------------------------------------------------------------------
/update_rsID_bim_arg.py:
--------------------------------------------------------------------------------
 1 | n__author__= "mlin, adapted for specific use by egatkinson"
 2 | import argparse
 3 | import numpy as np
 4 | import pandas as pd 
 5 | 
 6 | USAGE = """
 7 | update_rsID_bim.py --bim <PLINK bim file to update SNP ID>
 8 |                              --bed <bed file containing newest ID>
 9 |                              --format <formatting unmatched ID as chr:bp>
10 |                              --codechr < if need to use X, Y, XY, M in bed file, default T>
11 |                              --out <new bim file name>
12 | """
13 | 
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument("--bim", help = "PLINK bim file to update SNP ID")
16 | parser.add_argument("--bed", help = "e.g. dbSNP build bed file")
17 | parser.add_argument("-f", "--format", default = "F", help = "T or F, formatting unmatched ID as chr:bp")
18 | parser.add_argument("-c", "--codechr", default = "T", help = "T or F")
19 | parser.add_argument("-o", "--out", default = "out")
20 | 
21 | args=parser.parse_args()
22 | 
23 | bimfile = args.bim
24 | bedfile = args.bed
25 | out = file(args.out,'w')
26 | 
27 | #read in 
28 | bim = pd.read_csv(bimfile, header=-1, sep='\s+', dtype = str)
29 | bed = pd.read_csv(bedfile, header=-1, sep='\s+', dtype = str)
30 | 
31 | bim['chr'] = bim[0] # temporarily store original chr values
32 | if args.codechr=='T':
33 |     for i in range(0,bim[0].size):
34 |         if bim[0][i]=="23":
35 |             bim[0][i] = "X"
36 |         if bim[0][i]=="24":
37 |             bim[0][i] = "Y"
38 |         if bim[0][i]=="25":
39 |             bim[0][i] = "XY"
40 |         if bim[0][i]=="26":
41 |             bim[0][i] = "M"
42 | 
43 | #creating an index according to chr and bp 
44 | ##bim['position'] = pd.Series(['chr%s']*len(bim[0]) % bim[0].astype(str)).str.cat(bim[3].astype(str),sep="_")
45 | bim['position'] = pd.Series(bim[0].astype(str)).str.cat(bim[3].astype(str), sep=":") # key in the form of chr:bp
46 | if 'chr' in bed[0][0]:
47 |     ##bed['position'] = pd.Series(bed[0].astype(str)).str.cat(bed[2].astype(str),sep="_")
48 |     bed['position'] = pd.Series(bed[0].astype(str).str.lstrip('chr')).str.cat(bed[2].astype(str),sep=":")
49 | else:
50 |     ##bed['position'] = pd.Series(['chr%s']*len(bed[0]) % bed[0].astype(str)).str.cat(bed[2].astype(str),sep="_")
51 |     bed['position'] = pd.Series(bed[0].astype(str)).str.cat(bed[2].astype(str),sep=":")
52 | 
53 | #building a dictionary based on index and rsID of the reference
54 | bed_dict = dict(zip(bed['position'],bed[3]))
55 | 
56 | # a mark of indels
57 | bim['indel'] = 'NA'
58 | for i in range(0, bim[0].size):
59 |     bim['indel'][i] = int((bim[4][i]=='D') | (bim[4][i]=='I') | (bim[5][i]=='D') | (bim[5][i]=='I') | (len(bim[4][i])>1) | (len(bim[5][i])>1))
60 | 
61 | #updating new rsIDs of bim
62 | bim['new'] = [0]*len(bim[0])
63 | bim[0]=bim['chr'] # convert back to original chr values
64 | for i,snp in enumerate(bim['position']):
65 |     if (args.format == 'T'):
66 |         if bim['indel'][i] == 0: #not indel
67 |             bim['new'][i] = bed_dict.get(snp, snp) #if not matched, return "chr:bp" as the new ID of the locus
68 |         else: #indel renamed as 'chr:pos_indel'
69 |             bim['new'][i] = snp + '_indel'
70 |     else:
71 |         if bim['indel'][i] == 0:
72 |             bim['new'][i] = bed_dict.get(snp, bim[1][i])
73 |         else: #indel, don't change name
74 |             bim['new'][i] = bim[1][i]
75 |     out.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (bim[0][i],bim['new'][i],str(bim[2][i]),str(int(bim[3][i])),bim[4][i],bim[5][i]))
76 | out.close()
77 | 


--------------------------------------------------------------------------------