├── resources ├── hla_bwa.kit.fna.gz ├── Notes_on_DRB5_gen.txt ├── DRB5_gen.txt └── HLA_decoys.fa ├── src ├── Val.java ├── HLASequence.java ├── PathBaseErrorProb.java ├── MergeStatus.java ├── Score.java ├── LogHandler.java ├── Group.java ├── ScoreRecord.java ├── NomG.java ├── TmpPath.java ├── Result.java ├── Node.java ├── DNAString.java ├── Needle.java ├── Base.java ├── CustomHashMap.java ├── BubblePathLikelihoodScores.java ├── AllelePath.java ├── SuperAllelePath.java ├── CustomWeightedEdge.java ├── FormatIMGT.java └── HLA.java ├── LICENSE ├── scripts ├── download_panel.sh ├── download_grch38.sh ├── formatIMGT.sh ├── alignAndExtract_hs38DH_NoAlt.sh ├── alignAndExtract_hs38.sh └── alignAndExtract_hs38Alt.sh ├── pom.xml ├── README.md └── preprocessing.md /resources/hla_bwa.kit.fna.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kingsford-Group/kourami/HEAD/resources/hla_bwa.kit.fna.gz -------------------------------------------------------------------------------- /resources/Notes_on_DRB5_gen.txt: -------------------------------------------------------------------------------- 1 | # DRB5_gen.txt missing in IMGT/HLA releases. 2 | # There is only one known full length allele DRB5*01:01:01. 3 | # "msf" directory contains DRB5_gen.msf. However, alignments directory doesn't contain DRB5_gen.txt. 4 | # DRB5_gen.txt is originally created from the intron exon region annotation for DRB5*01:01:01 5 | # found in hla.xml file in IMGT/HLA 3.24.0 release. 6 | # DRB5*01:01:01 sequence was added initially in 3.22.0 release and was updated 7 | # (single base appened at the end - doesn't affect Kourami at all) in 3.26.0. 8 | -------------------------------------------------------------------------------- /src/Val.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | public class Val{ 7 | 8 | private int whichH;//-1 if same score (split), 0 for H1, 1 for H2 9 | 10 | public Val(){ 11 | whichH=-1; 12 | } 13 | 14 | public Val(int n){ 15 | this.whichH = n; 16 | } 17 | 18 | public void set(int n){ 19 | this.whichH = n; 20 | } 21 | 22 | public int getWhichH(){ 23 | return this.whichH; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/HLASequence.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | /* 7 | * Simple class storing allele name and its typing sequence 8 | * 9 | */ 10 | public class HLASequence{ 11 | 12 | private Group grp; 13 | private String sequence; 14 | 15 | public HLASequence(Group g, Sequence seq){ 16 | this.grp = g; 17 | this.sequence = seq.getTypingSequence(); 18 | } 19 | 20 | public Group getGroup(){ 21 | return this.grp; 22 | } 23 | 24 | public String toString(){ 25 | return ">" + this.grp.getGroupString() + "\n" + this.sequence + "\n"; 26 | } 27 | 28 | public String getSequence(){ 29 | return this.sequence; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/PathBaseErrorProb.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | public class PathBaseErrorProb{ 7 | 8 | private char[] pathBases; 9 | //public char[][] readBases; 10 | private double[][] errorProb; 11 | 12 | 13 | public int numReads(){ 14 | return this.errorProb.length; 15 | } 16 | 17 | public double[] getNthReadErrorProb(int n){ 18 | return this.errorProb[n]; 19 | } 20 | 21 | public char[] getBases(){ 22 | return this.pathBases; 23 | } 24 | 25 | public double[][] getErrorProb(){ 26 | return this.errorProb; 27 | } 28 | 29 | public PathBaseErrorProb(int numReads, int len){ 30 | this.pathBases = new char[len]; 31 | //this.readBases = new char[numReads][len]; 32 | this.errorProb = new double[numReads][len]; 33 | } 34 | 35 | public void addPathBases(char b, int pos){ 36 | this.pathBases[pos] = b; 37 | } 38 | 39 | public void add(double e, int readIndex, int pos){ 40 | //this.pathBases[readIndex][pos] = b; 41 | //this.readBases[readIndex][pos] = b; 42 | this.errorProb[readIndex][pos] = e; 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/MergeStatus.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | public class MergeStatus{ 7 | 8 | private boolean split; 9 | private boolean segregating; 10 | private int lastSegregationColumnIndex; 11 | 12 | public MergeStatus(){ 13 | this.split = false; 14 | this.segregating = false; 15 | this.lastSegregationColumnIndex = -1; 16 | } 17 | 18 | public MergeStatus(boolean segregating, int lastSegCI){ 19 | this(); 20 | this.segregating = segregating; 21 | this.lastSegregationColumnIndex = lastSegCI; 22 | } 23 | 24 | public int getLastSegregationColumnIndex(){ 25 | return this.lastSegregationColumnIndex; 26 | } 27 | 28 | public boolean isSplit(){ 29 | return this.split; 30 | } 31 | 32 | public boolean isSegregating(){ 33 | return this.segregating; 34 | } 35 | 36 | public void setSplit(boolean b){ 37 | this.split = b; 38 | } 39 | public void setSegregating(boolean b){ 40 | this.segregating = b; 41 | } 42 | 43 | public void setLastSegregationColumnIndex(int ci){ 44 | this.lastSegregationColumnIndex = ci; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/Score.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | public class Score implements Comparable{ 7 | 8 | private double[] scores; //scores for each pair of alleles (n+1)*n/2 pairings 9 | private int[] pairIndicies; //size 2 int array holding i,j pair for alleles index 10 | public static int sortIndex; 11 | 12 | public Score(double[] s, int i , int j){ 13 | this.scores = s; 14 | this.pairIndicies = new int[2]; 15 | this.pairIndicies[0] = i; 16 | this.pairIndicies[1] = j; 17 | } 18 | 19 | public Score(double[] s, int[] p){ 20 | this.scores = s; 21 | this.pairIndicies = p; 22 | } 23 | 24 | //descending order 25 | public int compareTo(Score os){ 26 | if(this.getNthScore(Score.sortIndex) > os.getNthScore(Score.sortIndex)) 27 | return -1; 28 | else if(this.getNthScore(Score.sortIndex) < os.getNthScore(Score.sortIndex)) 29 | return 1; 30 | else 31 | return 0; 32 | } 33 | 34 | public double getNthScore(int scoringScheme){ 35 | return this.scores[scoringScheme]; 36 | } 37 | 38 | public int[] getIndicies(){ 39 | return this.pairIndicies; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/LogHandler.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | import java.io.*; 7 | 8 | public class LogHandler{ 9 | 10 | public static boolean debug = true; 11 | 12 | public StringBuffer bf; 13 | 14 | public BufferedWriter bw; 15 | 16 | public LogHandler(){ 17 | this.bf = new StringBuffer(); 18 | this.bw = null; 19 | } 20 | 21 | public void flush(){ 22 | try{ 23 | if(this.bw == null){ 24 | this.bw = new BufferedWriter(new FileWriter(HLA.OUTPREFIX + ".log")); 25 | } 26 | this.bw.write(this.bf.toString()); 27 | this.bw.flush(); 28 | this.bf = new StringBuffer(); 29 | }catch(IOException ioe){ 30 | ioe.printStackTrace(); 31 | } 32 | } 33 | 34 | public void appendln(int i){ 35 | bf.append(i + "\n"); 36 | } 37 | 38 | public void appendln(String line){ 39 | bf.append(line + "\n"); 40 | } 41 | 42 | public void appendln(char c){ 43 | bf.append(c + "\n"); 44 | } 45 | 46 | public void appendln(){ 47 | bf.append("\n"); 48 | } 49 | 50 | public void append(int i){ 51 | bf.append(i + ""); 52 | } 53 | 54 | public void append(String line){ 55 | bf.append(line); 56 | } 57 | 58 | public void append(char c){ 59 | bf.append(c + ""); 60 | } 61 | 62 | 63 | public void outToFile(){ 64 | 65 | try{ 66 | if(bw == null) 67 | this.flush(); 68 | bw.write(bf.toString()); 69 | bw.close(); 70 | }catch(IOException ioe){ 71 | ioe.printStackTrace(); 72 | } 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Heewook Lee, Carl Kingsford, and Carnegie Mellon University 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /src/Group.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | import java.util.*; 7 | 8 | /* 9 | * G Group 10 | */ 11 | public class Group{ 12 | 13 | private String hlaGeneName; //A B C DQA1 DQB1 DRB1 etc 14 | 15 | private String groupname; //representatitve name: ex) 01:01:01G 16 | 17 | private HashSet set; 18 | 19 | public String getGroupString(){ 20 | return hlaGeneName + "*" + groupname; 21 | } 22 | 23 | public String getHLAGeneName(){ 24 | return hlaGeneName; 25 | } 26 | 27 | public String getGroupName(){ 28 | return this.hlaGeneName + "*" + this.groupname; 29 | } 30 | 31 | public String getFirstAllele(){ 32 | return this.hlaGeneName + "*" + set.iterator().next(); 33 | } 34 | 35 | public Group(String line, NomG nomG){ 36 | this.set = new HashSet(); 37 | this.process(line, nomG); 38 | } 39 | 40 | public Group(String alleleName){ 41 | this.set = new HashSet(); 42 | this.hlaGeneName = alleleName.substring(0,alleleName.indexOf("*")); 43 | this.groupname = alleleName.substring(alleleName.indexOf("*")+1); 44 | this.set.add(groupname); 45 | } 46 | 47 | public void process(String line, NomG nomG){ 48 | String[] tokens = line.split(";"); 49 | this.hlaGeneName = tokens[0].substring(0,tokens[0].indexOf("*")); 50 | String gName = null; 51 | if(tokens.length == 2 && line.endsWith(";")) 52 | this.groupname = tokens[1]; 53 | else 54 | this.groupname = tokens[2]; 55 | 56 | String[] elements = tokens[1].split("/"); 57 | for(String e : elements){ 58 | this.set.add(e); 59 | nomG.addToAllele2Group(e, this); 60 | } 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/ScoreRecord.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | import java.util.*; 7 | 8 | public class ScoreRecord{ 9 | 10 | private ArrayList listOfScores; 11 | private int currentBestScore = 0; 12 | 13 | public ScoreRecord(){ 14 | this.listOfScores = new ArrayList(); 15 | } 16 | 17 | public int getCurrentBestPairScore(){ 18 | return this.currentBestScore; 19 | } 20 | 21 | public void sort(int sIndex){ 22 | Score.sortIndex = sIndex; 23 | Collections.sort(listOfScores); 24 | } 25 | 26 | public void addScore(double[] s, int i, int j){ 27 | this.listOfScores.add(new Score(s, i, j)); 28 | } 29 | 30 | public void printBest(int sortIndex){ 31 | this.sort(sortIndex); 32 | double best = Double.NEGATIVE_INFINITY; 33 | for(Score s : this.listOfScores){ 34 | if(s.getNthScore(sortIndex) >= best){ 35 | best = s.getNthScore(sortIndex); 36 | int[] bestIJ = s.getIndicies(); 37 | HLA.log.appendln(">>>>>>>>> BEST PAIR[" + bestIJ[0] + ":" + bestIJ[1] + "]:\t" + s.getNthScore(sortIndex)); 38 | }else 39 | break; 40 | } 41 | } 42 | 43 | public ArrayList getBestPairs(int sortIndex){ 44 | ArrayList bestPairs = new ArrayList(); 45 | this.sort(sortIndex); 46 | double best = Double.NEGATIVE_INFINITY; 47 | for(Score s : this.listOfScores){ 48 | if(s.getNthScore(sortIndex) >= best){ 49 | best = s.getNthScore(sortIndex); 50 | int[] bestIJ = s.getIndicies(); 51 | bestPairs.add(bestIJ); 52 | HLA.log.appendln(">>>>>>>>> BEST PAIR[" + bestIJ[0] + ":" + bestIJ[1] + "]:\t" + s.getNthScore(sortIndex)); 53 | }else 54 | break; 55 | } 56 | return bestPairs; 57 | } 58 | } 59 | 60 | -------------------------------------------------------------------------------- /src/NomG.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | import java.util.*; 7 | import java.io.*; 8 | 9 | public class NomG{ 10 | 11 | private HashMap> hlagene2groups; 12 | 13 | //private ArrayList groups; 14 | private HashMap allele2Group; 15 | 16 | public NomG(){ 17 | this.hlagene2groups = new HashMap>(); 18 | //this.groups = new ArrayList(); 19 | this.allele2Group = new HashMap(); 20 | } 21 | 22 | 23 | //returns the list of groups belonging to HLA gene 24 | public ArrayList getGroups(String hgn){ 25 | return this.hlagene2groups.get(hgn); 26 | } 27 | 28 | public void loadHlaGene2Groups(String nomGFile){ 29 | BufferedReader br = null; 30 | String curline = null; 31 | try{ 32 | br = new BufferedReader(new FileReader(nomGFile)); 33 | while( (curline=br.readLine()) != null){ 34 | if(curline.charAt(0) != '#'){ 35 | //System.err.println(curline); 36 | Group curgrp = new Group(curline, this); 37 | String curhlagn = curgrp.getHLAGeneName(); 38 | ArrayList groups = this.hlagene2groups.get(curhlagn); 39 | if(groups == null){ 40 | groups = new ArrayList(); 41 | this.hlagene2groups.put(curhlagn, groups); 42 | } 43 | groups.add(new Group(curline, this)); 44 | } 45 | } 46 | br.close(); 47 | }catch(IOException ioe){ 48 | ioe.printStackTrace(); 49 | } 50 | } 51 | 52 | public void addToAllele2Group(String allele, Group g){ 53 | this.allele2Group.put(allele, g); 54 | } 55 | 56 | public String getGroupNameForAllele(String a){ 57 | return this.allele2Group.get(a).getGroupName(); 58 | } 59 | 60 | } 61 | 62 | 63 | -------------------------------------------------------------------------------- /scripts/download_panel.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Part of Kourami HLA typer/assembler 3 | # (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | # See LICENSE for licensing. 5 | # 6 | 7 | #!/bin/bash 8 | 9 | function print_usage { 10 | echo "Usage: $0" 11 | echo "" 12 | echo "Note: This script downloads Kourami reference panel sequences." 13 | echo " The panel is downloaded in '/db' " 14 | exit 1; 15 | } 16 | 17 | echo "#---------------------------------------------------------------------" 18 | echo "# Run this script to download Kourami panel reference." 19 | echo "#---------------------------------------------------------------------" 20 | echo 21 | 22 | pushd `dirname $0` > /dev/null 23 | SCRIPTD=`pwd` 24 | popd > /dev/null 25 | kourami_home=$SCRIPTD/.. 26 | 27 | urlkouramipanel="https://github.com/Kingsford-Group/kourami/releases/download/v0.9/kouramiDB_3.24.0.tar.gz" 28 | 29 | bwa_bin=`(which bwa)` 30 | 31 | if [ ! -x "$bwa_bin" ];then 32 | echo "Please make sure bwa is installed." 33 | exit 1 34 | fi 35 | 36 | if [ $# -eq 0 ]; then 37 | echo "-----------------------------------------------" 38 | echo "| Downloading Kourami panel ... |" 39 | echo "-----------------------------------------------" 40 | echo 41 | wget -O- $urlkouramipanel | tar xzf - -C $kourami_home 42 | OUT=$? 43 | if [ ! $OUT -eq 0 ];then 44 | echo 45 | echo "-----------------------------------------------" 46 | echo "| Could NOT download Kourami reference panel! |" 47 | echo "-----------------------------------------------" 48 | exit 1 49 | fi 50 | echo "-----------------------------------------------" 51 | echo "| Indexing the Kourami panel ... |" 52 | echo "-----------------------------------------------" 53 | echo 54 | $bwa_bin index $kourami_home/db/All_FINAL_with_Decoy.fa.gz 55 | OUT=$? 56 | if [ ! $OUT -eq 0 ];then 57 | echo 58 | echo "-----------------------------------------------" 59 | echo "| There was a problem indexing Kourami panel |" 60 | echo "-----------------------------------------------" 61 | exit 1 62 | fi 63 | echo 64 | echo "----------------------------------" 65 | echo "| Kourami panel is installed at: |" 66 | echo "----------------------------------" 67 | readlink_bin=`(which readlink)` 68 | if [ ! -x "$readlink_bin" ];then 69 | echo "$kourami_home/db" 70 | else 71 | $readlink_bin -f $kourami_home/db 72 | fi 73 | else 74 | print_usage 75 | fi 76 | 77 | -------------------------------------------------------------------------------- /src/TmpPath.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | import java.util.ArrayList; 7 | 8 | import org.jgrapht.*; 9 | import org.jgrapht.graph.*; 10 | 11 | 12 | public class TmpPath{ 13 | 14 | private ArrayList orderedNodeList; 15 | 16 | public TmpPath(){ 17 | this.orderedNodeList = new ArrayList(); 18 | } 19 | 20 | public TmpPath(ArrayList l){ 21 | this.orderedNodeList = l; 22 | } 23 | 24 | public TmpPath clone(){ 25 | return new TmpPath(new ArrayList(this.orderedNodeList)); 26 | } 27 | 28 | public boolean merge(TmpPath p, SimpleDirectedWeightedGraph g){ 29 | Node thisTail = this.orderedNodeList.get(this.orderedNodeList.size()-1); 30 | Node pHead = p.getNthNode(0); 31 | 32 | if(g.getEdge(thisTail, pHead) !=null){ 33 | this.orderedNodeList.addAll(p.getOrderedNodeList()); 34 | return true; 35 | } 36 | return false; 37 | } 38 | 39 | public ArrayList getOrderedNodeList(){ 40 | return this.orderedNodeList; 41 | } 42 | 43 | public Path toPath(SimpleDirectedWeightedGraph g){ 44 | Path p = new Path(); 45 | Node cur = null; 46 | Node pre = this.orderedNodeList.get(0); 47 | for(int i=1;i 0) 68 | HLA.log.append("NOT CONTIGUOUS --> ( " + this.orderedNodeList.get(0).toString()); 69 | for(int i=1; i=s2l ? s1l : s2l); // updated identity using the lenght of longer sequence as its denominator 27 | this.hit = hit; 28 | this.gGroupName = ggn; 29 | this.perfectMatch = ip; 30 | } 31 | //imperfect match 32 | public Result(int fs, int al, int s1l, int s2l, int il, double id, StringBuffer sb, String hit, String ggn){ 33 | this(fs, al, s1l, s2l, il, id 34 | , sb, hit, ggn, false); 35 | } 36 | //perfect result 37 | public Result(int len, String hit, String ggn){ 38 | this(len, len, len, len, len, 1.0d 39 | , new StringBuffer(""), hit, ggn, true); 40 | } 41 | //perfect result 42 | public Result(int len, HLASequence hs2){ 43 | this(len, len, len, len, len, 1.0d 44 | , new StringBuffer(""), hs2.getSequence(), hs2.getGroup().getGroupString(), true); 45 | } 46 | 47 | public double getPairIdentity(Result other){ 48 | return (this.identicalLen + other.getIdenticalLen())*1.0d / (this.getMaxLen() + other.getMaxLen())*1.0d; 49 | } 50 | 51 | public int getMaxLen(){ 52 | if(this.s1len >= this.s2len) 53 | return this.s1len; 54 | return this.s2len; 55 | } 56 | 57 | public int getS1Len(){ 58 | return this.s1len; 59 | } 60 | 61 | public int getS2Len(){ 62 | return this.s2len; 63 | } 64 | 65 | public String getGGroupName(){ 66 | return this.gGroupName; 67 | } 68 | 69 | public boolean isPerfect(){ 70 | return this.isPerfect(); 71 | } 72 | 73 | public String getHit(){ 74 | return this.hit; 75 | } 76 | 77 | public String toAlignmentString(){ 78 | return this.outputBuffer.toString(); 79 | } 80 | 81 | public String toString(){ 82 | return finaScore + "\t" + alignedLen + "\t"+ s1len + "\t"+ s2len + "\t"+ identicalLen + "\t"+ identity; 83 | } 84 | 85 | public double getIdentity(){ 86 | return this.identity; 87 | } 88 | 89 | public int getIdenticalLen(){ 90 | return this.identicalLen; 91 | } 92 | 93 | public int getAlignedLen(){ 94 | return this.alignedLen; 95 | } 96 | 97 | public int getScore(){ 98 | return this.finaScore; 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /scripts/download_grch38.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Part of Kourami HLA typer/assembler 3 | # (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | # See LICENSE for licensing. 5 | # 6 | 7 | #!/bin/bash 8 | 9 | echo "#---------------------------------------------------------------------" 10 | echo "# This script is adopted from run-gen-ref in bwa.kit 0.7.12 by Heng Li" 11 | echo "# available from https://github.com/lh3/bwa/tree/master/bwakit" 12 | echo "#" 13 | echo "# run this to download GRCh38 reference." 14 | echo "#" 15 | echo "#---------------------------------------------------------------------" 16 | echo 17 | 18 | pushd `dirname $0` > /dev/null 19 | SCRIPTD=`pwd` 20 | popd > /dev/null 21 | resources_dir=$SCRIPTD/../resources 22 | 23 | url38NoAltDecoy="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna.gz" 24 | url38d="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set.fna.gz" 25 | url38="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" 26 | 27 | 28 | 29 | function print_usage { 30 | echo "Usage: $0 " 31 | echo "" 32 | echo " The file containting the HLA sequences is located at resources/hla_bwa.kit.fna.gz" 33 | echo "" 34 | echo "Analysis sets:" 35 | echo " hs38 primary assembly of GRCh38 (incl. chromosomes, unplaced and unlocalized contigs) and EBV" 36 | echo " hs38D hs38 + decoy contigs" 37 | echo " hs38DH hs38 + ALT contigs + decoy contigs + HLA genes (recommended for GRCh38 mapping)" 38 | echo " hs38NoAltDH hs38 + decoy contigs + HLA alleles from [bwa.kit] (recommended for HLA typing)" 39 | echo "" 40 | echo "Note: This script downloads human reference genomes from NCBI ftp server" 41 | exit 1; 42 | } 43 | #if [ $# -eq 2 ]; then 44 | # if [ ! -e $2 ]; then 45 | # echo "ERROR: $2 NOT FOUND!" 46 | # print_usage 47 | # fi 48 | #fi 49 | 50 | if [ $# -eq 1 ]; then 51 | if [ $1 == "hs38DH" ]; then 52 | if [ ! -e $resources_dir/$1.fa ]; then 53 | (wget -O- $url38d | gzip -dc; gzip -dc $resources_dir/hla_bwa.kit.fna.gz) > $resources_dir/$1.fa 54 | else 55 | print_usage 56 | fi 57 | elif [ $1 == "hs38" ]; then 58 | if [ $# -eq 1 ]; then 59 | (wget -O- $url38 | gzip -dc) > $resources_dir/$1.fa 60 | else 61 | print_usage 62 | fi 63 | elif [ $1 == "hs38D" ]; then 64 | if [ $# -eq 1 ]; then 65 | (wget -O- $url38d | gzip -dc) > $resources_dir/$1.fa 66 | else 67 | print_usage 68 | fi 69 | elif [ $1 == "hs38NoAltDH" ]; then 70 | if [ $# -eq 1 ]; then 71 | (wget -O- $url38NoAltDecoy | gzip -dc; gzip -dc $resources_dir/hla_bwa.kit.fna.gz) > $resources_dir/$1.fa 72 | else 73 | print_usage 74 | fi 75 | else 76 | echo "ERROR: unknown genome build" 77 | echo 78 | print_usage 79 | fi 80 | 81 | [ ! -f $resources_dir/$1.fa.bwt ] && echo -e "\nPlease run 'bwa index $resources_dir/$1.fa'...\n" 82 | else 83 | print_usage 84 | fi 85 | -------------------------------------------------------------------------------- /src/Node.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | //import java.util.HashSet; 7 | 8 | public class Node{ 9 | 10 | public Node(char b, int ci){ 11 | this.base = Character.toUpperCase(b); 12 | this.colIndex = ci; 13 | this.iBase = this.base2Index(); 14 | //this.rHash = new HashSet(); 15 | //moved rHash to CustomWeightedEdge 16 | } 17 | 18 | public Node(int ib, int ci){ 19 | 20 | } 21 | 22 | //moved rHash to CustomWeightedEdge 23 | /* 24 | public void addAllReadsFrom(HashSet otherRHash){ 25 | this.rHash.addAll(otherRHash); 26 | } 27 | 28 | public HashSet getReadHashSet(){ 29 | return this.rHash; 30 | }*/ 31 | 32 | public Node(Base b){ 33 | this(b.getBase(), b.getColPos()); 34 | } 35 | 36 | public int getIBase(){ 37 | return this.iBase; 38 | } 39 | 40 | public char getBase(){ 41 | return this.base; 42 | } 43 | 44 | public Character getBaseObj(){ 45 | return new Character(this.base); 46 | } 47 | 48 | public int getColIndex(){ 49 | return this.colIndex; 50 | } 51 | 52 | public void setColIndex(int ni){ 53 | this.colIndex = ni; 54 | } 55 | 56 | public boolean equals(Node other){ 57 | if(this.base == other.getBase()){ 58 | if(this.colIndex == other.getColIndex()) 59 | return true; 60 | } 61 | return false; 62 | } 63 | 64 | public int base2Index(){ 65 | if(this.base == 'A' || this.base == 'a') 66 | return 0; 67 | else if(this.base == 'C' || this.base == 'c') 68 | return 1; 69 | else if(this.base == 'G' || this.base == 'g') 70 | return 2; 71 | else if(this.base == 'T' || this.base == 't') 72 | return 3; 73 | else if(this.base == '-' || this.base == '.') 74 | return 4; 75 | else 76 | return 5; 77 | } 78 | 79 | public String toString(){ 80 | return "[" + base + "," + colIndex + "]"; 81 | } 82 | 83 | /* 84 | public void incrementNumPathInBubbleFwd(int inc){ 85 | this.numPathInBubbleFwd += inc; 86 | } 87 | 88 | public void incrementNumPathInBubbleRev(int inc){ 89 | this.numPathInBubbleFwd += inc; 90 | } 91 | 92 | public void setNumInBubbleFwd(int n){ 93 | this.numPathInBubbleFwd = n; 94 | } 95 | 96 | public void setNumInBubbleRev(int n){ 97 | this.numPathInBubbleRev = n; 98 | } 99 | 100 | public int getNumInBubbleFwd(){ 101 | return this.numPathInBubbleFwd; 102 | } 103 | 104 | public int getNumInBubbleRev(){ 105 | return this.numPathInBubbleRev; 106 | } 107 | 108 | public void initBubblePathsCounters(){ 109 | this.numPathInBubbleFwd = 0; 110 | this.numPathInBubbleRev = 0; 111 | } 112 | */ 113 | private char base; 114 | private int iBase; 115 | private int colIndex; 116 | 117 | private int numPathInBubbleFwd; 118 | private int numPathInBubbleRev; 119 | 120 | //moved rHash to CustomWeightedEdge 121 | /* 122 | public void addRead(int readNum){ 123 | this.rHash.add(new Integer(readNum)); 124 | } 125 | 126 | private HashSet rHash; 127 | */ 128 | } 129 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 5 | 4.0.0 6 | 7 | kingsford-group 8 | Kourami 9 | 0.9.4 10 | 11 | 12 | org.jgrapht 13 | jgrapht-core 14 | 0.9.1 15 | 16 | 17 | com.github.samtools 18 | htsjdk 19 | 2.1.1 20 | 21 | 22 | commons-cli 23 | commons-cli 24 | 1.4 25 | 26 | 27 | it.unimi.dsi 28 | fastutil 29 | 7.0.13 30 | 31 | 32 | org.jgrapht 33 | jgrapht-ext 34 | 0.9.1 35 | 36 | 37 | org.jgrapht 38 | jgrapht-demo 39 | 0.9.1 40 | 41 | 42 | 43 | install 44 | ${basedir}/target 45 | ${project.artifactId} 46 | ${basedir}/src/ 47 | ${basedir}/target/classes 48 | 49 | 50 | org.apache.maven.plugins 51 | maven-jar-plugin 52 | 2.4 53 | 54 | 55 | 56 | true 57 | HLA 58 | dependency-jars/ 59 | 60 | 61 | 62 | 63 | 64 | org.apache.maven.plugins 65 | maven-dependency-plugin 66 | 2.5.1 67 | 68 | 69 | copy-dependencies 70 | package 71 | 72 | copy-dependencies 73 | 74 | 75 | 76 | ${project.build.directory}/dependency-jars/ 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | . 86 | 87 | README.md 88 | LICENSE 89 | 90 | 91 | 92 | 93 | 94 | UTF-8 95 | 1.8 96 | 1.8 97 | 98 | 99 | Kingsford Group 100 | http://kingsfordlab.cbd.cmu.edu 101 | 102 | 103 | 104 | heewookl 105 | Heewook Lee 106 | heewookl@cs.cmu.edu 107 | 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /src/DNAString.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | import java.util.*; 7 | 8 | public class DNAString{ 9 | 10 | private String hlaGeneName; 11 | private int sbnum; //superbubble number 12 | private ArrayList pathnums; // path number 13 | private double weightedIntersectionSum; 14 | private double probability; 15 | 16 | private StringBuffer sequence; 17 | 18 | public DNAString deepCopy(){ 19 | return new DNAString(this.hlaGeneName, this.sbnum, this.pathnums 20 | , this.weightedIntersectionSum, this.probability 21 | , this.sequence.toString()); 22 | } 23 | 24 | public DNAString mergeDeep(DNAString other){ 25 | DNAString td = this.deepCopy(); 26 | return td.merge(other); 27 | } 28 | 29 | private DNAString merge(DNAString other){ 30 | if(this.hlaGeneName.equals(other.getHlaGeneName())){ 31 | for(Integer i: other.getPathnums()) 32 | this.pathnums.add(i); 33 | this.weightedIntersectionSum += other.getWeightedIntersectionSum(); 34 | this.probability = this.probability * other.getProbability(); 35 | this.sequence.append(other.getSequence()); 36 | }else{ 37 | System.err.println("[DNAString.merge(DNAString other)]HLA Gene Names don't match:\t" 38 | +this.hlaGeneName + "\t" + other.getHlaGeneName()); 39 | } 40 | return this; 41 | } 42 | 43 | public String pathnums2String(){ 44 | StringBuffer bf = new StringBuffer(); 45 | int count = 0; 46 | for(Integer i: this.pathnums){ 47 | if(count > 0) 48 | bf.append(":"); 49 | bf.append(i.intValue()); 50 | count++; 51 | } 52 | return bf.toString(); 53 | } 54 | 55 | public StringBuffer toFasta(){ 56 | StringBuffer bf = new StringBuffer(">"+ this.hlaGeneName + "_" + sbnum + "_" 57 | + pathnums2String() + "\t" 58 | + this.weightedIntersectionSum + "\t" + this.probability + "\n"); 59 | bf.append(sequence.toString() + "\n"); 60 | return bf; 61 | } 62 | 63 | public DNAString(String hgn, int sbn, int pn, double wis, double p){ 64 | this.hlaGeneName = hgn; 65 | this.sbnum = sbn; 66 | this.pathnums = new ArrayList(); 67 | this.pathnums.add(pn); 68 | this.weightedIntersectionSum = wis; 69 | this.probability = p; 70 | this.sequence = new StringBuffer(); 71 | } 72 | 73 | public DNAString(String hgn, int sbn, int pn, double wis, double p, String seq){ 74 | this.hlaGeneName = hgn; 75 | this.sbnum = sbn; 76 | this.pathnums = new ArrayList(); 77 | this.pathnums.add(pn); 78 | this.weightedIntersectionSum = wis; 79 | this.probability = p; 80 | this.sequence = new StringBuffer(); 81 | this.sequence.append(seq); 82 | } 83 | 84 | public DNAString(String hgn, int sbn, ArrayList pns, double wis, double p, String seq){ 85 | this.hlaGeneName = hgn; 86 | this.sbnum = sbn; 87 | this.pathnums = new ArrayList(); 88 | for(Integer i : pns) 89 | this.pathnums.add(i); 90 | this.weightedIntersectionSum = wis; 91 | this.probability = p; 92 | this.sequence = new StringBuffer(); 93 | this.sequence.append(seq); 94 | } 95 | 96 | public void append(String seg){ 97 | this.sequence.append(seg); 98 | } 99 | 100 | public String getHlaGeneName(){ 101 | return this.hlaGeneName; 102 | } 103 | 104 | public int getSbnum(){ 105 | return this.sbnum; 106 | } 107 | 108 | public ArrayList getPathnums(){ 109 | return this.pathnums; 110 | } 111 | 112 | public int getFirstPathnum(){ 113 | return this.pathnums.get(0).intValue(); 114 | } 115 | 116 | public double getWeightedIntersectionSum(){ 117 | return this.weightedIntersectionSum; 118 | } 119 | 120 | public double getProbability(){ 121 | return this.probability; 122 | } 123 | 124 | public String getSequence(){ 125 | return this.sequence.toString(); 126 | } 127 | 128 | } 129 | -------------------------------------------------------------------------------- /src/Needle.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | import java.io.*; 7 | 8 | /* simple implementation of Needleman-Wunsch alignment*/ 9 | public class Needle{ 10 | 11 | public static Result run(String s1, HLASequence hs2){ 12 | String s2 = hs2.getSequence(); 13 | if(s1.equals(s2)){ 14 | return new Result(s1.length(), hs2); 15 | } 16 | return runNeedle(s1, s2, hs2); 17 | } 18 | 19 | private static Result runNeedle(String s1, String s2, HLASequence hs2){ 20 | int[][] dptable = new int[s1.length() + 1][s2.length() + 1]; 21 | StringBuffer output = new StringBuffer(); 22 | /* init dp table*/ 23 | for(int i=1; i<=s1.length(); i++){ 24 | dptable[i][0] = dptable[i-0][0] - 1; 25 | if(printtable) 26 | output.append(dptable[i][0] + " "); 27 | } 28 | if(printtable) 29 | output.append("\n"); 30 | for(int i=1; i<=s2.length(); i++) 31 | dptable[0][i] = dptable[0][i-0] - 1; 32 | 33 | /* fill table */ 34 | for(int i=1; i<=s1.length(); i++){ 35 | for(int j=1; j<=s2.length(); j++){ 36 | if(printtable) 37 | output.append(dptable[0][j] + " "); 38 | char s1char = s1.charAt(i-1); 39 | char s2char = s2.charAt(j-1); 40 | int match = dptable[i-1][j-1] + (s1char == s2char ? sMatch : sMismatch); 41 | int ins = dptable[i][j-1] + sGap; 42 | int del = dptable[i-1][j] + sGap; 43 | dptable[i][j] = Math.max(match, Math.max(ins, del)); 44 | if(printtable) 45 | output.append(dptable[i][j] + " "); 46 | } 47 | if(printtable) 48 | output.append("\n"); 49 | } 50 | 51 | /* traceback */ 52 | StringBuffer sa1 = new StringBuffer(); 53 | StringBuffer sa2 = new StringBuffer(); 54 | int i = s1.length(); 55 | int j = s2.length(); 56 | int stepcost = 0; 57 | int identityLen = 0; 58 | int alignLen = 0; 59 | while( i > 0 || j > 0){ 60 | char s1char = '0'; 61 | char s2char = '0'; 62 | if(i > 0) 63 | s1char = s1.charAt(i-1); 64 | if(j > 0) 65 | s2char = s2.charAt(j-1); 66 | boolean match = (s1char == s2char); 67 | 68 | //if match 69 | if( i > 0 && j > 0 && 70 | dptable[i][j] == dptable[i-1][j-1] + (match ? sMatch : sMismatch)){ 71 | if(match) 72 | identityLen++; 73 | sa1.append(s1char); 74 | sa2.append(s2char); 75 | i--; 76 | j--; 77 | } 78 | //del 79 | else if( i > 0 && 80 | dptable[i][j] == dptable[i-1][j] + sGap){ 81 | sa1.append(s1char); 82 | sa2.append("-"); 83 | i--; 84 | } 85 | //ins 86 | else if( j > 0 && 87 | dptable[i][j] == dptable[i][j-1] + sGap){ 88 | sa1.append("-"); 89 | sa2.append(s2char); 90 | j--; 91 | } 92 | } 93 | output.append("s1: " + sa1.reverse().toString() + "\n"); 94 | output.append("s2: " + sa2.reverse().toString() + "\n"); 95 | int score = dptable[s1.length()][s2.length()]; 96 | int alignedLen = sa1.length(); 97 | double identity = identityLen*1.0d / (s2.length()); 98 | //double identityWithLongerDenom = identityLen*1.9d/(s2.length() >= s1.length() ? s2.length() : s1.length()); 99 | output.append("score = " + score + "\n"); 100 | output.append("alignedLen = " + alignedLen + "\n"); 101 | output.append("identity = " + identity + "\n"); 102 | output.append("#edits = " + (s2.length() - identityLen) + "\n"); 103 | if(hs2 != null) 104 | return new Result(score, sa1.length(), s1.length(), s2.length(), identityLen, identity, output, s2, hs2.getGroup().getGroupString()); 105 | else 106 | return new Result(score, sa1.length(), s1.length(), s2.length(), identityLen, identity, output, s2, "tmpG"); 107 | } 108 | 109 | public static void main(String[] args){ 110 | Result rslt = runNeedle(getSequence(args[0]), getSequence(args[1]), null); 111 | System.out.println(rslt.toAlignmentString()); 112 | } 113 | 114 | public static String getSequence(String sf){ 115 | BufferedReader br = null; 116 | StringBuffer bf = new StringBuffer(); 117 | try{ 118 | br = new BufferedReader(new FileReader(sf)); 119 | String curline = ""; 120 | 121 | while((curline = br.readLine())!=null){ 122 | if(!curline.startsWith(">")) 123 | bf.append(curline); 124 | } 125 | br.close(); 126 | }catch(IOException ioe){ 127 | ioe.printStackTrace(); 128 | } 129 | return bf.toString(); 130 | } 131 | 132 | private static int sMatch = 1; 133 | private static int sMismatch = 0; 134 | private static int sGap = 0; 135 | private static boolean printtable = false; 136 | 137 | } 138 | -------------------------------------------------------------------------------- /src/Base.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | 7 | public class Base{ 8 | 9 | public char base; //AaCcGgTT. 10 | public int iBase; //0 1 2 3 4 A C G T -, 5 for others. 11 | public int basePos; //1-based basePosition 12 | public int colPos; //1-based columnPosition 13 | public int base2colOffset; 14 | public boolean exon; 15 | public int intronExonNumber; //1-based --> if exon, and intronExonNumber is 1 : Exon1 16 | public int frame; 17 | 18 | public Base deepCopy(){ 19 | return new Base(this.base, this.iBase, this.basePos, this.colPos, this.base2colOffset, this.exon, this.intronExonNumber, this.frame); 20 | } 21 | 22 | public Base deepCopyWithOffset(int offsetDifference){ 23 | return new Base(this.base, this.iBase, this.basePos - offsetDifference, this.colPos, this.base2colOffset + offsetDifference, this.exon, this.intronExonNumber, this.frame); 24 | } 25 | 26 | public String toString(){ 27 | return "b:" + this.base + "|ib:" + iBase + "|bp:" + basePos + "|cp:" + colPos + "|b2co:" + base2colOffset + "|exon?:" + exon + "|ieNum:" + intronExonNumber + "|frame:" + frame; 28 | } 29 | 30 | public Base(char b, int i, int bp, int cp, int b2co, boolean e, int ien, int f){ 31 | this.base = b; 32 | this.iBase = i; 33 | this.basePos = bp; 34 | this.colPos = cp; 35 | this.base2colOffset = b2co; 36 | this.exon = e; 37 | this.intronExonNumber = ien; 38 | this.frame = f; 39 | } 40 | 41 | public Base(char b, int bp, int cp, int b2co, boolean e, int ien){ 42 | this.base = b; 43 | this.iBase = Base.char2ibase(b); 44 | this.basePos = bp; 45 | this.colPos = cp; 46 | this.base2colOffset = b2co; 47 | this.exon = e; 48 | this.intronExonNumber = ien; 49 | this.frame = -1; 50 | 51 | //if(this.basePos != (this.colPos - this.base2colOffset) ){ 52 | // System.err.println("Coordinates don't match :" + this.toString()); 53 | //} 54 | } 55 | 56 | public static char ibase2char(int i){ 57 | if(i == 0) 58 | return 'A'; 59 | else if(i == 1) 60 | return 'C'; 61 | else if(i == 2) 62 | return 'G'; 63 | else if(i == 3) 64 | return 'T'; 65 | else if(i == 4) 66 | return '.'; 67 | else 68 | return 'N'; 69 | } 70 | 71 | public static int char2ibase(char c){ 72 | if(c == 'A' || c == 'a') 73 | return 0; 74 | else if(c == 'C' || c == 'c') 75 | return 1; 76 | else if(c == 'G' || c == 'g') 77 | return 2; 78 | else if(c == 'T' || c == 't') 79 | return 3; 80 | else if(c == '.' || c == '-') 81 | return 4; 82 | else 83 | return 5; 84 | } 85 | 86 | public boolean isMatch(int ib){ 87 | if(ib == this.iBase) 88 | return true; 89 | return false; 90 | } 91 | 92 | public boolean isMatch(char b){ 93 | if(this.base == b || this.base == Character.toUpperCase(b)) 94 | return true; 95 | return false; 96 | } 97 | 98 | public static boolean isBase(int ib){ 99 | if(ib >-1 && ib<4 || ib == 5) 100 | return true; 101 | return false; 102 | } 103 | 104 | public boolean isBase(){ 105 | if(this.iBase > -1 && this.iBase < 4 || this.iBase == 5) 106 | return true; 107 | return false; 108 | } 109 | 110 | public static boolean isBase(char base){ 111 | if(base == 'A' || base == 'C' || base == 'G' || base == 'T' 112 | || base == 'a' || base == 'c' || base == 'g' || base == 't' 113 | || base == 'N' || base == 'n'){ 114 | return true; 115 | } 116 | return false; 117 | } 118 | 119 | public static boolean isGap(int ib){ 120 | if(ib == 4) 121 | return true; 122 | return false; 123 | } 124 | 125 | public static boolean isGap(char base){ 126 | if(base == '.') 127 | return true; 128 | return false; 129 | } 130 | 131 | public char getBase(){ 132 | return this.base; 133 | } 134 | 135 | public int getIBase(){ 136 | return this.iBase; 137 | } 138 | 139 | public Character getBaseUpperObj(){ 140 | return Character.valueOf(Character.toUpperCase(this.base)); 141 | } 142 | 143 | public int getBasePos(){ 144 | return this.basePos; 145 | } 146 | 147 | public int getColPos(){ 148 | return this.colPos; 149 | } 150 | 151 | public int base2colOffset(){ 152 | return this.base2colOffset; 153 | } 154 | 155 | public int getFrame(){ 156 | return this.frame; 157 | } 158 | 159 | public int getIntronExonNumber(){ 160 | return this.intronExonNumber; 161 | } 162 | 163 | public boolean isExon(){ 164 | return this.exon; 165 | } 166 | } 167 | 168 | -------------------------------------------------------------------------------- /src/CustomHashMap.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | import it.unimi.dsi.fastutil.ints.Int2IntMap; 7 | import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; 8 | import it.unimi.dsi.fastutil.objects.ObjectIterator; 9 | import it.unimi.dsi.fastutil.ints.IntIterator; 10 | 11 | public class CustomHashMap extends Int2IntOpenHashMap{ 12 | 13 | 14 | public CustomHashMap(){ 15 | super(); 16 | this.defaultReturnValue(-1);//default phred value of -1 to indicate no entry in the hash. 17 | } 18 | 19 | public CustomHashMap(int expected){ 20 | super(expected); 21 | } 22 | 23 | public CustomHashMap(int[] k, int[] v){ 24 | super(k, v); 25 | } 26 | 27 | public CustomHashMap(int[] k, int[] v, float f){ 28 | super(k, v, f); 29 | } 30 | 31 | public CustomHashMap(Int2IntMap m){ 32 | super(m); 33 | } 34 | 35 | public CustomHashMap(Int2IntMap m, float f){ 36 | super(m, f); 37 | } 38 | 39 | public CustomHashMap(int expected, float f){ 40 | super(expected, f); 41 | } 42 | 43 | public void printReads(){ 44 | IntIterator itr = this.keySet().iterator(); 45 | HLA.log.append("{"); 46 | while(itr.hasNext()) 47 | HLA.log.append(" (" +itr.nextInt() + ") "); 48 | HLA.log.appendln("}"); 49 | 50 | } 51 | 52 | /* performs union of two HashMap based on Keys */ 53 | public boolean union(CustomHashMap other){ 54 | boolean modified = false; 55 | for(Int2IntMap.Entry otherEntry : other.int2IntEntrySet()){ 56 | if(!this.containsKey(otherEntry.getIntKey())){ 57 | this.put(otherEntry.getIntKey(), otherEntry.getIntValue()); 58 | modified = true; 59 | } 60 | } 61 | return modified; 62 | } 63 | 64 | /* performs intersection of two HashMap based on Keys */ 65 | public boolean intersection(CustomHashMap other){ 66 | boolean modified = false; 67 | IntIterator itr = this.keySet().iterator(); 68 | while(itr.hasNext()){ 69 | int curInt = itr.nextInt(); 70 | if(!other.containsKey(curInt)){ 71 | itr.remove(); 72 | modified = false; 73 | } 74 | } 75 | return modified; 76 | } 77 | 78 | /* 79 | * Performs a special operation for paired-end 80 | * Keys of pairing reads are additive inverse to each other. 81 | * Elements are kepts as long as one additive inverse is in each set. 82 | * 83 | */ 84 | public boolean intersectionPE(CustomHashMap other){ 85 | boolean modified = false; 86 | 87 | IntIterator itr = this.keySet().iterator(); 88 | while(itr.hasNext()){ 89 | int key = itr.nextInt(); 90 | int keyInv = 0 - key; 91 | if(!other.containsKey(key) && !other.containsKey(keyInv)){ 92 | itr.remove(); 93 | modified = true; 94 | } 95 | } 96 | 97 | for(int otherKey : other.keySet()){ 98 | int otherKeyInv = 0-otherKey; 99 | boolean intst = this.containsKey(otherKey); //does this contain otherKey? 100 | boolean intstPrime = this.containsKey(otherKeyInv); //does this contain -(otherkey)? 101 | 102 | if(!intst && intstPrime){ 103 | this.put(otherKey, other.get(otherKey)); 104 | modified = true; 105 | } 106 | } 107 | return modified; 108 | } 109 | 110 | 111 | //union of this and (intersectionPE of this and other) 112 | //this just add PE reads of other that is missing in this readset 113 | public boolean addPEReads(CustomHashMap other){ 114 | boolean modified = false; 115 | for(int otherKey : other.keySet()){ 116 | int otherKeyInv = 0-otherKey; 117 | boolean intst = this.containsKey(otherKey); 118 | boolean intstPrime = this.containsKey(otherKeyInv); 119 | if(!intst && intstPrime){ 120 | this.put(otherKey, other.get(otherKey)); 121 | modified = true; 122 | } 123 | } 124 | return modified; 125 | } 126 | 127 | 128 | public boolean removeAll(CustomHashMap other){ 129 | boolean modified = false; 130 | IntIterator itr = other.keySet().iterator(); 131 | int curKey; 132 | while(itr.hasNext()){ 133 | curKey = itr.next(); 134 | if(this.containsKey(curKey)){ 135 | this.remove(curKey); 136 | modified = true; 137 | } 138 | } 139 | return modified; 140 | } 141 | 142 | public boolean addAll(CustomHashMap other){ 143 | return this.union(other); 144 | } 145 | 146 | public boolean retainAll(CustomHashMap other){ 147 | return this.intersection(other); 148 | } 149 | 150 | /* public CustomHashMap deepCopy(){ 151 | CustomHashMap tmp = new CustomHashMap(); 152 | for(Int2IntMap.Entry thisEntry : this.int2IntEntrySet()) 153 | tmp.put(thisEntry.getIntKey(), thisEntry.getIntValue()); 154 | return tmp; 155 | }*/ 156 | 157 | public CustomHashMap clone(){ 158 | return (CustomHashMap)super.clone(); 159 | } 160 | 161 | public void printKeys(){ 162 | IntIterator itr = this.keySet().iterator(); 163 | HLA.log.append("{"); 164 | if(this.size() > 0) 165 | HLA.log.append(itr.nextInt()); 166 | while(itr.hasNext()) 167 | HLA.log.append("," + itr.nextInt()); 168 | HLA.log.appendln("}"); 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /src/BubblePathLikelihoodScores.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | import it.unimi.dsi.fastutil.ints.IntArrayList; 7 | 8 | public class BubblePathLikelihoodScores{ 9 | 10 | private double[][] logFractionScores; // this is just based on the number of reads and the total numeber of reads for this bubble. 11 | 12 | private double[][] logScores; //only the upper triangle is used 13 | //private int[] maxHomoLogScoresIndicies; 14 | private int maxHomoGenotypeIndex; 15 | private int maxHeteroGenotypeIndex1; 16 | private int maxHeteroGenotypeIndex2; 17 | 18 | private int doubleCountH1; //2 x #reads assigned to H1 19 | private int doubleCountH2; //2 x #reads assigned to H2 20 | 21 | // must be i<=j (upper triangle only) 22 | public double getLogScore(int i, int j){ 23 | return this.logScores[i][j]; 24 | } 25 | 26 | // must be i<=j (upper triangle only) 27 | public double getLogFractionScore(int i, int j){ 28 | return this.logFractionScores[i][j]; 29 | } 30 | 31 | public double[][] getLogScores(){ 32 | return this.logScores; 33 | } 34 | 35 | public double[][] getLogFractionScores(){ 36 | return this.logFractionScores; 37 | } 38 | 39 | public int getMaxHomoGenotypeIndex(){ 40 | return this.maxHomoGenotypeIndex; 41 | } 42 | 43 | public double getMaxHomoScore(){ 44 | return this.logScores[this.maxHomoGenotypeIndex][this.maxHomoGenotypeIndex]; 45 | } 46 | 47 | public int[] getMaxHeteroIndex(){ 48 | int[] tmp = new int[2]; 49 | tmp[0] = this.maxHeteroGenotypeIndex1; 50 | tmp[1] = this.maxHeteroGenotypeIndex2; 51 | return tmp; 52 | } 53 | 54 | public int getMaxHeteroGenotypeIndex1(){ 55 | return this.maxHeteroGenotypeIndex1; 56 | } 57 | 58 | public int getMaxHeteroGenotypeIndex2(){ 59 | return this.maxHeteroGenotypeIndex2; 60 | } 61 | 62 | public double getMaxHeteroScore(){ 63 | return this.logScores[this.maxHeteroGenotypeIndex1][this.maxHeteroGenotypeIndex2]; 64 | } 65 | 66 | public int getDoubleCountH1(){ 67 | return this.doubleCountH1; 68 | } 69 | 70 | public int getDoubleCountH2(){ 71 | return this.doubleCountH2; 72 | } 73 | 74 | public BubblePathLikelihoodScores(int numPaths){ 75 | this.logFractionScores = new double[numPaths][numPaths]; 76 | this.logScores = new double[numPaths][numPaths]; 77 | for(int i=0;i 1) 87 | this.maxHeteroGenotypeIndex2 = 1; 88 | else 89 | this.maxHeteroGenotypeIndex2 = 0; 90 | } 91 | 92 | // since we are using UPPER triangle only j >= i 93 | public void updateMax(int i, int j, double logScore, double readFractionScore, int dcH1, int dcH2){ 94 | if(j this.logScores[this.maxHeteroGenotypeIndex1][this.maxHeteroGenotypeIndex2]){ 109 | this.maxHeteroGenotypeIndex1 = i; 110 | this.maxHeteroGenotypeIndex2 = j; 111 | this.doubleCountH1 = dcH1; 112 | this.doubleCountH2 = dcH2; 113 | 114 | if(HLA.DEBUG){ 115 | HLA.log.appendln("Updating Max Hetero Counts: curMax:" + this.logScores[this.maxHeteroGenotypeIndex1][this.maxHeteroGenotypeIndex2]); 116 | HLA.log.appendln("\t|H1|x2=" + doubleCountH1 + "\t|H2|x2=" + doubleCountH2); 117 | } 118 | } 119 | }else{/* Homozygous */ 120 | if(HLA.DEBUG) 121 | HLA.log.appendln("[HOMO]"); 122 | if(logScore > this.logScores[this.maxHomoGenotypeIndex][this.maxHomoGenotypeIndex]) 123 | this.maxHomoGenotypeIndex = i; 124 | 125 | } 126 | this.logScores[i][j] = logScore; 127 | this.logFractionScores[i][j] = readFractionScore; 128 | 129 | } 130 | 131 | public void applyRemoval(IntArrayList sortedRemovalList){ 132 | for(int i=sortedRemovalList.size() -1; i>=0; i--) 133 | this.remove(sortedRemovalList.getInt(i)); 134 | } 135 | 136 | //removes a path so we update the logScores and all indicies 137 | private void remove(int n){ 138 | double[][] newLogScores = new double[this.logScores.length][this.logScores.length]; 139 | for(int i=0;i n) 155 | this.maxHomoGenotypeIndex = this.maxHomoGenotypeIndex - 1; 156 | 157 | if(maxHeteroGenotypeIndex1 > n) 158 | this.maxHeteroGenotypeIndex1 = this.maxHeteroGenotypeIndex1 - 1; 159 | 160 | if(maxHeteroGenotypeIndex2 > n) 161 | this.maxHeteroGenotypeIndex2 = this.maxHeteroGenotypeIndex2 - 1; 162 | 163 | } 164 | 165 | 166 | public void setHeteroIndicies(int i, int j){ 167 | this.maxHeteroGenotypeIndex1 = i; 168 | this.maxHeteroGenotypeIndex2 = j; 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /scripts/formatIMGT.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Part of Kourami HLA typer/assembler 3 | # (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | # See LICENSE for licensing. 5 | # 6 | 7 | #!/bin/bash 8 | 9 | pushd `dirname $0` > /dev/null 10 | SCRIPTD=`pwd` 11 | popd > /dev/null 12 | 13 | jvm_memory=4G 14 | resource_dir=$SCRIPTD/../resources 15 | db_base=$SCRIPTD/../custom_db 16 | imgt_ver_num=0 17 | me=`basename $0` 18 | 19 | kourami=$SCRIPTD/../target/Kourami.jar 20 | if [ ! -e "$SCRIPTD/../target/Kourami.jar" ];then 21 | if [ -e "$SCRIPTD/../build/Kourami.jar" ];then 22 | kourami=$SCRIPTD/../build/Kourami.jar 23 | elif [ -e "$SCRIPTD/../Kourami.jar"];then 24 | echo "Could not find Kourami.jar. The jar must be located under target, build or kourami installation directory." 25 | exit 1 26 | fi 27 | fi 28 | 29 | function usage { 30 | echo "IMGT/HLA DB formatter for Kourami" 31 | echo "Note: Run this script to generate Kourami-formatted IMGT/HLA gene-wise MSA" 32 | echo " and the reference panel sequences for read alignment" 33 | echo 34 | echo "USAGE: /$me -i [IMGT/HLA alignments dir] " 35 | echo 36 | echo "------------------ Required Parameters -----------------" 37 | echo " -i [input_dir] : path to IMGT/HLA alignments directory " 38 | echo 39 | echo "------------------ Optional Parameters -----------------" 40 | echo " -v [ver_number] : version number is automatically taken from " 41 | echo " IMGT/HLA alignment files. If specified, it " 42 | echo " overrides the version in IMGT/HLA alignment" 43 | echo " files." 44 | echo " -o [output_dir] : name of the directory the output will be " 45 | echo " : written to. Output is written to [ver_number]" 46 | echo " directory under [output_dir]. " 47 | echo " (default : custom_db under Kourami installation.)" 48 | echo " -h : print this message." 49 | echo 50 | exit 1 51 | } 52 | 53 | # print usage when no argument is given 54 | if [ $# -lt 1 ]; then 55 | usage 56 | fi 57 | 58 | while getopts i:v:o:h FLAG; do 59 | case $FLAG in 60 | i) 61 | input_msa=$OPTARG 62 | nomg=$input_msa/hla_nom_g.txt 63 | ;; 64 | v) 65 | if [ "$OPTARG" == "0" ]; then 66 | echo 'Invalid version number: $OPTARG' 67 | usage 68 | fi 69 | imgt_ver_num=$OPTARG 70 | ;; 71 | o) 72 | db_base=$OPTARG 73 | ;; 74 | h) 75 | usage 76 | ;; 77 | \?) 78 | echo "Unrecognized option -$OPTARG. See usage:" 79 | usage 80 | ;; 81 | esac 82 | done 83 | 84 | if [[ -z "$input_msa" ]];then 85 | echo 86 | echo "ERROR: IMGT/HLA alignments directory must be provided." 87 | echo 88 | usage 89 | fi 90 | 91 | 92 | if [ ! -e "$nomg" ];then 93 | echo 94 | echo "ERROR: Missing $nomg. hla_nom_g.txt must be downloaded from the same IMGT/HLA release and be placed in the alignments directory [input]" 95 | echo 96 | usage 97 | fi 98 | 99 | shift $((OPTIND-1)) 100 | 101 | if [ $# -gt 0 ]; then 102 | echo "Unrecognized paramester(s) $@" 103 | usage 104 | fi 105 | 106 | if [ ! -e "$resource_dir/HLA_decoys.fa" ];then 107 | echo "Missing decoy sequence for Kourami panel in the resrouce directory. Please git pull or clone" 108 | exit 1 109 | fi 110 | 111 | if [ ! -e "$resource_dir/DRB5_gen.txt" ];then 112 | echo "Missing DRB5_gen.txt in the resource directory. Please git pull or git clone" 113 | exit 1 114 | else 115 | cp $resource_dir/DRB5_gen.txt $input_msa/. 116 | fi 117 | 118 | mkdir -p $db_base 119 | OUT=$? 120 | if [ ! $OUT -eq 0 ];then 121 | echo " Cannot create $db_base." 122 | exit 1 123 | fi 124 | 125 | logfilen=`(date +kourami_formatIMGT.%H%M%S%m%d%y.log)` 126 | logfile=$db_base/$logfilen 127 | 128 | echo ">>>>>>>>>>>>>> IMGT/HLA DB --> Kourami formatted DB/panel" 129 | java -Xmx$jvm_memory -cp $kourami FormatIMGT $input_msa $imgt_ver_num $db_base 2> $logfile 130 | 131 | OUT=$? 132 | 133 | #### if something has gone wrong during formatting 134 | if [ ! $OUT -eq 0 ];then 135 | echo " An error has occurred while formating IMGT/HLA DB." 136 | echo " See log file: $logfile" 137 | exit 1 138 | else 139 | if [ "$imgt_ver_num" == "0" ];then 140 | echo "Getting IMGT/HLA DB release number automatically" 141 | verline=`head -n1 $logfile` 142 | if [[ $verline == IMGTver* ]];then 143 | IFS=' ' read -a tokens <<< "${verline}" 144 | echo IMGT/HLA Release: ${tokens[1]} 145 | imgt_ver_num=${tokens[1]} 146 | fi 147 | else 148 | echo "Using the user-input version: $imgt_ver_num" 149 | fi 150 | finalLogFile=$db_base/$imgt_ver_num/$logfilen 151 | if [ -e $db_base/$imgt_ver_num ];then 152 | mv $logfile $finalLogFile 153 | echo "Formatting finished. (logfile: $finalLogFile)" 154 | else 155 | echo "Formatting finished. (logfile: $logFile)" 156 | fi 157 | echo 158 | outdir=`readlink -e $db_base/$imgt_ver_num/` 159 | echo "-------------------------------------------------" 160 | echo " Kourami Formatted db written to: " 161 | echo " $outdir" 162 | echo "-------------------------------------------------" 163 | echo 164 | fi 165 | 166 | 167 | #### putting the panel sequence together 168 | touch $db_base/$imgt_ver_num/All_FINAL_with_Decoy.fa.gz 169 | panelseq=`readlink -e $db_base/$imgt_ver_num/All_FINAL_with_Decoy.fa.gz` 170 | cat $db_base/$imgt_ver_num/*.merged.fa $resource_dir/HLA_decoys.fa | gzip > $panelseq 171 | cp $nomg $db_base/$imgt_ver_num/. 172 | 173 | #### bwa indexing panel sequence 174 | bwa_bin=`(which bwa)` 175 | 176 | if [ ! -x "$bwa_bin" ];then 177 | echo "[ERROR]: bwa NOT found! Kourami formatted reference panel MUST be indexed before using it." 178 | echo "Please install the latest copy of bwa and index the panel by running: " 179 | echo "bwa index $panelseq" 180 | echo 181 | exit 1 182 | else 183 | echo ">>>>>>>>>>>>>> BWA indexing : Kourami reference panel" 184 | $bwa_bin index $db_base/$imgt_ver_num/All_FINAL_with_Decoy.fa.gz 185 | echo "-------------------------------------------------" 186 | echo " Indexed Kourami panel sequence :" 187 | echo " $panelseq" 188 | echo "-------------------------------------------------" 189 | echo 190 | fi 191 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
  2 | -hhy+.                o o       o o       o o o o       o o
  3 | .`           -syss:---.`        o     o o o     o o o         o o o     o o o
  4 | :+:`     .:/o+++++///ommy+`    o       _  __                               _
  5 | `yhs/..:osssooooo++++dmNNNdo`   o     | |/ /___  _   _ _ __ __ _ _ __ ___ (_)
  6 |  /syy///++++ooooooooodNMdNdmh: o      | ' // _ \| | | | '__/ _` | '_ ` _ \| |
  7 |  -do/` .://++++++++oodmmmmmmd-        | . \ (_) | |_| | | | (_| | | | | | | |
  8 |  .+:     `.://///+///ommmmdy-         |_|\_\___/ \__,_|_|  \__,_|_| |_| |_|_|
  9 |   .          -syo----..``          
 10 |             +y+.                
 11 | 
12 | 13 | # Overview 14 | 15 | Kourami is a graph-guided assembler for HLA haplotypes covering typing exons (exons 2 and 3 for Class I and exon 3 for Class II) 16 | using high-coverage whole genome sequencing data. Kourami constructs highly accurate haplotype sequences at 1-bp resolution by 17 | first encoding currently available HLA allelic sequences from IPD-IMGT/HLA Database ( http://www.ebi.ac.uk/ipd/imgt/hla/ ) 18 | as partial-ordered graphs. Each database allele is naturally encoded as a path through the graph and any detectable genetic 19 | variations (SNPs or indels) not captured by the known sequences are added to the graph by graph-modification based on read alignment 20 | to capture differences novel alleles have compared to known sequences. Unlike previously available WGS-based HLA typing methods 21 | (database-matching techniques), Kourami direclty assembles both haplotypes for each HLA gene (HLA-A, -B, -C, -DQA1, -DQB1, -DRB1). 22 | From version 0.9.4 or later, Kourami supports additional HLA loci. It also provides the typing result (6-digit 'G' resolution) by 23 | outputing the best matching alleles among the known sequences whenever 'G' grouping information is available. 24 | 25 | 26 | # Release 27 | 28 | The latest release, including both jar and source code can be downloaded from [here](https://github.com/Kingsford-Group/kourami/releases/latest). 29 | 30 | # Support 31 | 32 | Kourami is, and will continue to be, [freely and actively supported on a best-effort basis](https://oceangenomics.com/about/#open). 33 | 34 | If you need industrial-grade technical support, please consider the options at [oceangenomics.com/support](http://oceangenomics.com/support). 35 | 36 | 37 | # Installation 38 | 39 | To install Kourami, you must have following installed on your system: 40 | 41 | - JDK 1.8+ 42 | 43 | - Apache Maven (3.3+) or Apache Ant (1.9+) is required (we **recommend Maven** for easy dependency downloads) 44 | - OR you must have dependencies downloaded and added to your CLASSPATH. Then you can compile using javac. 45 | - To use Ant, you must have dependencies downloaded and place jars under 'exjars' directory. 'exjars' directory must be created. 46 | 47 | -A copy of the preformatted IMGT-HLA database (Kourami panel) can be obtained using a script. The panel sequence file needs to be bwa indexed before using and this is NOW done by the script when it downloads the database. The script will download and install the database under ```db``` directory under the Kourami installation directory. The download and index script can be run from the kourami installation directory: 48 | 49 | ``` 50 | scripts/download_panel.sh 51 | ``` 52 | 53 | [MAVEN USERS] To compile and generate a jar file run the following command from the kourami directory where pom.xml is located. 54 | ``` 55 | mvn install 56 | ``` 57 | 58 | [ANT USERS] To compile and generate a jar file run the following command from the kourami directory where build.xml is located. 59 | ``` 60 | ant compile jar 61 | ``` 62 | 63 | This will create a "target" directory and place a packaged jar file in it. 64 | 65 | # Usage 66 | ``` 67 | java -jar /Kourami.jar [options] ... 68 | ``` 69 | NOTE: kourami jar takes a **bam aligned to Kourami reference panel built from IMGT/HLA db** (included in the preformatted IMGT-HLA database). 70 | Detailed notes on how to generate input bam consisting of HLA loci reads aligned to known alleles is explained in [How to prepare input bam and HLA panel for Kourami](https://github.com/Kingsford-Group/kourami/blob/master/preprocessing.md). 71 | 72 | Option Tag | Description 73 | ----------------------- | ----------------------------- 74 | -h,--help | print this message 75 | -d,--msaDirectory \ | build HLAGraph from gen and nuc MSAs provided by IMGT/HLA DB from given directory (required). Can be downloaded by running ```scripts/download_panel.sh```. 76 | -o,--outfilePrefix \ | use given outfile prefix for all output files (required) 77 | -a,--additionalLoci | type additional loci (optional) 78 | # Output 79 | 80 | \.result contains the typing result and the columns are: 81 | 1: Allele 82 | 2: #BasesMatched 83 | 3: Identity (#BasesMatched/MaxLen(query, db_allele)) 84 | 4: Length of the assembled allele 85 | 5: Length of the matched allele from IMGT/HLA DB 86 | 6: Combined bottleneck weights of both paths at a position. This is not necessarily same as the sum of column 7 and 8. 87 | 7: Weight of the bottleneck edge in path 1 88 | 8: Weight of the bottleneck edge in path 2 89 | 90 | Note: Given a path, a bottleneck edge is an edge with the minimal weight. For an allele, there are always two entries (lines) reported in the result file. Path 1 is reported first, and path 2 is reported in the following line. The columns 6 7 8 are going to be redundant (same) for both lines. 91 | 92 | \ contiains program log 93 | 94 | Assembled allele sequences are outputed in files ending with .typed.fa (multi-FASTA format) 95 | 96 | 97 | # Dependencies 98 | Dependecies can be easily downloaded by using Maven install command. 99 | 100 | In each release, the pre-compiled jar is distributed with all necessary jars for dependencies, and they are: 101 | 102 | - JGraphT 0.9.1 ( http://jgrapht.org/ ) 103 | - Apache Commons CLI 1.4 ( https://commons.apache.org/proper/commons-cli/ ) 104 | - fastutil 7.0.13 : Fast & compact type-specific collections for Java ( http://fastutil.di.unimi.it/ ) 105 | 106 | # How to cite Kourami 107 | Please cite our [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1388-2) available on Genome Biology: 108 | 109 | Lee, H., & Kingsford, C. Kourami: graph-guided assembly for novel human leukocyte antigen allele discovery. *Genome Biology* 19(16), 2018 110 | -------------------------------------------------------------------------------- /preprocessing.md: -------------------------------------------------------------------------------- 1 | # How to prepare input bam and HLA panel for Kourami 2 | 3 | ## (1) Input bam to Kourami [read alignments to HLA panel] 4 | The input to Kourami is a bam file containing read alignments of a subset of reads from HLA loci to known HLA reference panel generated by incorporating gene-length MSA and exon-only MSA provided by IMGT/HLA database. Currently, Kourami uses 3.24.0 release of the database. Different versions of the database can be used easily and how to use other IMGT releases or your own db is explained in the second part of this document. 5 | 6 | ### What you need 7 | 0. [Kourami] 8 | 1. [bamUtil](https://github.com/statgen/bamUtil/releases/tag/v1.0.14) 1.0.14 or later 9 | 2. [samtools](https://github.com/samtools/samtools) 1.3.1 or later 10 | 3. [bwa 0.7.15-r1140](https://github.com/lh3/bwa) or later 11 | 4. [bwa-kit 0.7.12](https://github.com/lh3/bwa/tree/master/bwakit) if you don't already have GRCh38 alignment 12 | 13 | 14 | ### Download a suitable flavor of GRCh38 15 | The current version (GRCh38) of the human genome comes in multiple flavors because they are published as multiple components. The components are: 16 | 17 | Ⓐ. Primary assembly : chromosome, unplaced, and unlocaized contigs + EBV (195 contigs) 18 | 19 | Ⓑ. Decoy (2386 contigs) 20 | 21 | Ⓒ. ALT : ALT haplotype (261 contigs) 22 | 23 | Ⓓ. HLA alleles packaged in hs38DH in [bwa.kit](https://github.com/lh3/bwa/tree/master/bwakit) (525 contigs) 24 | 25 | We find that either using hs38NoAltDH ( a + b + d ) or hs38DH ( a + b + c + d ) is most effective for extracting reads from HLA loci. The hs38DH flavor is used by 1000 Genome project (see [here](http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/README.1000genomes.GRCh38DH.alignment)) 26 | 27 | You need to download hs38NoAltDH by running `download_grch38.sh` script located under `script` directory 28 | ``` 29 | kourami@kourami:~/kourami$./scripts/download_grch38.sh hs38NoAltDH 30 | ``` 31 | This will generate the reference fasta file `hs38NoAltDH.fa` under `resources` directory. Then you need to `bwa index` the downloaded reference by running: 32 | ``` 33 | kourami@kourami:~/kourami$bwa index resources/hs38NoAltDH.fa 34 | ``` 35 | 36 | ### Read Extraction and input bam generation from GRCh38 bam 37 | If you have WGS data aligned to GRCh38 reference, we first need to extract reads that likely coming from HLA loci. If not, see the section "When aligned bam files to GRCh38 are not available" below. Depending on the GRCh38 flavor your bam is aligned to, you need to use the correct script to extract reads (See Table below). The WGS data aligned to GRCh38 should be either in bam or cram format (have to be sorted and indexed) prior to running an extraction script. 38 | 39 | GRCh38 flavor | Use 40 | ------------------|-------------- 41 | hs38DH | alignAndExtract_hs38DH.sh 42 | hs38NoAltDH | alignAndExtract_hs38DH_NoAlt.sh 43 | Ⓐ + (Ⓑ optional) + Ⓒ | alignAndExtract_hs38Alt.sh 44 | Ⓐ + (Ⓑ optional) [NOT recommended] | alignAndExtract_hs38.sh 45 | 46 | #### Running the extraction (pre-processing) script 47 | An example is shown for bam aligned to hs38DH below: 48 | ```` 49 | kourami@kourami:~/kourami$mkdir test 50 | kourami@kourami:~/kourami$cd test 51 | kourami@kourami:~/kourami/test$../scripts/alignAndExtract_hs38DH.sh NA12878 /mnt/data/NA12878.hs38DH.bam 52 | ```` 53 | This will generate `NA12878_on_KouramiPanel.bam` and this can be fed into Kourami. 54 | 55 | #### Specifying another version of IMGT/HLA DB other then the default 56 | Kourami formatted IMGT/HLA DB as well as the reference panel sequences are normally located under `db` directotry under kourami installation. In case you want to use another version of IMGT/HLA DB, you can sepcify the path to the desired database (by using `-d [path-to-db]` option)when running one of the extraction scripts. Before your another IMGT/HLA release, you must change it to Kourami-compatible format. This is explained in the second part of this document (see **[Using another version of IMGT/HLA DB or a custom verion]** below). 57 | ```` 58 | kourami@kourami:~/kourami/test$../scripts/alignAndExtract_hs38DH.sh -d ~/kourami/customDB NA12878 /mnt/data/NA12878.hs38DH.bam 59 | ```` 60 | 61 | ### When aligned bam files to GRCh38 are not available: 62 | 63 | When an aligned bam file (to the human genome) is not available, you must first align high coverage WGS data ( >30X coverage ) to the reference human genome, we recommend using the hs38NoAltDH or hs38DH flavor (see **[Downloading the correct version of GRCh38]** section above). We recommend you to follow 1000 genomes GRCh38 pipeline explained [here](http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/README.1000genomes.GRCh38DH.alignment). Generate bam or cram file should be sorted and indexed. 64 | 65 | ## (2) Creating Kourami HLA panel and merged MSAs from another version (release) of IMGT/HLA DB or a custom version 66 | The default version (Kourami formatted - IMGT/HLA release 3.24.0) can be automatically downloaded and bwa-indexed by running ```scripts/download_panel.sh``` (See Installation section in [README](https://github.com/Kingsford-Group/kourami/blob/master/README.md). 67 | 68 | Given a set of MSAs prepared in each release of IMGT/HLA DB, you will need to reformat them to be used with Kourami. 69 | You can use the script `formatIMGT.sh` under `scripts` directiory. 70 | 71 | ### Dependencies 72 | - [Kourami] v0.9.3 or higher 73 | - [bwa 0.7.15-r1140](https://github.com/lh3/bwa) or higher 74 | 75 | ### Input to the script 76 | - All multiple sequence alignment (MSA) flat files of `alignments` directory in an IMGT/HLA DB release. 77 | - In each release, `alignments` directory can be separately downloaded. `alignments` directory is distributed as a zip file (Alignments_Rel_XXXX.zip where XXXX is Release number). The text format of MSA is explained under [File Formats]-[Sequence Alignments] in [here](http://www.ebi.ac.uk/ipd/imgt/hla/download.html). Zipped alignments directory can be downloaded either from https://github.com/ANHIG/IMGTHLA or ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/ . The ftp site only provides the latest release. 78 | - `hla_nom_g.txt` file is included in each IMGT/HLA release and this is a required file for Kourami. When downloading `alignments` directory, `hla_nom_g.txt` from the same release of IMGT/HLA must be downloaded as well. Place this file in the downloaded `alignments` directory. 79 | 80 | ### Usage 81 | ```` 82 | /formatIMGT.sh -i [IMGT/HLA alignments dir] 83 | ```` 84 | Option Tag | Description 85 | ------------------------ | --------------------- 86 | -i | path to IMGT/HLA alignments directory [required] 87 | -v | IMGT/HLA release number is automatically set from IMGT/HLA MSA files. If specified, it overrides. [optional] 88 | -o | name of the directory the output will be written to. Output files will be written to `/`. If not provided, `custom_db/` under Kourami installation directory will be used. [optional] 89 | -h | print this message [optional] 90 | -------------------------------------------------------------------------------- /src/AllelePath.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | import java.util.ArrayList; 7 | import java.util.HashSet; 8 | import java.util.Iterator; 9 | 10 | import org.jgrapht.*; 11 | import org.jgrapht.graph.*; 12 | 13 | 14 | //single path through a superbubble. 15 | //each superbubble can have multiple AllelePath 16 | public class AllelePath{ 17 | 18 | private Path bubblePath;//this is the original bubblePath before adding interBubble paths; 19 | 20 | private ArrayList orderedEdgeList; 21 | 22 | //this keeps track of indices where disconnect happens due to exonic boundary. 23 | private ArrayList fractureEndIndex; 24 | 25 | //private ArrayList mergedOpIndicies; 26 | 27 | private double probability; 28 | 29 | private double weightedIntersectionSum; 30 | 31 | private int mergedNums; 32 | 33 | private String sequence; 34 | 35 | private String sequenceName; 36 | 37 | public ArrayList getOrderedEdgeList(){ 38 | return this.orderedEdgeList; 39 | } 40 | 41 | public double[] jointTraverse(AllelePath other, SimpleDirectedWeightedGraph g){ 42 | double[] results = new double[4];//new double[2]; 43 | double weightSum = 0.0d; 44 | double maxFlow = Double.MAX_VALUE; 45 | double minDepth1 = Double.MAX_VALUE; 46 | double minDepth2 = Double.MAX_VALUE; 47 | for(int i=0; i< this.orderedEdgeList.size(); i++){ 48 | CustomWeightedEdge te = this.orderedEdgeList.get(i); 49 | CustomWeightedEdge oe = other.getOrderedEdgeList().get(i); 50 | double w = 0.0d; 51 | double d1 = 0.0d; 52 | double d2 = 0.0d; 53 | //TO DO: should we not count for '-'?? CHECK THIS 54 | if(te.equals(oe)){//we count once if homozygous 55 | w = g.getEdgeWeight(te); 56 | int tmpw = (int) w; 57 | if(tmpw%2 == 0){ //even 58 | d1 = tmpw/2; 59 | d2 = tmpw/2; 60 | }else{ //odd, we give extra weight to d1 61 | d1 = tmpw/2 + 1; 62 | d2 = tmpw/2; 63 | } 64 | } 65 | else{//otherwise we count only 66 | w = g.getEdgeWeight(te) + g.getEdgeWeight(oe); 67 | d1 = g.getEdgeWeight(te); 68 | d2 = g.getEdgeWeight(oe); 69 | } 70 | weightSum += w; 71 | if(w < maxFlow) 72 | maxFlow = w; 73 | if(d1 < minDepth1) 74 | minDepth1 = d1; 75 | if(d2 < minDepth2) 76 | minDepth2 = d2; 77 | } 78 | results[0] = weightSum; 79 | results[1] = maxFlow; 80 | results[2] = minDepth1; 81 | results[3] = minDepth2; 82 | return results; 83 | } 84 | 85 | 86 | public double[] traverse(SimpleDirectedWeightedGraph g){ 87 | double[] results = new double[2]; 88 | double weightSum = 0.0d; 89 | double maxFlow = Double.MAX_VALUE; 90 | for(CustomWeightedEdge e : orderedEdgeList){ 91 | double w = g.getEdgeWeight(e); 92 | weightSum += w; 93 | if(w < maxFlow) 94 | maxFlow = w; 95 | } 96 | results[0] = weightSum; 97 | results[1] = maxFlow; 98 | return results; 99 | } 100 | 101 | 102 | public AllelePath(){ 103 | this.bubblePath = null; 104 | this.orderedEdgeList = new ArrayList(); 105 | this.fractureEndIndex = new ArrayList(); 106 | //this.mergedOpIndicies = new ArrayList(); 107 | this.weightedIntersectionSum = 0.0d; 108 | this.mergedNums = 0; 109 | this.probability = 0.0d; 110 | this.sequence = null; 111 | this.sequenceName = null; 112 | } 113 | 114 | public AllelePath(double p, double wis, int mn, Path bp){//ArrayList moi, Path bp){ 115 | this(); 116 | this.bubblePath = bp; 117 | this.weightedIntersectionSum = wis; 118 | this.mergedNums = mn; 119 | this.probability = p; 120 | //this.mergedOpIndicies = moi; 121 | } 122 | 123 | public Path getBubblePath(){ 124 | return this.bubblePath; 125 | } 126 | 127 | public double[] getJointProbability(AllelePath other, Bubble superBubble){ 128 | return this.bubblePath.getJointProbability(other.getBubblePath(), superBubble); 129 | } 130 | 131 | private void printFractureEndIndex(){ 132 | HLA.log.append("FractureEndIndex:["); 133 | for(Integer i: this.fractureEndIndex) 134 | HLA.log.append(" " + i.intValue() +" "); 135 | HLA.log.appendln("]"); 136 | } 137 | 138 | public void setFractureEndIndex(){ 139 | this.fractureEndIndex.add(new Integer(this.orderedEdgeList.size())); 140 | } 141 | 142 | public void setFractureEndIndexForNoSB(){ 143 | this.fractureEndIndex.add(new Integer(0)); 144 | } 145 | 146 | public void appendEdge(CustomWeightedEdge e){ 147 | this.orderedEdgeList.add(e); 148 | } 149 | 150 | public void appendAllEdges(Path other){ 151 | this.orderedEdgeList.addAll(other.getOrderedEdgeList()); 152 | } 153 | 154 | public void appendAllEdges(ArrayList anotherOrderedEdgeList){ 155 | this.orderedEdgeList.addAll(anotherOrderedEdgeList); 156 | } 157 | 158 | public void printPath(SimpleDirectedWeightedGraph g, int superBubbleNum, int n){ 159 | this.printFractureEndIndex(); 160 | HLA.log.appendln("IntersectionScore:\t" + this.weightedIntersectionSum + "\t" + this.probability); 161 | HLA.log.appendln(this.toFasta().toString()); 162 | //HLA.log.appendln(">candidate_" + superBubbleNum + "-" + n + "\n" + this.sequence);//this.toString(g, superBubbleNum, n)); 163 | } 164 | 165 | public StringBuffer toFasta(){ 166 | return new StringBuffer(">" + this.sequenceName + "\n" + this.sequence + "\n"); 167 | } 168 | 169 | public String getSequence(){ 170 | return this.sequence; 171 | } 172 | 173 | public double getProbability(){ 174 | return this.probability; 175 | } 176 | 177 | public double getIntersectionSum(){ 178 | return this.weightedIntersectionSum; 179 | } 180 | 181 | public int getMergedNums(){ 182 | return this.mergedNums; 183 | } 184 | 185 | public void generateSequence(SimpleDirectedWeightedGraph g){ 186 | 187 | } 188 | 189 | public void setSequenceString(SimpleDirectedWeightedGraph g, int superBubbleNum, int n){ 190 | 191 | StringBuffer bf = new StringBuffer(); 192 | CustomWeightedEdge pre = null; 193 | int disconnectCount = 0; 194 | int curfi = 0; 195 | for(int i=0; i curfi) 224 | && (i == this.fractureEndIndex.get(curfi).intValue()) ){ 225 | curChar = g.getEdgeSource(cur).getBase(); 226 | if(curChar != '.') 227 | bf.append(curChar); 228 | curfi++; 229 | } 230 | 231 | curChar = g.getEdgeTarget(cur).getBase(); 232 | if(curChar != '.') 233 | bf.append(curChar); 234 | pre = cur; 235 | } 236 | //String finalStr = bf.toString(); 237 | this.sequence = bf.toString(); 238 | this.sequenceName = "candidate_" + superBubbleNum + "-" + n; 239 | //HLA.log.appendln(">candidate_" + superBubbleNum + "-" + n + "\n" + finalStr); 240 | //return finalStr; 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /src/SuperAllelePath.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | import java.util.ArrayList; 7 | import java.util.Hashtable; 8 | 9 | import org.jgrapht.*; 10 | import org.jgrapht.graph.*; 11 | 12 | public class SuperAllelePath{ 13 | 14 | private ArrayList orderedAllelePaths; 15 | private ArrayList pathNums; 16 | private String hlaGeneName; 17 | 18 | public SuperAllelePath(String genename){ 19 | this.orderedAllelePaths = new ArrayList(); 20 | this.pathNums = new ArrayList(); 21 | this.hlaGeneName = genename; 22 | } 23 | 24 | public SuperAllelePath(ArrayList aps, ArrayList pns, String hgn){ 25 | this.orderedAllelePaths = aps; 26 | this.pathNums = pns; 27 | this.hlaGeneName = hgn; 28 | } 29 | 30 | public double[] jointTraverse(SuperAllelePath other, SimpleDirectedWeightedGraph g){ 31 | double[] weightFlow = new double[4];//new double[2]; 32 | weightFlow[1] = Double.MAX_VALUE; 33 | weightFlow[2] = Double.MAX_VALUE; 34 | weightFlow[3] = Double.MAX_VALUE; 35 | for(int i=0; i g){ 53 | double[] weightFlow = new double[2]; 54 | for(AllelePath ap : orderedAllelePaths){ 55 | double[] tmp = ap.traverse(g); 56 | weightFlow[0] += tmp[0]; 57 | weightFlow[1] += tmp[1]; 58 | } 59 | return weightFlow; 60 | } 61 | 62 | public int numAllelePaths(){ 63 | return this.orderedAllelePaths.size(); 64 | } 65 | 66 | //return 3 scores (TP/OP and intersection score) + 4 scores (allProduct, jointProduct, allProduct2, maxFlow) 67 | public double[] getJointProbability(SuperAllelePath other, ArrayList superBubbles){ 68 | 69 | double[] jp = new double[14]; 70 | if(this.numAllelePaths() != other.numAllelePaths() && this.numAllelePaths() != superBubbles.size()){ 71 | HLA.log.appendln("Incompatible SuperAllelePath. The number of fractured allelePath in superPath does not match"); 72 | return null; 73 | } 74 | 75 | for(int i=0; i apjp[6]) 85 | jp[6] = apjp[6]; 86 | }*/ 87 | } 88 | return jp; 89 | } 90 | 91 | 92 | public double getJointInterSuperBubbleLinkProb(SuperAllelePath other, Hashtable> hhl){ 93 | double logP = 0.0d; 94 | for(int i=0; i getOrderedAllelePaths(){ 130 | return this.orderedAllelePaths; 131 | } 132 | 133 | 134 | public String getHlaGeneName(){ 135 | return this.hlaGeneName; 136 | } 137 | 138 | public SuperAllelePath clone(){ 139 | return new SuperAllelePath(new ArrayList(this.orderedAllelePaths) 140 | , new ArrayList(this.pathNums) 141 | , this.hlaGeneName); 142 | } 143 | 144 | public void addAllelePath(AllelePath ap, int n){ 145 | this.orderedAllelePaths.add(ap); 146 | this.pathNums.add(new Integer(n)); 147 | } 148 | 149 | public String pathnums2String(){ 150 | StringBuffer bf = new StringBuffer(); 151 | int count = 0; 152 | for(Integer i : this.pathNums){ 153 | if(count > 0) 154 | bf.append(":"); 155 | bf.append(i.intValue()); 156 | count++; 157 | } 158 | return bf.toString(); 159 | } 160 | 161 | //return a list of db alleles that has maxIdenticalLength 162 | public ArrayList findMatchFrom(ArrayList typingSequences){ 163 | String curseq = this.getSequenceBuffer().toString(); 164 | ArrayList maxR = new ArrayList(); 165 | for(HLASequence subjscan: typingSequences){ 166 | if(curseq.equals(subjscan.getSequence())){ 167 | maxR.add(new Result(curseq.length(), subjscan)); 168 | return maxR; 169 | } 170 | } 171 | 172 | int maxIdenticalLen = 0; 173 | //since not perfect align to DB 174 | for(HLASequence subj : typingSequences){ 175 | Result curR = Needle.run(curseq, subj); 176 | if(curR.getIdenticalLen() >= maxIdenticalLen){ 177 | if(curR.getIdenticalLen() > maxIdenticalLen){ 178 | maxIdenticalLen = curR.getIdenticalLen(); 179 | maxR = new ArrayList(); 180 | } 181 | maxR.add(curR); 182 | } 183 | } 184 | return maxR; 185 | } 186 | 187 | public StringBuffer getSequenceBuffer(){ 188 | StringBuffer bf = new StringBuffer(); 189 | for(AllelePath ap : this.orderedAllelePaths) 190 | bf.append(ap.getSequence()); 191 | return bf; 192 | } 193 | 194 | public String toSimpleString(){ 195 | return this.pathnums2String() + "\t" + this.getWeightedIntersectionSumScore() + "\t" + this.getProbability(); 196 | } 197 | 198 | public StringBuffer toFasta(){ 199 | StringBuffer bf = new StringBuffer(">" + this.hlaGeneName + "_" + pathnums2String() + "\t" 200 | + this.getWeightedIntersectionSumScore() + "\t" + this.getProbability() + "\n"); 201 | bf.append(this.getSequenceBuffer()); 202 | bf.append("\n"); 203 | return bf; 204 | } 205 | 206 | public double getProbability(){ 207 | double logP = 0.0d; 208 | for(AllelePath ap : this.orderedAllelePaths){ 209 | logP += ap.getProbability(); 210 | } 211 | return logP; 212 | } 213 | 214 | /* 215 | public double getProbability(){ 216 | double p = 1.0d; 217 | for(AllelePath ap : this.orderedAllelePaths) 218 | p = p * ap.getProbability(); 219 | return p; 220 | } 221 | */ 222 | 223 | public double getWeightedIntersectionSumScore(){ 224 | double s = 0.0d; 225 | int n = 0; 226 | for(AllelePath ap : this.orderedAllelePaths){ 227 | s = s + ap.getIntersectionSum(); 228 | n = n + ap.getMergedNums(); 229 | } 230 | return s / n; 231 | } 232 | 233 | } 234 | -------------------------------------------------------------------------------- /src/CustomWeightedEdge.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | import htsjdk.samtools.util.QualityUtil; 7 | 8 | import org.jgrapht.*; 9 | import org.jgrapht.graph.*; 10 | 11 | import java.util.ArrayList; 12 | import java.util.Iterator; 13 | import java.util.Collections; 14 | import java.util.HashSet; 15 | 16 | /* 17 | * Added in order to assign confidence to the weights/baescall of the edge. 18 | * Current scoring is taken from MIRA. (stranded) 19 | * 20 | */ 21 | public class CustomWeightedEdge extends DefaultWeightedEdge{ 22 | 23 | //here you get list of quality scores --> forward and reverse 24 | //no way to trace back to readIDs for this. 25 | private ArrayList fScore; 26 | private ArrayList rScore; 27 | private double groupErrorProb; //consensus score 28 | 29 | private int numActivePath; 30 | 31 | //private HashSet rHash; 32 | private CustomHashMap rHash;//key is readId and value is quality value 33 | 34 | private HashSet pathset; 35 | 36 | private int edgeID; 37 | private static int nextID = 0; 38 | 39 | //public HashSet getReadHashSet(){ 40 | public CustomHashMap getReadHashSet(){ 41 | return this.rHash; 42 | } 43 | 44 | public HashSet getPathset(){ 45 | return this.pathset; 46 | } 47 | 48 | public int getEdgeId(){ 49 | return this.edgeID; 50 | } 51 | 52 | public void subtractSet(CustomHashMap removalSet){//HashSet removalSet){ 53 | this.rHash.removeAll(removalSet); 54 | } 55 | 56 | public void removePath(Path p){ 57 | this.pathset.remove(p); 58 | } 59 | 60 | public void removeRead(int r){//Integer r){ 61 | rHash.remove(r); 62 | } 63 | 64 | public HashSet getPathsetDeepCopy(){ 65 | HashSet tmp = new HashSet(); 66 | Iterator itr = this.pathset.iterator(); 67 | while(itr.hasNext()){ 68 | tmp.add(itr.next()); 69 | } 70 | return tmp; 71 | } 72 | 73 | //NO LONGER USED. SHOULD USE clone() in CustomHashMap class. 74 | /* 75 | public HashSet getReadHashSetDeepCopy(){ 76 | HashSet tmp = new HashSet(); 77 | Iterator itr = this.rHash.iterator(); 78 | while(itr.hasNext()){ 79 | tmp.add(itr.next()); 80 | } 81 | return tmp; 82 | } 83 | */ 84 | 85 | public void addAllReadsFrom(CustomHashMap otherSet){//HashSet otherSet){ 86 | this.rHash.addAll(otherSet); 87 | } 88 | 89 | public void addAllPathsFrom(HashSet otherSet){ 90 | this.pathset.addAll(otherSet); 91 | } 92 | 93 | /* THIS NEEDS TO BE REMOVED. CURRENTLY QUAL SET TO 0 to compile*/ 94 | /* public void addRead(int readNum){ 95 | this.addRead(readNum, 0); 96 | }*/ 97 | 98 | public void addRead(int readNum, int qual){ 99 | this.rHash.put(readNum, qual); 100 | } 101 | 102 | public void addPath(Path p){ 103 | this.pathset.add(p); 104 | } 105 | 106 | public static int numMaxLowestProbEntries = 10; 107 | 108 | public CustomWeightedEdge(){ 109 | super(); 110 | this.edgeID = CustomWeightedEdge.nextID; 111 | CustomWeightedEdge.nextID++; 112 | this.fScore = new ArrayList(); 113 | this.rScore = new ArrayList(); 114 | this.groupErrorProb = 0.0d; 115 | this.initNumActivePath(); 116 | this.rHash = new CustomHashMap();//new HashSet(); 117 | this.pathset = new HashSet(); 118 | } 119 | 120 | 121 | //returns union of reads if intersection of reads is non-empty. 122 | //public HashSet getUnionAfterCheckingIntersection(CustomWeightedEdge other){ 123 | public CustomHashMap getUnionAfterCheckingIntersection(CustomWeightedEdge other){ 124 | //HashSet ts = this.getReadHashSetDeepCopy(); 125 | //HashSet os = other.getReadHashSetDeepCopy(); 126 | CustomHashMap ts = this.rHash.clone(); 127 | CustomHashMap os = other.getReadHashSet().clone(); 128 | //ts.retainAll(os); 129 | ts.intersectionPE(os); 130 | //if(ts.retainAll(os)){ 131 | if(ts.size() > 0)//intersection is NOT empty 132 | ts = this.rHash.clone();//getReadHashSetDeepCopy(); 133 | else 134 | return null; 135 | //} 136 | ts.addAll(os); 137 | return ts; 138 | } 139 | 140 | //checks if there is intersection between this edge's readset and prevSet 141 | //if interesection is NOT empty, returns union (updates prevSet) 142 | //public HashSet unionAfterCheckingIntersection(HashSet prevSet){ 143 | public CustomHashMap unionAfterCheckingIntersection(CustomHashMap prevSet){//HashSet prevSet){ 144 | //HashSet ts = this.getReadHashSetDeepCopy(); 145 | CustomHashMap ts = this.rHash.clone(); 146 | // ts.retainAll(prevSet); 147 | ts.intersectionPE(prevSet); 148 | if(ts.size() > 0){ 149 | ts = this.rHash.clone(); 150 | //prevSet.addAll(this.rHash); 151 | //return prevSet; 152 | }else 153 | return null; 154 | ts.addAll(prevSet); 155 | return ts; 156 | } 157 | 158 | //return insertions of two sets. 159 | //returns null if intersection is an empty 160 | //public HashSet getIntersection(HashSet prevSet){ 161 | public CustomHashMap getIntersection(CustomHashMap prevSet){ 162 | //HashSet ts = this.getReadHashSetDeepCopy(); 163 | CustomHashMap ts = this.rHash.clone(); 164 | ts.retainAll(prevSet); 165 | if(ts.size() > 0) 166 | return ts; 167 | else 168 | return null; 169 | } 170 | 171 | 172 | public CustomHashMap getIntersectionPE(CustomHashMap prevSet){ 173 | //HashSet ts = this.getReadHashSetDeepCopy(); 174 | CustomHashMap ts = this.rHash.clone(); 175 | ts.intersectionPE(prevSet); 176 | if(ts.size() > 0) 177 | return ts; 178 | else 179 | return null; 180 | } 181 | 182 | 183 | public int getNumActivePath(){ 184 | return this.numActivePath; 185 | } 186 | 187 | public boolean isUniqueEdge(){ 188 | if(this.numActivePath == 1) 189 | return true; 190 | return false; 191 | } 192 | 193 | public void initNumActivePath(){ 194 | this.numActivePath = 0; 195 | } 196 | 197 | public void includeEdge(){ 198 | this.numActivePath++; 199 | } 200 | 201 | public void includeEdgeNTimes(int n){ 202 | this.numActivePath += n; 203 | } 204 | 205 | public void excludeEdge(){ 206 | this.numActivePath--; 207 | } 208 | 209 | public ArrayList getFScores(){ 210 | return this.fScore; 211 | } 212 | 213 | public ArrayList getRScores(){ 214 | return this.rScore; 215 | } 216 | 217 | public void setFScores(ArrayList fs){ 218 | this.fScore = fs; 219 | } 220 | 221 | public void setRScores(ArrayList rs){ 222 | this.rScore = rs; 223 | } 224 | 225 | public void addAllFScores(ArrayList fs){ 226 | this.fScore.addAll(fs); 227 | } 228 | 229 | public void addAllRScores(ArrayList rs){ 230 | this.rScore.addAll(rs); 231 | } 232 | 233 | public String toString(){ 234 | return this.getWeight() + "\t" + this.groupErrorProb;// + "\tfScoreSize[" + this.fScore.size() +"]" + "\trScoreSize[" + this.rScore.size() +"]"; 235 | } 236 | 237 | public void incrementWeight(SimpleDirectedWeightedGraph g, boolean isRefStrand, byte score){ 238 | g.setEdgeWeight(this, g.getEdgeWeight(this)+1); 239 | if(isRefStrand) 240 | this.fScore.add(new Byte(score)); 241 | else 242 | this.rScore.add(new Byte(score)); 243 | } 244 | 245 | public double getGroupErrorProb(){ 246 | return this.groupErrorProb; 247 | } 248 | 249 | public double computeGroupErrorProb(){ 250 | if(this.getWeight() == Double.MAX_VALUE) 251 | this.groupErrorProb = 0.0d; 252 | else{ 253 | double f = this.computeStrandedGroupErrorProb(this.fScore); 254 | double r = this.computeStrandedGroupErrorProb(this.rScore); 255 | //System.err.println("f-prob:\t" + f); 256 | //System.err.println("r-prob:\t" + r); 257 | if(f == 0.0d) 258 | f = 1.0d; 259 | if(r == 0.0d) 260 | r = 1.0d; 261 | double score = f*r; 262 | if(score == 1.0d) 263 | score = 0.0d; 264 | this.groupErrorProb = score; 265 | } 266 | //System.err.println("GroupErrorProb:\t" + this.groupErrorProb); 267 | return this.groupErrorProb; 268 | } 269 | 270 | private double computeStrandedGroupErrorProb(ArrayList scores){ 271 | //sort the phred score in a descending order 272 | Collections.sort(scores, Collections.reverseOrder()); 273 | Iterator itr = scores.iterator(); 274 | int count = 0; 275 | int sum = 0; 276 | while(itr.hasNext() && count < CustomWeightedEdge.numMaxLowestProbEntries){ 277 | byte tmp = itr.next().byteValue(); 278 | if(tmp < 2){ 279 | tmp = 2; 280 | } 281 | //System.err.println("ErrorProb(" + tmp + "):\t" + QualityUtil.getErrorProbabilityFromPhredScore(tmp)); 282 | int invErrorProb = (int) (1.0d / QualityUtil.getErrorProbabilityFromPhredScore(tmp)); 283 | //int invErrorProb = (int) (1.0d / QualityUtil.getErrorProbabilityFromPhredScore(itr.next().byteValue())); 284 | //System.err.println("INVErroProb(" + tmp + "):\t" + invErrorProb); 285 | double fraction = (double) (CustomWeightedEdge.numMaxLowestProbEntries - count) / CustomWeightedEdge.numMaxLowestProbEntries; 286 | //System.err.println("Fraction(" + tmp + "):\t" + fraction); 287 | sum += (int) (fraction * invErrorProb); 288 | //System.err.println("DENOM_SUM(" + tmp + "):\t" + sum); 289 | count++; 290 | } 291 | if(sum == 0) 292 | return 0.0d; 293 | else 294 | return 1.0d/((double)sum); 295 | } 296 | 297 | /* 298 | public int getNumActivePath(){ 299 | return this.numActivePath; 300 | } 301 | 302 | public void initNumActivePath(){ 303 | this.numActivePath = 0; 304 | } 305 | 306 | public void setNumActivePath(int n){ 307 | this.numActivePath = n; 308 | } 309 | 310 | public void incrementNumActivePathByN(int n){ 311 | this.numActivePath += n; 312 | } 313 | 314 | public void incrementNumActivePath(){ 315 | this.incrementNumActivePathByN(1); 316 | } 317 | */ 318 | 319 | /* TESTING */ 320 | public static void main(String[] args){ 321 | CustomWeightedEdge e = new CustomWeightedEdge(); 322 | e.fScore.add(new Byte((byte) 10)); 323 | e.fScore.add(new Byte((byte) 20)); 324 | e.fScore.add(new Byte((byte) 30)); 325 | 326 | e.rScore.add(new Byte((byte) 10)); 327 | e.rScore.add(new Byte((byte) 20)); 328 | e.rScore.add(new Byte((byte) 30)); 329 | 330 | e.computeGroupErrorProb(); 331 | } 332 | } 333 | -------------------------------------------------------------------------------- /scripts/alignAndExtract_hs38DH_NoAlt.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Part of Kourami HLA typer/assembler 3 | # (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | # See LICENSE for licensing. 5 | # 6 | 7 | #!/bin/bash 8 | 9 | pushd `dirname $0` > /dev/null 10 | SCRIPTD=`pwd` 11 | popd > /dev/null 12 | 13 | samtools_sort_memory_per_thread=2G 14 | num_processors=8 15 | kourami_db=$SCRIPTD/../db 16 | me=`basename $0` 17 | 18 | 19 | function usage { 20 | echo "HLA-related reads extractor for Kourami" 21 | echo "Note: Use this if you have bam file aligned to GRCh38 [NoAlt] (primary assembly + decoy + HLA from [bwa-kit] )" 22 | echo "USAGE: /$me -d [Kourami panel db] -r [refGenome] " 23 | echo 24 | echo " sample_id : desired sample name (ex: NA12878) [required]" 25 | echo 26 | echo " bamfile : sorted and indexed bam to hs38NoAltDH (ex: NA12878.bam) [required]" 27 | echo 28 | echo "------------------ Optional Parameters -----------------" 29 | echo " -d [panel DB] : Path to Kourami panel db. [Default: db directory under Kourami installation kourami/db]" 30 | echo 31 | echo " -r [Ref Gemome] : path to hs38NoAltDH (primary assembly + decoy + HLA [bwa-kit])" 32 | echo " USE download_grch38.sh script to obtain the reference." 33 | echo " MUST BE BWA INDEXED prior to running this script." 34 | echo " If not given, it assumes, hs38NoAltDH.fa is in resources dir." 35 | echo 36 | echo " -h : print this message." 37 | echo 38 | exit 1 39 | } 40 | 41 | # print usage when no argument is given 42 | if [ $# -lt 1 ]; then 43 | usage 44 | fi 45 | 46 | while getopts :d:r:h FLAG; do 47 | case $FLAG in 48 | d) 49 | kourami_db=$OPTARG 50 | ;; 51 | h) 52 | usage 53 | ;; 54 | \?) 55 | echo "Unrecognized option -$OPTARG. See usage:" 56 | usage 57 | ;; 58 | esac 59 | done 60 | 61 | shift $((OPTIND-1)) 62 | 63 | if [ $# -lt 2 ]; then 64 | echo "Missing one or more required arguments." 65 | usage 66 | fi 67 | 68 | sampleid=$1 69 | bam_path=$2 70 | 71 | merged_hla_panel=$kourami_db/All_FINAL_with_Decoy.fa.gz 72 | bam_for_kourami=$sampleid\_on_KouramiPanel.bam 73 | samtools_bin=`(which samtools)` 74 | bwa_bin=`(which bwa)` 75 | bamUtil=`(which bam)` 76 | if [ -z "$bamUtil" ]; then 77 | echo "missing bamUtil"; 78 | echo "bamUtil available from https://github.com/statgen/bamUtil" 79 | exit 1; 80 | fi 81 | #bamUtil=$HOME/bamUtil_1.0.13/bamUtil-master/bin/bam 82 | 83 | if [ ! -x "$samtools_bin" ] || [ ! -x "$bwa_bin" ] || [ ! -x "$bamUtil" ];then 84 | echo "Please make sure samtools, bwa, and bamUtil are installed" 85 | exit 1 86 | fi 87 | 88 | if [ ! -e "$bam_path" ] || [ ! -e "$kourami_db" ] || [ ! -e "$merged_hla_panel" ];then 89 | echo "Missing one of the following files/directories (38DH):\n" 90 | echo "$bam_path" 91 | echo "$kourami_db" 92 | echo "$merged_hla_panel" 93 | exit 1 94 | fi 95 | 96 | echo ">>>>>>>>>>>>>>>> extracting reads mapping to HLA loci and ALT contigs (38DH_NoAlt)" 97 | $samtools_bin view -b $bam_path chr6:29723340-29727296 chr6:29726601-29749049 chr6:29826979-29831122 \ 98 | chr6:29887760-29891080 chr6:29942470-29945884 chr6:30005971-30009956 chr6:30259562-30266951 \ 99 | chr6:30489406-30494205 chr6:31268749-31272136 chr6:31353866-31357245 chr6:31399784-31415316 \ 100 | chr6:31494881-31511124 chr6:32439842-32445051 chr6:32517377-32530229 chr6:32552713-32560002 \ 101 | chr6:32578770-32589836 chr6:32637406-32643652 chr6:32659464-32666689 chr6:32741386-32746887 \ 102 | chr6:32756098-32763553 chr6:32812763-32817048 chr6:32821833-32838770 chr6:32845209-32852787 \ 103 | chr6:32934629-32941070 chr6:32948614-32953122 chr6:33004183-33009612 chr6:33064569-33080778 \ 104 | chr6:33075926-33089696 chr6:33112516-33129113 HLA-A*01:01:01:01: HLA-A*01:01:01:02N: HLA-A*01:01:38L: HLA-A*01:02: HLA-A*01:03: HLA-A*01:04N: HLA-A*01:09: HLA-A*01:11N: HLA-A*01:14: HLA-A*01:16N: HLA-A*01:20: HLA-A*02:01:01:01: HLA-A*02:01:01:02L: HLA-A*02:01:01:03: HLA-A*02:01:01:04: HLA-A*02:02:01: HLA-A*02:03:01: HLA-A*02:03:03: HLA-A*02:05:01: HLA-A*02:06:01: HLA-A*02:07:01: HLA-A*02:10: HLA-A*02:251: HLA-A*02:259: HLA-A*02:264: HLA-A*02:265: HLA-A*02:266: HLA-A*02:269: HLA-A*02:279: HLA-A*02:32N: HLA-A*02:376: HLA-A*02:43N: HLA-A*02:455: HLA-A*02:48: HLA-A*02:51: HLA-A*02:533: HLA-A*02:53N: HLA-A*02:57: HLA-A*02:60:01: HLA-A*02:65: HLA-A*02:68: HLA-A*02:77: HLA-A*02:81: HLA-A*02:89: HLA-A*02:95: HLA-A*03:01:01:01: HLA-A*03:01:01:02N: HLA-A*03:01:01:03: HLA-A*03:02:01: HLA-A*03:11N: HLA-A*03:21N: HLA-A*03:36N: HLA-A*11:01:01: HLA-A*11:01:18: HLA-A*11:02:01: HLA-A*11:05: HLA-A*11:110: HLA-A*11:25: HLA-A*11:50Q: HLA-A*11:60: HLA-A*11:69N: HLA-A*11:74: HLA-A*11:75: HLA-A*11:77: HLA-A*23:01:01: HLA-A*23:09: HLA-A*23:38N: HLA-A*24:02:01:01: HLA-A*24:02:01:02L: HLA-A*24:02:01:03: HLA-A*24:02:03Q: HLA-A*24:02:10: HLA-A*24:03:01: HLA-A*24:07:01: HLA-A*24:08: HLA-A*24:09N: HLA-A*24:10:01: HLA-A*24:11N: HLA-A*24:152: HLA-A*24:20: HLA-A*24:215: HLA-A*24:61: HLA-A*24:86N: HLA-A*25:01:01: HLA-A*26:01:01: HLA-A*26:11N: HLA-A*26:15: HLA-A*26:50: HLA-A*29:01:01:01: HLA-A*29:01:01:02N: HLA-A*29:02:01:01: HLA-A*29:02:01:02: HLA-A*29:46: HLA-A*30:01:01: HLA-A*30:02:01:01: HLA-A*30:02:01:02: HLA-A*30:04:01: HLA-A*30:89: HLA-A*31:01:02: HLA-A*31:01:23: HLA-A*31:04: HLA-A*31:14N: HLA-A*31:46: HLA-A*32:01:01: HLA-A*32:06: HLA-A*33:01:01: HLA-A*33:03:01: HLA-A*33:07: HLA-A*34:01:01: HLA-A*34:02:01: HLA-A*36:01: HLA-A*43:01: HLA-A*66:01:01: HLA-A*66:17: HLA-A*68:01:01:01: HLA-A*68:01:01:02: HLA-A*68:01:02:01: HLA-A*68:01:02:02: HLA-A*68:02:01:01: HLA-A*68:02:01:02: HLA-A*68:02:01:03: HLA-A*68:02:02: HLA-A*68:03:01: HLA-A*68:08:01: HLA-A*68:113: HLA-A*68:17: HLA-A*68:18N: HLA-A*68:22: HLA-A*68:71: HLA-A*69:01: HLA-A*74:01: HLA-A*74:02:01:01: HLA-A*74:02:01:02: HLA-A*80:01:01:01: HLA-A*80:01:01:02: HLA-B*07:02:01: HLA-B*07:05:01: HLA-B*07:06: HLA-B*07:156: HLA-B*07:33:01: HLA-B*07:41: HLA-B*07:44: HLA-B*07:50: HLA-B*08:01:01: HLA-B*08:08N: HLA-B*08:132: HLA-B*08:134: HLA-B*08:19N: HLA-B*08:20: HLA-B*08:33: HLA-B*08:79: HLA-B*13:01:01: HLA-B*13:02:01: HLA-B*13:02:03: HLA-B*13:02:09: HLA-B*13:08: HLA-B*13:15: HLA-B*13:25: HLA-B*14:01:01: HLA-B*14:02:01: HLA-B*14:07N: HLA-B*15:01:01:01: HLA-B*15:01:01:02N: HLA-B*15:01:01:03: HLA-B*15:02:01: HLA-B*15:03:01: HLA-B*15:04:01: HLA-B*15:07:01: HLA-B*15:108: HLA-B*15:10:01: HLA-B*15:11:01: HLA-B*15:13:01: HLA-B*15:16:01: HLA-B*15:17:01:01: HLA-B*15:17:01:02: HLA-B*15:18:01: HLA-B*15:220: HLA-B*15:25:01: HLA-B*15:27:01: HLA-B*15:32:01: HLA-B*15:42: HLA-B*15:58: HLA-B*15:66: HLA-B*15:77: HLA-B*15:83: HLA-B*18:01:01:01: HLA-B*18:01:01:02: HLA-B*18:02: HLA-B*18:03: HLA-B*18:17N: HLA-B*18:26: HLA-B*18:94N: HLA-B*27:04:01: HLA-B*27:05:02: HLA-B*27:05:18: HLA-B*27:06: HLA-B*27:07:01: HLA-B*27:131: HLA-B*27:24: HLA-B*27:25: HLA-B*27:32: HLA-B*35:01:01:01: HLA-B*35:01:01:02: HLA-B*35:01:22: HLA-B*35:02:01: HLA-B*35:03:01: HLA-B*35:05:01: HLA-B*35:08:01: HLA-B*35:14:02: HLA-B*35:241: HLA-B*35:41: HLA-B*37:01:01: HLA-B*37:01:05: HLA-B*38:01:01: HLA-B*38:02:01: HLA-B*38:14: HLA-B*39:01:01:01: HLA-B*39:01:01:02L: HLA-B*39:01:01:03: HLA-B*39:01:03: HLA-B*39:01:16: HLA-B*39:01:21: HLA-B*39:05:01: HLA-B*39:06:02: HLA-B*39:10:01: HLA-B*39:13:02: HLA-B*39:14: HLA-B*39:34: HLA-B*39:38Q: HLA-B*40:01:01: HLA-B*40:01:02: HLA-B*40:02:01: HLA-B*40:03: HLA-B*40:06:01:01: HLA-B*40:06:01:02: HLA-B*40:10:01: HLA-B*40:150: HLA-B*40:40: HLA-B*40:72:01: HLA-B*40:79: HLA-B*41:01:01: HLA-B*41:02:01: HLA-B*42:01:01: HLA-B*42:02: HLA-B*42:08: HLA-B*44:02:01:01: HLA-B*44:02:01:02S: HLA-B*44:02:01:03: HLA-B*44:02:17: HLA-B*44:02:27: HLA-B*44:03:01: HLA-B*44:03:02: HLA-B*44:04: HLA-B*44:09: HLA-B*44:138Q: HLA-B*44:150: HLA-B*44:23N: HLA-B*44:26: HLA-B*44:46: HLA-B*44:49: HLA-B*44:56N: HLA-B*45:01:01: HLA-B*45:04: HLA-B*46:01:01: HLA-B*46:01:05: HLA-B*47:01:01:01: HLA-B*47:01:01:02: HLA-B*48:01:01: HLA-B*48:03:01: HLA-B*48:04: HLA-B*48:08: HLA-B*49:01:01: HLA-B*49:32: HLA-B*50:01:01: HLA-B*51:01:01: HLA-B*51:01:02: HLA-B*51:02:01: HLA-B*51:07:01: HLA-B*51:42: HLA-B*52:01:01:01: HLA-B*52:01:01:02: HLA-B*52:01:01:03: HLA-B*52:01:02: HLA-B*53:01:01: HLA-B*53:11: HLA-B*54:01:01: HLA-B*54:18: HLA-B*55:01:01: HLA-B*55:01:03: HLA-B*55:02:01: HLA-B*55:12: HLA-B*55:24: HLA-B*55:48: HLA-B*56:01:01: HLA-B*56:03: HLA-B*56:04: HLA-B*57:01:01: HLA-B*57:03:01: HLA-B*57:06: HLA-B*57:11: HLA-B*57:29: HLA-B*58:01:01: HLA-B*58:31N: HLA-B*59:01:01:01: HLA-B*59:01:01:02: HLA-B*67:01:01: HLA-B*67:01:02: HLA-B*67:02: HLA-B*73:01: HLA-B*78:01:01: HLA-B*81:01: HLA-B*82:02:01: HLA-C*01:02:01: HLA-C*01:02:11: HLA-C*01:02:29: HLA-C*01:02:30: HLA-C*01:03: HLA-C*01:06: HLA-C*01:08: HLA-C*01:14: HLA-C*01:21: HLA-C*01:30: HLA-C*01:40: HLA-C*02:02:02:01: HLA-C*02:02:02:02: HLA-C*02:10: HLA-C*02:11: HLA-C*02:16:02: HLA-C*02:69: HLA-C*02:85: HLA-C*02:86: HLA-C*02:87: HLA-C*03:02:01: HLA-C*03:02:02:01: HLA-C*03:02:02:02: HLA-C*03:02:02:03: HLA-C*03:03:01: HLA-C*03:04:01:01: HLA-C*03:04:01:02: HLA-C*03:04:02: HLA-C*03:04:04: HLA-C*03:05: HLA-C*03:06: HLA-C*03:100: HLA-C*03:13:01: HLA-C*03:20N: HLA-C*03:219: HLA-C*03:261: HLA-C*03:40:01: HLA-C*03:41:02: HLA-C*03:46: HLA-C*03:61: HLA-C*04:01:01:01: HLA-C*04:01:01:02: HLA-C*04:01:01:03: HLA-C*04:01:01:04: HLA-C*04:01:01:05: HLA-C*04:01:62: HLA-C*04:03:01: HLA-C*04:06: HLA-C*04:09N: HLA-C*04:128: HLA-C*04:161: HLA-C*04:177: HLA-C*04:70: HLA-C*04:71: HLA-C*05:01:01:01: HLA-C*05:01:01:02: HLA-C*05:08: HLA-C*05:09:01: HLA-C*05:93: HLA-C*06:02:01:01: HLA-C*06:02:01:02: HLA-C*06:02:01:03: HLA-C*06:23: HLA-C*06:24: HLA-C*06:46N: HLA-C*07:01:01:01: HLA-C*07:01:01:02: HLA-C*07:01:02: HLA-C*07:01:19: HLA-C*07:01:27: HLA-C*07:01:45: HLA-C*07:02:01:01: HLA-C*07:02:01:02: HLA-C*07:02:01:03: HLA-C*07:02:01:04: HLA-C*07:02:01:05: HLA-C*07:02:05: HLA-C*07:02:06: HLA-C*07:02:64: HLA-C*07:04:01: HLA-C*07:04:02: HLA-C*07:06: HLA-C*07:149: HLA-C*07:18: HLA-C*07:19: HLA-C*07:26: HLA-C*07:30: HLA-C*07:32N: HLA-C*07:384: HLA-C*07:385: HLA-C*07:386: HLA-C*07:391: HLA-C*07:392: HLA-C*07:49: HLA-C*07:56:02: HLA-C*07:66: HLA-C*07:67: HLA-C*08:01:01: HLA-C*08:01:03: HLA-C*08:02:01:01: HLA-C*08:02:01:02: HLA-C*08:03:01: HLA-C*08:04:01: HLA-C*08:112: HLA-C*08:20: HLA-C*08:21: HLA-C*08:22: HLA-C*08:24: HLA-C*08:27: HLA-C*08:36N: HLA-C*08:40: HLA-C*08:41: HLA-C*08:62: HLA-C*12:02:02: HLA-C*12:03:01:01: HLA-C*12:03:01:02: HLA-C*12:08: HLA-C*12:13: HLA-C*12:19: HLA-C*12:22: HLA-C*12:99: HLA-C*14:02:01: HLA-C*14:03: HLA-C*14:21N: HLA-C*14:23: HLA-C*15:02:01: HLA-C*15:05:01: HLA-C*15:05:02: HLA-C*15:13: HLA-C*15:16: HLA-C*15:17: HLA-C*15:96Q: HLA-C*16:01:01: HLA-C*16:02:01: HLA-C*16:04:01: HLA-C*17:01:01:01: HLA-C*17:01:01:02: HLA-C*17:01:01:03: HLA-C*17:03: HLA-C*18:01: HLA-DQA1*01:01:02: HLA-DQA1*01:02:01:01: HLA-DQA1*01:02:01:02: HLA-DQA1*01:02:01:03: HLA-DQA1*01:02:01:04: HLA-DQA1*01:03:01:01: HLA-DQA1*01:03:01:02: HLA-DQA1*01:04:01:01: HLA-DQA1*01:04:01:02: HLA-DQA1*01:05:01: HLA-DQA1*01:07: HLA-DQA1*01:10: HLA-DQA1*01:11: HLA-DQA1*02:01: HLA-DQA1*03:01:01: HLA-DQA1*03:02: HLA-DQA1*03:03:01: HLA-DQA1*04:01:02:01: HLA-DQA1*04:01:02:02: HLA-DQA1*04:02: HLA-DQA1*05:01:01:01: HLA-DQA1*05:01:01:02: HLA-DQA1*05:03: HLA-DQA1*05:05:01:01: HLA-DQA1*05:05:01:02: HLA-DQA1*05:05:01:03: HLA-DQA1*05:11: HLA-DQA1*06:01:01: HLA-DQB1*02:01:01: HLA-DQB1*02:02:01: HLA-DQB1*03:01:01:01: HLA-DQB1*03:01:01:02: HLA-DQB1*03:01:01:03: HLA-DQB1*03:02:01: HLA-DQB1*03:03:02:01: HLA-DQB1*03:03:02:02: HLA-DQB1*03:03:02:03: HLA-DQB1*03:05:01: HLA-DQB1*05:01:01:01: HLA-DQB1*05:01:01:02: HLA-DQB1*05:03:01:01: HLA-DQB1*05:03:01:02: HLA-DQB1*06:01:01: HLA-DQB1*06:02:01: HLA-DQB1*06:03:01: HLA-DQB1*06:09:01: HLA-DRB1*01:01:01: HLA-DRB1*01:02:01: HLA-DRB1*03:01:01:01: HLA-DRB1*03:01:01:02: HLA-DRB1*04:03:01: HLA-DRB1*07:01:01:01: HLA-DRB1*07:01:01:02: HLA-DRB1*08:03:02: HLA-DRB1*09:21: HLA-DRB1*10:01:01: HLA-DRB1*11:01:01: HLA-DRB1*11:01:02: HLA-DRB1*11:04:01: HLA-DRB1*12:01:01: HLA-DRB1*12:17: HLA-DRB1*13:01:01: HLA-DRB1*13:02:01: HLA-DRB1*14:05:01: HLA-DRB1*14:54:01: HLA-DRB1*15:01:01:01: HLA-DRB1*15:01:01:02: HLA-DRB1*15:01:01:03: HLA-DRB1*15:01:01:04: HLA-DRB1*15:02:01: HLA-DRB1*15:03:01:01: HLA-DRB1*15:03:01:02: \ 105 | HLA-DRB1*16:02:01:| $samtools_bin sort --thread $num_processors -m $samtools_sort_memory_per_thread -O BAM - > $sampleid.extract.bam 106 | 107 | OUT=$? 108 | if [ ! $OUT -eq 0 ];then 109 | echo 'Something went wrong while running bwa/samtools to align extracted reads to 38DH_NoAlt (38DH_NoAlt)' 110 | exit 1 111 | fi 112 | 113 | #rm $sampleid.tmp.extract* 114 | 115 | echo ">>>>>>>>>>>>>> indexing extracted bam (38DH_NoAlt)" 116 | $samtools_bin index $sampleid.extract.bam 117 | 118 | echo ">>>>>>>>>>>>>> bamUtil fastq extraction (38DH_NoAlt)" 119 | $bamUtil bam2FastQ --in $sampleid.extract.bam --gzip --firstOut $sampleid\_extract_1.fq.gz --secondOut $sampleid\_extract_2.fq.gz --unpairedOut $sampleid\_extract.unpaired.fq.gz &> /dev/null 120 | 121 | OUT=$? 122 | if [ ! $OUT -eq 0 ];then 123 | echo '$bamUtil fastq extraction Failed! (38DH_NoAlt)' 124 | exit 1 125 | else 126 | rm $sampleid.extract.bam* $sampleid\_extract.unpaired.fq.gz 127 | fi 128 | 129 | echo ">>>>>>>>>>>>>> bwa mem to hla panel for Kourami " 130 | $bwa_bin mem -t $num_processors $merged_hla_panel $sampleid\_extract_1.fq.gz $sampleid\_extract_2.fq.gz | $samtools_bin view -Sb - > $bam_for_kourami 131 | OUT=$? 132 | if [ ! $OUT -eq 0 ];then 133 | echo 'bwa alignment of extracted reads to HLA panel faild...' 134 | exit 1 135 | fi 136 | -------------------------------------------------------------------------------- /resources/DRB5_gen.txt: -------------------------------------------------------------------------------- 1 | HLA-DRB5 Genomic Sequence Alignments 2 | gDNA -313 3 | | 4 | DRB5*01:01:01 ACGATAGGGA CCCAGTTAAA GTGTTTTACG TGCAACTGGA TCAAATCTTT CAAGTACTAA TTTAAAACAA TCCTTTAAAT AAGGAAATTC TGTTTCAGAA GAGGACCTTC ATACAGCATC TCTGACCAGC AACTGATGAT ACTATTGAAC TCAGATGCTG ATTGGTTCTC CAACACGAGA GTACCCAAAC CAGGAGGAAG GAAATCAGTA ACTTCCTCCC CATAATTTGG AATGTGGGTG GAGGGGGATC ATAGTTCTCC CTGAGTGAGA CTTGCCTGCT CCTCTGGCCC CTGGTCCTGT CCTGTTCTCC AGC | ATGGTGT GTCTGAAGCT CCCTGGAGGT TCCTACATGG CAAAGCTGAC AGTGACACTG ATGGTGCTGA GCTCCCCACT GGCTTTGGCT GGGGACACCC GAC | GTAAGTG CACATTGTGG GTGCTGAGCT ACTATGGGGT GGGGAAAATA GGGAGTTTTG TTAACATTGT GCCCAGGCCA GGTGCCTTAA GAAATTGTGA CATTTTCTTC AGAGATTGCC CATCTTTATC ATGGGATCCC AAATTATTTC CTCCACAAAA GGAGCTTGAC TACTTGCCCT CTCCATGAGA CTGTGTAAGG GGCCTCCATA CAGGTCATTT CTTCTCAAAT CTCCACCAAT GAAACCTTTT CATCACATGT CCTCAGGGTC TTTAGAGGAT TTAGAAATAA GGATGCTAAA ATAAATTCCC CATACAGCAC TTCCCTTTAT TATGTTGACC TATGTCAGAC AAAACGAGTT TTTTTCTGAA AATTTTGTGG GAGTCAAGGG AATTCAAAGG GTCTCTCCTA GACAATCCTG TGTTATGCCC TTGACAGAAC CTGTGATATT GGCTCCTCTT CCTCATATGT GAGAATGGAC CCAGTGGCCT CCCCATTACC TCCTTTCTTT TCTTTCTGAA CTCCAATGTT TACCCTGTAA TGTATGCAAG GTCTCTGACA GAAGTTATGC TTAGTGCTTT TTCTTATGGG GAAAAATCCT TGGAGCTGAA GCTGAGATCT TTAGTATGTG GAGTCACCCT ACAGTTAAAG GGCATCTATG AGGTATTCTT TGGTGCCTAA AGGACTTAAG GCATCCTCTA AAAACCTGGC CCAGGTTAGA GTTTATTACA AATCTTTTTT AACCTTTCTA TACTATTTGC TCCTATATCT CCTACATGCT CTAACTAGAC ATGACAGGAA GAGATTCAAC TAACATAGGA CAAATTATAT GAAATTCTAT TTTTGTAAGT CAAAAATAGT CAAATATCAG AAATTTGATA AGGTTAAAAC TATATACTCT GTGTGGGGTT ACAGAGAGAA TGTGGACATT GTTCACATCT CATAGGGCTG AAAGTTAATG ATCAAGTCCT GGGAACTCAT TGTCTTACTG GGGTCTTGTC CTAAATTTCA TAGGTTCACC CGTCATGCCC TCAGCTTTCC TTAATTAGCC ATGTCTGCTT ACCTCTTCCT CCAGTTTCTC TCTATTTTTC CCCAGCTATG TTGTCATCAT ATCCAGAAAT CCCTAAAGCT TGCACAGATC CTTAGCACTA TGAGATCCAT TGAAAGAGAT AATTTTTTTC TTTTTGAGAT AGGGCCTGGC TCTGTCACCC AGGCTGTAAT GCAGTGGTCC GATCTAGGCT CACTGCAACC TCTGCCTCCA AGACTCAAGC TATCCTTCCT CCTCAGCCTC CAGAGCAGCG GAGACTACAA GCCCAACTAA TTTTGTTGGA GATGAGATTT CTGCCATGTT GCCCAGGGTG GTCTTAAACT CCTGGACTCA AACAATCCTC CTGCCTTGGC CTCCCAGCAT GCTAGGATTG TAGGTGTAAG CCACCGCACC CAGGCACGGT GAATTCATTG CCCAGGCAAT GAATCTCAAT GAAAAATTTC TTTTTTCTTA AATCACTGTT TCTTTATCTG TGAATTCTTC TTCCAATTAG AAGGAGGAGA AAGAAGAAAT TTGCCTGTAT TTCTCACCGG GAGGAGAAGG ATTCTAGTGT GACATCAAAT TGAAAGAGTG CTGGAGCTCG AGCCCCTTCT TGCTTTCCAG GATCCTTGCA GTGATCAGTT CCCAGAACCA TGGTTTATTA ATATAAACCA CAGTTATTTT TATCAGCAGC TACTCTGTAC TGGGCTCCCT TCTAGGTTCA AATCATTCTA TTTGAGTAAG ATAGAGAGGG TCCCGACTCT CAAGGTTACA CAAGAGTGGA GGAGACAGAC ACTAACCCAA TAAGCATTTA ACAAAGAAGA AAATTTCAGA GAGTCATAGT GCACTGAAGA AAAGACATCA GGTTTGTGGA AAAGAGAGAA ATGGATTCAC CTACTTTAGT TCATATGTCT AGAGAGCCCT ACTTGAGAAA GTGACATCCA GCTGAGAAAA CAAAATAAGT AGACAGTCGT GAAGATCTAA GGGACGAAAG TTCCAGGGAG ACCGAATTGG GGGGGAAGCC CTGGTGTGGG AAATTATGTG CAGGGACAGA AAGAAGGCTA GAGGGACTGA AGTGAGCAAG CAAGGAAATG CAGTGGCAGA AGATGAGGTA GGATGCAGAG AGGAAGTCGG GAGCCTCATA TCAGGCTCTG ATGTCCATGG TAAGAAACTT GAATTTTATT TTATTTTATT AATTTATTTA TTTATCTTAT TTATTTTTGA GAGGGAGTTT CATTCTTGTT GCCCAGGCTG GAGTGCAATG GCATGGTCTC AGCTCACTGC AACCTCTGCC TCCCGGGTTC AAGTGATTCT CTTGCCTCAG TCTCCCAAGT AGCTGGGATT ACAGGTACCT GCCACCATGC TCAGCTATTT TTTCTGTGTT TTTACTAGAG ACAGACTTTC ACCATGTTGG CCAGGCTGGT CTTCAACTCC TGACCTCAAG TAATCTGCCC GCCTGAGACT CCCAAAGTGC TGGGATTACA GGCGTGAGCC ACTGCGCATG GCCTGAATTT TATTTAAATA GATATGGGAA GCTACTGGAT GGTTACAAGG AAAGTCAATT TATATTCTTT TTTTTTAATT TTTTAATTTT TTTTTTTTTT GAGATATAGT CTTGCTCTGT CACCAGGCTG GAGTGCAGTG GCACTATCTC GGCTCACTGC AGCCTCTGCC TTCTGGGTTC AAGAGATTCT CCTGCCTCAG CCTCCCAAGT AGCTGGGATT ACAGGAGTGC ACCACCACGC CCAGCTAATT TTGGTATTTT TAGTAAAGAC AGGGTTTCAC CATGTTGGCC AGGATGGTCT TGATCTCTTG ACCTCGTGAT CCACCTGCCT CAGCCTCCCA GTGTGCTGGG ATTACAGGCG TGAGCCACCG TGCCCGGCCT ATATTGAATT TTTAAAACTA ATTCTAGCTA CTCTGTGGGG ATTGGATTGT TGGGGTTGAC AAGTGGTTAG GAAGACTATT TAGGAGCACA GCAGGGAATC CTCAGGGAAA ATAGGCTTGA GTTCACTAGT GATAAAGACA GTGAAAAAGA TAAAGTGGAT TTTGCTTACC TTGTTAATGG ATTACTGTAA AGGGGGTAGA AATATCAAGC TTATTCCTAA GGATTTTGTT TTGACAAATT AGTGAAAGGT GGTGGTGTTT ATTGAGATAG GGAAAACTGT GGGAGGAAAT GATTTGAAGT GGGTGGTTGG AAATAAAAGT TTTGTTTAAG TTTGAGATGA TTTATTGACA TTTGTGTGGA GCAATCAGAA GGTCAATGGC ATTTAAAAGA CTCATGGTGG GCCAGGCGCA GTGACTCATG CCTGTAATCC CAGCACTTTG GGAGGCCTAG ACAGGCGGAT CACCAGGTCA GGAGATCAAG ACCTTCCTGG CTAACACGGT GAAACCCCGT CTCTACTAAA AATACAAAAA ATTAGCTGGG CATGGTGGTG GGTGCCTGTA GTCCCAGCTA CTTGGGAGGC TGAAGCACGA GAATGGCGTG AACCAGGGAG GTGGAGCTTG CAGTGAGCTG AGATCACACC ACTTCACTCC AGGCTGGGTG ACAGAGCAAG ACTCCGTCTC AAAAAAAAAA AAAAAAAAAA GAGACTCATG GTGAGGCCAG GGCTGGAGGT ATTTATGTTG GTGGCATCAA TAGGTGTACT GTGTTAAATT CCAGGGAGTG GAAGAGGTTA CATAGGGAGA TTGTCTGGAG AAAAAAGAAA AGGGCACAGG CCAGCAAAGG GGACTAAGAA AGAGCCCAGG GATGTTGGGA AAAAACCAAG ATTACATCAT ATGTGTAAGT CAAGAAAAAT AGATTTTTTT TTTTCAAGGA GAAGGGAGAG GCCAATTGTG GTGAGTACCA CTAAGAGGAG GGTGAAGTGA GAATGTGACA GAGAAGCAAC GGCTGGGATT GGTGGAGCTG ATATTTGCAG TCAGTAGAGT ATCCAGGGAG GAAACTGGAT TGGACCATTT GAAGAGCGAG TAGAAGAGAG GACAAGGTTA AGGTTGACTG TTTTGAGTAG CGAGCTTCAG GGAAGGACTG TGCTCTGGGT TCAGGGAGCC AGCTGGATCA AAAGGAAAAG GCTAAAGAGG CTGAAGAGAA GCAGGAGGAC CTGTGAACCA GAGATGCTCG GTCATTATTA TCAAGGAAAT ATTAGAGGGC CCCTGTGTGC AGTGGTGACT GCTCATGAAG AGGTCACACA GCCAGTATTT CACACAGCCC ATATTTATTA GTGACTTAGA ATATACCAGT TACTACTCTA GGTCATGAGA ATGGAGTGAT GAATAAAATG AATCTGGTCT CCATCAGTAT ATGCCATGTA ACATTTTGCA GTGACTGTGT ACCAGGCCTA TGAATTTCAG TATGCAATTT CAATAATGAT CCTGCTGTAT CTGTGGTATA TAAAAACATG TACATCTCTG GAATCTAAAA TTGAAAGGTT ATAAGTAAAA CCCAGTATCA CAAATTTGGT GCTAGAAATC AGATTGCAGT TTAAATCTGA GCATGTAGAA AGTCCATTTC TTCTATGTCA GCAGGTGCCT TTTGTGTGAG GTTTAGGTAT ACTGCATTAT TAGACATAAA CCAGTGTTTC TGCCCTCTGT TTTCAGAATG ACAATTCTTT ATGAAACTAA TAGAAGAACA GAAGACAATT GCAAAATCAT GATGAAGATA CTAATTGCTT TAGAATCATG TAATAGAAAA AATAATGTGA GCTCCAGTTA TAGGGATCAA AAAATTAAAA TGGGAATACA TTTGAGTGTT TATTATGTGA TCACTGCTAA GAAGACTCTT TATTAAATTT CACACTTACA AAAATCCTGT GAGGATTATT ATAAAATGCA TTTGATAGAT TACAAATAGG CTTATGGTTG GCAAAAAGTG ACCCATTTAA AAGAGGTCAT ATTTCAATTC ATATTTTCTG ATTCTAGAAT ATGAGAGTTT TTCCATCATT AGTGAGTAGT GACTTGATTG TGTCTGAATT ATTGACAAAA TTTCTAATAT TCATATGTAC CAGGTTGTTT CTTAGAGTGG GGGCAGAGAT GCAAGGGCTT CTAGTTCCAA TGTATAAAAG AAACTTTCAT TCATTTTGCA TTTATCATTT TAAAAGTTCT ATAAGTCTAT CATAGGCATG TGTTGAAGAA CATAAAGAAG TATTAAATCA TTCCTTCTTC TGAGGTTTGA CTAGCAAGTC TGGCTAGGGT TGTCAGATAA CAAACAGGTT CCTAGTTAAA TCTGAATTTC AGACACACGA CCACAATTTA TTGGAAATCC AGATTTACCT GGGCATCCTC TGGTTTTATT TGTCAAATCT GTCAACCCTA AGTGGGACAC ATGAGCATGG ATTACAGTGC TAGCCATGCA AGCCACAGTG ACAGTGACTT TACAAATGTT TATTTTTTAA CATTCTGTCT GTAAAGAAAG TGCTTACATA ATTTAGGAAT AGAAAGATAG ACATTGTTTG ATCCAGGGTG CACACCTCTC TGCCATCATT TCTAAAGGGC AAACGGAGCT TTGTGAAGGT CTCTACACAA AGTCTGGGGA CCTGCTCATG TTTTGTAAAC TGTCTGTATG AGAACGTCAT TTTCTTGGTT TCTGACTTTC TGAGGGGACT TGACTACAAA ACCAAGAGTT CTATCTCTGG CCAAGGCTGG TAATTTGATG CCTGCTAGTA TTGTTGGGAG TGGGAGACTG AAAGACGTGA GTTAGTTGGG GCATTAAATG GGAATAAAAT AGCTGTGGTT GTGATTCACT ACTACAGATA ATTAGTGGAC AAGTGGCAGA GAAATTAAGA AAGAAGATGA TATGAAAGAC AAATGATATG ATTTGGTGAC TGGTTGGTAA TGCAAGGAAA TCAGTAAATC TTAGTTCTCA ACAAGTTCAT TTTCTGGAAA GATAGCACTG TATTGGGACC AGAATTCTAT AAAGCATTCG TTTTATGTAG GACCAAGATA TTCACAGATA TTTTTCAATA CAATTCTCAG CTGCTCCATA ACCACTAGTG GCTTGTTCAA CACAGGTTTT TTCAGATGGT TCACACCTGT GGTTCTTACC CAGGGATCGT TCACCACCGC TCCTTTCCCT CCCATCACCC TTGGGGAACT TGTGGCAATG TTTGAGCAAT TTTTGGTTGT CACCACAGGG GTTTCTTCTG ATATTTAATG AGTAGAAGCC AGGGACACTG CTAGAGAACC TACAATGTTC AGAACAGCCT CCTTTACCAA CAAAGAATTA TCTGGACCAA AATGTCAATA GTGCTGAGGC TGAGAGCACT GTTTCACACT GTGCTCTTTC TGAAAATTCT AGACTCACAT CTTTTATACA CTCAGCACCC AATTCAGTTC TTTATGGTTT ATTTTTGCTT GCTTCATTAT GCAAAAGTAG ACAGTTGCAT AAATTCAACC ACTTTCTTGT TGAATCCATT TAGTCAATGC AAGTTTAATA GCTTCATAGT TAGTTTTTGC CTTATGCAAT ATTTTTCAAC ATTTTCATGA GTTGTTGGTC AGCACTACCT CTATTAACTT TCAACAACTT GCCCTTATAA GTCACAAATA GTGATGCTCT GCAATTATTT TTCACTAACA TGCCTGAGAT TTCTGTAGTG ATTCTACATT TGGTATGATT CATAATGTAA AATGCTTTTA TTTATTCATT TCACTTTTAA CCAGAGGATT ATTTTTCAGT TATTTTTGTC ATTTTTACAC TTCAATCAAA CATAAAGACA AAAACATCTA AAATATATAG TATTTTACAT ATGTGTATAT TTACACACAT ATATGTATGT ATGTTTACAT GTATTGAAAC TACAGAAGCT CATGTCACCA ATAAGAGCTC TGAGACACTT TTGACCCCTT ACTCTTATCA GGTGAAATTT GCCAAATAAG TTTTATGAAC AAATTTCTTT TAACTGAATA TCTGAGCTTT GTGGATTTAG AAATGCAAAG GAAGGTTTGT GGACATTCAT GGGATCATGG TTTTATTCTC CTTAAATCTC TTTGATACTT TCCCACTGTC TTTAGCATAA ATCTCAGATC CTAACACCAC CCACAACGCT TTTCAATACC TGGCTTCTTG TGATGTCTCC AGTCTAATCT TTTACCCTCC TTCCCCTCAG CCTCTCTGCT TTAGTGAACT TTGTTCTAGT TTCTTGAAGA AGTTCATCAA TTCAAGCTTT TGTGCATGGT ATTTCCTAAA CCTGAAATGT GCTTCCCCCG TTTTGCCCAA ACAGACACAG AGTCTCCACT CCGCCCCCTG GTTCACACCT GCTTAACCTG CCAAGTCACA TCTGAACTGT CACCCTTCAG AAGGTCCTCC TCTGGCACCC TAATGTAATT GAGATCATCC TATTACTCTG TGTTCTGGAA CTCCACACTT CAGACATTTC TCATTCCTGT CTAAGCTCTT GTGTGTTTGG TTTTTGGCCT CACTTTTACT GCTCTTTAAG CTCCCCCAGC GGAGTGGAGA GGTCTGTTTT CCCGTGTTTG GATTCCCAGA GGCAGTGCAA GCCTGGCACA AGGTCATCAC TAAAGGAGTG TTCACAGGAT GAACGCGGTG GGAGTTAGGG AACGGGTAAA GCCTGTGGGA TGAGAGAAGG AGCAGAGATT GTCTTTGGGG TGCAGGCTCC CAGGAGGAGG CGGCGCGGGC TCCGGTGCTG GGCGGATCCT CCTCCAGCTC CTGCTTGGAG GTCTCCAGAA TAGGCTGGAG GCGGGGAGGG GGTCCCAAAA GCCTTGGGAT CAGAAGGGGT TTTCCTGCCT GGTCCCCCAG ACACCCCGTT CGCCTCAGGA AGACGGAGGA TGAGCCTCTG GGCTGCGCGT GGTCGGGGTT ATGGTTGGGA TCAGTTAAGG TTCCAGTGCC CGCACCCCGC CCAGGGAGCC CCGGATGGCG GCGTCTCTGT CAGTATCTTC CCCGGAGGCC GCCCCTGTGA CCGGATCCTT CGTGTCCCCA CAG | CACGTTT CTTGCAGCAG GATAAGTATG AGTGTCATTT CTTCAACGGG ACGGAGCGGG TGCGGTTCCT GCACAGAGAC ATCTATAACC AAGAGGAGGA CTTGCGCTTC GACAGCGACG TGGGGGAGTA CCGGGCGGTG ACGGAGCTGG GGCGGCCTGA CGCTGAGTAC TGGAACAGCC AGAAGGACTT CCTGGAAGAC AGGCGCGCCG CGGTGGACAC CTACTGCAGA CACAACTACG GGGTTGGTGA GAGCTTCACA GTGCAGCGGC GAG | GTGAGCA TGGTGGGGGG CGGGGCCTGG GTCCTTGTGA GCTGGGAATC TGAGTGTGTG TGTGTGTGTG TGTGTGTGTG TGTGTGTGTG TGTGTGAGAG AGAGAGGAAG AGAGAGACAG AGAGGGAGCG CGCCATCTGT GAGCATTTAG AATCCTCTCA ATCTTGAGCA AGGACTTCTG AGGACACAGG TGTGTGTGTA GAGTGTGGAT TTGTCTTTGT GGCCTTTGTG GGAGGGGAAG CAGGAGGGGG CTTCTTCTTA TCCTTGGAGG CCTCTGTGGG GAGGTGACAT GGGAGGTGGG TGCAGGGGGC TGGAGAGAGA GGAGACCTTG GTTGTCTCGT GTCCTTAGAG ATGCAGGGAA GGAAAGTGTA AGGTGTGTGT GGTTGGGGTG AAGGTTTAGG GGAGGAGAGC TGAGGGGTGT GGAAGATTTG TGATAATATG AGGAGGCCAG TTCCAGACTG TCCCTGGCAC CCACCCTTCA CGCAGTCTCT GAAATAAAAG TGTGTGGTGT TTGTTTGCAA AAGCATTAGA TTAATTTCTA GGGGAATTGA GGAGACCTCT GAGGCACTTC TGAAGCTTCT TTAGGTCTAA ATTTCTTGCT ATTTTTTCTG TTTGTTTTCT TAGTGTGTAT ATTTTTACAT AGTTGAAATG ACTGTGAAAC TAACTTTTTG ATTAAAATTT TAATACACTG TTACTATTTT ATTATAATGC TAATAGTTTT CTACTACTTA CCTATTATTC TTTTATATAT AATAGTTGTG ACACAACTTA TCTCACTTTC CCCTTTGTTG ACCTTTATTA TGACATTCAC CAAAAGTTGA AAATATATGT TTCTGGTTAT TATTTAATTT ATTTTTTTAT TTGGAATTCT TTTGAATTAT CTTGACCTAT TTATTGCCCA ATTATAATTA CTGCTCTAAG AATTCCCTAT TGTATTTGGT AGGTAATGGA CAATGATCTA TTTTCTGTTA TCTCCAGGGC TTAGTATTTT TCTCAGTGAT TTTGTGGGTT CTTTGTACTG TAAGATTATT AACACTTTAT TGATATTTGA TTCCACATTT TCTCCAGTTT TTGATTTGTA TGTTGATTTT GAAAATTCTA TGTTAAGAAT TTGAACATTT CTGTTTAATA AAATATATTC CAAAACTTTT ATTAATGGTT TACAAACCAT CTTAAATCTA TCATTTTGTG GTATTTTTGT CTCCAGGTTT CTCCTTCCTT CTTAAAAAAA ATGTATTTAT TGAGAGTCTG CTAGTGTTAG GGATTTTCCT AGGCATAAGC ACCCCAAGTA ATGAGTCCCA GACCCTGCCT GCCTTGATCC AAATGTCATT CTGGAAAGAA AAATTATTTT ACAATGATAG TATAATAATA GTTATGCTTG TTTGCATGGG AGATGCATTG ATCAGCTAAA TGTAAATGTA AGAATTTTCA AAACTAAAAT AACTTTCCTT AATCCTTCTC TCTGCTTTAT GACTCATGCT TTGCTGGGAA CTTAAAGATT GGAGAACCAT TTCTGTCTGT CCTACCTTCC CAGGGGCACA ACCATTTCTG TGTTGTTCTA AGGTGTGAGT GCATGGCAGT AGTATTCCTA AACATTCATA CTCAGTCTCC TTTTGTACCC TACTCGGTCC CTTTATCTAT CCACATTACT TTAAATCATA TTTTTCTCTC AAACTGTACA AGGATGATAA ATAGGTGGCA AGTGGAACAC CCAAGTGTGA TGAGCCCTCT CACAGTGGAA TGGAGTGAGA AGCTTTATGA CCTCATAAAT TAAAGGCTAT CTTCAGTCAT TGTTTTATAT ATTTTATGTG CATTAATCCT CATATAACCC CAAGAGGTAA ATTACTATAA TTATCCTCTA TTATGGGTGA GAAAGTTGAG ACACAGAAGA ATCGAAAAAC TCTTCCAGCA TCAACCAGTA AAAGGCAGAC CTTGGATTTG AGCCAGGCAA CCTGGCTCAG GTATCAGTTT TAATTACTAC ACTCTGTACT TTCAAAGACT TGTAAACACT TTGACAATGC ATCCCAATTT CAAGTGATGA AGAAACAAAC ACAATTTTTC ACATCTCTCA AATCTGATGA GCCCCCACTA TAAAGACTAA ATTCCAGGCT GATGACACTG TGAGGCCTCA TGGCCAGCTG TGCTGGAGGC CTGGTCAAGG CCAGAGCCTG GGTTTACAGA GAAGCAGACA AACAGCCAAA AAGGGAGATT CACTCTGTCT TCCTGAGTCA TTCCCTCTAC ATTTCCTTTC TCCTAG | TTGA GCCTAAGGTG ACTGTGTATC CTGCAAGGAC CCAGACCCTG CAGCACCACA ACCTCCTGGT CTGCTCTGTG AATGGTTTCT ATCCAGGCAG CATTGAAGTC AGGTGGTTCC GGAACAGCCA GGAAGAGAAG GCTGGGGTGG TGTCCACAGG CCTGATTCAG AATGGAGACT GGACCTTCCA GACCCTGGTG ATGCTGGAAA CAGTTCCTCG AAGTGGAGAG GTTTACACCT GCCAAGTGGA GCACCCAAGC GTGACGAGCC CTCTCACAGT GGAATGGA | GT GAGCAGCTTT CTGACTTCAT AAATTTCTCA ACCACCAAGA AGCAAACTTT ACTAATCCCT GAGTGTCAGG TTTCTCATCT CCCACATCCT ATTTTCATTT GCTCCATGTT CTCATCTCCA TTAGCACAGG TCACTGGGGG TAGCCCTGTA TAGTTTCTAG AAACACCTGT ATCCTCTGGG GAAGCAGTCA TTCCTGGCAG GAAGGAGAGG CTGTCCCTGT TTTGAACCTC CCCATGATGT CACAGGTCAG GGTCACCCCC TCTCCCAGGG CTCCAGACCC TGCCTCTGGG TCTGAGACTG TGTTTCTGGT GCTGTTGATC TGAGTTATTT GTTGTGATCT GGGAAGAGGA GAAGTGTAGG GGCCTTCCTG ACATGAGGGG AGTCCAATAT CAGCTCTGCC TTTTATTAGC TCTGTCACTC TAGACAAACT ATTTAACCTC ATTGAGTCTC AGGCTTTCTG TTTATCAGAT GTTGAAGCCG TGCCTTACAT CAGGGCTGTA ATATTAGAAT GAATTTGATC CCTGAAACTT GTAACTGTTC AGTGTGATTT GAAAACCTTT TTTTTCTCCA GAAATGGCTA GTTATTTTAG TTCTTACAGA GCAGCCTTCT TCCCCATTTT CAAAGCTATG AATATTGCAG GGTCTCAATT AAAAAGGTTC AATTTGGGAT AAAAATCACT AAACCTGGTT TCCTCTCCCA G | GAGCACAGT CTGAATCTGC ACAGAGCAAG ATGCTGAGTG GAGTCGGGGG CTTTGTGCTG GGCCTGCTCT TCCTTGGGGC CGGGCTATTC ATCTACTTCA AGAATCAGAA AG | GTGAGGAG CCTTTGGTAG CGGCTCTCTC CATAGACTTT TCCAGAGGAG GAAATAGGGC TTTGCTGAGG TTAGTTCTCA GTATATGAGT GGCCCTGGAT AAAGCCTTTC TTTCCACAAA TGACCTCCAA TGCCCTGCTA ATCCAGAAAT CATTAATGCA TGGTTACTAT GTCAAAAGCA TAATAGCTTG TGGCCTGCAG AGATAAGAGA AAGGTTAACT AGTTAAGGGT CCTTTGGTTT GAAATCCTGG AGCAAATTAA GGAAGAGCCA CTAAGGCTAA TGCAATTACA CTGGATCCTG TGACAGACAT GTCATGCTTC ATGGGTCACA TGGTCTGTTT CTGCTCCTCT CTGCCCTGGT TGGTGTGGGT TGTGGTGTTA GAGAAATCTC AGGTGGGAGA TCTGGGACTA GGACATTGTG TTGGGAGAAT AGATTTGCTT CCATACCTTT TAAGTGTATA TCTTTTCCTC TTTTTCCCAG | GGCACTCTGG ACTTCACCCA ACAG | GTAATA CCTTTTAATC CTCTTTTAGA AACAGATTCA GTTTTCCTAG AATGATGGCA GAGGTGATAA GGCATGAGAC AGAAATAGCA GGAAAGACTT TGGATCCAAA TTCCTGATCA GGCAATTTAT ACCAAAACTC CTCCTCTCCA CTTAGGCCTG TGCTCTGCAG GAGTATTGGT TCAGGGAGAC TTAGGAACTT GTTTTTCTTC TTCCTGCAGT GCTCTCATCT GAGTCCTTGA AAGAGAGGAA AAGAAGCTGT TAGTGGAACC AGGTCTGAAA ACAACACTTT CCTCTCTCTC TGCAG | GACTC GTGAGCTGA | A GTGCAGATGA CCACATTCAA GGGGGAACCT TCTGCCCCAG CTTTGCATGA TGAAAAGCTT TCCTGCTTGG CTCTTATTCT TCCACAAGAG AGGACTTTCT CAGGCCCTGG TTGCTACCGG TTCAGCAACT CTGCAGAAAA TGTCCATCCT TGTGGCTTCC TCAGCTCCTG CCCTTGGCCT GAAGTCCCAG CATTGATGGC AGTGCCTCAT CTTCAACTTT AGTGCTCCCC TTTACCTAAC CCTACGGCCT CCCATGCATC TGTACTCCCC CTGTGTGCCA CAAATGCACT ACGTTATTAA ATTTTTCTGA AGCCCAGAGT TAAAAATCAT CTGTCCACCT GGCTCCAAAG ACAAAAAATA AAAAGAAAAG AAAAAGGGAA GATTATTTCC TAATAGCATA ATGGTTTTTA TATGTATTTC ATAAGTATGT GAGGTAATGC ACATATTAAA TAGCTTGATT TAGTCATTCC ACACTATAGG CATATATCAG AACTTCATGC TGTACAACAT AAATATACTA TACAATTTTT ACTTGTCAAT AAAAGTAAAC CTAACATTTA AAAAGGCAAT GCATAAAAAT TGAGAACAGA TTATAACAAC TGAAACAACT TGGCCAACAT GAGATGAGAA ACCAGCTAGC AAGTC 5 | -------------------------------------------------------------------------------- /scripts/alignAndExtract_hs38.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Part of Kourami HLA typer/assembler 3 | # (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | # See LICENSE for licensing. 5 | # 6 | 7 | #!/bin/bash 8 | 9 | pushd `dirname $0` > /dev/null 10 | SCRIPTD=`pwd` 11 | popd > /dev/null 12 | 13 | samtools_sort_memory_per_thread=2G 14 | num_processors=8 15 | kourami_db=$SCRIPTD/../db 16 | resources_dir=$SCRIPTD/../resources 17 | grch38_HLA_NoAlt=$resources_dir/hs38NoAltDH.fa 18 | grch38_HLA_NoAlt_index=$resources_dir/hs38NoAltDH.fa.bwt 19 | me=`basename $0` 20 | 21 | function usage { 22 | echo "HLA-related reads extractor for Kourami" 23 | echo "Note: Use this if you have bam file aligned to GRCh38 (primary assembly + [decoy] : 38)" 24 | echo "USAGE: /$me -d [Kourami panel db] -r [refGenome] " 25 | echo 26 | echo " sample_id : desired sample name (ex: NA12878) [required]" 27 | echo 28 | echo " bamfile : sorted and indexed bam to hs38_NoAlt or hs38D_NoAlt (ex: NA12878.bam) [required]" 29 | echo 30 | echo "------------------ Optional Parameters -----------------" 31 | echo 32 | echo " -d [panel DB] : Path to Kourami panel db. [Default: db directory under Kourami installation kourami/db]" 33 | echo 34 | echo " -r [Ref Gemome] : path to hs38NoAltDH (primary assembly + decoy + HLA [bwa-kit])" 35 | echo " USE download_grch38.sh script to obtain the reference." 36 | echo " MUST BE BWA INDEXED prior to running this script." 37 | echo " If not given, it assumes, hs38NoAltDH.fa is in resources dir." 38 | echo 39 | echo " -h : print this message." 40 | echo 41 | exit 1 42 | } 43 | 44 | # print usage when no argument is given 45 | if [ $# -lt 1 ]; then 46 | usage 47 | fi 48 | 49 | while getopts :d:r:h FLAG; do 50 | case $FLAG in 51 | d) 52 | kourami_db=$OPTARG 53 | ;; 54 | r) 55 | grch38_HLA_NoAlt=$OPTARG 56 | grch38_HLA_NoAlt_index=$OPTARG.bwt 57 | ;; 58 | h) 59 | usage 60 | ;; 61 | \?) 62 | echo "Unrecognized option -$OPTARG. See usage:" 63 | usage 64 | ;; 65 | esac 66 | done 67 | 68 | shift $((OPTIND-1)) 69 | 70 | if [ $# -lt 2 ]; then 71 | echo "Missing one or more required arguments." 72 | usage 73 | fi 74 | sampleid=$1 75 | bam_path=$2 76 | 77 | merged_hla_panel=$kourami_db/All_FINAL_with_Decoy.fa.gz 78 | bam_for_kourami=$sampleid\_on_KouramiPanel.bam 79 | samtools_bin=`(which samtools)` 80 | bwa_bin=`(which bwa)` 81 | bamUtil=`(which bam)` 82 | if [ -z "$bamUtil" ]; then 83 | echo "missing bamUtil"; 84 | echo "bamUtil available from https://github.com/statgen/bamUtil" 85 | exit 1; 86 | fi 87 | #bamUtil=$HOME/bamUtil_1.0.13/bamUtil-master/bin/bam 88 | 89 | if [ ! -x "$samtools_bin" ] || [ ! -x "$bwa_bin" ] || [ ! -x "$bamUtil" ];then 90 | echo "Please make sure samtools, bwa, and bamUtil are installed" 91 | exit 1 92 | fi 93 | 94 | if [ ! -e "$bam_path" ] || [ ! -e "$kourami_db" ] || [ ! -e "$merged_hla_panel" ] || [ ! -e "$grch38_HLA_NoAlt" ];then 95 | echo "Missing one of the following files/directories (38):\n" 96 | echo "$bam_path" 97 | echo "$kourami_db" 98 | echo "$merged_hla_panel" 99 | echo "$grch38_HLA_NoAlt" 100 | exit 1 101 | fi 102 | 103 | if [ ! -f "$grch38_HLA_NoAlt_index" ];then 104 | echo "hs38NoAltDH must be bwa-indexed first." 105 | echo "Please run bwa index $grch38_HLA_NoAlt_index" 106 | exit 1 107 | fi 108 | 109 | echo ">>>>>>>>>>>>>>>> extracting reads mapping to HLA loci and ALT contigs (38)" 110 | $samtools_bin view -b $bam_path chr6:29723340-29727296 chr6:29726601-29749049 chr6:29826979-29831122 \ 111 | chr6:29887760-29891080 chr6:29942470-29945884 chr6:30005971-30009956 chr6:30259562-30266951 \ 112 | chr6:30489406-30494205 chr6:31268749-31272136 chr6:31353866-31357245 chr6:31399784-31415316 \ 113 | chr6:31494881-31511124 chr6:32439842-32445051 chr6:32517377-32530229 chr6:32552713-32560002 \ 114 | chr6:32578770-32589836 chr6:32637406-32643652 chr6:32659464-32666689 chr6:32741386-32746887 \ 115 | chr6:32756098-32763553 chr6:32812763-32817048 chr6:32821833-32838770 chr6:32845209-32852787 \ 116 | chr6:32934629-32941070 chr6:32948614-32953122 chr6:33004183-33009612 chr6:33064569-33080778 \ 117 | chr6:33075926-33089696 chr6:33112516-33129113 | $samtools_bin view -b -F 0x4 - | $samtools_bin sort --thread $num_processors -m $samtools_sort_memory_per_thread -O BAM - > $sampleid.tmp.extract.bam 118 | 119 | OUT=$? 120 | 121 | if [ ! $OUT -eq 0 ];then 122 | echo 'Something went wrong while extracting HLA-related reads from hs38 mapping (38)' 123 | exit 1 124 | fi 125 | 126 | echo ">>>>>>>>>>>>>> indexing extracted bam (38)" 127 | $samtools_bin index $sampleid.tmp.extract.bam 128 | 129 | echo ">>>>>>>>>>>>>> bamUtil fastq extraction (38)" 130 | $bamUtil bam2FastQ --in $sampleid.tmp.extract.bam --gzip --firstOut $sampleid\_tmp.extract_1.fq.gz --secondOut $sampleid\_tmp.extract_2.fq.gz --unpairedOut $sampleid\_tmp.extract.unpaired.fq.gz &> /dev/null 131 | 132 | OUT=$? 133 | if [ ! $OUT -eq 0 ];then 134 | echo '$bamUtil fastq extraction Failed! (38)' 135 | exit 1 136 | else 137 | echo rm $sampleid.tmp.extract.bam* 138 | fi 139 | 140 | echo ">>>>>>>>>>>>>> Mapping back to GRCh38_NoALT_wHLA (38DH_NoAlt)" 141 | $bwa_bin mem -t $num_processors $grch38_HLA_NoAlt $sampleid\_tmp.extract_1.fq.gz $sampleid\_tmp.extract_2.fq.gz | $samtools_bin view -Sb - | $samtools_bin sort --thread $num_processors -m $samtools_sort_memory_per_thread -O BAM - > $sampleid\_tmp.extract_on_grch38.bam 142 | 143 | OUT=$? 144 | if [ ! $OUT -eq 0 ];then 145 | echo '$bamUtil fastq extraction Failed! (38DH)' 146 | exit 1 147 | else 148 | rm $sampleid\_tmp.extract_?.fq.gz $sampleid\_tmp.extract.unpaired.fq.gz 149 | fi 150 | 151 | $samtools_bin index $sampleid\_tmp.extract_on_grch38.bam 152 | 153 | $samtools_bin view -b $sampleid\_tmp.extract_on_grch38.bam \ 154 | chr6:29723340-29727296 chr6:29726601-29749049 chr6:29826979-29831122 \ 155 | chr6:29887760-29891080 chr6:29942470-29945884 chr6:30005971-30009956 chr6:30259562-30266951 \ 156 | chr6:30489406-30494205 chr6:31268749-31272136 chr6:31353866-31357245 chr6:31399784-31415316 \ 157 | chr6:31494881-31511124 chr6:32439842-32445051 chr6:32517377-32530229 chr6:32552713-32560002 \ 158 | chr6:32578770-32589836 chr6:32637406-32643652 chr6:32659464-32666689 chr6:32741386-32746887 \ 159 | chr6:32756098-32763553 chr6:32812763-32817048 chr6:32821833-32838770 chr6:32845209-32852787 \ 160 | chr6:32934629-32941070 chr6:32948614-32953122 chr6:33004183-33009612 chr6:33064569-33080778 \ 161 | chr6:33075926-33089696 chr6:33112516-33129113 HLA-A*01:01:01:01: HLA-A*01:01:01:02N: HLA-A*01:01:38L: HLA-A*01:02: HLA-A*01:03: HLA-A*01:04N: HLA-A*01:09: HLA-A*01:11N: HLA-A*01:14: HLA-A*01:16N: HLA-A*01:20: HLA-A*02:01:01:01: HLA-A*02:01:01:02L: HLA-A*02:01:01:03: HLA-A*02:01:01:04: HLA-A*02:02:01: HLA-A*02:03:01: HLA-A*02:03:03: HLA-A*02:05:01: HLA-A*02:06:01: HLA-A*02:07:01: HLA-A*02:10: HLA-A*02:251: HLA-A*02:259: HLA-A*02:264: HLA-A*02:265: HLA-A*02:266: HLA-A*02:269: HLA-A*02:279: HLA-A*02:32N: HLA-A*02:376: HLA-A*02:43N: HLA-A*02:455: HLA-A*02:48: HLA-A*02:51: HLA-A*02:533: HLA-A*02:53N: HLA-A*02:57: HLA-A*02:60:01: HLA-A*02:65: HLA-A*02:68: HLA-A*02:77: HLA-A*02:81: HLA-A*02:89: HLA-A*02:95: HLA-A*03:01:01:01: HLA-A*03:01:01:02N: HLA-A*03:01:01:03: HLA-A*03:02:01: HLA-A*03:11N: HLA-A*03:21N: HLA-A*03:36N: HLA-A*11:01:01: HLA-A*11:01:18: HLA-A*11:02:01: HLA-A*11:05: HLA-A*11:110: HLA-A*11:25: HLA-A*11:50Q: HLA-A*11:60: HLA-A*11:69N: HLA-A*11:74: HLA-A*11:75: HLA-A*11:77: HLA-A*23:01:01: HLA-A*23:09: HLA-A*23:38N: HLA-A*24:02:01:01: HLA-A*24:02:01:02L: HLA-A*24:02:01:03: HLA-A*24:02:03Q: HLA-A*24:02:10: HLA-A*24:03:01: HLA-A*24:07:01: HLA-A*24:08: HLA-A*24:09N: HLA-A*24:10:01: HLA-A*24:11N: HLA-A*24:152: HLA-A*24:20: HLA-A*24:215: HLA-A*24:61: HLA-A*24:86N: HLA-A*25:01:01: HLA-A*26:01:01: HLA-A*26:11N: HLA-A*26:15: HLA-A*26:50: HLA-A*29:01:01:01: HLA-A*29:01:01:02N: HLA-A*29:02:01:01: HLA-A*29:02:01:02: HLA-A*29:46: HLA-A*30:01:01: HLA-A*30:02:01:01: HLA-A*30:02:01:02: HLA-A*30:04:01: HLA-A*30:89: HLA-A*31:01:02: HLA-A*31:01:23: HLA-A*31:04: HLA-A*31:14N: HLA-A*31:46: HLA-A*32:01:01: HLA-A*32:06: HLA-A*33:01:01: HLA-A*33:03:01: HLA-A*33:07: HLA-A*34:01:01: HLA-A*34:02:01: HLA-A*36:01: HLA-A*43:01: HLA-A*66:01:01: HLA-A*66:17: HLA-A*68:01:01:01: HLA-A*68:01:01:02: HLA-A*68:01:02:01: HLA-A*68:01:02:02: HLA-A*68:02:01:01: HLA-A*68:02:01:02: HLA-A*68:02:01:03: HLA-A*68:02:02: HLA-A*68:03:01: HLA-A*68:08:01: HLA-A*68:113: HLA-A*68:17: HLA-A*68:18N: HLA-A*68:22: HLA-A*68:71: HLA-A*69:01: HLA-A*74:01: HLA-A*74:02:01:01: HLA-A*74:02:01:02: HLA-A*80:01:01:01: HLA-A*80:01:01:02: HLA-B*07:02:01: HLA-B*07:05:01: HLA-B*07:06: HLA-B*07:156: HLA-B*07:33:01: HLA-B*07:41: HLA-B*07:44: HLA-B*07:50: HLA-B*08:01:01: HLA-B*08:08N: HLA-B*08:132: HLA-B*08:134: HLA-B*08:19N: HLA-B*08:20: HLA-B*08:33: HLA-B*08:79: HLA-B*13:01:01: HLA-B*13:02:01: HLA-B*13:02:03: HLA-B*13:02:09: HLA-B*13:08: HLA-B*13:15: HLA-B*13:25: HLA-B*14:01:01: HLA-B*14:02:01: HLA-B*14:07N: HLA-B*15:01:01:01: HLA-B*15:01:01:02N: HLA-B*15:01:01:03: HLA-B*15:02:01: HLA-B*15:03:01: HLA-B*15:04:01: HLA-B*15:07:01: HLA-B*15:108: HLA-B*15:10:01: HLA-B*15:11:01: HLA-B*15:13:01: HLA-B*15:16:01: HLA-B*15:17:01:01: HLA-B*15:17:01:02: HLA-B*15:18:01: HLA-B*15:220: HLA-B*15:25:01: HLA-B*15:27:01: HLA-B*15:32:01: HLA-B*15:42: HLA-B*15:58: HLA-B*15:66: HLA-B*15:77: HLA-B*15:83: HLA-B*18:01:01:01: HLA-B*18:01:01:02: HLA-B*18:02: HLA-B*18:03: HLA-B*18:17N: HLA-B*18:26: HLA-B*18:94N: HLA-B*27:04:01: HLA-B*27:05:02: HLA-B*27:05:18: HLA-B*27:06: HLA-B*27:07:01: HLA-B*27:131: HLA-B*27:24: HLA-B*27:25: HLA-B*27:32: HLA-B*35:01:01:01: HLA-B*35:01:01:02: HLA-B*35:01:22: HLA-B*35:02:01: HLA-B*35:03:01: HLA-B*35:05:01: HLA-B*35:08:01: HLA-B*35:14:02: HLA-B*35:241: HLA-B*35:41: HLA-B*37:01:01: HLA-B*37:01:05: HLA-B*38:01:01: HLA-B*38:02:01: HLA-B*38:14: HLA-B*39:01:01:01: HLA-B*39:01:01:02L: HLA-B*39:01:01:03: HLA-B*39:01:03: HLA-B*39:01:16: HLA-B*39:01:21: HLA-B*39:05:01: HLA-B*39:06:02: HLA-B*39:10:01: HLA-B*39:13:02: HLA-B*39:14: HLA-B*39:34: HLA-B*39:38Q: HLA-B*40:01:01: HLA-B*40:01:02: HLA-B*40:02:01: HLA-B*40:03: HLA-B*40:06:01:01: HLA-B*40:06:01:02: HLA-B*40:10:01: HLA-B*40:150: HLA-B*40:40: HLA-B*40:72:01: HLA-B*40:79: HLA-B*41:01:01: HLA-B*41:02:01: HLA-B*42:01:01: HLA-B*42:02: HLA-B*42:08: HLA-B*44:02:01:01: HLA-B*44:02:01:02S: HLA-B*44:02:01:03: HLA-B*44:02:17: HLA-B*44:02:27: HLA-B*44:03:01: HLA-B*44:03:02: HLA-B*44:04: HLA-B*44:09: HLA-B*44:138Q: HLA-B*44:150: HLA-B*44:23N: HLA-B*44:26: HLA-B*44:46: HLA-B*44:49: HLA-B*44:56N: HLA-B*45:01:01: HLA-B*45:04: HLA-B*46:01:01: HLA-B*46:01:05: HLA-B*47:01:01:01: HLA-B*47:01:01:02: HLA-B*48:01:01: HLA-B*48:03:01: HLA-B*48:04: HLA-B*48:08: HLA-B*49:01:01: HLA-B*49:32: HLA-B*50:01:01: HLA-B*51:01:01: HLA-B*51:01:02: HLA-B*51:02:01: HLA-B*51:07:01: HLA-B*51:42: HLA-B*52:01:01:01: HLA-B*52:01:01:02: HLA-B*52:01:01:03: HLA-B*52:01:02: HLA-B*53:01:01: HLA-B*53:11: HLA-B*54:01:01: HLA-B*54:18: HLA-B*55:01:01: HLA-B*55:01:03: HLA-B*55:02:01: HLA-B*55:12: HLA-B*55:24: HLA-B*55:48: HLA-B*56:01:01: HLA-B*56:03: HLA-B*56:04: HLA-B*57:01:01: HLA-B*57:03:01: HLA-B*57:06: HLA-B*57:11: HLA-B*57:29: HLA-B*58:01:01: HLA-B*58:31N: HLA-B*59:01:01:01: HLA-B*59:01:01:02: HLA-B*67:01:01: HLA-B*67:01:02: HLA-B*67:02: HLA-B*73:01: HLA-B*78:01:01: HLA-B*81:01: HLA-B*82:02:01: HLA-C*01:02:01: HLA-C*01:02:11: HLA-C*01:02:29: HLA-C*01:02:30: HLA-C*01:03: HLA-C*01:06: HLA-C*01:08: HLA-C*01:14: HLA-C*01:21: HLA-C*01:30: HLA-C*01:40: HLA-C*02:02:02:01: HLA-C*02:02:02:02: HLA-C*02:10: HLA-C*02:11: HLA-C*02:16:02: HLA-C*02:69: HLA-C*02:85: HLA-C*02:86: HLA-C*02:87: HLA-C*03:02:01: HLA-C*03:02:02:01: HLA-C*03:02:02:02: HLA-C*03:02:02:03: HLA-C*03:03:01: HLA-C*03:04:01:01: HLA-C*03:04:01:02: HLA-C*03:04:02: HLA-C*03:04:04: HLA-C*03:05: HLA-C*03:06: HLA-C*03:100: HLA-C*03:13:01: HLA-C*03:20N: HLA-C*03:219: HLA-C*03:261: HLA-C*03:40:01: HLA-C*03:41:02: HLA-C*03:46: HLA-C*03:61: HLA-C*04:01:01:01: HLA-C*04:01:01:02: HLA-C*04:01:01:03: HLA-C*04:01:01:04: HLA-C*04:01:01:05: HLA-C*04:01:62: HLA-C*04:03:01: HLA-C*04:06: HLA-C*04:09N: HLA-C*04:128: HLA-C*04:161: HLA-C*04:177: HLA-C*04:70: HLA-C*04:71: HLA-C*05:01:01:01: HLA-C*05:01:01:02: HLA-C*05:08: HLA-C*05:09:01: HLA-C*05:93: HLA-C*06:02:01:01: HLA-C*06:02:01:02: HLA-C*06:02:01:03: HLA-C*06:23: HLA-C*06:24: HLA-C*06:46N: HLA-C*07:01:01:01: HLA-C*07:01:01:02: HLA-C*07:01:02: HLA-C*07:01:19: HLA-C*07:01:27: HLA-C*07:01:45: HLA-C*07:02:01:01: HLA-C*07:02:01:02: HLA-C*07:02:01:03: HLA-C*07:02:01:04: HLA-C*07:02:01:05: HLA-C*07:02:05: HLA-C*07:02:06: HLA-C*07:02:64: HLA-C*07:04:01: HLA-C*07:04:02: HLA-C*07:06: HLA-C*07:149: HLA-C*07:18: HLA-C*07:19: HLA-C*07:26: HLA-C*07:30: HLA-C*07:32N: HLA-C*07:384: HLA-C*07:385: HLA-C*07:386: HLA-C*07:391: HLA-C*07:392: HLA-C*07:49: HLA-C*07:56:02: HLA-C*07:66: HLA-C*07:67: HLA-C*08:01:01: HLA-C*08:01:03: HLA-C*08:02:01:01: HLA-C*08:02:01:02: HLA-C*08:03:01: HLA-C*08:04:01: HLA-C*08:112: HLA-C*08:20: HLA-C*08:21: HLA-C*08:22: HLA-C*08:24: HLA-C*08:27: HLA-C*08:36N: HLA-C*08:40: HLA-C*08:41: HLA-C*08:62: HLA-C*12:02:02: HLA-C*12:03:01:01: HLA-C*12:03:01:02: HLA-C*12:08: HLA-C*12:13: HLA-C*12:19: HLA-C*12:22: HLA-C*12:99: HLA-C*14:02:01: HLA-C*14:03: HLA-C*14:21N: HLA-C*14:23: HLA-C*15:02:01: HLA-C*15:05:01: HLA-C*15:05:02: HLA-C*15:13: HLA-C*15:16: HLA-C*15:17: HLA-C*15:96Q: HLA-C*16:01:01: HLA-C*16:02:01: HLA-C*16:04:01: HLA-C*17:01:01:01: HLA-C*17:01:01:02: HLA-C*17:01:01:03: HLA-C*17:03: HLA-C*18:01: HLA-DQA1*01:01:02: HLA-DQA1*01:02:01:01: HLA-DQA1*01:02:01:02: HLA-DQA1*01:02:01:03: HLA-DQA1*01:02:01:04: HLA-DQA1*01:03:01:01: HLA-DQA1*01:03:01:02: HLA-DQA1*01:04:01:01: HLA-DQA1*01:04:01:02: HLA-DQA1*01:05:01: HLA-DQA1*01:07: HLA-DQA1*01:10: HLA-DQA1*01:11: HLA-DQA1*02:01: HLA-DQA1*03:01:01: HLA-DQA1*03:02: HLA-DQA1*03:03:01: HLA-DQA1*04:01:02:01: HLA-DQA1*04:01:02:02: HLA-DQA1*04:02: HLA-DQA1*05:01:01:01: HLA-DQA1*05:01:01:02: HLA-DQA1*05:03: HLA-DQA1*05:05:01:01: HLA-DQA1*05:05:01:02: HLA-DQA1*05:05:01:03: HLA-DQA1*05:11: HLA-DQA1*06:01:01: HLA-DQB1*02:01:01: HLA-DQB1*02:02:01: HLA-DQB1*03:01:01:01: HLA-DQB1*03:01:01:02: HLA-DQB1*03:01:01:03: HLA-DQB1*03:02:01: HLA-DQB1*03:03:02:01: HLA-DQB1*03:03:02:02: HLA-DQB1*03:03:02:03: HLA-DQB1*03:05:01: HLA-DQB1*05:01:01:01: HLA-DQB1*05:01:01:02: HLA-DQB1*05:03:01:01: HLA-DQB1*05:03:01:02: HLA-DQB1*06:01:01: HLA-DQB1*06:02:01: HLA-DQB1*06:03:01: HLA-DQB1*06:09:01: HLA-DRB1*01:01:01: HLA-DRB1*01:02:01: HLA-DRB1*03:01:01:01: HLA-DRB1*03:01:01:02: HLA-DRB1*04:03:01: HLA-DRB1*07:01:01:01: HLA-DRB1*07:01:01:02: HLA-DRB1*08:03:02: HLA-DRB1*09:21: HLA-DRB1*10:01:01: HLA-DRB1*11:01:01: HLA-DRB1*11:01:02: HLA-DRB1*11:04:01: HLA-DRB1*12:01:01: HLA-DRB1*12:17: HLA-DRB1*13:01:01: HLA-DRB1*13:02:01: HLA-DRB1*14:05:01: HLA-DRB1*14:54:01: HLA-DRB1*15:01:01:01: HLA-DRB1*15:01:01:02: HLA-DRB1*15:01:01:03: HLA-DRB1*15:01:01:04: HLA-DRB1*15:02:01: HLA-DRB1*15:03:01:01: HLA-DRB1*15:03:01:02: \ 162 | HLA-DRB1*16:02:01:| $samtools_bin sort --thread $num_processors -m $samtools_sort_memory_per_thread -O BAM - > $sampleid.extract.bam 163 | 164 | OUT=$? 165 | if [ ! $OUT -eq 0 ];then 166 | echo 'Something went wrong while running bwa/samtools to align extracted reads to 38DH_NoAlt (38DH_NoAlt)' 167 | exit 1 168 | else 169 | rm $sampleid\_tmp.extract_on_grch38.bam* 170 | fi 171 | 172 | #rm $sampleid.tmp.extract* 173 | 174 | echo ">>>>>>>>>>>>>> indexing extracted bam (38DH_NoAlt)" 175 | $samtools_bin index $sampleid.extract.bam 176 | 177 | echo ">>>>>>>>>>>>>> bamUtil fastq extraction (38DH_NoAlt)" 178 | $bamUtil bam2FastQ --in $sampleid.extract.bam --gzip --firstOut $sampleid\_extract_1.fq.gz --secondOut $sampleid\_extract_2.fq.gz --unpairedOut $sampleid\_extract.unpaired.fq.gz &> /dev/null 179 | 180 | OUT=$? 181 | if [ ! $OUT -eq 0 ];then 182 | echo '$bamUtil fastq extraction Failed! (38DH_NoAlt)' 183 | exit 1 184 | else 185 | rm $sampleid.extract.bam* $sampleid\_extract.unpaired.fq.gz 186 | fi 187 | 188 | echo ">>>>>>>>>>>>>> bwa mem to hla panel for Kourami " 189 | $bwa_bin mem -t $num_processors $merged_hla_panel $sampleid\_extract_1.fq.gz $sampleid\_extract_2.fq.gz | $samtools_bin view -Sb - > $bam_for_kourami 190 | OUT=$? 191 | if [ ! $OUT -eq 0 ];then 192 | echo 'bwa alignment of extracted reads to HLA panel faild...' 193 | exit 1 194 | fi 195 | -------------------------------------------------------------------------------- /scripts/alignAndExtract_hs38Alt.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Part of Kourami HLA typer/assembler 3 | # (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | # See LICENSE for licensing. 5 | # 6 | 7 | #!/bin/bash 8 | 9 | pushd `dirname $0` > /dev/null 10 | SCRIPTD=`pwd` 11 | popd > /dev/null 12 | 13 | samtools_sort_memory_per_thread=2G 14 | num_processors=8 15 | kourami_db=$SCRIPTD/../db 16 | resources_dir=$SCRIPTD/../resources 17 | grch38_HLA_NoAlt=$resources_dir/hs38NoAltDH.fa 18 | grch38_HLA_NoAlt_index=$resources_dir/hs38NoAltDH.fa.bwt 19 | me=`basename $0` 20 | 21 | function usage { 22 | echo "HLA-related reads extractor for Kourami" 23 | echo "Note: Use this if you have bam file aligned to GRCh38 (primary assembly + [decoy] + ALT : 38Alt)" 24 | echo "USAGE: /$me -d [Kourami panel db] -r [refGenome] " 25 | echo 26 | echo " sample_id : desired sample name (ex: NA12878) [required]" 27 | echo 28 | echo " bamfile : sorted and indexed bam to hs38 or hs38D (both with ALT) (ex: NA12878.bam) [required]" 29 | echo 30 | echo "------------------ Optional Parameters -----------------" 31 | echo " -d [panel DB] : Path to Kourami panel db. [Default: db directory under Kourami installation kourami/db]" 32 | echo 33 | echo " -r [Ref Gemome] : path to hs38NoAltDH (primary assembly + decoy + HLA [bwa-kit])" 34 | echo " USE download_grch38.sh script to obtain the reference." 35 | echo " MUST BE BWA INDEXED prior to running this script." 36 | echo " If not given, it assumes, hs38NoAltDH.fa is in resources dir." 37 | echo 38 | echo " -h : print this message." 39 | echo 40 | exit 1 41 | } 42 | 43 | # print usage when no argument is given 44 | if [ $# -lt 1 ]; then 45 | usage 46 | fi 47 | 48 | while getopts :d:r:h FLAG; do 49 | case $FLAG in 50 | d) 51 | kourami_db=$OPTARG 52 | ;; 53 | r) 54 | grch38_HLA_NoAlt=$OPTARG 55 | grch38_HLA_NoAlt_index=$OPTARG.bwt 56 | ;; 57 | h) 58 | usage 59 | ;; 60 | \?) 61 | echo "Unrecognized option -$OPTARG. See usage:" 62 | usage 63 | ;; 64 | esac 65 | done 66 | 67 | shift $((OPTIND-1)) 68 | 69 | if [ $# -lt 2 ]; then 70 | echo "Missing one or more required arguments." 71 | usage 72 | fi 73 | 74 | sampleid=$1 75 | bam_path=$2 76 | 77 | merged_hla_panel=$kourami_db/All_FINAL_with_Decoy.fa.gz 78 | bam_for_kourami=$sampleid\_on_KouramiPanel.bam 79 | samtools_bin=`(which samtools)` 80 | bwa_bin=`(which bwa)` 81 | bamUtil=`(which bam)` 82 | if [ -z "$bamUtil" ]; then 83 | echo "missing bamUtil"; 84 | echo "bamUtil available from https://github.com/statgen/bamUtil" 85 | exit 1; 86 | fi 87 | #bamUtil=$HOME/bamUtil_1.0.13/bamUtil-master/bin/bam 88 | 89 | if [ ! -x "$samtools_bin" ] || [ ! -x "$bwa_bin" ] || [ ! -x "$bamUtil" ];then 90 | echo "Please make sure samtools, bwa, and bamUtil are installed" 91 | exit 1 92 | fi 93 | 94 | if [ ! -e "$bam_path" ] || [ ! -e "$kourami_db" ] || [ ! -e "$merged_hla_panel" ] || [ ! -e "$grch38_HLA_NoAlt" ];then 95 | echo "Missing one of the following files/directories (38Alt):\n" 96 | echo "$bam_path" 97 | echo "$kourami_db" 98 | echo "$merged_hla_panel" 99 | echo "$grch38_HLA_NoAlt" 100 | exit 1 101 | fi 102 | 103 | if [ ! -f "$grch38_HLA_NoAlt_index" ];then 104 | echo "hs38NoAltDH must be bwa-indexed first." 105 | echo "Please run bwa index $grch38_HLA_NoAlt_index" 106 | exit 1 107 | fi 108 | 109 | echo ">>>>>>>>>>>>>>>> extracting reads mapping to HLA loci and ALT contigs (38Alt)" 110 | $samtools_bin view -b -L $resources_dir/hs38dh.initial.bed $bam_path | $samtools_bin sort --thread $num_processors -m $samtools_sort_memory_per_thread -O BAM - > $sampleid.tmp.extract.ba 111 | 112 | $samtools_bin view -b $bam_path chr6:29723340-29727296 chr6:29726601-29749049 chr6:29826979-29831122 \ 113 | chr6:29887760-29891080 chr6:29942470-29945884 chr6:30005971-30009956 chr6:30259562-30266951 \ 114 | chr6:30489406-30494205 chr6:31268749-31272136 chr6:31353866-31357245 chr6:31399784-31415316 \ 115 | chr6:31494881-31511124 chr6:32439842-32445051 chr6:32517377-32530229 chr6:32552713-32560002 \ 116 | chr6:32578770-32589836 chr6:32637406-32643652 chr6:32659464-32666689 chr6:32741386-32746887 \ 117 | chr6:32756098-32763553 chr6:32812763-32817048 chr6:32821833-32838770 chr6:32845209-32852787 \ 118 | chr6:32934629-32941070 chr6:32948614-32953122 chr6:33004183-33009612 chr6:33064569-33080778 \ 119 | chr6:33075926-33089696 chr6:33112516-33129113 \ 120 | chr6_GL000250v2_alt: chr6_GL000251v2_alt: chr6_GL000252v2_alt: chr6_GL000253v2_alt: chr6_GL000254v2_alt: chr6_GL000255v2_alt: chr6_GL000256v2_alt: | $samtools_bin view -b -F 0x4 - | $samtools_bin sort --thread $num_processors -m $samtools_sort_memory_per_thread -O BAM - > $sampleid.tmp.extract.bam 121 | 122 | OUT=$? 123 | 124 | if [ ! $OUT -eq 0 ];then 125 | echo 'Something went wrong while extracting HLA-related reads from hs38Alt mapping (38Alt)' 126 | exit 1 127 | fi 128 | 129 | echo ">>>>>>>>>>>>>> indexing extracted bam (38Alt)" 130 | $samtools_bin index $sampleid.tmp.extract.bam 131 | 132 | echo ">>>>>>>>>>>>>> bamUtil fastq extraction (38Alt)" 133 | $bamUtil bam2FastQ --in $sampleid.tmp.extract.bam --gzip --firstOut $sampleid\_tmp.extract_1.fq.gz --secondOut $sampleid\_tmp.extract_2.fq.gz --unpairedOut $sampleid\_tmp.extract.unpaired.fq.gz &> /dev/null 134 | 135 | OUT=$? 136 | if [ ! $OUT -eq 0 ];then 137 | echo '$bamUtil fastq extraction Failed! (38Alt)' 138 | exit 1 139 | else 140 | echo rm $sampleid.tmp.extract.bam* 141 | fi 142 | 143 | echo ">>>>>>>>>>>>>> Mapping back to GRCh38_NoALT_wHLA (38DH_NoAlt)" 144 | $bwa_bin mem -t $num_processors $grch38_HLA_NoAlt $sampleid\_tmp.extract_1.fq.gz $sampleid\_tmp.extract_2.fq.gz | $samtools_bin view -Sb - | $samtools_bin sort --thread $num_processors -m $samtools_sort_memory_per_thread -O BAM - > $sampleid\_tmp.extract_on_grch38.bam 145 | 146 | OUT=$? 147 | if [ ! $OUT -eq 0 ];then 148 | echo '$bamUtil fastq extraction Failed! (38DH)' 149 | exit 1 150 | else 151 | rm $sampleid\_tmp.extract_?.fq.gz $sampleid\_tmp.extract.unpaired.fq.gz 152 | fi 153 | 154 | $samtools_bin index $sampleid\_tmp.extract_on_grch38.bam 155 | 156 | $samtools_bin view -b $sampleid\_tmp.extract_on_grch38.bam \ 157 | chr6:29723340-29727296 chr6:29726601-29749049 chr6:29826979-29831122 \ 158 | chr6:29887760-29891080 chr6:29942470-29945884 chr6:30005971-30009956 chr6:30259562-30266951 \ 159 | chr6:30489406-30494205 chr6:31268749-31272136 chr6:31353866-31357245 chr6:31399784-31415316 \ 160 | chr6:31494881-31511124 chr6:32439842-32445051 chr6:32517377-32530229 chr6:32552713-32560002 \ 161 | chr6:32578770-32589836 chr6:32637406-32643652 chr6:32659464-32666689 chr6:32741386-32746887 \ 162 | chr6:32756098-32763553 chr6:32812763-32817048 chr6:32821833-32838770 chr6:32845209-32852787 \ 163 | chr6:32934629-32941070 chr6:32948614-32953122 chr6:33004183-33009612 chr6:33064569-33080778 \ 164 | chr6:33075926-33089696 chr6:33112516-33129113 HLA-A*01:01:01:01: HLA-A*01:01:01:02N: HLA-A*01:01:38L: HLA-A*01:02: HLA-A*01:03: HLA-A*01:04N: HLA-A*01:09: HLA-A*01:11N: HLA-A*01:14: HLA-A*01:16N: HLA-A*01:20: HLA-A*02:01:01:01: HLA-A*02:01:01:02L: HLA-A*02:01:01:03: HLA-A*02:01:01:04: HLA-A*02:02:01: HLA-A*02:03:01: HLA-A*02:03:03: HLA-A*02:05:01: HLA-A*02:06:01: HLA-A*02:07:01: HLA-A*02:10: HLA-A*02:251: HLA-A*02:259: HLA-A*02:264: HLA-A*02:265: HLA-A*02:266: HLA-A*02:269: HLA-A*02:279: HLA-A*02:32N: HLA-A*02:376: HLA-A*02:43N: HLA-A*02:455: HLA-A*02:48: HLA-A*02:51: HLA-A*02:533: HLA-A*02:53N: HLA-A*02:57: HLA-A*02:60:01: HLA-A*02:65: HLA-A*02:68: HLA-A*02:77: HLA-A*02:81: HLA-A*02:89: HLA-A*02:95: HLA-A*03:01:01:01: HLA-A*03:01:01:02N: HLA-A*03:01:01:03: HLA-A*03:02:01: HLA-A*03:11N: HLA-A*03:21N: HLA-A*03:36N: HLA-A*11:01:01: HLA-A*11:01:18: HLA-A*11:02:01: HLA-A*11:05: HLA-A*11:110: HLA-A*11:25: HLA-A*11:50Q: HLA-A*11:60: HLA-A*11:69N: HLA-A*11:74: HLA-A*11:75: HLA-A*11:77: HLA-A*23:01:01: HLA-A*23:09: HLA-A*23:38N: HLA-A*24:02:01:01: HLA-A*24:02:01:02L: HLA-A*24:02:01:03: HLA-A*24:02:03Q: HLA-A*24:02:10: HLA-A*24:03:01: HLA-A*24:07:01: HLA-A*24:08: HLA-A*24:09N: HLA-A*24:10:01: HLA-A*24:11N: HLA-A*24:152: HLA-A*24:20: HLA-A*24:215: HLA-A*24:61: HLA-A*24:86N: HLA-A*25:01:01: HLA-A*26:01:01: HLA-A*26:11N: HLA-A*26:15: HLA-A*26:50: HLA-A*29:01:01:01: HLA-A*29:01:01:02N: HLA-A*29:02:01:01: HLA-A*29:02:01:02: HLA-A*29:46: HLA-A*30:01:01: HLA-A*30:02:01:01: HLA-A*30:02:01:02: HLA-A*30:04:01: HLA-A*30:89: HLA-A*31:01:02: HLA-A*31:01:23: HLA-A*31:04: HLA-A*31:14N: HLA-A*31:46: HLA-A*32:01:01: HLA-A*32:06: HLA-A*33:01:01: HLA-A*33:03:01: HLA-A*33:07: HLA-A*34:01:01: HLA-A*34:02:01: HLA-A*36:01: HLA-A*43:01: HLA-A*66:01:01: HLA-A*66:17: HLA-A*68:01:01:01: HLA-A*68:01:01:02: HLA-A*68:01:02:01: HLA-A*68:01:02:02: HLA-A*68:02:01:01: HLA-A*68:02:01:02: HLA-A*68:02:01:03: HLA-A*68:02:02: HLA-A*68:03:01: HLA-A*68:08:01: HLA-A*68:113: HLA-A*68:17: HLA-A*68:18N: HLA-A*68:22: HLA-A*68:71: HLA-A*69:01: HLA-A*74:01: HLA-A*74:02:01:01: HLA-A*74:02:01:02: HLA-A*80:01:01:01: HLA-A*80:01:01:02: HLA-B*07:02:01: HLA-B*07:05:01: HLA-B*07:06: HLA-B*07:156: HLA-B*07:33:01: HLA-B*07:41: HLA-B*07:44: HLA-B*07:50: HLA-B*08:01:01: HLA-B*08:08N: HLA-B*08:132: HLA-B*08:134: HLA-B*08:19N: HLA-B*08:20: HLA-B*08:33: HLA-B*08:79: HLA-B*13:01:01: HLA-B*13:02:01: HLA-B*13:02:03: HLA-B*13:02:09: HLA-B*13:08: HLA-B*13:15: HLA-B*13:25: HLA-B*14:01:01: HLA-B*14:02:01: HLA-B*14:07N: HLA-B*15:01:01:01: HLA-B*15:01:01:02N: HLA-B*15:01:01:03: HLA-B*15:02:01: HLA-B*15:03:01: HLA-B*15:04:01: HLA-B*15:07:01: HLA-B*15:108: HLA-B*15:10:01: HLA-B*15:11:01: HLA-B*15:13:01: HLA-B*15:16:01: HLA-B*15:17:01:01: HLA-B*15:17:01:02: HLA-B*15:18:01: HLA-B*15:220: HLA-B*15:25:01: HLA-B*15:27:01: HLA-B*15:32:01: HLA-B*15:42: HLA-B*15:58: HLA-B*15:66: HLA-B*15:77: HLA-B*15:83: HLA-B*18:01:01:01: HLA-B*18:01:01:02: HLA-B*18:02: HLA-B*18:03: HLA-B*18:17N: HLA-B*18:26: HLA-B*18:94N: HLA-B*27:04:01: HLA-B*27:05:02: HLA-B*27:05:18: HLA-B*27:06: HLA-B*27:07:01: HLA-B*27:131: HLA-B*27:24: HLA-B*27:25: HLA-B*27:32: HLA-B*35:01:01:01: HLA-B*35:01:01:02: HLA-B*35:01:22: HLA-B*35:02:01: HLA-B*35:03:01: HLA-B*35:05:01: HLA-B*35:08:01: HLA-B*35:14:02: HLA-B*35:241: HLA-B*35:41: HLA-B*37:01:01: HLA-B*37:01:05: HLA-B*38:01:01: HLA-B*38:02:01: HLA-B*38:14: HLA-B*39:01:01:01: HLA-B*39:01:01:02L: HLA-B*39:01:01:03: HLA-B*39:01:03: HLA-B*39:01:16: HLA-B*39:01:21: HLA-B*39:05:01: HLA-B*39:06:02: HLA-B*39:10:01: HLA-B*39:13:02: HLA-B*39:14: HLA-B*39:34: HLA-B*39:38Q: HLA-B*40:01:01: HLA-B*40:01:02: HLA-B*40:02:01: HLA-B*40:03: HLA-B*40:06:01:01: HLA-B*40:06:01:02: HLA-B*40:10:01: HLA-B*40:150: HLA-B*40:40: HLA-B*40:72:01: HLA-B*40:79: HLA-B*41:01:01: HLA-B*41:02:01: HLA-B*42:01:01: HLA-B*42:02: HLA-B*42:08: HLA-B*44:02:01:01: HLA-B*44:02:01:02S: HLA-B*44:02:01:03: HLA-B*44:02:17: HLA-B*44:02:27: HLA-B*44:03:01: HLA-B*44:03:02: HLA-B*44:04: HLA-B*44:09: HLA-B*44:138Q: HLA-B*44:150: HLA-B*44:23N: HLA-B*44:26: HLA-B*44:46: HLA-B*44:49: HLA-B*44:56N: HLA-B*45:01:01: HLA-B*45:04: HLA-B*46:01:01: HLA-B*46:01:05: HLA-B*47:01:01:01: HLA-B*47:01:01:02: HLA-B*48:01:01: HLA-B*48:03:01: HLA-B*48:04: HLA-B*48:08: HLA-B*49:01:01: HLA-B*49:32: HLA-B*50:01:01: HLA-B*51:01:01: HLA-B*51:01:02: HLA-B*51:02:01: HLA-B*51:07:01: HLA-B*51:42: HLA-B*52:01:01:01: HLA-B*52:01:01:02: HLA-B*52:01:01:03: HLA-B*52:01:02: HLA-B*53:01:01: HLA-B*53:11: HLA-B*54:01:01: HLA-B*54:18: HLA-B*55:01:01: HLA-B*55:01:03: HLA-B*55:02:01: HLA-B*55:12: HLA-B*55:24: HLA-B*55:48: HLA-B*56:01:01: HLA-B*56:03: HLA-B*56:04: HLA-B*57:01:01: HLA-B*57:03:01: HLA-B*57:06: HLA-B*57:11: HLA-B*57:29: HLA-B*58:01:01: HLA-B*58:31N: HLA-B*59:01:01:01: HLA-B*59:01:01:02: HLA-B*67:01:01: HLA-B*67:01:02: HLA-B*67:02: HLA-B*73:01: HLA-B*78:01:01: HLA-B*81:01: HLA-B*82:02:01: HLA-C*01:02:01: HLA-C*01:02:11: HLA-C*01:02:29: HLA-C*01:02:30: HLA-C*01:03: HLA-C*01:06: HLA-C*01:08: HLA-C*01:14: HLA-C*01:21: HLA-C*01:30: HLA-C*01:40: HLA-C*02:02:02:01: HLA-C*02:02:02:02: HLA-C*02:10: HLA-C*02:11: HLA-C*02:16:02: HLA-C*02:69: HLA-C*02:85: HLA-C*02:86: HLA-C*02:87: HLA-C*03:02:01: HLA-C*03:02:02:01: HLA-C*03:02:02:02: HLA-C*03:02:02:03: HLA-C*03:03:01: HLA-C*03:04:01:01: HLA-C*03:04:01:02: HLA-C*03:04:02: HLA-C*03:04:04: HLA-C*03:05: HLA-C*03:06: HLA-C*03:100: HLA-C*03:13:01: HLA-C*03:20N: HLA-C*03:219: HLA-C*03:261: HLA-C*03:40:01: HLA-C*03:41:02: HLA-C*03:46: HLA-C*03:61: HLA-C*04:01:01:01: HLA-C*04:01:01:02: HLA-C*04:01:01:03: HLA-C*04:01:01:04: HLA-C*04:01:01:05: HLA-C*04:01:62: HLA-C*04:03:01: HLA-C*04:06: HLA-C*04:09N: HLA-C*04:128: HLA-C*04:161: HLA-C*04:177: HLA-C*04:70: HLA-C*04:71: HLA-C*05:01:01:01: HLA-C*05:01:01:02: HLA-C*05:08: HLA-C*05:09:01: HLA-C*05:93: HLA-C*06:02:01:01: HLA-C*06:02:01:02: HLA-C*06:02:01:03: HLA-C*06:23: HLA-C*06:24: HLA-C*06:46N: HLA-C*07:01:01:01: HLA-C*07:01:01:02: HLA-C*07:01:02: HLA-C*07:01:19: HLA-C*07:01:27: HLA-C*07:01:45: HLA-C*07:02:01:01: HLA-C*07:02:01:02: HLA-C*07:02:01:03: HLA-C*07:02:01:04: HLA-C*07:02:01:05: HLA-C*07:02:05: HLA-C*07:02:06: HLA-C*07:02:64: HLA-C*07:04:01: HLA-C*07:04:02: HLA-C*07:06: HLA-C*07:149: HLA-C*07:18: HLA-C*07:19: HLA-C*07:26: HLA-C*07:30: HLA-C*07:32N: HLA-C*07:384: HLA-C*07:385: HLA-C*07:386: HLA-C*07:391: HLA-C*07:392: HLA-C*07:49: HLA-C*07:56:02: HLA-C*07:66: HLA-C*07:67: HLA-C*08:01:01: HLA-C*08:01:03: HLA-C*08:02:01:01: HLA-C*08:02:01:02: HLA-C*08:03:01: HLA-C*08:04:01: HLA-C*08:112: HLA-C*08:20: HLA-C*08:21: HLA-C*08:22: HLA-C*08:24: HLA-C*08:27: HLA-C*08:36N: HLA-C*08:40: HLA-C*08:41: HLA-C*08:62: HLA-C*12:02:02: HLA-C*12:03:01:01: HLA-C*12:03:01:02: HLA-C*12:08: HLA-C*12:13: HLA-C*12:19: HLA-C*12:22: HLA-C*12:99: HLA-C*14:02:01: HLA-C*14:03: HLA-C*14:21N: HLA-C*14:23: HLA-C*15:02:01: HLA-C*15:05:01: HLA-C*15:05:02: HLA-C*15:13: HLA-C*15:16: HLA-C*15:17: HLA-C*15:96Q: HLA-C*16:01:01: HLA-C*16:02:01: HLA-C*16:04:01: HLA-C*17:01:01:01: HLA-C*17:01:01:02: HLA-C*17:01:01:03: HLA-C*17:03: HLA-C*18:01: HLA-DQA1*01:01:02: HLA-DQA1*01:02:01:01: HLA-DQA1*01:02:01:02: HLA-DQA1*01:02:01:03: HLA-DQA1*01:02:01:04: HLA-DQA1*01:03:01:01: HLA-DQA1*01:03:01:02: HLA-DQA1*01:04:01:01: HLA-DQA1*01:04:01:02: HLA-DQA1*01:05:01: HLA-DQA1*01:07: HLA-DQA1*01:10: HLA-DQA1*01:11: HLA-DQA1*02:01: HLA-DQA1*03:01:01: HLA-DQA1*03:02: HLA-DQA1*03:03:01: HLA-DQA1*04:01:02:01: HLA-DQA1*04:01:02:02: HLA-DQA1*04:02: HLA-DQA1*05:01:01:01: HLA-DQA1*05:01:01:02: HLA-DQA1*05:03: HLA-DQA1*05:05:01:01: HLA-DQA1*05:05:01:02: HLA-DQA1*05:05:01:03: HLA-DQA1*05:11: HLA-DQA1*06:01:01: HLA-DQB1*02:01:01: HLA-DQB1*02:02:01: HLA-DQB1*03:01:01:01: HLA-DQB1*03:01:01:02: HLA-DQB1*03:01:01:03: HLA-DQB1*03:02:01: HLA-DQB1*03:03:02:01: HLA-DQB1*03:03:02:02: HLA-DQB1*03:03:02:03: HLA-DQB1*03:05:01: HLA-DQB1*05:01:01:01: HLA-DQB1*05:01:01:02: HLA-DQB1*05:03:01:01: HLA-DQB1*05:03:01:02: HLA-DQB1*06:01:01: HLA-DQB1*06:02:01: HLA-DQB1*06:03:01: HLA-DQB1*06:09:01: HLA-DRB1*01:01:01: HLA-DRB1*01:02:01: HLA-DRB1*03:01:01:01: HLA-DRB1*03:01:01:02: HLA-DRB1*04:03:01: HLA-DRB1*07:01:01:01: HLA-DRB1*07:01:01:02: HLA-DRB1*08:03:02: HLA-DRB1*09:21: HLA-DRB1*10:01:01: HLA-DRB1*11:01:01: HLA-DRB1*11:01:02: HLA-DRB1*11:04:01: HLA-DRB1*12:01:01: HLA-DRB1*12:17: HLA-DRB1*13:01:01: HLA-DRB1*13:02:01: HLA-DRB1*14:05:01: HLA-DRB1*14:54:01: HLA-DRB1*15:01:01:01: HLA-DRB1*15:01:01:02: HLA-DRB1*15:01:01:03: HLA-DRB1*15:01:01:04: HLA-DRB1*15:02:01: HLA-DRB1*15:03:01:01: HLA-DRB1*15:03:01:02: HLA-DRB1*16:02:01:| $samtools_bin sort --thread $num_processors -m $samtools_sort_memory_per_thread -O BAM - > $sampleid.extract.bam 165 | 166 | OUT=$? 167 | if [ ! $OUT -eq 0 ];then 168 | echo 'Something went wrong while running bwa/samtools to align extracted reads to 38DH_NoAlt (38DH_NoAlt)' 169 | exit 1 170 | else 171 | rm $sampleid\_tmp.extract_on_grch38.bam* 172 | fi 173 | 174 | #rm $sampleid.tmp.extract* 175 | 176 | echo ">>>>>>>>>>>>>> indexing extracted bam (38DH_NoAlt)" 177 | $samtools_bin index $sampleid.extract.bam 178 | 179 | echo ">>>>>>>>>>>>>> bamUtil fastq extraction (38DH_NoAlt)" 180 | $bamUtil bam2FastQ --in $sampleid.extract.bam --gzip --firstOut $sampleid\_extract_1.fq.gz --secondOut $sampleid\_extract_2.fq.gz --unpairedOut $sampleid\_extract.unpaired.fq.gz &> /dev/null 181 | 182 | OUT=$? 183 | if [ ! $OUT -eq 0 ];then 184 | echo '$bamUtil fastq extraction Failed! (38DH_NoAlt)' 185 | exit 1 186 | else 187 | rm $sampleid.extract.bam* $sampleid\_extract.unpaired.fq.gz 188 | fi 189 | 190 | echo ">>>>>>>>>>>>>> bwa mem to hla panel for Kourami " 191 | $bwa_bin mem -t $num_processors $merged_hla_panel $sampleid\_extract_1.fq.gz $sampleid\_extract_2.fq.gz | $samtools_bin view -Sb - > $bam_for_kourami 192 | OUT=$? 193 | if [ ! $OUT -eq 0 ];then 194 | echo 'bwa alignment of extracted reads to HLA panel faild...' 195 | exit 1 196 | fi 197 | -------------------------------------------------------------------------------- /resources/HLA_decoys.fa: -------------------------------------------------------------------------------- 1 | >DQB2_decoy 2 | TCCGGCAGGTACATAAGATCCATTAGGTTTGAGCTGTGTTGACTACCACT 3 | GCTTTTTCCTTGGTCTCACTTACGTCTTGGAAGATGGCTCTGCAGATCCC 4 | TGGAGGCTTTTGGGCAGCAGCTGTGACCGTGATGCTGGTGATGCTGAGCA 5 | CCCCAGTGGCTGAGGCCAGAGACTTTCCCAGTAAGTGCAGGGCAGCTGCT 6 | CTCGAGAGCCACCACTGTGGGAACAGGCTCTCCTTGGGTTGGAGTATGGG 7 | GGATGGTGATCTCCATGATCTCAGAACACAGTCTTTTATCACCATTTATT 8 | CTTTTTGGGAAATAGAGCTATGTTGCATTTTTATTTCCACCTTATAATGG 9 | GTGAGGTGAGGATAATCCAACCCCAATCCCACAGGTTTAAGCCTGAAGGA 10 | GGAGAGAGGAAAGAGGAGACAAAGTGTGCATTCACTACCTGTGACAGGAC 11 | AAAATGACCATGGCACTCCACGGTTATGCATTTCCCCAAAGATATGCATT 12 | TCCCCAAAGACACAGTAGGATTTTTCTGCACTGGGAAAATGTAAGGCAGC 13 | AATGGTGTCTGTAGTCTCTGTATTGGAGGTAAAGGAGTCTATACTACTGA 14 | CTCGAGTGGAGAGTTTGTGGAGGCAAACTCTTAGTACTGAGGGAAGGTGA 15 | CTGGATGACCACAGACAGGGAGTCTTACTTTGGGTTTCACTGATTTATGG 16 | GCAAAAGGTGACTTGAGTGGGATTCAGGGACCTGAGTTGATGGTGGACTG 17 | AATTTAGTATGATAGGAAGGAGGAAGTAAAGAAGGGAAATAATACATATT 18 | GAGAAACCACTCCATTCAGACACAGGACAGTACTTTCTATAAATCCTCTC 19 | TCACTCCTCCTAACATCCTATGTGTAGGTATCATGATTTTCCTTTTATGT 20 | AATTATACTTGTGATATGGATATTCTGTTAAGTAACCTGCCCAAGCTGGT 21 | GATTGACTCAGTTTAATTGGACCCTATAGAATTCAAAAGCTTGGGCTCTT 22 | TCCATGAATAAATGTTTCCTTCTAGGACTCCGGAGGTGTAGGTCCTTTCT 23 | AACACAGAAGTGAGTGAACCTCACAGGGCACTTGGGCGGGTATAGCAGAA 24 | AGAGAGTAAATCCAGGCATGGGTTTACTTGGTCTCTTGCCCAGGGACCAA 25 | GAGAATACTTACATCAGGATGAGAACAAGCTTAATTCCTGAACCTTTCTC 26 | GTTATTCCCTTGAACTCTCAAATTTATGTGGATAACTCTGTCTCCGAGAT 27 | TCCCAAGAGCTCCATGGAAAATGGGATTTCATACGAGAACGCCCTGATCT 28 | AAGAGCAGAGGTCAATGTTGAATCGGTCCGACTGCCCTCTTCACTTGGTT 29 | CACAGGCTCAGGCAGGGACTGGGCTTTCCCTCTTACCTCCCTAAAGGAAG 30 | GCAGATTCCCGAGGCCCTCAGAGAGGGCGGGCAGGGCTGGGGCAGAGATG 31 | CCTCGAGGATCCCAGGTCCGGAGCACGAGGCACGGGCCCAGCCAAGAACT 32 | CAATTTCGCGTGGACGGGTTTCGCAGCTGCTGGCCGGGTCAGGGCAGCGG 33 | CTGAAGGGTGCGGTCCGGCTGGGGGCTGGGGCTAGGGCCGTGCTGGGGCC 34 | TGACTGACCCGCCGTGATTCTCCGCAGAGGATTTCTTGGTCCAGTTTAAG 35 | GGCATGTGCTACTTCACCAACGGGACAGAGCGCGTGCGCGGTGTGGCCAG 36 | ATACATCTATAACCGCGAGGAGTACGGGCGCTTCGACAGCGACGTTGGGG 37 | AGTTCCAGGCGGTGACCGAGCTGGGGCGGAGCATCGAGGACTGGAACAAC 38 | TATAAGGACTTCTTGGAGCAGGAGCGGGCCGCGGTGGACAAGGTGTGCAG 39 | ACACAACTACGAGGCGGAGCTGCGCACGACCTTGCAGCGGCAAGGTGAGC 40 | GTCGTCGTCCTTCCGCGGGGCTCACCCTTGGCCGGGGCCCGAGTCTCTTG 41 | CGCACAGAGGGGCGAGGACGGCGCGGCCTCAAGGACCGAGCCCTGATCCA 42 | TCCCAGGGTACAGGAAGGTGGCGGGGATTTGGAGGCTGGGGTAGTATCGG 43 | AGGGGCGGGGATCTAGGGCAGAGCAGGGGGATGCACAAAAGCATCCCTTA 44 | GTTCCCTGCAGGGTTGGGTTAGGCTGCCCAGTGTGTCCCCAGCCTCCCCG 45 | TCCATCGGCCTTGTCCTCTGCTCTGCATGTTCTTGCCTTGTGCCTTATGC 46 | GTTTGCCTCCTCGTGCCTTACCTTCGCTAAGCAGTTCTTTCTGCCCGAAT 47 | GCCCGCCCTCTTCCCCTGCCCGTCCGCCCCACTAGCACTGCCCCACCCAG 48 | CAAGGCCCACTTGCACAGCTCGCGCCGCAGGAAGCTTCAGGCTTGGCCTG 49 | GTGGAGTTAGGGCTGCTCCACAACTGCGCGCAGGGCATCCAGCAATTACA 50 | GTTGTGAAATAAGATATTTTAACTTTTGGCTTCAAATTATTATTCATCGT 51 | AATTCTGTTTTCTTAAACGGCTCTCATTCATGGCGGAGCTCTTTGAGGTG 52 | AGAGTGTTTTAATCATTGCATGCCTAGTACCTGACTCGTGGACCGGCATG 53 | TGGTATGAGCTCAATGATCTTCTGTTAAATTAATGAATAAATGTACTCAG 54 | CTGCCCATCCACTTAGGCTCAAGGGAAAGCAGAGGATAAATAGAGCCTTA 55 | AAGATGGACTTTATCAATTATTTTCTATTATTTTGCTTAATGCTGTAAAC 56 | TCTTATTGACTTGGATCTTAGTAAGGTTTGTGAATGCAGTCTGGGGAAAA 57 | AGGTGTTTGCTGAAAATAAAAACAACGCTTGAATGGTGTTATAAGGCAGT 58 | TTTAATTTCTTAGAAAAGCTGAACAAATGGCACAATGAAAAGAGCAGAAG 59 | CTTTGGAATACATAGATTGAAGCCACTAAATTATTGAATAAAAATAGTTT 60 | CAGGTTGCTTTTGGAGTAGATTTTCTCCCTCCCCCCATCACTATCCACTT 61 | CAGGCATAAACATTCTGAACGTCAATTTTACCCACTTAGTGAGCACTTAT 62 | TTCTAGACAATTGCCTTAGCAAACACCATCTAAGTTATGTCATTTAATAG 63 | CACAGTTACCTGTGCATTAGAGATTAGCATTGCCACTTTATATATCGTAA 64 | TATTGGTACATGATAAACACTTTAAGTAATCAACCCACAGTTATGCACCA 65 | GGACCTGAAGCCTCCCCCAAATACACAGCATTCTTTTATGTTCTTCAATA 66 | CTCGTCTACACAGCCTAAGGGAAGTAAAGCCTTGTTAAAGCCAATTTTGA 67 | CAAGAAGCAGCAATGGGTCTATTCCTGCCTGTTTTCACTGTTAATGGGAC 68 | AAAATGATACTTTCAAGGCATTGAAAATTCACTGATTAATCAATCCCTAG 69 | TCTGACCCCAGTGTTATCTATGCAGGTTCACAAAACTTCCTTGCCTTCTT 70 | CTGACCCACATCCTAATGCTGTCAATTATTTATATTTTTGCCATTTCAAG 71 | TCTATTTCTATAAAAGTTATTCTATCATTTTTTTCTCATGAATTTGTGCC 72 | CTCTATTTTTACTTTCAGTCTTTTTAAGATGAACAAATCTTGTAAGTCCC 73 | CACATAGCTGACTGTTATTTCAGTCAGACTCCAGGAAGGAGGGCCTAAAG 74 | AAAAGTTCAAGTCCAAGCAGAAACCAAGATTCCTTCCAGACAATGGCTCA 75 | TGAGTGCCATTTAATTGGGGTGCTACCTGCTGACCTCAGCAAATCCCAGC 76 | TATATGTATATGTTTGCATTACAGGCACATTCACCCAGGCCAACCTCTGC 77 | ATGGATCTCAGAATATTTCCTATGGAGAACGTACATGATAATGTCTGATT 78 | TCAGAACAAGAAAGTAATTCTCAATAGCAAGGGGATGGAGTAGGGTAGGC 79 | AGCTAGTAATTACACTATCTTGAGGGTTAAAAGGAAATTAAGAAAAAGCA 80 | GGAAAATGAGAGAACATATTACCAAGTAAATAAAGCATACATTAAATATT 81 | TACTATAATTTTACACTAAAGAAATAAAGGAAATGCAGTAAAATGGCCAG 82 | AGAGGTAAAGGTTAAGATGTATAAAATATGCAGGGAAAGGTGTGTCATTT 83 | TTGACCATGAGCAGCGCTCTGAGAAGATAAAGGAATTGAGTTATGGGCAA 84 | ACATGATGTTTGATCAGTGTTAGTTTTTTTCAAGGCCTGCCTACTTTTCC 85 | TTCAAATATTACAAACTTTTGAAATAACATTCAATTTTTTGGTCTCTGTT 86 | ACTAGATTGCAAGTTCTATAAAGGCAGGAACCAGGGTTTGTTGTTTATTT 87 | TTGGATTCTCAGTGATTGTCAAATTTATATTTGTTGAAGGAACCTTAATC 88 | CAAGACTTGGACTCCAGGTATCTTTCTATTCTGGTTCCAAGGAGGGACCT 89 | TCCTCACAGCAGGCGTGCTGTGTGGTCTCACATCTCACTCCTATATCTTT 90 | CCCTGTCTGTTACTGCCCTCAGTGGAGCCCACAGTGACCATCTCCCCATC 91 | CAGGACAGAGGCCCTCAACCACCACAACCTGCTGGTCTGCTCGGTGACAG 92 | ATTTCTATCCAGCCCAGATCAAAGTCCGGTGGTTTCGGAATGACCAGGAG 93 | GAGACAGCCGGTGTTGTGTCCACCTCCCTCATTAGGAATGGTGACTGGAC 94 | CTTCCAGATTCTGGTGATGCTGGAAATAACTCCCCAGCGTGGAGACATCT 95 | ACACCTGCCAAGTGGAGCACCCCAGCCTCCAGAGCCCCATCACCGTGGAG 96 | TGGCGTAAGGGGAAACTGGTTTCCTTTTACTGTGGGCCCCACAAGACAAA 97 | GGGCAGAGCTCCCGCTGATCCTTCCCATCCCATCTCTTGTCCCTGACATC 98 | ACTACTGAGCTGGGAATCACAGGAGACTAGAGCACCTGTTGCCCCATGGC 99 | AAGCACATCAGATGAATCCTGATCTCTTTGTCTTTCCAGATACCAGGGAG 100 | ATCACTTTCCACATTTGTGTTAGTCCATTCTTGTACTGCTACAAAGAAAT 101 | CTCTGAGACTGAGTAATTTATAAAGAAAAGAGGTTTAATTGGCTCTTCTC 102 | ACTCCACTATAAAGAAATACCTGAGAATGGGTAATTTATAAAGAGAAGAG 103 | GTTTAATTGGCTTATGATTCTGAGGCTGTAGGGGAAGCATAGTGGCTTCT 104 | GCTTATGGGGAGACATATGGAAGCTCCTAATCATGGCAGAAGGAAAAGAG 105 | GGAGTGAGGTGTCTCACAGGGCAGGGGCAGGAGCATGAGAGAGAGGGGGT 106 | TGGTGCTACGCAGTTTTACATAACCAGATCTCATGAGAACTCACTATTGT 107 | AATGACAGTACTAAGGGAGATGGTGACAAGAATCTGGTCTAATGATCCAG 108 | TCACCTCCCACCAGGCTCTACCTCCAACATTGTTAATTACAATTGAACAT 109 | GAAATTTGGGTGGGGCCACAGAATCAAACCATATCAACACTACTAAAGCC 110 | CCAGAACCAGCTCTGACAGCTATGAGAGACTGACTTAGGGCTGGTGACTG 111 | GGGCCTTAGGGTTTAAGGTTATGGATGAAGTCCTGAGGGGCAGGGGTGTG 112 | CTTCTTCCTCTCCCTCACCCACCTATTGTGTCCAAAGACCTACTGGCTGG 113 | TCTTTCTCTTCCCTAGGGTGGTCAGACTGGAGAACTAGTGTCCCCTGACA 114 | TCTCCACCTCCTGTACCAAGGACATTATGGGGTGTGGGGACAAACACTCA 115 | CACTCAGTTCTGCTCCTTAGGGGCTCAGTCTGAATCTGCCCAGAGCAAGA 116 | TGCTGAGTGGCATTGGAGGCTTCGTGCTGGGGCTGATCTTCCTCGGGCTG 117 | GGCCTTATCATCCGTCACAGGGGTCAGAAAGGTGAGGAACCCAAGGGGGA 118 | AATGGGGAAGATGAGCTGTGACCCAGACCCTCTATTCAGAGAGGTTCTGT 119 | CTCTAGATGTAGCTCTTTCCTCCTTACCCTGAGAGGAAGTGCGAGGAGAC 120 | AGGACAAGATTGGAGGAGGCATTGGAATCTGATTTTACTGGGTGAATGGT 121 | AGCGCTGCCAGAGCTGACTGATAGAGCTTATTCCAGGGCGTCCTTACCGT 122 | TCATCATCGTCTCACTGGCTCCTTTCTAAAAGCTTCCTCCATTATGAGGG 123 | TCAGAGCCTCGGCCTCCTTGTCTTCTAGTGACAATTTCCTTTGTTTTGGG 124 | GGATTTTAACTTAGGGTGCTTAAGGACTTAAAGAACATGGGAGGGAAGAG 125 | GATATAACCCCAATTAAACTACATGTGTCATTTTCCTTTGGGGTAAGATA 126 | GTGGTTGTTTGTTTAACAAGACCTTTCTCTGTATAACTTCCTTTTGTAGG 127 | ACCTCGAGGGCCTCCACCAGCAGGTAATATTTCAGCCATGATCCAGTCAG 128 | GGGAGAGGGCACAGGCATAAGAGGGAAGAGCCATGGTGAAACCGCATCTC 129 | TACTAAAAATACAAAAATTAGCTGGACGTGGTGGTGTGCATCTGTAATCC 130 | CGGCTACTTGGGAGGTTGAGGCAAGAGAATCACTTGAACCCAGGAGGCAG 131 | AGGTTGCAGTGAGCCAAGATGGCGCCACTGCACTCCAGTCTGGGCGATAG 132 | AGCTAGAGTCTGTCTCAAAAAAAAAAAAGAAGAGCATGAGCGGAGTGTTC 133 | CAGGGCACAGTGGTCTCTGTTCATGGCCTGTTTGCTGCTATGAGGGTTAA 134 | GACTTAGGGGAAAAGTTTGCCAGTTTCTACGAATCTCCAGAGATTGTTTC 135 | CTAGAACCAGGCCTTAACTTTGGTGGCATCTTTTTGTGAAATGTGGGGAC 136 | AGAGCCACATCTTGAATGTGAGATAGTAGGGTGATGCCCACTTTGTGCCA 137 | CATTTTGTTAGCTACTGCCTGTAGGCATTTTCAGTGACTAAAAGAGGCTG 138 | CTAGTGGTGGAGATGAAGTGTCACCCAATTTACTAAAAAAATCAAACTCT 139 | TCATATTACCCAGAAGGGTAACTGCTGTTCCCCCACCTCCACATATCTGC 140 | ATCAAGCTGAAGTTCTGTGTCCTCATGAGCTGATTTTACCTTTACACAGA 141 | TATTGGGGAACGTGATGATGATATGCCCTGGACCTCAGCATCCTCTGTTT 142 | GATGCTACAGAGGGAACTGAGGACTAGGGGAGAGGGTGTGTCCCTCAGGG 143 | TACCCTGTGCTGATCATGCCTCGTCTCTCTTCTCCAGGACTCCTGCACTG 144 | ACTCCTGAGGACTTTTGTCTGGGATTGGTCATCACTCTTCTGTAATGCCC 145 | ACCTGCCCCTGCCCAGAATTCCTAGCTGCCTGTGTCACCCTGTCCCACTG 146 | AGGTCAGAGTCCTACAGTGGCTCATGCAGCCACAGGTCACCTTCTGTGAT 147 | CCCCATCCCAAGGCACTGGTGGTGACTCTGCTTCCTGCACTGACCCAGAG 148 | CCTCTGCCTGTGCACTGCAAGCTGTGTCTACTCAGGCCCCAAGGGGCATC 149 | TCTGTTTCCATTCTCCCCCCACAGACCTGTCAAGAGAAGCATGACAAACA 150 | AAATCATTTACCTGACTTTAGTGCTTTTTCCCATAATTAAACCTGATTCT 151 | GAGTTA 152 | >DQA2_decoy 153 | TCCTCACAATTGCTCTACAGCTCAGAGCAGCAACTGCTGAGGCTGCCTTG 154 | GGAAGAAGATGATCCTAAACAAAGCTCTGCTGCTGGGGGCCCTCGCCCTG 155 | ACTGCCGTGATGAGCCCCTGTGGAGGTGAAGACATTGTGGGTGAGTACAT 156 | GAGTGAGGAATGTTCTCTGGAGCTGAAAAACAGTAAATTAAAGGAAAAGA 157 | GAGAGTGTAATTTGCTAAGAAATAGTAGAAATTTCCCAAGGGTCTTTTCA 158 | ATATTAAGAAATTTTAAAATTATGGCAGTTCCTCCTTTAGGAAACCAGAG 159 | CTCCAACCGACTCTCTTTGCTACCTGTGCTATTGGAGTTTACCAAGGACG 160 | TTGTTCTGTTTATATTATATCCAGAGACTATAGCCTGGAGGTCTGTGTGG 161 | CATTCCATCATGATTGCCTCAAAGACTAGGGATGTTTCCATGAATGGAGT 162 | ATTTTTTTGTTATTAAAAATTTCTGAACTGTTACTCCCAAATTTCTCTGA 163 | ACAACTTTTGAAGCTTTTCATATGCCTCCTATAGCATATGTTGGGGTAGA 164 | TAGTTCCATGAAGTATGTACACTCTATAGATATAAAGAAAGAGGTTCTTT 165 | TCTTTCTCTCAGACTTACATTTCCACATGGGAATTGGCACAGGTGGGGAG 166 | TAGGTGAAAGAGCCCAGCAGGCTGAATGCCTTCAACAATCATTTTACCAC 167 | GTGGTAAATGTGGTACTTACTCTCTGCTACCTCATATATGTCACCTCGCT 168 | TATGATCAAATAAAATGGGCATGTAGATATGCTTTATGAATAGTAAAAAC 169 | ATGAATGTCAACTTTTTTTAACTTATTCCTATTACAGGTATAACTTCGTA 170 | TTTTTTCTTTAGCAAAGTAAGGAATATATTTTAAAACTGAGAACTTTATG 171 | ATAAAATGCTTGGTAAATTAAATTATTTTATTCTCAAATTGTCAACCCAA 172 | ATTACTAGTTCTTCACCTTATCTAATGAAGTCTTATAAAGAGAAAAATGG 173 | GCAGGCACAGATAATTATTTGGTCCCTTAGTCCCCTCTGCCTTTGTCGTC 174 | CATCTCTTCCCACCTCTCTTCATGCATCCCTTTCTCCCTCTTCCCTTTCA 175 | GGATCCATCTCTGACTCCCTGCTCCTTTACAGACATGGGCAGTGGGTTTG 176 | TAAAACAAAAGTTGGAAAGTCAAATAGTTAAAAGGGGAAGTGAACTGGAA 177 | GCTACTCTAAACTTTCACAACCTTATTAACCATGGCTGCTCCCATTCTGA 178 | TTTTGTTTGGCAGTGGAAGTTTCACCTGCTTCTCCAGAGCACTTGGCTTT 179 | TTTGTTTCAAATTTCCTTTCTTCAACCTCACACCAGAGTGCCCCGGTCAG 180 | GCTCGACTTATCCATTAGGAACAGTGTGGGCAGTGAAGGGGACCCTCCAA 181 | ACTGTAAAGCTACAAGAGAACGTTTTAACTCCTTTTAAAATTAGAAGAAA 182 | AATGAAGTTTTACAGTCTATGAAAATGTTTTAACTTTTTTTTTTTTTTTT 183 | GACGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCACGATCTCG 184 | GCTCACTGCAAGCTCCGCCTCCCAGGTTCACGCCATTCTCCTGCCTCAGC 185 | CTCCCGAGTAGCTGGGACTACAGGCGCCCGCCACTGTGCACGGCTAATTT 186 | TTTGTATTTTTAGTAGAGACGGGGTTTCACCGTGTTAGCCAGGATGGTCT 187 | CGATCTCCTGACCTCATGATCCGCCCGTCTCAACCTCCCAAAGTGCTGGG 188 | ATTACAGGCGTGATCCACCGCGCCCGGCCTTAACTTTTAATGTAGCCTGG 189 | ATTGTATTTGTCTTTATACCAATACAATCAGAAGCTGTAATTTTCCGTAT 190 | TTTTATGGAGGAAGGCGCCCACAAAAGCAACAGTGCTCGGGGCTCACAAG 191 | TCAGAATTCAGCCCTGGGCATCCCTGATCCTGGGCTTTGCGTGGTTCTGC 192 | TACCTGGGTGCCTGTCAGTCTTCCCCAAAATCTATGTAATTGTCAAAAAT 193 | TGCAATTGTCATTCAATACACATGTTTGAGCACACAATGAGCTAACTTTT 194 | GGGAATTCAAAGATAAAAAATCATGCTGTCTGCCTTGCAGAGGGTGCACA 195 | AACCAGTGATGGAAACAGTATGGGGCACAGGAAAGCAGAAGGCCCTGCTG 196 | AGCAGGACACTGGCCCAGCAGAGGCTGAAACTATAAAAATGACTTGGTTC 197 | CAGCTGGGCCAGTAGAGTGATGTCCTCCAGCAACACTTAGCACCCAGGAC 198 | AAGTACCAGATGAAAAGAAGGATTGCATGTATTCCACATATATTCATGTT 199 | TGAACAAGGAGTCAAAGTTTATTGTAAGGATAAGGAGTCTTTGTTGGTGG 200 | CCTGTTAAGTAACCAACCAGGGCAGTCATGCTGGGTAGGGAAGAAGGTGA 201 | GCTGGAGGAGGAAAAGACAAACTTGGAGAGCCAGACATTGAGATTCCATT 202 | GAGGCGTTGGAGGTCACAACGCGGTCAAAAACATGTTGAGAGGACTTAGC 203 | TGCAAAGTTGTTAACTAAGTAGAAACCTCAAGGATGAATTTTAGGATTTC 204 | TCCAGGAAATCCTAAAAGATAACTTCTTTCAGGGAGAAAAAACAGACCCT 205 | TGCAAAGACATGAAAGGAAATGTAGTTTGGTTTGATTGGCAGATAGTTGT 206 | GAAGAATGTCGGACTGTAAGGCTGTCGATATCCTCCTCACAGAATTCCCC 207 | AAAGTACATTGTATTTGCTCCCTTACCGACCTGATTCTCCCACTATTCAG 208 | TTCATTCCTTGATGCTGTTTTAAGCAACCCCTGCTCTGTCTGACACTTTT 209 | GGATGCTCAGTAAATGAGGAAGGAAGGAAGGAAAGATAAAATGGTAAAGG 210 | GCTCACACATGTCTTAACAAAAATGTCCAGTTCGGCTCATTTGGCTATAC 211 | TTCATGGCTGCTGCTCTGCCCTGGCATCCTCGGATAAGCTCGCTGCCCAT 212 | TAGAGGAAAAAGGGTTTAATTTACCTGAGTCCTCGAGTGAATGTAATTGT 213 | TGAATCAGAACACTATAGATATTTAGTAACCTCCTTCAGAGGAAAAAAAA 214 | AAGTGGGGGCAATGACAGAAATTAAAAAACCAGTTGAGCTTCCACTTTTC 215 | ATTTCAGAAGAAATCAGGTGCTCTCCTCTAAGGACCACTACTATTAACAA 216 | AACAGAGACCTTAGAAGAATTGTTTATTTGTTATAAATGTATAATGTTGC 217 | TATTCTTGTAATAGTCTTTCTTGTACCCTATAATTGTTAGAAGAAATTAT 218 | TTTAAGTTAATACGTTCCTACATGCTTTTCTTTGGTTTAAAAAAAAAAAA 219 | AAAGGAAACTCTGTGTAGAAAGTGTCCTGTTCTGATCTAGTCCTGACAGG 220 | AAACGAAGTATAATCAACTTGTTATTAACTGAGAGAGAAAACTTAGGAAG 221 | CAGAGGGAAATAAACTGAATCTCTGAGTAAGAAAACTAAATCCTATGATA 222 | ACTCATTCATTCCTTCCTTTGTTTATTGCAATATTCATCATAAGCTTATG 223 | ATGCGCCAGGCACTAAGTAGGCACTCAGGAAATAACAGACGTGTGACGTT 224 | CTGCCTTTGTGGAGCATATGTTATAGTGAGAAAGACAGAATCAGTTCTAA 225 | CCTGATGACTACCAACGTTAGGCAAGGAGGAAGCAGGTGTTAGGAAGATT 226 | GTTCAGGGACTGTGCCAAAGATGAAGCCCATAATATTTGAAAGTGAGTTT 227 | CTTCAATCACTTTCTGTATTAAGGTTCTTTCTCCCTGTGTTCCACCCTCC 228 | TGCTTGTCACCTTCACTCGTCAGCTGACCATGTTGCCTCCTATGGTGTGA 229 | ACTTCTACCAGTCTCACGGTCCCTCTGGCCAGTACACCCATGAATTTGAT 230 | GGAGACGAGGAGTTCTATGTGGACCTGGAGACGAAAGAGACTGTCTGGCA 231 | GTTGCCTATGTTTAGCAAATTTATAAGTTTTGACCCGCAGAGTGCACTGA 232 | GAAATATGGCTGTGGGAAAACACACCTTGGAATTCATGATGAGACAGTCC 233 | AACTCTACCGCTGCCACCAATGGTACGTGTCCACCATTCCGCCTCTCTTT 234 | ACTGAAACTAATCTTTCATACCAAGTTTTACTCCCTTCTTCTCAAGAGAT 235 | TTCCAGATCTTCTCATGGTAATTGCTGAAATTTTATCATCTCCCATCTCT 236 | AAAATCACATATTCCCATGTAATACAAGGGTCTTTCCATTATGTATTAAT 237 | TCCTACTTTATTAAACATGCCCACAGAGAGAAGGGCACAGGAATAAAGCA 238 | GAGGCAATGTGTCGTTGCTCCCAAGCAGAAGGTAAATAAGACCTCTTTGA 239 | CTATCAGGTGGTGAAATGCTGGTAGGAGGGCTCTTCCAGGATGTAATGCA 240 | GAAGCTCATGGCAGAGCTATTCACACTTCACATCAGTGCTGTTTCCTCAC 241 | CACAGAGGTTCCTGAGGTCACAGTGTTTTCCAAGTTTCCTGTGACGCTGG 242 | GTCAGCCCAACACCCTCATCTGTCTTGTGGACAACATCTTTCCTCCTGTG 243 | GTCAACATCACCTGGCTGAGCAATGGGCACTCAGTCACAGAAGGTGTTTC 244 | TGAGACCAGCTTCCTCTCCAAGAGTGATCATTCCTTCTTCAAGATCAGTT 245 | ACCTCACCTTCCTCCCTTCTGCTGATGAGATTTATGACTGCAAGGTGGAG 246 | CACTGGGGCCTGGACGAGCCTCTTCTGAAACACTGGGGTAAGGATGAGTT 247 | CCACCACTTCATGGGTTTCTAATAATAGACTTCACTCTTCTCCCTAAGCC 248 | TGGGGCCTTGAGTCTTGCAGAGCCAGCCCTCCACCCCATCCCATCCCACA 249 | CACATGCACATGAGCACACTGCACATTCTGACCTCAACAGCTCCACTTTC 250 | ACAGAGCCTGAGATTCCAGCCCCTATGTCAGAGCTCACAGAGACTTTGGT 251 | CTGCGCCCTGGGGTTGTCTGTGGGCCTCATGGGCATTGTGGTGGGCACTG 252 | TCTTCATCATCCAAGGCCTGCGTTCAGTTGGTGCTTCCAGACACCAAGGG 253 | CTCTTATGAATCCCATCCTGAAAAGGAAGGTAAGATTGAGATTTGTTGGA 254 | GCTGAAACCTCAGTATGAGAGGGAGGAAAGTGGGAGGGGGTTGTGGACAT 255 | GAATGTGGTTGAAAGTTGTAGGCGAATTGGGAAGTGGCATGATGATCACA 256 | CAGGAGGCCCCTCAGACCCATCGATCTCATGTCTGTCCTGTTGCAGGTGC 257 | ATCACCATCTACAGGAGAAGAAGAATGGACTTGCTAAATGACCTAGCACT 258 | ATTCTCTGGCCTGATTTATCATATCCCTTTTCTCCTCCAAATGTTTCTTC 259 | TCTCACCTCTTCTCTGGGACTTAAGGTGCTATATTCCCTCAGAGCTCACA 260 | AATGCCTTTCAATTCTTTCCCTGACCTCCTTTCCTGAATTTTTTTATTTT 261 | CTCAAATGTTACCTACTAAGGGATGCCTGGGTAAGCCACTCAGCTACCTA 262 | ATTCCTCAATGACCTTTATCTAAAATCTCCATGGAAGCAATAAATTCCCT 263 | TT 264 | >Decoy_A1 265 | CACTGGGTCCATAAATCCTGTCCTGGTTATCTCCCCATTCTCTGTAAAAA 266 | GGATTCTCTGTAAAAAGATTACATCGCCCTAAAGGAGGACCTGAGCTCTT 267 | GGACCGCGGCGGCCATGGCGGCTCAGATTACCCAGCGCAAGTGGGAGGCG 268 | GCCCATGAGGCGGAGCAGCAGAGAGCCTACCTGGAGGGCACGTGCGTGGA 269 | GTGGCTCCGCAGATACCTGGAGAACAGGAAGGAGATGCTGCAGCGCACTG 270 | GTACCAGGGGCCACGGGGCGCCTCCCTGATCGCCTGTAGATCTCCCGGGC 271 | TGGCCTCCCACAAGGAGGGGAGACAGATGGGACCAACACTAGAATATCAC 272 | CCTCCCTCTGGTCCTGAGGGAGAAGAATCCTCCTGGGTTTCCAGATCCTG 273 | TACCAGAGAGTGACTCTGAGGTTCCACCCTGCTCTCTGACACAATTAAGG 274 | GATAAAATCTCTGAGGCAATGACGGGAAGACGCAATTAAGGGATAAAATC 275 | TCTGAGGGAATGACTGGAAGACGATCCCTCATTTAGTGATCCCAAGTCAC 276 | >Decoy_A2 277 | CACTGGGTCCATAAATCCTGTCCTGGTTATCTCCCCATTCTCTGTAAAAA 278 | CGATTCTCTGTAAAAAGATTACATCGCCCTAAACGAGGACCTGAGCTCTT 279 | GGACCGCGGCGGCCATGGCGGCTCAGATTACCCAGCGCAAGTGGGAGGCG 280 | GCCCATGAGGCGGAGCAGCAGAGAGCCTACCTGGAGGGCACGTGCGTGGA 281 | GTGGCTCCGCAGATACCTGGAGAACGGGAAGGAGACGCTGCAGCGCACTG 282 | GTACCAGGGGCCACGGGGCGCCTCCCTGATCGCCTGTAGATCTCCCAGGC 283 | TGGCCTCCCACAAGGAGAGGAGACAGATGGGACCAACACTAGAATATCAC 284 | CCTCCCTCTGGTCCTGAGGGAGAAGAATCCTCCTGGGTTTCCAGATCCTG 285 | TACCAGAGAGTGACTCTGAGGTTCCACCCTGCTCTCTGACACAATTAAGG 286 | GATAAAATCTCTGAGGCAATGACGGGAAGACGCAATTAAGGGATAAAATC 287 | TCTGAGGGAATGACGGGAAGAAGATCCCTCATTTAGTGATCCCAAGTCAC 288 | >Decoy_A3 289 | CACTGGGTCCATAAATCCTGTCCTGGTTATCTCCCCATTCTCTGTAAAAA 290 | CGATTCTCTGTAAAAAGATTACATCGCCCTAAACGAGGACCTGAGCTCTT 291 | GGACCGCGGCGGCCATGGCGGCTCAGATTACCCAGCGCAAGTGGGAGGCG 292 | GCCCATGAGGCGGAGCAGCAGAGAGCCTACCTAGAGGGCACGTGCGAGGA 293 | GTGGCTCCGCAGATACCTGGAGAACAGGAAGGAGACGCTGCAGCGCACTG 294 | TACCAGGGGCCACGGGGCGCCTCCCTGATCGCCTGTAGATCTCGCAGGCT 295 | GGCCTCCCACAAGGAGAGGAGACAGATGGGACCAACACTAGAATATCACC 296 | CTCCCTCTGGTCCTGAGGGAGAAGAATCCTCCTGGGTTTCCAGATCCTGT 297 | ACCAGAGAGTGACTCTGAGGTTCCACCCTGCTCTCTGACACAATTAAGGG 298 | ATAAAATCTCTGAGGCAATGACGGGAAGACGCAATTAAGGGATAAAATCT 299 | CTGAGGGAATGACGGGAAGACGATCCCTCATTTAGTGATCCCAAGTCAC 300 | >Decoy_A4 301 | CACTGGGTCCATAAATCCTGTCCTGGTTATCTCCCCATTCTCTGTAAAAA 302 | CGATTCTCTGTAAAAAGATTACATCGCCCTAAACGAGGACCTGAGCTCTT 303 | GGGCCGCGGCGGCCATGGCGGCTCAGATTACCCAGCGCAAGTGGGAGGCG 304 | GCCCATGAGGCGGAGCAGCAGAGAGCCTACCTGGAGGGCAGGTGCGTGGA 305 | GTGGCTCCGCAGATACCTGGAGAACGGGAAGGAGACGCTGCAGCGCACTG 306 | GTACCAGGGGCCACGGGGCGCCTCCCTGATCGCCTGTAGATCTCCCAGGC 307 | TGGCCTCCCACAAGGAGAGGAGACAGATGGGACCAACACTAGAATATCAC 308 | CCTCCCTCTGGTCCTGAGGGAGAAGAATCCTCCTGGGTTTCCAGATCCTG 309 | TACCAGAGAGTGACTCTGAGGTTCCACCCCGCTCTCTGACACAATTAAGG 310 | GATAAAATCTCTGAGGCAATGACGGGAAGACGCAATTAAGGGATAAAATC 311 | TCTGAGGGAATGACGGGAAGACGATCCCTCATTTAGTGATCCCAAGTCAC 312 | >Decoy_A5 313 | CACTGGGTCCATAAATCCTGTCCTGGTTATCTCCCCATTCTCTGTAAAAA 314 | GGATTCTCTGTAAAAAGATTACATCGCCCTAAACCAGGACCTGAGCTCTT 315 | GGACTGCGGCGGCCATGGCGGCTCAGATTACCCAGCGCAAGTGGGAGGCG 316 | GCCCATGAGGCGGAGCAGCAGACAGCCTACCTGGAGGGCAGGTGCGTGGA 317 | GTGGCTCCGCAGATACCTGGAGAACAGGAAGGAGACACTGCAGCACACCT 318 | GTACCAGGGACCACGGGCGCCTCCCTGATCGCCTGTAATCTCCCGGGCTG 319 | GCCTCCCACAAGGATGGGAGACAGATGGGACCAACACTAGAATATCACCC 320 | TCCCTCTGGTCCTGAGGGAGAAGAATCCTCCTCAGTTTCCAGATCCTGTA 321 | CCAGAGAGTGACTCTGAGGTTCCACACTGCTCTCTGACACAATTAAGGGA 322 | TAAAATCTCTGAGGCAATGACGGGAAGACGCAATTAAGGGATAAAATCTC 323 | TGAGGGAATGACGGGAACACGATCCCTCATTTAGTGATCCCAAGTCAC 324 | -------------------------------------------------------------------------------- /src/FormatIMGT.java: -------------------------------------------------------------------------------- 1 | /* 2 | Part of Kourami HLA typer/assembler 3 | (c) 2017 by Heewook Lee, Carl Kingsford, and Carnegie Mellon University. 4 | See LICENSE for licensing. 5 | */ 6 | import java.util.*; 7 | import java.io.*; 8 | 9 | public class FormatIMGT{ 10 | 11 | public static void main(String[] args){ 12 | if(args.length != 3){ 13 | System.err.println("USAGE: java FormatIMGT "); 14 | System.exit(1); 15 | } 16 | 17 | String imgtpath = null; 18 | String imgtVer = null; 19 | String outbase = null; 20 | if(args[0].endsWith(File.separator)) 21 | imgtpath = args[0].substring(0, args[0].length() - 1); 22 | else 23 | imgtpath = args[0]; 24 | 25 | if(args[1].equals("0")){ 26 | imgtVer = getVersionNum(imgtpath + File.separator + "A_gen.txt"); 27 | if(imgtVer != null) 28 | System.err.println("IMGTver " + imgtVer); 29 | else{ 30 | System.err.println("IMGTver could not be found. Please consider reruning the script with user-input ver_number"); 31 | System.exit(1); 32 | } 33 | }else 34 | imgtVer = args[1]; 35 | 36 | if(args[2].endsWith(File.separator)) 37 | outbase = args[2].substring(0, args[2].length() - 1); 38 | else 39 | outbase = args[2]; 40 | 41 | 42 | //String outpath = args[0] + File.separator + "kouramiFormatted"; 43 | String outpath = outbase + File.separator + imgtVer; 44 | 45 | File outdir = null; 46 | try{ 47 | File imgtdir = new File(imgtpath); 48 | if(!imgtdir.exists() || !imgtdir.isDirectory()){ 49 | System.err.println(imgtpath + " either doesn't exist or it is not a directory."); 50 | System.exit(1); 51 | } 52 | outdir = new File(outpath); 53 | if(outdir.exists()){ 54 | if(!outdir.isDirectory()){ 55 | System.err.println(outpath + " exists but it is NOT a writable directory."); 56 | System.exit(1); 57 | } 58 | }else 59 | outdir.mkdirs(); 60 | 61 | System.err.println("#Input IMGT/HLA MSA Allignments:\t" + imgtpath); 62 | System.err.println("#Output (Kourami panel/db) :\t" + outpath); 63 | if(!args[1].equals("0")) 64 | System.err.println("#Version number (user-input) :\t" + imgtVer); 65 | else 66 | System.err.println("#Version number (detected) :\t" + imgtVer); 67 | 68 | boolean missingFiles = false; 69 | for(int i=0; i>>>>>>>> Processing\t[" + geneName + "] <<<<<<<<<<"); 100 | FormatIMGT.processGene(imgtpath, outpath, geneName); 101 | } 102 | }catch(Exception e){ 103 | e.printStackTrace(); 104 | System.exit(1); 105 | } 106 | 107 | } 108 | 109 | public static String getVersionNum(String agenfile){ 110 | BufferedReader br = null; 111 | String ver = null; 112 | try{ 113 | br = new BufferedReader(new FileReader(agenfile)); 114 | String curline = null; 115 | while((curline=br.readLine()) != null){ 116 | if(curline.startsWith("IPD-IMGT/HLA Release:") || curline.startsWith("IMGT/HLA Release:") || curline.startsWith("# version:")){ 117 | if(curline.charAt(0) == '#') 118 | ver = curline.split("HLA")[1].trim(); 119 | else 120 | ver = curline.split(":")[1].trim(); 121 | break; 122 | } 123 | } 124 | br.close(); 125 | }catch(IOException ioe){ 126 | ioe.printStackTrace(); 127 | } 128 | return ver; 129 | } 130 | 131 | public static void processGene(String imgtpath, String outpath, String geneName){ 132 | String genfile = imgtpath + File.separator + geneName + "_gen.txt"; 133 | String nucfile = imgtpath + File.separator + geneName + "_nuc.txt"; 134 | String genoutfile = outpath + File.separator + geneName + "_gen.txt"; 135 | String nucoutfile = outpath + File.separator + geneName + "_nuc.txt"; 136 | if(geneName.startsWith("DRB")){ 137 | nucfile = imgtpath + File.separator + "DRB_nuc.txt"; 138 | if(FormatIMGT.isExtraDRB(geneName)) 139 | genfile = imgtpath + File.separator + "DRB1_gen.txt"; 140 | } 141 | IMGTReformatter nuc = null; 142 | IMGTReformatter gen = null; 143 | if(new File(nucfile).exists()) 144 | nuc = new IMGTReformatter(nucfile, geneName); 145 | else 146 | System.err.println("[WARNING]: Missing < " + nucfile + " >. Proceeding without.\nHowever, it is STRONGLY recommended to run FormatIMGT\nwith both nuc and gen files for each gene."); 147 | 148 | if(new File(genfile).exists()) 149 | gen = new IMGTReformatter(genfile, geneName); 150 | else{ 151 | System.err.println("[ERROR]: Missing < " + genfile + " >.\n"); 152 | System.err.println("A gen file is required for each gene!"); 153 | System.exit(1); 154 | } 155 | 156 | String nucRefAl = null; 157 | if(nuc != null) 158 | nucRefAl = nuc.getRefAlleleName(); 159 | String genRefAl = gen.getRefAlleleName(); 160 | 161 | System.err.println("nucRefAl:\t" + nucRefAl + "\tgenRefAl:\t" + genRefAl); 162 | //if both ref alleles are same 163 | if(nuc == null || nucRefAl.equals(genRefAl)){ 164 | if(nuc != null) 165 | System.err.println("\trefGeneName on nuc and gen are same"); 166 | System.err.println("\tWrting to :\t" + genoutfile); 167 | gen.outToFile(genoutfile); 168 | if(nuc !=null){ 169 | System.err.println("\tWrting to :\t" + nucoutfile); 170 | nuc.outToFile(nucoutfile); 171 | } 172 | } 173 | //if NOT same, then genRefAl must be in nucRefAl 174 | //because nucRef is the superset. 175 | else{ 176 | System.err.println("\trefGeneName on nuc and gen are NOT same"); 177 | 178 | if(nuc.contains(genRefAl)){ 179 | System.err.println("\tSWAPPING refGenes on nuc and gen"); 180 | nuc.swap(genRefAl); 181 | System.err.println("\tWrting to :\t" + genoutfile); 182 | gen.outToFile(genoutfile); 183 | System.err.println("\tWrting to :\t" + nucoutfile); 184 | nuc.outToFile(nucoutfile); 185 | }else if(FormatIMGT.isExtraDRB(nuc.getGeneName())){ 186 | System.err.println("\tExtra DRB genes."); 187 | gen.outToFile(genoutfile); 188 | nuc.outToFile(nucoutfile); 189 | }else{ 190 | System.err.println("Reference sequence entry [" + genRefAl + "] is " 191 | + "NOT found in nuc alignments.\n" 192 | + "Check the alignment files."); 193 | System.exit(1); 194 | } 195 | ;//swap then outToFile(); 196 | } 197 | 198 | MergeMSFs mm = new MergeMSFs(); 199 | if(FormatIMGT.isExtraDRB(nuc.getGeneName())) 200 | mm.setDRBMode(nuc.getGeneName()); 201 | 202 | if(!mm.merge(nucoutfile, genoutfile, false)){ 203 | System.err.println("ERROR in MSA merging. CANNOT proceed further. Exiting.."); 204 | System.exit(1); 205 | }else{ 206 | //if merging is successful, write the output as fasta 207 | mm.outToFasta(outpath + File.separator, false); 208 | } 209 | } 210 | 211 | public static boolean isExtraDRB(String genename){ 212 | if( genename.equals("DRB2") || genename.equals("DRB6") || genename.equals("DRB7") 213 | || genename.equals("DRB8") || genename.equals("DRB9") ) 214 | return true; 215 | return false; 216 | } 217 | 218 | public static final String[] list = {"A" , "B" , "C" , "DQA1" , "DQB1" , "DRB1"}; 219 | 220 | /* list of genes used in DB */ 221 | public static final String[] expList = {"A", "B", "C", "DMA", "DMB", "DOA" 222 | , "DPA1", "DPB1", "DPB2", "DQA1", "DQB1", "DRA" 223 | , "DRB1", "DRB3", "DRB4", "DRB5", "DRB2", "DRB6", "DRB7", "DRB8", "DRB9" 224 | , "E", "F", "G", "H", "HFE", "J", "K" 225 | , "L", "MICA", "MICB", "TAP1", "TAP2", "V", "Y"}; 226 | 227 | 228 | } 229 | 230 | class IMGTReformatter{ 231 | 232 | private ArrayList alleles; 233 | 234 | private StringBuffer header; 235 | private StringBuffer dnaCoordinate; 236 | private StringBuffer AACoordinate; 237 | private StringBuffer tick; 238 | 239 | private int startPos; 240 | 241 | private Allele newRef; 242 | private int newRefIndex; 243 | 244 | private String geneName; 245 | 246 | public IMGTReformatter(){ 247 | this.alleles = new ArrayList(); 248 | this.header = new StringBuffer(); 249 | this.dnaCoordinate = null; 250 | this.AACoordinate = null; 251 | this.tick = null; 252 | 253 | this.startPos = -1; 254 | 255 | this.newRef = null; 256 | this.newRefIndex = 0; 257 | 258 | this.geneName = null; 259 | } 260 | 261 | public IMGTReformatter(String msf, String gn){ 262 | this(); 263 | this.geneName = gn; 264 | this.loadAlleles(msf); 265 | } 266 | 267 | public String getGeneName(){ 268 | return this.geneName; 269 | } 270 | 271 | public int getNewRefIndex(){ 272 | return this.newRefIndex; 273 | } 274 | 275 | public boolean contains(String newRefName){ 276 | for(int i=1; i < this.alleles.size(); i++){ 277 | if(this.alleles.get(i).getName().equals(newRefName)){ 278 | this.newRef = this.alleles.get(i); 279 | this.newRefIndex = i; 280 | return true; 281 | } 282 | } 283 | return false; 284 | } 285 | 286 | //performs the swap of curRef to newRef 287 | public void swap(String newRefName){ 288 | Allele curRef = this.alleles.get(0); 289 | for(int i=1; i < this.alleles.size(); i++){ 290 | if(i!=this.newRefIndex) 291 | this.alleles.get(i).update(curRef, this.newRef); 292 | } 293 | Allele.swapRef(curRef, newRef); 294 | } 295 | 296 | public void swapAndDiscard(String newRefName){ 297 | this.swap(newRefName); 298 | 299 | } 300 | 301 | private boolean isAlleleLine(String[] tokens){ 302 | //tokens[0].startsWith(this.geneName + "*") && tokens[0].matches("[A-Z]+\\d{0,1}\\*\\d+(:\\d+){0,3}[A-Z]*") 303 | if(tokens.length > 0){ 304 | if(tokens[0].matches("[A-Z]+\\d{0,1}\\*\\d+(:\\d+){0,3}[A-Z]*")) 305 | return true; 306 | /*else{ 307 | System.err.print("[" + this.geneName + "]: " + tokens[0] + "\t"); 308 | //System.err.print(tokens[0].startsWith(this.geneName + "*") + "\t"); 309 | System.err.println(tokens[0].matches("[A-Z]+\\d{0,1}\\*\\d+(:\\d+){0,3}[A-Z]*")); 310 | }*/ 311 | } 312 | return false; 313 | } 314 | 315 | /* 316 | * Parses alignment format from IMGT db 317 | */ 318 | public void loadAlleles(String msf){ 319 | BufferedReader br = null; 320 | String curline = null; 321 | try{ 322 | br = new BufferedReader(new FileReader(msf)); 323 | boolean inMSF = false; //flag to split header and post-header 324 | int alleleIndex = 0; 325 | while((curline=br.readLine())!=null){ 326 | String stripped = curline.trim(); 327 | String[] tokens = stripped.split("\\s+"); 328 | 329 | //parse headers 330 | if(!inMSF){ 331 | if(stripped.startsWith("cDNA") || stripped.startsWith("gDNA")){ 332 | inMSF = true;//turn it on 333 | this.dnaCoordinate = new StringBuffer(curline); 334 | this.startPos = this.getStartIndex(curline, tokens); 335 | if(this.startPos < 0){ 336 | System.err.println("Check the input file:\t" + msf); 337 | System.exit(1); 338 | } 339 | 340 | }else//must be headers 341 | header.append(curline + "\n"); 342 | }else{ 343 | if(stripped.startsWith("|")){ 344 | if(this.tick == null) 345 | this.tick = new StringBuffer(curline); 346 | else 347 | this.tick.append(stripped); 348 | } 349 | // if it's cDNA gDNA tag 350 | else if(stripped.startsWith("cDNA") || stripped.startsWith("gDNA")){ 351 | this.dnaCoordinate.append(this.stripHeader(curline, this.startPos)); 352 | alleleIndex = 0; // reset alleleIndex 353 | } 354 | else if(stripped.startsWith("AA codon")){ 355 | if(this.AACoordinate == null) 356 | this.AACoordinate = new StringBuffer(curline); 357 | else 358 | this.AACoordinate.append(this.stripHeader(curline, this.startPos)); 359 | } 360 | else if(isAlleleLine(tokens)){ 361 | //System.err.println("allelesSize:\t" + this.alleles.size() + "\talleleIndex:\t" + alleleIndex ); 362 | // -------------------------------------------- 363 | // ONLY add or update if allele is the reference allele (alleleIndex == 0) 364 | // in the given msf or it is from same gene 365 | // -------------------------------------------- 366 | if(alleleIndex == 0 || tokens[0].startsWith(this.geneName + "*")){ 367 | //add 368 | if(this.alleles.size() == alleleIndex){ 369 | //System.err.println("ADDING "); 370 | this.alleles.add(new Allele(curline.substring(0 , this.startPos), curline.substring(this.startPos))); 371 | }else//update 372 | this.alleles.get(alleleIndex).appendSequence(tokens[0], curline.substring(this.startPos)); 373 | //append correct number of white spaces for coordinate and tick lines 374 | if(alleleIndex == 0) 375 | this.appendNPadding(this.alleles.get(alleleIndex).curStringLength(this.startPos)); 376 | alleleIndex++; 377 | } 378 | } 379 | } 380 | } 381 | }catch(IOException ioe){ 382 | ioe.printStackTrace(); 383 | } 384 | } 385 | 386 | //if newRefIndex == 0 --> no swapping was necessary 387 | //if newRefIndex > 0 --> oldRefIndex is 0, and newRefIndex is given 388 | public void outToFile(String merged){ 389 | BufferedWriter bw = null; 390 | try{ 391 | bw = new BufferedWriter(new FileWriter(merged)); 392 | bw.write("Nuc+Gen merged MSA for Kourami\n"); 393 | bw.write(this.header.toString()); 394 | bw.write(this.dnaCoordinate.toString().replaceFirst("\\s++$", "") + "\n"); 395 | if(this.AACoordinate != null) 396 | bw.write(this.AACoordinate.toString().replaceFirst("\\s++$", "") + "\n"); 397 | bw.write(this.tick.toString().replaceFirst("\\s++$", "") + "\n"); 398 | 399 | //if no swapping was applied 400 | if(this.newRefIndex == 0){ 401 | for(Allele a : this.alleles) 402 | bw.write(a.toString() + "\n"); 403 | } 404 | //if there was a swapping 405 | else if(this.newRefIndex > 0){ 406 | //write newRefAllele first 407 | bw.write(this.alleles.get(this.newRefIndex).toString() + "\n"); 408 | for(int i=0; i 1) 445 | return dnaLine.indexOf(tokens[1]); 446 | else{ 447 | System.err.println("Unusual gDNA header line. Check the input file file"); 448 | return -1; 449 | } 450 | 451 | } 452 | 453 | } 454 | 455 | class Allele{ 456 | 457 | private String nameWS; 458 | private String name; 459 | private StringBuffer sequence; 460 | 461 | //n may have leading and trailing whitespaces 462 | public Allele(String n, String seq){ 463 | this.nameWS = n; 464 | this.name = n.trim(); 465 | this.sequence = new StringBuffer(seq); 466 | } 467 | 468 | public boolean appendSequence(String n, String seq){ 469 | if(this.name.equals(n)){ 470 | this.sequence.append(seq); 471 | return true; 472 | } 473 | return false; 474 | } 475 | 476 | public boolean isFromGene(String genename){ 477 | if(this.name.startsWith(genename + "*")) 478 | return true; 479 | return false; 480 | } 481 | 482 | public int curStringLength(int startPos){ 483 | return startPos + this.sequence.length(); 484 | } 485 | 486 | public String toString(int startPos){ 487 | StringBuffer bf = new StringBuffer(name); 488 | int numSpaces = startPos - name.length(); 489 | for(int i = 0; i < numSpaces; i++) 490 | bf.append(" "); 491 | bf.append(sequence); 492 | return bf.toString(); 493 | } 494 | 495 | public String toString(){ 496 | return this.nameWS + this.sequence; 497 | } 498 | 499 | public String getName(){ 500 | return this.name; 501 | } 502 | 503 | public StringBuffer getSequenceBuffer(){ 504 | return this.sequence; 505 | } 506 | 507 | //symbols 508 | // . --> gap 509 | // - --> same as curRef 510 | // * --> unknownbase 511 | // [aAcCgGtT] --> base 512 | public void update(Allele curref, Allele newref){ 513 | 514 | for(int i=0; i meaing same as nrc 534 | this.setCharAt(i, '-'); 535 | else//base is different from crc and also difeferent from nrc, so keep it as it is 536 | ; 537 | } 538 | } 539 | } 540 | } 541 | 542 | public static void swapRef(Allele curref, Allele newref){ 543 | for(int i=0; i(); 61 | this.hlaName2typingSequences = new HashMap>(); 62 | this.loadGraphs(hlaList, nomGFile); 63 | } 64 | 65 | //loads HLAGraphs as well as nomG typing sequences 66 | private void loadGraphs(String[] hlaList, String nomGFile){ 67 | 68 | HLA.log.appendln("Merging HLA sequences and building HLA graphs"); 69 | 70 | int i; 71 | NomG nomG = new NomG(); 72 | nomG.loadHlaGene2Groups(nomGFile); 73 | 74 | String tmpDir = null; 75 | 76 | tmpDir = HLA.MSAFILELOC; 77 | 78 | for(i=0; i groups = nomG.getGroups(hlaList[i]); 90 | if(groups != null) 91 | this.hlaName2typingSequences.put(hlaList[i], mm.formDataBase(nomG.getGroups(hlaList[i]))); 92 | else 93 | this.hlaName2typingSequences.put(hlaList[i], mm.formDataBaseAll()); 94 | this.hlaName2Graph.get(hlaList[i]).setTypingSequences(this.hlaName2typingSequences.get(hlaList[i])); 95 | if(HLA.OUTPUT_MERGED_MSA) 96 | this.outputTypingSequences(hlaList[i]); 97 | } 98 | HLA.log.appendln("Done building\t" + i + "\tgraphs."); 99 | } 100 | 101 | 102 | 103 | public void outputTypingSequences(String hgn){ 104 | ArrayList typingSeqs = this.hlaName2typingSequences.get(hgn); 105 | 106 | BufferedWriter bw = null; 107 | try{ 108 | bw = new BufferedWriter(new FileWriter(hgn + "_typingDB.fa")); 109 | for(HLASequence h : typingSeqs){ 110 | bw.write(Bubble.stripPadding(h.toString())); 111 | //bw.write(">" + h.getGroup().getGroupString() + "\n"); 112 | //bw.write(); 113 | } 114 | bw.close(); 115 | }catch(IOException ioe){ 116 | ioe.printStackTrace(); 117 | } 118 | } 119 | 120 | //kourami bam checker added 121 | private boolean checkHeader(SAMFileHeader header){ 122 | List sequences = header.getSequenceDictionary().getSequences(); 123 | HashSet map = new HashSet(); 124 | 125 | //load kourami panel sequence names 126 | BufferedReader br; 127 | try{ 128 | br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(HLA.MSAFILELOC + File.separator + "All_FINAL_with_Decoy.fa.gz")))); 129 | String curline = ""; 130 | while((curline = br.readLine())!=null){ 131 | if(curline.charAt(0) == ('>')) 132 | map.add(curline.substring(1)); 133 | } 134 | br.close(); 135 | }catch(IOException ioe){ 136 | ioe.printStackTrace(); 137 | } 138 | 139 | //check if input bam has sequences to kourami panel 140 | for(SAMSequenceRecord ssr : sequences){ 141 | if(!map.contains(ssr.getSequenceName())) 142 | return false; 143 | } 144 | return true; 145 | } 146 | 147 | 148 | public void loadReads(File[] bams) throws IOException{ 149 | 150 | int count = 0; 151 | int numOp = 0; 152 | 153 | for(File bam : bams){ 154 | HLA.log.appendln("Loading reads from:\t" + bam.getName()); 155 | Object2IntOpenHashMap readLoadingSet = new Object2IntOpenHashMap(); 156 | readLoadingSet.defaultReturnValue(0); 157 | 158 | final SamReader reader = SamReaderFactory.makeDefault().open(bam); 159 | 160 | //Kourami bam checker added 161 | if(!checkHeader(reader.getFileHeader())){ 162 | HLA.log.appendln("Unexpected BAM :\t"+ bam.getName() 163 | +"\nThe input BAM MUST be aligned to the set of IMGT/HLA alleles in " + HLA.MSAFILELOC + "\n" 164 | + "Please use the recommended preprocessing steps explained on the github page:\n" 165 | + "https://github.com/Kingsford-Group/kourami"); 166 | System.err.println("Unexpected BAM :\t"+ bam.getName() 167 | +"\nThe input BAM MUST be aligned to the set of IMGT/HLA alleles in " + HLA.MSAFILELOC + "\n" 168 | + "Please use the recommended preprocessing steps explained on the github page:\n" 169 | + "https://github.com/Kingsford-Group/kourami"); 170 | HLA.log.outToFile(); 171 | System.exit(1); 172 | } 173 | 174 | for(final SAMRecord samRecord : reader){ 175 | if(count == 0){ 176 | HLA.READ_LENGTH = samRecord.getReadLength(); 177 | HLA.log.appendln("Setting HLA.READ_LEGNTH = " + HLA.READ_LENGTH); 178 | } 179 | //added checking to process reads matching to HLA-type sequences 180 | //discarding decoy hits (DQB2, DQA2) 181 | boolean qc = false; 182 | if( (samRecord.getReferenceName().indexOf("*") > -1) 183 | && !samRecord.getReadUnmappedFlag() 184 | && !samRecord.isSecondaryOrSupplementary() 185 | && !this.startWIns(samRecord)){ 186 | count++; 187 | if(samRecord.getReadPairedFlag()) 188 | numOp += processRecord(samRecord, readLoadingSet); 189 | else 190 | numOp += processRecordUnpaired(samRecord); 191 | } 192 | 193 | if(HLA.DEBUG && count%10000 == 0) 194 | HLA.log.appendln("Processed 10000 reads..."); 195 | } 196 | reader.close(); 197 | } 198 | HLA.log.appendln("Loaded a total of " + count + " mapped reads."); 199 | HLA.log.appendln("A total of " + numOp + " bases"); 200 | } 201 | 202 | public void updateErrorProb(){ 203 | HLA.log.appendln("------------ UPDATING error probabilities of each edge ---------"); 204 | Iterator itr = this.hlaName2Graph.keySet().iterator(); 205 | while(itr.hasNext()){ 206 | this.hlaName2Graph.get(itr.next()).updateEdgeWeightProb(); 207 | } 208 | HLA.log.appendln("------------ DONE UPDATING error probabilities ---------"); 209 | } 210 | 211 | //assume interleaved SAMRecord 212 | public int processRecord(SAMRecord sr, Object2IntOpenHashMap readLoadingSet){ 213 | int totalOp = 0; 214 | String hlagene = HLA.extractHLAGeneName(sr.getReferenceName()); 215 | HLAGraph hg = this.hlaName2Graph.get(hlagene); 216 | //hg.traverse(); 217 | if(hg != null){ 218 | if(hg.isClassI()){ 219 | boolean qc = this.qcCheck(sr); 220 | if(!qc) 221 | return 0; 222 | } 223 | int readnum = readLoadingSet.getInt(sr.getReadName()); 224 | //no such read has been read. return value of 0 means the hashSet doesn't have the read 225 | if(readnum == 0){ 226 | readnum = sr.getFirstOfPairFlag() ? HLA.readNum : 0-HLA.readNum; 227 | 228 | readLoadingSet.put(sr.getReadName(), HLA.readNum); 229 | HLA.readNum++; 230 | }else 231 | readnum = sr.getFirstOfPairFlag() ? readnum : 0-readnum; 232 | 233 | totalOp += hg.addWeight(sr, readnum);//HLA.readNum); 234 | //HLA.readNum++; 235 | }else{ 236 | ;//HLA.log.appendln("UNKNOWN HLA GENE: " + hlagene); 237 | } 238 | return totalOp; 239 | } 240 | 241 | public boolean startWIns(SAMRecord sr){ 242 | Cigar cigar = sr.getCigar(); 243 | if(cigar == null){ 244 | return true; 245 | }else{ 246 | CigarOperator op = cigar.getCigarElements().get(0).getOperator(); 247 | if(op == CigarOperator.I){ 248 | if(HLA.DEBUG) 249 | HLA.log.appendln("SKIPPING(Start with Insertion):\t" + sr.getReadName()); 250 | return true; 251 | 252 | } 253 | } 254 | return false; 255 | } 256 | 257 | public boolean qcCheck(SAMRecord sr){ 258 | Cigar cigar = sr.getCigar(); 259 | int rLen = sr.getReadLength(); 260 | int effectiveLen = 0; 261 | if(cigar==null) 262 | return false; 263 | else{ 264 | for(final CigarElement ce : cigar.getCigarElements()){ 265 | CigarOperator op = ce.getOperator(); 266 | int cigarLen = ce.getLength(); 267 | switch(op) 268 | { 269 | case M: 270 | { 271 | effectiveLen += cigarLen; 272 | break; 273 | } 274 | case I: 275 | { 276 | effectiveLen += cigarLen; 277 | break; 278 | } 279 | default: 280 | break; 281 | } 282 | } 283 | } 284 | boolean readdebug = false; 285 | if(readdebug){ 286 | HLA.log.appendln(sr.getSAMString()); 287 | HLA.log.appendln("EffectiveLen:\t" + effectiveLen); 288 | HLA.log.appendln("ReadLen:\t" + rLen); 289 | } 290 | Integer i = sr.getIntegerAttribute("NM"); 291 | int nm = 0; 292 | if(i!=null) 293 | nm = i.intValue(); 294 | if(readdebug) 295 | HLA.log.appendln("NM=\t" + nm); 296 | if(nm < 16){ 297 | if(readdebug) 298 | HLA.log.appendln("PASSWED QC"); 299 | return true; 300 | } 301 | if(readdebug){ 302 | HLA.log.appendln("FAILED QC"); 303 | HLA.log.appendln(sr.getSAMString()); 304 | } 305 | return false; 306 | } 307 | 308 | public int processRecordUnpaired(SAMRecord sr){ 309 | int totalOp = 0; 310 | String hlagene = HLA.extractHLAGeneName(sr.getReferenceName()); 311 | HLAGraph hg = this.hlaName2Graph.get(hlagene); 312 | //hg.traverse(); 313 | if(hg != null){ 314 | if(hg.isClassI()){ 315 | boolean qc = this.qcCheck(sr); 316 | if(!qc) 317 | return 0; 318 | } 319 | totalOp += hg.addWeight(sr, HLA.readNum); 320 | HLA.readNum++; 321 | } 322 | return totalOp; 323 | } 324 | 325 | public void printWeights(String[] list){ 326 | for(String g:list) 327 | this.hlaName2Graph.get(g).traverseAndWeights(); 328 | } 329 | 330 | public void printBoundaries(String[] list){ 331 | for(String g:list) 332 | this.hlaName2Graph.get(g).getRefAllele().printBoundaries(); 333 | } 334 | 335 | public void removeUnused(String[] list){ 336 | for(String g:list) 337 | this.hlaName2Graph.get(g).removeUnused(); 338 | } 339 | 340 | public void flattenInsertionNodes(String[] list){ 341 | for(String g:list) 342 | this.hlaName2Graph.get(g).flattenInsertionNodes(); 343 | } 344 | 345 | public void printStartEndNodes(String[] list){ 346 | for(String g:list) 347 | this.hlaName2Graph.get(g).printStartEndNodeInfo(); 348 | } 349 | 350 | public void countBubbles(String[] list){ 351 | for(String g:list) 352 | this.hlaName2Graph.get(g).countBubbles(); 353 | } 354 | 355 | public void countBubblesAndMerge(String[] list, StringBuffer rb){ 356 | for(String g:list) 357 | this.hlaName2Graph.get(g).countBubblesAndMerge(rb); 358 | } 359 | 360 | public void countStems(String[] list){ 361 | for(String g:list) 362 | this.hlaName2Graph.get(g).countStems(); 363 | } 364 | 365 | public void removeStems(String[] list){ 366 | for(String g:list) 367 | this.hlaName2Graph.get(g).removeStems(); 368 | } 369 | 370 | public void writeResults(StringBuffer rb, BufferedWriter resultWriter){ 371 | try{ 372 | resultWriter.write(rb.toString()); 373 | resultWriter.close(); 374 | }catch(IOException ioe){ 375 | ioe.printStackTrace(); 376 | } 377 | } 378 | 379 | private static Options createHelpOption(){ 380 | Options options = new Options(); 381 | Option help = new Option("h", "help", false, "print this message"); 382 | options.addOption(help); 383 | return options; 384 | } 385 | 386 | private static Options createOption(){ 387 | Options options = new Options(); 388 | Option help = new Option("h", "help", false, "print this message"); 389 | 390 | Option buildFromMSA = Option.builder("d") 391 | .longOpt("msaDirectory") 392 | .required(true) 393 | .argName("path") 394 | .hasArg() 395 | .desc("build HLA-Graph from gen and nuc MSAs provided by IMGT/HLA DB from given directory (required)") 396 | .build(); 397 | 398 | Option outfile = Option.builder("o") 399 | .longOpt("outfilePrefix") 400 | .required(true) 401 | .hasArg() 402 | .desc("use given outfile prefix for all output files (required)") 403 | .argName("outfile") 404 | .build(); 405 | 406 | Option additionalLoci = Option.builder("a") 407 | .longOpt("additionalLoci") 408 | .required(false) 409 | .hasArg(false) 410 | .desc("type additional loci (optional)") 411 | .build(); 412 | 413 | //options.addOption(help); 414 | options.addOption(buildFromMSA); 415 | options.addOption(outfile); 416 | options.addOption(additionalLoci); 417 | 418 | return options; 419 | } 420 | 421 | private static void help(Options options){ 422 | //String R = "\u001B[30m"; 423 | HelpFormatter formatter = new HelpFormatter(); 424 | formatter.setDescPadding(0); 425 | String header = "\n" 426 | + "Program: Kourami - Graph-guided assembly of HLA typing exons\n" 427 | + "Version: " + HLA.VERSION + "\n" 428 | + "Contact: Heewook Lee \n\n" 429 | + "Usage: java -jar /Kourami.jar [options] ... \n\n" 430 | + " -h,--help print this message\n"; 431 | 432 | String footer = "\n"; 433 | System.err.println(header); 434 | PrintWriter tmp = new PrintWriter(System.err); 435 | formatter.printOptions(tmp, 80, options, 3, 3); 436 | tmp.println("\n"); 437 | tmp.println(" -hhy+. o o o o o o o o o o"); 438 | tmp.println(".` -syss:---.` o o o o o o o o o o o o o"); 439 | tmp.println(":+:` .:/o+++++///ommy+` o _ __ _"); 440 | tmp.println("`yhs/..:osssooooo++++dmNNNdo` o | |/ /___ _ _ _ __ __ _ _ __ ___ (_)"); 441 | tmp.println(" /syy///++++ooooooooodNMdNdmh: o | ' // _ \\| | | | '__/ _` | '_ ` _ \\| |"); 442 | tmp.println(" -do/` .://++++++++oodmmmmmmd- | . \\ (_) | |_| | | | (_| | | | | | | |"); 443 | tmp.println(" .+: `.://///+///ommmmdy- |_|\\_\\___/ \\__,_|_| \\__,_|_| |_| |_|_|"); 444 | tmp.println(" . -syo----..`` "); 445 | tmp.println(" +y+. \n\n"); 446 | 447 | tmp.flush(); 448 | tmp.close(); 449 | System.exit(1); 450 | } 451 | 452 | public static void main(String[] args) throws IOException{ 453 | 454 | if(!isVersionOrHigher()){ 455 | System.err.println("JRE of 1.8+ is required to run Kourami. Exiting."); 456 | System.exit(1); 457 | } 458 | 459 | CommandLineParser parser = new DefaultParser(); 460 | Options options = HLA.createOption(); 461 | Options helponlyOpts = HLA.createHelpOption(); 462 | String[] bams = null; 463 | CommandLine line = null; 464 | boolean exitRun = false; 465 | try{ 466 | CommandLine helpcheck = new DefaultParser().parse(helponlyOpts, args, true); 467 | if(helpcheck.getOptions().length > 0) 468 | HLA.help(options); 469 | else{ 470 | line = parser.parse( options, args); 471 | if(line.hasOption("h"))//help")) 472 | HLA.help(options); 473 | else{ 474 | if(line.hasOption("a")) 475 | HLA.TYPEADDITIONAL = true; 476 | 477 | HLA.OUTPREFIX = line.getOptionValue("o");//outfilePrefix"); 478 | String tmploc = line.getOptionValue("d");//msaDirectory"); 479 | HLA.MSAFILELOC = tmploc; 480 | if(tmploc.endsWith(File.separator)) 481 | HLA.MSAFILELOC = tmploc.substring(0,tmploc.length()-1); 482 | if(! new File(HLA.MSAFILELOC).exists() || ! new File(HLA.MSAFILELOC).isDirectory()){ 483 | System.err.println("Given msaDirectory: " + HLA.MSAFILELOC + "\t does NOT exist or is NOT a directory."); 484 | exitRun = true; 485 | }else if(! new File(HLA.MSAFILELOC + File.separator + "hla_nom_g.txt").exists()){ 486 | System.err.println("hla_nom_g.txt NOT FOUND in " + HLA.MSAFILELOC ); 487 | System.err.println("Please download hla_nom_g.txt from the same IMGT Release as msa files."); 488 | exitRun = true; 489 | } 490 | } 491 | bams = line.getArgs(); 492 | 493 | if(bams.length <1 || (bams.length == 1 && bams[bams.length - 1].equals("DEBUG1228"))) 494 | throw new ParseException("At least 1 bam file is required. See Usage:"); 495 | else{ 496 | if(bams.length > 1 && bams[bams.length - 1].equals("DEBUG1228")){ 497 | String[] tmpbams = new String[bams.length - 1]; 498 | for(int i=0;i.result is writable 536 | //if not exit. 537 | BufferedWriter resultWriter = null; 538 | try{ 539 | resultWriter = new BufferedWriter(new FileWriter(HLA.OUTPREFIX + ".result")); 540 | }catch(IOException ioe){ 541 | ioe.printStackTrace(); 542 | System.err.println("\n\n>>> CANNOT open output file: " + HLA.OUTPREFIX + ".result <<<\n\n"); 543 | HLA.help(options); 544 | } 545 | 546 | 547 | HLA.log = new LogHandler(); 548 | for(int i =0; i= HLA.MIN_JRE_VERSION) 612 | return true; 613 | else{ 614 | System.err.println("You are using Java Runtime Environment (version: " + version + " )."); 615 | return false; 616 | } 617 | } 618 | 619 | public static int readNum = 1; 620 | private HashMap hlaName2Graph; 621 | private HashMap> hlaName2typingSequences; 622 | } 623 | --------------------------------------------------------------------------------