├── .gitignore ├── bioinformatics-toolkit-apps ├── LICENSE ├── Setup.hs ├── app │ ├── MergeMotifs.hs │ ├── MkIndex.hs │ └── MotifScan.hs └── bioinformatics-toolkit-apps.cabal └── bioinformatics-toolkit ├── LICENSE ├── README.md ├── Setup.hs ├── benchmarks └── bench.hs ├── bioinformatics-toolkit.cabal ├── cabal.project ├── docs └── main.tex ├── src └── Bio │ ├── ChIPSeq │ └── FragLen.hs │ ├── Data │ ├── Bam.hs │ ├── Bed.hs │ ├── Bed │ │ ├── Types.hs │ │ └── Utils.hs │ ├── BigWig.hs │ ├── Fasta.hs │ ├── Fastq.hs │ └── Types.hs │ ├── GO.hs │ ├── GO │ └── Parser.hs │ ├── HiC.hs │ ├── HiC │ ├── Visualize.hs │ └── Visualize │ │ └── Internal.hs │ ├── Motif.hs │ ├── Motif │ ├── Alignment.hs │ ├── Merge.hs │ └── Search.hs │ ├── RealWorld │ ├── BioGRID.hs │ ├── ENCODE.hs │ ├── Ensembl.hs │ ├── GDC.hs │ ├── GENCODE.hs │ ├── ID.hs │ ├── Reactome.hs │ └── Uniprot.hs │ ├── Seq.hs │ ├── Seq │ └── IO.hs │ └── Utils │ ├── BitVector.hs │ ├── Functions.hs │ ├── Misc.hs │ ├── Overlap.hs │ └── Types.hs └── tests ├── Tests ├── Bam.hs ├── Bed.hs ├── Fastq.hs ├── Motif.hs ├── Seq.hs └── Tools.hs ├── data ├── example.bam ├── example.bed.gz ├── example_copy.bam ├── example_intersect_peaks.bed.gz ├── genes.gtf.gz ├── motifs.fasta ├── motifs.meme ├── pairedend.bam ├── pairedend.bedpe ├── peaks.bed.gz ├── peaks.sorted.bed.gz ├── test.fastq.gz └── test_wrap.fastq.gz └── test.hs /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | cabal-dev 3 | *.o 4 | *.hi 5 | *.chi 6 | *.chs.h 7 | *.swp 8 | .virtualenv 9 | .hsenv 10 | .cabal-sandbox/ 11 | cabal.sandbox.config 12 | cabal.config 13 | .stack-work 14 | -------------------------------------------------------------------------------- /bioinformatics-toolkit-apps/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Author name here (c) 2017 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Author name here nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /bioinformatics-toolkit-apps/Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /bioinformatics-toolkit-apps/app/MergeMotifs.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Main where 4 | 5 | import AI.Clustering.Hierarchical hiding (drawDendrogram) 6 | import Control.Monad (forM, forM_) 7 | import qualified Data.ByteString.Char8 as B 8 | import Data.Default 9 | import Data.Double.Conversion.ByteString (toShortest) 10 | import Data.List.Split (splitOn) 11 | import Data.Monoid ((<>)) 12 | {- 13 | import Diagrams.Backend.Cairo 14 | import Diagrams.Plots.Dendrogram 15 | import Diagrams.Prelude (dims2D, strutX, (|||)) 16 | -} 17 | import Options.Applicative 18 | import System.IO 19 | import Text.Printf 20 | 21 | import Bio.Data.Fasta 22 | import Bio.Motif 23 | import Bio.Motif.Alignment 24 | import Bio.Motif.Merge 25 | import Bio.Seq (toBS) 26 | import Bio.Utils.Functions 27 | 28 | data CMD = Merge { mergeInput :: FilePath 29 | , mode :: String 30 | , thres :: Double 31 | , alignOption :: AlignOption 32 | , mergeOutput :: FilePath 33 | } 34 | | Dist { inputA :: FilePath 35 | , inputB :: Maybe FilePath 36 | , alignOption :: AlignOption 37 | } 38 | | Cluster { clusterInput :: !FilePath 39 | , cutoff :: !Double 40 | , alignOption :: !AlignOption 41 | } 42 | 43 | mergeParser :: Parser CMD 44 | mergeParser = Merge 45 | <$> argument str (metavar "INPUT") 46 | <*> strOption 47 | ( long "mode" 48 | <> short 'm' 49 | <> value "iter" 50 | <> metavar "MODE" 51 | <> help "Merging algorithm, could be iter or tree, default is iter" ) 52 | <*> option auto 53 | ( long "thres" 54 | <> short 't' 55 | <> value 0.2 56 | <> metavar "THRESHOLD" 57 | <> help "two motifs that have distance belowing threshold would be merged, default is 0.2" ) 58 | <*> alignParser 59 | <*> strOption 60 | ( long "output" 61 | <> short 'o' 62 | <> value "merged_output.meme" 63 | <> metavar "OUTPUT" ) 64 | 65 | distParser :: Parser CMD 66 | distParser = Dist 67 | <$> strOption 68 | ( short 'a' 69 | <> metavar "INPUT_A" ) 70 | <*> (optional . strOption) 71 | ( short 'b' 72 | <> metavar "INPUT_B" ) 73 | <*> alignParser 74 | 75 | clusterParser :: Parser CMD 76 | clusterParser = Cluster 77 | <$> argument str (metavar "INPUT") 78 | <*> option auto 79 | ( long "height" 80 | <> short 'h' 81 | <> value 0.2 82 | <> metavar "HEIGHT" 83 | <> help "Cut hierarchical tree at given height. Default: 0.2" ) 84 | <*> alignParser 85 | 86 | data AlignOption = AlignOption 87 | { gap :: Double 88 | , gapMode :: String 89 | , avgMode :: String 90 | } 91 | 92 | alignParser :: Parser AlignOption 93 | alignParser = AlignOption 94 | <$> option auto 95 | ( long "gap" 96 | <> short 'g' 97 | <> value 0.05 98 | <> metavar "GAP_PENALTY" 99 | <> help "Gap penalty, default: 0.05" ) 100 | <*> strOption 101 | ( long "gap_mode" 102 | <> value "exp" 103 | <> metavar "GAP_MODE" 104 | <> help "Gap penalty mode, one of linear, quadratic, cubic, and exp. default: exp." ) 105 | <*> strOption 106 | ( long "avg_mode" 107 | <> value "l1" 108 | <> metavar "AVERAGE_MODE" 109 | <> help "Averaging function, one of l1, l2, l3, max. default: l1." ) 110 | 111 | treeMerge :: Double -> String -> [Motif] -> AlignOption 112 | -> ([Motif], Dendrogram Motif) 113 | treeMerge th pre ms alignOpt = (zipWith f [0::Int ..] $ map merge $ tree `cutAt` th, tree) 114 | where 115 | f i (suffix, pwm) = Motif ((B.pack $ pre ++ "_" ++ show i ++ "_" ++ show (toIUPAC pwm)) 116 | `B.append` "(" 117 | `B.append` suffix 118 | `B.append` ")" ) pwm 119 | merge tr = ( B.intercalate "+" $ map _name $ flatten tr 120 | , dilute $ mergeTreeWeighted align tr) 121 | tree = buildTree align ms 122 | align = mkAlignFn alignOpt 123 | {-# INLINE treeMerge #-} 124 | 125 | getSuffix :: String -> String 126 | getSuffix = last . splitOn "." 127 | {-# INLINE getSuffix #-} 128 | 129 | mkAlignFn :: AlignOption -> AlignFn 130 | mkAlignFn (AlignOption gap gapMode avgMode) = alignmentBy jsd (pFn gap) avgFn 131 | where 132 | pFn = case gapMode of 133 | "linear" -> linPenal 134 | "quadratic" -> quadPenal 135 | "cubic" -> cubPenal 136 | "exp" -> expPenal 137 | _ -> error "Unknown gap mode" 138 | avgFn = case avgMode of 139 | "l1" -> l1 140 | "l2" -> l2 141 | "l3" -> l3 142 | "max" -> lInf 143 | _ -> error "Unknown average mode" 144 | {-# INLINE mkAlignFn #-} 145 | 146 | readMotif :: FilePath -> IO [Motif] 147 | readMotif fl = case getSuffix fl of 148 | "fasta" -> readFasta' fl 149 | _ -> readMEME fl 150 | 151 | writeMotif :: FilePath -> [Motif] -> IO () 152 | writeMotif fl motifs = case getSuffix fl of 153 | "fasta" -> writeFasta fl motifs 154 | _ -> writeMEME fl motifs def 155 | 156 | 157 | defaultMain :: CMD -> IO () 158 | defaultMain (Dist a b alignOpt) = do 159 | motifsA <- readMotif a 160 | pairs <- case b of 161 | Nothing -> return $ comb motifsA 162 | Just b' -> do 163 | motifsB <- readMotif b' 164 | return [(x,y) | x <- motifsA, y <- motifsB] 165 | forM_ pairs $ \(x,y) -> do 166 | let (d,_) = alignFn (_pwm x) $ (_pwm y) 167 | B.putStrLn $ B.intercalate "\t" [_name x, _name y, toShortest d] 168 | where 169 | alignFn = mkAlignFn alignOpt 170 | 171 | defaultMain (Merge inFl m th alignOpt outFl)= do 172 | motifs <- readMotif inFl 173 | let motifNumber = length motifs 174 | 175 | hPutStrLn stderr $ printf "Merging Mode: %s" m 176 | hPutStrLn stderr $ printf "Read %d motifs" motifNumber 177 | 178 | motifs' <- case m of 179 | "tree" -> do 180 | let (newMotifs, tree) = treeMerge th "" motifs alignOpt 181 | fn x = B.unpack (_name x) ++ ": " ++ B.unpack (toBS $ toIUPAC $ _pwm x) 182 | 183 | {- 184 | case svg of 185 | Just fl -> do 186 | let w = 80 187 | h = 5 * fromIntegral motifNumber 188 | renderCairo fl (dims2D (10*w) (10*h)) $ drawDendrogram w h th tree fn ||| strutX 40 189 | return newMotifs 190 | -} 191 | return newMotifs 192 | "iter" -> do 193 | let rs = iterativeMerge (mkAlignFn alignOpt) th motifs 194 | forM rs $ \(nm, pwm, ws) -> do 195 | let pwm' = dilute (pwm, ws) 196 | return $ Motif (B.intercalate "+" nm) pwm 197 | -- _ -> error "Unkown mode!" 198 | 199 | hPutStrLn stderr $ printf "Write %d motifs" (length motifs') 200 | 201 | writeMotif outFl motifs' 202 | 203 | defaultMain (Cluster inFl c alignOpt) = do 204 | motifs <- readMotif inFl 205 | let tree = buildTree align motifs 206 | align = mkAlignFn alignOpt 207 | forM_ (tree `cutAt` c) $ \t -> 208 | B.putStrLn $ B.intercalate "\t" $ map _name $ flatten t 209 | {-# INLINE defaultMain #-} 210 | 211 | comb :: [a] -> [(a,a)] 212 | comb (y:ys) = zip (repeat y) ys ++ comb ys 213 | comb _ = [] 214 | {-# INLINE comb #-} 215 | 216 | main :: IO () 217 | main = execParser opts >>= defaultMain 218 | where 219 | opts = info (helper <*> parser) 220 | ( fullDesc 221 | <> header (printf "Compare, align and merge motifs, version %s" v)) 222 | v = "0.2.0" :: String 223 | parser = subparser $ ( 224 | command "merge" (info (helper <*> mergeParser) $ 225 | fullDesc <> progDesc "Merge motifs") 226 | <> command "dist" (info (helper <*> distParser) $ 227 | fullDesc <> progDesc "Align and compute pairwise distances") 228 | <> command "cluster" (info (helper <*> clusterParser) $ 229 | fullDesc <> progDesc "Perform hierarchical clustering on motifs") 230 | ) 231 | -------------------------------------------------------------------------------- /bioinformatics-toolkit-apps/app/MkIndex.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import Bio.Seq.IO (mkIndex) 4 | import System.Environment 5 | 6 | main :: IO () 7 | main = do 8 | (outF: inputs) <- getArgs 9 | mkIndex inputs outF 10 | -------------------------------------------------------------------------------- /bioinformatics-toolkit-apps/app/MotifScan.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import Bio.Data.Bed 4 | import Bio.Data.Bed.Utils 5 | import Bio.Motif 6 | import Bio.Seq.IO 7 | import Conduit 8 | import Data.Default (def) 9 | import Data.Semigroup ((<>)) 10 | import Data.Version (showVersion) 11 | import Options.Applicative 12 | import Paths_bioinformatics_toolkit_apps (version) 13 | import System.IO (stdout) 14 | import Text.Printf 15 | 16 | 17 | data Options = Options 18 | { genomeFile :: FilePath 19 | , motifFile :: FilePath 20 | , input :: FilePath 21 | , p :: Double 22 | } deriving (Show, Read) 23 | 24 | parser :: Parser Options 25 | parser = Options 26 | <$> strArgument (metavar "GENOME") 27 | <*> strArgument (metavar "MOTIF_MEME") 28 | <*> strArgument (metavar "INPUT") 29 | <*> option auto 30 | ( long "p-value" 31 | <> short 'p' 32 | <> value 1e-5 33 | <> metavar "P-Value" 34 | <> help "p-value cutoff. (default: 1e-5)" ) 35 | 36 | defaultMain :: Options -> IO () 37 | defaultMain opts = do 38 | withGenome (genomeFile opts) $ \genome -> do 39 | motifs <- map (mkCutoffMotif def (p opts)) <$> readMEME (motifFile opts) 40 | runResourceT $ runConduit $ 41 | (streamBed (input opts) :: Source (ResourceT IO) BED3) .| 42 | scanMotif genome motifs .| sinkHandleBed stdout 43 | 44 | main :: IO () 45 | main = execParser opts >>= defaultMain 46 | where 47 | opts = info (helper <*> parser) ( fullDesc <> 48 | header (printf "bioinformatics-toolkit-apps-v%s" (showVersion version)) ) 49 | -------------------------------------------------------------------------------- /bioinformatics-toolkit-apps/bioinformatics-toolkit-apps.cabal: -------------------------------------------------------------------------------- 1 | name: bioinformatics-toolkit-apps 2 | version: 0.1.0 3 | -- synopsis: 4 | -- description: 5 | -- homepage: 6 | license: BSD3 7 | license-file: LICENSE 8 | author: Kai Zhang 9 | maintainer: kai@kzhang.org 10 | copyright: (c) 2017-2018 Kai Zhang 11 | category: Bioinformatics 12 | build-type: Simple 13 | extra-source-files: README.md 14 | cabal-version: >=1.10 15 | 16 | executable scan_motif 17 | hs-source-dirs: app 18 | main-is: MotifScan.hs 19 | -- ghc-options: -threaded -Wall -optl-static -optc-static -optl-pthread 20 | ghc-options: -threaded -Wall 21 | build-depends: base 22 | , bioinformatics-toolkit 23 | , conduit >=1.3.0 24 | , data-default 25 | , optparse-applicative 26 | default-language: Haskell2010 27 | 28 | executable mkindex 29 | hs-source-dirs: app 30 | main-is: MkIndex.hs 31 | -- ghc-options: -threaded -Wall -optl-static -optc-static -optl-pthread 32 | ghc-options: -threaded -Wall 33 | build-depends: base 34 | , bioinformatics-toolkit 35 | default-language: Haskell2010 36 | 37 | executable merge_motifs 38 | hs-source-dirs: app 39 | main-is: MergeMotifs.hs 40 | -- ghc-options: -threaded -Wall -optl-static -optc-static -optl-pthread 41 | ghc-options: -threaded -Wall 42 | build-depends: base 43 | , bioinformatics-toolkit 44 | , bytestring 45 | , conduit >=1.3.0 46 | , clustering 47 | , double-conversion 48 | , split 49 | , data-default 50 | , optparse-applicative 51 | default-language: Haskell2010 52 | 53 | source-repository head 54 | type: git 55 | location: https://github.com/kaizhang/bioinformatics-toolkit.git 56 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014-2021 Kai Zhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/README.md: -------------------------------------------------------------------------------- 1 | Bioinformatics Algorithms 2 | ========================= 3 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/benchmarks/bench.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | import Criterion.Main 4 | import System.Random 5 | import Data.Conduit.Zlib (ungzip, multiple, gzip) 6 | import qualified Data.ByteString.Char8 as B 7 | import Data.Default.Class 8 | import Conduit 9 | import Control.Monad.Identity 10 | import System.IO.Unsafe 11 | import AI.Clustering.Hierarchical 12 | import Data.Either 13 | 14 | import Bio.Data.Fasta 15 | import Bio.Data.Fastq 16 | import Bio.Motif 17 | import Bio.Motif.Search 18 | import Bio.Motif.Alignment 19 | import Bio.Seq 20 | 21 | dna :: DNA Basic 22 | dna = fromRight undefined $ fromBS $ B.pack $ map f $ take 5000 $ randomRs (0, 3) (mkStdGen 2) 23 | where 24 | f :: Int -> Char 25 | f x = case x of 26 | 0 -> 'A' 27 | 1 -> 'C' 28 | 2 -> 'G' 29 | 3 -> 'T' 30 | _ -> undefined 31 | 32 | pwm :: PWM 33 | pwm = toPWM [ "0.3 0.3 0.3 0.1" 34 | , "0 0.5 0 0.5" 35 | , "0.1 0.2 0.5 0.3" 36 | , "0.1 0.1 0.1 0.7" 37 | , "0 0 0 1" 38 | , "0.25 0.25 0.25 0.25" 39 | , "0.1 0.1 0.3 0.5" 40 | , "0.25 0.25 0 0.5" 41 | , "0.1 0.1 0.7 0.1" 42 | , "0 0 0 1" 43 | ] 44 | 45 | motifs :: [Motif] 46 | motifs = unsafePerformIO $ readFasta' "data/motifs.fasta" 47 | 48 | main :: IO () 49 | main = do 50 | content <- fmap B.concat $ runResourceT $ runConduit $ 51 | sourceFile "tests/data/test.fastq.gz" .| multiple ungzip .| sinkList 52 | let fq = B.concat $ replicate 50 content 53 | defaultMain 54 | [ bench "motif score" $ nf (scores def pwm) dna 55 | --, bgroup "TFBS scanning" [ bench "Naive" $ nf (\x -> runIdentity $ findTFBSSlow def pwm x (0.6 * optimalScore def pwm) $$ CL.consume) dna 56 | -- , bench "look ahead" $ nf (\x -> runIdentity $ findTFBS def pwm x (0.6 * optimalScore def pwm) $$ CL.consume) dna 57 | -- ] 58 | --, bench "motif merge" $ nf (\x -> fmap show $ flatten $ buildTree x) motifs 59 | , bench "Read FASTQ Fast" $ nfAppIO (\x -> runConduit $ yield x .| parseFastqC .| sinkList) fq 60 | , bench "Read FASTQ Slow" $ nfAppIO (\x -> runConduit $ yield x .| parseFastqC' .| sinkList) fq 61 | ] 62 | 63 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/bioinformatics-toolkit.cabal: -------------------------------------------------------------------------------- 1 | name: bioinformatics-toolkit 2 | version: 0.10.0 3 | synopsis: A collection of bioinformatics tools 4 | description: A collection of bioinformatics tools 5 | license: MIT 6 | license-file: LICENSE 7 | author: Kai Zhang 8 | maintainer: kai@kzhang.org 9 | copyright: (c) 2014-2022 Kai Zhang 10 | category: Bio 11 | build-type: Simple 12 | extra-source-files: README.md 13 | cabal-version: 1.18 14 | data-files: 15 | tests/data/example.bam 16 | tests/data/example.bed.gz 17 | tests/data/pairedend.bam 18 | tests/data/pairedend.bedpe 19 | tests/data/peaks.bed.gz 20 | tests/data/peaks.sorted.bed.gz 21 | tests/data/example_intersect_peaks.bed.gz 22 | tests/data/motifs.fasta 23 | tests/data/motifs.meme 24 | tests/data/test.fastq.gz 25 | tests/data/test_wrap.fastq.gz 26 | tests/data/genes.gtf.gz 27 | 28 | library 29 | hs-source-dirs: src 30 | ghc-options: -Wall 31 | 32 | exposed-modules: 33 | Bio.ChIPSeq.FragLen 34 | Bio.Data.Bed 35 | Bio.Data.Bed.Types 36 | Bio.Data.Bed.Utils 37 | Bio.Data.Bam 38 | Bio.Data.Fasta 39 | Bio.Data.Fastq 40 | Bio.GO 41 | Bio.GO.Parser 42 | Bio.Motif 43 | Bio.Motif.Alignment 44 | Bio.Motif.Merge 45 | Bio.Motif.Search 46 | Bio.RealWorld.BioGRID 47 | Bio.RealWorld.ENCODE 48 | Bio.RealWorld.Ensembl 49 | Bio.RealWorld.GENCODE 50 | Bio.RealWorld.GDC 51 | Bio.RealWorld.ID 52 | Bio.RealWorld.Reactome 53 | Bio.RealWorld.Uniprot 54 | Bio.Seq 55 | Bio.Seq.IO 56 | Bio.Utils.BitVector 57 | Bio.Utils.Functions 58 | Bio.Utils.Misc 59 | Bio.Utils.Overlap 60 | Bio.Utils.Types 61 | 62 | build-depends: 63 | base >=4.11 && <5.0 64 | , aeson >= 2.0 65 | , aeson-pretty 66 | , attoparsec >= 0.13.0.0 67 | , bytestring >= 0.10 68 | , bytestring-lexing >= 0.5 69 | , case-insensitive 70 | , clustering 71 | , conduit >= 1.3.0 72 | , conduit-extra 73 | , containers >= 0.5 74 | , data-ordlist 75 | , data-default-class 76 | , double-conversion 77 | , deepseq 78 | , HsHTSLib >= 1.9.2 79 | , http-conduit >= 2.1.8 80 | , hexpat 81 | , IntervalMap >= 0.5.0.0 82 | , microlens 83 | , microlens-th 84 | , matrices >= 0.5.0 85 | , mtl >= 2.1.3.1 86 | , math-functions 87 | , parallel >= 3.2 88 | , primitive 89 | , split 90 | , statistics >= 0.13.2.1 91 | , text >= 0.11 92 | , transformers >= 0.3.0.0 93 | , unordered-containers >= 0.2 94 | , word8 95 | , vector 96 | , vector-algorithms 97 | 98 | default-language: Haskell2010 99 | 100 | benchmark bench 101 | type: exitcode-stdio-1.0 102 | main-is: benchmarks/bench.hs 103 | default-language: Haskell2010 104 | build-depends: 105 | base >=4.8 && <5.0 106 | , bioinformatics-toolkit 107 | , random 108 | , criterion 109 | , clustering 110 | , bytestring 111 | , data-default-class 112 | , conduit 113 | , conduit-extra 114 | , mtl 115 | 116 | test-suite tests 117 | type: exitcode-stdio-1.0 118 | hs-source-dirs: tests 119 | main-is: test.hs 120 | other-modules: 121 | Tests.Bed 122 | , Tests.Bam 123 | , Tests.Fastq 124 | , Tests.Motif 125 | , Tests.Seq 126 | , Tests.Tools 127 | 128 | default-language: Haskell2010 129 | build-depends: 130 | base 131 | , bytestring 132 | , random 133 | , vector 134 | , data-default-class 135 | , microlens 136 | , tasty 137 | , tasty-golden 138 | , tasty-hunit 139 | , bioinformatics-toolkit 140 | , conduit 141 | , conduit-extra 142 | , unordered-containers 143 | , mtl 144 | , matrices 145 | 146 | source-repository head 147 | type: git 148 | location: https://github.com/kaizhang/bioinformatics-toolkit.git 149 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/cabal.project: -------------------------------------------------------------------------------- 1 | packages: *.cabal 2 | 3 | package cryptonite 4 | flags: -use_target_attributes -------------------------------------------------------------------------------- /bioinformatics-toolkit/docs/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass[12pt]{scrartcl} 2 | 3 | \usepackage[english]{babel} 4 | \usepackage[sorting=none,backend=biber]{biblatex} 5 | \usepackage{amsmath} 6 | \usepackage{graphicx} 7 | \usepackage[noend]{algpseudocode} 8 | \usepackage{algorithm} 9 | \usepackage{hyperref} 10 | \usepackage[backend=cairo, outputdir=diagrams]{diagrams-latex} 11 | 12 | \newtheorem{dfn}{Definition} 13 | 14 | \title{Document} 15 | \author{Kai Zhang} 16 | \date{} 17 | 18 | \begin{document} 19 | 20 | \maketitle 21 | 22 | \tableofcontents 23 | \newpage 24 | 25 | 1-based coordinate system A coordinate system where the first base of a sequence 26 | is one. In this system, a region is specified by a closed interval. For example, the 27 | region between the 3rd and the 7th bases inclusive is [3, 7]. The SAM, VCF, GFF and Wiggle 28 | formats are using the 1-based coordinate system. 29 | 30 | 0-based coordinate system A coordinate system where the first base of a sequence 31 | is zero. In this coordinate system, a region is specified by a half-closed-half-open interval. 32 | For example, the region between the 3rd and the 7th bases inclusive is [2, 7). 33 | The BAM, BCFv2, BED, and PSL formats are using the 0-based coordinate system. 34 | 35 | \printbibliography 36 | 37 | \end{document} 38 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/ChIPSeq/FragLen.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | 3 | module Bio.ChIPSeq.FragLen 4 | ( fragLength 5 | , naiveCCWithSmooth 6 | ) where 7 | 8 | import Bio.Data.Bed 9 | import Lens.Micro ((^.)) 10 | import Control.Parallel.Strategies (parMap, rpar) 11 | import qualified Data.ByteString.Char8 as B 12 | import qualified Data.HashMap.Strict as M 13 | import qualified Data.HashSet as S 14 | import Data.List (foldl', maximumBy) 15 | import Data.Ord (comparing) 16 | 17 | -- | estimate fragment length for a ChIP-seq experiment 18 | fragLength :: (Int, Int) -> [BED] -> Int 19 | fragLength (start, end) beds = fst $ maximumBy (comparing snd) $ 20 | naiveCCWithSmooth 4 beds [start, start+2 .. end] 21 | {-# INLINE fragLength #-} 22 | 23 | -- sizeDistribution :: (Int, Int) -> [BED] -> (Int, Int) 24 | 25 | fromBED :: [BED] -> [(B.ByteString, (S.HashSet Int, S.HashSet Int))] 26 | fromBED = map toSet . M.toList . M.fromListWith f . map parseLine 27 | where 28 | parseLine :: BED -> (B.ByteString, ([Int], [Int])) 29 | parseLine x = case x^.strand of 30 | Just True -> (x^.chrom, ([x^.chromStart], [])) 31 | Just False -> (x^.chrom, ([], [x^.chromEnd])) 32 | _ -> error "Unknown Strand!" 33 | f (a,b) (a',b') = (a++a', b++b') 34 | toSet (chr, (forwd, rev)) = (chr, (S.fromList forwd, S.fromList rev)) 35 | {-# INLINE fromBED #-} 36 | 37 | -- | fast relative cross-correlation with smoothing 38 | apprxCorr :: S.HashSet Int -> S.HashSet Int -> Int -> Int -> Int 39 | apprxCorr forwd rev smooth d = S.foldl' f 0 rev 40 | where 41 | f :: Int -> Int -> Int 42 | f !acc r | any (`S.member` forwd) [r-d-smooth..r-d+smooth] = acc + 1 43 | | otherwise = acc 44 | {-# INLINE apprxCorr #-} 45 | 46 | -- | calcuate cross corrlation with different shifts 47 | naiveCCWithSmooth :: Int -> [BED] -> [Int] -> [(Int, Int)] 48 | naiveCCWithSmooth smooth input range = f . fromBED $ input 49 | where 50 | cc :: [(B.ByteString, (S.HashSet Int, S.HashSet Int))] -> Int -> Int 51 | cc xs d = foldl' (+) 0 . map ((\(forwd, rev) -> apprxCorr forwd rev smooth d).snd) $ xs 52 | f xs = zip range $ parMap rpar (cc xs) range 53 | {-# INLINE naiveCCWithSmooth #-} 54 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Data/Bam.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | {-# LANGUAGE LambdaCase #-} 3 | module Bio.Data.Bam 4 | ( BAM 5 | , getBamHeader 6 | , streamBam 7 | , sinkBam 8 | , bamToBedC 9 | , bamToBed 10 | , bamToFragmentC 11 | , bamToFragment 12 | , bamToFastqC 13 | , bamToFastq 14 | , sortedBamToBedPE 15 | ) where 16 | 17 | import Control.Monad (mzero) 18 | import Data.List (foldl') 19 | import Bio.Data.Bed 20 | import Bio.Data.Fastq 21 | import Bio.HTS 22 | import Conduit 23 | 24 | -- | Convert bam record to bed record. Unmapped reads will be discarded. 25 | bamToBedC :: MonadIO m => BAMHeader -> ConduitT BAM BED m () 26 | bamToBedC header = mapC (bamToBed header) .| concatC 27 | {-# INLINE bamToBedC #-} 28 | 29 | -- | Convert bam record to fastq record. 30 | bamToFastqC :: Monad m => ConduitT BAM Fastq m () 31 | bamToFastqC = mapC bamToFastq .| concatC 32 | {-# INLINE bamToFastqC #-} 33 | 34 | -- | Convert pairedend bam to fragment. 35 | bamToFragmentC :: Monad m => BAMHeader -> ConduitT BAM BED m () 36 | bamToFragmentC header = mapC (bamToFragment header) .| concatC 37 | {-# INLINE bamToFragmentC #-} 38 | 39 | bamToFragment :: BAMHeader -> BAM -> Maybe BED 40 | bamToFragment header bam 41 | | not (isFirstSegment flg) = Nothing 42 | | otherwise = do 43 | chr1 <- refName header bam 44 | chr2 <- mateRefName header bam 45 | if chr1 == chr2 46 | then return $ BED chr1 (min start1 start2) (max end1 end2) 47 | (Just $ queryName bam) Nothing Nothing 48 | else mzero 49 | where 50 | start1 = startLoc bam 51 | end1 = endLoc bam 52 | start2 = mateStartLoc bam 53 | end2 = mateStartLoc bam + ciglen cig 54 | cig = case queryAuxData ('M', 'C') bam of 55 | Just (AuxString x) -> string2Cigar x 56 | _ -> error "No MC tag. Please run samtools fixmate on file first." 57 | ciglen (CIGAR c) = foldl' f 0 c 58 | where f acc (n,x) = if x `elem` "MDN=X" then n + acc else acc 59 | flg = flag bam 60 | {-# INLINE bamToFragment #-} 61 | 62 | -- | Convert pairedend bam file to bed. the bam file must be sorted by names, 63 | -- e.g., using "samtools sort -n". This condition is checked from Bam header. 64 | sortedBamToBedPE :: Monad m => BAMHeader -> ConduitT BAM (BED, BED) m () 65 | sortedBamToBedPE header = case getSortOrder header of 66 | Queryname -> loopBedPE .| concatC 67 | _ -> error "Bam file must be sorted by NAME." 68 | where 69 | loopBedPE = (,) <$$> await <***> await >>= \case 70 | Nothing -> return () 71 | Just (bam1, bam2) -> if queryName bam1 /= queryName bam2 72 | then error "Adjacent records have different query names. Aborted." 73 | else do 74 | yield $ (,) <$> bamToBed header bam1 <*> bamToBed header bam2 75 | loopBedPE 76 | where 77 | (<$$>) = fmap . fmap 78 | (<***>) = (<*>) . fmap (<*>) 79 | {-# INLINE sortedBamToBedPE #-} 80 | 81 | -- | Convert BAM to BED. 82 | bamToBed :: BAMHeader -> BAM -> Maybe BED 83 | bamToBed header bam = mkBed <$> refName header bam 84 | where 85 | mkBed chr = BED chr start end nm sc str 86 | start = startLoc bam 87 | end = endLoc bam 88 | nm = Just $ queryName bam 89 | str = Just $ not $ isRev bam 90 | sc = Just $ fromIntegral $ mapq bam 91 | {-# INLINE bamToBed #-} 92 | 93 | -- | Convert BAM to Fastq. 94 | bamToFastq :: BAM -> Maybe Fastq 95 | bamToFastq bam = Fastq (queryName bam) <$> getSeq bam <*> qualityS bam 96 | {-# INLINE bamToFastq #-} -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Data/Bed.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Bio.Data.Bed 5 | ( BEDLike(..) 6 | , BEDConvert(..) 7 | , BED(..) 8 | , BED3(..) 9 | , BEDGraph(..) 10 | , bdgValue 11 | , NarrowPeak(..) 12 | , npSignal 13 | , npPvalue 14 | , npQvalue 15 | , npPeak 16 | , BEDExt(..) 17 | , _bed 18 | , _data 19 | 20 | , BEDTree 21 | , bedToTree 22 | , sortedBedToTree 23 | , queryIntersect 24 | , intersecting 25 | , within 26 | , isIntersected 27 | , sizeOverlapped 28 | , splitBed 29 | , splitBedBySize 30 | , splitBedBySizeLeft 31 | , splitBedBySizeOverlap 32 | , sortBed 33 | , intersectBed 34 | , intersectBedWith 35 | , intersectSortedBed 36 | , intersectSortedBedWith 37 | , isOverlapped 38 | , mergeBed 39 | , mergeBedWith 40 | , mergeSortedBed 41 | , mergeSortedBedWith 42 | -- , splitOverlapped 43 | , countOverlapped 44 | 45 | -- * IO 46 | , streamBed 47 | , streamBedGzip 48 | , readBed 49 | , sinkFileBed 50 | , sinkFileBedGzip 51 | , sinkHandleBed 52 | , writeBed 53 | 54 | , compareBed 55 | ) where 56 | 57 | import Conduit 58 | import Control.Arrow ((***), first) 59 | import Lens.Micro 60 | import qualified Data.ByteString.Char8 as B 61 | import qualified Data.Foldable as F 62 | import Data.Function (on) 63 | import qualified Data.HashMap.Strict as M 64 | import qualified Data.IntervalMap.Strict as IM 65 | import Data.List (groupBy, sortBy, group) 66 | import Data.Conduit.Zlib (gzip, ungzip, multiple) 67 | import qualified Data.Vector as V 68 | import qualified Data.Vector.Algorithms.Intro as I 69 | import System.IO 70 | 71 | import Bio.Data.Bed.Types 72 | import Bio.Utils.Misc (binBySize, binBySizeLeft, 73 | binBySizeOverlap, bins) 74 | 75 | -- | Convert a set of sorted bed records to interval tree, with combining 76 | -- function for equal keys. 77 | sortedBedToTree :: (BEDLike b, F.Foldable f) 78 | => (a -> a -> a) 79 | -> Sorted (f (b, a)) 80 | -> BEDTree a 81 | sortedBedToTree f (Sorted xs) = M.fromList $ 82 | map ((head *** IM.fromAscListWith f) . unzip) $ groupBy ((==) `on` fst) $ 83 | map (\(b, x) -> (b^.chrom, (IM.IntervalCO (b^.chromStart) (b^.chromEnd), x))) $ 84 | F.toList xs 85 | {-# INLINE sortedBedToTree #-} 86 | 87 | bedToTree :: BEDLike b 88 | => (a -> a -> a) 89 | -> [(b, a)] 90 | -> BEDTree a 91 | bedToTree f xs = M.fromList $ map ((head *** IM.fromAscListWith f) . unzip) $ 92 | groupBy ((==) `on` fst) $ 93 | map (\(b, x) -> (b^.chrom, (IM.IntervalCO (b^.chromStart) (b^.chromEnd), x))) $ 94 | V.toList $ V.create $ do 95 | v <- V.unsafeThaw $ V.fromList xs 96 | I.sortBy (compareBed `on` fst) v 97 | return v 98 | {-# INLINE bedToTree #-} 99 | 100 | queryIntersect :: BEDLike b => b -> BEDTree a -> [(BED3, a)] 101 | queryIntersect x tree = map (first f) $ IM.assocs $ intersecting tree x 102 | where 103 | f (IM.IntervalCO lo hi) = BED3 chr lo hi 104 | f _ = undefined 105 | chr = x^.chrom 106 | {-# INLINE queryIntersect #-} 107 | 108 | -- | Return the submap of key intervals intersecting the given interval. 109 | intersecting :: BEDLike b => BEDTree a -> b -> IM.IntervalMap Int a 110 | intersecting tree x = IM.intersecting (M.lookupDefault IM.empty (x^.chrom) tree) $ 111 | IM.IntervalCO (x^.chromStart) $ x^.chromEnd 112 | {-# INLINE intersecting #-} 113 | 114 | -- | Return the submap of key intervals completely inside the given interval. 115 | within:: BEDLike b => BEDTree a -> b -> IM.IntervalMap Int a 116 | within tree x = IM.within (M.lookupDefault IM.empty (x^.chrom) tree) $ 117 | IM.IntervalCO (x^.chromStart) $ x^.chromEnd 118 | {-# INLINE within #-} 119 | 120 | isIntersected :: BEDLike b => BEDTree a -> b -> Bool 121 | isIntersected tree = not . IM.null . intersecting tree 122 | {-# INLINE isIntersected #-} 123 | 124 | sizeOverlapped :: (BEDLike b1, BEDLike b2) => b1 -> b2 -> Int 125 | sizeOverlapped b1 b2 | b1^.chrom /= b2^.chrom = 0 126 | | overlap < 0 = 0 127 | | otherwise = overlap 128 | where 129 | overlap = minimum [ b1^.chromEnd - b2^.chromStart 130 | , b2^.chromEnd - b1^.chromStart 131 | , b1^.chromEnd - b1^.chromStart 132 | , b2^.chromEnd - b2^.chromStart ] 133 | 134 | -- | split a bed region into k consecutive subregions, discarding leftovers 135 | splitBed :: BEDConvert b => Int -> b -> [b] 136 | splitBed k bed = map (uncurry (asBed (bed^.chrom))) $ 137 | bins k (bed^.chromStart, bed^.chromEnd) 138 | {-# INLINE splitBed #-} 139 | 140 | -- | split a bed region into consecutive fixed size subregions, discarding leftovers 141 | splitBedBySize :: BEDConvert b => Int -> b -> [b] 142 | splitBedBySize k bed = map (uncurry (asBed (bed^.chrom))) $ 143 | binBySize k (bed^.chromStart, bed^.chromEnd) 144 | {-# INLINE splitBedBySize #-} 145 | 146 | -- | split a bed region into consecutive fixed size subregions, including leftovers 147 | splitBedBySizeLeft :: BEDConvert b => Int -> b -> [b] 148 | splitBedBySizeLeft k bed = map (uncurry (asBed (bed^.chrom))) $ 149 | binBySizeLeft k (bed^.chromStart, bed^.chromEnd) 150 | {-# INLINE splitBedBySizeLeft #-} 151 | 152 | splitBedBySizeOverlap :: BEDConvert b 153 | => Int -- ^ bin size 154 | -> Int -- ^ overlap size 155 | -> b -> [b] 156 | splitBedBySizeOverlap k o bed = map (uncurry (asBed (bed^.chrom))) $ 157 | binBySizeOverlap k o (bed^.chromStart, bed^.chromEnd) 158 | {-# INLINE splitBedBySizeOverlap #-} 159 | 160 | -- | Compare bed records using only the chromosome, start and end positions. 161 | -- Unlike the ``compare'' from the Ord type class, this function can compare 162 | -- different types of BED data types. 163 | compareBed :: (BEDLike b1, BEDLike b2) => b1 -> b2 -> Ordering 164 | compareBed b1 b2 = compare (b1^.chrom, b1^.chromStart, b1^.chromEnd) 165 | (b2^.chrom, b2^.chromStart, b2^.chromEnd) 166 | {-# INLINE compareBed #-} 167 | 168 | -- | sort BED, first by chromosome (alphabetical order), then by chromStart, last by chromEnd 169 | sortBed :: BEDLike b => [b] -> Sorted (V.Vector b) 170 | sortBed beds = Sorted $ V.create $ do 171 | v <- V.unsafeThaw . V.fromList $ beds 172 | I.sortBy compareBed v 173 | return v 174 | {-# INLINE sortBed #-} 175 | 176 | -- | return records in A that are overlapped with records in B 177 | intersectBed :: (BEDLike b1, BEDLike b2, Monad m) => [b2] -> ConduitT b1 b1 m () 178 | intersectBed b = intersectSortedBed b' 179 | where 180 | b' = sortBed b 181 | {-# INLINE intersectBed #-} 182 | 183 | -- | return records in A that are overlapped with records in B 184 | intersectSortedBed :: (BEDLike b1, BEDLike b2, Monad m) 185 | => Sorted (V.Vector b2) -> ConduitT b1 b1 m () 186 | intersectSortedBed (Sorted b) = filterC (not . IM.null . intersecting tree) 187 | where 188 | tree = sortedBedToTree (\_ _ -> ()) . Sorted $ V.map (\x -> (x,())) b 189 | {-# INLINE intersectSortedBed #-} 190 | 191 | intersectBedWith :: (BEDLike b1, BEDLike b2, Monad m) 192 | => (b1 -> [b2] -> a) 193 | -> [b2] 194 | -> ConduitT b1 a m () 195 | intersectBedWith fn = intersectSortedBedWith fn . sortBed 196 | {-# INLINE intersectBedWith #-} 197 | 198 | intersectSortedBedWith :: (BEDLike b1, BEDLike b2, Monad m) 199 | => (b1 -> [b2] -> a) 200 | -> Sorted (V.Vector b2) 201 | -> ConduitT b1 a m () 202 | intersectSortedBedWith fn (Sorted b) = mapC $ \input -> fn input 203 | $ concat $ IM.elems $ intersecting tree input 204 | where 205 | tree = sortedBedToTree (++) $ Sorted $ V.map (\x -> (x, [x])) b 206 | {-# INLINE intersectSortedBedWith #-} 207 | 208 | isOverlapped :: (BEDLike b1, BEDLike b2) => b1 -> b2 -> Bool 209 | isOverlapped b1 b2 = b1^.chrom == b2^.chrom && 210 | not (b1^.chromEnd <= b2^.chromStart || b2^.chromEnd <= b1^.chromStart) 211 | 212 | -- | Merge overlapping regions. 213 | mergeBed :: (BEDConvert b, Monad m) => [b] -> ConduitT i b m () 214 | mergeBed xs = yieldMany xs' .| mergeSortedBed 215 | where 216 | Sorted xs' = sortBed xs 217 | {-# INLINE mergeBed #-} 218 | 219 | -- | Merge overlapping regions according to a merging function. 220 | mergeBedWith :: (BEDLike b, Monad m) 221 | => ([b] -> a) -> [b] -> ConduitT i a m () 222 | mergeBedWith f xs = yieldMany xs' .| mergeSortedBedWith f 223 | where 224 | Sorted xs' = sortBed xs 225 | {-# INLINE mergeBedWith #-} 226 | 227 | -- | Merge overlapping regions. The input stream must be sorted first. 228 | mergeSortedBed :: (BEDConvert b, Monad m) => ConduitT b b m () 229 | mergeSortedBed = mergeSortedBedWith f 230 | where 231 | f xs = asBed (head xs ^. chrom) lo hi 232 | where 233 | lo = minimum $ map (^.chromStart) xs 234 | hi = maximum $ map (^.chromEnd) xs 235 | {-# INLINE mergeSortedBed #-} 236 | 237 | -- | Merge overlapping regions according to a merging function. The input 238 | -- stream must be sorted first. 239 | mergeSortedBedWith :: (BEDLike b, Monad m) 240 | => ([b] -> a) -> ConduitT b a m () 241 | mergeSortedBedWith mergeFn = headC >>= ( maybe mempty $ \b0 -> 242 | go ((b0^.chrom, b0^.chromStart, b0^.chromEnd), [b0]) ) 243 | where 244 | go ((chr, s, e), acc) = headC >>= maybe (yield $ mergeFn acc) f 245 | where 246 | f bed | chr /= chr' || s' > e = 247 | yield (mergeFn acc) >> go ((chr',s',e'), [bed]) 248 | | s' < s = error "input stream is not sorted" 249 | | e' > e = go ((chr',s,e'), bed:acc) 250 | | otherwise = go ((chr,s,e), bed:acc) 251 | where 252 | chr' = bed^.chrom 253 | s' = bed^.chromStart 254 | e' = bed^.chromEnd 255 | {-# INLINE mergeSortedBedWith #-} 256 | 257 | {- 258 | -- | Split overlapped regions into non-overlapped regions. The input must be overlapped. 259 | -- This function is usually used with `mergeBedWith`. 260 | splitOverlapped :: BEDLike b => ([b] -> a) -> [b] -> [(BED3, a)] 261 | splitOverlapped fun xs = filter ((>0) . size . fst) $ 262 | evalState (F.foldrM f [] $ init anchors) $ 263 | (\(a,b) -> (fromEither a, M.singleton (b^.chromStart, b^.chromEnd) b)) $ 264 | last anchors 265 | where 266 | anchors = sortBy (comparing (fromEither . fst)) $ concatMap 267 | ( \x -> [(Left $ x^.chromStart, x), (Right $ x^.chromEnd, x)] ) xs 268 | f (i, x) acc = do 269 | (j, s) <- get 270 | let bed = (asBed chr (fromEither i) j, fun $ M.elems s) 271 | s' = case i of 272 | Left _ -> M.delete (x^.chromStart, x^.chromEnd) s 273 | Right _ -> M.insert (x^.chromStart, x^.chromEnd) x s 274 | put (fromEither i, s') 275 | return (bed:acc) 276 | fromEither (Left x) = x 277 | fromEither (Right x) = x 278 | chr = head xs ^. chrom 279 | {-# INLINE splitOverlapped #-} 280 | -} 281 | 282 | -- | Split overlapped regions into non-overlapped regions. The input must be overlapped. 283 | -- This function is usually used with `mergeBedWith`. 284 | countOverlapped :: BEDLike b => [b] -> [(BED3, Int)] 285 | countOverlapped xs = reverse $ (\(_,_,x) -> x) $ 286 | F.foldl' go (p0, c0, []) $ tail anchors 287 | where 288 | (Left p0, c0) = head anchors 289 | go (x, accum, result) (Left p, count) 290 | | x == p = (p, accum + count, result) 291 | | otherwise = (p, accum + count, (asBed chr x $ p, accum):result) 292 | go (x, accum, result) (Right p, count) = 293 | (p+1, accum - count, (asBed chr x $ p + 1, accum):result) 294 | anchors = map (\x -> (head x, length x)) $ group $ sortBy cmp $ concatMap 295 | (\x -> [Left $ x^.chromStart, Right $ x^.chromEnd - 1]) xs 296 | cmp (Left x) (Left y) = compare x y 297 | cmp (Right x) (Right y) = compare x y 298 | cmp (Left x) (Right y) = case compare x y of 299 | EQ -> LT 300 | o -> o 301 | cmp (Right x) (Left y) = case compare x y of 302 | EQ -> GT 303 | o -> o 304 | chr = head xs ^. chrom 305 | {-# INLINE countOverlapped #-} 306 | 307 | streamBed :: (MonadResource m, BEDConvert b, MonadIO m) 308 | => FilePath -> ConduitT i b m () 309 | streamBed input = sourceFile input .| bsToBed 310 | {-# INLINE streamBed #-} 311 | 312 | streamBedGzip :: (BEDConvert b, MonadResource m, MonadThrow m, PrimMonad m) 313 | => FilePath -> ConduitT i b m () 314 | streamBedGzip input = sourceFile input .| multiple ungzip .| bsToBed 315 | {-# INLINE streamBedGzip #-} 316 | 317 | readBed :: BEDConvert b => FilePath -> IO [b] 318 | readBed fl = runResourceT $ runConduit $ streamBed fl .| sinkList 319 | {-# INLINE readBed #-} 320 | 321 | sinkFileBed :: (BEDConvert b, MonadResource m) => FilePath -> ConduitT b o m () 322 | sinkFileBed output = bedToBS .| sinkFile output 323 | {-# INLINE sinkFileBed #-} 324 | 325 | sinkFileBedGzip :: (BEDConvert b, MonadResource m, MonadThrow m, PrimMonad m) 326 | => FilePath -> ConduitT b o m () 327 | sinkFileBedGzip output = bedToBS .| gzip .| sinkFile output 328 | {-# INLINE sinkFileBedGzip #-} 329 | 330 | sinkHandleBed :: (BEDConvert b, MonadIO m) => Handle -> ConduitT b o m () 331 | sinkHandleBed hdl = bedToBS .| sinkHandle hdl 332 | {-# INLINE sinkHandleBed #-} 333 | 334 | writeBed :: BEDConvert b => FilePath -> [b] -> IO () 335 | writeBed fl beds = runResourceT $ runConduit $ yieldMany beds .| sinkFileBed fl 336 | {-# INLINE writeBed #-} 337 | 338 | bedToBS :: (BEDConvert b, Monad m) => ConduitT b B.ByteString m () 339 | bedToBS = mapC toLine .| unlinesAsciiC 340 | {-# INLINE bedToBS #-} 341 | 342 | bsToBed :: (BEDConvert b, Monad m) => ConduitT B.ByteString b m () 343 | bsToBed = linesUnboundedAsciiC .| mapC fromLine 344 | {-# INLINE bsToBed #-} 345 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Data/Bed/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE TemplateHaskell #-} 3 | {-# LANGUAGE StrictData #-} 4 | {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-} 5 | 6 | module Bio.Data.Bed.Types 7 | ( BEDLike(..) 8 | , BEDConvert(..) 9 | , BED(..) 10 | , BED3(..) 11 | , BEDGraph(..) 12 | , bdgValue 13 | , NarrowPeak(..) 14 | , npSignal 15 | , npPvalue 16 | , npQvalue 17 | , npPeak 18 | , BroadPeak(..) 19 | , bpSignal 20 | , bpPvalue 21 | , bpQvalue 22 | , BEDExt(..) 23 | , _bed 24 | , _data 25 | , BEDTree 26 | , Sorted(..) 27 | ) where 28 | 29 | import Lens.Micro 30 | import Lens.Micro.TH (makeLensesFor) 31 | import qualified Data.ByteString.Char8 as B 32 | import Data.ByteString.Lex.Integral (packDecimal) 33 | import Data.Double.Conversion.ByteString (toShortest) 34 | import qualified Data.HashMap.Strict as M 35 | import qualified Data.IntervalMap.Strict as IM 36 | import Data.Maybe (fromJust, fromMaybe) 37 | import GHC.Generics (Generic) 38 | import Control.DeepSeq (NFData) 39 | 40 | import Bio.Utils.Misc (readDouble, readInt) 41 | 42 | readDoubleNonnegative :: B.ByteString -> Maybe Double 43 | readDoubleNonnegative x | v < 0 = Nothing 44 | | otherwise = Just v 45 | where 46 | v = readDouble x 47 | {-# INLINE readDoubleNonnegative #-} 48 | 49 | readIntNonnegative :: B.ByteString -> Maybe Int 50 | readIntNonnegative x | v < 0 = Nothing 51 | | otherwise = Just v 52 | where 53 | v = readInt x 54 | {-# INLINE readIntNonnegative #-} 55 | 56 | -- | A class representing BED-like data, e.g., BED3, BED6 and BED12. BED format 57 | -- uses 0-based index (see documentation). 58 | class BEDLike b where 59 | -- | Field lens 60 | chrom :: Lens' b B.ByteString 61 | chromStart :: Lens' b Int 62 | chromEnd :: Lens' b Int 63 | name :: Lens' b (Maybe B.ByteString) 64 | score :: Lens' b (Maybe Int) 65 | strand :: Lens' b (Maybe Bool) 66 | 67 | -- | Return the size of a bed region. 68 | size :: b -> Int 69 | size bed = bed^.chromEnd - bed^.chromStart 70 | {-# INLINE size #-} 71 | 72 | {-# MINIMAL chrom, chromStart, chromEnd, name, score, strand #-} 73 | 74 | class BEDLike b => BEDConvert b where 75 | -- | Construct bed record from chromsomoe, start location and end location 76 | asBed :: B.ByteString -> Int -> Int -> b 77 | 78 | -- | Convert bytestring to bed format 79 | fromLine :: B.ByteString -> b 80 | 81 | -- | Convert bed to bytestring 82 | toLine :: b -> B.ByteString 83 | 84 | convert :: BEDLike b' => b' -> b 85 | convert bed = asBed (bed^.chrom) (bed^.chromStart) (bed^.chromEnd) 86 | {-# INLINE convert #-} 87 | 88 | {-# MINIMAL asBed, fromLine, toLine #-} 89 | 90 | -- * BED6 format 91 | 92 | -- | BED6 format, as described in http://genome.ucsc.edu/FAQ/FAQformat.html#format1.7 93 | data BED = BED 94 | { _bed_chrom :: B.ByteString 95 | , _bed_chromStart :: Int 96 | , _bed_chromEnd :: Int 97 | , _bed_name :: Maybe B.ByteString 98 | , _bed_score :: Maybe Int 99 | , _bed_strand :: Maybe Bool -- ^ True: "+", False: "-" 100 | } deriving (Eq, Show, Read, Generic, NFData) 101 | 102 | instance Ord BED where 103 | compare (BED x1 x2 x3 x4 x5 x6) (BED y1 y2 y3 y4 y5 y6) = 104 | compare (x1,x2,x3,x4,x5,x6) (y1,y2,y3,y4,y5,y6) 105 | 106 | instance BEDLike BED where 107 | chrom = lens _bed_chrom (\bed x -> bed { _bed_chrom = x }) 108 | chromStart = lens _bed_chromStart (\bed x -> bed { _bed_chromStart = x }) 109 | chromEnd = lens _bed_chromEnd (\bed x -> bed { _bed_chromEnd = x }) 110 | name = lens _bed_name (\bed x -> bed { _bed_name = x }) 111 | score = lens _bed_score (\bed x -> bed { _bed_score = x }) 112 | strand = lens _bed_strand (\bed x -> bed { _bed_strand = x }) 113 | 114 | instance BEDConvert BED where 115 | asBed chr s e = BED chr s e Nothing Nothing Nothing 116 | 117 | fromLine l = f $ take 6 $ B.split '\t' l 118 | where 119 | f [f1,f2,f3,f4,f5,f6] = BED f1 (readInt f2) (readInt f3) (getName f4) 120 | (getScore f5) (getStrand f6) 121 | f [f1,f2,f3,f4,f5] = BED f1 (readInt f2) (readInt f3) (getName f4) 122 | (getScore f5) Nothing 123 | f [f1,f2,f3,f4] = BED f1 (readInt f2) (readInt f3) (getName f4) 124 | Nothing Nothing 125 | f [f1,f2,f3] = asBed f1 (readInt f2) (readInt f3) 126 | f _ = error "Read BED fail: Not enough fields!" 127 | getName x | x == "." = Nothing 128 | | otherwise = Just x 129 | getScore x | x == "." = Nothing 130 | | s >= 0 = Just s 131 | | otherwise = error "Read BED fail: score must be greater than 0" 132 | where 133 | s = readInt x 134 | getStrand str | str == "-" = Just False 135 | | str == "+" = Just True 136 | | otherwise = Nothing 137 | {-# INLINE fromLine #-} 138 | 139 | toLine (BED f1 f2 f3 f4 f5 f6) = B.intercalate "\t" 140 | [ f1, fromJust $ packDecimal f2, fromJust $ packDecimal f3 141 | , fromMaybe "." f4, score', strand' ] 142 | where 143 | strand' | f6 == Just True = "+" 144 | | f6 == Just False = "-" 145 | | otherwise = "." 146 | score' = case f5 of 147 | Just x -> fromJust $ packDecimal x 148 | _ -> "." 149 | {-# INLINE toLine #-} 150 | 151 | convert bed = BED (bed^.chrom) (bed^.chromStart) (bed^.chromEnd) (bed^.name) 152 | (bed^.score) (bed^.strand) 153 | 154 | -- | BED3 format 155 | data BED3 = BED3 156 | { _bed3_chrom :: B.ByteString 157 | , _bed3_chrom_start :: Int 158 | , _bed3_chrom_end :: Int 159 | } deriving (Eq, Show, Read, Generic, NFData) 160 | 161 | instance Ord BED3 where 162 | compare (BED3 x1 x2 x3) (BED3 y1 y2 y3) = compare (x1,x2,x3) (y1,y2,y3) 163 | 164 | instance BEDLike BED3 where 165 | chrom = lens _bed3_chrom (\bed x -> bed { _bed3_chrom = x }) 166 | chromStart = lens _bed3_chrom_start (\bed x -> bed { _bed3_chrom_start = x }) 167 | chromEnd = lens _bed3_chrom_end (\bed x -> bed { _bed3_chrom_end = x }) 168 | name = lens (const Nothing) (\bed _ -> bed) 169 | score = lens (const Nothing) (\bed _ -> bed) 170 | strand = lens (const Nothing) (\bed _ -> bed) 171 | 172 | instance BEDConvert BED3 where 173 | asBed = BED3 174 | 175 | fromLine l = case B.split '\t' l of 176 | (a:b:c:_) -> BED3 a (readInt b) $ readInt c 177 | _ -> error "Read BED fail: Incorrect number of fields" 178 | {-# INLINE fromLine #-} 179 | 180 | toLine (BED3 a b c) = B.intercalate "\t" 181 | [a, fromJust $ packDecimal b, fromJust $ packDecimal c] 182 | {-# INLINE toLine #-} 183 | 184 | -- | Bedgraph format. 185 | data BEDGraph = BEDGraph 186 | { _bdg_chrom :: B.ByteString 187 | , _bdg_chrom_start :: Int 188 | , _bdg_chrom_end :: Int 189 | , _bdg_value :: Double 190 | } deriving (Eq, Show, Read, Generic, NFData) 191 | 192 | makeLensesFor [("_bdg_value", "bdgValue")] ''BEDGraph 193 | 194 | instance Ord BEDGraph where 195 | compare (BEDGraph x1 x2 x3 x4) (BEDGraph y1 y2 y3 y4) = 196 | compare (x1,x2,x3,x4) (y1,y2,y3,y4) 197 | 198 | instance BEDLike BEDGraph where 199 | chrom = lens _bdg_chrom (\bed x -> bed { _bdg_chrom = x }) 200 | chromStart = lens _bdg_chrom_start (\bed x -> bed { _bdg_chrom_start = x }) 201 | chromEnd = lens _bdg_chrom_end (\bed x -> bed { _bdg_chrom_end = x }) 202 | name = lens (const Nothing) (\bed _ -> bed) 203 | score = lens (const Nothing) (\bed _ -> bed) 204 | strand = lens (const Nothing) (\bed _ -> bed) 205 | 206 | instance BEDConvert BEDGraph where 207 | asBed a b c = BEDGraph a b c 0 208 | {-# INLINE asBed #-} 209 | 210 | fromLine l = case B.split '\t' l of 211 | (a:b:c:d:_) -> BEDGraph a (readInt b) (readInt c) $ readDouble d 212 | _ -> error "Read BEDGraph fail: Incorrect number of fields" 213 | {-# INLINE fromLine #-} 214 | 215 | toLine (BEDGraph a b c d) = B.intercalate "\t" 216 | [a, fromJust $ packDecimal b, fromJust $ packDecimal c, toShortest d] 217 | {-# INLINE toLine #-} 218 | 219 | 220 | -- | ENCODE narrowPeak format: https://genome.ucsc.edu/FAQ/FAQformat.html#format12 221 | data NarrowPeak = NarrowPeak 222 | { _npChrom :: B.ByteString 223 | , _npStart :: Int 224 | , _npEnd :: Int 225 | , _npName :: Maybe B.ByteString 226 | , _npScore :: Int 227 | , _npStrand :: Maybe Bool 228 | , _npSignal :: Double 229 | , _npPvalue :: Maybe Double 230 | , _npQvalue :: Maybe Double 231 | , _npPeak :: Maybe Int 232 | } deriving (Eq, Show, Read, Generic, NFData) 233 | 234 | makeLensesFor [ ("_npSignal", "npSignal") 235 | , ("_npPvalue", "npPvalue") 236 | , ("_npQvalue", "npQvalue") 237 | , ("_npPeak", "npPeak") 238 | ] ''NarrowPeak 239 | 240 | instance BEDLike NarrowPeak where 241 | chrom = lens _npChrom (\bed x -> bed { _npChrom = x }) 242 | chromStart = lens _npStart (\bed x -> bed { _npStart = x }) 243 | chromEnd = lens _npEnd (\bed x -> bed { _npEnd = x }) 244 | name = lens _npName (\bed x -> bed { _npName = x }) 245 | score = lens (Just . _npScore) (\bed x -> bed { _npScore = fromJust x }) 246 | strand = lens _npStrand (\bed x -> bed { _npStrand = x }) 247 | 248 | instance BEDConvert NarrowPeak where 249 | asBed chr s e = NarrowPeak chr s e Nothing 0 Nothing 0 Nothing Nothing Nothing 250 | 251 | fromLine = go . B.split '\t' 252 | where 253 | go [a,b,c] = convert $ BED3 a (readInt b) $ readInt c 254 | go (a:b:c:d:e:f:g:h:i:j:_) = NarrowPeak a (readInt b) (readInt c) 255 | (if d == "." then Nothing else Just d) 256 | (readInt e) 257 | (if f == "." then Nothing else if f == "+" then Just True else Just False) 258 | (readDouble g) 259 | (readDoubleNonnegative h) 260 | (readDoubleNonnegative i) 261 | (readIntNonnegative j) 262 | go x = error $ "Cannot parse line: " <> show x 263 | {-# INLINE fromLine #-} 264 | 265 | toLine (NarrowPeak a b c d e f g h i j) = B.intercalate "\t" 266 | [ a, fromJust $ packDecimal b, fromJust $ packDecimal c, fromMaybe "." d 267 | , fromJust $ packDecimal e 268 | , case f of 269 | Nothing -> "." 270 | Just True -> "+" 271 | _ -> "-" 272 | , toShortest g, fromMaybe "-1" $ fmap toShortest h 273 | , fromMaybe "-1" $ fmap toShortest i 274 | , fromMaybe "-1" $ fmap (fromJust . packDecimal) j 275 | ] 276 | {-# INLINE toLine #-} 277 | 278 | convert bed = NarrowPeak (bed^.chrom) (bed^.chromStart) (bed^.chromEnd) (bed^.name) 279 | (fromMaybe 0 $ bed^.score) (bed^.strand) 0 Nothing Nothing Nothing 280 | 281 | -- | ENCODE broadPeak format: https://genome.ucsc.edu/FAQ/FAQformat.html#format13 282 | data BroadPeak = BroadPeak 283 | { _bpChrom :: B.ByteString 284 | , _bpStart :: Int 285 | , _bpEnd :: Int 286 | , _bpName :: Maybe B.ByteString 287 | , _bpScore :: Int 288 | , _bpStrand :: Maybe Bool 289 | , _bpSignal :: Double 290 | , _bpPvalue :: Maybe Double 291 | , _bpQvalue :: Maybe Double 292 | } deriving (Eq, Show, Read, Generic, NFData) 293 | 294 | makeLensesFor [ ("_bpSignal", "bpSignal") 295 | , ("_bpPvalue", "bpPvalue") 296 | , ("_bpQvalue", "bpQvalue") 297 | ] ''BroadPeak 298 | 299 | instance BEDLike BroadPeak where 300 | chrom = lens _bpChrom (\bed x -> bed { _bpChrom = x }) 301 | chromStart = lens _bpStart (\bed x -> bed { _bpStart = x }) 302 | chromEnd = lens _bpEnd (\bed x -> bed { _bpEnd = x }) 303 | name = lens _bpName (\bed x -> bed { _bpName = x }) 304 | score = lens (Just . _bpScore) (\bed x -> bed { _bpScore = fromJust x }) 305 | strand = lens _bpStrand (\bed x -> bed { _bpStrand = x }) 306 | 307 | instance BEDConvert BroadPeak where 308 | asBed chr s e = BroadPeak chr s e Nothing 0 Nothing 0 Nothing Nothing 309 | 310 | fromLine l = BroadPeak a (readInt b) (readInt c) 311 | (if d == "." then Nothing else Just d) 312 | (readInt e) 313 | (if f == "." then Nothing else if f == "+" then Just True else Just False) 314 | (readDouble g) 315 | (readDoubleNonnegative h) 316 | (readDoubleNonnegative i) 317 | where 318 | (a:b:c:d:e:f:g:h:i:_) = B.split '\t' l 319 | {-# INLINE fromLine #-} 320 | 321 | toLine (BroadPeak a b c d e f g h i) = B.intercalate "\t" 322 | [ a, fromJust $ packDecimal b, fromJust $ packDecimal c, fromMaybe "." d 323 | , fromJust $ packDecimal e 324 | , case f of 325 | Nothing -> "." 326 | Just True -> "+" 327 | _ -> "-" 328 | , toShortest g, fromMaybe "-1" $ fmap toShortest h 329 | , fromMaybe "-1" $ fmap toShortest i 330 | ] 331 | {-# INLINE toLine #-} 332 | 333 | convert bed = BroadPeak (bed^.chrom) (bed^.chromStart) (bed^.chromEnd) (bed^.name) 334 | (fromMaybe 0 $ bed^.score) (bed^.strand) 0 Nothing Nothing 335 | 336 | 337 | data BEDExt bed a = BEDExt 338 | { _ext_bed :: bed 339 | , _ext_data :: a 340 | } deriving (Eq, Show, Read, Generic, NFData) 341 | 342 | makeLensesFor [("_ext_bed", "_bed"), ("_ext_data", "_data")] ''BEDExt 343 | 344 | instance BEDLike bed => BEDLike (BEDExt bed a) where 345 | chrom = _bed . chrom 346 | chromStart = _bed . chromStart 347 | chromEnd = _bed . chromEnd 348 | name = _bed . name 349 | score = _bed . score 350 | strand = _bed . strand 351 | 352 | instance (Read a, Show a, BEDConvert bed) => BEDConvert (BEDExt bed a) where 353 | asBed _ _ _ = error "Unable to transform arbitrary record to BEDExt" 354 | 355 | fromLine l = let (a, b) = B.breakEnd (=='\t') l 356 | in BEDExt (fromLine $ B.init a) $ read $ B.unpack b 357 | {-# INLINE fromLine #-} 358 | 359 | toLine (BEDExt bed a) = toLine bed <> "\t" <> B.pack (show a) 360 | {-# INLINE toLine #-} 361 | 362 | type BEDTree a = M.HashMap B.ByteString (IM.IntervalMap Int a) 363 | 364 | -- | a type to imply that underlying data structure is sorted 365 | newtype Sorted b = Sorted {fromSorted :: b} deriving (Show, Read, Eq) -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Data/Bed/Utils.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE LambdaCase #-} 4 | {-# LANGUAGE RecordWildCards #-} 5 | {-# LANGUAGE BangPatterns #-} 6 | 7 | module Bio.Data.Bed.Utils 8 | ( fetchSeq 9 | , clipBed 10 | , CutoffMotif(..) 11 | , mkCutoffMotif 12 | , scanMotif 13 | , monoColonalize 14 | , BaseMap(..) 15 | , baseMap 16 | , queryBaseMap 17 | , rpkmBed 18 | , rpkmSortedBed 19 | , countTagsBed 20 | , countTagsBinBed 21 | , countTagsBinBed' 22 | , tagCountDistr 23 | , peakCluster 24 | ) where 25 | 26 | import Conduit 27 | import Lens.Micro 28 | import Control.Monad.State.Strict 29 | import qualified Data.ByteString.Char8 as B 30 | import qualified Data.Foldable as F 31 | import Data.Function (on) 32 | import qualified Data.HashMap.Strict as M 33 | import qualified Data.IntervalMap.Strict as IM 34 | import Data.Maybe (fromJust, fromMaybe) 35 | import qualified Data.Vector as V 36 | import qualified Data.Vector.Algorithms.Intro as I 37 | import qualified Data.Vector.Generic as G 38 | import qualified Data.Vector.Generic.Mutable as GM 39 | import qualified Data.Vector.Unboxed as U 40 | import System.IO 41 | 42 | import Bio.Data.Bed 43 | import Bio.Data.Bed.Types 44 | import Bio.Motif (Bkgd (..), Motif (..), CDF, PWM) 45 | import qualified Bio.Motif as Motif 46 | import qualified Bio.Motif.Search as Motif 47 | import Bio.Seq hiding (length) 48 | import Bio.Seq.IO 49 | import qualified Bio.Utils.BitVector as BV 50 | 51 | clipBed :: (BEDLike b, Monad m) 52 | => [(B.ByteString, Int)] -- ^ Chromosome sizes 53 | -> ConduitT b b m () 54 | clipBed chrsize = concatMapC f 55 | where 56 | f x = case M.lookup (x^.chrom) chrsize' of 57 | Nothing -> Nothing 58 | Just n -> if x^.chromStart >= n 59 | then Nothing 60 | else Just $ chromStart %~ max 0 $ chromEnd %~ min n $ x 61 | chrsize' = M.fromListWith (error "redundant chromosomes") chrsize 62 | {-# INLINE clipBed #-} 63 | 64 | -- | retreive sequences 65 | fetchSeq :: BioSeq DNA a 66 | => Genome 67 | -> BED 68 | -> IO (Either String (DNA a)) 69 | fetchSeq g bed = do 70 | dna <- getSeq g (bed^.chrom, bed^.chromStart, bed^.chromEnd) 71 | return $ case bed^.strand of 72 | Just False -> rc <$> dna 73 | _ -> dna 74 | {-# INLINE fetchSeq #-} 75 | 76 | -- | Motif with predefined cutoff score. All necessary intermediate data 77 | -- structure for motif scanning are stored. 78 | data CutoffMotif = CutoffMotif 79 | { _motif_name :: B.ByteString 80 | , _motif_pwm :: PWM 81 | , _motif_sigma :: U.Vector Double 82 | , _motif_pwm_rc :: PWM 83 | , _motif_sigma_rc :: U.Vector Double 84 | , _background :: Bkgd 85 | , _cutoff :: Double 86 | , _cdf :: CDF } 87 | 88 | mkCutoffMotif :: Bkgd 89 | -> Double -- ^ p-value 90 | -> Motif -> CutoffMotif 91 | mkCutoffMotif bg p motif = CutoffMotif (_name motif) (_pwm motif) sigma pwm' 92 | sigma' bg sc $ Motif.truncateCDF (1 - p * 10) cdf 93 | where 94 | cdf = Motif.scoreCDF bg $ _pwm motif 95 | sc = Motif.cdf' cdf $ 1 - p 96 | sigma = Motif.optimalScoresSuffix bg $ _pwm motif 97 | pwm' = Motif.rcPWM $ _pwm motif 98 | sigma' = Motif.optimalScoresSuffix bg pwm' 99 | 100 | -- | Motif score is in [0, 1000]: ( 1 / (1 + exp (-(-logP - 5))) ) * 1000. 101 | scanMotif :: (BEDLike b, MonadIO m) 102 | => Genome -> [CutoffMotif] -> ConduitT b BED m () 103 | scanMotif g motifs = awaitForever $ \bed -> do 104 | let (chr, start, end) = (bed^.chrom, bed^.chromStart, bed^.chromEnd) 105 | liftIO (getSeq g (chr, start, end)) >>= \case 106 | Left _ -> liftIO $ hPutStrLn stderr $ 107 | "Warning: no sequence for region: " ++ show (chr, start, end) 108 | Right dna -> forM_ motifs $ \CutoffMotif{..} -> do 109 | let mkBed str (i, sc) = BED chr (start + i) (start + i + n) 110 | (Just $ _motif_name) (Just $ toAffinity $ 1 - Motif.cdf _cdf sc) 111 | (Just str) 112 | n = Motif.size _motif_pwm 113 | -- Scan forward strand 114 | Motif.findTFBSWith _motif_sigma _background _motif_pwm 115 | (dna :: DNA IUPAC) _cutoff True .| mapC (mkBed True) 116 | -- Scan reverse strand 117 | Motif.findTFBSWith _motif_sigma_rc _background _motif_pwm_rc 118 | dna _cutoff True .| mapC (mkBed False) 119 | where 120 | toAffinity x' = round $ sc * 1000 121 | where 122 | sc = 1 / (1 + exp (-(x - 5))) 123 | x = negate $ logBase 10 $ max 1e-20 x' 124 | {-# INLINE scanMotif #-} 125 | 126 | -- | process a sorted BED stream, keep only mono-colonal tags 127 | monoColonalize :: Monad m => ConduitT BED BED m () 128 | monoColonalize = do 129 | x <- headC 130 | case x of 131 | Just b -> yield b >> concatMapAccumC f b 132 | Nothing -> return () 133 | where 134 | f cur prev = case compareBed prev cur of 135 | GT -> error $ 136 | "Input is not sorted: " ++ show prev ++ " > " ++ show cur 137 | LT -> (cur, [cur]) 138 | _ -> if prev^.strand == cur^.strand then (cur, []) else (cur, [cur]) 139 | {-# INLINE monoColonalize #-} 140 | 141 | newtype BaseMap = BaseMap (M.HashMap B.ByteString BV.BitVector) 142 | 143 | -- | Count the tags (starting positions) at each position in the genome. 144 | baseMap :: PrimMonad m 145 | => [(B.ByteString, Int)] -- ^ chromosomes and their sizes 146 | -> ConduitT BED o m BaseMap 147 | baseMap chrs = do 148 | bvs <- lift $ fmap M.fromList $ forM chrs $ \(chr, n) -> do 149 | bv <- BV.zeros n 150 | return (chr, bv) 151 | 152 | mapM_C $ \bed -> case M.lookup (bed^.chrom) bvs of 153 | Nothing -> return () 154 | Just bv -> if fromMaybe True $ bed^.strand 155 | then BV.set bv $ bed^.chromStart 156 | else BV.set bv $ bed^.chromEnd - 1 157 | 158 | lift $ fmap BaseMap $ sequence $ fmap BV.unsafeFreeze bvs 159 | 160 | queryBaseMap :: BEDLike b => b -> BaseMap -> Maybe [Bool] 161 | queryBaseMap bed (BaseMap bm) = case M.lookup (bed^.chrom) bm of 162 | Nothing -> Nothing 163 | Just bv -> 164 | let res = map (bv BV.!) [bed^.chromStart .. bed^.chromEnd - 1] 165 | in case bed^.strand of 166 | Just False -> Just $ reverse res 167 | _ -> Just res 168 | 169 | -- | calculate RPKM on a set of unique regions. Regions (in bed format) would be kept in 170 | -- memory but not tag file. 171 | -- RPKM: Readcounts per kilobase per million reads. Only counts the starts of tags 172 | rpkmBed :: (PrimMonad m, BEDLike b, G.Vector v Double) 173 | => [b] -> ConduitT BED o m (v Double) 174 | rpkmBed regions = do 175 | v <- lift $ do v' <- V.unsafeThaw . V.fromList . zip [0..] $ regions 176 | I.sortBy (compareBed `on` snd) v' 177 | V.unsafeFreeze v' 178 | let (idx, sortedRegions) = V.unzip v 179 | n = G.length idx 180 | readCount <- rpkmSortedBed $ Sorted sortedRegions 181 | 182 | lift $ do 183 | result <- GM.new n 184 | G.sequence_ . G.imap (\x i -> GM.unsafeWrite result i (readCount U.! x)) $ idx 185 | G.unsafeFreeze result 186 | {-# INLINE rpkmBed #-} 187 | 188 | -- | calculate RPKM on a set of regions. Regions must be sorted. The Sorted data 189 | -- type is used to remind users to sort their data. 190 | rpkmSortedBed :: (PrimMonad m, BEDLike b, G.Vector v Double) 191 | => Sorted (V.Vector b) -> ConduitT BED o m (v Double) 192 | rpkmSortedBed (Sorted regions) = do 193 | vec <- lift $ GM.replicate l 0 194 | n <- foldMC (count vec) (0 :: Int) 195 | let factor = fromIntegral n / 1e9 196 | lift $ liftM (G.imap (\i x -> x / factor / (fromIntegral . size) (regions V.! i))) 197 | $ G.unsafeFreeze vec 198 | where 199 | count v nTags tag = do 200 | let p | tag^.strand == Just True = tag^.chromStart 201 | | tag^.strand == Just False = tag^.chromEnd - 1 202 | | otherwise = error "Unkown strand" 203 | xs = concat $ IM.elems $ 204 | IM.containing (M.lookupDefault IM.empty (tag^.chrom) intervalMap) p 205 | addOne v xs 206 | return $ succ nTags 207 | 208 | intervalMap = sortedBedToTree (++) . Sorted . G.toList . G.zip regions . 209 | G.map return . G.enumFromN 0 $ l 210 | addOne v' = mapM_ $ \x -> GM.unsafeRead v' x >>= GM.unsafeWrite v' x . (+1) 211 | l = G.length regions 212 | {-# INLINE rpkmSortedBed #-} 213 | 214 | countTagsBed :: (PrimMonad m, BEDLike b, G.Vector v Int) 215 | => [b] -> ConduitT BED o m (v Int, Int) 216 | countTagsBed regions = do 217 | vec <- lift $ GM.replicate l 0 218 | n <- foldMC (count vec) (0 :: Int) 219 | vec' <- lift $ G.unsafeFreeze vec 220 | return (vec', n) 221 | where 222 | count v nTags tag = do 223 | let p | tag^.strand == Just True = tag^.chromStart 224 | | tag^.strand == Just False = tag^.chromEnd - 1 225 | | otherwise = error "Unkown strand" 226 | xs = concat $ IM.elems $ 227 | IM.containing (M.lookupDefault IM.empty (tag^.chrom) intervalMap) p 228 | addOne v xs 229 | return $ succ nTags 230 | intervalMap = bedToTree (++) $ zip regions $ map return [0..] 231 | addOne v' = mapM_ $ \x -> GM.unsafeRead v' x >>= GM.unsafeWrite v' x . (+1) 232 | l = length regions 233 | {-# INLINE countTagsBed #-} 234 | 235 | -- | divide each region into consecutive bins, and count tags for each bin and 236 | -- return the number of all tags. Note: a tag is considered to be overlapped 237 | -- with a region only if the starting position of the tag is in the region. For 238 | -- the common sense overlapping, use countTagsBinBed'. 239 | countTagsBinBed :: (Integral a, PrimMonad m, G.Vector v a, BEDLike b) 240 | => Int -- ^ bin size 241 | -> [b] -- ^ regions 242 | -> ConduitT BED o m ([v a], Int) 243 | countTagsBinBed k beds = do 244 | vs <- lift $ fmap V.fromList $ forM beds $ \bed -> do 245 | let start = bed^.chromStart 246 | num = ((bed^.chromEnd) - start) `div` k 247 | index i = (i - start) `div` k 248 | v <- GM.replicate num 0 249 | return (v, index) 250 | nTags <- foldMC (f vs) 0 251 | readCount <- lift $ mapM (G.unsafeFreeze . fst) $ G.toList vs 252 | return (readCount, nTags) 253 | where 254 | f vs n bed = do 255 | let pos | bed^.strand == Just True = bed^.chromStart 256 | | bed^.strand == Just False = bed^.chromEnd - 1 257 | | otherwise = error "unkown strand." 258 | overlaps = concat $ IM.elems $ IM.containing 259 | (M.lookupDefault IM.empty (bed^.chrom) intervalMap) pos 260 | forM_ overlaps $ \x -> do 261 | let (v, idxFn) = vs `G.unsafeIndex` x 262 | i = let i' = idxFn pos 263 | l = GM.length v 264 | in if i' >= l then l - 1 else i' 265 | GM.unsafeModify v (+1) i 266 | return $ n + 1 267 | intervalMap = bedToTree (++) $ zip beds $ map return [0..] 268 | {-# INLINE countTagsBinBed #-} 269 | 270 | -- | Same as countTagsBinBed, except that tags are treated as complete intervals 271 | -- instead of single points. 272 | countTagsBinBed' :: (Integral a, PrimMonad m, G.Vector v a, BEDLike b1, BEDLike b2) 273 | => Int -- ^ bin size 274 | -> [b1] -- ^ regions 275 | -> ConduitT b2 o m ([v a], Int) 276 | countTagsBinBed' k beds = do 277 | initRC <- lift $ forM beds $ \bed -> do 278 | let start = bed^.chromStart 279 | end = bed^.chromEnd 280 | num = (end - start) `div` k 281 | index i = (i - start) `div` k 282 | v <- GM.replicate num 0 283 | return (v, index) 284 | 285 | sink 0 $ V.fromList initRC 286 | where 287 | sink !nTags vs = do 288 | tag <- await 289 | case tag of 290 | Just bed -> do 291 | let chr = bed^.chrom 292 | start = bed^.chromStart 293 | end = bed^.chromEnd 294 | overlaps = concat $ IM.elems $ IM.intersecting 295 | (M.lookupDefault IM.empty chr intervalMap) $ IM.IntervalCO start end 296 | lift $ forM_ overlaps $ \x -> do 297 | let (v, idxFn) = vs `G.unsafeIndex` x 298 | lo = let i = idxFn start 299 | in if i < 0 then 0 else i 300 | hi = let i = idxFn end 301 | l = GM.length v 302 | in if i >= l then l - 1 else i 303 | forM_ [lo..hi] $ \i -> 304 | GM.unsafeRead v i >>= GM.unsafeWrite v i . (+1) 305 | sink (nTags+1) vs 306 | 307 | _ -> do readCount <- lift $ mapM (G.unsafeFreeze . fst) $ G.toList vs 308 | return (readCount, nTags) 309 | 310 | intervalMap = bedToTree (++) $ zip beds $ map return [0..] 311 | {-# INLINE countTagsBinBed' #-} 312 | 313 | tagCountDistr :: PrimMonad m => G.Vector v Int => ConduitT BED o m (v Int) 314 | tagCountDistr = loop M.empty 315 | where 316 | loop m = do 317 | x <- await 318 | case x of 319 | Just bed -> do 320 | let p | fromMaybe True (bed^.strand) = bed^.chromStart 321 | | otherwise = 1 - bed^.chromEnd 322 | case M.lookup (bed^.chrom) m of 323 | Just table -> loop $ M.insert (bed^.chrom) (M.insertWith (+) p 1 table) m 324 | _ -> loop $ M.insert (bed^.chrom) (M.fromList [(p,1)]) m 325 | _ -> lift $ do 326 | vec <- GM.replicate 100 0 327 | F.forM_ m $ \table -> 328 | F.forM_ table $ \v -> do 329 | let i = min 99 v 330 | GM.unsafeRead vec i >>= GM.unsafeWrite vec i . (+1) 331 | G.unsafeFreeze vec 332 | {-# INLINE tagCountDistr #-} 333 | 334 | -- | cluster peaks 335 | peakCluster :: (BEDLike b, Monad m) 336 | => [b] -- ^ peaks 337 | -> Int -- ^ radius 338 | -> Int -- ^ cutoff 339 | -> ConduitT o BED m () 340 | peakCluster peaks r th = mergeBedWith mergeFn peaks' .| filterC g 341 | where 342 | peaks' = map f peaks 343 | f b = let c = (b^.chromStart + b^.chromEnd) `div` 2 344 | in asBed (b^.chrom) (c-r) (c+r) :: BED3 345 | mergeFn xs = asBed (head xs ^. chrom) lo hi & score .~ Just (fromIntegral $ length xs) 346 | where 347 | lo = minimum $ map (^.chromStart) xs 348 | hi = maximum $ map (^.chromEnd) xs 349 | g b = fromJust (b^.score) >= fromIntegral th 350 | {-# INLINE peakCluster #-} 351 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Data/BigWig.hs: -------------------------------------------------------------------------------- 1 | module Bio.Data.BigWig 2 | ( module Data.BBI.BigWig 3 | , accumScores 4 | , normalizedScores 5 | ) where 6 | 7 | import Data.BBI.BigWig 8 | import Data.Conduit 9 | import qualified Data.Conduit.List as CL 10 | import Bio.Data.Bed 11 | 12 | -- | for each BED region, return total scores of all wig records overlapped with it 13 | accumScores :: BEDLike b => BWFile -> Conduit b IO Double 14 | accumScores bwF = CL.mapM (helper bwF) 15 | -} 16 | 17 | -- | for each BED region, return normalized scores (divided by the length) of 18 | -- all wig records overlapped with it 19 | normalizedScores :: BEDLike b => BWFile -> Conduit b IO Double 20 | normalizedScores bwF = CL.mapM $ \bed -> do 21 | x <- helper bwF bed 22 | return $ x / (fromIntegral . size) bed 23 | 24 | helper :: BEDLike b => BWFile -> b -> IO Double 25 | helper bwF bed = queryBWFile bwF (chr, start, end) $$ CL.fold f 0.0 26 | where 27 | f acc (_, s, e, v) = acc + fromIntegral (e - s) * v 28 | chr = chrom bed 29 | start = chromStart bed 30 | end = chromEnd bed 31 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Data/Fasta.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleInstances #-} 2 | {-# LANGUAGE UndecidableInstances #-} 3 | 4 | module Bio.Data.Fasta 5 | ( FastaLike(..) 6 | , fastaReader 7 | ) where 8 | 9 | import Bio.Motif 10 | import Bio.Seq 11 | import qualified Data.ByteString.Char8 as B 12 | import Conduit 13 | 14 | class FastaLike f where 15 | -- | Convert a FASTA record, consisting of a record header and a record body, 16 | -- to a specific data type 17 | fromFastaRecord :: (B.ByteString, [B.ByteString]) -> f 18 | 19 | readFasta :: FilePath -> ConduitT i f (ResourceT IO) () 20 | readFasta fl = fastaReader fl .| mapC fromFastaRecord 21 | 22 | -- | non-stream version, read whole file in memory 23 | readFasta' :: FilePath -> IO [f] 24 | readFasta' fl = runResourceT $ runConduit $ readFasta fl .| sinkList 25 | {-# MINIMAL fromFastaRecord #-} 26 | 27 | instance BioSeq s a => FastaLike (s a) where 28 | fromFastaRecord (_, xs) = case fromBS (B.concat xs) of 29 | Left err -> error err 30 | Right x -> x 31 | {-# INLINE fromFastaRecord #-} 32 | 33 | instance FastaLike Motif where 34 | fromFastaRecord (name, mat) = Motif name (toPWM mat) 35 | {-# INLINE fromFastaRecord #-} 36 | 37 | fastaReader :: FilePath 38 | -> ConduitT i (B.ByteString, [B.ByteString]) (ResourceT IO) () 39 | fastaReader fl = sourceFile fl .| linesUnboundedAsciiC .| loop [] 40 | where 41 | loop acc = do 42 | x <- await 43 | case x of 44 | Just l -> case () of 45 | _ | B.null l -> loop acc -- empty line, go to next line 46 | | B.head l == '>' -> output (reverse acc) >> loop [B.tail l] 47 | | otherwise -> loop (l:acc) 48 | Nothing -> output $ reverse acc 49 | output (x:xs) = yield (x, xs) 50 | output _ = return () 51 | {-# INLINE fastaReader #-} 52 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Data/Fastq.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE LambdaCase #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-} 4 | module Bio.Data.Fastq 5 | ( Fastq(..) 6 | , streamFastqGzip 7 | , streamFastq 8 | , sinkFastqGzip 9 | , sinkFastq 10 | , parseFastqC 11 | , parseFastqC' 12 | , fastqToByteString 13 | , qualitySummary 14 | , trimPolyA 15 | ) where 16 | 17 | import Conduit 18 | import Data.Conduit.Zlib (ungzip, multiple, gzip) 19 | import qualified Data.ByteString.Char8 as B 20 | import qualified Data.ByteString as BS 21 | import qualified Data.Attoparsec.ByteString as A 22 | import Data.Attoparsec.ByteString.Char8 23 | import Data.Conduit.Attoparsec 24 | import GHC.Generics (Generic) 25 | import Control.DeepSeq 26 | 27 | -- | A FASTQ file normally uses four lines per sequence. 28 | -- 29 | -- * Line 1 begins with a '@' character and is followed by a sequence 30 | -- identifier and an optional description (like a FASTA title line). 31 | -- 32 | -- * Line 2 is the raw sequence letters. 33 | -- 34 | -- * Line 3 begins with a '+' character and is optionally followed by the 35 | -- same sequence identifier (and any description) again. 36 | -- 37 | -- * Line 4 encodes the quality values for the sequence in Line 2, and must 38 | -- contain the same number of symbols as letters in the sequence. 39 | data Fastq = Fastq 40 | { fastqSeqId :: B.ByteString 41 | , fastqSeq :: B.ByteString 42 | , fastqSeqQual :: B.ByteString 43 | } deriving (Show, Eq, Generic, NFData) 44 | 45 | -- | Read gzipped fastq file. 46 | streamFastqGzip :: (PrimMonad m, MonadThrow m, MonadResource m) 47 | => FilePath -> ConduitT i Fastq m () 48 | streamFastqGzip fl = sourceFileBS fl .| multiple ungzip .| parseFastqC 49 | 50 | streamFastq :: (MonadResource m, MonadThrow m) 51 | => FilePath -> ConduitT i Fastq m () 52 | streamFastq fl = sourceFileBS fl .| parseFastqC 53 | 54 | sinkFastq :: (MonadResource m, MonadThrow m) 55 | => FilePath -> ConduitT Fastq o m () 56 | sinkFastq fl = mapC fastqToByteString .| unlinesAsciiC .| sinkFileBS fl 57 | 58 | sinkFastqGzip :: (PrimMonad m, MonadThrow m, MonadResource m) 59 | => FilePath -> ConduitT Fastq o m () 60 | sinkFastqGzip fl = mapC fastqToByteString .| unlinesAsciiC .| gzip .| sinkFileBS fl 61 | 62 | data FQBuilder = Init FQBuilder 63 | | FQ1 B.ByteString FQBuilder 64 | | FQ2 B.ByteString FQBuilder 65 | | FQ3 B.ByteString FQBuilder 66 | | Complete 67 | 68 | fqBuilder :: FQBuilder -> Fastq 69 | fqBuilder = go ([], [], []) 70 | where 71 | go acc (Init bldr) = go acc bldr 72 | go (f1,f2,f3) (FQ1 x bldr) = go (x:f1, f2, f3) bldr 73 | go (f1,f2,f3) (FQ2 x bldr) = go (f1, x:f2, f3) bldr 74 | go (f1,f2,f3) (FQ3 x bldr) = go (f1, f2, x:f3) bldr 75 | go (f1,f2,f3) Complete = Fastq (B.concat $ reverse f1) 76 | (B.concat $ reverse f2) (B.concat $ reverse f3) 77 | {-# INLINE fqBuilder #-} 78 | 79 | parseFastqC :: Monad m => ConduitT B.ByteString Fastq m () 80 | parseFastqC = await >>= maybe (error "Empty input") ( \x -> do 81 | if B.head x == '@' 82 | then loop Init 'a' $ B.tail x 83 | else error "Record does not start with \'@\'" ) 84 | where 85 | tryRead1 input | B.null input = await >>= maybe (error "Unexpected EOF") return 86 | | otherwise = return input 87 | loop acc st input = case st of 88 | 'a' -> do 89 | (x, rest) <- B.break (=='\n') <$> tryRead1 input 90 | if B.null rest 91 | then loop (acc . FQ1 x) 'a' rest 92 | else loop (acc . FQ1 x) 'b' $ B.tail rest 93 | 'b' -> do 94 | (x, rest) <- B.break (=='\n') <$> tryRead1 input 95 | if B.null rest 96 | then loop (acc . FQ2 x) 'b' rest 97 | else loop (acc . FQ2 x) 'B' $ B.tail rest 98 | 'B' -> do 99 | input' <- tryRead1 input 100 | if B.head input' == '+' 101 | then loop acc 'c' $ B.tail input' 102 | else do 103 | let (x, rest) = B.break (=='\n') input' 104 | if B.null rest 105 | then loop (acc . FQ2 x) 'b' rest 106 | else loop (acc . FQ2 x) 'B' $ B.tail rest 107 | 'c' -> do 108 | (x, rest) <- B.break (=='\n') <$> tryRead1 input 109 | if B.null rest 110 | then loop acc 'c' rest 111 | else loop acc 'd' $ B.tail rest 112 | 'd' -> do 113 | (x, rest) <- B.break (=='\n') <$> tryRead1 input 114 | if B.null rest 115 | then loop (acc . FQ3 x) 'd' rest 116 | else loop (acc . FQ3 x) 'D' $ B.tail rest 117 | 'D' -> if B.null input 118 | then await >>= \case 119 | Nothing -> yield $ fqBuilder $ acc Complete 120 | Just input' -> if B.head input' == '@' 121 | then do 122 | yield $ fqBuilder $ acc Complete 123 | loop Init 'a' $ B.tail input' 124 | else loop acc 'd' input' 125 | else if B.head input == '@' 126 | then do 127 | yield $ fqBuilder $ acc Complete 128 | loop Init 'a' $ B.tail input 129 | else loop acc 'd' input 130 | {-# INLINE parseFastqC #-} 131 | 132 | parseFastqC' :: MonadThrow m => ConduitT B.ByteString Fastq m () 133 | parseFastqC' = conduitParser fastqParser .| mapC snd 134 | where 135 | fastqParser = do 136 | _ <- skipWhile (/='@') >> char8 '@' 137 | ident <- A.takeTill isEndOfLine 138 | endOfLine 139 | sequ <- BS.filter (not . isEndOfLine) <$> takeTill (=='+') 140 | char8 '+' >> A.skipWhile (not . isEndOfLine) >> endOfLine 141 | score <- BS.filter (not . isEndOfLine) <$> 142 | A.scan 0 (f (B.length sequ)) 143 | skipWhile (/='@') 144 | return $ Fastq ident sequ score 145 | where 146 | f n i x | i >= n = Nothing 147 | | isEndOfLine x = Just i 148 | | otherwise = Just $ i + 1 149 | {-# INLINE parseFastqC' #-} 150 | 151 | fastqToByteString :: Fastq -> B.ByteString 152 | fastqToByteString (Fastq a b c) = B.concat ["@", a, "\n", b, "\n+\n", c] 153 | {-# INLINE fastqToByteString #-} 154 | 155 | -- | Get the mean and variance of quality scores at every position. 156 | qualitySummary :: Monad m => ConduitT Fastq o m [(Double, Double)] 157 | qualitySummary = mapC (map fromIntegral . decodeQualSc) .| meanVarianceC 158 | 159 | meanVarianceC :: Monad m => ConduitT [Double] o m [(Double, Double)] 160 | meanVarianceC = peekC >>= \case 161 | Nothing -> error "Empty input" 162 | Just x -> fst <$> foldlC f (replicate (length x) (0,0), 0 :: Int) 163 | where 164 | f (acc, n) xs = let acc' = zipWith g acc xs in (acc', n') 165 | where 166 | n' = n + 1 167 | g (m, s) x = (m', s') 168 | where 169 | m' = m + d / fromIntegral n' 170 | s' = s + d * (x - m') 171 | d = x - m 172 | {-# INLINE meanVarianceC #-} 173 | 174 | decodeQualSc :: Fastq -> [Int] 175 | decodeQualSc = map (fromIntegral . (\x -> x - 33)) . BS.unpack .fastqSeqQual 176 | {-# INLINE decodeQualSc #-} 177 | 178 | pError :: Int -> Double 179 | pError x = 10 ** (negate (fromIntegral x) / 10) 180 | {-# INLINE pError #-} 181 | 182 | {- 183 | mkFastqRecord l1 l2 l3 l4 = Fastq (parseLine1 l1) (parseLine2 l2) 184 | (parseLine3 l3) (parseLine4 l4) 185 | where 186 | parseLine1 x | B.head x == '@' = B.tail x 187 | | otherwise = error $ "Parse line 1 failed: " ++ B.unpack x 188 | parseLine2 x | B.all f x = x 189 | | otherwise = error $ "Parse line 2 failed: " ++ B.unpack x 190 | where 191 | f 'C' = True 192 | f 'G' = True 193 | f 'T' = True 194 | f 'A' = True 195 | f 'N' = True 196 | f _ = False 197 | parseLine3 x | B.head x == '+' = B.tail x 198 | | otherwise = error $ "Parse line 3 failed: " ++ B.unpack x 199 | parseLine4 x | BS.all f x = x 200 | | otherwise = error $ "Parse line 4 failed: " ++ B.unpack x 201 | where 202 | f b = let b' = fromIntegral b :: Int 203 | in b' >= 33 && b' <= 126 204 | -} 205 | 206 | -- | Remove trailing 'A' 207 | trimPolyA :: Int -> Fastq -> Fastq 208 | trimPolyA n f@(Fastq a b c) 209 | | B.length trailing >= n = Fastq a b' $ B.take (B.length b') c 210 | | otherwise = f 211 | where 212 | (b', trailing) = B.spanEnd (=='A') b -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Data/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TemplateHaskell #-} 2 | {-# LANGUAGE FlexibleInstances #-} 3 | {-# LANGUAGE FlexibleContexts #-} 4 | {-# LANGUAGE MultiParamTypeClasses #-} 5 | {-# LANGUAGE FunctionalDependencies #-} 6 | {-# LANGUAGE OverloadedStrings #-} 7 | 8 | module Bio.Data.Types where 9 | ( FileFormat(..) 10 | , File(..) 11 | , emptyFile 12 | , location 13 | , replication 14 | , format 15 | , keywords 16 | , Experiment(..) 17 | , eid 18 | , control 19 | , celltype 20 | , target 21 | , files 22 | , info 23 | ) where 24 | 25 | import Data.Aeson 26 | import Data.Aeson.TH (deriveJSON, defaultOptions) 27 | import Shelly hiding (FilePath) 28 | import qualified Data.ByteString as B 29 | import qualified Data.ByteString.Char8 as BC 30 | import qualified Data.Text as T 31 | import Control.Lens (makeFields, (^.), (.~)) 32 | import qualified Data.HashMap.Strict as M 33 | import Crypto.Hash.MD5 (hash) 34 | import Numeric (showHex) 35 | 36 | data FileFormat = BamFile 37 | | BaiFile 38 | | BedFile 39 | | BedGZip 40 | | FastqFile 41 | | FastqGZip 42 | | BedgraphFile 43 | | BigWigFile 44 | | NarrowPeakFile 45 | | BroadPeakFile 46 | | Other 47 | deriving (Show, Read, Eq) 48 | 49 | data File = File 50 | { fileLocation :: !FilePath 51 | , fileReplication :: !Int 52 | , fileFormat :: !FileFormat 53 | , fileKeywords :: ![String] 54 | , fileInfo :: !(M.HashMap String String) 55 | } deriving (Show, Read, Eq) 56 | 57 | makeFields ''File 58 | 59 | emptyFile :: File 60 | emptyFile = File 61 | { fileLocation = "" 62 | , fileReplication = 0 63 | , fileFormat = Other 64 | , fileKeywords = [] 65 | , fileInfo = M.empty 66 | } 67 | 68 | data Experiment = Experiment 69 | { experimentEid :: !String 70 | , experimentCelltype :: !String 71 | , experimentTarget :: !String 72 | , experimentFiles :: ![File] 73 | , experimentInfo :: !(M.HashMap String String) 74 | , experimentControl :: !(Maybe String) 75 | } deriving (Show, Read, Eq) 76 | 77 | makeFields ''Experiment 78 | 79 | deriveJSON defaultOptions ''Format 80 | 81 | instance FromJSON File where 82 | parseJSON (Object obj) = do 83 | path <- obj .: "path" 84 | File <$> return path <*> 85 | obj .:? "rep" .!= 1 <*> 86 | obj .:? "format" .!= getFormat path <*> 87 | return [] <*> 88 | return M.empty 89 | 90 | instance FromJSON Experiment where 91 | parseJSON (Object obj) = do 92 | fls <- obj .: "files" 93 | let eid' = concat . map (flip showHex "") $ B.unpack $ hash $ BC.pack $ 94 | unlines $ map (^.location) fls 95 | Experiment <$> obj .:? "id" .!= eid' <*> 96 | obj .:? "celltype" .!= "" <*> 97 | obj .: "target" <*> 98 | return fls <*> 99 | return M.empty <*> 100 | obj .:? "control" 101 | 102 | getFormat :: FilePath -> Format 103 | getFormat fl = case suf of 104 | "bam" -> Bam 105 | "bed" -> Bed 106 | "gz" -> case (snd $ T.breakOnEnd "." $ T.init pre) of 107 | "bed" -> BedGZip 108 | _ -> error "Unknown file format" 109 | _ -> error "Unknown file format" 110 | where 111 | (pre, suf) = T.breakOnEnd "." $ T.toLower $ T.pack fl 112 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/GO.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module Bio.GO 3 | ( GO(..) 4 | , GOId 5 | , GOMap 6 | , getGOLevel 7 | ) where 8 | 9 | import qualified Data.HashMap.Strict as M 10 | import Data.Maybe 11 | import qualified Data.Text as T 12 | 13 | data GO = GO 14 | { _oboId :: !GOId 15 | , _label :: !T.Text 16 | , _subProcessOf :: ![GOId] 17 | , _oboNS :: !T.Text 18 | } deriving (Show, Read) 19 | 20 | type GOId = Int 21 | 22 | type GOMap = M.HashMap GOId GO 23 | 24 | -- | The top level is 0. 25 | getGOLevel :: GOId -> GOMap -> Int 26 | getGOLevel gid gm = loop 0 [gid] 27 | where 28 | loop l ids | null parents = l 29 | | otherwise = loop (l+1) parents 30 | where 31 | parents = concatMap _subProcessOf $ flip mapMaybe ids $ \i -> M.lookup i gm 32 | 33 | {- 34 | getParentById :: GOId -> GOMap -> Maybe GO 35 | getParentById gid goMap = M.lookup gid goMap >>= _subProcessOf 36 | >>= (`M.lookup` goMap) 37 | {-# INLINE getParentById #-} 38 | 39 | -- | Add a GO term to the count table. Term counts will propogate from child to 40 | -- its parents. This function works for cyclical graph as well. 41 | addTerm :: GO -> GOMap -> TermCount -> TermCount 42 | addTerm g m t = loop S.empty g t 43 | where 44 | loop visited go table 45 | | _oboId go `S.member` visited = table 46 | | otherwise = case _subProcessOf go of 47 | Nothing -> table' 48 | Just gid -> loop (S.insert (_oboId go) visited) 49 | (M.lookupDefault undefined gid m) table' 50 | where 51 | table' = M.insertWith (+) (_oboId go) 1 table 52 | 53 | enrichment :: (TermCount, Int) -- ^ Background frequency and the total number 54 | -> (TermCount, Int) -- ^ Foreground 55 | -> [(GOId, Double, Double)] 56 | enrichment (bg, bg_total) (fg, fg_total) = 57 | flip map (M.toList fg) $ \(gid, fg_count) -> 58 | let enrich = fromIntegral (fg_count * bg_total) / 59 | fromIntegral (fg_total * bg_count) 60 | bg_count = M.lookupDefault undefined gid bg 61 | p = 1 - hyperquick fg_count bg_count fg_total bg_total 62 | in (gid, enrich, p) 63 | -} -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/GO/Parser.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE StrictData #-} 4 | module Bio.GO.Parser 5 | ( readOWL 6 | , readOWLAsMap 7 | , GAF(..) 8 | , readGAF 9 | ) where 10 | 11 | import Control.Arrow ((&&&)) 12 | import Conduit 13 | import qualified Data.ByteString.Lazy.Char8 as L 14 | import qualified Data.ByteString.Char8 as B 15 | import qualified Data.HashMap.Strict as M 16 | import Data.Text.Encoding (decodeUtf8) 17 | import Text.XML.Expat.Proc 18 | import Text.XML.Expat.Tree 19 | import Data.Maybe 20 | import qualified Data.CaseInsensitive as CI 21 | 22 | import Bio.GO 23 | import Bio.Utils.Misc (readInt) 24 | 25 | readOWL :: FilePath -> IO [GO] 26 | readOWL fl = do 27 | xml <- parseThrowing defaultParseOptions <$> L.readFile fl :: IO (Node B.ByteString B.ByteString) 28 | return $ map process $ filterChildren (\x -> "owl:Class" == getName x && not (isDeprecated x)) xml 29 | where 30 | isDeprecated x = isJust $ findChild "owl:deprecated" x 31 | process record = GO id' label parent namespace 32 | where 33 | id' = case findChild "oboInOwl:id" record of 34 | Nothing -> error $ "Cannot find id field for: " <> show record 35 | Just i -> readInt $ snd $ B.breakEnd (==':') $ B.concat $ map getText $ getChildren i 36 | label = case findChild "rdfs:label" record of 37 | Nothing -> error "readOWL: cannot find label field" 38 | Just l -> decodeUtf8 $ B.concat $ map getText $ getChildren l 39 | namespace = case findChild "oboInOwl:hasOBONamespace" record of 40 | Nothing -> error "readOWL: cannot find namespace field" 41 | Just ns -> decodeUtf8 $ B.concat $ map getText $ getChildren ns 42 | parent = 43 | let f p = case lookup "rdf:resource" (getAttributes p) of 44 | Nothing -> Nothing 45 | Just at -> Just $ readInt $ snd $ B.breakEnd (=='_') at 46 | in mapMaybe f $ findChildren "rdfs:subClassOf" record 47 | 48 | readOWLAsMap :: FilePath -> IO GOMap 49 | readOWLAsMap fl = M.fromListWith errMsg . map (_oboId &&& id) <$> readOWL fl 50 | where 51 | errMsg = error "readOWLAsMap: Duplicate records." 52 | 53 | data GAF = GAF 54 | { gafDb :: B.ByteString 55 | , gafDbId :: B.ByteString 56 | , gafSymbol :: CI.CI B.ByteString 57 | , gafQualifier :: Maybe [B.ByteString] 58 | , gafGoId :: GOId 59 | , gafDbRef :: [B.ByteString] 60 | , gafEvidenceCode :: B.ByteString 61 | , gafWithOrFrom :: Maybe [B.ByteString] 62 | , gafAspect :: B.ByteString 63 | , gafName :: Maybe B.ByteString 64 | , gafSynonym :: Maybe [B.ByteString] 65 | , gafType :: B.ByteString 66 | , gafTaxon :: [B.ByteString] 67 | , gafDate :: B.ByteString 68 | , gafAssignedBy :: B.ByteString 69 | , gafAnnotationExtension :: Maybe [B.ByteString] 70 | , gafGeneProductID :: Maybe B.ByteString 71 | } deriving (Show, Ord, Eq) 72 | 73 | -- | GO Annotation File (GAF) Format 2.1 Parser. For details read: 74 | -- http://geneontology.org/page/go-annotation-file-gaf-format-21. 75 | readGAF :: FilePath -> ConduitT i GAF (ResourceT IO) () 76 | readGAF input = sourceFileBS input .| linesUnboundedAsciiC .| 77 | (dropWhileC isCom >> mapC parseLine) 78 | where 79 | isCom l = B.head l == '!' || B.null l 80 | {-# INLINE readGAF #-} 81 | 82 | parseLine :: B.ByteString -> GAF 83 | parseLine l = GAF f1 f2 (CI.mk f3) (optionals f4) 84 | (readInt $ snd $ B.breakEnd (==':') f5) (B.split '|' f6) f7 (optionals f8) 85 | f9 (optional f10) (optionals f11) f12 (B.split '|' f13) f14 f15 86 | (optionals f16) (optional f17) 87 | where 88 | [f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17] = B.split '\t' l 89 | optional x | B.null x = Nothing 90 | | otherwise = Just x 91 | optionals x | B.null x = Nothing 92 | | otherwise = Just $ B.split '|' x 93 | {-# INLINE parseLine #-} 94 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/HiC.hs: -------------------------------------------------------------------------------- 1 | module Bio.HiC 2 | ( ContactMap(..) 3 | , fromAL 4 | , mkContactMap 5 | , mkContactMap' 6 | 7 | -- * contact matrix normalization 8 | , vcNorm 9 | , vcNormVector 10 | , sqrtNorm 11 | , sqrtNormVector 12 | , obsDivExp 13 | , expectedVector 14 | ) where 15 | 16 | import Bio.SamTools.Bam 17 | import qualified Bio.SamTools.BamIndex as BI 18 | import Control.Monad (forM_, when, liftM, replicateM) 19 | import Control.Monad.Primitive 20 | import Control.Monad.Trans (lift) 21 | import qualified Data.ByteString.Char8 as B 22 | import Data.Binary (Binary(..)) 23 | import Data.Conduit 24 | import qualified Data.Conduit.List as CL 25 | import Data.List (foldl') 26 | import Data.Maybe (fromJust) 27 | import Data.Bits (shiftR) 28 | import qualified Data.Vector.Generic as G 29 | import qualified Data.Vector.Generic.Mutable as GM 30 | import qualified Data.Vector.Unboxed as U 31 | import qualified Data.Matrix.Symmetric as MS 32 | import qualified Data.Matrix.Symmetric.Mutable as MSM 33 | import Statistics.Sample (mean) 34 | 35 | import Bio.Data.Bam 36 | import Bio.Data.Bed 37 | 38 | type Matrix a = MS.SymMatrix U.Vector a 39 | 40 | -- | a specific base in the genome 41 | type Site = (B.ByteString, Int) 42 | 43 | -- | two sites interacting with each other form a contact 44 | type Contact = (Site, Site) 45 | 46 | data ContactMap = ContactMap 47 | { _chroms :: [(B.ByteString, Int)] 48 | , _resolution :: !Int 49 | , _matrix :: !(Matrix Double) 50 | } 51 | 52 | instance Binary ContactMap where 53 | put (ContactMap chroms res mat) = do 54 | put n 55 | mapM_ put chroms 56 | put res 57 | put mat 58 | where 59 | n = length chroms 60 | 61 | get = do 62 | n <- get 63 | chroms <- replicateM n get 64 | res <- get 65 | mat <- get 66 | return $ ContactMap chroms res mat 67 | 68 | -- | Read HiC contact map from associate list 69 | fromAL :: PrimMonad m 70 | => [(B.ByteString, Int)] 71 | -> Int 72 | -> Sink ((Int, Int), Double) m ContactMap 73 | fromAL chrs res = do 74 | mat <- lift $ MSM.replicate (matSize,matSize) 0 75 | CL.mapM_ $ \((i,j), v) -> MSM.write mat (i `div` res, j `div` res) v 76 | mat' <- lift $ MS.unsafeFreeze mat 77 | return $ ContactMap chrs res mat' 78 | where 79 | matSize = foldl' (+) 0 $ map (\(_,x) -> (x-1) `div` res + 1) chrs 80 | {-# INLINE fromAL #-} 81 | 82 | {- 83 | mkContactMap' :: FilePath -> [BED3] -> Int -> Int -> Source IO .. 84 | mkContactMap' bamFl regions w extend = do 85 | handle <- liftIO $ BI.open bamFl 86 | forM_ [0..n-1] 87 | where 88 | positions = V.scanl f 0 regions' 89 | n = V.last positions 90 | f acc (BED3 chr s e) = acc + (e - s) `div` w 91 | regions' = V.fromList regions 92 | g i = binarySearch positions i 93 | -} 94 | 95 | -- | O(n * (logN + k)). n = number of bins, N = number of tags. Generate contanct 96 | -- map using constant memory 97 | mkContactMap :: FilePath -> BED3 -> Int -> Int -> Source IO ((Int, Int), Int) 98 | mkContactMap bamFl (BED3 chr s e) w extend = do 99 | handle <- lift $ BI.open bamFl 100 | forM_ [0..n-1] $ \i -> 101 | forM_ [i..n-1] $ \j -> do 102 | let s1 = s + i * w 103 | c1 = s1 + w' 104 | s2 = s + j * w 105 | c2 = s2 + w' 106 | r <- lift $ readCount handle (w'+extend) ((chr, c1), (chr, c2)) 107 | if i == j 108 | then yield ((s1, s2), r `div` 2) 109 | else yield ((s1, s2), r) 110 | where 111 | n = (e - s) `div` w 112 | w' = w `div` 2 113 | 114 | -- | O(N + n). Store matrix in memory, faster when region is big 115 | mkContactMap' :: PrimMonad m => BED3 -> Int -> Int -> Sink Bam1 m (U.Vector Int) 116 | mkContactMap' (BED3 chr s e) w extend = do 117 | vec <- lift $ GM.replicate vecLen 0 118 | loop vec 119 | lift . liftM (G.map (`div` 2)) . G.unsafeFreeze $ vec 120 | where 121 | n = (e - s) `div` w 122 | e' = n * w 123 | vecLen = n * (n+1) `div` 2 124 | loop v = do 125 | x <- await 126 | case x of 127 | Just bam -> 128 | let flag = do bamChr <- targetName bam 129 | matChr <- mateTargetName bam 130 | return $ bamChr == chr && matChr == chr 131 | in case flag of 132 | Just True -> do 133 | let (p, pMate) = getStarts bam 134 | a = (p - s) `div` w 135 | b = (pMate - s) `div` w 136 | i | a < b = idx a b 137 | | otherwise = idx b a 138 | when (p >= s && pMate >= s && p < e' && pMate < e') $ 139 | lift $ GM.read v i >>= GM.write v i . (+1) 140 | loop v 141 | _ -> loop v 142 | _ -> return () 143 | idx i j = i * (2 * n - i + 1) `div` 2 + j - i 144 | 145 | -- | get starting location of bam and its mate 146 | getStarts :: Bam1 -> (Int, Int) 147 | getStarts bam = let p1 = fromIntegral . fromJust . position $ bam 148 | p2 = fromIntegral . fromJust . matePosition $ bam 149 | l = fromIntegral . fromJust . queryLength $ bam 150 | p1' | isReverse bam = p1 + l 151 | | otherwise = p1 152 | p2' | isReverse bam = p2 + l 153 | | otherwise = p2 154 | in (p1', p2') 155 | {-# INLINE getStarts #-} 156 | 157 | -- | the number of tag pairs that overlap with the region 158 | -- covered by the contact 159 | readCount :: BI.IdxHandle -- ^ bam file handler 160 | -> Int -- ^ half window width 161 | -> Contact 162 | -> IO Int 163 | readCount handle w ((c1, p1), (c2, p2)) = viewBam handle (c1, p1-w, p1+w) $$ CL.fold f 0 164 | where 165 | f acc x = 166 | case mateTargetName x of 167 | Just chr -> if chr == c2 168 | then let mp = fromIntegral . fromJust . matePosition $ x 169 | l = fromIntegral . fromJust . queryLength $ x 170 | in if isOverlapped r2 (mp, mp+l) 171 | then acc + 1 172 | else acc 173 | else acc 174 | _ -> acc 175 | isOverlapped (lo,hi) (lo',hi') = lo' < hi && hi' > lo 176 | r2 = (p2-w, p2+w) 177 | {-# INLINE readCount #-} 178 | 179 | -- | Vanilla coverage normalization (Lieberman-Aiden et al., 2009) 180 | vcNorm :: ContactMap -> ContactMap 181 | vcNorm c = normalizeBy (vcNormVector c) c 182 | 183 | sqrtNorm :: ContactMap -> ContactMap 184 | sqrtNorm c = normalizeBy (sqrtNormVector c) c 185 | 186 | normalizeBy :: U.Vector Double -> ContactMap -> ContactMap 187 | normalizeBy normVec c = c{_matrix = MS.SymMatrix n vec'} 188 | where 189 | (MS.SymMatrix n vec) = _matrix c 190 | vec' = U.create $ do 191 | v <- U.thaw vec 192 | loop 0 0 v 193 | return v 194 | loop i j v | i < n && j < n = do 195 | let i' = idx n i j 196 | x = (normVec U.! i) * (normVec U.! j) 197 | x' | x == 0 = 0 198 | | otherwise = 1 / x 199 | GM.read v i' >>= GM.write v i' . (*x') 200 | loop i (j+1) v 201 | | i < n = loop (i+1) (i+1) v 202 | | otherwise = return () 203 | {-# INLINE normalizeBy #-} 204 | 205 | vcNormVector :: ContactMap -> U.Vector Double 206 | vcNormVector c = U.generate n $ \i -> (U.foldl1' (+) $ mat `MS.takeRow` i) / 1e6 207 | where 208 | mat = _matrix c 209 | n = fst $ MS.dim mat 210 | {-# INLINE vcNormVector #-} 211 | 212 | sqrtNormVector :: ContactMap -> U.Vector Double 213 | sqrtNormVector = U.map sqrt . vcNormVector 214 | {-# INLINE sqrtNormVector #-} 215 | 216 | -- | O/E 217 | obsDivExp :: ContactMap -> ContactMap 218 | obsDivExp c = c{_matrix = MS.SymMatrix n vec'} 219 | where 220 | (MS.SymMatrix n vec) = _matrix c 221 | vec' = U.create $ do 222 | v <- U.thaw vec 223 | loop 0 0 v 224 | return v 225 | loop i j v | i < n && j < n = do 226 | let i' = idx n i j 227 | x = expect `U.unsafeIndex` (j-i) 228 | x' | x == 0 = 0 229 | | otherwise = 1 / x 230 | GM.read v i' >>= GM.write v i' . (*x') 231 | loop i (j+1) v 232 | | i < n = loop (i+1) (i+1) v 233 | | otherwise = return () 234 | expect = expectedVector c 235 | 236 | -- | Expected contact frequency between two locus with distance d. 237 | expectedAt :: Int -> ContactMap -> Double 238 | expectedAt d c = mean $ U.generate (n-d) $ \i -> MS.unsafeIndex mat (i,i+d) 239 | where 240 | mat = _matrix c 241 | n = MS.rows mat 242 | {-# INLINE expectedAt #-} 243 | 244 | expectedVector :: ContactMap -> U.Vector Double 245 | expectedVector c = U.generate n $ \d -> expectedAt d c 246 | where 247 | n = MS.rows $ _matrix c 248 | {-# INLINE expectedVector #-} 249 | 250 | -- row major upper triangular indexing 251 | idx :: Int -> Int -> Int -> Int 252 | idx n i j = (i * (2 * n - i - 1)) `shiftR` 1 + j 253 | {-# INLINE idx #-} 254 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/HiC/Visualize.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | module Bio.HiC.Visualize 3 | ( drawHiC 4 | , DrawOpt(..) 5 | , reds 6 | , blueRed 7 | ) where 8 | 9 | import qualified Data.ByteString.Lazy as L 10 | import qualified Data.Matrix.Generic as MG 11 | import Data.Matrix.Symmetric (SymMatrix(..)) 12 | import Codec.Picture 13 | import Data.Colour 14 | import Data.Colour.Names 15 | import Data.Colour.SRGB 16 | import Data.Default.Class (Default(..)) 17 | import qualified Data.Vector as V 18 | import qualified Data.Vector.Unboxed as U 19 | 20 | import Bio.HiC 21 | 22 | data DrawOpt = DrawOpt 23 | { _range :: !(Maybe (Double, Double)) 24 | , _palette :: !(V.Vector (Colour Double)) 25 | } 26 | 27 | instance Default DrawOpt where 28 | def = DrawOpt 29 | { _range = Nothing 30 | , _palette = reds 31 | } 32 | 33 | reds :: V.Vector (Colour Double) 34 | reds = V.fromList $ interpolate 62 white red 35 | 36 | blueRed :: V.Vector (Colour Double) 37 | blueRed = V.fromList $ interpolate 30 blue white ++ interpolate 30 white red 38 | 39 | drawHiC :: FilePath -> ContactMap -> DrawOpt -> IO () 40 | drawHiC fl m opt = case encodePalettedPng pal pic of 41 | Left e -> error e 42 | Right png -> L.writeFile fl png 43 | where 44 | pic = matToImage nCol (lo,hi) mat 45 | pal = generateImage fn nCol 1 46 | where 47 | fn i _ = colorConvert $ cols V.! i 48 | cols = _palette opt 49 | nCol = V.length $ _palette opt 50 | mat@(SymMatrix _ v) = _matrix m 51 | (lo,hi) = case _range opt of 52 | Just (a,b) -> (a,b) 53 | _ -> (U.minimum v, U.maximum v) 54 | 55 | {- 56 | drawMatrix :: MG.Matrix m v Double => FilePath -> m v Double -> DrawOpt -> IO () 57 | drawMatrix fl mat opt = case encodePalettedPng pal pic of 58 | Left e -> error e 59 | Right png -> L.writeFile fl png 60 | where 61 | pic = matToImage nCol (lo,hi) mat 62 | pal = generateImage fn nCol 1 63 | where 64 | fn i _ = colorConvert $ cols V.! i 65 | cols = _palette opt 66 | nCol = V.length $ _palette opt 67 | (lo,hi) = case _range opt of 68 | Just (a,b) -> (a,b) 69 | _ -> (U.minimum v, U.maximum v) 70 | -} 71 | 72 | matToImage :: MG.Matrix m v Double => Int -> (Double, Double) -> m v Double -> Image Pixel8 73 | matToImage n (lo,hi) mat = generateImage drawPixel r c 74 | where 75 | drawPixel i j | x <= lo = 0 76 | | x >= hi = fromIntegral $ n - 1 77 | | otherwise = truncate $ (x - lo) / step 78 | where 79 | x = mat `MG.unsafeIndex` (i,j) 80 | step = (hi - lo) / fromIntegral n 81 | (r,c) = MG.dim mat 82 | {-# INLINE matToImage #-} 83 | 84 | indexedMatToImage :: MG.Matrix m v Pixel8 => m v Pixel8 -> Image Pixel8 85 | indexedMatToImage mat = generateImage fn r c 86 | where 87 | fn i j = mat `MG.unsafeIndex` (i,j) 88 | (r,c) = MG.dim mat 89 | {-# INLINE indexedMatToImage #-} 90 | 91 | colorConvert :: Colour Double -> PixelRGB8 92 | colorConvert c = let RGB r g b = toSRGB24 c 93 | in PixelRGB8 r g b 94 | {-# INLINE colorConvert #-} 95 | 96 | {- 97 | colorBlend :: Int -> V.Vector (Colour Double) -> V.Vector (Colour Double) 98 | colorBlend n colors | n <= m = colors 99 | | otherwise = 100 | where 101 | m = V.length colors 102 | -} 103 | 104 | interpolate :: Int -> Colour Double -> Colour Double -> [Colour Double] 105 | interpolate n c1 c2 = loop 1 106 | where 107 | loop i | i > n = [] 108 | | otherwise = blend (fromIntegral i * step) c2 c1 : loop (i+1) 109 | step = 1 / fromIntegral (n+1) 110 | {-# INLINE interpolate #-} 111 | 112 | -- | map numbers to colors 113 | colorMapSmooth :: Double -- a value from 0 to 1 114 | -> V.Vector (Colour Double) -> Colour Double 115 | colorMapSmooth x colors = blend p (colors V.! i) $ colors V.! (i+1) 116 | where 117 | p = fromIntegral i - x * (fromIntegral n - 1) + 1 118 | i | x == 1 = n - 2 119 | | otherwise = truncate $ x * (fromIntegral n - 1) 120 | n = V.length colors 121 | {-# INLINE colorMapSmooth #-} 122 | 123 | linearMapBound :: (Double, Double) -> (Double, Double) -> Double -> Double 124 | linearMapBound (l, u) (l', u') x 125 | | isNaN x || x < l = l' 126 | | x > u = u' 127 | | otherwise = (x - l) / (u - l) * (u' - l') + l' 128 | {-# INLINE linearMapBound #-} 129 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/HiC/Visualize/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables #-} 2 | {-# LANGUAGE Rank2Types #-} 3 | {-# LANGUAGE CPP #-} 4 | -- most of the codes in this file are directly copied from JuicyPixel 5 | 6 | module Bio.HiC.Visualize.Internal where 7 | 8 | #if !MIN_VERSION_base(4,8,0) 9 | import Foreign.ForeignPtr.Safe( ForeignPtr, castForeignPtr ) 10 | #else 11 | import Foreign.ForeignPtr( ForeignPtr, castForeignPtr ) 12 | #endif 13 | 14 | import Foreign.Storable( Storable, sizeOf ) 15 | import Control.Monad (when) 16 | import Data.Binary (Binary(..), Get) 17 | import Data.Binary.Get( getWord8 18 | , getWord32be 19 | , getLazyByteString 20 | ) 21 | import Data.Binary.Put( runPut 22 | , putWord8 23 | , putWord32be 24 | , putLazyByteString 25 | ) 26 | import Data.Bits( xor, (.&.), unsafeShiftR ) 27 | import qualified Data.Vector.Unboxed as U 28 | import Data.Word 29 | import Data.List (foldl') 30 | import qualified Data.ByteString as B 31 | import qualified Data.ByteString.Lazy as L 32 | import qualified Data.ByteString.Lazy.Char8 as LS 33 | import Data.Vector.Storable ( Vector, unsafeToForeignPtr, unsafeFromForeignPtr0 ) 34 | import qualified Data.ByteString.Internal as S 35 | import qualified Data.Vector.Generic as G 36 | 37 | import Data.Conduit 38 | 39 | -- | Value used to identify a png chunk, must be 4 bytes long. 40 | type ChunkSignature = L.ByteString 41 | 42 | -- | Generic header used in PNG images. 43 | data PngIHdr = PngIHdr 44 | { width :: !Word32 -- ^ Image width in number of pixel 45 | , height :: !Word32 -- ^ Image height in number of pixel 46 | , bitDepth :: !Word8 -- ^ Number of bit per sample 47 | , colourType :: !PngImageType -- ^ Kind of png image (greyscale, true color, indexed...) 48 | , compressionMethod :: !Word8 -- ^ Compression method used 49 | , filterMethod :: !Word8 -- ^ Must be 0 50 | , interlaceMethod :: !PngInterlaceMethod -- ^ If the image is interlaced (for progressive rendering) 51 | } 52 | deriving Show 53 | 54 | -- | Data structure during real png loading/parsing 55 | data PngRawChunk = PngRawChunk 56 | { chunkLength :: Word32 57 | , chunkType :: ChunkSignature 58 | , chunkCRC :: Word32 59 | , chunkData :: L.ByteString 60 | } 61 | 62 | -- | What kind of information is encoded in the IDAT section 63 | -- of the PngFile 64 | data PngImageType = 65 | PngGreyscale 66 | | PngTrueColour 67 | | PngIndexedColor 68 | | PngGreyscaleWithAlpha 69 | | PngTrueColourWithAlpha 70 | deriving Show 71 | 72 | -- | Different known interlace methods for PNG image 73 | data PngInterlaceMethod = 74 | -- | No interlacing, basic data ordering, line by line 75 | -- from left to right. 76 | PngNoInterlace 77 | 78 | -- | Use the Adam7 ordering, see `adam7Reordering` 79 | | PngInterlaceAdam7 80 | deriving (Enum, Show) 81 | 82 | preparePngHeader :: Int -> Int -> PngImageType -> Word8 -> PngIHdr 83 | preparePngHeader w h imgType depth = PngIHdr 84 | { width = fromIntegral w 85 | , height = fromIntegral h 86 | , bitDepth = depth 87 | , colourType = imgType 88 | , compressionMethod = 0 89 | , filterMethod = 0 90 | , interlaceMethod = PngNoInterlace 91 | } 92 | 93 | instance Binary PngRawChunk where 94 | put chunk = do 95 | putWord32be $ chunkLength chunk 96 | putLazyByteString $ chunkType chunk 97 | when (chunkLength chunk /= 0) 98 | (putLazyByteString $ chunkData chunk) 99 | putWord32be $ chunkCRC chunk 100 | 101 | get = do 102 | size <- getWord32be 103 | chunkSig <- getLazyByteString (fromIntegral $ L.length iHDRSignature) 104 | imgData <- if size == 0 105 | then return L.empty 106 | else getLazyByteString (fromIntegral size) 107 | crc <- getWord32be 108 | 109 | let computedCrc = pngComputeCrc [chunkSig, imgData] 110 | when (computedCrc `xor` crc /= 0) 111 | (fail $ "Invalid CRC : " ++ show computedCrc ++ ", " 112 | ++ show crc) 113 | return PngRawChunk { 114 | chunkLength = size, 115 | chunkData = imgData, 116 | chunkCRC = crc, 117 | chunkType = chunkSig 118 | } 119 | 120 | instance Binary PngImageType where 121 | put PngGreyscale = putWord8 0 122 | put PngTrueColour = putWord8 2 123 | put PngIndexedColor = putWord8 3 124 | put PngGreyscaleWithAlpha = putWord8 4 125 | put PngTrueColourWithAlpha = putWord8 6 126 | 127 | get = get >>= imageTypeOfCode 128 | 129 | imageTypeOfCode :: Word8 -> Get PngImageType 130 | imageTypeOfCode 0 = return PngGreyscale 131 | imageTypeOfCode 2 = return PngTrueColour 132 | imageTypeOfCode 3 = return PngIndexedColor 133 | imageTypeOfCode 4 = return PngGreyscaleWithAlpha 134 | imageTypeOfCode 6 = return PngTrueColourWithAlpha 135 | imageTypeOfCode _ = fail "Invalid png color code" 136 | 137 | instance Binary PngIHdr where 138 | put hdr = do 139 | putWord32be 13 140 | let inner = runPut $ do 141 | putLazyByteString iHDRSignature 142 | putWord32be $ width hdr 143 | putWord32be $ height hdr 144 | putWord8 $ bitDepth hdr 145 | put $ colourType hdr 146 | put $ compressionMethod hdr 147 | put $ filterMethod hdr 148 | put $ interlaceMethod hdr 149 | crc = pngComputeCrc [inner] 150 | putLazyByteString inner 151 | putWord32be crc 152 | 153 | get = do 154 | _size <- getWord32be 155 | ihdrSig <- getLazyByteString (L.length iHDRSignature) 156 | when (ihdrSig /= iHDRSignature) 157 | (fail "Invalid PNG file, wrong ihdr") 158 | w <- getWord32be 159 | h <- getWord32be 160 | depth <- get 161 | colorType <- get 162 | compression <- get 163 | filtermethod <- get 164 | interlace <- get 165 | _crc <- getWord32be 166 | return PngIHdr { 167 | width = w, 168 | height = h, 169 | bitDepth = depth, 170 | colourType = colorType, 171 | compressionMethod = compression, 172 | filterMethod = filtermethod, 173 | interlaceMethod = interlace 174 | } 175 | 176 | instance Binary PngInterlaceMethod where 177 | get = getWord8 >>= \w -> case w of 178 | 0 -> return PngNoInterlace 179 | 1 -> return PngInterlaceAdam7 180 | _ -> fail "Invalid interlace method" 181 | 182 | put PngNoInterlace = putWord8 0 183 | put PngInterlaceAdam7 = putWord8 1 184 | 185 | -- signature 186 | 187 | -- | Signature signalling that the following data will be a png image 188 | -- in the png bit stream 189 | pngSignature :: ChunkSignature 190 | pngSignature = L.pack [137, 80, 78, 71, 13, 10, 26, 10] 191 | 192 | -- | Helper function to help pack signatures. 193 | signature :: String -> ChunkSignature 194 | signature = LS.pack 195 | 196 | -- | Signature for the header chunk of png (must be the first) 197 | iHDRSignature :: ChunkSignature 198 | iHDRSignature = signature "IHDR" 199 | 200 | -- | Signature for a palette chunk in the pgn file. Must 201 | -- occure before iDAT. 202 | pLTESignature :: ChunkSignature 203 | pLTESignature = signature "PLTE" 204 | 205 | -- | Signature for a data chuck (with image parts in it) 206 | iDATSignature :: ChunkSignature 207 | iDATSignature = signature "IDAT" 208 | 209 | -- | Signature for the last chunk of a png image, telling 210 | -- the end. 211 | iENDSignature :: ChunkSignature 212 | iENDSignature = signature "IEND" 213 | 214 | -- | Compute the CRC of a raw buffer, as described in annex D of the PNG 215 | -- specification. 216 | pngComputeCrc :: [L.ByteString] -> Word32 217 | pngComputeCrc = (0xFFFFFFFF `xor`) . L.foldl' updateCrc 0xFFFFFFFF . L.concat 218 | where updateCrc crc val = 219 | let u32Val = fromIntegral val 220 | lutVal = pngCrcTable U.! fromIntegral ((crc `xor` u32Val) .&. 0xFF) 221 | in lutVal `xor` (crc `unsafeShiftR` 8) 222 | 223 | -- | From the Annex D of the png specification. 224 | pngCrcTable :: U.Vector Word32 225 | pngCrcTable = U.fromListN 256 [ foldl' updateCrcConstant c [zero .. 7] | c <- [0 .. 255] ] 226 | where zero = 0 :: Int -- To avoid defaulting to Integer 227 | updateCrcConstant c _ | c .&. 1 /= 0 = magicConstant `xor` (c `unsafeShiftR` 1) 228 | | otherwise = c `unsafeShiftR` 1 229 | magicConstant = 0xedb88320 :: Word32 230 | 231 | 232 | 233 | ---------------------------------------------------------------------------- 234 | 235 | endChunk :: PngRawChunk 236 | endChunk = PngRawChunk { chunkLength = 0 237 | , chunkType = iENDSignature 238 | , chunkCRC = pngComputeCrc [iENDSignature] 239 | , chunkData = L.empty 240 | } 241 | 242 | type Palette = Vector Word8 243 | 244 | preparePalette :: Palette -> PngRawChunk 245 | preparePalette pal = PngRawChunk 246 | { chunkLength = fromIntegral $ G.length pal 247 | , chunkType = pLTESignature 248 | , chunkCRC = pngComputeCrc [pLTESignature, binaryData] 249 | , chunkData = binaryData 250 | } 251 | where binaryData = L.fromChunks [toByteString pal] 252 | 253 | toByteString :: forall a. (Storable a) => Vector a -> B.ByteString 254 | toByteString vec = S.PS (castForeignPtr ptr) offset (len * size) 255 | where (ptr, offset, len) = unsafeToForeignPtr vec 256 | size = sizeOf (undefined :: a) 257 | 258 | generatePng :: Int -> Int -> Palette -> (Int -> Int -> Word8) -> Source m L.ByteString 259 | generatePng w h pal fn = do 260 | yield pngSignature 261 | yield $ encode header 262 | yield $ encode $ preparePalette pal 263 | loop 0 0 264 | yield $ encode endChunk 265 | where 266 | header = preparePngHeader w h PngIndexedColor 8 267 | loop i j = do 268 | fn i j 269 | 270 | genericEncodePng :: forall px. (Pixel px, PixelBaseComponent px ~ Word8) 271 | => Int -> Int -> Maybe Palette -> PngImageType -> Image px 272 | -> L.ByteString 273 | genericEncodePng w h palette imgKind 274 | image@(Image { imageWidth = w, imageHeight = h, imageData = arr }) = 275 | encode PngRawImage { header = hdr 276 | , chunks = prependPalette palette [prepareIDatChunk imgEncodedData, endChunk]} 277 | where hdr = preparePngHeader w h image imgKind 8 278 | zero = B.singleton 0 279 | 280 | compCount = componentCount (undefined :: px) 281 | 282 | prependPalette Nothing l = l 283 | prependPalette (Just p) l = preparePalette p : l 284 | 285 | lineSize = compCount * w 286 | encodeLine line = blitVector arr (line * lineSize) lineSize 287 | imgEncodedData = Z.compress . L.fromChunks 288 | $ concat [[zero, encodeLine line] | line <- [0 .. h - 1]] 289 | 290 | prepareIDatChunk :: Lb.ByteString -> PngRawChunk 291 | prepareIDatChunk imgData = PngRawChunk 292 | { chunkLength = fromIntegral $ Lb.length imgData 293 | , chunkType = iDATSignature 294 | , chunkCRC = pngComputeCrc [iDATSignature, imgData] 295 | , chunkData = imgData 296 | } 297 | 298 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Motif/Alignment.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE Rank2Types #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE FlexibleContexts #-} 4 | {-# LANGUAGE BangPatterns #-} 5 | module Bio.Motif.Alignment 6 | ( alignment 7 | , alignmentBy 8 | , linPenal 9 | , quadPenal 10 | , cubPenal 11 | , expPenal 12 | , l1 13 | , l2 14 | , l3 15 | , lInf 16 | , AlignFn 17 | , CombineFn 18 | ) where 19 | 20 | import qualified Data.Vector.Generic as G 21 | import qualified Data.Vector.Unboxed as U 22 | import qualified Data.Matrix.Unboxed as M 23 | import Statistics.Sample (mean) 24 | 25 | import Bio.Motif 26 | import Bio.Utils.Functions 27 | 28 | -- | penalty function takes the number of gaps and matched positions as input, 29 | -- return penalty value 30 | type PenalFn = Int -> Int -> Double 31 | 32 | type DistanceFn = forall v. (G.Vector v Double, G.Vector v (Double, Double)) 33 | => v Double -> v Double -> Double 34 | 35 | type AlignFn = PWM 36 | -> PWM 37 | -> (Double, (Bool, Int)) -- ^ (distance, (on same direction, 38 | -- position w.r.t. the first pwm)) 39 | 40 | -- | combine distances from different positions of alignment 41 | type CombineFn = U.Vector Double -> Double 42 | 43 | alignment :: AlignFn 44 | alignment = alignmentBy jsd (expPenal 0.05) l1 45 | 46 | -- | linear penalty 47 | linPenal :: Double -> PenalFn 48 | linPenal x nGap nMatch = fromIntegral nGap * x / fromIntegral nMatch 49 | {-# INLINE linPenal #-} 50 | 51 | -- | quadratic penalty 52 | quadPenal :: Double -> PenalFn 53 | quadPenal x nGap nMatch = fromIntegral (nGap ^ (2 :: Int)) * x / fromIntegral nMatch 54 | {-# INLINE quadPenal #-} 55 | 56 | -- | cubic penalty 57 | cubPenal :: Double -> PenalFn 58 | cubPenal x nGap nMatch = fromIntegral (nGap ^ (3 :: Int)) * x / fromIntegral nMatch 59 | {-# INLINE cubPenal #-} 60 | 61 | -- | exponentail penalty 62 | expPenal :: Double -> PenalFn 63 | expPenal x nGap nMatch = fromIntegral (2^nGap - 1 :: Int) * x / fromIntegral nMatch 64 | {-# INLINE expPenal #-} 65 | 66 | l1 :: CombineFn 67 | l1 = mean 68 | {-# INLINE l1 #-} 69 | 70 | l2 :: CombineFn 71 | l2 = sqrt . mean . U.map (**2) 72 | {-# INLINE l2 #-} 73 | 74 | l3 :: CombineFn 75 | l3 = (**(1/3)) . mean . U.map (**3) 76 | {-# INLINE l3 #-} 77 | 78 | lInf :: CombineFn 79 | lInf = U.maximum 80 | {-# INLINE lInf #-} 81 | 82 | -- internal gaps are not allowed, larger score means larger distance, so the smaller the better 83 | alignmentBy :: DistanceFn -- ^ compute the distance between two aligned pwms 84 | -> PenalFn -- ^ gap penalty 85 | -> CombineFn 86 | -> AlignFn 87 | alignmentBy fn pFn combFn m1 m2 88 | | fst forwardAlign <= fst reverseAlign = 89 | (fst forwardAlign, (True, snd forwardAlign)) 90 | | otherwise = (fst reverseAlign, (False, snd reverseAlign)) 91 | where 92 | forwardAlign | d1 < d2 = (d1,i1) 93 | | otherwise = (d2,-i2) 94 | where 95 | (d1,i1) = loop opti2 (1/0,-1) s2 s1 0 96 | (d2,i2) = loop opti1 (1/0,-1) s1 s2 0 97 | reverseAlign | d1 < d2 = (d1,i1) 98 | | otherwise = (d2,-i2) 99 | where 100 | (d1,i1) = loop opti2 (1/0,-1) s2' s1 0 101 | (d2,i2) = loop opti1 (1/0,-1) s1 s2' 0 102 | 103 | loop opti (min',i') a b@(_:xs) !i 104 | | opti U.! i >= min' = (min',i') 105 | | d < min' = loop opti (d,i) a xs (i+1) 106 | | otherwise = loop opti (min',i') a xs (i+1) 107 | where 108 | d = combFn sc + pFn nGap nMatch 109 | sc = U.fromList $ zipWith fn a b 110 | nMatch = U.length sc 111 | nGap = n1 + n2 - 2 * nMatch 112 | loop _ acc _ _ _ = acc 113 | 114 | opti1 = optimalSc n1 n2 115 | opti2 = optimalSc n2 n1 116 | 117 | optimalSc x y = U.fromList $ scanr1 f $ go 0 118 | where 119 | f v min' = min v min' 120 | go i | nM == 0 = [] 121 | | otherwise = pFn nG nM : go (i+1) 122 | where 123 | nM = min x $ y - i 124 | nG = i + abs (x - (y-i)) 125 | 126 | s1 = M.toRows . _mat $ m1 127 | s2 = M.toRows . _mat $ m2 128 | s2' = M.toRows . _mat $ m2' 129 | m2' = rcPWM m2 130 | n1 = length s1 131 | n2 = length s2 132 | {-# INLINE alignmentBy #-} 133 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Motif/Merge.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | module Bio.Motif.Merge 4 | ( mergePWM 5 | , mergePWMWeighted 6 | , dilute 7 | , trim 8 | , mergeTree 9 | , mergeTreeWeighted 10 | , iterativeMerge 11 | , buildTree 12 | , cutTreeBy 13 | )where 14 | 15 | import AI.Clustering.Hierarchical hiding (size) 16 | import Control.Arrow (first) 17 | import Control.Monad (forM_, when) 18 | import Control.Monad.ST (runST, ST) 19 | import qualified Data.ByteString.Char8 as B 20 | import Data.List (dropWhileEnd) 21 | import qualified Data.Matrix.Symmetric.Generic.Mutable as MSU 22 | import qualified Data.Matrix.Unboxed as MU 23 | import Data.Maybe 24 | import qualified Data.Vector as V 25 | import qualified Data.Vector.Mutable as VM 26 | import qualified Data.Vector.Unboxed as U 27 | 28 | import Bio.Motif 29 | import Bio.Motif.Alignment 30 | import Bio.Utils.Functions (kld) 31 | 32 | mergePWM :: (PWM, PWM, Int) -> PWM 33 | mergePWM (m1, m2, shift) | shift >= 0 = merge shift (_mat m1) $ _mat m2 34 | | otherwise = merge (-shift) (_mat m2) $ _mat m1 35 | where 36 | merge s a b = PWM Nothing $ MU.fromRows $ loop 0 37 | where 38 | n1 = MU.rows a 39 | n2 = MU.rows b 40 | loop i | i' < 0 || (i < n1 && i' >= n2) = MU.takeRow a i : loop (i+1) 41 | | i < n1 && i' < n2 = f (MU.takeRow a i) (MU.takeRow b i') : loop (i+1) 42 | | i >= n1 && i' < n2 = MU.takeRow b i' : loop (i+1) 43 | | otherwise = [] 44 | where 45 | i' = i - s 46 | f = U.zipWith (\x y -> (x+y)/2) 47 | 48 | mergePWMWeighted :: (PWM, [Int]) -- ^ pwm and weights at each position 49 | -> (PWM, [Int]) 50 | -> Int -- ^ shift 51 | -> (PWM, [Int]) 52 | mergePWMWeighted m1 m2 shift 53 | | shift >= 0 = merge shift (first _mat m1) $ first _mat m2 54 | | otherwise = merge (-shift) (first _mat m2) $ first _mat m1 55 | 56 | where 57 | merge s (p1,w1) (p2,w2) = first (PWM Nothing . MU.fromRows) $ unzip $ loop 0 58 | where 59 | a = V.fromList $ zip (MU.toRows p1) w1 60 | b = V.fromList $ zip (MU.toRows p2) w2 61 | n1 = V.length a 62 | n2 = V.length b 63 | loop i | i' < 0 || (i < n1 && i' >= n2) = a V.! i : loop (i+1) 64 | | i < n1 && i' < n2 = f (a V.! i) (b V.! i') : loop (i+1) 65 | | i >= n1 && i' < n2 = b V.! i' : loop (i+1) 66 | | otherwise = [] 67 | where 68 | i' = i - s 69 | f (xs, wx) (ys, wy) = (U.zipWith (\x y -> 70 | (fromIntegral wx * x + fromIntegral wy * y) / 71 | fromIntegral (wx + wy)) xs ys, wx + wy) 72 | 73 | -- | dilute positions in a PWM that are associated with low weights 74 | dilute :: (PWM, [Int]) -> PWM 75 | dilute (pwm, ws) = PWM Nothing $ MU.fromRows $ zipWith f ws $ MU.toRows $ _mat pwm 76 | where 77 | f w r | w < n = let d = fromIntegral $ n - w 78 | in U.map (\x -> (fromIntegral w * x + 0.25 * d) / fromIntegral n) r 79 | | otherwise = r 80 | n = maximum ws 81 | {-# INLINE dilute #-} 82 | 83 | trim :: Bkgd -> Double -> PWM -> PWM 84 | trim (BG (a,c,g,t)) cutoff pwm = PWM Nothing $ MU.fromRows $ dropWhileEnd f $ 85 | dropWhile f rs 86 | where 87 | f x = kld x bg < cutoff 88 | rs = MU.toRows $ _mat pwm 89 | bg = U.fromList [a,c,g,t] 90 | {-# INLINE trim #-} 91 | 92 | mergeTree :: AlignFn -> Dendrogram Motif -> PWM 93 | mergeTree align t = case t of 94 | Branch _ _ left right -> f (mergeTree align left) $ mergeTree align right 95 | Leaf a -> _pwm a 96 | where 97 | f a b | isSame = mergePWM (a, b, i) 98 | | otherwise = mergePWM (a, rcPWM b, i) 99 | where (_, (isSame, i)) = align a b 100 | 101 | mergeTreeWeighted :: AlignFn -> Dendrogram Motif -> (PWM, [Int]) 102 | mergeTreeWeighted align t = case t of 103 | Branch _ _ left right -> f (mergeTreeWeighted align left) $ 104 | mergeTreeWeighted align right 105 | Leaf a -> (_pwm a, replicate (size $ _pwm a) 1) 106 | where 107 | f (a,w1) (b,w2) 108 | | isSame = mergePWMWeighted (a,w1) (b,w2) i 109 | | otherwise = mergePWMWeighted (a,w1) (rcPWM b, reverse w2) i 110 | where (_, (isSame, i)) = align a b 111 | {-# INLINE mergeTreeWeighted #-} 112 | 113 | iterativeMerge :: AlignFn 114 | -> Double -- cutoff 115 | -> [Motif] -- ^ Motifs to be merged. Motifs must have unique name. 116 | -> [([B.ByteString], PWM, [Int])] 117 | iterativeMerge align th motifs = runST $ do 118 | motifs' <- V.unsafeThaw $ V.fromList $ flip map motifs $ \x -> 119 | Just ([_name x], _pwm x, replicate (size $ _pwm x) 1) 120 | 121 | let n = VM.length motifs' 122 | iter mat = do 123 | -- retrieve the minimum value 124 | ((i, j), (d, (isSame, pos))) <- loop ((-1,-1), (1/0, undefined)) 0 1 125 | if d < th 126 | then do 127 | Just (nm1, pwm1, w1) <- VM.unsafeRead motifs' i 128 | Just (nm2, pwm2, w2) <- VM.unsafeRead motifs' j 129 | let merged = (nm1 ++ nm2, pwm', w') 130 | (pwm',w') | isSame = mergePWMWeighted (pwm1, w1) (pwm2, w2) pos 131 | | otherwise = mergePWMWeighted (pwm1, w1) 132 | (rcPWM $ pwm2, reverse w2) pos 133 | 134 | -- update 135 | forM_ [0..n-1] $ \i' -> MSU.unsafeWrite mat (i',j) Nothing 136 | VM.unsafeWrite motifs' i $ Just merged 137 | VM.unsafeWrite motifs' j Nothing 138 | forM_ [0..n-1] $ \j' -> when (i /= j') $ do 139 | x <- VM.unsafeRead motifs' j' 140 | case x of 141 | Just (_, pwm2',_) -> do 142 | let ali | i < j' = Just $ align pwm' pwm2' 143 | | otherwise = Just $ align pwm2' pwm' 144 | MSU.unsafeWrite mat (i,j') ali 145 | _ -> return () 146 | iter mat 147 | else return () 148 | where 149 | loop ((i_min, j_min), d_min) i j 150 | | i >= n = return ((i_min, j_min), d_min) 151 | | j >= n = loop ((i_min, j_min), d_min) (i+1) (i+2) 152 | | otherwise = do 153 | x <- MSU.unsafeRead mat (i,j) 154 | case x of 155 | Just d -> if fst d < fst d_min 156 | then loop ((i,j), d) i (j+1) 157 | else loop ((i_min, j_min), d_min) i (j+1) 158 | _ -> loop ((i_min, j_min), d_min) i (j+1) 159 | 160 | -- initialization 161 | mat <- MSU.replicate (n,n) Nothing :: ST s (MSU.SymMMatrix VM.MVector s (Maybe (Double, (Bool, Int)))) 162 | forM_ [0..n-1] $ \i -> forM_ [i+1 .. n-1] $ \j -> do 163 | Just (_, pwm1, _) <- VM.unsafeRead motifs' i 164 | Just (_, pwm2, _) <- VM.unsafeRead motifs' j 165 | MSU.unsafeWrite mat (i,j) $ Just $ align pwm1 pwm2 166 | 167 | iter mat 168 | results <- V.unsafeFreeze motifs' 169 | return $ V.toList $ V.map fromJust $ V.filter isJust results 170 | {-# INLINE iterativeMerge #-} 171 | 172 | -- | build a guide tree from a set of motifs 173 | buildTree :: AlignFn -> [Motif] -> Dendrogram Motif 174 | buildTree align motifs = hclust Average (V.fromList motifs) δ 175 | where 176 | δ (Motif _ x) (Motif _ y) = fst $ align x y 177 | 178 | cutTreeBy :: Double -- ^ start 179 | -> Double -- ^ step 180 | -> ([Dendrogram a] -> Bool) -> Dendrogram a -> [Dendrogram a] 181 | cutTreeBy start step fn tree = go start 182 | where 183 | go x | fn clusters = clusters 184 | | x - step > 0 = go $ x - step 185 | | otherwise = clusters 186 | where clusters = cutAt tree x 187 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Motif/Search.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | 4 | module Bio.Motif.Search 5 | ( findTFBS 6 | , findTFBSWith 7 | , findTFBSSlow 8 | , maxMatchSc 9 | , optimalScoresSuffix 10 | ) where 11 | 12 | import Conduit 13 | import qualified Data.ByteString.Char8 as B 14 | import qualified Data.Matrix.Unboxed as M 15 | import Data.Ord (comparing) 16 | import qualified Data.Vector.Unboxed as U 17 | 18 | import Bio.Motif (Bkgd (..), PWM (..), scores', size) 19 | import Bio.Seq (DNA, toBS) 20 | import qualified Bio.Seq as Seq (length) 21 | 22 | -- | given a user defined threshold, look for TF binding sites on a DNA 23 | -- sequence, using look ahead search. This function doesn't search for binding 24 | -- sites on the reverse strand 25 | findTFBS :: Monad m 26 | => Bkgd 27 | -> PWM 28 | -> DNA a 29 | -> Double 30 | -> Bool -- ^ whether to skip ambiguous sequences. Recommend: True 31 | -- in most cases 32 | -> ConduitT i (Int, Double) m () 33 | findTFBS bg pwm dna thres skip = findTFBSWith sigma bg pwm dna thres skip 34 | where 35 | sigma = optimalScoresSuffix bg pwm 36 | {-# INLINE findTFBS #-} 37 | 38 | findTFBSWith :: Monad m 39 | => U.Vector Double -- ^ best possible match score of suffixes 40 | -> Bkgd 41 | -> PWM 42 | -> DNA a 43 | -> Double 44 | -> Bool -- ^ whether to skip ambiguous sequences. Recommend: True 45 | -- in most cases 46 | -> ConduitT i (Int, Double) m () 47 | findTFBSWith sigma bg pwm dna thres skip = loop 0 48 | where 49 | loop !i | i >= l - n + 1 = return () 50 | | otherwise = do let (d, sc) = searchFn bg pwm sigma dna i thres 51 | if d == n - 1 52 | then yield (i, sc) >> loop (i+1) 53 | else loop (i+1) 54 | l = Seq.length dna 55 | n = size pwm 56 | searchFn | skip = lookAheadSearch' 57 | | otherwise = lookAheadSearch 58 | {-# INLINE findTFBSWith #-} 59 | 60 | {- 61 | -- | a parallel version of findTFBS 62 | findTFBS' :: Bkgd 63 | -> PWM 64 | -> DNA a 65 | -> Double 66 | -> Bool 67 | -> [Int] 68 | findTFBS' bg pwm dna th skip = concat $ parMap rdeepseq f [0,step..l-n+1] 69 | where 70 | f x = loop x 71 | where 72 | loop i | i >= x+step || i >= l-n+1 = [] 73 | | otherwise = let d = fst $ searchFn bg pwm sigma dna i th 74 | in if d == n-1 75 | then i : loop (i+1) 76 | else loop (i+1) 77 | sigma = optimalScoresSuffix bg pwm 78 | l = Seq.length dna 79 | n = size pwm 80 | step = 500000 81 | searchFn | skip = lookAheadSearch' 82 | | otherwise = lookAheadSearch 83 | {-# INLINE findTFBS' #-} 84 | -} 85 | 86 | -- | use naive search 87 | findTFBSSlow :: Monad m => Bkgd -> PWM -> DNA a -> Double -> ConduitT i (Int, Double) m () 88 | findTFBSSlow bg pwm dna thres = scores' bg pwm dna .| loop 0 89 | where 90 | loop i = do v <- await 91 | case v of 92 | Just v' -> if v' >= thres then yield (i, v') >> loop (i+1) 93 | else loop (i+1) 94 | _ -> return () 95 | {-# INLINE findTFBSSlow #-} 96 | 97 | -- | the largest possible match scores starting from every position of a DNA sequence 98 | maxMatchSc :: Bkgd -> PWM -> DNA a -> Double 99 | maxMatchSc bg pwm dna = loop (-1/0) 0 100 | where 101 | loop !max' !i | i >= l - n + 1 = max' 102 | | otherwise = if d == n - 1 then loop sc (i+1) 103 | else loop max' (i+1) 104 | where 105 | (d, sc) = lookAheadSearch bg pwm sigma dna i max' 106 | sigma = optimalScoresSuffix bg pwm 107 | l = Seq.length dna 108 | n = size pwm 109 | {-# INLINE maxMatchSc #-} 110 | 111 | optimalScoresSuffix :: Bkgd -> PWM -> U.Vector Double 112 | optimalScoresSuffix (BG (a, c, g, t)) (PWM _ pwm) = 113 | U.fromList . tail . map (last sigma -) $ sigma 114 | where 115 | sigma = scanl f 0 $ M.toRows pwm 116 | f !acc xs = let (i, s) = U.maximumBy (comparing snd) . 117 | U.zip (U.fromList ([0..3] :: [Int])) $ xs 118 | in acc + case i of 119 | 0 -> log $ s / a 120 | 1 -> log $ s / c 121 | 2 -> log $ s / g 122 | 3 -> log $ s / t 123 | _ -> undefined 124 | {-# INLINE optimalScoresSuffix #-} 125 | 126 | lookAheadSearch :: Bkgd -- ^ background nucleotide distribution 127 | -> PWM -- ^ pwm 128 | -> U.Vector Double -- ^ best possible match score of suffixes 129 | -> DNA a -- ^ DNA sequence 130 | -> Int -- ^ starting location on the DNA 131 | -> Double -- ^ threshold 132 | -> (Int, Double) -- ^ (d, sc_d), the largest d such that sc_d > th_d 133 | lookAheadSearch (BG (a, c, g, t)) pwm sigma dna start thres = loop (0, -1) 0 134 | where 135 | loop (!acc, !th_d) !d 136 | | acc < th_d = (d-2, acc) 137 | | otherwise = if d >= n 138 | then (d-1, acc) 139 | else loop (acc + sc, thres - sigma U.! d) (d+1) 140 | where 141 | sc = case toBS dna `B.index` (start + d) of 142 | 'A' -> log $! matchA / a 143 | 'C' -> log $! matchC / c 144 | 'G' -> log $! matchG / g 145 | 'T' -> log $! matchT / t 146 | 'N' -> 0 147 | 'V' -> log $! (matchA + matchC + matchG) / (a + c + g) 148 | 'H' -> log $! (matchA + matchC + matchT) / (a + c + t) 149 | 'D' -> log $! (matchA + matchG + matchT) / (a + g + t) 150 | 'B' -> log $! (matchC + matchG + matchT) / (c + g + t) 151 | 'M' -> log $! (matchA + matchC) / (a + c) 152 | 'K' -> log $! (matchG + matchT) / (g + t) 153 | 'W' -> log $! (matchA + matchT) / (a + t) 154 | 'S' -> log $! (matchC + matchG) / (c + g) 155 | 'Y' -> log $! (matchC + matchT) / (c + t) 156 | 'R' -> log $! (matchA + matchG) / (a + g) 157 | _ -> error "Bio.Motif.Search.lookAheadSearch: invalid nucleotide" 158 | matchA = addSome $ M.unsafeIndex (_mat pwm) (d,0) 159 | matchC = addSome $ M.unsafeIndex (_mat pwm) (d,1) 160 | matchG = addSome $ M.unsafeIndex (_mat pwm) (d,2) 161 | matchT = addSome $ M.unsafeIndex (_mat pwm) (d,3) 162 | addSome !x | x == 0 = pseudoCount 163 | | otherwise = x 164 | pseudoCount = 0.0001 165 | n = size pwm 166 | {-# INLINE lookAheadSearch #-} 167 | 168 | -- | this version skip sequences contain ambiguous bases, like "N" 169 | lookAheadSearch' :: Bkgd -- ^ background nucleotide distribution 170 | -> PWM -- ^ pwm 171 | -> U.Vector Double -- ^ best possible match score of suffixes 172 | -> DNA a -- ^ DNA sequence 173 | -> Int -- ^ starting location on the DNA 174 | -> Double -- ^ threshold 175 | -> (Int, Double) -- ^ (d, sc_d), the largest d such that sc_d > th_d 176 | lookAheadSearch' (BG (a, c, g, t)) pwm sigma dna start thres = loop (0, -1) 0 177 | where 178 | loop (!acc, !th_d) !d 179 | | acc < th_d = (d-2, acc) 180 | | otherwise = if d >= n 181 | then (d-1, acc) 182 | else loop (acc + sc, thres - sigma U.! d) (d+1) 183 | where 184 | sc = case toBS dna `B.index` (start + d) of 185 | 'A' -> log $! matchA / a 186 | 'C' -> log $! matchC / c 187 | 'G' -> log $! matchG / g 188 | 'T' -> log $! matchT / t 189 | _ -> -1 / 0 190 | matchA = addSome $ M.unsafeIndex (_mat pwm) (d,0) 191 | matchC = addSome $ M.unsafeIndex (_mat pwm) (d,1) 192 | matchG = addSome $ M.unsafeIndex (_mat pwm) (d,2) 193 | matchT = addSome $ M.unsafeIndex (_mat pwm) (d,3) 194 | addSome !x | x == 0 = pseudoCount 195 | | otherwise = x 196 | pseudoCount = 0.0001 197 | n = size pwm 198 | {-# INLINE lookAheadSearch' #-} 199 | 200 | {- 201 | data SpaceDistribution = SpaceDistribution 202 | { _motif1 :: Motif 203 | , _nSites1 :: (Int, Int) 204 | , _motif2 :: Motif 205 | , _nSites2 :: (Int, Int) 206 | , _same :: [(Int, Int)] 207 | , _opposite :: [(Int, Int)] 208 | } deriving (Show, Read) 209 | 210 | -- | search for spacing constraint between two TFs 211 | spaceConstraint :: [(Motif, Motif)] -- ^ motifs, names must be unique 212 | -> Bkgd -- ^ backgroud nucleotide distribution 213 | -> Double -- ^ p-Value for motif finding 214 | -> Int -- ^ half window size, typical 5 215 | -> Int -- ^ max distance to search, typical 300 216 | -> DNA a -> [SpaceDistribution] 217 | spaceConstraint pairs bg th w k dna = flip map pairs $ \(a, b) -> 218 | let (m1, site1) = HM.lookupDefault undefined (_name a) sites 219 | (m2, site2) = HM.lookupDefault undefined (_name b) sites 220 | (same, opposite) = spaceConstraintHelper site1 site2 w k 221 | in SpaceDistribution m1 ((U.length *** U.length) site1) m2 222 | ((U.length *** U.length) site2) same opposite 223 | where 224 | motifs = nubBy ((==) `on` _name) $ concatMap (\(a,b) -> [a,b]) pairs 225 | findSites (Motif _ pwm) = (fwd, rev) 226 | where 227 | fwd = runST $ runConduit $ findTFBS bg pwm dna cutoff True .| sinkVector 228 | rev = runST $ runConduit $ findTFBS bg pwm' dna cutoff' True .| 229 | mapC (+ (size pwm - 1)) .| sinkVector 230 | cutoff = pValueToScore th bg pwm 231 | cutoff' = pValueToScore th bg pwm' 232 | pwm' = rcPWM pwm 233 | sites = HM.fromList $ map (\m -> (_name m, (m, findSites m))) motifs 234 | {-# INLINE spaceConstraint #-} 235 | 236 | spaceConstraintHelper :: (U.Vector Int, U.Vector Int) 237 | -> (U.Vector Int, U.Vector Int) 238 | -> Int 239 | -> Int 240 | -> ([(Int, Int)], [(Int, Int)]) 241 | spaceConstraintHelper (fw1, rv1) (fw2, rv2) w k = (same, oppose) 242 | where 243 | rs = let rs' = [-k, -k+2*w+1 .. 0] 244 | in rs' ++ map (*(-1)) (reverse rs') 245 | fw2' = S.fromList $ U.toList fw2 246 | rv2' = S.fromList $ U.toList rv2 247 | nOverlap :: U.Vector Int -> S.HashSet Int -> Int -> Int -> Int 248 | nOverlap xs ys w' i = U.foldl' f 0 xs 249 | where 250 | f acc x | any (`S.member` ys) [x + i - w' .. x + i + w'] = acc + 1 251 | | otherwise = acc 252 | same = zip rs $ zipWith (+) nFF nRR 253 | where 254 | nFF = map (nOverlap fw1 fw2' w) rs 255 | nRR = map (nOverlap rv1 rv2' w) $ reverse rs 256 | oppose = zip rs $ zipWith (+) nFR nRF 257 | where 258 | nFR = map (nOverlap fw1 rv2' w) rs 259 | nRF = map (nOverlap rv1 fw2' w) $ reverse rs 260 | {-# INLINE spaceConstraintHelper #-} 261 | 262 | computePValue :: Double -> [Int] -> [(Int, Double)] 263 | computePValue p xs = zip xs $ map (pValue n p) xs 264 | where 265 | n = foldl' (+) 0 xs 266 | 267 | pValue :: Int -> Double -> Int -> Double 268 | pValue n p x | n > 2000 = complCumulative (poisson (fromIntegral n* p)) $ fromIntegral x 269 | | otherwise = complCumulative (binomial n p) $ fromIntegral x 270 | 271 | -} -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/RealWorld/BioGRID.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Bio.RealWorld.BioGRID 4 | ( TAB2(..) 5 | , fetchByGeneNames 6 | ) where 7 | 8 | import Network.HTTP.Conduit 9 | import Data.List 10 | import qualified Data.ByteString.Lazy.Char8 as BL 11 | import qualified Data.ByteString.Char8 as B 12 | import qualified Data.Text as T 13 | 14 | accessKey :: String 15 | accessKey = "accessKey=6168b8d02b2aa2e9a45af6f3afac4461" 16 | 17 | base :: String 18 | base = "http://webservice.thebiogrid.org/" 19 | 20 | -- | BioGRID tab2 format 21 | data TAB2 = TAB2 22 | { _biogridId :: B.ByteString 23 | , _entrezIdA :: B.ByteString 24 | , _entrezIdB :: B.ByteString 25 | , _biogridIdA :: B.ByteString 26 | , _biogridIdB :: B.ByteString 27 | , _systematicNameA :: T.Text 28 | , _systematicNameB :: T.Text 29 | , _symbolA :: T.Text 30 | , _symbolB :: T.Text 31 | , _synonymsA :: [T.Text] 32 | , _synonymsB :: [T.Text] 33 | , _experimentalSystemName :: T.Text 34 | , _experimentalSystemType :: T.Text 35 | , _firstAuthor :: T.Text 36 | , _pubmedId :: B.ByteString 37 | , _organismIdA :: B.ByteString 38 | , _organismIdB :: B.ByteString 39 | , _throughput :: T.Text 40 | , _score :: Maybe Double 41 | , _ptm :: T.Text 42 | , _phenotypes :: [T.Text] 43 | , _qualifications :: [T.Text] 44 | , _tags :: [T.Text] 45 | , _source :: T.Text 46 | } deriving (Show) 47 | 48 | parseAsTab2 :: BL.ByteString -> TAB2 49 | parseAsTab2 l = TAB2 (BL.toStrict $ xs!!0) 50 | (BL.toStrict $ xs!!1) 51 | (BL.toStrict $ xs!!2) 52 | (BL.toStrict $ xs!!3) 53 | (BL.toStrict $ xs!!4) 54 | (T.pack $ BL.unpack $ xs!!5) 55 | (T.pack $ BL.unpack $ xs!!6) 56 | (T.pack $ BL.unpack $ xs!!7) 57 | (T.pack $ BL.unpack $ xs!!8) 58 | (T.splitOn "|" $ T.pack $ BL.unpack $ xs!!9) 59 | (T.splitOn "|" $ T.pack $ BL.unpack $ xs!!10) 60 | (T.pack $ BL.unpack $ xs!!11) 61 | (T.pack $ BL.unpack $ xs!!12) 62 | (T.pack $ BL.unpack $ xs!!13) 63 | (BL.toStrict $ xs!!14) 64 | (BL.toStrict $ xs!!15) 65 | (BL.toStrict $ xs!!16) 66 | (T.pack $ BL.unpack $ xs!!17) 67 | (getScore $ BL.unpack $ xs!!18) 68 | (T.pack $ BL.unpack $ xs!!19) 69 | (T.splitOn "|" $ T.pack $ BL.unpack $ xs!!20) 70 | (T.splitOn "|" $ T.pack $ BL.unpack $ xs!!21) 71 | (T.splitOn "|" $ T.pack $ BL.unpack $ xs!!22) 72 | (T.pack $ BL.unpack $ xs!!23) 73 | where 74 | xs = BL.split '\t' l 75 | getScore "-" = Nothing 76 | getScore x = Just $ read x 77 | 78 | -- | retreive first 10,000 records 79 | fetchByGeneNames :: [String] -> IO [TAB2] 80 | fetchByGeneNames genes = do 81 | initReq <- parseRequest $ intercalate "&" [url, geneList, tax, accessKey] 82 | let request = initReq { method = "GET" 83 | , requestHeaders = [("Content-type", "text/plain")] 84 | } 85 | manager <- newManager tlsManagerSettings 86 | r <- httpLbs request manager 87 | return $ map parseAsTab2 $ BL.lines $ responseBody r 88 | where 89 | url = base ++ "/interactions/?searchNames=ture&includeInteractors=false" 90 | geneList = "geneList=" ++ intercalate "|" genes 91 | tax = "taxId=9606" 92 | {-# INLINE fetchByGeneNames #-} 93 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/RealWorld/ENCODE.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Bio.RealWorld.ENCODE 4 | ( KeyWords(..) 5 | , search 6 | 7 | -- * common search 8 | , terms 9 | , cellIs 10 | , organismIs 11 | , assayIs 12 | 13 | -- * specific search 14 | , getFile 15 | , queryById 16 | , openUrl 17 | , jsonFromUrl 18 | 19 | -- * Inspection 20 | , (|@) 21 | , (|!) 22 | , as 23 | , (&) 24 | 25 | -- * printing 26 | , showResult 27 | ) where 28 | 29 | import Data.Aeson 30 | import Data.Aeson.Types 31 | import Data.Aeson.Encode.Pretty (encodePretty) 32 | import qualified Data.Aeson.KeyMap as M 33 | import qualified Data.Aeson.Key as M 34 | import qualified Data.ByteString.Lazy.Char8 as B 35 | import qualified Data.ByteString.Char8 as BS 36 | import qualified Data.Sequence as S 37 | import qualified Data.Text as T 38 | import qualified Data.Vector as V 39 | import Data.Maybe (fromMaybe) 40 | import Network.HTTP.Conduit 41 | import Data.Default.Class 42 | 43 | import Bio.RealWorld.ID 44 | 45 | -- | Terms and constraints. 46 | data KeyWords = KeyWords (S.Seq String) (S.Seq String) 47 | 48 | instance Default KeyWords where 49 | def = KeyWords S.empty $ S.fromList ["frame=object", "limit=all"] 50 | 51 | instance Show KeyWords where 52 | show (KeyWords x y) = f x ++ g y 53 | where 54 | f x' | S.null x' = "" 55 | | otherwise = "searchTerm=" ++ foldr1 (\a b -> b ++ ('+':a)) x' ++ "&" 56 | g y' | S.null y' = "" 57 | | otherwise = foldr1 (\a b -> b ++ ('&':a)) y' 58 | 59 | instance Semigroup KeyWords where 60 | (<>) (KeyWords a b) (KeyWords a' b') = KeyWords (a S.>< a') (b S.>< b') 61 | 62 | base :: String 63 | base = "https://www.encodeproject.org/" 64 | 65 | -- | general search using keywords and a set of constraints. Example: 66 | -- search ["chip", "sp1"] ["type=experiment"] 67 | search :: KeyWords -> IO (Either String [Value]) 68 | search kw = do 69 | initReq <- parseRequest url 70 | let request = initReq { method = "GET" 71 | , requestHeaders = [("accept", "application/json")] 72 | } 73 | manager <- newManager tlsManagerSettings 74 | r <- httpLbs request manager 75 | return $ (eitherDecode . responseBody) r >>= 76 | parseEither (withObject "ENCODE_JSON" (.: "@graph")) 77 | where 78 | url = base ++ "search/?" ++ show kw 79 | 80 | showResult :: Value -> IO () 81 | showResult = B.putStrLn . encodePretty 82 | 83 | terms :: [String] -> KeyWords 84 | terms xs = KeyWords (S.fromList xs) S.empty 85 | 86 | assayIs :: String -> KeyWords 87 | assayIs x = KeyWords S.empty $ 88 | S.fromList ["type=experiment", "assay_term_name=" ++ x] 89 | 90 | organismIs :: String -> KeyWords 91 | organismIs x = KeyWords S.empty $ 92 | S.fromList ["replicates.library.biosample.donor.organism.scientific_name=" ++ x] 93 | 94 | cellIs :: String -> KeyWords 95 | cellIs x = KeyWords S.empty $ S.fromList ["biosample_term_name=" ++ x] 96 | 97 | -- | accession 98 | queryById :: EncodeAcc -> IO (Either String Value) 99 | queryById acc = jsonFromUrl $ "experiments/" ++ BS.unpack (fromID acc) 100 | 101 | getFile :: FilePath -> String -> IO () 102 | getFile out url = openUrl (base ++ url) "application/octet-stream" >>= 103 | B.writeFile out 104 | 105 | openUrl :: String -> String -> IO B.ByteString 106 | openUrl url datatype = do 107 | initReq <- parseRequest url 108 | let request = initReq { method = "GET" 109 | , requestHeaders = [("accept", BS.pack datatype)] 110 | } 111 | manager <- newManager tlsManagerSettings 112 | r <- httpLbs request manager 113 | return $ responseBody r 114 | 115 | jsonFromUrl :: String -> IO (Either String Value) 116 | jsonFromUrl url = eitherDecode <$> openUrl (base ++ url) "application/json" 117 | 118 | 119 | (|@) :: Value -> T.Text -> Value 120 | (|@) (Object obj) key = fromMaybe (error errMsg) $ M.lookup (M.fromText key) obj 121 | where 122 | errMsg = "No such key: " ++ T.unpack key ++ " In: " ++ show obj 123 | (|@) _ _ = error "not an object" 124 | {-# INLINE (|@) #-} 125 | 126 | (|!) :: Value -> Int -> Value 127 | (|!) (Array ar) i = ar V.! i 128 | (|!) _ _ = error "not an array" 129 | {-# INLINE (|!) #-} 130 | 131 | (&) :: a -> (a -> b) -> b 132 | (&) = flip ($) 133 | {-# INLINE (&) #-} 134 | 135 | as :: FromJSON a => Value -> a 136 | as = getResult . fromJSON 137 | where 138 | getResult (Error e) = error e 139 | getResult (Success x) = x 140 | {-# INLINE as #-} 141 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/RealWorld/Ensembl.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module Bio.RealWorld.Ensembl 3 | ( lookup 4 | ) where 5 | 6 | import Prelude hiding (lookup) 7 | import Data.Aeson 8 | import Data.List.Split (chunksOf) 9 | import qualified Data.ByteString.Char8 as B 10 | import qualified Data.Aeson.KeyMap as M 11 | import Network.HTTP.Conduit 12 | 13 | import Bio.RealWorld.ID (BioID(..), EnsemblID) 14 | 15 | base :: String 16 | base = "http://rest.ensembl.org/" 17 | 18 | lookup :: [EnsemblID] -> IO (Either String Object) 19 | lookup xs = do 20 | rs <- mapM lookupHelp $ chunksOf 1000 xs 21 | return $ foldl1 f rs 22 | where 23 | f a b = do 24 | a' <- a 25 | b' <- b 26 | return $ M.union a' b' 27 | 28 | lookupHelp :: [EnsemblID] -> IO (Either String Object) 29 | lookupHelp xs = do 30 | initReq <- parseRequest url 31 | let request = initReq { method = "POST" 32 | , requestHeaders = [("Content-type", "application/json")] 33 | , requestBody = body 34 | } 35 | manager <- newManager tlsManagerSettings 36 | r <- httpLbs request manager 37 | return . eitherDecode . responseBody $ r 38 | where 39 | url = base ++ "/lookup/id/" 40 | ids = B.pack $ show $ map fromID xs 41 | body = RequestBodyBS $ B.intercalate "" ["{ \"ids\" :", ids, "}"] 42 | {-# INLINE lookupHelp #-} 43 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/RealWorld/GDC.hs: -------------------------------------------------------------------------------- 1 | -- NIH Genomic Data Commons 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Bio.RealWorld.GDC 5 | (downloadData) where 6 | 7 | import Conduit 8 | import qualified Data.Text as T 9 | import qualified Data.ByteString.Char8 as B 10 | import Data.Maybe (fromJust) 11 | import Network.HTTP.Conduit 12 | 13 | baseurl :: String 14 | baseurl = "https://api.gdc.cancer.gov/" 15 | 16 | -- | Download data 17 | downloadData :: String -- ^ UUID 18 | -> FilePath -- ^ Output dir 19 | -> IO FilePath 20 | downloadData uuid dir = do 21 | request <- parseRequest url 22 | manager <- newManager tlsManagerSettings 23 | runResourceT $ do 24 | response <- http request manager 25 | let filename = T.unpack $ snd $ T.breakOnEnd "filename=" $ T.pack $ 26 | B.unpack $ fromJust $ lookup "Content-Disposition" $ 27 | responseHeaders response 28 | runConduit $ responseBody response .| sinkFileBS (dir ++ "/" ++ filename) 29 | return filename 30 | where 31 | url = baseurl ++ "data/" ++ uuid 32 | {-# INLINE downloadData #-} -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/RealWorld/GENCODE.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE RecordWildCards #-} 4 | 5 | module Bio.RealWorld.GENCODE 6 | ( Gene(..) 7 | , Transcript(..) 8 | , TranscriptType(..) 9 | , readGenes 10 | , readGenesC 11 | , getPromoters 12 | , getDomains 13 | ) where 14 | 15 | import Conduit 16 | import qualified Data.ByteString.Char8 as B 17 | import Data.CaseInsensitive (CI, mk) 18 | import qualified Data.HashMap.Strict as M 19 | import Data.List.Ordered (nubSort) 20 | import Data.Maybe (fromMaybe, fromJust, isNothing) 21 | import Lens.Micro 22 | import Data.List (foldl') 23 | import Data.Char (toLower) 24 | import qualified Data.Vector as V 25 | 26 | import Bio.Data.Bed 27 | import Bio.Data.Bed.Types 28 | import Bio.Utils.Misc (readInt) 29 | 30 | data TranscriptType = Coding 31 | | NonCoding 32 | deriving (Show, Eq, Ord) 33 | 34 | -- | GTF's position is 1-based, but here we convert it to 0-based indexing. 35 | data Gene = Gene 36 | { geneName :: !(CI B.ByteString) 37 | , geneId :: !B.ByteString 38 | , geneChrom :: !B.ByteString 39 | , geneLeft :: !Int 40 | , geneRight :: !Int 41 | , geneStrand :: !Bool 42 | , geneTranscripts :: ![Transcript] 43 | } deriving (Show, Eq, Ord) 44 | 45 | data Transcript = Transcript 46 | { transId :: !B.ByteString 47 | , transLeft :: !Int 48 | , transRight :: !Int 49 | , transStrand :: !Bool 50 | , transExon :: ![(Int, Int)] 51 | , transUTR :: ![(Int, Int)] 52 | , transType :: TranscriptType 53 | } deriving (Show, Eq, Ord) 54 | 55 | -- | Read gene information from Gencode GTF file 56 | readGenes :: FilePath -> IO [Gene] 57 | readGenes input = runResourceT $ runConduit $ sourceFile input .| readGenesC 58 | 59 | readGenesC :: Monad m => ConduitT B.ByteString o m [Gene] 60 | readGenesC = do 61 | (genes, transcripts, exons, utrs) <- readElements 62 | let t = M.fromList $ map (\(a,b) -> (transId b, (a,b))) transcripts 63 | return $ nubGene $ M.elems $ foldl' addTranscript 64 | (M.fromList $ map (\x -> (geneId x, x)) genes) $ 65 | M.elems $ foldl' addUTR (foldl' addExon t exons) utrs 66 | {-# INLINE readGenesC #-} 67 | 68 | nubGene :: [Gene] -> [Gene] 69 | nubGene gs = nubSort $ map nubG gs 70 | where 71 | nubG g = g { geneTranscripts = nubSort $ map nubT $ geneTranscripts g} 72 | nubT t = t { transExon = nubSort $ transExon t 73 | , transUTR = nubSort $ transUTR t } 74 | {-# INLINE nubGene #-} 75 | 76 | readElements :: Monad m => ConduitT B.ByteString o m 77 | ( [Gene], [(B.ByteString, Transcript)] 78 | , [(B.ByteString, (Int, Int))], [(B.ByteString, (Int, Int))] ) 79 | readElements = linesUnboundedAsciiC .| foldlC f ([], [], [], []) 80 | where 81 | f acc l 82 | | B.head l == '#' = acc 83 | | featType == "gene" = _1 %~ (gene:) $ acc 84 | | featType == "transcript" = _2 %~ ((gid, transcript):) $ acc 85 | | featType == "exon" = _3 %~ ((tid, exon):) $ acc 86 | | featType == "utr" = _4 %~ ((tid, utr):) $ acc 87 | | otherwise = acc 88 | where 89 | gene = Gene (mk $ fromMaybe (error "could not find \"gene_name\"") $ 90 | getField "gene_name") gid chr lPos rPos (f7=="+") [] 91 | transcript = Transcript tid lPos rPos (f7=="+") [] [] tTy 92 | exon = (lPos, rPos) 93 | utr = (lPos, rPos) 94 | [chr,_,f3,f4,f5,_,f7,_,f9] = B.split '\t' l 95 | gid = fromMaybe (error "could not find \"gene_id\"") $ getField "gene_id" 96 | tid = fromMaybe (error "could not find \"transcript_id\"") $ getField "transcript_id" 97 | tTy = case getField "transcript_type" of 98 | Just "protein_coding" -> Coding 99 | Nothing -> Coding 100 | _ -> NonCoding 101 | lPos = readInt f4 - 1 102 | rPos = readInt f5 - 1 103 | featType = B.map toLower f3 104 | getField x = fmap (B.init . B.drop 2) $ lookup x $ 105 | map (B.break isSpace . strip) $ B.split ';' f9 106 | strip = fst . B.spanEnd isSpace . B.dropWhile isSpace 107 | isSpace = (== ' ') 108 | {-# INLINE readElements #-} 109 | 110 | addExon :: M.HashMap B.ByteString (a, Transcript) 111 | -> (B.ByteString, (Int, Int)) 112 | -> M.HashMap B.ByteString (a, Transcript) 113 | addExon m (key, val) = M.adjust (\(x, trans) -> 114 | (x, trans{transExon = val : transExon trans})) key m 115 | {-# INLINE addExon #-} 116 | 117 | addUTR :: M.HashMap B.ByteString (a, Transcript) 118 | -> (B.ByteString, (Int, Int)) 119 | -> M.HashMap B.ByteString (a, Transcript) 120 | addUTR m (key, val) = M.adjust (\(x, trans) -> 121 | (x, trans{transUTR = val : transUTR trans})) key m 122 | {-# INLINE addUTR #-} 123 | 124 | addTranscript :: M.HashMap B.ByteString Gene 125 | -> (B.ByteString, Transcript) 126 | -> M.HashMap B.ByteString Gene 127 | addTranscript m (key, val) = M.adjust (\gene -> 128 | gene{geneTranscripts = val : geneTranscripts gene}) key m 129 | {-# INLINE addTranscript #-} 130 | 131 | getPromoters :: Int -- ^ upstream 132 | -> Int -- ^ downstream 133 | -> Gene 134 | -> [BEDExt BED3 (Int, CI B.ByteString)] 135 | getPromoters up down Gene{..} = map g $ nubSort tss 136 | where 137 | g x | geneStrand = BEDExt (asBed geneChrom (max 0 $ x - up) (x + down)) (x, geneName) 138 | | otherwise = BEDExt (asBed geneChrom (max 0 $ x - down) (x + up)) (x, geneName) 139 | tss | geneStrand = geneLeft : map transLeft geneTranscripts 140 | | otherwise = geneRight : map transRight geneTranscripts 141 | {-# INLINE getPromoters #-} 142 | 143 | -- | Compute genes' regulatory domains using the algorithm described in GREAT. 144 | -- NOTE: the result doesn't contain promoters 145 | getDomains :: BEDLike b 146 | => Int -- ^ Extension length. A good default is 1M. 147 | -> [b] -- ^ A list of promoters 148 | -> [b] -- ^ Regulatory domains 149 | getDomains ext genes 150 | | null genes = error "No gene available for domain assignment!" 151 | | otherwise = filter ((>0) . size) $ concatMap f $ triplet $ 152 | [Nothing] ++ map Just basal ++ [Nothing] 153 | where 154 | f (left, Just bed, right) = 155 | [ chromStart .~ leftPos $ chromEnd .~ s $ bed 156 | , chromStart .~ e $ chromEnd .~ rightPos $ bed ] 157 | where 158 | chr = bed^.chrom 159 | s = bed^.chromStart 160 | e = bed^.chromEnd 161 | leftPos 162 | | isNothing left || chr /= fromJust left ^. chrom = max (s - ext) 0 163 | | otherwise = min s $ max (s - ext) $ fromJust left ^. chromEnd 164 | rightPos 165 | | isNothing right || chr /= fromJust right ^. chrom = e + ext -- TODO: bound check 166 | | otherwise = max e $ min (e + ext) $ fromJust right ^. chromStart 167 | f _ = undefined 168 | triplet (x1:x2:x3:xs) = (x1,x2,x3) : triplet xs 169 | triplet _ = [] 170 | basal = V.toList $ fromSorted $ sortBed genes 171 | {-# INLINE getDomains #-} -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/RealWorld/ID.hs: -------------------------------------------------------------------------------- 1 | module Bio.RealWorld.ID where 2 | 3 | import qualified Data.ByteString.Char8 as B 4 | 5 | class BioID a where 6 | fromID :: a -> B.ByteString 7 | toID :: B.ByteString -> a 8 | 9 | newtype UniprotID = UniprotID B.ByteString deriving (Show, Eq) 10 | 11 | newtype UCSCID = UCSCID B.ByteString deriving (Show, Eq) 12 | 13 | newtype GOID = GOID B.ByteString deriving (Show, Eq) 14 | 15 | -- | ENCODE Accession 16 | newtype EncodeAcc = EncodeAcc B.ByteString deriving (Show, Eq) 17 | 18 | -- | Ensembl ID 19 | newtype EnsemblID = EnsemblID B.ByteString deriving (Show, Eq) 20 | 21 | instance BioID EncodeAcc where 22 | fromID (EncodeAcc x) = x 23 | toID = EncodeAcc 24 | 25 | instance BioID EnsemblID where 26 | fromID (EnsemblID x) = x 27 | toID = EnsemblID 28 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/RealWorld/Reactome.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE DeriveGeneric #-} 3 | 4 | module Bio.RealWorld.Reactome 5 | ( getPathways 6 | ) where 7 | 8 | import Data.Aeson 9 | import GHC.Generics (Generic) 10 | import qualified Data.Text as T 11 | import Network.HTTP.Simple 12 | 13 | base :: String 14 | base = "https://reactome.org/ContentService" 15 | 16 | data Obj = Obj 17 | { className :: Maybe T.Text 18 | , dbId :: Int 19 | , displayName :: T.Text 20 | , schemaClass :: Maybe T.Text 21 | , stId :: Maybe T.Text 22 | , stIdVersion :: Maybe T.Text 23 | } deriving (Show, Generic) 24 | 25 | instance ToJSON Obj 26 | instance FromJSON Obj 27 | 28 | -- | All Reactome top level pathways 29 | getPathways :: String -> IO [Obj] 30 | getPathways species = do 31 | req <- parseRequest url 32 | response <- httpJSON req 33 | return $ getResponseBody response 34 | where 35 | url = base ++ "/data/pathways/top/" ++ species 36 | 37 | {- 38 | pathwayAnalysis :: [B.ByteString] -- ^ A list of identifiers 39 | -> IO B.ByteString 40 | pathwayAnalysis ids = do 41 | initReq <- parseRequest base 42 | let request = urlEncodedBody [] initReq 43 | manager <- newManager tlsManagerSettings 44 | r <- fmap M.fromList $ runResourceT $ do 45 | response <- http request manager 46 | runConduit $ responseBody response .| linesUnboundedAsciiC .| 47 | (dropC 1 >> mapC ((\[a,b] -> (a,b)) . B.split '\t')) .| sinkList 48 | return $ map (flip M.lookup r) ids 49 | where 50 | params = 51 | -} -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/RealWorld/Uniprot.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Bio.RealWorld.Uniprot 4 | ( mapID 5 | ) where 6 | 7 | import Conduit 8 | import qualified Data.ByteString.Char8 as B 9 | import qualified Data.HashMap.Strict as M 10 | import Network.HTTP.Conduit 11 | 12 | base :: String 13 | base = "http://www.uniprot.org/uploadlists/" 14 | 15 | mapID :: [B.ByteString] -- ^ A list of IDs 16 | -> B.ByteString -- ^ From database 17 | -> B.ByteString -- ^ To database 18 | -> IO [Maybe B.ByteString] 19 | mapID ids from to = do 20 | initReq <- parseRequest base 21 | let request = setQueryString query initReq 22 | { method = "GET" 23 | , requestHeaders = [("User-Agent", "kk@test.org")] 24 | } 25 | manager <- newManager tlsManagerSettings 26 | r <- fmap M.fromList $ runResourceT $ do 27 | response <- http request manager 28 | runConduit $ responseBody response .| linesUnboundedAsciiC .| 29 | (dropC 1 >> mapC ((\[a,b] -> (a,b)) . B.split '\t')) .| sinkList 30 | return $ map (flip M.lookup r) ids 31 | where 32 | query = [ ("from", Just from) 33 | , ("to", Just to) 34 | , ("format", Just "tab") 35 | , ("query", Just $ B.unwords ids) 36 | ] 37 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Seq.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE MultiParamTypeClasses #-} 4 | {-# LANGUAGE ScopedTypeVariables #-} 5 | module Bio.Seq 6 | ( 7 | -- * Alphabet 8 | Basic 9 | , IUPAC 10 | , Ext 11 | -- * Sequence types 12 | , DNA 13 | , RNA 14 | , Peptide 15 | , BioSeq'(..) 16 | , BioSeq(..) 17 | -- * DNA related functions 18 | , rc 19 | , gcContent 20 | , nucleotideFreq 21 | ) where 22 | 23 | import qualified Data.ByteString.Char8 as B 24 | import Data.Char8 (toUpper) 25 | import qualified Data.HashMap.Strict as M 26 | import qualified Data.HashSet as S 27 | import Data.Proxy (Proxy (..)) 28 | import Prelude hiding (length) 29 | 30 | -- | Alphabet defined by http://www.chem.qmul.ac.uk/iupac/ 31 | -- | Standard unambiguous alphabet 32 | data Basic 33 | 34 | -- | full IUPAC alphabet, including ambiguous letters 35 | data IUPAC 36 | 37 | -- | extended alphabet 38 | data Ext 39 | 40 | -- | DNA sequence 41 | newtype DNA alphabet = DNA B.ByteString 42 | 43 | -- | RNA sequence 44 | newtype RNA alphabet = RNA B.ByteString 45 | 46 | -- | Peptide sequence 47 | newtype Peptide alphabet = Peptide B.ByteString 48 | 49 | instance Show (DNA a) where 50 | show (DNA s) = B.unpack s 51 | 52 | instance Semigroup (DNA a) where 53 | (<>) (DNA x) (DNA y) = DNA (x <> y) 54 | 55 | instance Monoid (DNA a) where 56 | mempty = DNA B.empty 57 | mconcat dnas = DNA . B.concat . map toBS $ dnas 58 | 59 | class BioSeq' s where 60 | toBS :: s a -> B.ByteString 61 | unsafeFromBS :: B.ByteString -> s a 62 | 63 | slice :: Int -> Int -> s a -> s a 64 | 65 | length :: s a -> Int 66 | length = B.length . toBS 67 | {-# MINIMAL toBS, slice, unsafeFromBS #-} 68 | 69 | instance BioSeq' DNA where 70 | toBS (DNA s) = s 71 | unsafeFromBS = DNA 72 | slice i l (DNA s) = DNA . B.take l . B.drop i $ s 73 | 74 | instance BioSeq' RNA where 75 | toBS (RNA s) = s 76 | unsafeFromBS = RNA 77 | slice i l (RNA s) = RNA . B.take l . B.drop i $ s 78 | 79 | instance BioSeq' Peptide where 80 | toBS (Peptide s) = s 81 | unsafeFromBS = Peptide 82 | slice i l (Peptide s) = Peptide . B.take l . B.drop i $ s 83 | 84 | class BioSeq' seq => BioSeq seq alphabet where 85 | alphabet :: Proxy (seq alphabet) -> S.HashSet Char 86 | fromBS :: B.ByteString -> Either String (seq alphabet) 87 | fromBS input = case B.mapAccumL fun Nothing input of 88 | (Nothing, r) -> Right $ unsafeFromBS r 89 | (Just e, _) -> Left $ "Bio.Seq.fromBS: unknown character: " ++ [e] 90 | where 91 | fun (Just e) x = (Just e, x) 92 | fun Nothing x = let x' = toUpper x 93 | in if x' `S.member` alphabet (Proxy :: Proxy (seq alphabet)) 94 | then (Nothing, x') 95 | else (Just x', x') 96 | {-# MINIMAL alphabet #-} 97 | 98 | instance BioSeq DNA Basic where 99 | alphabet _ = S.fromList "ACGT" 100 | 101 | instance BioSeq DNA IUPAC where 102 | alphabet _ = S.fromList "ACGTNVHDBMKWSYR" 103 | 104 | instance BioSeq DNA Ext where 105 | alphabet _ = undefined 106 | 107 | instance BioSeq RNA Basic where 108 | alphabet _ = S.fromList "ACGU" 109 | 110 | -- | O(n) Reverse complementary of DNA sequence. 111 | rc :: DNA alphabet -> DNA alphabet 112 | rc (DNA s) = DNA . B.map f . B.reverse $ s 113 | where 114 | f x = case x of 115 | 'A' -> 'T' 116 | 'C' -> 'G' 117 | 'G' -> 'C' 118 | 'T' -> 'A' 119 | _ -> x 120 | 121 | -- | O(n) Compute GC content. 122 | gcContent :: DNA alphabet -> Double 123 | gcContent = (\(a,b) -> a / fromIntegral b) . B.foldl' f (0.0,0::Int) . toBS 124 | where 125 | f (!x,!n) c = 126 | let x' = case c of 127 | 'A' -> x 128 | 'C' -> x + 1 129 | 'G' -> x + 1 130 | 'T' -> x 131 | 'H' -> x + 0.25 132 | 'D' -> x + 0.25 133 | 'V' -> x + 0.75 134 | 'B' -> x + 0.75 135 | 'S' -> x + 1 136 | 'W' -> x 137 | _ -> x + 0.5 -- "NMKYR" 138 | in (x', n+1) 139 | 140 | -- | O(n) Compute single nucleotide frequency. 141 | nucleotideFreq :: forall a . BioSeq DNA a => DNA a -> M.HashMap Char Int 142 | nucleotideFreq dna = B.foldl' f m0 . toBS $ dna 143 | where 144 | m0 = M.fromList . zip (S.toList $ alphabet (Proxy :: Proxy (DNA a))) . repeat $ 0 145 | f m x = M.adjust (+1) x m 146 | {-# INLINE nucleotideFreq #-} 147 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Seq/IO.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Bio.Seq.IO 5 | ( Genome 6 | , openGenome 7 | , closeGenome 8 | , withGenome 9 | , getSeq 10 | , getChrom 11 | , getChrSizes 12 | , mkIndex 13 | ) where 14 | 15 | import Control.Exception (bracket) 16 | import Conduit 17 | import qualified Data.ByteString.Char8 as B 18 | import qualified Data.HashMap.Lazy as M 19 | import Data.List.Split 20 | import System.IO 21 | 22 | import Bio.Seq 23 | import Bio.Utils.Misc (readInt) 24 | import Bio.Data.Fasta (fastaReader) 25 | 26 | -- | The first 2048 bytes are header. Header consists of a magic string, 27 | -- followed by chromosome information. Example: 28 | -- \nCHR1 START SIZE 29 | data Genome = Genome !Handle !IndexTable !Int 30 | 31 | type IndexTable = M.HashMap B.ByteString (Int, Int) 32 | 33 | magic :: B.ByteString 34 | magic = "" 35 | 36 | openGenome :: FilePath -> IO Genome 37 | openGenome fl = do 38 | h <- openFile fl ReadMode 39 | sig <- B.hGetLine h 40 | if sig == magic 41 | then do 42 | header <- B.hGetLine h 43 | return $ Genome h (getIndex header) (B.length sig + B.length header + 2) 44 | else error "Bio.Seq.Query.openGenome: Incorrect format" 45 | where 46 | getIndex = M.fromList . map f . chunksOf 3 . B.words 47 | where 48 | f [k, v, l] = (k, (readInt v, readInt l)) 49 | f _ = error "error" 50 | {-# INLINE openGenome #-} 51 | 52 | closeGenome :: Genome -> IO () 53 | closeGenome (Genome h _ _) = hClose h 54 | {-# INLINE closeGenome #-} 55 | 56 | withGenome :: FilePath -> (Genome -> IO a) -> IO a 57 | withGenome fl fn = bracket (openGenome fl) closeGenome fn 58 | {-# INLINE withGenome #-} 59 | 60 | -- | A query is represented by a tuple: (chr, start, end) and is 61 | -- zero-based index, half-close-half-open 62 | type Query = (B.ByteString, Int, Int) 63 | 64 | -- | Retrieve sequence. 65 | getSeq :: BioSeq s a => Genome -> Query -> IO (Either String (s a)) 66 | getSeq (Genome h index headerSize) (chr, start, end) = case M.lookup chr index of 67 | Just (chrStart, chrSize) -> 68 | if end > chrSize 69 | then return $ Left $ "Bio.Seq.getSeq: out of index: " ++ show end ++ 70 | ">" ++ show chrSize 71 | else do 72 | hSeek h AbsoluteSeek $ fromIntegral $ headerSize + chrStart + start 73 | fromBS <$> B.hGet h (end - start) 74 | _ -> return $ Left $ "Bio.Seq.getSeq: Cannot find " ++ show chr 75 | {-# INLINE getSeq #-} 76 | 77 | -- | Retrieve whole chromosome. 78 | getChrom :: Genome -> B.ByteString -> IO (Either String (DNA IUPAC)) 79 | getChrom g chr = case lookup chr chrSize of 80 | Just s -> getSeq g (chr, 0, s) 81 | _ -> return $ Left "Unknown chromosome" 82 | where 83 | chrSize = getChrSizes g 84 | {-# INLINE getChrom #-} 85 | 86 | -- | Retrieve chromosome size information. 87 | getChrSizes :: Genome -> [(B.ByteString, Int)] 88 | getChrSizes (Genome _ table _) = map (\(k, (_, l)) -> (k, l)) . M.toList $ table 89 | {-# INLINE getChrSizes #-} 90 | 91 | -- | Indexing a genome. 92 | mkIndex :: [FilePath] -- ^ A list of fasta files. Each file can have multiple 93 | -- chromosomes. 94 | -> FilePath -- ^ output file 95 | -> IO () 96 | mkIndex fls outFl = withFile outFl WriteMode $ \outH -> do 97 | (chrs, dnas) <- (unzip . concat) <$> mapM readSeq fls 98 | B.hPutStr outH $ B.unlines [magic, mkHeader chrs] 99 | mapM_ (B.hPutStr outH) dnas 100 | where 101 | readSeq fl = runResourceT $ runConduit $ fastaReader fl .| conduit .| sinkList 102 | where 103 | conduit = awaitForever $ \(chrName, seqs) -> do 104 | let dna = B.concat seqs 105 | yield ((head $ B.words chrName, B.length dna), dna) 106 | {-# INLINE mkIndex #-} 107 | 108 | mkHeader :: [(B.ByteString, Int)] -> B.ByteString 109 | mkHeader xs = B.unwords.fst $ foldl f ([], 0) xs 110 | where 111 | f (s, i) (s', i') = (s ++ [s', B.pack $ show i, B.pack $ show i'], i + i') 112 | {-# INLINE mkHeader #-} 113 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Utils/BitVector.hs: -------------------------------------------------------------------------------- 1 | module Bio.Utils.BitVector 2 | ( BitVector(..) 3 | , BitMVector 4 | , size 5 | , (!) 6 | , set 7 | , clear 8 | , unsafeFreeze 9 | , zeros 10 | , toList 11 | ) where 12 | 13 | import qualified Data.Vector.Unboxed as U 14 | import Control.Monad.Primitive 15 | import qualified Data.Vector.Unboxed.Mutable as UM 16 | import Data.Word 17 | import Data.Bits 18 | import Text.Printf (printf) 19 | 20 | data BitVector = BitVector Int (U.Vector Word8) 21 | 22 | data BitMVector s = BitMVector Int (UM.MVector s Word8) 23 | 24 | size :: BitVector -> Int 25 | size (BitVector n _) = n 26 | 27 | (!) :: BitVector -> Int -> Bool 28 | (!) = index 29 | 30 | index :: BitVector -> Int -> Bool 31 | index (BitVector n v) idx 32 | | idx >= n = error $ printf "index out of bounds (%d,%d)" idx n 33 | | otherwise = testBit (v `U.unsafeIndex` i) j 34 | where 35 | i = idx `div` 8 36 | j = idx `mod` 8 37 | 38 | set :: PrimMonad m => BitMVector (PrimState m) -> Int -> m () 39 | set (BitMVector _ mv) idx = UM.modify mv ((flip setBit) j) i 40 | where 41 | i = idx `div` 8 42 | j = idx `mod` 8 43 | 44 | clear :: PrimMonad m => BitMVector (PrimState m) -> Int -> m () 45 | clear (BitMVector _ mv) idx = UM.modify mv ((flip clearBit) j) i 46 | where 47 | i = idx `div` 8 48 | j = idx `mod` 8 49 | 50 | unsafeFreeze :: PrimMonad m => BitMVector (PrimState m) -> m BitVector 51 | unsafeFreeze (BitMVector n mv) = U.unsafeFreeze mv >>= return . BitVector n 52 | 53 | zeros :: PrimMonad m => Int -> m (BitMVector (PrimState m)) 54 | zeros n = UM.replicate n' 0 >>= return . BitMVector n 55 | where 56 | n' = if j == 0 then i else i + 1 57 | i = n `div` 8 58 | j = n `mod` 8 59 | 60 | toList :: BitVector -> [Bool] 61 | toList bv = flip map [0..n-1] $ \i -> bv ! i 62 | where 63 | n = size bv -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Utils/Functions.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | 4 | module Bio.Utils.Functions ( 5 | ihs 6 | , ihs' 7 | , scale 8 | , filterFDR 9 | , slideAverage 10 | , hyperquick 11 | , gaussianKDE 12 | , kld 13 | , jsd 14 | , binarySearch 15 | , binarySearchBy 16 | , binarySearchByBounds 17 | , quantileNormalization 18 | , quantileNormalization' 19 | ) where 20 | 21 | import Data.Bits (shiftR) 22 | import Data.List (foldl', groupBy) 23 | import Data.Function (on) 24 | import Data.Ord (comparing) 25 | import qualified Data.Vector as V 26 | import qualified Data.Vector.Generic as G 27 | import qualified Data.Vector.Unboxed as U 28 | import qualified Data.Matrix as M 29 | import Statistics.Sample (meanVarianceUnb, mean) 30 | import Statistics.Function (sortBy) 31 | import Control.Parallel.Strategies (parMap, rseq) 32 | import Statistics.Sample.KernelDensity (kde) 33 | 34 | -- | inverse hyperbolic sine transformation 35 | ihs :: Double -- ^ θ, determine the shape of the function 36 | -> Double -- ^ input 37 | -> Double 38 | ihs !θ !x | θ == 0 = x 39 | | otherwise = log (θ * x + sqrt (θ * θ * x * x + 1)) / θ 40 | {-# INLINE ihs #-} 41 | 42 | -- | inverse hyperbolic sine transformation with θ = 1 43 | ihs' :: Double -> Double 44 | ihs' = ihs 1 45 | 46 | -- | scale data to zero mean and 1 variance 47 | scale :: G.Vector v Double => v Double -> v Double 48 | scale xs = G.map (\x -> (x - m) / sqrt s) xs 49 | where 50 | (m,s) = meanVarianceUnb xs 51 | {-# INLINE scale #-} 52 | 53 | -- | given the p-values, filter data by controling FDR 54 | filterFDR :: G.Vector v (a, Double) 55 | => Double -- ^ desired FDR value 56 | -> v (a, Double) -- ^ input data and pvalues 57 | -> v (a, Double) 58 | filterFDR α xs = go n . sortBy (comparing snd) $ xs 59 | where 60 | go rank v | rank <= 0 = G.empty 61 | | snd (v `G.unsafeIndex` (rank-1)) <= fromIntegral rank * α / fromIntegral n = G.slice 0 rank v 62 | | otherwise = go (rank-1) v 63 | n = G.length xs 64 | {-# INLINE filterFDR #-} 65 | 66 | -- | Compute the sliding average for each entry in a vector 67 | slideAverage :: (Fractional a, G.Vector v a) 68 | => Int -- ^ size of HALF sliding window, 2 means a total 69 | -- window size is 5 70 | -> v a 71 | -> v a 72 | slideAverage k xs = G.generate n $ \i -> go xs (max 0 (i-k)) (min (n-1) (i+k)) 73 | where 74 | go v i j = let l = j - i + 1 75 | in G.foldl' (+) 0 (G.unsafeSlice i l v) / fromIntegral l 76 | n = G.length xs 77 | {-# INLINE slideAverage #-} 78 | 79 | hyperquick :: Int -> Int -> Int -> Int -> Double 80 | hyperquick x m _n _N = loop (m-2) s s (2*e) 81 | where 82 | loop !k !ak !bk !epsk 83 | | k < _N - (_n-x) - 1 && epsk > e = 84 | let ck = ak / bk 85 | k' = k + 1 86 | jjm = invJm _n x _N k' 87 | bk' = bk * jjm + 1 88 | ak' = ak * jjm 89 | espk' = fromIntegral (_N - (_n - x) - 1 - k') * (ck - ak' / bk') 90 | in loop k' ak' bk' espk' 91 | | otherwise = 1 - (ak / bk - epsk / 2) 92 | s = foldl' (\s' k -> 1 + s' * invJm _n x _N k) 1.0 [x..m-2] 93 | invJm _n _x _N _m = ( 1 - fromIntegral _x / fromIntegral (_m+1) ) / 94 | ( 1 - fromIntegral (_n-1-_x) / fromIntegral (_N-1-_m) ) 95 | e = 1e-20 96 | 97 | -- | Assign weights to the points according to density estimation. 98 | gaussianKDE :: Int -- ^ number of mesh points used in KDE 99 | -> U.Vector Double -> (Double -> Double) 100 | gaussianKDE n xs = \x -> 101 | let i = binarySearch points x 102 | lo = points U.! (i - 1) 103 | lo_d = den U.! (i - 1) 104 | hi = points U.! i 105 | hi_d = den U.! i 106 | hi_w = (x - lo) / (hi - lo) 107 | lo_w = 1 - hi_w 108 | in lo_w * lo_d + hi_w * hi_d 109 | where 110 | (points, den) = kde n xs 111 | 112 | -- | compute the Kullback-Leibler divergence between two valid (not check) probability distributions. 113 | -- kl(X,Y) = \sum_i P(x_i) log_2(P(x_i)\/P(y_i)). 114 | kld :: (G.Vector v Double, G.Vector v (Double, Double)) => v Double -> v Double -> Double 115 | kld xs ys | G.length xs /= G.length ys = error "Incompitable dimensions" 116 | | otherwise = G.foldl' f 0 . G.zip xs $ ys 117 | where 118 | f acc (x, y) | x == 0 = acc 119 | | otherwise = acc + x * (logBase 2 x - logBase 2 y) 120 | {-# SPECIALIZE kld :: U.Vector Double -> U.Vector Double -> Double #-} 121 | {-# SPECIALIZE kld :: V.Vector Double -> V.Vector Double -> Double #-} 122 | 123 | -- | Jensen-Shannon divergence: JS(X,Y) = 1\/2 KL(X,(X+Y)\/2) + 1\/2 KL(Y,(X+Y)\/2). 124 | jsd :: (G.Vector v Double, G.Vector v (Double, Double)) => v Double -> v Double -> Double 125 | jsd xs ys = 0.5 * kld xs zs + 0.5 * kld ys zs 126 | where zs = G.zipWith (\x y -> (x + y) / 2) xs ys 127 | {-# SPECIALIZE jsd :: U.Vector Double -> U.Vector Double -> Double #-} 128 | {-# SPECIALIZE jsd :: V.Vector Double -> V.Vector Double -> Double #-} 129 | 130 | -- | O(log n). return the position of the first element that is >= query 131 | binarySearch :: (G.Vector v e, Ord e) 132 | => v e -> e -> Int 133 | binarySearch vec e = binarySearchByBounds compare vec e 0 $ G.length vec - 1 134 | {-# INLINE binarySearch #-} 135 | 136 | binarySearchBy :: G.Vector v e 137 | => (e -> a -> Ordering) -> v e -> a -> Int 138 | binarySearchBy cmp vec e = binarySearchByBounds cmp vec e 0 $ G.length vec - 1 139 | {-# INLINE binarySearchBy #-} 140 | 141 | binarySearchByBounds :: G.Vector v e 142 | => (e -> a -> Ordering) -> v e -> a -> Int -> Int -> Int 143 | binarySearchByBounds cmp vec e = loop 144 | where 145 | loop !l !u 146 | | u < l = l 147 | | otherwise = case cmp (vec G.! k) e of 148 | LT -> loop (k+1) u 149 | EQ -> loop l (k-1) 150 | GT -> loop l (k-1) 151 | where k = u + l `shiftR` 1 152 | {-# INLINE binarySearchByBounds #-} 153 | 154 | {- 155 | empiricalCDF :: G.Vector v Double => v Double -> v Double 156 | empiricalCDF xs = runST $ do 157 | let n = G.length xs 158 | indices = groupBy ( (==) `on` ((xs G.!).snd) ) $ zip [1.0..] $ sortBy (compare `on` (xs G.!)) [0..n-1] 159 | updates mv (v,ys) = mapM_ (flip (GM.unsafeWrite mv) v.snd) ys 160 | xs' <- G.thaw xs 161 | mapM_ (updates xs'. ((flip (/) (fromIntegral n).fst.last) &&& id)) indices 162 | G.unsafeFreeze xs' 163 | {-# INLINE empiricalCDF #-} 164 | 165 | cdf :: G.Vector v Double => v Double -> v Double 166 | cdf xs = let f = empiricalCDF xs 167 | n = fromIntegral $ G.length xs 168 | δ = 1 / (4 * (n**0.25) * sqrt (pi * log n)) 169 | in G.map (\ x -> case () of 170 | _ | x < δ -> δ 171 | | x > 1 - δ -> 1 - δ 172 | | otherwise -> x) f 173 | {-# INLINE cdf #-} 174 | -} 175 | 176 | -- | Columns are samples, rows are features / genes. 177 | quantileNormalization :: M.Matrix Double -> M.Matrix Double 178 | quantileNormalization mat = M.fromColumns $ map 179 | (fst . G.unzip . sortBy (comparing snd) . G.fromList . concatMap f . 180 | groupBy ((==) `on` (snd . snd)) . zip averages . G.toList) $ 181 | M.toColumns srtMat 182 | where 183 | f [(a,(b,_))] = [(a,b)] 184 | f xs = let m = mean $ U.fromList $ fst $ unzip xs 185 | in map (\(_,(i,_)) -> (m, i)) xs 186 | srtMat :: M.Matrix (Int, Double) 187 | srtMat = M.fromColumns $ map (sortBy (comparing snd) . G.zip (G.enumFromN 0 n)) $ 188 | M.toColumns mat 189 | averages = map (mean . snd . G.unzip) $ M.toRows srtMat 190 | n = M.rows mat 191 | {-# INLINE quantileNormalization #-} 192 | 193 | -- | Columns are samples, rows are features / genes. 194 | quantileNormalization' :: M.Matrix Double -> M.Matrix Double 195 | quantileNormalization' mat = M.fromColumns $ parMap rseq 196 | (fst . G.unzip . sortBy (comparing snd) . G.fromList . concatMap f . 197 | groupBy ((==) `on` (snd . snd)) . zip averages . G.toList) $ 198 | M.toColumns srtMat 199 | where 200 | f [(a,(b,_))] = [(a,b)] 201 | f xs = let m = mean $ U.fromList $ fst $ unzip xs 202 | in map (\(_,(i,_)) -> (m, i)) xs 203 | srtMat :: M.Matrix (Int, Double) 204 | srtMat = M.fromColumns $ parMap rseq (sortBy (comparing snd) . G.zip (G.enumFromN 0 n)) $ 205 | M.toColumns mat 206 | averages = parMap rseq (mean . snd . G.unzip) $ M.toRows srtMat 207 | n = M.rows mat 208 | {-# INLINE quantileNormalization' #-} -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Utils/Misc.hs: -------------------------------------------------------------------------------- 1 | module Bio.Utils.Misc 2 | ( readInt 3 | , readDouble 4 | , bins 5 | , binBySize 6 | , binBySizeLeft 7 | , binBySizeOverlap 8 | ) where 9 | 10 | import Data.ByteString.Char8 (ByteString) 11 | import Data.ByteString.Lex.Fractional (readExponential, readSigned) 12 | import Data.ByteString.Lex.Integral (readDecimal) 13 | import Data.Maybe (fromMaybe) 14 | 15 | readInt :: ByteString -> Int 16 | readInt x = fst . fromMaybe errMsg . readSigned readDecimal $ x 17 | where 18 | errMsg = error $ "readInt: Fail to cast ByteString to Int:" ++ show x 19 | {-# INLINE readInt #-} 20 | 21 | readDouble :: ByteString -> Double 22 | readDouble x = fst . fromMaybe errMsg . readSigned readExponential $ x 23 | where 24 | errMsg = error $ "readDouble: Fail to cast ByteString to Double:" ++ show x 25 | {-# INLINE readDouble #-} 26 | 27 | -- | divide a given half-close-half-open region into fixed size 28 | -- half-close-half-open intervals, discarding leftovers 29 | binBySize :: Int -> (Int, Int) -> [(Int, Int)] 30 | binBySize step (start, end) = let xs = [start, start + step .. end] 31 | in zip xs . tail $ xs 32 | {-# INLINE binBySize #-} 33 | 34 | binBySizeOverlap :: Int -> Int -> (Int, Int) -> [(Int, Int)] 35 | binBySizeOverlap step overlap (start, end) 36 | | overlap >= step = error "binBySizeOverlap: overlap > step" 37 | | otherwise = go start 38 | where 39 | go i | i + overlap < end = (i, i + step) : go (i + step - overlap) 40 | | otherwise = [] 41 | {-# INLINE binBySizeOverlap #-} 42 | 43 | -- | Including leftovers, the last bin will be extended to match desirable size 44 | binBySizeLeft :: Int -> (Int, Int) -> [(Int, Int)] 45 | binBySizeLeft step (start, end) = go start 46 | where 47 | go i | i < end = (i, i + step) : go (i + step) 48 | | otherwise = [] 49 | {-# INLINE binBySizeLeft #-} 50 | 51 | -- | divide a given region into k equal size sub-regions, discarding leftovers 52 | bins :: Int -> (Int, Int) -> [(Int, Int)] 53 | bins binNum (start, end) = let k = (end - start) `div` binNum 54 | in take binNum . binBySize k $ (start, end) 55 | {-# INLINE bins #-} 56 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Utils/Overlap.hs: -------------------------------------------------------------------------------- 1 | module Bio.Utils.Overlap 2 | ( overlapFragment 3 | , overlapNucl 4 | , coverage 5 | ) where 6 | 7 | import Bio.Data.Bed 8 | import Conduit 9 | import Lens.Micro ((^.)) 10 | import Control.Monad 11 | import qualified Data.ByteString.Char8 as B 12 | import Data.Function 13 | import qualified Data.HashMap.Strict as M 14 | import qualified Data.IntervalMap.Strict as IM 15 | import Data.List 16 | import qualified Data.Vector.Unboxed as V 17 | import qualified Data.Vector.Unboxed.Mutable as VM 18 | 19 | -- | convert lines of a BED file into a data structure - A hashmap of which the 20 | -- | chromosomes, and values are interval maps. 21 | toMap :: [(B.ByteString, (Int, Int))] -> M.HashMap B.ByteString (IM.IntervalMap Int Int) 22 | toMap input = M.fromList.map create.groupBy ((==) `on` (fst.fst)) $ zip input [0..] 23 | where 24 | f ((_, x), i) = (toInterval x, i) 25 | create xs = (fst.fst.head $ xs, IM.fromDistinctAscList.map f $ xs) 26 | {-# INLINE toMap #-} 27 | 28 | coverage :: [BED] -- ^ genomic locus in BED format 29 | -> ConduitT () BED IO () -- ^ reads in BED format 30 | -> IO (V.Vector Double, Int) 31 | coverage bin tags = liftM getResult $ runConduit $ tags .| sink 32 | where 33 | sink = do 34 | v <- lift $ VM.replicate (n+1) 0 35 | mapM_C $ \t -> do 36 | let set = M.lookup (t^.chrom) featMap 37 | s = t^.chromStart 38 | e = t^.chromEnd 39 | b = (s, e) 40 | l = e - s + 1 41 | intervals = case set of 42 | Just iMap -> IM.toList . IM.intersecting iMap . toInterval $ b 43 | _ -> [] 44 | forM_ intervals (\interval -> do 45 | let i = snd interval 46 | nucl = overlap b . fst $ interval 47 | VM.write v i . (+nucl) =<< VM.read v i 48 | ) 49 | VM.write v n . (+l) =<< VM.read v n 50 | lift $ V.freeze v 51 | getResult v = (V.zipWith normalize (V.slice 0 n v) featWidth, v V.! n) 52 | featMap = toMap.map (\x -> (x^.chrom, (x^.chromStart, x^.chromEnd))) $ bin 53 | featWidth = V.fromList $ map size bin 54 | n = length bin 55 | overlap (l, u) (IM.ClosedInterval l' u') 56 | | l' >= l = if u' <= u then u'-l'+1 else u-l'+1 57 | | otherwise = if u' <= u then u'-l+1 else u-l+1 58 | overlap _ _ = 0 59 | normalize a b = fromIntegral a / fromIntegral b 60 | 61 | overlapFragment, overlapNucl :: 62 | [(Int, Int)] -- ^ Ascending order list 63 | -> [(Int, Int)] -- ^ tags in any order 64 | -> V.Vector Int 65 | overlapFragment xs ts = V.create (VM.replicate n 0 >>= go ts) 66 | where 67 | n = length xs 68 | iMap = IM.fromAscList $ zip (map toInterval xs) [0..] 69 | go ts' v = do 70 | forM_ ts' (\x -> do 71 | let indices = IM.elems . IM.intersecting iMap . toInterval $ x 72 | forM_ indices (\i -> VM.write v i . (+1) =<< VM.read v i) 73 | ) 74 | return v 75 | 76 | overlapNucl xs ts = V.create (VM.replicate n 0 >>= go ts) 77 | where 78 | n = length xs 79 | iMap = IM.fromAscList $ zip (map toInterval xs) [0..] 80 | go ts' v = do 81 | forM_ ts' (\x -> do 82 | let intervals = IM.toList . IM.intersecting iMap . toInterval $ x 83 | forM_ intervals (\interval -> do 84 | let i = snd interval 85 | nucl = overlap x . fst $ interval 86 | VM.write v i . (+nucl) =<< VM.read v i 87 | ) 88 | ) 89 | return v 90 | overlap (l, u) (IM.ClosedInterval l' u') 91 | | l' >= l = if u' <= u then u'-l'+1 else u-l'+1 92 | | otherwise = if u' <= u then u'-l+1 else u-l+1 93 | overlap _ _ = 0 94 | 95 | toInterval :: (a, a) -> IM.Interval a 96 | toInterval (l, u) = IM.ClosedInterval l u 97 | {-# INLINE toInterval #-} 98 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/src/Bio/Utils/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs #-} 2 | {-# LANGUAGE StandaloneDeriving #-} 3 | {-# LANGUAGE FlexibleContexts #-} 4 | module Bio.Utils.Types 5 | ( Sorted 6 | , ordering 7 | , fromSorted 8 | , toSorted 9 | , unsafeToSorted 10 | ) where 11 | 12 | import qualified Data.Foldable as F 13 | import Data.Ord () 14 | 15 | data Sorted f a where 16 | Sorted :: (F.Foldable f, Ord a) 17 | => { ordering :: !Ordering 18 | , fromSorted :: !(f a) 19 | } 20 | -> Sorted f a 21 | 22 | deriving instance Show (f a) => Show (Sorted f a) 23 | 24 | -- | if the data has been sorted, wrap it into Sorted type 25 | toSorted :: (F.Foldable f, Ord a) => f a -> Sorted f a 26 | toSorted xs = Sorted o xs 27 | where 28 | o = snd . F.foldl' g (const EQ, EQ) $ xs 29 | g (func, ord) x 30 | | ord == EQ = (compare x, ord') 31 | | ord' == ord || ord' == EQ = (compare x, ord) 32 | | otherwise = error "data is not sorted" 33 | where 34 | ord' = func x 35 | 36 | unsafeToSorted :: (F.Foldable f, Ord a) => Ordering -> f a -> Sorted f a 37 | unsafeToSorted = Sorted 38 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/Tests/Bam.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Tests.Bam (tests) where 4 | 5 | import Bio.Data.Bam 6 | import Bio.Data.Bed 7 | import Bio.Utils.Misc (readInt) 8 | import Conduit 9 | import Control.Monad (forM_) 10 | import qualified Data.ByteString.Char8 as B 11 | import Data.Tuple (swap) 12 | import Test.Tasty 13 | import Test.Tasty.Golden 14 | import Test.Tasty.HUnit 15 | 16 | tests :: TestTree 17 | tests = testGroup "Test: Bio.Data.Bam" 18 | [ bamIOTest 19 | , testCase "bamToBed" bamToBedTest 20 | , testCase "sortedBamToBedPE" sortedBamToBedPETest 21 | ] 22 | 23 | bamIOTest :: TestTree 24 | bamIOTest = do 25 | goldenVsFile "BAM Read/Write Test" input output io 26 | where 27 | io = do 28 | header <- getBamHeader input 29 | runResourceT $ runConduit $ streamBam input .| sinkBam output header 30 | input = "tests/data/example.bam" 31 | output = "tests/data/example_copy.bam" 32 | 33 | bamToBedTest :: Assertion 34 | bamToBedTest = do 35 | bed <- runResourceT $ runConduit $ 36 | streamBedGzip "tests/data/example.bed.gz" .| sinkList 37 | bed' <- do 38 | let input = "tests/data/example.bam" 39 | header <- getBamHeader input 40 | runResourceT $ runConduit $ streamBam input .| bamToBedC header .| sinkList 41 | forM_ (zip bed bed') $ \(a,b) -> 42 | if a == b then return () else error $ show (a,b) 43 | (bed == bed') @? show (head bed, head bed') 44 | 45 | sortedBamToBedPETest :: Assertion 46 | sortedBamToBedPETest = do 47 | bedpe <- readBedPE "tests/data/pairedend.bedpe" :: IO [(BED3, BED3)] 48 | bedpe' <- do 49 | let input = "tests/data/pairedend.bam" 50 | header <- getBamHeader input 51 | runResourceT $ runConduit $ streamBam input .| 52 | sortedBamToBedPE header .| 53 | mapC (\(x,y) -> (convert x, convert y)) .| sinkList 54 | forM_ (zip bedpe bedpe') $ \(b1, b2) -> (b1 == b2 || b1 == swap b2) @? show (b1,b2) 55 | where 56 | readBedPE fl = do 57 | c <- B.readFile fl 58 | return $ map (f . B.split '\t') $ B.lines c 59 | f (f1:f2:f3:f4:f5:f6:_) = ( asBed f1 (readInt f2) (readInt f3) 60 | , asBed f4 (readInt f5) (readInt f6) ) 61 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/Tests/Bed.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Tests.Bed (tests) where 4 | 5 | import Bio.Data.Bed 6 | import Bio.Data.Bed.Types 7 | import Bio.Data.Bed.Utils 8 | import Bio.Utils.BitVector hiding (size) 9 | import Lens.Micro 10 | import Conduit 11 | import Data.Function (on) 12 | import Data.List (sortBy, sort) 13 | import qualified Data.Vector as V 14 | import Test.Tasty 15 | import Test.Tasty.HUnit 16 | import qualified Data.HashMap.Strict as M 17 | import Data.Ord 18 | import Data.Maybe 19 | import Bio.RealWorld.GENCODE 20 | import Data.Conduit.Zlib 21 | 22 | tests :: TestTree 23 | tests = testGroup "Test: Bio.Data.Bed" 24 | [ testCase "sortBed" sortBedTest 25 | , testCase "split" splitBedTest 26 | , testCase "splitOverlapped" splitOverlappedTest 27 | , testCase "mergeBed" mergeBedTest 28 | , testCase "intersectBed" intersectBedTest 29 | , testCase "baseMap" baseMapTest 30 | , testCase "domain assignment" geneDomainTest 31 | ] 32 | 33 | sortBedTest :: Assertion 34 | sortBedTest = do 35 | beds <- runResourceT $ runConduit $ 36 | streamBedGzip "tests/data/peaks.bed.gz" .| sinkList :: IO [BED3] 37 | let (Sorted actual) = sortBed beds 38 | expect <- runResourceT $ runConduit $ 39 | streamBedGzip "tests/data/peaks.sorted.bed.gz" .| sinkVector 40 | expect @=? actual 41 | 42 | splitBedTest :: Assertion 43 | splitBedTest = (s1', s2', s3') @=? (s1, s2, s3) 44 | where 45 | bed = asBed "chr1" 0 99 46 | s1 = splitBedBySize 20 bed 47 | s1' = map f [(0, 20), (20, 40), (40, 60), (60, 80)] 48 | s2 = splitBedBySizeLeft 20 bed 49 | s2' = map f [(0, 20), (20, 40), (40, 60), (60, 80), (80, 100)] 50 | s3 = splitBedBySizeOverlap 20 10 bed 51 | s3' = map f [ ( 0, 20), (10, 30), (20, 40), (30, 50), (40, 60) 52 | , (50, 70), (60, 80), (70, 90), (80, 100) ] 53 | f (a,b) = asBed "chr1" a b :: BED3 54 | 55 | splitOverlappedTest :: Assertion 56 | splitOverlappedTest = expect @=? result 57 | where 58 | input :: [BED3] 59 | input = map (\(a,b) -> asBed "chr1" a b) 60 | [ (0, 100) 61 | , (10, 20) 62 | , (50, 150) 63 | , (120, 160) 64 | , (155, 200) 65 | , (155, 220) 66 | , (0, 10) 67 | , (0, 10) 68 | ] 69 | expect = map (\((a,b), x) -> (asBed "chr1" a b, x)) 70 | [ ((0, 10), 3) 71 | , ((10, 20), 2) 72 | , ((20, 50), 1) 73 | , ((50, 100), 2) 74 | , ((100, 120), 1) 75 | , ((120, 150), 2) 76 | , ((150, 155), 1) 77 | , ((155, 160), 3) 78 | , ((160, 200), 2) 79 | , ((200, 220), 1) 80 | ] 81 | --result = sortBy (compareBed `on` fst) $ splitOverlapped length input 82 | result = sortBy (compareBed `on` fst) $ countOverlapped input 83 | 84 | mergeBedTest :: Assertion 85 | mergeBedTest = expect @=? result 86 | where 87 | input :: [BED3] 88 | input = map (\(a,b) -> asBed "chr1" a b) 89 | [ (0, 100) 90 | , (10, 20) 91 | , (50, 150) 92 | , (120, 160) 93 | , (155, 200) 94 | , (155, 220) 95 | , (500, 1000) 96 | ] 97 | expect = map (\(a,b) -> asBed "chr1" a b) 98 | [ (0, 220) 99 | , (500, 1000) 100 | ] 101 | result = runIdentity $ runConduit $ mergeBed input .| sinkList 102 | 103 | intersectBedTest :: Assertion 104 | intersectBedTest = do 105 | expect <- runResourceT $ runConduit $ 106 | streamBedGzip "tests/data/example_intersect_peaks.bed.gz" .| sinkList :: IO [BED3] 107 | peaks <- runResourceT $ runConduit $ 108 | streamBedGzip "tests/data/peaks.bed.gz" .| sinkList :: IO [BED3] 109 | result <- runResourceT $ runConduit $ 110 | streamBedGzip "tests/data/example.bed.gz" .| intersectBed peaks .| sinkList 111 | expect @=? result 112 | 113 | baseMapTest :: Assertion 114 | baseMapTest = do 115 | BaseMap bv <- runResourceT $ runConduit $ streamBedGzip "tests/data/example.bed.gz" .| 116 | baseMap [("chr1", 300000000)] 117 | let res = M.lookupDefault undefined "chr1" $ 118 | fmap (map fst . filter snd . zip [0..] . toList) bv 119 | expect <- runResourceT $ runConduit $ streamBedGzip "tests/data/example.bed.gz" .| 120 | concatMapC f .| sinkList 121 | sort expect @=? sort res 122 | where 123 | f :: BED -> Maybe Int 124 | f bed = if bed^.chrom == "chr1" 125 | then Just $ if fromJust (bed^.strand) 126 | then bed^.chromStart 127 | else bed^.chromEnd - 1 128 | else Nothing 129 | 130 | geneDomainTest :: Assertion 131 | geneDomainTest = do 132 | genes <- runResourceT $ runConduit $ sourceFile "tests/data/genes.gtf.gz" .| 133 | multiple ungzip .| readGenesC 134 | let promoters = bedToTree const $ zip (concatMap (getPromoters 5000 1000) genes) $ repeat () 135 | domain = getDomains 1000000 $ concatMap (getPromoters 5000 1000) genes 136 | f x = not (isIntersected promoters x) && (c1 || c2) 137 | where 138 | c1 = isIntersected promoters (chromStart %~ (`subtract` 1) $ x) && 139 | isIntersected promoters (chromEnd %~ (+1) $ x) && 140 | size x < 1000000 141 | c2 = isIntersected promoters (chromStart %~ (`subtract` 1) $ chromEnd %~ (+1) $ x) && 142 | size x == 1000000 143 | all f domain @=? True -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/Tests/Fastq.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Tests.Fastq (tests) where 4 | 5 | import Bio.Data.Fastq 6 | import Data.Conduit.Zlib (ungzip, multiple, gzip) 7 | import qualified Data.ByteString.Char8 as B 8 | import qualified Data.ByteString.Lazy.Char8 as BL 9 | import Lens.Micro 10 | import Conduit 11 | import Test.Tasty 12 | import Test.Tasty.Golden 13 | import Test.Tasty.HUnit 14 | import qualified Data.HashMap.Strict as M 15 | import Data.Ord 16 | import Data.Maybe 17 | 18 | tests :: TestTree 19 | tests = testGroup "Test: Bio.Data.Fastq" 20 | [ testCase "FASTQ IO" fastqIO 21 | ] 22 | 23 | fastqIO :: Assertion 24 | fastqIO = do 25 | a <- fmap B.concat $ runResourceT $ runConduit $ 26 | sourceFile "tests/data/test.fastq.gz" .| multiple ungzip .| sinkList 27 | b <- fmap B.concat $ runResourceT $ runConduit $ 28 | sourceFile "tests/data/test_wrap.fastq.gz" .| multiple ungzip .| sinkList 29 | let fq1 = B.concat $ replicate 1000 a 30 | fq2 = B.concat $ replicate 1000 b 31 | r1 = B.concat $ runIdentity $ runConduit $ yield fq1 .| 32 | parseFastqC .| mapC fastqToByteString .| unlinesAsciiC .| sinkList 33 | r2 = B.concat $ runIdentity $ runConduit $ yield fq2 .| 34 | parseFastqC .| mapC fastqToByteString .| unlinesAsciiC .| sinkList 35 | [True, True] @=? [r1 == fq1, r2 == fq1] -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/Tests/Motif.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Tests.Motif (tests) where 4 | 5 | import qualified Data.ByteString.Char8 as B 6 | import Data.Conduit 7 | import qualified Data.Conduit.List as CL 8 | import Data.Default.Class 9 | import System.Random 10 | import Test.Tasty 11 | import Test.Tasty.HUnit 12 | 13 | import Bio.Data.Fasta 14 | import Bio.Motif 15 | import Bio.Motif.Search 16 | import Bio.Seq 17 | 18 | import qualified Data.Vector.Unboxed as U 19 | 20 | dna :: DNA Basic 21 | dna = case fromBS (B.pack $ map f $ take 5000 $ randomRs (0, 3) (mkStdGen 2)) of 22 | Left msg -> error msg 23 | Right x -> x 24 | where 25 | f :: Int -> Char 26 | f x = case x of 27 | 0 -> 'A' 28 | 1 -> 'C' 29 | 2 -> 'G' 30 | 3 -> 'T' 31 | _ -> undefined 32 | 33 | motifs :: IO [Motif] 34 | motifs = do 35 | m1 <- readFasta' "tests/data/motifs.fasta" 36 | -- m2 <- readMEME "tests/data/motifs.meme" 37 | return m1 38 | 39 | tests :: TestTree 40 | tests = testGroup "Test: Bio.Motif" 41 | [ --testCase "IUPAC converting" toIUPACTest 42 | testCase "TFBS scanning" findTFBSTest 43 | , testCase "Max matching score" maxScTest 44 | , testCase "pValue calculation" pValueTest 45 | , testCase "CDF truncate test" cdfTruncateTest 46 | , testCase "Reverse Complentary" reverseComplentary 47 | ] 48 | 49 | 50 | {- 51 | toIUPACTest :: Assertion 52 | toIUPACTest = assertEqual "toIUPAC check" expect actual 53 | where 54 | expect = "SAA" 55 | actual = show . toIUPAC $ pwm 56 | -} 57 | 58 | findTFBSTest :: Assertion 59 | findTFBSTest = do 60 | ms <- motifs 61 | let (Motif _ pwm) = head ms 62 | expect <- findTFBS def pwm dna (0.6 * optimalScore def pwm) True $$ CL.consume 63 | actual <- findTFBSSlow def pwm dna (0.6 * optimalScore def pwm) $$ CL.consume 64 | assertEqual "findTFBS" expect actual 65 | 66 | maxScTest :: Assertion 67 | maxScTest = do 68 | ms <- motifs 69 | let expect = map (\x -> maximum $ scores def (_pwm x) dna) ms 70 | actual = map (\x -> maxMatchSc def (_pwm x) dna) ms 71 | assertEqual "maxMatchSc" expect actual 72 | 73 | pValueTest :: Assertion 74 | pValueTest = do 75 | ms <- motifs 76 | let expect = map (approx . pValueToScoreExact 1e-4 def . _pwm) ms 77 | actual = map (approx . pValueToScore 1e-4 def . _pwm) ms 78 | assertEqual "pValueToScore" expect actual 79 | where 80 | approx x = round $ 10 * x 81 | 82 | cdfTruncateTest :: Assertion 83 | cdfTruncateTest = do 84 | ms <- motifs 85 | let expect = map (pValueToScore 1e-4 def . _pwm) ms 86 | actual = map (pValueToScore' 1e-4 def . _pwm) ms 87 | assertEqual "CDF truncate" expect actual 88 | where 89 | pValueToScore' p bg pwm = cdf' (truncateCDF 0.999 $ scoreCDF bg pwm) $ 1 - p 90 | 91 | reverseComplentary :: Assertion 92 | reverseComplentary = do 93 | ms <- motifs 94 | let expect = map (\x -> map approx $ scores def (_pwm x) dna) ms 95 | actual = map (\x -> map approx $ reverse $ scores def (rcPWM $ _pwm x) $ rc dna) ms 96 | assertEqual "reverse complentary" expect actual 97 | where 98 | approx x = round $ 100000 * x -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/Tests/Seq.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module Tests.Seq (tests) where 3 | 4 | import Bio.Seq 5 | import Data.Either 6 | import qualified Data.HashMap.Strict as M 7 | import Test.Tasty 8 | import Test.Tasty.HUnit 9 | 10 | tests :: TestTree 11 | tests = testGroup "Test: Bio.Seq" 12 | [ testCase "nucleotideFreq" testNuclFreq 13 | ] 14 | 15 | testNuclFreq :: Assertion 16 | testNuclFreq = do 17 | let dna = fromRight undefined $ fromBS "ACTTCCCGGGD" :: DNA IUPAC 18 | test = M.lookupDefault undefined 'C' $ nucleotideFreq dna 19 | expected = 4 20 | test @=? expected 21 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/Tests/Tools.hs: -------------------------------------------------------------------------------- 1 | module Tests.Tools (tests) where 2 | 3 | import Bio.Utils.Functions 4 | import Test.Tasty 5 | import Test.Tasty.HUnit 6 | import qualified Data.Matrix as MU 7 | import Text.Printf (printf) 8 | 9 | tests :: TestTree 10 | tests = testGroup "Test: Bio.Utils" 11 | [ quantileNormalizationTest 12 | ] 13 | 14 | quantileNormalizationTest :: TestTree 15 | quantileNormalizationTest = testGroup "quantile normalization" 16 | [ testCase "case 1" $ y1 @=? quantileNormalization x1 17 | , testCase "case 2" $ MU.map (printf "%.2f") y2 @=? 18 | (MU.map (printf "%.2f") (quantileNormalization x2) :: MU.Matrix String) 19 | ] 20 | where 21 | x1 :: MU.Matrix Double 22 | x1 = MU.fromLists 23 | [ [2, 4, 4, 5] 24 | , [5, 14, 4, 7] 25 | , [4, 8, 6, 9] 26 | , [3, 8, 5, 8] 27 | , [3, 9, 3, 5] ] 28 | y1 :: MU.Matrix Double 29 | y1 = MU.fromLists 30 | [ [ 3.5, 3.5, 5.25, 4.25] 31 | , [ 8.5, 8.5, 5.25, 5.5] 32 | , [ 6.5, 5.25, 8.5, 8.5] 33 | , [5.25, 5.25, 6.5, 6.5] 34 | , [5.25, 6.5, 3.5, 4.25] ] 35 | x2 :: MU.Matrix Double 36 | x2 = MU.fromLists 37 | [ [5, 4, 3] 38 | , [2, 1, 4] 39 | , [3, 4, 6] 40 | , [4, 2, 8] ] 41 | y2 :: MU.Matrix Double 42 | y2 = MU.fromLists 43 | [ [ 5.666667, 5.166667, 2 ] 44 | , [ 2, 2, 3] 45 | , [3, 5.166667, 4.666667] 46 | , [4.666667, 3, 5.666667] ] 47 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/data/example.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/example.bam -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/data/example.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/example.bed.gz -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/data/example_copy.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/example_copy.bam -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/data/example_intersect_peaks.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/example_intersect_peaks.bed.gz -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/data/genes.gtf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/genes.gtf.gz -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/data/motifs.fasta: -------------------------------------------------------------------------------- 1 | >1_ASCCAGGCKGG 2 | 0.768791 0.07577 0.120456 0.034983 3 | 0.057276 0.532522375 0.282802875 0.12739862500000002 4 | 0.048894125 0.796682375 0.09644475 0.05797825 5 | 0.044861750000000006 0.6375725 0.263928 0.05363775 6 | 0.61916725 0.104690625 0.17939824999999998 0.096743875 7 | 0.086404375 0.0745045 0.807213125 0.031878125 8 | 0.08837187499999999 0.07052475 0.828965875 0.012137499999999999 9 | 0.039856875 0.80558425 0.08105799999999999 0.07350025 10 | 0.079739875 0.110598625 0.40112975 0.40853125 11 | 0.03881925 0.12873025 0.781449 0.05100175 12 | 0.136913 0.15827100000000002 0.5818685 0.1229465 13 | 14 | >2_NGCRGGCKGAS 15 | 0.090506 0.471424 0.2324 0.205671 16 | 0.021978 0.040675 0.927348 0.01 17 | 0.014602750000000001 0.9562965 0.012410250000000001 0.01669075 18 | 0.468245 0.021042 0.29159075 0.21912225 19 | 0.0388475 0.04228675 0.887877 0.03098875 20 | 0.05335675 0.096898 0.8321492500000001 0.017596 21 | 0.15436075 0.7520785 0.052595249999999996 0.040965499999999995 22 | 0.02820475 0.07090675 0.31607725 0.5848115 23 | 0.02995975 0.01687 0.9428795 0.010290750000000001 24 | 0.6324460000000001 0.0383675 0.260754 0.0684325 25 | 0.056907 0.5814265000000001 0.3313415 0.030325 26 | 27 | >3_GGAGGAGGNG 28 | 0.078664 0.0663 0.823079 0.031957 29 | 0.20135799999999998 0.034205 0.7544365 0.01 30 | 0.82030325 0.07725525 0.0680665 0.034375 31 | 0.08712400000000001 0.054251 0.8422565 0.0163685 32 | 0.283462 0.0698595 0.5971305 0.049547999999999995 33 | 0.65613775 0.22489399999999998 0.0696585 0.04930975 34 | 0.0762455 0.164718 0.74042025 0.0186165 35 | 0.11206625 0.05687675 0.81332925 0.01772775 36 | 0.38844975000000004 0.26271175 0.27699425 0.07184399999999999 37 | 0.10369725 0.07755350000000001 0.7556514999999999 0.06309775000000001 38 | 39 | >4_YCGSGANCGSA 40 | 0.088404 0.484429 0.078568 0.348599 41 | 0.04927675 0.6174805 0.25066225 0.0825805 42 | 0.0349955 0.06153675 0.88377325 0.0196945 43 | 0.051606 0.4338045 0.38140925 0.1331805 44 | 0.09071325 0.10709625 0.7679435 0.03424675 45 | 0.82530675 0.1102275 0.03089375 0.033572 46 | 0.44087375 0.267475 0.246572 0.04507925 47 | 0.050244 0.81758975 0.10140475 0.0307615 48 | 0.038650500000000004 0.085901 0.7673414999999999 0.108107 49 | 0.028267999999999998 0.451347 0.444228 0.076157 50 | 0.805 0.052 0.035 0.108 51 | 52 | >5_SGCGTGCRC 53 | 0.052962999999999996 0.322791 0.4390235 0.1852225 54 | 0.149675 0.089726 0.7472715000000001 0.013327499999999999 55 | 0.044135499999999994 0.8364149999999999 0.0258065 0.0936425 56 | 0.0219595 0.01 0.9580405 0.01 57 | 0.027667 0.01 0.020536 0.941797 58 | 0.0118715 0.040971999999999995 0.937156 0.01 59 | 0.022783499999999998 0.8734415 0.053024999999999996 0.05075 60 | 0.486805 0.0319375 0.469651 0.011606 61 | 0.060225499999999994 0.7693025 0.0367025 0.13376949999999999 62 | 63 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/data/motifs.meme: -------------------------------------------------------------------------------- 1 | MEME version 4 2 | 3 | ALPHABET= ACGT 4 | 5 | strands: + - 6 | 7 | Background letter frequencies 8 | A 0.303 C 0.183 G 0.209 T 0.306 9 | 10 | MOTIF lexA 11 | letter-probability matrix: alength= 4 w= 12 nsites= 14 E= 3.2e-035 12 | 0.000000 0.000000 1.000000 0.000000 13 | 0.000000 0.000000 0.000000 1.000000 14 | 0.857143 0.000000 0.071429 0.071429 15 | 0.000000 0.071429 0.000000 0.928571 16 | 0.857143 0.000000 0.071429 0.071429 17 | 0.142857 0.000000 0.000000 0.857143 18 | 0.571429 0.071429 0.214286 0.142857 19 | 0.285714 0.285714 0.000000 0.428571 20 | 1.000000 0.000000 0.000000 0.000000 21 | 0.285714 0.214286 0.000000 0.500000 22 | 0.428571 0.500000 0.000000 0.071429 23 | 0.000000 1.000000 0.000000 0.000000 24 | 25 | MOTIF YPL133C+757.pfm 26 | letter-probability matrix: alength= 4 w= 12 nsites= 1 E= 0 27 | 0.25 0.25 0.25 0.25 28 | 0 0.5 0 0.5 29 | 0 1 0 0 30 | 0 1 0 0 31 | 0.333333 0 0.333333 0.333333 32 | 0 0 0 1 33 | 0 0 0 1 34 | 1 0 0 0 35 | 0 1 0 0 36 | 0 1 0 0 37 | 0 0 1 0 38 | 0.25 0.25 0.25 0.25 39 | 40 | MOTIF 7 V_ELK1_01 41 | 42 | letter-probability matrix: alength= 4 w= 12 nsites= 4 E= 0 43 | 0.250000 0.250000 0.250000 0.250000 44 | 0.500000 0.000000 0.250000 0.250000 45 | 0.500000 0.250000 0.250000 0.000000 46 | 0.750000 0.000000 0.250000 0.000000 47 | 0.000000 0.750000 0.250000 0.000000 48 | 0.000000 0.000000 1.000000 0.000000 49 | 1.000000 0.000000 0.000000 0.000000 50 | 0.000000 0.250000 0.000000 0.750000 51 | 0.250000 0.250000 0.250000 0.250000 52 | 0.250000 0.750000 0.000000 0.000000 53 | 0.250000 0.250000 0.500000 0.000000 54 | 0.250000 0.250000 0.000000 0.500000 55 | -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/data/pairedend.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/pairedend.bam -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/data/peaks.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/peaks.bed.gz -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/data/peaks.sorted.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/peaks.sorted.bed.gz -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/data/test.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/test.fastq.gz -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/data/test_wrap.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/test_wrap.fastq.gz -------------------------------------------------------------------------------- /bioinformatics-toolkit/tests/test.hs: -------------------------------------------------------------------------------- 1 | import qualified Tests.Bed as Bed 2 | import qualified Tests.Bam as Bam 3 | import qualified Tests.Motif as Motif 4 | import qualified Tests.Fastq as Fastq 5 | import qualified Tests.Seq as Seq 6 | import qualified Tests.Tools as Tools 7 | import Test.Tasty 8 | 9 | main :: IO () 10 | main = defaultMain $ testGroup "Main" 11 | [ Bed.tests 12 | , Bam.tests 13 | , Seq.tests 14 | , Motif.tests 15 | , Tools.tests 16 | , Fastq.tests 17 | ] 18 | --------------------------------------------------------------------------------